├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── publish-to-pypi.yml │ ├── pytest.yml │ └── sphinx.yml ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── create_uml.sh ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat ├── modules.rst ├── pyregexp.rst └── uml │ ├── classes.engine.png │ ├── classes.lexer.png │ ├── classes.match.png │ ├── classes.pyregexp.png │ ├── classes.pyrser.png │ ├── classes.re_ast.png │ ├── classes.tokens.png │ └── packages.pyregexp.png ├── grammar.txt ├── print_coverage.sh ├── pyregexp ├── __init__.py ├── engine.py ├── lexer.py ├── match.py ├── pyrser.py ├── re_ast.py └── tokens.py ├── pytest.ini ├── regex.py ├── regex.sh ├── requirements.txt ├── setup.cfg ├── setup.py └── test ├── __init__.py ├── test_engine.py ├── test_engine2.py ├── test_lexer.py ├── test_parser.py ├── test_re_ast.py └── test_tokens.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.9 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.9 15 | - name: Install pypa/build 16 | run: >- 17 | python -m 18 | pip install 19 | build 20 | --user 21 | - name: Build a binary wheel and a source tarball 22 | run: >- 23 | python -m 24 | build 25 | --sdist 26 | --wheel 27 | --outdir dist/ 28 | . 29 | - name: Publish distribution 📦 to Test PyPI 30 | uses: pypa/gh-action-pypi-publish@master 31 | with: 32 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 33 | repository_url: https://test.pypi.org/legacy/ 34 | skip_existing: true 35 | - name: Publish distribution 📦 to PyPI 36 | if: startsWith(github.ref, 'refs/tags') 37 | uses: pypa/gh-action-pypi-publish@master 38 | with: 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | name: Run Pytest 2 | 3 | # Controls when the workflow will run 4 | on: 5 | # Triggers the workflow on push or pull request events but only for the master branch 6 | push: 7 | branches: [master] 8 | pull_request: 9 | branches: [master] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 15 | jobs: 16 | # This workflow contains a single job called "build" 17 | build: 18 | # The type of runner that the job will run on 19 | runs-on: ubuntu-latest 20 | 21 | # Steps represent a sequence of tasks that will be executed as part of the job 22 | steps: 23 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 24 | - uses: actions/checkout@v2 25 | - name: Set up Python 3.9 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: 3.9 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install flake8 pytest 33 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 34 | 35 | - name: Test with pytest 36 | run: | 37 | if [ -d tests ] || [ -d test ]; then python -m pytest; fi 38 | -------------------------------------------------------------------------------- /.github/workflows/sphinx.yml: -------------------------------------------------------------------------------- 1 | name: Pages 2 | on: 3 | push: 4 | branches: 5 | - master 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/setup-python@v2 11 | - uses: actions/checkout@master 12 | with: 13 | fetch-depth: 0 # otherwise, you will failed to push refs to dest repo 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install furo 18 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 19 | - name: Build and Commit 20 | uses: sphinx-notes/pages@v2 21 | - name: Push changes 22 | uses: ad-m/github-push-action@master 23 | with: 24 | github_token: ${{ secrets.GITHUB_TOKEN }} 25 | branch: gh-pages 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | 3 | __pycache__/ 4 | *.py[cod] 5 | 6 | # Distribution / packaging 7 | bin/ 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | eggs/ 12 | lib/ 13 | lib64/ 14 | parts/ 15 | sdist/ 16 | var/ 17 | *.egg-info/ 18 | .installed.cfg 19 | *.egg 20 | 21 | # Installer logs 22 | pip-log.txt 23 | pip-delete-this-directory.txt 24 | 25 | # Unit test / coverage reports 26 | .tox/ 27 | .coverage 28 | .cache 29 | nosetests.xml 30 | coverage.xml 31 | 32 | .vscode 33 | .pytest_cache/ 34 | .coverage 35 | 36 | # Sphinx documentation 37 | docs/_build/* 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021-2022 Lorenzo Felletti 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyregex(p) 2 | 3 | ## What is it 4 | 5 | Pyregex(p) is a backtracking Regex Engine complete with all major regular-expressions' features. 6 | 7 | It is composed of a Lexer, a Parser (a TDRD parser) and finally the Engine. 8 | 9 | Features implemented includes: 10 | | Feature | Syntax | 11 | |-|-| 12 | | match start | ^... | 13 | | match end | ...$ | 14 | | escaping | \\ | 15 | | grouping | (...) | 16 | | named group | (?\...) | 17 | | non-capturing group | (?:...) | 18 | | alternative | a\|b | 19 | | wildcard | . | 20 | | space | \s | 21 | | quantifiers | ? \* + | 22 | | curly brace quantification | {exact} {min,max} {,max} {min,} | 23 | | range element | [^a-zA-Z059] | 24 | 25 | 26 | ## Play with the engine: 27 | 28 | ```Python 29 | from pyregexp.engine import RegexEngine 30 | 31 | reng = RegexEngine() 32 | 33 | reng.match('^my_(beautiful_)+regex', '^my_beautiful_beautiful_beautiful_regex') 34 | ``` 35 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/__init__.py -------------------------------------------------------------------------------- /create_uml.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pyreverse -o re_ast.png -A -S -mn -f ALL ./pyregexp/re_ast.py 3 | mv classes.re_ast.png docs/uml 4 | 5 | pyreverse -o engine.png -A -S -mn -f ALL ./pyregexp/engine.py 6 | mv classes.engine.png docs/uml 7 | 8 | pyreverse -o lexer.png -A -S -mn -f ALL ./pyregexp/lexer.py 9 | mv classes.lexer.png docs/uml 10 | 11 | pyreverse -o match.png -A -S -mn -f ALL ./pyregexp/match.py 12 | mv classes.match.png docs/uml 13 | 14 | pyreverse -o pyrser.png -A -S -mn -f ALL ./pyregexp/pyrser.py 15 | mv classes.pyrser.png docs/uml 16 | 17 | pyreverse -o tokens.png -A -S -mn -f ALL ./pyregexp/tokens.py 18 | mv classes.tokens.png docs/uml 19 | 20 | pyreverse -o pyregexp.png -A -S -mn ./pyregexp/* 21 | mv classes.pyregexp.png docs/uml 22 | mv packages.pyregexp.png docs/uml 23 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('..')) 16 | 17 | import pyregexp 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pyregexp' 21 | copyright = '2022, Lorenzo Felletti' 22 | author = 'Lorenzo Felletti' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'sphinx.ext.autodoc', 32 | 'sphinx.ext.napoleon', 33 | 'sphinx.ext.viewcode', 34 | 'sphinx.ext.githubpages', 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 44 | 45 | 46 | # -- Options for HTML output ------------------------------------------------- 47 | 48 | # The theme to use for HTML and HTML Help pages. See the documentation for 49 | # a list of builtin themes. 50 | # 51 | html_theme = 'furo' 52 | 53 | # Add any paths that contain custom static files (such as style sheets) here, 54 | # relative to this directory. They are copied after the builtin static files, 55 | # so a file named "default.css" will overwrite the builtin "default.css". 56 | html_static_path = ['_static'] 57 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. pyregexp documentation master file, created by 2 | sphinx-quickstart on Wed Mar 23 17:24:22 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to pyregexp's documentation! 7 | ==================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | Indices and tables 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | pyregexp 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | pyregexp 8 | -------------------------------------------------------------------------------- /docs/pyregexp.rst: -------------------------------------------------------------------------------- 1 | pyregexp package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyregexp.engine module 8 | ---------------------- 9 | 10 | .. figure:: /uml/classes.engine.png 11 | :alt: engine.py uml diagram 12 | :width: 100% 13 | 14 | *UML of all pyregexp.engine classes.* 15 | 16 | .. automodule:: pyregexp.engine 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | 21 | pyregexp.lexer module 22 | --------------------- 23 | 24 | .. figure:: /uml/classes.lexer.png 25 | :alt: lexer.py uml diagram 26 | :width: 25% 27 | 28 | *UML of all pyregexp.lexer classes.* 29 | 30 | .. automodule:: pyregexp.lexer 31 | :members: 32 | :undoc-members: 33 | :show-inheritance: 34 | 35 | pyregexp.match module 36 | --------------------- 37 | 38 | .. figure:: /uml/classes.match.png 39 | :alt: match.py uml diagram 40 | :width: 70% 41 | 42 | *UML of all pyregexp.match classes.* 43 | 44 | .. automodule:: pyregexp.match 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | pyregexp.pyrser module 50 | ---------------------- 51 | 52 | .. figure:: /uml/classes.pyrser.png 53 | :alt: pyrser.py uml diagram 54 | :width: 20% 55 | 56 | *UML of all pyregexp.pyrser classes.* 57 | 58 | .. automodule:: pyregexp.pyrser 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | pyregexp.re\_ast module 64 | ----------------------- 65 | 66 | .. figure:: /uml/classes.re_ast.png 67 | :alt: re_ast.py uml diagram 68 | :width: 100% 69 | 70 | *UML of all pyregexp.re_ast classes.* 71 | 72 | .. automodule:: pyregexp.re_ast 73 | :members: 74 | :undoc-members: 75 | :show-inheritance: 76 | 77 | pyregexp.tokens module 78 | ---------------------- 79 | 80 | .. figure:: /uml/classes.tokens.png 81 | :alt: tokens.py uml diagram 82 | :width: 100% 83 | 84 | *UML of all pyregexp.tokens classes.* 85 | 86 | .. automodule:: pyregexp.tokens 87 | :members: 88 | :undoc-members: 89 | :show-inheritance: 90 | 91 | Module contents 92 | --------------- 93 | 94 | .. figure:: /uml/classes.pyregexp.png 95 | :alt: pyregexp uml diagram 96 | :width: 100% 97 | 98 | *UML of all pyregexp classes.* 99 | 100 | .. figure:: /uml/packages.pyregexp.png 101 | :alt: packages uml diagram 102 | :scale: 70% 103 | 104 | *UML of pyregexp packages.* 105 | 106 | .. automodule:: pyregexp 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | -------------------------------------------------------------------------------- /docs/uml/classes.engine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.engine.png -------------------------------------------------------------------------------- /docs/uml/classes.lexer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.lexer.png -------------------------------------------------------------------------------- /docs/uml/classes.match.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.match.png -------------------------------------------------------------------------------- /docs/uml/classes.pyregexp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.pyregexp.png -------------------------------------------------------------------------------- /docs/uml/classes.pyrser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.pyrser.png -------------------------------------------------------------------------------- /docs/uml/classes.re_ast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.re_ast.png -------------------------------------------------------------------------------- /docs/uml/classes.tokens.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/classes.tokens.png -------------------------------------------------------------------------------- /docs/uml/packages.pyregexp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/docs/uml/packages.pyregexp.png -------------------------------------------------------------------------------- /grammar.txt: -------------------------------------------------------------------------------- 1 | REGEX GRAMMAR 2 | 3 | REGEX GRAMMAR recognized: 4 | RE ::= RE_SEQ 5 | RE_SEQ ::= '^'? GROUP '$'? ('|' RE_SEQ)? 6 | GROUP ::= (RANGE_EL QTIFIER?)+ 7 | RANGE_EL ::= EL | '[' '^'? INNER_EL ']' 8 | EL ::= '\\'? (ch | SPECIAL) | '(' ('?:')? RE_SEQ ')' 9 | 10 | QTIFIER ::= '*' | '+' | '?' | '{' (num)? ',' num '}' | '{' num '}' 11 | INNER_EL ::= ch+ | ch '-' ch INNER_EL 12 | SPECIAL ::= '(' | ')' | '+' | '{' | '[' | '|' | '.' | '^' | '$' | ... 13 | -------------------------------------------------------------------------------- /print_coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | (source venv/bin/activate;coverage run --omit 'venv/*,test/*' -m pytest;coverage report "$@";deactivate) 4 | -------------------------------------------------------------------------------- /pyregexp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/pyregexp/__init__.py -------------------------------------------------------------------------------- /pyregexp/engine.py: -------------------------------------------------------------------------------- 1 | """Module containing the RegexEngine class. 2 | 3 | The RegexEngine class implements a regular expressions engine. 4 | 5 | Example: 6 | Matching a regex with some test string:: 7 | 8 | reng = RegexEngine() 9 | result, consumed = reng.match(r"a+bx", "aabx") 10 | """ 11 | 12 | 13 | from collections import deque 14 | from typing import Callable, Deque, Union, Tuple, List 15 | import unicodedata 16 | from .pyrser import Pyrser 17 | from .match import Match 18 | from .re_ast import RE, GroupNode, LeafNode, OrNode, EndElement, StartElement 19 | 20 | 21 | class RegexEngine: 22 | """ Regular Expressions Engine. 23 | 24 | This class contains all the necessary to recognize regular expressions in a test string. 25 | """ 26 | 27 | def __init__(self): 28 | self.parser: Pyrser = Pyrser() 29 | self.prev_re: str = None 30 | self.prev_ast: RE = None 31 | 32 | def match(self, re: str, string: str, return_matches: bool = False, continue_after_match: bool = False, ignore_case: int = 0) -> Union[Tuple[bool, int, List[Deque[Match]]], Tuple[bool, int]]: 33 | """ Searches a regex in a test string. 34 | 35 | Searches the passed regular expression in the passed test string and 36 | returns the result. 37 | 38 | It is possible to customize both the returned value and the search 39 | method. 40 | 41 | The ignore_case flag may cause unexpected results in the returned 42 | number of matched characters, and also in the returned matches, e.g. 43 | when the character ẞ is present in either the regex or the test string. 44 | 45 | Args: 46 | re (str): the regular expression to search 47 | string (str): the test string 48 | return_matches (bool): if True a data structure containing the 49 | matches - the whole match and the subgroups matched 50 | (default is False) 51 | continue_after_match (bool): if True the engine continues 52 | matching until the whole input is consumed 53 | (default is False) 54 | ignore_case (int): when 0 the case is not ignored, when 1 a "soft" 55 | case ignoring is performed, when 2 casefolding is performed. 56 | (default is 0) 57 | 58 | Returns: 59 | A tuple containing whether a match was found or not, the last 60 | matched character index, and, if return_matches is True, a 61 | list of deques of Match, where each list of matches represents 62 | in the first position the whole match, and in the subsequent 63 | positions all the group and subgroups matched. 64 | """ 65 | 66 | def return_fnc(res: bool, consumed: int, all_matches: List[Deque[Match]], return_matches: bool) -> Union[Tuple[bool, int, List[Deque[Match]]], Tuple[bool, int]]: 67 | """ Create the Tuple to return.""" 68 | if return_matches: 69 | return res, consumed, all_matches 70 | else: 71 | return res, consumed 72 | 73 | if ignore_case == 1: 74 | re = unicodedata.normalize("NFKD", re).lower() 75 | string = unicodedata.normalize("NFKD", string).casefold() 76 | elif ignore_case == 2: 77 | re = unicodedata.normalize("NFKD", re).casefold() 78 | string = unicodedata.normalize("NFKD", string).casefold() 79 | 80 | ast = self.parser.parse(re=re) if self.prev_re != re else self.prev_ast 81 | self.prev_re = re 82 | self.prev_ast = ast 83 | 84 | # variables holding the matched groups list for each matched substring in the test string 85 | all_matches: List[Deque[Match]] = [] 86 | highest_matched_idx: int = 0 # holds the highest matched string's index 87 | 88 | res, consumed, matches = self.__match__(ast, string, 0) 89 | if res: 90 | highest_matched_idx = consumed 91 | all_matches.append(matches) 92 | else: 93 | return return_fnc(res, highest_matched_idx, all_matches, return_matches) 94 | 95 | if not continue_after_match or not consumed > 0: 96 | return return_fnc(res, highest_matched_idx, all_matches, return_matches) 97 | 98 | while True: 99 | res, consumed, matches = self.__match__(ast, string, consumed) 100 | 101 | # if consumed is not grater than highest_matched_idx this means the new match 102 | # consumed 0 characters, so there is really nothing more to match 103 | if res and consumed > highest_matched_idx: 104 | highest_matched_idx = consumed 105 | all_matches.append(matches) 106 | else: 107 | return return_fnc(True, highest_matched_idx, all_matches, return_matches) 108 | 109 | def __match__(self, ast: RE, string: str, start_str_i: int) -> Tuple[bool, int, Deque[Match]]: 110 | """ Same as match, but always returns after the first match.""" 111 | matches: Deque[Match] = deque() 112 | 113 | # used to restore the left match of a ornode if necessary 114 | last_match: Match = None 115 | 116 | # str_i represents the matched characters so far. It is inizialized to 117 | # the value of the input parameter start_str_i because the match could 118 | # be to be searched starting at an index different from 0, e.g. in the 119 | # case this function is called to search a second match in the test 120 | # string. 121 | str_i = start_str_i 122 | 123 | # max_matched_idx represents the "upper limit" of the match. 124 | # It is necessary when backtracking in the presence of nested 125 | # quantifiers, because we need a way to "tell" the group that 126 | # is causing the fail by being too greedy to stop earlier if 127 | # possible. 128 | max_matched_idx = -1 129 | 130 | def return_fnc(res: bool, str_i: int) -> Tuple[bool, int, Deque[Match]]: 131 | """ Returns the Tuple to be returned by __match__.""" 132 | nonlocal matches 133 | return res, str_i, matches 134 | 135 | def save_matches(match_group: Callable, ast: Union[RE, GroupNode], string: str, start_idx: int, max_matched_idx=-1) -> Tuple[bool, int]: 136 | """ Save the matches of capturing groups. 137 | 138 | Args: 139 | match_group (Callable): the function to use to match the group 140 | ast (Union[RE, GroupNode]): the group to match 141 | string (str): the string to match 142 | start_idx (int): the starting index 143 | 144 | Returns: 145 | A tuple of the boolean result of the match, and the last matched 146 | index. 147 | """ 148 | nonlocal matches 149 | nonlocal last_match 150 | 151 | res, end_idx = match_group(ast, string, max_matched_idx) 152 | 153 | if ast.is_capturing() and res == True: 154 | for i in range(0, len(matches)): 155 | if matches[i].group_id == ast.group_id: 156 | last_match = matches[i] 157 | matches.remove(matches[i]) 158 | break 159 | matches.appendleft( 160 | Match(ast.group_id, start_idx, end_idx, string, ast.group_name)) 161 | 162 | return res, end_idx 163 | 164 | def remove_leftmost_match(): 165 | """ Used when matching an OrNode. 166 | 167 | When matching an OrNode the right children is always saved instead 168 | of saving the left one when the chosen path goes left. By calling 169 | this function you remove the leftmost match (the one created by the 170 | right child). 171 | """ 172 | nonlocal matches 173 | matches.popleft() 174 | 175 | def appendleft_last_match(): 176 | """ Used when matching an OrNode. 177 | 178 | When matching an OrNode the right children is always saved instead 179 | of saving the left one when the chosen path goes left. By calling 180 | this function you restore the left match. 181 | """ 182 | nonlocal matches 183 | matches.appendleft(last_match) 184 | 185 | 186 | def match_group(ast: Union[RE, GroupNode, OrNode], string: str, max_matched_idx: int = -1) -> Tuple[bool, int]: 187 | """ 188 | Match a group, which is always the case.s 189 | 190 | Returns the match state (True or False) and the new string i, that is the 191 | number of matched characters in the string so far. 192 | """ 193 | nonlocal start_str_i 194 | nonlocal str_i 195 | backtrack_stack: List[Tuple[int, int, int, List[int]]] = [] 196 | 197 | def backtrack(str_i: int, curr_child_i: int, recursive: bool = False) -> Tuple[bool, int, int]: 198 | """ Returns whether it is possible to backtrack and the state to backtrack to. 199 | 200 | Takes as input the current state of the engine and returns whether 201 | or not it is possible to backtrack. 202 | 203 | Args: 204 | str_i (int): the current considered index of the test string 205 | curr_child_i (int): the index of the GroupNode children considered 206 | 207 | Returns: 208 | A Tuple containing a bool, True if it is possible to backtrack, 209 | the new string index, and the new node children index to which 210 | backtrack to. Note that the last two parameters only have a 211 | meaning in the case it is possible to backtrack (the bool is 212 | True). 213 | """ 214 | nonlocal backtrack_stack 215 | nonlocal max_matched_idx 216 | nonlocal ast 217 | 218 | if len(backtrack_stack) == 0: 219 | return False, str_i, curr_child_i 220 | 221 | # the fist step is to pop the last tuple from the backtrack_stack 222 | popped_child_i, min_, matched_times, consumed_list = backtrack_stack.pop() 223 | 224 | if matched_times == min_: 225 | # if a node is already matched the minimum number of times, the 226 | # chance you have to potentially be able to backtrack is to is 227 | # to delete the entry from the stack and then search for a new 228 | # possibility (recursively calling this function). 229 | # But, before the recursion, you have to calculate what the 230 | # string index (str_i) value was before the node was matched 231 | # even once. Thus, you have to decrease the string index 232 | # of each consumption in the consumed_list. 233 | 234 | # calculate_the new str_i 235 | before_str_i = str_i 236 | for consumption in consumed_list: 237 | str_i -= consumption 238 | if max_matched_idx == -1 or isinstance(ast.children[popped_child_i], LeafNode) or before_str_i == str_i: 239 | # recursive call 240 | return backtrack(str_i, popped_child_i, True) 241 | else: 242 | # case of backtracking from nested quantifier 243 | # returns "not recursive" because if it is the case 244 | # of a recursive call, this is outside of the case of 245 | # simply nested quantifiers, and in I cannot backtrack 246 | # anymore 247 | return not recursive, str_i, popped_child_i 248 | else: 249 | # the node was matched more times than its min, so you just 250 | # need to remove the last consumption from the list, 251 | # decrease the str_i by that amount, decrease the times the node 252 | # was matched - matched_times - by 1, and then append the stack 253 | # the tuple with the new matched_times and consumed_list. 254 | last_consumed = consumed_list.pop() 255 | new_str_i = str_i - last_consumed 256 | if max_matched_idx == -1 or isinstance(ast.children[popped_child_i], LeafNode): 257 | backtrack_stack.append( 258 | (popped_child_i, min_, matched_times - 1, consumed_list)) 259 | # lastly, you return that the backtracking is possible, and 260 | # the state to which backtrack to. 261 | return True, new_str_i, curr_child_i 262 | else: 263 | # case of backtracking from nested quantifier 264 | return not recursive, new_str_i, popped_child_i 265 | 266 | def remove_this_node_from_stack(curr_child_i: int, str_i: int) -> int: 267 | """ Removes node from stack and returns the new str_i. 268 | """ 269 | nonlocal backtrack_stack 270 | popped_child_i, min_, matched_times, consumed_list = backtrack_stack.pop() 271 | if popped_child_i == curr_child_i: 272 | for consumption in consumed_list: 273 | str_i -= consumption 274 | else: 275 | backtrack_stack.append((popped_child_i, min_, matched_times, consumed_list)) 276 | return str_i 277 | 278 | curr_node = ast.children[0] if len(ast.children) > 0 else None 279 | i = 0 # the children i'm iterating, not to confuse with str_i 280 | 281 | if isinstance(ast, OrNode): 282 | # matcha il primo, se matcha return true 283 | # se no matcha il secondo 284 | # se matcha return true, altrimenti false 285 | tmp_str_i = str_i 286 | res, new_str_i = save_matches( 287 | match_group, curr_node, string, str_i, max_matched_idx) if not isinstance(curr_node, OrNode) else match_group(curr_node, string, max_matched_idx) 288 | if not res: 289 | str_i = tmp_str_i 290 | curr_node = ast.right 291 | res, new_str_i = save_matches( 292 | match_group, curr_node, string, str_i, max_matched_idx) if not isinstance(curr_node, OrNode) else match_group(curr_node, string, max_matched_idx) 293 | str_i = new_str_i 294 | return res, str_i 295 | 296 | # the passed ast can't be a Leaf 297 | while i < len(ast.children): 298 | curr_node = ast.children[i] 299 | 300 | # if is OrNode I evaluate the sub-groups with a recursive call 301 | if isinstance(curr_node, OrNode): 302 | before_str_i = str_i 303 | min_, max_ = curr_node.min, curr_node.max 304 | j = 0 305 | consumed_list = [] 306 | 307 | backtracking = False 308 | while j < max_: 309 | tmp_str_i = str_i 310 | 311 | save_match_left = isinstance(curr_node.left, GroupNode) 312 | res_left, str_i_left = save_matches(match_group, curr_node.left, string, str_i, max_matched_idx) if save_match_left else match_group(curr_node.left, string, max_matched_idx) 313 | 314 | str_i = tmp_str_i 315 | 316 | save_match_right = isinstance(curr_node.right, GroupNode) 317 | res_right, str_i_right = save_matches(match_group, curr_node.right, string, str_i, max_matched_idx) if save_match_right else match_group(curr_node.right, string, max_matched_idx) 318 | 319 | if res_left and res_right: 320 | # choose the one that consumed the most character 321 | # unless it exceeds the max_matched_idx 322 | chose_left = (str_i_left >= str_i_right) 323 | str_i = str_i_left if chose_left else str_i_right 324 | if max_matched_idx != -1 and str_i > max_matched_idx: 325 | # tries to stay below the max_matched_idx threshold 326 | str_i = str_i_right if chose_left else str_i_left 327 | if chose_left: 328 | if save_match_right: 329 | remove_leftmost_match() 330 | if save_match_left: 331 | appendleft_last_match() 332 | else: 333 | # chose right 334 | if save_match_left and not save_match_right: 335 | # there is a spurious match originated from 336 | # the left child 337 | remove_leftmost_match() 338 | 339 | elif res_left and not res_right: 340 | str_i = str_i_left 341 | elif not res_left and res_right: 342 | str_i = str_i_right 343 | 344 | res = (res_left or res_right) 345 | 346 | if res == True and (max_matched_idx == -1 or str_i <= max_matched_idx): 347 | if (str_i - tmp_str_i == 0) and j >= min_: 348 | max_matched_idx = -1 349 | break 350 | consumed_list.append(str_i - tmp_str_i) 351 | else: 352 | if min_ <= j: 353 | max_matched_idx = -1 354 | break 355 | if i > 0 and not isinstance(ast.children[i-1], LeafNode): 356 | str_i = remove_this_node_from_stack(i, str_i) 357 | if str_i == start_str_i: 358 | return False, str_i 359 | max_matched_idx = str_i - 1 if max_matched_idx == -1 else max_matched_idx - 1 360 | can_bt, bt_str_i, bt_i = backtrack(str_i, i) 361 | if can_bt: 362 | i = bt_i 363 | str_i = bt_str_i 364 | backtracking = True 365 | break # retry to match the current node 366 | else: 367 | return False, str_i 368 | j += 1 369 | if not backtracking: 370 | backtrack_stack.append( 371 | (i, min_, j, consumed_list)) 372 | max_matched_idx = -1 373 | i += 1 374 | continue 375 | 376 | elif isinstance(curr_node, GroupNode): 377 | min_, max_ = curr_node.min, curr_node.max 378 | j = 0 379 | consumed_list = [] 380 | before_str_i = str_i 381 | 382 | backtracking = False 383 | while j < max_: 384 | tmp_str_i = str_i 385 | 386 | res, new_str_i = save_matches( 387 | match_group, curr_node, string, str_i, max_matched_idx) 388 | if res == True and (max_matched_idx == -1 or new_str_i <= max_matched_idx): 389 | # i must use tmp_str_i because str_i is changed by the match_group 390 | # call, so (new_str_i - str_i) would be always 0 391 | if (new_str_i - tmp_str_i == 0) and j >= min_: 392 | max_matched_idx = -1 393 | break 394 | consumed_list.append(new_str_i - tmp_str_i) 395 | #str_i = new_str_i 396 | else: 397 | if min_ <= j: 398 | # i did the bare minimum or more 399 | max_matched_idx = -1 400 | break 401 | if i > 0 and not isinstance(ast.children[i-1], LeafNode): 402 | str_i = remove_this_node_from_stack(i, str_i) 403 | if str_i == start_str_i: 404 | return False, str_i 405 | max_matched_idx = str_i - 1 if max_matched_idx == -1 else max_matched_idx - 1 406 | can_bt, bt_str_i, bt_i = backtrack(str_i, i) 407 | if can_bt: 408 | i = bt_i 409 | str_i = bt_str_i 410 | backtracking = True 411 | break # retry to match the current node 412 | else: 413 | return False, str_i 414 | j += 1 415 | 416 | # if NOT backtracking iterate the next element, and put the 417 | # current on the backtrack_stack, otherwise don't increment i, don't put on the 418 | # stack so to retry the current one (just continue) 419 | if not backtracking: 420 | backtrack_stack.append( 421 | (i, min_, j, consumed_list)) 422 | max_matched_idx = -1 423 | i += 1 424 | 425 | continue 426 | 427 | elif isinstance(curr_node, LeafNode): 428 | # it is a LeafNode obviously now 429 | min_, max_ = curr_node.min, curr_node.max 430 | j = 0 431 | 432 | consumed_list = [] 433 | 434 | before_str_i = str_i # to discard changes made in case i need to bt 435 | 436 | backtracking = False 437 | while j < max_: 438 | if str_i < len(string): # i still have input to match 439 | if curr_node.is_match(ch=string[str_i], str_i=str_i, str_len=len(string)) and (max_matched_idx == -1 or str_i < max_matched_idx): 440 | if not (isinstance(curr_node, StartElement) or isinstance(curr_node, EndElement)): 441 | consumed_list.append(1) 442 | str_i += 1 443 | else: 444 | if min_ <= j: # I already met the minimum requirement for match 445 | break 446 | if i > 0 and not isinstance(ast.children[i-1], LeafNode): 447 | str_i = remove_this_node_from_stack(i, str_i) 448 | if str_i == start_str_i: 449 | return False, str_i 450 | max_matched_idx = str_i - 1 451 | can_bt, bt_str_i, bt_i = backtrack( 452 | before_str_i, i) 453 | if can_bt: 454 | i = bt_i 455 | str_i = bt_str_i 456 | backtracking = True 457 | break 458 | else: 459 | return False, str_i 460 | else: # finished input 461 | if isinstance(curr_node, StartElement) or isinstance(curr_node, EndElement) and curr_node.is_match(str_i=str_i, str_len=len(string)): 462 | pass 463 | # finished input w/o finishing the regex tree 464 | elif min_ <= j: 465 | break 466 | else: 467 | # i have more states, but the input is finished 468 | can_bt, bt_str_i, bt_i = backtrack( 469 | before_str_i, i) 470 | if can_bt: 471 | i = bt_i 472 | str_i = bt_str_i 473 | backtracking = True 474 | break 475 | else: 476 | return False, str_i 477 | j += 1 478 | if not backtracking: 479 | backtrack_stack.append( 480 | (i, min_, j, consumed_list)) 481 | i += 1 482 | continue 483 | else: 484 | return False, str_i 485 | 486 | return True, str_i 487 | 488 | i = str_i 489 | 490 | if len(string) == 0: 491 | res, consumed = save_matches( 492 | match_group=match_group, ast=ast, string=string, start_idx=str_i) 493 | return return_fnc(res, consumed) 494 | 495 | while str_i < len(string): 496 | res, _ = save_matches(match_group=match_group, 497 | ast=ast, string=string, start_idx=str_i) 498 | i += 1 499 | if res: 500 | return return_fnc(True, str_i) 501 | else: 502 | matches = deque() 503 | str_i = i 504 | return return_fnc(False, str_i) 505 | -------------------------------------------------------------------------------- /pyregexp/lexer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from .tokens import * 3 | 4 | 5 | class Lexer: 6 | """ Lexer for the pyregexp library. 7 | 8 | This class contains the method to scan a regular expression string producing the corresponding tokens. 9 | """ 10 | 11 | def __init__(self) -> None: 12 | self.__digits__ = '0123456789' 13 | 14 | def __is_digit__(self, ch: str) -> bool: 15 | return self.__digits__.find(ch) > -1 16 | 17 | def scan(self, re: str) -> List[Token]: 18 | """ Regular expressions scanner. 19 | 20 | Scans the regular expression in input and produces the list of recognized Tokens in output. 21 | It raises an Exception if there are errors in the regular expression. 22 | 23 | Args: 24 | re (str): the regular expression to scan 25 | 26 | Returns: 27 | List[Token]: the list of tokens recognized in the passed regex 28 | """ 29 | tokens = [] 30 | 31 | def append(elem: Token) -> None: 32 | nonlocal tokens 33 | tokens.append(elem) 34 | 35 | i = 0 36 | escape_found = False 37 | while i < len(re): 38 | ch = re[i] 39 | if escape_found: 40 | if ch == 't': 41 | append(ElementToken(char='\t')) 42 | if ch == 's': 43 | # \s matches a space character 44 | append(SpaceToken(char=ch)) 45 | else: 46 | append(ElementToken(char=ch)) 47 | elif ch == '\\': 48 | escape_found = True 49 | i += 1 # otherwise i won't be incremented bc of continue 50 | continue 51 | elif ch == '.': 52 | append(Wildcard()) 53 | elif ch == '(': 54 | append(LeftParenthesis()) 55 | elif ch == ')': 56 | append(RightParenthesis()) 57 | elif ch == '[': 58 | append(LeftBracket()) 59 | elif ch == '-': 60 | append(Dash()) 61 | elif ch == ']': 62 | append(RightBracket()) 63 | elif ch == '{': 64 | append(LeftCurlyBrace()) 65 | i += 1 66 | while i < len(re): 67 | ch = re[i] 68 | if ch == ',': 69 | append(Comma()) 70 | elif self.__is_digit__(ch): 71 | append(ElementToken(char=ch)) 72 | elif ch == '}': 73 | append(RightCurlyBrace()) 74 | break 75 | else: 76 | raise Exception("Bad token at index ${}.".format(i)) 77 | i += 1 78 | elif ch == '^': 79 | if i == 0: 80 | append(Start()) 81 | else: 82 | append(Circumflex()) 83 | elif ch == '$': 84 | append(End()) 85 | elif ch == '?': 86 | append(QuestionMark()) 87 | elif ch == '*': 88 | append(Asterisk()) 89 | elif ch == '+': 90 | append(Plus()) 91 | elif ch == '|': 92 | append(VerticalBar()) 93 | elif ch == '}': 94 | append(RightCurlyBrace()) 95 | else: 96 | append(ElementToken(char=ch)) 97 | 98 | escape_found = False 99 | i += 1 100 | 101 | return tokens 102 | -------------------------------------------------------------------------------- /pyregexp/match.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Match: 4 | """ Contains the information of a match in a regular expression.""" 5 | 6 | def __init__(self, group_id: int, start_idx: int, end_idx: int, string: str, name: str) -> None: 7 | self.group_id: int = group_id 8 | self.name: str = name 9 | self.start_idx: int = start_idx 10 | self.end_idx: int = end_idx 11 | self.match: str = string[start_idx:end_idx] 12 | -------------------------------------------------------------------------------- /pyregexp/pyrser.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Callable 2 | import itertools 3 | import math 4 | from .lexer import Lexer 5 | from .tokens import * 6 | from .re_ast import * 7 | 8 | 9 | class Pyrser: 10 | """ Regular Expression Parser. 11 | 12 | Pyrser instances can parse regular expressions and return the corresponding AST. 13 | """ 14 | 15 | def __init__(self) -> None: 16 | self.lxr: Lexer = Lexer() 17 | 18 | def parse(self, re: str) -> RE: 19 | """ Parses a regular expression. 20 | 21 | Parses a regex and returns the corresponding AST. 22 | If the regex contains errors raises an Exception. 23 | 24 | Args: 25 | re (str): a regular expression 26 | 27 | Returns: 28 | RE: the root node of the regular expression's AST 29 | """ 30 | 31 | def get_range_str(start: str, end: str) -> str: 32 | result = '' 33 | i = ord(start) 34 | while i <= ord(end): 35 | result += chr(i) 36 | i += 1 37 | return result 38 | 39 | def next_tkn_initializer(re: str) -> Callable[[bool], Union[Token, None]]: 40 | tokens = self.lxr.scan(re=re) 41 | 42 | i = -1 43 | 44 | def next_tkn(without_consuming: bool = False) -> Union[Token, None]: 45 | nonlocal i 46 | nonlocal tokens 47 | nonlocal curr_tkn 48 | 49 | if without_consuming: 50 | return tokens[i+1] if len(tokens) > i+1 else None 51 | 52 | i += 1 53 | if i < len(tokens): 54 | curr_tkn = tokens[i] 55 | else: 56 | curr_tkn = None 57 | 58 | return next_tkn 59 | 60 | def parse_re() -> RE: 61 | return RE(parse_re_seq()) 62 | 63 | def parse_re_seq(capturing: bool = True, group_name: str = None, group_id: int = None) -> Union[OrNode, GroupNode]: 64 | match_start, match_end = False, False 65 | if type(curr_tkn) is Start or type(curr_tkn) is Circumflex: 66 | next_tkn() 67 | match_start = True 68 | 69 | node = parse_group(capturing=capturing, 70 | group_name=group_name, group_id=group_id) 71 | 72 | if isinstance(curr_tkn, EndToken): 73 | next_tkn() 74 | match_end = True 75 | else: 76 | match_end = False 77 | 78 | if match_start: 79 | node.children.appendleft(StartElement()) 80 | if match_end: 81 | node.children.append(EndElement()) 82 | 83 | if isinstance(curr_tkn, OrToken): 84 | next_tkn() 85 | node = OrNode(left=node, right=parse_re_seq( 86 | group_name=node.group_name, group_id=node.group_id)) 87 | 88 | return node 89 | 90 | def parse_group(capturing: bool = True, group_name: str = None, group_id: int = None) -> GroupNode: 91 | nonlocal groups_counter 92 | if group_id is None: 93 | group_id = next(groups_counter) 94 | 95 | elements = deque() # holds the children of the GroupNode 96 | 97 | while curr_tkn is not None and not isinstance(curr_tkn, OrToken) and \ 98 | not isinstance(curr_tkn, RightParenthesis) and \ 99 | not isinstance(curr_tkn, EndToken): 100 | new_el = parse_range_el() 101 | 102 | next_tkn() 103 | 104 | if isinstance(curr_tkn, EndToken): 105 | elements.append(new_el) 106 | break 107 | 108 | if isinstance(curr_tkn, Quantifier): 109 | if isinstance(curr_tkn, ZeroOrOne): 110 | new_el.min, new_el.max = 0, 1 111 | elif isinstance(curr_tkn, ZeroOrMore): 112 | new_el.min, new_el.max = 0, math.inf 113 | else: 114 | # suppose it's 1+ 115 | new_el.min, new_el.max = 1, math.inf 116 | next_tkn() 117 | elif isinstance(curr_tkn, LeftCurlyBrace): 118 | parse_curly(new_el) 119 | 120 | elements.append(new_el) 121 | 122 | return GroupNode(children=elements, capturing=capturing, group_name=group_name, group_id=group_id) 123 | 124 | def parse_curly(new_el: ASTNode) -> None: 125 | # move past the left brace 126 | next_tkn() 127 | 128 | # find val_1, val_2 129 | val_1, val_2 = '', '' 130 | try: 131 | while isinstance(curr_tkn, ElementToken): 132 | val_1 += curr_tkn.char 133 | next_tkn() 134 | if val_1 == '': 135 | val_1 == 0 136 | else: 137 | val_1 = int(val_1) 138 | 139 | if isinstance(curr_tkn, RightCurlyBrace): 140 | # case {exact} 141 | if type(val_1) is int: 142 | new_el.min, new_el.max = val_1, val_1 143 | next_tkn() # skip the closing brace 144 | return 145 | else: 146 | raise Exception("Invalid curly brace syntax.") 147 | 148 | next_tkn() 149 | while isinstance(curr_tkn, ElementToken): 150 | val_2 += curr_tkn.char 151 | next_tkn() 152 | if val_2 == '': 153 | val_2 == math.inf 154 | else: 155 | val_2 = int(val_2) 156 | 157 | next_tkn() # skip the closing brace 158 | 159 | new_el.min = val_1 if type(val_1) is int else 0 160 | new_el.max = val_2 if type(val_2) is int else math.inf 161 | 162 | except Exception as e: 163 | raise Exception("Invalid curly brace syntax.") 164 | 165 | def parse_range_el() -> ASTNode: 166 | if isinstance(curr_tkn, LeftBracket): 167 | next_tkn() 168 | element = parse_inner_el() 169 | if isinstance(curr_tkn, RightBracket): 170 | return element 171 | else: 172 | raise Exception( 173 | "Missing closing ']'.") 174 | else: 175 | return parse_el() 176 | 177 | def parse_inner_el() -> RangeElement: 178 | # parse_inner_el creates a single RangeElement with all the matches 179 | nonlocal curr_tkn 180 | match_str = '' 181 | if curr_tkn is None: 182 | raise Exception( 183 | "Missing closing ']'.") 184 | 185 | positive_logic = True 186 | if isinstance(curr_tkn, NotToken): 187 | positive_logic = False 188 | next_tkn() 189 | 190 | prev_char = None 191 | while curr_tkn is not None: 192 | if isinstance(curr_tkn, RightBracket): 193 | break 194 | 195 | if isinstance(curr_tkn, SpaceToken): 196 | match_str += curr_tkn.char 197 | next_tkn() 198 | continue 199 | 200 | # every character inside it must be treated as an element 201 | if not isinstance(curr_tkn, ElementToken): 202 | curr_tkn = ElementToken(char=curr_tkn.char) 203 | 204 | if next_tkn(without_consuming=True) is None: 205 | raise Exception("Missing closing ']'.") 206 | elif isinstance(next_tkn(without_consuming=True), Dash): 207 | # it may be a range (like a-z, A-M, 0-9, ...) 208 | prev_char = curr_tkn.char 209 | next_tkn() # current token is now the Dash 210 | if isinstance(next_tkn(without_consuming=True), RightBracket) or isinstance(next_tkn(without_consuming=True), SpaceToken): 211 | # we're in one of these scenarios: "-]" "-\s" 212 | # the dash and previous character must be interpreted as single elements 213 | match_str += prev_char + curr_tkn.char 214 | else: 215 | # we're in the case of an actual range (or next_tkn is none) 216 | next_tkn() # curr_tkn is now the one after the dash 217 | if next_tkn is None: 218 | raise Exception("Missing closing ']'.") 219 | elif ord(prev_char) > ord(curr_tkn.char): 220 | raise Exception( 221 | f"Range values reversed. Start '{prev_char}' char code is greater than end '{curr_tkn.char}' char code.") 222 | else: 223 | match_str += get_range_str(prev_char, 224 | curr_tkn.char) 225 | else: 226 | # no range, no missing ']', just a char to add to match_str 227 | match_str += curr_tkn.char 228 | next_tkn() 229 | 230 | return RangeElement(match_str="".join(sorted(set(match_str))), is_positive_logic=positive_logic) 231 | 232 | def parse_el() -> Union[Element, OrNode, GroupNode]: 233 | group_name: Union[str, None] 234 | group_name = None 235 | if isinstance(curr_tkn, ElementToken): 236 | return Element(match_ch=curr_tkn.char) 237 | elif isinstance(curr_tkn, Wildcard): 238 | return WildcardElement() 239 | elif isinstance(curr_tkn, SpaceToken): 240 | return SpaceElement() 241 | elif isinstance(curr_tkn, LeftParenthesis): 242 | next_tkn() 243 | # (?: for non-capturing group 244 | capturing = True 245 | if type(curr_tkn) is QuestionMark: 246 | next_tkn() 247 | if curr_tkn.char == ':': 248 | capturing = False 249 | next_tkn() 250 | elif curr_tkn.char == '<': 251 | next_tkn() 252 | group_name = parse_group_name() 253 | else: 254 | if curr_tkn is None: 255 | raise Exception("Unterminated group.") 256 | else: 257 | raise Exception( 258 | f"Invalid group: '{{?{curr_tkn.char}'.") 259 | res = parse_re_seq(capturing=capturing, group_name=group_name) 260 | if isinstance(curr_tkn, RightParenthesis): 261 | # next_tkn() not needed (parse_group's while loop will eat the parenthesis) 262 | return res 263 | else: 264 | raise Exception("Missing closing group parenthesis ')'.") 265 | else: 266 | raise Exception( 267 | "Unescaped special character {}.".format(curr_tkn.char)) 268 | 269 | def parse_group_name() -> str: 270 | if curr_tkn is None: 271 | raise Exception("Unterminated named group name.") 272 | group_name = '' 273 | while curr_tkn.char != '>': 274 | group_name += curr_tkn.char 275 | next_tkn() 276 | if curr_tkn is None: 277 | raise Exception("Unterminated named group name.") 278 | if len(group_name) == 0: 279 | raise Exception("Unexpected empty named group name.") 280 | next_tkn() # consumes '>' 281 | return group_name 282 | 283 | groups_counter = itertools.count(start=0) 284 | 285 | curr_tkn = None 286 | next_tkn = next_tkn_initializer(re) 287 | next_tkn() 288 | 289 | ast = parse_re() 290 | if curr_tkn is not None: 291 | raise Exception( 292 | "Unable to parse the regex.") 293 | return ast 294 | -------------------------------------------------------------------------------- /pyregexp/re_ast.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | from typing import Deque, List, Union 3 | 4 | 5 | class ASTNode: 6 | """ AST nodes base class. 7 | 8 | Abstract Syntax Tree classes hierarchy base class. 9 | """ 10 | 11 | def __init__(self) -> None: 12 | pass 13 | 14 | 15 | class RE(ASTNode): 16 | """ Entry point of the AST. 17 | 18 | This class acts as the entry point for a regular expression's AST. 19 | """ 20 | 21 | def __init__(self, child: ASTNode, capturing: bool = False, group_name: str = "RegEx") -> None: 22 | super().__init__() 23 | self.__capturing__: bool = capturing 24 | self.group_name: str = group_name 25 | self.group_id: int = -1 26 | self.child: Union[GroupNode, OrNode] = child 27 | self.children: List[Union[GroupNode, OrNode]] = deque([child]) 28 | 29 | def is_capturing(self) -> bool: 30 | return self.__capturing__ 31 | 32 | 33 | class LeafNode(ASTNode): 34 | """ AST class defining the leaf nodes. 35 | 36 | Every leaf node inherits from this class. 37 | """ 38 | 39 | def __init__(self) -> None: 40 | super().__init__() 41 | 42 | def is_match(self, ch: str = None, str_i: int = None, str_len: int = None) -> bool: 43 | """ 44 | Returns whether the passed inputs matches with the node. 45 | 46 | For example, if the node matches the character "a" and the passed ch is 47 | "b" the method will return False, but if the passed ch was "a" then the 48 | result would have been True. 49 | 50 | Args: 51 | ch (str): the char you want to match 52 | str_i (int): the string index you are considering 53 | str_len (int): the test string length 54 | 55 | Returns: 56 | bool: represents whether there is a match between the node and the 57 | passed parameters or not. 58 | """ 59 | return False 60 | 61 | 62 | class Element(LeafNode): 63 | """ AST Element. 64 | 65 | Specialization of the LeafNode class. This class models the elements of a regex. 66 | """ 67 | 68 | def __init__(self, match_ch: str = None) -> None: 69 | super().__init__() 70 | self.match: str = match_ch 71 | self.min: Union[int, float] = 1 72 | self.max: Union[int, float] = 1 73 | 74 | def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool: 75 | return self.match == ch 76 | 77 | 78 | class WildcardElement(Element): 79 | """ AST WildcardElement. 80 | 81 | Specialization of the Element class to model the wildcard behavior. 82 | """ 83 | 84 | def __init__(self) -> None: 85 | super().__init__(match_ch='anything') 86 | self.match = None 87 | 88 | def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool: 89 | return ch != '\n' 90 | 91 | 92 | class SpaceElement(Element): 93 | """ AST SpaceElement. 94 | 95 | Specialization of the element class to model the match-space behavior. 96 | """ 97 | 98 | def __init__(self) -> None: 99 | super().__init__() 100 | self.match = None 101 | 102 | def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool: 103 | return ch.isspace() and len(ch) == 1 104 | 105 | 106 | class RangeElement(LeafNode): 107 | """ AST RangeElement. 108 | 109 | Specialization of the LeafNode class modeling the range-element behavior, 110 | that is that it matches with more than one character. 111 | """ 112 | 113 | def __init__(self, match_str: str, is_positive_logic: bool = True) -> None: 114 | super().__init__() 115 | self.match: str = match_str 116 | self.min: Union[int, float] = 1 117 | self.max: Union[int, float] = 1 118 | self.is_positive_logic: bool = is_positive_logic 119 | 120 | def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool: 121 | # XNOR of whether the ch is found and the logic (positive/negative) 122 | return not((ch in self.match) ^ self.is_positive_logic) 123 | 124 | 125 | class StartElement(LeafNode): 126 | """ AST StartElement. 127 | 128 | Inherits from LeafNode and models the match-start-element behavior. 129 | """ 130 | 131 | def __init__(self) -> None: 132 | super().__init__() 133 | self.match = None 134 | self.min: Union[int, float] = 1 135 | self.max: Union[int, float] = 1 136 | 137 | def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool: 138 | return str_i == 0 139 | 140 | 141 | class EndElement(LeafNode): 142 | """ AST EndElement. 143 | 144 | Inherits from LeafNode and models the match-end-element behavior. 145 | """ 146 | 147 | def __init__(self) -> None: 148 | super().__init__() 149 | self.match = '' 150 | self.min: Union[int, float] = 1 151 | self.max: Union[int, float] = 1 152 | 153 | def is_match(self, ch: str = None, str_i: int = 0, str_len: int = 0) -> bool: 154 | return str_i == str_len 155 | 156 | 157 | class OrNode(ASTNode): 158 | """ AST OrNode. 159 | 160 | Inherits from ASTNode and models the or-nodes, that is the nodes that 161 | divides the regex into two possible matching paths. 162 | """ 163 | 164 | def __init__(self, left: ASTNode, right: ASTNode) -> None: 165 | super().__init__() 166 | self.left: ASTNode = left 167 | self.right: ASTNode = right 168 | self.children: List[ASTNode] = [left, right] 169 | self.min: Union[int, float] = 1 170 | self.max: Union[int, float] = 1 171 | 172 | 173 | # unused 174 | class NotNode(ASTNode): 175 | """ AST NotNode. 176 | 177 | Inherits from ASTNode and models the not-node behavior. 178 | """ 179 | 180 | def __init__(self, child: ASTNode) -> None: 181 | super().__init__() 182 | self.child: ASTNode = child 183 | self.children: Deque[ASTNode] = deque([child]) 184 | 185 | 186 | class GroupNode(ASTNode): 187 | """ AST GroupNode. 188 | 189 | Inherits from ASTNode and models the group in a regex. 190 | """ 191 | 192 | def __init__(self, children: Deque[ASTNode], capturing: bool = False, group_name: str = None, group_id: int = -1) -> None: 193 | super().__init__() 194 | self.__capturing__: bool = capturing 195 | self.group_id: int = group_id 196 | self.group_name: str = group_name if group_name is not None else "Group " + \ 197 | str(self.group_id) 198 | self.children: Deque[ASTNode] = children 199 | self.min: Union[int, float] = 1 200 | self.max: Union[int, float] = 1 201 | 202 | def is_capturing(self) -> bool: 203 | """ Returns whether the GroupNode is capturing. 204 | 205 | Returns: 206 | bool: True if the group is capturing, False otherwise 207 | """ 208 | return self.__capturing__ 209 | -------------------------------------------------------------------------------- /pyregexp/tokens.py: -------------------------------------------------------------------------------- 1 | import string 2 | from typing import Literal 3 | 4 | 5 | class Token: 6 | """ Token base class.""" 7 | 8 | def __init__(self) -> None: 9 | self.char: str = '' 10 | pass 11 | 12 | 13 | class ElementToken(Token): 14 | """ Token that are not associated to special meaning.""" 15 | 16 | def __init__(self, char: str): 17 | super().__init__() 18 | self.char: str = char 19 | 20 | 21 | class WildcardToken(Token): 22 | """ Token of a wildcard.""" 23 | 24 | def __init__(self, char: str): 25 | super().__init__() 26 | self.char: str = char 27 | 28 | 29 | class SpaceToken(Token): 30 | """ Token of a space.""" 31 | 32 | def __init__(self, char: str) -> None: 33 | super().__init__() 34 | self.char: str = string.whitespace 35 | 36 | 37 | class Wildcard(WildcardToken): 38 | """ Token using '.' as wildcard.""" 39 | 40 | def __init__(self): 41 | super().__init__(char='.') 42 | 43 | 44 | class StartToken(Token): 45 | """ Token of match start.""" 46 | 47 | def __init__(self, char: str): 48 | super().__init__() 49 | self.char: str = char 50 | 51 | 52 | class Start(StartToken): 53 | """ Token using '^' to match start.""" 54 | 55 | def __init__(self): 56 | super().__init__(char='^') 57 | 58 | 59 | class EndToken(Token): 60 | """ Token of match end.""" 61 | 62 | def __init__(self, char: str): 63 | super().__init__() 64 | self.char: str = char 65 | 66 | 67 | class End(EndToken): 68 | """ Token using '$' to match end.""" 69 | 70 | def __init__(self): 71 | super().__init__(char='$') 72 | 73 | 74 | class Escape(Token): 75 | """ Token of the escape character.""" 76 | 77 | def __init__(self): 78 | super().__init__() 79 | self.char = '\\' 80 | 81 | 82 | class Comma(Token): 83 | """ Token of a comma.""" 84 | 85 | def __init__(self): 86 | super().__init__() 87 | self.char = ',' 88 | 89 | 90 | class Parenthesis(Token): 91 | """ Token of a parenthesis.""" 92 | 93 | def __init__(self): 94 | super().__init__() 95 | 96 | 97 | class LeftParenthesis(Parenthesis): 98 | """ Left parenthesis token.""" 99 | 100 | def __init__(self): 101 | super().__init__() 102 | self.char = '(' 103 | 104 | 105 | class RightParenthesis(Parenthesis): 106 | """ Right parenthesis token.""" 107 | 108 | def __init__(self): 109 | super().__init__() 110 | self.char = ')' 111 | 112 | 113 | class CurlyBrace(Token): 114 | """ Curly brace token.""" 115 | 116 | def __init__(self): 117 | super().__init__() 118 | 119 | 120 | class LeftCurlyBrace(CurlyBrace): 121 | """ Left curly brace token.""" 122 | 123 | def __init__(self): 124 | super().__init__() 125 | self.char = '{' 126 | 127 | 128 | class RightCurlyBrace(CurlyBrace): 129 | """ Right curly brace token.""" 130 | 131 | def __init__(self): 132 | super().__init__() 133 | self.char = '}' 134 | 135 | 136 | class Bracket(Token): 137 | """ Brackets token.""" 138 | 139 | def __init__(self): 140 | super().__init__() 141 | 142 | 143 | class LeftBracket(Bracket): 144 | """ Left bracke token.""" 145 | 146 | def __init__(self): 147 | super().__init__() 148 | self.char = '[' 149 | 150 | 151 | class RightBracket(Bracket): 152 | """ Right bracket token.""" 153 | 154 | def __init__(self): 155 | super().__init__() 156 | self.char = ']' 157 | 158 | 159 | class Quantifier(Token): 160 | """ Quantifier token.""" 161 | 162 | def __init__(self, char: str): 163 | super().__init__() 164 | self.char: str = char 165 | 166 | 167 | class ZeroOrMore(Quantifier): 168 | """ Quantifier 'zero or more' token.""" 169 | 170 | def __init__(self, char: str): 171 | super().__init__(char=char) 172 | 173 | 174 | class OneOrMore(Quantifier): 175 | """ Quantifier 'one or more' token.""" 176 | 177 | def __init__(self, char: str): 178 | super().__init__(char=char) 179 | 180 | 181 | class ZeroOrOne(Quantifier): 182 | """ Quantifier 'zero or one' token.""" 183 | 184 | def __init__(self, char: str): 185 | super().__init__(char=char) 186 | 187 | 188 | class Asterisk(ZeroOrMore): 189 | """ Quantifier 'zero or more' token using character '*'.""" 190 | 191 | def __init__(self): 192 | super().__init__(char='*') 193 | 194 | 195 | class Plus(OneOrMore): 196 | """ Quantifier 'one or more' token using character '+'.""" 197 | 198 | def __init__(self): 199 | super().__init__(char='+') 200 | 201 | 202 | class QuestionMark(ZeroOrOne): 203 | """ Quantifier 'zero or one' token using character '?'.""" 204 | 205 | def __init__(self): 206 | super().__init__(char='?') 207 | 208 | 209 | class OrToken(Token): 210 | """ Token of the or.""" 211 | 212 | def __init__(self, char: str): 213 | super().__init__() 214 | self.char: str = char 215 | 216 | 217 | class VerticalBar(OrToken): 218 | """ Token of the or using '|'.""" 219 | 220 | def __init__(self): 221 | super().__init__(char='|') 222 | 223 | 224 | class NotToken(Token): 225 | """ Token of the negation.""" 226 | 227 | def __init__(self, char: str): 228 | super().__init__() 229 | self.char: str = char 230 | 231 | 232 | class Circumflex(NotToken): 233 | """ Token of the negation using '^'.""" 234 | 235 | def __init__(self): 236 | super().__init__(char='^') 237 | 238 | 239 | class Dash(Token): 240 | """ Token of the dash '-'.""" 241 | 242 | def __init__(self): 243 | super().__init__() 244 | self.char = '-' 245 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | [pytest] 3 | minversion = 6.0 4 | testpaths = 5 | test 6 | -------------------------------------------------------------------------------- /regex.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from time import perf_counter_ns 3 | from pyregexp.engine import RegexEngine 4 | 5 | 6 | def usage(): 7 | print("usage: {} regex test_string1 [test_string2 ...]".format( 8 | sys.argv[0])) 9 | pass 10 | 11 | 12 | reng = RegexEngine() 13 | 14 | if __name__ == "__main__": 15 | if len(sys.argv) == 2 and sys.argv[1] == '--usage' or sys.argv[1] == '--help' or sys.argv[1] == '-u' or sys.argv[1] == '-h': 16 | usage() 17 | exit(0) 18 | else: 19 | if len(sys.argv) < 3: 20 | print("Missing arguments.") 21 | usage() 22 | exit(-1) 23 | 24 | regex = sys.argv[1] 25 | print("Regular expression: '{}'".format(regex)) 26 | 27 | i = 2 28 | while i < len(sys.argv): 29 | test_str = sys.argv[i] 30 | start_time = perf_counter_ns() 31 | res, _ = reng.match(regex, test_str) 32 | stop_time = perf_counter_ns() 33 | print(f'Execution time: {stop_time - start_time} ns.') 34 | print_string = f"'{test_str}' match with the regex" if res == True else f"'{test_str}' doesn't match the given regex" 35 | print(print_string) 36 | i += 1 37 | -------------------------------------------------------------------------------- /regex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | source venv/bin/activate 3 | 4 | python3 regex.py "$@" 5 | 6 | deactivate 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.7.12 2 | astroid==2.9.3 3 | attrs==20.3.0 4 | autopep8==1.6.0 5 | Babel==2.9.1 6 | beautifulsoup4==4.10.0 7 | certifi==2021.10.8 8 | charset-normalizer==2.0.12 9 | coverage==5.5 10 | docutils==0.17.1 11 | furo==2022.3.4 12 | idna==3.3 13 | imagesize==1.3.0 14 | importlib-metadata==4.11.3 15 | iniconfig==1.1.1 16 | isort==5.10.1 17 | Jinja2==3.0.3 18 | lazy-object-proxy==1.7.1 19 | MarkupSafe==2.1.1 20 | mccabe==0.6.1 21 | packaging==20.9 22 | platformdirs==2.5.1 23 | pluggy==0.13.1 24 | py==1.10.0 25 | pycodestyle==2.8.0 26 | Pygments==2.11.2 27 | pylint==2.12.2 28 | pyparsing==2.4.7 29 | pytest==6.2.3 30 | pytz==2022.1 31 | requests==2.27.1 32 | rope==0.19.0 33 | snowballstemmer==2.2.0 34 | soupsieve==2.3.1 35 | Sphinx==4.4.0 36 | sphinxcontrib-applehelp==1.0.2 37 | sphinxcontrib-devhelp==1.0.2 38 | sphinxcontrib-htmlhelp==2.0.0 39 | sphinxcontrib-jsmath==1.0.1 40 | sphinxcontrib-qthelp==1.0.3 41 | sphinxcontrib-serializinghtml==1.1.5 42 | toml==0.10.2 43 | typing-extensions==4.1.1 44 | urllib3==1.26.9 45 | wrapt==1.13.3 46 | zipp==3.7.0 47 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = LICENSE 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from pathlib import Path 4 | this_directory = Path(__file__).parent 5 | long_description = (this_directory / "README.md").read_text() 6 | 7 | setup( 8 | name='pyregexp', 9 | packages=['pyregexp'], 10 | version='0.3.1', 11 | license='MIT', 12 | description='Simple regex library', 13 | long_description=long_description, 14 | long_description_content_type='text/markdown', 15 | author='Lorenzo Felletti', 16 | url='https://github.com/lorenzofelletti/pyregex', 17 | download_url='https://github.com/lorenzofelletti/pyregex/archive/v0.3.1.tar.gz', 18 | keywords=['regex', 'regexp', 'engine'], 19 | install_requires=[], 20 | classifiers=[ 21 | 'Development Status :: 3 - Alpha', 22 | 'Intended Audience :: Developers', 23 | 'Operating System :: OS Independent', 24 | 'Topic :: Scientific/Engineering :: Information Analysis', 25 | 'Topic :: Software Development :: Libraries :: Python Modules', 26 | 'Topic :: Text Processing', 27 | 'Topic :: Text Processing :: General', 28 | 'License :: OSI Approved :: MIT License', 29 | 'Programming Language :: Python :: 3', 30 | 'Programming Language :: Python :: 3.8', 31 | 'Programming Language :: Python :: 3.9', 32 | 'Programming Language :: Python :: 3.10', 33 | ], 34 | ) 35 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lorenzofelletti/pyregex/1fde1086bf5c6b24258a7942b862e5a41394f172/test/__init__.py -------------------------------------------------------------------------------- /test/test_engine.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ..pyregexp.engine import RegexEngine 3 | 4 | 5 | @pytest.fixture 6 | def reng() -> RegexEngine: 7 | return RegexEngine() 8 | 9 | 10 | def test_simplest(reng: RegexEngine): 11 | assert (True, 1) == reng.match('a', 'a') 12 | 13 | 14 | def test_simplest_with_wildcard(reng: RegexEngine): 15 | assert (True, 1) == reng.match('.', 'a') 16 | 17 | 18 | def test_simplest_but_longer(reng: RegexEngine): 19 | assert (True, 3) == reng.match('a.c', 'abc') 20 | 21 | 22 | def test_wildcard(reng: RegexEngine): 23 | assert (True, 2) == reng.match('.*a', 'aa') 24 | 25 | 26 | def test_backtracking(reng: RegexEngine): 27 | assert (True, 4) == reng.match('a*a', 'aaaa') 28 | 29 | 30 | def test_or(reng: RegexEngine): 31 | assert (True, 1) == reng.match('a.*|b', 'b') 32 | 33 | 34 | def test_or_no_match(reng: RegexEngine): 35 | res, _ = reng.match('^a|b$', 'c') 36 | assert res == False 37 | 38 | 39 | def test_or_no_match_with_bt(reng: RegexEngine): 40 | res, _ = reng.match('a|b', 'c') 41 | assert res == False 42 | 43 | 44 | def test_bt_no_match(reng: RegexEngine): 45 | res, _ = reng.match('a{5}a', 'aaaaa') 46 | assert res == False 47 | 48 | 49 | def test_match_group_zero_or_more(reng: RegexEngine): 50 | res, consumed = reng.match('(a)*', 'aa') 51 | assert (True, 2) == (res, consumed) 52 | 53 | 54 | def test_fail_group_one_or_more(reng: RegexEngine): 55 | res, _ = reng.match('^(a)+', 'b') 56 | assert res == False 57 | 58 | 59 | def test_complex_match(reng: RegexEngine): 60 | res, _ = reng.match('^(a|b+c)?[n-z]{2}', 'axx') 61 | assert res == True 62 | 63 | 64 | def test_complex_match_2(reng: RegexEngine): 65 | res, _ = reng.match('^(a|b+c)?[n-z]{2}', 'xx') 66 | assert res == True 67 | 68 | 69 | def test_match_mail_simple(reng: RegexEngine): 70 | res, _ = reng.match(r'.*@.*\.(com|it)', 'vr@gmail.com') 71 | assert res == True 72 | 73 | 74 | def test_bt_index_leaf(reng: RegexEngine): 75 | res, _ = reng.match(r'^aaaa.*a$', 'aaaaa') 76 | assert res == True 77 | 78 | 79 | def test_bt_index_or(reng: RegexEngine): 80 | res, _ = reng.match(r'^x(a|b)?bc$', 'xbc') 81 | assert res == True 82 | 83 | 84 | def test_bt_index_group(reng: RegexEngine): 85 | res, _ = reng.match(r'^x(a)?ac$', 'xac') 86 | assert res == True 87 | 88 | 89 | def test_match_or_left(reng: RegexEngine): 90 | res, _ = reng.match('na|nb', 'na') 91 | assert res == True 92 | 93 | 94 | def test_match_or_right(reng: RegexEngine): 95 | res, _ = reng.match('na|nb', 'nb') 96 | assert res == True 97 | 98 | 99 | def test_match_or_right_at_start_end(reng: RegexEngine): 100 | res, _ = reng.match('^na|nb$', 'nb') 101 | assert res == True 102 | 103 | 104 | def test_no_match_after_end(reng: RegexEngine): 105 | res, _ = reng.match('^na|nb$', 'nb ') 106 | assert res == False 107 | 108 | 109 | def test_match_sequence_with_start_end_correctly(reng: RegexEngine): 110 | res, _ = reng.match('^a|b$', 'a ') 111 | assert res == True 112 | 113 | res, _ = reng.match('^a|b$', ' a ') 114 | assert res == False 115 | 116 | res, _ = reng.match('^a|b$', ' b') 117 | assert res == True 118 | 119 | res, _ = reng.match('^a|b$', ' b ') 120 | assert res == False 121 | 122 | 123 | def test_complex_match_3(reng: RegexEngine): 124 | res, _ = reng.match('a(b|[c-n])+b{3}.{2}', 'ahhbbbbbb') 125 | assert res == True 126 | 127 | 128 | def test_bit_less_complex_match_3(reng: RegexEngine): 129 | res, _ = reng.match('a(b|[c-n])+b{3}', 'ahhbbbbbb') 130 | assert res == True 131 | 132 | 133 | def test_unescaped_special_ch(reng: RegexEngine): 134 | with pytest.raises(Exception): 135 | reng.match('$a^', 'aa') 136 | 137 | 138 | def test_various_emails(reng: RegexEngine): 139 | res, _ = reng.match(r'.*@(gmail|hotmail)\.(com|it)', 'baa.aa@hotmail.it') 140 | assert res == True 141 | res, _ = reng.match(r'.*@(gmail|hotmail)\.(com|it)', 'baa.aa@gmail.com') 142 | assert res == True 143 | res, _ = reng.match(r'.*@(gmail|hotmail)\.(com|it)', 'baa.aa@hotmaila.com') 144 | assert res == False 145 | 146 | 147 | def test_match_empty(reng: RegexEngine): 148 | res, _ = reng.match('^$', '') 149 | assert res == True 150 | res, _ = reng.match('$', '') 151 | assert res == True 152 | res, _ = reng.match('^', '') 153 | assert res == True 154 | 155 | 156 | def test_match_space(reng: RegexEngine): 157 | res, _ = reng.match(r'\s', r' ') 158 | assert res == True 159 | res, _ = reng.match(r'\s', '\t') 160 | assert res == True 161 | res, _ = reng.match(r'\s', '\r') 162 | assert res == True 163 | res, _ = reng.match(r'\s', '\f') 164 | assert res == True 165 | res, _ = reng.match(r'\s', '\n') 166 | assert res == True 167 | res, _ = reng.match(r'\s', '\v') 168 | assert res == True 169 | 170 | 171 | def test_match_space_2(reng: RegexEngine): 172 | res, _ = reng.match(r'\s+', '\r\t\n \f \v') 173 | assert res == True 174 | res, _ = reng.match(r'^\s$', '\r\t') 175 | assert res == False 176 | 177 | 178 | def test_return_matches_simple(reng: RegexEngine): 179 | res, _, matches = reng.match(r'a\s', r'a ', return_matches=True) 180 | assert res == True 181 | assert len(matches[0]) == 1 182 | 183 | 184 | def test_return_matches_two(reng: RegexEngine): 185 | res, _m, matches = reng.match(r'a(b)+a', r'abba', return_matches=True) 186 | assert res == True 187 | assert len(matches[0]) == 2 188 | 189 | 190 | def test_non_capturing_group(reng: RegexEngine): 191 | res, _, matches = reng.match(r'a(?:b)+a', r'abba', return_matches=True) 192 | assert res == True 193 | assert len(matches[0]) == 1 194 | 195 | 196 | def test_continue_after_match_and_return_matches_simple(reng: RegexEngine): 197 | string = 'abba' 198 | res, consumed, matches = reng.match( 199 | r'a', string, continue_after_match=True, return_matches=True) 200 | assert consumed == len(string) 201 | assert len(matches) == 2 202 | assert len(matches[0]) == 1 203 | x = matches[0] 204 | assert matches[0][0].match == 'a' 205 | assert len(matches[1]) == 1 206 | assert matches[1][0].match == 'a' 207 | 208 | 209 | def test_continue_after_match_and_return_matches_2(reng: RegexEngine): 210 | string = 'abbai' 211 | res, consumed, matches = reng.match( 212 | r'a', string, continue_after_match=True, return_matches=True) 213 | assert consumed == len(string)-1 214 | assert len(matches) == 2 215 | assert len(matches[0]) == 1 216 | x = matches[0] 217 | assert matches[0][0].match == 'a' 218 | assert len(matches[1]) == 1 219 | assert matches[1][0].match == 'a' 220 | 221 | 222 | def test_question_mark(reng: RegexEngine): 223 | res, _ = reng.match(r'https?://', r'http://') 224 | assert res == True 225 | res, _ = reng.match(r'https?://', r'https://') 226 | assert res == True 227 | 228 | 229 | def test_engine_1(reng: RegexEngine): 230 | with pytest.raises(Exception): 231 | res, _ = reng.match("$^", '') 232 | 233 | 234 | def test_engine_2(reng: RegexEngine): 235 | regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" 236 | 237 | mail = "lorenzo.felletti@mail.com" 238 | res, consumed = reng.match(regex, mail) 239 | assert res == True 240 | assert consumed == len(mail) 241 | 242 | mail = "lorenzo.felletti@mail.c" 243 | res, _ = reng.match(regex, mail) 244 | assert res == False 245 | 246 | mail = "lorenzo.fellettimail.com" 247 | res, _ = reng.match(regex, mail) 248 | assert res == False 249 | 250 | mail = "lorenz^^o.felletti@mymail.com" 251 | res, _ = reng.match(regex, mail) 252 | assert res == False 253 | 254 | mail = "lorenz0.%+-@mymail.com" 255 | res, _ = reng.match(regex, mail) 256 | assert res == True 257 | 258 | 259 | def test_engine_3(reng: RegexEngine): 260 | string = "lorem ipsum" 261 | res, consumed = reng.match(r"m", string, continue_after_match=True) 262 | assert res == True 263 | assert consumed == len(string) 264 | 265 | 266 | def test_engine_4(reng: RegexEngine): 267 | string = "lorem ipsum" 268 | res, consumed, matches = reng.match( 269 | r"m", string, continue_after_match=True, return_matches=True) 270 | assert res == True 271 | assert consumed == len(string) 272 | 273 | assert len(matches) == 2 274 | assert matches[0][0].match == 'm' 275 | assert matches[1][0].match == 'm' 276 | 277 | 278 | def test_engine_5(reng: RegexEngine): 279 | match_1 = "lor.fel@ah.ha" 280 | match_2 = "fel.log@ha.ah" 281 | string = match_1 + " " + match_2 282 | res, consumed, matches = reng.match( 283 | r"[a-z.]+@[a-z]+\.[a-z]{2}", string, continue_after_match=True, return_matches=True) 284 | assert res == True 285 | assert consumed == len(string) 286 | 287 | assert len(matches) == 2 288 | assert matches[0][0].match == match_1 289 | assert matches[1][0].match == match_2 290 | 291 | 292 | def test_engine_6(reng: RegexEngine): 293 | res, consumed = reng.match(r'[\abc]', r'\\') 294 | assert res == False 295 | assert consumed == 0 296 | 297 | res, _ = reng.match(r'[\\abc]', r'\\') 298 | assert res == True 299 | 300 | 301 | def test_engine_7(reng: RegexEngine): 302 | res, _ = reng.match(r'(a)+(a)?(a{2}|b)+', 'aaabbaa') 303 | assert res == True 304 | 305 | 306 | def test_engine_8(reng: RegexEngine): 307 | res, _ = reng.match(r'(a){2}', r'a') 308 | assert res == False 309 | 310 | res, _ = reng.match(r'(aa){1,2}', r'aa') 311 | assert res == True 312 | 313 | 314 | def test_named_group(reng: RegexEngine): 315 | res, _, matches = reng.match( 316 | r'(?clancy)', r'clancy', return_matches=True) 317 | assert res == True 318 | assert matches[0][1].name == 'fancy' 319 | 320 | 321 | def test_named_group_fail_1(reng: RegexEngine): 322 | with pytest.raises(Exception): 323 | res, _ = reng.match(r"(?<)", '') 324 | 325 | 326 | def test_named_group_fail_2(reng: RegexEngine): 327 | with pytest.raises(Exception): 328 | res, _ = reng.match(r"(?asf)", '') 334 | 335 | 336 | def test_matches_indexes(reng: RegexEngine): 337 | test_str = "abbabbab" 338 | res, consumed, matches = reng.match( 339 | r"a", test_str, continue_after_match=True, return_matches=True) 340 | assert res == True 341 | assert consumed == len(test_str) - 1 342 | assert len(matches) == 3 343 | assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1 344 | assert matches[1][0].start_idx == 3 and matches[1][0].end_idx == 4 345 | assert matches[2][0].start_idx == 6 and matches[2][0].end_idx == 7 346 | 347 | 348 | def test_returned_matches_indexes(reng: RegexEngine): 349 | regex = r"(a)(a)(a)(a)(a)(a)" 350 | test_str = "aaaaaaaaaacccaaaaaac" 351 | res, consumed, matches = reng.match(regex, test_str, True, True) 352 | 353 | assert res == True 354 | assert consumed == len(test_str)-1 355 | assert matches is not None and len(matches) == 2 356 | assert len(matches[0]) == 7 357 | assert len(matches[1]) == 7 358 | assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == 6 359 | assert matches[0][1].start_idx == 5 and matches[0][1].end_idx == 6 360 | assert matches[0][2].start_idx == 4 and matches[0][2].end_idx == 5 361 | assert matches[0][3].start_idx == 3 and matches[0][3].end_idx == 4 362 | assert matches[0][4].start_idx == 2 and matches[0][4].end_idx == 3 363 | assert matches[0][5].start_idx == 1 and matches[0][5].end_idx == 2 364 | assert matches[0][6].start_idx == 0 and matches[0][6].end_idx == 1 365 | 366 | assert matches[1][0].start_idx == 13 and matches[1][0].end_idx == 19 367 | assert matches[1][1].start_idx == 18 and matches[1][1].end_idx == 19 368 | assert matches[1][2].start_idx == 17 and matches[1][2].end_idx == 18 369 | assert matches[1][3].start_idx == 16 and matches[1][3].end_idx == 17 370 | assert matches[1][4].start_idx == 15 and matches[1][4].end_idx == 16 371 | assert matches[1][5].start_idx == 14 and matches[1][5].end_idx == 15 372 | assert matches[1][6].start_idx == 13 and matches[1][6].end_idx == 14 373 | 374 | 375 | # this one loops 376 | def test_returned_groups(reng: RegexEngine): 377 | # group e will not be matched due to the greediness of the engine, 378 | # .* "eats" the "e" in test_str 379 | regex = r"a(b).*(e)?c(c)(c)c" 380 | test_str = "abxxecccc" 381 | res, consumed, matches = reng.match(regex, test_str, True, True) 382 | 383 | assert res == True 384 | assert consumed == len(test_str) 385 | assert len(matches) == 1 386 | assert len(matches[0]) == 4 387 | assert matches[0][0].match == test_str 388 | assert matches[0][1].match == "c" and matches[0][1].start_idx == len( 389 | test_str) - 2 390 | assert matches[0][2].match == "c" and matches[0][2].start_idx == len( 391 | test_str) - 3 392 | assert matches[0][3].match == "b" and matches[0][3].start_idx == 1 393 | 394 | 395 | def test_on_long_string(reng: RegexEngine): 396 | regex = r"a(b)?.{0,10}c(d)" 397 | test_str = "abcd dcvrsbshpeuiògjAAwdew ac abc vcsweacscweflllacd" 398 | res, _, matches = reng.match(regex, test_str, True, True) 399 | 400 | assert res == True 401 | assert len(matches) == 2 402 | 403 | assert len(matches[0]) == 3 404 | assert matches[0][0].start_idx == 0 and \ 405 | matches[0][0].end_idx == 4 406 | assert matches[0][1].start_idx == 3 and \ 407 | matches[0][1].end_idx == 4 408 | assert matches[0][2].start_idx == 1 and \ 409 | matches[0][2].end_idx == 2 410 | 411 | len(matches[1]) == 2 412 | assert matches[1][0].start_idx == 39 and \ 413 | matches[1][0].end_idx == len(test_str) 414 | assert matches[1][1].start_idx == len(test_str)-1 and \ 415 | matches[1][1].end_idx == len(test_str) 416 | 417 | 418 | def test_ignore_case_no_casefolding(reng: RegexEngine): 419 | regex = r"ss" 420 | test_str = "SS" 421 | res, _ = reng.match(regex, test_str, ignore_case=1) 422 | assert res == True 423 | 424 | regex = r"ÄCHER" 425 | test_str = "ächer" 426 | res, _ = reng.match(regex, test_str, ignore_case=1) 427 | assert res == True 428 | 429 | regex = r"ÄCHER" 430 | test_str = "acher" 431 | res, _ = reng.match(regex, test_str, ignore_case=1) 432 | assert res == False 433 | 434 | 435 | def test_ignore_case_casefolding(reng: RegexEngine): 436 | regex = r"ẞ" 437 | test_str = "SS" 438 | res, _ = reng.match(regex, test_str, ignore_case=2) 439 | assert res == True 440 | 441 | regex = r"ÄCHER" 442 | test_str = "ächer" 443 | res, _ = reng.match(regex, test_str, ignore_case=2) 444 | assert res == True 445 | 446 | regex = r"ÄCHER" 447 | test_str = "acher" 448 | res, _ = reng.match(regex, test_str, ignore_case=2) 449 | assert res == False 450 | 451 | 452 | def test_empty_regex(reng: RegexEngine): 453 | regex = r"" 454 | test_str = "aaaa" 455 | 456 | # repeate the test with different optional parameters configurations 457 | res, _ = reng.match(regex, test_str) 458 | assert res == True 459 | 460 | res, _ = reng.match(regex, test_str, ignore_case=1) 461 | assert res == True 462 | 463 | res, _ = reng.match(regex, test_str, ignore_case=2) 464 | assert res == True 465 | 466 | res, _ = reng.match(regex, test_str, continue_after_match=True) 467 | assert res == True 468 | 469 | res, _, matches = reng.match(regex, test_str, return_matches=True) 470 | assert res == True 471 | assert len(matches) == 1 and len(matches[0]) == 1 472 | assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0 473 | 474 | res, _, matches = reng.match(regex, test_str, True, True, 0) 475 | assert res == True 476 | assert len(matches) == 1 and len(matches[0]) == 1 477 | assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0 478 | 479 | res, _, matches = reng.match(regex, test_str, True, True, 1) 480 | assert res == True 481 | assert len(matches) == 1 and len(matches[0]) == 1 482 | assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0 483 | 484 | res, _, matches = reng.match(regex, test_str, True, True, 2) 485 | assert res == True 486 | assert len(matches) == 1 and len(matches[0]) == 1 487 | assert matches[0][0].match == "" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 0 488 | 489 | 490 | def test_empty_test_str(reng: RegexEngine): 491 | regex = r"a" 492 | test_str = "" 493 | res, _ = reng.match(regex, test_str) 494 | assert res == False 495 | 496 | 497 | def test_empty_regex_and_test_str(reng: RegexEngine): 498 | regex = r"" 499 | test_str = "" 500 | res, _ = reng.match(regex, test_str) 501 | assert res == True 502 | 503 | 504 | def test_regex_with_rigth_empty_group(reng: RegexEngine): 505 | regex = r"a|" 506 | test_str = "ab" 507 | 508 | # repeate the test with different optional parameters configurations 509 | res, _ = reng.match(regex, test_str) 510 | assert res == True 511 | 512 | res, _ = reng.match(regex, test_str, ignore_case=1) 513 | assert res == True 514 | 515 | res, _ = reng.match(regex, test_str, ignore_case=2) 516 | assert res == True 517 | 518 | res, _ = reng.match(regex, test_str, continue_after_match=True) 519 | assert res == True 520 | 521 | res, _, matches = reng.match(regex, test_str, return_matches=True) 522 | assert res == True 523 | assert len(matches) == 1 and len(matches[0]) == 1 524 | assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1 525 | 526 | res, _, matches = reng.match(regex, test_str, True, True, 0) 527 | assert res == True 528 | assert len(matches) == 1 and len(matches[0]) == 1 529 | assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1 530 | 531 | res, _, matches = reng.match(regex, test_str, True, True, 1) 532 | assert res == True 533 | assert len(matches) == 1 and len(matches[0]) == 1 534 | assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1 535 | 536 | res, _, matches = reng.match(regex, test_str, True, True, 2) 537 | assert res == True 538 | assert len(matches) == 1 and len(matches[0]) == 1 539 | assert matches[0][0].match == "a" and matches[0][0].start_idx == 0 and matches[0][0].end_idx == 1 540 | 541 | 542 | def test_empty_group_quantified(reng: RegexEngine): 543 | regex = r'()+' 544 | test_str = 'ab' 545 | res, _ = reng.match(regex, test_str) 546 | assert res == True 547 | 548 | 549 | def test_nested_quantifiers(reng: RegexEngine): 550 | regex = r'(a*)+ab' 551 | test_str = 'aab' 552 | res, _ = reng.match(regex, test_str) 553 | assert res == True 554 | 555 | regex = r'(a+)+ab' 556 | test_str = 'ab' 557 | res, _ = reng.match(regex, test_str) 558 | assert res == False 559 | 560 | 561 | def test_nested_quantifiers_with_or_node(reng: RegexEngine): 562 | regex = r'(a*|b*)*ab' 563 | test_str = 'ab' 564 | res, _ = reng.match(regex, test_str) 565 | assert res == True 566 | 567 | regex = r'(a*|b*)+ab' 568 | test_str = 'ab' 569 | res, _ = reng.match(regex, test_str) 570 | assert res == True 571 | 572 | regex = r'(a+|b+)+ab' 573 | test_str = 'ab' 574 | res, _ = reng.match(regex, test_str) 575 | assert res == False 576 | 577 | 578 | def test_multiple_named_groups(reng: RegexEngine): 579 | regex = r"(?[a-z]+)(?i)(?l)" 580 | test_str = "nostril" 581 | res, _, _ = reng.match(regex, test_str, True, True, 0) 582 | assert res == True 583 | 584 | 585 | def test_one_named_group(reng: RegexEngine): 586 | regex = r"[a-z]+(?l)" 587 | test_str = "nostril" 588 | res, _, matches = reng.match(regex, test_str, True, True, 0) 589 | assert res == True 590 | 591 | 592 | def test_two_separated_named_group(reng: RegexEngine): 593 | regex = r"(?n)[a-z]+(?l)" 594 | test_str = "nostril" 595 | res, _, matches = reng.match(regex, test_str, True, True, 0) 596 | assert res == True 597 | assert len(matches) == 1 598 | assert len(matches[0]) == 3 599 | assert matches[0][0].match == "nostril" 600 | assert matches[0][1].match == "l" 601 | assert matches[0][2].match == "n" 602 | 603 | 604 | def test_match_contiguous_named_groups(reng: RegexEngine): 605 | regex = r"(?n)(?l)" 606 | test_str = "nl" 607 | res, _, matches = reng.match(regex, test_str, True, True, 0) 608 | assert res == True 609 | assert len(matches) == 1 610 | assert len(matches[0]) == 3 611 | assert matches[0][0].match == "nl" 612 | assert matches[0][1].match == "l" 613 | assert matches[0][2].match == "n" 614 | 615 | 616 | def test_named_group_with_range_element(reng: RegexEngine): 617 | regex = r"(?[a-z])(?l)" 618 | test_str = "nl" 619 | res, _, matches = reng.match(regex, test_str, True, True, 0) 620 | assert res == True 621 | assert len(matches) == 1 622 | assert len(matches[0]) == 3 623 | assert matches[0][0].match == "nl" 624 | assert matches[0][1].match == "l" 625 | assert matches[0][2].match == "n" 626 | 627 | 628 | def test_named_group_with_range_element_and_quantifier(reng: RegexEngine): 629 | regex = r"(?[a-z]+)(?l)" 630 | test_str = "nl" 631 | res, _, matches = reng.match(regex, test_str, True, True, 0) 632 | assert res == True 633 | assert len(matches) == 1 634 | assert len(matches[0]) == 3 635 | assert matches[0][0].match == "nl" 636 | assert matches[0][1].match == "l" 637 | assert matches[0][2].match == "n" 638 | 639 | 640 | def test_backtracking_or_node_inside_group_node(reng: RegexEngine): 641 | regex = r"(?b{1,2}|[a-z]+)(?l)" 642 | test_str = "bnl" 643 | 644 | res, _, matches = reng.match(regex, test_str, True, True, 0) 645 | assert res == True 646 | assert len(matches) == 1 647 | assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == len(test_str) 648 | assert matches[0][1].start_idx == 2 and matches[0][1].end_idx == len(test_str) 649 | assert matches[0][2].start_idx == 0 and matches[0][2].end_idx == 2 650 | 651 | regex = r"(?[a-z]+|b{1,2})(?l)" 652 | res, _, matches = reng.match(regex, test_str, True, True, 0) 653 | assert res == True 654 | assert len(matches) == 1 655 | assert matches[0][0].start_idx == 0 and matches[0][0].end_idx == len(test_str) 656 | assert matches[0][1].start_idx == 2 and matches[0][1].end_idx == len(test_str) 657 | assert matches[0][2].start_idx == 0 and matches[0][2].end_idx == 2 658 | 659 | 660 | def test_double_or_nodes_with_wildcard_in_between(reng: RegexEngine): 661 | res, _ = reng.match(r'@(gm|ho).(com|it)', '@hoa.com') 662 | assert res == False 663 | -------------------------------------------------------------------------------- /test/test_engine2.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ..pyregexp.engine import RegexEngine 3 | 4 | 5 | @pytest.fixture 6 | def reng() -> RegexEngine: 7 | return RegexEngine() 8 | 9 | 10 | def test_1(reng: RegexEngine): 11 | regex = r"(ad+a)*a" 12 | test_str = "adaa" 13 | 14 | res, consumed, matches = reng.match(regex, test_str, True, True) 15 | 16 | assert res == True 17 | consumed == len(test_str) 18 | assert len(matches) == 1 19 | 20 | 21 | def test_2(reng: RegexEngine): 22 | regex = r"0|1|2|3" 23 | test_str = "3210" 24 | 25 | res, consumed, matches = reng.match(regex, test_str, True, True) 26 | 27 | assert res == True 28 | consumed == len(test_str) 29 | assert len(matches) == 4 30 | -------------------------------------------------------------------------------- /test/test_lexer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ..pyregexp.tokens import * 3 | from ..pyregexp.lexer import Lexer 4 | 5 | 6 | @pytest.fixture 7 | def lexer(): 8 | return Lexer() 9 | 10 | 11 | def test_simple_re_lexing(lexer: Lexer): 12 | tokens = lexer.scan('a') 13 | assert tokens[0].char == 'a' 14 | 15 | 16 | def test_escaping_char(lexer: Lexer): 17 | tokens = lexer.scan(r'a\\a\\t\.') 18 | assert type(tokens[1]) is ElementToken and tokens[1].char == '\\' 19 | 20 | 21 | def test_escaping_get_tab(lexer: Lexer): 22 | tokens = lexer.scan(r'a\h\t') 23 | assert type(tokens[2]) is ElementToken and tokens[2].char == '\t' 24 | 25 | 26 | def test_escaping_wildcard(lexer: Lexer): 27 | tokens = lexer.scan(r'\.') 28 | assert type(tokens[0]) is ElementToken and tokens[0].char == '.' 29 | 30 | 31 | def test_get_comma(lexer: Lexer): 32 | tokens = lexer.scan('a{3,5}') 33 | assert type(tokens[3]) is Comma 34 | 35 | 36 | def test_comma_is_element(lexer: Lexer): 37 | tokens = lexer.scan('a,') 38 | assert type(tokens[1]) is ElementToken 39 | 40 | 41 | def test_match_start(lexer: Lexer): 42 | tokens = lexer.scan('^a') 43 | assert type(tokens[0]) is Start 44 | 45 | 46 | def test_match_end(lexer: Lexer): 47 | tokens = lexer.scan(r'fdsad\$cs$') 48 | assert type(tokens[len(tokens) - 1]) is End 49 | 50 | 51 | def test_fail_curly(lexer: Lexer): 52 | with pytest.raises(Exception): 53 | lexer.scan('advfe{a}') 54 | 55 | 56 | def test_lexer_1(lexer: Lexer): 57 | tokens = lexer.scan(r'-\\\/\s~') 58 | assert len(tokens) == 5 59 | assert type(tokens[0]) is Dash 60 | assert type(tokens[1]) is ElementToken 61 | assert type(tokens[2]) is ElementToken 62 | assert type(tokens[3]) is SpaceToken 63 | assert type(tokens[4]) is ElementToken 64 | -------------------------------------------------------------------------------- /test/test_parser.py: -------------------------------------------------------------------------------- 1 | import math 2 | import pytest 3 | from ..pyregexp.re_ast import RE, EndElement, GroupNode, Element, OrNode, RangeElement, SpaceElement, StartElement 4 | from ..pyregexp.pyrser import Pyrser 5 | 6 | 7 | @pytest.fixture 8 | def parser(): 9 | return Pyrser() 10 | 11 | 12 | def test_simple_regex(parser: Pyrser): 13 | ast = parser.parse('a') 14 | print(ast) 15 | assert type(ast) is RE 16 | assert type(ast.child) is GroupNode 17 | assert type(ast.child.children[0]) is Element 18 | 19 | 20 | def test_grouping(parser: Pyrser): 21 | ast = parser.parse('a(b)c') 22 | 23 | # top level group 24 | assert len(ast.child.children) == 3 25 | assert type(ast.child.children[0]) is Element 26 | assert type(ast.child.children[1]) is GroupNode 27 | assert type(ast.child.children[2]) is Element 28 | 29 | # ast.child.children[1] group '(a)' 30 | assert len(ast.child.children[1].children) == 1 31 | assert type(ast.child.children[1].children[0]) is Element 32 | 33 | 34 | def test_curly_braces_1(parser: Pyrser): 35 | ast = parser.parse(r'a{5}b') 36 | assert len(ast.child.children) == 2 37 | 38 | 39 | def test_fail_curly(parser: Pyrser): 40 | with pytest.raises(Exception): 41 | parser.parse('a{3,3}}') 42 | 43 | 44 | def test_fail_no_closing_par(parser: Pyrser): 45 | with pytest.raises(Exception): 46 | parser.parse('a[d]((vfw)') 47 | 48 | 49 | def test_parse_match_start_end(parser: Pyrser): 50 | ast = parser.parse('^aaaa.*a$') 51 | assert len(ast.child.children) == 8 52 | 53 | 54 | def test_complex_regex(parser: Pyrser): 55 | ast = parser.parse(r'^[a-zA-Z]{1,20}@[a-zA-Z]\.[a-z]{1,3}$') 56 | assert len(ast.child.children) == 7 57 | 58 | assert type(ast.child.children[0]) is StartElement 59 | 60 | assert type(ast.child.children[1]) is RangeElement 61 | assert ast.child.children[1].min == 1 62 | assert ast.child.children[1].max == 20 63 | 64 | assert type(ast.child.children[2]) is Element 65 | 66 | assert type(ast.child.children[3]) is RangeElement 67 | 68 | assert type(ast.child.children[4]) is Element 69 | 70 | assert type(ast.child.children[5]) is RangeElement 71 | assert ast.child.children[5].min == 1 72 | assert ast.child.children[5].max == 3 73 | 74 | assert type(ast.child.children[6]) is EndElement 75 | 76 | 77 | def test_space_element(parser: Pyrser): 78 | ast = parser.parse(r'\s') 79 | assert len(ast.child.children) == 1 80 | assert type(ast.child.children[0]) is SpaceElement 81 | 82 | 83 | def test_range_1(parser: Pyrser): 84 | ast = parser.parse('[^a-z]') 85 | assert len(ast.child.children) == 1 86 | assert type(ast.child.children[0]) is RangeElement 87 | assert ast.child.children[0].is_match('a') == False 88 | 89 | 90 | def test_range_2(parser: Pyrser): 91 | ast = parser.parse(r'[^a-z-\s-]') 92 | assert len(ast.child.children) == 1 93 | assert type(ast.child.children[0]) is RangeElement 94 | assert ast.child.children[0].is_match('a') == False 95 | assert ast.child.children[0].is_match('-') == False 96 | ast.child.children[0].is_match(' ') == False 97 | 98 | 99 | def test_range_3(parser: Pyrser): 100 | ast = parser.parse(r'[a-z-\s-]') 101 | assert len(ast.child.children) == 1 102 | assert type(ast.child.children[0]) is RangeElement 103 | assert ast.child.children[0].is_match('a') == True 104 | assert ast.child.children[0].is_match('-') == True 105 | ast.child.children[0].is_match(' ') == True 106 | 107 | 108 | def test_range_2(parser: Pyrser): 109 | ast = parser.parse(r'[\]]') 110 | assert len(ast.child.children) == 1 111 | assert type(ast.child.children[0]) is RangeElement 112 | assert ast.child.children[0].is_match(']') == True 113 | 114 | 115 | def test_parse_curly_1(parser: Pyrser): 116 | ast = parser.parse(r'a{2}') 117 | assert len(ast.child.children) == 1 118 | assert type(ast.child.children[0]) is Element 119 | assert ast.child.children[0].is_match('a') == True 120 | assert ast.child.children[0].min == 2 121 | ast.child.children[0].max == 2 122 | 123 | 124 | def test_parse_curly_2(parser: Pyrser): 125 | ast = parser.parse(r'a{,2}') 126 | assert len(ast.child.children) == 1 127 | assert type(ast.child.children[0]) is Element 128 | assert ast.child.children[0].is_match('a') == True 129 | assert ast.child.children[0].min == 0 130 | ast.child.children[0].max == 2 131 | 132 | 133 | def test_parse_curly_3(parser: Pyrser): 134 | ast = parser.parse(r'a{2,}') 135 | assert len(ast.child.children) == 1 136 | assert type(ast.child.children[0]) is Element 137 | assert ast.child.children[0].is_match('a') == True 138 | assert ast.child.children[0].min == 2 139 | ast.child.children[0].max == math.inf 140 | 141 | 142 | def test_parse_curly_4(parser: Pyrser): 143 | ast = parser.parse(r'a{,}') 144 | assert len(ast.child.children) == 1 145 | assert type(ast.child.children[0]) is Element 146 | assert ast.child.children[0].is_match('a') == True 147 | assert ast.child.children[0].min == 0 148 | ast.child.children[0].max == math.inf 149 | 150 | 151 | def test_parse_fail_empty_curly(parser: Pyrser): 152 | with pytest.raises(Exception): 153 | ast = parser.parse(r'a{}') 154 | 155 | 156 | def test_fail_quatifier_unescaped(parser: Pyrser): 157 | with pytest.raises(Exception): 158 | ast = parser.parse(r'?') 159 | 160 | 161 | def test_parse_fail_missing_clising_bracket(parser: Pyrser): 162 | with pytest.raises(Exception): 163 | ast = parser.parse(r'a[abc') 164 | 165 | 166 | def test_parse_fail_unescaped_closing_bracket(parser: Pyrser): 167 | with pytest.raises(Exception): 168 | ast = parser.parse(r'abc]') 169 | 170 | 171 | def test_parse_fail_unescaped_closing_parenthesis(parser: Pyrser): 172 | with pytest.raises(Exception): 173 | ast = parser.parse(r'a)') 174 | 175 | 176 | def test_parse_fail_unescaped_start(parser: Pyrser): 177 | with pytest.raises(Exception): 178 | ast = parser.parse(r'^^') 179 | 180 | 181 | def test_parse_fail_unescaped_end(parser: Pyrser): 182 | with pytest.raises(Exception): 183 | ast = parser.parse(r'$$') 184 | 185 | 186 | def test_parse_fail_swapped_range(parser: Pyrser): 187 | with pytest.raises(Exception): 188 | ast = parser.parse(r'[z-a]') 189 | 190 | 191 | def test_parse_fail_non_capturing_group(parser: Pyrser): 192 | with pytest.raises(Exception): 193 | parser.parse(r'(?') 194 | 195 | with pytest.raises(Exception): 196 | parser.parse(r'(?aa') 197 | 198 | 199 | def test_parse_fail_non_closed_range(parser: Pyrser): 200 | with pytest.raises(Exception): 201 | parser.parse(r'[a') 202 | 203 | with pytest.raises(Exception): 204 | parser.parse(r'[') 205 | 206 | 207 | def test_parse_onrnode_groups_names(parser: Pyrser): 208 | regex = r'a|b' 209 | ast = parser.parse(regex) 210 | assert len(ast.children) == 1 211 | assert isinstance(ast.child, OrNode) 212 | assert isinstance(ast.child.left, GroupNode) 213 | assert isinstance(ast.child.right, GroupNode) 214 | assert ast.child.left.group_name == ast.child.right.group_name 215 | assert ast.child.left.group_id == ast.child.right.group_id 216 | 217 | 218 | def test_groups_names_double_ornode(parser: Pyrser): 219 | regex = r'a|b|c' 220 | ast = parser.parse(regex) 221 | assert len(ast.children) == 1 222 | assert isinstance(ast.child, OrNode) 223 | assert isinstance(ast.child.left, GroupNode) 224 | leftmost_gid = ast.child.left.group_id 225 | leftmost_gname = ast.child.left.group_name 226 | 227 | assert isinstance(ast.child.right, OrNode) 228 | assert isinstance(ast.child.right.left, GroupNode) 229 | central_gid = ast.child.right.left.group_id 230 | central_gname = ast.child.right.left.group_name 231 | 232 | assert isinstance(ast.child.right.right, GroupNode) 233 | rightmost_gid = ast.child.right.right.group_id 234 | rightmost_gname = ast.child.right.right.group_name 235 | 236 | assert leftmost_gid == central_gid 237 | assert central_gid == rightmost_gid 238 | assert leftmost_gname == central_gname 239 | assert central_gname == rightmost_gname 240 | -------------------------------------------------------------------------------- /test/test_re_ast.py: -------------------------------------------------------------------------------- 1 | from ..pyregexp.re_ast import ASTNode, RE, LeafNode, Element, WildcardElement, SpaceElement, RangeElement, StartElement, EndElement, OrNode, NotNode, GroupNode 2 | 3 | 4 | def test_ASTNode(): 5 | ast_node = ASTNode() 6 | assert ast_node is not None 7 | 8 | 9 | def test_RE(): 10 | re = RE(child=Element(match_ch='e')) 11 | assert re is not None 12 | 13 | assert hasattr(re, 'child') 14 | assert hasattr(re, 'children') 15 | 16 | assert re.child is re.children[0] 17 | 18 | 19 | def test_NotNode(): 20 | not_node = NotNode(child=Element(match_ch='e')) 21 | assert not_node is not None 22 | 23 | assert hasattr(not_node, 'child') 24 | assert hasattr(not_node, 'children') 25 | 26 | assert not_node.child is not_node.children[0] 27 | 28 | 29 | def test_LeafNode(): 30 | ln = LeafNode() 31 | assert ln is not None 32 | assert hasattr(ln, 'is_match') 33 | 34 | assert ln.is_match() == False 35 | 36 | 37 | def test_WildcardElement(): 38 | we = WildcardElement() 39 | assert we is not None 40 | 41 | 42 | def test_SpaceElement(): 43 | se = SpaceElement() 44 | assert se is not None 45 | assert hasattr(se, 'is_match') 46 | 47 | assert se.is_match(" ") 48 | assert se.is_match("\t") 49 | assert se.is_match("\n") 50 | assert se.is_match("\f") 51 | assert se.is_match("\r") 52 | assert se.is_match("t") == False 53 | 54 | 55 | def test_RangeElement_positive_logic(): 56 | re = RangeElement("abc", True) 57 | assert re is not None 58 | assert re.is_positive_logic == True 59 | 60 | assert re.is_match("a") == True 61 | assert re.is_match("x") == False 62 | 63 | 64 | def test_RangeElement_negative_logic(): 65 | nre = RangeElement("abc", False) 66 | assert nre is not None 67 | assert nre.is_positive_logic == False 68 | 69 | assert nre.is_match("a") == False 70 | assert nre.is_match("x") == True 71 | -------------------------------------------------------------------------------- /test/test_tokens.py: -------------------------------------------------------------------------------- 1 | from ..pyregexp.tokens import Asterisk, Bracket, Circumflex, Comma, CurlyBrace, Dash, ElementToken, End, EndToken, Escape, LeftBracket, LeftCurlyBrace, LeftParenthesis, NotToken, OneOrMore, OrToken, Parenthesis, Plus, Quantifier, QuestionMark, RightBracket, RightCurlyBrace, RightParenthesis, SpaceToken, Start, StartToken, Token, VerticalBar, Wildcard, WildcardToken, ZeroOrMore, ZeroOrOne 2 | 3 | 4 | def test_Asterisk(): 5 | assert issubclass(Asterisk, ZeroOrMore) 6 | 7 | a = Asterisk() 8 | assert a is not None 9 | 10 | assert type(a) == Asterisk 11 | 12 | 13 | def test_NotToken(): 14 | assert issubclass(NotToken, Token) == True 15 | 16 | nt = NotToken(char='^') 17 | assert nt is not None 18 | assert nt.char == '^' 19 | 20 | 21 | def test_Bracket(): 22 | br = Bracket() 23 | assert br is not None 24 | br = LeftBracket() 25 | assert br is not None 26 | br = RightBracket() 27 | assert br is not None 28 | 29 | 30 | def test_Escape(): 31 | escape = Escape() 32 | assert escape is not None 33 | --------------------------------------------------------------------------------