├── requirements.dev.txt ├── .editorconfig ├── .isort.cfg ├── .flake8 ├── main.py ├── CHANGELOG.md ├── greenery ├── __init__.py ├── mult_test.py ├── conc_test.py ├── bound.py ├── bound_test.py ├── multiplier_test.py ├── pattern_test.py ├── multiplier.py ├── parse_test.py ├── charclass_test.py ├── parse.py ├── charclass.py ├── rxelems.py └── fsm.py ├── .gitattributes ├── mypy.ini ├── setup.py ├── LICENSE.txt ├── .github └── workflows │ └── workflow-1.yml ├── .gitignore ├── README.md └── .pylintrc /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | build 3 | flake8 4 | isort 5 | mypy 6 | pylint 7 | pytest 8 | setuptools 9 | twine 10 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | insert_final_newline = true 3 | 4 | [*.py] 5 | charset = utf-8 6 | indent_style = space 7 | indent_size = 4 8 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | add_imports = from __future__ import annotations 4 | remove_redundant_aliases = true 5 | combine_as_imports = true 6 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | ignore = 4 | # "black" formatter has slightly different operator spacing rules than 5 | # flake8's defaults. 6 | # whitespace before ‘,’, ‘;’, or ‘:’ 7 | E203, 8 | # line break before binary operator 9 | W503, 10 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from greenery import INF, PLUS, QM, STAR, Bound, Multiplier, parse 4 | 5 | pattern = parse("a") 6 | print(pattern) # "a" 7 | 8 | pattern = pattern * PLUS * QM * STAR * Multiplier(Bound(3), INF) 9 | print(pattern) # "((((a)+)?)*){3,}" 10 | 11 | pattern = pattern.reduce() 12 | print(pattern) # "a*" 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## 4.2.0 4 | 5 | https://github.com/qntm/greenery/pull/106 6 | 7 | ## 4.1.0 8 | 9 | https://github.com/qntm/greenery/pull/99 10 | 11 | ## 4.0.0 12 | 13 | https://github.com/qntm/greenery/pull/67 14 | 15 | ## 3.0 16 | 17 | https://github.com/qntm/greenery/commit/347760c730232b2f0c243917f34bdf596288984a 18 | 19 | ## 2.0 20 | 21 | https://github.com/qntm/greenery/pull/10 22 | 23 | ## 1.0 24 | 25 | Initial release. 26 | -------------------------------------------------------------------------------- /greenery/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | __all__ = ( 4 | "Bound", 5 | "INF", 6 | "Multiplier", 7 | "PLUS", 8 | "Pattern", 9 | "QM", 10 | "STAR", 11 | "parse", 12 | "Fsm", 13 | "EPSILON", 14 | "NULL", 15 | "Charclass", 16 | ) 17 | 18 | from .bound import INF, Bound 19 | from .fsm import EPSILON, NULL, Charclass, Fsm 20 | from .multiplier import PLUS, QM, STAR, Multiplier 21 | from .parse import parse 22 | from .rxelems import Pattern 23 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | warn_return_any = True 4 | warn_unused_configs = True 5 | no_implicit_optional = True 6 | 7 | # strict typing 8 | strict_optional = True 9 | disallow_untyped_calls = True 10 | disallow_untyped_defs = True 11 | disallow_incomplete_defs = True 12 | check_untyped_defs = True 13 | disallow_untyped_decorators = True 14 | 15 | disallow_any_generics = True 16 | disallow_subclassing_any = True 17 | no_implicit_reexport = True 18 | strict_concatenate = True 19 | strict_equality = True 20 | warn_redundant_casts = True 21 | warn_unused_ignores = True 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from setuptools import setup 4 | 5 | setup( 6 | name="greenery", 7 | version="4.2.2", 8 | tests_require=["pytest"], 9 | packages=["greenery"], 10 | package_dir={"greenery": "greenery"}, 11 | author="qntm", 12 | author_email="qntm ", 13 | description="Greenery allows manipulation of regular expressions", 14 | license="MIT License", 15 | keywords=" ".join( 16 | [ 17 | "re", 18 | "regex", 19 | "regexp", 20 | "regular", 21 | "expression", 22 | "deterministic", 23 | "finite", 24 | "state", 25 | "machine", 26 | "automaton", 27 | "fsm", 28 | "dfsm", 29 | "fsa", 30 | "dfsa", 31 | "greenery", 32 | ] 33 | ), 34 | url="https://github.com/qntm/greenery", 35 | classifiers=[ 36 | "License :: OSI Approved :: MIT License", 37 | "Programming Language :: Python :: 3.8", 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 qntm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/workflow-1.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | pull_request: 8 | branches: 9 | - '**' 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 3.8 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: "3.8" 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install -r requirements.dev.txt 26 | - name: Lint with isort 27 | run: | 28 | isort --quiet --diff --check . 29 | - name: Check formatting with black 30 | run: | 31 | black --diff --check . 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 --count --statistics --show-source --select=E9,F63,F7,F82 . 36 | # exit-zero treats all errors as warnings 37 | flake8 --count --statistics --exit-zero . 38 | - name: Lint with pylint 39 | run: | 40 | pylint --recursive=true . 41 | - name: Check with mypy 42 | run: | 43 | mypy greenery 44 | - name: Test with pytest 45 | run: | 46 | pytest 47 | -------------------------------------------------------------------------------- /greenery/mult_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .bound import INF, Bound 4 | from .charclass import DIGIT, Charclass 5 | from .multiplier import ONE, PLUS, QM, STAR, Multiplier 6 | from .rxelems import Mult 7 | 8 | 9 | def test_mult_equality() -> None: 10 | a = Mult(Charclass("a"), ONE) 11 | # pylint: disable=comparison-with-itself 12 | assert a == a 13 | assert a != Mult(Charclass("b"), ONE) 14 | assert a != Mult(Charclass("a"), QM) 15 | assert a != Mult(Charclass("a"), Multiplier(Bound(1), Bound(2))) 16 | 17 | 18 | def test_mult_str() -> None: 19 | a = Charclass("a") 20 | assert str(Mult(a, ONE)) == "a" 21 | assert str(Mult(a, Multiplier(Bound(2), Bound(2)))) == "a{2}" 22 | assert str(Mult(a, Multiplier(Bound(3), Bound(3)))) == "a{3}" 23 | assert str(Mult(a, Multiplier(Bound(4), Bound(4)))) == "a{4}" 24 | assert str(Mult(a, Multiplier(Bound(5), Bound(5)))) == "a{5}" 25 | assert str(Mult(a, QM)) == "a?" 26 | assert str(Mult(a, STAR)) == "a*" 27 | assert str(Mult(a, PLUS)) == "a+" 28 | assert str(Mult(a, Multiplier(Bound(2), Bound(5)))) == "a{2,5}" 29 | assert str(Mult(a, Multiplier(Bound(2), INF))) == "a{2,}" 30 | 31 | assert str(Mult(DIGIT, ONE)) == "\\d" 32 | assert str(Mult(DIGIT, Multiplier(Bound(2), Bound(2)))) == "\\d{2}" 33 | assert str(Mult(DIGIT, Multiplier(Bound(3), Bound(3)))) == "\\d{3}" 34 | 35 | 36 | def test_odd_bug() -> None: 37 | # pylint: disable=invalid-name 38 | 39 | # Odd bug with ([bc]*c)?[ab]* 40 | int5A = Mult( 41 | Charclass("bc"), 42 | STAR, 43 | ).to_fsm() 44 | assert int5A.accepts("") 45 | 46 | int5B = Mult( 47 | Charclass("c"), 48 | ONE, 49 | ).to_fsm() 50 | assert int5B.accepts("c") 51 | 52 | int5C = int5A.concatenate(int5B) 53 | assert int5C.accepts("c") 54 | 55 | 56 | def test_mult_common() -> None: 57 | a = Charclass("a") 58 | assert Mult(a, Multiplier(Bound(3), Bound(4))).common( 59 | Mult(a, Multiplier(Bound(2), Bound(5))) 60 | ) == Mult(a, Multiplier(Bound(2), Bound(3))) 61 | assert Mult(a, Multiplier(Bound(2), INF)).common( 62 | Mult(a, Multiplier(Bound(1), Bound(5))) 63 | ) == Mult(a, Multiplier(Bound(1), Bound(5))) 64 | assert Mult(a, Multiplier(Bound(3), INF)).common( 65 | Mult(a, Multiplier(Bound(2), INF)) 66 | ) == Mult(a, Multiplier(Bound(2), INF)) 67 | 68 | 69 | def test_mult_dock() -> None: 70 | a = Charclass("a") 71 | assert Mult(a, Multiplier(Bound(4), Bound(5))).dock( 72 | Mult(a, Multiplier(Bound(3), Bound(3))) 73 | ) == Mult(a, Multiplier(Bound(1), Bound(2))) 74 | -------------------------------------------------------------------------------- /greenery/conc_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from .bound import Bound 6 | from .charclass import Charclass 7 | from .multiplier import ONE, PLUS, QM, STAR, ZERO, Multiplier 8 | from .rxelems import Conc, Mult 9 | 10 | 11 | def test_conc_equality() -> None: 12 | a = Conc(Mult(Charclass("a"), ONE)) 13 | assert a == Conc(Mult(Charclass("a"), ONE)) 14 | assert a != Conc(Mult(Charclass("b"), ONE)) 15 | assert a != Conc(Mult(Charclass("a"), QM)) 16 | assert a != Conc(Mult(Charclass("a"), Multiplier(Bound(1), Bound(2)))) 17 | assert a != Conc() 18 | 19 | 20 | def test_conc_str() -> None: 21 | assert ( 22 | str( 23 | Conc( 24 | Mult(Charclass("a"), ONE), 25 | Mult(Charclass("b"), ONE), 26 | Mult(Charclass("c"), ONE), 27 | Mult(Charclass("d"), ONE), 28 | Mult(Charclass("e"), ONE), 29 | Mult(~Charclass("fg"), STAR), 30 | Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))), 31 | Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS), 32 | ) 33 | ) 34 | == "abcde[^fg]*h{5}[a-z]+" 35 | ) 36 | 37 | 38 | def test_conc_common() -> None: 39 | a = Mult(Charclass("A"), ONE) 40 | b = Mult(Charclass("B"), ONE) 41 | c = Mult(Charclass("C"), ONE) 42 | y = Mult(Charclass("y"), ONE) 43 | z = Mult(Charclass("Z"), ONE) 44 | zstar = Mult(Charclass("Z"), STAR) 45 | 46 | assert Conc(a, a, z, y).common(Conc(b, b, z, y), suffix=True) == Conc(z, y) 47 | assert Conc(c, z).common(Conc(c, z), suffix=True) == Conc(c, z) 48 | assert Conc(c, y).common(Conc(c, z), suffix=True) == Conc() 49 | assert Conc(a, z).common(Conc(b, z), suffix=True) == Conc(z) 50 | assert Conc(a, zstar).common(Conc(b, z), suffix=True) == Conc() 51 | assert Conc(a).common(Conc(b), suffix=True) == Conc() 52 | 53 | 54 | def test_conc_dock() -> None: 55 | a = Mult(Charclass("A"), ONE) 56 | b = Mult(Charclass("B"), ONE) 57 | x = Mult(Charclass("X"), ONE) 58 | x_twice = Mult(Charclass("X"), Multiplier(Bound(2), Bound(2))) 59 | yplus = Mult(Charclass("y"), PLUS) 60 | z = Mult(Charclass("Z"), ONE) 61 | 62 | assert Conc(a, z).dock(Conc(z)) == Conc(a) 63 | assert Conc(a, b, x, yplus, z).dock(Conc(x, yplus, z)) == Conc(a, b) 64 | assert Conc(a, b, x, yplus, z).behead(Conc(a, b, x, yplus)) == Conc(z) 65 | assert Conc(a).dock(Conc()) == Conc(a) 66 | 67 | with pytest.raises(ArithmeticError, match="Can't subtract"): 68 | Conc(x_twice, yplus, z).behead(Conc(x, yplus)) 69 | 70 | 71 | def test_mult_reduction_easy() -> None: 72 | assert Conc(Mult(Charclass("a"), ZERO)).reduce() == Conc() 73 | -------------------------------------------------------------------------------- /greenery/bound.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | __all__ = ( 4 | "Bound", 5 | "INF", 6 | ) 7 | 8 | from dataclasses import dataclass 9 | 10 | 11 | @dataclass(frozen=True) 12 | class Bound: 13 | """An integer but sometimes also possibly infinite (None)""" 14 | 15 | v: int | None 16 | 17 | def __post_init__(self, /) -> None: 18 | if self.v is not None and self.v < 0: 19 | raise ValueError(f"Invalid bound: {self.v!r}") 20 | 21 | def __repr__(self, /) -> str: 22 | return f"Bound({self.v!r})" 23 | 24 | def __str__(self, /) -> str: 25 | if self.v is None: 26 | # This only happens for an unlimited upper bound 27 | return "" 28 | return str(self.v) 29 | 30 | def __eq__(self, other: object, /) -> bool: 31 | if not isinstance(other, type(self)): 32 | return NotImplemented 33 | return self.v == other.v 34 | 35 | def __hash__(self, /) -> int: 36 | return hash(self.v) 37 | 38 | def __lt__(self, other: Bound, /) -> bool: 39 | if self.v is None: 40 | return False 41 | if other.v is None: 42 | return True 43 | return self.v < other.v 44 | 45 | def __ge__(self, other: Bound, /) -> bool: 46 | return not self < other 47 | 48 | def __mul__(self, other: Bound, /) -> Bound: 49 | """Multiply this bound by another""" 50 | if Bound(0) in (self, other): 51 | return Bound(0) 52 | if self.v is None or other.v is None: 53 | return INF 54 | return Bound(self.v * other.v) 55 | 56 | def __add__(self, other: Bound, /) -> Bound: 57 | """Add this bound to another""" 58 | if self.v is None or other.v is None: 59 | return INF 60 | return Bound(self.v + other.v) 61 | 62 | def __sub__(self, other: Bound, /) -> Bound: 63 | """ 64 | Subtract another bound from this one. 65 | Caution: this operation is not meaningful for all bounds. 66 | """ 67 | if other.v is None: 68 | if self.v is not None: 69 | raise ArithmeticError(f"Can't subtract {other!r} from {self!r}") 70 | 71 | # Infinity minus infinity is zero. This has to be true so that 72 | # we can for example subtract Multiplier(Bound(0), INF) from 73 | # Multiplier(Bound(1), INF) to get Multiplier(Bound(1), Bound(1)) 74 | return Bound(0) 75 | if self.v is None: 76 | return INF 77 | try: 78 | return Bound(self.v - other.v) 79 | except ValueError as e: 80 | raise ArithmeticError(*e.args) from e 81 | 82 | def copy(self, /) -> Bound: 83 | return Bound(self.v) 84 | 85 | 86 | # Use this for cases where no upper bound is needed 87 | INF = Bound(None) 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | [Dd]ebug/ 46 | [Rr]elease/ 47 | *_i.c 48 | *_p.c 49 | *.ilk 50 | *.meta 51 | *.obj 52 | *.pch 53 | *.pdb 54 | *.pgc 55 | *.pgd 56 | *.rsp 57 | *.sbr 58 | *.tlb 59 | *.tli 60 | *.tlh 61 | *.tmp 62 | *.vspscc 63 | .builds 64 | *.dotCover 65 | 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this 67 | #packages/ 68 | 69 | # Visual C++ cache files 70 | ipch/ 71 | *.aps 72 | *.ncb 73 | *.opensdf 74 | *.sdf 75 | 76 | # Visual Studio profiler 77 | *.psess 78 | *.vsp 79 | 80 | # ReSharper is a .NET coding add-in 81 | _ReSharper* 82 | 83 | # Installshield output folder 84 | [Ee]xpress 85 | 86 | # DocProject is a documentation generator add-in 87 | DocProject/buildhelp/ 88 | DocProject/Help/*.HxT 89 | DocProject/Help/*.HxC 90 | DocProject/Help/*.hhc 91 | DocProject/Help/*.hhk 92 | DocProject/Help/*.hhp 93 | DocProject/Help/Html2 94 | DocProject/Help/html 95 | 96 | # Click-Once directory 97 | publish 98 | 99 | # Others 100 | [Bb]in 101 | [Oo]bj 102 | sql 103 | TestResults 104 | *.Cache 105 | ClientBin 106 | stylecop.* 107 | ~$* 108 | *.dbmdl 109 | Generated_Code #added for RIA/Silverlight projects 110 | 111 | # Backup & report files from converting an old project file to a newer 112 | # Visual Studio version. Backup files are not needed, because we have git ;-) 113 | _UpgradeReport_Files/ 114 | Backup*/ 115 | UpgradeLog*.XML 116 | 117 | 118 | 119 | ############ 120 | ## Windows 121 | ############ 122 | 123 | # Windows image file caches 124 | Thumbs.db 125 | 126 | # Folder config file 127 | Desktop.ini 128 | 129 | 130 | ############# 131 | ## Python 132 | ############# 133 | 134 | *.py[co] 135 | 136 | # Packages 137 | *.egg 138 | *.egg-info 139 | dist 140 | build 141 | eggs 142 | parts 143 | bin 144 | var 145 | sdist 146 | develop-eggs 147 | .installed.cfg 148 | 149 | # Installer logs 150 | pip-log.txt 151 | 152 | # Unit test / coverage reports 153 | .coverage 154 | .tox 155 | 156 | #Translations 157 | *.mo 158 | 159 | #Mr Developer 160 | .mr.developer.cfg 161 | 162 | # Mac crap 163 | .DS_Store 164 | -------------------------------------------------------------------------------- /greenery/bound_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from .bound import INF, Bound 6 | 7 | 8 | def test_ctor() -> None: 9 | assert Bound(None) == INF 10 | 11 | Bound(0) 12 | Bound(1) 13 | Bound(2) 14 | 15 | with pytest.raises(ValueError): 16 | Bound(-1) 17 | 18 | 19 | def test_eq_neq() -> None: 20 | # pylint: disable=comparison-with-itself 21 | assert Bound(0) == Bound(0) 22 | assert INF == INF 23 | assert Bound(0) != Bound(1) 24 | assert Bound(0) != INF 25 | assert Bound(1) == Bound(1) 26 | assert Bound(None) == INF 27 | 28 | 29 | def test_eq_neq_heterogeneous() -> None: 30 | assert Bound(1) != "blah" 31 | 32 | 33 | def test_comparisons() -> None: 34 | # pylint: disable=comparison-with-itself 35 | # pylint: disable=unneeded-not 36 | 37 | assert Bound(0) < Bound(1) 38 | assert Bound(0) < INF 39 | assert Bound(1) < INF 40 | assert not INF < INF 41 | 42 | 43 | def test_multiplication() -> None: 44 | assert Bound(0) * Bound(0) == Bound(0) 45 | assert Bound(0) * Bound(1) == Bound(0) 46 | assert Bound(0) * Bound(2) == Bound(0) 47 | assert Bound(0) * Bound(5) == Bound(0) 48 | assert Bound(0) * INF == Bound(0) 49 | 50 | assert Bound(1) * Bound(5) == Bound(5) 51 | assert Bound(2) * Bound(5) == Bound(10) 52 | assert Bound(0) * INF == Bound(0) 53 | assert Bound(2) * INF == INF 54 | assert INF * INF == INF 55 | assert INF * Bound(0) == Bound(0) 56 | assert Bound(1) * Bound(0) == Bound(0) 57 | 58 | 59 | def test_addition() -> None: 60 | assert Bound(0) + Bound(0) == Bound(0) 61 | assert Bound(0) + Bound(1) == Bound(1) 62 | assert Bound(0) + Bound(5) == Bound(5) 63 | assert Bound(0) + INF == INF 64 | 65 | assert Bound(1) + Bound(0) == Bound(1) 66 | assert Bound(1) + Bound(1) == Bound(2) 67 | assert Bound(1) + Bound(5) == Bound(6) 68 | assert Bound(1) + INF == INF 69 | 70 | assert INF + Bound(0) == INF 71 | assert INF + Bound(1) == INF 72 | assert INF + INF == INF 73 | 74 | 75 | def test_subtraction() -> None: 76 | assert Bound(0) - Bound(0) == Bound(0) 77 | assert Bound(1) - Bound(0) == Bound(1) 78 | assert Bound(6) - Bound(4) == Bound(2) 79 | assert Bound(5) - Bound(5) == Bound(0) 80 | 81 | assert INF - Bound(0) == INF 82 | assert INF - Bound(1) == INF 83 | assert INF - Bound(1000) == INF 84 | assert INF - INF == Bound(0) 85 | 86 | with pytest.raises(ArithmeticError): 87 | _ = Bound(5) - Bound(6) 88 | 89 | with pytest.raises(ArithmeticError): 90 | _ = Bound(0) - Bound(1) 91 | 92 | with pytest.raises(ArithmeticError): 93 | _ = Bound(0) - INF 94 | 95 | with pytest.raises(ArithmeticError): 96 | _ = Bound(10) - INF 97 | 98 | 99 | def test_copy() -> None: 100 | assert INF.copy() == INF 101 | 102 | b = Bound(6) 103 | assert b.copy() == b 104 | 105 | 106 | def test_bound_str() -> None: 107 | assert str(Bound(2)) == "2" 108 | 109 | # pylint: disable-next=compare-to-empty-string 110 | assert str(INF) == "" 111 | 112 | 113 | def test_bound() -> None: 114 | assert min(Bound(0), INF) == Bound(0) 115 | assert min(Bound(1), INF) == Bound(1) 116 | -------------------------------------------------------------------------------- /greenery/multiplier_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from .bound import INF, Bound 6 | from .multiplier import ONE, PLUS, QM, STAR, ZERO, Multiplier 7 | 8 | 9 | def test_multiplier_str() -> None: 10 | assert str(Multiplier(Bound(2), INF)) == "{2,}" 11 | assert str(Multiplier(Bound(0), Bound(0))) == "{0}" 12 | assert str(Multiplier(Bound(2), Bound(2))) == "{2}" 13 | assert str(Multiplier(Bound(2), Bound(5))) == "{2,5}" 14 | 15 | 16 | def test_bound_qm() -> None: 17 | assert QM.mandatory == Bound(0) 18 | assert QM.optional == Bound(1) 19 | 20 | 21 | def test_eq() -> None: 22 | assert ZERO == Multiplier(Bound(0), Bound(0)) 23 | assert ONE == Multiplier(Bound(1), Bound(1)) 24 | assert STAR == Multiplier(Bound(0), INF) 25 | assert Multiplier(Bound(1), Bound(2)) == Multiplier(Bound(1), Bound(2)) 26 | 27 | assert ZERO != ONE 28 | assert STAR != QM 29 | 30 | 31 | def test_eq_het() -> None: 32 | assert ZERO != "goldfish" 33 | 34 | 35 | def test_multiplier_common() -> None: 36 | assert ZERO.common(ZERO) == ZERO 37 | assert ZERO.common(QM) == ZERO 38 | assert ZERO.common(ONE) == ZERO 39 | assert ZERO.common(STAR) == ZERO 40 | assert ZERO.common(PLUS) == ZERO 41 | assert QM.common(ZERO) == ZERO 42 | assert QM.common(QM) == QM 43 | assert QM.common(ONE) == ZERO 44 | assert QM.common(STAR) == QM 45 | assert QM.common(PLUS) == QM 46 | assert ONE.common(ZERO) == ZERO 47 | assert ONE.common(QM) == ZERO 48 | assert ONE.common(ONE) == ONE 49 | assert ONE.common(STAR) == ZERO 50 | assert ONE.common(PLUS) == ONE 51 | assert STAR.common(ZERO) == ZERO 52 | assert STAR.common(QM) == QM 53 | assert STAR.common(ONE) == ZERO 54 | assert STAR.common(STAR) == STAR 55 | assert STAR.common(PLUS) == STAR 56 | assert PLUS.common(ZERO) == ZERO 57 | assert PLUS.common(QM) == QM 58 | assert PLUS.common(ONE) == ONE 59 | assert PLUS.common(STAR) == STAR 60 | assert PLUS.common(PLUS) == PLUS 61 | 62 | 63 | def test_multiplier_subtraction() -> None: 64 | # a{3,4}, a{2,5} -> a{2,3} (with a{1,1}, a{0,2} left over) 65 | assert Multiplier(Bound(3), Bound(4)).common( 66 | Multiplier(Bound(2), Bound(5)) 67 | ) == Multiplier(Bound(2), Bound(3)) 68 | assert Multiplier(Bound(3), Bound(4)) - Multiplier(Bound(2), Bound(3)) == ONE 69 | assert Multiplier(Bound(2), Bound(5)) - Multiplier( 70 | Bound(2), Bound(3) 71 | ) == Multiplier(Bound(0), Bound(2)) 72 | 73 | # a{2,}, a{1,5} -> a{1,5} (with a{1,}, a{0,0} left over) 74 | assert Multiplier(Bound(2), INF).common( 75 | Multiplier(Bound(1), Bound(5)) 76 | ) == Multiplier(Bound(1), Bound(5)) 77 | assert Multiplier(Bound(2), INF) - Multiplier(Bound(1), Bound(5)) == PLUS 78 | assert Multiplier(Bound(1), Bound(5)) - Multiplier(Bound(1), Bound(5)) == ZERO 79 | 80 | # a{3,}, a{2,} -> a{2,} (with a, epsilon left over) 81 | assert Multiplier(Bound(3), INF).common(Multiplier(Bound(2), INF)) == Multiplier( 82 | Bound(2), INF 83 | ) 84 | assert Multiplier(Bound(3), INF) - Multiplier(Bound(2), INF) == ONE 85 | assert Multiplier(Bound(2), INF) - Multiplier(Bound(2), INF) == ZERO 86 | 87 | # a{3,}, a{3,} -> a{3,} (with ZERO, ZERO left over) 88 | assert Multiplier(Bound(3), INF).common(Multiplier(Bound(3), INF)) == Multiplier( 89 | Bound(3), INF 90 | ) 91 | assert Multiplier(Bound(3), INF) - Multiplier(Bound(3), INF) == ZERO 92 | 93 | 94 | def test_multiplier_union() -> None: 95 | assert ZERO | ZERO == ZERO 96 | assert ZERO | QM == QM 97 | assert ZERO | ONE == QM 98 | assert ZERO | STAR == STAR 99 | assert ZERO | PLUS == STAR 100 | assert QM | ZERO == QM 101 | assert QM | QM == QM 102 | assert QM | ONE == QM 103 | assert QM | STAR == STAR 104 | assert QM | PLUS == STAR 105 | assert ONE | ZERO == QM 106 | assert ONE | QM == QM 107 | assert ONE | ONE == ONE 108 | assert ONE | STAR == STAR 109 | assert ONE | PLUS == PLUS 110 | assert STAR | ZERO == STAR 111 | assert STAR | QM == STAR 112 | assert STAR | ONE == STAR 113 | assert STAR | STAR == STAR 114 | assert STAR | PLUS == STAR 115 | assert PLUS | ZERO == STAR 116 | assert PLUS | QM == STAR 117 | assert PLUS | ONE == PLUS 118 | assert PLUS | STAR == STAR 119 | assert PLUS | PLUS == PLUS 120 | assert not ZERO.canunion(Multiplier(Bound(2), INF)) 121 | assert not ONE.canunion(Multiplier(Bound(3), Bound(4))) 122 | assert not Multiplier(Bound(8), INF).canunion(Multiplier(Bound(3), Bound(4))) 123 | 124 | with pytest.raises(ArithmeticError, match="Can't compute the union"): 125 | _ = ZERO | Multiplier(Bound(7), Bound(8)) 126 | -------------------------------------------------------------------------------- /greenery/pattern_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .charclass import Charclass 4 | from .multiplier import ONE, ZERO 5 | from .parse import parse 6 | from .rxelems import Conc, Mult, Pattern 7 | 8 | 9 | def test_pattern_equality() -> None: 10 | assert Pattern( 11 | Conc(Mult(Charclass("a"), ONE)), 12 | Conc(Mult(Charclass("b"), ONE)), 13 | ) == Pattern( 14 | Conc(Mult(Charclass("b"), ONE)), 15 | Conc(Mult(Charclass("a"), ONE)), 16 | ) 17 | assert Pattern( 18 | Conc(Mult(Charclass("a"), ONE)), 19 | Conc(Mult(Charclass("a"), ONE)), 20 | ) == Pattern( 21 | Conc(Mult(Charclass("a"), ONE)), 22 | ) 23 | 24 | 25 | def test_pattern_str() -> None: 26 | assert ( 27 | str( 28 | Pattern( 29 | Conc(Mult(Charclass("a"), ONE)), 30 | Conc(Mult(Charclass("b"), ONE)), 31 | ) 32 | ) 33 | == "a|b" 34 | ) 35 | assert ( 36 | str( 37 | Pattern( 38 | Conc(Mult(Charclass("a"), ONE)), 39 | Conc(Mult(Charclass("a"), ONE)), 40 | ) 41 | ) 42 | == "a" 43 | ) 44 | assert ( 45 | str( 46 | Pattern( 47 | Conc( 48 | Mult(Charclass("a"), ONE), 49 | Mult(Charclass("b"), ONE), 50 | Mult(Charclass("c"), ONE), 51 | ), 52 | Conc( 53 | Mult(Charclass("d"), ONE), 54 | Mult(Charclass("e"), ONE), 55 | Mult(Charclass("f"), ONE), 56 | Mult( 57 | Pattern( 58 | Conc( 59 | Mult(Charclass("g"), ONE), 60 | Mult(Charclass("h"), ONE), 61 | Mult(Charclass("i"), ONE), 62 | ), 63 | Conc( 64 | Mult(Charclass("j"), ONE), 65 | Mult(Charclass("k"), ONE), 66 | Mult(Charclass("l"), ONE), 67 | ), 68 | ), 69 | ONE, 70 | ), 71 | ), 72 | ) 73 | ) 74 | == "abc|def(ghi|jkl)" 75 | ) 76 | 77 | 78 | def test_empty() -> None: 79 | assert Pattern().empty() 80 | 81 | 82 | def test_mult_reduction_easy() -> None: 83 | assert Pattern(Conc()).reduce() == Pattern(Conc()) 84 | assert Pattern( 85 | Conc( 86 | Mult( 87 | Charclass("a"), 88 | ZERO, 89 | ) 90 | ) 91 | ).reduce() == Pattern(Conc()) 92 | 93 | assert str( 94 | # pylint: disable-next=compare-to-empty-string 95 | Pattern( 96 | Conc( 97 | Mult( 98 | Charclass("a"), 99 | ZERO, 100 | ) 101 | ).reduce() 102 | ) 103 | == "" 104 | ) 105 | 106 | 107 | def test_empty_pattern_reduction() -> None: 108 | assert str(Pattern().reduce()) == "[]" 109 | 110 | 111 | def test_empty_conc_suppression() -> None: 112 | assert ( 113 | str( 114 | Pattern( 115 | Conc( 116 | # this `Mult` can never actually match anything 117 | Mult(Pattern(), ONE), 118 | Mult(Charclass("0"), ONE), 119 | Mult(Charclass("0123456789"), ONE), 120 | ) # so neither can this `Conc` 121 | ).reduce() 122 | ) 123 | == "[]" 124 | ) 125 | 126 | 127 | def test_pattern_dock() -> None: 128 | a = Mult(Charclass("a"), ONE) 129 | c = Mult(Charclass("c"), ONE) 130 | f = Mult(Charclass("f"), ONE) 131 | 132 | assert parse("a|bc").dock(Conc()) == parse("a|bc") 133 | assert parse("aa|bca").dock(Conc(a)) == parse("a|bc") 134 | assert parse("xyza|abca|a").dock(Conc(a)) == parse("xyz|abc|") 135 | assert parse("f{2,3}c|fc").dock(Conc(f, c)) == parse("f{1,2}|") 136 | assert parse("aa").dock(Conc(a, a)) == parse("") 137 | 138 | 139 | def test_pattern_beheading() -> None: 140 | a = Mult(Charclass("a"), ONE) 141 | c = Mult(Charclass("c"), ONE) 142 | f = Mult(Charclass("f"), ONE) 143 | z = Mult(Charclass("Z"), ONE) 144 | 145 | assert parse("aa").behead(Conc(a)) == parse("a") 146 | assert parse("abc|aa").behead(Conc(a)) == parse("a|bc") 147 | assert parse("cf{1,2}|cf").behead(Conc(c)) == parse("f{1,2}|f") 148 | assert parse("aa|aa").behead(Conc(a, a)) == parse("") 149 | assert parse("abc|aa").behead(Conc(a)) == parse("a|bc") 150 | assert parse("a|bc").behead(Conc()) == parse("a|bc") 151 | assert parse("cf{1,2}|cf").behead(Conc(c, f)) == parse("f?|") 152 | assert parse("ZA|ZB|ZC").behead(Conc(z)) == parse("A|B|C") 153 | assert parse("Z+A|ZB|ZZC").behead(Conc(z)) == parse("Z*A|B|ZC") 154 | assert parse("a{2}b|a+c").behead(Conc(a)) == parse("ab|a*c") 155 | -------------------------------------------------------------------------------- /greenery/multiplier.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | __all__ = ( 4 | "Multiplier", 5 | "ONE", 6 | "PLUS", 7 | "QM", 8 | "STAR", 9 | "ZERO", 10 | "symbolic", 11 | ) 12 | 13 | from dataclasses import dataclass, field 14 | from typing import Mapping 15 | 16 | from .bound import INF, Bound 17 | 18 | 19 | @dataclass(frozen=True) 20 | class Multiplier: 21 | """ 22 | A min and a max. The vast majority of characters in regular expressions 23 | occur without a specific multiplier, which is implicitly equivalent to 24 | a min of 1 and a max of 1, but many more have explicit multipliers like 25 | "*" (min = 0, max = inf) and so on. 26 | 27 | Although it seems odd and can lead to some confusing edge cases, we do 28 | also permit a max of 0 (iff min is 0 too). This allows the multiplier 29 | `ZERO` to exist, which actually are quite useful in their own special 30 | way. 31 | """ 32 | 33 | min: Bound 34 | max: Bound 35 | mandatory: Bound = field(init=False) 36 | optional: Bound = field(init=False) 37 | 38 | def __post_init__(self, /) -> None: 39 | if self.min == INF: 40 | raise ValueError(f"Minimum bound of a multiplier can't be {INF!r}") 41 | if self.min > self.max: 42 | raise ValueError( 43 | f"Invalid multiplier bounds: {self.min!r} and {self.max!r}" 44 | ) 45 | 46 | # More useful than "min" and "max" in many situations 47 | # are "mandatory" and "optional". 48 | object.__setattr__(self, "mandatory", self.min) 49 | object.__setattr__(self, "optional", self.max - self.min) 50 | 51 | def __eq__(self, other: object, /) -> bool: 52 | if not isinstance(other, type(self)): 53 | return NotImplemented 54 | return self.min == other.min and self.max == other.max 55 | 56 | def __hash__(self, /) -> int: 57 | return hash((self.min, self.max)) 58 | 59 | def __repr__(self, /) -> str: 60 | return f"Multiplier({self.min!r}, {self.max!r})" 61 | 62 | def __str__(self, /) -> str: 63 | try: 64 | return symbolic[self] 65 | except LookupError: 66 | pass 67 | 68 | if self.min == self.max: 69 | return "{" + str(self.min) + "}" 70 | return "{" + str(self.min) + "," + str(self.max) + "}" 71 | 72 | def canmultiplyby(self, other: Multiplier, /) -> bool: 73 | """ 74 | Multiplication is not well-defined for all pairs of multipliers 75 | because the resulting possibilities do not necessarily form a 76 | continuous range. 77 | 78 | For example: 79 | {0,x} * {0,y} = {0,x*y} 80 | {2} * {3} = {6} 81 | {2} * {1,2} = ERROR 82 | 83 | The proof isn't simple but suffice it to say that {p,p+q} * {r,r+s} 84 | is equal to {pr, (p+q)(r+s)} only if s=0 or qr+1 >= p. If not, then 85 | at least one gap appears in the range. The first inaccessible 86 | number is (p+q)r+1. And no, multiplication is not commutative 87 | """ 88 | return ( 89 | other.optional == Bound(0) 90 | or self.optional * other.mandatory + Bound(1) >= self.mandatory 91 | ) 92 | 93 | def __mul__(self, other: Multiplier, /) -> Multiplier: 94 | """Multiply this multiplier by another""" 95 | if not self.canmultiplyby(other): 96 | raise ArithmeticError(f"Can't multiply {self!r} by {other!r}") 97 | return Multiplier(self.min * other.min, self.max * other.max) 98 | 99 | def __add__(self, other: Multiplier, /) -> Multiplier: 100 | """Add two multipliers together""" 101 | return Multiplier(self.min + other.min, self.max + other.max) 102 | 103 | def __sub__(self, other: Multiplier, /) -> Multiplier: 104 | """ 105 | Subtract another multiplier from this one. 106 | Caution: multipliers are not totally ordered. 107 | This operation is not meaningful for all pairs of multipliers. 108 | """ 109 | mandatory = self.mandatory - other.mandatory 110 | optional = self.optional - other.optional 111 | return Multiplier(mandatory, mandatory + optional) 112 | 113 | def canintersect(self, other: Multiplier, /) -> bool: 114 | """ 115 | Intersection is not well-defined for all pairs of multipliers. 116 | For example: 117 | {2,3} & {3,4} = {3} 118 | {2,} & {1,7} = {2,7} 119 | {2} & {5} = ERROR 120 | """ 121 | return not (self.max < other.min or other.max < self.min) 122 | 123 | def __and__(self, other: Multiplier, /) -> Multiplier: 124 | """ 125 | Find the intersection of two multipliers: that is, a third 126 | multiplier expressing the range covered by both of the originals. 127 | This is not defined for all multipliers since they may not overlap. 128 | """ 129 | if not self.canintersect(other): 130 | raise ArithmeticError( 131 | f"Can't compute intersection of {self!r} and {other!r}" 132 | ) 133 | a = max(self.min, other.min) 134 | b = min(self.max, other.max) 135 | return Multiplier(a, b) 136 | 137 | def canunion(self, other: Multiplier, /) -> bool: 138 | """ 139 | Union is not defined for all pairs of multipliers. 140 | E.g. {0,1} | {3,4} -> nope 141 | """ 142 | return not (self.max + Bound(1) < other.min or other.max + Bound(1) < self.min) 143 | 144 | def __or__(self, other: Multiplier, /) -> Multiplier: 145 | """ 146 | Find the union of two multipliers: that is, a third multiplier 147 | expressing the range covered by either of the originals. This is 148 | not defined for all multipliers since they may not intersect. 149 | """ 150 | if not self.canunion(other): 151 | raise ArithmeticError(f"Can't compute the union of {self!r} and {other!r}") 152 | a = min(self.min, other.min) 153 | b = max(self.max, other.max) 154 | return Multiplier(a, b) 155 | 156 | def common(self, other: Multiplier, /) -> Multiplier: 157 | """ 158 | Find the shared part of two multipliers. This is the largest 159 | multiplier which can be safely subtracted from both the originals. 160 | This may return the `ZERO` multiplier. 161 | """ 162 | mandatory = min(self.mandatory, other.mandatory) 163 | optional = min(self.optional, other.optional) 164 | return Multiplier(mandatory, mandatory + optional) 165 | 166 | def copy(self, /) -> Multiplier: 167 | return Multiplier(self.min.copy(), self.max.copy()) 168 | 169 | 170 | # Preset multipliers. These get used ALL THE TIME in unit tests 171 | ZERO = Multiplier(Bound(0), Bound(0)) # has some occasional uses 172 | QM = Multiplier(Bound(0), Bound(1)) 173 | ONE = Multiplier(Bound(1), Bound(1)) 174 | STAR = Multiplier(Bound(0), INF) 175 | PLUS = Multiplier(Bound(1), INF) 176 | 177 | # Symbol lookup table for preset multipliers. 178 | symbolic: Mapping[Multiplier, str] = { 179 | QM: "?", 180 | ONE: "", 181 | STAR: "*", 182 | PLUS: "+", 183 | } 184 | -------------------------------------------------------------------------------- /greenery/parse_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from .bound import INF, Bound 6 | from .charclass import DIGIT, DOT, NULLCHARCLASS, Charclass 7 | from .multiplier import ONE, PLUS, STAR, Multiplier 8 | 9 | # noinspection PyProtectedMember 10 | from .parse import NoMatch, match_charclass, match_mult, parse 11 | from .rxelems import Conc, Mult, Pattern 12 | 13 | if __name__ == "__main__": 14 | raise RuntimeError( 15 | "Test files can't be run directly. Use `python -m pytest greenery`" 16 | ) 17 | 18 | 19 | def test_charclass_matching() -> None: 20 | assert match_charclass("a", 0) == (Charclass("a"), 1) 21 | assert match_charclass("aa", 1) == (Charclass("a"), 2) 22 | assert match_charclass("a$", 1) == (Charclass("$"), 2) 23 | assert match_charclass(".", 0) == (DOT, 1) 24 | 25 | with pytest.raises(IndexError): 26 | match_charclass("[", 0) 27 | 28 | with pytest.raises(NoMatch): 29 | match_charclass("a", 1) 30 | 31 | assert match_charclass("[\\d]", 0) == (DIGIT, 4) 32 | 33 | 34 | def test_negatives_inside_charclasses() -> None: 35 | assert match_charclass("[\\D]", 0) == (~DIGIT, 4) 36 | assert match_charclass("[a\\D]", 0) == (~DIGIT, 5) 37 | assert match_charclass("[a1\\D]", 0) == (~Charclass("023456789"), 6) 38 | assert match_charclass("[1a\\D]", 0) == (~Charclass("023456789"), 6) 39 | assert match_charclass("[1\\D]", 0) == (~Charclass("023456789"), 5) 40 | assert match_charclass("[\\Da]", 0) == (~DIGIT, 5) 41 | assert match_charclass("[\\D1]", 0) == (~Charclass("023456789"), 5) 42 | assert match_charclass("[\\D1a]", 0) == (~Charclass("023456789"), 6) 43 | assert match_charclass("[\\D\\d]", 0) == (DOT, 6) 44 | assert match_charclass("[\\D\\D]", 0) == (~DIGIT, 6) 45 | 46 | # "Either non-whitespace or a non-digit" matches _anything_. 47 | assert match_charclass("[\\S\\D]", 0) == (DOT, 6) 48 | assert match_charclass("[\\S \\D]", 0) == (DOT, 7) 49 | 50 | 51 | def test_negated_negatives_inside_charclasses() -> None: 52 | assert match_charclass("[^\\D]", 0) == (DIGIT, 5) 53 | assert match_charclass("[^a\\D]", 0) == (DIGIT, 6) 54 | assert match_charclass("[^a1\\D]", 0) == (Charclass("023456789"), 7) 55 | assert match_charclass("[^1a\\D]", 0) == (Charclass("023456789"), 7) 56 | assert match_charclass("[^1\\D]", 0) == (Charclass("023456789"), 6) 57 | assert match_charclass("[^\\Da]", 0) == (DIGIT, 6) 58 | assert match_charclass("[^\\D1]", 0) == (Charclass("023456789"), 6) 59 | assert match_charclass("[^\\D1a]", 0) == (Charclass("023456789"), 7) 60 | assert match_charclass("[^\\D\\d]", 0) == (NULLCHARCLASS, 7) 61 | assert match_charclass("[^\\D\\D]", 0) == (DIGIT, 7) 62 | 63 | # "Anything but non-whitespace and non-digit" matches _nothing_. 64 | assert match_charclass("[^\\S\\D]", 0) == (NULLCHARCLASS, 7) 65 | assert match_charclass("[^\\S \\D]", 0) == (NULLCHARCLASS, 8) 66 | 67 | 68 | def test_match_nightmare_charclass() -> None: 69 | assert match_charclass("[\t\n\r -\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", 0) == ( 70 | Charclass( 71 | ( 72 | ("\t", "\t"), 73 | ("\n", "\n"), 74 | ("\r", "\r"), 75 | (" ", "\uD7FF"), 76 | ("\uE000", "\uFFFD"), 77 | ("\U00010000", "\U0010FFFF"), 78 | ) 79 | ), 80 | 14, 81 | ) 82 | 83 | 84 | def test_mult_matching() -> None: 85 | assert match_mult("abcde[^fg]*", 5) == (Mult(~Charclass("fg"), STAR), 11) 86 | assert match_mult("abcde[^fg]*h{5}[a-z]+", 11) == ( 87 | Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))), 88 | 15, 89 | ) 90 | assert match_mult("abcde[^fg]*h{5}[a-z]+T{1,}", 15) == ( 91 | Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS), 92 | 21, 93 | ) 94 | assert match_mult("abcde[^fg]*h{5}[a-z]+T{2,}", 21) == ( 95 | Mult(Charclass("T"), Multiplier(Bound(2), INF)), 96 | 26, 97 | ) 98 | 99 | 100 | def test_lazy_multipliers() -> None: 101 | assert match_mult("abcde[^fg]*?", 5) == (Mult(~Charclass("fg"), STAR), 12) 102 | assert match_mult("abcde[^fg]*?h{5}?[a-z]+", 12) == ( 103 | Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))), 104 | 17, 105 | ) 106 | assert match_mult("abcde[^fg]*?h{5}?[a-z]+?T{1,}", 17) == ( 107 | Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS), 108 | 24, 109 | ) 110 | assert match_mult("abcde[^fg]*?h{5}?[a-z]+?T{2,}?", 24) == ( 111 | Mult(Charclass("T"), Multiplier(Bound(2), INF)), 112 | 30, 113 | ) 114 | 115 | 116 | def test_charclass_ranges() -> None: 117 | # Should accept arbitrary ranges of characters in charclasses. No longer 118 | # limited to alphanumerics. (User beware...) 119 | assert parse("[z{|}~]") == parse("[z-~]") 120 | assert parse("[\\w:;<=>?@\\[\\\\\\]\\^`]") == parse("[0-z]") 121 | 122 | 123 | def test_hex_escapes() -> None: 124 | # Should be able to parse e.g. "\\x40" 125 | assert parse("\\x00") == parse("\x00") 126 | assert parse("\\x40") == parse("@") 127 | assert parse("[\\x40]") == parse("[@]") 128 | assert parse("[\\x41-\\x5a]") == parse("[A-Z]") 129 | 130 | 131 | def test_w_d_s() -> None: 132 | # Allow "\w", "\d" and "\s" in charclasses 133 | assert parse("\\w") == parse("[0-9A-Z_a-z]") 134 | assert parse("[\\w~]") == parse("[0-9A-Z_a-z~]") 135 | assert parse("[\\da]") == parse("[0123456789a]") 136 | assert parse("[\\s]") == parse("[\t\n\r\f\v ]") 137 | 138 | 139 | def test_mult_parsing() -> None: 140 | assert parse("[a-g]+") == Pattern(Conc(Mult(Charclass("abcdefg"), PLUS))) 141 | assert parse("[a-g0-8$%]+") == Pattern( 142 | Conc(Mult(Charclass("abcdefg012345678$%"), PLUS)) 143 | ) 144 | assert parse("[a-g0-8$%\\^]+") == Pattern( 145 | Conc(Mult(Charclass("abcdefg012345678$%^"), PLUS)) 146 | ) 147 | 148 | 149 | def test_lazy_mult_parsing() -> None: 150 | assert parse("[a-g]+?") == Pattern(Conc(Mult(Charclass("abcdefg"), PLUS))) 151 | 152 | 153 | def test_conc_parsing() -> None: 154 | assert parse("abcde[^fg]*h{5}[a-z]+") == Pattern( 155 | Conc( 156 | Mult(Charclass("a"), ONE), 157 | Mult(Charclass("b"), ONE), 158 | Mult(Charclass("c"), ONE), 159 | Mult(Charclass("d"), ONE), 160 | Mult(Charclass("e"), ONE), 161 | Mult(~Charclass("fg"), STAR), 162 | Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))), 163 | Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS), 164 | ) 165 | ) 166 | assert parse("[bc]*[ab]*") == Pattern( 167 | Conc( 168 | Mult(Charclass("bc"), STAR), 169 | Mult(Charclass("ab"), STAR), 170 | ) 171 | ) 172 | assert parse("abc...") == Pattern( 173 | Conc( 174 | Mult(Charclass("a"), ONE), 175 | Mult(Charclass("b"), ONE), 176 | Mult(Charclass("c"), ONE), 177 | Mult(DOT, ONE), 178 | Mult(DOT, ONE), 179 | Mult(DOT, ONE), 180 | ) 181 | ) 182 | assert parse("\\d{4}-\\d{2}-\\d{2}") == Pattern( 183 | Conc( 184 | Mult(DIGIT, Multiplier(Bound(4), Bound(4))), 185 | Mult(Charclass("-"), ONE), 186 | Mult(DIGIT, Multiplier(Bound(2), Bound(2))), 187 | Mult(Charclass("-"), ONE), 188 | Mult(DIGIT, Multiplier(Bound(2), Bound(2))), 189 | ) 190 | ) 191 | 192 | 193 | def test_pattern_parsing() -> None: 194 | assert parse("abc|def(ghi|jkl)") == Pattern( 195 | Conc( 196 | Mult(Charclass("a"), ONE), 197 | Mult(Charclass("b"), ONE), 198 | Mult(Charclass("c"), ONE), 199 | ), 200 | Conc( 201 | Mult(Charclass("d"), ONE), 202 | Mult(Charclass("e"), ONE), 203 | Mult(Charclass("f"), ONE), 204 | Mult( 205 | Pattern( 206 | Conc( 207 | Mult(Charclass("g"), ONE), 208 | Mult(Charclass("h"), ONE), 209 | Mult(Charclass("i"), ONE), 210 | ), 211 | Conc( 212 | Mult(Charclass("j"), ONE), 213 | Mult(Charclass("k"), ONE), 214 | Mult(Charclass("l"), ONE), 215 | ), 216 | ), 217 | ONE, 218 | ), 219 | ), 220 | ) 221 | 222 | # Accept the "non-capturing group" syntax, "(?: ... )" but give it no 223 | # special significance 224 | assert parse("(?:)") == parse("()") 225 | assert parse("(?:abc|def)") == parse("(abc|def)") 226 | parse("(:abc)") # should give no problems 227 | 228 | # Named groups 229 | assert parse("(?Pabc)") == parse("(abc)") 230 | 231 | 232 | def test_nightmare_pattern() -> None: 233 | assert parse("[\t\n\r -\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]*") == Pattern( 234 | Conc( 235 | Mult( 236 | Charclass( 237 | ( 238 | ("\t", "\t"), 239 | ("\n", "\n"), 240 | ("\r", "\r"), 241 | (" ", "\uD7FF"), 242 | ("\uE000", "\uFFFD"), 243 | ("\U00010000", "\U0010FFFF"), 244 | ) 245 | ), 246 | STAR, 247 | ) 248 | ) 249 | ) 250 | -------------------------------------------------------------------------------- /greenery/charclass_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import unicodedata 4 | 5 | from .charclass import ( 6 | DIGIT, 7 | DOT, 8 | NONDIGITCHAR, 9 | NONSPACECHAR, 10 | NONWORDCHAR, 11 | NULLCHARCLASS, 12 | SPACECHAR, 13 | WORDCHAR, 14 | Charclass, 15 | collapse_ord_ranges, 16 | repartition, 17 | ) 18 | 19 | 20 | def test_collapse_ord_ranges_0() -> None: 21 | assert collapse_ord_ranges([(1, 2)]) == [(1, 2)] 22 | 23 | 24 | def test_collapse_ord_ranges_1a() -> None: 25 | assert collapse_ord_ranges( 26 | [(1, 1), (3, 4), (10, 11), (13, 17), (7, 7)], 27 | ) == [(1, 1), (3, 4), (7, 7), (10, 11), (13, 17)] 28 | 29 | 30 | def test_collapse_ord_ranges_1b() -> None: 31 | assert collapse_ord_ranges([(5, 16), (1, 1)]) == [(1, 1), (5, 16)] 32 | assert collapse_ord_ranges([(5, 16), (1, 2)]) == [(1, 2), (5, 16)] 33 | assert collapse_ord_ranges([(5, 16), (1, 3)]) == [(1, 3), (5, 16)] 34 | assert collapse_ord_ranges([(5, 16), (1, 4)]) == [(1, 16)] 35 | assert collapse_ord_ranges([(5, 16), (1, 5)]) == [(1, 16)] 36 | assert collapse_ord_ranges([(5, 16), (1, 16)]) == [(1, 16)] 37 | assert collapse_ord_ranges([(5, 16), (1, 17)]) == [(1, 17)] 38 | assert collapse_ord_ranges([(5, 16), (1, 18)]) == [(1, 18)] 39 | assert collapse_ord_ranges([(5, 16), (4, 4)]) == [(4, 16)] 40 | assert collapse_ord_ranges([(5, 16), (5, 5)]) == [(5, 16)] 41 | assert collapse_ord_ranges([(5, 16), (5, 18)]) == [(5, 18)] 42 | assert collapse_ord_ranges([(5, 16), (7, 8)]) == [(5, 16)] 43 | assert collapse_ord_ranges([(5, 16), (10, 20)]) == [(5, 20)] 44 | assert collapse_ord_ranges([(5, 16), (16, 20)]) == [(5, 20)] 45 | assert collapse_ord_ranges([(5, 16), (17, 20)]) == [(5, 20)] 46 | assert collapse_ord_ranges([(5, 16), (18, 20)]) == [(5, 16), (18, 20)] 47 | 48 | 49 | def test_collapse_ord_ranges_2() -> None: 50 | assert collapse_ord_ranges([(1, 2), (11, 12), (5, 6)]) == [(1, 2), (5, 6), (11, 12)] 51 | assert collapse_ord_ranges([(1, 2), (11, 12), (3, 6)]) == [(1, 6), (11, 12)] 52 | assert collapse_ord_ranges([(1, 2), (11, 12), (2, 6)]) == [(1, 6), (11, 12)] 53 | assert collapse_ord_ranges([(1, 2), (11, 12), (5, 9)]) == [(1, 2), (5, 9), (11, 12)] 54 | assert collapse_ord_ranges([(1, 2), (11, 12), (5, 10)]) == [(1, 2), (5, 12)] 55 | assert collapse_ord_ranges([(1, 2), (11, 12), (-2, -1)]) == [ 56 | (-2, -1), 57 | (1, 2), 58 | (11, 12), 59 | ] 60 | assert collapse_ord_ranges([(1, 2), (11, 12), (0, 20)]) == [(0, 20)] 61 | 62 | 63 | def test_charclass_equality() -> None: 64 | assert Charclass("a") == Charclass("a") 65 | assert ~Charclass("a") == ~Charclass("a") 66 | assert ~Charclass("a") != Charclass("a") 67 | assert Charclass("ab") == Charclass("ba") 68 | 69 | 70 | def test_charclass_ctor() -> None: 71 | assert not Charclass("ab").negated 72 | assert not Charclass("ab", negated=False).negated 73 | assert Charclass("ab", negated=True).negated 74 | 75 | 76 | def test_repr() -> None: 77 | assert repr(~Charclass("a")) == "~Charclass((('a', 'a'),))" 78 | 79 | 80 | def test_issubset() -> None: 81 | assert Charclass("a").issubset(Charclass("a")) 82 | assert not Charclass("a").issubset(Charclass("b")) 83 | assert Charclass("a").issubset(Charclass((("a", "b"),))) 84 | assert Charclass("a").issubset(~Charclass("b")) 85 | assert not (~Charclass("a")).issubset(Charclass("b")) 86 | assert (~Charclass("a")).issubset(DOT) 87 | 88 | 89 | def test_charclass_str() -> None: 90 | assert str(WORDCHAR) == "\\w" 91 | assert str(DIGIT) == "\\d" 92 | assert str(SPACECHAR) == "\\s" 93 | assert str(Charclass("a")) == "a" 94 | assert str(Charclass("{")) == "\\{" 95 | assert str(Charclass("\t")) == "\\t" 96 | assert str(Charclass("ab")) == "[ab]" 97 | assert str(Charclass("a{")) == "[a{]" 98 | assert str(Charclass("a\t")) == "[\\ta]" 99 | assert str(Charclass("a-")) == "[\\-a]" 100 | assert str(Charclass("a[")) == "[\\[a]" 101 | assert str(Charclass("a]")) == "[\\]a]" 102 | assert str(Charclass("ab")) == "[ab]" 103 | assert str(Charclass("abc")) == "[abc]" 104 | assert str(Charclass("abcd")) == "[a-d]" 105 | assert str(Charclass("abcdfghi")) == "[a-df-i]" 106 | assert str(Charclass("^")) == "^" 107 | assert str(Charclass("\\")) == "\\\\" 108 | assert str(Charclass("a^")) == "[\\^a]" 109 | assert str(Charclass("0123456789a")) == "[0-9a]" 110 | assert str(Charclass("\t\v\r A")) == "[\\t\\v\\r A]" 111 | assert str(Charclass("\n\f A")) == "[\\n\\f A]" 112 | assert str(Charclass("\t\n\v\f\r A")) == "[\\t-\\r A]" 113 | assert ( 114 | str( 115 | Charclass( 116 | "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz|" 117 | ) 118 | ) 119 | == "[0-9A-Z_a-z|]" 120 | ) 121 | assert str(NONWORDCHAR) == "\\W" 122 | assert str(NONDIGITCHAR) == "\\D" 123 | assert str(NONSPACECHAR) == "\\S" 124 | assert str(DOT) == "." 125 | assert str(~Charclass("")) == "." 126 | assert str(~Charclass("a")) == "[^a]" 127 | assert str(~Charclass("{")) == "[^{]" 128 | assert str(~Charclass("\t")) == "[^\\t]" 129 | assert str(~Charclass("^")) == "[^\\^]" 130 | 131 | 132 | def test_charclass_negation() -> None: 133 | assert ~~Charclass("a") == Charclass("a") 134 | assert Charclass("a") == ~~Charclass("a") 135 | 136 | 137 | def test_charclass_union() -> None: 138 | # [ab] ∪ [bc] = [abc] 139 | assert Charclass("ab") | Charclass("bc") == Charclass("abc") 140 | # [ab] ∪ [^bc] = [^c] 141 | assert Charclass("ab") | ~Charclass("bc") == ~Charclass("c") 142 | # [^ab] ∪ [bc] = [^a] 143 | assert ~Charclass("ab") | Charclass("bc") == ~Charclass("a") 144 | # [^ab] ∪ [^bc] = [^b] 145 | assert ~Charclass("ab") | ~Charclass("bc") == ~Charclass("b") 146 | 147 | 148 | def test_charclass_intersection() -> None: 149 | # [ab] ∩ [bc] = [b] 150 | assert Charclass("ab") & Charclass("bc") == Charclass("b") 151 | # [ab] ∩ [^bc] = [a] 152 | assert Charclass("ab") & ~Charclass("bc") == Charclass("a") 153 | # [^ab] ∩ [bc] = [c] 154 | assert ~Charclass("ab") & Charclass("bc") == Charclass("c") 155 | # [^ab] ∩ [^bc] = [^abc] 156 | assert ~Charclass("ab") & ~Charclass("bc") == ~Charclass("abc") 157 | 158 | assert (Charclass("ab") & Charclass("bcd") & Charclass("abcde")) == Charclass("b") 159 | 160 | 161 | def test_empty() -> None: 162 | assert NULLCHARCLASS.empty() 163 | assert not DOT.empty() 164 | 165 | 166 | def test_repartition_elementary() -> None: 167 | assert repartition([Charclass("a")]) == { 168 | Charclass("a"): [Charclass("a")], 169 | } 170 | 171 | 172 | def test_repartition_elementary_2() -> None: 173 | assert repartition([Charclass("a"), ~Charclass("a")]) == { 174 | Charclass("a"): [Charclass("a")], 175 | ~Charclass("a"): [~Charclass("a")], 176 | } 177 | 178 | 179 | def test_repartition_basic() -> None: 180 | assert repartition([Charclass("a"), Charclass("abc")]) == { 181 | Charclass("a"): [ 182 | Charclass("a"), 183 | ], 184 | Charclass("abc"): [ 185 | Charclass("a"), 186 | Charclass("bc"), 187 | ], 188 | } 189 | 190 | 191 | def test_repartition_negation() -> None: 192 | assert repartition([Charclass("ab"), Charclass("a"), ~Charclass("ab")]) == { 193 | Charclass("ab"): [ 194 | Charclass("a"), 195 | Charclass("b"), 196 | ], 197 | Charclass("a"): [ 198 | Charclass("a"), 199 | ], 200 | ~Charclass("ab"): [ 201 | ~Charclass("ab"), 202 | ], 203 | } 204 | 205 | 206 | def test_repartition_negation_2() -> None: 207 | assert repartition([Charclass("ab"), Charclass("abc"), ~Charclass("ab")]) == { 208 | Charclass("ab"): [ 209 | Charclass("ab"), 210 | ], 211 | Charclass("abc"): [ 212 | Charclass("ab"), 213 | Charclass("c"), 214 | ], 215 | ~Charclass("ab"): [ 216 | ~Charclass("abc"), 217 | Charclass("c"), 218 | ], 219 | } 220 | assert repartition( 221 | [ 222 | ~Charclass("a"), 223 | ~Charclass("ab"), 224 | ~Charclass("abc"), 225 | ] 226 | ) == { 227 | ~Charclass("a"): [ 228 | ~Charclass("abc"), 229 | Charclass("b"), 230 | Charclass("c"), 231 | ], 232 | ~Charclass("ab"): [ 233 | ~Charclass("abc"), 234 | Charclass("c"), 235 | ], 236 | ~Charclass("abc"): [ 237 | ~Charclass("abc"), 238 | ], 239 | } 240 | 241 | 242 | def test_repartition_advanced() -> None: 243 | assert repartition( 244 | [ 245 | Charclass("a"), 246 | Charclass("bcdef"), 247 | ~Charclass("abcdef"), 248 | Charclass("abcd"), 249 | ~Charclass("abcd"), 250 | ] 251 | ) == { 252 | Charclass("a"): [Charclass("a")], 253 | Charclass("bcdef"): [ 254 | Charclass("bcd"), 255 | Charclass("ef"), 256 | ], 257 | ~Charclass("abcdef"): [ 258 | ~Charclass("abcdef"), 259 | ], 260 | Charclass("abcd"): [ 261 | Charclass("a"), 262 | Charclass("bcd"), 263 | ], 264 | ~Charclass("abcd"): [ 265 | ~Charclass("abcdef"), 266 | Charclass("ef"), 267 | ], 268 | } 269 | 270 | 271 | def test_repartition_advanced_2() -> None: 272 | assert repartition([WORDCHAR, DIGIT, DOT, NONDIGITCHAR, NULLCHARCLASS]) == { 273 | WORDCHAR: [ 274 | DIGIT, 275 | Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"), 276 | ], 277 | DIGIT: [DIGIT], 278 | DOT: [ 279 | ~Charclass((("0", "z"),)), 280 | DIGIT, 281 | Charclass(((":", "@"), ("[", "^"), ("`", "`"))), 282 | Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"), 283 | ], 284 | NONDIGITCHAR: [ 285 | ~Charclass((("0", "z"),)), 286 | Charclass(((":", "@"), ("[", "^"), ("`", "`"))), 287 | Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"), 288 | ], 289 | NULLCHARCLASS: [ 290 | # Yup, there's nothing here! 291 | # This should be impossible or at least cause no problems in practice 292 | ], 293 | } 294 | 295 | 296 | # This should take a reasonable amount of time 297 | # It was previously taking forever 298 | def test_charclass_by_category() -> None: 299 | out = {} 300 | for i in range(0x101000): 301 | c = chr(i) 302 | cat = unicodedata.category(c) 303 | if cat not in out: 304 | out[cat] = [c] 305 | else: 306 | out[cat].append(c) 307 | for cat, cs in out.items(): 308 | Charclass("".join(cs)) 309 | -------------------------------------------------------------------------------- /greenery/parse.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | __all__ = ( 4 | "parse", 5 | "NoMatch", 6 | ) 7 | 8 | from typing import Collection, Tuple, TypeVar 9 | 10 | from .bound import INF, Bound 11 | from .charclass import ( 12 | DIGIT, 13 | NONDIGITCHAR, 14 | NONSPACECHAR, 15 | NONWORDCHAR, 16 | SPACECHAR, 17 | WORDCHAR, 18 | Charclass, 19 | escapes, 20 | shorthand, 21 | ) 22 | from .multiplier import ONE, Multiplier, symbolic 23 | from .rxelems import Conc, Mult, Pattern 24 | 25 | # Currently many statements are grouped by `try/except NoMatch` in order to try 26 | # multiple matching functions in sequence. They can be refactored into smaller 27 | # functions to remove this suppression. 28 | # pylint: disable=too-many-try-statements 29 | 30 | T_co = TypeVar("T_co", covariant=True) 31 | 32 | 33 | class NoMatch(Exception): 34 | """ 35 | Thrown when parsing fails. 36 | Almost always caught and almost never fatal 37 | """ 38 | 39 | 40 | MatchResult = Tuple[T_co, int] 41 | 42 | 43 | def read_until(string: str, i: int, stop_char: str) -> MatchResult[str]: 44 | start = i 45 | while True: 46 | if i >= len(string): 47 | raise NoMatch 48 | if string[i] == stop_char: 49 | break 50 | i += 1 51 | return string[start:i], i + 1 52 | 53 | 54 | def static(haystack: str, i: int, needle: str) -> int: 55 | j = i + len(needle) 56 | if haystack[i:j] == needle: 57 | return j 58 | raise NoMatch 59 | 60 | 61 | def select_static(haystack: str, i: int, *needles: str) -> MatchResult[str]: 62 | for needle in needles: 63 | j = i + len(needle) 64 | if haystack[i:j] == needle: 65 | return needle, j 66 | raise NoMatch 67 | 68 | 69 | def unescape_hex(string: str, i: int) -> MatchResult[str]: 70 | """Turn e.g. "\\x40" into "@". Exactly two hex digits""" 71 | hex_digits = "0123456789AaBbCcDdEeFf" 72 | 73 | j = static(string, i, "\\x") 74 | 75 | hex1 = string[j] # e.g. "4" 76 | if hex1 not in hex_digits: 77 | raise NoMatch 78 | j += len(hex1) 79 | 80 | hex2 = string[j] # e.g. "0" 81 | if hex2 not in hex_digits: 82 | raise NoMatch 83 | j += len(hex2) 84 | 85 | codepoint = int(hex1 + hex2, 16) # e.g. 64 86 | char = chr(codepoint) # "@" 87 | return char, j 88 | 89 | 90 | def match_internal_char(string: str, i: int) -> MatchResult[str]: 91 | # e.g. if we see "\\t", return "\t" 92 | for char, escaped_mnemonic in escapes.items(): 93 | try: 94 | return char, static(string, i, escaped_mnemonic) 95 | except NoMatch: 96 | pass 97 | 98 | # special chars e.g. "\\-" returns "-" 99 | for char in Charclass.classSpecial: 100 | try: 101 | return char, static(string, i, "\\" + char) 102 | except NoMatch: 103 | pass 104 | 105 | # hex escape e.g. "\\x40" returns "@" 106 | try: 107 | return unescape_hex(string, i) 108 | except NoMatch: 109 | pass 110 | 111 | # single non-special character, not contained 112 | # inside square brackets 113 | char, j = string[i], i + 1 114 | if char in Charclass.classSpecial: 115 | raise NoMatch 116 | 117 | return char, j 118 | 119 | 120 | def match_inner_charclass( 121 | string: str, 122 | i: int, 123 | ) -> MatchResult[Charclass]: 124 | """ 125 | We have to return several ranges, because of \\\\w etc. 126 | """ 127 | # Attempt 1: shorthand 128 | inner_shorthand = { 129 | "\\w": WORDCHAR, 130 | "\\d": DIGIT, 131 | "\\s": SPACECHAR, 132 | "\\W": NONWORDCHAR, 133 | "\\D": NONDIGITCHAR, 134 | "\\S": NONSPACECHAR, 135 | # no ".": DOT, 136 | } 137 | 138 | for cc_shorthand, charclass in inner_shorthand.items(): 139 | try: 140 | return charclass, static(string, i, cc_shorthand) 141 | except NoMatch: 142 | pass 143 | 144 | # Attempt 2: a range e.g. "d-h" 145 | try: 146 | first, j = match_internal_char(string, i) # `first` is "d" 147 | k = static(string, j, "-") 148 | last, k = match_internal_char(string, k) # `last` is "h" 149 | return Charclass(((first, last),)), k 150 | except NoMatch: 151 | pass 152 | 153 | # Attempt 3: just a character on its own 154 | char, j = match_internal_char(string, i) 155 | return Charclass(((char, char),)), j 156 | 157 | 158 | def match_class_interior(string: str, i: int) -> MatchResult[Charclass]: 159 | inner_charclasses = [] 160 | try: 161 | while True: 162 | # Match an internal character, range, or other charclass predicate. 163 | inner_charclass, i = match_inner_charclass(string, i) 164 | inner_charclasses.append(inner_charclass) 165 | except NoMatch: 166 | pass 167 | 168 | # Use the existing Charclass union functionality 169 | charclass = Charclass() 170 | for inner_charclass in inner_charclasses: 171 | charclass |= inner_charclass 172 | 173 | return charclass, i 174 | 175 | 176 | def match_charclass(string: str, i: int) -> MatchResult[Charclass]: 177 | # pylint: disable=too-many-return-statements 178 | 179 | if i >= len(string): 180 | raise NoMatch 181 | 182 | # wildcard ".", "\\w", "\\d", etc. 183 | for shorthand_charclass, shorthand_abbrev in shorthand.items(): 184 | try: 185 | return shorthand_charclass, static(string, i, shorthand_abbrev) 186 | except NoMatch: 187 | pass 188 | 189 | # "[^dsgsdg]" 190 | try: 191 | j = static(string, i, "[^") 192 | result, j = match_class_interior(string, j) 193 | j = static(string, j, "]") 194 | return ~result, j 195 | except NoMatch: 196 | pass 197 | 198 | # "[sdfsf]" 199 | try: 200 | j = static(string, i, "[") 201 | result, j = match_class_interior(string, j) 202 | j = static(string, j, "]") 203 | return result, j 204 | except NoMatch: 205 | pass 206 | 207 | # e.g. if seeing "\\t", return "\t" 208 | for char, escaped_mnemonic in escapes.items(): 209 | try: 210 | return Charclass(((char, char),)), static(string, i, escaped_mnemonic) 211 | except NoMatch: 212 | pass 213 | 214 | # e.g. if seeing "\\{", return "{" 215 | for char in Charclass.allSpecial: 216 | try: 217 | return Charclass(((char, char),)), static(string, i, "\\" + char) 218 | except NoMatch: 219 | pass 220 | 221 | # e.g. if seeing "\\x40", return "@" 222 | try: 223 | char, j = unescape_hex(string, i) 224 | return Charclass(((char, char),)), j 225 | except NoMatch: 226 | pass 227 | 228 | # single non-special character, not contained inside square brackets 229 | char, i = string[i], i + 1 230 | if char in Charclass.allSpecial: 231 | raise NoMatch 232 | 233 | return Charclass(((char, char),)), i 234 | 235 | 236 | def match_multiplicand(string: str, i: int) -> MatchResult[Pattern | Charclass]: 237 | # explicitly non-capturing "(?:...)" syntax. No special significance 238 | try: 239 | j = static(string, i, "(?") 240 | opts, j = select_static(string, j, ":", "P<") 241 | if opts == "P<": 242 | _group_name, j = read_until(string, j, ">") 243 | pattern, j = match_pattern(string, j) 244 | j = static(string, j, ")") 245 | return pattern, j 246 | except NoMatch: 247 | pass 248 | 249 | # normal "(...)" syntax 250 | try: 251 | j = static(string, i, "(") 252 | pattern, j = match_pattern(string, j) 253 | j = static(string, j, ")") 254 | return pattern, j 255 | except NoMatch: 256 | pass 257 | 258 | # Just a `Charclass` on its own 259 | charclass, j = match_charclass(string, i) 260 | return charclass, j 261 | 262 | 263 | def match_any_of(string: str, i: int, collection: Collection[str]) -> MatchResult[str]: 264 | for char in collection: 265 | try: 266 | return char, static(string, i, char) 267 | except NoMatch: 268 | pass 269 | raise NoMatch 270 | 271 | 272 | def match_bound(string: str, i: int) -> MatchResult[Bound]: 273 | # "0" 274 | try: 275 | return Bound(0), static(string, i, "0") 276 | except NoMatch: 277 | pass 278 | 279 | # "1", etc. 280 | try: 281 | digit, j = match_any_of(string, i, "123456789") 282 | integer = int(digit) 283 | try: 284 | while True: 285 | digit, j = match_any_of(string, j, "0123456789") 286 | integer *= 10 287 | integer += int(digit) 288 | except NoMatch: 289 | return Bound(integer), j 290 | except NoMatch: 291 | pass 292 | 293 | # "" empty string = infinite bound as in "{4,}" 294 | return INF, i 295 | 296 | 297 | def match_nonempty_greedy_multiplier(string: str, i: int) -> MatchResult[Multiplier]: 298 | """ 299 | Any multiplier which isn't the default empty string (equivalent to `{1,1}`) 300 | """ 301 | # {2,3} or {2,} 302 | try: 303 | j = static(string, i, "{") 304 | min_, j = match_bound(string, j) 305 | j = static(string, j, ",") 306 | max_, j = match_bound(string, j) 307 | j = static(string, j, "}") 308 | return Multiplier(min_, max_), j 309 | except NoMatch: 310 | pass 311 | 312 | # {2} 313 | try: 314 | j = static(string, i, "{") 315 | min_, j = match_bound(string, j) 316 | j = static(string, j, "}") 317 | return Multiplier(min_, min_), j 318 | except NoMatch: 319 | pass 320 | 321 | # "?"/"*"/"+" 322 | for mult, symbol in symbolic.items(): 323 | if not symbol: 324 | continue 325 | try: 326 | return mult, static(string, i, symbol) 327 | except NoMatch: 328 | pass 329 | 330 | raise NoMatch 331 | 332 | 333 | def match_nonempty_multiplier(string: str, i: int) -> MatchResult[Multiplier]: 334 | multiplier, j = match_nonempty_greedy_multiplier(string, i) 335 | try: 336 | j = static(string, j, "?") 337 | except NoMatch: 338 | pass 339 | return multiplier, j 340 | 341 | 342 | def match_multiplier(string: str, i: int) -> MatchResult[Multiplier]: 343 | try: 344 | return match_nonempty_multiplier(string, i) 345 | except NoMatch: 346 | return ONE, i 347 | 348 | 349 | def match_mult(string: str, i: int) -> MatchResult[Mult]: 350 | multiplicand, j = match_multiplicand(string, i) 351 | multiplier, j = match_multiplier(string, j) 352 | return Mult(multiplicand, multiplier), j 353 | 354 | 355 | def match_conc(string: str, i: int) -> MatchResult[Conc]: 356 | mults = [] 357 | try: 358 | while True: 359 | m, i = match_mult(string, i) 360 | mults.append(m) 361 | except NoMatch: 362 | pass 363 | return Conc(*mults), i 364 | 365 | 366 | def match_pattern(string: str, i: int) -> MatchResult[Pattern]: 367 | concs = [] 368 | 369 | # first one 370 | c, i = match_conc(string, i) 371 | concs.append(c) 372 | 373 | # the rest 374 | while True: 375 | try: 376 | i = static(string, i, "|") 377 | c, i = match_conc(string, i) 378 | concs.append(c) 379 | except NoMatch: 380 | return Pattern(*concs), i 381 | 382 | 383 | def parse(string: str) -> Pattern: 384 | """ 385 | Parse a full string and return a `Pattern` object. Fail if 386 | the whole string wasn't parsed 387 | """ 388 | obj, i = match_pattern(string, 0) 389 | if i != len(string): 390 | raise NoMatch(f"Could not parse {string!r} beyond index {i}") 391 | return obj 392 | -------------------------------------------------------------------------------- /greenery/charclass.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=fixme,too-many-locals,too-many-branches 2 | 3 | from __future__ import annotations 4 | 5 | __all__ = ( 6 | "Charclass", 7 | "DIGIT", 8 | "DOT", 9 | "NONDIGITCHAR", 10 | "NONSPACECHAR", 11 | "NONWORDCHAR", 12 | "NULLCHARCLASS", 13 | "SPACECHAR", 14 | "WORDCHAR", 15 | "escapes", 16 | "negate", 17 | "shorthand", 18 | "repartition", 19 | ) 20 | 21 | from dataclasses import dataclass 22 | from typing import ClassVar, Dict, Iterable, Iterator, List, Mapping, Tuple 23 | 24 | NUM_UNICODE_CHARS = (1 << 16) + (1 << 20) 25 | 26 | 27 | def negate(ord_ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]: 28 | u = 0 29 | negated = [] 30 | for ord_range in ord_ranges: 31 | if u < ord_range[0]: 32 | negated.append((u, ord_range[0] - 1)) 33 | u = ord_range[1] + 1 34 | if u < NUM_UNICODE_CHARS - 1: 35 | negated.append((u, NUM_UNICODE_CHARS - 1)) 36 | return negated 37 | 38 | 39 | def collapse_ord_ranges(ord_ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]: 40 | """ 41 | Assume all existing ord ranges are sorted, and also disjoint 42 | So no cases of [[12, 17], [2, 3]] or [[4, 6], [7, 8]]. 43 | """ 44 | collapsed: List[Tuple[int, int]] = [] 45 | 46 | for ord_range in sorted(ord_ranges): 47 | if not collapsed or collapsed[-1][1] + 1 < ord_range[0]: 48 | collapsed.append(ord_range) 49 | elif ord_range[1] > collapsed[-1][1]: 50 | # merge into previous 51 | collapsed[-1] = (collapsed[-1][0], ord_range[1]) 52 | 53 | return collapsed 54 | 55 | 56 | @dataclass(frozen=True, init=False) 57 | class Charclass: 58 | """ 59 | A `Charclass` is basically a `frozenset` of symbols. 60 | A `Charclass` with the `negated` flag set is assumed 61 | to contain every symbol that is in the alphabet of all symbols but not 62 | explicitly listed inside the frozenset. e.g. [^a]. This is very handy 63 | if the full alphabet is extremely large, but also requires dedicated 64 | combination functions. 65 | """ 66 | 67 | ord_ranges: List[Tuple[int, int]] 68 | negated: bool 69 | 70 | def __init__( 71 | self, ranges: str | Tuple[Tuple[str, str], ...] = "", negated: bool = False 72 | ): 73 | if isinstance(ranges, str): 74 | ranges = tuple((char, char) for char in ranges) 75 | if not isinstance(ranges, tuple): 76 | raise TypeError(f"Bad ranges: {ranges!r}") 77 | for r in ranges: 78 | if len(r) != 2 or r[0] > r[1]: 79 | raise ValueError(f"Bad range: {r!r}") 80 | for char in r: 81 | if not isinstance(char, str): 82 | raise TypeError(f"Can't put {char!r} in a `Charclass`", char) 83 | if len(char) != 1: 84 | raise ValueError("`Charclass` can only contain single chars", char) 85 | 86 | # Rebalance ranges! 87 | ord_ranges = [(ord(first), ord(last)) for first, last in ranges] 88 | ord_ranges = collapse_ord_ranges(ord_ranges) 89 | 90 | object.__setattr__(self, "ord_ranges", tuple(ord_ranges)) 91 | object.__setattr__(self, "negated", negated) 92 | 93 | def __lt__(self, other: Charclass, /) -> bool: 94 | if self.negated < other.negated: 95 | return True 96 | if ( 97 | self.negated == other.negated 98 | and self.ord_ranges[0][0] < other.ord_ranges[0][0] 99 | ): 100 | return True 101 | return False 102 | 103 | def __eq__(self, other: object, /) -> bool: 104 | return ( 105 | isinstance(other, Charclass) 106 | and self.ord_ranges == other.ord_ranges 107 | and self.negated == other.negated 108 | ) 109 | 110 | def __hash__(self, /) -> int: 111 | return hash((self.ord_ranges, self.negated)) 112 | 113 | # These are the characters carrying special meanings when they appear 114 | # "outdoors" within a regular expression. To be interpreted literally, they 115 | # must be escaped with a backslash. 116 | allSpecial: ClassVar[frozenset[str]] = frozenset("\\[]|().?*+{}") 117 | 118 | # These are the characters carrying special meanings when they appear 119 | # INSIDE a character class (delimited by square brackets) within a regular 120 | # expression. To be interpreted literally, they must be escaped with a 121 | # backslash. Notice how much smaller this class is than the one above; note 122 | # also that the hyphen and caret do NOT appear above. 123 | classSpecial: ClassVar[frozenset[str]] = frozenset("\\[]^-") 124 | 125 | def __str__(self, /) -> str: 126 | # pylint: disable=too-many-return-statements 127 | 128 | # e.g. \w 129 | if self in shorthand: 130 | return shorthand[self] 131 | 132 | # e.g. [^a] 133 | if self.negated: 134 | return f"[^{self.escape()}]" 135 | 136 | # single character, not contained inside square brackets. 137 | if len(self.ord_ranges) == 1 and self.ord_ranges[0][0] == self.ord_ranges[0][1]: 138 | u = self.ord_ranges[0][0] 139 | char = chr(u) 140 | 141 | # e.g. if char is "\t", return "\\t" 142 | if char in escapes: 143 | return escapes[char] 144 | 145 | if char in Charclass.allSpecial: 146 | return f"\\{char}" 147 | 148 | # If char is an ASCII control character, don't print it directly, 149 | # return a hex escape sequence e.g. "\\x00". Note that this 150 | # includes tab and other characters already handled above 151 | if 0 <= u <= 0x1F or u == 0x7F: 152 | return f"\\x{u:02x}" 153 | 154 | return char 155 | 156 | # multiple characters (or possibly 0 characters) 157 | return f"[{self.escape()}]" 158 | 159 | def escape(self, /) -> str: 160 | def escape_char(char: str, /) -> str: 161 | if char in Charclass.classSpecial: 162 | return f"\\{char}" 163 | if char in escapes: 164 | return escapes[char] 165 | 166 | # If char is an ASCII control character, don't print it directly, 167 | # return a hex escape sequence e.g. "\\x00". Note that this 168 | # includes tab and other characters already handled above 169 | if 0 <= ord(char) <= 0x1F or ord(char) == 0x7F: 170 | return f"\\x{ord(char):02x}" 171 | 172 | return char 173 | 174 | output = "" 175 | 176 | for first_u, last_u in self.ord_ranges: 177 | # there's no point in putting a range when the whole thing is 178 | # 3 characters or fewer. "abc" -> "abc" but "abcd" -> "a-d" 179 | if last_u <= first_u + 2: 180 | # "a" or "ab" or "abc" or "abcd" 181 | for u in range(first_u, last_u + 1): 182 | output += escape_char(chr(u)) 183 | else: 184 | # "a-b" or "a-c" or "a-d" 185 | output += escape_char(chr(first_u)) + "-" + escape_char(chr(last_u)) 186 | 187 | return output 188 | 189 | def __repr__(self, /) -> str: 190 | sign = "~" if self.negated else "" 191 | ranges = tuple( 192 | (chr(first_u), chr(last_u)) for (first_u, last_u) in self.ord_ranges 193 | ) 194 | return f"{sign}Charclass({ranges!r})" 195 | 196 | def reduce(self, /) -> Charclass: 197 | # `Charclass`es cannot be reduced. 198 | return self 199 | 200 | def empty(self, /) -> bool: 201 | return not self.ord_ranges and not self.negated 202 | 203 | # set operations 204 | def negate(self, /) -> Charclass: 205 | """ 206 | Negate the current `Charclass`. e.g. [ab] becomes [^ab]. Call 207 | using "charclass2 = ~charclass1" 208 | """ 209 | ranges = tuple( 210 | (chr(first_u), chr(last_u)) for (first_u, last_u) in self.ord_ranges 211 | ) 212 | return Charclass(ranges, negated=not self.negated) 213 | 214 | def __invert__(self, /) -> Charclass: 215 | return self.negate() 216 | 217 | def get_chars(self, /) -> Iterator[str]: 218 | """ 219 | Use this with caution, it can iterate over 1,000,000+ characters 220 | """ 221 | for first_u, last_u in self.ord_ranges: 222 | for u in range(first_u, last_u + 1): 223 | yield chr(u) 224 | 225 | def num_chars(self, /) -> int: 226 | num = 0 227 | for first_u, last_u in self.ord_ranges: 228 | num += last_u + 1 - first_u 229 | return NUM_UNICODE_CHARS - num if self.negated else num 230 | 231 | def accepts(self, char: str, /) -> bool: 232 | u = ord(char) 233 | for first_u, last_u in self.ord_ranges: 234 | if first_u <= u <= last_u: 235 | return not self.negated 236 | return self.negated 237 | 238 | def reversed(self, /) -> Charclass: 239 | return self 240 | 241 | def union(self, other: Charclass, /) -> Charclass: 242 | # TODO: make this able to efficiently unite many Charclasses at once, 243 | # again 244 | self_ord_ranges = list(self.ord_ranges) 245 | if self.negated: 246 | self_ord_ranges = negate(self_ord_ranges) 247 | 248 | other_ord_ranges = list(other.ord_ranges) 249 | if other.negated: 250 | other_ord_ranges = negate(other_ord_ranges) 251 | 252 | new_ord_ranges = [] 253 | new_ord_ranges.extend(self_ord_ranges) 254 | new_ord_ranges.extend(other_ord_ranges) 255 | new_ord_ranges = collapse_ord_ranges(new_ord_ranges) 256 | 257 | new_negated = self.negated or other.negated 258 | if new_negated: 259 | new_ord_ranges = negate(new_ord_ranges) 260 | new_ranges = tuple( 261 | (chr(first_u), chr(last_u)) for (first_u, last_u) in new_ord_ranges 262 | ) 263 | return Charclass(new_ranges, new_negated) 264 | 265 | __or__ = union 266 | 267 | def issubset(self, other: Charclass, /) -> bool: 268 | return self | other == other 269 | 270 | def intersection(self, other: Charclass, /) -> Charclass: 271 | # TODO: is this actually efficient? 272 | # TODO: make this able to efficiently intersect many Charclasses at once, 273 | # again 274 | return ~(~self | ~other) 275 | 276 | __and__ = intersection 277 | 278 | 279 | # Standard character classes 280 | WORDCHAR = Charclass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz") 281 | DIGIT = Charclass("0123456789") 282 | SPACECHAR = Charclass("\t\n\v\f\r ") 283 | 284 | # This `Charclass` expresses "no possibilities at all" 285 | # and can never match anything. 286 | NULLCHARCLASS = Charclass() 287 | 288 | NONWORDCHAR = ~WORDCHAR 289 | NONDIGITCHAR = ~DIGIT 290 | NONSPACECHAR = ~SPACECHAR 291 | DOT = ~NULLCHARCLASS 292 | 293 | # Textual representations of standard character classes 294 | shorthand: Mapping[Charclass, str] = { 295 | WORDCHAR: "\\w", 296 | DIGIT: "\\d", 297 | SPACECHAR: "\\s", 298 | NONWORDCHAR: "\\W", 299 | NONDIGITCHAR: "\\D", 300 | NONSPACECHAR: "\\S", 301 | DOT: ".", 302 | } 303 | 304 | # Characters which users may escape in a regex instead of inserting them 305 | # literally. In ASCII order: 306 | escapes: Mapping[str, str] = { 307 | "\t": "\\t", # tab 308 | "\n": "\\n", # line feed 309 | "\v": "\\v", # vertical tab 310 | "\f": "\\f", # form feed 311 | "\r": "\\r", # carriage return 312 | } 313 | 314 | 315 | def repartition( 316 | charclasses: Iterable[Charclass], 317 | ) -> Mapping[Charclass, Iterable[Charclass]]: 318 | """ 319 | Accept an iterable of `Charclass`es which may overlap somewhat. 320 | Construct a minimal collection of `Charclass`es which partition the space 321 | of all possible characters and can be combined to create all of the 322 | originals. 323 | Return a map from each original `Charclass` to its constituent pieces. 324 | """ 325 | ord_range_boundaries = set() 326 | for charclass in charclasses: 327 | for first_u, last_u in charclass.ord_ranges: 328 | ord_range_boundaries.add(first_u) 329 | ord_range_boundaries.add(last_u + 1) 330 | ord_range_boundaries_2 = sorted(ord_range_boundaries) 331 | 332 | ord_ranges = [] 333 | for i, ord_range_boundary in enumerate(ord_range_boundaries_2): 334 | if i + 1 < len(ord_range_boundaries_2): 335 | ord_ranges.append((ord_range_boundary, ord_range_boundaries_2[i + 1] - 1)) 336 | 337 | # Group all of the possible ranges by "signature". 338 | # A signature is a tuple of Booleans telling us which character classes 339 | # a particular range is mentioned in. 340 | # (Whether it's *accepted* is actually not relevant.) 341 | signatures: Dict[Tuple[bool, ...], List[Tuple[int, int]]] = {} 342 | for ord_range in ord_ranges: 343 | signature = [] 344 | for charclass in charclasses: 345 | ord_range_in_charclass = False 346 | for x in charclass.ord_ranges: 347 | if x[0] <= ord_range[0] and ord_range[1] <= x[1]: 348 | ord_range_in_charclass = True 349 | break 350 | signature.append(ord_range_in_charclass) 351 | signature2 = tuple(signature) 352 | if signature2 not in signatures: 353 | signatures[signature2] = [] 354 | signatures[signature2].append(ord_range) 355 | 356 | # From the signatures we can gather the new Charclasses 357 | newcharclasses = [] 358 | newcharclasses.append( 359 | ~Charclass( 360 | tuple((chr(first_u), chr(last_u)) for (first_u, last_u) in ord_ranges) 361 | ) 362 | ) 363 | for ord_ranges2 in signatures.values(): 364 | newcharclasses.append( 365 | Charclass( 366 | tuple((chr(first_u), chr(last_u)) for (first_u, last_u) in ord_ranges2) 367 | ) 368 | ) 369 | 370 | # Now compute the breakdowns 371 | partition: Dict[Charclass, List[Charclass]] = {} 372 | for charclass in charclasses: 373 | partition[charclass] = [] 374 | for newcharclass in newcharclasses: 375 | if newcharclass.issubset(charclass): 376 | partition[charclass].append(newcharclass) 377 | 378 | return partition 379 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # greenery 2 | 3 | Tools for parsing and manipulating regular expressions. Note that this is a very different concept from that of simply *creating and using* those regular expressions, functionality which is present in basically every programming language in the world, [Python included](http://docs.python.org/library/re.html). 4 | 5 | This project was undertaken because I wanted to be able to **compute the intersection between two regular expressions**. The "intersection" is the set of strings which both regular expressions will accept, represented as a third regular expression. 6 | 7 | ## Installation 8 | 9 | ```sh 10 | pip install greenery 11 | ``` 12 | 13 | ## Example 14 | 15 | ```python 16 | from greenery import parse 17 | 18 | print(parse("abc...") & parse("...def")) 19 | # "abcdef" 20 | 21 | print(parse("\d{4}-\d{2}-\d{2}") & parse("19.*")) 22 | # "19\d{2}-\d{2}-\d{2}" 23 | 24 | print(parse("\W*") & parse("[a-g0-8$%\^]+") & parse("[^d]{2,8}")) 25 | # "[$%\^]{2,8}" 26 | 27 | print(parse("[bc]*[ab]*") & parse("[ab]*[bc]*")) 28 | # "([ab]*a|[bc]*c)?b*" 29 | 30 | print(parse("a*") & parse("b*")) 31 | # "" 32 | 33 | print(parse("a") & parse("b")) 34 | # "[]" 35 | ``` 36 | 37 | In the penultimate example, the empty string is returned, because only the empty string is in both of the regular languages `a*` and `b*`. In the final example, an empty character class has been returned. An empty character class can never match anything, which means `greenery` can use this to represent a regular expression which matches no strings at all. Note that this is different from only matching the empty string. 38 | 39 | Internally, `greenery` works by converting regular expressions to finite state machines, computing the intersection of the two FSMs as a third FSM, and using the Brzozowski algebraic method (*q.v.*) to convert the third FSM back to a regular expression. 40 | 41 | ## API 42 | 43 | ### parse(string) 44 | 45 | This function takes a regular expression (_i.e._ a string) as input and returns a `Pattern` object (see below) representing that regular expression. 46 | 47 | The following metacharacters and formations have their usual meanings: `.`, `*`, `+`, `?`, `{m}`, `{m,}`, `{m,n}`, `()`, `|`, `[]`, `^` within `[]` character ranges only, `-` within `[]` character ranges only, and `\` to escape any of the preceding characters or itself. 48 | 49 | These character escapes are possible: `\t`, `\r`, `\n`, `\f`, `\v`. 50 | 51 | These predefined character sets also have their usual meanings: `\w`, `\d`, `\s` and their negations `\W`, `\D`, `\S`. `.` matches any character, including new line characters and carriage returns. 52 | 53 | An empty charclass `[]` is legal and matches no characters: when used in a regular expression, the regular expression may match no strings. 54 | 55 | #### Unsupported constructs 56 | 57 | * This method is intentionally rigorously simple, and tolerates no ambiguity. For example, a hyphen must be escaped in a character class even if it appears first or last. `[-abc]` is a syntax error, write `[\-abc]`. Escaping something which doesn't need it is a syntax error too: `[\ab]` resolves to neither `[\\ab]` nor `[ab]`. 58 | 59 | * The `^` and `$` metacharacters are not supported. By default, `greenery` assumes that all regexes are anchored at the start and end of any input string. Carets and dollar signs will be parsed as themselves. If you want to *not* anchor at the start or end of the string, put `.*` at the start or end of your regex respectively. 60 | 61 | This is because computing the intersection between `.*a.*` and `.*b.*` (1) is largely pointless and (2) usually results in gibberish coming out of the program. 62 | 63 | * The non-greedy operators `*?`, `+?`, `??` and `{m,n}?` are permitted but do nothing. This is because they do not alter the regular language. For example, `abc{0,5}def` and `abc{0,5}?def` represent precisely the same set of strings. 64 | 65 | * Parentheses are used to alternate between multiple possibilities e.g. `(a|bc)` only, not for capture grouping. Here's why: 66 | 67 | ```python 68 | print(parse("(ab)c") & parse("a(bc)")) 69 | # "abc" 70 | ``` 71 | 72 | * The `(?:...)` syntax for non-capturing groups is permitted, but does nothing. 73 | 74 | * Other `(?...)` constructs are not supported (and most are not [regular in the computer science sense](http://en.wikipedia.org/wiki/Regular_language)). 75 | 76 | * Back-references, such as `([aeiou])\1`, are not regular. 77 | 78 | ### Pattern 79 | 80 | A `Pattern` represents a regular expression and exposes various methods for manipulating it and combining it with other regular expressions. `Pattern`s are immutable. 81 | 82 | A regular language is a possibly-infinite set of strings. With this in mind, `Pattern` implements numerous [methods like those on `frozenset`](https://docs.python.org/3/library/stdtypes.html#frozenset), as well as many regular expression-specific methods. 83 | 84 | It's not intended that you construct new `Pattern` instances directly; use `parse(string)`, above. 85 | 86 | Method | Behaviour 87 | ---|--- 88 | `pattern.matches("a")`
`"a" in pattern` | Returns `True` if the regular expression matches the string or `False` if not. 89 | `pattern.strings()`
`for string in pattern` | Returns a generator of all the strings that this regular expression matches. 90 | `pattern.empty()` | Returns `True` if this regular expression matches no strings, otherwise `False`. 91 | `pattern.cardinality()`
`len(pattern)` | Returns the number of strings which the regular expression matches. Throws an `OverflowError` if this number is infinite. 92 | `pattern1.equivalent(pattern2)` | Returns `True` if the two regular expressions match exactly the same strings, otherwise `False`. 93 | `pattern.copy()` | Returns a shallow copy of `pattern`. 94 | `pattern.everythingbut()` | Returns a regular expression which matches every string not matched by the original. `pattern.everythingbut().everythingbut()` matches the same strings as `pattern`, but is not necessarily identical in structure. 95 | `pattern.reversed()`
`reversed(pattern)` | Returns a reversed regular expression. For each string that `pattern` matched, `reversed(pattern)` will match the reversed string. `reversed(reversed(pattern))` matches the same strings as `pattern`, but is not necessarily identical. 96 | `pattern.times(star)`
`pattern * star` | Returns the input regular expression multiplied by any `Multiplier` (see below). 97 | `pattern1.concatenate(pattern2, ...)`
`pattern1 + pattern2 + ...` | Returns a regular expression which matches any string of the form *a·b·...* where *a* is a string matched by `pattern1`, *b* is a string matched by `pattern2` and so on. 98 | `pattern1.union(pattern2, ...)`
`pattern1 \| pattern2 \| ...` | Returns a regular expression matching any string matched by any of the input regular expressions. This is also called *alternation*. 99 | `pattern1.intersection(pattern2, ...)`
`pattern1 & pattern2 & ...` | Returns a regular expression matching any string matched by all input regular expressions. The successful implementation of this method was the ultimate goal of this entire project. 100 | `pattern1.difference(pattern2, ...)`
`pattern1 - pattern2 - ...` | Subtract the set of strings matched by `pattern2` onwards from those matched by `pattern1` and return the resulting regular expression. 101 | `pattern1.symmetric_difference(pattern2, ...)`
`pattern1 ^ pattern2 ^ ...` | Returns a regular expression matching any string accepted by `pattern1` or `pattern2` but not both. 102 | `pattern.derive("a")` | Return the [Brzozowski derivative](https://en.wikipedia.org/wiki/Brzozowski_derivative) of the input regular expression with respect to "a". 103 | `pattern.reduce()` | Returns a regular expression which is equivalent to `pattern` (*i.e.* matches exactly the same strings) but is simplified as far as possible. See dedicated section below. 104 | 105 | #### pattern.reduce() 106 | 107 | Call this method to try to simplify the regular expression object. The follow simplification heuristics are supported: 108 | 109 | * `(ab|cd|ef|)g` to `(ab|cd|ef)?g` 110 | * `([ab])*` to `[ab]*` 111 | * `ab?b?c` to `ab{0,2}c` 112 | * `aa` to `a{2}` 113 | * `a(d(ab|a*c))` to `ad(ab|a*c)` 114 | * `0|[2-9]` to `[02-9]` 115 | * `abc|ade` to `a(bc|de)` 116 | * `xyz|stz` to `(xy|st)z` 117 | * `abc()def` to `abcdef` 118 | * `a{1,2}|a{3,4}` to `a{1,4}` 119 | 120 | The value returned is a new `Pattern` object. 121 | 122 | Note that in a few cases this did *not* result in a shorter regular expression. 123 | 124 | ### Multiplier 125 | 126 | A combination of a finite lower `Bound` (see below) and a possibly-infinite upper `Bound`. 127 | 128 | ```python 129 | from greenery import parse, Bound, INF, Multiplier 130 | 131 | print(parse("a") * Multiplier(Bound(3), INF)) # "a{3,}" 132 | ``` 133 | 134 | ### STAR 135 | 136 | Special `Multiplier`, equal to `Multiplier(Bound(0), INF)`. When it appears in a regular expression, this is `{0,}` or the [Kleene star](https://en.wikipedia.org/wiki/Kleene_star) `*`. 137 | 138 | ### QM 139 | 140 | Special `Multiplier`, equal to `Multiplier(Bound(0), Bound(1))`. When it appears in a regular expression, this is `{0,1}` or `?`. 141 | 142 | ### PLUS 143 | 144 | Special `Multiplier`, equal to `Multiplier(Bound(1), INF)`. When it appears in a regular expression, this is `{1,}` or `+`. 145 | 146 | ### Bound 147 | 148 | Represents a non-negative integer or infinity. 149 | 150 | ### INF 151 | 152 | Special `Bound` representing no limit. Can be used as an upper bound only. 153 | 154 | ### Charclass 155 | 156 | This class represents a _character class_ such as `a`, `\w`, `.`, `[A-Za-z0-9_]`, and so on. `Charclass`es must be constructed longhand either using a string containing all the desired characters, or a tuple of ranges, where each range is a pair of characters to be used as the range's inclusive endpoints. Use `~` to negate a `Charclass`. 157 | 158 | * `a` = `Charclass("a")` 159 | * `[abyz]` = `Charclass("abyz")` 160 | * `[a-z]` = `Charclass("abcdefghijklmnopqrstuvwxyz")` or `Charclass((("a", "z"),))` 161 | * `\w` = `Charclass((("a", "z"), ("A", "Z"), ("0", "9"), ("_", "_")))` 162 | * `[^x]` = `~Charclass("x")` 163 | * `\D` = `~Charclass("0123456789")` 164 | * `.` = `~Charclass(())` 165 | 166 | ### Fsm 167 | 168 | An `Fsm` is a finite state machine which accepts strings (or more generally iterables of Unicode characters) as input. This is used internally by `Pattern` for most regular expression manipulation operations. 169 | 170 | In theory, accepting strings as input means that every `Fsm`'s alphabet is the same: the set of all 1,114,112 possible Unicode characters which can make up a string. But this is a very large alphabet and would result in extremely large transition maps, and have very poor performance. So, in practice, `Fsm` uses not single characters but `Charclass`es (see above) for its alphabet and its map transitions. 171 | 172 | ```python 173 | # FSM accepting only the string "a" 174 | a = Fsm( 175 | alphabet={Charclass("a"), ~Charclass("a")}, 176 | states={0, 1, 2}, 177 | initial=0, 178 | finals={1}, 179 | map={ 180 | 0: {Charclass("a"): 1, ~Charclass("a"): 2}, 181 | 1: {Charclass("a"): 2, ~Charclass("a"): 2}, 182 | 2: {Charclass("a"): 2, ~Charclass("a"): 2}, 183 | }, 184 | ) 185 | ``` 186 | 187 | Notes: 188 | 189 | * The `Charclass`es which make up the alphabet must _partition_ the space of all Unicode characters - every Unicode character must be a member of exactly one `Charclass` in the alphabet. 190 | * States must be integers. 191 | * The map must be complete. Omitting transition symbols or states is not permitted. 192 | 193 | A regular language is a possibly-infinite set of strings. With this in mind, `Fsm` implements several [methods like those on `frozenset`](https://docs.python.org/3/library/stdtypes.html#frozenset). 194 | 195 | Method | Behaviour 196 | ---|--- 197 | `fsm.accepts("a")` | Returns `True` if the FSM accepts string or `False` if not. 198 | `fsm.strings()` | Returns a generator of all the strings which this FSM accepts. 199 | `fsm.empty()` | Returns `True` if this FSM accepts no strings, otherwise `False`. 200 | `fsm.cardinality()` | Returns the number of strings which the FSM accepts. Throws an `OverflowError` if this number is infinite. 201 | `fsm1.equivalent(fsm2)` | Returns `True` if the two FSMs accept exactly the same strings, otherwise `False`. 202 | `fsm.copy()` | Returns a shallow copy of `fsm`. 203 | `fsm.everythingbut()` | Returns an FSM which accepts every string not matched by the original. `fsm.everythingbut().everythingbut()` matches the same strings as `fsm`. 204 | `fsm1.concatenate(fsm2, ...)` | Returns an FSM which accepts any string of the form *a·b·...* where *a* is a string accepted by `fsm1`, *b* is a string accepted by `fsm2` and so on. 205 | `fsm.times(multiplier)` | Returns the input FSM concatenated with itself `multiplier` times. `multiplier` must be a non-negative integer. 206 | `fsm.star()` | Returns an FSM which is the Kleene star closure of the original. 207 | `fsm1.union(fsm2, ...)` | Returns an FSM accepting any string matched by any of the input FSMs. This is also called *alternation*. 208 | `fsm1.intersection(fsm2, ...)` | Returns an FSM accepting any string matched by all input FSMs. 209 | `fsm1.difference(fsm2, ...)` | Subtract the set of strings matched by `fsm2` onwards from those matched by `fsm1` and return the resulting FSM. 210 | `fsm1.symmetric_difference(fsm2, ...)` | Returns an FSM matching any string accepted by `fsm1` or `fsm2` but not both. 211 | `fsm.derive(string)` | Return the [Brzozowski derivative](https://en.wikipedia.org/wiki/Brzozowski_derivative) of the input FSM with respect to the input string. 212 | `fsm.reduce()` | Returns an FSM which is equivalent to `fsm` (*i.e.* accepts exactly the same strings) but has a minimal number of states. 213 | 214 | Note that methods combining FSMs usually output new FSMs with modified alphabets. For example, concatenating an FSM with alphabet `{Charclass("a"), ~Charclass("a")}` and another FSM with alphabet `{Charclass("abc"), ~Charclass("abc")}` usually results in a third FSM with a _repartitioned_ alphabet of `{Charclass("a"), Charclass("bc"), ~Charclass("abc")}`. Notice how all three alphabets partition the space of all Unicode characters. 215 | 216 | Several other methods on `Fsm` instances are available - these should not be used, they're subject to change. 217 | 218 | ### EPSILON 219 | 220 | Special `Fsm` which accepts only the empty string. 221 | 222 | ### NULL 223 | 224 | Special `Fsm` which accepts no strings. 225 | 226 | ## Development 227 | 228 | ### Running tests 229 | 230 | ```sh 231 | pip install -r requirements.dev.txt 232 | isort . 233 | black . 234 | mypy greenery 235 | flake8 --count --statistics --show-source --select=E9,F63,F7,F82 . 236 | flake8 --count --statistics --exit-zero --max-complexity=10 . 237 | pylint --recursive=true . 238 | pytest 239 | ``` 240 | 241 | ### Building and publishing new versions 242 | 243 | * Update the version in `./setup.py` 244 | * Trash `./dist` 245 | * `python -m build` - creates a `./dist` directory with some stuff in it 246 | * `python -m twine upload dist/*` 247 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-allow-list= 7 | math, 8 | 9 | # A comma-separated list of package or module names from where C extensions may 10 | # be loaded. Extensions are loading into the active Python interpreter and may 11 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list 12 | # for backward compatibility.) 13 | extension-pkg-whitelist= 14 | 15 | # Return non-zero exit code if any of these messages/categories are detected, 16 | # even if score is above --fail-under value. Syntax same as enable. Messages 17 | # specified are enabled, while categories only check already-enabled messages. 18 | fail-on= 19 | 20 | # Specify a score threshold to be exceeded before program exits with error. 21 | fail-under=10.0 22 | 23 | # Files or directories to be skipped. They should be base names, not paths. 24 | ignore= 25 | dist, 26 | env, 27 | venv, 28 | 29 | # Add files or directories matching the regex patterns to the ignore-list. The 30 | # regex matches against paths. 31 | ignore-paths= 32 | 33 | # Files or directories matching the regex patterns are skipped. The regex 34 | # matches against base names, not paths. 35 | ignore-patterns= 36 | # Anything beginning with a dot. 37 | ^\..+$, 38 | # Anything ending in `.venv` (e.g. dev.venv) 39 | \.venv$, 40 | # Anything ending in `.egg-info` 41 | \.egg-info$, 42 | # Anything like __pycache__ or __mypycache__ or whatever. 43 | ^__.*cache.*__$, 44 | 45 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 46 | # number of processors available to use. 47 | jobs=0 48 | 49 | # Control the amount of potential inferred values when inferring a single 50 | # object. This can help the performance when dealing with large functions or 51 | # complex, nested conditions. 52 | limit-inference-results=100 53 | 54 | # List of plugins (as comma separated values of python module names) to load, 55 | # usually to register additional checkers. 56 | load-plugins= 57 | pylint.extensions.bad_builtin, 58 | pylint.extensions.broad_try_clause, 59 | pylint.extensions.check_elif, 60 | pylint.extensions.code_style, 61 | pylint.extensions.comparison_placement, 62 | pylint.extensions.confusing_elif, 63 | pylint.extensions.consider_ternary_expression, 64 | pylint.extensions.docparams, 65 | pylint.extensions.docstyle, 66 | pylint.extensions.empty_comment, 67 | pylint.extensions.eq_without_hash, 68 | pylint.extensions.for_any_all, 69 | pylint.extensions.mccabe, 70 | pylint.extensions.no_self_use, 71 | pylint.extensions.overlapping_exceptions, 72 | pylint.extensions.private_import, 73 | pylint.extensions.redefined_loop_name, 74 | pylint.extensions.redefined_variable_type, 75 | pylint.extensions.set_membership, 76 | pylint.extensions.typing, 77 | pylint.extensions.while_used, 78 | 79 | # Pickle collected data for later comparisons. 80 | persistent=yes 81 | 82 | # Min Python version to use for version dependend checks. Will default to the 83 | # version used to run pylint. 84 | py-version=3.8 85 | 86 | # When enabled, pylint would attempt to guess common misconfiguration and emit 87 | # user-friendly hints instead of false-positive error messages. 88 | suggestion-mode=yes 89 | 90 | # Allow loading of arbitrary C extensions. Extensions are imported into the 91 | # active Python interpreter and may run arbitrary code. 92 | unsafe-load-any-extension=no 93 | 94 | 95 | [MESSAGES CONTROL] 96 | 97 | # Only show warnings with the listed confidence levels. Leave empty to show 98 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 99 | confidence= 100 | 101 | # Disable the message, report, category or checker with the given id(s). You 102 | # can either give multiple identifiers separated by comma (,) or put this 103 | # option multiple times (only on the command line, not in the configuration 104 | # file where it should appear only once). You can also use "--disable=all" to 105 | # disable everything first and then reenable specific checks. For example, if 106 | # you want to run only the similarities checker, you can use "--disable=all 107 | # --enable=similarities". If you want to run only the classes checker, but have 108 | # no Warning level messages displayed, use "--disable=all --enable=classes 109 | # --disable=W". 110 | disable= 111 | consider-using-assignment-expr, 112 | consider-using-augmented-assign, 113 | docstring-first-line-empty, 114 | duplicate-code, 115 | file-ignored, 116 | locally-disabled, 117 | missing-function-docstring, 118 | missing-module-docstring, 119 | no-method-argument, 120 | no-self-argument, 121 | similarities, 122 | suppressed-message, 123 | too-complex, 124 | too-few-public-methods, 125 | while-used, 126 | 127 | # Enable the message, report, category or checker with the given id(s). You can 128 | # either give multiple identifier separated by comma (,) or put this option 129 | # multiple time (only on the command line, not in the configuration file where 130 | # it should appear only once). See also the "--disable" option for examples. 131 | enable= 132 | bad-inline-option, 133 | c-extension-no-member, 134 | deprecated-pragma, 135 | raw-checker-failed, 136 | use-symbolic-message-instead, 137 | useless-suppression, 138 | use-implicit-booleaness-not-comparison-to-zero, 139 | use-implicit-booleaness-not-comparison-to-string, 140 | 141 | 142 | [REPORTS] 143 | 144 | # Python expression which should return a score less than or equal to 10. You 145 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 146 | # which contain the number of messages in each category, as well as 'statement' 147 | # which is the total number of statements analyzed. This score is used by the 148 | # global evaluation report (RP0004). 149 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 150 | 151 | # Template used to display messages. This is a python new-style format string 152 | # used to format the message information. See doc for all details. 153 | #msg-template= 154 | 155 | # Set the output format. Available formats are text, parseable, colorized, json 156 | # and msvs (visual studio). You can also give a reporter class, e.g. 157 | # mypackage.mymodule.MyReporterClass. 158 | output-format=text 159 | 160 | # Tells whether to display a full report or only the messages. 161 | reports=no 162 | 163 | # Activate the evaluation score. 164 | score=no 165 | 166 | 167 | [REFACTORING] 168 | 169 | # Maximum number of nested blocks for function / method body 170 | max-nested-blocks=5 171 | 172 | # Complete name of functions that never returns. When checking for 173 | # inconsistent-return-statements if a never returning function is called then 174 | # it will be considered as an explicit return statement and no message will be 175 | # printed. 176 | never-returning-functions=sys.exit,argparse.parse_error 177 | 178 | 179 | [LOGGING] 180 | 181 | # The type of string formatting that logging methods do. `old` means using % 182 | # formatting, `new` is for `{}` formatting. 183 | logging-format-style=old 184 | 185 | # Logging modules to check that the string format arguments are in logging 186 | # function parameter format. 187 | logging-modules=logging 188 | 189 | 190 | [SPELLING] 191 | 192 | # Limits count of emitted suggestions for spelling mistakes. 193 | max-spelling-suggestions=4 194 | 195 | # Spelling dictionary name. Available dictionaries: none. To make it work, 196 | # install the 'python-enchant' package. 197 | spelling-dict= 198 | 199 | # List of comma separated words that should be considered directives if they 200 | # appear and the beginning of a comment and should not be checked. 201 | spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: 202 | 203 | # List of comma separated words that should not be checked. 204 | spelling-ignore-words= 205 | 206 | # A path to a file that contains the private dictionary; one word per line. 207 | spelling-private-dict-file= 208 | 209 | # Tells whether to store unknown words to the private dictionary (see the 210 | # --spelling-private-dict-file option) instead of raising a message. 211 | spelling-store-unknown-words=no 212 | 213 | 214 | [MISCELLANEOUS] 215 | 216 | # List of note tags to take in consideration, separated by a comma. 217 | notes=FIXME, 218 | XXX, 219 | TODO 220 | 221 | # Regular expression of note tags to take in consideration. 222 | #notes-rgx= 223 | 224 | 225 | [TYPECHECK] 226 | 227 | # List of decorators that produce context managers, such as 228 | # contextlib.contextmanager. Add to this list to register other decorators that 229 | # produce valid context managers. 230 | contextmanager-decorators=contextlib.contextmanager 231 | 232 | # List of members which are set dynamically and missed by pylint inference 233 | # system, and so shouldn't trigger E1101 when accessed. Python regular 234 | # expressions are accepted. 235 | generated-members= 236 | 237 | # Tells whether missing members accessed in mixin class should be ignored. A 238 | # mixin class is detected if its name ends with "mixin" (case insensitive). 239 | ignore-mixin-members=yes 240 | 241 | # Tells whether to warn about missing members when the owner of the attribute 242 | # is inferred to be None. 243 | ignore-none=yes 244 | 245 | # This flag controls whether pylint should warn about no-member and similar 246 | # checks whenever an opaque object is returned when inferring. The inference 247 | # can return multiple potential results while evaluating a Python object, but 248 | # some branches might not be evaluated, which results in partial inference. In 249 | # that case, it might be useful to still emit no-member and other checks for 250 | # the rest of the inferred objects. 251 | ignore-on-opaque-inference=yes 252 | 253 | # List of class names for which member attributes should not be checked (useful 254 | # for classes with dynamically set attributes). This supports the use of 255 | # qualified names. 256 | ignored-classes=optparse.Values,thread._local,_thread._local 257 | 258 | # List of module names for which member attributes should not be checked 259 | # (useful for modules/projects where namespaces are manipulated during runtime 260 | # and thus existing member attributes cannot be deduced by static analysis). It 261 | # supports qualified module names, as well as Unix pattern matching. 262 | ignored-modules= 263 | 264 | # Show a hint with possible names when a member name was not found. The aspect 265 | # of finding the hint is based on edit distance. 266 | missing-member-hint=yes 267 | 268 | # The minimum edit distance a name should have in order to be considered a 269 | # similar match for a missing member name. 270 | missing-member-hint-distance=1 271 | 272 | # The total number of similar names that should be taken in consideration when 273 | # showing a hint for a missing member. 274 | missing-member-max-choices=1 275 | 276 | # List of decorators that change the signature of a decorated function. 277 | signature-mutators= 278 | 279 | 280 | [VARIABLES] 281 | 282 | # List of additional names supposed to be defined in builtins. Remember that 283 | # you should avoid defining new builtins when possible. 284 | additional-builtins= 285 | 286 | # Tells whether unused global variables should be treated as a violation. 287 | allow-global-unused-variables=yes 288 | 289 | # List of names allowed to shadow builtins 290 | allowed-redefined-builtins= 291 | 292 | # List of strings which can identify a callback function by name. A callback 293 | # name must start or end with one of those strings. 294 | callbacks=cb_, 295 | _cb 296 | 297 | # A regular expression matching the name of dummy variables (i.e. expected to 298 | # not be used). 299 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 300 | 301 | # Argument names that match this expression will be ignored. Default to name 302 | # with leading underscore. 303 | ignored-argument-names=_.*|^ignored_|^unused_ 304 | 305 | # Tells whether we should check for unused import in __init__ files. 306 | init-import=no 307 | 308 | # List of qualified module names which can have objects that can redefine 309 | # builtins. 310 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 311 | 312 | 313 | [FORMAT] 314 | 315 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 316 | expected-line-ending-format= 317 | 318 | # Regexp for a line that is allowed to be longer than the limit. 319 | ignore-long-lines=^\s*(# )??$ 320 | 321 | # Number of spaces of indent required inside a hanging or continued line. 322 | indent-after-paren=4 323 | 324 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 325 | # tab). 326 | indent-string=' ' 327 | 328 | # Maximum number of characters on a single line. 329 | max-line-length=88 330 | 331 | # Maximum number of lines in a module. 332 | max-module-lines=1000 333 | 334 | # Allow the body of a class to be on the same line as the declaration if body 335 | # contains single statement. 336 | single-line-class-stmt=no 337 | 338 | # Allow the body of an if to be on the same line as the test if there is no 339 | # else. 340 | single-line-if-stmt=no 341 | 342 | 343 | [SIMILARITIES] 344 | 345 | # Comments are removed from the similarity computation 346 | ignore-comments=yes 347 | 348 | # Docstrings are removed from the similarity computation 349 | ignore-docstrings=yes 350 | 351 | # Imports are removed from the similarity computation 352 | ignore-imports=no 353 | 354 | # Signatures are removed from the similarity computation 355 | ignore-signatures=no 356 | 357 | # Minimum lines number of a similarity. 358 | min-similarity-lines=4 359 | 360 | 361 | [BASIC] 362 | 363 | # Naming style matching correct argument names. 364 | argument-naming-style=snake_case 365 | 366 | # Regular expression matching correct argument names. Overrides argument- 367 | # naming-style. 368 | #argument-rgx= 369 | 370 | # Naming style matching correct attribute names. 371 | attr-naming-style=snake_case 372 | 373 | # Regular expression matching correct attribute names. Overrides attr-naming- 374 | # style. 375 | #attr-rgx= 376 | 377 | # Bad variable names which should always be refused, separated by a comma. 378 | bad-names=foo, 379 | bar, 380 | baz, 381 | toto, 382 | tutu, 383 | tata 384 | 385 | # Bad variable names regexes, separated by a comma. If names match any regex, 386 | # they will always be refused 387 | bad-names-rgxs= 388 | 389 | # Naming style matching correct class attribute names. 390 | class-attribute-naming-style=any 391 | 392 | # Regular expression matching correct class attribute names. Overrides class- 393 | # attribute-naming-style. 394 | #class-attribute-rgx= 395 | 396 | # Naming style matching correct class constant names. 397 | class-const-naming-style=UPPER_CASE 398 | 399 | # Regular expression matching correct class constant names. Overrides class- 400 | # const-naming-style. 401 | #class-const-rgx= 402 | 403 | # Naming style matching correct class names. 404 | class-naming-style=PascalCase 405 | 406 | # Regular expression matching correct class names. Overrides class-naming- 407 | # style. 408 | #class-rgx= 409 | 410 | # Naming style matching correct constant names. 411 | const-naming-style=UPPER_CASE 412 | 413 | # Regular expression matching correct constant names. Overrides const-naming- 414 | # style. 415 | #const-rgx= 416 | 417 | # Minimum line length for functions/classes that require docstrings, shorter 418 | # ones are exempt. 419 | docstring-min-length=-1 420 | 421 | # Naming style matching correct function names. 422 | function-naming-style=snake_case 423 | 424 | # Regular expression matching correct function names. Overrides function- 425 | # naming-style. 426 | #function-rgx= 427 | 428 | # Good variable names which should always be accepted, separated by a comma. 429 | good-names= 430 | # Exception 431 | ex, 432 | # "Element" 433 | el, 434 | # "Function" 435 | fn, 436 | # "it" as a trivial lambda argument; or an iterator. 437 | it, 438 | # "Operation" 439 | op, 440 | Run, 441 | _, 442 | 443 | # Good variable names regexes, separated by a comma. If names match any regex, 444 | # they will always be accepted 445 | good-names-rgxs= 446 | # Any lowercase single letter except "l" and "o". 447 | # May be pluralized. 448 | ^[abcdefghijkmnpqrstuvwxyz]s?$, 449 | 450 | # Type variables have slightly different conventions. 451 | typevar-rgx=_?[A-Z]\d*(_co|_contra)? 452 | 453 | # Include a hint for the correct naming format with invalid-name. 454 | include-naming-hint=no 455 | 456 | # Naming style matching correct inline iteration names. 457 | inlinevar-naming-style=any 458 | 459 | # Regular expression matching correct inline iteration names. Overrides 460 | # inlinevar-naming-style. 461 | #inlinevar-rgx= 462 | 463 | # Naming style matching correct method names. 464 | method-naming-style=snake_case 465 | 466 | # Regular expression matching correct method names. Overrides method-naming- 467 | # style. 468 | #method-rgx= 469 | 470 | # Naming style matching correct module names. 471 | module-naming-style=snake_case 472 | 473 | # Regular expression matching correct module names. Overrides module-naming- 474 | # style. 475 | #module-rgx= 476 | 477 | # Colon-delimited sets of names that determine each other's naming style when 478 | # the name regexes allow several styles. 479 | name-group= 480 | 481 | # Regular expression which should only match function or class names that do 482 | # not require a docstring. 483 | no-docstring-rgx=^_ 484 | 485 | # List of decorators that produce properties, such as abc.abstractproperty. Add 486 | # to this list to register other decorators that produce valid properties. 487 | # These decorators are taken in consideration only for invalid-name. 488 | property-classes=abc.abstractproperty 489 | 490 | # Naming style matching correct variable names. 491 | variable-naming-style=snake_case 492 | 493 | # Regular expression matching correct variable names. Overrides variable- 494 | # naming-style. 495 | #variable-rgx= 496 | 497 | 498 | [STRING] 499 | 500 | # This flag controls whether inconsistent-quotes generates a warning when the 501 | # character used as a quote delimiter is used inconsistently within a module. 502 | check-quote-consistency=yes 503 | 504 | # This flag controls whether the implicit-str-concat should generate a warning 505 | # on implicit string concatenation in sequences defined over several lines. 506 | check-str-concat-over-line-jumps=yes 507 | 508 | 509 | [IMPORTS] 510 | 511 | # List of modules that can be imported at any level, not just the top level 512 | # one. 513 | allow-any-import-level= 514 | 515 | # Allow wildcard imports from modules that define __all__. 516 | allow-wildcard-with-all=no 517 | 518 | # Analyse import fallback blocks. This can be used to support both Python 2 and 519 | # 3 compatible code, which means that the block might have code that exists 520 | # only in one or another interpreter, leading to false positives when analysed. 521 | analyse-fallback-blocks=no 522 | 523 | # Deprecated modules which should not be used, separated by a comma. 524 | deprecated-modules= 525 | 526 | # Output a graph (.gv or any supported image format) of external dependencies 527 | # to the given file (report RP0402 must not be disabled). 528 | ext-import-graph= 529 | 530 | # Output a graph (.gv or any supported image format) of all (i.e. internal and 531 | # external) dependencies to the given file (report RP0402 must not be 532 | # disabled). 533 | import-graph= 534 | 535 | # Output a graph (.gv or any supported image format) of internal dependencies 536 | # to the given file (report RP0402 must not be disabled). 537 | int-import-graph= 538 | 539 | # Force import order to recognize a module as part of the standard 540 | # compatibility libraries. 541 | known-standard-library= 542 | 543 | # Force import order to recognize a module as part of a third party library. 544 | known-third-party=enchant 545 | 546 | # Couples of modules and preferred modules, separated by a comma. 547 | preferred-modules= 548 | 549 | 550 | [CLASSES] 551 | 552 | # Warn about protected attribute access inside special methods 553 | check-protected-access-in-special-methods=no 554 | 555 | # List of method names used to declare (i.e. assign) instance attributes. 556 | defining-attr-methods=__init__, 557 | __new__, 558 | setUp, 559 | __post_init__ 560 | 561 | # List of member names, which should be excluded from the protected access 562 | # warning. 563 | exclude-protected=_asdict, 564 | _fields, 565 | _replace, 566 | _source, 567 | _make 568 | 569 | # List of valid names for the first argument in a class method. 570 | valid-classmethod-first-arg=cls 571 | 572 | # List of valid names for the first argument in a metaclass class method. 573 | valid-metaclass-classmethod-first-arg=cls 574 | 575 | 576 | [DESIGN] 577 | 578 | # List of qualified class names to ignore when counting class parents (see 579 | # R0901) 580 | ignored-parents= 581 | 582 | # Maximum number of arguments for function / method. 583 | max-args=5 584 | 585 | # Maximum number of attributes for a class (see R0902). 586 | max-attributes=7 587 | 588 | # Maximum number of boolean expressions in an if statement (see R0916). 589 | max-bool-expr=5 590 | 591 | # Maximum number of branch for function / method body. 592 | max-branches=12 593 | 594 | # Maximum number of locals for function / method body. 595 | max-locals=15 596 | 597 | # Maximum number of parents for a class (see R0901). 598 | max-parents=7 599 | 600 | # Maximum number of public methods for a class (see R0904). 601 | max-public-methods=20 602 | 603 | # Maximum number of return / yield for function / method body. 604 | max-returns=6 605 | 606 | # Maximum number of statements in function / method body. 607 | max-statements=50 608 | 609 | # Minimum number of public methods for a class (see R0903). 610 | min-public-methods=2 611 | 612 | 613 | [EXCEPTIONS] 614 | 615 | # Exceptions that will emit a warning when being caught. Defaults to 616 | # "BaseException, Exception". 617 | overgeneral-exceptions= 618 | builtins.BaseException, 619 | builtins.Exception, 620 | -------------------------------------------------------------------------------- /greenery/rxelems.py: -------------------------------------------------------------------------------- 1 | """ 2 | Because of the circularity between `Pattern`, `Conc` and `Mult`, all three 3 | need to be in the same source file? 4 | """ 5 | 6 | from __future__ import annotations 7 | 8 | __all__ = ( 9 | "Conc", 10 | "Mult", 11 | "Pattern", 12 | "from_fsm", 13 | ) 14 | 15 | from dataclasses import dataclass 16 | from enum import Enum, auto 17 | from functools import reduce 18 | from typing import Iterator 19 | 20 | from .bound import INF, Bound 21 | from .charclass import NULLCHARCLASS, Charclass 22 | from .fsm import EPSILON, NULL, Fsm, StateType, from_charclass 23 | from .multiplier import ONE, QM, STAR, ZERO, Multiplier 24 | 25 | 26 | @dataclass(frozen=True) 27 | class Conc: 28 | """ 29 | A `Conc` (short for "concatenation") is a tuple of `Mult`s i.e. an 30 | unbroken string of mults occurring one after the other. 31 | e.g. abcde[^fg]*h{4}[a-z]+(subpattern)(subpattern2) 32 | To express the empty string, use an empty `Conc`, Conc(). 33 | """ 34 | 35 | mults: tuple[Mult, ...] 36 | 37 | def __init__(self, /, *mults: Mult): 38 | object.__setattr__(self, "mults", tuple(mults)) 39 | 40 | def __eq__(self, other: object, /) -> bool: 41 | if not isinstance(other, type(self)): 42 | return NotImplemented 43 | return self.mults == other.mults 44 | 45 | def __hash__(self, /) -> int: 46 | return hash(self.mults) 47 | 48 | def __repr__(self, /) -> str: 49 | args = ", ".join(repr(mult) for mult in self.mults) 50 | return f"Conc({args})" 51 | 52 | def reduce(self) -> Conc: 53 | # pylint: disable=too-many-branches 54 | # pylint: disable=too-many-return-statements 55 | 56 | if self == NULLCONC: 57 | return self 58 | 59 | if self.empty(): 60 | return NULLCONC 61 | 62 | # Try recursively reducing our mults 63 | reduced = tuple(mult.reduce() for mult in self.mults) 64 | if reduced != self.mults: 65 | return Conc(*reduced).reduce() 66 | 67 | # strip out mults which can only match the empty string 68 | for i, mult in enumerate(self.mults): 69 | if ( 70 | # Conc contains "()" (i.e. a `Mult` containing only a `Pattern` 71 | # containing the empty string)? That can be removed 72 | # e.g. "a()b" -> "ab" 73 | mult.multiplicand == Pattern(EMPTYSTRING) 74 | # If a `Mult` has an empty multiplicand, we can only match it 75 | # zero times => empty string => remove it entirely 76 | # e.g. "a[]{0,3}b" -> "ab" 77 | or (mult.multiplicand.empty() and mult.multiplier.min == Bound(0)) 78 | # Failing that, we have a positive multiplicand which we 79 | # intend to match zero times. In this case the only possible 80 | # match is the empty string => remove it 81 | # e.g. "a[XYZ]{0}b" -> "ab" 82 | or mult.multiplier == ZERO 83 | ): 84 | new = self.mults[:i] + self.mults[i + 1 :] 85 | return Conc(*new).reduce() 86 | 87 | # We might be able to combine some mults together or at least simplify 88 | # the multiplier on one of them. 89 | if len(self.mults) > 1: 90 | for i in range(len(self.mults) - 1): 91 | r = self.mults[i] 92 | s = self.mults[i + 1] 93 | 94 | def to_pattern(multiplicand: Pattern | Charclass, /) -> Pattern: 95 | if isinstance(multiplicand, Pattern): 96 | return multiplicand 97 | return Pattern(Conc(Mult(multiplicand, ONE))) 98 | 99 | # so we can do intersection 100 | rm_pattern = to_pattern(r.multiplicand) 101 | sm_pattern = to_pattern(s.multiplicand) 102 | rm_sm_intersection = None 103 | 104 | # If R = S, then we can squish the multipliers together 105 | # e.g. ab?b?c -> ab{0,2}c 106 | if rm_pattern == sm_pattern: 107 | squished = Mult(rm_pattern, r.multiplier + s.multiplier) 108 | new = self.mults[:i] + (squished,) + self.mults[i + 2 :] 109 | return Conc(*new).reduce() 110 | 111 | # If R's language is a subset of S's, then R{a,b}S{c,} reduces 112 | # to R{a}S{c,}... 113 | # e.g. \d+\w+ -> \d\w+ 114 | # Do the cheapest checks first 115 | if r.multiplier.min < r.multiplier.max and s.multiplier.max == INF: 116 | rm_sm_intersection = rm_pattern & sm_pattern 117 | if rm_sm_intersection.equivalent(rm_pattern): 118 | trimmed = Mult( 119 | rm_pattern, 120 | Multiplier(r.multiplier.min, r.multiplier.min), 121 | ) 122 | new = self.mults[:i] + (trimmed, s) + self.mults[i + 2 :] 123 | return Conc(*new).reduce() 124 | 125 | # Conversely, if R is superset of S, then R{c,}S{a,b} reduces 126 | # to R{c,}S{a}. 127 | # e.g. [ab]+a? -> [ab]+ 128 | # Do the cheapest checks first 129 | if r.multiplier.max == INF and s.multiplier.min < s.multiplier.max: 130 | if rm_sm_intersection is None: 131 | rm_sm_intersection = rm_pattern & sm_pattern 132 | if rm_sm_intersection.equivalent(sm_pattern): 133 | trimmed = Mult( 134 | sm_pattern, 135 | Multiplier(s.multiplier.min, s.multiplier.min), 136 | ) 137 | new = self.mults[:i] + (r, trimmed) + self.mults[i + 2 :] 138 | return Conc(*new).reduce() 139 | 140 | # Conc contains (among other things) a *singleton* `Mult` containing 141 | # `Pattern` with only one internal `Conc`? Flatten out. 142 | # e.g. "a(d(ab|a*c))" -> "ad(ab|a*c)" 143 | # BUT NOT "a(d(ab|a*c)){2,}" 144 | # AND NOT "a(d(ab|a*c)|y)" 145 | for i, mult in enumerate(self.mults): 146 | if ( 147 | mult.multiplier == ONE 148 | and isinstance(mult.multiplicand, Pattern) 149 | and len(mult.multiplicand.concs) == 1 150 | ): 151 | (conc,) = mult.multiplicand.concs 152 | new = self.mults[:i] + conc.mults + self.mults[i + 1 :] 153 | return Conc(*new).reduce() 154 | 155 | return self 156 | 157 | def to_fsm(self, /) -> Fsm: 158 | return Fsm.concatenate(EPSILON, *(mult.to_fsm() for mult in self.mults)) 159 | 160 | def empty(self, /) -> bool: 161 | return any(mult.empty() for mult in self.mults) 162 | 163 | def __str__(self, /) -> str: 164 | return "".join(str(m) for m in self.mults) 165 | 166 | def common(self, other: Conc, /, suffix: bool = False) -> Conc: 167 | """ 168 | Return the common prefix of these two `Conc`s; that is, the largest 169 | `Conc` which can be safely beheaded() from the front of both. The 170 | result could be `EMPTYSTRING`. 171 | "ZYAA, ZYBB" -> "ZY" 172 | "CZ, CZ" -> "CZ" 173 | "YC, ZC" -> "" 174 | 175 | With the "suffix" flag set, works from the end. E.g.: 176 | "AAZY, BBZY" -> "ZY" 177 | "CZ, CZ" -> "CZ" 178 | "CY, CZ" -> "" 179 | """ 180 | mults = [] 181 | 182 | indices = list(range(min(len(self.mults), len(other.mults)))) 183 | # e.g. [0, 1, 2, 3] 184 | 185 | # Work backwards from the end of both `Conc`s instead. 186 | if suffix: 187 | indices = [-i - 1 for i in indices] # e.g. [-1, -2, -3, -4] 188 | 189 | for i in indices: 190 | x = self.mults[i] 191 | y = other.mults[i] 192 | common = x.common(y) 193 | 194 | # Happens when multiplicands disagree (e.g. "A.common(B)") or if 195 | # the multiplicand is shared but the common multiplier is `ZERO` 196 | # (e.g. "ABZ*.common(CZ)".) 197 | if common.multiplier == ZERO: 198 | break 199 | 200 | mults.append(common) 201 | 202 | # If we did not remove the entirety of both mults, we cannot 203 | # continue. 204 | if common != x or common != y: 205 | break 206 | 207 | if suffix: 208 | mults = mults[::-1] 209 | 210 | return Conc(*mults) 211 | 212 | def dock(self, other: Conc, /) -> Conc: 213 | """ 214 | Subtract another `Conc` from this one. 215 | This is the opposite of concatenation. 216 | For example, if ABC + DEF = ABCDEF, 217 | then logically ABCDEF - DEF = ABC. 218 | """ 219 | 220 | # e.g. self has mults at indices [0, 1, 2, 3, 4, 5, 6] len=7 221 | # e.g. other has mults at indices [0, 1, 2] len=3 222 | new = list(self.mults) 223 | for i in reversed(range(len(other.mults))): # [2, 1, 0] 224 | # e.g. i = 1, j = 7 - 3 + 1 = 5 225 | j = len(self.mults) - len(other.mults) + i 226 | new[j] = new[j].dock(other.mults[i]) 227 | 228 | if new[j].multiplier == ZERO: 229 | # omit that `Mult` entirely since it has been factored out 230 | del new[j] 231 | 232 | # If the subtraction is incomplete but there is more to 233 | # other.mults, then we have a problem. For example, "ABC{2} - BC" 234 | # subtracts the C successfully but leaves something behind, 235 | # then tries to subtract the B too, which isn't possible 236 | elif i: 237 | raise ArithmeticError(f"Can't subtract {other!r} from {self!r}") 238 | 239 | return Conc(*new) 240 | 241 | def behead(self, other: Conc, /) -> Conc: 242 | """ 243 | As with dock() but the other way around. For example, if 244 | ABC + DEF = ABCDEF, then ABCDEF.behead(AB) = CDEF. 245 | """ 246 | # Observe that FEDCBA - BA = FEDC. 247 | return self.reversed().dock(other.reversed()).reversed() 248 | 249 | def reversed(self, /) -> Conc: 250 | return Conc(*[mult.reversed() for mult in reversed(self.mults)]) 251 | 252 | 253 | # We need a new state not already used. 254 | class _Outside(Enum): 255 | """Marker state for use in `from_fsm`.""" 256 | 257 | TOKEN = auto() 258 | 259 | 260 | def from_fsm(f: Fsm) -> Pattern: 261 | """ 262 | Turn the supplied finite state machine into a `Pattern`. This is 263 | accomplished using the Brzozowski algebraic method. 264 | """ 265 | # pylint: disable=too-many-branches 266 | 267 | outside = _Outside.TOKEN 268 | 269 | # The set of strings that would be accepted by this FSM if you started 270 | # at state i is represented by the regex R_i. 271 | # If state i has a sole transition "a" to state j, then we know 272 | # R_i = a R_j. 273 | # If state i is final, then the empty string is also accepted by this 274 | # regex. 275 | # And so on... 276 | 277 | # From this we can build a set of simultaneous equations in len(f.states) 278 | # variables. This system is easily solved for all variables, but we only 279 | # need one: R_a, where a is the starting state. 280 | 281 | # The first thing we need to do is organise the states into order of depth, 282 | # so that when we perform our back-substitutions, we can start with the 283 | # last (deepest) state and therefore finish with R_a. 284 | states = [f.initial] 285 | i = 0 286 | while i < len(states): 287 | current = states[i] 288 | if current in f.map: 289 | for symbol in sorted(f.map[current]): 290 | next_state = f.map[current][symbol] 291 | if next_state not in states: 292 | states.append(next_state) 293 | i += 1 294 | 295 | # Our system of equations is represented like so: 296 | brz: dict[StateType, dict[StateType | _Outside, Pattern]] = {} 297 | 298 | for a in f.states: 299 | brz[a] = {} 300 | for b in f.states: 301 | brz[a][b] = NULLPATTERN 302 | 303 | if a in f.finals: 304 | brz[a][outside] = Pattern(EMPTYSTRING) 305 | else: 306 | brz[a][outside] = NULLPATTERN 307 | 308 | # Populate it with some initial data. 309 | for a in f.map: 310 | for charclass in f.map[a]: 311 | b = f.map[a][charclass] 312 | brz[a][b] = Pattern(*brz[a][b].concs, Conc(Mult(charclass, ONE))).reduce() 313 | 314 | # Now perform our back-substitution 315 | for i in reversed(range(len(states))): 316 | a = states[i] 317 | 318 | # Before the equation for R_a can be substituted into the other 319 | # equations, we need to resolve the self-transition (if any). 320 | # e.g. R_a = 0 R_a | 1 R_b | 2 R_c 321 | # becomes R_a = 0*1 R_b | 0*2 R_c 322 | loop = Mult(brz[a][a], STAR) # i.e. "0*" 323 | del brz[a][a] 324 | 325 | for right in brz[a]: 326 | brz[a][right] = Pattern(Conc(loop, Mult(brz[a][right], ONE))).reduce() 327 | 328 | # Note: even if we're down to our final equation, the above step still 329 | # needs to be performed before anything is returned. 330 | 331 | # Now we can substitute this equation into all of the previous ones. 332 | for j in range(i): 333 | b = states[j] 334 | 335 | # e.g. substituting R_a = 0*1 R_b | 0*2 R_c 336 | # into R_b = 3 R_a | 4 R_c | 5 R_d 337 | # yields R_b = 30*1 R_b | (30*2|4) R_c | 5 R_d 338 | univ = brz[b][a] # i.e. "3" 339 | del brz[b][a] 340 | 341 | for right in brz[a]: 342 | brz[b][right] = Pattern( 343 | *brz[b][right].concs, 344 | Conc(Mult(univ, ONE), Mult(brz[a][right], ONE)), 345 | ).reduce() 346 | 347 | return brz[f.initial][outside].reduce() 348 | 349 | 350 | @dataclass(frozen=True) 351 | class Pattern: 352 | """ 353 | A `Pattern` (also known as an "alt", short for "alternation") is a 354 | set of `Conc`s. A `Pattern` expresses multiple alternate possibilities. 355 | When written out as a regex, these would separated by pipes. A 356 | `Pattern` containing no possibilities is possible and represents a 357 | regular expression matching no strings whatsoever (there is no 358 | conventional string form for this). 359 | 360 | e.g. "abc|def(ghi|jkl)" is an alt containing two `Conc`s: "abc" and 361 | "def(ghi|jkl)". The latter is a `Conc` containing four `Mult`s: "d", 362 | "e", "f" and "(ghi|jkl)". The latter in turn is a `Mult` consisting of 363 | an upper bound 1, a lower bound 1, and a multiplicand which is a new 364 | subpattern, "ghi|jkl". This new subpattern again consists of two 365 | `Conc`s: "ghi" and "jkl". 366 | """ 367 | 368 | concs: frozenset[Conc] 369 | 370 | def __init__(self, /, *concs: Conc): 371 | object.__setattr__(self, "concs", frozenset(concs)) 372 | 373 | def __eq__(self, other: object, /) -> bool: 374 | if not isinstance(other, type(self)): 375 | return NotImplemented 376 | return self.concs == other.concs 377 | 378 | def __hash__(self, /) -> int: 379 | return hash(self.concs) 380 | 381 | def __repr__(self, /) -> str: 382 | args = ", ".join(repr(conc) for conc in self.concs) 383 | return f"Pattern({args})" 384 | 385 | def empty(self, /) -> bool: 386 | return all(conc.empty() for conc in self.concs) 387 | 388 | def intersection(self, other: Pattern, /) -> Pattern: 389 | combined = self.to_fsm() & other.to_fsm() 390 | return from_fsm(combined) 391 | 392 | def __and__(self, other: Pattern, /) -> Pattern: 393 | return self.intersection(other) 394 | 395 | def difference(*elems: Pattern) -> Pattern: 396 | """ 397 | Return a regular expression which matches any string which `self` 398 | matches but none of the strings which `other` matches. 399 | """ 400 | return from_fsm(Fsm.difference(*(elem.to_fsm() for elem in elems))) 401 | 402 | def __sub__(self, other: Pattern, /) -> Pattern: 403 | return self.difference(other) 404 | 405 | def union(self, other: Pattern, /) -> Pattern: 406 | return Pattern(*(self.concs | other.concs)) 407 | 408 | def __or__(self, other: Pattern, /) -> Pattern: 409 | return self.union(other) 410 | 411 | def __str__(self, /) -> str: 412 | if not self.concs: 413 | raise ValueError(f"Can't serialise {self!r}") 414 | return "|".join(sorted(str(conc) for conc in self.concs)) 415 | 416 | def reduce(self, /) -> Pattern: 417 | # pylint: disable=too-many-branches 418 | # pylint: disable=too-many-locals 419 | # pylint: disable=too-many-return-statements 420 | 421 | if self == NULLPATTERN: 422 | return self 423 | 424 | if self.empty(): 425 | return NULLPATTERN 426 | 427 | # Try recursively reducing our internal `Conc`s. 428 | reduced = frozenset(c.reduce() for c in self.concs) 429 | if reduced != self.concs: 430 | return Pattern(*reduced).reduce() 431 | 432 | # If one of our internal concs is empty, remove it 433 | for conc in self.concs: 434 | if conc.empty(): 435 | new = self.concs - {conc} 436 | return Pattern(*new).reduce() 437 | 438 | # If we have just one `Conc` with just one `Mult` with a multiplier of 439 | # 1, and the multiplicand is a `Pattern`, pull that up 440 | if len(self.concs) == 1: 441 | (conc,) = self.concs 442 | if ( 443 | len(conc.mults) == 1 444 | and conc.mults[0].multiplier == ONE 445 | and isinstance(conc.mults[0].multiplicand, Pattern) 446 | ): 447 | return conc.mults[0].multiplicand.reduce() 448 | 449 | # If this `Pattern` contains several `Conc`s each containing just 1 450 | # `Mult` and their multiplicands agree, we may be able to merge the 451 | # multipliers. 452 | # e.g. "a{1,2}|a{3,4}|bc" -> "a{1,4}|bc" 453 | oldconcs = list(self.concs) # so we can index the things 454 | for i, conc1 in enumerate(oldconcs): 455 | if len(conc1.mults) != 1: 456 | continue 457 | multiplicand1 = conc1.mults[0].multiplicand 458 | for j in range(i + 1, len(oldconcs)): 459 | conc2 = oldconcs[j] 460 | if len(conc2.mults) != 1: 461 | continue 462 | multiplicand2 = conc2.mults[0].multiplicand 463 | if multiplicand2 != multiplicand1: 464 | continue 465 | multiplicand = multiplicand1 466 | multiplier1 = conc1.mults[0].multiplier 467 | multiplier2 = conc2.mults[0].multiplier 468 | if not multiplier1.canunion(multiplier2): 469 | continue 470 | multiplier = multiplier1 | multiplier2 471 | newconcs = ( 472 | oldconcs[:i] 473 | + oldconcs[i + 1 : j] 474 | + oldconcs[j + 1 :] 475 | + [Conc(Mult(multiplicand, multiplier))] 476 | ) 477 | return Pattern(*newconcs).reduce() 478 | 479 | # If this `Pattern` contains several `Conc`s each containing just 1 480 | # `Mult` each containing just a `Charclass`, with a multiplier of 1, 481 | # then we can merge those `Charclass`es together. 482 | # e.g. "0|[1-9]|ab" -> "[0-9]|ab" 483 | merged_charclass = NULLCHARCLASS 484 | num_merged = 0 485 | rest = [] 486 | for conc in self.concs: 487 | if ( 488 | len(conc.mults) == 1 489 | and conc.mults[0].multiplier == ONE 490 | and isinstance(conc.mults[0].multiplicand, Charclass) 491 | ): 492 | merged_charclass |= conc.mults[0].multiplicand 493 | num_merged += 1 494 | else: 495 | rest.append(conc) 496 | if num_merged >= 2: 497 | rest.append(Conc(Mult(merged_charclass, ONE))) 498 | return Pattern(*rest).reduce() 499 | 500 | # If one of the present `Pattern`'s `Conc`s is the empty string... 501 | if EMPTYSTRING in self.concs: 502 | for conc in self.concs: 503 | # ...and there is another `Conc` 504 | # with a single `Mult` whose lower bound is 0... 505 | if len(conc.mults) == 1 and conc.mults[0].multiplier.min == Bound(0): 506 | # Then we can omit the empty string. 507 | # E.g. "|(ab)*|def" => "(ab)*|def". 508 | return Pattern(*(self.concs - {EMPTYSTRING})).reduce() 509 | 510 | for conc in self.concs: 511 | # ...and there is another `Conc` 512 | # with a single `Mult` whose lower bound is 1... 513 | if len(conc.mults) == 1 and conc.mults[0].multiplier.min == Bound(1): 514 | # Then we can merge the empty string into that. 515 | # E.g. "|(ab)+|def" => "(ab)*|def". 516 | merged_conc = Conc( 517 | Mult(conc.mults[0].multiplicand, conc.mults[0].multiplier * QM) 518 | ) 519 | return Pattern( 520 | *(self.concs - {EMPTYSTRING, conc} | {merged_conc}) 521 | ).reduce() 522 | 523 | # If the present `Pattern`'s `Conc`s all have a common prefix, split 524 | # that out. This increases the depth of the object 525 | # but it is still arguably simpler/ripe for further reduction 526 | # e.g. "abc|ade" -> a(bc|de)" 527 | if len(self.concs) > 1: 528 | prefix = self._commonconc() 529 | if prefix != EMPTYSTRING: 530 | leftovers = self.behead(prefix) 531 | mults = prefix.mults + (Mult(leftovers, ONE),) 532 | return Pattern(Conc(*mults)).reduce() 533 | 534 | # Same but for suffixes. 535 | # e.g. "xyz|stz -> (xy|st)z" 536 | suffix = self._commonconc(suffix=True) 537 | if suffix != EMPTYSTRING: 538 | leftovers = self.dock(suffix) 539 | mults = (Mult(leftovers, ONE),) + suffix.mults 540 | return Pattern(Conc(*mults)).reduce() 541 | 542 | return self 543 | 544 | def symmetric_difference(*elems: Pattern) -> Pattern: 545 | """ 546 | Return a regular expression matching only the strings recognised by 547 | `self` or `other` but not both. 548 | """ 549 | return from_fsm(Fsm.symmetric_difference(*(elem.to_fsm() for elem in elems))) 550 | 551 | def __xor__(self, other: Pattern, /) -> Pattern: 552 | return self.symmetric_difference(other) 553 | 554 | def dock(self, other: Conc, /) -> Pattern: 555 | """ 556 | The opposite of concatenation. Remove a common suffix from the 557 | present `Pattern`; that is, from each of its constituent concs. 558 | 559 | AYZ|BYZ|CYZ - YZ -> A|B|C. 560 | """ 561 | return Pattern(*[conc.dock(other) for conc in self.concs]) 562 | 563 | def behead(self, other: Conc, /) -> Pattern: 564 | """ 565 | Like dock() but the other way around. Remove a common prefix from 566 | the present `Pattern`; that is, from each of its constituent concs. 567 | 568 | ZA|ZB|ZC.behead(Z) -> A|B|C 569 | """ 570 | return Pattern(*[conc.behead(other) for conc in self.concs]) 571 | 572 | def _commonconc(self, /, suffix: bool = False) -> Conc: 573 | """ 574 | Find the longest `Conc` which acts as prefix to every `Conc` in 575 | this `Pattern`. This could be `EMPTYSTRING`. Return the common 576 | prefix along with all the leftovers after truncating that common 577 | prefix from each `Conc`. 578 | 579 | "ZA|ZB|ZC" -> "Z", "(A|B|C)" 580 | "ZA|ZB|ZC|Z" -> "Z", "(A|B|C|)" 581 | "CZ|CZ" -> "CZ", "()" 582 | 583 | If "suffix" is True, the same result but for suffixes. 584 | """ 585 | if not self.concs: 586 | raise ValueError(f"Can't call _commonconc on {self!r}") 587 | 588 | return reduce(lambda x, y: x.common(y, suffix=suffix), self.concs) 589 | 590 | def to_fsm(self, /) -> Fsm: 591 | return Fsm.union(NULL, *(conc.to_fsm() for conc in self.concs)) 592 | 593 | def reversed(self, /) -> Pattern: 594 | return Pattern(*(c.reversed() for c in self.concs)) 595 | 596 | def copy(self, /) -> Pattern: 597 | """ 598 | For completeness only, since `set.copy()` also exists. `Pattern`s 599 | are immutable, so I can see only very odd reasons to need this 600 | """ 601 | return Pattern(*self.concs) 602 | 603 | def equivalent(self, other: Pattern, /) -> bool: 604 | """ 605 | Two `Pattern`s are equivalent if they recognise the same strings. 606 | Note that in the general case this is actually quite an intensive 607 | calculation, but far from unsolvable, as we demonstrate here: 608 | """ 609 | return self.to_fsm().equivalent(other.to_fsm()) 610 | 611 | def times(self, multiplier: Multiplier, /) -> Pattern: 612 | """ 613 | Equivalent to repeated concatenation. Multiplier consists of a 614 | minimum and a maximum; maximum may be infinite (for Kleene star 615 | closure). Call using "a = b * qm" 616 | """ 617 | return Pattern(Conc(Mult(self, multiplier))) 618 | 619 | def __mul__(self, multiplier: Multiplier, /) -> Pattern: 620 | return self.times(multiplier) 621 | 622 | def everythingbut(self, /) -> Pattern: 623 | """ 624 | Return a `Pattern` which will match any string not matched by 625 | `self`, and which will not match any string matched by `self`. 626 | Another task which is very difficult in general (and typically 627 | returns utter garbage when actually printed), but becomes trivial 628 | to code thanks to FSM routines. 629 | """ 630 | return from_fsm(self.to_fsm().everythingbut()) 631 | 632 | def derive(self, string: str, /) -> Pattern: 633 | return from_fsm(self.to_fsm().derive(string)) 634 | 635 | def isdisjoint(self, other: Pattern, /) -> bool: 636 | """ 637 | Treat `self` and `other` as sets of strings and see if they are 638 | disjoint 639 | """ 640 | return self.to_fsm().isdisjoint(other.to_fsm()) 641 | 642 | def matches(self, string: str, /) -> bool: 643 | return self.to_fsm().accepts(string) 644 | 645 | def __contains__(self, string: str, /) -> bool: 646 | """ 647 | This lets you use the syntax `"a" in pattern` to see whether the 648 | string "a" is in the set of strings matched by `pattern`. 649 | """ 650 | return self.matches(string) 651 | 652 | # pylint: disable=fixme 653 | # TODO: this is a misuse of __reversed__ 654 | # and should be removed next major version 655 | def __reversed__(self, /) -> Pattern: 656 | return self.reversed() 657 | 658 | def cardinality(self, /) -> int: 659 | """ 660 | Consider the regular expression as a set of strings and return the 661 | cardinality of that set, or raise an OverflowError if there are 662 | infinitely many. 663 | """ 664 | # There is no way to do this other than converting to an FSM, because 665 | # the `Pattern` may allow duplicate routes, such as "a|a". 666 | return self.to_fsm().cardinality() 667 | 668 | def __len__(self, /) -> int: 669 | return self.cardinality() 670 | 671 | def strings(self, /, *, otherchar: str | None = None) -> Iterator[str]: 672 | """ 673 | Each time next() is called on this iterator, a new string is 674 | returned which this `Pattern` can match. `StopIteration` 675 | is raised once all such strings have been returned, although a 676 | regex with a * in may match infinitely many strings. 677 | """ 678 | otherchars = [] if otherchar is None else [otherchar] 679 | return self.to_fsm().strings(otherchars) 680 | 681 | def __iter__(self, /) -> Iterator[str]: 682 | """ 683 | This allows you to do `for string in pattern` as a list 684 | comprehension! 685 | """ 686 | return self.strings() 687 | 688 | 689 | @dataclass(frozen=True) 690 | class Mult: 691 | """ 692 | A `Mult` is a combination of a multiplicand with a multiplier (a min 693 | and a max). The vast majority of characters in regular expressions 694 | occur without a specific multiplier, which is implicitly equivalent to 695 | a min of 1 and a max of 1, but many more have explicit multipliers like 696 | "*" (min = 0, max = INF) and so on. 697 | 698 | e.g. a, b{2}, c?, d*, [efg]{2,5}, f{2,}, (anysubpattern)+, .*, ... 699 | """ 700 | 701 | multiplicand: Charclass | Pattern 702 | multiplier: Multiplier 703 | 704 | def __eq__(self, other: object, /) -> bool: 705 | if not isinstance(other, type(self)): 706 | return NotImplemented 707 | return ( 708 | self.multiplicand == other.multiplicand 709 | and self.multiplier == other.multiplier 710 | ) 711 | 712 | def __hash__(self, /) -> int: 713 | return hash((self.multiplicand, self.multiplier)) 714 | 715 | def __repr__(self, /) -> str: 716 | return f"Mult({self.multiplicand!r}, {self.multiplier!r})" 717 | 718 | def dock(self, other: Mult, /) -> Mult: 719 | """ 720 | "Dock" another `Mult` from this one (i.e. remove part of the tail) 721 | and return the result. The reverse of concatenation. This is a lot 722 | trickier. 723 | e.g. a{4,5} - a{3} = a{1,2} 724 | """ 725 | if other.multiplicand != self.multiplicand: 726 | raise ArithmeticError(f"Can't subtract {other!r} from {self!r}") 727 | return Mult(self.multiplicand, self.multiplier - other.multiplier) 728 | 729 | def common(self, other: Mult, /) -> Mult: 730 | """ 731 | Return the common part of these two mults. This is the largest 732 | `Mult` which can be safely subtracted from both the originals. The 733 | multiplier on this `Mult` could be `ZERO`: this is the case if, for 734 | example, the multiplicands disagree. 735 | """ 736 | if self.multiplicand == other.multiplicand: 737 | return Mult(self.multiplicand, self.multiplier.common(other.multiplier)) 738 | 739 | # Multiplicands disagree, no common part at all. 740 | return Mult(NULLCHARCLASS, ZERO) 741 | 742 | def empty(self, /) -> bool: 743 | return self.multiplicand.empty() and self.multiplier.min > Bound(0) 744 | 745 | def reduce(self, /) -> Mult: 746 | if self == NULLMULT: 747 | return self 748 | 749 | # Can't match anything: reduce to empty `Mult` 750 | if self.empty(): 751 | return NULLMULT 752 | 753 | # Try recursively reducing our multiplicand 754 | reduced = self.multiplicand.reduce() 755 | if reduced != self.multiplicand: 756 | return Mult(reduced, self.multiplier).reduce() 757 | 758 | # If our multiplicand is a `Pattern` containing an empty `Conc` 759 | # we can pull that "optional" bit out into our own multiplier 760 | # instead. 761 | # e.g. (A|B|C|) -> (A|B|C)? 762 | # e.g. (A|B|C|){2} -> (A|B|C){0,2} 763 | if ( 764 | isinstance(self.multiplicand, Pattern) 765 | and EMPTYSTRING in self.multiplicand.concs 766 | and self.multiplier.canmultiplyby(QM) 767 | ): 768 | return Mult( 769 | Pattern(*(conc for conc in self.multiplicand.concs if conc.mults)), 770 | self.multiplier * QM, 771 | ).reduce() 772 | 773 | # If our multiplicand is a `Pattern` containing a single `Conc` 774 | # containing a single `Mult`, we can scrap the `Pattern` in favour of 775 | # that `Mult`'s multiplicand 776 | # e.g. ([ab])* -> [ab]* 777 | # e.g. ((a))* -> (a)* -> a* 778 | # NOTE: this logic lives here at the `Mult` level, NOT in 779 | # `Pattern.reduce` because we want to return another `Mult` (same type) 780 | if isinstance(self.multiplicand, Pattern) and len(self.multiplicand.concs) == 1: 781 | (conc,) = self.multiplicand.concs 782 | if len(conc.mults) == 1 and conc.mults[0].multiplier.canmultiplyby( 783 | self.multiplier 784 | ): 785 | return Mult( 786 | conc.mults[0].multiplicand, 787 | conc.mults[0].multiplier * self.multiplier, 788 | ).reduce() 789 | 790 | # no reduction possible 791 | return self 792 | 793 | def __str__(self, /) -> str: 794 | if isinstance(self.multiplicand, Pattern): 795 | return f"({self.multiplicand}){self.multiplier}" 796 | if isinstance(self.multiplicand, Charclass): 797 | return f"{self.multiplicand}{self.multiplier}" 798 | raise TypeError(f"Unknown type {type(self.multiplicand)}") 799 | 800 | def to_fsm(self, /) -> Fsm: 801 | # worked example: (min, max) = (5, 7) or (5, INF) 802 | # (mandatory, optional) = (5, 2) or (5, INF) 803 | 804 | unit = ( 805 | from_charclass(self.multiplicand) 806 | if isinstance(self.multiplicand, Charclass) 807 | else self.multiplicand.to_fsm() 808 | ) 809 | # accepts e.g. "ab" 810 | 811 | # Yuck. `mandatory` cannot be infinite: it's just a natural number. 812 | # However, it uses `Bound`, which describes co-naturals. 813 | assert self.multiplier.mandatory.v is not None 814 | 815 | # accepts "ababababab" 816 | mandatory = unit.times(self.multiplier.mandatory.v) 817 | 818 | # unlimited additional copies 819 | if self.multiplier.optional == INF: 820 | optional = unit.star() 821 | # accepts "(ab)*" 822 | 823 | else: 824 | optional = EPSILON | unit 825 | # accepts "(ab)?" 826 | 827 | # Implied by `!= INF`. 828 | assert self.multiplier.optional.v is not None 829 | 830 | optional = optional.times(self.multiplier.optional.v) 831 | # accepts "(ab)?(ab)?" 832 | 833 | return mandatory.concatenate(optional) 834 | 835 | def reversed(self, /) -> Mult: 836 | return Mult(self.multiplicand.reversed(), self.multiplier) 837 | 838 | 839 | NULLMULT = Mult(NULLCHARCLASS, ONE) 840 | NULLCONC = Conc(NULLMULT) 841 | EMPTYSTRING = Conc() 842 | NULLPATTERN = Pattern(NULLCONC) 843 | -------------------------------------------------------------------------------- /greenery/fsm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Finite state machine library, intended to be used by `greenery` only 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | __all__ = ( 8 | "Fsm", 9 | "StateType", 10 | "EPSILON", 11 | "NULL", 12 | "Charclass", 13 | ) 14 | 15 | from dataclasses import dataclass 16 | from typing import ( 17 | Callable, 18 | ClassVar, 19 | Collection, 20 | Dict, 21 | Iterable, 22 | Iterator, 23 | List, 24 | Mapping, 25 | TypeVar, 26 | ) 27 | 28 | from .charclass import DOT, Charclass, repartition 29 | 30 | AlphaType = Charclass 31 | StateType = int 32 | M = TypeVar("M") 33 | """Meta-state type for crawl(). Can be anything.""" 34 | 35 | 36 | def unify_alphabets(fsms: Iterable[Fsm], /) -> List[Fsm]: 37 | charclasses = set() 38 | for fsm in fsms: 39 | for charclass in fsm.alphabet: 40 | charclasses.add(charclass) 41 | 42 | partition = repartition(charclasses) 43 | # maps old Charclasses to collections of new Charclasses 44 | 45 | return [fsm.replace_alphabet(partition) for fsm in fsms] 46 | 47 | 48 | # pylint: disable=too-many-public-methods,too-many-branches,fixme 49 | @dataclass(frozen=True, init=False) 50 | class Fsm: 51 | """ 52 | A Finite State Machine or FSM has an alphabet and a set of states. At 53 | any given moment, the FSM is in one state. When passed a symbol from 54 | the alphabet, the FSM jumps to another state (or possibly the same 55 | state). A map (Python dictionary) indicates where to jump. 56 | One state is nominated as a starting state. Zero or more states are 57 | nominated as final states. If, after consuming a string of symbols, 58 | the FSM is in a final state, then it is said to "accept" the string. 59 | This class also has some pretty powerful methods which allow FSMs to 60 | be concatenated, alternated between, multiplied, looped (Kleene star 61 | closure), intersected, and simplified. 62 | The majority of these methods are available using operator overloads. 63 | """ 64 | 65 | alphabet: frozenset[AlphaType] 66 | states: frozenset[StateType] 67 | initial: StateType 68 | finals: frozenset[StateType] 69 | map: Mapping[StateType, Mapping[AlphaType, StateType]] 70 | 71 | # noinspection PyShadowingBuiltins 72 | # pylint: disable=too-many-arguments 73 | def __init__( 74 | self, 75 | /, 76 | *, 77 | alphabet: Iterable[AlphaType], 78 | states: Iterable[StateType], 79 | initial: StateType, 80 | finals: Iterable[StateType], 81 | # pylint: disable=redefined-builtin 82 | map: Mapping[StateType, Mapping[AlphaType, StateType]], 83 | ) -> None: 84 | """ 85 | `alphabet` is an iterable of symbols the FSM can be fed. 86 | `states` is the set of states for the FSM 87 | `initial` is the initial state 88 | `finals` is the set of accepting states 89 | `map` must be complete 90 | """ 91 | alphabet = frozenset(alphabet) 92 | states = frozenset(states) 93 | finals = frozenset(finals) 94 | 95 | # Validation. Thanks to immutability, this only needs to be carried out 96 | # once. 97 | if initial not in states: 98 | raise ValueError(f"Initial state {initial!r} must be one of {states!r}") 99 | if not finals.issubset(states): 100 | raise ValueError(f"Final states {finals!r} must be a subset of {states!r}") 101 | for state, state_trans in map.items(): 102 | if state not in states: 103 | raise ValueError(f"Transition from unknown state {state!r}") 104 | for symbol, dest in state_trans.items(): 105 | if symbol not in alphabet: 106 | raise ValueError( 107 | f"Invalid symbol {symbol!r}" 108 | f" in transition from {state!r}" 109 | f" to {dest!r}" 110 | ) 111 | if dest not in states: 112 | raise ValueError( 113 | f"Transition for state {state!r}" 114 | f" and symbol {symbol!r}" 115 | f" leads to {dest!r}," 116 | " which is not a state" 117 | ) 118 | for state in states: 119 | if state not in map: 120 | raise ValueError(f"State {state!r} missing from map") 121 | for charclass in alphabet: 122 | if charclass not in map[state]: 123 | raise ValueError( 124 | f"Symbol {charclass!r} missing from map[{state!r}]" 125 | ) 126 | 127 | # Check that the charclasses form a proper partition of all of Unicode 128 | unified = Charclass() 129 | for charclass in alphabet: 130 | if unified & charclass != Charclass(): 131 | raise ValueError(f"Alphabet {alphabet!r} has overlaps") 132 | unified |= charclass 133 | if unified != DOT: 134 | raise ValueError(f"Alphabet {alphabet!r} is not a proper partition") 135 | 136 | # Initialise the hard way due to immutability. 137 | object.__setattr__(self, "alphabet", alphabet) 138 | object.__setattr__(self, "states", states) 139 | object.__setattr__(self, "initial", initial) 140 | object.__setattr__(self, "finals", finals) 141 | object.__setattr__(self, "map", map) 142 | 143 | def accepts(self, string: str, /) -> bool: 144 | """ 145 | Test whether the present FSM accepts the supplied string (iterable 146 | of symbols). Equivalently, consider `self` as a possibly-infinite 147 | set of strings and test whether `string` is a member of it. This is 148 | actually mainly used for unit testing purposes. 149 | """ 150 | state = self.initial 151 | for char in string: 152 | for charclass in self.map[state]: 153 | if charclass.accepts(char): 154 | state = self.map[state][charclass] 155 | break 156 | return state in self.finals 157 | 158 | def __contains__(self, string: str, /) -> bool: 159 | """ 160 | This lets you use the syntax `"a" in fsm1` to see whether the 161 | string "a" is in the set of strings accepted by `fsm1`. 162 | """ 163 | return self.accepts(string) 164 | 165 | def reduce(self, /) -> Fsm: 166 | """ 167 | A result by Brzozowski (1963) shows that a minimal finite state 168 | machine equivalent to the original can be obtained by reversing the 169 | original twice. 170 | """ 171 | return self.reversed().reversed() 172 | 173 | def __repr__(self, /) -> str: 174 | args = ", ".join( 175 | [ 176 | f"alphabet={self.alphabet!r}", 177 | f"states={self.states!r}", 178 | f"initial={self.initial!r}", 179 | f"finals={self.finals!r}", 180 | f"map={self.map!r}", 181 | ] 182 | ) 183 | return f"Fsm({args})" 184 | 185 | # The Python `__eq__` + `__hash__` contract requires that value-equality 186 | # implies hash-equality. `Fsm` `__eq__` implementation currently represents 187 | # equality of the set of accepted strings, independent of specific state 188 | # labels or unused members of the alphabet. This is not trivial to hash. 189 | # Regarding the type suppression, see 190 | # https://github.com/python/mypy/issues/4266 191 | __hash__: ClassVar[None] = None # type: ignore 192 | 193 | def __str__(self, /) -> str: 194 | rows = [] 195 | 196 | sorted_alphabet = sorted(self.alphabet) 197 | 198 | # top row 199 | row = ["", "name", "final?"] 200 | row.extend(str(symbol) for symbol in sorted_alphabet) 201 | rows.append(row) 202 | 203 | # other rows 204 | for state in self.states: 205 | row = [] 206 | if state == self.initial: 207 | row.append("*") 208 | else: 209 | row.append("") 210 | row.append(str(state)) 211 | if state in self.finals: 212 | row.append("True") 213 | else: 214 | row.append("False") 215 | for symbol in sorted_alphabet: 216 | row.append(str(self.map[state][symbol])) 217 | rows.append(row) 218 | 219 | # column widths 220 | colwidths = [] 221 | for x in range(len(rows[0])): 222 | colwidths.append(max(len(str(row[x])) for y, row in enumerate(rows)) + 1) 223 | 224 | # apply padding 225 | for y, row in enumerate(rows): 226 | for x, col in enumerate(row): 227 | rows[y][x] = col.ljust(colwidths[x]) 228 | 229 | # horizontal line 230 | rows.insert(1, ["-" * colwidth for colwidth in colwidths]) 231 | 232 | return "".join("".join(row) + "\n" for row in rows) 233 | 234 | def concatenate(*fsms: Fsm) -> Fsm: 235 | """ 236 | Concatenate arbitrarily many finite state machines together. 237 | """ 238 | unified_fsms = unify_alphabets(fsms) 239 | 240 | def connect_all( 241 | i: int, 242 | substate: StateType, 243 | ) -> Iterable[tuple[int, StateType]]: 244 | """ 245 | Take a state in the numbered FSM and return a set containing 246 | it, plus (if it's final) the first state from the next FSM, 247 | plus (if that's final) the first state from the next but one 248 | FSM, plus... 249 | """ 250 | result = {(i, substate)} 251 | while i < len(unified_fsms) - 1 and substate in unified_fsms[i].finals: 252 | i += 1 253 | substate = unified_fsms[i].initial 254 | result.add((i, substate)) 255 | return result 256 | 257 | # Use a superset containing states from all FSMs at once. 258 | # We start at the start of the first FSM. If this state is final in the 259 | # first FSM, then we are also at the start of the second FSM. And so 260 | # on. 261 | initial = frozenset( 262 | connect_all(0, unified_fsms[0].initial) if unified_fsms else () 263 | ) 264 | 265 | def final(state: frozenset[tuple[int, StateType]]) -> bool: 266 | """If you're in a final state of the final FSM, it's final""" 267 | return any( 268 | i == len(unified_fsms) - 1 and substate in unified_fsms[i].finals 269 | for i, substate in state 270 | ) 271 | 272 | def follow( 273 | current: frozenset[tuple[int, StateType]], 274 | symbol: AlphaType, 275 | ) -> frozenset[tuple[int, StateType]]: 276 | """ 277 | Follow the collection of states through all FSMs at once, 278 | jumping to the next FSM if we reach the end of the current one 279 | """ 280 | next_metastate: set[tuple[int, StateType]] = set() 281 | for i, substate in current: 282 | next_metastate.update( 283 | connect_all(i, unified_fsms[i].map[substate][symbol]) 284 | ) 285 | 286 | return frozenset(next_metastate) 287 | 288 | alphabet = unified_fsms[0].alphabet if len(unified_fsms) > 0 else {~Charclass()} 289 | 290 | return crawl(alphabet, initial, final, follow).reduce() 291 | 292 | def __add__(self, other: Fsm, /) -> Fsm: 293 | """ 294 | Concatenate two finite state machines together. 295 | For example, if self accepts "0*" and other accepts "1+(0|1)", 296 | will return a finite state machine accepting "0*1+(0|1)". 297 | Accomplished by effectively following non-deterministically. 298 | Call using "fsm3 = fsm1 + fsm2" 299 | """ 300 | return self.concatenate(other) 301 | 302 | def star(self, /) -> Fsm: 303 | """ 304 | If the present FSM accepts X, returns an FSM accepting X* (i.e. 0 305 | or more Xes). This is NOT as simple as naively connecting the final 306 | states back to the initial state: see (b*ab)* for example. 307 | """ 308 | alphabet = self.alphabet 309 | 310 | initial: Collection[StateType] = {self.initial} 311 | 312 | def follow( 313 | state: Collection[StateType], 314 | symbol: AlphaType, 315 | ) -> Collection[StateType]: 316 | next_states = set() 317 | 318 | for substate in state: 319 | next_states.add(self.map[substate][symbol]) 320 | 321 | # If one of our substates is final, then we can also consider 322 | # transitions from the initial state of the original FSM. 323 | if substate in self.finals: 324 | next_states.add(self.map[self.initial][symbol]) 325 | 326 | return frozenset(next_states) 327 | 328 | def final(state: Collection[StateType]) -> bool: 329 | return any(substate in self.finals for substate in state) 330 | 331 | return crawl(alphabet, initial, final, follow) | EPSILON 332 | 333 | def times(self, multiplier: int, /) -> Fsm: 334 | """ 335 | Given an FSM and a multiplier, return the multiplied FSM. 336 | """ 337 | if multiplier < 0: 338 | raise ArithmeticError(f"Can't multiply an FSM by {multiplier!r}") 339 | 340 | alphabet = self.alphabet 341 | 342 | # metastate is a set of iterations+states 343 | initial: Collection[tuple[StateType, int]] = {(self.initial, 0)} 344 | 345 | def final(state: Collection[tuple[StateType, int]]) -> bool: 346 | """ 347 | If the initial state is final then multiplying doesn't alter 348 | that 349 | """ 350 | return any( 351 | substate == self.initial 352 | and (self.initial in self.finals or iteration == multiplier) 353 | for substate, iteration in state 354 | ) 355 | 356 | def follow( 357 | current: Collection[tuple[StateType, int]], 358 | symbol: AlphaType, 359 | ) -> Collection[tuple[StateType, int]]: 360 | next_metastate = [] 361 | for substate, iteration in current: 362 | if iteration < multiplier: 363 | next_metastate.append((self.map[substate][symbol], iteration)) 364 | # final of self? merge with initial on next iteration 365 | if self.map[substate][symbol] in self.finals: 366 | next_metastate.append((self.initial, iteration + 1)) 367 | return frozenset(next_metastate) 368 | 369 | return crawl(alphabet, initial, final, follow).reduce() 370 | 371 | def __mul__(self, multiplier: int, /) -> Fsm: 372 | """ 373 | Given an FSM and a multiplier, return the multiplied FSM. 374 | """ 375 | return self.times(multiplier) 376 | 377 | def union(*fsms: Fsm) -> Fsm: 378 | """ 379 | Treat `fsms` as a collection of arbitrary FSMs and return the union 380 | FSM. Can be used as `fsm1.union(fsm2, ...)` or 381 | `fsm.union(fsm1, ...)`. `fsms` may be empty. 382 | """ 383 | return parallel(fsms, any) 384 | 385 | def __or__(self, other: Fsm, /) -> Fsm: 386 | """ 387 | Alternation. 388 | Return a finite state machine which accepts any sequence of symbols 389 | that is accepted by either self or other. Note that the set of 390 | strings recognised by the two FSMs undergoes a set union. 391 | Call using "fsm3 = fsm1 | fsm2" 392 | """ 393 | return self.union(other) 394 | 395 | def intersection(*fsms: Fsm) -> Fsm: 396 | """ 397 | Intersection. 398 | Take FSMs and AND them together. That is, return an FSM which 399 | accepts any sequence of symbols that is accepted by both of the 400 | original FSMs. Note that the set of strings recognised by the two 401 | FSMs undergoes a set intersection operation. 402 | Call using "fsm3 = fsm1 & fsm2" 403 | """ 404 | return parallel(fsms, all) 405 | 406 | def __and__(self, other: Fsm, /) -> Fsm: 407 | """ 408 | Treat the FSMs as sets of strings and return the intersection of 409 | those sets in the form of a new FSM. 410 | """ 411 | return self.intersection(other) 412 | 413 | def symmetric_difference(*fsms: Fsm) -> Fsm: 414 | """ 415 | Treat `fsms` as a collection of sets of strings and compute the 416 | symmetric difference of them all. The python set method only allows 417 | two sets to be operated on at once, but we go the extra mile since 418 | it's not too hard. 419 | """ 420 | return parallel(fsms, lambda accepts: (accepts.count(True) % 2) == 1) 421 | 422 | def __xor__(self, other: Fsm, /) -> Fsm: 423 | """ 424 | Symmetric difference. Returns an FSM which recognises only the 425 | strings recognised by `self` or `other` but not both. 426 | """ 427 | return self.symmetric_difference(other) 428 | 429 | def everythingbut(self, /) -> Fsm: 430 | """ 431 | Return a finite state machine which will accept any string NOT 432 | accepted by self, and will not accept any string accepted by self. 433 | """ 434 | alphabet = self.alphabet 435 | initial = self.initial 436 | 437 | def follow( 438 | current: StateType, 439 | symbol: AlphaType, 440 | ) -> StateType: 441 | return self.map[current][symbol] 442 | 443 | # state is final unless the original was 444 | def final(state: StateType) -> bool: 445 | return state not in self.finals 446 | 447 | return crawl(alphabet, initial, final, follow).reduce() 448 | 449 | def reversed(self, /) -> Fsm: 450 | """ 451 | Return a new FSM such that for every string that self accepts (e.g. 452 | "beer", the new FSM accepts the reversed string ("reeb"). 453 | """ 454 | alphabet = self.alphabet 455 | 456 | # Start from a composite "state-set" consisting of all final states. 457 | # If there are no final states, this set is empty and we'll find that 458 | # no other states get generated. 459 | initial = frozenset(self.finals) 460 | 461 | # Find every possible way to reach the current state-set 462 | # using this symbol. 463 | def follow( 464 | current: frozenset[StateType], 465 | symbol: AlphaType, 466 | ) -> frozenset[StateType]: 467 | next_states = frozenset( 468 | [ 469 | prev 470 | for prev in self.map 471 | for state in current 472 | if self.map[prev][symbol] == state 473 | ] 474 | ) 475 | return next_states 476 | 477 | # A state-set is final if the initial state is in it. 478 | def final(state: frozenset[StateType]) -> bool: 479 | return self.initial in state 480 | 481 | # Man, crawl() is the best! 482 | return crawl(alphabet, initial, final, follow) 483 | # Do not reduce() the result, since reduce() calls us in turn 484 | 485 | def islive(self, /, state: StateType) -> bool: 486 | """A state is "live" if a final state can be reached from it.""" 487 | reachable = [state] 488 | i = 0 489 | while i < len(reachable): 490 | current = reachable[i] 491 | if current in self.finals: 492 | return True 493 | for symbol in self.map[current]: 494 | next_state = self.map[current][symbol] 495 | if next_state not in reachable: 496 | reachable.append(next_state) 497 | i += 1 498 | return False 499 | 500 | def empty(self, /) -> bool: 501 | """ 502 | An FSM is empty if it recognises no strings. An FSM may be 503 | arbitrarily complicated and have arbitrarily many final states 504 | while still recognising no strings because those final states may 505 | all be inaccessible from the initial state. Equally, an FSM may be 506 | non-empty despite having an empty alphabet if the initial state is 507 | final. 508 | """ 509 | return not self.islive(self.initial) 510 | 511 | def strings(self, otherchars: Iterable[str]) -> Iterator[str]: 512 | """ 513 | Generate strings that this FSM accepts. Note that for our purposes a 514 | string is a sequence of Unicode characters, NOT a list of Charclasses. 515 | 516 | Since 517 | there may be infinitely many of these we use a generator instead of 518 | constructing a static list. Strings will be sorted in order of 519 | length and then lexically. This procedure uses arbitrary amounts of 520 | memory but is very fast. There may be more efficient ways to do 521 | this, that I haven't investigated yet. You can use this in list 522 | comprehensions. 523 | """ 524 | 525 | # Most FSMs have at least one "dead state". 526 | # Once you reach a dead state, you can no 527 | # longer reach a final state. Since many strings may end up here, it's 528 | # advantageous to constrain our search to live states only. 529 | livestates = set(state for state in self.states if self.islive(state)) 530 | 531 | # We store a list of tuples. Each tuple consists of an input string and 532 | # the state that this input string leads to. This means we don't have 533 | # to run the state machine from the very beginning every time we want 534 | # to check a new string. 535 | strings: list[tuple[str, StateType]] = [] 536 | 537 | # Initial entry (or possibly not, in which case this is a short one) 538 | cstate: StateType = self.initial 539 | cstring: str = "" 540 | if cstate in livestates: 541 | if cstate in self.finals: 542 | yield cstring 543 | strings.append((cstring, cstate)) 544 | 545 | # Fixed point calculation 546 | i = 0 547 | while i < len(strings): 548 | cstring, cstate = strings[i] 549 | 550 | for charclass in sorted(self.map[cstate]): 551 | # TODO: scrap otherchars as a concept? 552 | chars = otherchars if charclass.negated else charclass.get_chars() 553 | for char in chars: 554 | nstate = self.map[cstate][charclass] 555 | nstring = cstring + char 556 | if nstate in livestates: 557 | if nstate in self.finals: 558 | yield nstring 559 | strings.append((nstring, nstate)) 560 | i += 1 561 | 562 | def __iter__(self, /) -> Iterator[str]: 563 | """ 564 | This allows you to do `for string in fsm1` as a list comprehension! 565 | """ 566 | return self.strings([]) 567 | 568 | def equivalent(self, other: Fsm, /) -> bool: 569 | """ 570 | Two FSMs are considered equivalent if they recognise the same 571 | strings. Or, to put it another way, if their symmetric difference 572 | recognises no strings. 573 | """ 574 | return (self ^ other).empty() 575 | 576 | def __eq__(self, other: object, /) -> bool: 577 | """ 578 | You can use `fsm1 == fsm2` to determine whether two FSMs recognise 579 | the same strings. 580 | """ 581 | if not isinstance(other, Fsm): 582 | return NotImplemented 583 | return self.equivalent(other) 584 | 585 | def different(self, other: Fsm, /) -> bool: 586 | """ 587 | Two FSMs are considered different if they have a non-empty 588 | symmetric difference. 589 | """ 590 | return not (self ^ other).empty() 591 | 592 | def __ne__(self, other: object, /) -> bool: 593 | """ 594 | Use `fsm1 != fsm2` to determine whether two FSMs recognise 595 | different strings. 596 | """ 597 | return not self == other 598 | 599 | def difference(*fsms: Fsm) -> Fsm: 600 | """ 601 | Difference. Returns an FSM which recognises only the strings 602 | recognised by the first FSM in the list, but none of the others. 603 | """ 604 | return parallel(fsms, lambda accepts: accepts[0] and not any(accepts[1:])) 605 | 606 | def __sub__(self, other: Fsm, /) -> Fsm: 607 | return self.difference(other) 608 | 609 | def cardinality(self, /) -> int: 610 | """ 611 | Consider the FSM as a set of strings and return the cardinality of 612 | that set, or raise an OverflowError if there are infinitely many 613 | """ 614 | num_strings: dict[StateType, int | None] = {} 615 | 616 | def get_num_strings(state: StateType) -> int: 617 | # Most FSMs have at least one oblivion state 618 | if self.islive(state): 619 | if state in num_strings: 620 | if num_strings[state] is None: # "computing..." 621 | # Recursion! There are infinitely many strings 622 | # recognised 623 | raise OverflowError(state) 624 | return num_strings[state] # type: ignore 625 | 626 | num_strings[state] = None # i.e. "computing..." 627 | n = 0 628 | for charclass in self.map[state]: 629 | num_transitions = charclass.num_chars() 630 | nstate = self.map[state][charclass] 631 | if nstate in self.finals: 632 | n += num_transitions 633 | n += num_transitions * get_num_strings(nstate) 634 | num_strings[state] = n 635 | 636 | else: 637 | # Dead state 638 | num_strings[state] = 0 639 | 640 | return num_strings[state] # type: ignore 641 | 642 | n = 1 if self.initial in self.finals else 0 643 | return n + get_num_strings(self.initial) 644 | 645 | def __len__(self, /) -> int: 646 | """ 647 | Consider the FSM as a set of strings and return the cardinality of 648 | that set, or raise an OverflowError if there are infinitely many 649 | """ 650 | return self.cardinality() 651 | 652 | def isdisjoint(self, other: Fsm, /) -> bool: 653 | """ 654 | Treat `self` and `other` as sets of strings and see if they are 655 | disjoint 656 | """ 657 | return (self & other).empty() 658 | 659 | def issubset(self, other: Fsm, /) -> bool: 660 | """ 661 | Treat `self` and `other` as sets of strings and see if `self` is a 662 | subset of `other`... `self` recognises no strings which `other` 663 | doesn't. 664 | """ 665 | return (self - other).empty() 666 | 667 | def __le__(self, other: Fsm, /) -> bool: 668 | """ 669 | Treat `self` and `other` as sets of strings and see if `self` is a 670 | subset of `other`... `self` recognises no strings which `other` 671 | doesn't. 672 | """ 673 | return self.issubset(other) 674 | 675 | def ispropersubset(self, other: Fsm, /) -> bool: 676 | """ 677 | Treat `self` and `other` as sets of strings and see if `self` is a 678 | proper subset of `other`. 679 | """ 680 | return self <= other and self != other 681 | 682 | def __lt__(self, other: Fsm, /) -> bool: 683 | """ 684 | Treat `self` and `other` as sets of strings and see if `self` is a 685 | strict subset of `other`. 686 | """ 687 | return self.ispropersubset(other) 688 | 689 | def issuperset(self, other: Fsm, /) -> bool: 690 | """ 691 | Treat `self` and `other` as sets of strings and see if `self` is a 692 | superset of `other`. 693 | """ 694 | return (other - self).empty() 695 | 696 | def __ge__(self, other: Fsm, /) -> bool: 697 | """ 698 | Treat `self` and `other` as sets of strings and see if `self` is a 699 | superset of `other`. 700 | """ 701 | return self.issuperset(other) 702 | 703 | def ispropersuperset(self, other: Fsm, /) -> bool: 704 | """ 705 | Treat `self` and `other` as sets of strings and see if `self` is a 706 | proper superset of `other`. 707 | """ 708 | return self >= other and self != other 709 | 710 | def __gt__(self, other: Fsm, /) -> bool: 711 | """ 712 | Treat `self` and `other` as sets of strings and see if `self` is a 713 | strict superset of `other`. 714 | """ 715 | return self.ispropersuperset(other) 716 | 717 | def copy(self, /) -> Fsm: 718 | """ 719 | For completeness only, since `set.copy()` and `frozenset.copy()` exist. 720 | FSM objects are immutable; like `frozenset`, this just returns `self`. 721 | """ 722 | return self 723 | 724 | __copy__ = copy 725 | 726 | def derive(self, string: str, /) -> Fsm: 727 | """ 728 | Compute the Brzozowski derivative of this FSM with respect to the 729 | input string. Note that the FSM uses Charclasses as symbols internally, 730 | but the input string is a sequence of Unicode characters 731 | 732 | """ 733 | # Consume the input string. 734 | state = self.initial 735 | for char in string: 736 | for charclass in self.map[state]: 737 | if charclass.accepts(char): 738 | state = self.map[state][charclass] 739 | break 740 | 741 | # OK so now we have consumed that string, use the new location as 742 | # the starting point. 743 | return Fsm( 744 | alphabet=self.alphabet, 745 | states=self.states, 746 | initial=state, 747 | finals=self.finals, 748 | map=self.map, 749 | ) 750 | 751 | def replace_alphabet( 752 | self, replacements: Mapping[AlphaType, Iterable[AlphaType]] 753 | ) -> Fsm: 754 | """ 755 | Returns a new FSM which uses a different alphabet. If one original 756 | symbol converts to two new symbols, there will be multiple identical 757 | transitions; if none, the transitions will be omitted. 758 | """ 759 | new_alphabet = set() 760 | for symbol in self.alphabet: 761 | for replacement in replacements[symbol]: 762 | new_alphabet.add(replacement) 763 | 764 | new_map: Dict[StateType, Dict[AlphaType, StateType]] = {} 765 | for state in self.map: 766 | new_map[state] = {} 767 | for symbol in self.alphabet: 768 | for replacement in replacements[symbol]: 769 | new_map[state][replacement] = self.map[state][symbol] 770 | 771 | return Fsm( 772 | alphabet=new_alphabet, 773 | states=self.states, 774 | initial=self.initial, 775 | finals=self.finals, 776 | map=new_map, 777 | ) 778 | 779 | 780 | NULL = Fsm( 781 | alphabet={~Charclass()}, 782 | states={0}, 783 | initial=0, 784 | finals=(), 785 | map={ 786 | 0: {~Charclass(): 0}, 787 | }, 788 | ) 789 | """ 790 | An FSM accepting nothing (not even the empty string). This is 791 | demonstrates that this is possible, and is also extremely useful 792 | in some situations 793 | """ 794 | 795 | EPSILON = Fsm( 796 | alphabet={~Charclass()}, 797 | states={0, 1}, 798 | initial=0, 799 | finals={0}, 800 | map={ 801 | 0: {~Charclass(): 1}, 802 | 1: {~Charclass(): 1}, 803 | }, 804 | ) 805 | """ 806 | An FSM matching an empty string, "", only. 807 | This is very useful in many situations 808 | """ 809 | 810 | 811 | def parallel( 812 | fsms: tuple[Fsm, ...], 813 | test: Callable[[list[bool]], bool], 814 | /, 815 | ) -> Fsm: 816 | """ 817 | Crawl several FSMs in parallel, mapping the states of a larger 818 | meta-FSM. To determine whether a state in the larger FSM is final, pass 819 | all of the finality statuses (e.g. [True, False, False] to `test`. 820 | """ 821 | unified_fsms = unify_alphabets(fsms) 822 | 823 | initial: Mapping[int, StateType] = { 824 | i: fsm.initial for i, fsm in enumerate(unified_fsms) 825 | } 826 | 827 | # dedicated function accepts a "superset" and returns the next "superset" 828 | # obtained by following this transition in the new FSM 829 | def follow( 830 | current: Mapping[int, StateType], 831 | symbol: AlphaType, 832 | ) -> Mapping[int, StateType]: 833 | return {i: fsm.map[current[i]][symbol] for i, fsm in enumerate(unified_fsms)} 834 | 835 | # Determine the "is final?" condition of each substate, then pass it to the 836 | # test to determine finality of the overall FSM. 837 | def final(state: Mapping[int, StateType]) -> bool: 838 | return test([state[i] in fsm.finals for i, fsm in enumerate(unified_fsms)]) 839 | 840 | alphabet = unified_fsms[0].alphabet if len(unified_fsms) > 0 else {~Charclass()} 841 | 842 | return crawl(alphabet, initial, final, follow).reduce() 843 | 844 | 845 | def crawl( 846 | alphabet: Iterable[AlphaType], 847 | initial: M, 848 | final: Callable[[M], bool], 849 | follow: Callable[[M, AlphaType], M], 850 | ) -> Fsm: 851 | """ 852 | Given the above conditions and instructions, crawl a new unknown FSM, 853 | mapping its states, final states and transitions. Return the new FSM. 854 | This is a pretty powerful procedure which could potentially go on 855 | forever if you supply an evil version of follow(). 856 | """ 857 | 858 | states: list[M] = [initial] 859 | finals: set[StateType] = set() 860 | transitions: dict[StateType, dict[AlphaType, StateType]] = {} 861 | 862 | # iterate over a growing list 863 | i = 0 864 | while i < len(states): 865 | state = states[i] 866 | 867 | # add to finals 868 | if final(state): 869 | finals.add(i) 870 | 871 | # compute map for this state 872 | transitions[i] = {} 873 | for symbol in sorted(alphabet): 874 | next_state = follow(state, symbol) 875 | 876 | try: 877 | j = states.index(next_state) 878 | except ValueError: 879 | j = len(states) 880 | states.append(next_state) 881 | 882 | transitions[i][symbol] = j 883 | 884 | i += 1 885 | 886 | return Fsm( 887 | alphabet=alphabet, 888 | states=set(range(len(states))), 889 | initial=0, 890 | finals=finals, 891 | map=transitions, 892 | ) 893 | 894 | 895 | def from_charclass(charclass: Charclass) -> Fsm: 896 | # 0 is initial, 1 is final, 2 is dead 897 | return Fsm( 898 | alphabet={charclass, ~charclass}, 899 | states={0, 1, 2}, 900 | initial=0, 901 | finals={1}, 902 | map={ 903 | 0: {charclass: 1, ~charclass: 2}, 904 | 1: {charclass: 2, ~charclass: 2}, 905 | 2: {charclass: 2, ~charclass: 2}, 906 | }, 907 | ) 908 | --------------------------------------------------------------------------------