├── requirements.dev.txt
├── .editorconfig
├── .isort.cfg
├── .flake8
├── main.py
├── CHANGELOG.md
├── greenery
    ├── __init__.py
    ├── mult_test.py
    ├── conc_test.py
    ├── bound.py
    ├── bound_test.py
    ├── multiplier_test.py
    ├── pattern_test.py
    ├── multiplier.py
    ├── parse_test.py
    ├── charclass_test.py
    ├── parse.py
    ├── charclass.py
    ├── rxelems.py
    └── fsm.py
├── .gitattributes
├── mypy.ini
├── setup.py
├── LICENSE.txt
├── .github
    └── workflows
    │   └── workflow-1.yml
├── .gitignore
├── README.md
└── .pylintrc


/requirements.dev.txt:
--------------------------------------------------------------------------------
 1 | black
 2 | build
 3 | flake8
 4 | isort
 5 | mypy
 6 | pylint
 7 | pytest
 8 | setuptools
 9 | twine
10 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | [*]
2 | insert_final_newline = true
3 | 
4 | [*.py]
5 | charset = utf-8
6 | indent_style = space
7 | indent_size = 4
8 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | profile = black
3 | add_imports = from __future__ import annotations
4 | remove_redundant_aliases = true
5 | combine_as_imports = true
6 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 88
 3 | ignore =
 4 |   # "black" formatter has slightly different operator spacing rules than
 5 |   # flake8's defaults.
 6 |   # whitespace before ‘,’, ‘;’, or ‘:’
 7 |   E203,
 8 |   # line break before binary operator
 9 |   W503,
10 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from greenery import INF, PLUS, QM, STAR, Bound, Multiplier, parse
 4 | 
 5 | pattern = parse("a")
 6 | print(pattern)  # "a"
 7 | 
 8 | pattern = pattern * PLUS * QM * STAR * Multiplier(Bound(3), INF)
 9 | print(pattern)  # "((((a)+)?)*){3,}"
10 | 
11 | pattern = pattern.reduce()
12 | print(pattern)  # "a*"
13 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # CHANGELOG
 2 | 
 3 | ## 4.2.0
 4 | 
 5 | https://github.com/qntm/greenery/pull/106
 6 | 
 7 | ## 4.1.0
 8 | 
 9 | https://github.com/qntm/greenery/pull/99
10 | 
11 | ## 4.0.0
12 | 
13 | https://github.com/qntm/greenery/pull/67
14 | 
15 | ## 3.0
16 | 
17 | https://github.com/qntm/greenery/commit/347760c730232b2f0c243917f34bdf596288984a
18 | 
19 | ## 2.0
20 | 
21 | https://github.com/qntm/greenery/pull/10
22 | 
23 | ## 1.0
24 | 
25 | Initial release.
26 | 


--------------------------------------------------------------------------------
/greenery/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | __all__ = (
 4 |     "Bound",
 5 |     "INF",
 6 |     "Multiplier",
 7 |     "PLUS",
 8 |     "Pattern",
 9 |     "QM",
10 |     "STAR",
11 |     "parse",
12 |     "Fsm",
13 |     "EPSILON",
14 |     "NULL",
15 |     "Charclass",
16 | )
17 | 
18 | from .bound import INF, Bound
19 | from .fsm import EPSILON, NULL, Charclass, Fsm
20 | from .multiplier import PLUS, QM, STAR, Multiplier
21 | from .parse import parse
22 | from .rxelems import Pattern
23 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | *.sln    merge=union
 7 | *.csproj merge=union
 8 | *.vbproj merge=union
 9 | *.fsproj merge=union
10 | *.dbproj merge=union
11 | 
12 | # Standard to msysgit
13 | *.doc	 diff=astextplain
14 | *.DOC	 diff=astextplain
15 | *.docx diff=astextplain
16 | *.DOCX diff=astextplain
17 | *.dot  diff=astextplain
18 | *.DOT  diff=astextplain
19 | *.pdf  diff=astextplain
20 | *.PDF	 diff=astextplain
21 | *.rtf	 diff=astextplain
22 | *.RTF	 diff=astextplain
23 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.8
 3 | warn_return_any = True
 4 | warn_unused_configs = True
 5 | no_implicit_optional = True
 6 | 
 7 | # strict typing
 8 | strict_optional = True
 9 | disallow_untyped_calls = True
10 | disallow_untyped_defs = True
11 | disallow_incomplete_defs = True
12 | check_untyped_defs = True
13 | disallow_untyped_decorators = True
14 | 
15 | disallow_any_generics = True
16 | disallow_subclassing_any = True
17 | no_implicit_reexport = True
18 | strict_concatenate = True
19 | strict_equality = True
20 | warn_redundant_casts = True
21 | warn_unused_ignores = True
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | setup(
 6 |     name="greenery",
 7 |     version="4.2.2",
 8 |     tests_require=["pytest"],
 9 |     packages=["greenery"],
10 |     package_dir={"greenery": "greenery"},
11 |     author="qntm",
12 |     author_email="qntm <qntm@users.noreply.github.com>",
13 |     description="Greenery allows manipulation of regular expressions",
14 |     license="MIT License",
15 |     keywords=" ".join(
16 |         [
17 |             "re",
18 |             "regex",
19 |             "regexp",
20 |             "regular",
21 |             "expression",
22 |             "deterministic",
23 |             "finite",
24 |             "state",
25 |             "machine",
26 |             "automaton",
27 |             "fsm",
28 |             "dfsm",
29 |             "fsa",
30 |             "dfsa",
31 |             "greenery",
32 |         ]
33 |     ),
34 |     url="https://github.com/qntm/greenery",
35 |     classifiers=[
36 |         "License :: OSI Approved :: MIT License",
37 |         "Programming Language :: Python :: 3.8",
38 |     ],
39 | )
40 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 qntm
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/workflow-1.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   pull_request:
 8 |     branches:
 9 |     - '**'
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Set up Python 3.8
19 |       uses: actions/setup-python@v2
20 |       with:
21 |         python-version: "3.8"
22 |     - name: Install dependencies
23 |       run: |
24 |         python -m pip install --upgrade pip
25 |         pip install -r requirements.dev.txt
26 |     - name: Lint with isort
27 |       run: |
28 |         isort --quiet --diff --check .
29 |     - name: Check formatting with black
30 |       run: |
31 |         black --diff --check .
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 --count --statistics --show-source --select=E9,F63,F7,F82 .
36 |         # exit-zero treats all errors as warnings
37 |         flake8 --count --statistics --exit-zero .
38 |     - name: Lint with pylint
39 |       run: |
40 |         pylint --recursive=true .
41 |     - name: Check with mypy
42 |       run: |
43 |         mypy greenery
44 |     - name: Test with pytest
45 |       run: |
46 |         pytest
47 | 


--------------------------------------------------------------------------------
/greenery/mult_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from .bound import INF, Bound
 4 | from .charclass import DIGIT, Charclass
 5 | from .multiplier import ONE, PLUS, QM, STAR, Multiplier
 6 | from .rxelems import Mult
 7 | 
 8 | 
 9 | def test_mult_equality() -> None:
10 |     a = Mult(Charclass("a"), ONE)
11 |     # pylint: disable=comparison-with-itself
12 |     assert a == a
13 |     assert a != Mult(Charclass("b"), ONE)
14 |     assert a != Mult(Charclass("a"), QM)
15 |     assert a != Mult(Charclass("a"), Multiplier(Bound(1), Bound(2)))
16 | 
17 | 
18 | def test_mult_str() -> None:
19 |     a = Charclass("a")
20 |     assert str(Mult(a, ONE)) == "a"
21 |     assert str(Mult(a, Multiplier(Bound(2), Bound(2)))) == "a{2}"
22 |     assert str(Mult(a, Multiplier(Bound(3), Bound(3)))) == "a{3}"
23 |     assert str(Mult(a, Multiplier(Bound(4), Bound(4)))) == "a{4}"
24 |     assert str(Mult(a, Multiplier(Bound(5), Bound(5)))) == "a{5}"
25 |     assert str(Mult(a, QM)) == "a?"
26 |     assert str(Mult(a, STAR)) == "a*"
27 |     assert str(Mult(a, PLUS)) == "a+"
28 |     assert str(Mult(a, Multiplier(Bound(2), Bound(5)))) == "a{2,5}"
29 |     assert str(Mult(a, Multiplier(Bound(2), INF))) == "a{2,}"
30 | 
31 |     assert str(Mult(DIGIT, ONE)) == "\\d"
32 |     assert str(Mult(DIGIT, Multiplier(Bound(2), Bound(2)))) == "\\d{2}"
33 |     assert str(Mult(DIGIT, Multiplier(Bound(3), Bound(3)))) == "\\d{3}"
34 | 
35 | 
36 | def test_odd_bug() -> None:
37 |     # pylint: disable=invalid-name
38 | 
39 |     # Odd bug with ([bc]*c)?[ab]*
40 |     int5A = Mult(
41 |         Charclass("bc"),
42 |         STAR,
43 |     ).to_fsm()
44 |     assert int5A.accepts("")
45 | 
46 |     int5B = Mult(
47 |         Charclass("c"),
48 |         ONE,
49 |     ).to_fsm()
50 |     assert int5B.accepts("c")
51 | 
52 |     int5C = int5A.concatenate(int5B)
53 |     assert int5C.accepts("c")
54 | 
55 | 
56 | def test_mult_common() -> None:
57 |     a = Charclass("a")
58 |     assert Mult(a, Multiplier(Bound(3), Bound(4))).common(
59 |         Mult(a, Multiplier(Bound(2), Bound(5)))
60 |     ) == Mult(a, Multiplier(Bound(2), Bound(3)))
61 |     assert Mult(a, Multiplier(Bound(2), INF)).common(
62 |         Mult(a, Multiplier(Bound(1), Bound(5)))
63 |     ) == Mult(a, Multiplier(Bound(1), Bound(5)))
64 |     assert Mult(a, Multiplier(Bound(3), INF)).common(
65 |         Mult(a, Multiplier(Bound(2), INF))
66 |     ) == Mult(a, Multiplier(Bound(2), INF))
67 | 
68 | 
69 | def test_mult_dock() -> None:
70 |     a = Charclass("a")
71 |     assert Mult(a, Multiplier(Bound(4), Bound(5))).dock(
72 |         Mult(a, Multiplier(Bound(3), Bound(3)))
73 |     ) == Mult(a, Multiplier(Bound(1), Bound(2)))
74 | 


--------------------------------------------------------------------------------
/greenery/conc_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | from .bound import Bound
 6 | from .charclass import Charclass
 7 | from .multiplier import ONE, PLUS, QM, STAR, ZERO, Multiplier
 8 | from .rxelems import Conc, Mult
 9 | 
10 | 
11 | def test_conc_equality() -> None:
12 |     a = Conc(Mult(Charclass("a"), ONE))
13 |     assert a == Conc(Mult(Charclass("a"), ONE))
14 |     assert a != Conc(Mult(Charclass("b"), ONE))
15 |     assert a != Conc(Mult(Charclass("a"), QM))
16 |     assert a != Conc(Mult(Charclass("a"), Multiplier(Bound(1), Bound(2))))
17 |     assert a != Conc()
18 | 
19 | 
20 | def test_conc_str() -> None:
21 |     assert (
22 |         str(
23 |             Conc(
24 |                 Mult(Charclass("a"), ONE),
25 |                 Mult(Charclass("b"), ONE),
26 |                 Mult(Charclass("c"), ONE),
27 |                 Mult(Charclass("d"), ONE),
28 |                 Mult(Charclass("e"), ONE),
29 |                 Mult(~Charclass("fg"), STAR),
30 |                 Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))),
31 |                 Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS),
32 |             )
33 |         )
34 |         == "abcde[^fg]*h{5}[a-z]+"
35 |     )
36 | 
37 | 
38 | def test_conc_common() -> None:
39 |     a = Mult(Charclass("A"), ONE)
40 |     b = Mult(Charclass("B"), ONE)
41 |     c = Mult(Charclass("C"), ONE)
42 |     y = Mult(Charclass("y"), ONE)
43 |     z = Mult(Charclass("Z"), ONE)
44 |     zstar = Mult(Charclass("Z"), STAR)
45 | 
46 |     assert Conc(a, a, z, y).common(Conc(b, b, z, y), suffix=True) == Conc(z, y)
47 |     assert Conc(c, z).common(Conc(c, z), suffix=True) == Conc(c, z)
48 |     assert Conc(c, y).common(Conc(c, z), suffix=True) == Conc()
49 |     assert Conc(a, z).common(Conc(b, z), suffix=True) == Conc(z)
50 |     assert Conc(a, zstar).common(Conc(b, z), suffix=True) == Conc()
51 |     assert Conc(a).common(Conc(b), suffix=True) == Conc()
52 | 
53 | 
54 | def test_conc_dock() -> None:
55 |     a = Mult(Charclass("A"), ONE)
56 |     b = Mult(Charclass("B"), ONE)
57 |     x = Mult(Charclass("X"), ONE)
58 |     x_twice = Mult(Charclass("X"), Multiplier(Bound(2), Bound(2)))
59 |     yplus = Mult(Charclass("y"), PLUS)
60 |     z = Mult(Charclass("Z"), ONE)
61 | 
62 |     assert Conc(a, z).dock(Conc(z)) == Conc(a)
63 |     assert Conc(a, b, x, yplus, z).dock(Conc(x, yplus, z)) == Conc(a, b)
64 |     assert Conc(a, b, x, yplus, z).behead(Conc(a, b, x, yplus)) == Conc(z)
65 |     assert Conc(a).dock(Conc()) == Conc(a)
66 | 
67 |     with pytest.raises(ArithmeticError, match="Can't subtract"):
68 |         Conc(x_twice, yplus, z).behead(Conc(x, yplus))
69 | 
70 | 
71 | def test_mult_reduction_easy() -> None:
72 |     assert Conc(Mult(Charclass("a"), ZERO)).reduce() == Conc()
73 | 


--------------------------------------------------------------------------------
/greenery/bound.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | __all__ = (
 4 |     "Bound",
 5 |     "INF",
 6 | )
 7 | 
 8 | from dataclasses import dataclass
 9 | 
10 | 
11 | @dataclass(frozen=True)
12 | class Bound:
13 |     """An integer but sometimes also possibly infinite (None)"""
14 | 
15 |     v: int | None
16 | 
17 |     def __post_init__(self, /) -> None:
18 |         if self.v is not None and self.v < 0:
19 |             raise ValueError(f"Invalid bound: {self.v!r}")
20 | 
21 |     def __repr__(self, /) -> str:
22 |         return f"Bound({self.v!r})"
23 | 
24 |     def __str__(self, /) -> str:
25 |         if self.v is None:
26 |             # This only happens for an unlimited upper bound
27 |             return ""
28 |         return str(self.v)
29 | 
30 |     def __eq__(self, other: object, /) -> bool:
31 |         if not isinstance(other, type(self)):
32 |             return NotImplemented
33 |         return self.v == other.v
34 | 
35 |     def __hash__(self, /) -> int:
36 |         return hash(self.v)
37 | 
38 |     def __lt__(self, other: Bound, /) -> bool:
39 |         if self.v is None:
40 |             return False
41 |         if other.v is None:
42 |             return True
43 |         return self.v < other.v
44 | 
45 |     def __ge__(self, other: Bound, /) -> bool:
46 |         return not self < other
47 | 
48 |     def __mul__(self, other: Bound, /) -> Bound:
49 |         """Multiply this bound by another"""
50 |         if Bound(0) in (self, other):
51 |             return Bound(0)
52 |         if self.v is None or other.v is None:
53 |             return INF
54 |         return Bound(self.v * other.v)
55 | 
56 |     def __add__(self, other: Bound, /) -> Bound:
57 |         """Add this bound to another"""
58 |         if self.v is None or other.v is None:
59 |             return INF
60 |         return Bound(self.v + other.v)
61 | 
62 |     def __sub__(self, other: Bound, /) -> Bound:
63 |         """
64 |         Subtract another bound from this one.
65 |         Caution: this operation is not meaningful for all bounds.
66 |         """
67 |         if other.v is None:
68 |             if self.v is not None:
69 |                 raise ArithmeticError(f"Can't subtract {other!r} from {self!r}")
70 | 
71 |             # Infinity minus infinity is zero. This has to be true so that
72 |             # we can for example subtract Multiplier(Bound(0), INF) from
73 |             # Multiplier(Bound(1), INF) to get Multiplier(Bound(1), Bound(1))
74 |             return Bound(0)
75 |         if self.v is None:
76 |             return INF
77 |         try:
78 |             return Bound(self.v - other.v)
79 |         except ValueError as e:
80 |             raise ArithmeticError(*e.args) from e
81 | 
82 |     def copy(self, /) -> Bound:
83 |         return Bound(self.v)
84 | 
85 | 
86 | # Use this for cases where no upper bound is needed
87 | INF = Bound(None)
88 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # PDT-specific
 29 | .buildpath
 30 | 
 31 | 
 32 | #################
 33 | ## Visual Studio
 34 | #################
 35 | 
 36 | ## Ignore Visual Studio temporary files, build results, and
 37 | ## files generated by popular Visual Studio add-ons.
 38 | 
 39 | # User-specific files
 40 | *.suo
 41 | *.user
 42 | *.sln.docstates
 43 | 
 44 | # Build results
 45 | [Dd]ebug/
 46 | [Rr]elease/
 47 | *_i.c
 48 | *_p.c
 49 | *.ilk
 50 | *.meta
 51 | *.obj
 52 | *.pch
 53 | *.pdb
 54 | *.pgc
 55 | *.pgd
 56 | *.rsp
 57 | *.sbr
 58 | *.tlb
 59 | *.tli
 60 | *.tlh
 61 | *.tmp
 62 | *.vspscc
 63 | .builds
 64 | *.dotCover
 65 | 
 66 | ## TODO: If you have NuGet Package Restore enabled, uncomment this
 67 | #packages/
 68 | 
 69 | # Visual C++ cache files
 70 | ipch/
 71 | *.aps
 72 | *.ncb
 73 | *.opensdf
 74 | *.sdf
 75 | 
 76 | # Visual Studio profiler
 77 | *.psess
 78 | *.vsp
 79 | 
 80 | # ReSharper is a .NET coding add-in
 81 | _ReSharper*
 82 | 
 83 | # Installshield output folder
 84 | [Ee]xpress
 85 | 
 86 | # DocProject is a documentation generator add-in
 87 | DocProject/buildhelp/
 88 | DocProject/Help/*.HxT
 89 | DocProject/Help/*.HxC
 90 | DocProject/Help/*.hhc
 91 | DocProject/Help/*.hhk
 92 | DocProject/Help/*.hhp
 93 | DocProject/Help/Html2
 94 | DocProject/Help/html
 95 | 
 96 | # Click-Once directory
 97 | publish
 98 | 
 99 | # Others
100 | [Bb]in
101 | [Oo]bj
102 | sql
103 | TestResults
104 | *.Cache
105 | ClientBin
106 | stylecop.*
107 | ~$*
108 | *.dbmdl
109 | Generated_Code #added for RIA/Silverlight projects
110 | 
111 | # Backup & report files from converting an old project file to a newer
112 | # Visual Studio version. Backup files are not needed, because we have git ;-)
113 | _UpgradeReport_Files/
114 | Backup*/
115 | UpgradeLog*.XML
116 | 
117 | 
118 | 
119 | ############
120 | ## Windows
121 | ############
122 | 
123 | # Windows image file caches
124 | Thumbs.db
125 | 
126 | # Folder config file
127 | Desktop.ini
128 | 
129 | 
130 | #############
131 | ## Python
132 | #############
133 | 
134 | *.py[co]
135 | 
136 | # Packages
137 | *.egg
138 | *.egg-info
139 | dist
140 | build
141 | eggs
142 | parts
143 | bin
144 | var
145 | sdist
146 | develop-eggs
147 | .installed.cfg
148 | 
149 | # Installer logs
150 | pip-log.txt
151 | 
152 | # Unit test / coverage reports
153 | .coverage
154 | .tox
155 | 
156 | #Translations
157 | *.mo
158 | 
159 | #Mr Developer
160 | .mr.developer.cfg
161 | 
162 | # Mac crap
163 | .DS_Store
164 | 


--------------------------------------------------------------------------------
/greenery/bound_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pytest
  4 | 
  5 | from .bound import INF, Bound
  6 | 
  7 | 
  8 | def test_ctor() -> None:
  9 |     assert Bound(None) == INF
 10 | 
 11 |     Bound(0)
 12 |     Bound(1)
 13 |     Bound(2)
 14 | 
 15 |     with pytest.raises(ValueError):
 16 |         Bound(-1)
 17 | 
 18 | 
 19 | def test_eq_neq() -> None:
 20 |     # pylint: disable=comparison-with-itself
 21 |     assert Bound(0) == Bound(0)
 22 |     assert INF == INF
 23 |     assert Bound(0) != Bound(1)
 24 |     assert Bound(0) != INF
 25 |     assert Bound(1) == Bound(1)
 26 |     assert Bound(None) == INF
 27 | 
 28 | 
 29 | def test_eq_neq_heterogeneous() -> None:
 30 |     assert Bound(1) != "blah"
 31 | 
 32 | 
 33 | def test_comparisons() -> None:
 34 |     # pylint: disable=comparison-with-itself
 35 |     # pylint: disable=unneeded-not
 36 | 
 37 |     assert Bound(0) < Bound(1)
 38 |     assert Bound(0) < INF
 39 |     assert Bound(1) < INF
 40 |     assert not INF < INF
 41 | 
 42 | 
 43 | def test_multiplication() -> None:
 44 |     assert Bound(0) * Bound(0) == Bound(0)
 45 |     assert Bound(0) * Bound(1) == Bound(0)
 46 |     assert Bound(0) * Bound(2) == Bound(0)
 47 |     assert Bound(0) * Bound(5) == Bound(0)
 48 |     assert Bound(0) * INF == Bound(0)
 49 | 
 50 |     assert Bound(1) * Bound(5) == Bound(5)
 51 |     assert Bound(2) * Bound(5) == Bound(10)
 52 |     assert Bound(0) * INF == Bound(0)
 53 |     assert Bound(2) * INF == INF
 54 |     assert INF * INF == INF
 55 |     assert INF * Bound(0) == Bound(0)
 56 |     assert Bound(1) * Bound(0) == Bound(0)
 57 | 
 58 | 
 59 | def test_addition() -> None:
 60 |     assert Bound(0) + Bound(0) == Bound(0)
 61 |     assert Bound(0) + Bound(1) == Bound(1)
 62 |     assert Bound(0) + Bound(5) == Bound(5)
 63 |     assert Bound(0) + INF == INF
 64 | 
 65 |     assert Bound(1) + Bound(0) == Bound(1)
 66 |     assert Bound(1) + Bound(1) == Bound(2)
 67 |     assert Bound(1) + Bound(5) == Bound(6)
 68 |     assert Bound(1) + INF == INF
 69 | 
 70 |     assert INF + Bound(0) == INF
 71 |     assert INF + Bound(1) == INF
 72 |     assert INF + INF == INF
 73 | 
 74 | 
 75 | def test_subtraction() -> None:
 76 |     assert Bound(0) - Bound(0) == Bound(0)
 77 |     assert Bound(1) - Bound(0) == Bound(1)
 78 |     assert Bound(6) - Bound(4) == Bound(2)
 79 |     assert Bound(5) - Bound(5) == Bound(0)
 80 | 
 81 |     assert INF - Bound(0) == INF
 82 |     assert INF - Bound(1) == INF
 83 |     assert INF - Bound(1000) == INF
 84 |     assert INF - INF == Bound(0)
 85 | 
 86 |     with pytest.raises(ArithmeticError):
 87 |         _ = Bound(5) - Bound(6)
 88 | 
 89 |     with pytest.raises(ArithmeticError):
 90 |         _ = Bound(0) - Bound(1)
 91 | 
 92 |     with pytest.raises(ArithmeticError):
 93 |         _ = Bound(0) - INF
 94 | 
 95 |     with pytest.raises(ArithmeticError):
 96 |         _ = Bound(10) - INF
 97 | 
 98 | 
 99 | def test_copy() -> None:
100 |     assert INF.copy() == INF
101 | 
102 |     b = Bound(6)
103 |     assert b.copy() == b
104 | 
105 | 
106 | def test_bound_str() -> None:
107 |     assert str(Bound(2)) == "2"
108 | 
109 |     # pylint: disable-next=compare-to-empty-string
110 |     assert str(INF) == ""
111 | 
112 | 
113 | def test_bound() -> None:
114 |     assert min(Bound(0), INF) == Bound(0)
115 |     assert min(Bound(1), INF) == Bound(1)
116 | 


--------------------------------------------------------------------------------
/greenery/multiplier_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pytest
  4 | 
  5 | from .bound import INF, Bound
  6 | from .multiplier import ONE, PLUS, QM, STAR, ZERO, Multiplier
  7 | 
  8 | 
  9 | def test_multiplier_str() -> None:
 10 |     assert str(Multiplier(Bound(2), INF)) == "{2,}"
 11 |     assert str(Multiplier(Bound(0), Bound(0))) == "{0}"
 12 |     assert str(Multiplier(Bound(2), Bound(2))) == "{2}"
 13 |     assert str(Multiplier(Bound(2), Bound(5))) == "{2,5}"
 14 | 
 15 | 
 16 | def test_bound_qm() -> None:
 17 |     assert QM.mandatory == Bound(0)
 18 |     assert QM.optional == Bound(1)
 19 | 
 20 | 
 21 | def test_eq() -> None:
 22 |     assert ZERO == Multiplier(Bound(0), Bound(0))
 23 |     assert ONE == Multiplier(Bound(1), Bound(1))
 24 |     assert STAR == Multiplier(Bound(0), INF)
 25 |     assert Multiplier(Bound(1), Bound(2)) == Multiplier(Bound(1), Bound(2))
 26 | 
 27 |     assert ZERO != ONE
 28 |     assert STAR != QM
 29 | 
 30 | 
 31 | def test_eq_het() -> None:
 32 |     assert ZERO != "goldfish"
 33 | 
 34 | 
 35 | def test_multiplier_common() -> None:
 36 |     assert ZERO.common(ZERO) == ZERO
 37 |     assert ZERO.common(QM) == ZERO
 38 |     assert ZERO.common(ONE) == ZERO
 39 |     assert ZERO.common(STAR) == ZERO
 40 |     assert ZERO.common(PLUS) == ZERO
 41 |     assert QM.common(ZERO) == ZERO
 42 |     assert QM.common(QM) == QM
 43 |     assert QM.common(ONE) == ZERO
 44 |     assert QM.common(STAR) == QM
 45 |     assert QM.common(PLUS) == QM
 46 |     assert ONE.common(ZERO) == ZERO
 47 |     assert ONE.common(QM) == ZERO
 48 |     assert ONE.common(ONE) == ONE
 49 |     assert ONE.common(STAR) == ZERO
 50 |     assert ONE.common(PLUS) == ONE
 51 |     assert STAR.common(ZERO) == ZERO
 52 |     assert STAR.common(QM) == QM
 53 |     assert STAR.common(ONE) == ZERO
 54 |     assert STAR.common(STAR) == STAR
 55 |     assert STAR.common(PLUS) == STAR
 56 |     assert PLUS.common(ZERO) == ZERO
 57 |     assert PLUS.common(QM) == QM
 58 |     assert PLUS.common(ONE) == ONE
 59 |     assert PLUS.common(STAR) == STAR
 60 |     assert PLUS.common(PLUS) == PLUS
 61 | 
 62 | 
 63 | def test_multiplier_subtraction() -> None:
 64 |     # a{3,4}, a{2,5} -> a{2,3} (with a{1,1}, a{0,2} left over)
 65 |     assert Multiplier(Bound(3), Bound(4)).common(
 66 |         Multiplier(Bound(2), Bound(5))
 67 |     ) == Multiplier(Bound(2), Bound(3))
 68 |     assert Multiplier(Bound(3), Bound(4)) - Multiplier(Bound(2), Bound(3)) == ONE
 69 |     assert Multiplier(Bound(2), Bound(5)) - Multiplier(
 70 |         Bound(2), Bound(3)
 71 |     ) == Multiplier(Bound(0), Bound(2))
 72 | 
 73 |     # a{2,}, a{1,5} -> a{1,5} (with a{1,}, a{0,0} left over)
 74 |     assert Multiplier(Bound(2), INF).common(
 75 |         Multiplier(Bound(1), Bound(5))
 76 |     ) == Multiplier(Bound(1), Bound(5))
 77 |     assert Multiplier(Bound(2), INF) - Multiplier(Bound(1), Bound(5)) == PLUS
 78 |     assert Multiplier(Bound(1), Bound(5)) - Multiplier(Bound(1), Bound(5)) == ZERO
 79 | 
 80 |     # a{3,}, a{2,} -> a{2,} (with a, epsilon left over)
 81 |     assert Multiplier(Bound(3), INF).common(Multiplier(Bound(2), INF)) == Multiplier(
 82 |         Bound(2), INF
 83 |     )
 84 |     assert Multiplier(Bound(3), INF) - Multiplier(Bound(2), INF) == ONE
 85 |     assert Multiplier(Bound(2), INF) - Multiplier(Bound(2), INF) == ZERO
 86 | 
 87 |     # a{3,}, a{3,} -> a{3,} (with ZERO, ZERO left over)
 88 |     assert Multiplier(Bound(3), INF).common(Multiplier(Bound(3), INF)) == Multiplier(
 89 |         Bound(3), INF
 90 |     )
 91 |     assert Multiplier(Bound(3), INF) - Multiplier(Bound(3), INF) == ZERO
 92 | 
 93 | 
 94 | def test_multiplier_union() -> None:
 95 |     assert ZERO | ZERO == ZERO
 96 |     assert ZERO | QM == QM
 97 |     assert ZERO | ONE == QM
 98 |     assert ZERO | STAR == STAR
 99 |     assert ZERO | PLUS == STAR
100 |     assert QM | ZERO == QM
101 |     assert QM | QM == QM
102 |     assert QM | ONE == QM
103 |     assert QM | STAR == STAR
104 |     assert QM | PLUS == STAR
105 |     assert ONE | ZERO == QM
106 |     assert ONE | QM == QM
107 |     assert ONE | ONE == ONE
108 |     assert ONE | STAR == STAR
109 |     assert ONE | PLUS == PLUS
110 |     assert STAR | ZERO == STAR
111 |     assert STAR | QM == STAR
112 |     assert STAR | ONE == STAR
113 |     assert STAR | STAR == STAR
114 |     assert STAR | PLUS == STAR
115 |     assert PLUS | ZERO == STAR
116 |     assert PLUS | QM == STAR
117 |     assert PLUS | ONE == PLUS
118 |     assert PLUS | STAR == STAR
119 |     assert PLUS | PLUS == PLUS
120 |     assert not ZERO.canunion(Multiplier(Bound(2), INF))
121 |     assert not ONE.canunion(Multiplier(Bound(3), Bound(4)))
122 |     assert not Multiplier(Bound(8), INF).canunion(Multiplier(Bound(3), Bound(4)))
123 | 
124 |     with pytest.raises(ArithmeticError, match="Can't compute the union"):
125 |         _ = ZERO | Multiplier(Bound(7), Bound(8))
126 | 


--------------------------------------------------------------------------------
/greenery/pattern_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from .charclass import Charclass
  4 | from .multiplier import ONE, ZERO
  5 | from .parse import parse
  6 | from .rxelems import Conc, Mult, Pattern
  7 | 
  8 | 
  9 | def test_pattern_equality() -> None:
 10 |     assert Pattern(
 11 |         Conc(Mult(Charclass("a"), ONE)),
 12 |         Conc(Mult(Charclass("b"), ONE)),
 13 |     ) == Pattern(
 14 |         Conc(Mult(Charclass("b"), ONE)),
 15 |         Conc(Mult(Charclass("a"), ONE)),
 16 |     )
 17 |     assert Pattern(
 18 |         Conc(Mult(Charclass("a"), ONE)),
 19 |         Conc(Mult(Charclass("a"), ONE)),
 20 |     ) == Pattern(
 21 |         Conc(Mult(Charclass("a"), ONE)),
 22 |     )
 23 | 
 24 | 
 25 | def test_pattern_str() -> None:
 26 |     assert (
 27 |         str(
 28 |             Pattern(
 29 |                 Conc(Mult(Charclass("a"), ONE)),
 30 |                 Conc(Mult(Charclass("b"), ONE)),
 31 |             )
 32 |         )
 33 |         == "a|b"
 34 |     )
 35 |     assert (
 36 |         str(
 37 |             Pattern(
 38 |                 Conc(Mult(Charclass("a"), ONE)),
 39 |                 Conc(Mult(Charclass("a"), ONE)),
 40 |             )
 41 |         )
 42 |         == "a"
 43 |     )
 44 |     assert (
 45 |         str(
 46 |             Pattern(
 47 |                 Conc(
 48 |                     Mult(Charclass("a"), ONE),
 49 |                     Mult(Charclass("b"), ONE),
 50 |                     Mult(Charclass("c"), ONE),
 51 |                 ),
 52 |                 Conc(
 53 |                     Mult(Charclass("d"), ONE),
 54 |                     Mult(Charclass("e"), ONE),
 55 |                     Mult(Charclass("f"), ONE),
 56 |                     Mult(
 57 |                         Pattern(
 58 |                             Conc(
 59 |                                 Mult(Charclass("g"), ONE),
 60 |                                 Mult(Charclass("h"), ONE),
 61 |                                 Mult(Charclass("i"), ONE),
 62 |                             ),
 63 |                             Conc(
 64 |                                 Mult(Charclass("j"), ONE),
 65 |                                 Mult(Charclass("k"), ONE),
 66 |                                 Mult(Charclass("l"), ONE),
 67 |                             ),
 68 |                         ),
 69 |                         ONE,
 70 |                     ),
 71 |                 ),
 72 |             )
 73 |         )
 74 |         == "abc|def(ghi|jkl)"
 75 |     )
 76 | 
 77 | 
 78 | def test_empty() -> None:
 79 |     assert Pattern().empty()
 80 | 
 81 | 
 82 | def test_mult_reduction_easy() -> None:
 83 |     assert Pattern(Conc()).reduce() == Pattern(Conc())
 84 |     assert Pattern(
 85 |         Conc(
 86 |             Mult(
 87 |                 Charclass("a"),
 88 |                 ZERO,
 89 |             )
 90 |         )
 91 |     ).reduce() == Pattern(Conc())
 92 | 
 93 |     assert str(
 94 |         # pylint: disable-next=compare-to-empty-string
 95 |         Pattern(
 96 |             Conc(
 97 |                 Mult(
 98 |                     Charclass("a"),
 99 |                     ZERO,
100 |                 )
101 |             ).reduce()
102 |         )
103 |         == ""
104 |     )
105 | 
106 | 
107 | def test_empty_pattern_reduction() -> None:
108 |     assert str(Pattern().reduce()) == "[]"
109 | 
110 | 
111 | def test_empty_conc_suppression() -> None:
112 |     assert (
113 |         str(
114 |             Pattern(
115 |                 Conc(
116 |                     # this `Mult` can never actually match anything
117 |                     Mult(Pattern(), ONE),
118 |                     Mult(Charclass("0"), ONE),
119 |                     Mult(Charclass("0123456789"), ONE),
120 |                 )  # so neither can this `Conc`
121 |             ).reduce()
122 |         )
123 |         == "[]"
124 |     )
125 | 
126 | 
127 | def test_pattern_dock() -> None:
128 |     a = Mult(Charclass("a"), ONE)
129 |     c = Mult(Charclass("c"), ONE)
130 |     f = Mult(Charclass("f"), ONE)
131 | 
132 |     assert parse("a|bc").dock(Conc()) == parse("a|bc")
133 |     assert parse("aa|bca").dock(Conc(a)) == parse("a|bc")
134 |     assert parse("xyza|abca|a").dock(Conc(a)) == parse("xyz|abc|")
135 |     assert parse("f{2,3}c|fc").dock(Conc(f, c)) == parse("f{1,2}|")
136 |     assert parse("aa").dock(Conc(a, a)) == parse("")
137 | 
138 | 
139 | def test_pattern_beheading() -> None:
140 |     a = Mult(Charclass("a"), ONE)
141 |     c = Mult(Charclass("c"), ONE)
142 |     f = Mult(Charclass("f"), ONE)
143 |     z = Mult(Charclass("Z"), ONE)
144 | 
145 |     assert parse("aa").behead(Conc(a)) == parse("a")
146 |     assert parse("abc|aa").behead(Conc(a)) == parse("a|bc")
147 |     assert parse("cf{1,2}|cf").behead(Conc(c)) == parse("f{1,2}|f")
148 |     assert parse("aa|aa").behead(Conc(a, a)) == parse("")
149 |     assert parse("abc|aa").behead(Conc(a)) == parse("a|bc")
150 |     assert parse("a|bc").behead(Conc()) == parse("a|bc")
151 |     assert parse("cf{1,2}|cf").behead(Conc(c, f)) == parse("f?|")
152 |     assert parse("ZA|ZB|ZC").behead(Conc(z)) == parse("A|B|C")
153 |     assert parse("Z+A|ZB|ZZC").behead(Conc(z)) == parse("Z*A|B|ZC")
154 |     assert parse("a{2}b|a+c").behead(Conc(a)) == parse("ab|a*c")
155 | 


--------------------------------------------------------------------------------
/greenery/multiplier.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | __all__ = (
  4 |     "Multiplier",
  5 |     "ONE",
  6 |     "PLUS",
  7 |     "QM",
  8 |     "STAR",
  9 |     "ZERO",
 10 |     "symbolic",
 11 | )
 12 | 
 13 | from dataclasses import dataclass, field
 14 | from typing import Mapping
 15 | 
 16 | from .bound import INF, Bound
 17 | 
 18 | 
 19 | @dataclass(frozen=True)
 20 | class Multiplier:
 21 |     """
 22 |     A min and a max. The vast majority of characters in regular expressions
 23 |     occur without a specific multiplier, which is implicitly equivalent to
 24 |     a min of 1 and a max of 1, but many more have explicit multipliers like
 25 |     "*" (min = 0, max = inf) and so on.
 26 | 
 27 |     Although it seems odd and can lead to some confusing edge cases, we do
 28 |     also permit a max of 0 (iff min is 0 too). This allows the multiplier
 29 |     `ZERO` to exist, which actually are quite useful in their own special
 30 |     way.
 31 |     """
 32 | 
 33 |     min: Bound
 34 |     max: Bound
 35 |     mandatory: Bound = field(init=False)
 36 |     optional: Bound = field(init=False)
 37 | 
 38 |     def __post_init__(self, /) -> None:
 39 |         if self.min == INF:
 40 |             raise ValueError(f"Minimum bound of a multiplier can't be {INF!r}")
 41 |         if self.min > self.max:
 42 |             raise ValueError(
 43 |                 f"Invalid multiplier bounds: {self.min!r} and {self.max!r}"
 44 |             )
 45 | 
 46 |         # More useful than "min" and "max" in many situations
 47 |         # are "mandatory" and "optional".
 48 |         object.__setattr__(self, "mandatory", self.min)
 49 |         object.__setattr__(self, "optional", self.max - self.min)
 50 | 
 51 |     def __eq__(self, other: object, /) -> bool:
 52 |         if not isinstance(other, type(self)):
 53 |             return NotImplemented
 54 |         return self.min == other.min and self.max == other.max
 55 | 
 56 |     def __hash__(self, /) -> int:
 57 |         return hash((self.min, self.max))
 58 | 
 59 |     def __repr__(self, /) -> str:
 60 |         return f"Multiplier({self.min!r}, {self.max!r})"
 61 | 
 62 |     def __str__(self, /) -> str:
 63 |         try:
 64 |             return symbolic[self]
 65 |         except LookupError:
 66 |             pass
 67 | 
 68 |         if self.min == self.max:
 69 |             return "{" + str(self.min) + "}"
 70 |         return "{" + str(self.min) + "," + str(self.max) + "}"
 71 | 
 72 |     def canmultiplyby(self, other: Multiplier, /) -> bool:
 73 |         """
 74 |         Multiplication is not well-defined for all pairs of multipliers
 75 |         because the resulting possibilities do not necessarily form a
 76 |         continuous range.
 77 | 
 78 |         For example:
 79 |             {0,x} * {0,y} = {0,x*y}
 80 |             {2} * {3} = {6}
 81 |             {2} * {1,2} = ERROR
 82 | 
 83 |         The proof isn't simple but suffice it to say that {p,p+q} * {r,r+s}
 84 |         is equal to {pr, (p+q)(r+s)} only if s=0 or qr+1 >= p. If not, then
 85 |         at least one gap appears in the range. The first inaccessible
 86 |         number is (p+q)r+1. And no, multiplication is not commutative
 87 |         """
 88 |         return (
 89 |             other.optional == Bound(0)
 90 |             or self.optional * other.mandatory + Bound(1) >= self.mandatory
 91 |         )
 92 | 
 93 |     def __mul__(self, other: Multiplier, /) -> Multiplier:
 94 |         """Multiply this multiplier by another"""
 95 |         if not self.canmultiplyby(other):
 96 |             raise ArithmeticError(f"Can't multiply {self!r} by {other!r}")
 97 |         return Multiplier(self.min * other.min, self.max * other.max)
 98 | 
 99 |     def __add__(self, other: Multiplier, /) -> Multiplier:
100 |         """Add two multipliers together"""
101 |         return Multiplier(self.min + other.min, self.max + other.max)
102 | 
103 |     def __sub__(self, other: Multiplier, /) -> Multiplier:
104 |         """
105 |         Subtract another multiplier from this one.
106 |         Caution: multipliers are not totally ordered.
107 |         This operation is not meaningful for all pairs of multipliers.
108 |         """
109 |         mandatory = self.mandatory - other.mandatory
110 |         optional = self.optional - other.optional
111 |         return Multiplier(mandatory, mandatory + optional)
112 | 
113 |     def canintersect(self, other: Multiplier, /) -> bool:
114 |         """
115 |         Intersection is not well-defined for all pairs of multipliers.
116 |         For example:
117 |             {2,3} & {3,4} = {3}
118 |             {2,} & {1,7} = {2,7}
119 |             {2} & {5} = ERROR
120 |         """
121 |         return not (self.max < other.min or other.max < self.min)
122 | 
123 |     def __and__(self, other: Multiplier, /) -> Multiplier:
124 |         """
125 |         Find the intersection of two multipliers: that is, a third
126 |         multiplier expressing the range covered by both of the originals.
127 |         This is not defined for all multipliers since they may not overlap.
128 |         """
129 |         if not self.canintersect(other):
130 |             raise ArithmeticError(
131 |                 f"Can't compute intersection of {self!r} and {other!r}"
132 |             )
133 |         a = max(self.min, other.min)
134 |         b = min(self.max, other.max)
135 |         return Multiplier(a, b)
136 | 
137 |     def canunion(self, other: Multiplier, /) -> bool:
138 |         """
139 |         Union is not defined for all pairs of multipliers.
140 |         E.g. {0,1} | {3,4} -> nope
141 |         """
142 |         return not (self.max + Bound(1) < other.min or other.max + Bound(1) < self.min)
143 | 
144 |     def __or__(self, other: Multiplier, /) -> Multiplier:
145 |         """
146 |         Find the union of two multipliers: that is, a third multiplier
147 |         expressing the range covered by either of the originals. This is
148 |         not defined for all multipliers since they may not intersect.
149 |         """
150 |         if not self.canunion(other):
151 |             raise ArithmeticError(f"Can't compute the union of {self!r} and {other!r}")
152 |         a = min(self.min, other.min)
153 |         b = max(self.max, other.max)
154 |         return Multiplier(a, b)
155 | 
156 |     def common(self, other: Multiplier, /) -> Multiplier:
157 |         """
158 |         Find the shared part of two multipliers. This is the largest
159 |         multiplier which can be safely subtracted from both the originals.
160 |         This may return the `ZERO` multiplier.
161 |         """
162 |         mandatory = min(self.mandatory, other.mandatory)
163 |         optional = min(self.optional, other.optional)
164 |         return Multiplier(mandatory, mandatory + optional)
165 | 
166 |     def copy(self, /) -> Multiplier:
167 |         return Multiplier(self.min.copy(), self.max.copy())
168 | 
169 | 
170 | # Preset multipliers. These get used ALL THE TIME in unit tests
171 | ZERO = Multiplier(Bound(0), Bound(0))  # has some occasional uses
172 | QM = Multiplier(Bound(0), Bound(1))
173 | ONE = Multiplier(Bound(1), Bound(1))
174 | STAR = Multiplier(Bound(0), INF)
175 | PLUS = Multiplier(Bound(1), INF)
176 | 
177 | # Symbol lookup table for preset multipliers.
178 | symbolic: Mapping[Multiplier, str] = {
179 |     QM: "?",
180 |     ONE: "",
181 |     STAR: "*",
182 |     PLUS: "+",
183 | }
184 | 


--------------------------------------------------------------------------------
/greenery/parse_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import pytest
  4 | 
  5 | from .bound import INF, Bound
  6 | from .charclass import DIGIT, DOT, NULLCHARCLASS, Charclass
  7 | from .multiplier import ONE, PLUS, STAR, Multiplier
  8 | 
  9 | # noinspection PyProtectedMember
 10 | from .parse import NoMatch, match_charclass, match_mult, parse
 11 | from .rxelems import Conc, Mult, Pattern
 12 | 
 13 | if __name__ == "__main__":
 14 |     raise RuntimeError(
 15 |         "Test files can't be run directly. Use `python -m pytest greenery`"
 16 |     )
 17 | 
 18 | 
 19 | def test_charclass_matching() -> None:
 20 |     assert match_charclass("a", 0) == (Charclass("a"), 1)
 21 |     assert match_charclass("aa", 1) == (Charclass("a"), 2)
 22 |     assert match_charclass("a$", 1) == (Charclass("$"), 2)
 23 |     assert match_charclass(".", 0) == (DOT, 1)
 24 | 
 25 |     with pytest.raises(IndexError):
 26 |         match_charclass("[", 0)
 27 | 
 28 |     with pytest.raises(NoMatch):
 29 |         match_charclass("a", 1)
 30 | 
 31 |     assert match_charclass("[\\d]", 0) == (DIGIT, 4)
 32 | 
 33 | 
 34 | def test_negatives_inside_charclasses() -> None:
 35 |     assert match_charclass("[\\D]", 0) == (~DIGIT, 4)
 36 |     assert match_charclass("[a\\D]", 0) == (~DIGIT, 5)
 37 |     assert match_charclass("[a1\\D]", 0) == (~Charclass("023456789"), 6)
 38 |     assert match_charclass("[1a\\D]", 0) == (~Charclass("023456789"), 6)
 39 |     assert match_charclass("[1\\D]", 0) == (~Charclass("023456789"), 5)
 40 |     assert match_charclass("[\\Da]", 0) == (~DIGIT, 5)
 41 |     assert match_charclass("[\\D1]", 0) == (~Charclass("023456789"), 5)
 42 |     assert match_charclass("[\\D1a]", 0) == (~Charclass("023456789"), 6)
 43 |     assert match_charclass("[\\D\\d]", 0) == (DOT, 6)
 44 |     assert match_charclass("[\\D\\D]", 0) == (~DIGIT, 6)
 45 | 
 46 |     # "Either non-whitespace or a non-digit" matches _anything_.
 47 |     assert match_charclass("[\\S\\D]", 0) == (DOT, 6)
 48 |     assert match_charclass("[\\S \\D]", 0) == (DOT, 7)
 49 | 
 50 | 
 51 | def test_negated_negatives_inside_charclasses() -> None:
 52 |     assert match_charclass("[^\\D]", 0) == (DIGIT, 5)
 53 |     assert match_charclass("[^a\\D]", 0) == (DIGIT, 6)
 54 |     assert match_charclass("[^a1\\D]", 0) == (Charclass("023456789"), 7)
 55 |     assert match_charclass("[^1a\\D]", 0) == (Charclass("023456789"), 7)
 56 |     assert match_charclass("[^1\\D]", 0) == (Charclass("023456789"), 6)
 57 |     assert match_charclass("[^\\Da]", 0) == (DIGIT, 6)
 58 |     assert match_charclass("[^\\D1]", 0) == (Charclass("023456789"), 6)
 59 |     assert match_charclass("[^\\D1a]", 0) == (Charclass("023456789"), 7)
 60 |     assert match_charclass("[^\\D\\d]", 0) == (NULLCHARCLASS, 7)
 61 |     assert match_charclass("[^\\D\\D]", 0) == (DIGIT, 7)
 62 | 
 63 |     # "Anything but non-whitespace and non-digit" matches _nothing_.
 64 |     assert match_charclass("[^\\S\\D]", 0) == (NULLCHARCLASS, 7)
 65 |     assert match_charclass("[^\\S \\D]", 0) == (NULLCHARCLASS, 8)
 66 | 
 67 | 
 68 | def test_match_nightmare_charclass() -> None:
 69 |     assert match_charclass("[\t\n\r -\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", 0) == (
 70 |         Charclass(
 71 |             (
 72 |                 ("\t", "\t"),
 73 |                 ("\n", "\n"),
 74 |                 ("\r", "\r"),
 75 |                 (" ", "\uD7FF"),
 76 |                 ("\uE000", "\uFFFD"),
 77 |                 ("\U00010000", "\U0010FFFF"),
 78 |             )
 79 |         ),
 80 |         14,
 81 |     )
 82 | 
 83 | 
 84 | def test_mult_matching() -> None:
 85 |     assert match_mult("abcde[^fg]*", 5) == (Mult(~Charclass("fg"), STAR), 11)
 86 |     assert match_mult("abcde[^fg]*h{5}[a-z]+", 11) == (
 87 |         Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))),
 88 |         15,
 89 |     )
 90 |     assert match_mult("abcde[^fg]*h{5}[a-z]+T{1,}", 15) == (
 91 |         Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS),
 92 |         21,
 93 |     )
 94 |     assert match_mult("abcde[^fg]*h{5}[a-z]+T{2,}", 21) == (
 95 |         Mult(Charclass("T"), Multiplier(Bound(2), INF)),
 96 |         26,
 97 |     )
 98 | 
 99 | 
100 | def test_lazy_multipliers() -> None:
101 |     assert match_mult("abcde[^fg]*?", 5) == (Mult(~Charclass("fg"), STAR), 12)
102 |     assert match_mult("abcde[^fg]*?h{5}?[a-z]+", 12) == (
103 |         Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))),
104 |         17,
105 |     )
106 |     assert match_mult("abcde[^fg]*?h{5}?[a-z]+?T{1,}", 17) == (
107 |         Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS),
108 |         24,
109 |     )
110 |     assert match_mult("abcde[^fg]*?h{5}?[a-z]+?T{2,}?", 24) == (
111 |         Mult(Charclass("T"), Multiplier(Bound(2), INF)),
112 |         30,
113 |     )
114 | 
115 | 
116 | def test_charclass_ranges() -> None:
117 |     # Should accept arbitrary ranges of characters in charclasses. No longer
118 |     # limited to alphanumerics. (User beware...)
119 |     assert parse("[z{|}~]") == parse("[z-~]")
120 |     assert parse("[\\w:;<=>?@\\[\\\\\\]\\^`]") == parse("[0-z]")
121 | 
122 | 
123 | def test_hex_escapes() -> None:
124 |     # Should be able to parse e.g. "\\x40"
125 |     assert parse("\\x00") == parse("\x00")
126 |     assert parse("\\x40") == parse("@")
127 |     assert parse("[\\x40]") == parse("[@]")
128 |     assert parse("[\\x41-\\x5a]") == parse("[A-Z]")
129 | 
130 | 
131 | def test_w_d_s() -> None:
132 |     # Allow "\w", "\d" and "\s" in charclasses
133 |     assert parse("\\w") == parse("[0-9A-Z_a-z]")
134 |     assert parse("[\\w~]") == parse("[0-9A-Z_a-z~]")
135 |     assert parse("[\\da]") == parse("[0123456789a]")
136 |     assert parse("[\\s]") == parse("[\t\n\r\f\v ]")
137 | 
138 | 
139 | def test_mult_parsing() -> None:
140 |     assert parse("[a-g]+") == Pattern(Conc(Mult(Charclass("abcdefg"), PLUS)))
141 |     assert parse("[a-g0-8$%]+") == Pattern(
142 |         Conc(Mult(Charclass("abcdefg012345678$%"), PLUS))
143 |     )
144 |     assert parse("[a-g0-8$%\\^]+") == Pattern(
145 |         Conc(Mult(Charclass("abcdefg012345678$%^"), PLUS))
146 |     )
147 | 
148 | 
149 | def test_lazy_mult_parsing() -> None:
150 |     assert parse("[a-g]+?") == Pattern(Conc(Mult(Charclass("abcdefg"), PLUS)))
151 | 
152 | 
153 | def test_conc_parsing() -> None:
154 |     assert parse("abcde[^fg]*h{5}[a-z]+") == Pattern(
155 |         Conc(
156 |             Mult(Charclass("a"), ONE),
157 |             Mult(Charclass("b"), ONE),
158 |             Mult(Charclass("c"), ONE),
159 |             Mult(Charclass("d"), ONE),
160 |             Mult(Charclass("e"), ONE),
161 |             Mult(~Charclass("fg"), STAR),
162 |             Mult(Charclass("h"), Multiplier(Bound(5), Bound(5))),
163 |             Mult(Charclass("abcdefghijklmnopqrstuvwxyz"), PLUS),
164 |         )
165 |     )
166 |     assert parse("[bc]*[ab]*") == Pattern(
167 |         Conc(
168 |             Mult(Charclass("bc"), STAR),
169 |             Mult(Charclass("ab"), STAR),
170 |         )
171 |     )
172 |     assert parse("abc...") == Pattern(
173 |         Conc(
174 |             Mult(Charclass("a"), ONE),
175 |             Mult(Charclass("b"), ONE),
176 |             Mult(Charclass("c"), ONE),
177 |             Mult(DOT, ONE),
178 |             Mult(DOT, ONE),
179 |             Mult(DOT, ONE),
180 |         )
181 |     )
182 |     assert parse("\\d{4}-\\d{2}-\\d{2}") == Pattern(
183 |         Conc(
184 |             Mult(DIGIT, Multiplier(Bound(4), Bound(4))),
185 |             Mult(Charclass("-"), ONE),
186 |             Mult(DIGIT, Multiplier(Bound(2), Bound(2))),
187 |             Mult(Charclass("-"), ONE),
188 |             Mult(DIGIT, Multiplier(Bound(2), Bound(2))),
189 |         )
190 |     )
191 | 
192 | 
193 | def test_pattern_parsing() -> None:
194 |     assert parse("abc|def(ghi|jkl)") == Pattern(
195 |         Conc(
196 |             Mult(Charclass("a"), ONE),
197 |             Mult(Charclass("b"), ONE),
198 |             Mult(Charclass("c"), ONE),
199 |         ),
200 |         Conc(
201 |             Mult(Charclass("d"), ONE),
202 |             Mult(Charclass("e"), ONE),
203 |             Mult(Charclass("f"), ONE),
204 |             Mult(
205 |                 Pattern(
206 |                     Conc(
207 |                         Mult(Charclass("g"), ONE),
208 |                         Mult(Charclass("h"), ONE),
209 |                         Mult(Charclass("i"), ONE),
210 |                     ),
211 |                     Conc(
212 |                         Mult(Charclass("j"), ONE),
213 |                         Mult(Charclass("k"), ONE),
214 |                         Mult(Charclass("l"), ONE),
215 |                     ),
216 |                 ),
217 |                 ONE,
218 |             ),
219 |         ),
220 |     )
221 | 
222 |     # Accept the "non-capturing group" syntax, "(?: ... )" but give it no
223 |     # special significance
224 |     assert parse("(?:)") == parse("()")
225 |     assert parse("(?:abc|def)") == parse("(abc|def)")
226 |     parse("(:abc)")  # should give no problems
227 | 
228 |     # Named groups
229 |     assert parse("(?P<ng1>abc)") == parse("(abc)")
230 | 
231 | 
232 | def test_nightmare_pattern() -> None:
233 |     assert parse("[\t\n\r -\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]*") == Pattern(
234 |         Conc(
235 |             Mult(
236 |                 Charclass(
237 |                     (
238 |                         ("\t", "\t"),
239 |                         ("\n", "\n"),
240 |                         ("\r", "\r"),
241 |                         (" ", "\uD7FF"),
242 |                         ("\uE000", "\uFFFD"),
243 |                         ("\U00010000", "\U0010FFFF"),
244 |                     )
245 |                 ),
246 |                 STAR,
247 |             )
248 |         )
249 |     )
250 | 


--------------------------------------------------------------------------------
/greenery/charclass_test.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import unicodedata
  4 | 
  5 | from .charclass import (
  6 |     DIGIT,
  7 |     DOT,
  8 |     NONDIGITCHAR,
  9 |     NONSPACECHAR,
 10 |     NONWORDCHAR,
 11 |     NULLCHARCLASS,
 12 |     SPACECHAR,
 13 |     WORDCHAR,
 14 |     Charclass,
 15 |     collapse_ord_ranges,
 16 |     repartition,
 17 | )
 18 | 
 19 | 
 20 | def test_collapse_ord_ranges_0() -> None:
 21 |     assert collapse_ord_ranges([(1, 2)]) == [(1, 2)]
 22 | 
 23 | 
 24 | def test_collapse_ord_ranges_1a() -> None:
 25 |     assert collapse_ord_ranges(
 26 |         [(1, 1), (3, 4), (10, 11), (13, 17), (7, 7)],
 27 |     ) == [(1, 1), (3, 4), (7, 7), (10, 11), (13, 17)]
 28 | 
 29 | 
 30 | def test_collapse_ord_ranges_1b() -> None:
 31 |     assert collapse_ord_ranges([(5, 16), (1, 1)]) == [(1, 1), (5, 16)]
 32 |     assert collapse_ord_ranges([(5, 16), (1, 2)]) == [(1, 2), (5, 16)]
 33 |     assert collapse_ord_ranges([(5, 16), (1, 3)]) == [(1, 3), (5, 16)]
 34 |     assert collapse_ord_ranges([(5, 16), (1, 4)]) == [(1, 16)]
 35 |     assert collapse_ord_ranges([(5, 16), (1, 5)]) == [(1, 16)]
 36 |     assert collapse_ord_ranges([(5, 16), (1, 16)]) == [(1, 16)]
 37 |     assert collapse_ord_ranges([(5, 16), (1, 17)]) == [(1, 17)]
 38 |     assert collapse_ord_ranges([(5, 16), (1, 18)]) == [(1, 18)]
 39 |     assert collapse_ord_ranges([(5, 16), (4, 4)]) == [(4, 16)]
 40 |     assert collapse_ord_ranges([(5, 16), (5, 5)]) == [(5, 16)]
 41 |     assert collapse_ord_ranges([(5, 16), (5, 18)]) == [(5, 18)]
 42 |     assert collapse_ord_ranges([(5, 16), (7, 8)]) == [(5, 16)]
 43 |     assert collapse_ord_ranges([(5, 16), (10, 20)]) == [(5, 20)]
 44 |     assert collapse_ord_ranges([(5, 16), (16, 20)]) == [(5, 20)]
 45 |     assert collapse_ord_ranges([(5, 16), (17, 20)]) == [(5, 20)]
 46 |     assert collapse_ord_ranges([(5, 16), (18, 20)]) == [(5, 16), (18, 20)]
 47 | 
 48 | 
 49 | def test_collapse_ord_ranges_2() -> None:
 50 |     assert collapse_ord_ranges([(1, 2), (11, 12), (5, 6)]) == [(1, 2), (5, 6), (11, 12)]
 51 |     assert collapse_ord_ranges([(1, 2), (11, 12), (3, 6)]) == [(1, 6), (11, 12)]
 52 |     assert collapse_ord_ranges([(1, 2), (11, 12), (2, 6)]) == [(1, 6), (11, 12)]
 53 |     assert collapse_ord_ranges([(1, 2), (11, 12), (5, 9)]) == [(1, 2), (5, 9), (11, 12)]
 54 |     assert collapse_ord_ranges([(1, 2), (11, 12), (5, 10)]) == [(1, 2), (5, 12)]
 55 |     assert collapse_ord_ranges([(1, 2), (11, 12), (-2, -1)]) == [
 56 |         (-2, -1),
 57 |         (1, 2),
 58 |         (11, 12),
 59 |     ]
 60 |     assert collapse_ord_ranges([(1, 2), (11, 12), (0, 20)]) == [(0, 20)]
 61 | 
 62 | 
 63 | def test_charclass_equality() -> None:
 64 |     assert Charclass("a") == Charclass("a")
 65 |     assert ~Charclass("a") == ~Charclass("a")
 66 |     assert ~Charclass("a") != Charclass("a")
 67 |     assert Charclass("ab") == Charclass("ba")
 68 | 
 69 | 
 70 | def test_charclass_ctor() -> None:
 71 |     assert not Charclass("ab").negated
 72 |     assert not Charclass("ab", negated=False).negated
 73 |     assert Charclass("ab", negated=True).negated
 74 | 
 75 | 
 76 | def test_repr() -> None:
 77 |     assert repr(~Charclass("a")) == "~Charclass((('a', 'a'),))"
 78 | 
 79 | 
 80 | def test_issubset() -> None:
 81 |     assert Charclass("a").issubset(Charclass("a"))
 82 |     assert not Charclass("a").issubset(Charclass("b"))
 83 |     assert Charclass("a").issubset(Charclass((("a", "b"),)))
 84 |     assert Charclass("a").issubset(~Charclass("b"))
 85 |     assert not (~Charclass("a")).issubset(Charclass("b"))
 86 |     assert (~Charclass("a")).issubset(DOT)
 87 | 
 88 | 
 89 | def test_charclass_str() -> None:
 90 |     assert str(WORDCHAR) == "\\w"
 91 |     assert str(DIGIT) == "\\d"
 92 |     assert str(SPACECHAR) == "\\s"
 93 |     assert str(Charclass("a")) == "a"
 94 |     assert str(Charclass("{")) == "\\{"
 95 |     assert str(Charclass("\t")) == "\\t"
 96 |     assert str(Charclass("ab")) == "[ab]"
 97 |     assert str(Charclass("a{")) == "[a{]"
 98 |     assert str(Charclass("a\t")) == "[\\ta]"
 99 |     assert str(Charclass("a-")) == "[\\-a]"
100 |     assert str(Charclass("a[")) == "[\\[a]"
101 |     assert str(Charclass("a]")) == "[\\]a]"
102 |     assert str(Charclass("ab")) == "[ab]"
103 |     assert str(Charclass("abc")) == "[abc]"
104 |     assert str(Charclass("abcd")) == "[a-d]"
105 |     assert str(Charclass("abcdfghi")) == "[a-df-i]"
106 |     assert str(Charclass("^")) == "^"
107 |     assert str(Charclass("\\")) == "\\\\"
108 |     assert str(Charclass("a^")) == "[\\^a]"
109 |     assert str(Charclass("0123456789a")) == "[0-9a]"
110 |     assert str(Charclass("\t\v\r A")) == "[\\t\\v\\r A]"
111 |     assert str(Charclass("\n\f A")) == "[\\n\\f A]"
112 |     assert str(Charclass("\t\n\v\f\r A")) == "[\\t-\\r A]"
113 |     assert (
114 |         str(
115 |             Charclass(
116 |                 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz|"
117 |             )
118 |         )
119 |         == "[0-9A-Z_a-z|]"
120 |     )
121 |     assert str(NONWORDCHAR) == "\\W"
122 |     assert str(NONDIGITCHAR) == "\\D"
123 |     assert str(NONSPACECHAR) == "\\S"
124 |     assert str(DOT) == "."
125 |     assert str(~Charclass("")) == "."
126 |     assert str(~Charclass("a")) == "[^a]"
127 |     assert str(~Charclass("{")) == "[^{]"
128 |     assert str(~Charclass("\t")) == "[^\\t]"
129 |     assert str(~Charclass("^")) == "[^\\^]"
130 | 
131 | 
132 | def test_charclass_negation() -> None:
133 |     assert ~~Charclass("a") == Charclass("a")
134 |     assert Charclass("a") == ~~Charclass("a")
135 | 
136 | 
137 | def test_charclass_union() -> None:
138 |     # [ab] ∪ [bc] = [abc]
139 |     assert Charclass("ab") | Charclass("bc") == Charclass("abc")
140 |     # [ab] ∪ [^bc] = [^c]
141 |     assert Charclass("ab") | ~Charclass("bc") == ~Charclass("c")
142 |     # [^ab] ∪ [bc] = [^a]
143 |     assert ~Charclass("ab") | Charclass("bc") == ~Charclass("a")
144 |     # [^ab] ∪ [^bc] = [^b]
145 |     assert ~Charclass("ab") | ~Charclass("bc") == ~Charclass("b")
146 | 
147 | 
148 | def test_charclass_intersection() -> None:
149 |     # [ab] ∩ [bc] = [b]
150 |     assert Charclass("ab") & Charclass("bc") == Charclass("b")
151 |     # [ab] ∩ [^bc] = [a]
152 |     assert Charclass("ab") & ~Charclass("bc") == Charclass("a")
153 |     # [^ab] ∩ [bc] = [c]
154 |     assert ~Charclass("ab") & Charclass("bc") == Charclass("c")
155 |     # [^ab] ∩ [^bc] = [^abc]
156 |     assert ~Charclass("ab") & ~Charclass("bc") == ~Charclass("abc")
157 | 
158 |     assert (Charclass("ab") & Charclass("bcd") & Charclass("abcde")) == Charclass("b")
159 | 
160 | 
161 | def test_empty() -> None:
162 |     assert NULLCHARCLASS.empty()
163 |     assert not DOT.empty()
164 | 
165 | 
166 | def test_repartition_elementary() -> None:
167 |     assert repartition([Charclass("a")]) == {
168 |         Charclass("a"): [Charclass("a")],
169 |     }
170 | 
171 | 
172 | def test_repartition_elementary_2() -> None:
173 |     assert repartition([Charclass("a"), ~Charclass("a")]) == {
174 |         Charclass("a"): [Charclass("a")],
175 |         ~Charclass("a"): [~Charclass("a")],
176 |     }
177 | 
178 | 
179 | def test_repartition_basic() -> None:
180 |     assert repartition([Charclass("a"), Charclass("abc")]) == {
181 |         Charclass("a"): [
182 |             Charclass("a"),
183 |         ],
184 |         Charclass("abc"): [
185 |             Charclass("a"),
186 |             Charclass("bc"),
187 |         ],
188 |     }
189 | 
190 | 
191 | def test_repartition_negation() -> None:
192 |     assert repartition([Charclass("ab"), Charclass("a"), ~Charclass("ab")]) == {
193 |         Charclass("ab"): [
194 |             Charclass("a"),
195 |             Charclass("b"),
196 |         ],
197 |         Charclass("a"): [
198 |             Charclass("a"),
199 |         ],
200 |         ~Charclass("ab"): [
201 |             ~Charclass("ab"),
202 |         ],
203 |     }
204 | 
205 | 
206 | def test_repartition_negation_2() -> None:
207 |     assert repartition([Charclass("ab"), Charclass("abc"), ~Charclass("ab")]) == {
208 |         Charclass("ab"): [
209 |             Charclass("ab"),
210 |         ],
211 |         Charclass("abc"): [
212 |             Charclass("ab"),
213 |             Charclass("c"),
214 |         ],
215 |         ~Charclass("ab"): [
216 |             ~Charclass("abc"),
217 |             Charclass("c"),
218 |         ],
219 |     }
220 |     assert repartition(
221 |         [
222 |             ~Charclass("a"),
223 |             ~Charclass("ab"),
224 |             ~Charclass("abc"),
225 |         ]
226 |     ) == {
227 |         ~Charclass("a"): [
228 |             ~Charclass("abc"),
229 |             Charclass("b"),
230 |             Charclass("c"),
231 |         ],
232 |         ~Charclass("ab"): [
233 |             ~Charclass("abc"),
234 |             Charclass("c"),
235 |         ],
236 |         ~Charclass("abc"): [
237 |             ~Charclass("abc"),
238 |         ],
239 |     }
240 | 
241 | 
242 | def test_repartition_advanced() -> None:
243 |     assert repartition(
244 |         [
245 |             Charclass("a"),
246 |             Charclass("bcdef"),
247 |             ~Charclass("abcdef"),
248 |             Charclass("abcd"),
249 |             ~Charclass("abcd"),
250 |         ]
251 |     ) == {
252 |         Charclass("a"): [Charclass("a")],
253 |         Charclass("bcdef"): [
254 |             Charclass("bcd"),
255 |             Charclass("ef"),
256 |         ],
257 |         ~Charclass("abcdef"): [
258 |             ~Charclass("abcdef"),
259 |         ],
260 |         Charclass("abcd"): [
261 |             Charclass("a"),
262 |             Charclass("bcd"),
263 |         ],
264 |         ~Charclass("abcd"): [
265 |             ~Charclass("abcdef"),
266 |             Charclass("ef"),
267 |         ],
268 |     }
269 | 
270 | 
271 | def test_repartition_advanced_2() -> None:
272 |     assert repartition([WORDCHAR, DIGIT, DOT, NONDIGITCHAR, NULLCHARCLASS]) == {
273 |         WORDCHAR: [
274 |             DIGIT,
275 |             Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"),
276 |         ],
277 |         DIGIT: [DIGIT],
278 |         DOT: [
279 |             ~Charclass((("0", "z"),)),
280 |             DIGIT,
281 |             Charclass(((":", "@"), ("[", "^"), ("`", "`"))),
282 |             Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"),
283 |         ],
284 |         NONDIGITCHAR: [
285 |             ~Charclass((("0", "z"),)),
286 |             Charclass(((":", "@"), ("[", "^"), ("`", "`"))),
287 |             Charclass("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"),
288 |         ],
289 |         NULLCHARCLASS: [
290 |             # Yup, there's nothing here!
291 |             # This should be impossible or at least cause no problems in practice
292 |         ],
293 |     }
294 | 
295 | 
296 | # This should take a reasonable amount of time
297 | # It was previously taking forever
298 | def test_charclass_by_category() -> None:
299 |     out = {}
300 |     for i in range(0x101000):
301 |         c = chr(i)
302 |         cat = unicodedata.category(c)
303 |         if cat not in out:
304 |             out[cat] = [c]
305 |         else:
306 |             out[cat].append(c)
307 |     for cat, cs in out.items():
308 |         Charclass("".join(cs))
309 | 


--------------------------------------------------------------------------------
/greenery/parse.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | __all__ = (
  4 |     "parse",
  5 |     "NoMatch",
  6 | )
  7 | 
  8 | from typing import Collection, Tuple, TypeVar
  9 | 
 10 | from .bound import INF, Bound
 11 | from .charclass import (
 12 |     DIGIT,
 13 |     NONDIGITCHAR,
 14 |     NONSPACECHAR,
 15 |     NONWORDCHAR,
 16 |     SPACECHAR,
 17 |     WORDCHAR,
 18 |     Charclass,
 19 |     escapes,
 20 |     shorthand,
 21 | )
 22 | from .multiplier import ONE, Multiplier, symbolic
 23 | from .rxelems import Conc, Mult, Pattern
 24 | 
 25 | # Currently many statements are grouped by `try/except NoMatch` in order to try
 26 | # multiple matching functions in sequence. They can be refactored into smaller
 27 | # functions to remove this suppression.
 28 | # pylint: disable=too-many-try-statements
 29 | 
 30 | T_co = TypeVar("T_co", covariant=True)
 31 | 
 32 | 
 33 | class NoMatch(Exception):
 34 |     """
 35 |     Thrown when parsing fails.
 36 |     Almost always caught and almost never fatal
 37 |     """
 38 | 
 39 | 
 40 | MatchResult = Tuple[T_co, int]
 41 | 
 42 | 
 43 | def read_until(string: str, i: int, stop_char: str) -> MatchResult[str]:
 44 |     start = i
 45 |     while True:
 46 |         if i >= len(string):
 47 |             raise NoMatch
 48 |         if string[i] == stop_char:
 49 |             break
 50 |         i += 1
 51 |     return string[start:i], i + 1
 52 | 
 53 | 
 54 | def static(haystack: str, i: int, needle: str) -> int:
 55 |     j = i + len(needle)
 56 |     if haystack[i:j] == needle:
 57 |         return j
 58 |     raise NoMatch
 59 | 
 60 | 
 61 | def select_static(haystack: str, i: int, *needles: str) -> MatchResult[str]:
 62 |     for needle in needles:
 63 |         j = i + len(needle)
 64 |         if haystack[i:j] == needle:
 65 |             return needle, j
 66 |     raise NoMatch
 67 | 
 68 | 
 69 | def unescape_hex(string: str, i: int) -> MatchResult[str]:
 70 |     """Turn e.g. "\\x40" into "@". Exactly two hex digits"""
 71 |     hex_digits = "0123456789AaBbCcDdEeFf"
 72 | 
 73 |     j = static(string, i, "\\x")
 74 | 
 75 |     hex1 = string[j]  # e.g. "4"
 76 |     if hex1 not in hex_digits:
 77 |         raise NoMatch
 78 |     j += len(hex1)
 79 | 
 80 |     hex2 = string[j]  # e.g. "0"
 81 |     if hex2 not in hex_digits:
 82 |         raise NoMatch
 83 |     j += len(hex2)
 84 | 
 85 |     codepoint = int(hex1 + hex2, 16)  # e.g. 64
 86 |     char = chr(codepoint)  # "@"
 87 |     return char, j
 88 | 
 89 | 
 90 | def match_internal_char(string: str, i: int) -> MatchResult[str]:
 91 |     # e.g. if we see "\\t", return "\t"
 92 |     for char, escaped_mnemonic in escapes.items():
 93 |         try:
 94 |             return char, static(string, i, escaped_mnemonic)
 95 |         except NoMatch:
 96 |             pass
 97 | 
 98 |     # special chars e.g. "\\-" returns "-"
 99 |     for char in Charclass.classSpecial:
100 |         try:
101 |             return char, static(string, i, "\\" + char)
102 |         except NoMatch:
103 |             pass
104 | 
105 |     # hex escape e.g. "\\x40" returns "@"
106 |     try:
107 |         return unescape_hex(string, i)
108 |     except NoMatch:
109 |         pass
110 | 
111 |     # single non-special character, not contained
112 |     # inside square brackets
113 |     char, j = string[i], i + 1
114 |     if char in Charclass.classSpecial:
115 |         raise NoMatch
116 | 
117 |     return char, j
118 | 
119 | 
120 | def match_inner_charclass(
121 |     string: str,
122 |     i: int,
123 | ) -> MatchResult[Charclass]:
124 |     """
125 |     We have to return several ranges, because of \\\\w etc.
126 |     """
127 |     # Attempt 1: shorthand
128 |     inner_shorthand = {
129 |         "\\w": WORDCHAR,
130 |         "\\d": DIGIT,
131 |         "\\s": SPACECHAR,
132 |         "\\W": NONWORDCHAR,
133 |         "\\D": NONDIGITCHAR,
134 |         "\\S": NONSPACECHAR,
135 |         # no ".": DOT,
136 |     }
137 | 
138 |     for cc_shorthand, charclass in inner_shorthand.items():
139 |         try:
140 |             return charclass, static(string, i, cc_shorthand)
141 |         except NoMatch:
142 |             pass
143 | 
144 |     # Attempt 2: a range e.g. "d-h"
145 |     try:
146 |         first, j = match_internal_char(string, i)  # `first` is "d"
147 |         k = static(string, j, "-")
148 |         last, k = match_internal_char(string, k)  # `last` is "h"
149 |         return Charclass(((first, last),)), k
150 |     except NoMatch:
151 |         pass
152 | 
153 |     # Attempt 3: just a character on its own
154 |     char, j = match_internal_char(string, i)
155 |     return Charclass(((char, char),)), j
156 | 
157 | 
158 | def match_class_interior(string: str, i: int) -> MatchResult[Charclass]:
159 |     inner_charclasses = []
160 |     try:
161 |         while True:
162 |             # Match an internal character, range, or other charclass predicate.
163 |             inner_charclass, i = match_inner_charclass(string, i)
164 |             inner_charclasses.append(inner_charclass)
165 |     except NoMatch:
166 |         pass
167 | 
168 |     # Use the existing Charclass union functionality
169 |     charclass = Charclass()
170 |     for inner_charclass in inner_charclasses:
171 |         charclass |= inner_charclass
172 | 
173 |     return charclass, i
174 | 
175 | 
176 | def match_charclass(string: str, i: int) -> MatchResult[Charclass]:
177 |     # pylint: disable=too-many-return-statements
178 | 
179 |     if i >= len(string):
180 |         raise NoMatch
181 | 
182 |     # wildcard ".", "\\w", "\\d", etc.
183 |     for shorthand_charclass, shorthand_abbrev in shorthand.items():
184 |         try:
185 |             return shorthand_charclass, static(string, i, shorthand_abbrev)
186 |         except NoMatch:
187 |             pass
188 | 
189 |     # "[^dsgsdg]"
190 |     try:
191 |         j = static(string, i, "[^")
192 |         result, j = match_class_interior(string, j)
193 |         j = static(string, j, "]")
194 |         return ~result, j
195 |     except NoMatch:
196 |         pass
197 | 
198 |     # "[sdfsf]"
199 |     try:
200 |         j = static(string, i, "[")
201 |         result, j = match_class_interior(string, j)
202 |         j = static(string, j, "]")
203 |         return result, j
204 |     except NoMatch:
205 |         pass
206 | 
207 |     # e.g. if seeing "\\t", return "\t"
208 |     for char, escaped_mnemonic in escapes.items():
209 |         try:
210 |             return Charclass(((char, char),)), static(string, i, escaped_mnemonic)
211 |         except NoMatch:
212 |             pass
213 | 
214 |     # e.g. if seeing "\\{", return "{"
215 |     for char in Charclass.allSpecial:
216 |         try:
217 |             return Charclass(((char, char),)), static(string, i, "\\" + char)
218 |         except NoMatch:
219 |             pass
220 | 
221 |     # e.g. if seeing "\\x40", return "@"
222 |     try:
223 |         char, j = unescape_hex(string, i)
224 |         return Charclass(((char, char),)), j
225 |     except NoMatch:
226 |         pass
227 | 
228 |     # single non-special character, not contained inside square brackets
229 |     char, i = string[i], i + 1
230 |     if char in Charclass.allSpecial:
231 |         raise NoMatch
232 | 
233 |     return Charclass(((char, char),)), i
234 | 
235 | 
236 | def match_multiplicand(string: str, i: int) -> MatchResult[Pattern | Charclass]:
237 |     # explicitly non-capturing "(?:...)" syntax. No special significance
238 |     try:
239 |         j = static(string, i, "(?")
240 |         opts, j = select_static(string, j, ":", "P<")
241 |         if opts == "P<":
242 |             _group_name, j = read_until(string, j, ">")
243 |         pattern, j = match_pattern(string, j)
244 |         j = static(string, j, ")")
245 |         return pattern, j
246 |     except NoMatch:
247 |         pass
248 | 
249 |     # normal "(...)" syntax
250 |     try:
251 |         j = static(string, i, "(")
252 |         pattern, j = match_pattern(string, j)
253 |         j = static(string, j, ")")
254 |         return pattern, j
255 |     except NoMatch:
256 |         pass
257 | 
258 |     # Just a `Charclass` on its own
259 |     charclass, j = match_charclass(string, i)
260 |     return charclass, j
261 | 
262 | 
263 | def match_any_of(string: str, i: int, collection: Collection[str]) -> MatchResult[str]:
264 |     for char in collection:
265 |         try:
266 |             return char, static(string, i, char)
267 |         except NoMatch:
268 |             pass
269 |     raise NoMatch
270 | 
271 | 
272 | def match_bound(string: str, i: int) -> MatchResult[Bound]:
273 |     # "0"
274 |     try:
275 |         return Bound(0), static(string, i, "0")
276 |     except NoMatch:
277 |         pass
278 | 
279 |     # "1", etc.
280 |     try:
281 |         digit, j = match_any_of(string, i, "123456789")
282 |         integer = int(digit)
283 |         try:
284 |             while True:
285 |                 digit, j = match_any_of(string, j, "0123456789")
286 |                 integer *= 10
287 |                 integer += int(digit)
288 |         except NoMatch:
289 |             return Bound(integer), j
290 |     except NoMatch:
291 |         pass
292 | 
293 |     # "" empty string = infinite bound as in "{4,}"
294 |     return INF, i
295 | 
296 | 
297 | def match_nonempty_greedy_multiplier(string: str, i: int) -> MatchResult[Multiplier]:
298 |     """
299 |     Any multiplier which isn't the default empty string (equivalent to `{1,1}`)
300 |     """
301 |     # {2,3} or {2,}
302 |     try:
303 |         j = static(string, i, "{")
304 |         min_, j = match_bound(string, j)
305 |         j = static(string, j, ",")
306 |         max_, j = match_bound(string, j)
307 |         j = static(string, j, "}")
308 |         return Multiplier(min_, max_), j
309 |     except NoMatch:
310 |         pass
311 | 
312 |     # {2}
313 |     try:
314 |         j = static(string, i, "{")
315 |         min_, j = match_bound(string, j)
316 |         j = static(string, j, "}")
317 |         return Multiplier(min_, min_), j
318 |     except NoMatch:
319 |         pass
320 | 
321 |     # "?"/"*"/"+"
322 |     for mult, symbol in symbolic.items():
323 |         if not symbol:
324 |             continue
325 |         try:
326 |             return mult, static(string, i, symbol)
327 |         except NoMatch:
328 |             pass
329 | 
330 |     raise NoMatch
331 | 
332 | 
333 | def match_nonempty_multiplier(string: str, i: int) -> MatchResult[Multiplier]:
334 |     multiplier, j = match_nonempty_greedy_multiplier(string, i)
335 |     try:
336 |         j = static(string, j, "?")
337 |     except NoMatch:
338 |         pass
339 |     return multiplier, j
340 | 
341 | 
342 | def match_multiplier(string: str, i: int) -> MatchResult[Multiplier]:
343 |     try:
344 |         return match_nonempty_multiplier(string, i)
345 |     except NoMatch:
346 |         return ONE, i
347 | 
348 | 
349 | def match_mult(string: str, i: int) -> MatchResult[Mult]:
350 |     multiplicand, j = match_multiplicand(string, i)
351 |     multiplier, j = match_multiplier(string, j)
352 |     return Mult(multiplicand, multiplier), j
353 | 
354 | 
355 | def match_conc(string: str, i: int) -> MatchResult[Conc]:
356 |     mults = []
357 |     try:
358 |         while True:
359 |             m, i = match_mult(string, i)
360 |             mults.append(m)
361 |     except NoMatch:
362 |         pass
363 |     return Conc(*mults), i
364 | 
365 | 
366 | def match_pattern(string: str, i: int) -> MatchResult[Pattern]:
367 |     concs = []
368 | 
369 |     # first one
370 |     c, i = match_conc(string, i)
371 |     concs.append(c)
372 | 
373 |     # the rest
374 |     while True:
375 |         try:
376 |             i = static(string, i, "|")
377 |             c, i = match_conc(string, i)
378 |             concs.append(c)
379 |         except NoMatch:
380 |             return Pattern(*concs), i
381 | 
382 | 
383 | def parse(string: str) -> Pattern:
384 |     """
385 |     Parse a full string and return a `Pattern` object. Fail if
386 |     the whole string wasn't parsed
387 |     """
388 |     obj, i = match_pattern(string, 0)
389 |     if i != len(string):
390 |         raise NoMatch(f"Could not parse {string!r} beyond index {i}")
391 |     return obj
392 | 


--------------------------------------------------------------------------------
/greenery/charclass.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=fixme,too-many-locals,too-many-branches
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | __all__ = (
  6 |     "Charclass",
  7 |     "DIGIT",
  8 |     "DOT",
  9 |     "NONDIGITCHAR",
 10 |     "NONSPACECHAR",
 11 |     "NONWORDCHAR",
 12 |     "NULLCHARCLASS",
 13 |     "SPACECHAR",
 14 |     "WORDCHAR",
 15 |     "escapes",
 16 |     "negate",
 17 |     "shorthand",
 18 |     "repartition",
 19 | )
 20 | 
 21 | from dataclasses import dataclass
 22 | from typing import ClassVar, Dict, Iterable, Iterator, List, Mapping, Tuple
 23 | 
 24 | NUM_UNICODE_CHARS = (1 << 16) + (1 << 20)
 25 | 
 26 | 
 27 | def negate(ord_ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
 28 |     u = 0
 29 |     negated = []
 30 |     for ord_range in ord_ranges:
 31 |         if u < ord_range[0]:
 32 |             negated.append((u, ord_range[0] - 1))
 33 |         u = ord_range[1] + 1
 34 |     if u < NUM_UNICODE_CHARS - 1:
 35 |         negated.append((u, NUM_UNICODE_CHARS - 1))
 36 |     return negated
 37 | 
 38 | 
 39 | def collapse_ord_ranges(ord_ranges: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
 40 |     """
 41 |     Assume all existing ord ranges are sorted, and also disjoint
 42 |     So no cases of [[12, 17], [2, 3]] or [[4, 6], [7, 8]].
 43 |     """
 44 |     collapsed: List[Tuple[int, int]] = []
 45 | 
 46 |     for ord_range in sorted(ord_ranges):
 47 |         if not collapsed or collapsed[-1][1] + 1 < ord_range[0]:
 48 |             collapsed.append(ord_range)
 49 |         elif ord_range[1] > collapsed[-1][1]:
 50 |             # merge into previous
 51 |             collapsed[-1] = (collapsed[-1][0], ord_range[1])
 52 | 
 53 |     return collapsed
 54 | 
 55 | 
 56 | @dataclass(frozen=True, init=False)
 57 | class Charclass:
 58 |     """
 59 |     A `Charclass` is basically a `frozenset` of symbols.
 60 |     A `Charclass` with the `negated` flag set is assumed
 61 |     to contain every symbol that is in the alphabet of all symbols but not
 62 |     explicitly listed inside the frozenset. e.g. [^a]. This is very handy
 63 |     if the full alphabet is extremely large, but also requires dedicated
 64 |     combination functions.
 65 |     """
 66 | 
 67 |     ord_ranges: List[Tuple[int, int]]
 68 |     negated: bool
 69 | 
 70 |     def __init__(
 71 |         self, ranges: str | Tuple[Tuple[str, str], ...] = "", negated: bool = False
 72 |     ):
 73 |         if isinstance(ranges, str):
 74 |             ranges = tuple((char, char) for char in ranges)
 75 |         if not isinstance(ranges, tuple):
 76 |             raise TypeError(f"Bad ranges: {ranges!r}")
 77 |         for r in ranges:
 78 |             if len(r) != 2 or r[0] > r[1]:
 79 |                 raise ValueError(f"Bad range: {r!r}")
 80 |             for char in r:
 81 |                 if not isinstance(char, str):
 82 |                     raise TypeError(f"Can't put {char!r} in a `Charclass`", char)
 83 |                 if len(char) != 1:
 84 |                     raise ValueError("`Charclass` can only contain single chars", char)
 85 | 
 86 |         # Rebalance ranges!
 87 |         ord_ranges = [(ord(first), ord(last)) for first, last in ranges]
 88 |         ord_ranges = collapse_ord_ranges(ord_ranges)
 89 | 
 90 |         object.__setattr__(self, "ord_ranges", tuple(ord_ranges))
 91 |         object.__setattr__(self, "negated", negated)
 92 | 
 93 |     def __lt__(self, other: Charclass, /) -> bool:
 94 |         if self.negated < other.negated:
 95 |             return True
 96 |         if (
 97 |             self.negated == other.negated
 98 |             and self.ord_ranges[0][0] < other.ord_ranges[0][0]
 99 |         ):
100 |             return True
101 |         return False
102 | 
103 |     def __eq__(self, other: object, /) -> bool:
104 |         return (
105 |             isinstance(other, Charclass)
106 |             and self.ord_ranges == other.ord_ranges
107 |             and self.negated == other.negated
108 |         )
109 | 
110 |     def __hash__(self, /) -> int:
111 |         return hash((self.ord_ranges, self.negated))
112 | 
113 |     # These are the characters carrying special meanings when they appear
114 |     # "outdoors" within a regular expression. To be interpreted literally, they
115 |     # must be escaped with a backslash.
116 |     allSpecial: ClassVar[frozenset[str]] = frozenset("\\[]|().?*+{}")
117 | 
118 |     # These are the characters carrying special meanings when they appear
119 |     # INSIDE a character class (delimited by square brackets) within a regular
120 |     # expression. To be interpreted literally, they must be escaped with a
121 |     # backslash. Notice how much smaller this class is than the one above; note
122 |     # also that the hyphen and caret do NOT appear above.
123 |     classSpecial: ClassVar[frozenset[str]] = frozenset("\\[]^-")
124 | 
125 |     def __str__(self, /) -> str:
126 |         # pylint: disable=too-many-return-statements
127 | 
128 |         # e.g. \w
129 |         if self in shorthand:
130 |             return shorthand[self]
131 | 
132 |         # e.g. [^a]
133 |         if self.negated:
134 |             return f"[^{self.escape()}]"
135 | 
136 |         # single character, not contained inside square brackets.
137 |         if len(self.ord_ranges) == 1 and self.ord_ranges[0][0] == self.ord_ranges[0][1]:
138 |             u = self.ord_ranges[0][0]
139 |             char = chr(u)
140 | 
141 |             # e.g. if char is "\t", return "\\t"
142 |             if char in escapes:
143 |                 return escapes[char]
144 | 
145 |             if char in Charclass.allSpecial:
146 |                 return f"\\{char}"
147 | 
148 |             # If char is an ASCII control character, don't print it directly,
149 |             # return a hex escape sequence e.g. "\\x00". Note that this
150 |             # includes tab and other characters already handled above
151 |             if 0 <= u <= 0x1F or u == 0x7F:
152 |                 return f"\\x{u:02x}"
153 | 
154 |             return char
155 | 
156 |         # multiple characters (or possibly 0 characters)
157 |         return f"[{self.escape()}]"
158 | 
159 |     def escape(self, /) -> str:
160 |         def escape_char(char: str, /) -> str:
161 |             if char in Charclass.classSpecial:
162 |                 return f"\\{char}"
163 |             if char in escapes:
164 |                 return escapes[char]
165 | 
166 |             # If char is an ASCII control character, don't print it directly,
167 |             # return a hex escape sequence e.g. "\\x00". Note that this
168 |             # includes tab and other characters already handled above
169 |             if 0 <= ord(char) <= 0x1F or ord(char) == 0x7F:
170 |                 return f"\\x{ord(char):02x}"
171 | 
172 |             return char
173 | 
174 |         output = ""
175 | 
176 |         for first_u, last_u in self.ord_ranges:
177 |             # there's no point in putting a range when the whole thing is
178 |             # 3 characters or fewer. "abc" -> "abc" but "abcd" -> "a-d"
179 |             if last_u <= first_u + 2:
180 |                 # "a" or "ab" or "abc" or "abcd"
181 |                 for u in range(first_u, last_u + 1):
182 |                     output += escape_char(chr(u))
183 |             else:
184 |                 # "a-b" or "a-c" or "a-d"
185 |                 output += escape_char(chr(first_u)) + "-" + escape_char(chr(last_u))
186 | 
187 |         return output
188 | 
189 |     def __repr__(self, /) -> str:
190 |         sign = "~" if self.negated else ""
191 |         ranges = tuple(
192 |             (chr(first_u), chr(last_u)) for (first_u, last_u) in self.ord_ranges
193 |         )
194 |         return f"{sign}Charclass({ranges!r})"
195 | 
196 |     def reduce(self, /) -> Charclass:
197 |         # `Charclass`es cannot be reduced.
198 |         return self
199 | 
200 |     def empty(self, /) -> bool:
201 |         return not self.ord_ranges and not self.negated
202 | 
203 |     # set operations
204 |     def negate(self, /) -> Charclass:
205 |         """
206 |         Negate the current `Charclass`. e.g. [ab] becomes [^ab]. Call
207 |         using "charclass2 = ~charclass1"
208 |         """
209 |         ranges = tuple(
210 |             (chr(first_u), chr(last_u)) for (first_u, last_u) in self.ord_ranges
211 |         )
212 |         return Charclass(ranges, negated=not self.negated)
213 | 
214 |     def __invert__(self, /) -> Charclass:
215 |         return self.negate()
216 | 
217 |     def get_chars(self, /) -> Iterator[str]:
218 |         """
219 |         Use this with caution, it can iterate over 1,000,000+ characters
220 |         """
221 |         for first_u, last_u in self.ord_ranges:
222 |             for u in range(first_u, last_u + 1):
223 |                 yield chr(u)
224 | 
225 |     def num_chars(self, /) -> int:
226 |         num = 0
227 |         for first_u, last_u in self.ord_ranges:
228 |             num += last_u + 1 - first_u
229 |         return NUM_UNICODE_CHARS - num if self.negated else num
230 | 
231 |     def accepts(self, char: str, /) -> bool:
232 |         u = ord(char)
233 |         for first_u, last_u in self.ord_ranges:
234 |             if first_u <= u <= last_u:
235 |                 return not self.negated
236 |         return self.negated
237 | 
238 |     def reversed(self, /) -> Charclass:
239 |         return self
240 | 
241 |     def union(self, other: Charclass, /) -> Charclass:
242 |         # TODO: make this able to efficiently unite many Charclasses at once,
243 |         # again
244 |         self_ord_ranges = list(self.ord_ranges)
245 |         if self.negated:
246 |             self_ord_ranges = negate(self_ord_ranges)
247 | 
248 |         other_ord_ranges = list(other.ord_ranges)
249 |         if other.negated:
250 |             other_ord_ranges = negate(other_ord_ranges)
251 | 
252 |         new_ord_ranges = []
253 |         new_ord_ranges.extend(self_ord_ranges)
254 |         new_ord_ranges.extend(other_ord_ranges)
255 |         new_ord_ranges = collapse_ord_ranges(new_ord_ranges)
256 | 
257 |         new_negated = self.negated or other.negated
258 |         if new_negated:
259 |             new_ord_ranges = negate(new_ord_ranges)
260 |         new_ranges = tuple(
261 |             (chr(first_u), chr(last_u)) for (first_u, last_u) in new_ord_ranges
262 |         )
263 |         return Charclass(new_ranges, new_negated)
264 | 
265 |     __or__ = union
266 | 
267 |     def issubset(self, other: Charclass, /) -> bool:
268 |         return self | other == other
269 | 
270 |     def intersection(self, other: Charclass, /) -> Charclass:
271 |         # TODO: is this actually efficient?
272 |         # TODO: make this able to efficiently intersect many Charclasses at once,
273 |         # again
274 |         return ~(~self | ~other)
275 | 
276 |     __and__ = intersection
277 | 
278 | 
279 | # Standard character classes
280 | WORDCHAR = Charclass("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz")
281 | DIGIT = Charclass("0123456789")
282 | SPACECHAR = Charclass("\t\n\v\f\r ")
283 | 
284 | # This `Charclass` expresses "no possibilities at all"
285 | # and can never match anything.
286 | NULLCHARCLASS = Charclass()
287 | 
288 | NONWORDCHAR = ~WORDCHAR
289 | NONDIGITCHAR = ~DIGIT
290 | NONSPACECHAR = ~SPACECHAR
291 | DOT = ~NULLCHARCLASS
292 | 
293 | # Textual representations of standard character classes
294 | shorthand: Mapping[Charclass, str] = {
295 |     WORDCHAR: "\\w",
296 |     DIGIT: "\\d",
297 |     SPACECHAR: "\\s",
298 |     NONWORDCHAR: "\\W",
299 |     NONDIGITCHAR: "\\D",
300 |     NONSPACECHAR: "\\S",
301 |     DOT: ".",
302 | }
303 | 
304 | # Characters which users may escape in a regex instead of inserting them
305 | # literally. In ASCII order:
306 | escapes: Mapping[str, str] = {
307 |     "\t": "\\t",  # tab
308 |     "\n": "\\n",  # line feed
309 |     "\v": "\\v",  # vertical tab
310 |     "\f": "\\f",  # form feed
311 |     "\r": "\\r",  # carriage return
312 | }
313 | 
314 | 
315 | def repartition(
316 |     charclasses: Iterable[Charclass],
317 | ) -> Mapping[Charclass, Iterable[Charclass]]:
318 |     """
319 |     Accept an iterable of `Charclass`es which may overlap somewhat.
320 |     Construct a minimal collection of `Charclass`es which partition the space
321 |     of all possible characters and can be combined to create all of the
322 |     originals.
323 |     Return a map from each original `Charclass` to its constituent pieces.
324 |     """
325 |     ord_range_boundaries = set()
326 |     for charclass in charclasses:
327 |         for first_u, last_u in charclass.ord_ranges:
328 |             ord_range_boundaries.add(first_u)
329 |             ord_range_boundaries.add(last_u + 1)
330 |     ord_range_boundaries_2 = sorted(ord_range_boundaries)
331 | 
332 |     ord_ranges = []
333 |     for i, ord_range_boundary in enumerate(ord_range_boundaries_2):
334 |         if i + 1 < len(ord_range_boundaries_2):
335 |             ord_ranges.append((ord_range_boundary, ord_range_boundaries_2[i + 1] - 1))
336 | 
337 |     # Group all of the possible ranges by "signature".
338 |     # A signature is a tuple of Booleans telling us which character classes
339 |     # a particular range is mentioned in.
340 |     # (Whether it's *accepted* is actually not relevant.)
341 |     signatures: Dict[Tuple[bool, ...], List[Tuple[int, int]]] = {}
342 |     for ord_range in ord_ranges:
343 |         signature = []
344 |         for charclass in charclasses:
345 |             ord_range_in_charclass = False
346 |             for x in charclass.ord_ranges:
347 |                 if x[0] <= ord_range[0] and ord_range[1] <= x[1]:
348 |                     ord_range_in_charclass = True
349 |                     break
350 |             signature.append(ord_range_in_charclass)
351 |         signature2 = tuple(signature)
352 |         if signature2 not in signatures:
353 |             signatures[signature2] = []
354 |         signatures[signature2].append(ord_range)
355 | 
356 |     # From the signatures we can gather the new Charclasses
357 |     newcharclasses = []
358 |     newcharclasses.append(
359 |         ~Charclass(
360 |             tuple((chr(first_u), chr(last_u)) for (first_u, last_u) in ord_ranges)
361 |         )
362 |     )
363 |     for ord_ranges2 in signatures.values():
364 |         newcharclasses.append(
365 |             Charclass(
366 |                 tuple((chr(first_u), chr(last_u)) for (first_u, last_u) in ord_ranges2)
367 |             )
368 |         )
369 | 
370 |     # Now compute the breakdowns
371 |     partition: Dict[Charclass, List[Charclass]] = {}
372 |     for charclass in charclasses:
373 |         partition[charclass] = []
374 |         for newcharclass in newcharclasses:
375 |             if newcharclass.issubset(charclass):
376 |                 partition[charclass].append(newcharclass)
377 | 
378 |     return partition
379 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # greenery
  2 | 
  3 | Tools for parsing and manipulating regular expressions. Note that this is a very different concept from that of simply *creating and using* those regular expressions, functionality which is present in basically every programming language in the world, [Python included](http://docs.python.org/library/re.html).
  4 | 
  5 | This project was undertaken because I wanted to be able to **compute the intersection between two regular expressions**. The "intersection" is the set of strings which both regular expressions will accept, represented as a third regular expression.
  6 | 
  7 | ## Installation
  8 | 
  9 | ```sh
 10 | pip install greenery
 11 | ```
 12 | 
 13 | ## Example
 14 | 
 15 | ```python
 16 | from greenery import parse
 17 | 
 18 | print(parse("abc...") & parse("...def"))
 19 | # "abcdef"
 20 | 
 21 | print(parse("\d{4}-\d{2}-\d{2}") & parse("19.*"))
 22 | # "19\d{2}-\d{2}-\d{2}"
 23 | 
 24 | print(parse("\W*") & parse("[a-g0-8$%\^]+") & parse("[^d]{2,8}"))
 25 | # "[$%\^]{2,8}"
 26 | 
 27 | print(parse("[bc]*[ab]*") & parse("[ab]*[bc]*"))
 28 | # "([ab]*a|[bc]*c)?b*"
 29 | 
 30 | print(parse("a*") & parse("b*"))
 31 | # ""
 32 | 
 33 | print(parse("a") & parse("b"))
 34 | # "[]"
 35 | ```
 36 | 
 37 | In the penultimate example, the empty string is returned, because only the empty string is in both of the regular languages `a*` and `b*`. In the final example, an empty character class has been returned. An empty character class can never match anything, which means `greenery` can use this to represent a regular expression which matches no strings at all. Note that this is different from only matching the empty string.
 38 | 
 39 | Internally, `greenery` works by converting regular expressions to finite state machines, computing the intersection of the two FSMs as a third FSM, and using the Brzozowski algebraic method (*q.v.*) to convert the third FSM back to a regular expression.
 40 | 
 41 | ## API
 42 | 
 43 | ### parse(string)
 44 | 
 45 | This function takes a regular expression (_i.e._ a string) as input and returns a `Pattern` object (see below) representing that regular expression.
 46 | 
 47 | The following metacharacters and formations have their usual meanings: `.`, `*`, `+`, `?`, `{m}`, `{m,}`, `{m,n}`, `()`, `|`, `[]`, `^` within `[]` character ranges only, `-` within `[]` character ranges only, and `\` to escape any of the preceding characters or itself.
 48 | 
 49 | These character escapes are possible: `\t`, `\r`, `\n`, `\f`, `\v`.
 50 | 
 51 | These predefined character sets also have their usual meanings: `\w`, `\d`, `\s` and their negations `\W`, `\D`, `\S`. `.` matches any character, including new line characters and carriage returns.
 52 | 
 53 | An empty charclass `[]` is legal and matches no characters: when used in a regular expression, the regular expression may match no strings.
 54 | 
 55 | #### Unsupported constructs
 56 | 
 57 | * This method is intentionally rigorously simple, and tolerates no ambiguity. For example, a hyphen must be escaped in a character class even if it appears first or last. `[-abc]` is a syntax error, write `[\-abc]`. Escaping something which doesn't need it is a syntax error too: `[\ab]` resolves to neither `[\\ab]` nor `[ab]`.
 58 | 
 59 | * The `^` and `$` metacharacters are not supported. By default, `greenery` assumes that all regexes are anchored at the start and end of any input string. Carets and dollar signs will be parsed as themselves. If you want to *not* anchor at the start or end of the string, put `.*` at the start or end of your regex respectively.
 60 | 
 61 |   This is because computing the intersection between `.*a.*` and `.*b.*` (1) is largely pointless and (2) usually results in gibberish coming out of the program.
 62 | 
 63 | * The non-greedy operators `*?`, `+?`, `??` and `{m,n}?` are permitted but do nothing. This is because they do not alter the regular language. For example, `abc{0,5}def` and `abc{0,5}?def` represent precisely the same set of strings.
 64 | 
 65 | * Parentheses are used to alternate between multiple possibilities e.g. `(a|bc)` only, not for capture grouping. Here's why:
 66 | 
 67 |   ```python
 68 |   print(parse("(ab)c") & parse("a(bc)"))
 69 |   # "abc"
 70 |   ```
 71 | 
 72 | * The `(?:...)` syntax for non-capturing groups is permitted, but does nothing.
 73 | 
 74 | * Other `(?...)` constructs are not supported (and most are not [regular in the computer science sense](http://en.wikipedia.org/wiki/Regular_language)).
 75 | 
 76 | *  Back-references, such as `([aeiou])\1`, are not regular.
 77 | 
 78 | ### Pattern
 79 | 
 80 | A `Pattern` represents a regular expression and exposes various methods for manipulating it and combining it with other regular expressions. `Pattern`s are immutable.
 81 | 
 82 | A regular language is a possibly-infinite set of strings. With this in mind, `Pattern` implements numerous [methods like those on `frozenset`](https://docs.python.org/3/library/stdtypes.html#frozenset), as well as many regular expression-specific methods.
 83 | 
 84 | It's not intended that you construct new `Pattern` instances directly; use `parse(string)`, above.
 85 | 
 86 | Method | Behaviour
 87 | ---|---
 88 | `pattern.matches("a")` <br/> `"a" in pattern` | Returns `True` if the regular expression matches the string or `False` if not.
 89 | `pattern.strings()` <br/> `for string in pattern` | Returns a generator of all the strings that this regular expression matches.
 90 | `pattern.empty()` | Returns `True` if this regular expression matches no strings, otherwise `False`.
 91 | `pattern.cardinality()` <br/> `len(pattern)` | Returns the number of strings which the regular expression matches. Throws an `OverflowError` if this number is infinite.
 92 | `pattern1.equivalent(pattern2)` | Returns `True` if the two regular expressions match exactly the same strings, otherwise `False`.
 93 | `pattern.copy()` | Returns a shallow copy of `pattern`.
 94 | `pattern.everythingbut()` | Returns a regular expression which matches every string not matched by the original. `pattern.everythingbut().everythingbut()` matches the same strings as `pattern`, but is not necessarily identical in structure.
 95 | `pattern.reversed()` <br/> `reversed(pattern)` | Returns a reversed regular expression. For each string that `pattern` matched, `reversed(pattern)` will match the reversed string. `reversed(reversed(pattern))` matches the same strings as `pattern`, but is not necessarily identical.
 96 | `pattern.times(star)` <br/> `pattern * star` | Returns the input regular expression multiplied by any `Multiplier` (see below).
 97 | `pattern1.concatenate(pattern2, ...)` <br/> `pattern1 + pattern2 + ...` | Returns a regular expression which matches any string of the form *a·b·...* where *a* is a string matched by `pattern1`, *b* is a string matched by `pattern2` and so on.
 98 | `pattern1.union(pattern2, ...)` <br/> `pattern1 \| pattern2 \| ...` | Returns a regular expression matching any string matched by any of the input regular expressions. This is also called *alternation*.
 99 | `pattern1.intersection(pattern2, ...)` <br/> `pattern1 & pattern2 & ...` | Returns a regular expression matching any string matched by all input regular expressions. The successful implementation of this method was the ultimate goal of this entire project.
100 | `pattern1.difference(pattern2, ...)` <br/> `pattern1 - pattern2 - ...` | Subtract the set of strings matched by `pattern2` onwards from those matched by `pattern1` and return the resulting regular expression.
101 | `pattern1.symmetric_difference(pattern2, ...)` <br/> `pattern1 ^ pattern2 ^ ...` | Returns a regular expression matching any string accepted by `pattern1` or `pattern2` but not both.
102 | `pattern.derive("a")` | Return the [Brzozowski derivative](https://en.wikipedia.org/wiki/Brzozowski_derivative) of the input regular expression with respect to "a".
103 | `pattern.reduce()` | Returns a regular expression which is equivalent to `pattern` (*i.e.* matches exactly the same strings) but is simplified as far as possible. See dedicated section below.
104 | 
105 | #### pattern.reduce()
106 | 
107 | Call this method to try to simplify the regular expression object. The follow simplification heuristics are supported:
108 | 
109 | * `(ab|cd|ef|)g` to `(ab|cd|ef)?g`
110 | * `([ab])*` to `[ab]*`
111 | * `ab?b?c` to `ab{0,2}c`
112 | * `aa` to `a{2}`
113 | * `a(d(ab|a*c))` to `ad(ab|a*c)`
114 | * `0|[2-9]` to `[02-9]`
115 | * `abc|ade` to `a(bc|de)`
116 | * `xyz|stz` to `(xy|st)z`
117 | * `abc()def` to `abcdef`
118 | * `a{1,2}|a{3,4}` to `a{1,4}`
119 | 
120 | The value returned is a new `Pattern` object.
121 | 
122 | Note that in a few cases this did *not* result in a shorter regular expression.
123 | 
124 | ### Multiplier
125 | 
126 | A combination of a finite lower `Bound` (see below) and a possibly-infinite upper `Bound`.
127 | 
128 | ```python
129 | from greenery import parse, Bound, INF, Multiplier
130 | 
131 | print(parse("a") * Multiplier(Bound(3), INF)) # "a{3,}"
132 | ```
133 | 
134 | ### STAR
135 | 
136 | Special `Multiplier`, equal to `Multiplier(Bound(0), INF)`. When it appears in a regular expression, this is `{0,}` or the [Kleene star](https://en.wikipedia.org/wiki/Kleene_star) `*`.
137 | 
138 | ### QM
139 | 
140 | Special `Multiplier`, equal to `Multiplier(Bound(0), Bound(1))`. When it appears in a regular expression, this is `{0,1}` or `?`.
141 | 
142 | ### PLUS
143 | 
144 | Special `Multiplier`, equal to `Multiplier(Bound(1), INF)`. When it appears in a regular expression, this is `{1,}` or `+`.
145 | 
146 | ### Bound
147 | 
148 | Represents a non-negative integer or infinity.
149 | 
150 | ### INF
151 | 
152 | Special `Bound` representing no limit. Can be used as an upper bound only.
153 | 
154 | ### Charclass
155 | 
156 | This class represents a _character class_ such as `a`, `\w`, `.`, `[A-Za-z0-9_]`, and so on. `Charclass`es must be constructed longhand either using a string containing all the desired characters, or a tuple of ranges, where each range is a pair of characters to be used as the range's inclusive endpoints. Use `~` to negate a `Charclass`.
157 | 
158 | * `a` = `Charclass("a")`
159 | * `[abyz]` = `Charclass("abyz")`
160 | * `[a-z]` = `Charclass("abcdefghijklmnopqrstuvwxyz")` or `Charclass((("a", "z"),))`
161 | * `\w` = `Charclass((("a", "z"), ("A", "Z"), ("0", "9"), ("_", "_")))`
162 | * `[^x]` = `~Charclass("x")`
163 | * `\D` = `~Charclass("0123456789")`
164 | * `.` = `~Charclass(())`
165 | 
166 | ### Fsm
167 | 
168 | An `Fsm` is a finite state machine which accepts strings (or more generally iterables of Unicode characters) as input. This is used internally by `Pattern` for most regular expression manipulation operations.
169 | 
170 | In theory, accepting strings as input means that every `Fsm`'s alphabet is the same: the set of all 1,114,112 possible Unicode characters which can make up a string. But this is a very large alphabet and would result in extremely large transition maps, and have very poor performance. So, in practice, `Fsm` uses not single characters but `Charclass`es (see above) for its alphabet and its map transitions.
171 | 
172 | ```python
173 | # FSM accepting only the string "a"
174 | a = Fsm(
175 |     alphabet={Charclass("a"), ~Charclass("a")},
176 |     states={0, 1, 2},
177 |     initial=0,
178 |     finals={1},
179 |     map={
180 |         0: {Charclass("a"): 1, ~Charclass("a"): 2},
181 |         1: {Charclass("a"): 2, ~Charclass("a"): 2},
182 |         2: {Charclass("a"): 2, ~Charclass("a"): 2},
183 |     },
184 | )
185 | ```
186 | 
187 | Notes:
188 | 
189 | * The `Charclass`es which make up the alphabet must _partition_ the space of all Unicode characters - every Unicode character must be a member of exactly one `Charclass` in the alphabet.
190 | * States must be integers.
191 | * The map must be complete. Omitting transition symbols or states is not permitted.
192 | 
193 | A regular language is a possibly-infinite set of strings. With this in mind, `Fsm` implements several [methods like those on `frozenset`](https://docs.python.org/3/library/stdtypes.html#frozenset).
194 | 
195 | Method | Behaviour
196 | ---|---
197 | `fsm.accepts("a")` | Returns `True` if the FSM accepts string or `False` if not.
198 | `fsm.strings()` | Returns a generator of all the strings which this FSM accepts.
199 | `fsm.empty()` | Returns `True` if this FSM accepts no strings, otherwise `False`.
200 | `fsm.cardinality()` | Returns the number of strings which the FSM accepts. Throws an `OverflowError` if this number is infinite.
201 | `fsm1.equivalent(fsm2)` | Returns `True` if the two FSMs accept exactly the same strings, otherwise `False`.
202 | `fsm.copy()` | Returns a shallow copy of `fsm`.
203 | `fsm.everythingbut()` | Returns an FSM which accepts every string not matched by the original. `fsm.everythingbut().everythingbut()` matches the same strings as `fsm`.
204 | `fsm1.concatenate(fsm2, ...)` | Returns an FSM which accepts any string of the form *a·b·...* where *a* is a string accepted by `fsm1`, *b* is a string accepted by `fsm2` and so on.
205 | `fsm.times(multiplier)` | Returns the input FSM concatenated with itself `multiplier` times. `multiplier` must be a non-negative integer.
206 | `fsm.star()` | Returns an FSM which is the Kleene star closure of the original.
207 | `fsm1.union(fsm2, ...)` | Returns an FSM accepting any string matched by any of the input FSMs. This is also called *alternation*.
208 | `fsm1.intersection(fsm2, ...)` | Returns an FSM accepting any string matched by all input FSMs.
209 | `fsm1.difference(fsm2, ...)` | Subtract the set of strings matched by `fsm2` onwards from those matched by `fsm1` and return the resulting FSM.
210 | `fsm1.symmetric_difference(fsm2, ...)` | Returns an FSM matching any string accepted by `fsm1` or `fsm2` but not both.
211 | `fsm.derive(string)` | Return the [Brzozowski derivative](https://en.wikipedia.org/wiki/Brzozowski_derivative) of the input FSM with respect to the input string.
212 | `fsm.reduce()` | Returns an FSM which is equivalent to `fsm` (*i.e.* accepts exactly the same strings) but has a minimal number of states.
213 | 
214 | Note that methods combining FSMs usually output new FSMs with modified alphabets. For example, concatenating an FSM with alphabet `{Charclass("a"), ~Charclass("a")}` and another FSM with alphabet `{Charclass("abc"), ~Charclass("abc")}` usually results in a third FSM with a _repartitioned_ alphabet of `{Charclass("a"), Charclass("bc"), ~Charclass("abc")}`. Notice how all three alphabets partition the space of all Unicode characters.
215 | 
216 | Several other methods on `Fsm` instances are available - these should not be used, they're subject to change.
217 | 
218 | ### EPSILON
219 | 
220 | Special `Fsm` which accepts only the empty string.
221 | 
222 | ### NULL
223 | 
224 | Special `Fsm` which accepts no strings.
225 | 
226 | ## Development
227 | 
228 | ### Running tests
229 | 
230 | ```sh
231 | pip install -r requirements.dev.txt
232 | isort .
233 | black .
234 | mypy greenery
235 | flake8 --count --statistics --show-source --select=E9,F63,F7,F82 .
236 | flake8 --count --statistics --exit-zero --max-complexity=10 .
237 | pylint --recursive=true .
238 | pytest
239 | ```
240 | 
241 | ### Building and publishing new versions
242 | 
243 | * Update the version in `./setup.py`
244 | * Trash `./dist`
245 | * `python -m build` - creates a `./dist` directory with some stuff in it
246 | * `python -m twine upload dist/*`
247 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-allow-list=
  7 |         math,
  8 | 
  9 | # A comma-separated list of package or module names from where C extensions may
 10 | # be loaded. Extensions are loading into the active Python interpreter and may
 11 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list
 12 | # for backward compatibility.)
 13 | extension-pkg-whitelist=
 14 | 
 15 | # Return non-zero exit code if any of these messages/categories are detected,
 16 | # even if score is above --fail-under value. Syntax same as enable. Messages
 17 | # specified are enabled, while categories only check already-enabled messages.
 18 | fail-on=
 19 | 
 20 | # Specify a score threshold to be exceeded before program exits with error.
 21 | fail-under=10.0
 22 | 
 23 | # Files or directories to be skipped. They should be base names, not paths.
 24 | ignore=
 25 |         dist,
 26 |         env,
 27 |         venv,
 28 | 
 29 | # Add files or directories matching the regex patterns to the ignore-list. The
 30 | # regex matches against paths.
 31 | ignore-paths=
 32 | 
 33 | # Files or directories matching the regex patterns are skipped. The regex
 34 | # matches against base names, not paths.
 35 | ignore-patterns=
 36 |         # Anything beginning with a dot.
 37 |         ^\..+$,
 38 |         # Anything ending in `.venv` (e.g. dev.venv)
 39 |         \.venv$,
 40 |         # Anything ending in `.egg-info`
 41 |         \.egg-info$,
 42 |         # Anything like __pycache__ or __mypycache__ or whatever.
 43 |         ^__.*cache.*__$,
 44 | 
 45 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 46 | # number of processors available to use.
 47 | jobs=0
 48 | 
 49 | # Control the amount of potential inferred values when inferring a single
 50 | # object. This can help the performance when dealing with large functions or
 51 | # complex, nested conditions.
 52 | limit-inference-results=100
 53 | 
 54 | # List of plugins (as comma separated values of python module names) to load,
 55 | # usually to register additional checkers.
 56 | load-plugins=
 57 |         pylint.extensions.bad_builtin,
 58 |         pylint.extensions.broad_try_clause,
 59 |         pylint.extensions.check_elif,
 60 |         pylint.extensions.code_style,
 61 |         pylint.extensions.comparison_placement,
 62 |         pylint.extensions.confusing_elif,
 63 |         pylint.extensions.consider_ternary_expression,
 64 |         pylint.extensions.docparams,
 65 |         pylint.extensions.docstyle,
 66 |         pylint.extensions.empty_comment,
 67 |         pylint.extensions.eq_without_hash,
 68 |         pylint.extensions.for_any_all,
 69 |         pylint.extensions.mccabe,
 70 |         pylint.extensions.no_self_use,
 71 |         pylint.extensions.overlapping_exceptions,
 72 |         pylint.extensions.private_import,
 73 |         pylint.extensions.redefined_loop_name,
 74 |         pylint.extensions.redefined_variable_type,
 75 |         pylint.extensions.set_membership,
 76 |         pylint.extensions.typing,
 77 |         pylint.extensions.while_used,
 78 | 
 79 | # Pickle collected data for later comparisons.
 80 | persistent=yes
 81 | 
 82 | # Min Python version to use for version dependend checks. Will default to the
 83 | # version used to run pylint.
 84 | py-version=3.8
 85 | 
 86 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 87 | # user-friendly hints instead of false-positive error messages.
 88 | suggestion-mode=yes
 89 | 
 90 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 91 | # active Python interpreter and may run arbitrary code.
 92 | unsafe-load-any-extension=no
 93 | 
 94 | 
 95 | [MESSAGES CONTROL]
 96 | 
 97 | # Only show warnings with the listed confidence levels. Leave empty to show
 98 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 99 | confidence=
100 | 
101 | # Disable the message, report, category or checker with the given id(s). You
102 | # can either give multiple identifiers separated by comma (,) or put this
103 | # option multiple times (only on the command line, not in the configuration
104 | # file where it should appear only once). You can also use "--disable=all" to
105 | # disable everything first and then reenable specific checks. For example, if
106 | # you want to run only the similarities checker, you can use "--disable=all
107 | # --enable=similarities". If you want to run only the classes checker, but have
108 | # no Warning level messages displayed, use "--disable=all --enable=classes
109 | # --disable=W".
110 | disable=
111 |         consider-using-assignment-expr,
112 |         consider-using-augmented-assign,
113 |         docstring-first-line-empty,
114 |         duplicate-code,
115 |         file-ignored,
116 |         locally-disabled,
117 |         missing-function-docstring,
118 |         missing-module-docstring,
119 |         no-method-argument,
120 |         no-self-argument,
121 |         similarities,
122 |         suppressed-message,
123 |         too-complex,
124 |         too-few-public-methods,
125 |         while-used,
126 | 
127 | # Enable the message, report, category or checker with the given id(s). You can
128 | # either give multiple identifier separated by comma (,) or put this option
129 | # multiple time (only on the command line, not in the configuration file where
130 | # it should appear only once). See also the "--disable" option for examples.
131 | enable=
132 |         bad-inline-option,
133 |         c-extension-no-member,
134 |         deprecated-pragma,
135 |         raw-checker-failed,
136 |         use-symbolic-message-instead,
137 |         useless-suppression,
138 |         use-implicit-booleaness-not-comparison-to-zero,
139 |         use-implicit-booleaness-not-comparison-to-string,
140 | 
141 | 
142 | [REPORTS]
143 | 
144 | # Python expression which should return a score less than or equal to 10. You
145 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
146 | # which contain the number of messages in each category, as well as 'statement'
147 | # which is the total number of statements analyzed. This score is used by the
148 | # global evaluation report (RP0004).
149 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
150 | 
151 | # Template used to display messages. This is a python new-style format string
152 | # used to format the message information. See doc for all details.
153 | #msg-template=
154 | 
155 | # Set the output format. Available formats are text, parseable, colorized, json
156 | # and msvs (visual studio). You can also give a reporter class, e.g.
157 | # mypackage.mymodule.MyReporterClass.
158 | output-format=text
159 | 
160 | # Tells whether to display a full report or only the messages.
161 | reports=no
162 | 
163 | # Activate the evaluation score.
164 | score=no
165 | 
166 | 
167 | [REFACTORING]
168 | 
169 | # Maximum number of nested blocks for function / method body
170 | max-nested-blocks=5
171 | 
172 | # Complete name of functions that never returns. When checking for
173 | # inconsistent-return-statements if a never returning function is called then
174 | # it will be considered as an explicit return statement and no message will be
175 | # printed.
176 | never-returning-functions=sys.exit,argparse.parse_error
177 | 
178 | 
179 | [LOGGING]
180 | 
181 | # The type of string formatting that logging methods do. `old` means using %
182 | # formatting, `new` is for `{}` formatting.
183 | logging-format-style=old
184 | 
185 | # Logging modules to check that the string format arguments are in logging
186 | # function parameter format.
187 | logging-modules=logging
188 | 
189 | 
190 | [SPELLING]
191 | 
192 | # Limits count of emitted suggestions for spelling mistakes.
193 | max-spelling-suggestions=4
194 | 
195 | # Spelling dictionary name. Available dictionaries: none. To make it work,
196 | # install the 'python-enchant' package.
197 | spelling-dict=
198 | 
199 | # List of comma separated words that should be considered directives if they
200 | # appear and the beginning of a comment and should not be checked.
201 | spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
202 | 
203 | # List of comma separated words that should not be checked.
204 | spelling-ignore-words=
205 | 
206 | # A path to a file that contains the private dictionary; one word per line.
207 | spelling-private-dict-file=
208 | 
209 | # Tells whether to store unknown words to the private dictionary (see the
210 | # --spelling-private-dict-file option) instead of raising a message.
211 | spelling-store-unknown-words=no
212 | 
213 | 
214 | [MISCELLANEOUS]
215 | 
216 | # List of note tags to take in consideration, separated by a comma.
217 | notes=FIXME,
218 |       XXX,
219 |       TODO
220 | 
221 | # Regular expression of note tags to take in consideration.
222 | #notes-rgx=
223 | 
224 | 
225 | [TYPECHECK]
226 | 
227 | # List of decorators that produce context managers, such as
228 | # contextlib.contextmanager. Add to this list to register other decorators that
229 | # produce valid context managers.
230 | contextmanager-decorators=contextlib.contextmanager
231 | 
232 | # List of members which are set dynamically and missed by pylint inference
233 | # system, and so shouldn't trigger E1101 when accessed. Python regular
234 | # expressions are accepted.
235 | generated-members=
236 | 
237 | # Tells whether missing members accessed in mixin class should be ignored. A
238 | # mixin class is detected if its name ends with "mixin" (case insensitive).
239 | ignore-mixin-members=yes
240 | 
241 | # Tells whether to warn about missing members when the owner of the attribute
242 | # is inferred to be None.
243 | ignore-none=yes
244 | 
245 | # This flag controls whether pylint should warn about no-member and similar
246 | # checks whenever an opaque object is returned when inferring. The inference
247 | # can return multiple potential results while evaluating a Python object, but
248 | # some branches might not be evaluated, which results in partial inference. In
249 | # that case, it might be useful to still emit no-member and other checks for
250 | # the rest of the inferred objects.
251 | ignore-on-opaque-inference=yes
252 | 
253 | # List of class names for which member attributes should not be checked (useful
254 | # for classes with dynamically set attributes). This supports the use of
255 | # qualified names.
256 | ignored-classes=optparse.Values,thread._local,_thread._local
257 | 
258 | # List of module names for which member attributes should not be checked
259 | # (useful for modules/projects where namespaces are manipulated during runtime
260 | # and thus existing member attributes cannot be deduced by static analysis). It
261 | # supports qualified module names, as well as Unix pattern matching.
262 | ignored-modules=
263 | 
264 | # Show a hint with possible names when a member name was not found. The aspect
265 | # of finding the hint is based on edit distance.
266 | missing-member-hint=yes
267 | 
268 | # The minimum edit distance a name should have in order to be considered a
269 | # similar match for a missing member name.
270 | missing-member-hint-distance=1
271 | 
272 | # The total number of similar names that should be taken in consideration when
273 | # showing a hint for a missing member.
274 | missing-member-max-choices=1
275 | 
276 | # List of decorators that change the signature of a decorated function.
277 | signature-mutators=
278 | 
279 | 
280 | [VARIABLES]
281 | 
282 | # List of additional names supposed to be defined in builtins. Remember that
283 | # you should avoid defining new builtins when possible.
284 | additional-builtins=
285 | 
286 | # Tells whether unused global variables should be treated as a violation.
287 | allow-global-unused-variables=yes
288 | 
289 | # List of names allowed to shadow builtins
290 | allowed-redefined-builtins=
291 | 
292 | # List of strings which can identify a callback function by name. A callback
293 | # name must start or end with one of those strings.
294 | callbacks=cb_,
295 |           _cb
296 | 
297 | # A regular expression matching the name of dummy variables (i.e. expected to
298 | # not be used).
299 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
300 | 
301 | # Argument names that match this expression will be ignored. Default to name
302 | # with leading underscore.
303 | ignored-argument-names=_.*|^ignored_|^unused_
304 | 
305 | # Tells whether we should check for unused import in __init__ files.
306 | init-import=no
307 | 
308 | # List of qualified module names which can have objects that can redefine
309 | # builtins.
310 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
311 | 
312 | 
313 | [FORMAT]
314 | 
315 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
316 | expected-line-ending-format=
317 | 
318 | # Regexp for a line that is allowed to be longer than the limit.
319 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
320 | 
321 | # Number of spaces of indent required inside a hanging or continued line.
322 | indent-after-paren=4
323 | 
324 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
325 | # tab).
326 | indent-string='    '
327 | 
328 | # Maximum number of characters on a single line.
329 | max-line-length=88
330 | 
331 | # Maximum number of lines in a module.
332 | max-module-lines=1000
333 | 
334 | # Allow the body of a class to be on the same line as the declaration if body
335 | # contains single statement.
336 | single-line-class-stmt=no
337 | 
338 | # Allow the body of an if to be on the same line as the test if there is no
339 | # else.
340 | single-line-if-stmt=no
341 | 
342 | 
343 | [SIMILARITIES]
344 | 
345 | # Comments are removed from the similarity computation
346 | ignore-comments=yes
347 | 
348 | # Docstrings are removed from the similarity computation
349 | ignore-docstrings=yes
350 | 
351 | # Imports are removed from the similarity computation
352 | ignore-imports=no
353 | 
354 | # Signatures are removed from the similarity computation
355 | ignore-signatures=no
356 | 
357 | # Minimum lines number of a similarity.
358 | min-similarity-lines=4
359 | 
360 | 
361 | [BASIC]
362 | 
363 | # Naming style matching correct argument names.
364 | argument-naming-style=snake_case
365 | 
366 | # Regular expression matching correct argument names. Overrides argument-
367 | # naming-style.
368 | #argument-rgx=
369 | 
370 | # Naming style matching correct attribute names.
371 | attr-naming-style=snake_case
372 | 
373 | # Regular expression matching correct attribute names. Overrides attr-naming-
374 | # style.
375 | #attr-rgx=
376 | 
377 | # Bad variable names which should always be refused, separated by a comma.
378 | bad-names=foo,
379 |           bar,
380 |           baz,
381 |           toto,
382 |           tutu,
383 |           tata
384 | 
385 | # Bad variable names regexes, separated by a comma. If names match any regex,
386 | # they will always be refused
387 | bad-names-rgxs=
388 | 
389 | # Naming style matching correct class attribute names.
390 | class-attribute-naming-style=any
391 | 
392 | # Regular expression matching correct class attribute names. Overrides class-
393 | # attribute-naming-style.
394 | #class-attribute-rgx=
395 | 
396 | # Naming style matching correct class constant names.
397 | class-const-naming-style=UPPER_CASE
398 | 
399 | # Regular expression matching correct class constant names. Overrides class-
400 | # const-naming-style.
401 | #class-const-rgx=
402 | 
403 | # Naming style matching correct class names.
404 | class-naming-style=PascalCase
405 | 
406 | # Regular expression matching correct class names. Overrides class-naming-
407 | # style.
408 | #class-rgx=
409 | 
410 | # Naming style matching correct constant names.
411 | const-naming-style=UPPER_CASE
412 | 
413 | # Regular expression matching correct constant names. Overrides const-naming-
414 | # style.
415 | #const-rgx=
416 | 
417 | # Minimum line length for functions/classes that require docstrings, shorter
418 | # ones are exempt.
419 | docstring-min-length=-1
420 | 
421 | # Naming style matching correct function names.
422 | function-naming-style=snake_case
423 | 
424 | # Regular expression matching correct function names. Overrides function-
425 | # naming-style.
426 | #function-rgx=
427 | 
428 | # Good variable names which should always be accepted, separated by a comma.
429 | good-names=
430 |            # Exception
431 |            ex,
432 |            # "Element"
433 |            el,
434 |            # "Function"
435 |            fn,
436 |            # "it" as a trivial lambda argument; or an iterator.
437 |            it,
438 |            # "Operation"
439 |            op,
440 |            Run,
441 |            _,
442 | 
443 | # Good variable names regexes, separated by a comma. If names match any regex,
444 | # they will always be accepted
445 | good-names-rgxs=
446 |         # Any lowercase single letter except "l" and "o".
447 |         # May be pluralized.
448 |         ^[abcdefghijkmnpqrstuvwxyz]s?$,
449 | 
450 | # Type variables have slightly different conventions.
451 | typevar-rgx=_?[A-Z]\d*(_co|_contra)?
452 | 
453 | # Include a hint for the correct naming format with invalid-name.
454 | include-naming-hint=no
455 | 
456 | # Naming style matching correct inline iteration names.
457 | inlinevar-naming-style=any
458 | 
459 | # Regular expression matching correct inline iteration names. Overrides
460 | # inlinevar-naming-style.
461 | #inlinevar-rgx=
462 | 
463 | # Naming style matching correct method names.
464 | method-naming-style=snake_case
465 | 
466 | # Regular expression matching correct method names. Overrides method-naming-
467 | # style.
468 | #method-rgx=
469 | 
470 | # Naming style matching correct module names.
471 | module-naming-style=snake_case
472 | 
473 | # Regular expression matching correct module names. Overrides module-naming-
474 | # style.
475 | #module-rgx=
476 | 
477 | # Colon-delimited sets of names that determine each other's naming style when
478 | # the name regexes allow several styles.
479 | name-group=
480 | 
481 | # Regular expression which should only match function or class names that do
482 | # not require a docstring.
483 | no-docstring-rgx=^_
484 | 
485 | # List of decorators that produce properties, such as abc.abstractproperty. Add
486 | # to this list to register other decorators that produce valid properties.
487 | # These decorators are taken in consideration only for invalid-name.
488 | property-classes=abc.abstractproperty
489 | 
490 | # Naming style matching correct variable names.
491 | variable-naming-style=snake_case
492 | 
493 | # Regular expression matching correct variable names. Overrides variable-
494 | # naming-style.
495 | #variable-rgx=
496 | 
497 | 
498 | [STRING]
499 | 
500 | # This flag controls whether inconsistent-quotes generates a warning when the
501 | # character used as a quote delimiter is used inconsistently within a module.
502 | check-quote-consistency=yes
503 | 
504 | # This flag controls whether the implicit-str-concat should generate a warning
505 | # on implicit string concatenation in sequences defined over several lines.
506 | check-str-concat-over-line-jumps=yes
507 | 
508 | 
509 | [IMPORTS]
510 | 
511 | # List of modules that can be imported at any level, not just the top level
512 | # one.
513 | allow-any-import-level=
514 | 
515 | # Allow wildcard imports from modules that define __all__.
516 | allow-wildcard-with-all=no
517 | 
518 | # Analyse import fallback blocks. This can be used to support both Python 2 and
519 | # 3 compatible code, which means that the block might have code that exists
520 | # only in one or another interpreter, leading to false positives when analysed.
521 | analyse-fallback-blocks=no
522 | 
523 | # Deprecated modules which should not be used, separated by a comma.
524 | deprecated-modules=
525 | 
526 | # Output a graph (.gv or any supported image format) of external dependencies
527 | # to the given file (report RP0402 must not be disabled).
528 | ext-import-graph=
529 | 
530 | # Output a graph (.gv or any supported image format) of all (i.e. internal and
531 | # external) dependencies to the given file (report RP0402 must not be
532 | # disabled).
533 | import-graph=
534 | 
535 | # Output a graph (.gv or any supported image format) of internal dependencies
536 | # to the given file (report RP0402 must not be disabled).
537 | int-import-graph=
538 | 
539 | # Force import order to recognize a module as part of the standard
540 | # compatibility libraries.
541 | known-standard-library=
542 | 
543 | # Force import order to recognize a module as part of a third party library.
544 | known-third-party=enchant
545 | 
546 | # Couples of modules and preferred modules, separated by a comma.
547 | preferred-modules=
548 | 
549 | 
550 | [CLASSES]
551 | 
552 | # Warn about protected attribute access inside special methods
553 | check-protected-access-in-special-methods=no
554 | 
555 | # List of method names used to declare (i.e. assign) instance attributes.
556 | defining-attr-methods=__init__,
557 |                       __new__,
558 |                       setUp,
559 |                       __post_init__
560 | 
561 | # List of member names, which should be excluded from the protected access
562 | # warning.
563 | exclude-protected=_asdict,
564 |                   _fields,
565 |                   _replace,
566 |                   _source,
567 |                   _make
568 | 
569 | # List of valid names for the first argument in a class method.
570 | valid-classmethod-first-arg=cls
571 | 
572 | # List of valid names for the first argument in a metaclass class method.
573 | valid-metaclass-classmethod-first-arg=cls
574 | 
575 | 
576 | [DESIGN]
577 | 
578 | # List of qualified class names to ignore when counting class parents (see
579 | # R0901)
580 | ignored-parents=
581 | 
582 | # Maximum number of arguments for function / method.
583 | max-args=5
584 | 
585 | # Maximum number of attributes for a class (see R0902).
586 | max-attributes=7
587 | 
588 | # Maximum number of boolean expressions in an if statement (see R0916).
589 | max-bool-expr=5
590 | 
591 | # Maximum number of branch for function / method body.
592 | max-branches=12
593 | 
594 | # Maximum number of locals for function / method body.
595 | max-locals=15
596 | 
597 | # Maximum number of parents for a class (see R0901).
598 | max-parents=7
599 | 
600 | # Maximum number of public methods for a class (see R0904).
601 | max-public-methods=20
602 | 
603 | # Maximum number of return / yield for function / method body.
604 | max-returns=6
605 | 
606 | # Maximum number of statements in function / method body.
607 | max-statements=50
608 | 
609 | # Minimum number of public methods for a class (see R0903).
610 | min-public-methods=2
611 | 
612 | 
613 | [EXCEPTIONS]
614 | 
615 | # Exceptions that will emit a warning when being caught. Defaults to
616 | # "BaseException, Exception".
617 | overgeneral-exceptions=
618 |   builtins.BaseException,
619 |   builtins.Exception,
620 | 


--------------------------------------------------------------------------------
/greenery/rxelems.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Because of the circularity between `Pattern`, `Conc` and `Mult`, all three
  3 | need to be in the same source file?
  4 | """
  5 | 
  6 | from __future__ import annotations
  7 | 
  8 | __all__ = (
  9 |     "Conc",
 10 |     "Mult",
 11 |     "Pattern",
 12 |     "from_fsm",
 13 | )
 14 | 
 15 | from dataclasses import dataclass
 16 | from enum import Enum, auto
 17 | from functools import reduce
 18 | from typing import Iterator
 19 | 
 20 | from .bound import INF, Bound
 21 | from .charclass import NULLCHARCLASS, Charclass
 22 | from .fsm import EPSILON, NULL, Fsm, StateType, from_charclass
 23 | from .multiplier import ONE, QM, STAR, ZERO, Multiplier
 24 | 
 25 | 
 26 | @dataclass(frozen=True)
 27 | class Conc:
 28 |     """
 29 |     A `Conc` (short for "concatenation") is a tuple of `Mult`s i.e. an
 30 |     unbroken string of mults occurring one after the other.
 31 |     e.g. abcde[^fg]*h{4}[a-z]+(subpattern)(subpattern2)
 32 |     To express the empty string, use an empty `Conc`, Conc().
 33 |     """
 34 | 
 35 |     mults: tuple[Mult, ...]
 36 | 
 37 |     def __init__(self, /, *mults: Mult):
 38 |         object.__setattr__(self, "mults", tuple(mults))
 39 | 
 40 |     def __eq__(self, other: object, /) -> bool:
 41 |         if not isinstance(other, type(self)):
 42 |             return NotImplemented
 43 |         return self.mults == other.mults
 44 | 
 45 |     def __hash__(self, /) -> int:
 46 |         return hash(self.mults)
 47 | 
 48 |     def __repr__(self, /) -> str:
 49 |         args = ", ".join(repr(mult) for mult in self.mults)
 50 |         return f"Conc({args})"
 51 | 
 52 |     def reduce(self) -> Conc:
 53 |         # pylint: disable=too-many-branches
 54 |         # pylint: disable=too-many-return-statements
 55 | 
 56 |         if self == NULLCONC:
 57 |             return self
 58 | 
 59 |         if self.empty():
 60 |             return NULLCONC
 61 | 
 62 |         # Try recursively reducing our mults
 63 |         reduced = tuple(mult.reduce() for mult in self.mults)
 64 |         if reduced != self.mults:
 65 |             return Conc(*reduced).reduce()
 66 | 
 67 |         # strip out mults which can only match the empty string
 68 |         for i, mult in enumerate(self.mults):
 69 |             if (
 70 |                 # Conc contains "()" (i.e. a `Mult` containing only a `Pattern`
 71 |                 # containing the empty string)? That can be removed
 72 |                 # e.g. "a()b" -> "ab"
 73 |                 mult.multiplicand == Pattern(EMPTYSTRING)
 74 |                 # If a `Mult` has an empty multiplicand, we can only match it
 75 |                 # zero times => empty string => remove it entirely
 76 |                 # e.g. "a[]{0,3}b" -> "ab"
 77 |                 or (mult.multiplicand.empty() and mult.multiplier.min == Bound(0))
 78 |                 # Failing that, we have a positive multiplicand which we
 79 |                 # intend to match zero times. In this case the only possible
 80 |                 # match is the empty string => remove it
 81 |                 # e.g. "a[XYZ]{0}b" -> "ab"
 82 |                 or mult.multiplier == ZERO
 83 |             ):
 84 |                 new = self.mults[:i] + self.mults[i + 1 :]
 85 |                 return Conc(*new).reduce()
 86 | 
 87 |         # We might be able to combine some mults together or at least simplify
 88 |         # the multiplier on one of them.
 89 |         if len(self.mults) > 1:
 90 |             for i in range(len(self.mults) - 1):
 91 |                 r = self.mults[i]
 92 |                 s = self.mults[i + 1]
 93 | 
 94 |                 def to_pattern(multiplicand: Pattern | Charclass, /) -> Pattern:
 95 |                     if isinstance(multiplicand, Pattern):
 96 |                         return multiplicand
 97 |                     return Pattern(Conc(Mult(multiplicand, ONE)))
 98 | 
 99 |                 # so we can do intersection
100 |                 rm_pattern = to_pattern(r.multiplicand)
101 |                 sm_pattern = to_pattern(s.multiplicand)
102 |                 rm_sm_intersection = None
103 | 
104 |                 # If R = S, then we can squish the multipliers together
105 |                 # e.g. ab?b?c -> ab{0,2}c
106 |                 if rm_pattern == sm_pattern:
107 |                     squished = Mult(rm_pattern, r.multiplier + s.multiplier)
108 |                     new = self.mults[:i] + (squished,) + self.mults[i + 2 :]
109 |                     return Conc(*new).reduce()
110 | 
111 |                 # If R's language is a subset of S's, then R{a,b}S{c,} reduces
112 |                 # to R{a}S{c,}...
113 |                 # e.g. \d+\w+ -> \d\w+
114 |                 # Do the cheapest checks first
115 |                 if r.multiplier.min < r.multiplier.max and s.multiplier.max == INF:
116 |                     rm_sm_intersection = rm_pattern & sm_pattern
117 |                     if rm_sm_intersection.equivalent(rm_pattern):
118 |                         trimmed = Mult(
119 |                             rm_pattern,
120 |                             Multiplier(r.multiplier.min, r.multiplier.min),
121 |                         )
122 |                         new = self.mults[:i] + (trimmed, s) + self.mults[i + 2 :]
123 |                         return Conc(*new).reduce()
124 | 
125 |                 # Conversely, if R is superset of S, then R{c,}S{a,b} reduces
126 |                 # to R{c,}S{a}.
127 |                 # e.g. [ab]+a? -> [ab]+
128 |                 # Do the cheapest checks first
129 |                 if r.multiplier.max == INF and s.multiplier.min < s.multiplier.max:
130 |                     if rm_sm_intersection is None:
131 |                         rm_sm_intersection = rm_pattern & sm_pattern
132 |                     if rm_sm_intersection.equivalent(sm_pattern):
133 |                         trimmed = Mult(
134 |                             sm_pattern,
135 |                             Multiplier(s.multiplier.min, s.multiplier.min),
136 |                         )
137 |                         new = self.mults[:i] + (r, trimmed) + self.mults[i + 2 :]
138 |                         return Conc(*new).reduce()
139 | 
140 |         # Conc contains (among other things) a *singleton* `Mult` containing
141 |         # `Pattern` with only one internal `Conc`? Flatten out.
142 |         # e.g. "a(d(ab|a*c))" -> "ad(ab|a*c)"
143 |         # BUT NOT "a(d(ab|a*c)){2,}"
144 |         # AND NOT "a(d(ab|a*c)|y)"
145 |         for i, mult in enumerate(self.mults):
146 |             if (
147 |                 mult.multiplier == ONE
148 |                 and isinstance(mult.multiplicand, Pattern)
149 |                 and len(mult.multiplicand.concs) == 1
150 |             ):
151 |                 (conc,) = mult.multiplicand.concs
152 |                 new = self.mults[:i] + conc.mults + self.mults[i + 1 :]
153 |                 return Conc(*new).reduce()
154 | 
155 |         return self
156 | 
157 |     def to_fsm(self, /) -> Fsm:
158 |         return Fsm.concatenate(EPSILON, *(mult.to_fsm() for mult in self.mults))
159 | 
160 |     def empty(self, /) -> bool:
161 |         return any(mult.empty() for mult in self.mults)
162 | 
163 |     def __str__(self, /) -> str:
164 |         return "".join(str(m) for m in self.mults)
165 | 
166 |     def common(self, other: Conc, /, suffix: bool = False) -> Conc:
167 |         """
168 |         Return the common prefix of these two `Conc`s; that is, the largest
169 |         `Conc` which can be safely beheaded() from the front of both. The
170 |         result could be `EMPTYSTRING`.
171 |         "ZYAA, ZYBB" -> "ZY"
172 |         "CZ, CZ" -> "CZ"
173 |         "YC, ZC" -> ""
174 | 
175 |         With the "suffix" flag set, works from the end. E.g.:
176 |         "AAZY, BBZY" -> "ZY"
177 |         "CZ, CZ" -> "CZ"
178 |         "CY, CZ" -> ""
179 |         """
180 |         mults = []
181 | 
182 |         indices = list(range(min(len(self.mults), len(other.mults))))
183 |         # e.g. [0, 1, 2, 3]
184 | 
185 |         # Work backwards from the end of both `Conc`s instead.
186 |         if suffix:
187 |             indices = [-i - 1 for i in indices]  # e.g. [-1, -2, -3, -4]
188 | 
189 |         for i in indices:
190 |             x = self.mults[i]
191 |             y = other.mults[i]
192 |             common = x.common(y)
193 | 
194 |             # Happens when multiplicands disagree (e.g. "A.common(B)") or if
195 |             # the multiplicand is shared but the common multiplier is `ZERO`
196 |             # (e.g. "ABZ*.common(CZ)".)
197 |             if common.multiplier == ZERO:
198 |                 break
199 | 
200 |             mults.append(common)
201 | 
202 |             # If we did not remove the entirety of both mults, we cannot
203 |             # continue.
204 |             if common != x or common != y:
205 |                 break
206 | 
207 |         if suffix:
208 |             mults = mults[::-1]
209 | 
210 |         return Conc(*mults)
211 | 
212 |     def dock(self, other: Conc, /) -> Conc:
213 |         """
214 |         Subtract another `Conc` from this one.
215 |         This is the opposite of concatenation.
216 |         For example, if ABC + DEF = ABCDEF,
217 |         then logically ABCDEF - DEF = ABC.
218 |         """
219 | 
220 |         # e.g. self has mults at indices [0, 1, 2, 3, 4, 5, 6] len=7
221 |         # e.g. other has mults at indices [0, 1, 2] len=3
222 |         new = list(self.mults)
223 |         for i in reversed(range(len(other.mults))):  # [2, 1, 0]
224 |             # e.g. i = 1, j = 7 - 3 + 1 = 5
225 |             j = len(self.mults) - len(other.mults) + i
226 |             new[j] = new[j].dock(other.mults[i])
227 | 
228 |             if new[j].multiplier == ZERO:
229 |                 # omit that `Mult` entirely since it has been factored out
230 |                 del new[j]
231 | 
232 |             # If the subtraction is incomplete but there is more to
233 |             # other.mults, then we have a problem. For example, "ABC{2} - BC"
234 |             # subtracts the C successfully but leaves something behind,
235 |             # then tries to subtract the B too, which isn't possible
236 |             elif i:
237 |                 raise ArithmeticError(f"Can't subtract {other!r} from {self!r}")
238 | 
239 |         return Conc(*new)
240 | 
241 |     def behead(self, other: Conc, /) -> Conc:
242 |         """
243 |         As with dock() but the other way around. For example, if
244 |         ABC + DEF = ABCDEF, then ABCDEF.behead(AB) = CDEF.
245 |         """
246 |         # Observe that FEDCBA - BA = FEDC.
247 |         return self.reversed().dock(other.reversed()).reversed()
248 | 
249 |     def reversed(self, /) -> Conc:
250 |         return Conc(*[mult.reversed() for mult in reversed(self.mults)])
251 | 
252 | 
253 | # We need a new state not already used.
254 | class _Outside(Enum):
255 |     """Marker state for use in `from_fsm`."""
256 | 
257 |     TOKEN = auto()
258 | 
259 | 
260 | def from_fsm(f: Fsm) -> Pattern:
261 |     """
262 |     Turn the supplied finite state machine into a `Pattern`. This is
263 |     accomplished using the Brzozowski algebraic method.
264 |     """
265 |     # pylint: disable=too-many-branches
266 | 
267 |     outside = _Outside.TOKEN
268 | 
269 |     # The set of strings that would be accepted by this FSM if you started
270 |     # at state i is represented by the regex R_i.
271 |     # If state i has a sole transition "a" to state j, then we know
272 |     # R_i = a R_j.
273 |     # If state i is final, then the empty string is also accepted by this
274 |     # regex.
275 |     # And so on...
276 | 
277 |     # From this we can build a set of simultaneous equations in len(f.states)
278 |     # variables. This system is easily solved for all variables, but we only
279 |     # need one: R_a, where a is the starting state.
280 | 
281 |     # The first thing we need to do is organise the states into order of depth,
282 |     # so that when we perform our back-substitutions, we can start with the
283 |     # last (deepest) state and therefore finish with R_a.
284 |     states = [f.initial]
285 |     i = 0
286 |     while i < len(states):
287 |         current = states[i]
288 |         if current in f.map:
289 |             for symbol in sorted(f.map[current]):
290 |                 next_state = f.map[current][symbol]
291 |                 if next_state not in states:
292 |                     states.append(next_state)
293 |         i += 1
294 | 
295 |     # Our system of equations is represented like so:
296 |     brz: dict[StateType, dict[StateType | _Outside, Pattern]] = {}
297 | 
298 |     for a in f.states:
299 |         brz[a] = {}
300 |         for b in f.states:
301 |             brz[a][b] = NULLPATTERN
302 | 
303 |         if a in f.finals:
304 |             brz[a][outside] = Pattern(EMPTYSTRING)
305 |         else:
306 |             brz[a][outside] = NULLPATTERN
307 | 
308 |     # Populate it with some initial data.
309 |     for a in f.map:
310 |         for charclass in f.map[a]:
311 |             b = f.map[a][charclass]
312 |             brz[a][b] = Pattern(*brz[a][b].concs, Conc(Mult(charclass, ONE))).reduce()
313 | 
314 |     # Now perform our back-substitution
315 |     for i in reversed(range(len(states))):
316 |         a = states[i]
317 | 
318 |         # Before the equation for R_a can be substituted into the other
319 |         # equations, we need to resolve the self-transition (if any).
320 |         # e.g.    R_a = 0 R_a |   1 R_b |   2 R_c
321 |         # becomes R_a =         0*1 R_b | 0*2 R_c
322 |         loop = Mult(brz[a][a], STAR)  # i.e. "0*"
323 |         del brz[a][a]
324 | 
325 |         for right in brz[a]:
326 |             brz[a][right] = Pattern(Conc(loop, Mult(brz[a][right], ONE))).reduce()
327 | 
328 |         # Note: even if we're down to our final equation, the above step still
329 |         # needs to be performed before anything is returned.
330 | 
331 |         # Now we can substitute this equation into all of the previous ones.
332 |         for j in range(i):
333 |             b = states[j]
334 | 
335 |             # e.g. substituting R_a =  0*1 R_b |      0*2 R_c
336 |             # into              R_b =    3 R_a |        4 R_c | 5 R_d
337 |             # yields            R_b = 30*1 R_b | (30*2|4) R_c | 5 R_d
338 |             univ = brz[b][a]  # i.e. "3"
339 |             del brz[b][a]
340 | 
341 |             for right in brz[a]:
342 |                 brz[b][right] = Pattern(
343 |                     *brz[b][right].concs,
344 |                     Conc(Mult(univ, ONE), Mult(brz[a][right], ONE)),
345 |                 ).reduce()
346 | 
347 |     return brz[f.initial][outside].reduce()
348 | 
349 | 
350 | @dataclass(frozen=True)
351 | class Pattern:
352 |     """
353 |     A `Pattern` (also known as an "alt", short for "alternation") is a
354 |     set of `Conc`s. A `Pattern` expresses multiple alternate possibilities.
355 |     When written out as a regex, these would separated by pipes. A
356 |     `Pattern` containing no possibilities is possible and represents a
357 |     regular expression matching no strings whatsoever (there is no
358 |     conventional string form for this).
359 | 
360 |     e.g. "abc|def(ghi|jkl)" is an alt containing two `Conc`s: "abc" and
361 |     "def(ghi|jkl)". The latter is a `Conc` containing four `Mult`s: "d",
362 |     "e", "f" and "(ghi|jkl)". The latter in turn is a `Mult` consisting of
363 |     an upper bound 1, a lower bound 1, and a multiplicand which is a new
364 |     subpattern, "ghi|jkl". This new subpattern again consists of two
365 |     `Conc`s: "ghi" and "jkl".
366 |     """
367 | 
368 |     concs: frozenset[Conc]
369 | 
370 |     def __init__(self, /, *concs: Conc):
371 |         object.__setattr__(self, "concs", frozenset(concs))
372 | 
373 |     def __eq__(self, other: object, /) -> bool:
374 |         if not isinstance(other, type(self)):
375 |             return NotImplemented
376 |         return self.concs == other.concs
377 | 
378 |     def __hash__(self, /) -> int:
379 |         return hash(self.concs)
380 | 
381 |     def __repr__(self, /) -> str:
382 |         args = ", ".join(repr(conc) for conc in self.concs)
383 |         return f"Pattern({args})"
384 | 
385 |     def empty(self, /) -> bool:
386 |         return all(conc.empty() for conc in self.concs)
387 | 
388 |     def intersection(self, other: Pattern, /) -> Pattern:
389 |         combined = self.to_fsm() & other.to_fsm()
390 |         return from_fsm(combined)
391 | 
392 |     def __and__(self, other: Pattern, /) -> Pattern:
393 |         return self.intersection(other)
394 | 
395 |     def difference(*elems: Pattern) -> Pattern:
396 |         """
397 |         Return a regular expression which matches any string which `self`
398 |         matches but none of the strings which `other` matches.
399 |         """
400 |         return from_fsm(Fsm.difference(*(elem.to_fsm() for elem in elems)))
401 | 
402 |     def __sub__(self, other: Pattern, /) -> Pattern:
403 |         return self.difference(other)
404 | 
405 |     def union(self, other: Pattern, /) -> Pattern:
406 |         return Pattern(*(self.concs | other.concs))
407 | 
408 |     def __or__(self, other: Pattern, /) -> Pattern:
409 |         return self.union(other)
410 | 
411 |     def __str__(self, /) -> str:
412 |         if not self.concs:
413 |             raise ValueError(f"Can't serialise {self!r}")
414 |         return "|".join(sorted(str(conc) for conc in self.concs))
415 | 
416 |     def reduce(self, /) -> Pattern:
417 |         # pylint: disable=too-many-branches
418 |         # pylint: disable=too-many-locals
419 |         # pylint: disable=too-many-return-statements
420 | 
421 |         if self == NULLPATTERN:
422 |             return self
423 | 
424 |         if self.empty():
425 |             return NULLPATTERN
426 | 
427 |         # Try recursively reducing our internal `Conc`s.
428 |         reduced = frozenset(c.reduce() for c in self.concs)
429 |         if reduced != self.concs:
430 |             return Pattern(*reduced).reduce()
431 | 
432 |         # If one of our internal concs is empty, remove it
433 |         for conc in self.concs:
434 |             if conc.empty():
435 |                 new = self.concs - {conc}
436 |                 return Pattern(*new).reduce()
437 | 
438 |         # If we have just one `Conc` with just one `Mult` with a multiplier of
439 |         # 1, and the multiplicand is a `Pattern`, pull that up
440 |         if len(self.concs) == 1:
441 |             (conc,) = self.concs
442 |             if (
443 |                 len(conc.mults) == 1
444 |                 and conc.mults[0].multiplier == ONE
445 |                 and isinstance(conc.mults[0].multiplicand, Pattern)
446 |             ):
447 |                 return conc.mults[0].multiplicand.reduce()
448 | 
449 |         # If this `Pattern` contains several `Conc`s each containing just 1
450 |         # `Mult` and their multiplicands agree, we may be able to merge the
451 |         # multipliers.
452 |         # e.g. "a{1,2}|a{3,4}|bc" -> "a{1,4}|bc"
453 |         oldconcs = list(self.concs)  # so we can index the things
454 |         for i, conc1 in enumerate(oldconcs):
455 |             if len(conc1.mults) != 1:
456 |                 continue
457 |             multiplicand1 = conc1.mults[0].multiplicand
458 |             for j in range(i + 1, len(oldconcs)):
459 |                 conc2 = oldconcs[j]
460 |                 if len(conc2.mults) != 1:
461 |                     continue
462 |                 multiplicand2 = conc2.mults[0].multiplicand
463 |                 if multiplicand2 != multiplicand1:
464 |                     continue
465 |                 multiplicand = multiplicand1
466 |                 multiplier1 = conc1.mults[0].multiplier
467 |                 multiplier2 = conc2.mults[0].multiplier
468 |                 if not multiplier1.canunion(multiplier2):
469 |                     continue
470 |                 multiplier = multiplier1 | multiplier2
471 |                 newconcs = (
472 |                     oldconcs[:i]
473 |                     + oldconcs[i + 1 : j]
474 |                     + oldconcs[j + 1 :]
475 |                     + [Conc(Mult(multiplicand, multiplier))]
476 |                 )
477 |                 return Pattern(*newconcs).reduce()
478 | 
479 |         # If this `Pattern` contains several `Conc`s each containing just 1
480 |         # `Mult` each containing just a `Charclass`, with a multiplier of 1,
481 |         # then we can merge those `Charclass`es together.
482 |         # e.g. "0|[1-9]|ab" -> "[0-9]|ab"
483 |         merged_charclass = NULLCHARCLASS
484 |         num_merged = 0
485 |         rest = []
486 |         for conc in self.concs:
487 |             if (
488 |                 len(conc.mults) == 1
489 |                 and conc.mults[0].multiplier == ONE
490 |                 and isinstance(conc.mults[0].multiplicand, Charclass)
491 |             ):
492 |                 merged_charclass |= conc.mults[0].multiplicand
493 |                 num_merged += 1
494 |             else:
495 |                 rest.append(conc)
496 |         if num_merged >= 2:
497 |             rest.append(Conc(Mult(merged_charclass, ONE)))
498 |             return Pattern(*rest).reduce()
499 | 
500 |         # If one of the present `Pattern`'s `Conc`s is the empty string...
501 |         if EMPTYSTRING in self.concs:
502 |             for conc in self.concs:
503 |                 # ...and there is another `Conc`
504 |                 # with a single `Mult` whose lower bound is 0...
505 |                 if len(conc.mults) == 1 and conc.mults[0].multiplier.min == Bound(0):
506 |                     # Then we can omit the empty string.
507 |                     # E.g. "|(ab)*|def" => "(ab)*|def".
508 |                     return Pattern(*(self.concs - {EMPTYSTRING})).reduce()
509 | 
510 |             for conc in self.concs:
511 |                 # ...and there is another `Conc`
512 |                 # with a single `Mult` whose lower bound is 1...
513 |                 if len(conc.mults) == 1 and conc.mults[0].multiplier.min == Bound(1):
514 |                     # Then we can merge the empty string into that.
515 |                     # E.g. "|(ab)+|def" => "(ab)*|def".
516 |                     merged_conc = Conc(
517 |                         Mult(conc.mults[0].multiplicand, conc.mults[0].multiplier * QM)
518 |                     )
519 |                     return Pattern(
520 |                         *(self.concs - {EMPTYSTRING, conc} | {merged_conc})
521 |                     ).reduce()
522 | 
523 |         # If the present `Pattern`'s `Conc`s all have a common prefix, split
524 |         # that out. This increases the depth of the object
525 |         # but it is still arguably simpler/ripe for further reduction
526 |         # e.g. "abc|ade" -> a(bc|de)"
527 |         if len(self.concs) > 1:
528 |             prefix = self._commonconc()
529 |             if prefix != EMPTYSTRING:
530 |                 leftovers = self.behead(prefix)
531 |                 mults = prefix.mults + (Mult(leftovers, ONE),)
532 |                 return Pattern(Conc(*mults)).reduce()
533 | 
534 |             # Same but for suffixes.
535 |             # e.g. "xyz|stz -> (xy|st)z"
536 |             suffix = self._commonconc(suffix=True)
537 |             if suffix != EMPTYSTRING:
538 |                 leftovers = self.dock(suffix)
539 |                 mults = (Mult(leftovers, ONE),) + suffix.mults
540 |                 return Pattern(Conc(*mults)).reduce()
541 | 
542 |         return self
543 | 
544 |     def symmetric_difference(*elems: Pattern) -> Pattern:
545 |         """
546 |         Return a regular expression matching only the strings recognised by
547 |         `self` or `other` but not both.
548 |         """
549 |         return from_fsm(Fsm.symmetric_difference(*(elem.to_fsm() for elem in elems)))
550 | 
551 |     def __xor__(self, other: Pattern, /) -> Pattern:
552 |         return self.symmetric_difference(other)
553 | 
554 |     def dock(self, other: Conc, /) -> Pattern:
555 |         """
556 |         The opposite of concatenation. Remove a common suffix from the
557 |         present `Pattern`; that is, from each of its constituent concs.
558 | 
559 |         AYZ|BYZ|CYZ - YZ -> A|B|C.
560 |         """
561 |         return Pattern(*[conc.dock(other) for conc in self.concs])
562 | 
563 |     def behead(self, other: Conc, /) -> Pattern:
564 |         """
565 |         Like dock() but the other way around. Remove a common prefix from
566 |         the present `Pattern`; that is, from each of its constituent concs.
567 | 
568 |         ZA|ZB|ZC.behead(Z) -> A|B|C
569 |         """
570 |         return Pattern(*[conc.behead(other) for conc in self.concs])
571 | 
572 |     def _commonconc(self, /, suffix: bool = False) -> Conc:
573 |         """
574 |         Find the longest `Conc` which acts as prefix to every `Conc` in
575 |         this `Pattern`. This could be `EMPTYSTRING`. Return the common
576 |         prefix along with all the leftovers after truncating that common
577 |         prefix from each `Conc`.
578 | 
579 |         "ZA|ZB|ZC" -> "Z", "(A|B|C)"
580 |         "ZA|ZB|ZC|Z" -> "Z", "(A|B|C|)"
581 |         "CZ|CZ" -> "CZ", "()"
582 | 
583 |         If "suffix" is True, the same result but for suffixes.
584 |         """
585 |         if not self.concs:
586 |             raise ValueError(f"Can't call _commonconc on {self!r}")
587 | 
588 |         return reduce(lambda x, y: x.common(y, suffix=suffix), self.concs)
589 | 
590 |     def to_fsm(self, /) -> Fsm:
591 |         return Fsm.union(NULL, *(conc.to_fsm() for conc in self.concs))
592 | 
593 |     def reversed(self, /) -> Pattern:
594 |         return Pattern(*(c.reversed() for c in self.concs))
595 | 
596 |     def copy(self, /) -> Pattern:
597 |         """
598 |         For completeness only, since `set.copy()` also exists. `Pattern`s
599 |         are immutable, so I can see only very odd reasons to need this
600 |         """
601 |         return Pattern(*self.concs)
602 | 
603 |     def equivalent(self, other: Pattern, /) -> bool:
604 |         """
605 |         Two `Pattern`s are equivalent if they recognise the same strings.
606 |         Note that in the general case this is actually quite an intensive
607 |         calculation, but far from unsolvable, as we demonstrate here:
608 |         """
609 |         return self.to_fsm().equivalent(other.to_fsm())
610 | 
611 |     def times(self, multiplier: Multiplier, /) -> Pattern:
612 |         """
613 |         Equivalent to repeated concatenation. Multiplier consists of a
614 |         minimum and a maximum; maximum may be infinite (for Kleene star
615 |         closure). Call using "a = b * qm"
616 |         """
617 |         return Pattern(Conc(Mult(self, multiplier)))
618 | 
619 |     def __mul__(self, multiplier: Multiplier, /) -> Pattern:
620 |         return self.times(multiplier)
621 | 
622 |     def everythingbut(self, /) -> Pattern:
623 |         """
624 |         Return a `Pattern` which will match any string not matched by
625 |         `self`, and which will not match any string matched by `self`.
626 |         Another task which is very difficult in general (and typically
627 |         returns utter garbage when actually printed), but becomes trivial
628 |         to code thanks to FSM routines.
629 |         """
630 |         return from_fsm(self.to_fsm().everythingbut())
631 | 
632 |     def derive(self, string: str, /) -> Pattern:
633 |         return from_fsm(self.to_fsm().derive(string))
634 | 
635 |     def isdisjoint(self, other: Pattern, /) -> bool:
636 |         """
637 |         Treat `self` and `other` as sets of strings and see if they are
638 |         disjoint
639 |         """
640 |         return self.to_fsm().isdisjoint(other.to_fsm())
641 | 
642 |     def matches(self, string: str, /) -> bool:
643 |         return self.to_fsm().accepts(string)
644 | 
645 |     def __contains__(self, string: str, /) -> bool:
646 |         """
647 |         This lets you use the syntax `"a" in pattern` to see whether the
648 |         string "a" is in the set of strings matched by `pattern`.
649 |         """
650 |         return self.matches(string)
651 | 
652 |     # pylint: disable=fixme
653 |     # TODO: this is a misuse of __reversed__
654 |     # and should be removed next major version
655 |     def __reversed__(self, /) -> Pattern:
656 |         return self.reversed()
657 | 
658 |     def cardinality(self, /) -> int:
659 |         """
660 |         Consider the regular expression as a set of strings and return the
661 |         cardinality of that set, or raise an OverflowError if there are
662 |         infinitely many.
663 |         """
664 |         # There is no way to do this other than converting to an FSM, because
665 |         # the `Pattern` may allow duplicate routes, such as "a|a".
666 |         return self.to_fsm().cardinality()
667 | 
668 |     def __len__(self, /) -> int:
669 |         return self.cardinality()
670 | 
671 |     def strings(self, /, *, otherchar: str | None = None) -> Iterator[str]:
672 |         """
673 |         Each time next() is called on this iterator, a new string is
674 |         returned which this `Pattern` can match. `StopIteration`
675 |         is raised once all such strings have been returned, although a
676 |         regex with a * in may match infinitely many strings.
677 |         """
678 |         otherchars = [] if otherchar is None else [otherchar]
679 |         return self.to_fsm().strings(otherchars)
680 | 
681 |     def __iter__(self, /) -> Iterator[str]:
682 |         """
683 |         This allows you to do `for string in pattern` as a list
684 |         comprehension!
685 |         """
686 |         return self.strings()
687 | 
688 | 
689 | @dataclass(frozen=True)
690 | class Mult:
691 |     """
692 |     A `Mult` is a combination of a multiplicand with a multiplier (a min
693 |     and a max). The vast majority of characters in regular expressions
694 |     occur without a specific multiplier, which is implicitly equivalent to
695 |     a min of 1 and a max of 1, but many more have explicit multipliers like
696 |     "*" (min = 0, max = INF) and so on.
697 | 
698 |     e.g. a, b{2}, c?, d*, [efg]{2,5}, f{2,}, (anysubpattern)+, .*, ...
699 |     """
700 | 
701 |     multiplicand: Charclass | Pattern
702 |     multiplier: Multiplier
703 | 
704 |     def __eq__(self, other: object, /) -> bool:
705 |         if not isinstance(other, type(self)):
706 |             return NotImplemented
707 |         return (
708 |             self.multiplicand == other.multiplicand
709 |             and self.multiplier == other.multiplier
710 |         )
711 | 
712 |     def __hash__(self, /) -> int:
713 |         return hash((self.multiplicand, self.multiplier))
714 | 
715 |     def __repr__(self, /) -> str:
716 |         return f"Mult({self.multiplicand!r}, {self.multiplier!r})"
717 | 
718 |     def dock(self, other: Mult, /) -> Mult:
719 |         """
720 |         "Dock" another `Mult` from this one (i.e. remove part of the tail)
721 |         and return the result. The reverse of concatenation. This is a lot
722 |         trickier.
723 |         e.g. a{4,5} - a{3} = a{1,2}
724 |         """
725 |         if other.multiplicand != self.multiplicand:
726 |             raise ArithmeticError(f"Can't subtract {other!r} from {self!r}")
727 |         return Mult(self.multiplicand, self.multiplier - other.multiplier)
728 | 
729 |     def common(self, other: Mult, /) -> Mult:
730 |         """
731 |         Return the common part of these two mults. This is the largest
732 |         `Mult` which can be safely subtracted from both the originals. The
733 |         multiplier on this `Mult` could be `ZERO`: this is the case if, for
734 |         example, the multiplicands disagree.
735 |         """
736 |         if self.multiplicand == other.multiplicand:
737 |             return Mult(self.multiplicand, self.multiplier.common(other.multiplier))
738 | 
739 |         # Multiplicands disagree, no common part at all.
740 |         return Mult(NULLCHARCLASS, ZERO)
741 | 
742 |     def empty(self, /) -> bool:
743 |         return self.multiplicand.empty() and self.multiplier.min > Bound(0)
744 | 
745 |     def reduce(self, /) -> Mult:
746 |         if self == NULLMULT:
747 |             return self
748 | 
749 |         # Can't match anything: reduce to empty `Mult`
750 |         if self.empty():
751 |             return NULLMULT
752 | 
753 |         # Try recursively reducing our multiplicand
754 |         reduced = self.multiplicand.reduce()
755 |         if reduced != self.multiplicand:
756 |             return Mult(reduced, self.multiplier).reduce()
757 | 
758 |         # If our multiplicand is a `Pattern` containing an empty `Conc`
759 |         # we can pull that "optional" bit out into our own multiplier
760 |         # instead.
761 |         # e.g. (A|B|C|) -> (A|B|C)?
762 |         # e.g. (A|B|C|){2} -> (A|B|C){0,2}
763 |         if (
764 |             isinstance(self.multiplicand, Pattern)
765 |             and EMPTYSTRING in self.multiplicand.concs
766 |             and self.multiplier.canmultiplyby(QM)
767 |         ):
768 |             return Mult(
769 |                 Pattern(*(conc for conc in self.multiplicand.concs if conc.mults)),
770 |                 self.multiplier * QM,
771 |             ).reduce()
772 | 
773 |         # If our multiplicand is a `Pattern` containing a single `Conc`
774 |         # containing a single `Mult`, we can scrap the `Pattern` in favour of
775 |         # that `Mult`'s multiplicand
776 |         # e.g. ([ab])* -> [ab]*
777 |         # e.g. ((a))* -> (a)* -> a*
778 |         # NOTE: this logic lives here at the `Mult` level, NOT in
779 |         # `Pattern.reduce` because we want to return another `Mult` (same type)
780 |         if isinstance(self.multiplicand, Pattern) and len(self.multiplicand.concs) == 1:
781 |             (conc,) = self.multiplicand.concs
782 |             if len(conc.mults) == 1 and conc.mults[0].multiplier.canmultiplyby(
783 |                 self.multiplier
784 |             ):
785 |                 return Mult(
786 |                     conc.mults[0].multiplicand,
787 |                     conc.mults[0].multiplier * self.multiplier,
788 |                 ).reduce()
789 | 
790 |         # no reduction possible
791 |         return self
792 | 
793 |     def __str__(self, /) -> str:
794 |         if isinstance(self.multiplicand, Pattern):
795 |             return f"({self.multiplicand}){self.multiplier}"
796 |         if isinstance(self.multiplicand, Charclass):
797 |             return f"{self.multiplicand}{self.multiplier}"
798 |         raise TypeError(f"Unknown type {type(self.multiplicand)}")
799 | 
800 |     def to_fsm(self, /) -> Fsm:
801 |         # worked example: (min, max) = (5, 7) or (5, INF)
802 |         # (mandatory, optional) = (5, 2) or (5, INF)
803 | 
804 |         unit = (
805 |             from_charclass(self.multiplicand)
806 |             if isinstance(self.multiplicand, Charclass)
807 |             else self.multiplicand.to_fsm()
808 |         )
809 |         # accepts e.g. "ab"
810 | 
811 |         # Yuck. `mandatory` cannot be infinite: it's just a natural number.
812 |         # However, it uses `Bound`, which describes co-naturals.
813 |         assert self.multiplier.mandatory.v is not None
814 | 
815 |         # accepts "ababababab"
816 |         mandatory = unit.times(self.multiplier.mandatory.v)
817 | 
818 |         # unlimited additional copies
819 |         if self.multiplier.optional == INF:
820 |             optional = unit.star()
821 |             # accepts "(ab)*"
822 | 
823 |         else:
824 |             optional = EPSILON | unit
825 |             # accepts "(ab)?"
826 | 
827 |             # Implied by `!= INF`.
828 |             assert self.multiplier.optional.v is not None
829 | 
830 |             optional = optional.times(self.multiplier.optional.v)
831 |             # accepts "(ab)?(ab)?"
832 | 
833 |         return mandatory.concatenate(optional)
834 | 
835 |     def reversed(self, /) -> Mult:
836 |         return Mult(self.multiplicand.reversed(), self.multiplier)
837 | 
838 | 
839 | NULLMULT = Mult(NULLCHARCLASS, ONE)
840 | NULLCONC = Conc(NULLMULT)
841 | EMPTYSTRING = Conc()
842 | NULLPATTERN = Pattern(NULLCONC)
843 | 


--------------------------------------------------------------------------------
/greenery/fsm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Finite state machine library, intended to be used by `greenery` only
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | __all__ = (
  8 |     "Fsm",
  9 |     "StateType",
 10 |     "EPSILON",
 11 |     "NULL",
 12 |     "Charclass",
 13 | )
 14 | 
 15 | from dataclasses import dataclass
 16 | from typing import (
 17 |     Callable,
 18 |     ClassVar,
 19 |     Collection,
 20 |     Dict,
 21 |     Iterable,
 22 |     Iterator,
 23 |     List,
 24 |     Mapping,
 25 |     TypeVar,
 26 | )
 27 | 
 28 | from .charclass import DOT, Charclass, repartition
 29 | 
 30 | AlphaType = Charclass
 31 | StateType = int
 32 | M = TypeVar("M")
 33 | """Meta-state type for crawl(). Can be anything."""
 34 | 
 35 | 
 36 | def unify_alphabets(fsms: Iterable[Fsm], /) -> List[Fsm]:
 37 |     charclasses = set()
 38 |     for fsm in fsms:
 39 |         for charclass in fsm.alphabet:
 40 |             charclasses.add(charclass)
 41 | 
 42 |     partition = repartition(charclasses)
 43 |     # maps old Charclasses to collections of new Charclasses
 44 | 
 45 |     return [fsm.replace_alphabet(partition) for fsm in fsms]
 46 | 
 47 | 
 48 | # pylint: disable=too-many-public-methods,too-many-branches,fixme
 49 | @dataclass(frozen=True, init=False)
 50 | class Fsm:
 51 |     """
 52 |     A Finite State Machine or FSM has an alphabet and a set of states. At
 53 |     any given moment, the FSM is in one state. When passed a symbol from
 54 |     the alphabet, the FSM jumps to another state (or possibly the same
 55 |     state). A map (Python dictionary) indicates where to jump.
 56 |     One state is nominated as a starting state. Zero or more states are
 57 |     nominated as final states. If, after consuming a string of symbols,
 58 |     the FSM is in a final state, then it is said to "accept" the string.
 59 |     This class also has some pretty powerful methods which allow FSMs to
 60 |     be concatenated, alternated between, multiplied, looped (Kleene star
 61 |     closure), intersected, and simplified.
 62 |     The majority of these methods are available using operator overloads.
 63 |     """
 64 | 
 65 |     alphabet: frozenset[AlphaType]
 66 |     states: frozenset[StateType]
 67 |     initial: StateType
 68 |     finals: frozenset[StateType]
 69 |     map: Mapping[StateType, Mapping[AlphaType, StateType]]
 70 | 
 71 |     # noinspection PyShadowingBuiltins
 72 |     # pylint: disable=too-many-arguments
 73 |     def __init__(
 74 |         self,
 75 |         /,
 76 |         *,
 77 |         alphabet: Iterable[AlphaType],
 78 |         states: Iterable[StateType],
 79 |         initial: StateType,
 80 |         finals: Iterable[StateType],
 81 |         # pylint: disable=redefined-builtin
 82 |         map: Mapping[StateType, Mapping[AlphaType, StateType]],
 83 |     ) -> None:
 84 |         """
 85 |         `alphabet` is an iterable of symbols the FSM can be fed.
 86 |         `states` is the set of states for the FSM
 87 |         `initial` is the initial state
 88 |         `finals` is the set of accepting states
 89 |         `map` must be complete
 90 |         """
 91 |         alphabet = frozenset(alphabet)
 92 |         states = frozenset(states)
 93 |         finals = frozenset(finals)
 94 | 
 95 |         # Validation. Thanks to immutability, this only needs to be carried out
 96 |         # once.
 97 |         if initial not in states:
 98 |             raise ValueError(f"Initial state {initial!r} must be one of {states!r}")
 99 |         if not finals.issubset(states):
100 |             raise ValueError(f"Final states {finals!r} must be a subset of {states!r}")
101 |         for state, state_trans in map.items():
102 |             if state not in states:
103 |                 raise ValueError(f"Transition from unknown state {state!r}")
104 |             for symbol, dest in state_trans.items():
105 |                 if symbol not in alphabet:
106 |                     raise ValueError(
107 |                         f"Invalid symbol {symbol!r}"
108 |                         f" in transition from {state!r}"
109 |                         f" to {dest!r}"
110 |                     )
111 |                 if dest not in states:
112 |                     raise ValueError(
113 |                         f"Transition for state {state!r}"
114 |                         f" and symbol {symbol!r}"
115 |                         f" leads to {dest!r},"
116 |                         " which is not a state"
117 |                     )
118 |         for state in states:
119 |             if state not in map:
120 |                 raise ValueError(f"State {state!r} missing from map")
121 |             for charclass in alphabet:
122 |                 if charclass not in map[state]:
123 |                     raise ValueError(
124 |                         f"Symbol {charclass!r} missing from map[{state!r}]"
125 |                     )
126 | 
127 |         # Check that the charclasses form a proper partition of all of Unicode
128 |         unified = Charclass()
129 |         for charclass in alphabet:
130 |             if unified & charclass != Charclass():
131 |                 raise ValueError(f"Alphabet {alphabet!r} has overlaps")
132 |             unified |= charclass
133 |         if unified != DOT:
134 |             raise ValueError(f"Alphabet {alphabet!r} is not a proper partition")
135 | 
136 |         # Initialise the hard way due to immutability.
137 |         object.__setattr__(self, "alphabet", alphabet)
138 |         object.__setattr__(self, "states", states)
139 |         object.__setattr__(self, "initial", initial)
140 |         object.__setattr__(self, "finals", finals)
141 |         object.__setattr__(self, "map", map)
142 | 
143 |     def accepts(self, string: str, /) -> bool:
144 |         """
145 |         Test whether the present FSM accepts the supplied string (iterable
146 |         of symbols). Equivalently, consider `self` as a possibly-infinite
147 |         set of strings and test whether `string` is a member of it. This is
148 |         actually mainly used for unit testing purposes.
149 |         """
150 |         state = self.initial
151 |         for char in string:
152 |             for charclass in self.map[state]:
153 |                 if charclass.accepts(char):
154 |                     state = self.map[state][charclass]
155 |                     break
156 |         return state in self.finals
157 | 
158 |     def __contains__(self, string: str, /) -> bool:
159 |         """
160 |         This lets you use the syntax `"a" in fsm1` to see whether the
161 |         string "a" is in the set of strings accepted by `fsm1`.
162 |         """
163 |         return self.accepts(string)
164 | 
165 |     def reduce(self, /) -> Fsm:
166 |         """
167 |         A result by Brzozowski (1963) shows that a minimal finite state
168 |         machine equivalent to the original can be obtained by reversing the
169 |         original twice.
170 |         """
171 |         return self.reversed().reversed()
172 | 
173 |     def __repr__(self, /) -> str:
174 |         args = ", ".join(
175 |             [
176 |                 f"alphabet={self.alphabet!r}",
177 |                 f"states={self.states!r}",
178 |                 f"initial={self.initial!r}",
179 |                 f"finals={self.finals!r}",
180 |                 f"map={self.map!r}",
181 |             ]
182 |         )
183 |         return f"Fsm({args})"
184 | 
185 |     # The Python `__eq__` + `__hash__` contract requires that value-equality
186 |     # implies hash-equality. `Fsm` `__eq__` implementation currently represents
187 |     # equality of the set of accepted strings, independent of specific state
188 |     # labels or unused members of the alphabet. This is not trivial to hash.
189 |     # Regarding the type suppression, see
190 |     # https://github.com/python/mypy/issues/4266
191 |     __hash__: ClassVar[None] = None  # type: ignore
192 | 
193 |     def __str__(self, /) -> str:
194 |         rows = []
195 | 
196 |         sorted_alphabet = sorted(self.alphabet)
197 | 
198 |         # top row
199 |         row = ["", "name", "final?"]
200 |         row.extend(str(symbol) for symbol in sorted_alphabet)
201 |         rows.append(row)
202 | 
203 |         # other rows
204 |         for state in self.states:
205 |             row = []
206 |             if state == self.initial:
207 |                 row.append("*")
208 |             else:
209 |                 row.append("")
210 |             row.append(str(state))
211 |             if state in self.finals:
212 |                 row.append("True")
213 |             else:
214 |                 row.append("False")
215 |             for symbol in sorted_alphabet:
216 |                 row.append(str(self.map[state][symbol]))
217 |             rows.append(row)
218 | 
219 |         # column widths
220 |         colwidths = []
221 |         for x in range(len(rows[0])):
222 |             colwidths.append(max(len(str(row[x])) for y, row in enumerate(rows)) + 1)
223 | 
224 |         # apply padding
225 |         for y, row in enumerate(rows):
226 |             for x, col in enumerate(row):
227 |                 rows[y][x] = col.ljust(colwidths[x])
228 | 
229 |         # horizontal line
230 |         rows.insert(1, ["-" * colwidth for colwidth in colwidths])
231 | 
232 |         return "".join("".join(row) + "\n" for row in rows)
233 | 
234 |     def concatenate(*fsms: Fsm) -> Fsm:
235 |         """
236 |         Concatenate arbitrarily many finite state machines together.
237 |         """
238 |         unified_fsms = unify_alphabets(fsms)
239 | 
240 |         def connect_all(
241 |             i: int,
242 |             substate: StateType,
243 |         ) -> Iterable[tuple[int, StateType]]:
244 |             """
245 |             Take a state in the numbered FSM and return a set containing
246 |             it, plus (if it's final) the first state from the next FSM,
247 |             plus (if that's final) the first state from the next but one
248 |             FSM, plus...
249 |             """
250 |             result = {(i, substate)}
251 |             while i < len(unified_fsms) - 1 and substate in unified_fsms[i].finals:
252 |                 i += 1
253 |                 substate = unified_fsms[i].initial
254 |                 result.add((i, substate))
255 |             return result
256 | 
257 |         # Use a superset containing states from all FSMs at once.
258 |         # We start at the start of the first FSM. If this state is final in the
259 |         # first FSM, then we are also at the start of the second FSM. And so
260 |         # on.
261 |         initial = frozenset(
262 |             connect_all(0, unified_fsms[0].initial) if unified_fsms else ()
263 |         )
264 | 
265 |         def final(state: frozenset[tuple[int, StateType]]) -> bool:
266 |             """If you're in a final state of the final FSM, it's final"""
267 |             return any(
268 |                 i == len(unified_fsms) - 1 and substate in unified_fsms[i].finals
269 |                 for i, substate in state
270 |             )
271 | 
272 |         def follow(
273 |             current: frozenset[tuple[int, StateType]],
274 |             symbol: AlphaType,
275 |         ) -> frozenset[tuple[int, StateType]]:
276 |             """
277 |             Follow the collection of states through all FSMs at once,
278 |             jumping to the next FSM if we reach the end of the current one
279 |             """
280 |             next_metastate: set[tuple[int, StateType]] = set()
281 |             for i, substate in current:
282 |                 next_metastate.update(
283 |                     connect_all(i, unified_fsms[i].map[substate][symbol])
284 |                 )
285 | 
286 |             return frozenset(next_metastate)
287 | 
288 |         alphabet = unified_fsms[0].alphabet if len(unified_fsms) > 0 else {~Charclass()}
289 | 
290 |         return crawl(alphabet, initial, final, follow).reduce()
291 | 
292 |     def __add__(self, other: Fsm, /) -> Fsm:
293 |         """
294 |         Concatenate two finite state machines together.
295 |         For example, if self accepts "0*" and other accepts "1+(0|1)",
296 |         will return a finite state machine accepting "0*1+(0|1)".
297 |         Accomplished by effectively following non-deterministically.
298 |         Call using "fsm3 = fsm1 + fsm2"
299 |         """
300 |         return self.concatenate(other)
301 | 
302 |     def star(self, /) -> Fsm:
303 |         """
304 |         If the present FSM accepts X, returns an FSM accepting X* (i.e. 0
305 |         or more Xes). This is NOT as simple as naively connecting the final
306 |         states back to the initial state: see (b*ab)* for example.
307 |         """
308 |         alphabet = self.alphabet
309 | 
310 |         initial: Collection[StateType] = {self.initial}
311 | 
312 |         def follow(
313 |             state: Collection[StateType],
314 |             symbol: AlphaType,
315 |         ) -> Collection[StateType]:
316 |             next_states = set()
317 | 
318 |             for substate in state:
319 |                 next_states.add(self.map[substate][symbol])
320 | 
321 |                 # If one of our substates is final, then we can also consider
322 |                 # transitions from the initial state of the original FSM.
323 |                 if substate in self.finals:
324 |                     next_states.add(self.map[self.initial][symbol])
325 | 
326 |             return frozenset(next_states)
327 | 
328 |         def final(state: Collection[StateType]) -> bool:
329 |             return any(substate in self.finals for substate in state)
330 | 
331 |         return crawl(alphabet, initial, final, follow) | EPSILON
332 | 
333 |     def times(self, multiplier: int, /) -> Fsm:
334 |         """
335 |         Given an FSM and a multiplier, return the multiplied FSM.
336 |         """
337 |         if multiplier < 0:
338 |             raise ArithmeticError(f"Can't multiply an FSM by {multiplier!r}")
339 | 
340 |         alphabet = self.alphabet
341 | 
342 |         # metastate is a set of iterations+states
343 |         initial: Collection[tuple[StateType, int]] = {(self.initial, 0)}
344 | 
345 |         def final(state: Collection[tuple[StateType, int]]) -> bool:
346 |             """
347 |             If the initial state is final then multiplying doesn't alter
348 |             that
349 |             """
350 |             return any(
351 |                 substate == self.initial
352 |                 and (self.initial in self.finals or iteration == multiplier)
353 |                 for substate, iteration in state
354 |             )
355 | 
356 |         def follow(
357 |             current: Collection[tuple[StateType, int]],
358 |             symbol: AlphaType,
359 |         ) -> Collection[tuple[StateType, int]]:
360 |             next_metastate = []
361 |             for substate, iteration in current:
362 |                 if iteration < multiplier:
363 |                     next_metastate.append((self.map[substate][symbol], iteration))
364 |                     # final of self? merge with initial on next iteration
365 |                     if self.map[substate][symbol] in self.finals:
366 |                         next_metastate.append((self.initial, iteration + 1))
367 |             return frozenset(next_metastate)
368 | 
369 |         return crawl(alphabet, initial, final, follow).reduce()
370 | 
371 |     def __mul__(self, multiplier: int, /) -> Fsm:
372 |         """
373 |         Given an FSM and a multiplier, return the multiplied FSM.
374 |         """
375 |         return self.times(multiplier)
376 | 
377 |     def union(*fsms: Fsm) -> Fsm:
378 |         """
379 |         Treat `fsms` as a collection of arbitrary FSMs and return the union
380 |         FSM. Can be used as `fsm1.union(fsm2, ...)` or
381 |         `fsm.union(fsm1, ...)`. `fsms` may be empty.
382 |         """
383 |         return parallel(fsms, any)
384 | 
385 |     def __or__(self, other: Fsm, /) -> Fsm:
386 |         """
387 |         Alternation.
388 |         Return a finite state machine which accepts any sequence of symbols
389 |         that is accepted by either self or other. Note that the set of
390 |         strings recognised by the two FSMs undergoes a set union.
391 |         Call using "fsm3 = fsm1 | fsm2"
392 |         """
393 |         return self.union(other)
394 | 
395 |     def intersection(*fsms: Fsm) -> Fsm:
396 |         """
397 |         Intersection.
398 |         Take FSMs and AND them together. That is, return an FSM which
399 |         accepts any sequence of symbols that is accepted by both of the
400 |         original FSMs. Note that the set of strings recognised by the two
401 |         FSMs undergoes a set intersection operation.
402 |         Call using "fsm3 = fsm1 & fsm2"
403 |         """
404 |         return parallel(fsms, all)
405 | 
406 |     def __and__(self, other: Fsm, /) -> Fsm:
407 |         """
408 |         Treat the FSMs as sets of strings and return the intersection of
409 |         those sets in the form of a new FSM.
410 |         """
411 |         return self.intersection(other)
412 | 
413 |     def symmetric_difference(*fsms: Fsm) -> Fsm:
414 |         """
415 |         Treat `fsms` as a collection of sets of strings and compute the
416 |         symmetric difference of them all. The python set method only allows
417 |         two sets to be operated on at once, but we go the extra mile since
418 |         it's not too hard.
419 |         """
420 |         return parallel(fsms, lambda accepts: (accepts.count(True) % 2) == 1)
421 | 
422 |     def __xor__(self, other: Fsm, /) -> Fsm:
423 |         """
424 |         Symmetric difference. Returns an FSM which recognises only the
425 |         strings recognised by `self` or `other` but not both.
426 |         """
427 |         return self.symmetric_difference(other)
428 | 
429 |     def everythingbut(self, /) -> Fsm:
430 |         """
431 |         Return a finite state machine which will accept any string NOT
432 |         accepted by self, and will not accept any string accepted by self.
433 |         """
434 |         alphabet = self.alphabet
435 |         initial = self.initial
436 | 
437 |         def follow(
438 |             current: StateType,
439 |             symbol: AlphaType,
440 |         ) -> StateType:
441 |             return self.map[current][symbol]
442 | 
443 |         # state is final unless the original was
444 |         def final(state: StateType) -> bool:
445 |             return state not in self.finals
446 | 
447 |         return crawl(alphabet, initial, final, follow).reduce()
448 | 
449 |     def reversed(self, /) -> Fsm:
450 |         """
451 |         Return a new FSM such that for every string that self accepts (e.g.
452 |         "beer", the new FSM accepts the reversed string ("reeb").
453 |         """
454 |         alphabet = self.alphabet
455 | 
456 |         # Start from a composite "state-set" consisting of all final states.
457 |         # If there are no final states, this set is empty and we'll find that
458 |         # no other states get generated.
459 |         initial = frozenset(self.finals)
460 | 
461 |         # Find every possible way to reach the current state-set
462 |         # using this symbol.
463 |         def follow(
464 |             current: frozenset[StateType],
465 |             symbol: AlphaType,
466 |         ) -> frozenset[StateType]:
467 |             next_states = frozenset(
468 |                 [
469 |                     prev
470 |                     for prev in self.map
471 |                     for state in current
472 |                     if self.map[prev][symbol] == state
473 |                 ]
474 |             )
475 |             return next_states
476 | 
477 |         # A state-set is final if the initial state is in it.
478 |         def final(state: frozenset[StateType]) -> bool:
479 |             return self.initial in state
480 | 
481 |         # Man, crawl() is the best!
482 |         return crawl(alphabet, initial, final, follow)
483 |         # Do not reduce() the result, since reduce() calls us in turn
484 | 
485 |     def islive(self, /, state: StateType) -> bool:
486 |         """A state is "live" if a final state can be reached from it."""
487 |         reachable = [state]
488 |         i = 0
489 |         while i < len(reachable):
490 |             current = reachable[i]
491 |             if current in self.finals:
492 |                 return True
493 |             for symbol in self.map[current]:
494 |                 next_state = self.map[current][symbol]
495 |                 if next_state not in reachable:
496 |                     reachable.append(next_state)
497 |             i += 1
498 |         return False
499 | 
500 |     def empty(self, /) -> bool:
501 |         """
502 |         An FSM is empty if it recognises no strings. An FSM may be
503 |         arbitrarily complicated and have arbitrarily many final states
504 |         while still recognising no strings because those final states may
505 |         all be inaccessible from the initial state. Equally, an FSM may be
506 |         non-empty despite having an empty alphabet if the initial state is
507 |         final.
508 |         """
509 |         return not self.islive(self.initial)
510 | 
511 |     def strings(self, otherchars: Iterable[str]) -> Iterator[str]:
512 |         """
513 |         Generate strings that this FSM accepts. Note that for our purposes a
514 |         string is a sequence of Unicode characters, NOT a list of Charclasses.
515 | 
516 |         Since
517 |         there may be infinitely many of these we use a generator instead of
518 |         constructing a static list. Strings will be sorted in order of
519 |         length and then lexically. This procedure uses arbitrary amounts of
520 |         memory but is very fast. There may be more efficient ways to do
521 |         this, that I haven't investigated yet. You can use this in list
522 |         comprehensions.
523 |         """
524 | 
525 |         # Most FSMs have at least one "dead state".
526 |         # Once you reach a dead state, you can no
527 |         # longer reach a final state. Since many strings may end up here, it's
528 |         # advantageous to constrain our search to live states only.
529 |         livestates = set(state for state in self.states if self.islive(state))
530 | 
531 |         # We store a list of tuples. Each tuple consists of an input string and
532 |         # the state that this input string leads to. This means we don't have
533 |         # to run the state machine from the very beginning every time we want
534 |         # to check a new string.
535 |         strings: list[tuple[str, StateType]] = []
536 | 
537 |         # Initial entry (or possibly not, in which case this is a short one)
538 |         cstate: StateType = self.initial
539 |         cstring: str = ""
540 |         if cstate in livestates:
541 |             if cstate in self.finals:
542 |                 yield cstring
543 |             strings.append((cstring, cstate))
544 | 
545 |         # Fixed point calculation
546 |         i = 0
547 |         while i < len(strings):
548 |             cstring, cstate = strings[i]
549 | 
550 |             for charclass in sorted(self.map[cstate]):
551 |                 # TODO: scrap otherchars as a concept?
552 |                 chars = otherchars if charclass.negated else charclass.get_chars()
553 |                 for char in chars:
554 |                     nstate = self.map[cstate][charclass]
555 |                     nstring = cstring + char
556 |                     if nstate in livestates:
557 |                         if nstate in self.finals:
558 |                             yield nstring
559 |                         strings.append((nstring, nstate))
560 |             i += 1
561 | 
562 |     def __iter__(self, /) -> Iterator[str]:
563 |         """
564 |         This allows you to do `for string in fsm1` as a list comprehension!
565 |         """
566 |         return self.strings([])
567 | 
568 |     def equivalent(self, other: Fsm, /) -> bool:
569 |         """
570 |         Two FSMs are considered equivalent if they recognise the same
571 |         strings. Or, to put it another way, if their symmetric difference
572 |         recognises no strings.
573 |         """
574 |         return (self ^ other).empty()
575 | 
576 |     def __eq__(self, other: object, /) -> bool:
577 |         """
578 |         You can use `fsm1 == fsm2` to determine whether two FSMs recognise
579 |         the same strings.
580 |         """
581 |         if not isinstance(other, Fsm):
582 |             return NotImplemented
583 |         return self.equivalent(other)
584 | 
585 |     def different(self, other: Fsm, /) -> bool:
586 |         """
587 |         Two FSMs are considered different if they have a non-empty
588 |         symmetric difference.
589 |         """
590 |         return not (self ^ other).empty()
591 | 
592 |     def __ne__(self, other: object, /) -> bool:
593 |         """
594 |         Use `fsm1 != fsm2` to determine whether two FSMs recognise
595 |         different strings.
596 |         """
597 |         return not self == other
598 | 
599 |     def difference(*fsms: Fsm) -> Fsm:
600 |         """
601 |         Difference. Returns an FSM which recognises only the strings
602 |         recognised by the first FSM in the list, but none of the others.
603 |         """
604 |         return parallel(fsms, lambda accepts: accepts[0] and not any(accepts[1:]))
605 | 
606 |     def __sub__(self, other: Fsm, /) -> Fsm:
607 |         return self.difference(other)
608 | 
609 |     def cardinality(self, /) -> int:
610 |         """
611 |         Consider the FSM as a set of strings and return the cardinality of
612 |         that set, or raise an OverflowError if there are infinitely many
613 |         """
614 |         num_strings: dict[StateType, int | None] = {}
615 | 
616 |         def get_num_strings(state: StateType) -> int:
617 |             # Most FSMs have at least one oblivion state
618 |             if self.islive(state):
619 |                 if state in num_strings:
620 |                     if num_strings[state] is None:  # "computing..."
621 |                         # Recursion! There are infinitely many strings
622 |                         # recognised
623 |                         raise OverflowError(state)
624 |                     return num_strings[state]  # type: ignore
625 | 
626 |                 num_strings[state] = None  # i.e. "computing..."
627 |                 n = 0
628 |                 for charclass in self.map[state]:
629 |                     num_transitions = charclass.num_chars()
630 |                     nstate = self.map[state][charclass]
631 |                     if nstate in self.finals:
632 |                         n += num_transitions
633 |                     n += num_transitions * get_num_strings(nstate)
634 |                 num_strings[state] = n
635 | 
636 |             else:
637 |                 # Dead state
638 |                 num_strings[state] = 0
639 | 
640 |             return num_strings[state]  # type: ignore
641 | 
642 |         n = 1 if self.initial in self.finals else 0
643 |         return n + get_num_strings(self.initial)
644 | 
645 |     def __len__(self, /) -> int:
646 |         """
647 |         Consider the FSM as a set of strings and return the cardinality of
648 |         that set, or raise an OverflowError if there are infinitely many
649 |         """
650 |         return self.cardinality()
651 | 
652 |     def isdisjoint(self, other: Fsm, /) -> bool:
653 |         """
654 |         Treat `self` and `other` as sets of strings and see if they are
655 |         disjoint
656 |         """
657 |         return (self & other).empty()
658 | 
659 |     def issubset(self, other: Fsm, /) -> bool:
660 |         """
661 |         Treat `self` and `other` as sets of strings and see if `self` is a
662 |         subset of `other`... `self` recognises no strings which `other`
663 |         doesn't.
664 |         """
665 |         return (self - other).empty()
666 | 
667 |     def __le__(self, other: Fsm, /) -> bool:
668 |         """
669 |         Treat `self` and `other` as sets of strings and see if `self` is a
670 |         subset of `other`... `self` recognises no strings which `other`
671 |         doesn't.
672 |         """
673 |         return self.issubset(other)
674 | 
675 |     def ispropersubset(self, other: Fsm, /) -> bool:
676 |         """
677 |         Treat `self` and `other` as sets of strings and see if `self` is a
678 |         proper subset of `other`.
679 |         """
680 |         return self <= other and self != other
681 | 
682 |     def __lt__(self, other: Fsm, /) -> bool:
683 |         """
684 |         Treat `self` and `other` as sets of strings and see if `self` is a
685 |         strict subset of `other`.
686 |         """
687 |         return self.ispropersubset(other)
688 | 
689 |     def issuperset(self, other: Fsm, /) -> bool:
690 |         """
691 |         Treat `self` and `other` as sets of strings and see if `self` is a
692 |         superset of `other`.
693 |         """
694 |         return (other - self).empty()
695 | 
696 |     def __ge__(self, other: Fsm, /) -> bool:
697 |         """
698 |         Treat `self` and `other` as sets of strings and see if `self` is a
699 |         superset of `other`.
700 |         """
701 |         return self.issuperset(other)
702 | 
703 |     def ispropersuperset(self, other: Fsm, /) -> bool:
704 |         """
705 |         Treat `self` and `other` as sets of strings and see if `self` is a
706 |         proper superset of `other`.
707 |         """
708 |         return self >= other and self != other
709 | 
710 |     def __gt__(self, other: Fsm, /) -> bool:
711 |         """
712 |         Treat `self` and `other` as sets of strings and see if `self` is a
713 |         strict superset of `other`.
714 |         """
715 |         return self.ispropersuperset(other)
716 | 
717 |     def copy(self, /) -> Fsm:
718 |         """
719 |         For completeness only, since `set.copy()` and `frozenset.copy()` exist.
720 |         FSM objects are immutable; like `frozenset`, this just returns `self`.
721 |         """
722 |         return self
723 | 
724 |     __copy__ = copy
725 | 
726 |     def derive(self, string: str, /) -> Fsm:
727 |         """
728 |         Compute the Brzozowski derivative of this FSM with respect to the
729 |         input string. Note that the FSM uses Charclasses as symbols internally,
730 |         but the input string is a sequence of Unicode characters
731 |         <https://en.wikipedia.org/wiki/Brzozowski_derivative>
732 |         """
733 |         # Consume the input string.
734 |         state = self.initial
735 |         for char in string:
736 |             for charclass in self.map[state]:
737 |                 if charclass.accepts(char):
738 |                     state = self.map[state][charclass]
739 |                     break
740 | 
741 |         # OK so now we have consumed that string, use the new location as
742 |         # the starting point.
743 |         return Fsm(
744 |             alphabet=self.alphabet,
745 |             states=self.states,
746 |             initial=state,
747 |             finals=self.finals,
748 |             map=self.map,
749 |         )
750 | 
751 |     def replace_alphabet(
752 |         self, replacements: Mapping[AlphaType, Iterable[AlphaType]]
753 |     ) -> Fsm:
754 |         """
755 |         Returns a new FSM which uses a different alphabet. If one original
756 |         symbol converts to two new symbols, there will be multiple identical
757 |         transitions; if none, the transitions will be omitted.
758 |         """
759 |         new_alphabet = set()
760 |         for symbol in self.alphabet:
761 |             for replacement in replacements[symbol]:
762 |                 new_alphabet.add(replacement)
763 | 
764 |         new_map: Dict[StateType, Dict[AlphaType, StateType]] = {}
765 |         for state in self.map:
766 |             new_map[state] = {}
767 |             for symbol in self.alphabet:
768 |                 for replacement in replacements[symbol]:
769 |                     new_map[state][replacement] = self.map[state][symbol]
770 | 
771 |         return Fsm(
772 |             alphabet=new_alphabet,
773 |             states=self.states,
774 |             initial=self.initial,
775 |             finals=self.finals,
776 |             map=new_map,
777 |         )
778 | 
779 | 
780 | NULL = Fsm(
781 |     alphabet={~Charclass()},
782 |     states={0},
783 |     initial=0,
784 |     finals=(),
785 |     map={
786 |         0: {~Charclass(): 0},
787 |     },
788 | )
789 | """
790 | An FSM accepting nothing (not even the empty string). This is
791 | demonstrates that this is possible, and is also extremely useful
792 | in some situations
793 | """
794 | 
795 | EPSILON = Fsm(
796 |     alphabet={~Charclass()},
797 |     states={0, 1},
798 |     initial=0,
799 |     finals={0},
800 |     map={
801 |         0: {~Charclass(): 1},
802 |         1: {~Charclass(): 1},
803 |     },
804 | )
805 | """
806 | An FSM matching an empty string, "", only.
807 | This is very useful in many situations
808 | """
809 | 
810 | 
811 | def parallel(
812 |     fsms: tuple[Fsm, ...],
813 |     test: Callable[[list[bool]], bool],
814 |     /,
815 | ) -> Fsm:
816 |     """
817 |     Crawl several FSMs in parallel, mapping the states of a larger
818 |     meta-FSM. To determine whether a state in the larger FSM is final, pass
819 |     all of the finality statuses (e.g. [True, False, False] to `test`.
820 |     """
821 |     unified_fsms = unify_alphabets(fsms)
822 | 
823 |     initial: Mapping[int, StateType] = {
824 |         i: fsm.initial for i, fsm in enumerate(unified_fsms)
825 |     }
826 | 
827 |     # dedicated function accepts a "superset" and returns the next "superset"
828 |     # obtained by following this transition in the new FSM
829 |     def follow(
830 |         current: Mapping[int, StateType],
831 |         symbol: AlphaType,
832 |     ) -> Mapping[int, StateType]:
833 |         return {i: fsm.map[current[i]][symbol] for i, fsm in enumerate(unified_fsms)}
834 | 
835 |     # Determine the "is final?" condition of each substate, then pass it to the
836 |     # test to determine finality of the overall FSM.
837 |     def final(state: Mapping[int, StateType]) -> bool:
838 |         return test([state[i] in fsm.finals for i, fsm in enumerate(unified_fsms)])
839 | 
840 |     alphabet = unified_fsms[0].alphabet if len(unified_fsms) > 0 else {~Charclass()}
841 | 
842 |     return crawl(alphabet, initial, final, follow).reduce()
843 | 
844 | 
845 | def crawl(
846 |     alphabet: Iterable[AlphaType],
847 |     initial: M,
848 |     final: Callable[[M], bool],
849 |     follow: Callable[[M, AlphaType], M],
850 | ) -> Fsm:
851 |     """
852 |     Given the above conditions and instructions, crawl a new unknown FSM,
853 |     mapping its states, final states and transitions. Return the new FSM.
854 |     This is a pretty powerful procedure which could potentially go on
855 |     forever if you supply an evil version of follow().
856 |     """
857 | 
858 |     states: list[M] = [initial]
859 |     finals: set[StateType] = set()
860 |     transitions: dict[StateType, dict[AlphaType, StateType]] = {}
861 | 
862 |     # iterate over a growing list
863 |     i = 0
864 |     while i < len(states):
865 |         state = states[i]
866 | 
867 |         # add to finals
868 |         if final(state):
869 |             finals.add(i)
870 | 
871 |         # compute map for this state
872 |         transitions[i] = {}
873 |         for symbol in sorted(alphabet):
874 |             next_state = follow(state, symbol)
875 | 
876 |             try:
877 |                 j = states.index(next_state)
878 |             except ValueError:
879 |                 j = len(states)
880 |                 states.append(next_state)
881 | 
882 |             transitions[i][symbol] = j
883 | 
884 |         i += 1
885 | 
886 |     return Fsm(
887 |         alphabet=alphabet,
888 |         states=set(range(len(states))),
889 |         initial=0,
890 |         finals=finals,
891 |         map=transitions,
892 |     )
893 | 
894 | 
895 | def from_charclass(charclass: Charclass) -> Fsm:
896 |     # 0 is initial, 1 is final, 2 is dead
897 |     return Fsm(
898 |         alphabet={charclass, ~charclass},
899 |         states={0, 1, 2},
900 |         initial=0,
901 |         finals={1},
902 |         map={
903 |             0: {charclass: 1, ~charclass: 2},
904 |             1: {charclass: 2, ~charclass: 2},
905 |             2: {charclass: 2, ~charclass: 2},
906 |         },
907 |     )
908 | 


--------------------------------------------------------------------------------