├── .coveragerc
├── rgx
    ├── __init__.py
    ├── unicode_meta.py
    ├── meta.py
    └── entities.py
├── pyproject.toml
├── test
    ├── test_groups.py
    ├── test_creation.py
    ├── test_url.py
    └── test_operations.py
├── .github
    └── workflows
    │   ├── build.yml
    │   └── test.yml
├── LICENSE
├── .gitignore
└── README.md


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | exclude_lines = 
3 |     pragma: not covered
4 |     @overload


--------------------------------------------------------------------------------
/rgx/__init__.py:
--------------------------------------------------------------------------------
1 | from .entities import (
2 |     pattern,
3 |     NamedPattern as named,
4 |     group_reference as reference,
5 |     ConditionalPattern as conditional,
6 |     char_range as char_range,
7 |     Context as Context,
8 | )
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "rgx"
 3 | version = "2.0.0"
 4 | description = "Typed, simple and readable regexp generation"
 5 | authors = ["Dmitry Gritsenko <rgx@evtn.ru>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | repository = "https://github.com/evtn/rgx"
 9 | homepage = "https://github.com/evtn/rgx"
10 | keywords = ["regex", "regexp", "regular expressions"]
11 | 
12 | [tool.poetry.dependencies]
13 | python = "^3.7"
14 | wordstreamer = "^0.1.3"
15 | 
16 | [tool.poetry.dev-dependencies]
17 | 
18 | [tool.poetry.group.dev.dependencies]
19 | pytest = "^7.4.0"
20 | mypy = "^1.4.1"
21 | coverage = "^7.2.7"
22 | 
23 | [build-system]
24 | requires = ["poetry-core>=1.0.0"]
25 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/rgx/unicode_meta.py:
--------------------------------------------------------------------------------
 1 | from rgx.entities import UnescapedLiteral
 2 | 
 3 | 
 4 | def NAMED_PROPERTY(name: str, value: str) -> UnescapedLiteral:
 5 |     return UnescapedLiteral(fr"\P{{{name}={value}}}")
 6 | 
 7 | 
 8 | def NAMED_PROPERTY_INVERSE(name: str, value: str) -> UnescapedLiteral:
 9 |     return UnescapedLiteral(fr"\p{{{name}={value}}}")
10 | 
11 | 
12 | def PROPERTY(value: str) -> UnescapedLiteral:
13 |     return UnescapedLiteral(fr"\p{{{value}}}")
14 | 
15 | 
16 | def PROPERTY_INVERSE(value: str) -> UnescapedLiteral:
17 |     return UnescapedLiteral(fr"\P{{{value}}}")
18 | 
19 | 
20 | LETTER = PROPERTY("L")
21 | NON_LETTER = PROPERTY_INVERSE("L")
22 | 
23 | WHITESPACE = PROPERTY("Z")
24 | NON_WHITESPACE = PROPERTY_INVERSE("Z")
25 | 
26 | DIGIT = PROPERTY("Nd")
27 | NON_DIGIT = PROPERTY("Nd")


--------------------------------------------------------------------------------
/test/test_groups.py:
--------------------------------------------------------------------------------
 1 | from rgx import pattern, conditional, named
 2 | 
 3 | a = pattern("a")
 4 | b = pattern("b")
 5 | 
 6 | 
 7 | class TestClass:
 8 |     def test_look_x(self):
 9 |         assert a.before(b).render_str() == "a(?=b)"
10 |         assert a.after(b).render_str() == "(?<=b)a"
11 | 
12 |         assert a.not_before(b).render_str() == "a(?!b)"
13 |         assert a.not_after(b).render_str() == "(?<!b)a"
14 | 
15 |     def test_comment(self):
16 |         assert a.comment(" that's a!").render_str() == "a(?# that's a!)"
17 | 
18 |     def test_conditional(self):
19 |         assert conditional(1, a, b).render_str() == "(?(1)a|b)"
20 | 
21 |     def test_group(self):
22 |         assert a.capture().render_str() == "(a)"
23 | 
24 |     def test_named(self):
25 |         assert named("a", b).render_str() == "(?P<a>b)"
26 | 


--------------------------------------------------------------------------------
/rgx/meta.py:
--------------------------------------------------------------------------------
 1 | from rgx.entities import UnescapedLiteral
 2 | 
 3 | 
 4 | WORD_CHAR = UnescapedLiteral(r"\w")
 5 | NON_WORD_CHAR = UnescapedLiteral(r"\W")
 6 | DIGIT = UnescapedLiteral(r"\d")
 7 | NON_DIGIT = UnescapedLiteral(r"\D")
 8 | WHITESPACE = UnescapedLiteral(r"\s")
 9 | NON_WHITESPACE = UnescapedLiteral(r"\S")
10 | WORD_BOUNDARY = UnescapedLiteral(r"\b")
11 | NON_WORD_BOUNDARY = UnescapedLiteral(r"\B")
12 | ANY = UnescapedLiteral(".")
13 | NEWLINE = UnescapedLiteral(r"\n")
14 | CARRIAGE_RETURN = UnescapedLiteral(r"\r")
15 | TAB = UnescapedLiteral(r"\t")
16 | NULL_CHAR = UnescapedLiteral(r"\0")
17 | STRING_START = UnescapedLiteral("^")
18 | STRING_END = UnescapedLiteral("$")
19 | 
20 | def CHAR_ESCAPE(char_number: int):
21 |     try:
22 |         chr(char_number)
23 |     except ValueError:
24 |         raise ValueError(f"Invalid character: {char_number}")
25 |     prefix = ["x", "u", "U"][(char_number > 255) + (char_number > 65535)]
26 |     length = {"x": 2, "u": 4, "U": 8}[prefix]
27 |     return UnescapedLiteral(f"\\{prefix}{char_number:0{length}x}")


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish on PyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - lord
 8 |     paths:
 9 |       - pyproject.toml
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     name: Build and publish
15 | 
16 |     steps:
17 |       - name: git-checkout
18 |         uses: actions/checkout@v3
19 | 
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v4
22 |         with:
23 |           python-version: 3.9
24 | 
25 |       - name: Build and Publish
26 |         env:
27 |           PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
28 |         run: |
29 |           python -m pip install poetry
30 |           poetry install
31 |           poetry build
32 |           poetry config pypi-token.pypi $PYPI_TOKEN
33 |           poetry publish
34 | 
35 |       - run: echo "VERSION=$(poetry version -s)" >> $GITHUB_ENV
36 |         id: version-check
37 | 
38 |       - name: Release on GitHub
39 |         uses: softprops/action-gh-release@v1
40 |         with:
41 |           files: dist/*
42 |           tag_name: ${{ env.VERSION }}
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Dmitry Gritsenko
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - lord
 8 |     paths:
 9 |       - "**.py"
10 |       - "**.yml"
11 |   pull_request:
12 |     paths:
13 |       - "**.py"
14 |       - "**.yml"
15 | jobs:
16 |   check_types:
17 |     runs-on: ubuntu-latest
18 |     name: Check Types
19 |     steps:
20 |       - name: git-checkout
21 |         uses: actions/checkout@v3
22 | 
23 |       - name: Set up Python
24 |         uses: actions/setup-python@v4
25 |         with:
26 |           python-version: "3.11"
27 | 
28 |       - name: Install Poetry
29 |         uses: abatilo/actions-poetry@v2
30 | 
31 |       - run: poetry install --with dev
32 |       - run: poetry run mypy rgx/*.py --disallow-any-expr
33 | 
34 |   run-tests:
35 |     runs-on: ubuntu-latest
36 |     strategy:
37 |       matrix:
38 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
39 | 
40 |     name: Run Tests
41 |     steps:
42 |       - name: git-checkout
43 |         uses: actions/checkout@v3
44 | 
45 |       - name: Set up Python
46 |         uses: actions/setup-python@v4
47 |         with:
48 |           python-version: ${{ matrix.python-version }}
49 | 
50 |       - name: Install Poetry
51 |         uses: abatilo/actions-poetry@v2
52 | 
53 |       - run: poetry install --with dev
54 | 
55 |       - name: Test
56 |         run: poetry run coverage run --include "rgx/*" -m pytest test/
57 | 
58 |       - name: Coveralls Update
59 |         uses: coverallsapp/github-action@v2
60 |         with:
61 |           github-token: ${{ secrets.GITHUB_TOKEN }}
62 |           flag-name: ${{ matrix.python-version }}
63 | 


--------------------------------------------------------------------------------
/test/test_creation.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from rgx import pattern, char_range, reference, named
 3 | from rgx.entities import CharType, RegexPattern
 4 | import pytest
 5 | 
 6 | 
 7 | class TestClass:
 8 |     def test_literals(self):
 9 |         assert pattern("x").render_str() == "x"
10 |         assert pattern(".").render_str() == "\\."
11 |         assert pattern(".", escape=False).render_str() == "."
12 |         assert pattern(("x",)).render_str() == "x"
13 |         assert pattern(("x", "y")).render_str() == "(?:xy)"
14 |         assert pattern(["x", "y"]).render_str() == "[xy]"
15 | 
16 |     def test_char_classes(self):
17 |         onetwo_list: List[CharType] = ["1", "2"]
18 |         onetwo_chars = pattern(onetwo_list)
19 | 
20 |         az_char_range = pattern("a").to("z")
21 | 
22 |         assert az_char_range.render_str() == "[a-z]"
23 |         assert az_char_range.reverse().render_str() == "[^a-z]"
24 |         assert (onetwo_chars | az_char_range).render_str() == "[12a-z]"
25 |         assert (onetwo_list | az_char_range).render_str() == "[12a-z]"
26 |         assert onetwo_chars.render_str() == "[12]"
27 | 
28 |         assert (pattern("1") | pattern("2")).render_str() == "[12]"
29 | 
30 |         assert pattern("1").to("2").render_str() == "[12]"
31 |         assert pattern("1").to("3").render_str() == "[123]"
32 |         assert pattern("1").to("4").render_str() == "[1-4]"
33 | 
34 |         assert (pattern("1").to("9") | "0").render_str() == "[0-9]"
35 | 
36 |         assert char_range("a").render_str() == "[a-]"
37 |         assert char_range(None, "z").render_str() == "[-z]"
38 | 
39 |         assert pattern(["-"]).render_str() == "\\-"  # not a range actually
40 | 
41 |         a = pattern("a")
42 |         assert repr(a) == a.render_str()
43 | 
44 |         with pytest.raises(ValueError):
45 |             char_range()
46 | 
47 |     def test_references(self):
48 |         assert reference(1).render_str() == "\\1"
49 |         assert named("x").render_str() == "(?P=x)"
50 | 
51 |     def test_flags(self):
52 |         assert pattern("x").render_str("i") == "(?i)x"
53 | 
54 |     def test_that_render_on_regex_pattern_is_not_implemented_i_know_this_is_stupid_but_still(
55 |         self,
56 |     ):
57 |         assert RegexPattern().render(RegexPattern.default_context) == NotImplemented
58 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | poetry.lock


--------------------------------------------------------------------------------
/test/test_url.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from rgx.entities import UnescapedLiteral
  4 | from rgx import pattern, char_range
  5 | 
  6 | 
  7 | letter = char_range("a", "z") | char_range("A", "Z")
  8 | nonzero = char_range("1", "9")
  9 | digit = char_range("0", "9")
 10 | mark = pattern(["-", "_", ".", "!", "~", "*", "'", "(", ")"])
 11 | reserved = pattern([";", "/", "?", ":", "@", "&", "=", "+", "$", ","])
 12 | unreserved = letter | digit | mark
 13 | 
 14 | hex_char = digit | char_range("a", "f") | char_range("A", "F")
 15 | escaped = "%" + hex_char.x_times(2)
 16 | 
 17 | identifier = letter + (letter | digit | pattern(["+", "-", "."])).some()
 18 | scheme = identifier.named("scheme")
 19 | 
 20 | userinfo = (
 21 |     (unreserved | escaped | pattern([";", ":", "&", "=", "+", "$", ","]))
 22 |     .many()
 23 |     .named("userinfo")
 24 | )
 25 | 
 26 | domain = (letter | digit) + (
 27 |     (letter | digit | pattern(["-"])).some() + (letter | digit)
 28 | ).maybe()
 29 | 
 30 | top_domain = (
 31 |     letter + ((letter | digit | pattern(["-"])).some() + (letter | digit)).maybe()
 32 | )
 33 | 
 34 | hostname = (domain + ".").some() + top_domain + pattern(".").maybe()
 35 | 
 36 | ip_number = (pattern("1").maybe() + nonzero.maybe() + digit) | (
 37 |     "2" + (char_range("0", "4") + digit | "5" + char_range("0", "5"))
 38 | )
 39 | 
 40 | ip4_address = (ip_number + ".").x_times(3) + ip_number
 41 | host = (hostname | ip4_address).named("host")
 42 | 
 43 | port = digit.some().named("port")
 44 | 
 45 | authority = ((userinfo + "@").maybe() + host + (":" + port).maybe()).named("authority")
 46 | 
 47 | pchar = unreserved | escaped | pattern([":", "@", "&", "=", "+", "$", ",", ";"])
 48 | 
 49 | param = pchar.some()
 50 | path_segment = param + (";" + param).some()
 51 | path_segment_nonempty = pchar.many() | param + (";" + param).many()
 52 | path_segments = path_segment.maybe() + ("/" + path_segment.maybe()).some()
 53 | no_authority_path = (path_segment_nonempty + "/" + path_segments).maybe()
 54 | 
 55 | path = ("/" + path_segments).named("path")
 56 | 
 57 | autority_with_path = "//" + authority + path.maybe()
 58 | no_authority_with_path = no_authority_path.named("path_noauthority")
 59 | 
 60 | qfchars = (pchar | pattern(["?", "/"])).some()
 61 | 
 62 | query = qfchars.named("query")
 63 | fragment = qfchars.named("fragment")
 64 | 
 65 | url = (
 66 |     scheme
 67 |     + ":"
 68 |     + (autority_with_path | no_authority_with_path)
 69 |     + ("?" + query).maybe()
 70 |     + (UnescapedLiteral("#") + fragment).maybe()
 71 | )
 72 | 
 73 | import re
 74 | 
 75 | url_regex = re.compile(str(url))
 76 | 
 77 | 
 78 | test_suites: dict[str, dict] = {
 79 |     "https://datatracker.ietf.org/doc/html/rfc3986?asd=213#section-3.4": {
 80 |         "scheme": "https",
 81 |         "authority": "datatracker.ietf.org",
 82 |         "userinfo": None,
 83 |         "host": "datatracker.ietf.org",
 84 |         "port": None,
 85 |         "path": "/doc/html/rfc3986",
 86 |         "path_noauthority": None,
 87 |         "query": "asd=213",
 88 |         "fragment": "section-3.4",
 89 |     },
 90 |     "http://http://http://@http://http://?http://#http://": {
 91 |         "scheme": "http",
 92 |         "authority": "http:",
 93 |         "userinfo": None,
 94 |         "host": "http",
 95 |         "port": "",
 96 |         "path": "//http://@http://http://",
 97 |         "path_noauthority": None,
 98 |         "query": "http://",
 99 |         "fragment": "http://",
100 |     },
101 |     "https://mail.python.org/archives/list/typing-sig@python.org/thread/66RITIHDQHVTUMJHH2ORSNWZ6DOPM367/#QYOBBLTWVSEWMFRRHBA2OPR5QQ4IMWOL": {
102 |         "scheme": "https",
103 |         "authority": "mail.python.org",
104 |         "userinfo": None,
105 |         "host": "mail.python.org",
106 |         "port": None,
107 |         "path": "/archives/list/typing-sig@python.org/thread/66RITIHDQHVTUMJHH2ORSNWZ6DOPM367/",
108 |         "path_noauthority": None,
109 |         "query": None,
110 |         "fragment": "QYOBBLTWVSEWMFRRHBA2OPR5QQ4IMWOL",
111 |     },
112 | }
113 | 
114 | 
115 | class TestClass:
116 |     def test_url(self):
117 |         for test_url, expected_result in test_suites.items():
118 |             match = url_regex.fullmatch(test_url)
119 | 
120 |             assert match and match.groupdict() == expected_result
121 | 


--------------------------------------------------------------------------------
/test/test_operations.py:
--------------------------------------------------------------------------------
  1 | from rgx import pattern
  2 | from rgx.entities import Option
  3 | 
  4 | a = pattern("a")
  5 | b = pattern("b")
  6 | 
  7 | 
  8 | class TestClass:
  9 |     def test_concat(self):
 10 |         assert (a + "b").render_str() == "ab"
 11 |         assert ("b" + a).render_str() == "ba"
 12 |         assert (a + b).render_str() == "ab"
 13 |         assert (a + a + a).render_str() == "aaa"
 14 |         assert a.concat(b).concat(a).render_str() == (a + b + a).render_str()
 15 | 
 16 |     def test_option(self):
 17 |         # those are needed because one-char string produces Chars instance, thus making result render differently
 18 |         ab = pattern("ab")
 19 |         ac = pattern("ac")
 20 | 
 21 |         assert (ab | ac).render_str() == "ab|ac"
 22 |         assert (ab | "b").render_str() == "ab|b"
 23 |         assert ("a" | ac).render_str() == "a|ac"
 24 |         assert (ab | ac | ab).render_str() == "ab|ac|ab"
 25 |         assert ("a" | (ab | ac)).render_str() == "a|ab|ac"
 26 |         assert ab.option(ac).render_str() == (ab | ac).render_str()
 27 | 
 28 |     def test_char_option(self):
 29 |         assert (a | b).render_str() == "[ab]"
 30 |         assert (a | "b").render_str() == "[ab]"
 31 |         assert ("a" | b).render_str() == "[ab]"
 32 |         assert (a | b | a).render_str() == "[ab]"
 33 | 
 34 |     def test_quantifiers(self):
 35 |         assert a.many().render_str() == "a+"
 36 |         assert a.many(True).render_str() == "a+?"
 37 | 
 38 |         assert a.some().render_str() == "a*"
 39 |         assert a.some(True).render_str() == "a*?"
 40 | 
 41 |         assert a.maybe().render_str() == "a?"
 42 |         assert a.maybe(lazy=True).render_str() == "a??"
 43 | 
 44 |         assert (
 45 |             a.maybe()
 46 |             .maybe()
 47 |             .maybe()
 48 |             .maybe()
 49 |             .maybe()
 50 |             .maybe()
 51 |             .maybe()
 52 |             .maybe()
 53 |             .render_str()
 54 |             == "a?"
 55 |         )
 56 | 
 57 |         assert a.many().many().render_str() == "a+"
 58 | 
 59 |     def test_range_quantifier(self):
 60 |         assert a.repeat(5).or_less().render_str() == "a{,5}"
 61 |         assert a.x_or_less_times(5).render_str() == "a{,5}"
 62 | 
 63 |         assert a.repeat(5, lazy=True).or_less().render_str() == "a{,5}?"
 64 |         assert a.x_or_less_times(5, lazy=True).render_str() == "a{,5}?"
 65 | 
 66 |         assert a.repeat(5).or_more().render_str() == "a{5,}"
 67 |         assert a.x_or_more_times(5).render_str() == "a{5,}"
 68 | 
 69 |         assert a.repeat(5, lazy=True).or_more().render_str() == "a{5,}?"
 70 |         assert a.x_or_more_times(5, lazy=True).render_str() == "a{5,}?"
 71 | 
 72 |         assert a.repeat(5).render_str() == "a{5}"
 73 |         assert a.x_times(5).render_str() == "a{5}"
 74 | 
 75 |         assert a.repeat(5, lazy=True).render_str() == "a{5}"
 76 |         assert a.x_times(5, lazy=True).render_str() == "a{5}"
 77 | 
 78 |         assert a.repeat(4).to(5).render_str() == "a{4,5}"
 79 |         assert a.repeat_from(4).to(5).render_str() == "a{4,5}"
 80 |         assert a.between_x_y_times(4, 5).render_str() == "a{4,5}"
 81 | 
 82 |         assert a.repeat(5).many().render_str() == "(?:a{5})+"
 83 |         assert a.many().repeat(10).render_str() == "a{10,}"
 84 |         assert a.repeat(5).to(10).repeat(20).render_str() == "(?:a{5,10}){20}"
 85 | 
 86 |         # check Range.repeat() for explanation
 87 |         assert a.repeat(5).repeat(10).render_str() == "(?:a{5}){10}"
 88 | 
 89 |         # specific cases
 90 |         assert a.repeat(1).or_less().render_str() == "a?"
 91 |         assert a.repeat(1, True).or_less().render_str() == "a??"
 92 | 
 93 |         assert a.repeat(1).or_more().render_str() == "a+"
 94 |         assert a.repeat(1, lazy=True).or_more().render_str() == "a+?"
 95 | 
 96 |         assert a.repeat(0).or_more().render_str() == "a*"
 97 |         assert a.repeat(0, lazy=True).or_more().render_str() == "a*?"
 98 | 
 99 |         assert a.repeat(1).render_str() == "a"
100 |         assert a.repeat(1, lazy=True).render_str() == "a"
101 | 
102 |         assert a.repeat(5).to(4).render_str() == "a{4,5}"
103 | 
104 |         assert a.repeat(0).render_str() == ""
105 |         assert a.repeat(1).render_str() == a.render_str()
106 | 
107 |     def test_priority(self):
108 |         ab = pattern("ab")
109 |         ac = pattern("ac")
110 | 
111 |         assert ((ab | ac) + b).render_str() == "(?:ab|ac)b"
112 |         assert ((a + b) | b).render_str() == "ab|b"
113 |         assert (a + b).many().render_str() == "(?:ab)+"
114 | 
115 |     def test_flags(self):
116 |         assert a.set_flags("i").render_str() == "(?i:a)"
117 | 
118 |     def test_empty_option(self):
119 |         assert Option().render_str() == ""
120 | 
121 |     def test_flag_merging(self):
122 |         c = (pattern("one") | "two").case_insensitive()
123 |         assert c.render_str() == "(?i:one|two)"
124 | 
125 |     def test_case_insensitive_chars(self):
126 |         c = (a + "test").case_insensitive()
127 | 
128 |         assert c.render_str() == "(?i:[Aa]test)"
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![wordstreamer badge](https://img.shields.io/badge/renderable-what?label=wordstreamer&color=%2333bb33)](https://github.com/evtn/wordstreamer)
  2 | 
  3 | Many people complain about unreadable and complex syntax of regular expressions.  
  4 | Many others complain about how they can't remember all constructs and features.
  5 | 
  6 | `rgx` solves those problems: it is a straightforward regexp builder. It also places non-capturing groups where needed to respect intended operator priority.  
  7 | It can produce a regular expression string to use in `re.compile` or any other regex library of your choice.
  8 | 
  9 | In other words, with `rgx` you can build a regular expression from parts, using straightforward and simple expressions.
 10 | 
 11 | ## Installation
 12 | 
 13 | `pip install rgx`
 14 | 
 15 | That's it.
 16 | 
 17 | ## Basic usage
 18 | 
 19 | ### Hello, regex world
 20 | 
 21 | ```python
 22 | from rgx import pattern, meta
 23 | import re
 24 | 
 25 | separator = meta.WHITESPACE.some() + (meta.WHITESPACE | ",") + meta.WHITESPACE.some()
 26 | 
 27 | # matches "hello world", "hello, world", "hello            world", "hello,world", "hello ,  world"
 28 | hello_world = pattern((
 29 |     "hello",
 30 |     separator,
 31 |     "world"
 32 | )) # (?:hello(?:\s)*(?:\s|,)(?:\s)*world)
 33 | 
 34 | re.compile(
 35 |     hello_world.render_str("i") # global flag (case-insensitive)
 36 | )
 37 | 
 38 | ```
 39 | 
 40 | ### Match some integers
 41 | 
 42 | this regex will match valid Python integer literals:
 43 | 
 44 | ```python
 45 | from rgx import pattern
 46 | import re
 47 | 
 48 | nonzero = pattern("1").to("9") # [1-9]
 49 | zero = "0"
 50 | digit = zero | nonzero # [0-9]
 51 | integer = zero | (nonzero + digit.some()) # 0|[1-9][0-9]*
 52 | 
 53 | int_regex = re.compile(str(integer))
 54 | 
 55 | ```
 56 | 
 57 | ...or this one:
 58 | 
 59 | ```python
 60 | from rgx import pattern, meta
 61 | import re
 62 | 
 63 | nonzero = pattern("1").to("9") # [1-9]
 64 | digit = meta.DIGIT # \d
 65 | integer = digit | (nonzero + digit.some()) # \d|[1-9]\d*
 66 | 
 67 | int_regex = re.compile(str(integer))
 68 | 
 69 | ```
 70 | 
 71 | ## Quickstart
 72 | 
 73 | _in this readme, `x` means some pattern object. Occasionaly, `y` is introduced to mean some other pattern object (or literal)_
 74 | 
 75 | ### Literals and pattern objects
 76 | 
 77 | `rgx` operates mostly on so-called "pattern objects" — `rgx.entities.RegexPattern` istances.  
 78 | Your starting point would be `rgx.pattern` — it creates pattern objects from literals (and from pattern objects, which doesn't make a lot of sense).
 79 | 
 80 | -   `rgx.pattern(str, escape: bool = True)` creates a literal pattern — one that exactly matches given string. If you want to disable escaping, pass `escape=False`
 81 | -   `rgx.pattern(tuple[AnyRegexPattern])` creates a non-capturing group of patterns (nested literals will be converted too)
 82 | -   `rgx.pattern(list[str])` creates a character class (for example, `rgx.pattern(["a", "b", "c"])` creates pattern `[abc]`, that matches any character of those in brackets)
 83 |     -   Same can be achieved by `rgx.pattern("a").to("c")` or `rgx.pattern("a") | "b" | "c"`
 84 | 
 85 | Most operations with pattern objects support using Python literals on one side, for example: `rgx.pattern("a") | b` would produce `[ab]` pattern object (specifically, `rgx.entities.Chars`)
 86 | 
 87 | ### Rendering patterns
 88 | 
 89 | ```python
 90 | 
 91 | from rgx import pattern
 92 | 
 93 | x = pattern("one")
 94 | y = pattern("two")
 95 | p = x | y
 96 | 
 97 | rendered_with_str = str(p) # "one|two"
 98 | rendered_with_method = p.render_str() # "one|two"
 99 | rendered_with_method_flags = p.render_str("im") # (?im)one|two
100 | ```
101 | 
102 | ### Capturing Groups
103 | 
104 | ```python
105 | from rgx import pattern, reference, named
106 | 
107 | x = pattern("x")
108 | 
109 | print(x.capture()) # (x)
110 | 
111 | print(reference(1)) # \1
112 | 
113 | 
114 | named_x = x.named("some_x") # x.named(name: str)
115 | 
116 | print(named_x) # (?P<some_x>x)
117 | 
118 | named_x_reference = named("some_x")
119 | 
120 | print(named_x_reference) # (?P=x)
121 | 
122 | ```
123 | 
124 | To create a capturing group, use `x.capture()`, or `rgx.reference(group: int)` for a reference.  
125 | To create a named capturing group, use `rgx.named(name: str, x)`, or `rgx.named(name: str)` for a named reference.
126 | 
127 | ### Character classes
128 | 
129 | ```python
130 | from rgx import pattern, meta
131 | 
132 | 
133 | az = pattern("a").to("z") # rgx.Chars.to(other: str | Literal | Chars)
134 | print(az) # [a-z]
135 | 
136 | digits_or_space = pattern(["1", "2", "3", meta.WHITESPACE])
137 | print(digits_or_space) # [123\s]
138 | 
139 | print(az | digits_or_space) # [a-z123\s]
140 | 
141 | 
142 | print( # rgx.Chars.reverse(self)
143 |     (az | digits_or_space).reverse() # [^a-z123\s]
144 | )
145 | 
146 | ```
147 | 
148 | #### Excluding characters
149 | 
150 | If you have two instances of Chars (or compatible literals), you can exclude one from another:
151 | 
152 | ```python
153 | from rgx import pattern
154 | 
155 | letters = pattern("a").to("z") | pattern("A").to("Z") # [A-Za-z]
156 | vowels = pattern(list("aAeEiIoOuU")) # [AEIOUaeiou]
157 | consonants = letters.exclude(vowels) # [BCDFGHJ-NP-TV-Zbcdfghj-np-tv-z]
158 | ```
159 | 
160 | ### Conditional pattern
161 | 
162 | ```python
163 | from rgx import pattern, conditional
164 | 
165 | x = pattern("x")
166 | y = pattern("y")
167 | z = pattern("z")
168 | 
169 | capture = x.capture()
170 | 
171 | # (x)(?(1)y|z)
172 | print(
173 |     capture + conditional(1, y, z)
174 | )
175 | ```
176 | 
177 | ### Repeating patterns
178 | 
179 | If you need to match a repeating pattern, you can use `pattern.repeat(count, lazy)`:
180 | 
181 | ```python
182 | a = pattern("a")
183 | 
184 | a.repeat(5)                      # a{5}
185 | # or
186 | a * 5                            # a{5}, multiplication is an alias for .repeat
187 | 
188 | a.repeat(5).or_more()            # a{5,}
189 | a.repeat(5).or_less()            # a{,5}
190 | 
191 | a.repeat_from(4).to(5)           # a{4, 5}, .repeat_from is just an alias for .repeat
192 | # or
193 | a.repeat(4) >> 5                 # a{4, 5}
194 | 
195 | a.repeat(1).or_less()            # a?
196 | # or
197 | -a.repeat(1)                     # a?
198 | # or
199 | a.maybe()                        # a?
200 | 
201 | a.repeat(1).or_more()            # a+
202 | # or
203 | +a.repeat(1)                     # a+
204 | # or
205 | +a                               # a+
206 | # or
207 | a.many()                         # a+
208 | 
209 | a.repeat(0).or_more()            # a*
210 | # or
211 | +a.repeat(0)                     # a*
212 | # or
213 | a.some()                         # a*
214 | # or (what)
215 | +-(a * 38)                       # a*
216 | ```
217 | 
218 | Here's what's going on:  
219 | `pattern.repeat(count, lazy)` returns a `{count, count}` `Range` object  
220 | `pattern * count` is the same as `pattern.repeat(count, False)`
221 | 
222 | `Range` implements `or_more`, `or_less` and `to` methods:
223 | 
224 | -   `Range.or_more()` [or `+Range`] moves (on a copy) upper bound of range to infinity (actually `None`)
225 | -   `Range.or_less()` [or `-Range`] moves (on a copy) lower bound of range to 0
226 | -   `Range.to(count)` [or `Range >> count` (right shift)] replaces upper bound with given number
227 | 
228 | Also, RegexPattern implements unary plus (`+pattern`) as an alias for `pattern.many()`
229 | 
230 | ## Docs
231 | 
232 | ### Pattern methods
233 | 
234 | #### `pattern.render_str(flags: str = '') -> str`
235 | 
236 | Renders given pattern into a string with specified global flags.
237 | 
238 | ---
239 | 
240 | #### `pattern.set_flags(flags: str) -> LocalFlags`
241 | 
242 | This method adds local flags to given pattern
243 | 
244 | ```python
245 | x.flags("y") # "(?y:x)"
246 | ```
247 | 
248 | ---
249 | 
250 | #### `pattern.concat(other: AnyRegexPattern) -> Concat`
251 | 
252 | Use to match one pattern and then another.
253 | 
254 | `A.concat(B)` is equivalent to `A + B` (works if either A or B is a RegexPart object, not a Python literal)
255 | 
256 | ```python
257 | x.concat(y) # "xy"
258 | x + y # "xy"
259 | ```
260 | 
261 | ---
262 | 
263 | #### `pattern.option(other: AnyRegexPattern) -> Chars | ReversedChars | Option`
264 | 
265 | Use to match either one pattern or another.
266 | 
267 | `A.option(B)` is equivalent to `A | B` (if either A or B is a RegexPart object, not a Python literal)
268 | 
269 | ```python
270 | x.option(y) # "x|y"
271 | x | y # "x|y"
272 | ```
273 | 
274 | ---
275 | 
276 | #### `pattern.many(lazy: bool = False) -> Range`
277 | 
278 | Use this for repeating patterns (one or more times)
279 | 
280 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
281 | 
282 | ```python
283 | x.many() # "x+"
284 | x.many(True) # "x+?"
285 | ```
286 | 
287 | ---
288 | 
289 | #### `pattern.some(lazy: bool = False) -> Range`
290 | 
291 | Use this for repeating optional patterns (zero or more times)
292 | 
293 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
294 | 
295 | ```python
296 | x.some() # "x*"
297 | x.some(True) # "x*?"
298 | ```
299 | 
300 | ---
301 | 
302 | #### `pattern.maybe(lazy: bool = False) -> Range`
303 | 
304 | Use this for optional patterns (zero or one times)
305 | 
306 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
307 | 
308 | ```python
309 | x.maybe() # "x?"
310 | x.maybe(True) # "x??"
311 | ```
312 | 
313 | ---
314 | 
315 | #### `pattern.x_or_less_times(count: int, lazy: bool = False) -> Range`
316 | 
317 | Use this to match pattern x or less times (hence the name).
318 | 
319 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
320 | 
321 | ```python
322 | x.x_or_less_times(5) # "x{,5}"
323 | x.x_or_less_times(5, True) # "x{,5}?"
324 | ```
325 | 
326 | ---
327 | 
328 | #### `pattern.x_or_more_times(count: int, lazy: bool = False) -> Range`
329 | 
330 | Use this to match pattern x or more times (hence the name).
331 | 
332 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
333 | 
334 | ```python
335 | x.x_or_more_times(5) # "x{5,}"
336 | x.x_or_more_times(5, True) # "x{5,}?"
337 | ```
338 | 
339 | ---
340 | 
341 | #### `pattern.x_times(count: int, lazy: bool = False) -> Range`
342 | 
343 | Use this to match pattern exactly x times (hence the name).
344 | 
345 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
346 | 
347 | ```python
348 | x.x_times(5) # "x{5}"
349 | x.x_times(5, True) # "x{5}?"
350 | x.repeat(5) # x{5}
351 | ```
352 | 
353 | ---
354 | 
355 | #### `pattern.between_x_y_times(min_count: int, max_count: int, lazy: bool = False) -> Range`
356 | 
357 | Use this to match pattern between x and y times, inclusive (hence the name).
358 | 
359 | When not lazy, matches as many times as possible, otherwise matches as few times as possible.
360 | 
361 | ```python
362 | x.between_x_y_times(5, 6) # "x{5,6}"
363 | x.between_x_y_times(5, 6, True) # "x{5,6}?"
364 | ```
365 | 
366 | ---
367 | 
368 | #### `pattern.lookahead(other: RegexPattern) -> Concat`
369 | 
370 | Use this to indicate that given pattern occurs before some another pattern (lookahead).
371 | 
372 | In other words, `x.lookahead(y)` matches a pattern `x` only if there is `y` after it
373 | 
374 | Lookahead pattern won't be captured.
375 | 
376 | ```python
377 | x.lookahead(y) # x(?=y)
378 | x.before(y) # x(?=y)
379 | ```
380 | 
381 | ---
382 | 
383 | #### `pattern.negative_lookahead(other) -> Concat`
384 | 
385 | Use this to indicate that given pattern doesn't occur before some another pattern (negative lookahead).
386 | 
387 | In other words, `x.negative_lookahead(y)` matches a pattern `x` only if there is no `y` after it
388 | 
389 | Lookahead pattern won't be captured.
390 | 
391 | ```python
392 | x.negative_lookahead(y) # x(?!y)
393 | x.not_before(y) # x(?!y)
394 | ```
395 | 
396 | ---
397 | 
398 | #### `pattern.lookbehind(other: RegexPattern) -> Concat`
399 | 
400 | Use this to indicate that given pattern occurs after some another pattern (lookbehind).
401 | 
402 | In other words, `x.lookbehind(y)` matches a pattern `x` only if there is `y` before it
403 | 
404 | Lookbehind pattern won't be captured.
405 | 
406 | ```python
407 | x.lookbehind(y) # (?<=y)x
408 | x.after(y) # (?<=y)x
409 | ```
410 | 
411 | ---
412 | 
413 | #### `pattern.negative_lookbehind(other) -> Concat`
414 | 
415 | Use this to indicate that given pattern goes before some another pattern (negative lookbehind).
416 | 
417 | In other words, `x.negative_lookbehind(y)` matches a pattern `x` only if there is NO `y` before it
418 | 
419 | Lookbehind pattern won't be captured.
420 | 
421 | ```python
422 | x.negative_lookbehind(y) # (?<!y)x
423 | x.not_after(y) # (?<!y)x
424 | ```
425 | 
426 | ---
427 | 
428 | #### `pattern.capture() -> Group`
429 | 
430 | Use this to make a capturing group out of pattern.
431 | 
432 | ```python
433 | x.capture() # (x)
434 | ```
435 | 
436 | ### Meta
437 | 
438 | `rgx.meta` is a collection of different meta-sequences and anchors:
439 | 
440 | ```python
441 | meta.WORD_CHAR = UnescapedLiteral(r"\w")
442 | meta.NON_WORD_CHAR = UnescapedLiteral(r"\W")
443 | meta.DIGIT = UnescapedLiteral(r"\d")
444 | meta.NON_DIGIT = UnescapedLiteral(r"\D")
445 | meta.WHITESPACE = UnescapedLiteral(r"\s")
446 | meta.NON_WHITESPACE = UnescapedLiteral(r"\S")
447 | meta.WORD_BOUNDARY = UnescapedLiteral(r"\b")
448 | meta.NON_WORD_BOUNDARY = UnescapedLiteral(r"\B")
449 | meta.ANY = UnescapedLiteral(".")
450 | meta.NEWLINE = UnescapedLiteral(r"\n")
451 | meta.CARRIAGE_RETURN = UnescapedLiteral(r"\r")
452 | meta.TAB = UnescapedLiteral(r"\t")
453 | meta.NULL_CHAR = UnescapedLiteral(r"\0")
454 | meta.STRING_START = UnescapedLiteral("^")
455 | meta.STRING_END = UnescapedLiteral("$")
456 | ```
457 | 
458 | Also `rgx.meta.CHAR_ESCAPE(char_number: int)` is available:
459 | 
460 | ```python
461 | from rgx import meta
462 | 
463 | print(meta.CHAR_ESCAPE(32)) # \x20
464 | print(meta.CHAR_ESCAPE(320)) # \u0140
465 | print(meta.CHAR_ESCAPE(320000)) # \U0004e200
466 | 
467 | ```
468 | 
469 | ### Unicode meta
470 | 
471 | `rgx.unicode_meta` is a collection of functions and constants, mostly for `\p` and `\P` usage:
472 | 
473 | Functions:
474 | 
475 | ```python
476 | unicode_meta.PROPERTY(value: str) # renders into `\p{value}` (any character with property specified by value, e.g. `PROPERTY("Ll") -> \p{Ll}`)
477 | unicode_meta.PROPERTY_INVERSE(value: str) # matches all characters *not* matched by corresponding `PROPERTY` (`\P{value}`)
478 | 
479 | unicode_meta.NAMED_PROPERTY(name: str, value: str) # renders into `\p{name=value}` and matches any character which property `name` equals `value`
480 | unicode_meta.NAMED_PROPERTY_INVERSE(name: str, value: str) # same, but inverted (`\P{name=value}`)
481 | ```
482 | 
483 | Constants:
484 | 
485 | ```python
486 | unicode_meta.LETTER = PROPERTY("L")
487 | unicode_meta.NON_LETTER = PROPERTY_INVERSE("L")
488 | 
489 | unicode_meta.WHITESPACE = PROPERTY("Z")
490 | unicode_meta.NON_WHITESPACE = PROPERTY_INVERSE("Z")
491 | 
492 | unicode_meta.DIGIT = PROPERTY("Nd")
493 | unicode_meta.NON_DIGIT = PROPERTY("Nd")
494 | ```
495 | 
496 | ## Extending
497 | 
498 | You can extend generation by subclassing one of the classes of `rgx.entities` module.  
499 | The one neccessary method to provide is `.render(self, context: rgx.Context)`. It should return an iterable of strings (e.g. `["something"]`).  
500 | Built-in components (and this section) are using generators for that purpose, but you're free to choose whatever works for you.
501 | For example, if you want to render a PCRE accept control verb - `(*ACCEPT)`, you can do it like this:
502 | 
503 | ```python
504 | from rgx.entities import RegexPattern, Concat
505 | from rgx import pattern, Context
506 | from typing import Iterable
507 | 
508 | 
509 | class Accept(RegexPattern):
510 |     def render(self, context: Context) -> Iterable[str]:
511 |         yield "(*ACCEPT)"
512 | 
513 | 
514 | def accept(self) -> Concat:
515 |     return self + Accept()
516 | 
517 | 
518 | RegexPattern.accept = accept
519 | 
520 | x = pattern("something").accept()
521 | print(x) # something(*ACCEPT)
522 | ```
523 | 
524 | Or like this:
525 | 
526 | ```python
527 | from rgx.entities import RegexPattern, Concat
528 | from rgx import pattern, Context
529 | from typing import Iterable
530 | 
531 | 
532 | class Accept(RegexPattern):
533 |     def __init__(self, accepted_pattern: RegexPattern):
534 |         self.accepted_pattern = accepted_pattern
535 | 
536 |     def render(self, context: Context) -> Iterable[str]:
537 |         yield from accepted_pattern.render(context)
538 |         yield "(*ACCEPT)"
539 | 
540 | 
541 | def accept(self) -> Accept:
542 |     return Accept(self)
543 | 
544 | RegexPattern.accept = accept
545 | 
546 | x = pattern("something").accept() # something(*ACCEPT)
547 | ```
548 | 
549 | ### Priority
550 | 
551 | If your extension has to rely on some priority, you can use `respect_priority` function.  
552 | Let's say you want to add a `x/y` operation, which does something (wow) and has prority between `a|b` and `ab` — so `a|b/cd` is the same as `a|(?:b/(?:cd))`.
553 | 
554 | ```python
555 | from rgx.entities import RegexPattern, Concat, Option, AnyRegexPattern, respect_priority, pattern, Context
556 | from typing import Iterable
557 | 
558 | class MagicSlash(RegexPattern):
559 |     priority = (Concat.priority + Option.priority) // 2 # let's take something in the middle
560 | 
561 |     def __init__(self, left: RegexPattern, right: RegexPattern):
562 |         self.left = respect_priority(left, self.priority) # you need to wrap all parts of your expression in respect_priority()
563 |         self.right = respect_priority(right, self.priority) # ...and pass your expression priority as a second argument
564 | 
565 |     def render(self, context: Context) -> Iterable[str]:
566 |         yield from self.left.render(context)
567 |         yield "/"
568 |         yield from self.right.render(context)
569 | 
570 | 
571 | def slash(self, other: AnyRegexPattern) -> MagicSlash: # AnyRegexPattern is either a RegexPattern instance or a Python literal
572 |     return MagicSlash(self, other) # respect_priority already takes literals in consideration, so no extra actions needed
573 | 
574 | def rslash(self, other: AnyRegexPattern) -> MagicSlash: # other/self
575 |     other = pattern(other)
576 |     return other / self
577 | 
578 | 
579 | RegexPattern.slash = slash
580 | RegexPattern.__truediv__ = slash # / operator
581 | RegexPattern.__rtruediv__ = rslash
582 | 
583 | 
584 | a = pattern("a")
585 | b = pattern("b")
586 | c = pattern("c")
587 | d = pattern("d")
588 | 
589 | print(
590 |     (a | b) / (c + d) # [ab]/cd
591 | )
592 | 
593 | print(
594 |     ((a | b) / c) + d # (?:[ab]/c)d
595 | )
596 | 
597 | print(
598 |     a | (b / c) + d   # a|(?:b/c)d
599 | )
600 | 
601 | ```
602 | 
603 | ## Common questions
604 | 
605 | ### Difference between `(x, y)` and `x + y`
606 | 
607 | Previous examples used `()` and `+`, and the difference might not be so obvious.
608 | 
609 | -   `x + y` creates a concatenation of patterns (`rgx.entities.Concat`), with no extra characters apart from those of patterns
610 | -   `x + y` can be used only if at least one of the operands is a pattern object (that is, created with one of `rgx` functions or is one of `rgx` constants)
611 | -   `x + y` produces a pattern object itself, so you won't need to call `pattern` on it to call pattern methods
612 | 
613 | -   `pattern((x, y))` creates a non-capturing group (`rgx.entities.NonCapturingGroup`): `pattern((x, y)).render_str()` -> `(?:xy)`
614 | -   `(x, y)` can be used with any pattern-like literals or pattern objects
615 | -   `(x, y)` is a tuple literal, so you can't use pattern methods on it directly or convert it into a complete expression (you need to use `rgx.pattern` on it first)
616 | 


--------------------------------------------------------------------------------
/rgx/entities.py:
--------------------------------------------------------------------------------
   1 | from __future__ import annotations
   2 | from typing import (
   3 |     Callable,
   4 |     NoReturn,
   5 |     Optional,
   6 |     Tuple,
   7 |     List,
   8 |     Union,
   9 |     cast,
  10 |     overload,
  11 |     Sequence,
  12 |     TYPE_CHECKING,
  13 | )
  14 | 
  15 | from wordstreamer import Context, Renderable as BaseRenderable, Renderer, TokenStream
  16 | from wordstreamer.internal_types import Comparator, Payload
  17 | 
  18 | if TYPE_CHECKING:
  19 |     from typing import Literal as LiteralType, Self
  20 | 
  21 | 
  22 | import re
  23 | 
  24 | CharType = Union[str, "CharRange", "Literal"]
  25 | LiteralPart = Union[Tuple["AnyRegexPattern", ...], List[CharType], str]
  26 | AnyRegexPattern = Union[LiteralPart, "RegexPattern"]
  27 | Processor = Callable[["RegexPattern"], "RegexPattern"]
  28 | 
  29 | OrResult = Union["Option", "Chars", "ReversedChars"]
  30 | 
  31 | priority_step = 1000
  32 | 
  33 | 
  34 | @overload
  35 | def pattern(literal: str, escape: LiteralType[False]) -> UnescapedLiteral:
  36 |     ...
  37 | 
  38 | 
  39 | @overload
  40 | def pattern(literal: str, escape: bool = True) -> Literal | Chars:
  41 |     ...
  42 | 
  43 | 
  44 | @overload
  45 | def pattern(
  46 |     literal: tuple[AnyRegexPattern, ...], escape: bool = True
  47 | ) -> RegexPattern | NonCapturingGroup:
  48 |     ...
  49 | 
  50 | 
  51 | @overload
  52 | def pattern(literal: list[CharType], escape: bool = True) -> Chars:
  53 |     ...
  54 | 
  55 | 
  56 | @overload
  57 | def pattern(literal: AnyRegexPattern, escape: bool = True) -> RegexPattern:
  58 |     ...
  59 | 
  60 | 
  61 | def pattern(literal: AnyRegexPattern, escape: bool = True) -> RegexPattern:
  62 |     """
  63 | 
  64 |     A universal pattern constructor.
  65 | 
  66 |     - With a string, returns a literan pattern. with `escape=False` returns an unescaped pattern.
  67 |     - With a tuple, returns a non-capturing group of patterns (or just one pattern if tuple has one element)
  68 |     - With a list, returns a character group (`[...]`). List must consist of strings and CharRange
  69 | 
  70 |     """
  71 |     if isinstance(literal, RegexPattern):
  72 |         return literal
  73 | 
  74 |     if isinstance(literal, str):
  75 |         if not escape:
  76 |             return UnescapedLiteral(literal)
  77 |         if len(literal) == 1:
  78 |             return Chars([literal])
  79 |         return Literal(literal)
  80 | 
  81 |     if isinstance(literal, tuple):
  82 |         if len(literal) == 1:
  83 |             return pattern(literal[0])
  84 |         return NonCapturingGroup(Concat(*literal))
  85 | 
  86 |     if isinstance(literal, list):
  87 |         return Chars(literal)
  88 | 
  89 | 
  90 | def respect_priority(contents: AnyRegexPattern, other_priority: int) -> RegexPattern:
  91 |     return cast(
  92 |         RegexPattern,
  93 |         pattern(contents).respect_priority(
  94 |             _PriorityShell(other_priority),
  95 |         ),
  96 |     )
  97 | 
  98 | 
  99 | class RegexPattern(BaseRenderable):
 100 |     priority: int = 100 * priority_step
 101 |     optimized = False
 102 |     default_context: Context = Context(Renderer())
 103 | 
 104 |     def wrap(self):
 105 |         return NonCapturingGroup(self)
 106 | 
 107 |     def render(self, context: Context) -> TokenStream:
 108 |         """
 109 |         Internal method
 110 | 
 111 |         Returns a generator, that can be joined to get a pattern string representation
 112 |         """
 113 |         return NotImplemented
 114 | 
 115 |     def stream(self, context: Context) -> TokenStream:
 116 |         return self.render(context)
 117 | 
 118 |     def case_insensitive(self) -> RegexPattern:
 119 |         return self.set_flags("i")
 120 | 
 121 |     def merge_flags(self) -> RegexPattern:
 122 |         return self
 123 | 
 124 |     def optimize(self) -> RegexPattern:
 125 |         self = self.apply(lambda x: x.optimize())
 126 |         self = self.merge_flags()
 127 | 
 128 |         self.optimized = True
 129 |         return self
 130 | 
 131 |     def apply(self, fn: Processor) -> Self:
 132 |         return self
 133 | 
 134 |     @staticmethod
 135 |     def merge_flags_abstract(
 136 |         parts: Sequence[RegexPattern],
 137 |     ) -> tuple[Sequence[RegexPattern], set[str]]:
 138 |         common_flags: set[str] | None = None
 139 | 
 140 |         for part in parts:
 141 |             if not isinstance(part, FlagLike):
 142 |                 return parts, set()
 143 | 
 144 |             flags = set(part.flags)
 145 | 
 146 |             if common_flags is None:
 147 |                 common_flags = flags
 148 |             else:
 149 |                 common_flags &= flags
 150 | 
 151 |         if not common_flags:
 152 |             return parts, set()
 153 | 
 154 |         new_parts: list[RegexPattern] = []
 155 | 
 156 |         for alt in parts:
 157 |             assert isinstance(alt, FlagLike)
 158 |             new_flags = "".join(f for f in alt.flags if f not in common_flags)
 159 | 
 160 |             if not new_flags:
 161 |                 new_parts.append(alt.inner)
 162 |             elif new_flags != alt.flags:
 163 |                 new_parts.append(LocalFlags(alt.inner, new_flags))
 164 |             else:
 165 |                 new_parts.append(alt)
 166 | 
 167 |         return new_parts, common_flags
 168 | 
 169 |     def render_str(self, flags: str = "", payload: Payload | None = None) -> str:
 170 |         """
 171 | 
 172 |         Renders given pattern into a string with specified global flags.
 173 | 
 174 |         """
 175 | 
 176 |         renderer = Renderer(payload)
 177 | 
 178 |         parts: list[BaseRenderable] = []
 179 | 
 180 |         if flags:
 181 |             parts.append(GlobalFlags(flags))
 182 | 
 183 |         parts.append(self.optimize())
 184 | 
 185 |         return "".join(map(renderer.render_string, parts))
 186 | 
 187 |     def __repr__(self) -> str:
 188 |         return self.render_str()
 189 | 
 190 |     def set_flags(self, flags: str) -> LocalFlags:
 191 |         """
 192 |         This method adds local flags to given pattern
 193 | 
 194 |         ```python
 195 |         x.flags("y") # "(?y:x)"
 196 |         ```
 197 |         """
 198 |         return LocalFlags(self, flags)
 199 | 
 200 |     def __add__(self, other: AnyRegexPattern) -> Concat:
 201 |         return Concat(self, other)
 202 | 
 203 |     def __radd__(self, other: AnyRegexPattern) -> Concat:
 204 |         return Concat(other, self)
 205 | 
 206 |     def concat(self, other: AnyRegexPattern) -> Concat:
 207 |         """
 208 |         Use to match one pattern and then another.
 209 | 
 210 |         `A.concat(B)` is equivalent to `A + B` (if either A or B is a RegexPart object, not a Python literal)
 211 | 
 212 |         ```python
 213 |         x.concat(y) # "xy"
 214 |         x + y # "xy"
 215 |         ```
 216 |         """
 217 |         return self + other
 218 | 
 219 |     def __or__(self, other: AnyRegexPattern) -> OrResult:
 220 |         return Option(self, other)
 221 | 
 222 |     def __ror__(self, other: AnyRegexPattern) -> OrResult:
 223 |         return respect_priority(other, Option.priority) | self
 224 | 
 225 |     def option(self, other: AnyRegexPattern) -> OrResult:
 226 |         """
 227 |         Use to match either one pattern or another.
 228 | 
 229 |         `A.option(B)` is equivalent to `A | B` (if either A or B is a RegexPart object, not a Python literal)
 230 | 
 231 |         ```python
 232 |         x.option(y) # "x|y"
 233 |         x | y # "x|y"
 234 |         ```
 235 |         """
 236 |         return self | other
 237 | 
 238 |     def repeat(self, count: int, lazy: bool = False) -> Range:
 239 |         return Range(self, min_count=count, max_count=count, lazy=lazy)
 240 | 
 241 |     def __mul__(self, other: int) -> Range:
 242 |         return self.repeat(other)
 243 | 
 244 |     repeat_from = repeat
 245 | 
 246 |     def many(self, lazy: bool = False) -> Range:
 247 |         """
 248 |         Use this for repeating patterns (one or more times)
 249 | 
 250 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 251 | 
 252 |         ```python
 253 |         x.many() # "x+"
 254 |         x.many(True) # "x+?"
 255 |         ```
 256 |         """
 257 |         result: Range = self.repeat(1, lazy).or_more()
 258 |         return result
 259 | 
 260 |     def plus(self, lazy: bool = False):
 261 |         """alias for .many"""
 262 |         return self.many(lazy)
 263 | 
 264 |     def __pos__(self):
 265 |         return self.many()
 266 | 
 267 |     def some(self, lazy: bool = False) -> Range:
 268 |         """
 269 |         Use this for repeating optional patterns (zero or more times)
 270 | 
 271 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 272 | 
 273 |         ```python
 274 |         x.some() # "x*"
 275 |         x.some(True) # "x*?"
 276 |         ```
 277 |         """
 278 | 
 279 |         return self.repeat(0, lazy).or_more()
 280 | 
 281 |     def star(self, lazy: bool = False):
 282 |         """alias for .some"""
 283 |         return self.some(lazy)
 284 | 
 285 |     def maybe(self, lazy: bool = False) -> Range:
 286 |         """
 287 |         Use this for optional patterns (zero or one times)
 288 | 
 289 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 290 | 
 291 |         ```python
 292 |         x.maybe() # "x?"
 293 |         x.maybe(True) # "x??"
 294 |         ```
 295 |         """
 296 |         return self.repeat(1, lazy).or_less()
 297 | 
 298 |     def optional(self, lazy: bool = False):
 299 |         """alias for .maybe"""
 300 |         return self.maybe(lazy)
 301 | 
 302 |     def x_or_less_times(self, count: int, lazy: bool = False) -> Range:
 303 |         """
 304 | 
 305 |         Use this to match pattern x or less times (hence the name).
 306 | 
 307 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 308 | 
 309 |         ```python
 310 |         x.x_or_less_times(5) # "x{,5}"
 311 |         x.x_or_less_times(5, True) # "x{,5}?"
 312 |         ```
 313 |         """
 314 |         return self.repeat(count, lazy).or_less()
 315 | 
 316 |     def x_or_more_times(self, count: int, lazy: bool = False) -> Range:
 317 |         """
 318 | 
 319 |         Use this to match pattern x or more times (hence the name).
 320 | 
 321 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 322 | 
 323 |         ```python
 324 |         x.x_or_more_times(5) # "x{5,}"
 325 |         x.x_or_more_times(5, True) # "x{5,}?"
 326 |         ```
 327 |         """
 328 |         return self.repeat(count, lazy).or_more()
 329 | 
 330 |     def x_times(self, count: int, lazy: bool = False) -> Range:
 331 |         """
 332 | 
 333 |         Use this to match pattern exactly x times (hence the name).
 334 | 
 335 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 336 | 
 337 |         ```python
 338 |         x.x_times(5) # "x{5}"
 339 |         x.x_times(5, True) # "x{5}?"
 340 |         ```
 341 |         """
 342 |         return self.repeat(count, lazy)
 343 | 
 344 |     def between_x_y_times(
 345 |         self, min_count: int, max_count: int, lazy: bool = False
 346 |     ) -> Range:
 347 |         """
 348 | 
 349 |         Use this to match pattern between x and y times, inclusive (hence the name).
 350 | 
 351 |         When not lazy, matches as many times as possible, otherwise matches as few times as possible.
 352 | 
 353 |         ```python
 354 |         x.between_x_y_times(5, 6) # "x{5,6}"
 355 |         x.between_x_y_times(5, 6, True) # "x{5,6}?"
 356 |         ```
 357 |         """
 358 |         return self.repeat(min_count, lazy).to(max_count)
 359 | 
 360 |     def lookahead(self, other: AnyRegexPattern) -> Concat:
 361 |         """
 362 |         Use this to indicate that given pattern occurs before some another pattern (lookahead).
 363 | 
 364 |         In other words, `x.lookahead(y)` matches a pattern `x` only if there is `y` after it
 365 | 
 366 |         Lookahead pattern won't be captured.
 367 | 
 368 |         ```python
 369 |         x.lookahead(y) # x(?=y)
 370 |         x.before(y) # x(?=y)
 371 |         ```
 372 |         """
 373 |         return Concat(self, Lookahead(other))
 374 | 
 375 |     def before(self, other: AnyRegexPattern) -> Concat:
 376 |         """alias for .lookahead"""
 377 |         return self.lookahead(other)
 378 | 
 379 |     def negative_lookahead(self, other: AnyRegexPattern) -> Concat:
 380 |         """
 381 |         Use this to indicate that given pattern doesn't occur before some another pattern (negative lookahead).
 382 | 
 383 |         In other words, `x.negative_lookahead(y)` matches a pattern `x` only if there is no `y` after it
 384 | 
 385 |         Lookahead pattern won't be captured.
 386 | 
 387 |         ```python
 388 |         x.negative_lookahead(y) # x(?!y)
 389 |         x.not_before(y) # x(?!y)
 390 |         ```
 391 |         """
 392 |         return Concat(self, NegativeLookahead(other))
 393 | 
 394 |     def not_before(self, other: AnyRegexPattern) -> Concat:
 395 |         """alias for .negative_lookahead"""
 396 |         return self.negative_lookahead(other)
 397 | 
 398 |     def lookbehind(self, other: AnyRegexPattern) -> Concat:
 399 |         """
 400 |         Use this to indicate that given pattern occurs after some another pattern (lookbehind).
 401 | 
 402 |         In other words, `x.lookbehind(y)` matches a pattern `x` only if there is `y` before it
 403 | 
 404 |         Lookbehind pattern won't be captured.
 405 | 
 406 |         ```python
 407 |         x.lookbehind(y) # (?<=y)x
 408 |         x.after(y) # (?<=y)x
 409 |         ```
 410 |         """
 411 |         return Concat(Lookbehind(other), self)
 412 | 
 413 |     def after(self, other: AnyRegexPattern) -> Concat:
 414 |         """alias for .lookbehind"""
 415 |         return self.lookbehind(other)
 416 | 
 417 |     def negative_lookbehind(self, other: AnyRegexPattern) -> Concat:
 418 |         """
 419 |         Use this to indicate that given pattern goes before some another pattern (negative lookbehind).
 420 | 
 421 |         In other words, `x.negative_lookbehind(y)` matches a pattern `x` only if there is NO `y` before it
 422 | 
 423 |         Lookbehind pattern won't be captured.
 424 | 
 425 |         ```python
 426 |         x.negative_lookbehind(y) # (?<!y)x
 427 |         x.not_after(y) # (?<!y)x
 428 |         ```
 429 |         """
 430 |         return Concat(NegativeLookbehind(other), self)
 431 | 
 432 |     def not_after(self, other: AnyRegexPattern) -> Concat:
 433 |         """alias for .negative_lookbehind"""
 434 |         return self.negative_lookbehind(other)
 435 | 
 436 |     def comment(self, text: str) -> Concat:
 437 |         """leaves a comment in expression (if needed for whatever reason)"""
 438 |         return Concat(self, Comment(UnescapedLiteral(text.replace(")", "\\)"))))
 439 | 
 440 |     def capture(self) -> Group:
 441 |         """
 442 | 
 443 |         Use this to make a capturing group out of pattern.
 444 | 
 445 |         ```python
 446 |         x.capture() # (x)
 447 |         ```
 448 |         """
 449 |         return Group(self)
 450 | 
 451 |     def named(self, name: str) -> NamedPattern:
 452 |         return NamedPattern(name, self)
 453 | 
 454 | 
 455 | class _PriorityShell(RegexPattern):
 456 |     def __init__(self, priority: int) -> None:
 457 |         self.priority = priority
 458 | 
 459 | 
 460 | class GroupBase(RegexPattern):
 461 |     contents: RegexPattern
 462 |     prefix: str
 463 | 
 464 |     def __init__(self, *contents: AnyRegexPattern):
 465 |         self.contents = pattern(contents)
 466 | 
 467 |     def render_prefix(self) -> TokenStream:
 468 |         yield self.prefix
 469 | 
 470 |     def case_insensitive(self):
 471 |         return self.apply(lambda x: x.case_insensitive())
 472 | 
 473 |     def render(self, context: Context) -> TokenStream:
 474 |         yield "("
 475 |         yield from self.render_prefix()
 476 |         yield from self.contents.render(context)
 477 |         yield ")"
 478 | 
 479 |     def apply(self, fn: Processor) -> Self:
 480 |         return self.__class__(fn(self.contents))
 481 | 
 482 | 
 483 | class Group(GroupBase):
 484 |     prefix = ""
 485 | 
 486 | 
 487 | class NonCapturingGroup(GroupBase):
 488 |     prefix = "?:"
 489 | 
 490 |     def optimize(self) -> RegexPattern:
 491 |         if isinstance(self.contents, NonCapturingGroup):
 492 |             return self.contents.optimize()
 493 |         return super().optimize()
 494 | 
 495 |     def respect_priority(
 496 |         self,
 497 |         operation: BaseRenderable,
 498 |         comparator: Comparator | None = None,
 499 |         side: str = "none",
 500 |     ) -> BaseRenderable:
 501 |         return self.contents.respect_priority(operation, comparator, side)
 502 | 
 503 | 
 504 | class Lookahead(GroupBase):
 505 |     prefix = "?="
 506 | 
 507 | 
 508 | class NegativeLookahead(GroupBase):
 509 |     prefix = "?!"
 510 | 
 511 | 
 512 | class Lookbehind(GroupBase):
 513 |     prefix = "?<="
 514 | 
 515 | 
 516 | class NegativeLookbehind(GroupBase):
 517 |     prefix = "?<!"
 518 | 
 519 | 
 520 | class Comment(GroupBase):
 521 |     prefix = "?#"
 522 | 
 523 | 
 524 | def sort_chartype(seq: Sequence[CharRange]) -> Sequence[CharRange]:
 525 |     def sorting_func(char: CharRange) -> tuple[int, int]:
 526 |         return char.start, char.stop
 527 | 
 528 |     return sorted(seq, key=sorting_func)
 529 | 
 530 | 
 531 | def make_range(part: CharType) -> CharRange:
 532 |     if isinstance(part, str):
 533 |         return CharRange(part, part)
 534 |     if isinstance(part, Literal):
 535 |         return CharRange(part.contents, part.contents)
 536 |     return part
 537 | 
 538 | 
 539 | def merge_chars(contents: Sequence[CharType]) -> Sequence[CharRange]:
 540 |     result: list[CharRange] = []
 541 |     contents = sort_chartype([make_range(part) for part in contents])
 542 | 
 543 |     def merge_parts(last_part: CharRange, next_part: CharRange) -> Sequence[CharRange]:
 544 |         if last_part.stop + 1 >= next_part.start:
 545 |             if next_part.stop > last_part.stop:
 546 |                 return [CharRange(last_part.start, next_part.stop)]
 547 |             return [last_part]
 548 | 
 549 |         return [last_part, next_part]
 550 | 
 551 |     for part in contents:
 552 |         if len(result):
 553 |             result[-1:] = merge_parts(result[-1], part)
 554 |         else:
 555 |             result.append(part)
 556 | 
 557 |     return result
 558 | 
 559 | 
 560 | Bounds = Tuple[int, int]
 561 | 
 562 | 
 563 | class FlagLike(RegexPattern):
 564 |     flags: str
 565 |     inner: RegexPattern
 566 | 
 567 | 
 568 | class CharBase(FlagLike):
 569 |     def __init__(self, contents: Sequence[CharType]):
 570 |         self.contents = list(merge_chars(contents))
 571 |         self.inner = self
 572 | 
 573 |     @property
 574 |     def flags(self):
 575 |         ci = self.case_insensitive()
 576 |         if ci == self:
 577 |             return "i"
 578 |         return ""
 579 | 
 580 |     def __eq__(self, other: object):
 581 |         if not isinstance(other, self.__class__):
 582 |             return False
 583 | 
 584 |         if len(self.contents) != len(other.contents):
 585 |             return False
 586 | 
 587 |         for i, r in enumerate(self.contents):
 588 |             if r != other.contents[i]:
 589 |                 return False
 590 | 
 591 |         return True
 592 | 
 593 |     def case_insensitive(self) -> Self:
 594 |         contents: list[CharRange] = []
 595 | 
 596 |         for part in self.contents:
 597 |             start_char = chr(part.start)
 598 |             stop_char = chr(part.stop)
 599 | 
 600 |             is_lower = start_char.islower() and stop_char.islower()
 601 |             is_upper = start_char.isupper() and stop_char.isupper()
 602 | 
 603 |             if is_lower:
 604 |                 upper_chars = map(ord, map(str.upper, (start_char, stop_char)))
 605 |                 contents.append(CharRange(*upper_chars))
 606 | 
 607 |             elif is_upper:
 608 |                 lower_chars = map(ord, map(str.lower, (start_char, stop_char)))
 609 |                 contents.append(CharRange(*lower_chars))
 610 | 
 611 |             contents.append(part)
 612 | 
 613 |         return self.__class__(contents)
 614 | 
 615 | 
 616 | class Chars(CharBase):
 617 |     non_special = {".", "[", "|", "~", "*", "(", ")", "+", "$", "&", "?", "#"}
 618 | 
 619 |     def accepts(self, char: str) -> bool:
 620 |         for chrange in self.contents:
 621 |             if chrange.accepts(char):
 622 |                 return True
 623 |         return False
 624 | 
 625 |     def render(self, context: Context) -> TokenStream:
 626 |         if len(self.contents) == 1:
 627 |             contents = self.contents[0]
 628 |             if contents.is_single_char():
 629 |                 yield from contents.render_literal(context)
 630 |                 return
 631 |         yield "["
 632 | 
 633 |         for char in self.contents:
 634 |             yield from char.render(context)
 635 | 
 636 |         yield "]"
 637 | 
 638 |     def to(self, other: str | Literal | Chars) -> Chars:
 639 |         if isinstance(other, str):
 640 |             end = pattern(other)
 641 |         elif isinstance(other, Chars):
 642 |             end = other
 643 |         else:
 644 |             end = other
 645 | 
 646 |         start: int = self.contents[0].start
 647 | 
 648 |         stop_base = end.contents[0]
 649 |         stop: int
 650 |         if isinstance(stop_base, str):
 651 |             stop = ord(stop_base)
 652 |         else:
 653 |             stop = stop_base.stop
 654 | 
 655 |         return char_range(start, stop)
 656 | 
 657 |     def reverse(self) -> ReversedChars:
 658 |         return ReversedChars(self.contents)
 659 | 
 660 |     @overload
 661 |     def __or__(self, other: Chars | list[CharType]) -> Chars:
 662 |         ...
 663 | 
 664 |     @overload
 665 |     def __or__(self, other: AnyRegexPattern) -> Option | Chars:
 666 |         ...
 667 | 
 668 |     def __or__(self, other: AnyRegexPattern) -> Union[Option, Chars]:
 669 |         other = respect_priority(other, Option.priority)
 670 |         if isinstance(other, Chars):
 671 |             return Chars([*self.contents, *other.contents])
 672 |         return Option(self, other)
 673 | 
 674 |     def exclude(self, chars: AnyRegexPattern) -> Chars:
 675 |         chars = pattern(chars)
 676 |         if not isinstance(chars, Chars):
 677 |             raise ValueError(
 678 |                 "Can't exclude non-Chars pattern, don't really know how..."
 679 |             )
 680 |         result = []
 681 |         for part in self.contents:
 682 |             result.extend(part.exclude(chars))
 683 |         return Chars(result)
 684 | 
 685 | 
 686 | class ReversedChars(CharBase):
 687 |     def render(self, context: Context) -> TokenStream:
 688 |         yield "["
 689 |         yield "^"
 690 |         for char in self.contents:
 691 |             if isinstance(char, (Literal, CharRange)):
 692 |                 yield from char.render(context)
 693 |             elif char in Chars.non_special:
 694 |                 yield char
 695 |             else:
 696 |                 yield re.escape(char)
 697 |         yield "]"
 698 | 
 699 |     def reverse(self) -> Chars:
 700 |         return Chars(self.contents)
 701 | 
 702 |     @overload
 703 |     def __or__(self, other: ReversedChars) -> ReversedChars:
 704 |         ...
 705 | 
 706 |     @overload
 707 |     def __or__(self, other: AnyRegexPattern) -> Option | ReversedChars:
 708 |         ...
 709 | 
 710 |     def __or__(self, other: AnyRegexPattern) -> Union[Option, ReversedChars]:
 711 |         other = respect_priority(other, Option.priority)
 712 |         if isinstance(other, ReversedChars):
 713 |             return ReversedChars([*self.contents, *other.contents])
 714 |         return Option(self, other)
 715 | 
 716 | 
 717 | class CharRange(BaseRenderable):
 718 |     min_char = 0
 719 |     max_char = 0x10FFFF
 720 | 
 721 |     def __init__(self, start: Optional[str | int], stop: Optional[str | int]):
 722 |         meta = None
 723 |         if isinstance(start, str):
 724 |             if len(start) > 1:
 725 |                 meta = start
 726 |                 start = -1
 727 |             else:
 728 |                 start = ord(start)
 729 |         if isinstance(stop, str):
 730 |             if len(stop) > 1:
 731 |                 stop = -1
 732 |             else:
 733 |                 stop = ord(stop)
 734 | 
 735 |         self.start = start or CharRange.min_char
 736 |         self.stop = stop or CharRange.max_char
 737 |         self.meta = meta
 738 | 
 739 |         if not (start or stop):
 740 |             raise ValueError(
 741 |                 "Cannot create a character range with no data. Use rgx.meta.ANY instead"
 742 |             )
 743 | 
 744 |     def accepts(self, char: str) -> bool:
 745 |         return ord(char) in range(self.start, self.stop + 1)
 746 | 
 747 |     @staticmethod
 748 |     def render_char(char: int) -> str:
 749 |         return re.escape(chr(char))
 750 | 
 751 |     def stream(self, context: Context) -> TokenStream:
 752 |         if self.meta:
 753 |             yield self.meta
 754 |             return
 755 | 
 756 |         diff = self.stop - self.start
 757 | 
 758 |         if self.start:
 759 |             yield self.render_char(self.start)
 760 | 
 761 |         if not diff:
 762 |             return  # one char
 763 | 
 764 |         if diff == 2:
 765 |             yield chr(self.stop - 1)  # render 012 instead of 0-2
 766 | 
 767 |         if diff > 2:
 768 |             yield "-"
 769 | 
 770 |         if self.stop != CharRange.max_char:
 771 |             yield self.render_char(self.stop)
 772 | 
 773 |     def render(self, context: Context) -> TokenStream:
 774 |         return self.stream(context)
 775 | 
 776 |     def render_literal(self, context: Context) -> TokenStream:
 777 |         if self.meta:
 778 |             yield self.meta
 779 |             return
 780 |         yield from Literal(chr(self.start)).render(context)
 781 | 
 782 |     @staticmethod
 783 |     def exclude_bounds(bounds: Bounds, exclude: Bounds) -> list[Bounds]:
 784 |         result: list[Bounds] = []
 785 |         self_range = range(bounds[0], bounds[1] + 1)
 786 | 
 787 |         if exclude[0] - 1 in self_range:
 788 |             result.append((bounds[0], exclude[0] - 1))
 789 |         if exclude[1] + 1 in self_range:
 790 |             result.append((exclude[1] + 1, bounds[1]))
 791 |         return result
 792 | 
 793 |     def exclude(self, chars: Chars) -> list[CharRange]:
 794 |         if self.meta:
 795 |             raise ValueError(
 796 |                 f"Cannot exclude chars '{chars}' from meta-sequence '{self.meta}'"
 797 |             )
 798 | 
 799 |         result: list[Bounds] = [(self.start, self.stop)]
 800 |         temp_result: list[Bounds] = []
 801 |         cut_start = 0
 802 |         last_cut_start = 0
 803 | 
 804 |         for char_part in chars.contents:
 805 |             if char_part.meta:
 806 |                 raise ValueError(
 807 |                     f"Cannot exclude meta-sequence '{self.meta}' from chars '[{self}]'"
 808 |                 )
 809 |             exclude = (char_part.start, char_part.stop)
 810 |             for i, bounds in enumerate(result[cut_start:], start=cut_start):
 811 |                 if exclude[1] < bounds[0]:
 812 |                     temp_result.extend(result[i:])
 813 |                     break
 814 | 
 815 |                 if exclude[0] > bounds[1]:
 816 |                     temp_result.append(result[i])
 817 |                     continue
 818 | 
 819 |                 temp_result.extend(self.exclude_bounds(bounds, exclude))
 820 | 
 821 |             last_cut_start = cut_start
 822 |             cut_start = len(temp_result) - 1
 823 | 
 824 |             result[last_cut_start:] = temp_result
 825 |             temp_result = []
 826 | 
 827 |         return [CharRange(*x) for x in result]
 828 | 
 829 |     def is_single_char(self) -> bool:
 830 |         return self.start == self.stop
 831 | 
 832 |     def __repr__(self):
 833 |         return Renderer().render_string(self)
 834 | 
 835 |     def __eq__(self, other: object):
 836 |         if not isinstance(other, CharRange):
 837 |             return False
 838 | 
 839 |         return (
 840 |             self.start == other.start
 841 |             and self.stop == other.stop
 842 |             and self.meta == other.meta
 843 |         )
 844 | 
 845 | 
 846 | @overload
 847 | def char_range(start: Optional[str | int], stop: str | int) -> Chars:
 848 |     ...
 849 | 
 850 | 
 851 | @overload
 852 | def char_range(start: str | int, stop: None = None) -> Chars:
 853 |     ...
 854 | 
 855 | 
 856 | @overload
 857 | def char_range(start: None = None, stop: None = None) -> NoReturn:
 858 |     ...
 859 | 
 860 | 
 861 | @overload
 862 | def char_range(start: Optional[str | int], stop: Optional[str | int]) -> Chars:
 863 |     ...
 864 | 
 865 | 
 866 | def char_range(
 867 |     start: Optional[str | int] = None, stop: Optional[str | int] = None
 868 | ) -> Chars:
 869 |     """
 870 | 
 871 |     Use this for character ranges (e.g. `[a-z]`)
 872 | 
 873 |     Can be combined with other Chars istances (or lists) using |
 874 | 
 875 |     `start` and `stop` are inclusive
 876 | 
 877 |     """
 878 | 
 879 |     return Chars([CharRange(start, stop)])
 880 | 
 881 | 
 882 | class Concat(RegexPattern):
 883 |     priority = 2 * priority_step
 884 | 
 885 |     def __init__(self, *contents: AnyRegexPattern) -> None:
 886 |         if len(contents) >= 3:
 887 |             contents = (contents[0], Concat(*contents[1:]))
 888 | 
 889 |         self.contents = [respect_priority(part, self.priority) for part in contents]
 890 | 
 891 |     def __add__(self, other: AnyRegexPattern) -> Concat:
 892 |         return Concat(*self.contents, other)
 893 | 
 894 |     def case_insensitive(self) -> RegexPattern:
 895 |         return self.apply(lambda x: x.case_insensitive())
 896 | 
 897 |     def render(self, context: Context) -> TokenStream:
 898 |         for part in self.contents:
 899 |             yield from part.render(context)
 900 | 
 901 |     def merge_flags(self) -> LocalFlags | Concat:
 902 |         processed, common_flags = self.merge_flags_abstract(self.contents)
 903 | 
 904 |         new = Concat(*processed)
 905 | 
 906 |         if not common_flags:
 907 |             return new
 908 | 
 909 |         return LocalFlags(new, "".join(common_flags))
 910 | 
 911 |     def apply(self, fn: Processor) -> Self:
 912 |         return self.__class__(*map(fn, self.contents))
 913 | 
 914 | 
 915 | class Option(RegexPattern):
 916 |     priority = 0 * priority_step
 917 | 
 918 |     def __init__(self, *alternatives: AnyRegexPattern):
 919 |         if len(alternatives) >= 3:
 920 |             alternatives = (alternatives[0], Option(*alternatives[1:]))
 921 | 
 922 |         self.alternatives = [
 923 |             respect_priority(alternative, self.priority) for alternative in alternatives
 924 |         ]
 925 | 
 926 |     def case_insensitive(self) -> RegexPattern:
 927 |         return self.apply(lambda x: x.case_insensitive())
 928 | 
 929 |     def merge_flags(self) -> LocalFlags | Option:
 930 |         processed, common_flags = self.merge_flags_abstract(self.alternatives)
 931 | 
 932 |         new = Option(*processed)
 933 | 
 934 |         if not common_flags:
 935 |             return new
 936 | 
 937 |         return LocalFlags(new, "".join(common_flags))
 938 | 
 939 |     def render(self, context: Context) -> TokenStream:
 940 |         if not self.alternatives:
 941 |             return
 942 |         yield from self.alternatives[0].render(context)
 943 |         for alternative in self.alternatives[1:]:
 944 |             yield "|"
 945 |             yield from alternative.render(context)
 946 | 
 947 |     def __or__(self, other: AnyRegexPattern) -> Option:
 948 |         return Option(*self.alternatives, other)
 949 | 
 950 |     def __ror__(self, other: AnyRegexPattern) -> Option:
 951 |         return Option(other, *self.alternatives)
 952 | 
 953 |     def apply(self, fn: Processor) -> Self:
 954 |         return self.__class__(*map(fn, self.alternatives))
 955 | 
 956 | 
 957 | class LocalFlags(FlagLike):
 958 |     def __init__(self, contents: AnyRegexPattern, flags: str):
 959 |         self.contents = pattern(contents)
 960 |         self.inner = self.contents
 961 |         self.flags = flags
 962 | 
 963 |     def case_insensitive(self) -> RegexPattern:
 964 |         return self.apply(lambda x: x.case_insensitive())
 965 | 
 966 |     def render(self, context: Context) -> TokenStream:
 967 |         yield "(?"
 968 |         yield self.flags
 969 |         yield ":"
 970 |         yield from self.contents.render(context)
 971 |         yield ")"
 972 | 
 973 |     def apply(self, fn: Processor) -> Self:
 974 |         return self.__class__(fn(self.contents), self.flags)
 975 | 
 976 | 
 977 | class GlobalFlags(GroupBase):
 978 |     prefix = "?"
 979 | 
 980 |     def __init__(self, contents: str):
 981 |         self.contents = Literal(contents)
 982 | 
 983 | 
 984 | class Range(RegexPattern):
 985 |     priority: int = 3 * priority_step
 986 | 
 987 |     def __init__(
 988 |         self,
 989 |         *contents: AnyRegexPattern,
 990 |         min_count: int = 0,
 991 |         max_count: Optional[int] = None,
 992 |         lazy: bool = False,
 993 |     ) -> None:
 994 |         if min_count == max_count == 1:
 995 |             self.contents = pattern(contents)
 996 |         else:
 997 |             self.contents = respect_priority(contents, self.priority + 1)
 998 | 
 999 |         if max_count is not None and min_count > max_count:
1000 |             min_count, max_count = max_count, min_count
1001 | 
1002 |         if min_count < 0:
1003 |             raise ValueError("Quantifier lower bound cannot be less than 0")
1004 | 
1005 |         if max_count is not None and max_count < 0:
1006 |             raise ValueError("Quantifier upper bound cannot be less than 0")
1007 | 
1008 |         self.min_count = min_count
1009 |         self.max_count = max_count
1010 |         self.lazy = lazy
1011 | 
1012 |     def case_insensitive(self) -> RegexPattern:
1013 |         return self.apply(lambda x: x.case_insensitive())
1014 | 
1015 |     def repeat(self, count: int, lazy: bool = False) -> Range:
1016 |         """
1017 | 
1018 |         The logic here should be carefully thought through.
1019 |         If we multiply a fixed-size pattern a{X} by Y, we generally DO NOT get a{X*Y}
1020 |         If we multiply a .or_less() pattern a{,X} by Y, we get a{,X*Y}
1021 |         If we multiply a pattern a{1,X} (X!=1) by Y, we get a{Y,X*Y}
1022 | 
1023 |         Above logic doesn't scale up with patterns a{X,N} * Y, if X is not in {0, 1}, so we should fallback to (?:a{X,N}){Y}
1024 | 
1025 |         While it is easy to say a{X} * Y == a{X*Y} (i.e. a{5} * 10 == a{50}),
1026 |         ...this doesn't work well with .many() and other quantifiers: (a{5} * 10).many() != a{50,}
1027 |         ...but rather (?:a{5}){10,}
1028 | 
1029 |         """
1030 | 
1031 |         if self.min_count not in {0, 1}:
1032 |             return super().repeat(count, lazy)
1033 | 
1034 |         max_count = self.max_count * count if self.max_count else None
1035 |         return Range(
1036 |             self.contents,
1037 |             min_count=self.min_count * count,
1038 |             max_count=max_count,
1039 |             lazy=lazy,
1040 |         )
1041 | 
1042 |     def or_more(self) -> Range:
1043 |         return Range(self.contents, min_count=self.min_count, lazy=self.lazy)
1044 | 
1045 |     def __pos__(self) -> Range:
1046 |         return self.or_more()
1047 | 
1048 |     def or_less(self) -> Range:
1049 |         return Range(
1050 |             self.contents, min_count=0, max_count=self.max_count, lazy=self.lazy
1051 |         )
1052 | 
1053 |     def __neg__(self) -> Range:
1054 |         return self.or_less()
1055 | 
1056 |     def to(self, count: int) -> Range:
1057 |         return Range(
1058 |             self.contents, min_count=self.min_count, max_count=count, lazy=self.lazy
1059 |         )
1060 | 
1061 |     def __rshift__(self, count: int) -> Range:
1062 |         return self.to(count)
1063 | 
1064 |     def render_quantifier(self) -> TokenStream:
1065 |         if self.max_count is None:
1066 |             if not self.min_count:
1067 |                 yield "*"
1068 |                 return
1069 |             elif self.min_count == 1:
1070 |                 yield "+"
1071 |                 return
1072 | 
1073 |         elif self.max_count == 1:
1074 |             if not self.min_count:
1075 |                 yield "?"
1076 |                 return
1077 |             elif self.min_count == 1:
1078 |                 return
1079 | 
1080 |         yield "{"
1081 | 
1082 |         if self.min_count:
1083 |             yield str(self.min_count)
1084 | 
1085 |         if self.min_count == self.max_count:
1086 |             yield "}"
1087 |             return
1088 | 
1089 |         yield ","
1090 | 
1091 |         if self.max_count:
1092 |             yield str(self.max_count)
1093 | 
1094 |         yield "}"
1095 | 
1096 |     def render(self, context: Context) -> TokenStream:
1097 |         if self.max_count == 0:
1098 |             return
1099 | 
1100 |         yield from self.contents.render(context)
1101 | 
1102 |         if self.min_count == self.max_count == 1:
1103 |             return
1104 | 
1105 |         yield from self.render_quantifier()
1106 | 
1107 |         if self.lazy and self.min_count != self.max_count:
1108 |             yield "?"
1109 | 
1110 |     def merge_flags(self) -> LocalFlags | Range:
1111 |         processed, common_flags = self.merge_flags_abstract([self.contents])
1112 | 
1113 |         if not common_flags:
1114 |             return self
1115 | 
1116 |         return LocalFlags(
1117 |             self.apply(lambda _: processed[0]),
1118 |             "".join(common_flags),
1119 |         )
1120 | 
1121 |     def apply(self, fn: Processor) -> Self:
1122 |         return self.__class__(
1123 |             fn(self.contents),
1124 |             min_count=self.min_count,
1125 |             max_count=self.max_count,
1126 |             lazy=self.lazy,
1127 |         )
1128 | 
1129 | 
1130 | class NamedPattern(RegexPattern):
1131 |     """
1132 | 
1133 |     Named capturing group.
1134 | 
1135 |     If `contents` are omitted, generates a reference, otherwise a named group definition.
1136 | 
1137 |     ```python
1138 |     pattern.named("x", y) # (?P<x>y)
1139 |     pattern.named("x") # (?P=x)
1140 |     ```
1141 |     """
1142 | 
1143 |     def __init__(self, name: str, contents: Optional[AnyRegexPattern] = None):
1144 |         self.name = name
1145 |         self.contents = pattern(contents) if contents is not None else None
1146 | 
1147 |     def case_insensitive(self) -> RegexPattern:
1148 |         contents = self.contents.case_insensitive() if self.contents else None
1149 |         return NamedPattern(self.name, contents)
1150 | 
1151 |     def render(self, context: Context) -> TokenStream:
1152 |         yield "(?P"
1153 |         if self.contents:
1154 |             yield "<"
1155 |             yield self.name
1156 |             yield ">"
1157 |             yield from self.contents.render(context)
1158 |         else:
1159 |             yield "="
1160 |             yield self.name
1161 |         yield ")"
1162 | 
1163 |     def merge_flags(self) -> LocalFlags | NamedPattern:
1164 |         if self.contents is None:
1165 |             return self
1166 | 
1167 |         processed, common_flags = self.merge_flags_abstract([self.contents])
1168 | 
1169 |         if not common_flags:
1170 |             return self
1171 | 
1172 |         return LocalFlags(
1173 |             self.apply(lambda _: processed[0]),
1174 |             "".join(common_flags),
1175 |         )
1176 | 
1177 |     def apply(self, fn: Processor) -> Self:
1178 |         if self.contents is None:
1179 |             return self
1180 | 
1181 |         return self.__class__(
1182 |             self.name,
1183 |             fn(self.contents),
1184 |         )
1185 | 
1186 | 
1187 | class ConditionalPattern(RegexPattern):
1188 |     """
1189 |     Use to match different patterns depending on whether another group matched or not.
1190 | 
1191 |     Next two snippets produce effectively the same result:
1192 | 
1193 |     ```python
1194 |     from rgx import pattern
1195 | 
1196 |     hello = pattern("hello").capture()
1197 |     world = pattern("world")
1198 |     where = pattern("where")
1199 | 
1200 |     x = (hello + world) | where
1201 |     ```
1202 | 
1203 |     ```python
1204 |     from rgx import pattern, conditional
1205 | 
1206 |     hello = pattern("hello").capture()
1207 |     world = pattern("world")
1208 |     where = pattern("where")
1209 | 
1210 |     x = hello.maybe() + conditional(1, world, where)
1211 |     ```
1212 |     """
1213 | 
1214 |     def __init__(
1215 |         self, group: int, true_option: AnyRegexPattern, false_option: AnyRegexPattern
1216 |     ) -> None:
1217 |         self.group = group
1218 |         self.true_option = respect_priority(true_option, Option.priority + 1)
1219 |         self.false_option = respect_priority(false_option, Option.priority + 1)
1220 | 
1221 |     def render(self, context: Context) -> TokenStream:
1222 |         yield "(?("
1223 |         yield str(self.group)
1224 |         yield ")"
1225 |         yield from self.true_option.render(context)
1226 |         yield "|"
1227 |         yield from self.false_option.render(context)
1228 |         yield ")"
1229 | 
1230 |     def apply(self, fn: Processor) -> Self:
1231 |         return self.__class__(
1232 |             self.group,
1233 |             fn(self.true_option),
1234 |             fn(self.false_option),
1235 |         )
1236 | 
1237 |     def case_insensitive(self) -> RegexPattern:
1238 |         return self.apply(lambda x: x.case_insensitive())
1239 | 
1240 | 
1241 | class Literal(RegexPattern):
1242 |     def __init__(self, contents: str) -> None:
1243 |         self.contents: str = contents
1244 |         if len(self.contents) != 1:
1245 |             self.priority = 2 * priority_step
1246 | 
1247 |     def to(self, other: str | Literal | Chars) -> Chars:
1248 |         return Chars([self]).to(other)
1249 | 
1250 |     def render(self, context: Context) -> TokenStream:
1251 |         yield re.escape(self.contents)
1252 | 
1253 |     def apply(self, fn: Processor) -> Self:
1254 |         return self
1255 | 
1256 | 
1257 | class UnescapedLiteral(Literal):
1258 |     """
1259 | 
1260 |     Unescaped literal. Renders into whatever is passed (as long as it is a string)
1261 | 
1262 |     """
1263 | 
1264 |     def render(self, context: Context) -> TokenStream:
1265 |         yield str(self.contents)
1266 | 
1267 | 
1268 | def group_reference(group: int) -> UnescapedLiteral:
1269 |     """
1270 | 
1271 |     Renders into a group reference (backreference)
1272 |     E.g. if Group #1 is `(x|y)` and it has matched "x", `reference(1)` would match exactly "x", but not "y"
1273 | 
1274 |     ```python
1275 |     rgx.reference(1) # \\1
1276 |     ```
1277 | 
1278 |     """
1279 |     return UnescapedLiteral(f"\\{group}")
1280 | 


--------------------------------------------------------------------------------