├── .coveragerc ├── rgx ├── __init__.py ├── unicode_meta.py ├── meta.py └── entities.py ├── pyproject.toml ├── test ├── test_groups.py ├── test_creation.py ├── test_url.py └── test_operations.py ├── .github └── workflows │ ├── build.yml │ └── test.yml ├── LICENSE ├── .gitignore └── README.md /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | pragma: not covered 4 | @overload -------------------------------------------------------------------------------- /rgx/__init__.py: -------------------------------------------------------------------------------- 1 | from .entities import ( 2 | pattern, 3 | NamedPattern as named, 4 | group_reference as reference, 5 | ConditionalPattern as conditional, 6 | char_range as char_range, 7 | Context as Context, 8 | ) 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "rgx" 3 | version = "2.0.0" 4 | description = "Typed, simple and readable regexp generation" 5 | authors = ["Dmitry Gritsenko "] 6 | license = "MIT" 7 | readme = "README.md" 8 | repository = "https://github.com/evtn/rgx" 9 | homepage = "https://github.com/evtn/rgx" 10 | keywords = ["regex", "regexp", "regular expressions"] 11 | 12 | [tool.poetry.dependencies] 13 | python = "^3.7" 14 | wordstreamer = "^0.1.3" 15 | 16 | [tool.poetry.dev-dependencies] 17 | 18 | [tool.poetry.group.dev.dependencies] 19 | pytest = "^7.4.0" 20 | mypy = "^1.4.1" 21 | coverage = "^7.2.7" 22 | 23 | [build-system] 24 | requires = ["poetry-core>=1.0.0"] 25 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /rgx/unicode_meta.py: -------------------------------------------------------------------------------- 1 | from rgx.entities import UnescapedLiteral 2 | 3 | 4 | def NAMED_PROPERTY(name: str, value: str) -> UnescapedLiteral: 5 | return UnescapedLiteral(fr"\P{{{name}={value}}}") 6 | 7 | 8 | def NAMED_PROPERTY_INVERSE(name: str, value: str) -> UnescapedLiteral: 9 | return UnescapedLiteral(fr"\p{{{name}={value}}}") 10 | 11 | 12 | def PROPERTY(value: str) -> UnescapedLiteral: 13 | return UnescapedLiteral(fr"\p{{{value}}}") 14 | 15 | 16 | def PROPERTY_INVERSE(value: str) -> UnescapedLiteral: 17 | return UnescapedLiteral(fr"\P{{{value}}}") 18 | 19 | 20 | LETTER = PROPERTY("L") 21 | NON_LETTER = PROPERTY_INVERSE("L") 22 | 23 | WHITESPACE = PROPERTY("Z") 24 | NON_WHITESPACE = PROPERTY_INVERSE("Z") 25 | 26 | DIGIT = PROPERTY("Nd") 27 | NON_DIGIT = PROPERTY("Nd") -------------------------------------------------------------------------------- /test/test_groups.py: -------------------------------------------------------------------------------- 1 | from rgx import pattern, conditional, named 2 | 3 | a = pattern("a") 4 | b = pattern("b") 5 | 6 | 7 | class TestClass: 8 | def test_look_x(self): 9 | assert a.before(b).render_str() == "a(?=b)" 10 | assert a.after(b).render_str() == "(?<=b)a" 11 | 12 | assert a.not_before(b).render_str() == "a(?!b)" 13 | assert a.not_after(b).render_str() == "(?b)" 26 | -------------------------------------------------------------------------------- /rgx/meta.py: -------------------------------------------------------------------------------- 1 | from rgx.entities import UnescapedLiteral 2 | 3 | 4 | WORD_CHAR = UnescapedLiteral(r"\w") 5 | NON_WORD_CHAR = UnescapedLiteral(r"\W") 6 | DIGIT = UnescapedLiteral(r"\d") 7 | NON_DIGIT = UnescapedLiteral(r"\D") 8 | WHITESPACE = UnescapedLiteral(r"\s") 9 | NON_WHITESPACE = UnescapedLiteral(r"\S") 10 | WORD_BOUNDARY = UnescapedLiteral(r"\b") 11 | NON_WORD_BOUNDARY = UnescapedLiteral(r"\B") 12 | ANY = UnescapedLiteral(".") 13 | NEWLINE = UnescapedLiteral(r"\n") 14 | CARRIAGE_RETURN = UnescapedLiteral(r"\r") 15 | TAB = UnescapedLiteral(r"\t") 16 | NULL_CHAR = UnescapedLiteral(r"\0") 17 | STRING_START = UnescapedLiteral("^") 18 | STRING_END = UnescapedLiteral("$") 19 | 20 | def CHAR_ESCAPE(char_number: int): 21 | try: 22 | chr(char_number) 23 | except ValueError: 24 | raise ValueError(f"Invalid character: {char_number}") 25 | prefix = ["x", "u", "U"][(char_number > 255) + (char_number > 65535)] 26 | length = {"x": 2, "u": 4, "U": 8}[prefix] 27 | return UnescapedLiteral(f"\\{prefix}{char_number:0{length}x}") -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish on PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - lord 8 | paths: 9 | - pyproject.toml 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | name: Build and publish 15 | 16 | steps: 17 | - name: git-checkout 18 | uses: actions/checkout@v3 19 | 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: 3.9 24 | 25 | - name: Build and Publish 26 | env: 27 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} 28 | run: | 29 | python -m pip install poetry 30 | poetry install 31 | poetry build 32 | poetry config pypi-token.pypi $PYPI_TOKEN 33 | poetry publish 34 | 35 | - run: echo "VERSION=$(poetry version -s)" >> $GITHUB_ENV 36 | id: version-check 37 | 38 | - name: Release on GitHub 39 | uses: softprops/action-gh-release@v1 40 | with: 41 | files: dist/* 42 | tag_name: ${{ env.VERSION }} 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Dmitry Gritsenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - lord 8 | paths: 9 | - "**.py" 10 | - "**.yml" 11 | pull_request: 12 | paths: 13 | - "**.py" 14 | - "**.yml" 15 | jobs: 16 | check_types: 17 | runs-on: ubuntu-latest 18 | name: Check Types 19 | steps: 20 | - name: git-checkout 21 | uses: actions/checkout@v3 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v4 25 | with: 26 | python-version: "3.11" 27 | 28 | - name: Install Poetry 29 | uses: abatilo/actions-poetry@v2 30 | 31 | - run: poetry install --with dev 32 | - run: poetry run mypy rgx/*.py --disallow-any-expr 33 | 34 | run-tests: 35 | runs-on: ubuntu-latest 36 | strategy: 37 | matrix: 38 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 39 | 40 | name: Run Tests 41 | steps: 42 | - name: git-checkout 43 | uses: actions/checkout@v3 44 | 45 | - name: Set up Python 46 | uses: actions/setup-python@v4 47 | with: 48 | python-version: ${{ matrix.python-version }} 49 | 50 | - name: Install Poetry 51 | uses: abatilo/actions-poetry@v2 52 | 53 | - run: poetry install --with dev 54 | 55 | - name: Test 56 | run: poetry run coverage run --include "rgx/*" -m pytest test/ 57 | 58 | - name: Coveralls Update 59 | uses: coverallsapp/github-action@v2 60 | with: 61 | github-token: ${{ secrets.GITHUB_TOKEN }} 62 | flag-name: ${{ matrix.python-version }} 63 | -------------------------------------------------------------------------------- /test/test_creation.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from rgx import pattern, char_range, reference, named 3 | from rgx.entities import CharType, RegexPattern 4 | import pytest 5 | 6 | 7 | class TestClass: 8 | def test_literals(self): 9 | assert pattern("x").render_str() == "x" 10 | assert pattern(".").render_str() == "\\." 11 | assert pattern(".", escape=False).render_str() == "." 12 | assert pattern(("x",)).render_str() == "x" 13 | assert pattern(("x", "y")).render_str() == "(?:xy)" 14 | assert pattern(["x", "y"]).render_str() == "[xy]" 15 | 16 | def test_char_classes(self): 17 | onetwo_list: List[CharType] = ["1", "2"] 18 | onetwo_chars = pattern(onetwo_list) 19 | 20 | az_char_range = pattern("a").to("z") 21 | 22 | assert az_char_range.render_str() == "[a-z]" 23 | assert az_char_range.reverse().render_str() == "[^a-z]" 24 | assert (onetwo_chars | az_char_range).render_str() == "[12a-z]" 25 | assert (onetwo_list | az_char_range).render_str() == "[12a-z]" 26 | assert onetwo_chars.render_str() == "[12]" 27 | 28 | assert (pattern("1") | pattern("2")).render_str() == "[12]" 29 | 30 | assert pattern("1").to("2").render_str() == "[12]" 31 | assert pattern("1").to("3").render_str() == "[123]" 32 | assert pattern("1").to("4").render_str() == "[1-4]" 33 | 34 | assert (pattern("1").to("9") | "0").render_str() == "[0-9]" 35 | 36 | assert char_range("a").render_str() == "[a-]" 37 | assert char_range(None, "z").render_str() == "[-z]" 38 | 39 | assert pattern(["-"]).render_str() == "\\-" # not a range actually 40 | 41 | a = pattern("a") 42 | assert repr(a) == a.render_str() 43 | 44 | with pytest.raises(ValueError): 45 | char_range() 46 | 47 | def test_references(self): 48 | assert reference(1).render_str() == "\\1" 49 | assert named("x").render_str() == "(?P=x)" 50 | 51 | def test_flags(self): 52 | assert pattern("x").render_str("i") == "(?i)x" 53 | 54 | def test_that_render_on_regex_pattern_is_not_implemented_i_know_this_is_stupid_but_still( 55 | self, 56 | ): 57 | assert RegexPattern().render(RegexPattern.default_context) == NotImplemented 58 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | poetry.lock -------------------------------------------------------------------------------- /test/test_url.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from rgx.entities import UnescapedLiteral 4 | from rgx import pattern, char_range 5 | 6 | 7 | letter = char_range("a", "z") | char_range("A", "Z") 8 | nonzero = char_range("1", "9") 9 | digit = char_range("0", "9") 10 | mark = pattern(["-", "_", ".", "!", "~", "*", "'", "(", ")"]) 11 | reserved = pattern([";", "/", "?", ":", "@", "&", "=", "+", "$", ","]) 12 | unreserved = letter | digit | mark 13 | 14 | hex_char = digit | char_range("a", "f") | char_range("A", "F") 15 | escaped = "%" + hex_char.x_times(2) 16 | 17 | identifier = letter + (letter | digit | pattern(["+", "-", "."])).some() 18 | scheme = identifier.named("scheme") 19 | 20 | userinfo = ( 21 | (unreserved | escaped | pattern([";", ":", "&", "=", "+", "$", ","])) 22 | .many() 23 | .named("userinfo") 24 | ) 25 | 26 | domain = (letter | digit) + ( 27 | (letter | digit | pattern(["-"])).some() + (letter | digit) 28 | ).maybe() 29 | 30 | top_domain = ( 31 | letter + ((letter | digit | pattern(["-"])).some() + (letter | digit)).maybe() 32 | ) 33 | 34 | hostname = (domain + ".").some() + top_domain + pattern(".").maybe() 35 | 36 | ip_number = (pattern("1").maybe() + nonzero.maybe() + digit) | ( 37 | "2" + (char_range("0", "4") + digit | "5" + char_range("0", "5")) 38 | ) 39 | 40 | ip4_address = (ip_number + ".").x_times(3) + ip_number 41 | host = (hostname | ip4_address).named("host") 42 | 43 | port = digit.some().named("port") 44 | 45 | authority = ((userinfo + "@").maybe() + host + (":" + port).maybe()).named("authority") 46 | 47 | pchar = unreserved | escaped | pattern([":", "@", "&", "=", "+", "$", ",", ";"]) 48 | 49 | param = pchar.some() 50 | path_segment = param + (";" + param).some() 51 | path_segment_nonempty = pchar.many() | param + (";" + param).many() 52 | path_segments = path_segment.maybe() + ("/" + path_segment.maybe()).some() 53 | no_authority_path = (path_segment_nonempty + "/" + path_segments).maybe() 54 | 55 | path = ("/" + path_segments).named("path") 56 | 57 | autority_with_path = "//" + authority + path.maybe() 58 | no_authority_with_path = no_authority_path.named("path_noauthority") 59 | 60 | qfchars = (pchar | pattern(["?", "/"])).some() 61 | 62 | query = qfchars.named("query") 63 | fragment = qfchars.named("fragment") 64 | 65 | url = ( 66 | scheme 67 | + ":" 68 | + (autority_with_path | no_authority_with_path) 69 | + ("?" + query).maybe() 70 | + (UnescapedLiteral("#") + fragment).maybe() 71 | ) 72 | 73 | import re 74 | 75 | url_regex = re.compile(str(url)) 76 | 77 | 78 | test_suites: dict[str, dict] = { 79 | "https://datatracker.ietf.org/doc/html/rfc3986?asd=213#section-3.4": { 80 | "scheme": "https", 81 | "authority": "datatracker.ietf.org", 82 | "userinfo": None, 83 | "host": "datatracker.ietf.org", 84 | "port": None, 85 | "path": "/doc/html/rfc3986", 86 | "path_noauthority": None, 87 | "query": "asd=213", 88 | "fragment": "section-3.4", 89 | }, 90 | "http://http://http://@http://http://?http://#http://": { 91 | "scheme": "http", 92 | "authority": "http:", 93 | "userinfo": None, 94 | "host": "http", 95 | "port": "", 96 | "path": "//http://@http://http://", 97 | "path_noauthority": None, 98 | "query": "http://", 99 | "fragment": "http://", 100 | }, 101 | "https://mail.python.org/archives/list/typing-sig@python.org/thread/66RITIHDQHVTUMJHH2ORSNWZ6DOPM367/#QYOBBLTWVSEWMFRRHBA2OPR5QQ4IMWOL": { 102 | "scheme": "https", 103 | "authority": "mail.python.org", 104 | "userinfo": None, 105 | "host": "mail.python.org", 106 | "port": None, 107 | "path": "/archives/list/typing-sig@python.org/thread/66RITIHDQHVTUMJHH2ORSNWZ6DOPM367/", 108 | "path_noauthority": None, 109 | "query": None, 110 | "fragment": "QYOBBLTWVSEWMFRRHBA2OPR5QQ4IMWOL", 111 | }, 112 | } 113 | 114 | 115 | class TestClass: 116 | def test_url(self): 117 | for test_url, expected_result in test_suites.items(): 118 | match = url_regex.fullmatch(test_url) 119 | 120 | assert match and match.groupdict() == expected_result 121 | -------------------------------------------------------------------------------- /test/test_operations.py: -------------------------------------------------------------------------------- 1 | from rgx import pattern 2 | from rgx.entities import Option 3 | 4 | a = pattern("a") 5 | b = pattern("b") 6 | 7 | 8 | class TestClass: 9 | def test_concat(self): 10 | assert (a + "b").render_str() == "ab" 11 | assert ("b" + a).render_str() == "ba" 12 | assert (a + b).render_str() == "ab" 13 | assert (a + a + a).render_str() == "aaa" 14 | assert a.concat(b).concat(a).render_str() == (a + b + a).render_str() 15 | 16 | def test_option(self): 17 | # those are needed because one-char string produces Chars instance, thus making result render differently 18 | ab = pattern("ab") 19 | ac = pattern("ac") 20 | 21 | assert (ab | ac).render_str() == "ab|ac" 22 | assert (ab | "b").render_str() == "ab|b" 23 | assert ("a" | ac).render_str() == "a|ac" 24 | assert (ab | ac | ab).render_str() == "ab|ac|ab" 25 | assert ("a" | (ab | ac)).render_str() == "a|ab|ac" 26 | assert ab.option(ac).render_str() == (ab | ac).render_str() 27 | 28 | def test_char_option(self): 29 | assert (a | b).render_str() == "[ab]" 30 | assert (a | "b").render_str() == "[ab]" 31 | assert ("a" | b).render_str() == "[ab]" 32 | assert (a | b | a).render_str() == "[ab]" 33 | 34 | def test_quantifiers(self): 35 | assert a.many().render_str() == "a+" 36 | assert a.many(True).render_str() == "a+?" 37 | 38 | assert a.some().render_str() == "a*" 39 | assert a.some(True).render_str() == "a*?" 40 | 41 | assert a.maybe().render_str() == "a?" 42 | assert a.maybe(lazy=True).render_str() == "a??" 43 | 44 | assert ( 45 | a.maybe() 46 | .maybe() 47 | .maybe() 48 | .maybe() 49 | .maybe() 50 | .maybe() 51 | .maybe() 52 | .maybe() 53 | .render_str() 54 | == "a?" 55 | ) 56 | 57 | assert a.many().many().render_str() == "a+" 58 | 59 | def test_range_quantifier(self): 60 | assert a.repeat(5).or_less().render_str() == "a{,5}" 61 | assert a.x_or_less_times(5).render_str() == "a{,5}" 62 | 63 | assert a.repeat(5, lazy=True).or_less().render_str() == "a{,5}?" 64 | assert a.x_or_less_times(5, lazy=True).render_str() == "a{,5}?" 65 | 66 | assert a.repeat(5).or_more().render_str() == "a{5,}" 67 | assert a.x_or_more_times(5).render_str() == "a{5,}" 68 | 69 | assert a.repeat(5, lazy=True).or_more().render_str() == "a{5,}?" 70 | assert a.x_or_more_times(5, lazy=True).render_str() == "a{5,}?" 71 | 72 | assert a.repeat(5).render_str() == "a{5}" 73 | assert a.x_times(5).render_str() == "a{5}" 74 | 75 | assert a.repeat(5, lazy=True).render_str() == "a{5}" 76 | assert a.x_times(5, lazy=True).render_str() == "a{5}" 77 | 78 | assert a.repeat(4).to(5).render_str() == "a{4,5}" 79 | assert a.repeat_from(4).to(5).render_str() == "a{4,5}" 80 | assert a.between_x_y_times(4, 5).render_str() == "a{4,5}" 81 | 82 | assert a.repeat(5).many().render_str() == "(?:a{5})+" 83 | assert a.many().repeat(10).render_str() == "a{10,}" 84 | assert a.repeat(5).to(10).repeat(20).render_str() == "(?:a{5,10}){20}" 85 | 86 | # check Range.repeat() for explanation 87 | assert a.repeat(5).repeat(10).render_str() == "(?:a{5}){10}" 88 | 89 | # specific cases 90 | assert a.repeat(1).or_less().render_str() == "a?" 91 | assert a.repeat(1, True).or_less().render_str() == "a??" 92 | 93 | assert a.repeat(1).or_more().render_str() == "a+" 94 | assert a.repeat(1, lazy=True).or_more().render_str() == "a+?" 95 | 96 | assert a.repeat(0).or_more().render_str() == "a*" 97 | assert a.repeat(0, lazy=True).or_more().render_str() == "a*?" 98 | 99 | assert a.repeat(1).render_str() == "a" 100 | assert a.repeat(1, lazy=True).render_str() == "a" 101 | 102 | assert a.repeat(5).to(4).render_str() == "a{4,5}" 103 | 104 | assert a.repeat(0).render_str() == "" 105 | assert a.repeat(1).render_str() == a.render_str() 106 | 107 | def test_priority(self): 108 | ab = pattern("ab") 109 | ac = pattern("ac") 110 | 111 | assert ((ab | ac) + b).render_str() == "(?:ab|ac)b" 112 | assert ((a + b) | b).render_str() == "ab|b" 113 | assert (a + b).many().render_str() == "(?:ab)+" 114 | 115 | def test_flags(self): 116 | assert a.set_flags("i").render_str() == "(?i:a)" 117 | 118 | def test_empty_option(self): 119 | assert Option().render_str() == "" 120 | 121 | def test_flag_merging(self): 122 | c = (pattern("one") | "two").case_insensitive() 123 | assert c.render_str() == "(?i:one|two)" 124 | 125 | def test_case_insensitive_chars(self): 126 | c = (a + "test").case_insensitive() 127 | 128 | assert c.render_str() == "(?i:[Aa]test)" 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![wordstreamer badge](https://img.shields.io/badge/renderable-what?label=wordstreamer&color=%2333bb33)](https://github.com/evtn/wordstreamer) 2 | 3 | Many people complain about unreadable and complex syntax of regular expressions. 4 | Many others complain about how they can't remember all constructs and features. 5 | 6 | `rgx` solves those problems: it is a straightforward regexp builder. It also places non-capturing groups where needed to respect intended operator priority. 7 | It can produce a regular expression string to use in `re.compile` or any other regex library of your choice. 8 | 9 | In other words, with `rgx` you can build a regular expression from parts, using straightforward and simple expressions. 10 | 11 | ## Installation 12 | 13 | `pip install rgx` 14 | 15 | That's it. 16 | 17 | ## Basic usage 18 | 19 | ### Hello, regex world 20 | 21 | ```python 22 | from rgx import pattern, meta 23 | import re 24 | 25 | separator = meta.WHITESPACE.some() + (meta.WHITESPACE | ",") + meta.WHITESPACE.some() 26 | 27 | # matches "hello world", "hello, world", "hello world", "hello,world", "hello , world" 28 | hello_world = pattern(( 29 | "hello", 30 | separator, 31 | "world" 32 | )) # (?:hello(?:\s)*(?:\s|,)(?:\s)*world) 33 | 34 | re.compile( 35 | hello_world.render_str("i") # global flag (case-insensitive) 36 | ) 37 | 38 | ``` 39 | 40 | ### Match some integers 41 | 42 | this regex will match valid Python integer literals: 43 | 44 | ```python 45 | from rgx import pattern 46 | import re 47 | 48 | nonzero = pattern("1").to("9") # [1-9] 49 | zero = "0" 50 | digit = zero | nonzero # [0-9] 51 | integer = zero | (nonzero + digit.some()) # 0|[1-9][0-9]* 52 | 53 | int_regex = re.compile(str(integer)) 54 | 55 | ``` 56 | 57 | ...or this one: 58 | 59 | ```python 60 | from rgx import pattern, meta 61 | import re 62 | 63 | nonzero = pattern("1").to("9") # [1-9] 64 | digit = meta.DIGIT # \d 65 | integer = digit | (nonzero + digit.some()) # \d|[1-9]\d* 66 | 67 | int_regex = re.compile(str(integer)) 68 | 69 | ``` 70 | 71 | ## Quickstart 72 | 73 | _in this readme, `x` means some pattern object. Occasionaly, `y` is introduced to mean some other pattern object (or literal)_ 74 | 75 | ### Literals and pattern objects 76 | 77 | `rgx` operates mostly on so-called "pattern objects" — `rgx.entities.RegexPattern` istances. 78 | Your starting point would be `rgx.pattern` — it creates pattern objects from literals (and from pattern objects, which doesn't make a lot of sense). 79 | 80 | - `rgx.pattern(str, escape: bool = True)` creates a literal pattern — one that exactly matches given string. If you want to disable escaping, pass `escape=False` 81 | - `rgx.pattern(tuple[AnyRegexPattern])` creates a non-capturing group of patterns (nested literals will be converted too) 82 | - `rgx.pattern(list[str])` creates a character class (for example, `rgx.pattern(["a", "b", "c"])` creates pattern `[abc]`, that matches any character of those in brackets) 83 | - Same can be achieved by `rgx.pattern("a").to("c")` or `rgx.pattern("a") | "b" | "c"` 84 | 85 | Most operations with pattern objects support using Python literals on one side, for example: `rgx.pattern("a") | b` would produce `[ab]` pattern object (specifically, `rgx.entities.Chars`) 86 | 87 | ### Rendering patterns 88 | 89 | ```python 90 | 91 | from rgx import pattern 92 | 93 | x = pattern("one") 94 | y = pattern("two") 95 | p = x | y 96 | 97 | rendered_with_str = str(p) # "one|two" 98 | rendered_with_method = p.render_str() # "one|two" 99 | rendered_with_method_flags = p.render_str("im") # (?im)one|two 100 | ``` 101 | 102 | ### Capturing Groups 103 | 104 | ```python 105 | from rgx import pattern, reference, named 106 | 107 | x = pattern("x") 108 | 109 | print(x.capture()) # (x) 110 | 111 | print(reference(1)) # \1 112 | 113 | 114 | named_x = x.named("some_x") # x.named(name: str) 115 | 116 | print(named_x) # (?Px) 117 | 118 | named_x_reference = named("some_x") 119 | 120 | print(named_x_reference) # (?P=x) 121 | 122 | ``` 123 | 124 | To create a capturing group, use `x.capture()`, or `rgx.reference(group: int)` for a reference. 125 | To create a named capturing group, use `rgx.named(name: str, x)`, or `rgx.named(name: str)` for a named reference. 126 | 127 | ### Character classes 128 | 129 | ```python 130 | from rgx import pattern, meta 131 | 132 | 133 | az = pattern("a").to("z") # rgx.Chars.to(other: str | Literal | Chars) 134 | print(az) # [a-z] 135 | 136 | digits_or_space = pattern(["1", "2", "3", meta.WHITESPACE]) 137 | print(digits_or_space) # [123\s] 138 | 139 | print(az | digits_or_space) # [a-z123\s] 140 | 141 | 142 | print( # rgx.Chars.reverse(self) 143 | (az | digits_or_space).reverse() # [^a-z123\s] 144 | ) 145 | 146 | ``` 147 | 148 | #### Excluding characters 149 | 150 | If you have two instances of Chars (or compatible literals), you can exclude one from another: 151 | 152 | ```python 153 | from rgx import pattern 154 | 155 | letters = pattern("a").to("z") | pattern("A").to("Z") # [A-Za-z] 156 | vowels = pattern(list("aAeEiIoOuU")) # [AEIOUaeiou] 157 | consonants = letters.exclude(vowels) # [BCDFGHJ-NP-TV-Zbcdfghj-np-tv-z] 158 | ``` 159 | 160 | ### Conditional pattern 161 | 162 | ```python 163 | from rgx import pattern, conditional 164 | 165 | x = pattern("x") 166 | y = pattern("y") 167 | z = pattern("z") 168 | 169 | capture = x.capture() 170 | 171 | # (x)(?(1)y|z) 172 | print( 173 | capture + conditional(1, y, z) 174 | ) 175 | ``` 176 | 177 | ### Repeating patterns 178 | 179 | If you need to match a repeating pattern, you can use `pattern.repeat(count, lazy)`: 180 | 181 | ```python 182 | a = pattern("a") 183 | 184 | a.repeat(5) # a{5} 185 | # or 186 | a * 5 # a{5}, multiplication is an alias for .repeat 187 | 188 | a.repeat(5).or_more() # a{5,} 189 | a.repeat(5).or_less() # a{,5} 190 | 191 | a.repeat_from(4).to(5) # a{4, 5}, .repeat_from is just an alias for .repeat 192 | # or 193 | a.repeat(4) >> 5 # a{4, 5} 194 | 195 | a.repeat(1).or_less() # a? 196 | # or 197 | -a.repeat(1) # a? 198 | # or 199 | a.maybe() # a? 200 | 201 | a.repeat(1).or_more() # a+ 202 | # or 203 | +a.repeat(1) # a+ 204 | # or 205 | +a # a+ 206 | # or 207 | a.many() # a+ 208 | 209 | a.repeat(0).or_more() # a* 210 | # or 211 | +a.repeat(0) # a* 212 | # or 213 | a.some() # a* 214 | # or (what) 215 | +-(a * 38) # a* 216 | ``` 217 | 218 | Here's what's going on: 219 | `pattern.repeat(count, lazy)` returns a `{count, count}` `Range` object 220 | `pattern * count` is the same as `pattern.repeat(count, False)` 221 | 222 | `Range` implements `or_more`, `or_less` and `to` methods: 223 | 224 | - `Range.or_more()` [or `+Range`] moves (on a copy) upper bound of range to infinity (actually `None`) 225 | - `Range.or_less()` [or `-Range`] moves (on a copy) lower bound of range to 0 226 | - `Range.to(count)` [or `Range >> count` (right shift)] replaces upper bound with given number 227 | 228 | Also, RegexPattern implements unary plus (`+pattern`) as an alias for `pattern.many()` 229 | 230 | ## Docs 231 | 232 | ### Pattern methods 233 | 234 | #### `pattern.render_str(flags: str = '') -> str` 235 | 236 | Renders given pattern into a string with specified global flags. 237 | 238 | --- 239 | 240 | #### `pattern.set_flags(flags: str) -> LocalFlags` 241 | 242 | This method adds local flags to given pattern 243 | 244 | ```python 245 | x.flags("y") # "(?y:x)" 246 | ``` 247 | 248 | --- 249 | 250 | #### `pattern.concat(other: AnyRegexPattern) -> Concat` 251 | 252 | Use to match one pattern and then another. 253 | 254 | `A.concat(B)` is equivalent to `A + B` (works if either A or B is a RegexPart object, not a Python literal) 255 | 256 | ```python 257 | x.concat(y) # "xy" 258 | x + y # "xy" 259 | ``` 260 | 261 | --- 262 | 263 | #### `pattern.option(other: AnyRegexPattern) -> Chars | ReversedChars | Option` 264 | 265 | Use to match either one pattern or another. 266 | 267 | `A.option(B)` is equivalent to `A | B` (if either A or B is a RegexPart object, not a Python literal) 268 | 269 | ```python 270 | x.option(y) # "x|y" 271 | x | y # "x|y" 272 | ``` 273 | 274 | --- 275 | 276 | #### `pattern.many(lazy: bool = False) -> Range` 277 | 278 | Use this for repeating patterns (one or more times) 279 | 280 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 281 | 282 | ```python 283 | x.many() # "x+" 284 | x.many(True) # "x+?" 285 | ``` 286 | 287 | --- 288 | 289 | #### `pattern.some(lazy: bool = False) -> Range` 290 | 291 | Use this for repeating optional patterns (zero or more times) 292 | 293 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 294 | 295 | ```python 296 | x.some() # "x*" 297 | x.some(True) # "x*?" 298 | ``` 299 | 300 | --- 301 | 302 | #### `pattern.maybe(lazy: bool = False) -> Range` 303 | 304 | Use this for optional patterns (zero or one times) 305 | 306 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 307 | 308 | ```python 309 | x.maybe() # "x?" 310 | x.maybe(True) # "x??" 311 | ``` 312 | 313 | --- 314 | 315 | #### `pattern.x_or_less_times(count: int, lazy: bool = False) -> Range` 316 | 317 | Use this to match pattern x or less times (hence the name). 318 | 319 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 320 | 321 | ```python 322 | x.x_or_less_times(5) # "x{,5}" 323 | x.x_or_less_times(5, True) # "x{,5}?" 324 | ``` 325 | 326 | --- 327 | 328 | #### `pattern.x_or_more_times(count: int, lazy: bool = False) -> Range` 329 | 330 | Use this to match pattern x or more times (hence the name). 331 | 332 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 333 | 334 | ```python 335 | x.x_or_more_times(5) # "x{5,}" 336 | x.x_or_more_times(5, True) # "x{5,}?" 337 | ``` 338 | 339 | --- 340 | 341 | #### `pattern.x_times(count: int, lazy: bool = False) -> Range` 342 | 343 | Use this to match pattern exactly x times (hence the name). 344 | 345 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 346 | 347 | ```python 348 | x.x_times(5) # "x{5}" 349 | x.x_times(5, True) # "x{5}?" 350 | x.repeat(5) # x{5} 351 | ``` 352 | 353 | --- 354 | 355 | #### `pattern.between_x_y_times(min_count: int, max_count: int, lazy: bool = False) -> Range` 356 | 357 | Use this to match pattern between x and y times, inclusive (hence the name). 358 | 359 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 360 | 361 | ```python 362 | x.between_x_y_times(5, 6) # "x{5,6}" 363 | x.between_x_y_times(5, 6, True) # "x{5,6}?" 364 | ``` 365 | 366 | --- 367 | 368 | #### `pattern.lookahead(other: RegexPattern) -> Concat` 369 | 370 | Use this to indicate that given pattern occurs before some another pattern (lookahead). 371 | 372 | In other words, `x.lookahead(y)` matches a pattern `x` only if there is `y` after it 373 | 374 | Lookahead pattern won't be captured. 375 | 376 | ```python 377 | x.lookahead(y) # x(?=y) 378 | x.before(y) # x(?=y) 379 | ``` 380 | 381 | --- 382 | 383 | #### `pattern.negative_lookahead(other) -> Concat` 384 | 385 | Use this to indicate that given pattern doesn't occur before some another pattern (negative lookahead). 386 | 387 | In other words, `x.negative_lookahead(y)` matches a pattern `x` only if there is no `y` after it 388 | 389 | Lookahead pattern won't be captured. 390 | 391 | ```python 392 | x.negative_lookahead(y) # x(?!y) 393 | x.not_before(y) # x(?!y) 394 | ``` 395 | 396 | --- 397 | 398 | #### `pattern.lookbehind(other: RegexPattern) -> Concat` 399 | 400 | Use this to indicate that given pattern occurs after some another pattern (lookbehind). 401 | 402 | In other words, `x.lookbehind(y)` matches a pattern `x` only if there is `y` before it 403 | 404 | Lookbehind pattern won't be captured. 405 | 406 | ```python 407 | x.lookbehind(y) # (?<=y)x 408 | x.after(y) # (?<=y)x 409 | ``` 410 | 411 | --- 412 | 413 | #### `pattern.negative_lookbehind(other) -> Concat` 414 | 415 | Use this to indicate that given pattern goes before some another pattern (negative lookbehind). 416 | 417 | In other words, `x.negative_lookbehind(y)` matches a pattern `x` only if there is NO `y` before it 418 | 419 | Lookbehind pattern won't be captured. 420 | 421 | ```python 422 | x.negative_lookbehind(y) # (? Group` 429 | 430 | Use this to make a capturing group out of pattern. 431 | 432 | ```python 433 | x.capture() # (x) 434 | ``` 435 | 436 | ### Meta 437 | 438 | `rgx.meta` is a collection of different meta-sequences and anchors: 439 | 440 | ```python 441 | meta.WORD_CHAR = UnescapedLiteral(r"\w") 442 | meta.NON_WORD_CHAR = UnescapedLiteral(r"\W") 443 | meta.DIGIT = UnescapedLiteral(r"\d") 444 | meta.NON_DIGIT = UnescapedLiteral(r"\D") 445 | meta.WHITESPACE = UnescapedLiteral(r"\s") 446 | meta.NON_WHITESPACE = UnescapedLiteral(r"\S") 447 | meta.WORD_BOUNDARY = UnescapedLiteral(r"\b") 448 | meta.NON_WORD_BOUNDARY = UnescapedLiteral(r"\B") 449 | meta.ANY = UnescapedLiteral(".") 450 | meta.NEWLINE = UnescapedLiteral(r"\n") 451 | meta.CARRIAGE_RETURN = UnescapedLiteral(r"\r") 452 | meta.TAB = UnescapedLiteral(r"\t") 453 | meta.NULL_CHAR = UnescapedLiteral(r"\0") 454 | meta.STRING_START = UnescapedLiteral("^") 455 | meta.STRING_END = UnescapedLiteral("$") 456 | ``` 457 | 458 | Also `rgx.meta.CHAR_ESCAPE(char_number: int)` is available: 459 | 460 | ```python 461 | from rgx import meta 462 | 463 | print(meta.CHAR_ESCAPE(32)) # \x20 464 | print(meta.CHAR_ESCAPE(320)) # \u0140 465 | print(meta.CHAR_ESCAPE(320000)) # \U0004e200 466 | 467 | ``` 468 | 469 | ### Unicode meta 470 | 471 | `rgx.unicode_meta` is a collection of functions and constants, mostly for `\p` and `\P` usage: 472 | 473 | Functions: 474 | 475 | ```python 476 | unicode_meta.PROPERTY(value: str) # renders into `\p{value}` (any character with property specified by value, e.g. `PROPERTY("Ll") -> \p{Ll}`) 477 | unicode_meta.PROPERTY_INVERSE(value: str) # matches all characters *not* matched by corresponding `PROPERTY` (`\P{value}`) 478 | 479 | unicode_meta.NAMED_PROPERTY(name: str, value: str) # renders into `\p{name=value}` and matches any character which property `name` equals `value` 480 | unicode_meta.NAMED_PROPERTY_INVERSE(name: str, value: str) # same, but inverted (`\P{name=value}`) 481 | ``` 482 | 483 | Constants: 484 | 485 | ```python 486 | unicode_meta.LETTER = PROPERTY("L") 487 | unicode_meta.NON_LETTER = PROPERTY_INVERSE("L") 488 | 489 | unicode_meta.WHITESPACE = PROPERTY("Z") 490 | unicode_meta.NON_WHITESPACE = PROPERTY_INVERSE("Z") 491 | 492 | unicode_meta.DIGIT = PROPERTY("Nd") 493 | unicode_meta.NON_DIGIT = PROPERTY("Nd") 494 | ``` 495 | 496 | ## Extending 497 | 498 | You can extend generation by subclassing one of the classes of `rgx.entities` module. 499 | The one neccessary method to provide is `.render(self, context: rgx.Context)`. It should return an iterable of strings (e.g. `["something"]`). 500 | Built-in components (and this section) are using generators for that purpose, but you're free to choose whatever works for you. 501 | For example, if you want to render a PCRE accept control verb - `(*ACCEPT)`, you can do it like this: 502 | 503 | ```python 504 | from rgx.entities import RegexPattern, Concat 505 | from rgx import pattern, Context 506 | from typing import Iterable 507 | 508 | 509 | class Accept(RegexPattern): 510 | def render(self, context: Context) -> Iterable[str]: 511 | yield "(*ACCEPT)" 512 | 513 | 514 | def accept(self) -> Concat: 515 | return self + Accept() 516 | 517 | 518 | RegexPattern.accept = accept 519 | 520 | x = pattern("something").accept() 521 | print(x) # something(*ACCEPT) 522 | ``` 523 | 524 | Or like this: 525 | 526 | ```python 527 | from rgx.entities import RegexPattern, Concat 528 | from rgx import pattern, Context 529 | from typing import Iterable 530 | 531 | 532 | class Accept(RegexPattern): 533 | def __init__(self, accepted_pattern: RegexPattern): 534 | self.accepted_pattern = accepted_pattern 535 | 536 | def render(self, context: Context) -> Iterable[str]: 537 | yield from accepted_pattern.render(context) 538 | yield "(*ACCEPT)" 539 | 540 | 541 | def accept(self) -> Accept: 542 | return Accept(self) 543 | 544 | RegexPattern.accept = accept 545 | 546 | x = pattern("something").accept() # something(*ACCEPT) 547 | ``` 548 | 549 | ### Priority 550 | 551 | If your extension has to rely on some priority, you can use `respect_priority` function. 552 | Let's say you want to add a `x/y` operation, which does something (wow) and has prority between `a|b` and `ab` — so `a|b/cd` is the same as `a|(?:b/(?:cd))`. 553 | 554 | ```python 555 | from rgx.entities import RegexPattern, Concat, Option, AnyRegexPattern, respect_priority, pattern, Context 556 | from typing import Iterable 557 | 558 | class MagicSlash(RegexPattern): 559 | priority = (Concat.priority + Option.priority) // 2 # let's take something in the middle 560 | 561 | def __init__(self, left: RegexPattern, right: RegexPattern): 562 | self.left = respect_priority(left, self.priority) # you need to wrap all parts of your expression in respect_priority() 563 | self.right = respect_priority(right, self.priority) # ...and pass your expression priority as a second argument 564 | 565 | def render(self, context: Context) -> Iterable[str]: 566 | yield from self.left.render(context) 567 | yield "/" 568 | yield from self.right.render(context) 569 | 570 | 571 | def slash(self, other: AnyRegexPattern) -> MagicSlash: # AnyRegexPattern is either a RegexPattern instance or a Python literal 572 | return MagicSlash(self, other) # respect_priority already takes literals in consideration, so no extra actions needed 573 | 574 | def rslash(self, other: AnyRegexPattern) -> MagicSlash: # other/self 575 | other = pattern(other) 576 | return other / self 577 | 578 | 579 | RegexPattern.slash = slash 580 | RegexPattern.__truediv__ = slash # / operator 581 | RegexPattern.__rtruediv__ = rslash 582 | 583 | 584 | a = pattern("a") 585 | b = pattern("b") 586 | c = pattern("c") 587 | d = pattern("d") 588 | 589 | print( 590 | (a | b) / (c + d) # [ab]/cd 591 | ) 592 | 593 | print( 594 | ((a | b) / c) + d # (?:[ab]/c)d 595 | ) 596 | 597 | print( 598 | a | (b / c) + d # a|(?:b/c)d 599 | ) 600 | 601 | ``` 602 | 603 | ## Common questions 604 | 605 | ### Difference between `(x, y)` and `x + y` 606 | 607 | Previous examples used `()` and `+`, and the difference might not be so obvious. 608 | 609 | - `x + y` creates a concatenation of patterns (`rgx.entities.Concat`), with no extra characters apart from those of patterns 610 | - `x + y` can be used only if at least one of the operands is a pattern object (that is, created with one of `rgx` functions or is one of `rgx` constants) 611 | - `x + y` produces a pattern object itself, so you won't need to call `pattern` on it to call pattern methods 612 | 613 | - `pattern((x, y))` creates a non-capturing group (`rgx.entities.NonCapturingGroup`): `pattern((x, y)).render_str()` -> `(?:xy)` 614 | - `(x, y)` can be used with any pattern-like literals or pattern objects 615 | - `(x, y)` is a tuple literal, so you can't use pattern methods on it directly or convert it into a complete expression (you need to use `rgx.pattern` on it first) 616 | -------------------------------------------------------------------------------- /rgx/entities.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import ( 3 | Callable, 4 | NoReturn, 5 | Optional, 6 | Tuple, 7 | List, 8 | Union, 9 | cast, 10 | overload, 11 | Sequence, 12 | TYPE_CHECKING, 13 | ) 14 | 15 | from wordstreamer import Context, Renderable as BaseRenderable, Renderer, TokenStream 16 | from wordstreamer.internal_types import Comparator, Payload 17 | 18 | if TYPE_CHECKING: 19 | from typing import Literal as LiteralType, Self 20 | 21 | 22 | import re 23 | 24 | CharType = Union[str, "CharRange", "Literal"] 25 | LiteralPart = Union[Tuple["AnyRegexPattern", ...], List[CharType], str] 26 | AnyRegexPattern = Union[LiteralPart, "RegexPattern"] 27 | Processor = Callable[["RegexPattern"], "RegexPattern"] 28 | 29 | OrResult = Union["Option", "Chars", "ReversedChars"] 30 | 31 | priority_step = 1000 32 | 33 | 34 | @overload 35 | def pattern(literal: str, escape: LiteralType[False]) -> UnescapedLiteral: 36 | ... 37 | 38 | 39 | @overload 40 | def pattern(literal: str, escape: bool = True) -> Literal | Chars: 41 | ... 42 | 43 | 44 | @overload 45 | def pattern( 46 | literal: tuple[AnyRegexPattern, ...], escape: bool = True 47 | ) -> RegexPattern | NonCapturingGroup: 48 | ... 49 | 50 | 51 | @overload 52 | def pattern(literal: list[CharType], escape: bool = True) -> Chars: 53 | ... 54 | 55 | 56 | @overload 57 | def pattern(literal: AnyRegexPattern, escape: bool = True) -> RegexPattern: 58 | ... 59 | 60 | 61 | def pattern(literal: AnyRegexPattern, escape: bool = True) -> RegexPattern: 62 | """ 63 | 64 | A universal pattern constructor. 65 | 66 | - With a string, returns a literan pattern. with `escape=False` returns an unescaped pattern. 67 | - With a tuple, returns a non-capturing group of patterns (or just one pattern if tuple has one element) 68 | - With a list, returns a character group (`[...]`). List must consist of strings and CharRange 69 | 70 | """ 71 | if isinstance(literal, RegexPattern): 72 | return literal 73 | 74 | if isinstance(literal, str): 75 | if not escape: 76 | return UnescapedLiteral(literal) 77 | if len(literal) == 1: 78 | return Chars([literal]) 79 | return Literal(literal) 80 | 81 | if isinstance(literal, tuple): 82 | if len(literal) == 1: 83 | return pattern(literal[0]) 84 | return NonCapturingGroup(Concat(*literal)) 85 | 86 | if isinstance(literal, list): 87 | return Chars(literal) 88 | 89 | 90 | def respect_priority(contents: AnyRegexPattern, other_priority: int) -> RegexPattern: 91 | return cast( 92 | RegexPattern, 93 | pattern(contents).respect_priority( 94 | _PriorityShell(other_priority), 95 | ), 96 | ) 97 | 98 | 99 | class RegexPattern(BaseRenderable): 100 | priority: int = 100 * priority_step 101 | optimized = False 102 | default_context: Context = Context(Renderer()) 103 | 104 | def wrap(self): 105 | return NonCapturingGroup(self) 106 | 107 | def render(self, context: Context) -> TokenStream: 108 | """ 109 | Internal method 110 | 111 | Returns a generator, that can be joined to get a pattern string representation 112 | """ 113 | return NotImplemented 114 | 115 | def stream(self, context: Context) -> TokenStream: 116 | return self.render(context) 117 | 118 | def case_insensitive(self) -> RegexPattern: 119 | return self.set_flags("i") 120 | 121 | def merge_flags(self) -> RegexPattern: 122 | return self 123 | 124 | def optimize(self) -> RegexPattern: 125 | self = self.apply(lambda x: x.optimize()) 126 | self = self.merge_flags() 127 | 128 | self.optimized = True 129 | return self 130 | 131 | def apply(self, fn: Processor) -> Self: 132 | return self 133 | 134 | @staticmethod 135 | def merge_flags_abstract( 136 | parts: Sequence[RegexPattern], 137 | ) -> tuple[Sequence[RegexPattern], set[str]]: 138 | common_flags: set[str] | None = None 139 | 140 | for part in parts: 141 | if not isinstance(part, FlagLike): 142 | return parts, set() 143 | 144 | flags = set(part.flags) 145 | 146 | if common_flags is None: 147 | common_flags = flags 148 | else: 149 | common_flags &= flags 150 | 151 | if not common_flags: 152 | return parts, set() 153 | 154 | new_parts: list[RegexPattern] = [] 155 | 156 | for alt in parts: 157 | assert isinstance(alt, FlagLike) 158 | new_flags = "".join(f for f in alt.flags if f not in common_flags) 159 | 160 | if not new_flags: 161 | new_parts.append(alt.inner) 162 | elif new_flags != alt.flags: 163 | new_parts.append(LocalFlags(alt.inner, new_flags)) 164 | else: 165 | new_parts.append(alt) 166 | 167 | return new_parts, common_flags 168 | 169 | def render_str(self, flags: str = "", payload: Payload | None = None) -> str: 170 | """ 171 | 172 | Renders given pattern into a string with specified global flags. 173 | 174 | """ 175 | 176 | renderer = Renderer(payload) 177 | 178 | parts: list[BaseRenderable] = [] 179 | 180 | if flags: 181 | parts.append(GlobalFlags(flags)) 182 | 183 | parts.append(self.optimize()) 184 | 185 | return "".join(map(renderer.render_string, parts)) 186 | 187 | def __repr__(self) -> str: 188 | return self.render_str() 189 | 190 | def set_flags(self, flags: str) -> LocalFlags: 191 | """ 192 | This method adds local flags to given pattern 193 | 194 | ```python 195 | x.flags("y") # "(?y:x)" 196 | ``` 197 | """ 198 | return LocalFlags(self, flags) 199 | 200 | def __add__(self, other: AnyRegexPattern) -> Concat: 201 | return Concat(self, other) 202 | 203 | def __radd__(self, other: AnyRegexPattern) -> Concat: 204 | return Concat(other, self) 205 | 206 | def concat(self, other: AnyRegexPattern) -> Concat: 207 | """ 208 | Use to match one pattern and then another. 209 | 210 | `A.concat(B)` is equivalent to `A + B` (if either A or B is a RegexPart object, not a Python literal) 211 | 212 | ```python 213 | x.concat(y) # "xy" 214 | x + y # "xy" 215 | ``` 216 | """ 217 | return self + other 218 | 219 | def __or__(self, other: AnyRegexPattern) -> OrResult: 220 | return Option(self, other) 221 | 222 | def __ror__(self, other: AnyRegexPattern) -> OrResult: 223 | return respect_priority(other, Option.priority) | self 224 | 225 | def option(self, other: AnyRegexPattern) -> OrResult: 226 | """ 227 | Use to match either one pattern or another. 228 | 229 | `A.option(B)` is equivalent to `A | B` (if either A or B is a RegexPart object, not a Python literal) 230 | 231 | ```python 232 | x.option(y) # "x|y" 233 | x | y # "x|y" 234 | ``` 235 | """ 236 | return self | other 237 | 238 | def repeat(self, count: int, lazy: bool = False) -> Range: 239 | return Range(self, min_count=count, max_count=count, lazy=lazy) 240 | 241 | def __mul__(self, other: int) -> Range: 242 | return self.repeat(other) 243 | 244 | repeat_from = repeat 245 | 246 | def many(self, lazy: bool = False) -> Range: 247 | """ 248 | Use this for repeating patterns (one or more times) 249 | 250 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 251 | 252 | ```python 253 | x.many() # "x+" 254 | x.many(True) # "x+?" 255 | ``` 256 | """ 257 | result: Range = self.repeat(1, lazy).or_more() 258 | return result 259 | 260 | def plus(self, lazy: bool = False): 261 | """alias for .many""" 262 | return self.many(lazy) 263 | 264 | def __pos__(self): 265 | return self.many() 266 | 267 | def some(self, lazy: bool = False) -> Range: 268 | """ 269 | Use this for repeating optional patterns (zero or more times) 270 | 271 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 272 | 273 | ```python 274 | x.some() # "x*" 275 | x.some(True) # "x*?" 276 | ``` 277 | """ 278 | 279 | return self.repeat(0, lazy).or_more() 280 | 281 | def star(self, lazy: bool = False): 282 | """alias for .some""" 283 | return self.some(lazy) 284 | 285 | def maybe(self, lazy: bool = False) -> Range: 286 | """ 287 | Use this for optional patterns (zero or one times) 288 | 289 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 290 | 291 | ```python 292 | x.maybe() # "x?" 293 | x.maybe(True) # "x??" 294 | ``` 295 | """ 296 | return self.repeat(1, lazy).or_less() 297 | 298 | def optional(self, lazy: bool = False): 299 | """alias for .maybe""" 300 | return self.maybe(lazy) 301 | 302 | def x_or_less_times(self, count: int, lazy: bool = False) -> Range: 303 | """ 304 | 305 | Use this to match pattern x or less times (hence the name). 306 | 307 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 308 | 309 | ```python 310 | x.x_or_less_times(5) # "x{,5}" 311 | x.x_or_less_times(5, True) # "x{,5}?" 312 | ``` 313 | """ 314 | return self.repeat(count, lazy).or_less() 315 | 316 | def x_or_more_times(self, count: int, lazy: bool = False) -> Range: 317 | """ 318 | 319 | Use this to match pattern x or more times (hence the name). 320 | 321 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 322 | 323 | ```python 324 | x.x_or_more_times(5) # "x{5,}" 325 | x.x_or_more_times(5, True) # "x{5,}?" 326 | ``` 327 | """ 328 | return self.repeat(count, lazy).or_more() 329 | 330 | def x_times(self, count: int, lazy: bool = False) -> Range: 331 | """ 332 | 333 | Use this to match pattern exactly x times (hence the name). 334 | 335 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 336 | 337 | ```python 338 | x.x_times(5) # "x{5}" 339 | x.x_times(5, True) # "x{5}?" 340 | ``` 341 | """ 342 | return self.repeat(count, lazy) 343 | 344 | def between_x_y_times( 345 | self, min_count: int, max_count: int, lazy: bool = False 346 | ) -> Range: 347 | """ 348 | 349 | Use this to match pattern between x and y times, inclusive (hence the name). 350 | 351 | When not lazy, matches as many times as possible, otherwise matches as few times as possible. 352 | 353 | ```python 354 | x.between_x_y_times(5, 6) # "x{5,6}" 355 | x.between_x_y_times(5, 6, True) # "x{5,6}?" 356 | ``` 357 | """ 358 | return self.repeat(min_count, lazy).to(max_count) 359 | 360 | def lookahead(self, other: AnyRegexPattern) -> Concat: 361 | """ 362 | Use this to indicate that given pattern occurs before some another pattern (lookahead). 363 | 364 | In other words, `x.lookahead(y)` matches a pattern `x` only if there is `y` after it 365 | 366 | Lookahead pattern won't be captured. 367 | 368 | ```python 369 | x.lookahead(y) # x(?=y) 370 | x.before(y) # x(?=y) 371 | ``` 372 | """ 373 | return Concat(self, Lookahead(other)) 374 | 375 | def before(self, other: AnyRegexPattern) -> Concat: 376 | """alias for .lookahead""" 377 | return self.lookahead(other) 378 | 379 | def negative_lookahead(self, other: AnyRegexPattern) -> Concat: 380 | """ 381 | Use this to indicate that given pattern doesn't occur before some another pattern (negative lookahead). 382 | 383 | In other words, `x.negative_lookahead(y)` matches a pattern `x` only if there is no `y` after it 384 | 385 | Lookahead pattern won't be captured. 386 | 387 | ```python 388 | x.negative_lookahead(y) # x(?!y) 389 | x.not_before(y) # x(?!y) 390 | ``` 391 | """ 392 | return Concat(self, NegativeLookahead(other)) 393 | 394 | def not_before(self, other: AnyRegexPattern) -> Concat: 395 | """alias for .negative_lookahead""" 396 | return self.negative_lookahead(other) 397 | 398 | def lookbehind(self, other: AnyRegexPattern) -> Concat: 399 | """ 400 | Use this to indicate that given pattern occurs after some another pattern (lookbehind). 401 | 402 | In other words, `x.lookbehind(y)` matches a pattern `x` only if there is `y` before it 403 | 404 | Lookbehind pattern won't be captured. 405 | 406 | ```python 407 | x.lookbehind(y) # (?<=y)x 408 | x.after(y) # (?<=y)x 409 | ``` 410 | """ 411 | return Concat(Lookbehind(other), self) 412 | 413 | def after(self, other: AnyRegexPattern) -> Concat: 414 | """alias for .lookbehind""" 415 | return self.lookbehind(other) 416 | 417 | def negative_lookbehind(self, other: AnyRegexPattern) -> Concat: 418 | """ 419 | Use this to indicate that given pattern goes before some another pattern (negative lookbehind). 420 | 421 | In other words, `x.negative_lookbehind(y)` matches a pattern `x` only if there is NO `y` before it 422 | 423 | Lookbehind pattern won't be captured. 424 | 425 | ```python 426 | x.negative_lookbehind(y) # (? Concat: 433 | """alias for .negative_lookbehind""" 434 | return self.negative_lookbehind(other) 435 | 436 | def comment(self, text: str) -> Concat: 437 | """leaves a comment in expression (if needed for whatever reason)""" 438 | return Concat(self, Comment(UnescapedLiteral(text.replace(")", "\\)")))) 439 | 440 | def capture(self) -> Group: 441 | """ 442 | 443 | Use this to make a capturing group out of pattern. 444 | 445 | ```python 446 | x.capture() # (x) 447 | ``` 448 | """ 449 | return Group(self) 450 | 451 | def named(self, name: str) -> NamedPattern: 452 | return NamedPattern(name, self) 453 | 454 | 455 | class _PriorityShell(RegexPattern): 456 | def __init__(self, priority: int) -> None: 457 | self.priority = priority 458 | 459 | 460 | class GroupBase(RegexPattern): 461 | contents: RegexPattern 462 | prefix: str 463 | 464 | def __init__(self, *contents: AnyRegexPattern): 465 | self.contents = pattern(contents) 466 | 467 | def render_prefix(self) -> TokenStream: 468 | yield self.prefix 469 | 470 | def case_insensitive(self): 471 | return self.apply(lambda x: x.case_insensitive()) 472 | 473 | def render(self, context: Context) -> TokenStream: 474 | yield "(" 475 | yield from self.render_prefix() 476 | yield from self.contents.render(context) 477 | yield ")" 478 | 479 | def apply(self, fn: Processor) -> Self: 480 | return self.__class__(fn(self.contents)) 481 | 482 | 483 | class Group(GroupBase): 484 | prefix = "" 485 | 486 | 487 | class NonCapturingGroup(GroupBase): 488 | prefix = "?:" 489 | 490 | def optimize(self) -> RegexPattern: 491 | if isinstance(self.contents, NonCapturingGroup): 492 | return self.contents.optimize() 493 | return super().optimize() 494 | 495 | def respect_priority( 496 | self, 497 | operation: BaseRenderable, 498 | comparator: Comparator | None = None, 499 | side: str = "none", 500 | ) -> BaseRenderable: 501 | return self.contents.respect_priority(operation, comparator, side) 502 | 503 | 504 | class Lookahead(GroupBase): 505 | prefix = "?=" 506 | 507 | 508 | class NegativeLookahead(GroupBase): 509 | prefix = "?!" 510 | 511 | 512 | class Lookbehind(GroupBase): 513 | prefix = "?<=" 514 | 515 | 516 | class NegativeLookbehind(GroupBase): 517 | prefix = "? Sequence[CharRange]: 525 | def sorting_func(char: CharRange) -> tuple[int, int]: 526 | return char.start, char.stop 527 | 528 | return sorted(seq, key=sorting_func) 529 | 530 | 531 | def make_range(part: CharType) -> CharRange: 532 | if isinstance(part, str): 533 | return CharRange(part, part) 534 | if isinstance(part, Literal): 535 | return CharRange(part.contents, part.contents) 536 | return part 537 | 538 | 539 | def merge_chars(contents: Sequence[CharType]) -> Sequence[CharRange]: 540 | result: list[CharRange] = [] 541 | contents = sort_chartype([make_range(part) for part in contents]) 542 | 543 | def merge_parts(last_part: CharRange, next_part: CharRange) -> Sequence[CharRange]: 544 | if last_part.stop + 1 >= next_part.start: 545 | if next_part.stop > last_part.stop: 546 | return [CharRange(last_part.start, next_part.stop)] 547 | return [last_part] 548 | 549 | return [last_part, next_part] 550 | 551 | for part in contents: 552 | if len(result): 553 | result[-1:] = merge_parts(result[-1], part) 554 | else: 555 | result.append(part) 556 | 557 | return result 558 | 559 | 560 | Bounds = Tuple[int, int] 561 | 562 | 563 | class FlagLike(RegexPattern): 564 | flags: str 565 | inner: RegexPattern 566 | 567 | 568 | class CharBase(FlagLike): 569 | def __init__(self, contents: Sequence[CharType]): 570 | self.contents = list(merge_chars(contents)) 571 | self.inner = self 572 | 573 | @property 574 | def flags(self): 575 | ci = self.case_insensitive() 576 | if ci == self: 577 | return "i" 578 | return "" 579 | 580 | def __eq__(self, other: object): 581 | if not isinstance(other, self.__class__): 582 | return False 583 | 584 | if len(self.contents) != len(other.contents): 585 | return False 586 | 587 | for i, r in enumerate(self.contents): 588 | if r != other.contents[i]: 589 | return False 590 | 591 | return True 592 | 593 | def case_insensitive(self) -> Self: 594 | contents: list[CharRange] = [] 595 | 596 | for part in self.contents: 597 | start_char = chr(part.start) 598 | stop_char = chr(part.stop) 599 | 600 | is_lower = start_char.islower() and stop_char.islower() 601 | is_upper = start_char.isupper() and stop_char.isupper() 602 | 603 | if is_lower: 604 | upper_chars = map(ord, map(str.upper, (start_char, stop_char))) 605 | contents.append(CharRange(*upper_chars)) 606 | 607 | elif is_upper: 608 | lower_chars = map(ord, map(str.lower, (start_char, stop_char))) 609 | contents.append(CharRange(*lower_chars)) 610 | 611 | contents.append(part) 612 | 613 | return self.__class__(contents) 614 | 615 | 616 | class Chars(CharBase): 617 | non_special = {".", "[", "|", "~", "*", "(", ")", "+", "$", "&", "?", "#"} 618 | 619 | def accepts(self, char: str) -> bool: 620 | for chrange in self.contents: 621 | if chrange.accepts(char): 622 | return True 623 | return False 624 | 625 | def render(self, context: Context) -> TokenStream: 626 | if len(self.contents) == 1: 627 | contents = self.contents[0] 628 | if contents.is_single_char(): 629 | yield from contents.render_literal(context) 630 | return 631 | yield "[" 632 | 633 | for char in self.contents: 634 | yield from char.render(context) 635 | 636 | yield "]" 637 | 638 | def to(self, other: str | Literal | Chars) -> Chars: 639 | if isinstance(other, str): 640 | end = pattern(other) 641 | elif isinstance(other, Chars): 642 | end = other 643 | else: 644 | end = other 645 | 646 | start: int = self.contents[0].start 647 | 648 | stop_base = end.contents[0] 649 | stop: int 650 | if isinstance(stop_base, str): 651 | stop = ord(stop_base) 652 | else: 653 | stop = stop_base.stop 654 | 655 | return char_range(start, stop) 656 | 657 | def reverse(self) -> ReversedChars: 658 | return ReversedChars(self.contents) 659 | 660 | @overload 661 | def __or__(self, other: Chars | list[CharType]) -> Chars: 662 | ... 663 | 664 | @overload 665 | def __or__(self, other: AnyRegexPattern) -> Option | Chars: 666 | ... 667 | 668 | def __or__(self, other: AnyRegexPattern) -> Union[Option, Chars]: 669 | other = respect_priority(other, Option.priority) 670 | if isinstance(other, Chars): 671 | return Chars([*self.contents, *other.contents]) 672 | return Option(self, other) 673 | 674 | def exclude(self, chars: AnyRegexPattern) -> Chars: 675 | chars = pattern(chars) 676 | if not isinstance(chars, Chars): 677 | raise ValueError( 678 | "Can't exclude non-Chars pattern, don't really know how..." 679 | ) 680 | result = [] 681 | for part in self.contents: 682 | result.extend(part.exclude(chars)) 683 | return Chars(result) 684 | 685 | 686 | class ReversedChars(CharBase): 687 | def render(self, context: Context) -> TokenStream: 688 | yield "[" 689 | yield "^" 690 | for char in self.contents: 691 | if isinstance(char, (Literal, CharRange)): 692 | yield from char.render(context) 693 | elif char in Chars.non_special: 694 | yield char 695 | else: 696 | yield re.escape(char) 697 | yield "]" 698 | 699 | def reverse(self) -> Chars: 700 | return Chars(self.contents) 701 | 702 | @overload 703 | def __or__(self, other: ReversedChars) -> ReversedChars: 704 | ... 705 | 706 | @overload 707 | def __or__(self, other: AnyRegexPattern) -> Option | ReversedChars: 708 | ... 709 | 710 | def __or__(self, other: AnyRegexPattern) -> Union[Option, ReversedChars]: 711 | other = respect_priority(other, Option.priority) 712 | if isinstance(other, ReversedChars): 713 | return ReversedChars([*self.contents, *other.contents]) 714 | return Option(self, other) 715 | 716 | 717 | class CharRange(BaseRenderable): 718 | min_char = 0 719 | max_char = 0x10FFFF 720 | 721 | def __init__(self, start: Optional[str | int], stop: Optional[str | int]): 722 | meta = None 723 | if isinstance(start, str): 724 | if len(start) > 1: 725 | meta = start 726 | start = -1 727 | else: 728 | start = ord(start) 729 | if isinstance(stop, str): 730 | if len(stop) > 1: 731 | stop = -1 732 | else: 733 | stop = ord(stop) 734 | 735 | self.start = start or CharRange.min_char 736 | self.stop = stop or CharRange.max_char 737 | self.meta = meta 738 | 739 | if not (start or stop): 740 | raise ValueError( 741 | "Cannot create a character range with no data. Use rgx.meta.ANY instead" 742 | ) 743 | 744 | def accepts(self, char: str) -> bool: 745 | return ord(char) in range(self.start, self.stop + 1) 746 | 747 | @staticmethod 748 | def render_char(char: int) -> str: 749 | return re.escape(chr(char)) 750 | 751 | def stream(self, context: Context) -> TokenStream: 752 | if self.meta: 753 | yield self.meta 754 | return 755 | 756 | diff = self.stop - self.start 757 | 758 | if self.start: 759 | yield self.render_char(self.start) 760 | 761 | if not diff: 762 | return # one char 763 | 764 | if diff == 2: 765 | yield chr(self.stop - 1) # render 012 instead of 0-2 766 | 767 | if diff > 2: 768 | yield "-" 769 | 770 | if self.stop != CharRange.max_char: 771 | yield self.render_char(self.stop) 772 | 773 | def render(self, context: Context) -> TokenStream: 774 | return self.stream(context) 775 | 776 | def render_literal(self, context: Context) -> TokenStream: 777 | if self.meta: 778 | yield self.meta 779 | return 780 | yield from Literal(chr(self.start)).render(context) 781 | 782 | @staticmethod 783 | def exclude_bounds(bounds: Bounds, exclude: Bounds) -> list[Bounds]: 784 | result: list[Bounds] = [] 785 | self_range = range(bounds[0], bounds[1] + 1) 786 | 787 | if exclude[0] - 1 in self_range: 788 | result.append((bounds[0], exclude[0] - 1)) 789 | if exclude[1] + 1 in self_range: 790 | result.append((exclude[1] + 1, bounds[1])) 791 | return result 792 | 793 | def exclude(self, chars: Chars) -> list[CharRange]: 794 | if self.meta: 795 | raise ValueError( 796 | f"Cannot exclude chars '{chars}' from meta-sequence '{self.meta}'" 797 | ) 798 | 799 | result: list[Bounds] = [(self.start, self.stop)] 800 | temp_result: list[Bounds] = [] 801 | cut_start = 0 802 | last_cut_start = 0 803 | 804 | for char_part in chars.contents: 805 | if char_part.meta: 806 | raise ValueError( 807 | f"Cannot exclude meta-sequence '{self.meta}' from chars '[{self}]'" 808 | ) 809 | exclude = (char_part.start, char_part.stop) 810 | for i, bounds in enumerate(result[cut_start:], start=cut_start): 811 | if exclude[1] < bounds[0]: 812 | temp_result.extend(result[i:]) 813 | break 814 | 815 | if exclude[0] > bounds[1]: 816 | temp_result.append(result[i]) 817 | continue 818 | 819 | temp_result.extend(self.exclude_bounds(bounds, exclude)) 820 | 821 | last_cut_start = cut_start 822 | cut_start = len(temp_result) - 1 823 | 824 | result[last_cut_start:] = temp_result 825 | temp_result = [] 826 | 827 | return [CharRange(*x) for x in result] 828 | 829 | def is_single_char(self) -> bool: 830 | return self.start == self.stop 831 | 832 | def __repr__(self): 833 | return Renderer().render_string(self) 834 | 835 | def __eq__(self, other: object): 836 | if not isinstance(other, CharRange): 837 | return False 838 | 839 | return ( 840 | self.start == other.start 841 | and self.stop == other.stop 842 | and self.meta == other.meta 843 | ) 844 | 845 | 846 | @overload 847 | def char_range(start: Optional[str | int], stop: str | int) -> Chars: 848 | ... 849 | 850 | 851 | @overload 852 | def char_range(start: str | int, stop: None = None) -> Chars: 853 | ... 854 | 855 | 856 | @overload 857 | def char_range(start: None = None, stop: None = None) -> NoReturn: 858 | ... 859 | 860 | 861 | @overload 862 | def char_range(start: Optional[str | int], stop: Optional[str | int]) -> Chars: 863 | ... 864 | 865 | 866 | def char_range( 867 | start: Optional[str | int] = None, stop: Optional[str | int] = None 868 | ) -> Chars: 869 | """ 870 | 871 | Use this for character ranges (e.g. `[a-z]`) 872 | 873 | Can be combined with other Chars istances (or lists) using | 874 | 875 | `start` and `stop` are inclusive 876 | 877 | """ 878 | 879 | return Chars([CharRange(start, stop)]) 880 | 881 | 882 | class Concat(RegexPattern): 883 | priority = 2 * priority_step 884 | 885 | def __init__(self, *contents: AnyRegexPattern) -> None: 886 | if len(contents) >= 3: 887 | contents = (contents[0], Concat(*contents[1:])) 888 | 889 | self.contents = [respect_priority(part, self.priority) for part in contents] 890 | 891 | def __add__(self, other: AnyRegexPattern) -> Concat: 892 | return Concat(*self.contents, other) 893 | 894 | def case_insensitive(self) -> RegexPattern: 895 | return self.apply(lambda x: x.case_insensitive()) 896 | 897 | def render(self, context: Context) -> TokenStream: 898 | for part in self.contents: 899 | yield from part.render(context) 900 | 901 | def merge_flags(self) -> LocalFlags | Concat: 902 | processed, common_flags = self.merge_flags_abstract(self.contents) 903 | 904 | new = Concat(*processed) 905 | 906 | if not common_flags: 907 | return new 908 | 909 | return LocalFlags(new, "".join(common_flags)) 910 | 911 | def apply(self, fn: Processor) -> Self: 912 | return self.__class__(*map(fn, self.contents)) 913 | 914 | 915 | class Option(RegexPattern): 916 | priority = 0 * priority_step 917 | 918 | def __init__(self, *alternatives: AnyRegexPattern): 919 | if len(alternatives) >= 3: 920 | alternatives = (alternatives[0], Option(*alternatives[1:])) 921 | 922 | self.alternatives = [ 923 | respect_priority(alternative, self.priority) for alternative in alternatives 924 | ] 925 | 926 | def case_insensitive(self) -> RegexPattern: 927 | return self.apply(lambda x: x.case_insensitive()) 928 | 929 | def merge_flags(self) -> LocalFlags | Option: 930 | processed, common_flags = self.merge_flags_abstract(self.alternatives) 931 | 932 | new = Option(*processed) 933 | 934 | if not common_flags: 935 | return new 936 | 937 | return LocalFlags(new, "".join(common_flags)) 938 | 939 | def render(self, context: Context) -> TokenStream: 940 | if not self.alternatives: 941 | return 942 | yield from self.alternatives[0].render(context) 943 | for alternative in self.alternatives[1:]: 944 | yield "|" 945 | yield from alternative.render(context) 946 | 947 | def __or__(self, other: AnyRegexPattern) -> Option: 948 | return Option(*self.alternatives, other) 949 | 950 | def __ror__(self, other: AnyRegexPattern) -> Option: 951 | return Option(other, *self.alternatives) 952 | 953 | def apply(self, fn: Processor) -> Self: 954 | return self.__class__(*map(fn, self.alternatives)) 955 | 956 | 957 | class LocalFlags(FlagLike): 958 | def __init__(self, contents: AnyRegexPattern, flags: str): 959 | self.contents = pattern(contents) 960 | self.inner = self.contents 961 | self.flags = flags 962 | 963 | def case_insensitive(self) -> RegexPattern: 964 | return self.apply(lambda x: x.case_insensitive()) 965 | 966 | def render(self, context: Context) -> TokenStream: 967 | yield "(?" 968 | yield self.flags 969 | yield ":" 970 | yield from self.contents.render(context) 971 | yield ")" 972 | 973 | def apply(self, fn: Processor) -> Self: 974 | return self.__class__(fn(self.contents), self.flags) 975 | 976 | 977 | class GlobalFlags(GroupBase): 978 | prefix = "?" 979 | 980 | def __init__(self, contents: str): 981 | self.contents = Literal(contents) 982 | 983 | 984 | class Range(RegexPattern): 985 | priority: int = 3 * priority_step 986 | 987 | def __init__( 988 | self, 989 | *contents: AnyRegexPattern, 990 | min_count: int = 0, 991 | max_count: Optional[int] = None, 992 | lazy: bool = False, 993 | ) -> None: 994 | if min_count == max_count == 1: 995 | self.contents = pattern(contents) 996 | else: 997 | self.contents = respect_priority(contents, self.priority + 1) 998 | 999 | if max_count is not None and min_count > max_count: 1000 | min_count, max_count = max_count, min_count 1001 | 1002 | if min_count < 0: 1003 | raise ValueError("Quantifier lower bound cannot be less than 0") 1004 | 1005 | if max_count is not None and max_count < 0: 1006 | raise ValueError("Quantifier upper bound cannot be less than 0") 1007 | 1008 | self.min_count = min_count 1009 | self.max_count = max_count 1010 | self.lazy = lazy 1011 | 1012 | def case_insensitive(self) -> RegexPattern: 1013 | return self.apply(lambda x: x.case_insensitive()) 1014 | 1015 | def repeat(self, count: int, lazy: bool = False) -> Range: 1016 | """ 1017 | 1018 | The logic here should be carefully thought through. 1019 | If we multiply a fixed-size pattern a{X} by Y, we generally DO NOT get a{X*Y} 1020 | If we multiply a .or_less() pattern a{,X} by Y, we get a{,X*Y} 1021 | If we multiply a pattern a{1,X} (X!=1) by Y, we get a{Y,X*Y} 1022 | 1023 | Above logic doesn't scale up with patterns a{X,N} * Y, if X is not in {0, 1}, so we should fallback to (?:a{X,N}){Y} 1024 | 1025 | While it is easy to say a{X} * Y == a{X*Y} (i.e. a{5} * 10 == a{50}), 1026 | ...this doesn't work well with .many() and other quantifiers: (a{5} * 10).many() != a{50,} 1027 | ...but rather (?:a{5}){10,} 1028 | 1029 | """ 1030 | 1031 | if self.min_count not in {0, 1}: 1032 | return super().repeat(count, lazy) 1033 | 1034 | max_count = self.max_count * count if self.max_count else None 1035 | return Range( 1036 | self.contents, 1037 | min_count=self.min_count * count, 1038 | max_count=max_count, 1039 | lazy=lazy, 1040 | ) 1041 | 1042 | def or_more(self) -> Range: 1043 | return Range(self.contents, min_count=self.min_count, lazy=self.lazy) 1044 | 1045 | def __pos__(self) -> Range: 1046 | return self.or_more() 1047 | 1048 | def or_less(self) -> Range: 1049 | return Range( 1050 | self.contents, min_count=0, max_count=self.max_count, lazy=self.lazy 1051 | ) 1052 | 1053 | def __neg__(self) -> Range: 1054 | return self.or_less() 1055 | 1056 | def to(self, count: int) -> Range: 1057 | return Range( 1058 | self.contents, min_count=self.min_count, max_count=count, lazy=self.lazy 1059 | ) 1060 | 1061 | def __rshift__(self, count: int) -> Range: 1062 | return self.to(count) 1063 | 1064 | def render_quantifier(self) -> TokenStream: 1065 | if self.max_count is None: 1066 | if not self.min_count: 1067 | yield "*" 1068 | return 1069 | elif self.min_count == 1: 1070 | yield "+" 1071 | return 1072 | 1073 | elif self.max_count == 1: 1074 | if not self.min_count: 1075 | yield "?" 1076 | return 1077 | elif self.min_count == 1: 1078 | return 1079 | 1080 | yield "{" 1081 | 1082 | if self.min_count: 1083 | yield str(self.min_count) 1084 | 1085 | if self.min_count == self.max_count: 1086 | yield "}" 1087 | return 1088 | 1089 | yield "," 1090 | 1091 | if self.max_count: 1092 | yield str(self.max_count) 1093 | 1094 | yield "}" 1095 | 1096 | def render(self, context: Context) -> TokenStream: 1097 | if self.max_count == 0: 1098 | return 1099 | 1100 | yield from self.contents.render(context) 1101 | 1102 | if self.min_count == self.max_count == 1: 1103 | return 1104 | 1105 | yield from self.render_quantifier() 1106 | 1107 | if self.lazy and self.min_count != self.max_count: 1108 | yield "?" 1109 | 1110 | def merge_flags(self) -> LocalFlags | Range: 1111 | processed, common_flags = self.merge_flags_abstract([self.contents]) 1112 | 1113 | if not common_flags: 1114 | return self 1115 | 1116 | return LocalFlags( 1117 | self.apply(lambda _: processed[0]), 1118 | "".join(common_flags), 1119 | ) 1120 | 1121 | def apply(self, fn: Processor) -> Self: 1122 | return self.__class__( 1123 | fn(self.contents), 1124 | min_count=self.min_count, 1125 | max_count=self.max_count, 1126 | lazy=self.lazy, 1127 | ) 1128 | 1129 | 1130 | class NamedPattern(RegexPattern): 1131 | """ 1132 | 1133 | Named capturing group. 1134 | 1135 | If `contents` are omitted, generates a reference, otherwise a named group definition. 1136 | 1137 | ```python 1138 | pattern.named("x", y) # (?Py) 1139 | pattern.named("x") # (?P=x) 1140 | ``` 1141 | """ 1142 | 1143 | def __init__(self, name: str, contents: Optional[AnyRegexPattern] = None): 1144 | self.name = name 1145 | self.contents = pattern(contents) if contents is not None else None 1146 | 1147 | def case_insensitive(self) -> RegexPattern: 1148 | contents = self.contents.case_insensitive() if self.contents else None 1149 | return NamedPattern(self.name, contents) 1150 | 1151 | def render(self, context: Context) -> TokenStream: 1152 | yield "(?P" 1153 | if self.contents: 1154 | yield "<" 1155 | yield self.name 1156 | yield ">" 1157 | yield from self.contents.render(context) 1158 | else: 1159 | yield "=" 1160 | yield self.name 1161 | yield ")" 1162 | 1163 | def merge_flags(self) -> LocalFlags | NamedPattern: 1164 | if self.contents is None: 1165 | return self 1166 | 1167 | processed, common_flags = self.merge_flags_abstract([self.contents]) 1168 | 1169 | if not common_flags: 1170 | return self 1171 | 1172 | return LocalFlags( 1173 | self.apply(lambda _: processed[0]), 1174 | "".join(common_flags), 1175 | ) 1176 | 1177 | def apply(self, fn: Processor) -> Self: 1178 | if self.contents is None: 1179 | return self 1180 | 1181 | return self.__class__( 1182 | self.name, 1183 | fn(self.contents), 1184 | ) 1185 | 1186 | 1187 | class ConditionalPattern(RegexPattern): 1188 | """ 1189 | Use to match different patterns depending on whether another group matched or not. 1190 | 1191 | Next two snippets produce effectively the same result: 1192 | 1193 | ```python 1194 | from rgx import pattern 1195 | 1196 | hello = pattern("hello").capture() 1197 | world = pattern("world") 1198 | where = pattern("where") 1199 | 1200 | x = (hello + world) | where 1201 | ``` 1202 | 1203 | ```python 1204 | from rgx import pattern, conditional 1205 | 1206 | hello = pattern("hello").capture() 1207 | world = pattern("world") 1208 | where = pattern("where") 1209 | 1210 | x = hello.maybe() + conditional(1, world, where) 1211 | ``` 1212 | """ 1213 | 1214 | def __init__( 1215 | self, group: int, true_option: AnyRegexPattern, false_option: AnyRegexPattern 1216 | ) -> None: 1217 | self.group = group 1218 | self.true_option = respect_priority(true_option, Option.priority + 1) 1219 | self.false_option = respect_priority(false_option, Option.priority + 1) 1220 | 1221 | def render(self, context: Context) -> TokenStream: 1222 | yield "(?(" 1223 | yield str(self.group) 1224 | yield ")" 1225 | yield from self.true_option.render(context) 1226 | yield "|" 1227 | yield from self.false_option.render(context) 1228 | yield ")" 1229 | 1230 | def apply(self, fn: Processor) -> Self: 1231 | return self.__class__( 1232 | self.group, 1233 | fn(self.true_option), 1234 | fn(self.false_option), 1235 | ) 1236 | 1237 | def case_insensitive(self) -> RegexPattern: 1238 | return self.apply(lambda x: x.case_insensitive()) 1239 | 1240 | 1241 | class Literal(RegexPattern): 1242 | def __init__(self, contents: str) -> None: 1243 | self.contents: str = contents 1244 | if len(self.contents) != 1: 1245 | self.priority = 2 * priority_step 1246 | 1247 | def to(self, other: str | Literal | Chars) -> Chars: 1248 | return Chars([self]).to(other) 1249 | 1250 | def render(self, context: Context) -> TokenStream: 1251 | yield re.escape(self.contents) 1252 | 1253 | def apply(self, fn: Processor) -> Self: 1254 | return self 1255 | 1256 | 1257 | class UnescapedLiteral(Literal): 1258 | """ 1259 | 1260 | Unescaped literal. Renders into whatever is passed (as long as it is a string) 1261 | 1262 | """ 1263 | 1264 | def render(self, context: Context) -> TokenStream: 1265 | yield str(self.contents) 1266 | 1267 | 1268 | def group_reference(group: int) -> UnescapedLiteral: 1269 | """ 1270 | 1271 | Renders into a group reference (backreference) 1272 | E.g. if Group #1 is `(x|y)` and it has matched "x", `reference(1)` would match exactly "x", but not "y" 1273 | 1274 | ```python 1275 | rgx.reference(1) # \\1 1276 | ``` 1277 | 1278 | """ 1279 | return UnescapedLiteral(f"\\{group}") 1280 | --------------------------------------------------------------------------------