├── .github ├── FUNDING.yml └── workflows │ ├── .pypi_upload.yml │ └── ci.yml ├── .gitignore ├── .readthedocs.yml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ ├── documentation │ ├── best-practices.rst │ ├── covering-the-basics.rst │ ├── modules │ │ ├── core │ │ │ ├── assertions.rst │ │ │ ├── classes.rst │ │ │ ├── groups.rst │ │ │ ├── operators.rst │ │ │ ├── pre.rst │ │ │ ├── quantifiers.rst │ │ │ └── tokens.rst │ │ └── meta │ │ │ └── essentials.rst │ └── subpackages.rst │ ├── index.rst │ ├── introduction.rst │ ├── logo.png │ └── requirements.txt ├── pyproject.toml ├── src └── pregex │ ├── __init__.py │ ├── core │ ├── __init__.py │ ├── assertions.py │ ├── classes.py │ ├── exceptions.py │ ├── groups.py │ ├── operators.py │ ├── pre.py │ ├── quantifiers.py │ └── tokens.py │ └── meta │ ├── __init__.py │ └── essentials.py └── tests ├── test_core_assertions.py ├── test_core_classes.py ├── test_core_groups.py ├── test_core_operators.py ├── test_core_pre.py ├── test_core_quantifiers.py ├── test_core_tokens.py └── test_meta_essentials.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [manoss96] 2 | -------------------------------------------------------------------------------- /.github/workflows/.pypi_upload.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | publish: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.9' 20 | 21 | - name: Install dependencies 22 | run: | 23 | pip install build 24 | 25 | - name: Build dist 26 | run: | 27 | python -m build --outdir dist/ 28 | 29 | - name: Publish to PyPI 30 | uses: pypa/gh-action-pypi-publish@release/v1 31 | with: 32 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Main CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | pull_request: 8 | braches: 9 | - '**' 10 | 11 | jobs: 12 | test-and-coverage: 13 | 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.9", "3.10", "3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v3 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Set PYTHONPATH 28 | run: | 29 | echo "PYTHONPATH=${GITHUB_WORKSPACE}/src" >> $GITHUB_ENV 30 | 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | python -m pip install coverage 35 | 36 | - name: Run tests 37 | run: | 38 | cd tests 39 | python -m coverage run -m unittest 40 | python -m coverage lcov 41 | 42 | - name: Python ${{ matrix.python-version }} Coveralls 43 | uses: coverallsapp/github-action@master 44 | with: 45 | github-token: ${{ secrets.GITHUB_TOKEN }} 46 | path-to-lcov: tests/coverage.lcov 47 | flag-name: python-${{ matrix.python-version }}-run 48 | parallel: true 49 | 50 | 51 | finish: 52 | 53 | needs: test-and-coverage 54 | runs-on: ubuntu-latest 55 | 56 | steps: 57 | - name: Update Coveralls 58 | uses: coverallsapp/github-action@master 59 | with: 60 | github-token: ${{ secrets.github_token }} 61 | parallel-finished: true -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Required 2 | version: 2 3 | 4 | # Set the version of Python and other tools you might need 5 | build: 6 | os: ubuntu-20.04 7 | tools: 8 | python: "3.9" 9 | 10 | # Build documentation in the docs/ directory with Sphinx 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | python: 15 | install: 16 | - requirements: docs/source/requirements.txt -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | Contributing to pregex 3 | ============================ 4 | 5 | There are two main ways to contribute to pregex: 6 | 7 | 1. **Bug Hunting**: It is more probable than not that there are currently 8 | a number of bugs silently waiting to be discovered! If you happen to stumble 9 | upon one of them while using pregex, please raise an issue labeled as **bug**, 10 | in which you report your findings as well as explain how one can reproduce 11 | the bug. Furthermore, if you're up for a challenge you can even create a 12 | new branch just for the issue and try to tackle the problem yourself! 13 | 14 | 2. **Propose an addition/modification**: Everything good can be even better! 15 | If you have an idea that you think might improve pregex, you can raise an 16 | issue labeled as **enhancement**, in which you discuss your idea. 17 | 18 | You can raise an issue by visiting the [Issues Page][issues-page]. 19 | 20 | Setting up a development environment 21 | ------------------------------------- 22 | Regardless of whether you want to work on fixing a bug or implementing a new feature, 23 | you should be able to set up a separate development environment just for pregex. The 24 | fastest way to do this would be the following: 25 | 26 | 1. Either clone or download the "pregex" repository to your local machine. 27 | 2. Add the path pointing to the project's "src" directory on your local machine to the "PYTHONPATH" environmental variable. 28 | - Make sure that "PYTHONPATH" is included in "PATH" as well. 29 | 3. Create and activate a new Python 3.9 environment that you will use solely for development purposes regarding pregex. 30 | - Make sure that you don't pip install pregex on this environment. 31 | 32 | After doing the above, you should be good to go! 33 | 34 | Running the tests 35 | ------------------------------------- 36 | For a pull request to be merged, it is important that it passes all tests defined 37 | within the project. In order to ensure that, you can run the tests yourself by 38 | simply going into the project's "tests" directory and executing the following 39 | command: 40 | ``` 41 | python3 -m unittest 42 | ``` 43 | Make sure that you've set up your development environment as explained in the 44 | corresponding section or else it is very likely that the above command will fail. 45 | 46 | 47 | Code of Conduct 48 | --------------- 49 | 50 | Please be nice to each other and abide by the principles of the [Python Software Foundation][psf-coc]. 51 | 52 | 53 | [issues-page]: https://github.com/manoss96/pregex/issues 54 | [psf-coc]: https://www.python.org/psf/codeofconduct/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Emmanouil Stoumpos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Python Version][python-shield]][python-url] 3 | [![MIT License][license-shield]][license-url] 4 | [![Coverage][coverage-shield]][coverage-url] 5 | 6 | ![PRegEx Logo](docs/source/logo.png) 7 | 8 | 9 | ## What is PRegEx? 10 | 11 | Let's face it, although RegEx is without a doubt an extremely useful tool, its syntax has been repeatedly proven to be quite hard for people to read and to memorize. This is mainly due to RegEx's declarative nature, which many programmers are not familiar with, as well as its extensive use of symbols that do not inherently relate to their functionality within a RegEx pattern, thus rendering them easy to forget. To make matters even worse, RegEx patterns are more often than not tightly packed with large amounts of information, which our brains just seem to be struggling to break down in order to analyze effectively. For these reasons, building even a simple RegEx pattern for matching URLs can prove to be quite a painful task. 12 | 13 | This is where PRegEx comes in! PRegEx, which stands for Programmable Regular Expressions, is a Python package that can be used in order to construct Regular Expression patterns in a more human-friendly way. Through the use of PRegEx, one is able to fully utilize the powerful tool that is RegEx without having to deal with any of its nuisances that seem to drive people crazy! PRegEx achieves that by offering the following: 14 | 15 | 1. An easy-to-remember syntax that resembles the good ol' imperative way of programming! 16 | 2. No longer having to group patterns or escape meta characters, as both are handled internally by PRegEx! 17 | 3. Modularity to building RegEx patterns, as one can easily break down a complex pattern into multiple simpler ones which can then be combined together. 18 | 4. A higher-level API on top of Python's built-in "re" module, providing access to its core functionality and more, while saving you the trouble of having to deal with "re.Match" instances. 19 | 20 | And remember, no matter how complex the abstraction, it's always just a pure RegEx pattern that sits underneath which you can fetch and use any way you like! 21 | 22 | 23 | 24 | ## Installation 25 | 26 | You can start using PRegEx by installing it via pip. Note that "pregex" requires Python >= 3.9. 27 | 28 | ```sh 29 | pip install pregex 30 | ``` 31 | 32 | 33 | 34 | ## Usage Example 35 | 36 | In PRegEx, everything is a Programmable Regular Expression, or "Pregex" for short. This makes it easy for simple Pregex instances to be combined into more complex ones! Within the code snippet below, we construct a Pregex instance that will match any URL that ends with either ".com" or ".org" as well as any IP address for which a 4-digit port number is specified. Furthermore, in the case of a URL, we would like for its domain name to be separately captured as well. 37 | 38 | ```python 39 | from pregex.core.classes import AnyLetter, AnyDigit, AnyFrom 40 | from pregex.core.quantifiers import Optional, AtLeastAtMost 41 | from pregex.core.operators import Either 42 | from pregex.core.groups import Capture 43 | from pregex.core.pre import Pregex 44 | 45 | # Define main sub-patterns. 46 | http_protocol = Optional('http' + Optional('s') + '://') 47 | 48 | www = Optional('www.') 49 | 50 | alphanum = AnyLetter() | AnyDigit() 51 | 52 | domain_name = \ 53 | alphanum + \ 54 | AtLeastAtMost(alphanum | AnyFrom('-', '.'), n=1, m=61) + \ 55 | alphanum 56 | 57 | tld = '.' + Either('com', 'org') 58 | 59 | ip_octet = AnyDigit().at_least_at_most(n=1, m=3) 60 | 61 | port_number = (AnyDigit() - '0') + 3 * AnyDigit() 62 | 63 | # Combine sub-patterns together. 64 | pre: Pregex = \ 65 | http_protocol + \ 66 | Either( 67 | www + Capture(domain_name) + tld, 68 | 3 * (ip_octet + '.') + ip_octet + ':' + port_number 69 | ) 70 | ``` 71 | 72 | We can then easily fetch the resulting Pregex instance's underlying RegEx pattern. 73 | ```python 74 | regex = pre.get_pattern() 75 | ``` 76 | 77 | This is the pattern that we just built. Yikes! 78 | ``` 79 | (?:https?:\/\/)?(?:(?:www\.)?([A-Za-z\d][A-Za-z\d\-.]{1,61}[A-Za-z\d])\.(?:com|org)|(?:\d{1,3}\.){3}\d{1,3}:[1-9]\d{3}) 80 | ``` 81 | 82 | Besides from having access to its underlying pattern, we can use a Pregex instance to find matches within a piece of text. Consider for example the following string: 83 | ```python 84 | text = "text--192.168.1.1:8000--text--http://www.wikipedia.org--text--https://youtube.com--text" 85 | ``` 86 | By invoking the instance's "get_matches" method, we are able to scan the above string for any possible matches: 87 | ```python 88 | matches = pre.get_matches(text) 89 | ``` 90 | 91 | Looks like there were three matches: 92 | ```python 93 | ['192.168.1.1:8000', 'http://www.wikipedia.org', 'https://youtube.com'] 94 | ``` 95 | 96 | Likewise, we can invoke the instance's "get_captures" method to get any captured groups. 97 | ```python 98 | groups = pre.get_captures(text) 99 | ``` 100 | As expected, there were only two captured groups since the first match is not a URL and therefore it does not contain a domain name to be captured. 101 | ```python 102 | [(None,), ('wikipedia',), ('youtube',)] 103 | ``` 104 | 105 | Finally, you might have noticed that we built our pattern by utilizing 106 | various classes that were imported from modules under *pregex.core*. These 107 | modules contain classes through which the RegEx syntax is essentially replaced. 108 | However, PRegEx also includes another set of modules, namely those under 109 | subpackage *pregex.meta*, whose classes build upon those in *pregex.core* so 110 | as to provide numerous pre-built patterns that you can just import and use 111 | right away! 112 | 113 | ```python 114 | 115 | from pregex.core.pre import Pregex 116 | from pregex.core.classes import AnyDigit 117 | from pregex.core.operators import Either 118 | from pregex.meta.essentials import HttpUrl, IPv4 119 | 120 | port_number = (AnyDigit() - '0') + 3 * AnyDigit() 121 | 122 | pre: Pregex = Either( 123 | HttpUrl(capture_domain=True, is_extensible=True), 124 | IPv4(is_extensible=True) + ':' + port_number 125 | ) 126 | ``` 127 | 128 | By using classes found within the *pregex.meta* subpackage, we were able to 129 | construct more or less the same pattern as before only much more easily! 130 | 131 | ## Solving Wordle with PRegEx 132 | 133 | We are now going to see another example that better exhibits the *programmable* nature of PRegEx. 134 | More specifically, we will be creating a Wordle solver function that, given all currently known 135 | information as well as access to a 5-letter word dictionary, utilizes PRegEx in order to return 136 | a list of candidate words to choose from as a possible solution to the problem. 137 | 138 | ### Formulating what is known 139 | 140 | First things first, we must think of a way to represent what is known so far regarding the 141 | word that we're trying to guess. This information can be encapsulated into three distinct 142 | sets of letters: 143 | 144 | 1. **Green letters**: Letters that are included in the word, whose position within it is known. 145 | 2. **Yellow letters**: Letters that are included in the word, and while their exact position is 146 | unknown, there is one or more positions which we can rule out. 147 | 3. **Gray letters**: Letters that are not included in the word. 148 | 149 | Green letters can be represented by using a dictionary that maps integers (positions) to strings (letters). 150 | For example, ``{4 : 'T'}`` indicates that the word we are looking for contains the letter ``T`` in its 151 | fourth position. Yellow letters can also be represented as a dictionary with integer keys, whose values 152 | however are going to be lists of strings instead of regular strings, as a position might have been ruled 153 | out for more than a single letter. For example, ``{1 : ['A', 'R'], 3 : ['P']}`` indicates that even though 154 | the word contains letters ``A``, ``R`` and ``P``, it cannot start with either an ``A`` or an ``R`` as 155 | well as it cannot have the letter ``P`` occupying its third position. Finally, gray letters can be simply 156 | stored in a list. 157 | 158 | In order to have a concrete example to work with, we will be assuming that our current 159 | information about the problem is expressed by the following three data structures: 160 | 161 | ```python 162 | green: dict[int, str] = {4 : 'T'} 163 | yellow: dict[int, list[str]] = {1 : ['A', 'R'], 3 : ['P']} 164 | gray: list[str] = ['C', 'D', 'L', 'M', 'N', 'Q', 'U'] 165 | ``` 166 | 167 | ### Initializing a Pregex class instance 168 | 169 | Having come up with a way of programmatically formulating the problem, the first step towards 170 | actually solving it would be to create a ``Pregex`` class instance: 171 | ```python 172 | wordle = Pregex() 173 | ``` 174 | 175 | Since we aren't providing a ``pattern`` parameter to the class's constructor, it automatically 176 | defaults to the empty string ``''``. Thus, through this instance we now have access to all methods 177 | of the ``Pregex`` class, though we are not really able to match anything with it yet. 178 | 179 | ### Yellow letter assertions 180 | 181 | Before we go on to dictate what the valid letters for each position within the word 182 | are, we are first going to deal with yellow letters, that is, letters which we know are 183 | included in the word that we are looking for, though their position is still uncertain. 184 | Since we know for a fact that the sought out word contains these letters, we have to 185 | somehow make sure that any candidate word includes them as well. This can easily be 186 | done by using what is known in RegEx lingo as a *positive lookahead assertion*, 187 | represented in PRegEx by the less intimidating *FollowedBy*! Assertions are used in 188 | order to *assert* something about a pattern without really having to *match* any additional 189 | characters. A positive lookahead assertion, in particular, dictates that the pattern to which 190 | it is applied must be followed by some other pattern in order for the former to constitute 191 | a valid match. 192 | 193 | In PRegEx, one is able to create a ``Pregex`` instance out of applying a positive 194 | lookahead assertion to some pattern ``p1`` by doing the following: 195 | 196 | ```python 197 | from pregex.core.assertions import FollowedBy 198 | 199 | pre = FollowedBy(p1, p2) 200 | ``` 201 | 202 | where both ``p1`` and ``p2`` are either strings or ``Pregex`` instances. Futhermore, in the 203 | case that ``p1`` already is a ``Pregex`` class instance, one can achieve the same result with: 204 | 205 | ```python 206 | pre = p1.followed_by(p2) 207 | ``` 208 | 209 | Having initialized ``wordle`` as a ``Pregex`` instance, we can simply simply do 210 | ``wordle.followed_by(some_pattern)`` so as to indicate that any potential match 211 | with ``wordle`` must be followed by ``some_pattern``. Recall that ``wordle`` merely 212 | represents the empty string, so we are not really matching anything at this point. 213 | Applying an assertion to the empty string pattern is just a neat little trick one 214 | can use in order to validate something about their pattern before they even begin 215 | to build it. 216 | 217 | Now it's just a matter of figuring out what the value of ``some_pattern`` is. 218 | Surely we can't just do ``wordle = wordle.followed_by(letter)``, as this results 219 | in ``letter`` always having to be at the beginning of the word. Here's however what 220 | we can do: It follows from the rules of Wordle that all words must be comprised of five 221 | letters, any of which is potentially a yellow letter. Thus, every yellow letter is certain 222 | to be preceded by up to four other letters, but no more than that. Therefore, we need a 223 | pattern that represents just that, namely *four letters at most*. By applying quantifier 224 | ``at_most(n=4)`` to an instance of ``AnyUppercaseLetter()``, we are able to create such 225 | a pattern. Add a yellow letter to its right and we have our ``some_pattern``. Since there 226 | may be more than one yellow letters, we make sure that we iterate them all one by one so 227 | as to enforce a separate assertion for each: 228 | 229 | ```python 230 | from pregex.core.classes import AnyUppercaseLetter 231 | 232 | yellow_letters_list: list[str] = [l for letter_list in yellow.values() for l in letter_list] 233 | 234 | at_most_four_letters = AnyUppercaseLetter().at_most(n=4) 235 | 236 | for letter in yellow_letters_list: 237 | wordle = wordle.followed_by(at_most_four_letters + letter) 238 | ``` 239 | 240 | By executing the above code snippet we get a ``Pregex`` instance which 241 | represents the following RegEx pattern: 242 | 243 | ``` 244 | (?=[A-Z]{,4}A)(?=[A-Z]{,4}R)(?=[A-Z]{,4}P) 245 | ``` 246 | 247 | ### Building valid character classes 248 | 249 | After we have made sure that our pattern will reject any words that do not contain 250 | all the yellow letters, we can finally start building the part of the pattern that 251 | will handle the actual matching. This can easily be achived by performing five 252 | iterations, one for each letter of the word, where at each iteration ``i`` we 253 | construct a new character class, which is then appended to our pattern based 254 | on the following logic: 255 | 256 | * If the letter that corresponds to the word's i-th position is known, then 257 | make it so that the pattern only matches that letter at that position. 258 | 259 | * If the letter that corresponds to the word's i-th position is not known, 260 | then make it so that the pattern matches any letter except for gray letters, 261 | green letters, as well as any yellow letters that may have been ruled out for 262 | that exact position. 263 | 264 | The following code snippet does just that: 265 | 266 | ```python 267 | from pregex.core.classes import AnyFrom 268 | 269 | for i in range(1, 6): 270 | if i in green: 271 | wordle += green[i] 272 | else: 273 | invalid_chars_at_pos_i = gray + list(green.values()) 274 | if i in yellow: 275 | invalid_chars_at_pos_i += yellow[i] 276 | wordle += AnyUppercaseLetter() - AnyFrom(*invalid_chars_at_pos_i) 277 | ``` 278 | 279 | After executing the above code, ``wordle`` will contain the following 280 | RegEx pattern: 281 | 282 | ``` 283 | (?=[A-Z]{,4}A)(?=[A-Z]{,4}R)(?=[A-Z]{,4}P)[BE-KOPSV-Z][ABE-KOPRSV-Z][ABE-KORSV-Z]T[ABE-KOPRSV-Z] 284 | ``` 285 | 286 | ### Matching from a dictionary 287 | 288 | Having built our pattern, the only thing left to do is to actually use it to 289 | match candidate words. Provided that we have access to a text file containing 290 | all possible Wordle words, we are able to invoke our ``Pregex`` instance's 291 | ``get_matches`` method in order to scan said text file for any potential matches. 292 | 293 | ```python 294 | words = wordle.get_matches('word_dictionary.txt', is_path=True) 295 | ``` 296 | 297 | ### Putting it all together 298 | 299 | Finally, we combine together everything we discussed into a single function that 300 | spews out a list of words which satisfy all necessary conditions so that they 301 | constitute possible solutions to the problem. 302 | 303 | ```python 304 | def wordle_solver(green: dict[int, str], yellow: dict[int, list[str]], gray: list[str]) -> list[str]: 305 | 306 | from pregex.core.pre import Pregex 307 | from pregex.core.classes import AnyUpperCaseLetter, AnyFrom 308 | 309 | # Initialize pattern as the empty string pattern. 310 | wordle = Pregex() 311 | 312 | # This part ensures that yellow letters 313 | # will appear at least once within the word. 314 | yellow_letters_list = [l for letter_list in yellow.values() for l in letter_list] 315 | at_most_four_letters = AnyUppercaseLetter().at_most(n=4) 316 | for letter in yellow_letters_list: 317 | wordle = wordle.followed_by(at_most_four_letters + letter) 318 | 319 | # This part actually dictates the set of valid letters 320 | # for each position within the word. 321 | for i in range(1, 6): 322 | if i in green: 323 | wordle += green[i] 324 | else: 325 | invalid_chars_at_pos_i = gray + list(green.values()) 326 | if i in yellow: 327 | invalid_chars_at_pos_i += yellow[i] 328 | wordle += AnyUppercaseLetter() - AnyFrom(*invalid_chars_at_pos_i) 329 | 330 | # Match candidate words from dictionary and return them in a list. 331 | return wordle.get_matches('word_dictionary.txt', is_path=True) 332 | ``` 333 | 334 | By invoking the above function we get the following list of words: 335 | 336 | ```python 337 | word_candidates = wordle_solver(green, yellow, gray) 338 | 339 | print(word_candidates) # This prints ['PARTY'] 340 | ``` 341 | 342 | Looks like there is only one candidate word, which means that we 343 | can consider our problem solved! 344 | 345 | You can learn more about PRegEx by visiting the [PRegEx Documentation Page][docs-url]. 346 | 347 | 348 | 349 | [python-shield]: https://img.shields.io/badge/python-3.9+-blue 350 | [python-url]: https://www.python.org/downloads/release/python-390/ 351 | [license-shield]: https://img.shields.io/badge/license-MIT-red 352 | [license-url]: https://github.com/manoss96/pregex/blob/main/LICENSE.txt 353 | [coverage-shield]: https://coveralls.io/repos/github/manoss96/pregex/badge.svg?branch=main&service=github 354 | [coverage-url]: https://coveralls.io/github/manoss96/pregex?branch=main 355 | [docs-url]: https://pregex.readthedocs.io/en/latest/ -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../../src/')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'pregex' 21 | copyright = '2022, Manos Stoumpos' 22 | author = 'Manos Stoumpos' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '2.3.3' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = ['sphinx.ext.todo', 'sphinx.ext.viewcode', 'sphinx.ext.autodoc'] 34 | 35 | # Supported file suffixes. 36 | source_suffix = ['.rst'] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # List of patterns, relative to source directory, that match files and 42 | # directories to ignore when looking for source files. 43 | # This pattern also affects html_static_path and html_extra_path. 44 | exclude_patterns = [] 45 | 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | 49 | # The theme to use for HTML and HTML Help pages. See the documentation for 50 | # a list of builtin themes. 51 | html_theme = 'sphinx_rtd_theme' 52 | 53 | # Add any paths that contain custom static files (such as style sheets) here, 54 | # relative to this directory. They are copied after the builtin static files, 55 | # so a file named "default.css" will overwrite the builtin "default.css". 56 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /docs/source/documentation/best-practices.rst: -------------------------------------------------------------------------------- 1 | ############### 2 | Best Practices 3 | ############### 4 | 5 | This page discusses the best practices that one should 6 | consider following when it comes to using pregex. 7 | 8 | Importing 9 | ========== 10 | 11 | Due to the relatively large number of modules contained within pregex, 12 | having to import each class individually can quickly become extremely annoying. 13 | For this reason, it is suggested that one handles their imports by 14 | including the following statements at the top of their Python script: 15 | 16 | * ``from pregex.core import *`` - Imports all core modules by using short aliases. 17 | More specifically: 18 | 19 | * Module :py:mod:`pregex.core.assertions` is imported as ``asr`` 20 | * Module :py:mod:`pregex.core.classes` is imported as ``cl`` 21 | * Module :py:mod:`pregex.core.groups` is imported as ``gr`` 22 | * Module :py:mod:`pregex.core.operators` is imported as ``op`` 23 | * Module :py:mod:`pregex.core.quantifiers` is imported as ``qu`` 24 | * Module :py:mod:`pregex.core.tokens` is imported as ``tk`` 25 | * Class :class:`pregex.core.pre.Pregex` is imported as is. 26 | 27 | Take a look at the example below to better understand how this works: 28 | 29 | .. code-block:: python 30 | 31 | from pregex.core import * 32 | 33 | pre = op.Either("Hello", "Bye") + " World" + qu.Optional("!") 34 | 35 | pre.print_pattern() # This prints "(?:Hello|Bye) World!?" 36 | 37 | It is recommended that you follow this practice as besides the fact that 38 | it saves you the trouble of having to import from each module separately, 39 | it also ensures that you are aware of the module that each class belongs in, 40 | which in turn reveals a lot in regards to the class's functionality and how 41 | it can be used. 42 | 43 | * ``from pregex.meta import *`` - Directly imports every class defined within any 44 | one of the *meta* modules. 45 | 46 | 47 | Finally, one is also able to replace both of the above import statements 48 | with a single statement, namely ``from pregex import *``. 49 | 50 | 51 | Maintaining readability 52 | ========================= 53 | 54 | One of the primary benefits of using PRegEx is being able to construct patterns 55 | that are readable, and therefore easier to maintain and update than their 56 | raw RegEx counterparts. This is mainly made possible through PRegEx's human-friendly 57 | syntax. Nevertheless, there exist certain cases where the syntax on its own is not 58 | enough to achieve readability, especially when it comes to building 59 | complex patterns. Consider for example the following Pregex instance: 60 | 61 | .. code-block:: python 62 | 63 | from pregex.core import * 64 | 65 | pre: Pregex = \ 66 | op.Enclose( 67 | op.Either( 68 | asr.FollowedBy( 69 | asr.PrecededBy( 70 | qu.OneOrMore(op.Either('+', '-')), 71 | 2 * (cl.AnyLetter() | (cl.AnyPunctuation() - cl.AnyFrom('+', '-'))) 72 | ), 73 | 2 * (cl.AnyLetter() | (cl.AnyPunctuation() - cl.AnyFrom('+', '-'))) 74 | ), 75 | asr.NotPrecededBy( 76 | asr.FollowedBy( 77 | qu.OneOrMore(op.Either('+', '-')), 78 | 2 * cl.AnyDigit() 79 | ), 80 | op.Either(cl.Any() + AnyDigit(), cl.AnyDigit() + cl.Any()) 81 | ), 82 | asr.NotFollowedBy( 83 | asr.PrecededBy( 84 | qu.OneOrMore(op.Either('+', '-')), 85 | 2 * cl.AnyDigit() 86 | ), 87 | op.Either(cl.Any() + AnyDigit(), cl.AnyDigit() + cl.Any()) 88 | ) 89 | ), 90 | 2 * (cl.AnyDigit | cl.AnyLetter() | (cl.AnyPunctuation() - cl.AnyFrom('+', '-'))) 91 | ) 92 | 93 | And this is the RegEx pattern to which the above Pregex instance compiles: 94 | 95 | .. code-block:: 96 | 97 | [,.-~!-*]{2}(?:(?<=[.-\/,!-*:-~]{2})(?:\+|-)+(?=[.-\/,!-*:-~]{2})|(?`_ we saw an alternative way 169 | of building patterns, which in certain cases is to be preferred over the standard API, 170 | and it just so happens that lookarounds constitute one of these cases. Here's what our pattern 171 | looks like when we apply the pattern chaining technique in order to impose any lookaround 172 | assertions: 173 | 174 | 175 | .. code-block:: python 176 | 177 | from pregex.core import * 178 | 179 | one_or_more_signs = qu.OneOrMore(op.Either('+', '-')) 180 | 181 | any_punct_but_signs = cl.AnyPunctuation() - cl.AnyFrom('+', '-') 182 | 183 | any_two_letters_or_punct_but_signs = 2 * (cl.AnyLetter() | any_punct_but_signs) 184 | 185 | any_two_digits = 2 * cl.AnyDigit() 186 | 187 | any_two_char_sequence_containing_digits = op.Either(cl.Any() + AnyDigit(), cl.AnyDigit() + cl.Any()) 188 | 189 | any_two_alphanums_or_punct_but_signs = 2 * (cl.AnyDigit() | cl.AnyLetter() | any_punct_but_signs) 190 | 191 | 192 | pre: Pregex = \ 193 | any_two_alphanums_or_punct_but_signs + \ 194 | op.Either( 195 | one_or_more_signs \ 196 | .preceded_by(any_two_letters_or_punct_but_signs) \ 197 | .followed_by(any_two_letters_or_punct_but_signs), 198 | one_or_more_signs \ 199 | .followed_by(any_two_digits) \ 200 | .not_preceded_by(any_two_char_sequence_containing_digits) \ 201 | one_or_more_signs \ 202 | .preceded_by(any_two_digits) \ 203 | .not_followed_by(any_two_char_sequence_containing_digits) 204 | ) + \ 205 | any_two_alphanums_or_punct_but_signs 206 | 207 | Having tinkered with the pattern-building process by incorporating what was discussed, 208 | it is now a lot more clear what this pattern is trying to match, which is any sequence 209 | of signs ``+`` and ``-`` that is both preceded and followed by any two-character sequence 210 | of letters, digits and punctuation marks except for ``+`` and ``-``, as long as any digits 211 | that appear within a possible match are: 212 | 213 | 1. Found exclusively either to the left or to the right of the sign sequence. 214 | 2. Occupy the whole two-character sequence. 215 | 216 | To give a concrete example, this pattern will match strings like ``a!+#c``, ``a!--12`` 217 | and ``12+-+a#``, but it won't work for strings like ``a!#$f``, ``a!+#3`` and ``1!-a#``. 218 | 219 | Having read all the above, try adopting these practices yourself when building 220 | patterns with PRegEx so you make the most out of it! 221 | -------------------------------------------------------------------------------- /docs/source/documentation/covering-the-basics.rst: -------------------------------------------------------------------------------- 1 | ################### 2 | Covering the Basics 3 | ################### 4 | 5 | In this section you will be learning about the :class:`~pregex.core.pre.Pregex` 6 | class, and how instances of this class can be effectively combined together in 7 | order to construct complex RegEx patterns. 8 | 9 | The Pregex class 10 | ============================================ 11 | 12 | The basic idea behind PRegEx is to provide higher-level abstractions 13 | of RegEx patterns that are easier to read and to work with. 14 | 15 | .. code-block:: python 16 | 17 | from pregex.core.quantifiers import Optional 18 | from pregex.core.groups import Capture 19 | from pregex.core.operators import Either 20 | 21 | Optional('a') # Stands for quantifier 'a?' 22 | Capture('a') # Stands for capturing group '(a)' 23 | Either('a', 'b') # Stands for alternation 'a|b' 24 | 25 | Besides representing RegEx patterns, these abstractions must also be able to 26 | serve as individual units that can be built upon. This is made possible by 27 | having a single base class, namely :class:`~pregex.core.pre.Pregex`, from which 28 | all other classes inherit. 29 | 30 | .. code-block:: python 31 | 32 | from pregex.core.pre import Pregex 33 | from pregex.core.classes import AnyDigit 34 | from pregex.core.operators import Either 35 | from pregex.core.assertions import FollowedBy 36 | 37 | # These are both Pregex instances. 38 | digit: Pregex = AnyDigit() 39 | either_a_or_b: Pregex = Either('a', 'b') 40 | 41 | # This is a Pregex instance as well! 42 | digit_followed_by_either_a_or_b: Pregex = FollowedBy(digit, either_a_or_b) 43 | 44 | Being wrapped within instances of the same type allows for these Pregex 45 | patterns to be easily combined together into even more complex patterns. 46 | Consider for example the code snippet below where we construct a Pregex 47 | pattern that will match either any word that starts with "ST" or "st", 48 | or any three-digit integer: 49 | 50 | .. code-block:: python 51 | 52 | from pregex.core.operators import Either 53 | from pregex.core.quantifiers import OneOrMore 54 | from pregex.core.assertions import WordBoundary 55 | from pregex.core.classes import AnyLetter, AnyDigit 56 | 57 | starts_with_st = Either('ST', 'st') + OneOrMore(AnyLetter()) 58 | 59 | three_digit_integer = (AnyDigit() - '0') + (2 * AnyDigit()) 60 | 61 | pre = WordBoundary() + Either(starts_with_st, three_digit_integer) + WordBoundary() 62 | 63 | By both using PRegEx's human-friendly syntax and breaking down the pattern into simpler 64 | subpatterns, it is not hard to follow this pattern's construction process, as well as 65 | what its purpose is. Furthermore, the resulting pattern is a Pregex instance itself, 66 | and as such, it has access to all of the class's methods: 67 | 68 | .. code-block:: python 69 | 70 | pre.print_pattern() # This prints '\b(?:(?:ST|st)[A-Za-z]+|[1-9]\d{2})\b' 71 | print(pre.get_matches('STACK station pastry must 012 446 3462')) # This prints "['STACK', 'station', '446']" 72 | 73 | 74 | Converting a string into a Pregex instance 75 | ============================================ 76 | In general, one can wrap any string within a Pregex instance by passing it as a 77 | parameter to the class's constructor. By doing this, any characters of the provided 78 | string that require escaping are automatically escaped. 79 | 80 | .. code-block:: python 81 | 82 | from pregex.core.pre import Pregex 83 | 84 | pre = Pregex('Hello.') 85 | 86 | pre.print_pattern() # This prints 'Hello\.' 87 | 88 | Nevertheless, you probably won't need to do this often since any string that interacts 89 | with a Pregex instance in any way is automatically converted into a Pregex instance itself: 90 | 91 | .. code-block:: python 92 | 93 | from pregex.core.pre import Pregex 94 | from pregex.core.quantifiers import Optional 95 | 96 | # These two statements are equivalent. 97 | pre1 = Optional(Pregex('Hello.')) 98 | pre2 = Optional('Hello.') 99 | 100 | Manually wrapping strings within Pregex instances can however be of use when one wishes 101 | to explicitly define their own RegEx pattern. In that case, one must also not forget 102 | to set the class's constructor ``escape`` parameter to ``False``, in order to disable 103 | character-escaping: 104 | 105 | .. code-block:: python 106 | 107 | from pregex.core.pre import Pregex 108 | 109 | pre = Pregex('[a-z].?', escape=False) 110 | 111 | pre.print_pattern() # This prints '[a-z].?' 112 | 113 | Concatenating patterns with "+" 114 | ============================================ 115 | There exists a separate :class:`~pregex.core.operators.Concat` class, 116 | which is specifically used to concatenate two or more patterns together. 117 | However, one can also achieve the same result by making use of Pregex's 118 | overloaded addition operator ``+``. 119 | 120 | .. code-block:: python 121 | 122 | from pregex.core.pre import Pregex 123 | from pregex.core.quantifiers import Optional 124 | 125 | pre = Pregex('a') + Pregex('b') + Optional('c') 126 | 127 | pre.print_pattern() # This prints 'abc?' 128 | 129 | This of course works with simple strings as well, as long as there 130 | is at least one Pregex instance involved in the operation: 131 | 132 | .. code-block:: python 133 | 134 | from pregex.core.quantifiers import Optional 135 | 136 | pre = 'a' + 'b' + Optional('c') 137 | 138 | pre.print_pattern() # This prints 'abc?' 139 | 140 | Concatenating patterns this way is encouraged as it leads to much more 141 | easy-to-read code. 142 | 143 | Repeating patterns with "*" 144 | ============================================ 145 | :class:`Pregex` has one more overloaded operator, namely the multiplication operator 146 | ``*``, which essentially replaces class :class:`~pregex.core.quantifiers.Exactly`. 147 | By using this operator on a Pregex instance, one indicates that a pattern is to be 148 | repeated an exact number of times: 149 | 150 | .. code-block:: python 151 | 152 | from pregex.core.pre import Pregex 153 | 154 | pre = 3 * Pregex('a') 155 | 156 | pre.print_pattern() # This prints 'a{3}' 157 | 158 | As it is the case with the addition operator ``+``, it is recommended 159 | that one also makes use of the multiplication operator ``*`` whenever 160 | possible. 161 | 162 | 163 | The "empty string" pattern 164 | ================================ 165 | 166 | Invoking the ``Pregex`` class's constructor without supplying it with a 167 | value for parameter ``pattern``, causes said parameter to take its default 168 | value, that is, the empty string ``''``. This is a good starting point 169 | to begin constructing your pattern: 170 | 171 | .. code-block:: python 172 | 173 | from pregex.core.pre.Pregex 174 | 175 | # Initialize your pattern as the empty string pattern. 176 | pre = Pregex() 177 | 178 | # Start building your pattern... 179 | for subpattern in subpatterns: 180 | if '!' in subpattern.get_pattern(): 181 | pre = pre.concat(subpattern + '?') 182 | else: 183 | pre = pre.concat(subpattern + '!') 184 | 185 | On top of that, any ``Pregex`` instance whose underlying pattern 186 | is the empty string pattern, has the following properties: 187 | 188 | 1. Applying a quantifier to the empty string pattern results in itself: 189 | 190 | .. code-block:: python 191 | 192 | from pregex.core.pre import Pregex 193 | from pregex.core.quantifiers import OneOrMore 194 | 195 | pre = OneOrMore(Pregex()) 196 | pre.print_pattern() # This prints '' 197 | 198 | 2. Creating a group out of the empty string pattern results in itself: 199 | 200 | .. code-block:: python 201 | 202 | from pregex.core.pre import Pregex 203 | from pregex.core.group import Group 204 | 205 | pre = Group(Pregex()) 206 | pre.print_pattern() # This prints '' 207 | 208 | 3. Applying the alternation operation between the empty string 209 | pattern and an ordinary pattern results in the latter: 210 | 211 | .. code-block:: python 212 | 213 | from pregex.core.pre import Pregex 214 | from pregex.core.operators import Either 215 | 216 | pre = Either(Pregex(), 'a') 217 | pre.print_pattern() # This prints 'a' 218 | 219 | 4. Applying a positive lookahead assertion based on the empty 220 | string pattern to any pattern results in that pattern: 221 | 222 | .. code-block:: python 223 | 224 | from pregex.core.pre import Pregex 225 | from pregex.core.assertions import FollowedBy 226 | 227 | pre = FollowedBy('a', Pregex()) 228 | pre.print_pattern() # This prints 'a' 229 | 230 | The above properties make it easy to write concise code 231 | like the following, without compromising your pattern: 232 | 233 | .. code-block:: python 234 | 235 | from pregex.core.pre.Pregex 236 | from pregex.core.groups import Capture 237 | from pregex.core.operators import Either 238 | from pregex.core.quantifiers import OneOrMore 239 | 240 | pre = Either( 241 | 'a', 242 | 'b' if i > 5 else Pregex(), 243 | OneOrMore('c' if i > 10 else Pregex()) 244 | ) + Capture('d' if i > 15 else Pregex()) 245 | 246 | This is the underlying pattern of instance ``pre`` when 247 | executing the above code snippet for various values of ``i``: 248 | 249 | * For ``i`` equal to ``1`` the resulting pattern is ``a`` 250 | * For ``i`` equal to ``6`` the resulting pattern is ``a|b`` 251 | * For ``i`` equal to ``11`` the resulting pattern is ``a|b|c+`` 252 | * For ``i`` equal to ``16`` the resulting pattern is ``(?:a|b|c+)(d)`` 253 | 254 | 255 | Pattern chaining 256 | ================== 257 | Apart from PRegEx's standard pattern-building API which involves 258 | wrapping strings and/or Pregex instances within other Pregex instances, 259 | there also exists a more functional-like approach to constructing patterns. 260 | More specifically, every Pregex instance has access to a number of methods 261 | that can be used so as to apply basic RegEx operators to its underlying 262 | pattern, through which process a brand new Pregex instance is generated. 263 | 264 | .. code-block:: python 265 | 266 | from pregex.core.classes import AnyLetter() 267 | from pregex.core.quantifiers import Optional() 268 | 269 | letter = AnyLetter() 270 | 271 | # Both statements are equivalent. 272 | optional_letter_1 = Optional(letter) 273 | optional_letter_2 = letter.optional() 274 | 275 | By chaining many of these methods together, it is also possible 276 | to construct more complex patterns. This technique is called 277 | *pattern chaining*: 278 | 279 | .. code-block:: python 280 | 281 | from pregex.core.pre import Pregex 282 | 283 | pre = Pregex() \ 284 | .concat('a') \ 285 | .either('b') \ 286 | .one_or_more() \ 287 | .concat('c') \ 288 | .optional() \ 289 | .concat('d') \ 290 | .match_at_line_start() \ 291 | .match_at_line_end() 292 | 293 | pre.print_pattern() # This prints '^(?:(?:a|b)+c)?d$' 294 | 295 | It is generally recommended that you use the standard API when dealing 296 | with larger patterns, as it provides a way of building patterns that is 297 | usually easier to read. Be that as it may, there do exist several case 298 | where pattern chaining is the better choice of the two. In the end, it's 299 | just a matter of choice! 300 | 301 | Check out :class:`~pregex.core.pre.Pregex` to learn what other methods this class 302 | has to offer. -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/assertions.rst: -------------------------------------------------------------------------------- 1 | *********************** 2 | pregex.core.assertions 3 | *********************** 4 | .. automodule:: pregex.core.assertions 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/classes.rst: -------------------------------------------------------------------------------- 1 | 2 | ********************* 3 | pregex.core.classes 4 | ********************* 5 | .. automodule:: pregex.core.classes 6 | :members: 7 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/groups.rst: -------------------------------------------------------------------------------- 1 | ********************* 2 | pregex.core.groups 3 | ********************* 4 | .. automodule:: pregex.core.groups 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/operators.rst: -------------------------------------------------------------------------------- 1 | ********************* 2 | pregex.core.operators 3 | ********************* 4 | .. automodule:: pregex.core.operators 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/pre.rst: -------------------------------------------------------------------------------- 1 | ********************* 2 | pregex.core.pre 3 | ********************* 4 | .. automodule:: pregex.core.pre 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/quantifiers.rst: -------------------------------------------------------------------------------- 1 | ************************* 2 | pregex.core.quantifiers 3 | ************************* 4 | .. automodule:: pregex.core.quantifiers 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/core/tokens.rst: -------------------------------------------------------------------------------- 1 | ********************* 2 | pregex.core.tokens 3 | ********************* 4 | .. automodule:: pregex.core.tokens 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/modules/meta/essentials.rst: -------------------------------------------------------------------------------- 1 | *********************** 2 | pregex.meta.essentials 3 | *********************** 4 | .. automodule:: pregex.meta.essentials 5 | :members: 6 | :undoc-members: -------------------------------------------------------------------------------- /docs/source/documentation/subpackages.rst: -------------------------------------------------------------------------------- 1 | ############# 2 | Subpackages 3 | ############# 4 | 5 | PRegEx's modules are divided into two subpackages, namely ``pregex.core`` and 6 | ``pregex.meta``, the former of which predominantly contains modules whose classes 7 | represent some fundamental RegEx operator, whereas the latter acts as a collection 8 | of various classes that build upon those within the core modules in order to provide 9 | ready-made patterns that can be used "straight out of the box". 10 | 11 | pregex.core 12 | ================= 13 | 14 | In order to better understand *core* modules, consider for example 15 | :py:mod:`pregex.core.quantifiers`, all classes of which correspond 16 | to a unique RegEx quantifier: 17 | 18 | .. code-block:: python 19 | 20 | from pregex.core.quantifiers import * 21 | 22 | Optional # Represents quantifier '?' 23 | Indefinite # Represents quantifier '*' 24 | OneOrMore # Represents quantifier '+' 25 | Exactly # Represents quantifier '{n}' 26 | AtLeast # Represents quantifier '{n,}' 27 | AtMost # Represents quantifier '{,n}' 28 | AtLeastAtMost # Represents quantifier '{n,m}' 29 | 30 | However, not all core modules contain classes that represent some specific 31 | RegEx operator. There is the :py:mod:`pregex.core.tokens` module, whose 32 | classes act as wrappers for various single-character patterns. That is, either 33 | to protect you from any character-escape-related issues that may arise due 34 | to using raw strings containing backslashes, or to save you the trouble of looking 35 | for a specific symbol's Unicode code point, provided of course that there is a 36 | corresponding *Token* class for that symbol. 37 | 38 | .. code-block:: python 39 | 40 | from pregex.core.tokens import Newline, Copyright 41 | 42 | # Both of these statements are 'True'. 43 | Newline().is_exact_match('\n') 44 | Copyright().is_exact_match('©') 45 | 46 | 47 | Lastly, there is module :py:mod:`pregex.core.classes` which does not only 48 | offer a number of commonly used RegEx character classes, but a complete 49 | framework for working on these classes as if they were regular sets. 50 | 51 | .. code-block:: python 52 | 53 | from pregex.core.classes import AnyLetter, AnyDigit 54 | 55 | letter = AnyLetter() # Represents '[A-Za-z]' 56 | digit_but_five = AnyDigit() - '5' # Represents '[0-46-9]' 57 | letter_or_digit_but_five = letter | digit_but_five # Represents '[A-Za-z0-46-9]' 58 | any_but_letter_or_digit_but_five = ~ letter_or_digit_but_five # Represents '[^A-Za-z0-46-9]' 59 | 60 | Click on any one of pregex's *core* modules below to check out its classes: 61 | 62 | .. toctree:: 63 | :maxdepth: 1 64 | 65 | modules/core/assertions 66 | modules/core/classes 67 | modules/core/groups 68 | modules/core/operators 69 | modules/core/pre 70 | modules/core/quantifiers 71 | modules/core/tokens 72 | 73 | pregex.meta 74 | ================= 75 | 76 | Unlike *core* modules, whose classes are all independent from each other, 77 | *meta* modules contain classes that effectively combine various 78 | :class:`~pregex.core.pre.Pregex` instances together in order to form 79 | complex patterns that you can then use. Consider for example 80 | :class:`~pregex.meta.essentials.Integer` which enables you to 81 | match any integer within a specified range. 82 | 83 | .. code-block:: python 84 | 85 | from pregex.meta.essentials import Integer 86 | 87 | text = "1 5 11 23 77 117 512 789 1011" 88 | 89 | pre = Integer(start=50, end=1000) 90 | 91 | print(pre.get_matches(text)) # This prints "['77', '117', '512', '789']" 92 | 93 | Classes in *meta* modules therefore offer various patterns that can be useful, 94 | but at the same time hard to build. And remember, no matter the complexity of 95 | a pattern, it remains to be a Pregex instance, and as such, it can always be 96 | extended even further! 97 | 98 | .. code-block:: python 99 | 100 | from pregex.core.classes import AnyLetter 101 | from pregex.meta.essentials import Integer 102 | 103 | pre = AnyLetter() + Integer(start=50, end=1000, is_extensible=True) 104 | text = "a1 b5 c11 d23 e77 f117 g512 h789 i1011" 105 | 106 | print(pre.get_matches(text)) # This prints "['e77', 'f117', 'g512', 'h789']" 107 | 108 | Just don't forget to set parameter ``is_extensible`` to ``True``, as 109 | this prevents some additional assertions from being applied to the 110 | pattern, which even though are essential in order for it to be able 111 | to match what is supposed to, at the same time they might introduce 112 | certain complications when it comes to the pattern serving as a building 113 | block to a larger pattern. 114 | 115 | Click on any one of pregex's *meta* modules below to check out its classes: 116 | 117 | .. toctree:: 118 | :maxdepth: 1 119 | 120 | modules/meta/essentials -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ****************************************** 2 | PRegEx - Programmable Regular Expressions 3 | ****************************************** 4 | 5 | Welcome to pregex's documentation page! You can start by going through the 6 | `Introduction `_ section in order to get a first look at the 7 | pregex package, as well as receive instructions on how to install it. After that, 8 | you are free to explore the `Documentation `_ 9 | section so that you learn more about building RegEx patterns with pregex, 10 | or you can even check out the source code itself by visiting pregex on 11 | `Github `_. 12 | 13 | ================================== 14 | 15 | .. toctree:: 16 | :maxdepth: 1 17 | :caption: Introduction 18 | 19 | introduction 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | :caption: Documentation 24 | 25 | documentation/covering-the-basics 26 | documentation/subpackages 27 | documentation/best-practices -------------------------------------------------------------------------------- /docs/source/introduction.rst: -------------------------------------------------------------------------------- 1 | .. _introduction: 2 | 3 | ******************* 4 | What is PRegEx? 5 | ******************* 6 | 7 | Let's face it, although RegEx is without a doubt an extremely useful tool, its syntax has been repeatedly proven to be quite hard for people to read and to memorize. This is mainly due to RegEx's declarative nature, which many programmers are not familiar with, as well as its extensive use of symbols that do not inherently relate to their functionality within a RegEx pattern, thus rendering them easy to forget. To make matters even worse, RegEx patterns are more often than not tightly packed with large amounts of information, which our brains just seem to be struggling to break down in order to analyze effectively. For these reasons, building even a simple RegEx pattern for matching URLs can prove to be quite a painful task. 8 | 9 | This is where PRegEx comes in! PRegEx, which stands for Programmable Regular Expressions, is a Python package that can be used in order to construct Regular Expression patterns in a more human-friendly way. Through the use of PRegEx, one is able to fully utilize the powerful tool that is RegEx without having to deal with any of its nuisances that seem to drive people crazy! PRegEx achieves that by offering the following: 10 | 11 | 1. An easy-to-remember syntax that resembles the good ol' imperative way of programming! 12 | 2. No longer having to group patterns or escape meta characters, as both are handled internally by PRegEx! 13 | 3. Modularity to building RegEx patterns, as one can easily break down a complex pattern into multiple simpler ones which can then be combined together. 14 | 4. A higher-level API on top of Python's built-in "re" module, providing access to its core functionality and more, while saving you the trouble of having to deal with "re.Match" instances. 15 | 16 | And remember, no matter how complex the abstraction, it's always just a pure 17 | RegEx pattern that sits underneath which you can fetch and use any way you like! 18 | 19 | ******************* 20 | Installation 21 | ******************* 22 | 23 | You can start using PRegEx by installing it via pip. Note that "pregex" requires Python >= 3.9. 24 | 25 | .. code-block:: 26 | 27 | pip install pregex 28 | 29 | ******************* 30 | Usage Example 31 | ******************* 32 | 33 | In PRegEx, everything is a Programmable Regular Expression, or `Pregex` for short. This makes it easy for simple :class:`~pregex.core.pre.Pregex` instances to be combined into more complex ones! Within the code snippet below, we construct a Pregex instance that will match any URL that ends with either ".com" or ".org" as well as any IP address for which a 4-digit port number is specified. Furthermore, in the case of a URL, we would like for its domain name to be separately captured as well. 34 | 35 | .. code-block:: python 36 | 37 | from pregex.core.classes import AnyLetter, AnyDigit, AnyFrom 38 | from pregex.core.quantifiers import Optional, AtLeastAtMost 39 | from pregex.core.operators import Either 40 | from pregex.core.groups import Capture 41 | from pregex.core.pre import Pregex 42 | 43 | http_protocol = Optional('http' + Optional('s') + '://') 44 | 45 | www = Optional('www.') 46 | 47 | alphanum = AnyLetter() | AnyDigit() 48 | 49 | domain_name = \ 50 | alphanum + \ 51 | AtLeastAtMost(alphanum | AnyFrom('-', '.'), n=1, m=61) + \ 52 | alphanum 53 | 54 | tld = '.' + Either('com', 'org') 55 | 56 | ip_octet = AnyDigit().at_least_at_most(n=1, m=3) 57 | 58 | port_number = (AnyDigit() - '0') + 3 * AnyDigit() 59 | 60 | # Combine sub-patterns together. 61 | pre: Pregex = \ 62 | http_protocol + \ 63 | Either( 64 | www + Capture(domain_name) + tld, 65 | 3 * (ip_octet + '.') + ip_octet + ':' + port_number 66 | ) 67 | 68 | We can then easily fetch the resulting Pregex instance's underlying RegEx pattern. 69 | 70 | .. code-block:: python 71 | 72 | regex = pre.get_pattern() 73 | 74 | 75 | This is the pattern that we just built. Yikes! 76 | 77 | .. code-block:: 78 | 79 | (?:https?:\/\/)?(?:(?:www\.)?([A-Za-z\d][A-Za-z\d\-.]{1,61}[A-Za-z\d])\.(?:com|org)|(?:\d{1,3}\.){3}\d{1,3}:[1-9]\d{3}) 80 | 81 | 82 | Besides from having access to its underlying pattern, we can use a Pregex instance to find matches within a piece of text. Consider for example the following string: 83 | 84 | .. code-block:: python 85 | 86 | text = "text--192.168.1.1:8000--text--http://www.wikipedia.org--text--https://youtube.com--text" 87 | 88 | By invoking the instance's :py:meth:`~pregex.pre.Pregex.get_matches` method, we are able to scan the above string for any possible matches: 89 | 90 | .. code-block:: python 91 | 92 | matches = pre.get_matches(text) 93 | 94 | 95 | Looks like there were three matches: 96 | 97 | .. code-block:: python 98 | 99 | ['192.168.1.1:8000', 'http://www.wikipedia.org', 'https://youtube.com'] 100 | 101 | 102 | Likewise, we can invoke the instance's :py:meth:`~pregex.pre.Pregex.get_captures` method to get any captured groups. 103 | 104 | .. code-block:: python 105 | 106 | groups = pre.get_captures(text) 107 | 108 | As expected, there were only two captured groups since the first match is not a URL and therefore it does not 109 | contain a domain name to be captured. 110 | 111 | .. code-block:: python 112 | 113 | [(None,), ('wikipedia',), ('youtube',)] 114 | 115 | Finally, you might have noticed that we built our pattern by utilizing 116 | various classes that were imported from modules under *pregex.core*. These 117 | modules contain classes through which the RegEx syntax is essentially replaced. 118 | However, PRegEx also includes another set of modules, namely those under 119 | subpackage *pregex.meta*, whose classes build upon those in *pregex.core* so 120 | as to provide numerous pre-built patterns that you can just import and use 121 | right away! 122 | 123 | .. code-block:: python 124 | 125 | from pregex.core.pre import Pregex 126 | from pregex.core.classes import AnyDigit 127 | from pregex.core.operators import Either 128 | from pregex.meta.essentials import HttpUrl, IPv4 129 | 130 | port_number = (AnyDigit() - '0') + 3 * AnyDigit() 131 | 132 | pre: Pregex = Either( 133 | HttpUrl(capture_domain=True, is_extensible=True), 134 | IPv4(is_extensible=True) + ':' + port_number 135 | ) 136 | 137 | By using classes found within the *pregex.meta* subpackage, we were able to 138 | construct more or less the same pattern as before only much more easily! 139 | 140 | 141 | *************************** 142 | Solving Wordle with PRegEx 143 | *************************** 144 | 145 | We are now going to see another example that better exhibits the *programmable* nature of PRegEx. 146 | More specifically, we will be creating a Wordle solver function that, given all currently known 147 | information as well as access to a 5-letter word dictionary, utilizes PRegEx in order to return 148 | a list of candidate words to choose from as a possible solution to the problem. 149 | 150 | Formulating what is known 151 | ------------------------------ 152 | 153 | First things first, we must think of a way to represent what is known so far regarding the 154 | word that we're trying to guess. This information can be encapsulated into three distinct 155 | sets of letters: 156 | 157 | 1. **Green letters**: Letters that are included in the word, whose position within it is known. 158 | 2. **Yellow letters**: Letters that are included in the word, and while their exact position is 159 | unknown, there is one or more positions which we can rule out. 160 | 3. **Gray letters**: Letters that are not included in the word. 161 | 162 | Green letters can be represented by using a dictionary that maps integers (positions) to strings (letters). 163 | For example, ``{4 : 'T'}`` indicates that the word we are looking for contains the letter ``T`` in its 164 | fourth position. Yellow letters can also be represented as a dictionary with integer keys, whose values 165 | however are going to be lists of strings instead of regular strings, as a position might have been ruled 166 | out for more than a single letter. For example, ``{1 : ['A', 'R'], 3 : ['P']}`` indicates that even though 167 | the word contains letters ``A``, ``R`` and ``P``, it cannot start with either an ``A`` or an ``R`` as 168 | well as it cannot have the letter ``P`` occupying its third position. Finally, gray letters can be simply 169 | stored in a list. 170 | 171 | In order to have a concrete example to work with, we will be assuming that our current 172 | information about the problem is expressed by the following three data structures: 173 | 174 | .. code-block:: python 175 | 176 | green: dict[int, str] = {4 : 'T'} 177 | yellow: dict[int, list[str]] = {1 : ['A', 'R'], 3 : ['P']} 178 | gray: list[str] = ['C', 'D', 'L', 'M', 'N', 'Q', 'U'] 179 | 180 | 181 | Initializing a Pregex class instance 182 | ---------------------------------------- 183 | 184 | Having come up with a way of programmatically formulating the problem, the first step towards 185 | actually solving it would be to create a ``Pregex`` class instance: 186 | 187 | .. code-block:: python 188 | 189 | wordle = Pregex() 190 | 191 | Since we aren't providing a ``pattern`` parameter to the class's constructor, it automatically 192 | defaults to the empty string ``''``. Thus, through this instance we now have access to all methods 193 | of the ``Pregex`` class, though we are not really able to match anything with it yet. 194 | 195 | 196 | Yellow letter assertions 197 | ---------------------------------------- 198 | 199 | Before we go on to dictate what the valid letters for each position within the word 200 | are, we are first going to deal with yellow letters, that is, letters which we know are 201 | included in the word that we are looking for, though their position is still uncertain. 202 | Since we know for a fact that the sought out word contains these letters, we have to 203 | somehow make sure that any candidate word includes them as well. This can easily be 204 | done by using what is known in RegEx lingo as a *positive lookahead assertion*, 205 | represented in PRegEx by the less intimidating *FollowedBy*! Assertions are used in 206 | order to *assert* something about a pattern without really having to *match* any additional 207 | characters. A positive lookahead assertion, in particular, dictates that the pattern to which 208 | it is applied must be followed by some other pattern in order for the former to constitute 209 | a valid match. 210 | 211 | In PRegEx, one is able to create a ``Pregex`` instance out of applying a positive 212 | lookahead assertion to some pattern ``p1`` by doing the following: 213 | 214 | .. code-block:: python 215 | 216 | from pregex.core.assertions import FollowedBy 217 | 218 | pre = FollowedBy(p1, p2) 219 | 220 | where both ``p1`` and ``p2`` are either strings or ``Pregex`` instances. Futhermore, in the 221 | case that ``p1`` already is a ``Pregex`` class instance, one can achieve the same result with: 222 | 223 | .. code-block:: python 224 | 225 | pre = p1.followed_by(p2) 226 | 227 | 228 | Having initialized ``wordle`` as a ``Pregex`` instance, we can simply simply do 229 | ``wordle.followed_by(some_pattern)`` so as to indicate that any potential match 230 | with ``wordle`` must be followed by ``some_pattern``. Recall that ``wordle`` merely 231 | represents the empty string, so we are not really matching anything at this point. 232 | Applying an assertion to the empty string pattern is just a neat little trick one 233 | can use in order to validate something about their pattern before they even begin 234 | to build it. 235 | 236 | Now it's just a matter of figuring out what the value of ``some_pattern`` is. 237 | Surely we can't just do ``wordle = wordle.followed_by(letter)``, as this results 238 | in ``letter`` always having to be at the beginning of the word. Here's however what 239 | we can do: It follows from the rules of Wordle that all words must be comprised of five 240 | letters, any of which is potentially a yellow letter. Thus, every yellow letter is certain 241 | to be preceded by up to four other letters, but no more than that. Therefore, we need a 242 | pattern that represents just that, namely *four letters at most*. By applying quantifier 243 | ``at_most(n=4)`` to an instance of ``AnyUppercaseLetter()``, we are able to create such 244 | a pattern. Add a yellow letter to its right and we have our ``some_pattern``. Since there 245 | may be more than one yellow letters, we make sure that we iterate them all one by one so 246 | as to enforce a separate assertion for each: 247 | 248 | .. code-block:: python 249 | 250 | from pregex.core.classes import AnyUppercaseLetter 251 | 252 | yellow_letters_list: list[str] = [l for letter_list in yellow.values() for l in letter_list] 253 | 254 | at_most_four_letters = AnyUppercaseLetter().at_most(n=4) 255 | 256 | for letter in yellow_letters_list: 257 | wordle = wordle.followed_by(at_most_four_letters + letter) 258 | 259 | By executing the above code snippet we get a ``Pregex`` instance which 260 | represents the following RegEx pattern: 261 | 262 | .. code-block:: 263 | 264 | (?=[A-Z]{,4}A)(?=[A-Z]{,4}R)(?=[A-Z]{,4}P) 265 | 266 | Building valid character classes 267 | ---------------------------------------- 268 | 269 | After we have made sure that our pattern will reject any words that do not contain 270 | all the yellow letters, we can finally start building the part of the pattern that 271 | will handle the actual matching. This can easily be achived by performing five 272 | iterations, one for each letter of the word, where at each iteration ``i`` we 273 | construct a new character class, which is then appended to our pattern based 274 | on the following logic: 275 | 276 | * If the letter that corresponds to the word's i-th position is known, then 277 | make it so that the pattern only matches that letter at that position. 278 | 279 | * If the letter that corresponds to the word's i-th position is not known, 280 | then make it so that the pattern matches any letter except for gray letters, 281 | green letters, as well as any yellow letters that may have been ruled out for 282 | that exact position. 283 | 284 | The following code snippet does just that: 285 | 286 | .. code-block:: python 287 | 288 | from pregex.core.classes import AnyFrom 289 | 290 | for i in range(1, 6): 291 | if i in green: 292 | wordle += green[i] 293 | else: 294 | invalid_chars_at_pos_i = gray + list(green.values()) 295 | if i in yellow: 296 | invalid_chars_at_pos_i += yellow[i] 297 | wordle += AnyUppercaseLetter() - AnyFrom(*invalid_chars_at_pos_i) 298 | 299 | After executing the above code, ``wordle`` will contain the following 300 | RegEx pattern: 301 | 302 | .. code-block:: 303 | 304 | (?=[A-Z]{,4}A)(?=[A-Z]{,4}R)(?=[A-Z]{,4}P)[BE-KOPSV-Z][ABE-KOPRSV-Z][ABE-KORSV-Z]T[ABE-KOPRSV-Z] 305 | 306 | Matching from a dictionary 307 | --------------------------- 308 | 309 | Having built our pattern, the only thing left to do is to actually use it to 310 | match candidate words. Provided that we have access to a text file containing 311 | all possible Wordle words, we are able to invoke our ``Pregex`` instance's 312 | ``get_matches`` method in order to scan said text file for any potential matches. 313 | 314 | .. code-block:: python 315 | 316 | words = wordle.get_matches('word_dictionary.txt', is_path=True) 317 | 318 | Putting it all together 319 | ---------------------------------------- 320 | 321 | Finally, we combine together everything we discussed into a single function that 322 | spews out a list of words which satisfy all necessary conditions so that they 323 | constitute possible solutions to the problem. 324 | 325 | .. code-block:: python 326 | 327 | def wordle_solver(green: dict[int, str], yellow: dict[int, list[str]], gray: list[str]) -> list[str]: 328 | 329 | from pregex.core.pre import Pregex 330 | from pregex.core.classes import AnyUpperCaseLetter, AnyFrom 331 | 332 | # Initialize pattern as the empty string pattern. 333 | wordle = Pregex() 334 | 335 | # This part ensures that yellow letters 336 | # will appear at least once within the word. 337 | yellow_letters_list = [l for letter_list in yellow.values() for l in letter_list] 338 | at_most_four_letters = AnyUppercaseLetter().at_most(n=4) 339 | for letter in yellow_letters_list: 340 | wordle = wordle.followed_by(at_most_four_letters + letter) 341 | 342 | # This part actually dictates the set of valid letters 343 | # for each position within the word. 344 | for i in range(1, 6): 345 | if i in green: 346 | wordle += green[i] 347 | else: 348 | invalid_chars_at_pos_i = gray + list(green.values()) 349 | if i in yellow: 350 | invalid_chars_at_pos_i += yellow[i] 351 | wordle += AnyUppercaseLetter() - AnyFrom(*invalid_chars_at_pos_i) 352 | 353 | # Match candidate words from dictionary and return them in a list. 354 | return wordle.get_matches('word_dictionary.txt', is_path=True) 355 | 356 | 357 | By invoking the above function we get the following list of words: 358 | 359 | .. code-block:: python 360 | 361 | word_candidates = wordle_solver(green, yellow, gray) 362 | 363 | print(word_candidates) # This prints ['PARTY'] 364 | 365 | Looks like there is only one candidate word, which means that we 366 | can consider our problem solved! 367 | 368 | You can learn more about PRegEx by going through the 369 | `Documentation `_ 370 | section or by directly visiting PRegEx on 371 | `Github `_ 372 | in order to check out the source code itself. -------------------------------------------------------------------------------- /docs/source/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/manoss96/pregex/33861d8e141116c670d4765ed2fdbc0eaf077114/docs/source/logo.png -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==5.3.0 2 | sphinx-rtd-theme==1.1.1 3 | readthedocs-sphinx-ext==2.2.0 4 | sphinxcontrib-applehelp==1.0.2 5 | sphinxcontrib-devhelp==1.0.2 6 | sphinxcontrib-htmlhelp==2.0.0 7 | sphinxcontrib-jsmath==1.0.1 8 | sphinxcontrib-qthelp==1.0.3 9 | sphinxcontrib-serializinghtml==1.1.5 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pregex" 3 | version = "2.3.3" 4 | authors = [ 5 | {email = "manosstoumpos@gmail.com"}, 6 | {name = "Manos Stoumpos"} 7 | ] 8 | description = "PRegEx - Programmable Regular Expressions" 9 | keywords = ["regex"] 10 | readme = "README.md" 11 | license = {text = "MIT"} 12 | requires-python = ">=3.9" 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | ] 18 | 19 | [project.urls] 20 | "Homepage" = "https://github.com/manoss96/pregex" 21 | "Bug Tracker" = "https://github.com/manoss96/pregex/issues" 22 | "Documentation" = "https://pregex.rtfd.io" -------------------------------------------------------------------------------- /src/pregex/__init__.py: -------------------------------------------------------------------------------- 1 | from pregex.core import * 2 | from pregex.meta import * -------------------------------------------------------------------------------- /src/pregex/core/__init__.py: -------------------------------------------------------------------------------- 1 | from pregex.core import assertions as asr 2 | from pregex.core import classes as cl 3 | from pregex.core import groups as gr 4 | from pregex.core import operators as op 5 | from pregex.core import quantifiers as qu 6 | from pregex.core import tokens as tk 7 | from pregex.core.pre import Pregex -------------------------------------------------------------------------------- /src/pregex/core/assertions.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | All classes within this module "assert" something about the provided pattern 3 | without having to match any additional characters. For example, :class:`MatchAtStart` 4 | ensures that the provided pattern matches only when it is found at the start of the string, 5 | while :class:`NotFollowedBy` asserts that a match must not be followed by one or more 6 | specified patterns. Another thing you should keep in mind is that many of these assertions 7 | cannot be repeated, as attempting that will cause a ``CannotBeRepeatedException`` exception 8 | to be thrown. 9 | 10 | Classes & methods 11 | ------------------------------------------- 12 | 13 | Below are listed all classes within :py:mod:`pregex.core.assertions` 14 | along with any possible methods they may possess. 15 | """ 16 | 17 | 18 | import pregex.core.pre as _pre 19 | import pregex.core.exceptions as _ex 20 | from typing import Union as _Union 21 | 22 | 23 | class __Assertion(_pre.Pregex): 24 | ''' 25 | Constitutes the base class for `__Anchor` and `__Lookaround` classes. 26 | 27 | :param str pattern: The RegEx pattern which represents the assertion. 28 | ''' 29 | def __init__(self, pattern: str): 30 | ''' 31 | Constitutes the base class for `__Anchor` and `__Lookaround` classes. 32 | 33 | :param str pattern: The RegEx pattern which represents the assertion. 34 | ''' 35 | super().__init__(pattern, escape=False) 36 | 37 | 38 | class __Anchor(__Assertion): 39 | ''' 40 | Constitutes the base class for all `anchor` classes that are part of this module. 41 | 42 | :param Pregex | str pre: A Pregex instance or string representing the `anchor` pattern. 43 | :param (Pregex => str) transform: A `transform` function for the provided pattern. 44 | 45 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a ``Pregex`` instance \ 46 | nor a string. 47 | ''' 48 | def __init__(self, pre: _Union[_pre.Pregex, str], transform): 49 | ''' 50 | Constitutes the base class for all `anchor` classes that are part of this module. 51 | 52 | :param Pregex | str pre: A Pregex instance or string representing the `anchor` pattern. 53 | :param (Pregex => str) transform: A `transform` function for the provided pattern. 54 | 55 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a ``Pregex`` instance \ 56 | nor a string. 57 | ''' 58 | super().__init__(str(transform(__class__._to_pregex(pre)))) 59 | 60 | 61 | class __Lookaround(__Assertion): 62 | ''' 63 | Constitutes the base class for all "Lookaround" classes. 64 | 65 | :param Pregex | str pres: Two or more Pregex instances, the first of which always \ 66 | represents the `match` pattern, while the rest constitute `assertion` patterns. 67 | :param (tuple[Pregex | str] => str) transform: A `transform` function for the provided patterns. 68 | 69 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 70 | :raises EmptyNegativeAssertionException: The empty string is provided \ 71 | as one of the assertion patterns. 72 | ''' 73 | def __init__(self, pres: tuple[_Union[_pre.Pregex, str]], transform) -> _pre.Pregex: 74 | ''' 75 | Constitutes the base class for all "Lookaround" classes. 76 | 77 | :param Pregex | str pres: Two or more Pregex instances, the first of which always \ 78 | represents the `match` pattern, while the rest constitute `assertion` patterns. 79 | :param (tuple[Pregex | str] => str) transform: A `transform` function for the provided patterns. 80 | 81 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 82 | :raises EmptyNegativeAssertionException: The empty string is provided \ 83 | as one of the assertion patterns. 84 | ''' 85 | if len(pres) < 2: 86 | message = "At least one assertion pattern is required." 87 | raise _ex.NotEnoughArgumentsException(message) 88 | result = __class__._to_pregex(pres[0]) 89 | for pre in pres[1:]: 90 | result = transform(result, pre) 91 | super().__init__(str(result)) 92 | 93 | 94 | class MatchAtStart(__Anchor): 95 | ''' 96 | Matches the provided pattern only if it is at the start of the string. 97 | 98 | :param Pregex | str pre: The pattern that is to be matched. 99 | 100 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 101 | ``Pregex`` instance nor a string. 102 | 103 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 104 | ''' 105 | 106 | def __init__(self, pre: _Union[_pre.Pregex, str]): 107 | ''' 108 | Matches the provided pattern only if it is at the start of the string. 109 | 110 | :param Pregex | str pre: The pattern that is to be matched. 111 | 112 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 113 | ``Pregex`` instance nor a string. 114 | 115 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 116 | ''' 117 | super().__init__(pre, lambda pre: pre.match_at_start()) 118 | 119 | 120 | class MatchAtEnd(__Anchor): 121 | ''' 122 | Matches the provided pattern only if it is at the end of the string. 123 | 124 | :param Pregex | str pre: The pattern that is to be matched. 125 | 126 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 127 | ``Pregex`` instance nor a string. 128 | 129 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 130 | ''' 131 | 132 | def __init__(self, pre: _Union[_pre.Pregex, str]): 133 | ''' 134 | Matches the provided pattern only if it is at the end of the string. 135 | 136 | :param Pregex | str pre: The pattern that is to be matched. 137 | 138 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 139 | ``Pregex`` instance nor a string. 140 | 141 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 142 | ''' 143 | super().__init__(pre, lambda pre: pre.match_at_end()) 144 | 145 | 146 | class MatchAtLineStart(__Anchor): 147 | ''' 148 | Matches the provided pattern only if it is at the start of a line. 149 | 150 | :param Pregex | str pre: The pattern that is to be matched. 151 | 152 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 153 | ``Pregex`` instance nor a string. 154 | 155 | :note: 156 | - The resulting pattern cannot have a repeating quantifier applied to it. 157 | - Uses meta character ``^`` since the `MULTILINE` flag is considered on. 158 | ''' 159 | 160 | def __init__(self, pre: _Union[_pre.Pregex, str]): 161 | ''' 162 | Matches the provided pattern only if it is at the start of a line. 163 | 164 | :param Pregex | str pre: The pattern that is to be matched. 165 | 166 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 167 | ``Pregex`` instance nor a string. 168 | 169 | :note: 170 | - The resulting pattern cannot have a repeating quantifier applied to it. 171 | - Uses meta character ``^`` since the `MULTILINE` flag is considered on. 172 | ''' 173 | super().__init__(pre, lambda pre: pre.match_at_line_start()) 174 | 175 | 176 | class MatchAtLineEnd(__Anchor): 177 | ''' 178 | Matches the provided pattern only if it is at the end of a line. 179 | 180 | :param Pregex | str pre: The pattern that is to be matched. 181 | 182 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 183 | ``Pregex`` instance nor a string. 184 | 185 | :note: 186 | - The resulting pattern cannot have a repeating quantifier applied to it. 187 | - Uses meta character ``$`` since the `MULTILINE` flag is considered on. 188 | ''' 189 | 190 | def __init__(self, pre: _Union[_pre.Pregex, str]): 191 | ''' 192 | Matches the provided pattern only if it is at the end of a line. 193 | 194 | :param Pregex | str pre: The pattern that is to be matched. 195 | 196 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 197 | ``Pregex`` instance nor a string. 198 | 199 | :note: 200 | - The resulting pattern cannot have a repeating quantifier applied to it. 201 | - Uses meta character ``$`` since the `MULTILINE` flag is considered on. 202 | ''' 203 | super().__init__(pre, lambda pre: pre.match_at_line_end()) 204 | 205 | 206 | class WordBoundary(__Anchor): 207 | ''' 208 | Asserts that the position, at which an instance of this class is placed, \ 209 | must constitute a word boundary. 210 | ''' 211 | 212 | def __init__(self): 213 | ''' 214 | Asserts that the position, at which an instance of this class is placed, \ 215 | must constitute a word boundary. 216 | ''' 217 | super().__init__(_pre.Pregex(), lambda pre: pre.concat(_pre.Pregex("\\b", escape=False))) 218 | 219 | 220 | class NonWordBoundary(__Anchor): 221 | ''' 222 | Asserts that the position, at which an instance of this class is placed, \ 223 | must not constitute a word boundary. 224 | ''' 225 | 226 | def __init__(self): 227 | ''' 228 | Asserts that the position, at which an instance of this class is placed, \ 229 | must not constitute a word boundary. 230 | ''' 231 | super().__init__(_pre.Pregex(), lambda pre: pre.concat(_pre.Pregex("\\B", escape=False))) 232 | 233 | 234 | class FollowedBy(__Lookaround): 235 | ''' 236 | Matches pattern ``match`` only if it is directly followed \ 237 | by all of the provided ``assertion`` patterns. 238 | 239 | :param Pregex | str match: A Pregex instance or string \ 240 | representing the `match` pattern. 241 | :param Pregex | str \*assertions: One or more patterns, all of which must \ 242 | come right after pattern ``match`` in order for it to be considered a match. 243 | 244 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 245 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 246 | is neither a ``Pregex`` instance nor a string. 247 | 248 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 249 | ''' 250 | 251 | def __init__(self, match: _Union[_pre.Pregex, str], *assertions: _Union[_pre.Pregex, str]): 252 | ''' 253 | Matches pattern ``match`` only if it is directly followed \ 254 | by all of the provided ``assertion`` patterns. 255 | 256 | :param Pregex | str match: A Pregex instance or string \ 257 | representing the `match` pattern. 258 | :param Pregex | str \*assertions: One or more patterns, all of which must \ 259 | come right after pattern ``match`` in order for it to be considered a match. 260 | 261 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 262 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 263 | is neither a ``Pregex`` instance nor a string. 264 | 265 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 266 | ''' 267 | super().__init__((match, *assertions), lambda pre1, pre2: pre1.followed_by(pre2)) 268 | 269 | 270 | class PrecededBy(__Lookaround): 271 | ''' 272 | Matches pattern ``match`` only if it is directly preceded \ 273 | by all of the provided ``assertion`` patterns. 274 | 275 | :param Pregex | str match: A Pregex instance or string \ 276 | representing the `match` pattern. 277 | :param Pregex | str \*assertions: One or more patterns, all of which must \ 278 | come right before pattern ``match`` in order for it to be considered a match. 279 | 280 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 281 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 282 | is neither a ``Pregex`` instance nor a string. 283 | :raises NonFixedWidthPatternException: Parameter ``assertion`` \ 284 | corresponds to a pattern that does not have a fixed width. 285 | 286 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 287 | ''' 288 | 289 | def __init__(self, match: _Union[_pre.Pregex, str], *assertions: _Union[_pre.Pregex, str]): 290 | ''' 291 | Matches pattern ``match`` only if it is directly preceded \ 292 | by all of the provided ``assertion`` patterns. 293 | 294 | :param Pregex | str match: A Pregex instance or string \ 295 | representing the `match` pattern. 296 | :param Pregex | str \*assertions: One or more patterns, all of which must \ 297 | come right before pattern ``match`` in order for it to be considered a match. 298 | 299 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 300 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 301 | is neither a ``Pregex`` instance nor a string. 302 | :raises NonFixedWidthPatternException: Parameter ``assertion`` \ 303 | corresponds to a pattern that does not have a fixed width. 304 | 305 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 306 | ''' 307 | super().__init__((match, *assertions), lambda pre1, pre2: pre1.preceded_by(pre2)) 308 | 309 | 310 | class EnclosedBy(__Lookaround): 311 | ''' 312 | Matches pattern ``match`` only if it is both directly preceded \ 313 | and followed by all of the provided ``assertion`` patterns. 314 | 315 | :param Pregex | str match: A Pregex instance or string \ 316 | representing the `match` pattern. 317 | :param Pregex | str \*assertions: One or more patterns, all of which must \ 318 | come both right before and right after pattern ``match`` in order for \ 319 | it to be considered a match. 320 | 321 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 322 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 323 | is neither a ``Pregex`` instance nor a string. 324 | :raises NonFixedWidthPatternException: Parameter ``assertion`` \ 325 | corresponds to a pattern that does not have a fixed width. 326 | 327 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 328 | ''' 329 | 330 | def __init__(self, match: _Union[_pre.Pregex, str], *assertions: _Union[_pre.Pregex, str]): 331 | ''' 332 | Matches pattern ``match`` only if it is both directly preceded \ 333 | and followed by all of the provided ``assertion`` patterns. 334 | 335 | :param Pregex | str match: A Pregex instance or string \ 336 | representing the `match` pattern. 337 | :param Pregex | str \*assertions: One or more patterns, all of which must \ 338 | come both right before and right after pattern ``match`` in order for \ 339 | it to be considered a match. 340 | 341 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 342 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 343 | is neither a ``Pregex`` instance nor a string. 344 | :raises NonFixedWidthPatternException: Parameter ``assertion`` \ 345 | corresponds to a pattern that does not have a fixed width. 346 | 347 | :note: The resulting pattern cannot have a repeating quantifier applied to it. 348 | ''' 349 | super().__init__((match, *assertions), lambda pre1, pre2: pre1.enclosed_by(pre2)) 350 | 351 | 352 | class NotFollowedBy(__Lookaround): 353 | ''' 354 | Matches pattern ``match`` only if it is not directly followed by \ 355 | any one of the provided ``assertions`` patterns. 356 | 357 | :param Pregex | str match: The pattern that is to be matched. 358 | :param Pregex | str \*assertions: One or more patterns, none of which must \ 359 | come right after pattern ``match`` in order for it to be considered a match. 360 | 361 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 362 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 363 | is neither a ``Pregex`` instance nor a string. 364 | :raises EmptyNegativeAssertionException: At least one of the provided assertion \ 365 | patterns is the empty-string pattern. 366 | ''' 367 | 368 | def __init__(self, match: _Union[_pre.Pregex, str], *assertions: _Union[_pre.Pregex, str]): 369 | ''' 370 | Matches pattern ``match`` only if it is not directly followed by \ 371 | any one of the provided ``assertions`` patterns. 372 | 373 | :param Pregex | str match: The pattern that is to be matched. 374 | :param Pregex | str \*assertions: One or more patterns, none of which must \ 375 | come right after pattern ``match`` in order for it to be considered a match. 376 | 377 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 378 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 379 | is neither a ``Pregex`` instance nor a string. 380 | :raises EmptyNegativeAssertionException: At least one of the provided assertion \ 381 | patterns is the empty-string pattern. 382 | ''' 383 | super().__init__((match, *assertions), 384 | lambda pre1, pre2: pre1.not_followed_by(pre2)) 385 | 386 | 387 | class NotPrecededBy(__Lookaround): 388 | ''' 389 | Matches pattern ``match`` only if it is not directly preceded by \ 390 | any one of the provided ``assertions`` patterns. 391 | 392 | :param Pregex | str match: The pattern that is to be matched. 393 | :param Pregex | str \*assertions: One or more patterns, none of which must \ 394 | come right before pattern ``match`` in order for it to be considered a match. 395 | 396 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 397 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 398 | is neither a ``Pregex`` instance nor a string. 399 | :raises EmptyNegativeAssertionException: At least one of the provided assertion \ 400 | patterns is the empty-string pattern. 401 | :raises NonFixedWidthPatternException: At least one of the provided assertion \ 402 | patterns does not have a fixed width. 403 | ''' 404 | 405 | def __init__(self, match: _Union[_pre.Pregex, str], *assertions: _Union[_pre.Pregex, str]): 406 | ''' 407 | Matches pattern ``match`` only if it is not directly preceded by \ 408 | any one of the provided ``assertions`` patterns. 409 | 410 | :param Pregex | str match: The pattern that is to be matched. 411 | :param Pregex | str \*assertions: One or more patterns, none of which must \ 412 | come right before pattern ``match`` in order for it to be considered a match. 413 | 414 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 415 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 416 | is neither a ``Pregex`` instance nor a string. 417 | :raises EmptyNegativeAssertionException: At least one of the provided assertion \ 418 | patterns is the empty-string pattern. 419 | :raises NonFixedWidthPatternException: At least one of the provided assertion \ 420 | patterns does not have a fixed width. 421 | ''' 422 | super().__init__((match, *assertions), 423 | lambda pre1, pre2: pre1.not_preceded_by(pre2)) 424 | 425 | 426 | class NotEnclosedBy(__Lookaround): 427 | ''' 428 | Matches pattern ``match`` only if it is neither directly preceded \ 429 | nor followed by any one of the provided ``assertions`` patterns. 430 | 431 | :param Pregex | str match: The pattern that is to be matched. 432 | :param Pregex | str \*assertions: One or more patterns, none of which must \ 433 | come either right before or right after pattern ``match`` in order for \ 434 | it to be considered a match. 435 | 436 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 437 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 438 | is neither a ``Pregex`` instance nor a string. 439 | :raises EmptyNegativeAssertionException: At least one of the provided assertion \ 440 | patterns is the empty-string pattern. 441 | :raises NonFixedWidthPatternException: At least one of the provided assertion \ 442 | patterns does not have a fixed width. 443 | ''' 444 | 445 | def __init__(self, match: _Union[_pre.Pregex, str], *assertions: _Union[_pre.Pregex, str]): 446 | ''' 447 | Matches pattern ``match`` only if it is neither directly preceded \ 448 | nor followed by any one of the provided ``assertions`` patterns. 449 | 450 | :param Pregex | str match: The pattern that is to be matched. 451 | :param Pregex | str \*assertions: One or more patterns, none of which must \ 452 | come either right before or right after pattern ``match`` in order for \ 453 | it to be considered a match. 454 | 455 | :raises NotEnoughArgumentsException: No assertion patterns were provided. 456 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 457 | is neither a ``Pregex`` instance nor a string. 458 | :raises EmptyNegativeAssertionException: At least one of the provided assertion \ 459 | patterns is the empty-string pattern. 460 | :raises NonFixedWidthPatternException: At least one of the provided assertion \ 461 | patterns does not have a fixed width. 462 | ''' 463 | super().__init__((match, *assertions), 464 | lambda pre1, pre2: pre1.not_enclosed_by(pre2)) -------------------------------------------------------------------------------- /src/pregex/core/exceptions.py: -------------------------------------------------------------------------------- 1 | class InvalidArgumentValueException(Exception): 2 | ''' 3 | This exception is thrown whenever an argument of invalid value is provided. 4 | 5 | :param str message: The message that is to be displayed \ 6 | along with the exception. 7 | ''' 8 | 9 | def __init__(self, message: str): 10 | ''' 11 | This exception is thrown whenever an argument of invalid value is provided. 12 | 13 | :param str message: The message that is to be displayed \ 14 | along with the exception. 15 | ''' 16 | super().__init__(message) 17 | 18 | 19 | class InvalidArgumentTypeException(Exception): 20 | ''' 21 | This exception is thrown whenever an argument of invalid type is provided. 22 | 23 | :param str message: The message that is to be displayed \ 24 | along with the exception. 25 | ''' 26 | 27 | def __init__(self, message: str): 28 | ''' 29 | This exception is thrown whenever an argument of invalid type is provided. 30 | 31 | :param str message: The message that is to be displayed \ 32 | along with the exception. 33 | ''' 34 | super().__init__(message) 35 | 36 | 37 | class NotEnoughArgumentsException(Exception): 38 | ''' 39 | This exception is thrown whenever an insufficient amount \ 40 | of arguments is provided. 41 | 42 | :param str message: The message that is to be displayed \ 43 | along with the exception. 44 | ''' 45 | 46 | def __init__(self, message: str): 47 | ''' 48 | This exception is thrown whenever an insufficient amount \ 49 | of arguments is provided. 50 | 51 | :param str message: The message that is to be displayed \ 52 | along with the exception. 53 | ''' 54 | super().__init__(message) 55 | 56 | 57 | class InvalidCapturingGroupNameException(Exception): 58 | ''' 59 | This exception is thrown whenever an invalid name \ 60 | for a capturing group was provided. 61 | 62 | :param str name: The string type argument because of which this exception was thrown. 63 | ''' 64 | 65 | def __init__(self, name: str): 66 | ''' 67 | This exception is thrown whenever an invalid name \ 68 | for a capturing group was provided. 69 | 70 | :param str name: The string type argument because of which this exception was thrown. 71 | ''' 72 | super().__init__(f"Name \"{name}\" is not valid. A capturing group's " + 73 | "name must be an alphanumeric sequence that starts with a non-digit.") 74 | 75 | 76 | class CannotBeNegatedException(Exception): 77 | ''' 78 | This exception is thrown whenever one tries to negate class ``Any``. 79 | ''' 80 | def __init__(self): 81 | ''' 82 | This exception is thrown whenever one tries to negate class ``Any``. 83 | ''' 84 | super().__init__(f"Class \"Any\" cannot be negated.") 85 | 86 | 87 | class CannotBeUnionedException(Exception): 88 | ''' 89 | This exception is thrown whenever one tries to union a class (or negated class) \ 90 | either with a negated class (or regular class) or an object of different type. 91 | 92 | :param Pregex pre: The ``Pregex`` instance because of which this exception was thrown. 93 | :param bool are_both_classes: Indicates whether both ``Pregex`` instances are of \ 94 | type ``__Class``. 95 | ''' 96 | 97 | def __init__(self, pre, are_both_classes: bool): 98 | ''' 99 | This exception is thrown whenever one tries to union a class (or negated class) \ 100 | either with a negated class (or regular class) or an object of different type. 101 | 102 | :param Pregex pre: The ``Pregex`` instance because of which this exception was thrown. 103 | :param bool are_both_classes: Indicates whether both ``Pregex`` instances are of \ 104 | type ``__Class``. 105 | ''' 106 | m = f"Classes and negated classes cannot be unioned together." if are_both_classes \ 107 | else f"Instance of type \"{type(pre).__name__}\" cannot be unioned with a class." 108 | super().__init__(m) 109 | 110 | 111 | class CannotBeSubtractedException(Exception): 112 | ''' 113 | This exception is thrown whenever one tries to subtract a class (or negated class) \ 114 | either from a negated class (or regular class) or an object of different type. 115 | 116 | :param Pregex pre: The ``Pregex`` instance because of which this exception was thrown. 117 | :param bool are_both_classes: Indicates whether both ``Pregex`` instances are of type ``__Class``. 118 | ''' 119 | 120 | def __init__(self, pre, are_both_classes: bool): 121 | ''' 122 | This exception is thrown whenever one tries to subtract a class (or negated class) \ 123 | either from a negated class (or regular class) or an object of different type. 124 | 125 | :param Pregex pre: The ``Pregex`` instance because of which this exception was thrown. 126 | :param bool are_both_classes: Indicates whether both ``Pregex`` instances are of type ``__Class``. 127 | ''' 128 | m = f"Classes and negated classes cannot be subtracted from one another." if are_both_classes \ 129 | else f"Instance of type \"{type(pre).__name__}\" cannot be subtracted from a class." 130 | super().__init__(m) 131 | 132 | 133 | class GlobalWordCharSubtractionException(Exception): 134 | ''' 135 | This exception is thrown whenever one tries to subtract from an instance of \ 136 | either one of ``AnyWordChar`` or ``AnyButWordChar`` classes, for which parameter \ 137 | "is_global" has been set to ``True``. 138 | 139 | :param AnyWordChar | AnyButWordChar pre: An instance of either one of the two classes. 140 | ''' 141 | 142 | def __init__(self, pre): 143 | ''' 144 | This exception is thrown whenever one tries to subtract from an instance of \ 145 | either one of ``AnyWordChar`` or ``AnyButWordChar`` classes, for which parameter \ 146 | "is_global" has been set to ``True``. 147 | 148 | :param AnyWordChar | AnyButWordChar pre: An instance of either one of the two classes. 149 | ''' 150 | m = f"Cannot subtract from an instance of class \"{type(pre).__name__}\"" + \ 151 | " for which parameter \"is_global\" has been set to \"True\"." 152 | super().__init__(m) 153 | 154 | 155 | class EmptyClassException(Exception): 156 | ''' 157 | This exception is thrown whenever one tries to subtract a class (or negated class) \ 158 | from a class (or negated class) which results in an empty class. 159 | 160 | :param Pregex pre1: The ``Pregex`` instance because of which this exception was thrown. 161 | :param Pregex pre2: The ``Pregex`` instance because of which this exception was thrown. 162 | ''' 163 | 164 | def __init__(self, pre1, pre2): 165 | ''' 166 | This exception is thrown whenever one tries to subtract a class (or negated class) \ 167 | from a class (or negated class) which results in an empty class. 168 | 169 | :param Pregex pre1: The ``Pregex`` instance because of which this exception was thrown. 170 | :param Pregex pre2: The ``Pregex`` instance because of which this exception was thrown. 171 | ''' 172 | m = f"Cannot subtract class \"{pre2}\" from class \"{pre1}\"" \ 173 | " as this results into an empty class." 174 | super().__init__(m) 175 | 176 | 177 | class InvalidRangeException(Exception): 178 | ''' 179 | This exception is thrown whenever there was provided a pair \ 180 | of values ``start`` and ``end``, where ``start`` comes after ``end``. 181 | 182 | :param int start: The integer because of which this exception was thrown. 183 | :param int end: The integer because of which this exception was thrown. 184 | ''' 185 | 186 | def __init__(self, start: int, end: int): 187 | ''' 188 | This exception is thrown whenever there was provided a pair \ 189 | of values ``start`` and ``end``, where ``start`` comes after ``end``. 190 | 191 | :param int start: The integer because of which this exception was thrown. 192 | :param int end: The integer because of which this exception was thrown. 193 | ''' 194 | super().__init__(f"\"[{start}-{end}]\" is not a valid range.") 195 | 196 | 197 | class CannotBeRepeatedException(Exception): 198 | ''' 199 | This exception is thrown whenever an instance of a class \ 200 | that is part of the ``assertions`` module is being quantified. 201 | 202 | :param __Assertion pre: The ``__Assertion`` instance because of which this exception was thrown. 203 | ''' 204 | 205 | def __init__(self, pre): 206 | ''' 207 | This exception is thrown whenever there is an attempt to \ 208 | repeat a non-repeatable pattern. 209 | 210 | :param __Assertion pre: The ``__Assertion`` instance because of which this exception was thrown. 211 | ''' 212 | m = f"Pattern \"{pre.get_pattern()}\" is non-repeatable." 213 | super().__init__(m) 214 | 215 | 216 | class NonFixedWidthPatternException(Exception): 217 | ''' 218 | This exception is thrown whenever a non-fixed-width pattern is being 219 | provided as lookbehind-pattern to either ``PrecededBy`` or ``NotPrecededBy``. 220 | 221 | :param __Lookaround lookbehind: The ``__Lookaround`` instance because of which this exception was thrown. 222 | :param Pregex pre: The ``Pregex`` instance because of which this exception was thrown. 223 | ''' 224 | 225 | def __init__(self, lookbehind): 226 | ''' 227 | This exception is thrown whenever a non-fixed-width pattern is being 228 | provided as lookbehind-pattern to either ``PrecededBy`` or ``NotPrecededBy``. 229 | 230 | :param __Lookaround lookbehind: The ``__Lookaround`` instance because of which this exception was thrown. 231 | ''' 232 | m = f"Pattern '{lookbehind.get_pattern()}' cannot be used as a lookbehind" 233 | m += f" assertion pattern due to its variable length." 234 | super().__init__(m) 235 | 236 | 237 | class EmptyNegativeAssertionException(Exception): 238 | ''' 239 | This exception is thrown whenever the ``Empty`` pattern is provided 240 | as a negative assertion. 241 | ''' 242 | 243 | def __init__(self): 244 | ''' 245 | This exception is thrown whenever the ``Empty`` pattern is provided 246 | as a negative assertion. 247 | ''' 248 | message = "The empty string can't be provided as a negative lookaround assertion pattern." 249 | super().__init__(message) -------------------------------------------------------------------------------- /src/pregex/core/groups.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains all necessary classes that are used to construct both 3 | capturing and non-capturing groups, as well as any other classes which relate 4 | to concepts that are based on groups, such as backreferences and conditionals. 5 | 6 | Pattern grouping 7 | ------------------------------------------- 8 | In general, one should not have to concern themselves with pattern grouping, 9 | as patterns are automatically wrapped within non-capturing groups whenever this is 10 | deemed necessary. Consider for instance the following code snippet: 11 | 12 | .. code-block:: python 13 | 14 | from pregex.core.quantifiers import Optional 15 | 16 | Optional('a').print_pattern() # This prints "a?" 17 | Optional('aa').print_pattern() # This prints "(?:aa)?" 18 | 19 | In the first case, quantifier :class:`~pregex.core.quantifiers.Optional` is applied to 20 | the pattern directly, whereas in the second case the pattern is placed into a non-capturing 21 | group so that "aa" is quantified as a whole. Be that as it may, there exists a separate class, 22 | namely :class:`Group`, through which one is able to explicitly wrap any pattern within 23 | a non-capturing group if they wish to do so: 24 | 25 | .. code-block:: python 26 | 27 | from pregex.core.groups import Group 28 | from pregex.core.quantifiers import Optional 29 | 30 | pre = Group(Optional('a')) 31 | 32 | pre.print_pattern() # This prints "(?:a)?" 33 | 34 | This class can also be used so as to apply various RegEx flags, also known \ 35 | as *modifiers*, to a pattern. As of yet, the only flag that is supported is 36 | the case-insensitive flag ``i``: 37 | 38 | .. code-block:: python 39 | 40 | from pregex.core.groups import Group 41 | 42 | pre = Group('pregex', is_case_insensitive=True) 43 | 44 | # This statement is "True" 45 | pre.is_exact_match('PRegEx') 46 | 47 | 48 | Capturing patterns 49 | ------------------------------------------- 50 | 51 | You'll find however that :class:`Capture` is probably the most important class 52 | of this module, as it is used to create a capturing group out of a pattern, 53 | so that said pattern is captured separately whenever a match occurs. 54 | 55 | .. code-block:: python 56 | 57 | from pregex.core.groups import Capture 58 | from pregex.core.classes import AnyLetter 59 | 60 | pre = AnyLetter() + Capture(2 * AnyLetter()) 61 | 62 | text = "abc def" 63 | print(pre.get_matches(text)) # This prints "['abc', 'def']" 64 | print(pre.get_captures(text)) # This prints "[('bc'), ('ef')]" 65 | 66 | As you can see, capturing is a very useful tool for whenever you are 67 | interested in isolating some specific part of a pattern. 68 | 69 | Classes & methods 70 | ------------------------------------------- 71 | 72 | Below are listed all classes within :py:mod:`pregex.core.groups` 73 | along with any possible methods they may possess. 74 | """ 75 | 76 | 77 | import re as _re 78 | import pregex.core.pre as _pre 79 | import pregex.core.exceptions as _ex 80 | from typing import Union as _Union 81 | from typing import Optional as _Optional 82 | 83 | 84 | class __Group(_pre.Pregex): 85 | ''' 86 | Constitutes the base class for all classes that are part of this module. 87 | 88 | :param Pregex | str pre: A Pregex instance or string representing the pattern \ 89 | that is to be groupped. 90 | :param (Pregex => str) transform: A `transform` function for the provided pattern. 91 | 92 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 93 | ``Pregex`` instance nor a string. 94 | ''' 95 | def __init__(self, pre: _Union[_pre.Pregex, str], transform) -> _pre.Pregex: 96 | ''' 97 | Constitutes the base class for all classes that are part of this module. 98 | 99 | :param Pregex | str pre: A Pregex instance or string representing the pattern \ 100 | that is to be groupped. 101 | :param (Pregex => str) transform: A `transform` function for the provided pattern. 102 | 103 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 104 | ``Pregex`` instance nor a string. 105 | ''' 106 | pattern = transform(__class__._to_pregex(pre)) 107 | super().__init__(str(pattern), escape=False) 108 | 109 | 110 | class Capture(__Group): 111 | ''' 112 | Creates a capturing group out of the provided pattern. 113 | 114 | :param Pregex | str pre: The pattern out of which the capturing group is created. 115 | :param str name: The name that is assigned to the captured group \ 116 | for backreference purposes. A value of ``None`` indicates that no name \ 117 | is to be assigned to the group. Defaults to ``None``. 118 | 119 | :raises InvalidArgumentTypeException: 120 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 121 | - Parameter ``name`` is neither a string nor ``None``. 122 | :raises InvalidCapturingGroupNameException: Parameter ``name`` is not a valid \ 123 | capturing group name. Such name must contain word characters only and start \ 124 | with a non-digit character. 125 | 126 | :note: 127 | - Creating a capturing group out of a capturing group does nothing to it. 128 | - Creating a capturing group out of a non-capturing group converts it \ 129 | into a capturing group, except if any flags have been applied to it, \ 130 | in which case, the non-capturing group is wrapped within a capturing \ 131 | group as a whole. 132 | - Creating a named capturing group out of an unnamed capturing group, \ 133 | assigns a name to it. 134 | - Creating a named capturing group out of a named capturing group, \ 135 | changes the group's name. 136 | ''' 137 | 138 | def __init__(self, pre: _Union[_pre.Pregex, str], name: _Optional[str] = None): 139 | ''' 140 | Creates a capturing group out of the provided pattern. 141 | 142 | :param Pregex | str pre: The pattern that is to be wrapped \ 143 | within a capturing group. 144 | :param str name: The name that is assigned to the captured group \ 145 | for backreference purposes. A value of ``None`` indicates that no name \ 146 | is to be assigned to the group. Defaults to ``None``. 147 | 148 | :raises InvalidArgumentTypeException: 149 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 150 | - Parameter ``name`` is neither a string nor ``None``. 151 | :raises InvalidCapturingGroupNameException: Parameter ``name`` is not a valid \ 152 | capturing group name. Such name must contain word characters only and start \ 153 | with a non-digit character. 154 | 155 | :note: 156 | - Creating a capturing group out of a capturing group does nothing to. 157 | - Creating a capturing group out of a non-capturing group converts it \ 158 | into a capturing group, except if any flags have been applied to it, \ 159 | in which case, the non-capturing group is wrapped within a capturing \ 160 | group as a whole. 161 | - Creating a named capturing group out of an unnamed capturing group, \ 162 | assigns a name to it. 163 | - Creating a named capturing group out of a named capturing group, \ 164 | changes the group's name. 165 | ''' 166 | super().__init__(pre, lambda pre: pre.capture(name)) 167 | 168 | 169 | class Group(__Group): 170 | ''' 171 | Creates a non-capturing group out of the provided pattern. 172 | 173 | :param Pregex | str pre: The pattern that is to be wrapped \ 174 | within a non-capturing group. 175 | :param bool is_case_insensitive: If ``True``, then the "case insensitive" \ 176 | flag is applied to the group so that the pattern within it ignores case \ 177 | when it comes to matching. Defaults to ``False``. 178 | 179 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither \ 180 | a ``Pregex`` instance nor a string. 181 | 182 | :note: 183 | - Creating a non-capturing group out of a non-capturing group does nothing, \ 184 | except for remove its flags if it has any. 185 | - Creating a non-capturing group out of a capturing group converts it into \ 186 | a non-capturing group. 187 | ''' 188 | 189 | def __init__(self, pre: _Union[_pre.Pregex, str], is_case_insensitive: bool = False): 190 | ''' 191 | Creates a non-capturing group out of the provided pattern. 192 | 193 | :param Pregex | str pre: The pattern that is to be wrapped \ 194 | within a non-capturing group. 195 | :param bool is_case_insensitive: If ``True``, then the "case insensitive" \ 196 | flag is applied to the group so that the pattern within it ignores case \ 197 | when it comes to matching. Defaults to ``False``. 198 | 199 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither \ 200 | a ``Pregex`` instance nor a string. 201 | 202 | :note: 203 | - Creating a non-capturing group out of a non-capturing group does nothing, \ 204 | except for remove its flags if it has any. 205 | - Creating a non-capturing group out of a capturing group converts it into \ 206 | a non-capturing group. 207 | ''' 208 | super().__init__(pre, lambda pre: pre.group(is_case_insensitive)) 209 | 210 | 211 | class Backreference(__Group): 212 | ''' 213 | Creates a backreference to some previously declared capturing group. 214 | 215 | :param int | str ref: A reference to some previously declared capturing group. \ 216 | This parameter can either be an integer, in which case the capturing group \ 217 | is referenced by order, or a string, in which case the capturing group is \ 218 | referenced by name. 219 | 220 | :raises InvalidArgumentTypeException: Parameter ``ref`` is neither an integer \ 221 | nor a string. 222 | :raises InvalidArgumentValueException: Parameter ``ref`` is an integer but \ 223 | has a value of either less than ``1`` or greater than ``10``. 224 | :raises InvalidCapturingGroupNameException: Parameter ``ref`` is a string but \ 225 | not a valid capturing group name. Such name must contain word characters \ 226 | only and start with a non-digit character. 227 | ''' 228 | 229 | def __init__(self, ref: _Union[int, str]): 230 | ''' 231 | Creates a backreference to some previously declared capturing group. 232 | 233 | :param int | str ref: A reference to some previously declared capturing group. \ 234 | This parameter can either be an integer, in which case the capturing group \ 235 | is referenced by order, or a string, in which case the capturing group is \ 236 | referenced by name. 237 | 238 | :raises InvalidArgumentTypeException: Parameter ``ref`` is neither an integer \ 239 | nor a string. 240 | :raises InvalidArgumentValueException: Parameter ``ref`` is an integer but \ 241 | has a value of either less than ``1`` or greater than ``10``. 242 | :raises InvalidCapturingGroupNameException: Parameter ``ref`` is a string but \ 243 | not a valid capturing group name. Such name must contain word characters \ 244 | only and start with a non-digit character. 245 | ''' 246 | if isinstance(ref, int): 247 | if isinstance(ref, bool): 248 | message = "Parameter \"ref\" is neither an integer nor a string." 249 | raise _ex.InvalidArgumentTypeException(message) 250 | if ref < 1 or ref > 99: 251 | message = "Parameter \"ref\" cannot be less than 1 or greater than 99." 252 | raise _ex.InvalidArgumentValueException(message) 253 | transform = lambda s : f"\\{s}" 254 | elif isinstance(ref, str): 255 | if _re.fullmatch("[A-Za-z_][A-Za-z_0-9]*", ref) is None: 256 | raise _ex.InvalidCapturingGroupNameException(ref) 257 | transform = lambda s : f"(?P={s})" 258 | else: 259 | message = "Parameter \"ref\" is neither an integer nor a string." 260 | raise _ex.InvalidArgumentTypeException(message) 261 | super().__init__(str(ref), transform) 262 | 263 | 264 | class Conditional(__Group): 265 | ''' 266 | Given the name of a capturing group, matches ``pre1`` only if said capturing group has \ 267 | been previously matched. Furthermore, if a second pattern ``pre2`` is provided, then \ 268 | this pattern is matched in case the referenced capturing group was not, though one \ 269 | should be aware that for this to be possible, the referenced capturing group must \ 270 | be optional. 271 | 272 | :param str name: The name of the referenced capturing group. 273 | :param Pregex | str pre1: The pattern that is to be matched in case condition is true. 274 | :param Pregex | str pre2: The pattern that is to be matched in case condition \ 275 | is false. Defaults to ``None``. 276 | 277 | :raises InvalidArgumentTypeException: 278 | - Parameter ``name`` is not a string. 279 | - Parameter ``pre1`` is neither a ``Pregex`` instance nor a string. 280 | - Parameter ``pre2`` is neither a ``Pregex`` instance nor a string nor ``None``. 281 | :raises InvalidCapturingGroupNameException: Parameter ``name`` is not a valid \ 282 | capturing group name. Such name must contain word characters only and start \ 283 | with a non-digit character. 284 | ''' 285 | 286 | def __init__(self, name: str, pre1: _Union[_pre.Pregex, str], pre2: _Optional[_Union[_pre.Pregex, str]] = None): 287 | ''' 288 | Given the name of a capturing group, matches ``pre1`` only if said capturing group has \ 289 | been previously matched. Furthermore, if a second pattern ``pre2`` is provided, then \ 290 | this pattern is matched in case the referenced capturing group was not, though one \ 291 | should be aware that for this to be possible, the referenced capturing group must \ 292 | be optional. 293 | 294 | :param str name: The name of the referenced capturing group. 295 | :param Pregex | str pre1: The pattern that is to be matched in case condition is true. 296 | :param Pregex | str pre2: The pattern that is to be matched in case condition \ 297 | is false. Defaults to ``None``. 298 | 299 | :raises InvalidArgumentTypeException: 300 | - Parameter ``name`` is not a string. 301 | - Parameter ``pre1`` is neither a ``Pregex`` instance nor a string. 302 | - Parameter ``pre2`` is neither a ``Pregex`` instance nor a string nor ``None``. 303 | :raises InvalidCapturingGroupNameException: Parameter ``name`` is not a valid \ 304 | capturing group name. Such name must contain word characters only and start \ 305 | with a non-digit character. 306 | ''' 307 | if not isinstance(name, str): 308 | message = "Provided argument \"name\" is not a string." 309 | raise _ex.InvalidArgumentTypeException(message) 310 | if _re.fullmatch("[A-Za-z_][\w]*", name) is None: 311 | raise _ex.InvalidCapturingGroupNameException(name) 312 | super().__init__(name, lambda s: f"(?({s}){pre1}{'|' + str(pre2) if pre2 != None else ''})") -------------------------------------------------------------------------------- /src/pregex/core/operators.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains various classes representing operators \ 3 | that are typically applied between two or more patterns. 4 | 5 | Classes & methods 6 | ------------------------------------------- 7 | 8 | Below are listed all classes within :py:mod:`pregex.core.operators` 9 | along with any possible methods they may possess. 10 | """ 11 | 12 | 13 | import pregex.core.pre as _pre 14 | import pregex.core.exceptions as _ex 15 | from typing import Union as _Union 16 | 17 | 18 | class __Operator(_pre.Pregex): 19 | ''' 20 | Constitutes the base class for all classes that are part of this module. 21 | 22 | :param tuple[Pregex | str] pres: A tuple of strings or Pregex instances representing \ 23 | the patterns to which the operator is to be applied. 24 | :param (tuple[Pregex | str] => str) transform: A `transform` function for the provided pattern. 25 | 26 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 27 | through ``pres`` is neither a ``Pregex`` instance nor a string. 28 | 29 | :note: If no arguments are provided, then the resulting ``Pregex`` instance \ 30 | corresponds to the "empty string" pattern, whereas if a single argument is \ 31 | provided, it is simply returned wrapped within a ``Pregex`` instance. 32 | ''' 33 | def __init__(self, pres: tuple[_Union[_pre.Pregex, str]], transform) -> _pre.Pregex: 34 | ''' 35 | Constitutes the base class for all classes that are part of this module. 36 | 37 | :param tuple[Pregex | str] pres: A tuple of strings or Pregex instances representing \ 38 | the patterns to which the operator is to be applied. 39 | :param (tuple[Pregex | str] => str) transform: A `transform` function for the provided pattern. 40 | 41 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 42 | through ``pres`` is neither a ``Pregex`` instance nor a string. 43 | 44 | :note: If no arguments are provided, then the resulting ``Pregex`` instance \ 45 | corresponds to the "empty string" pattern, whereas if a single argument is \ 46 | provided, it is simply returned wrapped within a ``Pregex`` instance. 47 | ''' 48 | if len(pres) == 0: 49 | result = '' 50 | else: 51 | result = __class__._to_pregex(pres[0]) 52 | if len(pres) > 1: 53 | for pre in pres[1:]: 54 | result = transform(result, pre) 55 | super().__init__(str(result), escape=False) 56 | 57 | 58 | class Concat(__Operator): 59 | ''' 60 | Matches the concatenation of the provided patterns. 61 | 62 | :param Pregex | str \*pres: Two or more patterns that are to be concatenated. 63 | 64 | :raises NotEnoughArgumentsException: Less than two arguments are provided. 65 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 66 | is neither a ``Pregex`` instance nor a string. 67 | 68 | :note: If no arguments are provided, then the resulting ``Pregex`` instance \ 69 | corresponds to the "empty string" pattern, whereas if a single argument is \ 70 | provided, it is simply returned wrapped within a ``Pregex`` instance. 71 | ''' 72 | 73 | def __init__(self, *pres: _Union[_pre.Pregex, str]) -> _pre.Pregex: 74 | ''' 75 | Matches the concatenation of the provided patterns. 76 | 77 | :param Pregex | str \*pres: Two or more patterns that are to be concatenated. 78 | 79 | :raises NotEnoughArgumentsException: Less than two arguments are provided. 80 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 81 | is neither a ``Pregex`` instance nor a string. 82 | 83 | :note: If no arguments are provided, then the resulting ``Pregex`` instance \ 84 | corresponds to the "empty string" pattern, whereas if a single argument is \ 85 | provided, it is simply returned wrapped within a ``Pregex`` instance. 86 | ''' 87 | super().__init__(pres, lambda pre1, pre2: pre1.concat(pre2)) 88 | 89 | 90 | class Either(__Operator): 91 | ''' 92 | Matches either one of the provided patterns. 93 | 94 | :param Pregex | str \*pres: Two or more patterns that constitute the \ 95 | operator's alternatives. 96 | 97 | :raises NotEnoughArgumentsException: Less than two arguments are provided. 98 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 99 | is neither a ``Pregex`` instance nor a string. 100 | 101 | :note: 102 | - If no arguments are provided, then the resulting ``Pregex`` instance \ 103 | corresponds to the "empty string" pattern, whereas if a single argument is \ 104 | provided, it is simply returned wrapped within a ``Pregex`` instance. 105 | - One should be aware that ``Either`` is eager, meaning that the regex engine will \ 106 | stop the moment it matches either one of the alternatives, starting from \ 107 | the left-most pattern and continuing on to the right until a match occurs. 108 | ''' 109 | 110 | def __init__(self, *pres: _Union[_pre.Pregex, str]): 111 | ''' 112 | Matches either one of the provided patterns. 113 | 114 | :param Pregex | str \*pres: Two or more patterns that constitute the \ 115 | operator's alternatives. 116 | 117 | :raises NotEnoughArgumentsException: Less than two arguments are provided. 118 | :raises InvalidArgumentTypeException: At least one of the provided arguments \ 119 | is neither a ``Pregex`` instance nor a string. 120 | 121 | :note: 122 | - If no arguments are provided, then the resulting ``Pregex`` instance \ 123 | corresponds to the "empty string" pattern, whereas if a single argument is \ 124 | provided, it is simply returned wrapped within a ``Pregex`` instance. 125 | - One should be aware that ``Either`` is eager, meaning that the regex engine will \ 126 | stop the moment it matches either one of the alternatives, starting from \ 127 | the left-most pattern and continuing on to the right until a match occurs. 128 | ''' 129 | super().__init__(pres, lambda pre1, pre2: pre1.either(pre2)) 130 | 131 | 132 | class Enclose(__Operator): 133 | ''' 134 | Matches the pattern that results from concatenating the ``enclosing`` \ 135 | pattern(s) to both sides of pattern ``pre``. 136 | 137 | :param Pregex | str pre: The pattern that is to be at the center \ 138 | of the concatenation. 139 | :param Pregex | str enclosing: One or more patterns that are to *enclose* \ 140 | pattern ``pre`` one by one. 141 | 142 | :raises NotEnoughArgumentsException: Less than two arguments are provided. 143 | :raises InvalidArgumentTypeException: Either ``pre`` or at least one of the \ 144 | ``enclosing`` patterns is neither a ``Pregex`` instance nor a string. 145 | ''' 146 | 147 | def __init__(self, pre: _Union[_pre.Pregex, str], *enclosing:_Union[_pre.Pregex, str]) -> _pre.Pregex: 148 | ''' 149 | Matches the pattern that results from concatenating the ``enclosing`` \ 150 | pattern(s) to both sides of pattern ``pre``. 151 | 152 | :param Pregex | str pre: The pattern that is to be at the center \ 153 | of the concatenation. 154 | :param Pregex | str enclosing: One or more patterns that are to *enclose* \ 155 | pattern ``pre`` one by one. 156 | 157 | :raises NotEnoughArgumentsException: Less than two arguments are provided. 158 | :raises InvalidArgumentTypeException: Either ``pre`` or at least one of the \ 159 | ``enclosing`` patterns is neither a ``Pregex`` instance nor a string. 160 | ''' 161 | super().__init__((pre, *enclosing), lambda pre1, pre2: pre1.enclose(pre2)) -------------------------------------------------------------------------------- /src/pregex/core/quantifiers.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | Every class within this module is used to declare that a pattern is to be 3 | matched a number of times, with each class representing a slightly different 4 | pattern-repetition rule. 5 | 6 | Classes & methods 7 | ------------------------------------------- 8 | 9 | Below are listed all classes within :py:mod:`pregex.core.quantifiers` 10 | along with any possible methods they may possess. 11 | """ 12 | 13 | 14 | import pregex.core.pre as _pre 15 | from typing import Union as _Union 16 | from typing import Optional as _Optional 17 | 18 | 19 | class __Quantifier(_pre.Pregex): 20 | ''' 21 | Constitutes the base class for all classes that are part of this module. 22 | 23 | :param Pregex | str pre: A Pregex instance or string representing the pattern \ 24 | that is to be quantified. 25 | :param (Pregex => str) transform: A `transform` function for the provided pattern. 26 | 27 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 28 | ``Pregex`` instance nor a string. 29 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 30 | pattern. Whether this exception is thrown also depends on certain parameter values. 31 | 32 | ''' 33 | def __init__(self, pre: _Union[_pre.Pregex, str], is_greedy: bool, transform) -> '__Quantifier': 34 | ''' 35 | Constitutes the base class for all classes that are part of this module. 36 | 37 | :param Pregex | str pre: A Pregex instance or string representing the pattern \ 38 | that is to be quantified. 39 | :param (Pregex => str) transform: A `transform` function for the provided pattern. 40 | 41 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 42 | ``Pregex`` instance nor a string. 43 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 44 | pattern. Whether this exception is thrown also depends on certain parameter values. 45 | ''' 46 | pattern = transform(__class__._to_pregex(pre), is_greedy) 47 | super().__init__(str(pattern), escape=False) 48 | 49 | 50 | class Optional(__Quantifier): 51 | ''' 52 | Matches the provided pattern once or not at all. 53 | 54 | :param Pregex | str pre: The pattern that is to be quantified. 55 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 56 | When declared as such, the regex engine will try to match \ 57 | the expression as many times as possible. Defaults to ``True``. 58 | 59 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 60 | ``Pregex`` instance nor a string. 61 | ''' 62 | 63 | def __init__(self, pre: _Union[_pre.Pregex, str], is_greedy: bool = True) -> _pre.Pregex: 64 | ''' 65 | Matches the provided pattern once or not at all. 66 | 67 | :param Pregex | str pre: The pattern that is to be quantified. 68 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 69 | When declared as such, the regex engine will try to match \ 70 | the expression as many times as possible. Defaults to ``True``. 71 | 72 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 73 | ``Pregex`` instance nor a string. 74 | ''' 75 | super().__init__(pre, is_greedy, lambda pre, is_greedy: pre.optional(is_greedy)) 76 | 77 | 78 | class Indefinite(__Quantifier): 79 | ''' 80 | Matches the provided pattern zero or more times. 81 | 82 | :param Pregex | str pre: The pattern that is to be quantified. 83 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 84 | When declared as such, the regex engine will try to match \ 85 | the expression as many times as possible. Defaults to ``True``. 86 | 87 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 88 | ``Pregex`` instance nor a string. 89 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable pattern. 90 | ''' 91 | 92 | def __init__(self, pre: _Union[_pre.Pregex, str], is_greedy: bool = True) -> _pre.Pregex: 93 | ''' 94 | Matches the provided pattern zero or more times. 95 | 96 | :param Pregex | str pre: The pattern that is to be quantified. 97 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 98 | When declared as such, the regex engine will try to match \ 99 | the expression as many times as possible. Defaults to ``True``. 100 | 101 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable pattern. 102 | ''' 103 | super().__init__(pre, is_greedy, lambda pre, is_greedy: pre.indefinite(is_greedy)) 104 | 105 | 106 | class OneOrMore(__Quantifier): 107 | ''' 108 | Matches the provided pattern one or more times. 109 | 110 | :param Pregex | str pre: The pattern that is to be quantified. 111 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 112 | When declared as such, the regex engine will try to match \ 113 | the expression as many times as possible. Defaults to ``True``. 114 | 115 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 116 | ``Pregex`` instance nor a string. 117 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable pattern. 118 | ''' 119 | 120 | def __init__(self, pre: _Union[_pre.Pregex, str], is_greedy: bool = True) -> _pre.Pregex: 121 | ''' 122 | Matches the provided pattern one or more times. 123 | 124 | :param Pregex | str pre: The pattern that is to be quantified. 125 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 126 | When declared as such, the regex engine will try to match \ 127 | the expression as many times as possible. Defaults to ``True``. 128 | 129 | :raises InvalidArgumentTypeException: Parameter ``pre`` is neither a \ 130 | ``Pregex`` instance nor a string. 131 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable pattern. 132 | ''' 133 | super().__init__(pre, is_greedy, lambda pre, is_greedy: pre.one_or_more(is_greedy)) 134 | 135 | 136 | class Exactly(__Quantifier): 137 | ''' 138 | Matches the provided pattern an exact number of times. 139 | 140 | :param Pregex | str pre: The pattern that is to be quantified. 141 | :param int n: The exact number of times that the provided pattern is to be matched. 142 | 143 | :raises InvalidArgumentTypeException: 144 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 145 | - Parameter ``n`` is not an integer. 146 | :raises InvalidArgumentValueException: Parameter ``n`` has a value of less than zero. 147 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 148 | pattern while parameter ``n`` has been set to a value of greater than ``1``. 149 | ''' 150 | 151 | def __init__(self, pre: _Union[_pre.Pregex, str], n: int) -> _pre.Pregex: 152 | ''' 153 | Matches the provided pattern an exact number of times. 154 | 155 | :param Pregex | str pre: The pattern that is to be quantified. 156 | :param int n: The exact number of times that the provided pattern is to be matched. 157 | 158 | :raises InvalidArgumentTypeException: 159 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 160 | - Parameter ``n`` is not an integer. 161 | :raises InvalidArgumentValueException: Parameter ``n`` has a value of less than zero. 162 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 163 | pattern while parameter ``n`` has been set to a value of greater than ``1``. 164 | ''' 165 | super().__init__(pre, False, lambda pre, _: pre.exactly(n)) 166 | 167 | 168 | class AtLeast(__Quantifier): 169 | ''' 170 | Matches the provided pattern a minimum number of times. 171 | 172 | :param Pregex | str pre: The pattern that is to be quantified. 173 | :param int n: The minimum number of times that the provided pattern is to be matched. 174 | :param bool is_greedy: Determines whether to declare this quantifier as greedy. \ 175 | When declared as such, the regex engine will try to match \ 176 | the expression as many times as possible. Defaults to ``True``. 177 | 178 | :raises InvalidArgumentTypeException: 179 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 180 | - Parameter ``n`` is not an integer. 181 | :raises InvalidArgumentValueException: Parameter ``n`` has a value of less than zero. 182 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable pattern. 183 | ''' 184 | 185 | def __init__(self, pre: _Union[_pre.Pregex, str], n: int, is_greedy: bool = True) -> _pre.Pregex: 186 | ''' 187 | Matches the provided pattern a minimum number of times. 188 | 189 | :param Pregex | str pre: The pattern that is to be quantified. 190 | :param int n: The minimum number of times that the provided pattern is to be matched. 191 | :param bool is_greedy: Determines whether to declare this quantifier as greedy. \ 192 | When declared as such, the regex engine will try to match \ 193 | the expression as many times as possible. Defaults to ``True``. 194 | 195 | :raises InvalidArgumentTypeException: 196 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 197 | - Parameter ``n`` is not an integer. 198 | :raises InvalidArgumentValueException: Parameter ``n`` has a value of less than zero. 199 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable pattern. 200 | ''' 201 | super().__init__(pre, is_greedy, lambda pre, is_greedy: pre.at_least(n, is_greedy)) 202 | 203 | 204 | class AtMost(__Quantifier): 205 | ''' 206 | Matches the provided pattern up to a maximum number of times. 207 | 208 | :param Pregex | str pre: The pattern that is to be quantified. 209 | :param int n: The maximum number of times that the provided pattern is to be matched. 210 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 211 | When declared as such, the regex engine will try to match \ 212 | the expression as many times as possible. Defaults to ``True``. 213 | 214 | :raises InvalidArgumentTypeException: 215 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 216 | - Parameter ``n`` is neither an integer nor ``None``. 217 | :raises InvalidArgumentValueException: Parameter ``n`` has a value of less than zero. 218 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 219 | pattern while parameter ``n`` has been set to a value of greater than ``1``. 220 | 221 | :note: Setting ``n`` equal to ``None`` indicates that there is no upper limit to the number of \ 222 | times the pattern is to be repeated. 223 | ''' 224 | 225 | def __init__(self, pre: _Union[_pre.Pregex, str], n: _Optional[int], is_greedy: bool = True) -> _pre.Pregex: 226 | ''' 227 | Matches the provided pattern up to a maximum number of times. 228 | 229 | :param Pregex | str pre: The pattern that is to be quantified. 230 | :param int n: The maximum number of times that the provided pattern is to be matched. 231 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 232 | When declared as such, the regex engine will try to match \ 233 | the expression as many times as possible. Defaults to ``True``. 234 | 235 | :raises InvalidArgumentTypeException: 236 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 237 | - Parameter ``n`` is neither an integer nor ``None``. 238 | :raises InvalidArgumentValueException: Parameter ``n`` has a value of less than zero. 239 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 240 | pattern while parameter ``n`` has been set to a value of greater than ``1``. 241 | 242 | :note: Setting ``n`` equal to ``None`` indicates that there is no upper limit to the number of \ 243 | times the pattern is to be repeated. 244 | ''' 245 | super().__init__(pre, is_greedy, lambda pre, is_greedy: pre.at_most(n, is_greedy)) 246 | 247 | 248 | class AtLeastAtMost(__Quantifier): 249 | ''' 250 | Matches the provided expression between a minimum and a maximum number of times. 251 | 252 | :param Pregex | str pre: The pattern that is to be quantified. 253 | :param int n: The minimum number of times that the provided pattern is to be matched. 254 | :param int m: The maximum number of times that the provided pattern is to be matched. 255 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 256 | When declared as such, the regex engine will try to match \ 257 | the expression as many times as possible. Defaults to ``True``. 258 | 259 | :raises InvalidArgumentTypeException: 260 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 261 | - Parameter ``n`` is not an integer. 262 | - Parameter ``m`` is neither an integer nor ``None``. 263 | :raises InvalidArgumentValueException: 264 | - Either parameter ``n`` or ``m`` has a value of less than zero. 265 | - Parameter ``n`` has a greater value than that of parameter ``m``. 266 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 267 | pattern while parameter ``m`` has been set to a value of greater than ``1``. 268 | 269 | :note: 270 | - Parameter ``is_greedy`` has no effect in the case that ``n`` equals ``m``. 271 | - Setting ``m`` equal to ``None`` indicates that there is no upper limit to the \ 272 | number of times the pattern is to be repeated. 273 | ''' 274 | 275 | def __init__(self, pre: _Union[_pre.Pregex, str], n: int, m: _Optional[int], is_greedy: bool = True) -> _pre.Pregex: 276 | ''' 277 | Matches the provided expression between a minimum and a maximum number of times. 278 | 279 | :param Pregex | str pre: The pattern that is to be quantified. 280 | :param int n: The minimum number of times that the provided pattern is to be matched. 281 | :param int m: The maximum number of times that the provided pattern is to be matched. 282 | :param bool is_greedy: Indicates whether to declare this quantifier as greedy. \ 283 | When declared as such, the regex engine will try to match \ 284 | the expression as many times as possible. Defaults to ``True``. 285 | 286 | :raises InvalidArgumentTypeException: 287 | - Parameter ``pre`` is neither a ``Pregex`` instance nor a string. 288 | - Parameter ``n`` is not an integer. 289 | - Parameter ``m`` is neither an integer nor ``None``. 290 | :raises InvalidArgumentValueException: 291 | - Either parameter ``n`` or ``m`` has a value of less than zero. 292 | - Parameter ``n`` has a greater value than that of parameter ``m``. 293 | :raises CannotBeRepeatedException: Parameter ``pre`` represents a non-repeatable \ 294 | pattern while parameter ``m`` has been set to a value of greater than ``1``. 295 | 296 | :note: 297 | - Parameter ``is_greedy`` has no effect in the case that ``n`` equals ``m``. 298 | - Setting ``m`` equal to ``None`` indicates that there is no upper limit to the \ 299 | number of times the pattern is to be repeated. 300 | ''' 301 | super().__init__(pre, is_greedy, lambda pre, is_greedy: pre.at_least_at_most(n, m, is_greedy)) -------------------------------------------------------------------------------- /src/pregex/core/tokens.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains a number of classes that represent special characters. 3 | Each token represents one and only one character. It is recommended that you 4 | make use of these classes instead of providing their corresponding characters 5 | as strings on your own in order to prevent any errors that relate to character 6 | escaping from happening. 7 | 8 | Classes & methods 9 | ------------------------------------------- 10 | 11 | Below are listed all classes within :py:mod:`pregex.core.tokens` 12 | along with any possible methods they may possess. 13 | """ 14 | 15 | 16 | import pregex.core.pre as _pre 17 | 18 | 19 | class __Token(_pre.Pregex): 20 | ''' 21 | Constitutes the base class for all classes that are part of this module. 22 | 23 | :param str pattern: The pattern representing the token. 24 | ''' 25 | 26 | def __init__(self, pattern: str) -> '__Token': 27 | ''' 28 | Constitutes the base class for all classes that are part of this module. 29 | 30 | :param str pattern: The pattern representing the token. 31 | ''' 32 | super().__init__(pattern, escape=False) 33 | 34 | 35 | class Backslash(__Token): 36 | ''' 37 | Matches a single backslash character. 38 | ''' 39 | 40 | def __init__(self) -> 'Backslash': 41 | ''' 42 | Matches a single backslash character. 43 | ''' 44 | super().__init__(r"\\") 45 | 46 | 47 | class Bullet(__Token): 48 | ''' 49 | Matches the bullet symbol "•". 50 | ''' 51 | 52 | def __init__(self) -> 'Bullet': 53 | ''' 54 | Matches the bullet symbol "•". 55 | ''' 56 | super().__init__("\u2022") 57 | 58 | 59 | class CarriageReturn(__Token): 60 | ''' 61 | Matches a single carriage return character. 62 | ''' 63 | 64 | def __init__(self) -> 'CarriageReturn': 65 | ''' 66 | Matches a single carriage return character. 67 | ''' 68 | super().__init__("\r") 69 | 70 | 71 | class Copyright(__Token): 72 | ''' 73 | Matches the copyright symbol "©". 74 | ''' 75 | 76 | def __init__(self) -> 'Copyright': 77 | ''' 78 | Matches the copyright symbol "©". 79 | ''' 80 | super().__init__("\u00A9") 81 | 82 | 83 | class Division(__Token): 84 | ''' 85 | Matches the division sign "÷". 86 | ''' 87 | 88 | def __init__(self) -> 'Division': 89 | ''' 90 | Matches the division sign "÷". 91 | ''' 92 | super().__init__("\u00f7") 93 | 94 | 95 | class Dollar(__Token): 96 | ''' 97 | Matches the dollar sign "$". 98 | ''' 99 | 100 | def __init__(self) -> 'Dollar': 101 | ''' 102 | Matches the dollar sign "$". 103 | ''' 104 | super().__init__("\\\u0024") 105 | 106 | 107 | class Euro(__Token): 108 | ''' 109 | Matches the euro sign "€". 110 | ''' 111 | 112 | def __init__(self) -> 'Euro': 113 | ''' 114 | Matches the euro sign "€". 115 | ''' 116 | super().__init__("\u20ac") 117 | 118 | 119 | class FormFeed(__Token): 120 | ''' 121 | Matches a single form feed character. 122 | ''' 123 | 124 | def __init__(self) -> 'FormFeed': 125 | ''' 126 | Matches a single form feed character. 127 | ''' 128 | super().__init__("\f") 129 | 130 | 131 | class Infinity(__Token): 132 | ''' 133 | Matches the infinity symbol "∞". 134 | ''' 135 | 136 | def __init__(self) -> 'Infinity': 137 | ''' 138 | Matches the infinity symbol "∞". 139 | ''' 140 | super().__init__("\u221e") 141 | 142 | 143 | class Multiplication(__Token): 144 | ''' 145 | Matches the multiplication sign "×". 146 | ''' 147 | 148 | def __init__(self) -> 'Multiplication': 149 | ''' 150 | Matches the multiplication sign "×". 151 | ''' 152 | super().__init__("\u00d7") 153 | 154 | 155 | class Newline(__Token): 156 | ''' 157 | Matches a single newline character. 158 | ''' 159 | 160 | def __init__(self) -> 'Newline': 161 | ''' 162 | Matches a single newline character. 163 | ''' 164 | super().__init__("\n") 165 | 166 | 167 | class Pound(__Token): 168 | ''' 169 | Matches the English pound sign "£". 170 | ''' 171 | 172 | def __init__(self) -> 'Pound': 173 | ''' 174 | Matches the English pound sign "£". 175 | ''' 176 | super().__init__("\u00a3") 177 | 178 | 179 | class Registered(__Token): 180 | ''' 181 | Matches the registered trademark symbol "®". 182 | ''' 183 | 184 | def __init__(self) -> 'Registered': 185 | ''' 186 | Matches the registered trademark symbol "®". 187 | ''' 188 | super().__init__("\u00ae") 189 | 190 | 191 | class Rupee(__Token): 192 | ''' 193 | Matches the Indian rupee sign "₹". 194 | ''' 195 | 196 | def __init__(self) -> 'Yen': 197 | ''' 198 | Matches the Indian rupee sign "₹". 199 | ''' 200 | super().__init__("\u20b9") 201 | 202 | 203 | class Space(__Token): 204 | ''' 205 | Matches a single space character. 206 | ''' 207 | 208 | def __init__(self) -> 'Space': 209 | ''' 210 | Matches a single space character. 211 | ''' 212 | super().__init__(" ") 213 | 214 | 215 | class Tab(__Token): 216 | ''' 217 | Matches a single tab character. 218 | ''' 219 | 220 | def __init__(self) -> 'Tab': 221 | ''' 222 | Matches a single tab character. 223 | ''' 224 | super().__init__("\t") 225 | 226 | 227 | class Trademark(__Token): 228 | ''' 229 | Matches the unregistered trademark symbol "™". 230 | ''' 231 | 232 | def __init__(self) -> 'Trademark': 233 | ''' 234 | Matches the unregistered trademark symbol "™". 235 | ''' 236 | super().__init__("\u2122") 237 | 238 | 239 | class VerticalTab(__Token): 240 | ''' 241 | Matches a single vertical tab character. 242 | ''' 243 | 244 | def __init__(self) -> 'VerticalTab': 245 | ''' 246 | Matches a single vertical tab character. 247 | ''' 248 | super().__init__("\v") 249 | 250 | 251 | class WhiteBullet(__Token): 252 | ''' 253 | Matches the white bullet symbol "◦". 254 | ''' 255 | 256 | def __init__(self) -> 'WhiteBullet': 257 | ''' 258 | Matches the white bullet symbol "◦". 259 | ''' 260 | super().__init__("\u25e6") 261 | 262 | 263 | class Yen(__Token): 264 | ''' 265 | Matches the Japanese yen sign "¥". 266 | ''' 267 | 268 | def __init__(self) -> 'Yen': 269 | ''' 270 | Matches the Japanese yen sign "¥". 271 | ''' 272 | super().__init__("\u00a5") -------------------------------------------------------------------------------- /src/pregex/meta/__init__.py: -------------------------------------------------------------------------------- 1 | from pregex.meta.essentials import * -------------------------------------------------------------------------------- /tests/test_core_assertions.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pregex.core.assertions import * 4 | from pregex.core.pre import Pregex, _Type 5 | from pregex.core.quantifiers import Exactly, Optional 6 | from pregex.core.exceptions import NonFixedWidthPatternException, \ 7 | NotEnoughArgumentsException, EmptyNegativeAssertionException 8 | 9 | 10 | TEST_STR = "test" 11 | pre1 = Pregex("PRE1") 12 | pre2 = Pregex("PRE2") 13 | pre3 = Pregex("PRE3") 14 | 15 | 16 | class TestMatchAtStart(unittest.TestCase): 17 | 18 | def test_match_at_start(self): 19 | self.assertEqual(str(MatchAtStart(TEST_STR)), f"\A{TEST_STR}") 20 | 21 | def test_match_at_start_on_type(self): 22 | self.assertEqual(MatchAtStart("a")._get_type(), _Type.Assertion) 23 | self.assertEqual(MatchAtStart("abc")._get_type(), _Type.Assertion) 24 | 25 | def test_match_at_start_on_quantifiability(self): 26 | self.assertEqual(MatchAtStart("a")._is_repeatable(), False) 27 | 28 | 29 | class TestMatchAtEnd(unittest.TestCase): 30 | 31 | def test_match_at_end(self): 32 | self.assertEqual(str(MatchAtEnd(TEST_STR)), f"{TEST_STR}\Z") 33 | 34 | def test_match_at_end_on_type(self): 35 | self.assertEqual(MatchAtEnd("a")._get_type(), _Type.Assertion) 36 | 37 | def test_match_at_end_on_quantifiability(self): 38 | self.assertEqual(MatchAtEnd("a")._is_repeatable(), False) 39 | 40 | 41 | class TestMatchAtLineStart(unittest.TestCase): 42 | 43 | def test_match_at_line_start(self): 44 | self.assertEqual(str(MatchAtLineStart(TEST_STR)), f"^{TEST_STR}") 45 | 46 | def test_match_at_line_start_on_type(self): 47 | self.assertEqual(MatchAtLineStart("a")._get_type(), _Type.Assertion) 48 | 49 | def test_match_at_line_start_on_quantifiability(self): 50 | self.assertEqual(MatchAtLineStart("a")._is_repeatable(), False) 51 | 52 | 53 | class TestMatchAtLineEnd(unittest.TestCase): 54 | 55 | def test_match_at_line_end(self): 56 | self.assertEqual(str(MatchAtLineEnd(TEST_STR)), f"{TEST_STR}$") 57 | 58 | def test_match_at_line_end_on_type(self): 59 | self.assertEqual(MatchAtLineEnd("a")._get_type(), _Type.Assertion) 60 | 61 | 62 | def test_match_at_line_end_on_quantifiability(self): 63 | self.assertEqual(MatchAtLineEnd("a")._is_repeatable(), False) 64 | 65 | 66 | class TestWordBoundary(unittest.TestCase): 67 | 68 | left_word_boundary = WordBoundary() + TEST_STR 69 | right_word_boundary = TEST_STR + WordBoundary() 70 | left_and_right_word_boundary = WordBoundary() + TEST_STR + WordBoundary() 71 | 72 | def test_word_boundary_on_pattern(self): 73 | self.assertEqual(str(WordBoundary()), "\\b") 74 | 75 | def test_word_boundary_on_matches(self): 76 | self.assertEqual((WordBoundary() + "a").get_matches("a ba -a"), ["a", "a"]) 77 | 78 | def test_word_boundary_on_type(self): 79 | self.assertEqual(WordBoundary()._get_type(), _Type.Assertion) 80 | 81 | def test_word_boundary_on_quantifiability(self): 82 | self.assertEqual(WordBoundary()._is_repeatable(), True) 83 | 84 | def test_left_word_boundary(self): 85 | self.assertEqual(str(self.left_word_boundary), f"\\b{TEST_STR}") 86 | 87 | def test_left_word_boundary_on_type(self): 88 | self.assertEqual(self.left_word_boundary._get_type(), _Type.Assertion) 89 | 90 | def test_left_word_boundary_on_quantifiability(self): 91 | self.assertEqual(self.left_word_boundary._is_repeatable(), True) 92 | 93 | def test_right_word_boundary(self): 94 | self.assertEqual(str(self.right_word_boundary), f"{TEST_STR}\\b") 95 | 96 | def test_right_word_boundary_on_type(self): 97 | self.assertEqual(self.right_word_boundary._get_type(), _Type.Assertion) 98 | 99 | def test_right_word_boundary_on_quantifiability(self): 100 | self.assertEqual(self.right_word_boundary._is_repeatable(), True) 101 | 102 | def test_left_and_right_word_boundary(self): 103 | self.assertEqual(str(self.left_and_right_word_boundary), f"\\b{TEST_STR}\\b") 104 | 105 | def test_left_and_right_word_boundary_on_type(self): 106 | self.assertEqual(self.left_and_right_word_boundary._get_type(), _Type.Assertion) 107 | 108 | def test_left_and_right_word_boundary_on_quantifiability(self): 109 | self.assertEqual(self.left_and_right_word_boundary._is_repeatable(), True) 110 | 111 | 112 | class TestNonWordBoundary(unittest.TestCase): 113 | 114 | left_non_word_boundary = NonWordBoundary() + TEST_STR 115 | right_non_word_boundary = TEST_STR + NonWordBoundary() 116 | left_and_right_non_word_boundary = NonWordBoundary() + TEST_STR + NonWordBoundary() 117 | 118 | def test_non_word_boundary_on_pattern(self): 119 | self.assertEqual(str(NonWordBoundary()), "\\B") 120 | 121 | def test_non_word_boundary_on_matches(self): 122 | self.assertEqual((NonWordBoundary() + "a").get_matches("a ba a"), ["a"]) 123 | 124 | def test_non_word_boundary_on_type(self): 125 | self.assertEqual(NonWordBoundary()._get_type(), _Type.Assertion) 126 | 127 | def test_non_word_boundary_on_quantifiability(self): 128 | self.assertEqual(NonWordBoundary()._is_repeatable(), True) 129 | 130 | def test_left_non_word_boundary(self): 131 | self.assertEqual(str(self.left_non_word_boundary), f"\\B{TEST_STR}") 132 | 133 | def test_left_non_word_boundary_on_type(self): 134 | self.assertEqual(self.left_non_word_boundary._get_type(), _Type.Assertion) 135 | 136 | def test_left_non_word_boundary_on_quantifiability(self): 137 | self.assertEqual(self.left_non_word_boundary._is_repeatable(), True) 138 | 139 | def test_right_non_word_boundary(self): 140 | self.assertEqual(str(self.right_non_word_boundary), f"{TEST_STR}\\B") 141 | 142 | def test_right_non_word_boundary_on_type(self): 143 | self.assertEqual(self.right_non_word_boundary._get_type(), _Type.Assertion) 144 | 145 | def test_right_non_word_boundary_on_quantifiability(self): 146 | self.assertEqual(self.right_non_word_boundary._is_repeatable(), True) 147 | 148 | def test_left_and_right_non_word_boundary(self): 149 | self.assertEqual(str(self.left_and_right_non_word_boundary), f"\\B{TEST_STR}\\B") 150 | 151 | def test_left_and_right_non_word_boundary_on_type(self): 152 | self.assertEqual(self.left_and_right_non_word_boundary._get_type(), _Type.Assertion) 153 | 154 | def test_left_and_right_non_word_boundary_on_quantifiability(self): 155 | self.assertEqual(self.left_and_right_non_word_boundary._is_repeatable(), True) 156 | 157 | 158 | class TestFollowedBy(unittest.TestCase): 159 | 160 | def test_followed_by(self): 161 | self.assertEqual(str(FollowedBy(pre1, pre2)), f"{pre1}(?={pre2})") 162 | 163 | def test_followed_by_on_multiple_patterns(self): 164 | self.assertEqual(str(FollowedBy(pre1, pre2, pre3)), f"{pre1}(?={pre2})(?={pre3})") 165 | 166 | def test_followed_by_on_type(self): 167 | self.assertEqual(FollowedBy("a", "b")._get_type(), _Type.Assertion) 168 | 169 | def test_followed_by_on_quantifiability(self): 170 | self.assertEqual(FollowedBy("a", "b")._is_repeatable(), False) 171 | 172 | def test_followed_by_on_empty_string_as_assertion_pattern(self): 173 | self.assertEqual(str(FollowedBy(pre1, Pregex())), f"{pre1}") 174 | 175 | def test_followed_by_on_not_enough_arguments_exception(self): 176 | self.assertRaises(NotEnoughArgumentsException, FollowedBy, pre1) 177 | 178 | 179 | class TestNotFollowedBy(unittest.TestCase): 180 | 181 | def test_not_followed_by(self): 182 | self.assertEqual(str(NotFollowedBy(pre1, pre2)), f"{pre1}(?!{pre2})") 183 | 184 | def test_not_followed_by_on_multiple_patterns(self): 185 | self.assertEqual(str(NotFollowedBy(pre1, pre2, pre3)), f"{pre1}(?!{pre2})(?!{pre3})") 186 | 187 | def test_not_followed_by_on_type(self): 188 | self.assertEqual(NotFollowedBy("a", "b")._get_type(), _Type.Assertion) 189 | 190 | def test_not_followed_by_on_quantifiability(self): 191 | self.assertEqual(NotFollowedBy("a", "b")._is_repeatable(), True) 192 | 193 | def test_not_followed_by_on_not_enough_arguments_exception(self): 194 | self.assertRaises(NotEnoughArgumentsException, NotFollowedBy, pre1) 195 | 196 | def test_not_followed_by_on_empty_string_negative_assertion_exception(self): 197 | self.assertRaises(EmptyNegativeAssertionException, NotFollowedBy, pre1, Pregex()) 198 | 199 | def test_not_followed_by_on_multiple_patterns_empty_string_negative_assertion_exception(self): 200 | self.assertRaises(EmptyNegativeAssertionException, NotFollowedBy, pre1, pre2, Pregex()) 201 | 202 | 203 | class TestPrecededBy(unittest.TestCase): 204 | 205 | def test_preceded_by(self): 206 | self.assertEqual(str(PrecededBy(pre1, pre2)), f"(?<={pre2}){pre1}") 207 | 208 | def test_preceded_by_on_multiple_patterns(self): 209 | self.assertEqual(str(PrecededBy(pre1, pre2, pre3)), f"(?<={pre3})(?<={pre2}){pre1}") 210 | 211 | def test_preceded_by_on_type(self): 212 | self.assertEqual(PrecededBy("a", "b")._get_type(), _Type.Assertion) 213 | 214 | def test_preceded_by_on_quantifiability(self): 215 | self.assertEqual(PrecededBy("a", "b")._is_repeatable(), False) 216 | 217 | def test_preceded_by_on_quantifier(self): 218 | exactly = Exactly(pre2, 3) 219 | self.assertEqual(str(PrecededBy(pre1, exactly)), f"(?<={exactly}){pre1}") 220 | self.assertRaises(NonFixedWidthPatternException, PrecededBy, pre1, Optional(pre2)) 221 | 222 | def test_preceded_by_on_empty_string_as_assertion_pattern(self): 223 | self.assertEqual(str(PrecededBy(pre1, Pregex())), f"{pre1}") 224 | 225 | def test_preceded_by_on_not_enough_arguments_exception(self): 226 | self.assertRaises(NotEnoughArgumentsException, PrecededBy, pre1) 227 | 228 | 229 | class TestNotPrecededBy(unittest.TestCase): 230 | 231 | def test_not_preceded_by(self): 232 | self.assertEqual(str(NotPrecededBy(pre1, pre2)), f"(?{TEST_STR})") 70 | 71 | def test_named_capturing_group_on_literal(self): 72 | literal = Pregex(TEST_STR) 73 | self.assertEqual(str(Capture(literal, self.name)), f"(?P<{self.name}>{literal})") 74 | 75 | def test_named_capturing_group_on_capturing_group(self): 76 | ''' Name-grouping a capturing group without a name, names the group. ''' 77 | group = Capture(TEST_STR) 78 | self.assertEqual(str(Capture(group, self.name)), f"(?P<{self.name}>{str(group)[1:-1]})") 79 | 80 | def test_named_capturing_group_on_named_capturing_group(self): 81 | ''' Name-grouping a capturing group with name, changes the group's name. ''' 82 | group = Capture(TEST_STR, self.name) 83 | new_name = "NEW_NAME" 84 | self.assertEqual(str(Capture(group, new_name)), str(group).replace(self.name, new_name)) 85 | 86 | def test_named_capturing_group_on_non_capturing_group(self): 87 | ''' Name-Grouping a non-capturing group converts it to a named capturing group. ''' 88 | group = Group(TEST_STR) 89 | self.assertEqual(str(Capture(group, self.name)), f"(?P<{self.name}>{str(group)[:-1].replace('(?:', '', 1)})") 90 | 91 | def test_named_capturing_group_on_invalid_argument_type_exception(self): 92 | invalid_type_names = [1, 1.5, True, Pregex("z")] 93 | for name in invalid_type_names: 94 | self.assertRaises(InvalidArgumentTypeException, Capture, "test", name) 95 | 96 | def test_named_capturing_group_on_invalid_name_exception(self): 97 | invalid_names = ["11zzz", "ald!!", "@%^Fl", "!flflf123", "dld-"] 98 | for name in invalid_names: 99 | self.assertRaises(InvalidCapturingGroupNameException, Capture, "test", name) 100 | 101 | 102 | class TestGroup(unittest.TestCase): 103 | 104 | def test_group_on_str(self): 105 | self.assertEqual(str(Group(TEST_STR)), f"(?:{TEST_STR})") 106 | 107 | def test_group_on_type(self): 108 | self.assertEqual(Group("a")._get_type(), _Type.Group) 109 | self.assertNotEqual((Group("a") + Group("b"))._get_type(), _Type.Group) 110 | 111 | def test_group_on_pregex(self): 112 | pregex = Pregex(TEST_STR) 113 | self.assertEqual(str(Group(pregex)), f"(?:{pregex})") 114 | 115 | def test_group_on_is_case_insensitive(self): 116 | self.assertEqual(str(Group(TEST_STR, is_case_insensitive=True)), f"(?i:{TEST_STR})") 117 | 118 | def test_group_on_capturing_group(self): 119 | group = Capture(TEST_STR) 120 | self.assertEqual(str(Group(group)), f"(?:{TEST_STR})") 121 | 122 | def test_group_on_flag_reset(self): 123 | flag_group = Group(TEST_STR, is_case_insensitive=True) 124 | self.assertEqual(str(Group(flag_group)), f"(?:{TEST_STR})") 125 | 126 | def test_group_on_concat_of_capturing_groups(self): 127 | pre = Capture('a') + 'b' + Capture('c') 128 | self.assertEqual(str(Group(pre)), f"(?:{pre})") 129 | 130 | def test_group_on_backslash_group(self): 131 | group = Capture(Backslash()) 132 | self.assertEqual(str(Group(group)),f"{str(group).replace('(', '(?:')}") 133 | 134 | def test_group_on_concat_of_capturing_groups_starting_with_backslash_group(self): 135 | pre = Capture(Backslash()) + "b" + Capture("c") 136 | self.assertEqual(str(Group(pre)), f"(?:{pre})") 137 | 138 | def test_group_on_concat_of_capturing_groups_ending_with_backslash_group(self): 139 | pre = Capture("a") + "b" + Capture(Backslash()) 140 | self.assertEqual(str(Group(pre)), f"(?:{pre})") 141 | 142 | def test_group_on_capturing_group_of_concat_of_capturing_groups(self): 143 | group = Capture(Capture("a") + "b" + Capture("c")) 144 | self.assertEqual(str(Group(group)), f"{str(group).replace('(', '(?:', 1)}") 145 | 146 | def test_group_on_non_capturing_group(self): 147 | ''' Applying 'Group' on a non-capturing group does nothing. ''' 148 | group = Group(TEST_STR) 149 | self.assertEqual(str(Group(group)), f"{group}") 150 | 151 | def test_group_on_concat_of_non_capturing_groups(self): 152 | pre = Group("a") + "b" + Group("c") 153 | self.assertEqual(str(Group(pre)), f"(?:{pre})") 154 | 155 | def test_group_on_non_capturing_group_of_concat_of_non_capturing_groups(self): 156 | group = Group(Group("a") + "b" + Group("c")) 157 | self.assertEqual(str(Group(group)), f"{group}") 158 | 159 | def test_group_on_named_capturing_group(self): 160 | ''' Applying 'Group' on a non-capturing group converts it into a non-capturing group. ''' 161 | name = "NAME" 162 | group = Capture(TEST_STR, name) 163 | self.assertEqual(str(Group(group)), f"(?:{TEST_STR})") 164 | 165 | 166 | class TestBackreference(unittest.TestCase): 167 | 168 | def test_backreference_int(self): 169 | ref = 1 170 | self.assertEqual(str(Backreference(ref)), f"\\{ref}") 171 | 172 | def test_backreference_str(self): 173 | ref = "name" 174 | self.assertEqual(str(Backreference(ref)), f"(?P={ref})") 175 | 176 | def test_backreference_on_type(self): 177 | self.assertEqual(Backreference("a")._get_type(), _Type.Group) 178 | 179 | def test_backreference_on_invalid_argument_type_exception(self): 180 | invalid_type_names = [1.5, True, Pregex("z")] 181 | for name in invalid_type_names: 182 | self.assertRaises(InvalidArgumentTypeException, Backreference, name) 183 | 184 | def test_backreference_on_invalid_argument_value_exception(self): 185 | ref1, ref2 = 0, 100 186 | self.assertRaises(InvalidArgumentValueException, Backreference, ref1) 187 | self.assertRaises(InvalidArgumentValueException, Backreference, ref2) 188 | 189 | def test_backreference_on_invalid_name_exception(self): 190 | invalid_names = ["11zzz", "ald!!", "@%^Fl", "!flflf123", "dld-"] 191 | for name in invalid_names: 192 | with self.assertRaises(InvalidCapturingGroupNameException): 193 | _ = Backreference(name) 194 | 195 | def test_backreference_pattern(self): 196 | name = "name" 197 | pre: Pregex = Pregex(f"(?P<{name}>a|b)", escape=False) + Backreference(name) 198 | self.assertTrue(pre.is_exact_match("aa")) 199 | self.assertTrue(pre.is_exact_match("bb")) 200 | self.assertFalse(pre.is_exact_match("ab")) 201 | 202 | 203 | class TestConditional(unittest.TestCase): 204 | 205 | name = "name" 206 | then_pre = Pregex("then") 207 | else_pre = Pregex("else") 208 | 209 | def test_conditional(self): 210 | self.assertEqual(str(Conditional(self.name, self.then_pre)), f"(?({self.name}){self.then_pre})") 211 | 212 | def test_conditional_on_type(self): 213 | self.assertEqual(Conditional("a", "b")._get_type(), _Type.Group) 214 | 215 | def test_conditional_with_else_pre(self): 216 | self.assertEqual(str(Conditional(self.name, self.then_pre, self.else_pre)), 217 | f"(?({self.name}){self.then_pre}|{self.else_pre})") 218 | 219 | def test_conditional_on_invalid_argument_type_exception(self): 220 | invalid_type_names = [1, 1.5, True, Pregex("z")] 221 | for name in invalid_type_names: 222 | self.assertRaises(InvalidArgumentTypeException, Conditional, name, self.then_pre) 223 | 224 | def test_conditional_on_invalid_name_exception(self): 225 | invalid_names = ["11zzz", "ald!!", "@%^Fl", "!flflf123", "dld-"] 226 | for name in invalid_names: 227 | with self.assertRaises(InvalidCapturingGroupNameException): 228 | _ = Conditional(name, self.then_pre) 229 | 230 | def test_conditional_pattern(self): 231 | pre: Pregex = Pregex(f"(?P<{self.name}>A)", escape=False) + Conditional(self.name, "B") 232 | self.assertTrue(pre.is_exact_match("AB")) 233 | 234 | def test_conditional_pattern_with_else(self): 235 | pre: Pregex = Pregex(f"(?P<{self.name}>A)?", escape=False) + Conditional(self.name, "B", "C") 236 | self.assertTrue(pre.is_exact_match("AB")) 237 | self.assertTrue(pre.is_exact_match("C")) 238 | 239 | 240 | if __name__=="__main__": 241 | unittest.main() -------------------------------------------------------------------------------- /tests/test_core_operators.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pregex.core.operators import * 3 | from pregex.core.quantifiers import Exactly 4 | from pregex.core.pre import Pregex, _Type 5 | from pregex.core.classes import AnyLowercaseLetter 6 | from pregex.core.assertions import FollowedBy, MatchAtStart 7 | from pregex.core.exceptions import NotEnoughArgumentsException 8 | 9 | 10 | TEST_STR_1 = "test1" 11 | TEST_STR_2 = "test2" 12 | TEST_STR_3 = "test3" 13 | 14 | 15 | class TestConcat(unittest.TestCase): 16 | 17 | def test_concat_class_type(self): 18 | self.assertEqual(Concat("a", "b")._get_type(), _Type.Other) 19 | 20 | def test_concat_on_pattern(self): 21 | self.assertEqual(str(Concat(TEST_STR_1, TEST_STR_2)), f"{TEST_STR_1}{TEST_STR_2}") 22 | self.assertEqual(str(Concat(Pregex(TEST_STR_1), Pregex(TEST_STR_2))), f"{TEST_STR_1}{TEST_STR_2}") 23 | 24 | def test_concat_on_multiple_pattern(self): 25 | self.assertEqual(str(Concat(TEST_STR_1, TEST_STR_2, TEST_STR_3)), 26 | f"{TEST_STR_1}{TEST_STR_2}{TEST_STR_3}") 27 | 28 | def test_concat_on_quantifier(self): 29 | quantifier = Exactly(TEST_STR_1, 2) 30 | self.assertEqual(str(Concat(quantifier, TEST_STR_2)), f"{quantifier}{TEST_STR_2}") 31 | 32 | def test_concat_on_concat(self): 33 | concat = Concat(TEST_STR_1, TEST_STR_2) 34 | self.assertEqual(str(Concat(concat, TEST_STR_3)), f"{concat}{TEST_STR_3}") 35 | 36 | def test_concat_on_either(self): 37 | either = Either(TEST_STR_1, TEST_STR_2) 38 | self.assertEqual(str(Concat(either, TEST_STR_3)), f"(?:{either}){TEST_STR_3}") 39 | 40 | def test_concat_on_class(self): 41 | any_ll = AnyLowercaseLetter() 42 | self.assertEqual(str(Concat(any_ll, TEST_STR_3)), f"{any_ll}{TEST_STR_3}") 43 | 44 | def test_concat_on_anchor_assertion(self): 45 | mat = MatchAtStart("a") 46 | self.assertEqual(str(Concat(mat, TEST_STR_1)), f"{mat}{TEST_STR_1}") 47 | 48 | def test_concat_on_lookaround_assertion(self): 49 | followed_by = FollowedBy("a", "b") 50 | self.assertEqual(str(Concat(followed_by, TEST_STR_1)), f"{followed_by}{TEST_STR_1}") 51 | 52 | def test_concat_on_a_single_pattern(self): 53 | self.assertEqual(str(Concat(TEST_STR_1)), f"{TEST_STR_1}") 54 | 55 | def test_concat_on_no_patterns(self): 56 | self.assertEqual(str(Concat()), '') 57 | 58 | def test_concat_on_empty_string(self): 59 | self.assertEqual(str(Concat(TEST_STR_1, Pregex())), TEST_STR_1) 60 | 61 | 62 | class TestEither(unittest.TestCase): 63 | 64 | def test_either_class_type(self): 65 | self.assertEqual(Either("a", "b")._get_type(), _Type.Alternation) 66 | self.assertEqual(Either("a", "|", "b")._get_type(), _Type.Alternation) 67 | self.assertNotEqual(("a" + Either("a", "b"))._get_type(), _Type.Alternation) 68 | self.assertNotEqual(("a|" + Either("a", "b"))._get_type(), _Type.Alternation) 69 | self.assertNotEqual((Either("a", "b") + "b")._get_type(), _Type.Alternation) 70 | self.assertNotEqual((Either("a", "b") + "|b")._get_type(), _Type.Alternation) 71 | self.assertNotEqual(("a" + Either("a", "b") + "b")._get_type(), _Type.Alternation) 72 | self.assertNotEqual(("a|" + Either("a", "b") + "|b")._get_type(), _Type.Alternation) 73 | 74 | def test_either_on_pattern(self): 75 | self.assertEqual(str(Either(TEST_STR_1, TEST_STR_2)), f"{TEST_STR_1}|{TEST_STR_2}") 76 | self.assertEqual(str(Either(Pregex(TEST_STR_1), Pregex(TEST_STR_2))), f"{TEST_STR_1}|{TEST_STR_2}") 77 | 78 | def test_either_on_multiple_pattern(self): 79 | self.assertEqual(str(Either(TEST_STR_1, TEST_STR_2, TEST_STR_3)), 80 | f"{TEST_STR_1}|{TEST_STR_2}|{TEST_STR_3}") 81 | 82 | def test_either_on_quantifier(self): 83 | quantifier = Exactly(TEST_STR_1, 2) 84 | self.assertEqual(str(Either(quantifier, TEST_STR_2)), f"{quantifier}|{TEST_STR_2}") 85 | 86 | def test_either_for_concat(self): 87 | concat = Concat(TEST_STR_1, TEST_STR_2) 88 | self.assertEqual(str(Either(concat, TEST_STR_3)), f"{concat}|{TEST_STR_3}") 89 | 90 | def test_either_on_either(self): 91 | either = Either(TEST_STR_1, TEST_STR_2) 92 | self.assertEqual(str(Either(either, TEST_STR_3)), f"{either}|{TEST_STR_3}") 93 | 94 | def test_either_on_class(self): 95 | any_ll = AnyLowercaseLetter() 96 | self.assertEqual(str(Either(any_ll, TEST_STR_3)), f"{any_ll}|{TEST_STR_3}") 97 | 98 | def test_either_on_a_single_pattern(self): 99 | self.assertEqual(str(Either(TEST_STR_1)), f"{TEST_STR_1}") 100 | 101 | def test_either_on_no_patterns(self): 102 | self.assertEqual(str(Either()), '') 103 | 104 | def test_either_on_empty_string(self): 105 | self.assertEqual(str(Either(TEST_STR_1, Pregex(), TEST_STR_2)), f"{TEST_STR_1}|{TEST_STR_2}") 106 | 107 | 108 | class TestEnclose(unittest.TestCase): 109 | 110 | def test_enclose_class_type(self): 111 | self.assertEqual(Enclose("a", "b")._get_type(), _Type.Other) 112 | 113 | def test_enclose_on_pattern(self): 114 | self.assertEqual(str(Enclose(TEST_STR_1, TEST_STR_2)), f"{TEST_STR_2}{TEST_STR_1}{TEST_STR_2}") 115 | self.assertEqual(str(Enclose(Pregex(TEST_STR_1), Pregex(TEST_STR_2))), f"{TEST_STR_2}{TEST_STR_1}{TEST_STR_2}") 116 | 117 | def test_enclose_on_multiple_patterns(self): 118 | self.assertEqual(str(Enclose(TEST_STR_1, TEST_STR_2, TEST_STR_3)), 119 | f"{TEST_STR_3}{TEST_STR_2}{TEST_STR_1}{TEST_STR_2}{TEST_STR_3}") 120 | 121 | def test_enclose_on_no_enclosing_patterns(self): 122 | self.assertEqual(str(Enclose(TEST_STR_1)), f"{TEST_STR_1}") 123 | 124 | def test_enclose_on_empty_string(self): 125 | self.assertEqual(str(Enclose(TEST_STR_1, Pregex())), f"{TEST_STR_1}") 126 | 127 | 128 | if __name__=="__main__": 129 | unittest.main() -------------------------------------------------------------------------------- /tests/test_core_quantifiers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pregex.core.quantifiers import * 3 | from pregex.core.pre import Pregex, _Type 4 | from pregex.core.assertions import MatchAtStart 5 | from pregex.core.operators import Concat, Either 6 | from pregex.core.classes import AnyLowercaseLetter 7 | from pregex.core.exceptions import InvalidArgumentTypeException, \ 8 | InvalidArgumentValueException, CannotBeRepeatedException 9 | 10 | 11 | TEST_STR_LEN_1 = "t" 12 | TEST_STR_LEN_N = "test" 13 | TEST_LITERAL_LEN_1 = Pregex(TEST_STR_LEN_1) 14 | TEST_LITERAL_LEN_N = Pregex(TEST_STR_LEN_N) 15 | 16 | 17 | class Test__Quantifier(unittest.TestCase): 18 | 19 | def test_quantifier_on_str(self): 20 | self.assertEqual(str(Optional(TEST_STR_LEN_N)), f"(?:{TEST_STR_LEN_N})?") 21 | 22 | def test_quantifier_on_literal(self): 23 | self.assertEqual(str(Optional(TEST_LITERAL_LEN_N)), f"(?:{TEST_LITERAL_LEN_N})?") 24 | 25 | def test_quantifier_on_concat(self): 26 | concat = Concat(TEST_STR_LEN_1, TEST_STR_LEN_N) 27 | self.assertEqual(str(Optional(concat)), f"(?:{concat})?") 28 | 29 | def test_quantifier_on_either(self): 30 | either = Either(TEST_STR_LEN_1, TEST_STR_LEN_N) 31 | self.assertEqual(str(Optional(either)), f"(?:{either})?") 32 | 33 | def test_quantifier_on_class(self): 34 | any_ll = AnyLowercaseLetter() 35 | self.assertEqual(str(Optional(any_ll)), f"{any_ll}?") 36 | 37 | def test_quantifier_on_quantifier(self): 38 | optional = Optional(TEST_STR_LEN_N) 39 | self.assertEqual(str(Optional(optional)), f"(?:{optional})?") 40 | 41 | 42 | class TestOptional(unittest.TestCase): 43 | 44 | def test_optional_on_len_1_str(self): 45 | self.assertEqual(str(Optional(TEST_STR_LEN_1)), f"{TEST_STR_LEN_1}?") 46 | 47 | def test_optional_on_len_n_str(self): 48 | self.assertEqual(str(Optional(TEST_STR_LEN_N)), f"(?:{TEST_STR_LEN_N})?") 49 | 50 | def test_optional_on_len_1_literal(self): 51 | self.assertEqual(str(Optional(TEST_LITERAL_LEN_1)), f"{TEST_STR_LEN_1}?") 52 | 53 | def test_optional_on_len_n_literal(self): 54 | self.assertEqual(str(Optional(TEST_LITERAL_LEN_N)), f"(?:{TEST_STR_LEN_N})?") 55 | 56 | def test_optional_on_laziness(self): 57 | self.assertEqual(str(Optional(TEST_LITERAL_LEN_N, is_greedy=False)), f"(?:{TEST_STR_LEN_N})??") 58 | 59 | def test_optional_on_type(self): 60 | self.assertEqual(Optional("a")._get_type(), _Type.Quantifier) 61 | self.assertEqual(Optional("abc")._get_type(), _Type.Quantifier) 62 | self.assertNotEqual(Pregex("abc?", escape=False)._get_type(), _Type.Quantifier) 63 | 64 | def test_optional_on_match(self): 65 | self.assertTrue(("a" + Optional("a") + "a").get_matches("aaa") == ["aaa"]) 66 | self.assertTrue(("a" + Optional("a") + "a").get_matches("aa") == ["aa"]) 67 | 68 | def test_optional_on_lazy_match(self): 69 | self.assertTrue(("a" + Optional("a", is_greedy=False) + "a").get_matches("aaa") == ["aa"]) 70 | 71 | def test_optional_on_non_repeatable_pattern(self): 72 | self.assertEqual(str(Optional(MatchAtStart("a"))), "(?:\\Aa)?") 73 | 74 | 75 | class TestIndefinite(unittest.TestCase): 76 | 77 | def test_indefinite_on_len_1_str(self): 78 | self.assertEqual(str(Indefinite(TEST_STR_LEN_1)), f"{TEST_STR_LEN_1}*") 79 | 80 | def test_indefinite_on_len_n_str(self): 81 | self.assertEqual(str(Indefinite(TEST_STR_LEN_N)), f"(?:{TEST_STR_LEN_N})*") 82 | 83 | def test_indefinite_on_len_1_literal(self): 84 | self.assertEqual(str(Indefinite(TEST_LITERAL_LEN_1)), f"{TEST_STR_LEN_1}*") 85 | 86 | def test_indefinite_on_len_n_literal(self): 87 | self.assertEqual(str(Indefinite(TEST_LITERAL_LEN_N)), f"(?:{TEST_STR_LEN_N})*") 88 | 89 | def test_indefinite_on_laziness(self): 90 | self.assertEqual(str(Indefinite(TEST_LITERAL_LEN_N, is_greedy=False)), f"(?:{TEST_STR_LEN_N})*?") 91 | 92 | def test_indefinite_on_type(self): 93 | self.assertEqual(Indefinite("a")._get_type(), _Type.Quantifier) 94 | self.assertEqual(Indefinite("abc")._get_type(), _Type.Quantifier) 95 | self.assertNotEqual(Pregex("abc*", escape=False)._get_type(), _Type.Quantifier) 96 | 97 | def test_indefinite_on_non_repeatable_pattern(self): 98 | mat = MatchAtStart("a") 99 | self.assertRaises(CannotBeRepeatedException, Indefinite, mat) 100 | 101 | 102 | class TestOneOrMore(unittest.TestCase): 103 | 104 | def test_one_or_more_on_len_1_str(self): 105 | self.assertEqual(str(OneOrMore(TEST_STR_LEN_1)), f"{TEST_STR_LEN_1}+") 106 | 107 | def test_one_or_more_on_len_n_str(self): 108 | self.assertEqual(str(OneOrMore(TEST_STR_LEN_N)), f"(?:{TEST_STR_LEN_N})+") 109 | 110 | def test_one_or_more_on_len_1_literal(self): 111 | self.assertEqual(str(OneOrMore(TEST_LITERAL_LEN_1)), f"{TEST_STR_LEN_1}+") 112 | 113 | def test_one_or_more_on_len_n_literal(self): 114 | self.assertEqual(str(OneOrMore(TEST_LITERAL_LEN_N)), f"(?:{TEST_STR_LEN_N})+") 115 | 116 | def test_one_or_more_on_laziness(self): 117 | self.assertEqual(str(OneOrMore(TEST_LITERAL_LEN_N, is_greedy=False)), f"(?:{TEST_STR_LEN_N})+?") 118 | 119 | def test_one_or_more_on_type(self): 120 | self.assertEqual(OneOrMore("a")._get_type(), _Type.Quantifier) 121 | self.assertEqual(OneOrMore("abc")._get_type(), _Type.Quantifier) 122 | self.assertNotEqual(Pregex("abc+", escape=False)._get_type(), _Type.Quantifier) 123 | 124 | def test_one_or_more_on_non_repeatable_pattern(self): 125 | mat = MatchAtStart("a") 126 | self.assertRaises(CannotBeRepeatedException, OneOrMore, mat) 127 | 128 | 129 | class TestExactly(unittest.TestCase): 130 | 131 | VALID_VALUES = [2, 10] 132 | 133 | def test_exactly_on_len_1_str(self): 134 | for val in self.VALID_VALUES: 135 | self.assertEqual(str(Exactly(TEST_STR_LEN_1, val)), f"{TEST_STR_LEN_1}{{{val}}}") 136 | 137 | def test_exactly_on_len_n_str(self): 138 | for val in self.VALID_VALUES: 139 | self.assertEqual(str(Exactly(TEST_STR_LEN_N, val)), f"(?:{TEST_STR_LEN_N}){{{val}}}") 140 | 141 | def test_exactly_on_len_1_literal(self): 142 | for val in self.VALID_VALUES: 143 | self.assertEqual(str(Exactly(TEST_LITERAL_LEN_1, val)), f"{TEST_LITERAL_LEN_1}{{{val}}}") 144 | 145 | def test_exactly_on_len_n_literal(self): 146 | for val in self.VALID_VALUES: 147 | self.assertEqual(str(Exactly(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N}){{{val}}}") 148 | 149 | def test_exactly_on_value_1(self): 150 | self.assertEqual(str(Exactly(TEST_LITERAL_LEN_N, 1)), f"{TEST_LITERAL_LEN_N}") 151 | 152 | def test_exactly_on_value_0(self): 153 | self.assertEqual(str(Exactly(TEST_LITERAL_LEN_N, 0)), "") 154 | 155 | def test_exactly_on_type(self): 156 | self.assertEqual(Exactly("a", n=2)._get_type(), _Type.Quantifier) 157 | self.assertEqual(Exactly("abc", n=2)._get_type(), _Type.Quantifier) 158 | self.assertNotEqual(Pregex("abc{2}", escape=False)._get_type(), _Type.Quantifier) 159 | 160 | def test_exactly_on_invalid_argument_type_exception(self): 161 | for val in ["s", 1.1, True]: 162 | self.assertRaises(InvalidArgumentTypeException, Exactly, TEST_STR_LEN_1, val) 163 | 164 | def test_exactly_on_invalid_argument_value_exception(self): 165 | for val in [-10, -1]: 166 | self.assertRaises(InvalidArgumentValueException, Exactly, TEST_STR_LEN_1, val) 167 | 168 | def test_exactly_on_non_repeatable_pattern(self): 169 | mat = MatchAtStart("a") 170 | self.assertRaises(CannotBeRepeatedException, Exactly, mat, n=2) 171 | self.assertEqual(str(Exactly(mat, 1)), str(mat)) 172 | 173 | 174 | class TestAtLeast(unittest.TestCase): 175 | 176 | VALID_VALUES = [2, 10] 177 | 178 | def test_at_least_on_len_1_str(self): 179 | for val in self.VALID_VALUES: 180 | self.assertEqual(str(AtLeast(TEST_STR_LEN_1, val)), f"{TEST_STR_LEN_1}{{{val},}}") 181 | 182 | def test_at_least_on_len_n_str(self): 183 | for val in self.VALID_VALUES: 184 | self.assertEqual(str(AtLeast(TEST_STR_LEN_N, val)), f"(?:{TEST_STR_LEN_N}){{{val},}}") 185 | 186 | def test_at_least_on_len_1_literal(self): 187 | for val in self.VALID_VALUES: 188 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_1, val)), f"{TEST_LITERAL_LEN_1}{{{val},}}") 189 | 190 | def test_at_least_on_len_n_literal(self): 191 | for val in self.VALID_VALUES: 192 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N}){{{val},}}") 193 | 194 | def test_at_least_on_value_0(self): 195 | val = 0 196 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N})*") 197 | 198 | def test_at_least_on_value_1(self): 199 | val = 1 200 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N})+") 201 | 202 | def test_at_least_on_laziness(self): 203 | val = 3 204 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_N, val, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N}){{{val},}}?") 205 | 206 | def test_at_least_on_lazy_value_0(self): 207 | val = 0 208 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_N, val, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N})*?") 209 | 210 | def test_at_least_on_lazy_value_1(self): 211 | val = 1 212 | self.assertEqual(str(AtLeast(TEST_LITERAL_LEN_N, val, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N})+?") 213 | 214 | def test_at_least_on_type(self): 215 | self.assertEqual(AtLeast("a", n=2)._get_type(), _Type.Quantifier) 216 | self.assertEqual(AtLeast("abc", n=2)._get_type(), _Type.Quantifier) 217 | self.assertNotEqual(Pregex("abc{2,}", escape=False)._get_type(), _Type.Quantifier) 218 | 219 | def test_at_least_on_invalid_argument_type_exception(self): 220 | for val in ["s", 1.1, True]: 221 | self.assertRaises(InvalidArgumentTypeException, AtLeast, TEST_STR_LEN_1, val) 222 | 223 | def test_at_least_on_invalid_argument_value_exception(self): 224 | for val in [-10, -1]: 225 | self.assertRaises(InvalidArgumentValueException, AtLeast, TEST_STR_LEN_1, val) 226 | 227 | def test_at_least_at_on_non_repeatable_pattern(self): 228 | mat = MatchAtStart("a") 229 | self.assertRaises(CannotBeRepeatedException, AtLeast, mat, n=5) 230 | 231 | 232 | class TestAtMost(unittest.TestCase): 233 | 234 | VALID_VALUES = [2, 10] 235 | 236 | def test_at_most_on_len_1_str(self): 237 | for val in self.VALID_VALUES: 238 | self.assertEqual(str(AtMost(TEST_STR_LEN_1, val)), f"{TEST_STR_LEN_1}{{,{val}}}") 239 | 240 | def test_at_most_on_len_n_str(self): 241 | for val in self.VALID_VALUES: 242 | self.assertEqual(str(AtMost(TEST_STR_LEN_N, val)), f"(?:{TEST_STR_LEN_N}){{,{val}}}") 243 | 244 | def test_at_most_on_len_1_literal(self): 245 | for val in self.VALID_VALUES: 246 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_1, val)), f"{TEST_LITERAL_LEN_1}{{,{val}}}") 247 | 248 | def test_at_most_on_len_n_literal(self): 249 | for val in self.VALID_VALUES: 250 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N}){{,{val}}}") 251 | 252 | def test_at_most_on_value_0(self): 253 | val = 0 254 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val)), "") 255 | 256 | def test_at_most_on_value_1(self): 257 | val = 1 258 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N})?") 259 | 260 | def test_at_most_on_value_None(self): 261 | val = None 262 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val)), f"(?:{TEST_LITERAL_LEN_N})*") 263 | 264 | def test_at_most_on_laziness(self): 265 | val = 3 266 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N}){{,{val}}}?") 267 | 268 | def test_at_most_on_lazy_value_1(self): 269 | val = 1 270 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N})??") 271 | 272 | def test_at_most_on_lazy_value_None(self): 273 | val = None 274 | self.assertEqual(str(AtMost(TEST_LITERAL_LEN_N, val, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N})*?") 275 | 276 | def test_at_most_on_type(self): 277 | self.assertEqual(AtMost("a", n=2)._get_type(), _Type.Quantifier) 278 | self.assertEqual(AtMost("abc", n=2)._get_type(), _Type.Quantifier) 279 | self.assertNotEqual(Pregex("abc{,2}", escape=False)._get_type(), _Type.Quantifier) 280 | 281 | def test_at_most_on_invalid_argument_type_exception(self): 282 | for val in ["s", 1.1, True]: 283 | self.assertRaises(InvalidArgumentTypeException, AtMost, TEST_STR_LEN_1, val) 284 | 285 | def test_at_most_on_invalid_argument_value_exception(self): 286 | for val in [-10, -1]: 287 | self.assertRaises(InvalidArgumentValueException, AtMost, TEST_STR_LEN_1, val) 288 | 289 | def test_at_most_on_non_repeatable_pattern(self): 290 | mat = MatchAtStart("a") 291 | self.assertRaises(CannotBeRepeatedException, AtMost, mat, n=2) 292 | self.assertEqual(str(AtMost(mat, 1)), f"(?:{mat})?") 293 | 294 | 295 | class TestAtLeastAtMost(unittest.TestCase): 296 | 297 | VALID_VALUES = [(2, 3), (10, 20)] 298 | 299 | def test_at_least_at_most_on_len_1_str(self): 300 | for min, max in self.VALID_VALUES: 301 | self.assertEqual(str(AtLeastAtMost(TEST_STR_LEN_1, min, max)), f"{TEST_STR_LEN_1}{{{min},{max}}}") 302 | 303 | def test_at_least_at_most_on_len_n_str(self): 304 | for min, max in self.VALID_VALUES: 305 | self.assertEqual(str(AtLeastAtMost(TEST_STR_LEN_N, min, max)), f"(?:{TEST_STR_LEN_N}){{{min},{max}}}") 306 | 307 | def test_at_least_at_most_on_len_1_literal(self): 308 | for min, max in self.VALID_VALUES: 309 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_1, min, max)), f"{TEST_LITERAL_LEN_1}{{{min},{max}}}") 310 | 311 | def test_at_least_at_most_on_len_n_literal(self): 312 | for min, max in self.VALID_VALUES: 313 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N}){{{min},{max}}}") 314 | 315 | def test_at_least_at_most_on_min_equal_to_max_equal_to_zero(self): 316 | min, max = 0, 0 317 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), "") 318 | 319 | def test_at_least_at_most_on_min_equal_to_zero_max_equal_to_1(self): 320 | min, max = 0, 1 321 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N})?") 322 | 323 | def test_at_least_at_most_on_min_equal_to_zero_max_greater_than_1(self): 324 | min, max = 0, 2 325 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N}){{,{max}}}") 326 | 327 | def test_at_least_at_most_on_min_equal_to_zero_max_equal_to_None(self): 328 | min, max = 0, None 329 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N})*") 330 | 331 | def test_at_least_at_most_on_min_equal_to_max_equal_to_one(self): 332 | min, max = 1, 1 333 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), str(TEST_LITERAL_LEN_N)) 334 | 335 | def test_at_least_at_most_on_min_equal_to_one_max_equal_to_None(self): 336 | min, max = 1, None 337 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N})+") 338 | 339 | def test_at_least_at_most_on_min_equal_to_max(self): 340 | min, max = 2, 2 341 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N}){{{min}}}") 342 | 343 | def test_at_least_at_most_on_min_equal_to_two_max_equal_to_None(self): 344 | min, max = 2, None 345 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max)), f"(?:{TEST_LITERAL_LEN_N}){{{min},}}") 346 | 347 | def test_at_least_at_most_on_laziness(self): 348 | min, max = 3, 5 349 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), 350 | f"(?:{TEST_LITERAL_LEN_N}){{{min},{max}}}?") 351 | 352 | def test_at_least_at_most_on_lazy_min_equal_to_zero_max_equal_to_1(self): 353 | min, max = 0, 1 354 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N})??") 355 | 356 | def test_at_least_at_most_on_lazy_min_equal_to_zero_max_greater_than_1(self): 357 | min, max = 0, 2 358 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N}){{,{max}}}?") 359 | 360 | def test_at_least_at_most_on_lazy_min_equal_to_zero_max_equal_to_None(self): 361 | min, max = 0, None 362 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), 363 | f"(?:{TEST_LITERAL_LEN_N})*?") 364 | 365 | def test_at_least_at_most_on_lazy_min_equal_to_one_max_equal_to_None(self): 366 | min, max = 1, None 367 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), 368 | f"(?:{TEST_LITERAL_LEN_N})+?") 369 | 370 | def test_at_least_at_most_on_lazy_min_equal_to_two_max_equal_to_None(self): 371 | min, max = 2, None 372 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), 373 | f"(?:{TEST_LITERAL_LEN_N}){{{min},}}?") 374 | 375 | def test_at_least_at_most_on_lazy_min_equal_to_max(self): 376 | min, max = 2, 2 377 | self.assertEqual(str(AtLeastAtMost(TEST_LITERAL_LEN_N, min, max, is_greedy=False)), f"(?:{TEST_LITERAL_LEN_N}){{{min}}}") 378 | 379 | def test_at_least_at_most_on_type(self): 380 | self.assertEqual(AtLeastAtMost("a", n=1, m=2)._get_type(), _Type.Quantifier) 381 | self.assertEqual(AtLeastAtMost("abc", n=1, m=2)._get_type(), _Type.Quantifier) 382 | self.assertNotEqual(Pregex("abc{1,2}", escape=False)._get_type(), _Type.Quantifier) 383 | 384 | def test_at_least_at_most_on_invalid_argument_type_exception(self): 385 | for val in ["s", 1.1, True]: 386 | self.assertRaises(InvalidArgumentTypeException, AtLeastAtMost, TEST_STR_LEN_1, n=val, m=10) 387 | self.assertRaises(InvalidArgumentTypeException, AtLeastAtMost, TEST_STR_LEN_1, n=2, m=val) 388 | 389 | def test_at_least_at_most_on_invalid_argument_value_exception(self): 390 | self.assertRaises(InvalidArgumentValueException, AtLeastAtMost, TEST_STR_LEN_1, n=-1, m=1) 391 | self.assertRaises(InvalidArgumentValueException, AtLeastAtMost, TEST_STR_LEN_1, n=1, m=-1) 392 | self.assertRaises(InvalidArgumentValueException, AtLeastAtMost, TEST_STR_LEN_1, n=5, m=3) 393 | 394 | def test_at_least_at_most_on_non_repeatable_pattern(self): 395 | mat = MatchAtStart("a") 396 | self.assertRaises(CannotBeRepeatedException, AtLeastAtMost, mat, n=2, m=3) 397 | self.assertEqual(str(AtLeastAtMost(mat, n=0, m=1)), f"(?:{mat})?") 398 | self.assertEqual(str(AtLeastAtMost(mat, n=1, m=1)), str(mat)) 399 | 400 | 401 | if __name__=="__main__": 402 | unittest.main() -------------------------------------------------------------------------------- /tests/test_core_tokens.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pregex.core.tokens import * 3 | from pregex.core.pre import _Type 4 | 5 | 6 | class TestBackslash(unittest.TestCase): 7 | 8 | def test_backslash(self): 9 | self.assertEqual(str(Backslash()), r"\\") 10 | 11 | def test_backslash_on_type(self): 12 | self.assertEqual(Backslash()._get_type(), _Type.Token) 13 | 14 | def test_backslash_on_match(self): 15 | self.assertTrue(Backslash().get_matches(r"text\ttext") == ["\\"]) 16 | 17 | 18 | class TestBullet(unittest.TestCase): 19 | 20 | def test_bullet(self): 21 | self.assertEqual(str(Bullet()), "\u2022") 22 | 23 | def test_backslash_on_type(self): 24 | self.assertEqual(Bullet()._get_type(), _Type.Token) 25 | 26 | def test_bullet_on_match(self): 27 | self.assertTrue(Bullet().get_matches("text•text") == ["•"]) 28 | 29 | 30 | class TestCarriageReturn(unittest.TestCase): 31 | 32 | def test_carriage_return(self): 33 | self.assertEqual(str(CarriageReturn()), "\r") 34 | 35 | def test_carriage_return_on_type(self): 36 | self.assertEqual(CarriageReturn()._get_type(), _Type.Token) 37 | 38 | def test_carriage_return_on_match(self): 39 | self.assertTrue(CarriageReturn().get_matches("text\rtext") == ["\r"]) 40 | 41 | 42 | class TestCopyright(unittest.TestCase): 43 | 44 | def test_copyright(self): 45 | self.assertEqual(str(Copyright()), "\u00A9") 46 | 47 | def test_copyright_on_type(self): 48 | self.assertEqual(Copyright()._get_type(), _Type.Token) 49 | 50 | def test_copyright_on_match(self): 51 | self.assertTrue(Copyright().get_matches("text©text") == ["©"]) 52 | 53 | 54 | class TestDivision(unittest.TestCase): 55 | 56 | def test_division(self): 57 | self.assertEqual(str(Division()), "\u00f7") 58 | 59 | def test_division_on_type(self): 60 | self.assertEqual(Division()._get_type(), _Type.Token) 61 | 62 | def test_division_on_match(self): 63 | self.assertTrue(Division().get_matches("text÷text") == ["÷"]) 64 | 65 | 66 | class TestDollar(unittest.TestCase): 67 | 68 | def test_dollar(self): 69 | self.assertEqual(str(Dollar()), "\\\u0024") 70 | 71 | def test_dollar_on_type(self): 72 | self.assertEqual(Dollar()._get_type(), _Type.Token) 73 | 74 | def test_dollar_on_match(self): 75 | self.assertTrue(Dollar().get_matches("text$text") == ["$"]) 76 | 77 | 78 | class TestEuro(unittest.TestCase): 79 | 80 | def test_euro(self): 81 | self.assertEqual(str(Euro()), "\u20ac") 82 | 83 | def test_euro_on_type(self): 84 | self.assertEqual(Euro()._get_type(), _Type.Token) 85 | 86 | def test_euro_on_match(self): 87 | self.assertTrue(Euro().get_matches("text€text") == ["€"]) 88 | 89 | 90 | class TestFormFeed(unittest.TestCase): 91 | 92 | def test_form_feed(self): 93 | self.assertEqual(str(FormFeed()), "\f") 94 | 95 | def test_form_feed_on_type(self): 96 | self.assertEqual(FormFeed()._get_type(), _Type.Token) 97 | 98 | def test_form_feed_on_match(self): 99 | self.assertTrue(FormFeed().get_matches("text\ftext") == ["\f"]) 100 | 101 | 102 | class TestInfinity(unittest.TestCase): 103 | 104 | def test_infinity(self): 105 | self.assertEqual(str(Infinity()), "\u221e") 106 | 107 | def test_infinity_on_type(self): 108 | self.assertEqual(Infinity()._get_type(), _Type.Token) 109 | 110 | def test_infinity_on_match(self): 111 | self.assertTrue(Infinity().get_matches("text∞text") == ["∞"]) 112 | 113 | 114 | class TestMultiplication(unittest.TestCase): 115 | 116 | def test_multiplication(self): 117 | self.assertEqual(str(Multiplication()), "\u00d7") 118 | 119 | def test_multiplication_on_type(self): 120 | self.assertEqual(Multiplication()._get_type(), _Type.Token) 121 | 122 | def test_multiplication_on_match(self): 123 | self.assertTrue(Multiplication().get_matches("text×text") == ["×"]) 124 | 125 | 126 | class TestNewline(unittest.TestCase): 127 | 128 | def test_newline(self): 129 | self.assertEqual(str(Newline()), "\n") 130 | 131 | def test_newline_on_type(self): 132 | self.assertEqual(Newline()._get_type(), _Type.Token) 133 | 134 | def test_newline_on_match(self): 135 | self.assertTrue(Newline().get_matches("text\ntext") == ["\n"]) 136 | 137 | 138 | class TestPound(unittest.TestCase): 139 | 140 | def test_pound(self): 141 | self.assertEqual(str(Pound()), "\u00a3") 142 | 143 | def test_pound_on_type(self): 144 | self.assertEqual(Pound()._get_type(), _Type.Token) 145 | 146 | def test_pound_on_match(self): 147 | self.assertTrue(Pound().get_matches("text£text") == ["£"]) 148 | 149 | 150 | class TestRegistered(unittest.TestCase): 151 | 152 | def test_registered(self): 153 | self.assertEqual(str(Registered()), "\u00ae") 154 | 155 | def test_registered_on_type(self): 156 | self.assertEqual(Registered()._get_type(), _Type.Token) 157 | 158 | def test_registered_on_match(self): 159 | self.assertTrue(Registered().get_matches("text®text") == ["®"]) 160 | 161 | 162 | class TestRupee(unittest.TestCase): 163 | 164 | def test_rupee(self): 165 | self.assertEqual(str(Rupee()), "\u20b9") 166 | 167 | def test_rupee_on_type(self): 168 | self.assertEqual(Rupee()._get_type(), _Type.Token) 169 | 170 | def test_rupee_on_match(self): 171 | self.assertTrue(Rupee().get_matches("text₹text") == ["₹"]) 172 | 173 | 174 | class TestSpace(unittest.TestCase): 175 | 176 | def test_space(self): 177 | self.assertEqual(str(Space()), r" ") 178 | 179 | def test_space_on_type(self): 180 | self.assertEqual(Space()._get_type(), _Type.Token) 181 | 182 | def test_space_on_match(self): 183 | self.assertTrue(Space().get_matches(r"text ext") == [" "]) 184 | 185 | 186 | class TestTab(unittest.TestCase): 187 | 188 | def test_tab(self): 189 | self.assertEqual(str(Tab()), "\t") 190 | 191 | def test_tab_on_type(self): 192 | self.assertEqual(Tab()._get_type(), _Type.Token) 193 | 194 | def test_tab_on_match(self): 195 | self.assertTrue(Tab().get_matches("text\ttext") == ["\t"]) 196 | 197 | 198 | class TestTrademark(unittest.TestCase): 199 | 200 | def test_trademark(self): 201 | self.assertEqual(str(Trademark()), "\u2122") 202 | 203 | def test_trademark_on_type(self): 204 | self.assertEqual(Trademark()._get_type(), _Type.Token) 205 | 206 | def test_trademark_on_match(self): 207 | self.assertTrue(Trademark().get_matches("text™text") == ["™"]) 208 | 209 | 210 | class TestVerticalTab(unittest.TestCase): 211 | 212 | def test_vertical_tab(self): 213 | self.assertEqual(str(VerticalTab()), "\v") 214 | 215 | def test_vertical_tab_on_type(self): 216 | self.assertEqual(VerticalTab()._get_type(), _Type.Token) 217 | 218 | def test_vertical_tab_on_match(self): 219 | self.assertTrue(VerticalTab().get_matches("text\vtext") == ["\v"]) 220 | 221 | 222 | class TestWhiteBullet(unittest.TestCase): 223 | 224 | def test_white_bullet(self): 225 | self.assertEqual(str(WhiteBullet()), "\u25e6") 226 | 227 | def test_white_bullet_on_type(self): 228 | self.assertEqual(WhiteBullet()._get_type(), _Type.Token) 229 | 230 | def test_white_bullet_on_match(self): 231 | self.assertTrue(WhiteBullet().get_matches("text◦text") == ["◦"]) 232 | 233 | 234 | class TestYen(unittest.TestCase): 235 | 236 | def test_yen(self): 237 | self.assertEqual(str(Yen()), "\u00a5") 238 | 239 | def test_yen_on_type(self): 240 | self.assertEqual(Yen()._get_type(), _Type.Token) 241 | 242 | def test_yen_on_match(self): 243 | self.assertTrue(Yen().get_matches("text¥text") == ["¥"]) 244 | 245 | 246 | if __name__=="__main__": 247 | unittest.main() --------------------------------------------------------------------------------