├── setup.cfg ├── regularize ├── exceptions.py ├── __init__.py ├── replace.py ├── flag.py ├── find.py └── expression.py ├── tests ├── test_flag.py ├── test_pattern.py └── test_examples.py ├── setup.py ├── LICENSE ├── .github └── workflows │ ├── python-app.yml │ └── pypi-publish.yml ├── .gitignore └── README.md /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Override the default of 80 characters 3 | max-line-length = 90 4 | -------------------------------------------------------------------------------- /regularize/exceptions.py: -------------------------------------------------------------------------------- 1 | class SampleNotMatchedError(Exception): 2 | pass 3 | 4 | 5 | class InvalidRangeError(Exception): 6 | pass 7 | -------------------------------------------------------------------------------- /regularize/__init__.py: -------------------------------------------------------------------------------- 1 | from regularize.expression import Pattern # noqa: F401 2 | from regularize.find import finder # noqa: F401 3 | from regularize.replace import substitution # noqa: F401 4 | 5 | pattern = Pattern 6 | -------------------------------------------------------------------------------- /tests/test_flag.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unittest 3 | 4 | from regularize.flag import FlagSet 5 | from regularize.expression import Pattern 6 | 7 | 8 | class TestFlagSet(unittest.TestCase): 9 | def test_case_insensitive(self): 10 | pass 11 | 12 | def test_equality_without_pattern(self): 13 | flags = FlagSet() 14 | other_flags = FlagSet() 15 | self.assertEqual(flags, other_flags) 16 | 17 | def test_compile(self): 18 | flags = FlagSet() 19 | flags.case_insensitive() 20 | flags.multiline() 21 | self.assertEqual(flags.compile(), re.I | re.M) 22 | 23 | def test_copy(self): 24 | flags = FlagSet() 25 | flags.case_insensitive() 26 | new_flags = flags.copy() 27 | self.assertEqual(flags.compile(), new_flags.compile()) 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('README.md') as f: 4 | readme = f.read() 5 | 6 | setup( 7 | name="regularize", 8 | version="0.0.7", 9 | author="George Psarakis", 10 | description="Regular Expression Builder", 11 | long_description=readme, 12 | long_description_content_type="text/markdown", 13 | package_dir={"": "."}, 14 | packages=find_packages(), 15 | zip_safe=False, 16 | install_requires=[], 17 | extras_require={ 18 | "dev": [ 19 | "pytest==6.1.0", 20 | "flake8==3.8.4", 21 | "pytest-cov==2.10.1" 22 | ] 23 | }, 24 | url="https://github.com/georgepsarakis/regularize", 25 | license="MIT", 26 | include_package_data=True, 27 | classifiers=[ 28 | "Intended Audience :: Developers", 29 | "Intended Audience :: System Administrators", 30 | "Operating System :: POSIX :: Linux", 31 | "Development Status :: 2 - Pre-Alpha", 32 | "Programming Language :: Python :: 3", 33 | "License :: OSI Approved :: MIT License", 34 | ], 35 | ) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 George Psarakis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /regularize/replace.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | import typing 3 | 4 | if typing.TYPE_CHECKING: 5 | from regularize.expression import Pattern 6 | 7 | 8 | class Substitution: 9 | def __init__(self, pattern: 'Pattern'): 10 | self._stack = [] 11 | self._pattern = pattern 12 | self._compiled_pattern = None 13 | 14 | @property 15 | def pattern(self): 16 | if self._compiled_pattern is None: 17 | self._compiled_pattern = self._pattern.compile() 18 | return self._compiled_pattern 19 | 20 | @property 21 | def stack(self): 22 | return self._stack 23 | 24 | def _build(self): 25 | return ''.join(self.stack) 26 | 27 | def add(self, string): 28 | self.stack.append(string) 29 | return self 30 | 31 | def backreference(self, name_or_number): 32 | self.stack.append(f'\\g<{name_or_number}>') 33 | return self 34 | 35 | @lru_cache(maxsize=1_000) 36 | def replace(self, string, count=0): 37 | return self.pattern.sub(self._build(), string, count=count) 38 | __call__ = replace 39 | 40 | 41 | substitution = Substitution 42 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python Application 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 3.8 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.8 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install -e .[dev] 25 | - name: Lint with flake8 26 | run: | 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | pytest -v --cov=regex_composer tests 34 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Package Publisher 5 | 6 | on: 7 | push: 8 | tags: [ '*' ] 9 | 10 | jobs: 11 | publish: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 3.8 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.8 19 | - name: Install pypa/build 20 | run: python -m pip install build --user 21 | - name: Build a binary wheel and a source tarball 22 | run: python -m build --sdist --wheel --outdir dist/ . 23 | - name: Publish distribution 📦 to Test PyPI 24 | uses: pypa/gh-action-pypi-publish@release/v1 25 | with: 26 | skip_existing: true 27 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 28 | repository_url: https://test.pypi.org/legacy/ 29 | - name: Publish distribution 📦 to PyPI 30 | if: startsWith(github.ref, 'refs/tags') 31 | uses: pypa/gh-action-pypi-publish@release/v1 32 | with: 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | -------------------------------------------------------------------------------- /regularize/flag.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | from operator import or_ 3 | import re 4 | 5 | 6 | class FlagSet: 7 | def __init__(self): 8 | self._options = set() 9 | 10 | def copy(self): 11 | new = self.__class__() 12 | new.options.update(self.options) 13 | return new 14 | 15 | @property 16 | def options(self): 17 | return self._options 18 | 19 | def _add_option(self, flag): 20 | self.options.add(flag) 21 | 22 | def _remove_option(self, flag): 23 | self.options.remove(flag) 24 | 25 | def _update_option(self, flag, enabled): 26 | if enabled: 27 | return self._add_option(flag) 28 | else: 29 | return self._remove_option(flag) 30 | 31 | def case_insensitive(self, enabled=True): 32 | return self._update_option(re.IGNORECASE, enabled=enabled) 33 | 34 | def ascii_only(self, enabled=True): 35 | return self._update_option(re.ASCII, enabled=enabled) 36 | 37 | def multiline(self, enabled=True): 38 | return self._update_option(re.MULTILINE, enabled=enabled) 39 | 40 | def dot_matches_newline(self, enabled=True): 41 | return self._update_option(re.DOTALL, enabled=enabled) 42 | 43 | def compile(self): 44 | if self._options: 45 | return reduce(or_, self._options, 0) 46 | return 0 47 | 48 | def __eq__(self, other): 49 | return self.options == other.options 50 | 51 | def __str__(self): 52 | if self._options: 53 | return repr(self.options) 54 | else: 55 | return '' 56 | 57 | def __repr__(self): 58 | return f'{self.__class__.__name__}: {repr(self.options)}' 59 | -------------------------------------------------------------------------------- /tests/test_pattern.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import re 3 | 4 | from regularize import Pattern, pattern 5 | 6 | 7 | class TestPattern(unittest.TestCase): 8 | def setUp(self): 9 | self.pattern = pattern().lowercase_ascii_letters(closed=True) 10 | 11 | @staticmethod 12 | def _to_list(pattern_instance): 13 | return list(map(str, pattern_instance.token_stack)) 14 | 15 | def _transform(self, function): 16 | self.pattern = function(self.pattern) 17 | 18 | def test_named_group(self): 19 | group_name = 'some_group' 20 | self.assertListEqual( 21 | self._to_list(self.pattern.group(group_name)), 22 | ['(', f'?P<{group_name}>', '[', 'a-z', ']', ')'] 23 | ) 24 | 25 | def test_unnamed_group(self): 26 | self._transform(lambda p: p.group()) 27 | 28 | # Return a new Pattern instance 29 | self.assertIsInstance(self.pattern, Pattern) 30 | self.assertListEqual( 31 | self._to_list(self.pattern), 32 | ['(', '[', 'a-z', ']', ')'] 33 | ) 34 | 35 | 36 | class TestComposition(unittest.TestCase): 37 | def setUp(self): 38 | self.pattern = Pattern() 39 | 40 | def test_quantified_numeric_range(self): 41 | self.pattern = self.pattern.literal('application.'). \ 42 | any_number_between().quantify(minimum=1). \ 43 | literal('.log'). \ 44 | case_insensitive() 45 | 46 | expected = re.compile(r'application\.[0-9]+\.log', re.IGNORECASE) 47 | self.assertEqual(expected, self.pattern.compile()) 48 | 49 | def test_domain_pattern(self): 50 | # Sample domain name pattern 51 | expected = re.compile(r'[a-zA-Z0-9][a-zA-Z0-9\-]{1,61}[a-zA-Z0-9]\.[a-zA-Z]{2,}') 52 | 53 | ascii_alpha_numeric = pattern(). \ 54 | lowercase_ascii_letters(). \ 55 | uppercase_ascii_letters(). \ 56 | any_number() 57 | 58 | domain_pattern = \ 59 | ascii_alpha_numeric.close_bracket() + \ 60 | ascii_alpha_numeric.literal('-').quantify(1, 61) 61 | 62 | # At least one alphanumeric character before the dot and after the dash 63 | domain_pattern += ascii_alpha_numeric.close_bracket() 64 | # Add TLD 65 | domain_pattern = domain_pattern.literal('.').\ 66 | lowercase_ascii_letters(closed=False).\ 67 | uppercase_ascii_letters().\ 68 | quantify(minimum=2) 69 | 70 | self.assertEqual(expected, domain_pattern.compile()) 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Pycharm 132 | .idea 133 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from regularize import Pattern, pattern 4 | 5 | 6 | class TestExamples(unittest.TestCase): 7 | def setUp(self): 8 | self.pattern = pattern() 9 | self.apache_webserver_combined_log = ( 10 | '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] ' 11 | '"GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" ' 12 | '"Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:84.0) ' 13 | 'Gecko/20100101 Firefox/84.0"' 14 | ) 15 | 16 | def tearDown(self) -> None: 17 | self.pattern.extensions.clear() 18 | 19 | def test_html_tag_extension(self): 20 | class HTMLTag(Pattern): 21 | def __call__(self, opening=True): 22 | if opening: 23 | new = self.literal('<') 24 | else: 25 | new = self.literal('') 28 | self.pattern.ext.registry['html_tag'] = HTMLTag 29 | self.assertEqual(self.pattern.ext.html_tag().build(), '<[a-z]+>') 30 | 31 | def test_apache_combined_log_parsing(self): 32 | ip = pattern().any_of('.', Pattern.ANY_NUMBER).quantify(minimum=7).group('ip') 33 | identd_client_id = pattern().literal('-') 34 | http_auth_user = pattern().any_of(Pattern.ANY_ASCII_CHARACTER, '_', '.').\ 35 | at_least_one().group('http_auth_user') 36 | time = pattern().literal('[').none_of(']').quantify(minimum=26).literal(']') 37 | http_verb = pattern().literal('"').group( 38 | 'http_verb', 39 | wrapped=pattern().uppercase_ascii_letters().at_least_one()) 40 | url = pattern().group( 41 | name='url', 42 | wrapped=pattern().none_of(Pattern.ANY_WHITESPACE).at_least_one()) 43 | http_version = pattern().literal('HTTP/').any_of('1', '2').literal('.').\ 44 | any_of('0', '1').group('http_version').literal('"') 45 | http_status_code = pattern().group( 46 | name='http_status_code', 47 | wrapped=pattern().any_of(Pattern.ANY_NUMBER).exactly(3)) 48 | response_bytes = pattern().group( 49 | name='response_bytes_without_headers', 50 | wrapped=pattern().any_of(Pattern.ANY_NUMBER).at_least_one()) 51 | referer = pattern().literal('"').\ 52 | group(name='referer', 53 | wrapped=pattern().none_of('"').at_least_one()).literal('"') 54 | user_agent = pattern().literal('"').\ 55 | group(name='user_agent', 56 | wrapped=pattern().none_of('"').at_least_one()) 57 | 58 | p = Pattern.join( 59 | pattern().whitespace(), 60 | [ip, identd_client_id, http_auth_user, time, 61 | http_verb, url, http_version, http_status_code, 62 | response_bytes, referer, user_agent] 63 | ) 64 | self.assertDictEqual( 65 | {'ip': '127.0.0.1', 'http_auth_user': 'frank', 66 | 'http_verb': 'GET', 'url': '/apache_pb.gif', 67 | 'http_version': 'HTTP/1.0', 'http_status_code': '200', 68 | 'response_bytes_without_headers': '2326', 69 | 'user_agent': 'http://www.example.com/start.html'}, 70 | p.compile().match(self.apache_webserver_combined_log).groupdict() 71 | ) 72 | -------------------------------------------------------------------------------- /regularize/find.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from typing import Union 3 | import re 4 | import typing 5 | 6 | 7 | if typing.TYPE_CHECKING: 8 | from regularize.expression import Pattern 9 | 10 | 11 | class Cache: 12 | NOT_FOUND = object() 13 | DEFAULT_MAXIMUM_SIZE = 1_000 14 | 15 | def __init__(self, maxsize=DEFAULT_MAXIMUM_SIZE): 16 | self._cache = dict() 17 | self._maxsize = maxsize 18 | self._reset_stats() 19 | 20 | def _increment_metric(self, name): 21 | self._stats[name] += 1 22 | 23 | def _reset_stats(self): 24 | self._stats = { 25 | 'maxsize_reached': 0, 26 | 'hits': 0, 27 | 'misses': 0 28 | } 29 | 30 | @property 31 | def stats(self): 32 | return self._stats.copy() 33 | 34 | @property 35 | def cache(self): 36 | return self._cache.copy() 37 | 38 | @property 39 | def current_size(self): 40 | return len(self._cache) 41 | 42 | def clear(self): 43 | self._reset_stats() 44 | self._cache.clear() 45 | 46 | def get(self, key): 47 | if key in self._cache: 48 | entry = self._cache[key] 49 | else: 50 | entry = self.NOT_FOUND 51 | self._increment_metric('misses') 52 | 53 | if entry is not self.NOT_FOUND: 54 | self._increment_metric('hits') 55 | # Emulate LRU by utilizing LIFO order in dictionaries. 56 | # Note that this has performance impact, but does not 57 | # require maintaining extra statistics or structures. 58 | del self._cache[key] 59 | self.add(key, entry) 60 | 61 | return entry, entry is not self.NOT_FOUND 62 | 63 | def add(self, key, entry): 64 | self._cache.setdefault(key, entry) 65 | if len(self._cache) > self._maxsize: 66 | self._increment_metric('maxsize_reached') 67 | # Remove the first key which should be the least recently 68 | # accessed. See .get for details. 69 | remove_key = next(iter(self._cache.keys())) 70 | del self._cache[remove_key] 71 | 72 | 73 | def enable_dict_cache(maxsize): 74 | cache = Cache(maxsize=maxsize) 75 | 76 | def cached(func): 77 | @wraps(func) 78 | def cached_wrapper(cls, pattern, string): 79 | key = (pattern, string) 80 | entry, found = cache.get(key) 81 | if found: 82 | return entry 83 | entry = func(cls, pattern, string) 84 | cache.add(key, entry) 85 | return entry 86 | cached_wrapper._cache = cache 87 | return cached_wrapper 88 | return cached 89 | 90 | 91 | class Finder: 92 | def __init__(self, pattern: Union['Pattern', re.Pattern]): 93 | self._pattern = pattern 94 | self._compiled_pattern = None 95 | self._is_builtin_pattern = isinstance(pattern, re.Pattern) 96 | 97 | @property 98 | def pattern(self): 99 | return self._pattern 100 | 101 | @property 102 | def compiled_pattern(self): 103 | if self._is_builtin_pattern: 104 | return self.pattern 105 | 106 | if self._compiled_pattern is None: 107 | self._compiled_pattern = self.pattern.compile() 108 | 109 | return self._compiled_pattern 110 | 111 | def match(self, string): 112 | return self.__class__._match(self.compiled_pattern, string) 113 | 114 | def find(self, string, iterator=True): 115 | if iterator: 116 | return self.compiled_pattern.finditer(string) 117 | else: 118 | return self.compiled_pattern.findall(string) 119 | 120 | @classmethod 121 | @enable_dict_cache(maxsize=1_000) 122 | def _match(cls, regex, string): 123 | return regex.match(string) 124 | 125 | @classmethod 126 | def cache_clear(cls): 127 | return cls._match._cache.clear() 128 | 129 | 130 | finder = Finder 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # regularize - Easily compose regular expressions 2 | 3 | ![Github Actions Status](https://github.com/georgepsarakis/regularize/actions/workflows/python-app.yml/badge.svg) [![PyPI version](https://badge.fury.io/py/regularize.svg)](https://badge.fury.io/py/regularize) [![Downloads](https://pepy.tech/badge/regularize)](https://pepy.tech/project/regularize) 4 | 5 | ## Motivation 6 | 7 | Writing complex regular expressions can prove to be difficult and error-prone. This library aims to provide a simple interface for constructing patterns, finding matches and performing substitutions. 8 | 9 | ### Key Features 10 | 11 | - **Pattern Builder:** a clean and robust API to build complex regular expressions. 12 | - **Flag Interface:** easily add and remove flags using a friendly interface. 13 | - **Immutable Pattern Objects:** in order to increase composability and reusability, `Pattern` instances do not modify internal state, but instead return copies with the modifications. 14 | - **Find/Replace with LRU cache:** using a shared cache, different pattern instances that compile to the same regular expression can benefit from the same cache entries. 15 | 16 | ## Examples 17 | 18 | ### Match compressed / uncompressed log filenames 19 | 20 | ```python 21 | from regularize import pattern, finder 22 | 23 | # Start a new pattern 24 | logfile_pattern = pattern() 25 | 26 | # Create a base pattern for the logfile names 27 | logfile_pattern = logfile_pattern.\ 28 | literal('application.').\ 29 | any_number().\ 30 | quantify(minimum=1).case_insensitive() 31 | 32 | uncompressed_logfile = logfile_pattern.literal('.log').end_anchor() 33 | compressed_logfile = logfile_pattern.literal('.log.gz').end_anchor() 34 | 35 | print(uncompressed_logfile) 36 | # Expression: /application\.[0-9]+\.log$/ 37 | 38 | print(compressed_logfile) 39 | # Expression: /application\.[0-9]+\.log\.gz$/ 40 | 41 | print(finder(uncompressed_logfile).match('application.1.log')) 42 | # 43 | print(finder(compressed_logfile).match('application.1.log.gz')) 44 | # 45 | ``` 46 | 47 | ### Match and extract URL components 48 | 49 | ```python 50 | from urllib.parse import urlparse 51 | 52 | from regularize import pattern 53 | 54 | # Valid characters for DNS names 55 | ascii_alphanumeric = pattern().lowercase_ascii_letters(). \ 56 | uppercase_ascii_letters().any_number() 57 | 58 | domain_pattern = \ 59 | ascii_alphanumeric.close_bracket() + \ 60 | ascii_alphanumeric.literal('-').quantify(1, 61) 61 | 62 | # At least one alphanumeric character before the dot and after the dash 63 | domain_pattern += ascii_alphanumeric.close_bracket() 64 | 65 | # Add TLD 66 | tld_pattern = pattern().lowercase_ascii_letters(closed=False). \ 67 | uppercase_ascii_letters(). \ 68 | quantify(minimum=2) 69 | 70 | # Add optional subdomain group 71 | subdomain_pattern = domain_pattern.\ 72 | group(name='subdomain', optional=True).\ 73 | literal('.').\ 74 | group(optional=True) 75 | 76 | # Full domain pattern 77 | domain_pattern = subdomain_pattern + domain_pattern.literal('.') + tld_pattern 78 | 79 | # Match HTTP or HTTPS scheme 80 | scheme_pattern = pattern().literal('http').any_of('s').\ 81 | quantify(minimum=0, maximum=1).\ 82 | group('scheme').\ 83 | literal('://') 84 | 85 | # Match the URL path (if any exists) 86 | path_pattern = pattern().literal('/').any_number().\ 87 | lowercase_ascii_letters().literal('%-_').\ 88 | quantify(minimum=1).match_all() 89 | 90 | # Compose the complete pattern 91 | url_pattern = (scheme_pattern + domain_pattern.group('domain') + 92 | path_pattern.group(name='path', optional=True)).case_insensitive() 93 | 94 | url = 'https://www.example.com/p/1' 95 | 96 | compiled_url_pattern = url_pattern.compile() 97 | url_regex_matches = compiled_url_pattern.match(url).groupdict() 98 | 99 | parsed_url = urlparse(url) 100 | 101 | print(url_regex_matches) 102 | # {'scheme': 'https', 'domain': 'www.example.com', 'subdomain': 'www', 'path': '/p/1'} 103 | print(parsed_url) 104 | # ParseResult(scheme='https', netloc='www.example.com', path='/p/1', params='', query='', fragment='') 105 | assert parsed_url.scheme == url_regex_matches['scheme'] 106 | assert parsed_url.hostname == url_regex_matches['domain'] 107 | assert parsed_url.path == url_regex_matches['path'] 108 | assert url_regex_matches['subdomain'] == 'www' 109 | ``` 110 | 111 | ### Parse HTTP Logs 112 | 113 | The following example is taken from the common format sample of the [Apache web server combined log](https://httpd.apache.org/docs/current/logs.html#combined). 114 | 115 | ```python 116 | from regularize.expression import Pattern, pattern 117 | 118 | apache_webserver_combined_log = ( 119 | '127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] ' 120 | '"GET /apache_pb.gif HTTP/1.0" 200 2326 "http://www.example.com/start.html" ' 121 | '"Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:84.0) Gecko/20100101 Firefox/84.0"' 122 | ) 123 | 124 | ip = pattern().any_of('.', Pattern.ANY_NUMBER).quantify(minimum=7).group('ip') 125 | identd_client_id = pattern().literal('-') 126 | http_auth_user = pattern().any_of(Pattern.ANY_ASCII_CHARACTER, '_', '.').\ 127 | at_least_one().group('http_auth_user') 128 | time = pattern().literal('[').none_of(']').quantify(minimum=26).literal(']') 129 | http_verb = pattern().literal('"').group('http_verb', 130 | pattern=pattern().uppercase_ascii_letters().at_least_one()) 131 | url = pattern().group(name='url', 132 | pattern=pattern().none_of(Pattern.ANY_WHITESPACE).at_least_one()) 133 | http_version = pattern().literal('HTTP/').any_of('1', '2').literal('.').\ 134 | any_of('0', '1').group('http_version').literal('"') 135 | http_status_code = pattern().group(name='http_status_code', 136 | pattern=pattern().any_of(Pattern.ANY_NUMBER).exactly(3)) 137 | response_bytes = pattern().group(name='response_bytes_without_headers', 138 | pattern=pattern().any_of(Pattern.ANY_NUMBER).at_least_one()) 139 | # Note the repetition here. For multiple groups using the same expression, 140 | # we can create a lambda, e.g: 141 | # lambda name: pattern().literal('"').group(name=name, pattern=pattern().none_of('"').at_least_one()).literal('"') 142 | referer = pattern().literal('"').\ 143 | group(name='referer', pattern=pattern().none_of('"').at_least_one()).literal('"') 144 | user_agent = pattern().literal('"').\ 145 | group(name='user_agent', pattern=pattern().none_of('"').at_least_one()) 146 | 147 | p = Pattern.join( 148 | pattern().whitespace(), 149 | [ip, identd_client_id, http_auth_user, time, 150 | http_verb, url, http_version, http_status_code, 151 | response_bytes, referer, user_agent] 152 | ) 153 | assert {'ip': '127.0.0.1', 'http_auth_user': 'frank', 'http_verb': 'GET', 'url': '/apache_pb.gif', 154 | 'http_version': 'HTTP/1.0', 'http_status_code': '200', 'response_bytes_without_headers': '2326', 155 | 'user_agent': 'http://www.example.com/start.html'} == \ 156 | p.compile().match(apache_webserver_combined_log).groupdict() 157 | ``` 158 | 159 | ### Strip HTML tags 160 | 161 | ```python 162 | from regularize import pattern 163 | from regularize.replace import substitution 164 | 165 | html = '''

Article Title

166 |

This is a blog post

''' 167 | p = pattern().literal('<').any_of('/').quantify(minimum=0).ascii_letters().any_number().at_least_one().literal('>') 168 | s = substitution(p) 169 | text = s.replace(html) 170 | print(text) 171 | ''' 172 | Article Title 173 | This is a blog post 174 | ''' 175 | ``` 176 | 177 | ## API 178 | 179 | ### Pattern Builder 180 | 181 | ### Finder 182 | 183 | ### Substitution (Replace) 184 | 185 | ## Extending 186 | 187 | ### Writing Extensions 188 | 189 | Commonly used patterns can be easily added either by creating a sub-class of the `Pattern` class, 190 | or by using the extension registry. 191 | 192 | #### Using a Pattern sub-class 193 | 194 | There are two prerequisites for new pattern builder methods: 195 | - The return value should be a `Pattern` instance. 196 | - Internal state is not modified, but instead all changes are applied to an instance clone. 197 | 198 | ```python 199 | from regularize.expression import Pattern 200 | 201 | class MyPattern(Pattern): 202 | def html_tag(self, opening=True): 203 | if opening: 204 | new = self.literal('<') 205 | else: 206 | new = self.literal('') 208 | ``` 209 | 210 | #### Registering an extension 211 | 212 | ```python 213 | from regularize.expression import Pattern 214 | 215 | class HTMLTag(Pattern): 216 | def __call__(self, opening=True): 217 | if opening: 218 | new = self.literal('<') 219 | else: 220 | new = self.literal('') 223 | 224 | 225 | p = Pattern() 226 | # The registry is attached to the Pattern class: 227 | Pattern.registry.add('html_tag', HTMLTag) 228 | # But is also accessible through the instance for convenience: 229 | p.extensions.registry.add('html_tag', HTMLTag) 230 | # We can now call the pattern wrapper by its given alias, through the `ext` object: 231 | p = p.ext.html_tag() 232 | 233 | print(p.build()) 234 | # <[a-z]+> 235 | ``` 236 | -------------------------------------------------------------------------------- /regularize/expression.py: -------------------------------------------------------------------------------- 1 | from collections.abc import MutableMapping 2 | from collections import deque 3 | from functools import partialmethod 4 | import math 5 | import re 6 | from functools import wraps 7 | 8 | from regularize.exceptions import SampleNotMatchedError, \ 9 | InvalidRangeError 10 | from regularize.flag import FlagSet 11 | 12 | 13 | class Metacharacter: 14 | def __copy__(self): 15 | return self.__class__() 16 | 17 | def __repr__(self): 18 | return f'\'{self.__class__.__name__} -> {str(self)}\'' 19 | 20 | 21 | class OpeningBracket(Metacharacter): 22 | def __str__(self): 23 | return '[' 24 | 25 | 26 | class ClosingBracket(Metacharacter): 27 | def __str__(self): 28 | return ']' 29 | 30 | 31 | class Or(Metacharacter): 32 | def __str__(self): 33 | return '|' 34 | 35 | def combine(self, *expressions): 36 | return str(self).join(expressions) 37 | 38 | 39 | class Expression: 40 | def __init__(self, parent: 'Expression' = None): 41 | self._token_stack = deque() 42 | self._bracket_stack = [] 43 | if parent: 44 | self._copy_state(parent) 45 | 46 | def _copy_state(self, other, clear=True): 47 | if clear: 48 | self.bracket_stack.clear() 49 | self.token_stack.clear() 50 | self.bracket_stack.extend(other.bracket_stack) 51 | self.token_stack.extend(other.token_stack) 52 | 53 | @property 54 | def token_stack(self) -> deque: 55 | return self._token_stack 56 | 57 | @property 58 | def bracket_stack(self) -> list: 59 | return self._bracket_stack 60 | 61 | def has_open_bracket(self): 62 | if not self.bracket_stack: 63 | return False 64 | return isinstance(self.bracket_stack[-1], OpeningBracket) 65 | 66 | def close_bracket(self): 67 | if not self.has_open_bracket(): 68 | return self 69 | 70 | if self.bracket_stack: 71 | last_item_in_stack = self.bracket_stack[-1] 72 | else: 73 | last_item_in_stack = None 74 | 75 | if not isinstance(last_item_in_stack, OpeningBracket): 76 | raise RuntimeError('Cannot close bracket without opening') 77 | 78 | return self.clone_with_updates(append=ClosingBracket()) 79 | 80 | def _prepare_for_build(self): 81 | return self.close_bracket() 82 | 83 | def build(self): 84 | return ''.join(map(str, self._prepare_for_build().token_stack)) 85 | 86 | def __repr__(self): 87 | return f"{self.__class__.__name__}<{hex(id(self))}>[{self.token_stack}]" 88 | 89 | def __str__(self): 90 | return f'Expression: /{self.build()}/' 91 | 92 | def __add__(self, other): 93 | new = self.__class__(parent=self) 94 | new._copy_state(other, clear=False) 95 | return new 96 | 97 | def clone(self) -> 'Expression': 98 | new = self.__class__(parent=self) 99 | self._on_after_clone(new) 100 | return new 101 | 102 | def _on_after_clone(self, new): 103 | pass 104 | 105 | def clone_with_updates(self, append=None, prepend=None) -> 'Expression': 106 | if append is not None and not isinstance(append, (list, tuple)): 107 | append = (append,) 108 | 109 | if prepend is not None and not isinstance(prepend, (list, tuple)): 110 | prepend = (prepend,) 111 | 112 | clone = self.clone() 113 | clone.token_stack.extendleft(reversed(prepend or [])) 114 | clone.token_stack.extend(append or []) 115 | 116 | if append: 117 | for item in append: 118 | if isinstance(item, ClosingBracket): 119 | if clone.has_open_bracket(): 120 | clone.bracket_stack.pop() 121 | elif isinstance(item, OpeningBracket): 122 | clone.bracket_stack.append(item) 123 | 124 | return clone 125 | 126 | 127 | class Pattern(Expression): 128 | def __init__(self, *args, **kwargs): 129 | self._extensions = None 130 | self._flags = None 131 | super(Pattern, self).__init__(*args, **kwargs) 132 | 133 | def _copy_state(self, other, clear=True): 134 | super(Pattern, self)._copy_state(other, clear=clear) 135 | self._extensions = other.extensions.clone() 136 | 137 | def __eq__(self, other): 138 | return self.flags == other.flags and \ 139 | self.token_stack == other.token_stack 140 | 141 | @property 142 | def flags(self): 143 | if self._flags is None: 144 | self._flags = FlagSet() 145 | return self._flags 146 | 147 | def _on_after_clone(self, new): 148 | new._flags = self.flags.copy() 149 | new._extensions = self.extensions.clone() 150 | 151 | def group(self, name=None, optional=False, wrapped=None): 152 | if wrapped is None: 153 | wrapped_pattern = self 154 | else: 155 | wrapped_pattern = wrapped 156 | 157 | new_group = Group(wrapped_pattern)(name=name, optional=optional) 158 | 159 | if wrapped is None: 160 | return new_group 161 | else: 162 | return self + new_group 163 | 164 | def match_any(self, *subexpressions, **kwargs): 165 | expression_list = [ 166 | subexpression.build() 167 | for subexpression in map(self._ensure_pattern, subexpressions) 168 | ] 169 | new = self.__class__().raw(Or().combine(expression_list)).group(**kwargs) 170 | return self.clone_with_updates(new.build()) 171 | 172 | @staticmethod 173 | def _ensure_pattern(obj): 174 | if isinstance(obj, str): 175 | return Literal()(obj) 176 | elif isinstance(obj, Pattern): 177 | return obj 178 | else: 179 | raise TypeError(f'Cannot handle type {obj.__class__.__name__} automatically') 180 | 181 | def __or__(self, other): 182 | return (self.clone_with_updates(append=Or()) + other).group() 183 | 184 | def whitespace(self, match=True) -> 'Pattern': 185 | return Whitespace(self)(match) 186 | 187 | def lowercase_ascii_letters(self, **kwargs): 188 | return AsciiLetterCharacter(self)(lowercase=True, **kwargs) 189 | 190 | # Alias due to high frequency use (along with case-insensitive flag) 191 | ascii_letters = lowercase_ascii_letters 192 | 193 | def uppercase_ascii_letters(self, **kwargs) -> 'Pattern': 194 | return AsciiLetterCharacter(self)(lowercase=False, **kwargs) 195 | 196 | def any_number_between(self, **kwargs): 197 | return Number(self)(**kwargs) 198 | 199 | any_number = any_number_between 200 | 201 | def quantify(self, minimum=0, maximum=math.inf): 202 | addition = None 203 | if minimum == 0 and math.isinf(maximum): 204 | addition = '*' 205 | elif minimum == 0 and maximum == 1: 206 | addition = '?' 207 | elif minimum == 1 and math.isinf(maximum): 208 | addition = '+' 209 | elif minimum == maximum: 210 | addition = f'{{{minimum}}}' 211 | elif minimum > 1 and math.isinf(maximum): 212 | addition = f'{{{minimum},}}' 213 | elif not math.isinf(maximum): 214 | addition = f'{{{minimum},{maximum}}}' 215 | return self.close_bracket().clone_with_updates(append=addition) 216 | 217 | at_least_one = partialmethod(quantify, minimum=1, maximum=math.inf) 218 | 219 | def exactly(self, times): 220 | return self.quantify(minimum=times, maximum=times) 221 | 222 | def wildcard(self, one_or_more=False): 223 | if one_or_more: 224 | add = '.+' 225 | else: 226 | add = '.' 227 | return self.clone_with_updates(add) 228 | 229 | match_all = partialmethod(wildcard, one_or_more=True) 230 | 231 | def literal(self, string): 232 | return Literal(self)(string) 233 | 234 | def any_of(self, *members, close=True): 235 | clone = self.clone_with_updates(append=OpeningBracket()) 236 | if members: 237 | expression = ''.join( 238 | map( 239 | str, 240 | map(BracketExpressionPartial.ensure, members) 241 | ) 242 | ) 243 | clone = clone.clone_with_updates(expression) 244 | if close: 245 | clone = clone.close_bracket() 246 | return clone 247 | 248 | def none_of(self, *members): 249 | clone = self.clone_with_updates(append=OpeningBracket()) 250 | if members: 251 | expression = ''.join( 252 | map( 253 | str, 254 | map(BracketExpressionPartial.ensure, members) 255 | ) 256 | ) 257 | clone = clone.clone_with_updates( 258 | f"^{expression}" 259 | ) 260 | return clone 261 | 262 | def raw(self, string): 263 | return self.clone_with_updates(string) 264 | 265 | def start_anchor(self): 266 | return self.clone_with_updates(append='^') 267 | 268 | def end_anchor(self): 269 | return self.clone_with_updates(append='$') 270 | 271 | def compile(self): 272 | try: 273 | return re.compile(self.build(), self.flags.compile()) 274 | except re.error as e: 275 | print(f'Unable to build regular expression: {self}') 276 | raise e 277 | 278 | def test(self, sample): 279 | regex = self.compile() 280 | match = regex.match(sample) 281 | if not match: 282 | raise SampleNotMatchedError(f'{regex} tested with "{sample}"') 283 | return match 284 | 285 | def case_insensitive(self, enabled=True) -> 'Pattern': 286 | clone = self.clone() 287 | clone.flags.case_insensitive(enabled=enabled) 288 | return clone 289 | 290 | def multiline(self, enabled=True): 291 | clone = self.clone() 292 | clone.flags.multiline(enabled=enabled) 293 | return clone 294 | 295 | def dot_matches_newline(self, enabled=True): 296 | clone = self.clone() 297 | clone.flags.dot_matches_newline(enabled=enabled) 298 | return clone 299 | 300 | def ascii_only(self, enabled=True): 301 | clone = self.clone() 302 | clone.flags.ascii_only(enabled=enabled) 303 | return clone 304 | 305 | def __str__(self): 306 | initial = super(Pattern, self).__str__() 307 | return f'{initial}{self.flags}' 308 | 309 | @property 310 | def ext(self) -> 'ExtensionRegistry': 311 | if self._extensions is None: 312 | self._extensions = ExtensionRegistry(self) 313 | return self._extensions 314 | 315 | extensions = ext 316 | 317 | @classmethod 318 | def join(cls, delimiter, subpatterns): 319 | composite_pattern = cls() 320 | for subpattern in subpatterns[:-2]: 321 | composite_pattern = composite_pattern + subpattern + delimiter 322 | composite_pattern = composite_pattern + subpatterns[-1] 323 | return composite_pattern 324 | 325 | 326 | class Group(Pattern): 327 | def __call__(self, name=None, optional=False) -> 'Pattern': 328 | add_right = [')'] 329 | if optional: 330 | add_right.append('?') 331 | add_left = ['('] 332 | if name is not None: 333 | add_left.append(f'?P<{name}>') 334 | return self.close_bracket().clone_with_updates( 335 | prepend=add_left, 336 | append=add_right 337 | ) 338 | 339 | class Literal(Pattern): 340 | def __call__(self, string): 341 | return self.clone_with_updates(re.escape(string)) 342 | 343 | 344 | class Whitespace(Pattern): 345 | def __call__(self, match): 346 | return self.clone_with_updates('\\s' if match else '\\S') 347 | 348 | 349 | class Range(Pattern): 350 | def __call__(self, start, end, closed=False, negated=False, skip_brackets=False): 351 | if negated: 352 | start = f'^{start}' 353 | 354 | additions = [] 355 | if not self.has_open_bracket() and not skip_brackets: 356 | additions.append(OpeningBracket()) 357 | additions.append(f'{start}-{end}') 358 | if closed and not skip_brackets: 359 | additions.append(ClosingBracket()) 360 | return self.clone_with_updates(append=additions) 361 | 362 | 363 | class AsciiLetterCharacter(Range): 364 | def __call__(self, lowercase=True, **kwargs): 365 | start = 'a' if lowercase else 'A' 366 | end = 'z' if lowercase else 'Z' 367 | return super(AsciiLetterCharacter, self).__call__(start, end, **kwargs) 368 | 369 | 370 | class Number(Range): 371 | def __call__(self, minimum=0, maximum=9, **kwargs) -> Pattern: 372 | if minimum >= maximum or minimum < 0 or maximum > 9: 373 | raise InvalidRangeError( 374 | f'Cannot build range between {minimum} and {maximum}' 375 | ) 376 | return super(Number, self).__call__(minimum, maximum, **kwargs) 377 | 378 | 379 | class BracketExpressionPartial: 380 | def __init__(self, expression: str): 381 | self._expression = expression 382 | 383 | def __str__(self): 384 | return self._expression 385 | 386 | def __repr__(self): 387 | return f'{self.__class__.__name__}: {repr(self._expression)}' 388 | 389 | @property 390 | def expression(self): 391 | return self._expression 392 | 393 | @classmethod 394 | def ensure(cls, obj): 395 | if not isinstance(obj, cls): 396 | return cls(Literal()(obj).build()) 397 | else: 398 | return obj 399 | 400 | 401 | class ExtensionRegistry(MutableMapping): 402 | def __init__(self, pattern: Pattern): 403 | self._registry = {} 404 | self._pattern = pattern 405 | self._callbacks_initialized = False 406 | self._callbacks = {} 407 | 408 | def __setitem__(self, key, value): 409 | self._registry[key] = value 410 | 411 | def __delitem__(self, key): 412 | del self._registry[key] 413 | 414 | def __len__(self): 415 | return len(self._registry) 416 | 417 | def __getitem__(self, item): 418 | return self._registry[item] 419 | 420 | def __iter__(self): 421 | return iter(self._registry) 422 | 423 | def __repr__(self): 424 | return repr(self._registry) 425 | 426 | @property 427 | def registry(self): 428 | return self._registry 429 | 430 | def clone(self): 431 | new = self.__class__(self._pattern) 432 | new._callbacks_initialized = False 433 | return new 434 | 435 | def _initialize_callbacks(self): 436 | if self._callbacks_initialized: 437 | return 438 | 439 | for name, klass in self.registry.items(): 440 | self._callbacks[name] = klass(self._pattern) 441 | self._callbacks_initialized = True 442 | 443 | def _ensure_clone(self, fn): 444 | @wraps(fn) 445 | def wrapper(*args, **kwargs): 446 | result = fn(*args, **kwargs) 447 | if not isinstance(result, Pattern): 448 | raise ValueError(type(result)) 449 | if result is self._pattern: 450 | raise ValueError('pattern instance clone required') 451 | return result 452 | return wrapper 453 | 454 | def __getattr__(self, item): 455 | if item in self.registry: 456 | self._initialize_callbacks() 457 | return self._ensure_clone(self._callbacks[item]) 458 | else: 459 | raise AttributeError(item) 460 | 461 | 462 | Pattern.ANY_NUMBER = BracketExpressionPartial(Number()(skip_brackets=True).build()) 463 | Pattern.ANY_ASCII_CHARACTER = BracketExpressionPartial( 464 | AsciiLetterCharacter()(skip_brackets=True).build() 465 | ) 466 | Pattern.NO_WHITESPACE = BracketExpressionPartial(Whitespace()(match=False).build()) 467 | Pattern.ANY_WHITESPACE = BracketExpressionPartial(Whitespace()(match=True).build()) 468 | --------------------------------------------------------------------------------