├── .travis.yml ├── LICENSE ├── README.rst ├── setup.py ├── test_triegex.py └── triegex └── __init__.py /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: false 3 | 4 | python: 5 | - 3.5 6 | - 3.6 7 | - 3.7-dev 8 | - nightly 9 | 10 | install: 11 | - python setup.py install 12 | - pip install pytest 13 | 14 | script: py.test --doctest-module --ignore=setup.py --doctest-glob='*.rst' 15 | 16 | deploy: 17 | provider: pypi 18 | user: dijkstra 19 | password: 20 | secure: cMY94zBzgfI04B5j/XKhxg7qRGCBUprZmQSiK1G90z+hodf8Ye4XbJY8ekdm+eBnvI5RPDy0GDmzCjzCa3mPOlcsYUmpPSeGSrtlH/FKoMCxZKry0MqwkzFUbgein3yB5AckAmigKt7qsPWE5lgwWlHxXh8Ng8gmmmU6SqSLghS3gNW4ow6hFkTKGzhj5OHwiQiC3WUbxjGWBLdaalipaC7izGJZOA+zAQ18xWA/KyMkki2QWdOdmVwiBdwJCxbn8JCtPnIlcUdtWGYxx9ZIQCsjWhfR/hzHTLdthEjGJketZ+cYI5KXFZEV5QTlXPJWt3mxNKaMfYjgOTdosel0D09NC2b32vTq8bG7eoYDEi4ZS4CAq4y4THBeLlkuQ659heOCq0X9eEN5Po9252GBshIdu+VmBe8zhv2TT2ARrrjKt9N4KCE3VYNzM46WODMo59qS0glHgGA4Tozab75lvgOSmth02m3sEPlJK10sTDlqFOxUrHwTGIyBcLAWuPpgbg03ExR0ktO4KCz4zXQNcFP4yQSD7Az6mys6bkxtztxwCyI6NmbBDf/8pUHOYAoI0Ruko9+4o2XkPXZLjz1UqIUeDy9NcwmVh+StJ52goOyESJ1Hpdw32ZTamUCPCn6Y1JA5zpXW7+NK8uq8OeE5aZ4CvjPEYjo64zvT1T/i074= 21 | on: 22 | tags: true 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Alexander Zhukov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ******* 2 | triegex 3 | ******* 4 | .. image:: https://travis-ci.org/ZhukovAlexander/triegex.svg?branch=master 5 | :target: https://travis-ci.org/ZhukovAlexander/triegex 6 | About 7 | ###### 8 | 9 | 10 | **triegex** is a library that builds a compact trie-structured regular expressions from a list of words. 11 | 12 | Installation 13 | ######## 14 | 15 | .. code-block:: bash 16 | 17 | pip install triegex 18 | 19 | Alternatively, you can install the latest release directly from git: 20 | 21 | .. code-block:: bash 22 | 23 | pip install git+https://github.com/ZhukovAlexander/triegex.git@0.0.4 24 | 25 | 26 | Example usage 27 | ######## 28 | 29 | .. code-block:: python 30 | 31 | >>> import triegex 32 | >>> 33 | >>> t = triegex.Triegex('foo', 'bar', 'baz') 34 | >>> 35 | >>> t.to_regex() # build regular expression 36 | '(?:ba(?:r\\b|z\\b)|foo\\b|~^(?#match nothing))' 37 | >>> 38 | >>> t.add('spam') 39 | >>> 40 | >>> 'spam' in t # you check if the word is in there 41 | True 42 | >>> 43 | >>> import re 44 | >>> re.findall(t.to_regex(), 'spam & eggs') # ['spam'] 45 | ['spam'] 46 | 47 | Why? 48 | #### 49 | The library was inspired by a need to match a list of valid IANA top-level domain names (`which is pretty big `_). 50 | 51 | Also it's fun 52 | 53 | **triegex** was influenced by these projects: `frak `_, `regex-trie `_ and `Regexp-Trie `_ 54 | 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import codecs 3 | 4 | with codecs.open('README.rst', 'r', 'utf-8') as f: 5 | readme = f.read() 6 | 7 | setup( 8 | name='triegex', 9 | url='https://github.com/ZhukovAlexander/triegex', 10 | author='Alexander Zhukov', 11 | author_email='zhukovaa90@gmail.com', 12 | description='Trie-ized regular expressions in python', 13 | long_description=readme, 14 | keywords='python regular expressions trie', 15 | use_scm_version=True, 16 | setup_requires=['setuptools_scm'], 17 | packages=['triegex'], 18 | zip_safe=True, 19 | classifiers=[ 20 | 'Development Status :: 3 - Alpha', 21 | 'Intended Audience :: Developers', 22 | 'License :: OSI Approved :: MIT License', 23 | 'Operating System :: OS Independent', 24 | 'Programming Language :: Python :: 3', 25 | 'Topic :: Software Development :: Libraries :: Python Modules', 26 | ], 27 | ) 28 | -------------------------------------------------------------------------------- /test_triegex.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import re 3 | 4 | import triegex 5 | 6 | 7 | class TriegexTest(TestCase): 8 | 9 | def findall(self, triegex, string): 10 | return re.findall(triegex.to_regex(), string) 11 | 12 | def test_basic(self): 13 | t = triegex.Triegex('Jon') 14 | self.assertListEqual(self.findall(t, 'Jon Snow'), ['Jon']) 15 | 16 | def test_empty_triegex_matches_nothing(self): 17 | t = triegex.Triegex() 18 | self.assertListEqual(self.findall(t, 'foo'), [], 'Should match nothing: {}'.format(t.to_regex())) 19 | 20 | def test_multiple_words(self): 21 | t = triegex.Triegex('Jon', 'Tyrion', 'Sam', 'Bran') 22 | self.assertListEqual(self.findall(t, 'Jon & Sam'), ['Jon', 'Sam']) 23 | 24 | def test_word_boundary_is_handled(self): 25 | t = triegex.Triegex('Sam') 26 | self.assertListEqual([], self.findall(t, 'Samwell')) 27 | self.assertListEqual(['Sam'], self.findall(t, 'Sam` Tarly')) 28 | 29 | def test_optimized(self): 30 | t = triegex.Triegex('Jon', 'Jorah') 31 | self.assertEqual(r'(?:Jo(?:n\b|rah\b)|~^(?#match nothing))', t.to_regex()) 32 | 33 | 34 | class TriegexMutableSetInterfaceTest(TestCase): 35 | def test_iter(self): 36 | self.assertListEqual(list(triegex.Triegex('foo')), ['foo']) 37 | 38 | def test_contains(self): 39 | self.assertIn('Jaime', triegex.Triegex('Jaime', 'Lannister')) 40 | self.assertNotIn('Stannis', triegex.Triegex('Kings Landing')) 41 | 42 | def test_len(self): 43 | t = triegex.Triegex() 44 | self.assertEqual(len(t), 0) 45 | t.add('Sansa') 46 | self.assertEqual(len(t), 1) 47 | 48 | def test_discart(self): 49 | t = triegex.Triegex() 50 | t.add('Hound') 51 | self.assertIn('Hound', t) 52 | t.discard('Hound') 53 | self.assertNotIn('Hound', t) 54 | -------------------------------------------------------------------------------- /triegex/__init__.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | __all__ = ('Triegex',) 4 | 5 | OR = r'|' 6 | 7 | # regex below matches nothing https://stackoverflow.com/a/940840/2183102. We 8 | # use '~' to ensure it comes last when lexicographically sorted: 9 | # max(string.printable) is '~' 10 | NOTHING = r'~^(?#match nothing)' 11 | GROUP = r'(?:{0})' 12 | WORD_BOUNDARY = r'\b' 13 | 14 | 15 | class TriegexNode: 16 | 17 | def __init__(self, char: str, end: bool, *children): 18 | self.char = char if char is not None else '' 19 | self.end = end 20 | self.children = {child.char: child for child in children} 21 | 22 | def __iter__(self): 23 | return iter(sorted(self.children.values(), key=lambda x: x.char)) 24 | 25 | def __len__(self): 26 | return len(self.children) 27 | 28 | def __repr__(self): 29 | return "".format(self) 30 | 31 | def __contains__(self, key): 32 | return key in self.children 33 | 34 | def __getitem__(self, key): 35 | return self.children[key] 36 | 37 | def __delitem__(self, key): 38 | del self.children[key] 39 | 40 | def to_regex(self): 41 | ''' 42 | RECURSIVE IMPLEMENTATION FOR REFERENCE 43 | suffixes = [v.to_regex() for k, v in self.children.items()] 44 | if self.end: 45 | suffixes += [WORD_BOUNDARY] 46 | 47 | if len(suffixes) > 1: 48 | return self.char + GROUP.format(OR.join(suffixes)) 49 | elif len(suffixes) == 1: 50 | return self.char + suffixes[0] 51 | else: 52 | return self.char 53 | ''' 54 | 55 | stack = [self] 56 | # marks starting indices of children of a node 57 | lookup = [] 58 | 59 | # Creates an ordered list of nodes starting with root and ending with leaves by using BFS 60 | i = 0 61 | j = 1 62 | while i < len(stack): 63 | stack.extend(sorted(stack[i].children.values(), key=lambda node: node.char)) 64 | lookup.append(j) 65 | j += len(stack[i].children) 66 | i += 1 67 | 68 | i = len(stack) 69 | # temp value array 70 | sub_regexes = [None] * i 71 | while i > 0: 72 | # We start with leaves and end at root thus we decrement 73 | i -= 1 74 | node = stack[i] 75 | # Get regexes of child nodes and make a root regex 76 | suffixes = [sub_regexes[child] for child in range(lookup[i], lookup[i] + len(node.children))] 77 | if node.end: 78 | # if the node is an ending node we add a \b character 79 | suffixes += [WORD_BOUNDARY] 80 | # If we arrive at the root node we have to add the NOTHING expression 81 | if i == 0: 82 | suffixes += [NOTHING] 83 | if len(suffixes) > 1: 84 | sub_regexes[i] = node.char + GROUP.format(OR.join(suffixes)) 85 | elif len(suffixes) == 1: 86 | sub_regexes[i] = node.char + suffixes[0] 87 | else: 88 | sub_regexes[i] = node.char 89 | # return the top Regex 90 | return sub_regexes[0] 91 | 92 | 93 | class Triegex(collections.MutableSet): 94 | def __init__(self, *words): 95 | """ 96 | Trigex constructor. 97 | """ 98 | 99 | self._root = TriegexNode(None, False) 100 | 101 | for word in words: 102 | self.add(word) 103 | 104 | def add(self, word: str): 105 | current = self._root 106 | for letter in word[:-1]: 107 | if letter in current.children: 108 | current = current.children[letter] 109 | else: 110 | current = current.children.setdefault(letter, 111 | TriegexNode(letter, False)) 112 | # this will ensure that we correctly match the word boundary 113 | if word[-1] in current.children: 114 | current.children[word[-1]].end = True 115 | else: 116 | current.children[word[-1]] = TriegexNode(word[-1], True) 117 | 118 | def to_regex(self): 119 | r""" 120 | Produce regular expression that will match each word in the 121 | internal trie. 122 | 123 | >>> t = Triegex('foo', 'bar', 'baz') 124 | >>> t.to_regex() 125 | '(?:ba(?:r\\b|z\\b)|foo\\b|~^(?#match nothing))' 126 | """ 127 | return self._root.to_regex() 128 | 129 | def _traverse(self): 130 | stack = [self._root] 131 | current = self._root 132 | while stack: 133 | yield current 134 | current = stack.pop() 135 | stack.extend(current.children.values()) 136 | 137 | def __iter__(self): 138 | paths = {self._root.char: []} 139 | for node in self._traverse(): 140 | for child in node: 141 | paths[child.char] = [node.char] + paths[node.char] 142 | if child.end: 143 | char = child.char 144 | yield ''.join(reversed([char] + paths[char])) 145 | 146 | def __len__(self): 147 | return sum(1 for _ in self.__iter__()) 148 | 149 | def __contains__(self, word): 150 | current = self._root 151 | for char in word: 152 | if char not in current: 153 | return False 154 | current = current[char] 155 | return True and current.end # word has to end with the last char 156 | 157 | def discard(self, word): 158 | to_delete = [self._root] 159 | current = self._root 160 | for char in word: 161 | if char not in current: 162 | return 163 | current = current[char] 164 | to_delete.append(current) 165 | if not to_delete[-1].end: 166 | return 167 | while len(to_delete) > 1: 168 | node = to_delete.pop() 169 | if len(node) == 0: 170 | del to_delete[-1][node.char] 171 | return 172 | --------------------------------------------------------------------------------