├── .gitignore ├── README.md ├── examples ├── calc.py └── wiki.py ├── LICENSE └── re_scan.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-regex-scanner 2 | 3 | A basic implementation of a class that helps tokenizing strings 4 | with Python regular expressions. 5 | 6 | For more information see [this blog post about regex gems](http://lucumr.pocoo.org/2015/11/18/pythons-hidden-re-gems/). 7 | -------------------------------------------------------------------------------- /examples/calc.py: -------------------------------------------------------------------------------- 1 | from re_scan import Scanner 2 | 3 | scanner = Scanner([ 4 | ('whitespace', r'\s+'), 5 | ('plus', r'\+'), 6 | ('minus', r'\-'), 7 | ('mult', r'\*'), 8 | ('div', r'/'), 9 | ('num', r'\d+'), 10 | ('paren_open', r'\('), 11 | ('paren_close', r'\)'), 12 | ]) 13 | 14 | for token, match in scanner.scan('(1 + 2) * 3'): 15 | print (token, match.group()) 16 | -------------------------------------------------------------------------------- /examples/wiki.py: -------------------------------------------------------------------------------- 1 | from re_scan import Scanner 2 | 3 | 4 | scanner = Scanner([ 5 | ('bold', '\*\*'), 6 | ('link_special', '\[\[(?P.*?)\|(?P.*?)\]\]'), 7 | ('link', '\[\[(.*?)\]\]'), 8 | ('underline', '_'), 9 | ]) 10 | 11 | input_text = 'Hello **World**! [[Stuff|extra]] _[[Stuff]]_.' 12 | 13 | for token, match in scanner.scan_with_holes(input_text): 14 | if token is None: 15 | print 'hole', match 16 | else: 17 | print 'token', (token, match.groups(), 18 | match.groupdict(), match.group()) 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 by Armin Ronacher. 2 | 3 | Some rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | 17 | * The names of the contributors may not be used to endorse or 18 | promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | -------------------------------------------------------------------------------- /re_scan.py: -------------------------------------------------------------------------------- 1 | from sre_parse import Pattern, SubPattern, parse 2 | from sre_compile import compile as sre_compile 3 | from sre_constants import BRANCH, SUBPATTERN 4 | 5 | 6 | class _ScanMatch(object): 7 | 8 | def __init__(self, match, rule, start, end): 9 | self._match = match 10 | self._start = start 11 | self._end = end 12 | self._rule = rule 13 | 14 | def __getattr__(self, name): 15 | return getattr(self._match, name) 16 | 17 | def __group_proc(self, method, group): 18 | if group == 0: 19 | return method() 20 | if isinstance(group, basestring): 21 | return method(self._rule + '\x00' + group) 22 | real_group = self._start + group 23 | if real_group > self._end: 24 | raise IndexError('no such group') 25 | return method(real_group) 26 | 27 | def group(self, *groups): 28 | if len(groups) in (0, 1): 29 | return self.__group_proc(self._match.group, 30 | groups and groups[0] or 0) 31 | return tuple(self.__group_proc(self._match.group, group) 32 | for group in groups) 33 | 34 | def groupdict(self, default=None): 35 | prefix = self._rule + '\x00' 36 | rv = {} 37 | for key, value in self._match.groupdict(default).iteritems(): 38 | if key.startswith(prefix): 39 | rv[key[len(prefix):]] = value 40 | return rv 41 | 42 | def span(self, group=0): 43 | return self.__group_proc(self._match.span, group) 44 | 45 | def groups(self): 46 | return self._match.groups()[self._start:self._end] 47 | 48 | def start(self, group=0): 49 | return self.__group_proc(self._match.start, group) 50 | 51 | def end(self, group=0): 52 | return self.__group_proc(self._match.end, group) 53 | 54 | def expand(self, template): 55 | raise RuntimeError('Unsupported on scan matches') 56 | 57 | 58 | class ScanEnd(Exception): 59 | 60 | def __init__(self, pos): 61 | Exception.__init__(self, pos) 62 | self.pos = pos 63 | 64 | 65 | class Scanner(object): 66 | 67 | def __init__(self, rules, flags=0): 68 | pattern = Pattern() 69 | pattern.flags = flags 70 | pattern.groups = len(rules) + 1 71 | 72 | _og = pattern.opengroup 73 | pattern.opengroup = lambda n: _og(n and '%s\x00%s' % (name, n) or n) 74 | 75 | self.rules = [] 76 | subpatterns = [] 77 | for group, (name, regex) in enumerate(rules, 1): 78 | last_group = pattern.groups - 1 79 | subpatterns.append(SubPattern(pattern, [ 80 | (SUBPATTERN, (group, parse(regex, flags, pattern))), 81 | ])) 82 | self.rules.append((name, last_group, pattern.groups - 1)) 83 | 84 | self._scanner = sre_compile(SubPattern( 85 | pattern, [(BRANCH, (None, subpatterns))])).scanner 86 | 87 | def scan(self, string, skip=False): 88 | sc = self._scanner(string) 89 | 90 | match = None 91 | for match in iter(sc.search if skip else sc.match, None): 92 | rule, start, end = self.rules[match.lastindex - 1] 93 | yield rule, _ScanMatch(match, rule, start, end) 94 | 95 | if not skip: 96 | end = match and match.end() or 0 97 | if end < len(string): 98 | raise ScanEnd(end) 99 | 100 | def scan_with_holes(self, string): 101 | pos = 0 102 | for rule, match in self.scan(string, skip=True): 103 | hole = string[pos:match.start()] 104 | if hole: 105 | yield None, hole 106 | yield rule, match 107 | pos = match.end() 108 | hole = string[pos:] 109 | if hole: 110 | yield None, hole --------------------------------------------------------------------------------