├── README ├── tests ├── __init__.py ├── wikipages.xml.gz ├── pattern.txt ├── split.txt ├── issue4.txt ├── sub.txt ├── re2_test.py ├── match_expand.txt ├── finditer.txt ├── search.txt ├── findall.txt ├── namedgroups.txt ├── unicode.txt ├── performance.py ├── re_tests.py └── test_re.py ├── src ├── #clib.pxd# ├── _re2macros.h ├── _re2.pxd └── re2.pyx ├── .gitignore ├── Makefile ├── AUTHORS ├── MANIFEST.in ├── CHANGELIST ├── LICENSE ├── setup.py └── README.rst /README: -------------------------------------------------------------------------------- 1 | README.rst -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/#clib.pxd#: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/wikipages.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/axiak/pyre2/HEAD/tests/wikipages.xml.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | /build 3 | /dist 4 | src/re2.html 5 | src/re2.so 6 | re2.so 7 | *~ 8 | *.pyc 9 | *.swp 10 | *.egg-info 11 | -------------------------------------------------------------------------------- /tests/pattern.txt: -------------------------------------------------------------------------------- 1 | pattern tests 2 | ============= 3 | 4 | >>> import re2 5 | 6 | We should be able to get back what we put in. 7 | 8 | >>> re2.compile("(foo|b[a]r?)").pattern 9 | '(foo|b[a]r?)' 10 | -------------------------------------------------------------------------------- /tests/split.txt: -------------------------------------------------------------------------------- 1 | Split tests 2 | =========== 3 | 4 | This one tests to make sure that utf8 data is parsed correctly. 5 | 6 | >>> import re2 as re 7 | >>> a = '我很好, 你呢?'.decode('utf8') 8 | >>> print re.split(' ', a) 9 | [u'\u6211\u5f88\u597d,', u'\u4f60\u5462?'] 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | rm -rf build &>/dev/null 3 | rm -rf src/*.so &>/dev/null 4 | rm -rf re2.so &>/dev/null 5 | rm -rf src/re2.cpp &>/dev/null 6 | python setup.py --cython build_ext --inplace 7 | 8 | test: all 9 | cp -v re2.so tests 10 | (cd tests && python re2_test.py) 11 | (cd tests && python test_re.py) 12 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | All contributors own the copyright to their own contributions, but agree 2 | to release each of their contributions under the BSD license included 3 | in this software. 4 | 5 | Michael Axiak 6 | 7 | Contributors 8 | ============ 9 | 10 | Alec Berryman 11 | Israel Tsadok 12 | Alex Willmer 13 | -------------------------------------------------------------------------------- /tests/issue4.txt: -------------------------------------------------------------------------------- 1 | issue #4 2 | ======== 3 | 4 | >>> import re2 5 | >>> TERM_SPEC2 = re2.compile('([\W\d_]*)(([^\W\d_]*[-\.]*)*[^\W\d_])([\W\d_]*[^\W\d_]*)', re2.UNICODE) 6 | >>> TERM_SPEC2.search("a").groups() 7 | ('', 'a', '', '') 8 | 9 | 10 | Still broken because of unicode: 11 | >>> TERM_SPEC2.search(u"Hello").groups() 12 | (u'', u'Hello', u'Hell', u',') 13 | -------------------------------------------------------------------------------- /tests/sub.txt: -------------------------------------------------------------------------------- 1 | Tests of substitution 2 | ===================== 3 | 4 | This first test is just looking to replace things between parentheses 5 | with an empty string. 6 | 7 | 8 | >>> import re2 as re 9 | >>> import hashlib 10 | >>> import gzip 11 | >>> data = gzip.open('wikipages.xml.gz').read() 12 | >>> print hashlib.md5(re.sub('\(.*?\)', '', data)).hexdigest() 13 | b7a469f55ab76cd5887c81dbb0cfe6d3 14 | -------------------------------------------------------------------------------- /tests/re2_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import glob 5 | import doctest 6 | 7 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) 8 | 9 | os.chdir(os.path.dirname(__file__) or '.') 10 | 11 | def testall(): 12 | for file in glob.glob(os.path.join(os.path.dirname(__file__), "*.txt")): 13 | print "Testing %s..." % file 14 | doctest.testfile(os.path.join(".", os.path.basename(file))) 15 | 16 | if __name__ == "__main__": 17 | testall() 18 | -------------------------------------------------------------------------------- /tests/match_expand.txt: -------------------------------------------------------------------------------- 1 | Match Expand Tests 2 | ================== 3 | 4 | Match objects have an .expand() method which allows them to 5 | expand templates as if the .sub() method was called on the pattern. 6 | 7 | >>> import re2 as re 8 | >>> m = re.match(r"(\w+) (\w+)\W+(?P\w+)", "Isaac Newton, physicist") 9 | >>> m.expand(r"\2, \1") 10 | 'Newton, Isaac' 11 | >>> m.expand(r"\1 \g<title>") 12 | 'Isaac physicist' 13 | >>> m.expand(r"\0 \1 \2") 14 | '\x00 Isaac Newton' 15 | >>> m.expand(r"\3") 16 | 'physicist' 17 | 18 | -------------------------------------------------------------------------------- /src/_re2macros.h: -------------------------------------------------------------------------------- 1 | #ifndef __RE2MACROS_H 2 | #define __RE2MACROS_H 3 | 4 | #include <stdio.h> 5 | #include "re2/stringpiece.h" 6 | 7 | static inline re2::StringPiece * new_StringPiece_array(int n) 8 | { 9 | re2::StringPiece * sp = new re2::StringPiece[n]; 10 | return sp; 11 | } 12 | static inline void delete_StringPiece_array(re2::StringPiece* ptr) 13 | { 14 | delete[] ptr; 15 | } 16 | 17 | #define addressof(A) (&A) 18 | #define addressofs(A) (&A) 19 | 20 | #define as_char(A) (char *)(A) 21 | #define pattern_Replace(A, B, C) re2::RE2::Replace((A), (B), (C)) 22 | #define pattern_GlobalReplace(A, B, C) re2::RE2::GlobalReplace((A), (B), (C)) 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELIST 2 | include Makefile 3 | include LICENSE 4 | include README 5 | include tests/cnn_homepage.dat 6 | include tests/performance.py 7 | include tests/search.txt 8 | include tests/finditer.txt 9 | include tests/wikipages.xml.gz 10 | include tests/__init__.py 11 | include tests/match_expand.txt 12 | include tests/test.py 13 | include tests/pattern.txt 14 | include tests/sub.txt 15 | include tests/unicode.txt 16 | include tests/findall.txt 17 | include tests/split.txt 18 | include AUTHORS 19 | include README.rst 20 | include src/_re2macros.h 21 | include src/_re2.pxd 22 | include src/re2.cpp 23 | include src/re2.pyx 24 | include MANIFEST 25 | include setup.py 26 | -------------------------------------------------------------------------------- /tests/finditer.txt: -------------------------------------------------------------------------------- 1 | Simple tests for the ``finditer`` function. 2 | =========================================== 3 | 4 | >>> import re2 as re 5 | 6 | >>> len(list(re.finditer(r'\w+', open("cnn_homepage.dat").read()))) 7 | 14230 8 | 9 | >>> [m.group(1) for m in re.finditer(r'\n#hdr-editions(.*?)\n', open("cnn_homepage.dat").read())] 10 | [' a { text-decoration:none; }', ' li { padding:0 10px; }', ' ul li.no-pad-left span { font-size:12px; }'] 11 | 12 | >>> [m.group(1) for m in re.finditer(r'^#hdr-editions(.*?)$', open("cnn_homepage.dat").read(), re.M)] 13 | [' a { text-decoration:none; }', ' li { padding:0 10px; }', ' ul li.no-pad-left span { font-size:12px; }'] 14 | 15 | -------------------------------------------------------------------------------- /tests/search.txt: -------------------------------------------------------------------------------- 1 | These are simple tests of the ``search`` function 2 | ================================================= 3 | 4 | >>> import re2 as re 5 | >>> re.search("((?:[01]?\d{1,2}|2[0-4]\d|25[0-5])\.){3}(?:[01]?\d{1,2}|2[0-4]\d|25[0-5])", "hello 28.224.2.1 test").group() 6 | '28.224.2.1' 7 | 8 | >>> re.search("(\d{3})\D?(\d{3})\D?(\d{4})", "800-555-1212").groups() 9 | ('800', '555', '1212') 10 | 11 | >>> input = 'a' * 999 12 | >>> len(re.search('(?:a{1000})?a{999}', input).group()) 13 | 999 14 | 15 | >>> re.search(r'\n#hdr-editions(.*?)\n', open("cnn_homepage.dat").read()).groups() 16 | (' a { text-decoration:none; }',) 17 | 18 | Verify some sanity checks 19 | 20 | >>> re.compile(r'x').search('x', 2000) 21 | >>> re.compile(r'x').search('x', 1, -300) 22 | 23 | -------------------------------------------------------------------------------- /tests/findall.txt: -------------------------------------------------------------------------------- 1 | findall tests 2 | ============= 3 | 4 | >>> import re2 5 | 6 | This one is from http://docs.python.org/library/re.html?#finding-all-adverbs: 7 | 8 | >>> re2.findall(r"\w+ly", "He was carefully disguised but captured quickly by police.") 9 | ['carefully', 'quickly'] 10 | 11 | This one makes sure all groups are found: 12 | 13 | >>> re2.findall(r"(\w+)=(\d+)", "foo=1,foo=2") 14 | [('foo', '1'), ('foo', '2')] 15 | 16 | When there's only one matched group, it should not be returned in a tuple: 17 | 18 | >>> re2.findall(r"(\w)\w", "fx") 19 | ['f'] 20 | 21 | Zero matches is an empty list: 22 | 23 | >>> re2.findall("(f)", "gggg") 24 | [] 25 | 26 | If pattern matches an empty string, do it only once at the end: 27 | 28 | >>> re2.findall(".*", "foo") 29 | ['foo', ''] 30 | 31 | >>> re2.findall("", "foo") 32 | ['', '', '', ''] 33 | -------------------------------------------------------------------------------- /tests/namedgroups.txt: -------------------------------------------------------------------------------- 1 | Testing some aspects of named groups 2 | ================================================= 3 | 4 | >>> import re2 as re 5 | 6 | >>> m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds") 7 | >>> m.start("first_name") 8 | 0 9 | >>> m.start("last_name") 10 | 8 11 | 12 | >>> m.span("last_name") 13 | (8, 16) 14 | >>> m.regs 15 | ((0, 16), (0, 7), (8, 16)) 16 | 17 | Make sure positions are converted properly for unicode 18 | 19 | >>> m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", u'\u05d9\u05e9\u05e8\u05d0\u05dc \u05e6\u05d3\u05d5\u05e7', re.UNICODE) 20 | >>> m.start("first_name") 21 | 0 22 | >>> m.start("last_name") 23 | 6 24 | >>> m.end("last_name") 25 | 10 26 | >>> m.regs 27 | ((0, 10), (0, 5), (6, 10)) 28 | >>> m.span(2) 29 | (6, 10) 30 | >>> m.span("last_name") 31 | (6, 10) 32 | 33 | -------------------------------------------------------------------------------- /CHANGELIST: -------------------------------------------------------------------------------- 1 | 0.2.12) 2 | - Fixed pyre2 to work with latest version of re2 (axiak) (issue #3) 3 | 4 | 0.2.10) 2010-12-08 5 | - Added .flags to pattern to make that transparent (axiak) 6 | - Added Python re unit tests (itsadok) 7 | - Fixed error compatibility (axiak) 8 | - Fixed group spans to be translated to their decoded positions (itsadok) 9 | - Fixed test_bug_1140 in unit test (itsadok) 10 | - Handle \n in replace manually (itsadok) 11 | - Return an interator from finditer (itsadok) 12 | - Have re.compile() accept SRE objects (moreati, itsadok) 13 | - Fixed findall to use group(1) if available (itsadok) 14 | - Fixed a mistaken use of verbose (itsadok) 15 | - Fixed a memory leak in replacement (itsadok) 16 | - Match delete[] to new[] calls to fix more memory leaks (itsadok) 17 | - Change split to handle empty matches to be more compatible with sre.c (itsadok) 18 | - Added group property to match re (itsadok) 19 | - Added the ability to fallback to old re in case of back references (itsadok) 20 | - Allow multiple arguments to group() (itsadok) 21 | - Fixed infinite loop in pathological case of findall(".*", "foo") 22 | 23 | 0.2.8) 2010-07-27 24 | - Added .expand() to group objects (axiak) 25 | - Input patterns are now kept for Python compatibility (alec) 26 | - Fixed 64-bit support (alec) 27 | - Fixed findall to support python symantics (alec) 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010, Michael Axiak <mike@axiak.net> 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 | 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 10 | -------------------------------------------------------------------------------- /tests/unicode.txt: -------------------------------------------------------------------------------- 1 | Here are some tests to make sure that utf-8 works 2 | ================================================= 3 | 4 | >>> import re2 as re 5 | >>> a = u'\u6211\u5f88\u597d' 6 | >>> c = re.compile(a[0]) 7 | >>> c.search(a).group() 8 | u'\u6211' 9 | 10 | Test unicode stickyness 11 | 12 | >>> re.sub(r'x', u'y', 'x') 13 | u'y' 14 | >>> re.sub(r'x', 'y', u'x') 15 | u'y' 16 | >>> re.sub(ur'x', 'y', 'x') 17 | 'y' 18 | >>> re.findall(ur'.', 'x') 19 | ['x'] 20 | >>> re.findall(ur'.', u'x') 21 | [u'x'] 22 | >>> re.split(ur',', '1,2,3') 23 | ['1', '2', '3'] 24 | >>> re.split(ur',', u'1,2,3') 25 | [u'1', u'2', u'3'] 26 | >>> re.search(ur'(\d)', '1').group(1) 27 | '1' 28 | >>> re.search(ur'(\d)', u'1').group(1) 29 | u'1' 30 | 31 | Test unicode character groups 32 | 33 | >>> re.search(r'\d', u'\u0661', re.UNICODE).group(0) 34 | u'\u0661' 35 | >>> int(re.search(r'\d', u'\u0661', re.UNICODE).group(0)) 36 | 1 37 | >>> re.search(r'\w', u'\u0401') 38 | >>> re.search(r'\w', u'\u0401', re.UNICODE).group(0) 39 | u'\u0401' 40 | >>> re.search(r'\s', u'\u1680', re.UNICODE).group(0) 41 | u'\u1680' 42 | >>> re.findall(r'[\s\d\w]', 'hey 123', re.UNICODE) 43 | ['h', 'e', 'y', ' ', '1', '2', '3'] 44 | >>> re.search(r'\D', u'\u0661x', re.UNICODE).group(0) 45 | u'x' 46 | >>> re.search(r'\W', u'\u0401!', re.UNICODE).group(0) 47 | u'!' 48 | >>> re.search(r'\S', u'\u1680x', re.UNICODE).group(0) 49 | u'x' 50 | >>> re.search(r'[\D]', u'\u0661x', re.UNICODE).group(0) 51 | u'x' 52 | >>> re.search(r'[\W]', u'\u0401!', re.UNICODE).group(0) 53 | u'!' 54 | >>> re.search(r'[\S]', u'\u1680x', re.UNICODE).group(0) 55 | u'x' 56 | 57 | 58 | Group positions need to be fixed with unicode 59 | 60 | >>> re.search(r' (.)', u'\U0001d200xxx\u1234 x').span(1) 61 | (6, 7) 62 | >>> re.search(r' (.)', u'\U0001d200xxx\u1234 x'.encode('utf-8')).span(1) 63 | (11, 12) 64 | 65 | Pos and endpos also need to be corrected 66 | 67 | >>> re.compile(r'x').findall(u'\u1234x', 1, 2) 68 | [u'x'] 69 | 70 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import re 5 | from distutils.core import setup, Extension, Command 6 | 7 | MINIMUM_CYTHON_VERSION = '0.13' 8 | 9 | 10 | def cmp(a, b): 11 | return (a > b) - (a < b) 12 | 13 | 14 | class TestCommand(Command): 15 | description = 'Run packaged tests' 16 | user_options = [] 17 | def initialize_options(self): 18 | pass 19 | 20 | def finalize_options(self): 21 | pass 22 | 23 | def run(self): 24 | from tests import re2_test 25 | re2_test.testall() 26 | 27 | 28 | def version_compare(version1, version2): 29 | def normalize(v): 30 | return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")] 31 | return cmp(normalize(version1), normalize(version2)) 32 | 33 | cmdclass = {'test': TestCommand} 34 | 35 | ext_files = [] 36 | if '--cython' in sys.argv[1:]: 37 | # Using Cython 38 | sys.argv.remove('--cython') 39 | from Cython.Compiler.Main import Version 40 | if version_compare(MINIMUM_CYTHON_VERSION, Version.version) > 0: 41 | raise ValueError("Cython is version %s, but needs to be at least %s." % 42 | (Version.version, MINIMUM_CYTHON_VERSION)) 43 | from Cython.Distutils import build_ext 44 | cmdclass['build_ext'] = build_ext 45 | ext_files.append("src/re2.pyx") 46 | else: 47 | # Building from C 48 | ext_files.append("src/re2.cpp") 49 | 50 | 51 | # Locate the re2 module 52 | _re2_prefixes = [ 53 | '/usr', 54 | '/usr/local', 55 | '/opt/', 56 | ] 57 | 58 | for re2_prefix in _re2_prefixes: 59 | if os.path.exists(os.path.join(re2_prefix, "include", "re2")): 60 | break 61 | else: 62 | re2_prefix = "" 63 | 64 | BASE_DIR = os.path.dirname(__file__) 65 | 66 | def get_long_description(): 67 | readme_f = open(os.path.join(BASE_DIR, "README.rst")) 68 | readme = readme_f.read() 69 | readme_f.close() 70 | return readme 71 | 72 | def get_authors(): 73 | author_re = re.compile(r'^\s*(.*?)\s+<.*?\@.*?>', re.M) 74 | authors_f = open(os.path.join(BASE_DIR, "AUTHORS")) 75 | authors = [match.group(1) for match in author_re.finditer(authors_f.read())] 76 | authors_f.close() 77 | return ', '.join(authors) 78 | 79 | def main(): 80 | setup( 81 | name="re2", 82 | version="0.2.23", 83 | description="Python wrapper for Google's RE2 using Cython", 84 | long_description=get_long_description(), 85 | author=get_authors(), 86 | license="New BSD License", 87 | author_email = "mike@axiak.net", 88 | url = "http://github.com/axiak/pyre2/", 89 | ext_modules = [ 90 | Extension( 91 | "re2", 92 | ext_files, 93 | language="c++", 94 | include_dirs=[os.path.join(re2_prefix, "include")] if re2_prefix else [], 95 | libraries=["re2"], 96 | extra_compile_args=['-std=c++11'], 97 | library_dirs=[os.path.join(re2_prefix, "lib")] if re2_prefix else [], 98 | runtime_library_dirs=[os.path.join(re2_prefix, "lib")] if re2_prefix else [], 99 | ) 100 | ], 101 | cmdclass=cmdclass, 102 | classifiers = [ 103 | 'License :: OSI Approved :: BSD License', 104 | 'Programming Language :: Cython', 105 | 'Programming Language :: Python :: 2.5', 106 | 'Programming Language :: Python :: 2.6', 107 | 'Intended Audience :: Developers', 108 | 'Topic :: Software Development :: Libraries :: Python Modules', 109 | ], 110 | ) 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /src/_re2.pxd: -------------------------------------------------------------------------------- 1 | cdef extern from *: 2 | ctypedef char* const_char_ptr "const char*" 3 | 4 | cdef extern from "<string>" namespace "std": 5 | cdef cppclass string: 6 | string(char *) 7 | string(char *, size_t n) 8 | const_char_ptr c_str() 9 | int length() 10 | void push_back(char c) 11 | 12 | ctypedef string cpp_string "std::string" 13 | ctypedef string const_string "const std::string" 14 | 15 | 16 | 17 | cdef extern from "<map>" namespace "std": 18 | cdef cppclass stringintmapiterator "std::map<std::string, int>::const_iterator": 19 | cpp_string first 20 | int second 21 | stringintmapiterator operator++() 22 | bint operator==(stringintmapiterator) 23 | stringintmapiterator& operator*(stringintmapiterator) 24 | bint operator!=(stringintmapiterator) 25 | 26 | cdef cppclass const_stringintmap "const std::map<std::string, int>": 27 | stringintmapiterator begin() 28 | stringintmapiterator end() 29 | int operator[](cpp_string) 30 | 31 | 32 | cdef extern from "Python.h": 33 | int PyObject_AsCharBuffer(object, const_char_ptr *, Py_ssize_t *) 34 | char * PyString_AS_STRING(object) 35 | 36 | cdef extern from "re2/stringpiece.h" namespace "re2": 37 | cdef cppclass StringPiece: 38 | StringPiece() 39 | StringPiece(const_char_ptr) 40 | StringPiece(const_char_ptr, int) 41 | const_char_ptr data() 42 | int copy(char * buf, size_t n, size_t pos) 43 | int length() 44 | 45 | ctypedef StringPiece const_StringPiece "const StringPiece" 46 | 47 | cdef extern from "re2/re2.h" namespace "re2": 48 | cdef enum Anchor: 49 | UNANCHORED "RE2::UNANCHORED" 50 | ANCHOR_START "RE2::ANCHOR_START" 51 | ANCHOR_BOTH "RE2::ANCHOR_BOTH" 52 | 53 | ctypedef Anchor re2_Anchor "RE2::Anchor" 54 | 55 | cdef enum ErrorCode: 56 | NoError "RE2::NoError" 57 | ErrorInternal "RE2::ErrorInternal" 58 | # Parse errors 59 | ErrorBadEscape "RE2::ErrorBadEscape" # bad escape sequence 60 | ErrorBadCharClass "RE2::ErrorBadCharClass" # bad character class 61 | ErrorBadCharRange "RE2::ErrorBadCharRange" # bad character class range 62 | ErrorMissingBracket "RE2::ErrorMissingBracket" # missing closing ] 63 | ErrorMissingParen "RE2::ErrorMissingParen" # missing closing ) 64 | ErrorTrailingBackslash "RE2::ErrorTrailingBackslash" # trailing \ at end of regexp 65 | ErrorRepeatArgument "RE2::ErrorRepeatArgument" # repeat argument missing, e.g. "*" 66 | ErrorRepeatSize "RE2::ErrorRepeatSize" # bad repetition argument 67 | ErrorRepeatOp "RE2::ErrorRepeatOp" # bad repetition operator 68 | ErrorBadPerlOp "RE2::ErrorBadPerlOp" # bad perl operator 69 | ErrorBadUTF8 "RE2::ErrorBadUTF8" # invalid UTF-8 in regexp 70 | ErrorBadNamedCapture "RE2::ErrorBadNamedCapture" # bad named capture group 71 | ErrorPatternTooLarge "RE2::ErrorPatternTooLarge" # pattern too large (compile failed) 72 | 73 | cdef enum Encoding: 74 | EncodingUTF8 "RE2::Options::EncodingUTF8" 75 | EncodingLatin1 "RE2::Options::EncodingLatin1" 76 | 77 | ctypedef Encoding re2_Encoding "RE2::Options::Encoding" 78 | 79 | cdef cppclass Options "RE2::Options": 80 | Options() 81 | void set_posix_syntax(int b) 82 | void set_longest_match(int b) 83 | void set_log_errors(int b) 84 | void set_max_mem(int m) 85 | void set_literal(int b) 86 | void set_never_nl(int b) 87 | void set_case_sensitive(int b) 88 | void set_perl_classes(int b) 89 | void set_word_boundary(int b) 90 | void set_one_line(int b) 91 | int case_sensitive() 92 | void set_encoding(re2_Encoding encoding) 93 | 94 | ctypedef Options const_Options "const RE2::Options" 95 | 96 | cdef cppclass RE2: 97 | RE2(const_StringPiece pattern, Options option) nogil 98 | RE2(const_StringPiece pattern) nogil 99 | int Match(const_StringPiece text, int startpos, int endpos, 100 | Anchor anchor, StringPiece * match, int nmatch) nogil 101 | int NumberOfCapturingGroups() 102 | int ok() 103 | const_string pattern() 104 | cpp_string error() 105 | ErrorCode error_code() 106 | const_stringintmap& NamedCapturingGroups() 107 | 108 | ctypedef RE2 const_RE2 "const RE2" 109 | 110 | 111 | # This header is used for ways to hack^Wbypass the cython 112 | # issues. 113 | cdef extern from "_re2macros.h": 114 | StringPiece * new_StringPiece_array(int) nogil 115 | void delete_StringPiece_array(StringPiece* ptr) 116 | 117 | # This fixes the bug Cython #548 whereby reference returns 118 | # cannot be addressed, due to it not being an l-value 119 | const_stringintmap * addressof(const_stringintmap&) 120 | cpp_string * addressofs(cpp_string&) 121 | char * as_char(const_char_ptr) 122 | 123 | # This fixes the bug whereby namespaces are causing 124 | # cython to just break for Cpp arguments. 125 | int pattern_Replace(cpp_string *str, 126 | const_RE2 pattern, 127 | const_StringPiece rewrite) 128 | int pattern_GlobalReplace(cpp_string *str, 129 | const_RE2 pattern, 130 | const_StringPiece rewrite) 131 | -------------------------------------------------------------------------------- /tests/performance.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This module runs the performance tests to compare the ``re`` module with the 4 | ``re2`` module. You can just run it from the command line, assuming you have re2 5 | installed, and it will output a table in ReST format comparing everything. 6 | 7 | To add a test, you can add a function to the bottom of this page that uses the 8 | @register_test() decorator. Alternatively, you can create a module that uses it and 9 | import it. 10 | """ 11 | from timeit import Timer 12 | import simplejson 13 | 14 | import re2 15 | import re 16 | try: 17 | import regex 18 | except ImportError: 19 | regex = None 20 | 21 | import os 22 | import gzip 23 | 24 | re2.set_fallback_notification(re2.FALLBACK_EXCEPTION) 25 | 26 | os.chdir(os.path.dirname(__file__) or '.') 27 | 28 | tests = {} 29 | 30 | setup_code = """\ 31 | import re2 32 | import re 33 | from __main__ import tests, current_re 34 | test = tests[%r] 35 | """ 36 | 37 | current_re = [None] 38 | 39 | 40 | 41 | 42 | def main(): 43 | benchmarks = {} 44 | # Run all of the performance comparisons. 45 | for testname, method in tests.items(): 46 | benchmarks[testname] = {} 47 | if regex is not None: 48 | modules = (re, re2, regex) 49 | else: 50 | modules = (re, re2) 51 | results = [None for module in modules] 52 | for i, module in enumerate(modules): 53 | # We pre-compile the pattern, because that's 54 | # what people do. 55 | current_re[0] = module.compile(method.pattern) 56 | 57 | results[i] = method(current_re[0], **method.data) 58 | 59 | # Run a test. 60 | t = Timer("test(current_re[0],**test.data)", 61 | setup_code % testname) 62 | benchmarks[testname][module.__name__] = (t.timeit(method.num_runs), 63 | method.__doc__.strip(), 64 | method.pattern, 65 | method.num_runs) 66 | for i in range(len(results) - 1): 67 | if results[i] != results[i + 1]: 68 | raise ValueError("re2 output is not the same as re output: %s" % testname) 69 | 70 | benchmarks_to_ReST(benchmarks) 71 | 72 | 73 | def benchmarks_to_ReST(benchmarks): 74 | """ 75 | Convert dictionary to a nice table for ReST. 76 | """ 77 | if regex is not None: 78 | headers = ('Test', 'Description', '# total runs', '``re`` time(s)', '``re2`` time(s)', '% ``re`` time', '``regex`` time(s)', '% ``regex`` time') 79 | else: 80 | headers = ('Test', 'Description', '# total runs', '``re`` time(s)', '``re2`` time(s)', '% ``regex`` time') 81 | table = [headers] 82 | f = lambda x: "%0.3f" % x 83 | p = lambda x: "%0.2f%%" % (x * 100) 84 | 85 | for test, data in benchmarks.items(): 86 | row = [test, data["re"][1], str(data["re"][3]), f(data["re"][0]), f(data["re2"][0])] 87 | 88 | row.append(p(data["re2"][0] / data["re"][0])) 89 | if regex is not None: 90 | row.extend((f(data["regex"][0]), p(data["re2"][0] / data["regex"][0]))) 91 | table.append(row) 92 | col_sizes = [0] * len(table[0]) 93 | for col in range(len(table[0])): 94 | col_sizes[col] = max(len(row[col]) for row in table) 95 | 96 | def print_divider(symbol='-'): 97 | print '+' + '+'.join(symbol*col_size for col_size in col_sizes) + '+' 98 | def print_row(row): 99 | print '|' + '|'.join(item.ljust(col_sizes[i]) for i, item in enumerate(row)) + '|' 100 | 101 | print_divider() 102 | print_row(table[0]) 103 | print_divider('=') 104 | for row in table[1:]: 105 | print_row(row) 106 | print_divider() 107 | 108 | 109 | 110 | 111 | 112 | ############################################### 113 | # Tests for performance 114 | ############################################### 115 | 116 | 117 | # Convenient decorator for registering a new test. 118 | def register_test(name, pattern, num_runs = 100, **data): 119 | def decorator(method): 120 | tests[name] = method 121 | method.pattern = pattern 122 | method.num_runs = num_runs 123 | method.data = data 124 | 125 | return method 126 | return decorator 127 | 128 | 129 | # This is the only function to get data right now, 130 | # but I could imagine other functions as well. 131 | _wikidata = None 132 | def getwikidata(): 133 | global _wikidata 134 | if _wikidata is None: 135 | _wikidata = gzip.open('wikipages.xml.gz').read() 136 | return _wikidata 137 | 138 | 139 | 140 | #register_test("Findall URI|Email", 141 | # r'([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)', 142 | # 2, 143 | # data=getwikidata()) 144 | def findall_uriemail(pattern, data): 145 | """ 146 | Find list of '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)' 147 | """ 148 | return len(pattern.findall(data)) 149 | 150 | 151 | 152 | #register_test("Replace WikiLinks", 153 | # r'(\[\[(^\|)+.*?\]\])', 154 | # data=getwikidata()) 155 | def replace_wikilinks(pattern, data): 156 | """ 157 | This test replaces links of the form [[Obama|Barack_Obama]] to Obama. 158 | """ 159 | return len(pattern.sub(r'\1', data)) 160 | 161 | 162 | 163 | #register_test("Remove WikiLinks", 164 | # r'(\[\[(^\|)+.*?\]\])', 165 | # data=getwikidata()) 166 | def remove_wikilinks(pattern, data): 167 | """ 168 | This test replaces links of the form [[Obama|Barack_Obama]] to the empty string 169 | """ 170 | return len(pattern.sub(r'', data)) 171 | 172 | 173 | 174 | 175 | 176 | #register_test("Remove WikiLinks", 177 | # r'(<page[^>]*>)', 178 | # data=getwikidata()) 179 | def split_pages(pattern, data): 180 | """ 181 | This test splits the data by the <page> tag. 182 | """ 183 | return len(pattern.split(data)) 184 | 185 | 186 | def getweblogdata(): 187 | return open(os.path.join(os.path.dirname(__file__), 'access.log')) 188 | 189 | @register_test("weblog scan", 190 | #r'^(\S+) (\S+) (\S+) \[(\d{1,2})/(\w{3})/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -(\d{4})\] "(\S+) (\S+) (\S+)" (\d+) (\d+|-) "([^"]+)" "([^"]+)"\n', 191 | # '(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ? (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (".*?"|-) (\S+) (\S+) (\S+) (\S+)', 192 | '(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ? (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+)', 193 | data=getweblogdata()) 194 | def weblog_matches(pattern, data): 195 | """ 196 | Match weblog data line by line. 197 | """ 198 | total=0 199 | for line in data.read()[:20000].splitlines(): 200 | p = pattern.search(line) 201 | #for p in pattern.finditer(data.read()[:20000]): 202 | if p: 203 | total += len(p.groups()) 204 | data.seek(0) 205 | 206 | return 0 207 | 208 | if __name__ == '__main__': 209 | main() 210 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | pyre2 3 | ===== 4 | 5 | .. contents:: 6 | 7 | Summary 8 | ======= 9 | 10 | pyre2 is a Python extension that wraps 11 | `Google's RE2 regular expression library 12 | <http://code.google.com/p/re2/>`_. 13 | 14 | This version of pyre2 is similar to the one you'd 15 | find at `facebook's github repository <http://github.com/facebook/pyre2/>`_ 16 | except that the stated goal of this version is to be a *drop-in replacement* for 17 | the ``re`` module. 18 | 19 | Backwards Compatibility 20 | ======================= 21 | 22 | The stated goal of this module is to be a drop-in replacement for ``re``. 23 | My hope is that some will be able to go to the top of their module and put:: 24 | 25 | try: 26 | import re2 as re 27 | except ImportError: 28 | import re 29 | 30 | That being said, there are features of the ``re`` module that this module may 31 | never have. For example, ``RE2`` does not handle lookahead assertions (``(?=...)``). 32 | For this reason, the module will automatically fall back to the original ``re`` module 33 | if there is a regex that it cannot handle. 34 | 35 | However, there are times when you may want to be notified of a failover. For this reason, 36 | I'm adding the single function ``set_fallback_notification`` to the module. 37 | Thus, you can write:: 38 | 39 | try: 40 | import re2 as re 41 | except ImportError: 42 | import re 43 | else: 44 | re.set_fallback_notification(re.FALLBACK_WARNING) 45 | 46 | And in the above example, ``set_fallback_notification`` can handle 3 values: 47 | ``re.FALLBACK_QUIETLY`` (default), ``re.FALLBACK_WARNING`` (raises a warning), and 48 | ``re.FALLBACK_EXCEPTION`` (which raises an exception). 49 | 50 | **Note**: The re2 module treats byte strings as UTF-8. This is fully backwards compatible with 7-bit ascii. 51 | However, bytes containing values larger than 0x7f are going to be treated very differently in re2 than in re. 52 | The RE library quietly ignores invalid utf8 in input strings, and throws an exception on invalid utf8 in patterns. 53 | For example: 54 | 55 | >>> re.findall(r'.', '\x80\x81\x82') 56 | ['\x80', '\x81', '\x82'] 57 | >>> re2.findall(r'.', '\x80\x81\x82') 58 | [] 59 | 60 | If you require the use of regular expressions over an arbitrary stream of bytes, then this library might not be for you. 61 | 62 | Installation 63 | ============ 64 | 65 | To install, you must first install the prerequisites: 66 | 67 | * The `re2 library from Google <http://code.google.com/p/re2/>`_ 68 | * The Python development headers (e.g. *sudo apt-get install python-dev*) 69 | * A build environment with ``g++`` (e.g. *sudo apt-get install build-essential*) 70 | 71 | After the prerequisites are installed, you can try installing using ``easy_install``:: 72 | 73 | $ sudo easy_install re2 74 | 75 | if you have setuptools installed (or use ``pip``). 76 | 77 | If you don't want to use ``setuptools``, you can alternatively download the tarball from `pypi <http://pypi.python.org/pypi/re2/>`_. 78 | 79 | Alternative to those, you can clone this repository and try installing it from there. To do this, run:: 80 | 81 | $ git clone git://github.com/axiak/pyre2.git 82 | $ cd pyre2.git 83 | $ sudo python setup.py install 84 | 85 | If you want to make changes to the bindings, you must have Cython >=0.13. 86 | 87 | Unicode Support 88 | =============== 89 | 90 | One current issue is Unicode support. As you may know, ``RE2`` supports UTF8, 91 | which is certainly distinct from unicode. Right now the module will automatically 92 | encode any unicode string into utf8 for you, which is *slow* (it also has to 93 | decode utf8 strings back into unicode objects on every substitution or split). 94 | Therefore, you are better off using bytestrings in utf8 while working with RE2 95 | and encoding things after everything you need done is finished. 96 | 97 | Performance 98 | =========== 99 | 100 | Performance is of course the point of this module, so it better perform well. 101 | Regular expressions vary widely in complexity, and the salient feature of ``RE2`` is 102 | that it behaves well asymptotically. This being said, for very simple substitutions, 103 | I've found that occasionally python's regular ``re`` module is actually slightly faster. 104 | However, when the ``re`` module gets slow, it gets *really* slow, while this module 105 | buzzes along. 106 | 107 | In the below example, I'm running the data against 8MB of text from the collosal Wikipedia 108 | XML file. I'm running them multiple times, being careful to use the ``timeit`` module. 109 | To see more details, please see the `performance script <http://github.com/axiak/pyre2/tree/master/tests/performance.py>`_. 110 | 111 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+ 112 | |Test |Description |# total runs|``re`` time(s)|``re2`` time(s)|% ``re`` time|``regex`` time(s)|% ``regex`` time| 113 | +=================+===========================================================================+============+==============+===============+=============+=================+================+ 114 | |Findall URI|Email|Find list of '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)'|2 |19.961 |0.336 |1.68% |11.463 |2.93% | 115 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+ 116 | |Replace WikiLinks|This test replaces links of the form [[Obama|Barack_Obama]] to Obama. |100 |16.032 |2.622 |16.35% |2.895 |90.54% | 117 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+ 118 | |Remove WikiLinks |This test splits the data by the <page> tag. |100 |15.983 |1.406 |8.80% |2.252 |62.43% | 119 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+ 120 | 121 | Feel free to add more speed tests to the bottom of the script and send a pull request my way! 122 | 123 | Current Status 124 | ============== 125 | 126 | pyre2 has only received basic testing. Please use it 127 | and let me know if you run into any issues! 128 | 129 | Contact 130 | ======= 131 | 132 | You can file bug reports on GitHub, or contact the author: 133 | `Mike Axiak contact page <http://mike.axiak.net/contact>`_. 134 | 135 | Tests 136 | ===== 137 | 138 | If you would like to help, one thing that would be very useful 139 | is writing comprehensive tests for this. It's actually really easy: 140 | 141 | * Come up with regular expression problems using the regular python 're' module. 142 | * Write a session in python traceback format `Example <http://github.com/axiak/pyre2/blob/master/tests/search.txt>`_. 143 | * Replace your ``import re`` with ``import re2 as re``. 144 | * Save it as a .txt file in the tests directory. You can comment on it however you like and indent the code with 4 spaces. 145 | 146 | Missing Features 147 | ================ 148 | 149 | Currently the features missing are: 150 | 151 | * If you use substitution methods without a callback, a non 0/1 maxsplit argument is not supported. 152 | 153 | 154 | Credits 155 | ======= 156 | 157 | Though I ripped out the code, I'd like to thank David Reiss 158 | and Facebook for the initial inspiration. Plus, I got to 159 | gut this readme file! 160 | 161 | Moreover, this library would of course not be possible if not for 162 | the immense work of the team at RE2 and the few people who work 163 | on Cython. 164 | -------------------------------------------------------------------------------- /tests/re_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- mode: python -*- 3 | 4 | # Re test suite and benchmark suite v1.5 5 | 6 | # The 3 possible outcomes for each pattern 7 | [SUCCEED, FAIL, SYNTAX_ERROR] = range(3) 8 | 9 | # Benchmark suite (needs expansion) 10 | # 11 | # The benchmark suite does not test correctness, just speed. The 12 | # first element of each tuple is the regex pattern; the second is a 13 | # string to match it against. The benchmarking code will embed the 14 | # second string inside several sizes of padding, to test how regex 15 | # matching performs on large strings. 16 | 17 | benchmarks = [ 18 | 19 | # test common prefix 20 | ('Python|Perl', 'Perl'), # Alternation 21 | ('(Python|Perl)', 'Perl'), # Grouped alternation 22 | 23 | ('Python|Perl|Tcl', 'Perl'), # Alternation 24 | ('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation 25 | 26 | ('(Python)\\1', 'PythonPython'), # Backreference 27 | ('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization 28 | ('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets 29 | 30 | ('Python', 'Python'), # Simple text literal 31 | ('.*Python', 'Python'), # Bad text literal 32 | ('.*Python.*', 'Python'), # Worse text literal 33 | ('.*(Python)', 'Python'), # Bad text literal with grouping 34 | 35 | ] 36 | 37 | # Test suite (for verifying correctness) 38 | # 39 | # The test suite is a list of 5- or 3-tuples. The 5 parts of a 40 | # complete tuple are: 41 | # element 0: a string containing the pattern 42 | # 1: the string to match against the pattern 43 | # 2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR) 44 | # 3: a string that will be eval()'ed to produce a test string. 45 | # This is an arbitrary Python expression; the available 46 | # variables are "found" (the whole match), and "g1", "g2", ... 47 | # up to "g99" contain the contents of each group, or the 48 | # string 'None' if the group wasn't given a value, or the 49 | # string 'Error' if the group index was out of range; 50 | # also "groups", the return value of m.group() (a tuple). 51 | # 4: The expected result of evaluating the expression. 52 | # If the two don't match, an error is reported. 53 | # 54 | # If the regex isn't expected to work, the latter two elements can be omitted. 55 | 56 | tests = [ 57 | # Test ?P< and ?P= extensions 58 | ('(?P<foo_123', '', SYNTAX_ERROR), # Unterminated group identifier 59 | ('(?P<1>a)', '', SYNTAX_ERROR), # Begins with a digit 60 | ('(?P<!>a)', '', SYNTAX_ERROR), # Begins with an illegal char 61 | ('(?P<foo!>a)', '', SYNTAX_ERROR), # Begins with an illegal char 62 | 63 | # Same tests, for the ?P= form 64 | ('(?P<foo_123>a)(?P=foo_123', 'aa', SYNTAX_ERROR), 65 | ('(?P<foo_123>a)(?P=1)', 'aa', SYNTAX_ERROR), 66 | ('(?P<foo_123>a)(?P=!)', 'aa', SYNTAX_ERROR), 67 | ('(?P<foo_123>a)(?P=foo_124', 'aa', SYNTAX_ERROR), # Backref to undefined group 68 | 69 | ('(?P<foo_123>a)', 'a', SUCCEED, 'g1', 'a'), 70 | ('(?P<foo_123>a)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'), 71 | 72 | # Test octal escapes 73 | ('\\1', 'a', SYNTAX_ERROR), # Backreference 74 | ('[\\1]', '\1', SUCCEED, 'found', '\1'), # Character 75 | ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'), 76 | ('\\141', 'a', SUCCEED, 'found', 'a'), 77 | ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'), 78 | 79 | # Test \0 is handled everywhere 80 | (r'\0', '\0', SUCCEED, 'found', '\0'), 81 | (r'[\0a]', '\0', SUCCEED, 'found', '\0'), 82 | (r'[a\0]', '\0', SUCCEED, 'found', '\0'), 83 | (r'[^a\0]', '\0', FAIL), 84 | 85 | # Test various letter escapes 86 | (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), 87 | (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), 88 | # NOTE: not an error under PCRE/PRE: 89 | # (r'\u', '', SYNTAX_ERROR), # A Perl escape 90 | (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), 91 | (r'\xff', '\377', SUCCEED, 'found', chr(255)), 92 | # new \x semantics 93 | (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), 94 | (r'\x00f', '\017', FAIL, 'found', chr(15)), 95 | (r'\x00fe', '\376', FAIL, 'found', chr(254)), 96 | # (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)), 97 | # (r'\x00f', '\017', SUCCEED, 'found', chr(15)), 98 | # (r'\x00fe', '\376', SUCCEED, 'found', chr(254)), 99 | 100 | (r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", 101 | SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"), 102 | 103 | # Test that . only matches \n in DOTALL mode 104 | ('a.b', 'acb', SUCCEED, 'found', 'acb'), 105 | ('a.b', 'a\nb', FAIL), 106 | ('a.*b', 'acc\nccb', FAIL), 107 | ('a.{4,5}b', 'acc\nccb', FAIL), 108 | ('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'), 109 | ('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'), 110 | ('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), 111 | ('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'), 112 | ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), 113 | 114 | (')', '', SYNTAX_ERROR), # Unmatched right bracket 115 | ('', '', SUCCEED, 'found', ''), # Empty pattern 116 | ('abc', 'abc', SUCCEED, 'found', 'abc'), 117 | ('abc', 'xbc', FAIL), 118 | ('abc', 'axc', FAIL), 119 | ('abc', 'abx', FAIL), 120 | ('abc', 'xabcy', SUCCEED, 'found', 'abc'), 121 | ('abc', 'ababc', SUCCEED, 'found', 'abc'), 122 | ('ab*c', 'abc', SUCCEED, 'found', 'abc'), 123 | ('ab*bc', 'abc', SUCCEED, 'found', 'abc'), 124 | ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'), 125 | ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 126 | ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'), 127 | ('ab+bc', 'abc', FAIL), 128 | ('ab+bc', 'abq', FAIL), 129 | ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 130 | ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'), 131 | ('ab?bc', 'abc', SUCCEED, 'found', 'abc'), 132 | ('ab?bc', 'abbbbc', FAIL), 133 | ('ab?c', 'abc', SUCCEED, 'found', 'abc'), 134 | ('^abc$', 'abc', SUCCEED, 'found', 'abc'), 135 | ('^abc$', 'abcc', FAIL), 136 | ('^abc', 'abcc', SUCCEED, 'found', 'abc'), 137 | ('^abc$', 'aabc', FAIL), 138 | ('abc$', 'aabc', SUCCEED, 'found', 'abc'), 139 | ('^', 'abc', SUCCEED, 'found+"-"', '-'), 140 | ('$', 'abc', SUCCEED, 'found+"-"', '-'), 141 | ('a.c', 'abc', SUCCEED, 'found', 'abc'), 142 | ('a.c', 'axc', SUCCEED, 'found', 'axc'), 143 | ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'), 144 | ('a.*c', 'axyzd', FAIL), 145 | ('a[bc]d', 'abc', FAIL), 146 | ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'), 147 | ('a[b-d]e', 'abd', FAIL), 148 | ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'), 149 | ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), 150 | ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), 151 | ('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'), 152 | # NOTE: not an error under PCRE/PRE: 153 | # ('a[b-]', 'a-', SYNTAX_ERROR), 154 | ('a[]b', '-', SYNTAX_ERROR), 155 | ('a[', '-', SYNTAX_ERROR), 156 | ('a\\', '-', SYNTAX_ERROR), 157 | ('abc)', '-', SYNTAX_ERROR), 158 | ('(abc', '-', SYNTAX_ERROR), 159 | ('a]', 'a]', SUCCEED, 'found', 'a]'), 160 | ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'), 161 | ('a[\]]b', 'a]b', SUCCEED, 'found', 'a]b'), 162 | ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'), 163 | ('a[^bc]d', 'abd', FAIL), 164 | ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'), 165 | ('a[^-b]c', 'a-c', FAIL), 166 | ('a[^]b]c', 'a]c', FAIL), 167 | ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'), 168 | ('\\ba\\b', 'a-', SUCCEED, '"-"', '-'), 169 | ('\\ba\\b', '-a', SUCCEED, '"-"', '-'), 170 | ('\\ba\\b', '-a-', SUCCEED, '"-"', '-'), 171 | ('\\by\\b', 'xy', FAIL), 172 | ('\\by\\b', 'yz', FAIL), 173 | ('\\by\\b', 'xyz', FAIL), 174 | ('x\\b', 'xyz', FAIL), 175 | ('x\\B', 'xyz', SUCCEED, '"-"', '-'), 176 | ('\\Bz', 'xyz', SUCCEED, '"-"', '-'), 177 | ('z\\B', 'xyz', FAIL), 178 | ('\\Bx', 'xyz', FAIL), 179 | ('\\Ba\\B', 'a-', FAIL, '"-"', '-'), 180 | ('\\Ba\\B', '-a', FAIL, '"-"', '-'), 181 | ('\\Ba\\B', '-a-', FAIL, '"-"', '-'), 182 | ('\\By\\B', 'xy', FAIL), 183 | ('\\By\\B', 'yz', FAIL), 184 | ('\\By\\b', 'xy', SUCCEED, '"-"', '-'), 185 | ('\\by\\B', 'yz', SUCCEED, '"-"', '-'), 186 | ('\\By\\B', 'xyz', SUCCEED, '"-"', '-'), 187 | ('ab|cd', 'abc', SUCCEED, 'found', 'ab'), 188 | ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'), 189 | ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'), 190 | ('$b', 'b', FAIL), 191 | ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'), 192 | ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'), 193 | ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'), 194 | ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'), 195 | ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'), 196 | ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'), 197 | ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'), 198 | ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), 199 | ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), 200 | ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), 201 | (')(', '-', SYNTAX_ERROR), 202 | ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'), 203 | ('abc', '', FAIL), 204 | ('a*', '', SUCCEED, 'found', ''), 205 | ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'), 206 | ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'), 207 | ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'), 208 | ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'), 209 | ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'), 210 | ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'), 211 | ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'), 212 | ('^(ab|cd)e', 'abcde', FAIL, 'xg1y', 'xy'), 213 | ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'), 214 | ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'), 215 | ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'), 216 | ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'), 217 | ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), 218 | ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), 219 | ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), 220 | ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'), 221 | ('a[bcd]+dcdcde', 'adcdcde', FAIL), 222 | ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'), 223 | ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), 224 | ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'), 225 | ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'), 226 | ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), 227 | ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'), 228 | ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL), 229 | ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL), 230 | ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), 231 | ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), 232 | ('multiple words of text', 'uh-uh', FAIL), 233 | ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), 234 | ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), 235 | ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'), 236 | ('[k]', 'ab', FAIL), 237 | ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'), 238 | ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), 239 | ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), 240 | ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'), 241 | ('(a+).\\1$', 'aaaaa', SUCCEED, 'found+"-"+g1', 'aaaaa-aa'), 242 | ('^(a+).\\1$', 'aaaa', FAIL), 243 | ('(abc)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), 244 | ('([a-c]+)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), 245 | ('(a)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), 246 | ('(a+)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), 247 | ('(a+)+\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'), 248 | ('(a).+\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'), 249 | ('(a)ba*\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'), 250 | ('(aa|a)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), 251 | ('(a|aa)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), 252 | ('(a+)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'), 253 | ('([abc]*)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'), 254 | ('(a)(b)c|ab', 'ab', SUCCEED, 'found+"-"+g1+"-"+g2', 'ab-None-None'), 255 | ('(a)+x', 'aaax', SUCCEED, 'found+"-"+g1', 'aaax-a'), 256 | ('([ac])+x', 'aacx', SUCCEED, 'found+"-"+g1', 'aacx-c'), 257 | ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'), 258 | ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', SUCCEED, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'), 259 | ('([^N]*N)+', 'abNNxyzN', SUCCEED, 'found+"-"+g1', 'abNNxyzN-xyzN'), 260 | ('([^N]*N)+', 'abNNxyz', SUCCEED, 'found+"-"+g1', 'abNN-N'), 261 | ('([abc]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'abcx-abc'), 262 | ('([abc]*)x', 'abc', FAIL), 263 | ('([xyz]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'), 264 | ('(a)+b|aac', 'aac', SUCCEED, 'found+"-"+g1', 'aac-None'), 265 | 266 | # Test symbolic groups 267 | 268 | ('(?P<i d>aaa)a', 'aaaa', SYNTAX_ERROR), 269 | ('(?P<id>aaa)a', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aaa'), 270 | ('(?P<id>aa)(?P=id)', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aa'), 271 | ('(?P<id>aa)(?P=xd)', 'aaaa', SYNTAX_ERROR), 272 | 273 | # Test octal escapes/memory references 274 | 275 | ('\\1', 'a', SYNTAX_ERROR), 276 | ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'), 277 | ('\\141', 'a', SUCCEED, 'found', 'a'), 278 | ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'), 279 | 280 | # All tests from Perl 281 | 282 | ('abc', 'abc', SUCCEED, 'found', 'abc'), 283 | ('abc', 'xbc', FAIL), 284 | ('abc', 'axc', FAIL), 285 | ('abc', 'abx', FAIL), 286 | ('abc', 'xabcy', SUCCEED, 'found', 'abc'), 287 | ('abc', 'ababc', SUCCEED, 'found', 'abc'), 288 | ('ab*c', 'abc', SUCCEED, 'found', 'abc'), 289 | ('ab*bc', 'abc', SUCCEED, 'found', 'abc'), 290 | ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'), 291 | ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 292 | ('ab{0,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 293 | ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'), 294 | ('ab+bc', 'abc', FAIL), 295 | ('ab+bc', 'abq', FAIL), 296 | ('ab{1,}bc', 'abq', FAIL), 297 | ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 298 | ('ab{1,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 299 | ('ab{1,3}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 300 | ('ab{3,4}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'), 301 | ('ab{4,5}bc', 'abbbbc', FAIL), 302 | ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'), 303 | ('ab?bc', 'abc', SUCCEED, 'found', 'abc'), 304 | ('ab{0,1}bc', 'abc', SUCCEED, 'found', 'abc'), 305 | ('ab?bc', 'abbbbc', FAIL), 306 | ('ab?c', 'abc', SUCCEED, 'found', 'abc'), 307 | ('ab{0,1}c', 'abc', SUCCEED, 'found', 'abc'), 308 | ('^abc$', 'abc', SUCCEED, 'found', 'abc'), 309 | ('^abc$', 'abcc', FAIL), 310 | ('^abc', 'abcc', SUCCEED, 'found', 'abc'), 311 | ('^abc$', 'aabc', FAIL), 312 | ('abc$', 'aabc', SUCCEED, 'found', 'abc'), 313 | ('^', 'abc', SUCCEED, 'found', ''), 314 | ('$', 'abc', SUCCEED, 'found', ''), 315 | ('a.c', 'abc', SUCCEED, 'found', 'abc'), 316 | ('a.c', 'axc', SUCCEED, 'found', 'axc'), 317 | ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'), 318 | ('a.*c', 'axyzd', FAIL), 319 | ('a[bc]d', 'abc', FAIL), 320 | ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'), 321 | ('a[b-d]e', 'abd', FAIL), 322 | ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'), 323 | ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), 324 | ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), 325 | ('a[b-]', 'a-', SUCCEED, 'found', 'a-'), 326 | ('a[b-a]', '-', SYNTAX_ERROR), 327 | ('a[]b', '-', SYNTAX_ERROR), 328 | ('a[', '-', SYNTAX_ERROR), 329 | ('a]', 'a]', SUCCEED, 'found', 'a]'), 330 | ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'), 331 | ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'), 332 | ('a[^bc]d', 'abd', FAIL), 333 | ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'), 334 | ('a[^-b]c', 'a-c', FAIL), 335 | ('a[^]b]c', 'a]c', FAIL), 336 | ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'), 337 | ('ab|cd', 'abc', SUCCEED, 'found', 'ab'), 338 | ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'), 339 | ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'), 340 | ('*a', '-', SYNTAX_ERROR), 341 | ('(*)b', '-', SYNTAX_ERROR), 342 | ('$b', 'b', FAIL), 343 | ('a\\', '-', SYNTAX_ERROR), 344 | ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'), 345 | ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'), 346 | ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'), 347 | ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'), 348 | ('abc)', '-', SYNTAX_ERROR), 349 | ('(abc', '-', SYNTAX_ERROR), 350 | ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'), 351 | ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'), 352 | ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'), 353 | ('a{1,}b{1,}c', 'aabbabc', SUCCEED, 'found', 'abc'), 354 | ('a**', '-', SYNTAX_ERROR), 355 | ('a.+?c', 'abcabc', SUCCEED, 'found', 'abc'), 356 | ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), 357 | ('(a+|b){0,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), 358 | ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), 359 | ('(a+|b){1,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'), 360 | ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), 361 | ('(a+|b){0,1}', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'), 362 | (')(', '-', SYNTAX_ERROR), 363 | ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'), 364 | ('abc', '', FAIL), 365 | ('a*', '', SUCCEED, 'found', ''), 366 | ('([abc])*d', 'abbbcd', SUCCEED, 'found+"-"+g1', 'abbbcd-c'), 367 | ('([abc])*bcd', 'abcd', SUCCEED, 'found+"-"+g1', 'abcd-a'), 368 | ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'), 369 | ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'), 370 | ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'), 371 | ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'), 372 | ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'), 373 | ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'), 374 | ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'), 375 | ('^(ab|cd)e', 'abcde', FAIL), 376 | ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'), 377 | ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'), 378 | ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'), 379 | ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'), 380 | ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), 381 | ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), 382 | ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), 383 | ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'), 384 | ('a[bcd]+dcdcde', 'adcdcde', FAIL), 385 | ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'), 386 | ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), 387 | ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'), 388 | ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'), 389 | ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), 390 | ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'), 391 | ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL), 392 | ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL), 393 | ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), 394 | ('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'), 395 | ('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'), 396 | # Python does not have the same rules for \\41 so this is a syntax error 397 | # ('((((((((((a))))))))))\\41', 'aa', FAIL), 398 | # ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'), 399 | ('((((((((((a))))))))))\\41', '', SYNTAX_ERROR), 400 | ('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR), 401 | ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'), 402 | ('multiple words of text', 'uh-uh', FAIL), 403 | ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'), 404 | ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), 405 | ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'), 406 | ('[k]', 'ab', FAIL), 407 | ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'), 408 | ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), 409 | ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'), 410 | ('(?i)abc', 'ABC', SUCCEED, 'found', 'ABC'), 411 | ('(?i)abc', 'XBC', FAIL), 412 | ('(?i)abc', 'AXC', FAIL), 413 | ('(?i)abc', 'ABX', FAIL), 414 | ('(?i)abc', 'XABCY', SUCCEED, 'found', 'ABC'), 415 | ('(?i)abc', 'ABABC', SUCCEED, 'found', 'ABC'), 416 | ('(?i)ab*c', 'ABC', SUCCEED, 'found', 'ABC'), 417 | ('(?i)ab*bc', 'ABC', SUCCEED, 'found', 'ABC'), 418 | ('(?i)ab*bc', 'ABBC', SUCCEED, 'found', 'ABBC'), 419 | ('(?i)ab*?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), 420 | ('(?i)ab{0,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), 421 | ('(?i)ab+?bc', 'ABBC', SUCCEED, 'found', 'ABBC'), 422 | ('(?i)ab+bc', 'ABC', FAIL), 423 | ('(?i)ab+bc', 'ABQ', FAIL), 424 | ('(?i)ab{1,}bc', 'ABQ', FAIL), 425 | ('(?i)ab+bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), 426 | ('(?i)ab{1,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), 427 | ('(?i)ab{1,3}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), 428 | ('(?i)ab{3,4}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'), 429 | ('(?i)ab{4,5}?bc', 'ABBBBC', FAIL), 430 | ('(?i)ab??bc', 'ABBC', SUCCEED, 'found', 'ABBC'), 431 | ('(?i)ab??bc', 'ABC', SUCCEED, 'found', 'ABC'), 432 | ('(?i)ab{0,1}?bc', 'ABC', SUCCEED, 'found', 'ABC'), 433 | ('(?i)ab??bc', 'ABBBBC', FAIL), 434 | ('(?i)ab??c', 'ABC', SUCCEED, 'found', 'ABC'), 435 | ('(?i)ab{0,1}?c', 'ABC', SUCCEED, 'found', 'ABC'), 436 | ('(?i)^abc$', 'ABC', SUCCEED, 'found', 'ABC'), 437 | ('(?i)^abc$', 'ABCC', FAIL), 438 | ('(?i)^abc', 'ABCC', SUCCEED, 'found', 'ABC'), 439 | ('(?i)^abc$', 'AABC', FAIL), 440 | ('(?i)abc$', 'AABC', SUCCEED, 'found', 'ABC'), 441 | ('(?i)^', 'ABC', SUCCEED, 'found', ''), 442 | ('(?i)$', 'ABC', SUCCEED, 'found', ''), 443 | ('(?i)a.c', 'ABC', SUCCEED, 'found', 'ABC'), 444 | ('(?i)a.c', 'AXC', SUCCEED, 'found', 'AXC'), 445 | ('(?i)a.*?c', 'AXYZC', SUCCEED, 'found', 'AXYZC'), 446 | ('(?i)a.*c', 'AXYZD', FAIL), 447 | ('(?i)a[bc]d', 'ABC', FAIL), 448 | ('(?i)a[bc]d', 'ABD', SUCCEED, 'found', 'ABD'), 449 | ('(?i)a[b-d]e', 'ABD', FAIL), 450 | ('(?i)a[b-d]e', 'ACE', SUCCEED, 'found', 'ACE'), 451 | ('(?i)a[b-d]', 'AAC', SUCCEED, 'found', 'AC'), 452 | ('(?i)a[-b]', 'A-', SUCCEED, 'found', 'A-'), 453 | ('(?i)a[b-]', 'A-', SUCCEED, 'found', 'A-'), 454 | ('(?i)a[b-a]', '-', SYNTAX_ERROR), 455 | ('(?i)a[]b', '-', SYNTAX_ERROR), 456 | ('(?i)a[', '-', SYNTAX_ERROR), 457 | ('(?i)a]', 'A]', SUCCEED, 'found', 'A]'), 458 | ('(?i)a[]]b', 'A]B', SUCCEED, 'found', 'A]B'), 459 | ('(?i)a[^bc]d', 'AED', SUCCEED, 'found', 'AED'), 460 | ('(?i)a[^bc]d', 'ABD', FAIL), 461 | ('(?i)a[^-b]c', 'ADC', SUCCEED, 'found', 'ADC'), 462 | ('(?i)a[^-b]c', 'A-C', FAIL), 463 | ('(?i)a[^]b]c', 'A]C', FAIL), 464 | ('(?i)a[^]b]c', 'ADC', SUCCEED, 'found', 'ADC'), 465 | ('(?i)ab|cd', 'ABC', SUCCEED, 'found', 'AB'), 466 | ('(?i)ab|cd', 'ABCD', SUCCEED, 'found', 'AB'), 467 | ('(?i)()ef', 'DEF', SUCCEED, 'found+"-"+g1', 'EF-'), 468 | ('(?i)*a', '-', SYNTAX_ERROR), 469 | ('(?i)(*)b', '-', SYNTAX_ERROR), 470 | ('(?i)$b', 'B', FAIL), 471 | ('(?i)a\\', '-', SYNTAX_ERROR), 472 | ('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'), 473 | ('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'), 474 | ('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'), 475 | ('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'), 476 | ('(?i)abc)', '-', SYNTAX_ERROR), 477 | ('(?i)(abc', '-', SYNTAX_ERROR), 478 | ('(?i)((a))', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'A-A-A'), 479 | ('(?i)(a)b(c)', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABC-A-C'), 480 | ('(?i)a+b+c', 'AABBABC', SUCCEED, 'found', 'ABC'), 481 | ('(?i)a{1,}b{1,}c', 'AABBABC', SUCCEED, 'found', 'ABC'), 482 | ('(?i)a**', '-', SYNTAX_ERROR), 483 | ('(?i)a.+?c', 'ABCABC', SUCCEED, 'found', 'ABC'), 484 | ('(?i)a.*?c', 'ABCABC', SUCCEED, 'found', 'ABC'), 485 | ('(?i)a.{0,5}?c', 'ABCABC', SUCCEED, 'found', 'ABC'), 486 | ('(?i)(a+|b)*', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), 487 | ('(?i)(a+|b){0,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), 488 | ('(?i)(a+|b)+', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), 489 | ('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'), 490 | ('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'), 491 | ('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'), 492 | ('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'), 493 | ('(?i))(', '-', SYNTAX_ERROR), 494 | ('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'), 495 | ('(?i)abc', '', FAIL), 496 | ('(?i)a*', '', SUCCEED, 'found', ''), 497 | ('(?i)([abc])*d', 'ABBBCD', SUCCEED, 'found+"-"+g1', 'ABBBCD-C'), 498 | ('(?i)([abc])*bcd', 'ABCD', SUCCEED, 'found+"-"+g1', 'ABCD-A'), 499 | ('(?i)a|b|c|d|e', 'E', SUCCEED, 'found', 'E'), 500 | ('(?i)(a|b|c|d|e)f', 'EF', SUCCEED, 'found+"-"+g1', 'EF-E'), 501 | ('(?i)abcd*efg', 'ABCDEFG', SUCCEED, 'found', 'ABCDEFG'), 502 | ('(?i)ab*', 'XABYABBBZ', SUCCEED, 'found', 'AB'), 503 | ('(?i)ab*', 'XAYABBBZ', SUCCEED, 'found', 'A'), 504 | ('(?i)(ab|cd)e', 'ABCDE', SUCCEED, 'found+"-"+g1', 'CDE-CD'), 505 | ('(?i)[abhgefdc]ij', 'HIJ', SUCCEED, 'found', 'HIJ'), 506 | ('(?i)^(ab|cd)e', 'ABCDE', FAIL), 507 | ('(?i)(abc|)ef', 'ABCDEF', SUCCEED, 'found+"-"+g1', 'EF-'), 508 | ('(?i)(a|b)c*d', 'ABCD', SUCCEED, 'found+"-"+g1', 'BCD-B'), 509 | ('(?i)(ab|ab*)bc', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-A'), 510 | ('(?i)a([bc]*)c*', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-BC'), 511 | ('(?i)a([bc]*)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'), 512 | ('(?i)a([bc]+)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'), 513 | ('(?i)a([bc]*)(c+d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-B-CD'), 514 | ('(?i)a[bcd]*dcdcde', 'ADCDCDE', SUCCEED, 'found', 'ADCDCDE'), 515 | ('(?i)a[bcd]+dcdcde', 'ADCDCDE', FAIL), 516 | ('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'), 517 | ('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'), 518 | ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'), 519 | ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'), 520 | ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'), 521 | ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'), 522 | ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL), 523 | ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL), 524 | ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'), 525 | ('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'), 526 | ('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'), 527 | #('(?i)((((((((((a))))))))))\\41', 'AA', FAIL), 528 | #('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'), 529 | ('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'), 530 | ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'), 531 | ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'), 532 | ('(?i)multiple words of text', 'UH-UH', FAIL), 533 | ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'), 534 | ('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'), 535 | ('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'), 536 | ('(?i)[k]', 'AB', FAIL), 537 | # ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'), 538 | # ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'), 539 | ('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'), 540 | ('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'), 541 | ('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'), 542 | ('a(?!b).', 'abad', SUCCEED, 'found', 'ad'), 543 | ('a(?=d).', 'abad', SUCCEED, 'found', 'ad'), 544 | ('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'), 545 | ('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'), 546 | ('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'), 547 | ('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'), 548 | ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'), 549 | ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'), 550 | 551 | # lookbehind: split by : but not if it is escaped by -. 552 | ('(?<!-):(.*?)(?<!-):', 'a:bc-:de:f', SUCCEED, 'g1', 'bc-:de' ), 553 | # escaping with \ as we know it 554 | ('(?<!\\\):(.*?)(?<!\\\):', 'a:bc\\:de:f', SUCCEED, 'g1', 'bc\\:de' ), 555 | # terminating with ' and escaping with ? as in edifact 556 | ("(?<!\\?)'(.*?)(?<!\\?)'", "a'bc?'de'f", SUCCEED, 'g1', "bc?'de" ), 557 | 558 | # Comments using the (?#...) syntax 559 | 560 | ('w(?# comment', 'w', SYNTAX_ERROR), 561 | ('w(?# comment 1)xy(?# comment 2)z', 'wxyz', SUCCEED, 'found', 'wxyz'), 562 | 563 | # Check odd placement of embedded pattern modifiers 564 | 565 | # not an error under PCRE/PRE: 566 | ('w(?i)', 'W', SUCCEED, 'found', 'W'), 567 | # ('w(?i)', 'W', SYNTAX_ERROR), 568 | 569 | # Comments using the x embedded pattern modifier 570 | 571 | ("""(?x)w# comment 1 572 | x y 573 | # comment 2 574 | z""", 'wxyz', SUCCEED, 'found', 'wxyz'), 575 | 576 | # using the m embedded pattern modifier 577 | 578 | ('^abc', """jkl 579 | abc 580 | xyz""", FAIL), 581 | ('(?m)^abc', """jkl 582 | abc 583 | xyz""", SUCCEED, 'found', 'abc'), 584 | 585 | ('(?m)abc$', """jkl 586 | xyzabc 587 | 123""", SUCCEED, 'found', 'abc'), 588 | 589 | # using the s embedded pattern modifier 590 | 591 | ('a.b', 'a\nb', FAIL), 592 | ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'), 593 | 594 | # test \w, etc. both inside and outside character classes 595 | 596 | ('\\w+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'), 597 | ('[\\w]+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'), 598 | ('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'), 599 | ('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'), 600 | ('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'), 601 | # not an error under PCRE/PRE: 602 | # ('[\\d-x]', '-', SYNTAX_ERROR), 603 | (r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '), 604 | (r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '), 605 | 606 | (r'\xff', '\377', SUCCEED, 'found', chr(255)), 607 | # new \x semantics 608 | (r'\x00ff', '\377', FAIL), 609 | # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), 610 | (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), 611 | ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), 612 | (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)), 613 | (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'), 614 | 615 | # 616 | # post-1.5.2 additions 617 | 618 | # xmllib problem 619 | (r'(([a-z]+):)?([a-z]+)$', 'smil', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-smil'), 620 | # bug 110866: reference to undefined group 621 | (r'((.)\1+)', '', SYNTAX_ERROR), 622 | # bug 111869: search (PRE/PCRE fails on this one, SRE doesn't) 623 | (r'.*d', 'abc\nabd', SUCCEED, 'found', 'abd'), 624 | # bug 112468: various expected syntax errors 625 | (r'(', '', SYNTAX_ERROR), 626 | (r'[\41]', '!', SUCCEED, 'found', '!'), 627 | # bug 114033: nothing to repeat 628 | (r'(x?)?', 'x', SUCCEED, 'found', 'x'), 629 | # bug 115040: rescan if flags are modified inside pattern 630 | (r' (?x)foo ', 'foo', SUCCEED, 'found', 'foo'), 631 | # bug 115618: negative lookahead 632 | (r'(?<!abc)(d.f)', 'abcdefdof', SUCCEED, 'found', 'dof'), 633 | # bug 116251: character class bug 634 | (r'[\w-]+', 'laser_beam', SUCCEED, 'found', 'laser_beam'), 635 | # bug 123769+127259: non-greedy backtracking bug 636 | (r'.*?\S *:', 'xx:', SUCCEED, 'found', 'xx:'), 637 | (r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'), 638 | (r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'), 639 | # bug 127259: \Z shouldn't depend on multiline mode 640 | (r'(?ms).*?x\s*\Z(.*)','xx\nx\n', SUCCEED, 'g1', ''), 641 | # bug 128899: uppercase literals under the ignorecase flag 642 | (r'(?i)M+', 'MMM', SUCCEED, 'found', 'MMM'), 643 | (r'(?i)m+', 'MMM', SUCCEED, 'found', 'MMM'), 644 | (r'(?i)[M]+', 'MMM', SUCCEED, 'found', 'MMM'), 645 | (r'(?i)[m]+', 'MMM', SUCCEED, 'found', 'MMM'), 646 | # bug 130748: ^* should be an error (nothing to repeat) 647 | (r'^*', '', SYNTAX_ERROR), 648 | # bug 133283: minimizing repeat problem 649 | (r'"(?:\\"|[^"])*?"', r'"\""', SUCCEED, 'found', r'"\""'), 650 | # bug 477728: minimizing repeat problem 651 | (r'^.*?$', 'one\ntwo\nthree\n', FAIL), 652 | # bug 483789: minimizing repeat problem 653 | (r'a[^>]*?b', 'a>b', FAIL), 654 | # bug 490573: minimizing repeat problem 655 | (r'^a*?$', 'foo', FAIL), 656 | # bug 470582: nested groups problem 657 | (r'^((a)c)?(ab)$', 'ab', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-ab'), 658 | # another minimizing repeat problem (capturing groups in assertions) 659 | ('^([ab]*?)(?=(b)?)c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), 660 | ('^([ab]*?)(?!(b))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), 661 | ('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'), 662 | ] 663 | 664 | try: 665 | u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'") 666 | except SyntaxError: 667 | pass 668 | else: 669 | tests.extend([ 670 | # bug 410271: \b broken under locales 671 | (r'\b.\b', 'a', SUCCEED, 'found', 'a'), 672 | (r'(?u)\b.\b', u, SUCCEED, 'found', u), 673 | (r'(?u)\w', u, SUCCEED, 'found', u), 674 | ]) 675 | -------------------------------------------------------------------------------- /tests/test_re.py: -------------------------------------------------------------------------------- 1 | from test.test_support import verbose, run_unittest, import_module 2 | import re2 as re 3 | from re import Scanner 4 | import sys, os, traceback 5 | from weakref import proxy 6 | 7 | # Misc tests from Tim Peters' re.doc 8 | 9 | # WARNING: Don't change details in these tests if you don't know 10 | # what you're doing. Some of these tests were carefuly modeled to 11 | # cover most of the code. 12 | 13 | import unittest 14 | 15 | class ReTests(unittest.TestCase): 16 | 17 | def test_weakref(self): 18 | s = 'QabbbcR' 19 | x = re.compile('ab+c') 20 | y = proxy(x) 21 | self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR')) 22 | 23 | def test_search_star_plus(self): 24 | self.assertEqual(re.search('x*', 'axx').span(0), (0, 0)) 25 | self.assertEqual(re.search('x*', 'axx').span(), (0, 0)) 26 | self.assertEqual(re.search('x+', 'axx').span(0), (1, 3)) 27 | self.assertEqual(re.search('x+', 'axx').span(), (1, 3)) 28 | self.assertEqual(re.search('x', 'aaa'), None) 29 | self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0)) 30 | self.assertEqual(re.match('a*', 'xxx').span(), (0, 0)) 31 | self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3)) 32 | self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3)) 33 | self.assertEqual(re.match('a+', 'xxx'), None) 34 | 35 | def bump_num(self, matchobj): 36 | int_value = int(matchobj.group(0)) 37 | return str(int_value + 1) 38 | 39 | def test_basic_re_sub(self): 40 | self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') 41 | self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), 42 | '9.3 -3 24x100y') 43 | self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), 44 | '9.3 -3 23x99y') 45 | 46 | self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n') 47 | self.assertEqual(re.sub('.', r"\n", 'x'), '\n') 48 | 49 | s = r"\1\1" 50 | self.assertEqual(re.sub('(.)', s, 'x'), 'xx') 51 | self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s) 52 | self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s) 53 | 54 | self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx') 55 | self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx') 56 | self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx') 57 | self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx') 58 | 59 | self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'), 60 | '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D') 61 | self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a') 62 | self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), 63 | (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7))) 64 | 65 | self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest') 66 | 67 | def test_bug_449964(self): 68 | # fails for group followed by other escape 69 | self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'), 70 | 'xx\bxx\b') 71 | 72 | def test_bug_449000(self): 73 | # Test for sub() on escaped characters 74 | self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), 75 | 'abc\ndef\n') 76 | self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), 77 | 'abc\ndef\n') 78 | self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), 79 | 'abc\ndef\n') 80 | self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'), 81 | 'abc\ndef\n') 82 | 83 | def test_bug_1140(self): 84 | # re.sub(x, y, u'') should return u'', not '', and 85 | # re.sub(x, y, '') should return '', not u''. 86 | # Also: 87 | # re.sub(x, y, unicode(x)) should return unicode(y), and 88 | # re.sub(x, y, str(x)) should return 89 | # str(y) if isinstance(y, str) else unicode(y). 90 | for x in 'x', u'x': 91 | for y in 'y', u'y': 92 | z = re.sub(x, y, u'') 93 | self.assertEqual(z, u'') 94 | self.assertEqual(type(z), unicode) 95 | # 96 | z = re.sub(x, y, '') 97 | self.assertEqual(z, '') 98 | self.assertEqual(type(z), str) 99 | # 100 | z = re.sub(x, y, unicode(x)) 101 | self.assertEqual(z, y) 102 | self.assertEqual(type(z), unicode) 103 | # 104 | z = re.sub(x, y, str(x)) 105 | self.assertEqual(z, y) 106 | self.assertEqual(type(z), type(y)) 107 | 108 | def test_bug_1661(self): 109 | # Verify that flags do not get silently ignored with compiled patterns 110 | pattern = re.compile('.') 111 | self.assertRaises(ValueError, re.match, pattern, 'A', re.I) 112 | self.assertRaises(ValueError, re.search, pattern, 'A', re.I) 113 | self.assertRaises(ValueError, re.findall, pattern, 'A', re.I) 114 | self.assertRaises(ValueError, re.compile, pattern, re.I) 115 | 116 | def test_bug_3629(self): 117 | # A regex that triggered a bug in the sre-code validator 118 | re.compile("(?P<quote>)(?(quote))") 119 | 120 | def test_sub_template_numeric_escape(self): 121 | # bug 776311 and friends 122 | self.assertEqual(re.sub('x', r'\0', 'x'), '\0') 123 | self.assertEqual(re.sub('x', r'\000', 'x'), '\000') 124 | self.assertEqual(re.sub('x', r'\001', 'x'), '\001') 125 | self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8') 126 | self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9') 127 | self.assertEqual(re.sub('x', r'\111', 'x'), '\111') 128 | self.assertEqual(re.sub('x', r'\117', 'x'), '\117') 129 | 130 | self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111') 131 | self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1') 132 | 133 | self.assertEqual(re.sub('x', r'\00', 'x'), '\x00') 134 | self.assertEqual(re.sub('x', r'\07', 'x'), '\x07') 135 | self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8') 136 | self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9') 137 | self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a') 138 | 139 | self.assertEqual(re.sub('x', r'\400', 'x'), '\0') 140 | self.assertEqual(re.sub('x', r'\777', 'x'), '\377') 141 | 142 | self.assertRaises(re.error, re.sub, 'x', r'\1', 'x') 143 | self.assertRaises(re.error, re.sub, 'x', r'\8', 'x') 144 | self.assertRaises(re.error, re.sub, 'x', r'\9', 'x') 145 | self.assertRaises(re.error, re.sub, 'x', r'\11', 'x') 146 | self.assertRaises(re.error, re.sub, 'x', r'\18', 'x') 147 | self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x') 148 | self.assertRaises(re.error, re.sub, 'x', r'\90', 'x') 149 | self.assertRaises(re.error, re.sub, 'x', r'\99', 'x') 150 | self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8' 151 | self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x') 152 | self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1' 153 | self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0' 154 | 155 | # in python2.3 (etc), these loop endlessly in sre_parser.py 156 | self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x') 157 | self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'), 158 | 'xz8') 159 | self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'), 160 | 'xza') 161 | 162 | def test_qualified_re_sub(self): 163 | self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb') 164 | self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa') 165 | 166 | def test_bug_114660(self): 167 | self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 168 | 'hello there') 169 | 170 | def test_bug_462270(self): 171 | # Test for empty sub() behaviour, see SF bug #462270 172 | self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-') 173 | self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d') 174 | 175 | def test_symbolic_refs(self): 176 | self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') 177 | self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx') 178 | self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx') 179 | self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx') 180 | self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx') 181 | self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx') 182 | self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx') 183 | self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx') 184 | self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx') 185 | 186 | def test_re_subn(self): 187 | self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) 188 | self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) 189 | self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) 190 | self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) 191 | self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2)) 192 | 193 | def test_re_split(self): 194 | self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) 195 | self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) 196 | self.assertEqual(re.split("(:*)", ":a:b::c"), 197 | ['', ':', 'a', ':', 'b', '::', 'c']) 198 | self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) 199 | self.assertEqual(re.split("(:)*", ":a:b::c"), 200 | ['', ':', 'a', ':', 'b', ':', 'c']) 201 | self.assertEqual(re.split("([b:]+)", ":a:b::c"), 202 | ['', ':', 'a', ':b::', 'c']) 203 | self.assertEqual(re.split("(b)|(:+)", ":a:b::c"), 204 | ['', None, ':', 'a', None, ':', '', 'b', None, '', 205 | None, '::', 'c']) 206 | self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), 207 | ['', 'a', '', '', 'c']) 208 | 209 | def test_qualified_re_split(self): 210 | self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) 211 | self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) 212 | self.assertEqual(re.split("(:)", ":a:b::c", 2), 213 | ['', ':', 'a', ':', 'b::c']) 214 | self.assertEqual(re.split("(:*)", ":a:b::c", 2), 215 | ['', ':', 'a', ':', 'b::c']) 216 | 217 | def test_re_findall(self): 218 | self.assertEqual(re.findall(":+", "abc"), []) 219 | self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) 220 | self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"]) 221 | self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""), 222 | (":", ":"), 223 | (":", "::")]) 224 | 225 | def test_bug_117612(self): 226 | self.assertEqual(re.findall(r"(a|(b))", "aba"), 227 | [("a", ""),("b", "b"),("a", "")]) 228 | 229 | def test_re_match(self): 230 | self.assertEqual(re.match('a', 'a').groups(), ()) 231 | self.assertEqual(re.match('(a)', 'a').groups(), ('a',)) 232 | self.assertEqual(re.match(r'(a)', 'a').group(0), 'a') 233 | self.assertEqual(re.match(r'(a)', 'a').group(1), 'a') 234 | self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a')) 235 | 236 | pat = re.compile('((a)|(b))(c)?') 237 | self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None)) 238 | self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None)) 239 | self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c')) 240 | self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c')) 241 | self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c')) 242 | 243 | # A single group 244 | m = re.match('(a)', 'a') 245 | self.assertEqual(m.group(0), 'a') 246 | self.assertEqual(m.group(0), 'a') 247 | self.assertEqual(m.group(1), 'a') 248 | self.assertEqual(m.group(1, 1), ('a', 'a')) 249 | 250 | pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') 251 | self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) 252 | self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), 253 | (None, 'b', None)) 254 | self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) 255 | 256 | def test_re_groupref_exists(self): 257 | self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), 258 | ('(', 'a')) 259 | self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), 260 | (None, 'a')) 261 | self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None) 262 | self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None) 263 | self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(), 264 | ('a', 'b')) 265 | self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(), 266 | (None, 'd')) 267 | self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(), 268 | (None, 'd')) 269 | self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(), 270 | ('a', '')) 271 | 272 | # Tests for bug #1177831: exercise groups other than the first group 273 | p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') 274 | self.assertEqual(p.match('abc').groups(), 275 | ('a', 'b', 'c')) 276 | self.assertEqual(p.match('ad').groups(), 277 | ('a', None, 'd')) 278 | self.assertEqual(p.match('abd'), None) 279 | self.assertEqual(p.match('ac'), None) 280 | 281 | 282 | def test_re_groupref(self): 283 | self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), 284 | ('|', 'a')) 285 | self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(), 286 | (None, 'a')) 287 | self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None) 288 | self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None) 289 | self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(), 290 | ('a', 'a')) 291 | self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(), 292 | (None, None)) 293 | 294 | def test_groupdict(self): 295 | self.assertEqual(re.match('(?P<first>first) (?P<second>second)', 296 | 'first second').groupdict(), 297 | {'first':'first', 'second':'second'}) 298 | 299 | def test_expand(self): 300 | self.assertEqual(re.match("(?P<first>first) (?P<second>second)", 301 | "first second") 302 | .expand(r"\2 \1 \g<second> \g<first>"), 303 | "second first second first") 304 | 305 | def test_repeat_minmax(self): 306 | self.assertEqual(re.match("^(\w){1}$", "abc"), None) 307 | self.assertEqual(re.match("^(\w){1}?$", "abc"), None) 308 | self.assertEqual(re.match("^(\w){1,2}$", "abc"), None) 309 | self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None) 310 | 311 | self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c") 312 | self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c") 313 | self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c") 314 | self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 315 | self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c") 316 | self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c") 317 | self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c") 318 | self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c") 319 | 320 | self.assertEqual(re.match("^x{1}$", "xxx"), None) 321 | self.assertEqual(re.match("^x{1}?$", "xxx"), None) 322 | self.assertEqual(re.match("^x{1,2}$", "xxx"), None) 323 | self.assertEqual(re.match("^x{1,2}?$", "xxx"), None) 324 | 325 | self.assertNotEqual(re.match("^x{3}$", "xxx"), None) 326 | self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None) 327 | self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None) 328 | self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) 329 | self.assertNotEqual(re.match("^x{3}?$", "xxx"), None) 330 | self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None) 331 | self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None) 332 | self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None) 333 | 334 | self.assertEqual(re.match("^x{}$", "xxx"), None) 335 | self.assertNotEqual(re.match("^x{}$", "x{}"), None) 336 | 337 | def test_getattr(self): 338 | self.assertEqual(re.match("(a)", "a").pos, 0) 339 | self.assertEqual(re.match("(a)", "a").endpos, 1) 340 | self.assertEqual(re.match("(a)", "a").string, "a") 341 | self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1))) 342 | self.assertNotEqual(re.match("(a)", "a").re, None) 343 | 344 | def test_special_escapes(self): 345 | self.assertEqual(re.search(r"\b(b.)\b", 346 | "abcd abc bcd bx").group(1), "bx") 347 | self.assertEqual(re.search(r"\B(b.)\B", 348 | "abc bcd bc abxd").group(1), "bx") 349 | self.assertEqual(re.search(r"\b(b.)\b", 350 | "abcd abc bcd bx", re.LOCALE).group(1), "bx") 351 | self.assertEqual(re.search(r"\B(b.)\B", 352 | "abc bcd bc abxd", re.LOCALE).group(1), "bx") 353 | self.assertEqual(re.search(r"\b(b.)\b", 354 | "abcd abc bcd bx", re.UNICODE).group(1), "bx") 355 | self.assertEqual(re.search(r"\B(b.)\B", 356 | "abc bcd bc abxd", re.UNICODE).group(1), "bx") 357 | self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") 358 | self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") 359 | self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None) 360 | self.assertEqual(re.search(r"\b(b.)\b", 361 | u"abcd abc bcd bx").group(1), "bx") 362 | self.assertEqual(re.search(r"\B(b.)\B", 363 | u"abc bcd bc abxd").group(1), "bx") 364 | self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc") 365 | self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc") 366 | self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None) 367 | self.assertEqual(re.search(r"\d\D\w\W\s\S", 368 | "1aa! a").group(0), "1aa! a") 369 | self.assertEqual(re.search(r"\d\D\w\W\s\S", 370 | "1aa! a", re.LOCALE).group(0), "1aa! a") 371 | self.assertEqual(re.search(r"\d\D\w\W\s\S", 372 | "1aa! a", re.UNICODE).group(0), "1aa! a") 373 | 374 | def test_bigcharset(self): 375 | self.assertEqual(re.match(u"([\u2222\u2223])", 376 | u"\u2222").group(1), u"\u2222") 377 | self.assertEqual(re.match(u"([\u2222\u2223])", 378 | u"\u2222", re.UNICODE).group(1), u"\u2222") 379 | 380 | def test_anyall(self): 381 | self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0), 382 | "a\nb") 383 | self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), 384 | "a\n\nb") 385 | 386 | def test_non_consuming(self): 387 | self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") 388 | self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") 389 | self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") 390 | self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a") 391 | self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a") 392 | self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a") 393 | self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a") 394 | 395 | self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a") 396 | self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a") 397 | self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") 398 | self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") 399 | 400 | def test_ignore_case(self): 401 | self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 402 | self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 403 | self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") 404 | self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") 405 | self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") 406 | self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") 407 | self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") 408 | self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") 409 | self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") 410 | self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") 411 | 412 | def test_category(self): 413 | self.assertEqual(re.match(r"(\s)", " ").group(1), " ") 414 | 415 | def test_getlower(self): 416 | import _sre 417 | self.assertEqual(_sre.getlower(ord('A'), 0), ord('a')) 418 | self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a')) 419 | self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a')) 420 | 421 | self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") 422 | self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") 423 | 424 | def test_not_literal(self): 425 | self.assertEqual(re.search("\s([^a])", " b").group(1), "b") 426 | self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb") 427 | 428 | def test_search_coverage(self): 429 | self.assertEqual(re.search("\s(b)", " b").group(1), "b") 430 | self.assertEqual(re.search("a\s", "a ").group(0), "a ") 431 | 432 | def test_re_escape(self): 433 | p="" 434 | # This had to change from the original test of range(0,256) 435 | # because we can't support non-ascii non-utf8 strings 436 | for i in range(0, 128): 437 | p = p + chr(i) 438 | self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None, 439 | True) 440 | self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1)) 441 | 442 | pat=re.compile(re.escape(p)) 443 | self.assertEqual(pat.match(p) is not None, True) 444 | self.assertEqual(pat.match(p).span(), (0,128)) 445 | 446 | def test_pickling(self): 447 | import pickle 448 | self.pickle_test(pickle) 449 | import cPickle 450 | self.pickle_test(cPickle) 451 | # old pickles expect the _compile() reconstructor in sre module 452 | import_module("sre", deprecated=True) 453 | from sre import _compile 454 | 455 | def pickle_test(self, pickle): 456 | oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)') 457 | s = pickle.dumps(oldpat) 458 | newpat = pickle.loads(s) 459 | self.assertEqual(oldpat, newpat) 460 | 461 | def test_constants(self): 462 | self.assertEqual(re.I, re.IGNORECASE) 463 | self.assertEqual(re.L, re.LOCALE) 464 | self.assertEqual(re.M, re.MULTILINE) 465 | self.assertEqual(re.S, re.DOTALL) 466 | self.assertEqual(re.X, re.VERBOSE) 467 | 468 | def test_flags(self): 469 | for flag in [re.I, re.M, re.X, re.S, re.L]: 470 | self.assertNotEqual(re.compile('^pattern$', flag), None) 471 | 472 | def test_sre_character_literals(self): 473 | for i in [0, 8, 16, 32, 64, 127, 128, 255]: 474 | self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None) 475 | self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None) 476 | self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None) 477 | self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None) 478 | self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None) 479 | self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None) 480 | self.assertRaises(re.error, re.match, "\911", "") 481 | 482 | def test_sre_character_class_literals(self): 483 | for i in [0, 8, 16, 32, 64, 127, 128, 255]: 484 | self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None) 485 | self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None) 486 | self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None) 487 | self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None) 488 | self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None) 489 | self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None) 490 | self.assertRaises(re.error, re.match, "[\911]", "") 491 | 492 | def test_bug_113254(self): 493 | self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1) 494 | self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1) 495 | self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1)) 496 | 497 | def test_bug_527371(self): 498 | # bug described in patches 527371/672491 499 | self.assertEqual(re.match(r'(a)?a','a').lastindex, None) 500 | self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1) 501 | self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') 502 | self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a') 503 | self.assertEqual(re.match("((a))", "a").lastindex, 1) 504 | 505 | def test_bug_545855(self): 506 | # bug 545855 -- This pattern failed to cause a compile error as it 507 | # should, instead provoking a TypeError. 508 | self.assertRaises(re.error, re.compile, 'foo[a-') 509 | 510 | def test_bug_418626(self): 511 | # bugs 418626 at al. -- Testing Greg Chapman's addition of op code 512 | # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of 513 | # pattern '*?' on a long string. 514 | self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001) 515 | self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), 516 | 20003) 517 | self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) 518 | # non-simple '*?' still used to hit the recursion limit, before the 519 | # non-recursive scheme was implemented. 520 | self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001) 521 | 522 | def test_bug_612074(self): 523 | pat=u"["+re.escape(u"\u2039")+u"]" 524 | self.assertEqual(re.compile(pat) and 1, 1) 525 | 526 | def test_stack_overflow(self): 527 | # nasty cases that used to overflow the straightforward recursive 528 | # implementation of repeated groups. 529 | self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x') 530 | self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x') 531 | self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x') 532 | 533 | def test_scanner(self): 534 | def s_ident(scanner, token): return token 535 | def s_operator(scanner, token): return "op%s" % token 536 | def s_float(scanner, token): return float(token) 537 | def s_int(scanner, token): return int(token) 538 | 539 | scanner = Scanner([ 540 | (r"[a-zA-Z_]\w*", s_ident), 541 | (r"\d+\.\d*", s_float), 542 | (r"\d+", s_int), 543 | (r"=|\+|-|\*|/", s_operator), 544 | (r"\s+", None), 545 | ]) 546 | 547 | self.assertNotEqual(scanner.scanner.scanner("").pattern, None) 548 | 549 | self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"), 550 | (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5, 551 | 'op+', 'bar'], '')) 552 | 553 | def test_bug_448951(self): 554 | # bug 448951 (similar to 429357, but with single char match) 555 | # (Also test greedy matches.) 556 | for op in '','?','*': 557 | self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(), 558 | (None, None)) 559 | self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(), 560 | ('a:', 'a')) 561 | 562 | def test_bug_725106(self): 563 | # capturing groups in alternatives in repeats 564 | self.assertEqual(re.match('^((a)|b)*', 'abc').groups(), 565 | ('b', 'a')) 566 | self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(), 567 | ('c', 'b')) 568 | self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(), 569 | ('b', None)) 570 | self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(), 571 | ('b', None)) 572 | self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(), 573 | ('b', 'a')) 574 | self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(), 575 | ('c', 'b')) 576 | self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(), 577 | ('b', None)) 578 | self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(), 579 | ('b', None)) 580 | 581 | def test_bug_725149(self): 582 | # mark_stack_base restoring before restoring marks 583 | self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(), 584 | ('a', None)) 585 | self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(), 586 | ('a', None, None)) 587 | 588 | def test_bug_764548(self): 589 | # bug 764548, re.compile() barfs on str/unicode subclasses 590 | try: 591 | unicode 592 | except NameError: 593 | return # no problem if we have no unicode 594 | class my_unicode(unicode): pass 595 | pat = re.compile(my_unicode("abc")) 596 | self.assertEqual(pat.match("xyz"), None) 597 | 598 | def test_finditer(self): 599 | iter = re.finditer(r":+", "a:b::c:::d") 600 | self.assertEqual([item.group(0) for item in iter], 601 | [":", "::", ":::"]) 602 | 603 | def test_bug_926075(self): 604 | try: 605 | unicode 606 | except NameError: 607 | return # no problem if we have no unicode 608 | self.assert_(re.compile('bug_926075') is not 609 | re.compile(eval("u'bug_926075'"))) 610 | 611 | def test_bug_931848(self): 612 | try: 613 | unicode 614 | except NameError: 615 | pass 616 | pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"') 617 | self.assertEqual(re.compile(pattern).split("a.b.c"), 618 | ['a','b','c']) 619 | 620 | def test_bug_581080(self): 621 | iter = re.finditer(r"\s", "a b") 622 | self.assertEqual(iter.next().span(), (1,2)) 623 | self.assertRaises(StopIteration, iter.next) 624 | 625 | scanner = re.compile(r"\s").scanner("a b") 626 | self.assertEqual(scanner.search().span(), (1, 2)) 627 | self.assertEqual(scanner.search(), None) 628 | 629 | def test_bug_817234(self): 630 | iter = re.finditer(r".*", "asdf") 631 | self.assertEqual(iter.next().span(), (0, 4)) 632 | self.assertEqual(iter.next().span(), (4, 4)) 633 | self.assertRaises(StopIteration, iter.next) 634 | 635 | def test_empty_array(self): 636 | # SF buf 1647541 637 | import array 638 | for typecode in 'cbBuhHiIlLfd': 639 | a = array.array(typecode) 640 | self.assertEqual(re.compile("bla").match(a), None) 641 | self.assertEqual(re.compile("").match(a).groups(), ()) 642 | 643 | def test_inline_flags(self): 644 | # Bug #1700 645 | upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow 646 | lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow 647 | 648 | p = re.compile(upper_char, re.I | re.U) 649 | q = p.match(lower_char) 650 | self.assertNotEqual(q, None) 651 | 652 | p = re.compile(lower_char, re.I | re.U) 653 | q = p.match(upper_char) 654 | self.assertNotEqual(q, None) 655 | 656 | p = re.compile('(?i)' + upper_char, re.U) 657 | q = p.match(lower_char) 658 | self.assertNotEqual(q, None) 659 | 660 | p = re.compile('(?i)' + lower_char, re.U) 661 | q = p.match(upper_char) 662 | self.assertNotEqual(q, None) 663 | 664 | p = re.compile('(?iu)' + upper_char) 665 | q = p.match(lower_char) 666 | self.assertNotEqual(q, None) 667 | 668 | p = re.compile('(?iu)' + lower_char) 669 | q = p.match(upper_char) 670 | self.assertNotEqual(q, None) 671 | 672 | def test_dollar_matches_twice(self): 673 | "$ matches the end of string, and just before the terminating \n" 674 | pattern = re.compile('$') 675 | self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') 676 | self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') 677 | self.assertEqual(pattern.sub('#', '\n'), '#\n#') 678 | 679 | pattern = re.compile('$', re.MULTILINE) 680 | self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' ) 681 | self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') 682 | self.assertEqual(pattern.sub('#', '\n'), '#\n#') 683 | 684 | def test_dealloc(self): 685 | # issue 3299: check for segfault in debug build 686 | import _sre 687 | # the overflow limit is different on wide and narrow builds and it 688 | # depends on the definition of SRE_CODE (see sre.h). 689 | # 2**128 should be big enough to overflow on both. For smaller values 690 | # a RuntimeError is raised instead of OverflowError. 691 | long_overflow = 2**128 692 | self.assertRaises(TypeError, re.finditer, "a", {}) 693 | self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) 694 | 695 | def run_re_tests(): 696 | from re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR 697 | if verbose: 698 | print 'Running re_tests test suite' 699 | else: 700 | # To save time, only run the first and last 10 tests 701 | #tests = tests[:10] + tests[-10:] 702 | pass 703 | 704 | for t in tests: 705 | sys.stdout.flush() 706 | pattern = s = outcome = repl = expected = None 707 | if len(t) == 5: 708 | pattern, s, outcome, repl, expected = t 709 | elif len(t) == 3: 710 | pattern, s, outcome = t 711 | else: 712 | raise ValueError, ('Test tuples should have 3 or 5 fields', t) 713 | 714 | try: 715 | obj = re.compile(pattern) 716 | except re.error: 717 | if outcome == SYNTAX_ERROR: pass # Expected a syntax error 718 | else: 719 | print '=== Syntax error:', t 720 | except KeyboardInterrupt: raise KeyboardInterrupt 721 | except: 722 | print '*** Unexpected error ***', t 723 | if verbose: 724 | traceback.print_exc(file=sys.stdout) 725 | else: 726 | try: 727 | result = obj.search(s) 728 | except re.error, msg: 729 | print '=== Unexpected exception', t, repr(msg) 730 | if outcome == SYNTAX_ERROR: 731 | # This should have been a syntax error; forget it. 732 | pass 733 | elif outcome == FAIL: 734 | if result is None: pass # No match, as expected 735 | else: print '=== Succeeded incorrectly', t 736 | elif outcome == SUCCEED: 737 | if result is not None: 738 | # Matched, as expected, so now we compute the 739 | # result string and compare it to our expected result. 740 | start, end = result.span(0) 741 | vardict={'found': result.group(0), 742 | 'groups': result.group(), 743 | 'flags': result.re.flags} 744 | for i in range(1, 100): 745 | try: 746 | gi = result.group(i) 747 | # Special hack because else the string concat fails: 748 | if gi is None: 749 | gi = "None" 750 | except IndexError: 751 | gi = "Error" 752 | vardict['g%d' % i] = gi 753 | for i in result.re.groupindex.keys(): 754 | try: 755 | gi = result.group(i) 756 | if gi is None: 757 | gi = "None" 758 | except IndexError: 759 | gi = "Error" 760 | vardict[i] = gi 761 | repl = eval(repl, vardict) 762 | if repl != expected: 763 | print '=== grouping error', t, 764 | print repr(repl) + ' should be ' + repr(expected) 765 | else: 766 | print '=== Failed incorrectly', t 767 | 768 | # Try the match on a unicode string, and check that it 769 | # still succeeds. 770 | try: 771 | result = obj.search(unicode(s, "latin-1")) 772 | if result is None: 773 | print '=== Fails on unicode match', t 774 | except NameError: 775 | continue # 1.5.2 776 | except TypeError: 777 | continue # unicode test case 778 | 779 | # Try the match on a unicode pattern, and check that it 780 | # still succeeds. 781 | obj=re.compile(unicode(pattern, "latin-1")) 782 | result = obj.search(s) 783 | if result is None: 784 | print '=== Fails on unicode pattern match', t 785 | 786 | # Try the match with the search area limited to the extent 787 | # of the match and see if it still succeeds. \B will 788 | # break (because it won't match at the end or start of a 789 | # string), so we'll ignore patterns that feature it. 790 | 791 | if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \ 792 | and result is not None: 793 | obj = re.compile(pattern) 794 | result = obj.search(s, result.start(0), result.end(0) + 1) 795 | if result is None: 796 | print '=== Failed on range-limited match', t 797 | 798 | # Try the match with IGNORECASE enabled, and check that it 799 | # still succeeds. 800 | obj = re.compile(pattern, re.IGNORECASE) 801 | result = obj.search(s) 802 | if result is None: 803 | print '=== Fails on case-insensitive match', t 804 | 805 | # Try the match with LOCALE enabled, and check that it 806 | # still succeeds. 807 | obj = re.compile(pattern, re.LOCALE) 808 | result = obj.search(s) 809 | if result is None: 810 | print '=== Fails on locale-sensitive match', t 811 | 812 | # Try the match with UNICODE locale enabled, and check 813 | # that it still succeeds. 814 | obj = re.compile(pattern, re.UNICODE) 815 | result = obj.search(s) 816 | if result is None: 817 | print '=== Fails on unicode-sensitive match', t 818 | 819 | def test_main(): 820 | run_unittest(ReTests) 821 | run_re_tests() 822 | 823 | if __name__ == "__main__": 824 | test_main() 825 | -------------------------------------------------------------------------------- /src/re2.pyx: -------------------------------------------------------------------------------- 1 | # cython: infer_types(False) 2 | # Import re flags to be compatible. 3 | import sys 4 | import re 5 | 6 | I = re.I 7 | IGNORECASE = re.IGNORECASE 8 | M = re.M 9 | MULTILINE = re.MULTILINE 10 | S = re.S 11 | DOTALL = re.DOTALL 12 | U = re.U 13 | UNICODE = re.UNICODE 14 | X = re.X 15 | VERBOSE = re.VERBOSE 16 | L = re.L 17 | LOCALE = re.LOCALE 18 | 19 | FALLBACK_QUIETLY = 0 20 | FALLBACK_WARNING = 1 21 | FALLBACK_EXCEPTION = 2 22 | 23 | VERSION = (0, 2, 23) 24 | VERSION_HEX = 0x000217 25 | 26 | # Type of compiled re object from Python stdlib 27 | SREPattern = type(re.compile('')) 28 | 29 | cdef int current_notification = FALLBACK_QUIETLY 30 | 31 | def set_fallback_notification(level): 32 | """ 33 | Set the fallback notification to a level; one of: 34 | FALLBACK_QUIETLY 35 | FALLBACK_WARNING 36 | FALLBACK_EXCEPTION 37 | """ 38 | global current_notification 39 | level = int(level) 40 | if level < 0 or level > 2: 41 | raise ValueError("This function expects a valid notification level.") 42 | current_notification = level 43 | 44 | 45 | class RegexError(re.error): 46 | """ 47 | Some error has occured in compilation of the regex. 48 | """ 49 | pass 50 | 51 | error = RegexError 52 | 53 | cdef int _I = I, _M = M, _S = S, _U = U, _X = X, _L = L 54 | 55 | cimport _re2 56 | cimport cpython.unicode 57 | from cython.operator cimport preincrement as inc, dereference as deref 58 | import warnings 59 | 60 | cdef object cpp_to_pystring(_re2.cpp_string input): 61 | # This function is a quick converter from a std::string object 62 | # to a python string. By taking the slice we go to the right size, 63 | # despite spurious or missing null characters. 64 | return input.c_str()[:input.length()] 65 | 66 | cdef inline object cpp_to_utf8(_re2.cpp_string input): 67 | # This function converts a std::string object to a utf8 object. 68 | return cpython.unicode.PyUnicode_DecodeUTF8(input.c_str(), input.length(), 'strict') 69 | 70 | cdef inline object char_to_utf8(_re2.const_char_ptr input, int length): 71 | # This function converts a C string to a utf8 object. 72 | return cpython.unicode.PyUnicode_DecodeUTF8(input, length, 'strict') 73 | 74 | cdef inline object unicode_to_bytestring(object pystring, int * encoded): 75 | # This function will convert a utf8 string to a bytestring object. 76 | if cpython.unicode.PyUnicode_Check(pystring): 77 | pystring = cpython.unicode.PyUnicode_EncodeUTF8(cpython.unicode.PyUnicode_AS_UNICODE(pystring), 78 | cpython.unicode.PyUnicode_GET_SIZE(pystring), 79 | "strict") 80 | encoded[0] = 1 81 | else: 82 | encoded[0] = 0 83 | return pystring 84 | 85 | cdef inline int pystring_to_bytestring(object pystring, char ** cstring, Py_ssize_t * length): 86 | # This function will convert a pystring to a bytesstring, placing 87 | # the char * in cstring, and the length in length. 88 | # First it will try treating it as a str object, but failing that 89 | # it will move to utf-8. If utf8 does not work, then it has to be 90 | # a non-supported encoding. 91 | return _re2.PyObject_AsCharBuffer(pystring, <_re2.const_char_ptr*> cstring, length) 92 | 93 | cdef extern from *: 94 | cdef void emit_ifndef_py_unicode_wide "#if !defined(Py_UNICODE_WIDE) //" () 95 | cdef void emit_endif "#endif //" () 96 | 97 | cdef class Match: 98 | cdef _re2.StringPiece * matches 99 | cdef _re2.const_stringintmap * named_groups 100 | 101 | cdef bint encoded 102 | cdef int _lastindex 103 | cdef int nmatches 104 | cdef int _pos 105 | cdef int _endpos 106 | cdef object match_string 107 | cdef object _pattern_object 108 | cdef tuple _groups 109 | cdef tuple _spans 110 | cdef dict _named_groups 111 | cdef dict _named_indexes 112 | 113 | def __init__(self, object pattern_object, int num_groups): 114 | self._lastindex = -1 115 | self._groups = None 116 | self._pos = 0 117 | self._endpos = -1 118 | self.matches = _re2.new_StringPiece_array(num_groups + 1) 119 | self.nmatches = num_groups 120 | self._pattern_object = pattern_object 121 | 122 | def __dealloc__(self): 123 | _re2.delete_StringPiece_array(self.matches) 124 | 125 | property re: 126 | def __get__(self): 127 | return self._pattern_object 128 | 129 | property pos: 130 | def __get__(self): 131 | return self._pos 132 | 133 | property endpos: 134 | def __get__(self): 135 | return self._endpos 136 | 137 | property string: 138 | def __get__(self): 139 | return self.match_string 140 | 141 | cdef init_groups(self): 142 | cdef list groups = [] 143 | cdef int i 144 | cdef bint cur_encoded = self.encoded 145 | 146 | if self._groups is not None: 147 | return 148 | 149 | cdef _re2.const_char_ptr last_end = NULL 150 | cdef _re2.const_char_ptr cur_end = NULL 151 | 152 | for i in range(self.nmatches): 153 | if self.matches[i].data() == NULL: 154 | groups.append(None) 155 | else: 156 | if i > 0: 157 | cur_end = self.matches[i].data() + self.matches[i].length() 158 | 159 | if last_end == NULL: 160 | last_end = cur_end 161 | self._lastindex = i 162 | else: 163 | # The rules for last group are a bit complicated: 164 | # if two groups end at the same point, the earlier one is considered last 165 | # so we don't switch our selection unless the end point has moved 166 | if cur_end > last_end: 167 | last_end = cur_end 168 | self._lastindex = i 169 | 170 | if cur_encoded: 171 | groups.append(char_to_utf8(self.matches[i].data(), self.matches[i].length())) 172 | else: 173 | groups.append(self.matches[i].data()[:self.matches[i].length()]) 174 | self._groups = tuple(groups) 175 | 176 | def groups(self, default=None): 177 | self.init_groups() 178 | if default is not None: 179 | return tuple([g or default for g in self._groups[1:]]) 180 | return self._groups[1:] 181 | 182 | def group(self, *args): 183 | try: 184 | string = basestring 185 | except NameError as e: 186 | string = (str, bytes) 187 | if len(args) > 1: 188 | return tuple([self.group(i) for i in args]) 189 | elif len(args) > 0: 190 | groupnum = args[0] 191 | else: 192 | groupnum = 0 193 | 194 | cdef int idx 195 | 196 | self.init_groups() 197 | 198 | if isinstance(groupnum, string): 199 | return self.groupdict()[groupnum] 200 | 201 | idx = groupnum 202 | 203 | if idx > self.nmatches - 1: 204 | raise IndexError("no such group") 205 | return self._groups[idx] 206 | 207 | cdef object _convert_positions(self, positions): 208 | cdef char * s = self.match_string 209 | cdef int cpos = 0 210 | cdef int upos = 0 211 | cdef int size = len(self.match_string) 212 | cdef int c 213 | 214 | new_positions = [] 215 | i = 0 216 | num_positions = len(positions) 217 | if positions[i] == -1: 218 | new_positions.append(-1) 219 | inc(i) 220 | if i == num_positions: 221 | return new_positions 222 | if positions[i] == 0: 223 | new_positions.append(0) 224 | inc(i) 225 | if i == num_positions: 226 | return new_positions 227 | 228 | while cpos < size: 229 | c = <unsigned char>s[cpos] 230 | if c < 0x80: 231 | inc(cpos) 232 | inc(upos) 233 | elif c < 0xe0: 234 | cpos += 2 235 | inc(upos) 236 | elif c < 0xf0: 237 | cpos += 3 238 | inc(upos) 239 | else: 240 | cpos += 4 241 | inc(upos) 242 | # wide unicode chars get 2 unichars when python is compiled with --enable-unicode=ucs2 243 | # TODO: verify this 244 | emit_ifndef_py_unicode_wide() 245 | inc(upos) 246 | emit_endif() 247 | 248 | if positions[i] == cpos: 249 | new_positions.append(upos) 250 | inc(i) 251 | if i == num_positions: 252 | return new_positions 253 | 254 | def _convert_spans(self, spans): 255 | positions = [x for x,y in spans] + [y for x,y in spans] 256 | positions = sorted(set(positions)) 257 | posdict = dict(zip(positions, self._convert_positions(positions))) 258 | 259 | return [(posdict[x], posdict[y]) for x,y in spans] 260 | 261 | 262 | cdef _make_spans(self): 263 | if self._spans is not None: 264 | return 265 | 266 | cdef int start, end 267 | cdef char * s = self.match_string 268 | cdef _re2.StringPiece * piece 269 | 270 | spans = [] 271 | for i in range(self.nmatches): 272 | if self.matches[i].data() == NULL: 273 | spans.append((-1, -1)) 274 | else: 275 | piece = &self.matches[i] 276 | if piece.data() == NULL: 277 | return (-1, -1) 278 | start = piece.data() - s 279 | end = start + piece.length() 280 | spans.append((start, end)) 281 | 282 | if self.encoded: 283 | spans = self._convert_spans(spans) 284 | 285 | self._spans = tuple(spans) 286 | 287 | property regs: 288 | def __get__(self): 289 | if self._spans is None: 290 | self._make_spans() 291 | return self._spans 292 | 293 | def expand(self, object template): 294 | # TODO - This can be optimized to work a bit faster in C. 295 | # Expand a template with groups 296 | items = template.split('\\') 297 | for i, item in enumerate(items[1:]): 298 | if item[0].isdigit(): 299 | # Number group 300 | if item[0] == '0': 301 | items[i + 1] = '\x00' + item[1:] 302 | else: 303 | items[i + 1] = self.group(int(item[0])) + item[1:] 304 | elif item[:2] == 'g<' and '>' in item: 305 | # This is a named group 306 | name, rest = item[2:].split('>', 1) 307 | items[i + 1] = self.group(name) + rest 308 | else: 309 | # This isn't a template at all 310 | items[i + 1] = '\\' + item 311 | return ''.join(items) 312 | 313 | def groupdict(self): 314 | cdef _re2.stringintmapiterator it 315 | cdef dict result = {} 316 | cdef dict indexes = {} 317 | 318 | self.init_groups() 319 | 320 | if self._named_groups: 321 | return self._named_groups 322 | 323 | self._named_groups = result 324 | it = self.named_groups.begin() 325 | while it != self.named_groups.end(): 326 | indexes[cpp_to_pystring(deref(it).first)] = deref(it).second 327 | result[cpp_to_pystring(deref(it).first)] = self._groups[deref(it).second] 328 | inc(it) 329 | 330 | self._named_groups = result 331 | self._named_indexes = indexes 332 | return result 333 | 334 | def end(self, group=0): 335 | return self.span(group)[1] 336 | 337 | def start(self, group=0): 338 | return self.span(group)[0] 339 | 340 | def span(self, group=0): 341 | self._make_spans() 342 | if type(group) is int: 343 | if group > len(self._spans): 344 | raise IndexError("no such group") 345 | return self._spans[group] 346 | else: 347 | self.groupdict() 348 | if group not in self._named_indexes: 349 | raise IndexError("no such group") 350 | return self._spans[self._named_indexes[group]] 351 | 352 | 353 | property lastindex: 354 | def __get__(self): 355 | self.init_groups() 356 | if self._lastindex < 1: 357 | return None 358 | else: 359 | return self._lastindex 360 | 361 | property lastgroup: 362 | def __get__(self): 363 | self.init_groups() 364 | cdef _re2.stringintmapiterator it 365 | 366 | if self._lastindex < 1: 367 | return None 368 | 369 | it = self.named_groups.begin() 370 | while it != self.named_groups.end(): 371 | if deref(it).second == self._lastindex: 372 | return cpp_to_pystring(deref(it).first) 373 | inc(it) 374 | 375 | return None 376 | 377 | 378 | cdef class Pattern: 379 | cdef _re2.RE2 * re_pattern 380 | cdef int ngroups 381 | cdef bint encoded 382 | cdef int _flags 383 | cdef public object pattern 384 | cdef object __weakref__ 385 | 386 | property flags: 387 | def __get__(self): 388 | return self._flags 389 | 390 | property groups: 391 | def __get__(self): 392 | return self.ngroups 393 | 394 | def __dealloc__(self): 395 | del self.re_pattern 396 | 397 | cdef _search(self, string, int pos, int endpos, _re2.re2_Anchor anchoring): 398 | """ 399 | Scan through string looking for a match, and return a corresponding 400 | Match instance. Return None if no position in the string matches. 401 | """ 402 | cdef Py_ssize_t size 403 | cdef int result 404 | cdef char * cstring 405 | cdef int encoded = 0 406 | cdef _re2.StringPiece * sp 407 | cdef Match m = Match(self, self.ngroups + 1) 408 | 409 | if hasattr(string, 'tostring'): 410 | string = string.tostring() 411 | 412 | string = unicode_to_bytestring(string, &encoded) 413 | 414 | if pystring_to_bytestring(string, &cstring, &size) == -1: 415 | raise TypeError("expected string or buffer") 416 | 417 | if endpos >= 0 and endpos <= pos: 418 | return None 419 | 420 | if endpos >= 0 and endpos < size: 421 | size = endpos 422 | 423 | if pos > size: 424 | return None 425 | 426 | sp = new _re2.StringPiece(cstring, size) 427 | with nogil: 428 | result = self.re_pattern.Match(sp[0], <int>pos, <int>size, anchoring, m.matches, self.ngroups + 1) 429 | 430 | del sp 431 | if result == 0: 432 | return None 433 | m.encoded = <bint>(encoded) 434 | m.named_groups = _re2.addressof(self.re_pattern.NamedCapturingGroups()) 435 | m.nmatches = self.ngroups + 1 436 | m.match_string = string 437 | m._pos = pos 438 | if endpos == -1: 439 | m._endpos = len(string) 440 | else: 441 | m._endpos = endpos 442 | return m 443 | 444 | 445 | def search(self, string, int pos=0, int endpos=-1): 446 | """ 447 | Scan through string looking for a match, and return a corresponding 448 | Match instance. Return None if no position in the string matches. 449 | """ 450 | return self._search(string, pos, endpos, _re2.UNANCHORED) 451 | 452 | 453 | def match(self, string, int pos=0, int endpos=-1): 454 | """ 455 | Matches zero or more characters at the beginning of the string. 456 | """ 457 | return self._search(string, pos, endpos, _re2.ANCHOR_START) 458 | 459 | cdef _print_pattern(self): 460 | cdef _re2.cpp_string * s 461 | s = <_re2.cpp_string *>_re2.addressofs(self.re_pattern.pattern()) 462 | print cpp_to_pystring(s[0]) + "\n" 463 | sys.stdout.flush() 464 | 465 | 466 | cdef _finditer(self, object string, int pos=0, int endpos=-1, int as_match=0): 467 | cdef Py_ssize_t size 468 | cdef int result 469 | cdef char * cstring 470 | cdef _re2.StringPiece * sp 471 | cdef Match m 472 | cdef list resultlist = [] 473 | cdef int encoded = 0 474 | 475 | string = unicode_to_bytestring(string, &encoded) 476 | if pystring_to_bytestring(string, &cstring, &size) == -1: 477 | raise TypeError("expected string or buffer") 478 | encoded = <bint>encoded 479 | 480 | if endpos != -1 and endpos < size: 481 | size = endpos 482 | 483 | sp = new _re2.StringPiece(cstring, size) 484 | 485 | while True: 486 | m = Match(self, self.ngroups + 1) 487 | with nogil: 488 | result = self.re_pattern.Match(sp[0], <int>pos, <int>size, _re2.UNANCHORED, m.matches, self.ngroups + 1) 489 | if result == 0: 490 | break 491 | m.encoded = encoded 492 | m.named_groups = _re2.addressof(self.re_pattern.NamedCapturingGroups()) 493 | m.nmatches = self.ngroups + 1 494 | m.match_string = string 495 | m._pos = pos 496 | if endpos == -1: 497 | m._endpos = len(string) 498 | else: 499 | m._endpos = endpos 500 | if as_match: 501 | if self.ngroups > 1: 502 | resultlist.append(m.groups("")) 503 | else: 504 | resultlist.append(m.group(self.ngroups)) 505 | else: 506 | resultlist.append(m) 507 | if pos == size: 508 | break 509 | # offset the pos to move to the next point 510 | if m.matches[0].length() == 0: 511 | pos += 1 512 | else: 513 | pos = m.matches[0].data() - cstring + m.matches[0].length() 514 | del sp 515 | return resultlist 516 | 517 | def finditer(self, object string, int pos=0, int endpos=-1): 518 | """ 519 | Return all non-overlapping matches of pattern in string as a list 520 | of match objects. 521 | """ 522 | # TODO This builds a list and returns its iterator. Probably could be more memory efficient 523 | return self._finditer(string, pos, endpos, 0).__iter__() 524 | 525 | def findall(self, object string, int pos=0, int endpos=-1): 526 | """ 527 | Return all non-overlapping matches of pattern in string as a list 528 | of strings. 529 | """ 530 | return self._finditer(string, pos, endpos, 1) 531 | 532 | def split(self, string, int maxsplit=0): 533 | """ 534 | split(string[, maxsplit = 0]) --> list 535 | Split a string by the occurances of the pattern. 536 | """ 537 | cdef Py_ssize_t size 538 | cdef int num_groups = 1 539 | cdef int result 540 | cdef int endpos 541 | cdef int pos = 0 542 | cdef int lookahead = 0 543 | cdef int num_split = 0 544 | cdef char * cstring 545 | cdef _re2.StringPiece * sp 546 | cdef _re2.StringPiece * matches 547 | cdef Match m 548 | cdef list resultlist = [] 549 | cdef int encoded = 0 550 | 551 | if maxsplit < 0: 552 | maxsplit = 0 553 | 554 | string = unicode_to_bytestring(string, &encoded) 555 | if pystring_to_bytestring(string, &cstring, &size) == -1: 556 | raise TypeError("expected string or buffer") 557 | 558 | encoded = <bint>encoded 559 | 560 | matches = _re2.new_StringPiece_array(self.ngroups + 1) 561 | sp = new _re2.StringPiece(cstring, size) 562 | 563 | while True: 564 | with nogil: 565 | result = self.re_pattern.Match(sp[0], <int>(pos + lookahead), <int>size, _re2.UNANCHORED, matches, self.ngroups + 1) 566 | if result == 0: 567 | break 568 | 569 | match_start = matches[0].data() - cstring 570 | match_end = match_start + matches[0].length() 571 | 572 | # If an empty match, just look ahead until you find something 573 | if match_start == match_end: 574 | if pos + lookahead == size: 575 | break 576 | lookahead += 1 577 | continue 578 | 579 | if encoded: 580 | resultlist.append(char_to_utf8(&sp.data()[pos], match_start - pos)) 581 | else: 582 | resultlist.append(sp.data()[pos:match_start]) 583 | if self.ngroups > 0: 584 | for group in range(self.ngroups): 585 | if matches[group + 1].data() == NULL: 586 | resultlist.append(None) 587 | else: 588 | if encoded: 589 | resultlist.append(char_to_utf8(matches[group + 1].data(), matches[group + 1].length())) 590 | else: 591 | resultlist.append(matches[group + 1].data()[:matches[group + 1].length()]) 592 | 593 | # offset the pos to move to the next point 594 | pos = match_end 595 | lookahead = 0 596 | 597 | num_split += 1 598 | if maxsplit and num_split >= maxsplit: 599 | break 600 | 601 | if encoded: 602 | resultlist.append(char_to_utf8(&sp.data()[pos], sp.length() - pos)) 603 | else: 604 | resultlist.append(sp.data()[pos:]) 605 | _re2.delete_StringPiece_array(matches) 606 | del sp 607 | return resultlist 608 | 609 | def sub(self, repl, string, int count=0): 610 | """ 611 | sub(repl, string[, count = 0]) --> newstring 612 | Return the string obtained by replacing the leftmost non-overlapping 613 | occurrences of pattern in string by the replacement repl. 614 | """ 615 | return self.subn(repl, string, count)[0] 616 | 617 | def subn(self, repl, string, int count=0): 618 | """ 619 | subn(repl, string[, count = 0]) --> (newstring, number of subs) 620 | Return the tuple (new_string, number_of_subs_made) found by replacing 621 | the leftmost non-overlapping occurrences of pattern with the 622 | replacement repl. 623 | """ 624 | cdef Py_ssize_t size 625 | cdef char * cstring 626 | cdef _re2.cpp_string * fixed_repl 627 | cdef _re2.StringPiece * sp 628 | cdef _re2.cpp_string * input_str 629 | cdef total_replacements = 0 630 | cdef int string_encoded = 0 631 | cdef int repl_encoded = 0 632 | cdef int encoded = 0 633 | 634 | if callable(repl): 635 | # This is a callback, so let's use the custom function 636 | return self._subn_callback(repl, string, count) 637 | 638 | string = unicode_to_bytestring(string, &string_encoded) 639 | repl = unicode_to_bytestring(repl, &repl_encoded) 640 | if pystring_to_bytestring(repl, &cstring, &size) == -1: 641 | raise TypeError("expected string or buffer") 642 | 643 | fixed_repl = NULL 644 | cdef _re2.const_char_ptr s = cstring 645 | cdef _re2.const_char_ptr end = s + size 646 | cdef int c = 0 647 | while s < end: 648 | c = s[0] 649 | if (c == '\\'): 650 | s += 1 651 | if s == end: 652 | raise RegexError("Invalid rewrite pattern") 653 | c = s[0] 654 | if c == '\\' or (c >= '0' and c <= '9'): 655 | if fixed_repl != NULL: 656 | fixed_repl.push_back('\\') 657 | fixed_repl.push_back(c) 658 | else: 659 | if fixed_repl == NULL: 660 | fixed_repl = new _re2.cpp_string(cstring, s - cstring - 1) 661 | if c == 'n': 662 | fixed_repl.push_back('\n') 663 | else: 664 | fixed_repl.push_back('\\') 665 | fixed_repl.push_back('\\') 666 | fixed_repl.push_back(c) 667 | else: 668 | if fixed_repl != NULL: 669 | fixed_repl.push_back(c) 670 | 671 | s += 1 672 | if fixed_repl != NULL: 673 | sp = new _re2.StringPiece(fixed_repl.c_str()) 674 | else: 675 | sp = new _re2.StringPiece(cstring, size) 676 | 677 | input_str = new _re2.cpp_string(string) 678 | if not count: 679 | total_replacements = _re2.pattern_GlobalReplace(input_str, 680 | self.re_pattern[0], 681 | sp[0]) 682 | elif count == 1: 683 | total_replacements = _re2.pattern_Replace(input_str, 684 | self.re_pattern[0], 685 | sp[0]) 686 | else: 687 | del fixed_repl 688 | del input_str 689 | del sp 690 | raise NotImplementedError("So far pyre2 does not support custom replacement counts") 691 | 692 | if string_encoded or (repl_encoded and total_replacements > 0): 693 | result = cpp_to_utf8(input_str[0]) 694 | else: 695 | result = cpp_to_pystring(input_str[0]) 696 | del fixed_repl 697 | del input_str 698 | del sp 699 | return (result, total_replacements) 700 | 701 | def _subn_callback(self, callback, string, int count=0): 702 | """ 703 | This function is probably the hardest to implement correctly. 704 | This is my first attempt, but if anybody has a better solution, please help out. 705 | """ 706 | cdef Py_ssize_t size 707 | cdef int result 708 | cdef int endpos 709 | cdef int pos = 0 710 | cdef int encoded = 0 711 | cdef int num_repl = 0 712 | cdef char * cstring 713 | cdef _re2.StringPiece * sp 714 | cdef Match m 715 | cdef list resultlist = [] 716 | 717 | if count < 0: 718 | count = 0 719 | 720 | string = unicode_to_bytestring(string, &encoded) 721 | if pystring_to_bytestring(string, &cstring, &size) == -1: 722 | raise TypeError("expected string or buffer") 723 | encoded = <bint>encoded 724 | 725 | sp = new _re2.StringPiece(cstring, size) 726 | 727 | try: 728 | while True: 729 | m = Match(self, self.ngroups + 1) 730 | with nogil: 731 | result = self.re_pattern.Match(sp[0], <int>pos, <int>size, _re2.UNANCHORED, m.matches, self.ngroups + 1) 732 | if result == 0: 733 | break 734 | 735 | endpos = m.matches[0].data() - cstring 736 | if encoded: 737 | resultlist.append(char_to_utf8(&sp.data()[pos], endpos - pos)) 738 | else: 739 | resultlist.append(sp.data()[pos:endpos]) 740 | pos = endpos + m.matches[0].length() 741 | 742 | m.encoded = encoded 743 | m.named_groups = _re2.addressof(self.re_pattern.NamedCapturingGroups()) 744 | m.nmatches = self.ngroups + 1 745 | m.match_string = string 746 | resultlist.append(callback(m) or '') 747 | 748 | num_repl += 1 749 | if count and num_repl >= count: 750 | break 751 | 752 | if encoded: 753 | resultlist.append(char_to_utf8(&sp.data()[pos], sp.length() - pos)) 754 | return (u''.join(resultlist), num_repl) 755 | else: 756 | resultlist.append(sp.data()[pos:]) 757 | return (''.join(resultlist), num_repl) 758 | finally: 759 | del sp 760 | 761 | _cache = {} 762 | _cache_repl = {} 763 | 764 | _MAXCACHE = 100 765 | 766 | def compile(pattern, int flags=0, int max_mem=8388608): 767 | cachekey = (type(pattern),) + (pattern, flags) 768 | p = _cache.get(cachekey) 769 | if p is not None: 770 | return p 771 | p = _compile(pattern, flags, max_mem) 772 | 773 | if len(_cache) >= _MAXCACHE: 774 | _cache.clear() 775 | _cache[cachekey] = p 776 | return p 777 | 778 | class BackreferencesException(Exception): 779 | pass 780 | 781 | class CharClassProblemException(Exception): 782 | pass 783 | 784 | WHITESPACE = set(" \t\n\r\v\f") 785 | 786 | class Tokenizer: 787 | def __init__(self, string): 788 | self.string = string 789 | self.index = 0 790 | self.__next() 791 | def __next(self): 792 | if self.index >= len(self.string): 793 | self.next = None 794 | return 795 | ch = self.string[self.index] 796 | if ch[0] == "\\": 797 | try: 798 | c = self.string[self.index + 1] 799 | except IndexError: 800 | raise RegexError, "bogus escape (end of line)" 801 | ch = ch + c 802 | self.index = self.index + len(ch) 803 | self.next = ch 804 | def get(self): 805 | this = self.next 806 | self.__next() 807 | return this 808 | 809 | def prepare_pattern(pattern, int flags): 810 | source = Tokenizer(pattern) 811 | new_pattern = [] 812 | 813 | cdef str strflags = '' 814 | if flags & _S: 815 | strflags += 's' 816 | if flags & _M: 817 | strflags += 'm' 818 | 819 | if strflags: 820 | new_pattern.append('(?' + strflags + ')') 821 | 822 | while 1: 823 | this = source.get() 824 | if this is None: 825 | break 826 | if flags & _X: 827 | if this in WHITESPACE: 828 | continue 829 | if this == "#": 830 | while 1: 831 | this = source.get() 832 | if this in (None, "\n"): 833 | break 834 | continue 835 | 836 | if this[0] not in '[\\': 837 | new_pattern.append(this) 838 | continue 839 | 840 | elif this == '[': 841 | new_pattern.append(this) 842 | while 1: 843 | this = source.get() 844 | if this is None: 845 | raise RegexError, "unexpected end of regular expression" 846 | elif this == ']': 847 | new_pattern.append(this) 848 | break 849 | elif this[0] == '\\': 850 | if flags & _U: 851 | if this[1] == 'd': 852 | new_pattern.append(r'\p{Nd}') 853 | elif this[1] == 'w': 854 | new_pattern.append(r'_\p{L}\p{Nd}') 855 | elif this[1] == 's': 856 | new_pattern.append(r'\s\p{Z}') 857 | elif this[1] == 'D': 858 | new_pattern.append(r'\P{Nd}') 859 | elif this[1] == 'W': 860 | # Since \w and \s are made out of several character groups, 861 | # I don't see a way to convert their complements into a group 862 | # without rewriting the whole expression, which seems too complicated. 863 | 864 | raise CharClassProblemException() 865 | elif this[1] == 'S': 866 | raise CharClassProblemException() 867 | else: 868 | new_pattern.append(this) 869 | else: 870 | new_pattern.append(this) 871 | else: 872 | new_pattern.append(this) 873 | elif this[0] == '\\': 874 | if this[1] in '89': 875 | raise BackreferencesException() 876 | elif this[1] in '1234567': 877 | if source.next and source.next in '1234567': 878 | this += source.get() 879 | if source.next and source.next in '1234567': 880 | # all clear, this is an octal escape 881 | new_pattern.append(this) 882 | else: 883 | raise BackreferencesException() 884 | else: 885 | raise BackreferencesException() 886 | elif flags & _U: 887 | if this[1] == 'd': 888 | new_pattern.append(r'\p{Nd}') 889 | elif this[1] == 'w': 890 | new_pattern.append(r'[_\p{L}\p{Nd}]') 891 | elif this[1] == 's': 892 | new_pattern.append(r'[\s\p{Z}]') 893 | elif this[1] == 'D': 894 | new_pattern.append(r'[^\p{Nd}]') 895 | elif this[1] == 'W': 896 | new_pattern.append(r'[^_\p{L}\p{Nd}]') 897 | elif this[1] == 'S': 898 | new_pattern.append(r'[^\s\p{Z}]') 899 | else: 900 | new_pattern.append(this) 901 | else: 902 | new_pattern.append(this) 903 | 904 | return ''.join(new_pattern) 905 | 906 | 907 | 908 | def _compile(pattern, int flags=0, int max_mem=8388608): 909 | """ 910 | Compile a regular expression pattern, returning a pattern object. 911 | """ 912 | cdef char * string 913 | cdef Py_ssize_t length 914 | cdef _re2.StringPiece * s 915 | cdef _re2.Options opts 916 | cdef int error_code 917 | cdef int encoded = 0 918 | 919 | if isinstance(pattern, (Pattern, SREPattern)): 920 | if flags: 921 | raise ValueError('Cannot process flags argument with a compiled pattern') 922 | return pattern 923 | 924 | cdef object original_pattern = pattern 925 | try: 926 | pattern = prepare_pattern(original_pattern, flags) 927 | except BackreferencesException: 928 | error_msg = "Backreferences not supported" 929 | if current_notification == <int>FALLBACK_EXCEPTION: 930 | # Raise an exception regardless of the type of error. 931 | raise RegexError(error_msg) 932 | elif current_notification == <int>FALLBACK_WARNING: 933 | warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) 934 | return re.compile(original_pattern, flags) 935 | except CharClassProblemException: 936 | error_msg = "\W and \S not supported inside character classes" 937 | if current_notification == <int>FALLBACK_EXCEPTION: 938 | # Raise an exception regardless of the type of error. 939 | raise RegexError(error_msg) 940 | elif current_notification == <int>FALLBACK_WARNING: 941 | warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) 942 | return re.compile(original_pattern, flags) 943 | 944 | # Set the options given the flags above. 945 | if flags & _I: 946 | opts.set_case_sensitive(0); 947 | 948 | opts.set_max_mem(max_mem) 949 | opts.set_log_errors(0) 950 | opts.set_encoding(_re2.EncodingUTF8) 951 | 952 | # We use this function to get the proper length of the string. 953 | 954 | pattern = unicode_to_bytestring(pattern, &encoded) 955 | if pystring_to_bytestring(pattern, &string, &length) == -1: 956 | raise TypeError("first argument must be a string or compiled pattern") 957 | 958 | s = new _re2.StringPiece(string, length) 959 | 960 | cdef _re2.RE2 *re_pattern 961 | with nogil: 962 | re_pattern = new _re2.RE2(s[0], opts) 963 | 964 | if not re_pattern.ok(): 965 | # Something went wrong with the compilation. 966 | del s 967 | error_msg = cpp_to_pystring(re_pattern.error()) 968 | error_code = re_pattern.error_code() 969 | del re_pattern 970 | if current_notification == <int>FALLBACK_EXCEPTION: 971 | # Raise an exception regardless of the type of error. 972 | raise RegexError(error_msg) 973 | elif error_code not in (_re2.ErrorBadPerlOp, _re2.ErrorRepeatSize, 974 | _re2.ErrorBadEscape): 975 | # Raise an error because these will not be fixed by using the 976 | # ``re`` module. 977 | raise RegexError(error_msg) 978 | elif current_notification == <int>FALLBACK_WARNING: 979 | warnings.warn("WARNING: Using re module. Reason: %s" % error_msg) 980 | return re.compile(original_pattern, flags) 981 | 982 | cdef Pattern pypattern = Pattern() 983 | pypattern.pattern = original_pattern 984 | pypattern.re_pattern = re_pattern 985 | pypattern.ngroups = re_pattern.NumberOfCapturingGroups() 986 | pypattern.encoded = <bint>encoded 987 | pypattern._flags = flags 988 | del s 989 | return pypattern 990 | 991 | def search(pattern, string, int flags=0): 992 | """ 993 | Scan through string looking for a match to the pattern, returning 994 | a match object or none if no match was found. 995 | """ 996 | return compile(pattern, flags).search(string) 997 | 998 | def match(pattern, string, int flags=0): 999 | """ 1000 | Try to apply the pattern at the start of the string, returning 1001 | a match object, or None if no match was found. 1002 | """ 1003 | return compile(pattern, flags).match(string) 1004 | 1005 | def finditer(pattern, string, int flags=0): 1006 | """ 1007 | Return an list of all non-overlapping matches in the 1008 | string. For each match, the iterator returns a match object. 1009 | 1010 | Empty matches are included in the result. 1011 | """ 1012 | return compile(pattern, flags).finditer(string) 1013 | 1014 | def findall(pattern, string, int flags=0): 1015 | """ 1016 | Return an list of all non-overlapping matches in the 1017 | string. For each match, the iterator returns a match object. 1018 | 1019 | Empty matches are included in the result. 1020 | """ 1021 | return compile(pattern, flags).findall(string) 1022 | 1023 | def split(pattern, string, int maxsplit=0): 1024 | """ 1025 | Split the source string by the occurrences of the pattern, 1026 | returning a list containing the resulting substrings. 1027 | """ 1028 | return compile(pattern).split(string, maxsplit) 1029 | 1030 | def sub(pattern, repl, string, int count=0): 1031 | """ 1032 | Return the string obtained by replacing the leftmost 1033 | non-overlapping occurrences of the pattern in string by the 1034 | replacement repl. repl can be either a string or a callable; 1035 | if a string, backslash escapes in it are processed. If it is 1036 | a callable, it's passed the match object and must return 1037 | a replacement string to be used. 1038 | """ 1039 | return compile(pattern).sub(repl, string, count) 1040 | 1041 | def subn(pattern, repl, string, int count=0): 1042 | """ 1043 | Return a 2-tuple containing (new_string, number). 1044 | new_string is the string obtained by replacing the leftmost 1045 | non-overlapping occurrences of the pattern in the source 1046 | string by the replacement repl. number is the number of 1047 | substitutions that were made. repl can be either a string or a 1048 | callable; if a string, backslash escapes in it are processed. 1049 | If it is a callable, it's passed the match object and must 1050 | return a replacement string to be used. 1051 | """ 1052 | return compile(pattern).subn(repl, string, count) 1053 | 1054 | _alphanum = {} 1055 | for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890': 1056 | _alphanum[c] = 1 1057 | del c 1058 | 1059 | def escape(pattern): 1060 | "Escape all non-alphanumeric characters in pattern." 1061 | s = list(pattern) 1062 | alphanum = _alphanum 1063 | for i in range(len(pattern)): 1064 | c = pattern[i] 1065 | if ord(c) < 0x80 and c not in alphanum: 1066 | if c == "\000": 1067 | s[i] = "\\000" 1068 | else: 1069 | s[i] = "\\" + c 1070 | return pattern[:0].join(s) 1071 | 1072 | --------------------------------------------------------------------------------