├── README
├── tests
    ├── __init__.py
    ├── wikipages.xml.gz
    ├── pattern.txt
    ├── split.txt
    ├── issue4.txt
    ├── sub.txt
    ├── re2_test.py
    ├── match_expand.txt
    ├── finditer.txt
    ├── search.txt
    ├── findall.txt
    ├── namedgroups.txt
    ├── unicode.txt
    ├── performance.py
    ├── re_tests.py
    └── test_re.py
├── src
    ├── #clib.pxd#
    ├── _re2macros.h
    ├── _re2.pxd
    └── re2.pyx
├── .gitignore
├── Makefile
├── AUTHORS
├── MANIFEST.in
├── CHANGELIST
├── LICENSE
├── setup.py
└── README.rst


/README:
--------------------------------------------------------------------------------
1 | README.rst


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/#clib.pxd#:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/wikipages.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/axiak/pyre2/HEAD/tests/wikipages.xml.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | MANIFEST
 2 | /build
 3 | /dist
 4 | src/re2.html
 5 | src/re2.so
 6 | re2.so
 7 | *~
 8 | *.pyc
 9 | *.swp
10 | *.egg-info
11 | 


--------------------------------------------------------------------------------
/tests/pattern.txt:
--------------------------------------------------------------------------------
 1 | pattern tests
 2 | =============
 3 | 
 4 |     >>> import re2
 5 | 
 6 | We should be able to get back what we put in.
 7 | 
 8 |     >>> re2.compile("(foo|b[a]r?)").pattern
 9 |     '(foo|b[a]r?)'
10 | 


--------------------------------------------------------------------------------
/tests/split.txt:
--------------------------------------------------------------------------------
 1 | Split tests
 2 | ===========
 3 | 
 4 | This one tests to make sure that utf8 data is parsed correctly.
 5 | 
 6 |     >>> import re2 as re
 7 |     >>> a = '我很好, 你呢?'.decode('utf8')
 8 |     >>> print re.split(' ', a)
 9 |     [u'\u6211\u5f88\u597d,', u'\u4f60\u5462?']
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	rm -rf build &>/dev/null
 3 | 	rm -rf src/*.so &>/dev/null
 4 | 	rm -rf re2.so &>/dev/null
 5 | 	rm -rf src/re2.cpp &>/dev/null
 6 | 	python setup.py --cython build_ext --inplace
 7 | 
 8 | test: all
 9 | 	cp -v re2.so tests
10 | 	(cd tests && python re2_test.py)
11 | 	(cd tests && python test_re.py)
12 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | All contributors own the copyright to their own contributions, but agree
 2 | to release each of their contributions under the BSD license included
 3 | in this software.
 4 | 
 5 | Michael Axiak <mike@axiak.net>
 6 | 
 7 | Contributors
 8 | ============
 9 | 
10 | Alec Berryman <alec@thened.net>
11 | Israel Tsadok <itsadok@gmail.com>
12 | Alex Willmer <alex@moreati.org.uk>
13 | 


--------------------------------------------------------------------------------
/tests/issue4.txt:
--------------------------------------------------------------------------------
 1 | issue #4
 2 | ========
 3 | 
 4 |     >>> import re2
 5 |     >>> TERM_SPEC2 = re2.compile('([\W\d_]*)(([^\W\d_]*[-\.]*)*[^\W\d_])([\W\d_]*[^\W\d_]*)', re2.UNICODE)
 6 |     >>> TERM_SPEC2.search("a").groups()
 7 |     ('', 'a', '', '')
 8 | 
 9 | 
10 | Still broken because of unicode:
11 |     >>> TERM_SPEC2.search(u"Hello").groups()
12 |     (u'', u'Hello', u'Hell', u',')
13 | 


--------------------------------------------------------------------------------
/tests/sub.txt:
--------------------------------------------------------------------------------
 1 | Tests of substitution
 2 | =====================
 3 | 
 4 | This first test is just looking to replace things between parentheses
 5 | with an empty string.
 6 | 
 7 | 
 8 |     >>> import re2 as re
 9 |     >>> import hashlib
10 |     >>> import gzip
11 |     >>> data = gzip.open('wikipages.xml.gz').read()
12 |     >>> print hashlib.md5(re.sub('\(.*?\)', '', data)).hexdigest()
13 |     b7a469f55ab76cd5887c81dbb0cfe6d3
14 | 


--------------------------------------------------------------------------------
/tests/re2_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | import glob
 5 | import doctest
 6 | 
 7 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
 8 | 
 9 | os.chdir(os.path.dirname(__file__) or '.')
10 | 
11 | def testall():
12 |     for file in glob.glob(os.path.join(os.path.dirname(__file__), "*.txt")):
13 |         print "Testing %s..." % file
14 |         doctest.testfile(os.path.join(".", os.path.basename(file)))
15 | 
16 | if __name__ == "__main__":
17 |     testall()
18 | 


--------------------------------------------------------------------------------
/tests/match_expand.txt:
--------------------------------------------------------------------------------
 1 | Match Expand Tests
 2 | ==================
 3 | 
 4 | Match objects have an .expand() method which allows them to
 5 | expand templates as if the .sub() method was called on the pattern.
 6 | 
 7 |     >>> import re2 as re
 8 |     >>> m = re.match(r"(\w+) (\w+)\W+(?P<title>\w+)", "Isaac Newton, physicist")
 9 |     >>> m.expand(r"\2, \1")
10 |     'Newton, Isaac'
11 |     >>> m.expand(r"\1 \g<title>")
12 |     'Isaac physicist'
13 |     >>> m.expand(r"\0 \1 \2")
14 |     '\x00 Isaac Newton'
15 |     >>> m.expand(r"\3")
16 |     'physicist'
17 | 
18 | 


--------------------------------------------------------------------------------
/src/_re2macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef __RE2MACROS_H
 2 | #define __RE2MACROS_H
 3 | 
 4 | #include <stdio.h>
 5 | #include "re2/stringpiece.h"
 6 | 
 7 | static inline re2::StringPiece * new_StringPiece_array(int n)
 8 | {
 9 |     re2::StringPiece * sp = new re2::StringPiece[n];
10 |     return sp;
11 | }
12 | static inline void delete_StringPiece_array(re2::StringPiece* ptr)
13 | {
14 |     delete[] ptr;
15 | }
16 | 
17 | #define addressof(A) (&A)
18 | #define addressofs(A) (&A)
19 | 
20 | #define as_char(A) (char *)(A)
21 | #define pattern_Replace(A, B, C) re2::RE2::Replace((A), (B), (C))
22 | #define pattern_GlobalReplace(A, B, C) re2::RE2::GlobalReplace((A), (B), (C))
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include CHANGELIST
 2 | include Makefile
 3 | include LICENSE
 4 | include README
 5 | include tests/cnn_homepage.dat
 6 | include tests/performance.py
 7 | include tests/search.txt
 8 | include tests/finditer.txt
 9 | include tests/wikipages.xml.gz
10 | include tests/__init__.py
11 | include tests/match_expand.txt
12 | include tests/test.py
13 | include tests/pattern.txt
14 | include tests/sub.txt
15 | include tests/unicode.txt
16 | include tests/findall.txt
17 | include tests/split.txt
18 | include AUTHORS
19 | include README.rst
20 | include src/_re2macros.h
21 | include src/_re2.pxd
22 | include src/re2.cpp
23 | include src/re2.pyx
24 | include MANIFEST
25 | include setup.py
26 | 


--------------------------------------------------------------------------------
/tests/finditer.txt:
--------------------------------------------------------------------------------
 1 | Simple tests for the ``finditer`` function.
 2 | ===========================================
 3 | 
 4 |     >>> import re2 as re
 5 | 
 6 |     >>> len(list(re.finditer(r'\w+', open("cnn_homepage.dat").read())))
 7 |     14230
 8 | 
 9 |     >>> [m.group(1) for m in re.finditer(r'\n#hdr-editions(.*?)\n', open("cnn_homepage.dat").read())]
10 |     [' a { text-decoration:none; }', ' li { padding:0 10px; }', ' ul li.no-pad-left span { font-size:12px; }']
11 | 
12 |     >>> [m.group(1) for m in re.finditer(r'^#hdr-editions(.*?)$', open("cnn_homepage.dat").read(), re.M)]
13 |     [' a { text-decoration:none; }', ' li { padding:0 10px; }', ' ul li.no-pad-left span { font-size:12px; }']
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/search.txt:
--------------------------------------------------------------------------------
 1 | These are simple tests of the ``search`` function
 2 | =================================================
 3 | 
 4 |     >>> import re2 as re
 5 |     >>> re.search("((?:[01]?\d{1,2}|2[0-4]\d|25[0-5])\.){3}(?:[01]?\d{1,2}|2[0-4]\d|25[0-5])", "hello 28.224.2.1 test").group()
 6 |     '28.224.2.1'
 7 | 
 8 |     >>> re.search("(\d{3})\D?(\d{3})\D?(\d{4})", "800-555-1212").groups()
 9 |     ('800', '555', '1212')
10 | 
11 |     >>> input = 'a' * 999
12 |     >>> len(re.search('(?:a{1000})?a{999}', input).group())
13 |     999
14 | 
15 |     >>> re.search(r'\n#hdr-editions(.*?)\n', open("cnn_homepage.dat").read()).groups()
16 |     (' a { text-decoration:none; }',)
17 | 
18 | Verify some sanity checks
19 | 
20 |     >>> re.compile(r'x').search('x', 2000)
21 |     >>> re.compile(r'x').search('x', 1, -300)
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/findall.txt:
--------------------------------------------------------------------------------
 1 | findall tests
 2 | =============
 3 | 
 4 |     >>> import re2
 5 | 
 6 | This one is from http://docs.python.org/library/re.html?#finding-all-adverbs:
 7 | 
 8 |     >>> re2.findall(r"\w+ly", "He was carefully disguised but captured quickly by police.")
 9 |     ['carefully', 'quickly']
10 | 
11 | This one makes sure all groups are found:
12 | 
13 |     >>> re2.findall(r"(\w+)=(\d+)", "foo=1,foo=2")
14 |     [('foo', '1'), ('foo', '2')]
15 | 
16 | When there's only one matched group, it should not be returned in a tuple:
17 | 
18 |     >>> re2.findall(r"(\w)\w", "fx")
19 |     ['f']
20 | 
21 | Zero matches is an empty list:
22 | 
23 |     >>> re2.findall("(f)", "gggg")
24 |     []
25 | 
26 | If pattern matches an empty string, do it only once at the end:
27 | 
28 |     >>> re2.findall(".*", "foo")
29 |     ['foo', '']
30 | 
31 |     >>> re2.findall("", "foo")
32 |     ['', '', '', '']
33 | 


--------------------------------------------------------------------------------
/tests/namedgroups.txt:
--------------------------------------------------------------------------------
 1 | Testing some aspects of named groups
 2 | =================================================
 3 | 
 4 |     >>> import re2 as re
 5 | 
 6 |     >>> m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")
 7 |     >>> m.start("first_name")
 8 |     0
 9 |     >>> m.start("last_name")
10 |     8
11 | 
12 |     >>> m.span("last_name")
13 |     (8, 16)
14 |     >>> m.regs
15 |     ((0, 16), (0, 7), (8, 16))
16 | 
17 | Make sure positions are converted properly for unicode 
18 | 
19 |     >>> m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", u'\u05d9\u05e9\u05e8\u05d0\u05dc \u05e6\u05d3\u05d5\u05e7', re.UNICODE)
20 |     >>> m.start("first_name")
21 |     0
22 |     >>> m.start("last_name")
23 |     6
24 |     >>> m.end("last_name")
25 |     10
26 |     >>> m.regs
27 |     ((0, 10), (0, 5), (6, 10))
28 |     >>> m.span(2)
29 |     (6, 10)
30 |     >>> m.span("last_name")
31 |     (6, 10)
32 | 
33 | 


--------------------------------------------------------------------------------
/CHANGELIST:
--------------------------------------------------------------------------------
 1 | 0.2.12)
 2 |     - Fixed pyre2 to work with latest version of re2 (axiak) (issue #3)
 3 | 
 4 | 0.2.10) 2010-12-08
 5 |     - Added .flags to pattern to make that transparent (axiak)
 6 |     - Added Python re unit tests (itsadok)
 7 |     - Fixed error compatibility (axiak)
 8 |     - Fixed group spans to be translated to their decoded positions (itsadok)
 9 |     - Fixed test_bug_1140 in unit test (itsadok)
10 |     - Handle \n in replace manually (itsadok)
11 |     - Return an interator from finditer (itsadok)
12 |     - Have re.compile() accept SRE objects (moreati, itsadok)
13 |     - Fixed findall to use group(1) if available (itsadok)
14 |     - Fixed a mistaken use of verbose (itsadok)
15 |     - Fixed a memory leak in replacement (itsadok)
16 |     - Match delete[] to new[] calls to fix more memory leaks (itsadok)
17 |     - Change split to handle empty matches to be more compatible with sre.c (itsadok)
18 |     - Added group property to match re (itsadok)
19 |     - Added the ability to fallback to old re in case of back references (itsadok)
20 |     - Allow multiple arguments to group() (itsadok)
21 |     - Fixed infinite loop in pathological case of findall(".*", "foo")
22 | 
23 | 0.2.8) 2010-07-27
24 |     - Added .expand() to group objects (axiak)
25 |     - Input patterns are now kept for Python compatibility (alec)
26 |     - Fixed 64-bit support (alec)
27 |     - Fixed findall to support python symantics (alec)
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010, Michael Axiak <mike@axiak.net>
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | Neither the name of the <ORGANIZATION> nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 9 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
10 | 


--------------------------------------------------------------------------------
/tests/unicode.txt:
--------------------------------------------------------------------------------
 1 | Here are some tests to make sure that utf-8 works
 2 | =================================================
 3 | 
 4 |     >>> import re2 as re
 5 |     >>> a = u'\u6211\u5f88\u597d'
 6 |     >>> c = re.compile(a[0])
 7 |     >>> c.search(a).group()
 8 |     u'\u6211'
 9 | 
10 | Test unicode stickyness
11 | 
12 |     >>> re.sub(r'x', u'y', 'x')
13 |     u'y'
14 |     >>> re.sub(r'x', 'y', u'x')
15 |     u'y'
16 |     >>> re.sub(ur'x', 'y', 'x')
17 |     'y'
18 |     >>> re.findall(ur'.', 'x')
19 |     ['x']
20 |     >>> re.findall(ur'.', u'x')
21 |     [u'x']
22 |     >>> re.split(ur',', '1,2,3')
23 |     ['1', '2', '3']
24 |     >>> re.split(ur',', u'1,2,3')
25 |     [u'1', u'2', u'3']
26 |     >>> re.search(ur'(\d)', '1').group(1)
27 |     '1'
28 |     >>> re.search(ur'(\d)', u'1').group(1)
29 |     u'1'
30 | 
31 | Test unicode character groups
32 | 
33 |     >>> re.search(r'\d', u'\u0661', re.UNICODE).group(0)
34 |     u'\u0661'
35 |     >>> int(re.search(r'\d', u'\u0661', re.UNICODE).group(0))
36 |     1
37 |     >>> re.search(r'\w', u'\u0401')
38 |     >>> re.search(r'\w', u'\u0401', re.UNICODE).group(0)
39 |     u'\u0401'
40 |     >>> re.search(r'\s', u'\u1680', re.UNICODE).group(0)
41 |     u'\u1680'
42 |     >>> re.findall(r'[\s\d\w]', 'hey 123', re.UNICODE)
43 |     ['h', 'e', 'y', ' ', '1', '2', '3']
44 |     >>> re.search(r'\D', u'\u0661x', re.UNICODE).group(0)
45 |     u'x'
46 |     >>> re.search(r'\W', u'\u0401!', re.UNICODE).group(0)
47 |     u'!'
48 |     >>> re.search(r'\S', u'\u1680x', re.UNICODE).group(0)
49 |     u'x'
50 |     >>> re.search(r'[\D]', u'\u0661x', re.UNICODE).group(0)
51 |     u'x'
52 |     >>> re.search(r'[\W]', u'\u0401!', re.UNICODE).group(0)
53 |     u'!'
54 |     >>> re.search(r'[\S]', u'\u1680x', re.UNICODE).group(0)
55 |     u'x'
56 | 
57 | 
58 | Group positions need to be fixed with unicode
59 | 
60 |     >>> re.search(r' (.)', u'\U0001d200xxx\u1234 x').span(1) 
61 |     (6, 7)
62 |     >>> re.search(r' (.)', u'\U0001d200xxx\u1234 x'.encode('utf-8')).span(1) 
63 |     (11, 12)
64 | 
65 | Pos and endpos also need to be corrected
66 | 
67 |     >>> re.compile(r'x').findall(u'\u1234x', 1, 2)
68 |     [u'x']
69 | 
70 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import re
  5 | from distutils.core import setup, Extension, Command
  6 | 
  7 | MINIMUM_CYTHON_VERSION = '0.13'
  8 | 
  9 | 
 10 | def cmp(a, b):
 11 |     return (a > b) - (a < b)
 12 | 
 13 | 
 14 | class TestCommand(Command):
 15 |     description = 'Run packaged tests'
 16 |     user_options = []
 17 |     def initialize_options(self):
 18 |         pass
 19 | 
 20 |     def finalize_options(self):
 21 |         pass
 22 | 
 23 |     def run(self):
 24 |         from tests import re2_test
 25 |         re2_test.testall()
 26 | 
 27 | 
 28 | def version_compare(version1, version2):
 29 |     def normalize(v):
 30 |         return [int(x) for x in re.sub(r'(\.0+)*$','', v).split(".")]
 31 |     return cmp(normalize(version1), normalize(version2))
 32 | 
 33 | cmdclass = {'test': TestCommand}
 34 | 
 35 | ext_files = []
 36 | if '--cython' in sys.argv[1:]:
 37 |     # Using Cython
 38 |     sys.argv.remove('--cython')
 39 |     from Cython.Compiler.Main import Version
 40 |     if version_compare(MINIMUM_CYTHON_VERSION, Version.version) > 0:
 41 |         raise ValueError("Cython is version %s, but needs to be at least %s." %
 42 |                          (Version.version, MINIMUM_CYTHON_VERSION))
 43 |     from Cython.Distutils import build_ext
 44 |     cmdclass['build_ext'] = build_ext
 45 |     ext_files.append("src/re2.pyx")
 46 | else:
 47 |     # Building from C
 48 |     ext_files.append("src/re2.cpp")
 49 | 
 50 | 
 51 | # Locate the re2 module
 52 | _re2_prefixes = [
 53 |     '/usr',
 54 |     '/usr/local',
 55 |     '/opt/',
 56 | ]
 57 | 
 58 | for re2_prefix in _re2_prefixes:
 59 |     if os.path.exists(os.path.join(re2_prefix, "include", "re2")):
 60 |         break
 61 | else:
 62 |     re2_prefix = ""
 63 | 
 64 | BASE_DIR = os.path.dirname(__file__)
 65 | 
 66 | def get_long_description():
 67 |     readme_f = open(os.path.join(BASE_DIR, "README.rst"))
 68 |     readme = readme_f.read()
 69 |     readme_f.close()
 70 |     return readme
 71 | 
 72 | def get_authors():
 73 |     author_re = re.compile(r'^\s*(.*?)\s+<.*?\@.*?>', re.M)
 74 |     authors_f = open(os.path.join(BASE_DIR, "AUTHORS"))
 75 |     authors = [match.group(1) for match in author_re.finditer(authors_f.read())]
 76 |     authors_f.close()
 77 |     return ', '.join(authors)
 78 | 
 79 | def main():
 80 |     setup(
 81 |         name="re2",
 82 |         version="0.2.23",
 83 |         description="Python wrapper for Google's RE2 using Cython",
 84 |         long_description=get_long_description(),
 85 |         author=get_authors(),
 86 |         license="New BSD License",
 87 |         author_email = "mike@axiak.net",
 88 |         url = "http://github.com/axiak/pyre2/",
 89 |         ext_modules = [
 90 |             Extension(
 91 |                 "re2",
 92 |                 ext_files,
 93 |                 language="c++",
 94 |                 include_dirs=[os.path.join(re2_prefix, "include")] if re2_prefix else [],
 95 |                 libraries=["re2"],
 96 |                 extra_compile_args=['-std=c++11'],
 97 |                 library_dirs=[os.path.join(re2_prefix, "lib")] if re2_prefix else [],
 98 |                 runtime_library_dirs=[os.path.join(re2_prefix, "lib")] if re2_prefix else [],
 99 |             )
100 |         ],
101 |         cmdclass=cmdclass,
102 |         classifiers = [
103 |             'License :: OSI Approved :: BSD License',
104 |             'Programming Language :: Cython',
105 |             'Programming Language :: Python :: 2.5',
106 |             'Programming Language :: Python :: 2.6',
107 |             'Intended Audience :: Developers',
108 |             'Topic :: Software Development :: Libraries :: Python Modules',
109 |             ],
110 |         )
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 


--------------------------------------------------------------------------------
/src/_re2.pxd:
--------------------------------------------------------------------------------
  1 | cdef extern from *:
  2 |     ctypedef char* const_char_ptr "const char*"
  3 | 
  4 | cdef extern from "<string>" namespace "std":
  5 |     cdef cppclass string:
  6 |         string(char *)
  7 |         string(char *, size_t n)
  8 |         const_char_ptr c_str()
  9 |         int length()
 10 |         void push_back(char c)
 11 | 
 12 |     ctypedef string cpp_string "std::string"
 13 |     ctypedef string const_string "const std::string"
 14 | 
 15 | 
 16 | 
 17 | cdef extern from "<map>" namespace "std":
 18 |     cdef cppclass stringintmapiterator "std::map<std::string, int>::const_iterator":
 19 |         cpp_string first
 20 |         int second
 21 |         stringintmapiterator operator++()
 22 |         bint operator==(stringintmapiterator)
 23 |         stringintmapiterator& operator*(stringintmapiterator)
 24 |         bint operator!=(stringintmapiterator)
 25 | 
 26 |     cdef cppclass const_stringintmap "const std::map<std::string, int>":
 27 |         stringintmapiterator begin()
 28 |         stringintmapiterator end()
 29 |         int operator[](cpp_string)
 30 | 
 31 | 
 32 | cdef extern from "Python.h":
 33 |     int PyObject_AsCharBuffer(object, const_char_ptr *, Py_ssize_t *)
 34 |     char * PyString_AS_STRING(object)
 35 | 
 36 | cdef extern from "re2/stringpiece.h" namespace "re2":
 37 |     cdef cppclass StringPiece:
 38 |         StringPiece()
 39 |         StringPiece(const_char_ptr)
 40 |         StringPiece(const_char_ptr, int)
 41 |         const_char_ptr data()
 42 |         int copy(char * buf, size_t n, size_t pos)
 43 |         int length()
 44 | 
 45 |     ctypedef StringPiece const_StringPiece "const StringPiece"
 46 |  
 47 | cdef extern from "re2/re2.h" namespace "re2":
 48 |     cdef enum Anchor:
 49 |         UNANCHORED "RE2::UNANCHORED"
 50 |         ANCHOR_START "RE2::ANCHOR_START"
 51 |         ANCHOR_BOTH "RE2::ANCHOR_BOTH"
 52 | 
 53 |     ctypedef Anchor re2_Anchor "RE2::Anchor"
 54 | 
 55 |     cdef enum ErrorCode:
 56 |         NoError "RE2::NoError"
 57 |         ErrorInternal "RE2::ErrorInternal"
 58 |         # Parse errors
 59 |         ErrorBadEscape "RE2::ErrorBadEscape"          # bad escape sequence
 60 |         ErrorBadCharClass "RE2::ErrorBadCharClass"       # bad character class
 61 |         ErrorBadCharRange "RE2::ErrorBadCharRange"       # bad character class range
 62 |         ErrorMissingBracket "RE2::ErrorMissingBracket"     # missing closing ]
 63 |         ErrorMissingParen   "RE2::ErrorMissingParen"       # missing closing )
 64 |         ErrorTrailingBackslash "RE2::ErrorTrailingBackslash"  # trailing \ at end of regexp
 65 |         ErrorRepeatArgument "RE2::ErrorRepeatArgument"     # repeat argument missing, e.g. "*"
 66 |         ErrorRepeatSize "RE2::ErrorRepeatSize"         # bad repetition argument
 67 |         ErrorRepeatOp "RE2::ErrorRepeatOp"           # bad repetition operator
 68 |         ErrorBadPerlOp "RE2::ErrorBadPerlOp"          # bad perl operator
 69 |         ErrorBadUTF8 "RE2::ErrorBadUTF8"            # invalid UTF-8 in regexp
 70 |         ErrorBadNamedCapture "RE2::ErrorBadNamedCapture"    # bad named capture group
 71 |         ErrorPatternTooLarge "RE2::ErrorPatternTooLarge"    # pattern too large (compile failed)
 72 | 
 73 |     cdef enum Encoding:
 74 |         EncodingUTF8 "RE2::Options::EncodingUTF8"
 75 |         EncodingLatin1 "RE2::Options::EncodingLatin1"
 76 | 
 77 |     ctypedef Encoding re2_Encoding "RE2::Options::Encoding"
 78 | 
 79 |     cdef cppclass Options "RE2::Options":
 80 |         Options()
 81 |         void set_posix_syntax(int b)
 82 |         void set_longest_match(int b)
 83 |         void set_log_errors(int b)
 84 |         void set_max_mem(int m)
 85 |         void set_literal(int b)
 86 |         void set_never_nl(int b)
 87 |         void set_case_sensitive(int b)
 88 |         void set_perl_classes(int b)
 89 |         void set_word_boundary(int b)
 90 |         void set_one_line(int b)
 91 |         int case_sensitive()
 92 |         void set_encoding(re2_Encoding encoding)
 93 | 
 94 |     ctypedef Options const_Options "const RE2::Options"
 95 | 
 96 |     cdef cppclass RE2:
 97 |         RE2(const_StringPiece pattern, Options option) nogil
 98 |         RE2(const_StringPiece pattern) nogil
 99 |         int Match(const_StringPiece text, int startpos, int endpos,
100 |                   Anchor anchor, StringPiece * match, int nmatch) nogil
101 |         int NumberOfCapturingGroups()
102 |         int ok()
103 |         const_string pattern()
104 |         cpp_string error()
105 |         ErrorCode error_code()
106 |         const_stringintmap& NamedCapturingGroups()
107 | 
108 |     ctypedef RE2 const_RE2 "const RE2"
109 | 
110 | 
111 | # This header is used for ways to hack^Wbypass the cython
112 | # issues.
113 | cdef extern from "_re2macros.h":
114 |     StringPiece * new_StringPiece_array(int) nogil
115 |     void delete_StringPiece_array(StringPiece* ptr)
116 | 
117 |     # This fixes the bug Cython #548 whereby reference returns
118 |     # cannot be addressed, due to it not being an l-value
119 |     const_stringintmap * addressof(const_stringintmap&)
120 |     cpp_string * addressofs(cpp_string&)
121 |     char * as_char(const_char_ptr)
122 | 
123 |     # This fixes the bug whereby namespaces are causing
124 |     # cython to just break for Cpp arguments.
125 |     int pattern_Replace(cpp_string *str,
126 |                         const_RE2 pattern,
127 |                         const_StringPiece rewrite)
128 |     int pattern_GlobalReplace(cpp_string *str,
129 |                               const_RE2 pattern,
130 |                               const_StringPiece rewrite)
131 | 


--------------------------------------------------------------------------------
/tests/performance.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | This module runs the performance tests to compare the ``re`` module with the
  4 | ``re2`` module. You can just run it from the command line, assuming you have re2
  5 | installed, and it will output a table in ReST format comparing everything.
  6 | 
  7 | To add a test, you can add a function to the bottom of this page that uses the
  8 | @register_test() decorator. Alternatively, you can create a module that uses it and
  9 | import it.
 10 | """
 11 | from timeit import Timer
 12 | import simplejson
 13 | 
 14 | import re2
 15 | import re
 16 | try:
 17 |     import regex
 18 | except ImportError:
 19 |     regex = None
 20 | 
 21 | import os
 22 | import gzip
 23 | 
 24 | re2.set_fallback_notification(re2.FALLBACK_EXCEPTION)
 25 | 
 26 | os.chdir(os.path.dirname(__file__) or '.')
 27 | 
 28 | tests = {}
 29 | 
 30 | setup_code = """\
 31 | import re2
 32 | import re
 33 | from __main__ import tests, current_re
 34 | test = tests[%r]
 35 | """
 36 | 
 37 | current_re = [None]
 38 | 
 39 | 
 40 | 
 41 | 
 42 | def main():
 43 |     benchmarks = {}
 44 |     # Run all of the performance comparisons.
 45 |     for testname, method in tests.items():
 46 |         benchmarks[testname] = {}
 47 |         if regex is not None:
 48 |             modules = (re, re2, regex)
 49 |         else:
 50 |             modules = (re, re2)
 51 |         results = [None for module in modules]
 52 |         for i, module in enumerate(modules):
 53 |             # We pre-compile the pattern, because that's
 54 |             # what people do.
 55 |             current_re[0] = module.compile(method.pattern)
 56 | 
 57 |             results[i] = method(current_re[0], **method.data)
 58 | 
 59 |             # Run a test.
 60 |             t = Timer("test(current_re[0],**test.data)",
 61 |                       setup_code % testname)
 62 |             benchmarks[testname][module.__name__] = (t.timeit(method.num_runs),
 63 |                                                      method.__doc__.strip(),
 64 |                                                      method.pattern,
 65 |                                                      method.num_runs)
 66 |         for i in range(len(results) - 1):
 67 |             if results[i] != results[i + 1]:
 68 |                 raise ValueError("re2 output is not the same as re output: %s" % testname)
 69 | 
 70 |     benchmarks_to_ReST(benchmarks)
 71 | 
 72 | 
 73 | def benchmarks_to_ReST(benchmarks):
 74 |     """
 75 |     Convert dictionary to a nice table for ReST.
 76 |     """
 77 |     if regex is not None:
 78 |         headers = ('Test', 'Description', '# total runs', '``re`` time(s)', '``re2`` time(s)', '% ``re`` time', '``regex`` time(s)', '% ``regex`` time')
 79 |     else:
 80 |         headers = ('Test', 'Description', '# total runs', '``re`` time(s)', '``re2`` time(s)', '% ``regex`` time')
 81 |     table = [headers]
 82 |     f = lambda x: "%0.3f" % x
 83 |     p = lambda x: "%0.2f%%" % (x * 100)
 84 | 
 85 |     for test, data in benchmarks.items():
 86 |         row = [test, data["re"][1], str(data["re"][3]), f(data["re"][0]), f(data["re2"][0])]
 87 |         
 88 |         row.append(p(data["re2"][0] / data["re"][0]))
 89 |         if regex is not None:
 90 |             row.extend((f(data["regex"][0]), p(data["re2"][0] / data["regex"][0])))
 91 |         table.append(row)
 92 |     col_sizes = [0] * len(table[0])
 93 |     for col in range(len(table[0])):
 94 |         col_sizes[col] = max(len(row[col]) for row in table)
 95 | 
 96 |     def print_divider(symbol='-'):
 97 |         print '+' + '+'.join(symbol*col_size for col_size in col_sizes) + '+'
 98 |     def print_row(row):
 99 |         print '|' + '|'.join(item.ljust(col_sizes[i]) for i, item in enumerate(row)) + '|'
100 | 
101 |     print_divider()
102 |     print_row(table[0])
103 |     print_divider('=')
104 |     for row in table[1:]:
105 |         print_row(row)
106 |         print_divider()
107 | 
108 | 
109 | 
110 | 
111 | 
112 | ###############################################
113 | # Tests for performance
114 | ###############################################
115 | 
116 | 
117 | # Convenient decorator for registering a new test.
118 | def register_test(name, pattern, num_runs = 100, **data):
119 |     def decorator(method):
120 |         tests[name] = method
121 |         method.pattern = pattern
122 |         method.num_runs = num_runs
123 |         method.data = data
124 | 
125 |         return method
126 |     return decorator
127 | 
128 | 
129 | # This is the only function to get data right now,
130 | # but I could imagine other functions as well.
131 | _wikidata = None
132 | def getwikidata():
133 |     global _wikidata
134 |     if _wikidata is None:
135 |         _wikidata = gzip.open('wikipages.xml.gz').read()
136 |     return _wikidata
137 | 
138 | 
139 | 
140 | #register_test("Findall URI|Email",
141 | #              r'([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)',
142 | #              2,
143 | #              data=getwikidata())
144 | def findall_uriemail(pattern, data):
145 |     """
146 |     Find list of '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)'
147 |     """
148 |     return len(pattern.findall(data))
149 | 
150 | 
151 | 
152 | #register_test("Replace WikiLinks",
153 | #              r'(\[\[(^\|)+.*?\]\])',
154 | #              data=getwikidata())
155 | def replace_wikilinks(pattern, data):
156 |     """
157 |     This test replaces links of the form [[Obama|Barack_Obama]] to Obama.
158 |     """
159 |     return len(pattern.sub(r'\1', data))
160 | 
161 | 
162 | 
163 | #register_test("Remove WikiLinks",
164 | #              r'(\[\[(^\|)+.*?\]\])',
165 | #              data=getwikidata())
166 | def remove_wikilinks(pattern, data):
167 |     """
168 |     This test replaces links of the form [[Obama|Barack_Obama]] to the empty string
169 |     """
170 |     return len(pattern.sub(r'', data))
171 | 
172 | 
173 | 
174 | 
175 | 
176 | #register_test("Remove WikiLinks",
177 | #              r'(<page[^>]*>)',
178 | #              data=getwikidata())
179 | def split_pages(pattern, data):
180 |     """
181 |     This test splits the data by the <page> tag.
182 |     """
183 |     return len(pattern.split(data))
184 | 
185 | 
186 | def getweblogdata():
187 |     return open(os.path.join(os.path.dirname(__file__), 'access.log'))
188 | 
189 | @register_test("weblog scan",
190 |                #r'^(\S+) (\S+) (\S+) \[(\d{1,2})/(\w{3})/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -(\d{4})\] "(\S+) (\S+) (\S+)" (\d+) (\d+|-) "([^"]+)" "([^"]+)"\n',
191 | #               '(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ? (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (".*?"|-) (\S+) (\S+) (\S+) (\S+)',
192 |                '(\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) ? (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+)',
193 |                data=getweblogdata())
194 | def weblog_matches(pattern, data):
195 |     """
196 |     Match weblog data line by line.
197 |     """
198 |     total=0
199 |     for line in data.read()[:20000].splitlines():
200 |         p = pattern.search(line)
201 |         #for p in pattern.finditer(data.read()[:20000]):
202 |         if p:
203 |             total += len(p.groups())
204 |     data.seek(0)
205 | 
206 |     return 0
207 | 
208 | if __name__ == '__main__':
209 |     main()
210 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | =====
  2 | pyre2
  3 | =====
  4 | 
  5 | .. contents::
  6 | 
  7 | Summary
  8 | =======
  9 | 
 10 | pyre2 is a Python extension that wraps
 11 | `Google's RE2 regular expression library
 12 | <http://code.google.com/p/re2/>`_.
 13 | 
 14 | This version of pyre2 is similar to the one you'd
 15 | find at `facebook's github repository <http://github.com/facebook/pyre2/>`_
 16 | except that the stated goal of this version is to be a *drop-in replacement* for
 17 | the ``re`` module.
 18 | 
 19 | Backwards Compatibility
 20 | =======================
 21 | 
 22 | The stated goal of this module is to be a drop-in replacement for ``re``. 
 23 | My hope is that some will be able to go to the top of their module and put::
 24 | 
 25 |     try:
 26 |         import re2 as re
 27 |     except ImportError:
 28 |         import re
 29 | 
 30 | That being said, there are features of the ``re`` module that this module may
 31 | never have. For example, ``RE2`` does not handle lookahead assertions (``(?=...)``).
 32 | For this reason, the module will automatically fall back to the original ``re`` module
 33 | if there is a regex that it cannot handle.
 34 | 
 35 | However, there are times when you may want to be notified of a failover. For this reason,
 36 | I'm adding the single function ``set_fallback_notification`` to the module.
 37 | Thus, you can write::
 38 | 
 39 |     try:
 40 |         import re2 as re
 41 |     except ImportError:
 42 |         import re
 43 |     else:
 44 | 	re.set_fallback_notification(re.FALLBACK_WARNING)
 45 | 
 46 | And in the above example, ``set_fallback_notification`` can handle 3 values:
 47 | ``re.FALLBACK_QUIETLY`` (default), ``re.FALLBACK_WARNING`` (raises a warning), and
 48 | ``re.FALLBACK_EXCEPTION`` (which raises an exception).
 49 | 
 50 | **Note**: The re2 module treats byte strings as UTF-8. This is fully backwards compatible with 7-bit ascii.
 51 | However, bytes containing values larger than 0x7f are going to be treated very differently in re2 than in re.
 52 | The RE library quietly ignores invalid utf8 in input strings, and throws an exception on invalid utf8 in patterns.
 53 | For example:
 54 | 
 55 |     >>> re.findall(r'.', '\x80\x81\x82')
 56 |     ['\x80', '\x81', '\x82']
 57 |     >>> re2.findall(r'.', '\x80\x81\x82')
 58 |     []
 59 | 
 60 | If you require the use of regular expressions over an arbitrary stream of bytes, then this library might not be for you.
 61 | 
 62 | Installation
 63 | ============
 64 | 
 65 | To install, you must first install the prerequisites:
 66 | 
 67 | * The `re2 library from Google <http://code.google.com/p/re2/>`_
 68 | * The Python development headers (e.g. *sudo apt-get install python-dev*)
 69 | * A build environment with ``g++`` (e.g. *sudo apt-get install build-essential*)
 70 | 
 71 | After the prerequisites are installed, you can try installing using ``easy_install``::
 72 | 
 73 |     $ sudo easy_install re2
 74 | 
 75 | if you have setuptools installed (or use ``pip``).
 76 | 
 77 | If you don't want to use ``setuptools``, you can alternatively download the tarball from `pypi <http://pypi.python.org/pypi/re2/>`_.
 78 | 
 79 | Alternative to those, you can clone this repository and try installing it from there. To do this, run::
 80 | 
 81 |     $ git clone git://github.com/axiak/pyre2.git
 82 |     $ cd pyre2.git
 83 |     $ sudo python setup.py install
 84 | 
 85 | If you want to make changes to the bindings, you must have Cython >=0.13.
 86 | 
 87 | Unicode Support
 88 | ===============
 89 | 
 90 | One current issue is Unicode support. As you may know, ``RE2`` supports UTF8,
 91 | which is certainly distinct from unicode. Right now the module will automatically
 92 | encode any unicode string into utf8 for you, which is *slow* (it also has to
 93 | decode utf8 strings back into unicode objects on every substitution or split).
 94 | Therefore, you are better off using bytestrings in utf8 while working with RE2
 95 | and encoding things after everything you need done is finished.
 96 | 
 97 | Performance
 98 | ===========
 99 | 
100 | Performance is of course the point of this module, so it better perform well.
101 | Regular expressions vary widely in complexity, and the salient feature of ``RE2`` is
102 | that it behaves well asymptotically. This being said, for very simple substitutions,
103 | I've found that occasionally python's regular ``re`` module is actually slightly faster.
104 | However, when the ``re`` module gets slow, it gets *really* slow, while this module
105 | buzzes along.
106 | 
107 | In the below example, I'm running the data against 8MB of text from the collosal Wikipedia
108 | XML file. I'm running them multiple times, being careful to use the ``timeit`` module.
109 | To see more details, please see the `performance script <http://github.com/axiak/pyre2/tree/master/tests/performance.py>`_.
110 | 
111 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+
112 | |Test             |Description                                                                |# total runs|``re`` time(s)|``re2`` time(s)|% ``re`` time|``regex`` time(s)|% ``regex`` time|
113 | +=================+===========================================================================+============+==============+===============+=============+=================+================+
114 | |Findall URI|Email|Find list of '([a-zA-Z][a-zA-Z0-9]*)://([^ /]+)(/[^ ]*)?|([^ @]+)@([^ @]+)'|2           |19.961        |0.336          |1.68%        |11.463           |2.93%           |
115 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+
116 | |Replace WikiLinks|This test replaces links of the form [[Obama|Barack_Obama]] to Obama.      |100         |16.032        |2.622          |16.35%       |2.895            |90.54%          |
117 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+
118 | |Remove WikiLinks |This test splits the data by the <page> tag.                               |100         |15.983        |1.406          |8.80%        |2.252            |62.43%          |
119 | +-----------------+---------------------------------------------------------------------------+------------+--------------+---------------+-------------+-----------------+----------------+
120 | 
121 | Feel free to add more speed tests to the bottom of the script and send a pull request my way!
122 | 
123 | Current Status
124 | ==============
125 | 
126 | pyre2 has only received basic testing. Please use it
127 | and let me know if you run into any issues!
128 | 
129 | Contact
130 | =======
131 | 
132 | You can file bug reports on GitHub, or contact the author:
133 | `Mike Axiak  contact page <http://mike.axiak.net/contact>`_.
134 | 
135 | Tests
136 | =====
137 | 
138 | If you would like to help, one thing that would be very useful
139 | is writing comprehensive tests for this. It's actually really easy:
140 | 
141 | * Come up with regular expression problems using the regular python 're' module.
142 | * Write a session in python traceback format `Example <http://github.com/axiak/pyre2/blob/master/tests/search.txt>`_.
143 | * Replace your ``import re`` with ``import re2 as re``.
144 | * Save it as a .txt file in the tests directory. You can comment on it however you like and indent the code with 4 spaces.
145 | 
146 | Missing Features
147 | ================
148 | 
149 | Currently the features missing are:
150 | 
151 | * If you use substitution methods without a callback, a non 0/1 maxsplit argument is not supported.
152 | 
153 | 
154 | Credits
155 | =======
156 | 
157 | Though I ripped out the code, I'd like to thank David Reiss
158 | and Facebook for the initial inspiration. Plus, I got to
159 | gut this readme file!
160 | 
161 | Moreover, this library would of course not be possible if not for
162 | the immense work of the team at RE2 and the few people who work
163 | on Cython.
164 | 


--------------------------------------------------------------------------------
/tests/re_tests.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- mode: python -*-
  3 | 
  4 | # Re test suite and benchmark suite v1.5
  5 | 
  6 | # The 3 possible outcomes for each pattern
  7 | [SUCCEED, FAIL, SYNTAX_ERROR] = range(3)
  8 | 
  9 | # Benchmark suite (needs expansion)
 10 | #
 11 | # The benchmark suite does not test correctness, just speed.  The
 12 | # first element of each tuple is the regex pattern; the second is a
 13 | # string to match it against.  The benchmarking code will embed the
 14 | # second string inside several sizes of padding, to test how regex
 15 | # matching performs on large strings.
 16 | 
 17 | benchmarks = [
 18 | 
 19 |     # test common prefix
 20 |     ('Python|Perl', 'Perl'),    # Alternation
 21 |     ('(Python|Perl)', 'Perl'),  # Grouped alternation
 22 | 
 23 |     ('Python|Perl|Tcl', 'Perl'),        # Alternation
 24 |     ('(Python|Perl|Tcl)', 'Perl'),      # Grouped alternation
 25 | 
 26 |     ('(Python)\\1', 'PythonPython'),    # Backreference
 27 |     ('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization
 28 |     ('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets
 29 | 
 30 |     ('Python', 'Python'),               # Simple text literal
 31 |     ('.*Python', 'Python'),             # Bad text literal
 32 |     ('.*Python.*', 'Python'),           # Worse text literal
 33 |     ('.*(Python)', 'Python'),           # Bad text literal with grouping
 34 | 
 35 | ]
 36 | 
 37 | # Test suite (for verifying correctness)
 38 | #
 39 | # The test suite is a list of 5- or 3-tuples.  The 5 parts of a
 40 | # complete tuple are:
 41 | # element 0: a string containing the pattern
 42 | #         1: the string to match against the pattern
 43 | #         2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR)
 44 | #         3: a string that will be eval()'ed to produce a test string.
 45 | #            This is an arbitrary Python expression; the available
 46 | #            variables are "found" (the whole match), and "g1", "g2", ...
 47 | #            up to "g99" contain the contents of each group, or the
 48 | #            string 'None' if the group wasn't given a value, or the
 49 | #            string 'Error' if the group index was out of range;
 50 | #            also "groups", the return value of m.group() (a tuple).
 51 | #         4: The expected result of evaluating the expression.
 52 | #            If the two don't match, an error is reported.
 53 | #
 54 | # If the regex isn't expected to work, the latter two elements can be omitted.
 55 | 
 56 | tests = [
 57 |     # Test ?P< and ?P= extensions
 58 |     ('(?P<foo_123', '', SYNTAX_ERROR),      # Unterminated group identifier
 59 |     ('(?P<1>a)', '', SYNTAX_ERROR),         # Begins with a digit
 60 |     ('(?P<!>a)', '', SYNTAX_ERROR),         # Begins with an illegal char
 61 |     ('(?P<foo!>a)', '', SYNTAX_ERROR),      # Begins with an illegal char
 62 | 
 63 |     # Same tests, for the ?P= form
 64 |     ('(?P<foo_123>a)(?P=foo_123', 'aa', SYNTAX_ERROR),
 65 |     ('(?P<foo_123>a)(?P=1)', 'aa', SYNTAX_ERROR),
 66 |     ('(?P<foo_123>a)(?P=!)', 'aa', SYNTAX_ERROR),
 67 |     ('(?P<foo_123>a)(?P=foo_124', 'aa', SYNTAX_ERROR),  # Backref to undefined group
 68 | 
 69 |     ('(?P<foo_123>a)', 'a', SUCCEED, 'g1', 'a'),
 70 |     ('(?P<foo_123>a)(?P=foo_123)', 'aa', SUCCEED, 'g1', 'a'),
 71 | 
 72 |     # Test octal escapes
 73 |     ('\\1', 'a', SYNTAX_ERROR),    # Backreference
 74 |     ('[\\1]', '\1', SUCCEED, 'found', '\1'),  # Character
 75 |     ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'),
 76 |     ('\\141', 'a', SUCCEED, 'found', 'a'),
 77 |     ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'),
 78 | 
 79 |     # Test \0 is handled everywhere
 80 |     (r'\0', '\0', SUCCEED, 'found', '\0'),
 81 |     (r'[\0a]', '\0', SUCCEED, 'found', '\0'),
 82 |     (r'[a\0]', '\0', SUCCEED, 'found', '\0'),
 83 |     (r'[^a\0]', '\0', FAIL),
 84 | 
 85 |     # Test various letter escapes
 86 |     (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
 87 |     (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'),
 88 |     # NOTE: not an error under PCRE/PRE:
 89 |     # (r'\u', '', SYNTAX_ERROR),    # A Perl escape
 90 |     (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'),
 91 |     (r'\xff', '\377', SUCCEED, 'found', chr(255)),
 92 |     # new \x semantics
 93 |     (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)),
 94 |     (r'\x00f', '\017', FAIL, 'found', chr(15)),
 95 |     (r'\x00fe', '\376', FAIL, 'found', chr(254)),
 96 |     # (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)),
 97 |     # (r'\x00f', '\017', SUCCEED, 'found', chr(15)),
 98 |     # (r'\x00fe', '\376', SUCCEED, 'found', chr(254)),
 99 | 
100 |     (r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c",
101 |      SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"),
102 | 
103 |     # Test that . only matches \n in DOTALL mode
104 |     ('a.b', 'acb', SUCCEED, 'found', 'acb'),
105 |     ('a.b', 'a\nb', FAIL),
106 |     ('a.*b', 'acc\nccb', FAIL),
107 |     ('a.{4,5}b', 'acc\nccb', FAIL),
108 |     ('a.b', 'a\rb', SUCCEED, 'found', 'a\rb'),
109 |     ('a.b(?s)', 'a\nb', SUCCEED, 'found', 'a\nb'),
110 |     ('a.*(?s)b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
111 |     ('(?s)a.{4,5}b', 'acc\nccb', SUCCEED, 'found', 'acc\nccb'),
112 |     ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
113 | 
114 |     (')', '', SYNTAX_ERROR),           # Unmatched right bracket
115 |     ('', '', SUCCEED, 'found', ''),    # Empty pattern
116 |     ('abc', 'abc', SUCCEED, 'found', 'abc'),
117 |     ('abc', 'xbc', FAIL),
118 |     ('abc', 'axc', FAIL),
119 |     ('abc', 'abx', FAIL),
120 |     ('abc', 'xabcy', SUCCEED, 'found', 'abc'),
121 |     ('abc', 'ababc', SUCCEED, 'found', 'abc'),
122 |     ('ab*c', 'abc', SUCCEED, 'found', 'abc'),
123 |     ('ab*bc', 'abc', SUCCEED, 'found', 'abc'),
124 |     ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'),
125 |     ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
126 |     ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'),
127 |     ('ab+bc', 'abc', FAIL),
128 |     ('ab+bc', 'abq', FAIL),
129 |     ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
130 |     ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'),
131 |     ('ab?bc', 'abc', SUCCEED, 'found', 'abc'),
132 |     ('ab?bc', 'abbbbc', FAIL),
133 |     ('ab?c', 'abc', SUCCEED, 'found', 'abc'),
134 |     ('^abc$', 'abc', SUCCEED, 'found', 'abc'),
135 |     ('^abc$', 'abcc', FAIL),
136 |     ('^abc', 'abcc', SUCCEED, 'found', 'abc'),
137 |     ('^abc$', 'aabc', FAIL),
138 |     ('abc$', 'aabc', SUCCEED, 'found', 'abc'),
139 |     ('^', 'abc', SUCCEED, 'found+"-"', '-'),
140 |     ('$', 'abc', SUCCEED, 'found+"-"', '-'),
141 |     ('a.c', 'abc', SUCCEED, 'found', 'abc'),
142 |     ('a.c', 'axc', SUCCEED, 'found', 'axc'),
143 |     ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'),
144 |     ('a.*c', 'axyzd', FAIL),
145 |     ('a[bc]d', 'abc', FAIL),
146 |     ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'),
147 |     ('a[b-d]e', 'abd', FAIL),
148 |     ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'),
149 |     ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'),
150 |     ('a[-b]', 'a-', SUCCEED, 'found', 'a-'),
151 |     ('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'),
152 |     # NOTE: not an error under PCRE/PRE:
153 |     # ('a[b-]', 'a-', SYNTAX_ERROR),
154 |     ('a[]b', '-', SYNTAX_ERROR),
155 |     ('a[', '-', SYNTAX_ERROR),
156 |     ('a\\', '-', SYNTAX_ERROR),
157 |     ('abc)', '-', SYNTAX_ERROR),
158 |     ('(abc', '-', SYNTAX_ERROR),
159 |     ('a]', 'a]', SUCCEED, 'found', 'a]'),
160 |     ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'),
161 |     ('a[\]]b', 'a]b', SUCCEED, 'found', 'a]b'),
162 |     ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'),
163 |     ('a[^bc]d', 'abd', FAIL),
164 |     ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'),
165 |     ('a[^-b]c', 'a-c', FAIL),
166 |     ('a[^]b]c', 'a]c', FAIL),
167 |     ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'),
168 |     ('\\ba\\b', 'a-', SUCCEED, '"-"', '-'),
169 |     ('\\ba\\b', '-a', SUCCEED, '"-"', '-'),
170 |     ('\\ba\\b', '-a-', SUCCEED, '"-"', '-'),
171 |     ('\\by\\b', 'xy', FAIL),
172 |     ('\\by\\b', 'yz', FAIL),
173 |     ('\\by\\b', 'xyz', FAIL),
174 |     ('x\\b', 'xyz', FAIL),
175 |     ('x\\B', 'xyz', SUCCEED, '"-"', '-'),
176 |     ('\\Bz', 'xyz', SUCCEED, '"-"', '-'),
177 |     ('z\\B', 'xyz', FAIL),
178 |     ('\\Bx', 'xyz', FAIL),
179 |     ('\\Ba\\B', 'a-', FAIL, '"-"', '-'),
180 |     ('\\Ba\\B', '-a', FAIL, '"-"', '-'),
181 |     ('\\Ba\\B', '-a-', FAIL, '"-"', '-'),
182 |     ('\\By\\B', 'xy', FAIL),
183 |     ('\\By\\B', 'yz', FAIL),
184 |     ('\\By\\b', 'xy', SUCCEED, '"-"', '-'),
185 |     ('\\by\\B', 'yz', SUCCEED, '"-"', '-'),
186 |     ('\\By\\B', 'xyz', SUCCEED, '"-"', '-'),
187 |     ('ab|cd', 'abc', SUCCEED, 'found', 'ab'),
188 |     ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'),
189 |     ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'),
190 |     ('$b', 'b', FAIL),
191 |     ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'),
192 |     ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'),
193 |     ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'),
194 |     ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'),
195 |     ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'),
196 |     ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'),
197 |     ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'),
198 |     ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
199 |     ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
200 |     ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
201 |     (')(', '-', SYNTAX_ERROR),
202 |     ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'),
203 |     ('abc', '', FAIL),
204 |     ('a*', '', SUCCEED, 'found', ''),
205 |     ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'),
206 |     ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'),
207 |     ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'),
208 |     ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'),
209 |     ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'),
210 |     ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'),
211 |     ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'),
212 |     ('^(ab|cd)e', 'abcde', FAIL, 'xg1y', 'xy'),
213 |     ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'),
214 |     ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'),
215 |     ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'),
216 |     ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'),
217 |     ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
218 |     ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
219 |     ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'),
220 |     ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'),
221 |     ('a[bcd]+dcdcde', 'adcdcde', FAIL),
222 |     ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'),
223 |     ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'),
224 |     ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'),
225 |     ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'),
226 |     ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
227 |     ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'),
228 |     ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL),
229 |     ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL),
230 |     ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
231 |     ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
232 |     ('multiple words of text', 'uh-uh', FAIL),
233 |     ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
234 |     ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
235 |     ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
236 |     ('[k]', 'ab', FAIL),
237 |     ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'),
238 |     ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
239 |     ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
240 |     ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'),
241 |     ('(a+).\\1$', 'aaaaa', SUCCEED, 'found+"-"+g1', 'aaaaa-aa'),
242 |     ('^(a+).\\1$', 'aaaa', FAIL),
243 |     ('(abc)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
244 |     ('([a-c]+)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
245 |     ('(a)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
246 |     ('(a+)\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
247 |     ('(a+)+\\1', 'aa', SUCCEED, 'found+"-"+g1', 'aa-a'),
248 |     ('(a).+\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'),
249 |     ('(a)ba*\\1', 'aba', SUCCEED, 'found+"-"+g1', 'aba-a'),
250 |     ('(aa|a)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
251 |     ('(a|aa)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
252 |     ('(a+)a\\1$', 'aaa', SUCCEED, 'found+"-"+g1', 'aaa-a'),
253 |     ('([abc]*)\\1', 'abcabc', SUCCEED, 'found+"-"+g1', 'abcabc-abc'),
254 |     ('(a)(b)c|ab', 'ab', SUCCEED, 'found+"-"+g1+"-"+g2', 'ab-None-None'),
255 |     ('(a)+x', 'aaax', SUCCEED, 'found+"-"+g1', 'aaax-a'),
256 |     ('([ac])+x', 'aacx', SUCCEED, 'found+"-"+g1', 'aacx-c'),
257 |     ('([^/]*/)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'),
258 |     ('([^.]*)\\.([^:]*):[T ]+(.*)', 'track1.title:TBlah blah blah', SUCCEED, 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'),
259 |     ('([^N]*N)+', 'abNNxyzN', SUCCEED, 'found+"-"+g1', 'abNNxyzN-xyzN'),
260 |     ('([^N]*N)+', 'abNNxyz', SUCCEED, 'found+"-"+g1', 'abNN-N'),
261 |     ('([abc]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'abcx-abc'),
262 |     ('([abc]*)x', 'abc', FAIL),
263 |     ('([xyz]*)x', 'abcx', SUCCEED, 'found+"-"+g1', 'x-'),
264 |     ('(a)+b|aac', 'aac', SUCCEED, 'found+"-"+g1', 'aac-None'),
265 | 
266 |     # Test symbolic groups
267 | 
268 |     ('(?P<i d>aaa)a', 'aaaa', SYNTAX_ERROR),
269 |     ('(?P<id>aaa)a', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aaa'),
270 |     ('(?P<id>aa)(?P=id)', 'aaaa', SUCCEED, 'found+"-"+id', 'aaaa-aa'),
271 |     ('(?P<id>aa)(?P=xd)', 'aaaa', SYNTAX_ERROR),
272 | 
273 |     # Test octal escapes/memory references
274 | 
275 |     ('\\1', 'a', SYNTAX_ERROR),
276 |     ('\\09', chr(0) + '9', SUCCEED, 'found', chr(0) + '9'),
277 |     ('\\141', 'a', SUCCEED, 'found', 'a'),
278 |     ('(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)\\119', 'abcdefghijklk9', SUCCEED, 'found+"-"+g11', 'abcdefghijklk9-k'),
279 | 
280 |     # All tests from Perl
281 | 
282 |     ('abc', 'abc', SUCCEED, 'found', 'abc'),
283 |     ('abc', 'xbc', FAIL),
284 |     ('abc', 'axc', FAIL),
285 |     ('abc', 'abx', FAIL),
286 |     ('abc', 'xabcy', SUCCEED, 'found', 'abc'),
287 |     ('abc', 'ababc', SUCCEED, 'found', 'abc'),
288 |     ('ab*c', 'abc', SUCCEED, 'found', 'abc'),
289 |     ('ab*bc', 'abc', SUCCEED, 'found', 'abc'),
290 |     ('ab*bc', 'abbc', SUCCEED, 'found', 'abbc'),
291 |     ('ab*bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
292 |     ('ab{0,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
293 |     ('ab+bc', 'abbc', SUCCEED, 'found', 'abbc'),
294 |     ('ab+bc', 'abc', FAIL),
295 |     ('ab+bc', 'abq', FAIL),
296 |     ('ab{1,}bc', 'abq', FAIL),
297 |     ('ab+bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
298 |     ('ab{1,}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
299 |     ('ab{1,3}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
300 |     ('ab{3,4}bc', 'abbbbc', SUCCEED, 'found', 'abbbbc'),
301 |     ('ab{4,5}bc', 'abbbbc', FAIL),
302 |     ('ab?bc', 'abbc', SUCCEED, 'found', 'abbc'),
303 |     ('ab?bc', 'abc', SUCCEED, 'found', 'abc'),
304 |     ('ab{0,1}bc', 'abc', SUCCEED, 'found', 'abc'),
305 |     ('ab?bc', 'abbbbc', FAIL),
306 |     ('ab?c', 'abc', SUCCEED, 'found', 'abc'),
307 |     ('ab{0,1}c', 'abc', SUCCEED, 'found', 'abc'),
308 |     ('^abc$', 'abc', SUCCEED, 'found', 'abc'),
309 |     ('^abc$', 'abcc', FAIL),
310 |     ('^abc', 'abcc', SUCCEED, 'found', 'abc'),
311 |     ('^abc$', 'aabc', FAIL),
312 |     ('abc$', 'aabc', SUCCEED, 'found', 'abc'),
313 |     ('^', 'abc', SUCCEED, 'found', ''),
314 |     ('$', 'abc', SUCCEED, 'found', ''),
315 |     ('a.c', 'abc', SUCCEED, 'found', 'abc'),
316 |     ('a.c', 'axc', SUCCEED, 'found', 'axc'),
317 |     ('a.*c', 'axyzc', SUCCEED, 'found', 'axyzc'),
318 |     ('a.*c', 'axyzd', FAIL),
319 |     ('a[bc]d', 'abc', FAIL),
320 |     ('a[bc]d', 'abd', SUCCEED, 'found', 'abd'),
321 |     ('a[b-d]e', 'abd', FAIL),
322 |     ('a[b-d]e', 'ace', SUCCEED, 'found', 'ace'),
323 |     ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'),
324 |     ('a[-b]', 'a-', SUCCEED, 'found', 'a-'),
325 |     ('a[b-]', 'a-', SUCCEED, 'found', 'a-'),
326 |     ('a[b-a]', '-', SYNTAX_ERROR),
327 |     ('a[]b', '-', SYNTAX_ERROR),
328 |     ('a[', '-', SYNTAX_ERROR),
329 |     ('a]', 'a]', SUCCEED, 'found', 'a]'),
330 |     ('a[]]b', 'a]b', SUCCEED, 'found', 'a]b'),
331 |     ('a[^bc]d', 'aed', SUCCEED, 'found', 'aed'),
332 |     ('a[^bc]d', 'abd', FAIL),
333 |     ('a[^-b]c', 'adc', SUCCEED, 'found', 'adc'),
334 |     ('a[^-b]c', 'a-c', FAIL),
335 |     ('a[^]b]c', 'a]c', FAIL),
336 |     ('a[^]b]c', 'adc', SUCCEED, 'found', 'adc'),
337 |     ('ab|cd', 'abc', SUCCEED, 'found', 'ab'),
338 |     ('ab|cd', 'abcd', SUCCEED, 'found', 'ab'),
339 |     ('()ef', 'def', SUCCEED, 'found+"-"+g1', 'ef-'),
340 |     ('*a', '-', SYNTAX_ERROR),
341 |     ('(*)b', '-', SYNTAX_ERROR),
342 |     ('$b', 'b', FAIL),
343 |     ('a\\', '-', SYNTAX_ERROR),
344 |     ('a\\(b', 'a(b', SUCCEED, 'found+"-"+g1', 'a(b-Error'),
345 |     ('a\\(*b', 'ab', SUCCEED, 'found', 'ab'),
346 |     ('a\\(*b', 'a((b', SUCCEED, 'found', 'a((b'),
347 |     ('a\\\\b', 'a\\b', SUCCEED, 'found', 'a\\b'),
348 |     ('abc)', '-', SYNTAX_ERROR),
349 |     ('(abc', '-', SYNTAX_ERROR),
350 |     ('((a))', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'a-a-a'),
351 |     ('(a)b(c)', 'abc', SUCCEED, 'found+"-"+g1+"-"+g2', 'abc-a-c'),
352 |     ('a+b+c', 'aabbabc', SUCCEED, 'found', 'abc'),
353 |     ('a{1,}b{1,}c', 'aabbabc', SUCCEED, 'found', 'abc'),
354 |     ('a**', '-', SYNTAX_ERROR),
355 |     ('a.+?c', 'abcabc', SUCCEED, 'found', 'abc'),
356 |     ('(a+|b)*', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
357 |     ('(a+|b){0,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
358 |     ('(a+|b)+', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
359 |     ('(a+|b){1,}', 'ab', SUCCEED, 'found+"-"+g1', 'ab-b'),
360 |     ('(a+|b)?', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
361 |     ('(a+|b){0,1}', 'ab', SUCCEED, 'found+"-"+g1', 'a-a'),
362 |     (')(', '-', SYNTAX_ERROR),
363 |     ('[^ab]*', 'cde', SUCCEED, 'found', 'cde'),
364 |     ('abc', '', FAIL),
365 |     ('a*', '', SUCCEED, 'found', ''),
366 |     ('([abc])*d', 'abbbcd', SUCCEED, 'found+"-"+g1', 'abbbcd-c'),
367 |     ('([abc])*bcd', 'abcd', SUCCEED, 'found+"-"+g1', 'abcd-a'),
368 |     ('a|b|c|d|e', 'e', SUCCEED, 'found', 'e'),
369 |     ('(a|b|c|d|e)f', 'ef', SUCCEED, 'found+"-"+g1', 'ef-e'),
370 |     ('abcd*efg', 'abcdefg', SUCCEED, 'found', 'abcdefg'),
371 |     ('ab*', 'xabyabbbz', SUCCEED, 'found', 'ab'),
372 |     ('ab*', 'xayabbbz', SUCCEED, 'found', 'a'),
373 |     ('(ab|cd)e', 'abcde', SUCCEED, 'found+"-"+g1', 'cde-cd'),
374 |     ('[abhgefdc]ij', 'hij', SUCCEED, 'found', 'hij'),
375 |     ('^(ab|cd)e', 'abcde', FAIL),
376 |     ('(abc|)ef', 'abcdef', SUCCEED, 'found+"-"+g1', 'ef-'),
377 |     ('(a|b)c*d', 'abcd', SUCCEED, 'found+"-"+g1', 'bcd-b'),
378 |     ('(ab|ab*)bc', 'abc', SUCCEED, 'found+"-"+g1', 'abc-a'),
379 |     ('a([bc]*)c*', 'abc', SUCCEED, 'found+"-"+g1', 'abc-bc'),
380 |     ('a([bc]*)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
381 |     ('a([bc]+)(c*d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-bc-d'),
382 |     ('a([bc]*)(c+d)', 'abcd', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcd-b-cd'),
383 |     ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, 'found', 'adcdcde'),
384 |     ('a[bcd]+dcdcde', 'adcdcde', FAIL),
385 |     ('(ab|a)b*c', 'abc', SUCCEED, 'found+"-"+g1', 'abc-ab'),
386 |     ('((a)(b)c)(d)', 'abcd', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'),
387 |     ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, 'found', 'alpha'),
388 |     ('^a(bc+|b[eh])g|.h$', 'abh', SUCCEED, 'found+"-"+g1', 'bh-None'),
389 |     ('(bc+d$|ef*g.|h?i(j|k))', 'effgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
390 |     ('(bc+d$|ef*g.|h?i(j|k))', 'ij', SUCCEED, 'found+"-"+g1+"-"+g2', 'ij-ij-j'),
391 |     ('(bc+d$|ef*g.|h?i(j|k))', 'effg', FAIL),
392 |     ('(bc+d$|ef*g.|h?i(j|k))', 'bcdd', FAIL),
393 |     ('(bc+d$|ef*g.|h?i(j|k))', 'reffgz', SUCCEED, 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'),
394 |     ('((((((((((a))))))))))', 'a', SUCCEED, 'g10', 'a'),
395 |     ('((((((((((a))))))))))\\10', 'aa', SUCCEED, 'found', 'aa'),
396 | # Python does not have the same rules for \\41 so this is a syntax error
397 | #    ('((((((((((a))))))))))\\41', 'aa', FAIL),
398 | #    ('((((((((((a))))))))))\\41', 'a!', SUCCEED, 'found', 'a!'),
399 |     ('((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
400 |     ('(?i)((((((((((a))))))))))\\41', '', SYNTAX_ERROR),
401 |     ('(((((((((a)))))))))', 'a', SUCCEED, 'found', 'a'),
402 |     ('multiple words of text', 'uh-uh', FAIL),
403 |     ('multiple words', 'multiple words, yeah', SUCCEED, 'found', 'multiple words'),
404 |     ('(.*)c(.*)', 'abcde', SUCCEED, 'found+"-"+g1+"-"+g2', 'abcde-ab-de'),
405 |     ('\\((.*), (.*)\\)', '(a, b)', SUCCEED, 'g2+"-"+g1', 'b-a'),
406 |     ('[k]', 'ab', FAIL),
407 |     ('a[-]?c', 'ac', SUCCEED, 'found', 'ac'),
408 |     ('(abc)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
409 |     ('([a-c]*)\\1', 'abcabc', SUCCEED, 'g1', 'abc'),
410 |     ('(?i)abc', 'ABC', SUCCEED, 'found', 'ABC'),
411 |     ('(?i)abc', 'XBC', FAIL),
412 |     ('(?i)abc', 'AXC', FAIL),
413 |     ('(?i)abc', 'ABX', FAIL),
414 |     ('(?i)abc', 'XABCY', SUCCEED, 'found', 'ABC'),
415 |     ('(?i)abc', 'ABABC', SUCCEED, 'found', 'ABC'),
416 |     ('(?i)ab*c', 'ABC', SUCCEED, 'found', 'ABC'),
417 |     ('(?i)ab*bc', 'ABC', SUCCEED, 'found', 'ABC'),
418 |     ('(?i)ab*bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
419 |     ('(?i)ab*?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
420 |     ('(?i)ab{0,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
421 |     ('(?i)ab+?bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
422 |     ('(?i)ab+bc', 'ABC', FAIL),
423 |     ('(?i)ab+bc', 'ABQ', FAIL),
424 |     ('(?i)ab{1,}bc', 'ABQ', FAIL),
425 |     ('(?i)ab+bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
426 |     ('(?i)ab{1,}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
427 |     ('(?i)ab{1,3}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
428 |     ('(?i)ab{3,4}?bc', 'ABBBBC', SUCCEED, 'found', 'ABBBBC'),
429 |     ('(?i)ab{4,5}?bc', 'ABBBBC', FAIL),
430 |     ('(?i)ab??bc', 'ABBC', SUCCEED, 'found', 'ABBC'),
431 |     ('(?i)ab??bc', 'ABC', SUCCEED, 'found', 'ABC'),
432 |     ('(?i)ab{0,1}?bc', 'ABC', SUCCEED, 'found', 'ABC'),
433 |     ('(?i)ab??bc', 'ABBBBC', FAIL),
434 |     ('(?i)ab??c', 'ABC', SUCCEED, 'found', 'ABC'),
435 |     ('(?i)ab{0,1}?c', 'ABC', SUCCEED, 'found', 'ABC'),
436 |     ('(?i)^abc$', 'ABC', SUCCEED, 'found', 'ABC'),
437 |     ('(?i)^abc$', 'ABCC', FAIL),
438 |     ('(?i)^abc', 'ABCC', SUCCEED, 'found', 'ABC'),
439 |     ('(?i)^abc$', 'AABC', FAIL),
440 |     ('(?i)abc$', 'AABC', SUCCEED, 'found', 'ABC'),
441 |     ('(?i)^', 'ABC', SUCCEED, 'found', ''),
442 |     ('(?i)$', 'ABC', SUCCEED, 'found', ''),
443 |     ('(?i)a.c', 'ABC', SUCCEED, 'found', 'ABC'),
444 |     ('(?i)a.c', 'AXC', SUCCEED, 'found', 'AXC'),
445 |     ('(?i)a.*?c', 'AXYZC', SUCCEED, 'found', 'AXYZC'),
446 |     ('(?i)a.*c', 'AXYZD', FAIL),
447 |     ('(?i)a[bc]d', 'ABC', FAIL),
448 |     ('(?i)a[bc]d', 'ABD', SUCCEED, 'found', 'ABD'),
449 |     ('(?i)a[b-d]e', 'ABD', FAIL),
450 |     ('(?i)a[b-d]e', 'ACE', SUCCEED, 'found', 'ACE'),
451 |     ('(?i)a[b-d]', 'AAC', SUCCEED, 'found', 'AC'),
452 |     ('(?i)a[-b]', 'A-', SUCCEED, 'found', 'A-'),
453 |     ('(?i)a[b-]', 'A-', SUCCEED, 'found', 'A-'),
454 |     ('(?i)a[b-a]', '-', SYNTAX_ERROR),
455 |     ('(?i)a[]b', '-', SYNTAX_ERROR),
456 |     ('(?i)a[', '-', SYNTAX_ERROR),
457 |     ('(?i)a]', 'A]', SUCCEED, 'found', 'A]'),
458 |     ('(?i)a[]]b', 'A]B', SUCCEED, 'found', 'A]B'),
459 |     ('(?i)a[^bc]d', 'AED', SUCCEED, 'found', 'AED'),
460 |     ('(?i)a[^bc]d', 'ABD', FAIL),
461 |     ('(?i)a[^-b]c', 'ADC', SUCCEED, 'found', 'ADC'),
462 |     ('(?i)a[^-b]c', 'A-C', FAIL),
463 |     ('(?i)a[^]b]c', 'A]C', FAIL),
464 |     ('(?i)a[^]b]c', 'ADC', SUCCEED, 'found', 'ADC'),
465 |     ('(?i)ab|cd', 'ABC', SUCCEED, 'found', 'AB'),
466 |     ('(?i)ab|cd', 'ABCD', SUCCEED, 'found', 'AB'),
467 |     ('(?i)()ef', 'DEF', SUCCEED, 'found+"-"+g1', 'EF-'),
468 |     ('(?i)*a', '-', SYNTAX_ERROR),
469 |     ('(?i)(*)b', '-', SYNTAX_ERROR),
470 |     ('(?i)$b', 'B', FAIL),
471 |     ('(?i)a\\', '-', SYNTAX_ERROR),
472 |     ('(?i)a\\(b', 'A(B', SUCCEED, 'found+"-"+g1', 'A(B-Error'),
473 |     ('(?i)a\\(*b', 'AB', SUCCEED, 'found', 'AB'),
474 |     ('(?i)a\\(*b', 'A((B', SUCCEED, 'found', 'A((B'),
475 |     ('(?i)a\\\\b', 'A\\B', SUCCEED, 'found', 'A\\B'),
476 |     ('(?i)abc)', '-', SYNTAX_ERROR),
477 |     ('(?i)(abc', '-', SYNTAX_ERROR),
478 |     ('(?i)((a))', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'A-A-A'),
479 |     ('(?i)(a)b(c)', 'ABC', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABC-A-C'),
480 |     ('(?i)a+b+c', 'AABBABC', SUCCEED, 'found', 'ABC'),
481 |     ('(?i)a{1,}b{1,}c', 'AABBABC', SUCCEED, 'found', 'ABC'),
482 |     ('(?i)a**', '-', SYNTAX_ERROR),
483 |     ('(?i)a.+?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
484 |     ('(?i)a.*?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
485 |     ('(?i)a.{0,5}?c', 'ABCABC', SUCCEED, 'found', 'ABC'),
486 |     ('(?i)(a+|b)*', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
487 |     ('(?i)(a+|b){0,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
488 |     ('(?i)(a+|b)+', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
489 |     ('(?i)(a+|b){1,}', 'AB', SUCCEED, 'found+"-"+g1', 'AB-B'),
490 |     ('(?i)(a+|b)?', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
491 |     ('(?i)(a+|b){0,1}', 'AB', SUCCEED, 'found+"-"+g1', 'A-A'),
492 |     ('(?i)(a+|b){0,1}?', 'AB', SUCCEED, 'found+"-"+g1', '-None'),
493 |     ('(?i))(', '-', SYNTAX_ERROR),
494 |     ('(?i)[^ab]*', 'CDE', SUCCEED, 'found', 'CDE'),
495 |     ('(?i)abc', '', FAIL),
496 |     ('(?i)a*', '', SUCCEED, 'found', ''),
497 |     ('(?i)([abc])*d', 'ABBBCD', SUCCEED, 'found+"-"+g1', 'ABBBCD-C'),
498 |     ('(?i)([abc])*bcd', 'ABCD', SUCCEED, 'found+"-"+g1', 'ABCD-A'),
499 |     ('(?i)a|b|c|d|e', 'E', SUCCEED, 'found', 'E'),
500 |     ('(?i)(a|b|c|d|e)f', 'EF', SUCCEED, 'found+"-"+g1', 'EF-E'),
501 |     ('(?i)abcd*efg', 'ABCDEFG', SUCCEED, 'found', 'ABCDEFG'),
502 |     ('(?i)ab*', 'XABYABBBZ', SUCCEED, 'found', 'AB'),
503 |     ('(?i)ab*', 'XAYABBBZ', SUCCEED, 'found', 'A'),
504 |     ('(?i)(ab|cd)e', 'ABCDE', SUCCEED, 'found+"-"+g1', 'CDE-CD'),
505 |     ('(?i)[abhgefdc]ij', 'HIJ', SUCCEED, 'found', 'HIJ'),
506 |     ('(?i)^(ab|cd)e', 'ABCDE', FAIL),
507 |     ('(?i)(abc|)ef', 'ABCDEF', SUCCEED, 'found+"-"+g1', 'EF-'),
508 |     ('(?i)(a|b)c*d', 'ABCD', SUCCEED, 'found+"-"+g1', 'BCD-B'),
509 |     ('(?i)(ab|ab*)bc', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-A'),
510 |     ('(?i)a([bc]*)c*', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-BC'),
511 |     ('(?i)a([bc]*)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'),
512 |     ('(?i)a([bc]+)(c*d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-BC-D'),
513 |     ('(?i)a([bc]*)(c+d)', 'ABCD', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCD-B-CD'),
514 |     ('(?i)a[bcd]*dcdcde', 'ADCDCDE', SUCCEED, 'found', 'ADCDCDE'),
515 |     ('(?i)a[bcd]+dcdcde', 'ADCDCDE', FAIL),
516 |     ('(?i)(ab|a)b*c', 'ABC', SUCCEED, 'found+"-"+g1', 'ABC-AB'),
517 |     ('(?i)((a)(b)c)(d)', 'ABCD', SUCCEED, 'g1+"-"+g2+"-"+g3+"-"+g4', 'ABC-A-B-D'),
518 |     ('(?i)[a-zA-Z_][a-zA-Z0-9_]*', 'ALPHA', SUCCEED, 'found', 'ALPHA'),
519 |     ('(?i)^a(bc+|b[eh])g|.h$', 'ABH', SUCCEED, 'found+"-"+g1', 'BH-None'),
520 |     ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
521 |     ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'IJ', SUCCEED, 'found+"-"+g1+"-"+g2', 'IJ-IJ-J'),
522 |     ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'EFFG', FAIL),
523 |     ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'BCDD', FAIL),
524 |     ('(?i)(bc+d$|ef*g.|h?i(j|k))', 'REFFGZ', SUCCEED, 'found+"-"+g1+"-"+g2', 'EFFGZ-EFFGZ-None'),
525 |     ('(?i)((((((((((a))))))))))', 'A', SUCCEED, 'g10', 'A'),
526 |     ('(?i)((((((((((a))))))))))\\10', 'AA', SUCCEED, 'found', 'AA'),
527 |     #('(?i)((((((((((a))))))))))\\41', 'AA', FAIL),
528 |     #('(?i)((((((((((a))))))))))\\41', 'A!', SUCCEED, 'found', 'A!'),
529 |     ('(?i)(((((((((a)))))))))', 'A', SUCCEED, 'found', 'A'),
530 |     ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a))))))))))', 'A', SUCCEED, 'g1', 'A'),
531 |     ('(?i)(?:(?:(?:(?:(?:(?:(?:(?:(?:(a|b|c))))))))))', 'C', SUCCEED, 'g1', 'C'),
532 |     ('(?i)multiple words of text', 'UH-UH', FAIL),
533 |     ('(?i)multiple words', 'MULTIPLE WORDS, YEAH', SUCCEED, 'found', 'MULTIPLE WORDS'),
534 |     ('(?i)(.*)c(.*)', 'ABCDE', SUCCEED, 'found+"-"+g1+"-"+g2', 'ABCDE-AB-DE'),
535 |     ('(?i)\\((.*), (.*)\\)', '(A, B)', SUCCEED, 'g2+"-"+g1', 'B-A'),
536 |     ('(?i)[k]', 'AB', FAIL),
537 | #    ('(?i)abcd', 'ABCD', SUCCEED, 'found+"-"+\\found+"-"+\\\\found', 'ABCD-$&-\\ABCD'),
538 | #    ('(?i)a(bc)d', 'ABCD', SUCCEED, 'g1+"-"+\\g1+"-"+\\\\g1', 'BC-$1-\\BC'),
539 |     ('(?i)a[-]?c', 'AC', SUCCEED, 'found', 'AC'),
540 |     ('(?i)(abc)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
541 |     ('(?i)([a-c]*)\\1', 'ABCABC', SUCCEED, 'g1', 'ABC'),
542 |     ('a(?!b).', 'abad', SUCCEED, 'found', 'ad'),
543 |     ('a(?=d).', 'abad', SUCCEED, 'found', 'ad'),
544 |     ('a(?=c|d).', 'abad', SUCCEED, 'found', 'ad'),
545 |     ('a(?:b|c|d)(.)', 'ace', SUCCEED, 'g1', 'e'),
546 |     ('a(?:b|c|d)*(.)', 'ace', SUCCEED, 'g1', 'e'),
547 |     ('a(?:b|c|d)+?(.)', 'ace', SUCCEED, 'g1', 'e'),
548 |     ('a(?:b|(c|e){1,2}?|d)+?(.)', 'ace', SUCCEED, 'g1 + g2', 'ce'),
549 |     ('^(.+)?B', 'AB', SUCCEED, 'g1', 'A'),
550 | 
551 |     # lookbehind: split by : but not if it is escaped by -.
552 |     ('(?<!-):(.*?)(?<!-):', 'a:bc-:de:f', SUCCEED, 'g1', 'bc-:de' ),
553 |     # escaping with \ as we know it
554 |     ('(?<!\\\):(.*?)(?<!\\\):', 'a:bc\\:de:f', SUCCEED, 'g1', 'bc\\:de' ),
555 |     # terminating with ' and escaping with ? as in edifact
556 |     ("(?<!\\?)'(.*?)(?<!\\?)'", "a'bc?'de'f", SUCCEED, 'g1', "bc?'de" ),
557 | 
558 |     # Comments using the (?#...) syntax
559 | 
560 |     ('w(?# comment', 'w', SYNTAX_ERROR),
561 |     ('w(?# comment 1)xy(?# comment 2)z', 'wxyz', SUCCEED, 'found', 'wxyz'),
562 | 
563 |     # Check odd placement of embedded pattern modifiers
564 | 
565 |     # not an error under PCRE/PRE:
566 |     ('w(?i)', 'W', SUCCEED, 'found', 'W'),
567 |     # ('w(?i)', 'W', SYNTAX_ERROR),
568 | 
569 |     # Comments using the x embedded pattern modifier
570 | 
571 |     ("""(?x)w# comment 1
572 |         x y
573 |         # comment 2
574 |         z""", 'wxyz', SUCCEED, 'found', 'wxyz'),
575 | 
576 |     # using the m embedded pattern modifier
577 | 
578 |     ('^abc', """jkl
579 | abc
580 | xyz""", FAIL),
581 |     ('(?m)^abc', """jkl
582 | abc
583 | xyz""", SUCCEED, 'found', 'abc'),
584 | 
585 |     ('(?m)abc$', """jkl
586 | xyzabc
587 | 123""", SUCCEED, 'found', 'abc'),
588 | 
589 |     # using the s embedded pattern modifier
590 | 
591 |     ('a.b', 'a\nb', FAIL),
592 |     ('(?s)a.b', 'a\nb', SUCCEED, 'found', 'a\nb'),
593 | 
594 |     # test \w, etc. both inside and outside character classes
595 | 
596 |     ('\\w+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
597 |     ('[\\w]+', '--ab_cd0123--', SUCCEED, 'found', 'ab_cd0123'),
598 |     ('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'),
599 |     ('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'),
600 |     ('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'),
601 |     # not an error under PCRE/PRE:
602 |     # ('[\\d-x]', '-', SYNTAX_ERROR),
603 |     (r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
604 |     (r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '),
605 | 
606 |     (r'\xff', '\377', SUCCEED, 'found', chr(255)),
607 |     # new \x semantics
608 |     (r'\x00ff', '\377', FAIL),
609 |     # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)),
610 |     (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
611 |     ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'),
612 |     (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)),
613 |     (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'),
614 | 
615 |     #
616 |     # post-1.5.2 additions
617 | 
618 |     # xmllib problem
619 |     (r'(([a-z]+):)?([a-z]+)$', 'smil', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-smil'),
620 |     # bug 110866: reference to undefined group
621 |     (r'((.)\1+)', '', SYNTAX_ERROR),
622 |     # bug 111869: search (PRE/PCRE fails on this one, SRE doesn't)
623 |     (r'.*d', 'abc\nabd', SUCCEED, 'found', 'abd'),
624 |     # bug 112468: various expected syntax errors
625 |     (r'(', '', SYNTAX_ERROR),
626 |     (r'[\41]', '!', SUCCEED, 'found', '!'),
627 |     # bug 114033: nothing to repeat
628 |     (r'(x?)?', 'x', SUCCEED, 'found', 'x'),
629 |     # bug 115040: rescan if flags are modified inside pattern
630 |     (r' (?x)foo ', 'foo', SUCCEED, 'found', 'foo'),
631 |     # bug 115618: negative lookahead
632 |     (r'(?<!abc)(d.f)', 'abcdefdof', SUCCEED, 'found', 'dof'),
633 |     # bug 116251: character class bug
634 |     (r'[\w-]+', 'laser_beam', SUCCEED, 'found', 'laser_beam'),
635 |     # bug 123769+127259: non-greedy backtracking bug
636 |     (r'.*?\S *:', 'xx:', SUCCEED, 'found', 'xx:'),
637 |     (r'a[ ]*?\ (\d+).*', 'a   10', SUCCEED, 'found', 'a   10'),
638 |     (r'a[ ]*?\ (\d+).*', 'a    10', SUCCEED, 'found', 'a    10'),
639 |     # bug 127259: \Z shouldn't depend on multiline mode
640 |     (r'(?ms).*?x\s*\Z(.*)','xx\nx\n', SUCCEED, 'g1', ''),
641 |     # bug 128899: uppercase literals under the ignorecase flag
642 |     (r'(?i)M+', 'MMM', SUCCEED, 'found', 'MMM'),
643 |     (r'(?i)m+', 'MMM', SUCCEED, 'found', 'MMM'),
644 |     (r'(?i)[M]+', 'MMM', SUCCEED, 'found', 'MMM'),
645 |     (r'(?i)[m]+', 'MMM', SUCCEED, 'found', 'MMM'),
646 |     # bug 130748: ^* should be an error (nothing to repeat)
647 |     (r'^*', '', SYNTAX_ERROR),
648 |     # bug 133283: minimizing repeat problem
649 |     (r'"(?:\\"|[^"])*?"', r'"\""', SUCCEED, 'found', r'"\""'),
650 |     # bug 477728: minimizing repeat problem
651 |     (r'^.*?$', 'one\ntwo\nthree\n', FAIL),
652 |     # bug 483789: minimizing repeat problem
653 |     (r'a[^>]*?b', 'a>b', FAIL),
654 |     # bug 490573: minimizing repeat problem
655 |     (r'^a*?$', 'foo', FAIL),
656 |     # bug 470582: nested groups problem
657 |     (r'^((a)c)?(ab)$', 'ab', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-ab'),
658 |     # another minimizing repeat problem (capturing groups in assertions)
659 |     ('^([ab]*?)(?=(b)?)c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
660 |     ('^([ab]*?)(?!(b))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
661 |     ('^([ab]*?)(?<!(a))c', 'abc', SUCCEED, 'g1+"-"+g2', 'ab-None'),
662 | ]
663 | 
664 | try:
665 |     u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'")
666 | except SyntaxError:
667 |     pass
668 | else:
669 |     tests.extend([
670 |     # bug 410271: \b broken under locales
671 |     (r'\b.\b', 'a', SUCCEED, 'found', 'a'),
672 |     (r'(?u)\b.\b', u, SUCCEED, 'found', u),
673 |     (r'(?u)\w', u, SUCCEED, 'found', u),
674 |     ])
675 | 


--------------------------------------------------------------------------------
/tests/test_re.py:
--------------------------------------------------------------------------------
  1 | from test.test_support import verbose, run_unittest, import_module
  2 | import re2 as re
  3 | from re import Scanner
  4 | import sys, os, traceback
  5 | from weakref import proxy
  6 | 
  7 | # Misc tests from Tim Peters' re.doc
  8 | 
  9 | # WARNING: Don't change details in these tests if you don't know
 10 | # what you're doing. Some of these tests were carefuly modeled to
 11 | # cover most of the code.
 12 | 
 13 | import unittest
 14 | 
 15 | class ReTests(unittest.TestCase):
 16 | 
 17 |     def test_weakref(self):
 18 |         s = 'QabbbcR'
 19 |         x = re.compile('ab+c')
 20 |         y = proxy(x)
 21 |         self.assertEqual(x.findall('QabbbcR'), y.findall('QabbbcR'))
 22 | 
 23 |     def test_search_star_plus(self):
 24 |         self.assertEqual(re.search('x*', 'axx').span(0), (0, 0))
 25 |         self.assertEqual(re.search('x*', 'axx').span(), (0, 0))
 26 |         self.assertEqual(re.search('x+', 'axx').span(0), (1, 3))
 27 |         self.assertEqual(re.search('x+', 'axx').span(), (1, 3))
 28 |         self.assertEqual(re.search('x', 'aaa'), None)
 29 |         self.assertEqual(re.match('a*', 'xxx').span(0), (0, 0))
 30 |         self.assertEqual(re.match('a*', 'xxx').span(), (0, 0))
 31 |         self.assertEqual(re.match('x*', 'xxxa').span(0), (0, 3))
 32 |         self.assertEqual(re.match('x*', 'xxxa').span(), (0, 3))
 33 |         self.assertEqual(re.match('a+', 'xxx'), None)
 34 | 
 35 |     def bump_num(self, matchobj):
 36 |         int_value = int(matchobj.group(0))
 37 |         return str(int_value + 1)
 38 | 
 39 |     def test_basic_re_sub(self):
 40 |         self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
 41 |         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
 42 |                          '9.3 -3 24x100y')
 43 |         self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3),
 44 |                          '9.3 -3 23x99y')
 45 | 
 46 |         self.assertEqual(re.sub('.', lambda m: r"\n", 'x'), '\\n')
 47 |         self.assertEqual(re.sub('.', r"\n", 'x'), '\n')
 48 | 
 49 |         s = r"\1\1"
 50 |         self.assertEqual(re.sub('(.)', s, 'x'), 'xx')
 51 |         self.assertEqual(re.sub('(.)', re.escape(s), 'x'), s)
 52 |         self.assertEqual(re.sub('(.)', lambda m: s, 'x'), s)
 53 | 
 54 |         self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<a>', 'xx'), 'xxxx')
 55 |         self.assertEqual(re.sub('(?P<a>x)', '\g<a>\g<1>', 'xx'), 'xxxx')
 56 |         self.assertEqual(re.sub('(?P<unk>x)', '\g<unk>\g<unk>', 'xx'), 'xxxx')
 57 |         self.assertEqual(re.sub('(?P<unk>x)', '\g<1>\g<1>', 'xx'), 'xxxx')
 58 | 
 59 |         self.assertEqual(re.sub('a',r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D','a'),
 60 |                          '\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D')
 61 |         self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'), '\t\n\v\r\f\a')
 62 |         self.assertEqual(re.sub('a', '\t\n\v\r\f\a', 'a'),
 63 |                          (chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)))
 64 | 
 65 |         self.assertEqual(re.sub('^\s*', 'X', 'test'), 'Xtest')
 66 | 
 67 |     def test_bug_449964(self):
 68 |         # fails for group followed by other escape
 69 |         self.assertEqual(re.sub(r'(?P<unk>x)', '\g<1>\g<1>\\b', 'xx'),
 70 |                          'xx\bxx\b')
 71 | 
 72 |     def test_bug_449000(self):
 73 |         # Test for sub() on escaped characters
 74 |         self.assertEqual(re.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'),
 75 |                          'abc\ndef\n')
 76 |         self.assertEqual(re.sub('\r\n', r'\n', 'abc\r\ndef\r\n'),
 77 |                          'abc\ndef\n')
 78 |         self.assertEqual(re.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'),
 79 |                          'abc\ndef\n')
 80 |         self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
 81 |                          'abc\ndef\n')
 82 | 
 83 |     def test_bug_1140(self):
 84 |         # re.sub(x, y, u'') should return u'', not '', and
 85 |         # re.sub(x, y, '') should return '', not u''.
 86 |         # Also:
 87 |         # re.sub(x, y, unicode(x)) should return unicode(y), and
 88 |         # re.sub(x, y, str(x)) should return
 89 |         #     str(y) if isinstance(y, str) else unicode(y).
 90 |         for x in 'x', u'x':
 91 |             for y in 'y', u'y':
 92 |                 z = re.sub(x, y, u'')
 93 |                 self.assertEqual(z, u'')
 94 |                 self.assertEqual(type(z), unicode)
 95 |                 #
 96 |                 z = re.sub(x, y, '')
 97 |                 self.assertEqual(z, '')
 98 |                 self.assertEqual(type(z), str)
 99 |                 #
100 |                 z = re.sub(x, y, unicode(x))
101 |                 self.assertEqual(z, y)
102 |                 self.assertEqual(type(z), unicode)
103 |                 #
104 |                 z = re.sub(x, y, str(x))
105 |                 self.assertEqual(z, y)
106 |                 self.assertEqual(type(z), type(y))
107 | 
108 |     def test_bug_1661(self):
109 |         # Verify that flags do not get silently ignored with compiled patterns
110 |         pattern = re.compile('.')
111 |         self.assertRaises(ValueError, re.match, pattern, 'A', re.I)
112 |         self.assertRaises(ValueError, re.search, pattern, 'A', re.I)
113 |         self.assertRaises(ValueError, re.findall, pattern, 'A', re.I)
114 |         self.assertRaises(ValueError, re.compile, pattern, re.I)
115 | 
116 |     def test_bug_3629(self):
117 |         # A regex that triggered a bug in the sre-code validator
118 |         re.compile("(?P<quote>)(?(quote))")
119 | 
120 |     def test_sub_template_numeric_escape(self):
121 |         # bug 776311 and friends
122 |         self.assertEqual(re.sub('x', r'\0', 'x'), '\0')
123 |         self.assertEqual(re.sub('x', r'\000', 'x'), '\000')
124 |         self.assertEqual(re.sub('x', r'\001', 'x'), '\001')
125 |         self.assertEqual(re.sub('x', r'\008', 'x'), '\0' + '8')
126 |         self.assertEqual(re.sub('x', r'\009', 'x'), '\0' + '9')
127 |         self.assertEqual(re.sub('x', r'\111', 'x'), '\111')
128 |         self.assertEqual(re.sub('x', r'\117', 'x'), '\117')
129 | 
130 |         self.assertEqual(re.sub('x', r'\1111', 'x'), '\1111')
131 |         self.assertEqual(re.sub('x', r'\1111', 'x'), '\111' + '1')
132 | 
133 |         self.assertEqual(re.sub('x', r'\00', 'x'), '\x00')
134 |         self.assertEqual(re.sub('x', r'\07', 'x'), '\x07')
135 |         self.assertEqual(re.sub('x', r'\08', 'x'), '\0' + '8')
136 |         self.assertEqual(re.sub('x', r'\09', 'x'), '\0' + '9')
137 |         self.assertEqual(re.sub('x', r'\0a', 'x'), '\0' + 'a')
138 | 
139 |         self.assertEqual(re.sub('x', r'\400', 'x'), '\0')
140 |         self.assertEqual(re.sub('x', r'\777', 'x'), '\377')
141 | 
142 |         self.assertRaises(re.error, re.sub, 'x', r'\1', 'x')
143 |         self.assertRaises(re.error, re.sub, 'x', r'\8', 'x')
144 |         self.assertRaises(re.error, re.sub, 'x', r'\9', 'x')
145 |         self.assertRaises(re.error, re.sub, 'x', r'\11', 'x')
146 |         self.assertRaises(re.error, re.sub, 'x', r'\18', 'x')
147 |         self.assertRaises(re.error, re.sub, 'x', r'\1a', 'x')
148 |         self.assertRaises(re.error, re.sub, 'x', r'\90', 'x')
149 |         self.assertRaises(re.error, re.sub, 'x', r'\99', 'x')
150 |         self.assertRaises(re.error, re.sub, 'x', r'\118', 'x') # r'\11' + '8'
151 |         self.assertRaises(re.error, re.sub, 'x', r'\11a', 'x')
152 |         self.assertRaises(re.error, re.sub, 'x', r'\181', 'x') # r'\18' + '1'
153 |         self.assertRaises(re.error, re.sub, 'x', r'\800', 'x') # r'\80' + '0'
154 | 
155 |         # in python2.3 (etc), these loop endlessly in sre_parser.py
156 |         self.assertEqual(re.sub('(((((((((((x)))))))))))', r'\11', 'x'), 'x')
157 |         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\118', 'xyz'),
158 |                          'xz8')
159 |         self.assertEqual(re.sub('((((((((((y))))))))))(.)', r'\11a', 'xyz'),
160 |                          'xza')
161 | 
162 |     def test_qualified_re_sub(self):
163 |         self.assertEqual(re.sub('a', 'b', 'aaaaa'), 'bbbbb')
164 |         self.assertEqual(re.sub('a', 'b', 'aaaaa', 1), 'baaaa')
165 | 
166 |     def test_bug_114660(self):
167 |         self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello  there'),
168 |                          'hello there')
169 | 
170 |     def test_bug_462270(self):
171 |         # Test for empty sub() behaviour, see SF bug #462270
172 |         self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
173 |         self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
174 | 
175 |     def test_symbolic_refs(self):
176 |         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
177 |         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<', 'xx')
178 |         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g', 'xx')
179 |         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a a>', 'xx')
180 |         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<1a1>', 'xx')
181 |         self.assertRaises(IndexError, re.sub, '(?P<a>x)', '\g<ab>', 'xx')
182 |         self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\g<b>', 'xx')
183 |         self.assertRaises(re.error, re.sub, '(?P<a>x)|(?P<b>y)', '\\2', 'xx')
184 |         self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<-1>', 'xx')
185 | 
186 |     def test_re_subn(self):
187 |         self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
188 |         self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
189 |         self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
190 |         self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
191 |         self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
192 | 
193 |     def test_re_split(self):
194 |         self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
195 |         self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
196 |         self.assertEqual(re.split("(:*)", ":a:b::c"),
197 |                          ['', ':', 'a', ':', 'b', '::', 'c'])
198 |         self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
199 |         self.assertEqual(re.split("(:)*", ":a:b::c"),
200 |                          ['', ':', 'a', ':', 'b', ':', 'c'])
201 |         self.assertEqual(re.split("([b:]+)", ":a:b::c"),
202 |                          ['', ':', 'a', ':b::', 'c'])
203 |         self.assertEqual(re.split("(b)|(:+)", ":a:b::c"),
204 |                          ['', None, ':', 'a', None, ':', '', 'b', None, '',
205 |                           None, '::', 'c'])
206 |         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
207 |                          ['', 'a', '', '', 'c'])
208 | 
209 |     def test_qualified_re_split(self):
210 |         self.assertEqual(re.split(":", ":a:b::c", 2), ['', 'a', 'b::c'])
211 |         self.assertEqual(re.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d'])
212 |         self.assertEqual(re.split("(:)", ":a:b::c", 2),
213 |                          ['', ':', 'a', ':', 'b::c'])
214 |         self.assertEqual(re.split("(:*)", ":a:b::c", 2),
215 |                          ['', ':', 'a', ':', 'b::c'])
216 | 
217 |     def test_re_findall(self):
218 |         self.assertEqual(re.findall(":+", "abc"), [])
219 |         self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
220 |         self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
221 |         self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
222 |                                                                (":", ":"),
223 |                                                                (":", "::")])
224 | 
225 |     def test_bug_117612(self):
226 |         self.assertEqual(re.findall(r"(a|(b))", "aba"),
227 |                          [("a", ""),("b", "b"),("a", "")])
228 | 
229 |     def test_re_match(self):
230 |         self.assertEqual(re.match('a', 'a').groups(), ())
231 |         self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
232 |         self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
233 |         self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
234 |         self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
235 | 
236 |         pat = re.compile('((a)|(b))(c)?')
237 |         self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
238 |         self.assertEqual(pat.match('b').groups(), ('b', None, 'b', None))
239 |         self.assertEqual(pat.match('ac').groups(), ('a', 'a', None, 'c'))
240 |         self.assertEqual(pat.match('bc').groups(), ('b', None, 'b', 'c'))
241 |         self.assertEqual(pat.match('bc').groups(""), ('b', "", 'b', 'c'))
242 | 
243 |         # A single group
244 |         m = re.match('(a)', 'a')
245 |         self.assertEqual(m.group(0), 'a')
246 |         self.assertEqual(m.group(0), 'a')
247 |         self.assertEqual(m.group(1), 'a')
248 |         self.assertEqual(m.group(1, 1), ('a', 'a'))
249 | 
250 |         pat = re.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?')
251 |         self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None))
252 |         self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'),
253 |                          (None, 'b', None))
254 |         self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
255 | 
256 |     def test_re_groupref_exists(self):
257 |         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
258 |                          ('(', 'a'))
259 |         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
260 |                          (None, 'a'))
261 |         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a)'), None)
262 |         self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a'), None)
263 |         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'ab').groups(),
264 |                          ('a', 'b'))
265 |         self.assertEqual(re.match('^(?:(a)|c)((?(1)b|d))$', 'cd').groups(),
266 |                          (None, 'd'))
267 |         self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'cd').groups(),
268 |                          (None, 'd'))
269 |         self.assertEqual(re.match('^(?:(a)|c)((?(1)|d))$', 'a').groups(),
270 |                          ('a', ''))
271 | 
272 |         # Tests for bug #1177831: exercise groups other than the first group
273 |         p = re.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))')
274 |         self.assertEqual(p.match('abc').groups(),
275 |                          ('a', 'b', 'c'))
276 |         self.assertEqual(p.match('ad').groups(),
277 |                          ('a', None, 'd'))
278 |         self.assertEqual(p.match('abd'), None)
279 |         self.assertEqual(p.match('ac'), None)
280 | 
281 | 
282 |     def test_re_groupref(self):
283 |         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
284 |                          ('|', 'a'))
285 |         self.assertEqual(re.match(r'^(\|)?([^()]+)\1?$', 'a').groups(),
286 |                          (None, 'a'))
287 |         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', 'a|'), None)
288 |         self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a'), None)
289 |         self.assertEqual(re.match(r'^(?:(a)|c)(\1)$', 'aa').groups(),
290 |                          ('a', 'a'))
291 |         self.assertEqual(re.match(r'^(?:(a)|c)(\1)?$', 'c').groups(),
292 |                          (None, None))
293 | 
294 |     def test_groupdict(self):
295 |         self.assertEqual(re.match('(?P<first>first) (?P<second>second)',
296 |                                   'first second').groupdict(),
297 |                          {'first':'first', 'second':'second'})
298 | 
299 |     def test_expand(self):
300 |         self.assertEqual(re.match("(?P<first>first) (?P<second>second)",
301 |                                   "first second")
302 |                                   .expand(r"\2 \1 \g<second> \g<first>"),
303 |                          "second first second first")
304 | 
305 |     def test_repeat_minmax(self):
306 |         self.assertEqual(re.match("^(\w){1}$", "abc"), None)
307 |         self.assertEqual(re.match("^(\w){1}?$", "abc"), None)
308 |         self.assertEqual(re.match("^(\w){1,2}$", "abc"), None)
309 |         self.assertEqual(re.match("^(\w){1,2}?$", "abc"), None)
310 | 
311 |         self.assertEqual(re.match("^(\w){3}$", "abc").group(1), "c")
312 |         self.assertEqual(re.match("^(\w){1,3}$", "abc").group(1), "c")
313 |         self.assertEqual(re.match("^(\w){1,4}$", "abc").group(1), "c")
314 |         self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
315 |         self.assertEqual(re.match("^(\w){3}?$", "abc").group(1), "c")
316 |         self.assertEqual(re.match("^(\w){1,3}?$", "abc").group(1), "c")
317 |         self.assertEqual(re.match("^(\w){1,4}?$", "abc").group(1), "c")
318 |         self.assertEqual(re.match("^(\w){3,4}?$", "abc").group(1), "c")
319 | 
320 |         self.assertEqual(re.match("^x{1}$", "xxx"), None)
321 |         self.assertEqual(re.match("^x{1}?$", "xxx"), None)
322 |         self.assertEqual(re.match("^x{1,2}$", "xxx"), None)
323 |         self.assertEqual(re.match("^x{1,2}?$", "xxx"), None)
324 | 
325 |         self.assertNotEqual(re.match("^x{3}$", "xxx"), None)
326 |         self.assertNotEqual(re.match("^x{1,3}$", "xxx"), None)
327 |         self.assertNotEqual(re.match("^x{1,4}$", "xxx"), None)
328 |         self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
329 |         self.assertNotEqual(re.match("^x{3}?$", "xxx"), None)
330 |         self.assertNotEqual(re.match("^x{1,3}?$", "xxx"), None)
331 |         self.assertNotEqual(re.match("^x{1,4}?$", "xxx"), None)
332 |         self.assertNotEqual(re.match("^x{3,4}?$", "xxx"), None)
333 | 
334 |         self.assertEqual(re.match("^x{}$", "xxx"), None)
335 |         self.assertNotEqual(re.match("^x{}$", "x{}"), None)
336 | 
337 |     def test_getattr(self):
338 |         self.assertEqual(re.match("(a)", "a").pos, 0)
339 |         self.assertEqual(re.match("(a)", "a").endpos, 1)
340 |         self.assertEqual(re.match("(a)", "a").string, "a")
341 |         self.assertEqual(re.match("(a)", "a").regs, ((0, 1), (0, 1)))
342 |         self.assertNotEqual(re.match("(a)", "a").re, None)
343 | 
344 |     def test_special_escapes(self):
345 |         self.assertEqual(re.search(r"\b(b.)\b",
346 |                                    "abcd abc bcd bx").group(1), "bx")
347 |         self.assertEqual(re.search(r"\B(b.)\B",
348 |                                    "abc bcd bc abxd").group(1), "bx")
349 |         self.assertEqual(re.search(r"\b(b.)\b",
350 |                                    "abcd abc bcd bx", re.LOCALE).group(1), "bx")
351 |         self.assertEqual(re.search(r"\B(b.)\B",
352 |                                    "abc bcd bc abxd", re.LOCALE).group(1), "bx")
353 |         self.assertEqual(re.search(r"\b(b.)\b",
354 |                                    "abcd abc bcd bx", re.UNICODE).group(1), "bx")
355 |         self.assertEqual(re.search(r"\B(b.)\B",
356 |                                    "abc bcd bc abxd", re.UNICODE).group(1), "bx")
357 |         self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
358 |         self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
359 |         self.assertEqual(re.search(r"^\Aabc\Z$", "\nabc\n", re.M), None)
360 |         self.assertEqual(re.search(r"\b(b.)\b",
361 |                                    u"abcd abc bcd bx").group(1), "bx")
362 |         self.assertEqual(re.search(r"\B(b.)\B",
363 |                                    u"abc bcd bc abxd").group(1), "bx")
364 |         self.assertEqual(re.search(r"^abc$", u"\nabc\n", re.M).group(0), "abc")
365 |         self.assertEqual(re.search(r"^\Aabc\Z$", u"abc", re.M).group(0), "abc")
366 |         self.assertEqual(re.search(r"^\Aabc\Z$", u"\nabc\n", re.M), None)
367 |         self.assertEqual(re.search(r"\d\D\w\W\s\S",
368 |                                    "1aa! a").group(0), "1aa! a")
369 |         self.assertEqual(re.search(r"\d\D\w\W\s\S",
370 |                                    "1aa! a", re.LOCALE).group(0), "1aa! a")
371 |         self.assertEqual(re.search(r"\d\D\w\W\s\S",
372 |                                    "1aa! a", re.UNICODE).group(0), "1aa! a")
373 | 
374 |     def test_bigcharset(self):
375 |         self.assertEqual(re.match(u"([\u2222\u2223])",
376 |                                   u"\u2222").group(1), u"\u2222")
377 |         self.assertEqual(re.match(u"([\u2222\u2223])",
378 |                                   u"\u2222", re.UNICODE).group(1), u"\u2222")
379 | 
380 |     def test_anyall(self):
381 |         self.assertEqual(re.match("a.b", "a\nb", re.DOTALL).group(0),
382 |                          "a\nb")
383 |         self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
384 |                          "a\n\nb")
385 | 
386 |     def test_non_consuming(self):
387 |         self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
388 |         self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
389 |         self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
390 |         self.assertEqual(re.match("(a(?=\s[abc]*))", "a bc").group(1), "a")
391 |         self.assertEqual(re.match(r"(a)(?=\s\1)", "a a").group(1), "a")
392 |         self.assertEqual(re.match(r"(a)(?=\s\1*)", "a aa").group(1), "a")
393 |         self.assertEqual(re.match(r"(a)(?=\s(abc|a))", "a a").group(1), "a")
394 | 
395 |         self.assertEqual(re.match(r"(a(?!\s[^a]))", "a a").group(1), "a")
396 |         self.assertEqual(re.match(r"(a(?!\s[abc]))", "a d").group(1), "a")
397 |         self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
398 |         self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
399 | 
400 |     def test_ignore_case(self):
401 |         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
402 |         self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
403 |         self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
404 |         self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
405 |         self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
406 |         self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
407 |         self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
408 |         self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
409 |         self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
410 |         self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
411 | 
412 |     def test_category(self):
413 |         self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
414 | 
415 |     def test_getlower(self):
416 |         import _sre
417 |         self.assertEqual(_sre.getlower(ord('A'), 0), ord('a'))
418 |         self.assertEqual(_sre.getlower(ord('A'), re.LOCALE), ord('a'))
419 |         self.assertEqual(_sre.getlower(ord('A'), re.UNICODE), ord('a'))
420 | 
421 |         self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
422 |         self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
423 | 
424 |     def test_not_literal(self):
425 |         self.assertEqual(re.search("\s([^a])", " b").group(1), "b")
426 |         self.assertEqual(re.search("\s([^a]*)", " bb").group(1), "bb")
427 | 
428 |     def test_search_coverage(self):
429 |         self.assertEqual(re.search("\s(b)", " b").group(1), "b")
430 |         self.assertEqual(re.search("a\s", "a ").group(0), "a ")
431 | 
432 |     def test_re_escape(self):
433 |         p=""
434 |         # This had to change from the original test of range(0,256)
435 |         # because we can't support non-ascii non-utf8 strings
436 |         for i in range(0, 128):
437 |             p = p + chr(i)
438 |             self.assertEqual(re.match(re.escape(chr(i)), chr(i)) is not None,
439 |                              True)
440 |             self.assertEqual(re.match(re.escape(chr(i)), chr(i)).span(), (0,1))
441 | 
442 |         pat=re.compile(re.escape(p))
443 |         self.assertEqual(pat.match(p) is not None, True)
444 |         self.assertEqual(pat.match(p).span(), (0,128))
445 | 
446 |     def test_pickling(self):
447 |         import pickle
448 |         self.pickle_test(pickle)
449 |         import cPickle
450 |         self.pickle_test(cPickle)
451 |         # old pickles expect the _compile() reconstructor in sre module
452 |         import_module("sre", deprecated=True)
453 |         from sre import _compile
454 | 
455 |     def pickle_test(self, pickle):
456 |         oldpat = re.compile('a(?:b|(c|e){1,2}?|d)+?(.)')
457 |         s = pickle.dumps(oldpat)
458 |         newpat = pickle.loads(s)
459 |         self.assertEqual(oldpat, newpat)
460 | 
461 |     def test_constants(self):
462 |         self.assertEqual(re.I, re.IGNORECASE)
463 |         self.assertEqual(re.L, re.LOCALE)
464 |         self.assertEqual(re.M, re.MULTILINE)
465 |         self.assertEqual(re.S, re.DOTALL)
466 |         self.assertEqual(re.X, re.VERBOSE)
467 | 
468 |     def test_flags(self):
469 |         for flag in [re.I, re.M, re.X, re.S, re.L]:
470 |             self.assertNotEqual(re.compile('^pattern$', flag), None)
471 | 
472 |     def test_sre_character_literals(self):
473 |         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
474 |             self.assertNotEqual(re.match(r"\%03o" % i, chr(i)), None)
475 |             self.assertNotEqual(re.match(r"\%03o0" % i, chr(i)+"0"), None)
476 |             self.assertNotEqual(re.match(r"\%03o8" % i, chr(i)+"8"), None)
477 |             self.assertNotEqual(re.match(r"\x%02x" % i, chr(i)), None)
478 |             self.assertNotEqual(re.match(r"\x%02x0" % i, chr(i)+"0"), None)
479 |             self.assertNotEqual(re.match(r"\x%02xz" % i, chr(i)+"z"), None)
480 |         self.assertRaises(re.error, re.match, "\911", "")
481 | 
482 |     def test_sre_character_class_literals(self):
483 |         for i in [0, 8, 16, 32, 64, 127, 128, 255]:
484 |             self.assertNotEqual(re.match(r"[\%03o]" % i, chr(i)), None)
485 |             self.assertNotEqual(re.match(r"[\%03o0]" % i, chr(i)), None)
486 |             self.assertNotEqual(re.match(r"[\%03o8]" % i, chr(i)), None)
487 |             self.assertNotEqual(re.match(r"[\x%02x]" % i, chr(i)), None)
488 |             self.assertNotEqual(re.match(r"[\x%02x0]" % i, chr(i)), None)
489 |             self.assertNotEqual(re.match(r"[\x%02xz]" % i, chr(i)), None)
490 |         self.assertRaises(re.error, re.match, "[\911]", "")
491 | 
492 |     def test_bug_113254(self):
493 |         self.assertEqual(re.match(r'(a)|(b)', 'b').start(1), -1)
494 |         self.assertEqual(re.match(r'(a)|(b)', 'b').end(1), -1)
495 |         self.assertEqual(re.match(r'(a)|(b)', 'b').span(1), (-1, -1))
496 | 
497 |     def test_bug_527371(self):
498 |         # bug described in patches 527371/672491
499 |         self.assertEqual(re.match(r'(a)?a','a').lastindex, None)
500 |         self.assertEqual(re.match(r'(a)(b)?b','ab').lastindex, 1)
501 |         self.assertEqual(re.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a')
502 |         self.assertEqual(re.match("(?P<a>a(b))", "ab").lastgroup, 'a')
503 |         self.assertEqual(re.match("((a))", "a").lastindex, 1)
504 | 
505 |     def test_bug_545855(self):
506 |         # bug 545855 -- This pattern failed to cause a compile error as it
507 |         # should, instead provoking a TypeError.
508 |         self.assertRaises(re.error, re.compile, 'foo[a-')
509 | 
510 |     def test_bug_418626(self):
511 |         # bugs 418626 at al. -- Testing Greg Chapman's addition of op code
512 |         # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of
513 |         # pattern '*?' on a long string.
514 |         self.assertEqual(re.match('.*?c', 10000*'ab'+'cd').end(0), 20001)
515 |         self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
516 |                          20003)
517 |         self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
518 |         # non-simple '*?' still used to hit the recursion limit, before the
519 |         # non-recursive scheme was implemented.
520 |         self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
521 | 
522 |     def test_bug_612074(self):
523 |         pat=u"["+re.escape(u"\u2039")+u"]"
524 |         self.assertEqual(re.compile(pat) and 1, 1)
525 | 
526 |     def test_stack_overflow(self):
527 |         # nasty cases that used to overflow the straightforward recursive
528 |         # implementation of repeated groups.
529 |         self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
530 |         self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
531 |         self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
532 | 
533 |     def test_scanner(self):
534 |         def s_ident(scanner, token): return token
535 |         def s_operator(scanner, token): return "op%s" % token
536 |         def s_float(scanner, token): return float(token)
537 |         def s_int(scanner, token): return int(token)
538 | 
539 |         scanner = Scanner([
540 |             (r"[a-zA-Z_]\w*", s_ident),
541 |             (r"\d+\.\d*", s_float),
542 |             (r"\d+", s_int),
543 |             (r"=|\+|-|\*|/", s_operator),
544 |             (r"\s+", None),
545 |             ])
546 | 
547 |         self.assertNotEqual(scanner.scanner.scanner("").pattern, None)
548 | 
549 |         self.assertEqual(scanner.scan("sum = 3*foo + 312.50 + bar"),
550 |                          (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
551 |                            'op+', 'bar'], ''))
552 | 
553 |     def test_bug_448951(self):
554 |         # bug 448951 (similar to 429357, but with single char match)
555 |         # (Also test greedy matches.)
556 |         for op in '','?','*':
557 |             self.assertEqual(re.match(r'((.%s):)?z'%op, 'z').groups(),
558 |                              (None, None))
559 |             self.assertEqual(re.match(r'((.%s):)?z'%op, 'a:z').groups(),
560 |                              ('a:', 'a'))
561 | 
562 |     def test_bug_725106(self):
563 |         # capturing groups in alternatives in repeats
564 |         self.assertEqual(re.match('^((a)|b)*', 'abc').groups(),
565 |                          ('b', 'a'))
566 |         self.assertEqual(re.match('^(([ab])|c)*', 'abc').groups(),
567 |                          ('c', 'b'))
568 |         self.assertEqual(re.match('^((d)|[ab])*', 'abc').groups(),
569 |                          ('b', None))
570 |         self.assertEqual(re.match('^((a)c|[ab])*', 'abc').groups(),
571 |                          ('b', None))
572 |         self.assertEqual(re.match('^((a)|b)*?c', 'abc').groups(),
573 |                          ('b', 'a'))
574 |         self.assertEqual(re.match('^(([ab])|c)*?d', 'abcd').groups(),
575 |                          ('c', 'b'))
576 |         self.assertEqual(re.match('^((d)|[ab])*?c', 'abc').groups(),
577 |                          ('b', None))
578 |         self.assertEqual(re.match('^((a)c|[ab])*?c', 'abc').groups(),
579 |                          ('b', None))
580 | 
581 |     def test_bug_725149(self):
582 |         # mark_stack_base restoring before restoring marks
583 |         self.assertEqual(re.match('(a)(?:(?=(b)*)c)*', 'abb').groups(),
584 |                          ('a', None))
585 |         self.assertEqual(re.match('(a)((?!(b)*))*', 'abb').groups(),
586 |                          ('a', None, None))
587 | 
588 |     def test_bug_764548(self):
589 |         # bug 764548, re.compile() barfs on str/unicode subclasses
590 |         try:
591 |             unicode
592 |         except NameError:
593 |             return  # no problem if we have no unicode
594 |         class my_unicode(unicode): pass
595 |         pat = re.compile(my_unicode("abc"))
596 |         self.assertEqual(pat.match("xyz"), None)
597 | 
598 |     def test_finditer(self):
599 |         iter = re.finditer(r":+", "a:b::c:::d")
600 |         self.assertEqual([item.group(0) for item in iter],
601 |                          [":", "::", ":::"])
602 | 
603 |     def test_bug_926075(self):
604 |         try:
605 |             unicode
606 |         except NameError:
607 |             return # no problem if we have no unicode
608 |         self.assert_(re.compile('bug_926075') is not
609 |                      re.compile(eval("u'bug_926075'")))
610 | 
611 |     def test_bug_931848(self):
612 |         try:
613 |             unicode
614 |         except NameError:
615 |             pass
616 |         pattern = eval('u"[\u002E\u3002\uFF0E\uFF61]"')
617 |         self.assertEqual(re.compile(pattern).split("a.b.c"),
618 |                          ['a','b','c'])
619 | 
620 |     def test_bug_581080(self):
621 |         iter = re.finditer(r"\s", "a b")
622 |         self.assertEqual(iter.next().span(), (1,2))
623 |         self.assertRaises(StopIteration, iter.next)
624 | 
625 |         scanner = re.compile(r"\s").scanner("a b")
626 |         self.assertEqual(scanner.search().span(), (1, 2))
627 |         self.assertEqual(scanner.search(), None)
628 | 
629 |     def test_bug_817234(self):
630 |         iter = re.finditer(r".*", "asdf")
631 |         self.assertEqual(iter.next().span(), (0, 4))
632 |         self.assertEqual(iter.next().span(), (4, 4))
633 |         self.assertRaises(StopIteration, iter.next)
634 | 
635 |     def test_empty_array(self):
636 |         # SF buf 1647541
637 |         import array
638 |         for typecode in 'cbBuhHiIlLfd':
639 |             a = array.array(typecode)
640 |             self.assertEqual(re.compile("bla").match(a), None)
641 |             self.assertEqual(re.compile("").match(a).groups(), ())
642 | 
643 |     def test_inline_flags(self):
644 |         # Bug #1700
645 |         upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Bellow
646 |         lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Bellow
647 | 
648 |         p = re.compile(upper_char, re.I | re.U)
649 |         q = p.match(lower_char)
650 |         self.assertNotEqual(q, None)
651 | 
652 |         p = re.compile(lower_char, re.I | re.U)
653 |         q = p.match(upper_char)
654 |         self.assertNotEqual(q, None)
655 | 
656 |         p = re.compile('(?i)' + upper_char, re.U)
657 |         q = p.match(lower_char)
658 |         self.assertNotEqual(q, None)
659 | 
660 |         p = re.compile('(?i)' + lower_char, re.U)
661 |         q = p.match(upper_char)
662 |         self.assertNotEqual(q, None)
663 | 
664 |         p = re.compile('(?iu)' + upper_char)
665 |         q = p.match(lower_char)
666 |         self.assertNotEqual(q, None)
667 | 
668 |         p = re.compile('(?iu)' + lower_char)
669 |         q = p.match(upper_char)
670 |         self.assertNotEqual(q, None)
671 | 
672 |     def test_dollar_matches_twice(self):
673 |         "$ matches the end of string, and just before the terminating \n"
674 |         pattern = re.compile('$')
675 |         self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#')
676 |         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#')
677 |         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
678 | 
679 |         pattern = re.compile('$', re.MULTILINE)
680 |         self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#' )
681 |         self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
682 |         self.assertEqual(pattern.sub('#', '\n'), '#\n#')
683 | 
684 |     def test_dealloc(self):
685 |         # issue 3299: check for segfault in debug build
686 |         import _sre
687 |         # the overflow limit is different on wide and narrow builds and it
688 |         # depends on the definition of SRE_CODE (see sre.h).
689 |         # 2**128 should be big enough to overflow on both. For smaller values
690 |         # a RuntimeError is raised instead of OverflowError.
691 |         long_overflow = 2**128
692 |         self.assertRaises(TypeError, re.finditer, "a", {})
693 |         self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow])
694 | 
695 | def run_re_tests():
696 |     from re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
697 |     if verbose:
698 |         print 'Running re_tests test suite'
699 |     else:
700 |         # To save time, only run the first and last 10 tests
701 |         #tests = tests[:10] + tests[-10:]
702 |         pass
703 | 
704 |     for t in tests:
705 |         sys.stdout.flush()
706 |         pattern = s = outcome = repl = expected = None
707 |         if len(t) == 5:
708 |             pattern, s, outcome, repl, expected = t
709 |         elif len(t) == 3:
710 |             pattern, s, outcome = t
711 |         else:
712 |             raise ValueError, ('Test tuples should have 3 or 5 fields', t)
713 | 
714 |         try:
715 |             obj = re.compile(pattern)
716 |         except re.error:
717 |             if outcome == SYNTAX_ERROR: pass  # Expected a syntax error
718 |             else:
719 |                 print '=== Syntax error:', t
720 |         except KeyboardInterrupt: raise KeyboardInterrupt
721 |         except:
722 |             print '*** Unexpected error ***', t
723 |             if verbose:
724 |                 traceback.print_exc(file=sys.stdout)
725 |         else:
726 |             try:
727 |                 result = obj.search(s)
728 |             except re.error, msg:
729 |                 print '=== Unexpected exception', t, repr(msg)
730 |             if outcome == SYNTAX_ERROR:
731 |                 # This should have been a syntax error; forget it.
732 |                 pass
733 |             elif outcome == FAIL:
734 |                 if result is None: pass   # No match, as expected
735 |                 else: print '=== Succeeded incorrectly', t
736 |             elif outcome == SUCCEED:
737 |                 if result is not None:
738 |                     # Matched, as expected, so now we compute the
739 |                     # result string and compare it to our expected result.
740 |                     start, end = result.span(0)
741 |                     vardict={'found': result.group(0),
742 |                              'groups': result.group(),
743 |                              'flags': result.re.flags}
744 |                     for i in range(1, 100):
745 |                         try:
746 |                             gi = result.group(i)
747 |                             # Special hack because else the string concat fails:
748 |                             if gi is None:
749 |                                 gi = "None"
750 |                         except IndexError:
751 |                             gi = "Error"
752 |                         vardict['g%d' % i] = gi
753 |                     for i in result.re.groupindex.keys():
754 |                         try:
755 |                             gi = result.group(i)
756 |                             if gi is None:
757 |                                 gi = "None"
758 |                         except IndexError:
759 |                             gi = "Error"
760 |                         vardict[i] = gi
761 |                     repl = eval(repl, vardict)
762 |                     if repl != expected:
763 |                         print '=== grouping error', t,
764 |                         print repr(repl) + ' should be ' + repr(expected)
765 |                 else:
766 |                     print '=== Failed incorrectly', t
767 | 
768 |                 # Try the match on a unicode string, and check that it
769 |                 # still succeeds.
770 |                 try:
771 |                     result = obj.search(unicode(s, "latin-1"))
772 |                     if result is None:
773 |                         print '=== Fails on unicode match', t
774 |                 except NameError:
775 |                     continue # 1.5.2
776 |                 except TypeError:
777 |                     continue # unicode test case
778 | 
779 |                 # Try the match on a unicode pattern, and check that it
780 |                 # still succeeds.
781 |                 obj=re.compile(unicode(pattern, "latin-1"))
782 |                 result = obj.search(s)
783 |                 if result is None:
784 |                     print '=== Fails on unicode pattern match', t
785 | 
786 |                 # Try the match with the search area limited to the extent
787 |                 # of the match and see if it still succeeds.  \B will
788 |                 # break (because it won't match at the end or start of a
789 |                 # string), so we'll ignore patterns that feature it.
790 | 
791 |                 if pattern[:2] != '\\B' and pattern[-2:] != '\\B' \
792 |                                and result is not None:
793 |                     obj = re.compile(pattern)
794 |                     result = obj.search(s, result.start(0), result.end(0) + 1)
795 |                     if result is None:
796 |                         print '=== Failed on range-limited match', t
797 | 
798 |                 # Try the match with IGNORECASE enabled, and check that it
799 |                 # still succeeds.
800 |                 obj = re.compile(pattern, re.IGNORECASE)
801 |                 result = obj.search(s)
802 |                 if result is None:
803 |                     print '=== Fails on case-insensitive match', t
804 | 
805 |                 # Try the match with LOCALE enabled, and check that it
806 |                 # still succeeds.
807 |                 obj = re.compile(pattern, re.LOCALE)
808 |                 result = obj.search(s)
809 |                 if result is None:
810 |                     print '=== Fails on locale-sensitive match', t
811 | 
812 |                 # Try the match with UNICODE locale enabled, and check
813 |                 # that it still succeeds.
814 |                 obj = re.compile(pattern, re.UNICODE)
815 |                 result = obj.search(s)
816 |                 if result is None:
817 |                     print '=== Fails on unicode-sensitive match', t
818 | 
819 | def test_main():
820 |     run_unittest(ReTests)
821 |     run_re_tests()
822 | 
823 | if __name__ == "__main__":
824 |     test_main()
825 | 


--------------------------------------------------------------------------------
/src/re2.pyx:
--------------------------------------------------------------------------------
   1 | # cython: infer_types(False)
   2 | # Import re flags to be compatible.
   3 | import sys
   4 | import re
   5 | 
   6 | I = re.I
   7 | IGNORECASE = re.IGNORECASE
   8 | M = re.M
   9 | MULTILINE = re.MULTILINE
  10 | S = re.S
  11 | DOTALL = re.DOTALL
  12 | U = re.U
  13 | UNICODE = re.UNICODE
  14 | X = re.X
  15 | VERBOSE = re.VERBOSE
  16 | L = re.L
  17 | LOCALE = re.LOCALE
  18 | 
  19 | FALLBACK_QUIETLY = 0
  20 | FALLBACK_WARNING = 1
  21 | FALLBACK_EXCEPTION = 2
  22 | 
  23 | VERSION = (0, 2, 23)
  24 | VERSION_HEX = 0x000217
  25 | 
  26 | # Type of compiled re object from Python stdlib
  27 | SREPattern = type(re.compile(''))
  28 | 
  29 | cdef int current_notification = FALLBACK_QUIETLY
  30 | 
  31 | def set_fallback_notification(level):
  32 |     """
  33 |     Set the fallback notification to a level; one of:
  34 |         FALLBACK_QUIETLY
  35 | 	FALLBACK_WARNING
  36 | 	FALLBACK_EXCEPTION
  37 |     """
  38 |     global current_notification
  39 |     level = int(level)
  40 |     if level < 0 or level > 2:
  41 |         raise ValueError("This function expects a valid notification level.")
  42 |     current_notification = level
  43 | 
  44 | 
  45 | class RegexError(re.error):
  46 |     """
  47 |     Some error has occured in compilation of the regex.
  48 |     """
  49 |     pass
  50 | 
  51 | error = RegexError
  52 | 
  53 | cdef int _I = I, _M = M, _S = S, _U = U, _X = X, _L = L
  54 | 
  55 | cimport _re2
  56 | cimport cpython.unicode
  57 | from cython.operator cimport preincrement as inc, dereference as deref
  58 | import warnings
  59 | 
  60 | cdef object cpp_to_pystring(_re2.cpp_string input):
  61 |     # This function is a quick converter from a std::string object
  62 |     # to a python string. By taking the slice we go to the right size,
  63 |     # despite spurious or missing null characters.
  64 |     return input.c_str()[:input.length()]
  65 | 
  66 | cdef inline object cpp_to_utf8(_re2.cpp_string input):
  67 |     # This function converts a std::string object to a utf8 object.
  68 |     return cpython.unicode.PyUnicode_DecodeUTF8(input.c_str(), input.length(), 'strict')
  69 | 
  70 | cdef inline object char_to_utf8(_re2.const_char_ptr input, int length):
  71 |     # This function converts a C string to a utf8 object.
  72 |     return cpython.unicode.PyUnicode_DecodeUTF8(input, length, 'strict')
  73 | 
  74 | cdef inline object unicode_to_bytestring(object pystring, int * encoded):
  75 |     # This function will convert a utf8 string to a bytestring object.
  76 |     if cpython.unicode.PyUnicode_Check(pystring):
  77 |         pystring = cpython.unicode.PyUnicode_EncodeUTF8(cpython.unicode.PyUnicode_AS_UNICODE(pystring),
  78 |                                                        cpython.unicode.PyUnicode_GET_SIZE(pystring),
  79 |                                                        "strict")
  80 |         encoded[0] = 1
  81 |     else:
  82 |         encoded[0] = 0
  83 |     return pystring
  84 | 
  85 | cdef inline int pystring_to_bytestring(object pystring, char ** cstring, Py_ssize_t * length):
  86 |     # This function will convert a pystring to a bytesstring, placing
  87 |     # the char * in cstring, and the length in length.
  88 |     # First it will try treating it as a str object, but failing that
  89 |     # it will move to utf-8. If utf8 does not work, then it has to be
  90 |     # a non-supported encoding.
  91 |     return _re2.PyObject_AsCharBuffer(pystring, <_re2.const_char_ptr*> cstring, length)
  92 | 
  93 | cdef extern from *:
  94 |     cdef void emit_ifndef_py_unicode_wide "#if !defined(Py_UNICODE_WIDE) //" ()
  95 |     cdef void emit_endif "#endif //" ()
  96 | 
  97 | cdef class Match:
  98 |     cdef _re2.StringPiece * matches
  99 |     cdef _re2.const_stringintmap * named_groups
 100 | 
 101 |     cdef bint encoded
 102 |     cdef int _lastindex
 103 |     cdef int nmatches
 104 |     cdef int _pos
 105 |     cdef int _endpos
 106 |     cdef object match_string
 107 |     cdef object _pattern_object
 108 |     cdef tuple _groups
 109 |     cdef tuple _spans
 110 |     cdef dict _named_groups
 111 |     cdef dict _named_indexes
 112 | 
 113 |     def __init__(self, object pattern_object, int num_groups):
 114 |         self._lastindex = -1
 115 |         self._groups = None
 116 |         self._pos = 0
 117 |         self._endpos = -1
 118 |         self.matches = _re2.new_StringPiece_array(num_groups + 1)
 119 |         self.nmatches = num_groups
 120 |         self._pattern_object = pattern_object
 121 | 
 122 |     def __dealloc__(self):
 123 |        _re2.delete_StringPiece_array(self.matches)
 124 | 
 125 |     property re:
 126 |         def __get__(self):
 127 |             return self._pattern_object
 128 | 
 129 |     property pos:
 130 |         def __get__(self):
 131 |             return self._pos
 132 | 
 133 |     property endpos:
 134 |         def __get__(self):
 135 |             return self._endpos
 136 | 
 137 |     property string:
 138 |         def __get__(self):
 139 |             return self.match_string
 140 | 
 141 |     cdef init_groups(self):
 142 |         cdef list groups = []
 143 |         cdef int i
 144 |         cdef bint cur_encoded = self.encoded
 145 | 
 146 |         if self._groups is not None:
 147 |             return
 148 | 
 149 |         cdef _re2.const_char_ptr last_end = NULL
 150 |         cdef _re2.const_char_ptr cur_end = NULL
 151 | 
 152 |         for i in range(self.nmatches):
 153 |             if self.matches[i].data() == NULL:
 154 |                 groups.append(None)
 155 |             else:
 156 |                 if i > 0:
 157 |                     cur_end = self.matches[i].data() + self.matches[i].length()
 158 | 
 159 |                     if last_end == NULL:
 160 |                         last_end = cur_end
 161 |                         self._lastindex = i
 162 |                     else:
 163 |                         # The rules for last group are a bit complicated:
 164 |                         # if two groups end at the same point, the earlier one is considered last
 165 |                         # so we don't switch our selection unless the end point has moved
 166 |                         if cur_end > last_end:
 167 |                             last_end = cur_end
 168 |                             self._lastindex = i
 169 | 
 170 |                 if cur_encoded:
 171 |                     groups.append(char_to_utf8(self.matches[i].data(), self.matches[i].length()))
 172 |                 else:
 173 |                     groups.append(self.matches[i].data()[:self.matches[i].length()])
 174 |         self._groups = tuple(groups)
 175 | 
 176 |     def groups(self, default=None):
 177 |         self.init_groups()
 178 |         if default is not None:
 179 |             return tuple([g or default for g in self._groups[1:]])
 180 |         return self._groups[1:]
 181 | 
 182 |     def group(self, *args):
 183 |         try:
 184 |             string = basestring
 185 |         except NameError as e:
 186 |             string = (str, bytes)
 187 |         if len(args) > 1:
 188 |             return tuple([self.group(i) for i in args])
 189 |         elif len(args) > 0:
 190 |             groupnum = args[0]
 191 |         else:
 192 |             groupnum = 0
 193 | 
 194 |         cdef int idx
 195 | 
 196 |         self.init_groups()
 197 | 
 198 |         if isinstance(groupnum, string):
 199 |             return self.groupdict()[groupnum]
 200 | 
 201 |         idx = groupnum
 202 | 
 203 |         if idx > self.nmatches - 1:
 204 |             raise IndexError("no such group")
 205 |         return self._groups[idx]
 206 | 
 207 |     cdef object _convert_positions(self, positions):
 208 |         cdef char * s = self.match_string
 209 |         cdef int cpos = 0
 210 |         cdef int upos = 0
 211 |         cdef int size = len(self.match_string)
 212 |         cdef int c
 213 | 
 214 |         new_positions = []
 215 |         i = 0
 216 |         num_positions = len(positions)
 217 |         if positions[i] == -1:
 218 |             new_positions.append(-1)
 219 |             inc(i)
 220 |             if i == num_positions:
 221 |                 return new_positions
 222 |         if positions[i] == 0:
 223 |             new_positions.append(0)
 224 |             inc(i)
 225 |             if i == num_positions:
 226 |                 return new_positions
 227 | 
 228 |         while cpos < size:
 229 |             c = <unsigned char>s[cpos]
 230 |             if c < 0x80:
 231 |                 inc(cpos)
 232 |                 inc(upos)
 233 |             elif c < 0xe0:
 234 |                 cpos += 2
 235 |                 inc(upos)
 236 |             elif c < 0xf0:
 237 |                 cpos += 3
 238 |                 inc(upos)
 239 |             else:
 240 |                 cpos += 4
 241 |                 inc(upos)
 242 |                 # wide unicode chars get 2 unichars when python is compiled with --enable-unicode=ucs2
 243 |                 # TODO: verify this
 244 |                 emit_ifndef_py_unicode_wide()
 245 |                 inc(upos)
 246 |                 emit_endif()
 247 | 
 248 |             if positions[i] == cpos:
 249 |                 new_positions.append(upos)
 250 |                 inc(i)
 251 |                 if i == num_positions:
 252 |                     return new_positions
 253 | 
 254 |     def _convert_spans(self, spans):
 255 |         positions = [x for x,y in spans] + [y for x,y in spans]
 256 |         positions = sorted(set(positions))
 257 |         posdict = dict(zip(positions, self._convert_positions(positions)))
 258 | 
 259 |         return [(posdict[x], posdict[y]) for x,y in spans]
 260 | 
 261 | 
 262 |     cdef _make_spans(self):
 263 |         if self._spans is not None:
 264 |             return
 265 | 
 266 |         cdef int start, end
 267 |         cdef char * s = self.match_string
 268 |         cdef _re2.StringPiece * piece
 269 | 
 270 |         spans = []
 271 |         for i in range(self.nmatches):
 272 |             if self.matches[i].data() == NULL:
 273 |                 spans.append((-1, -1))
 274 |             else:
 275 |                 piece = &self.matches[i]
 276 |                 if piece.data() == NULL:
 277 |                     return (-1, -1)
 278 |                 start = piece.data() - s
 279 |                 end = start + piece.length()
 280 |                 spans.append((start, end))
 281 | 
 282 |         if self.encoded:
 283 |             spans = self._convert_spans(spans)
 284 | 
 285 |         self._spans = tuple(spans)
 286 | 
 287 |     property regs:
 288 |         def __get__(self):
 289 |             if self._spans is None:
 290 |                 self._make_spans()
 291 |             return self._spans
 292 | 
 293 |     def expand(self, object template):
 294 |         # TODO - This can be optimized to work a bit faster in C.
 295 |         # Expand a template with groups
 296 |         items = template.split('\\')
 297 |         for i, item in enumerate(items[1:]):
 298 |             if item[0].isdigit():
 299 |                 # Number group
 300 |                 if item[0] == '0':
 301 |                     items[i + 1] = '\x00' + item[1:]
 302 |                 else:
 303 |                     items[i + 1] = self.group(int(item[0])) + item[1:]
 304 |             elif item[:2] == 'g<' and '>' in item:
 305 |                 # This is a named group
 306 |                 name, rest = item[2:].split('>', 1)
 307 |                 items[i + 1] = self.group(name) + rest
 308 |             else:
 309 |                 # This isn't a template at all
 310 |                 items[i + 1] = '\\' + item
 311 |         return ''.join(items)
 312 | 
 313 |     def groupdict(self):
 314 |         cdef _re2.stringintmapiterator it
 315 |         cdef dict result = {}
 316 |         cdef dict indexes = {}
 317 | 
 318 |         self.init_groups()
 319 | 
 320 |         if self._named_groups:
 321 |             return self._named_groups
 322 | 
 323 |         self._named_groups = result
 324 |         it = self.named_groups.begin()
 325 |         while it != self.named_groups.end():
 326 |             indexes[cpp_to_pystring(deref(it).first)] = deref(it).second
 327 |             result[cpp_to_pystring(deref(it).first)] = self._groups[deref(it).second]
 328 |             inc(it)
 329 | 
 330 |         self._named_groups = result
 331 |         self._named_indexes = indexes
 332 |         return result
 333 | 
 334 |     def end(self, group=0):
 335 |         return self.span(group)[1]
 336 | 
 337 |     def start(self, group=0):
 338 |         return self.span(group)[0]
 339 | 
 340 |     def span(self, group=0):
 341 |         self._make_spans()
 342 |         if type(group) is int:
 343 |             if group > len(self._spans):
 344 |                 raise IndexError("no such group")
 345 |             return self._spans[group]
 346 |         else:
 347 |             self.groupdict()
 348 |             if group not in self._named_indexes:
 349 |                 raise IndexError("no such group")
 350 |             return self._spans[self._named_indexes[group]]
 351 | 
 352 | 
 353 |     property lastindex:
 354 |         def __get__(self):
 355 |             self.init_groups()
 356 |             if self._lastindex < 1:
 357 |                 return None
 358 |             else:
 359 |                 return self._lastindex
 360 | 
 361 |     property lastgroup:
 362 |         def __get__(self):
 363 |             self.init_groups()
 364 |             cdef _re2.stringintmapiterator it
 365 | 
 366 |             if self._lastindex < 1:
 367 |                 return None
 368 | 
 369 |             it = self.named_groups.begin()
 370 |             while it != self.named_groups.end():
 371 |                 if deref(it).second == self._lastindex:
 372 |                     return cpp_to_pystring(deref(it).first)
 373 |                 inc(it)
 374 | 
 375 |             return None
 376 | 
 377 | 
 378 | cdef class Pattern:
 379 |     cdef _re2.RE2 * re_pattern
 380 |     cdef int ngroups
 381 |     cdef bint encoded
 382 |     cdef int _flags
 383 |     cdef public object pattern
 384 |     cdef object __weakref__
 385 | 
 386 |     property flags:
 387 |         def __get__(self):
 388 |             return self._flags
 389 | 
 390 |     property groups:
 391 |         def __get__(self):
 392 |             return self.ngroups
 393 | 
 394 |     def __dealloc__(self):
 395 |         del self.re_pattern
 396 | 
 397 |     cdef _search(self, string, int pos, int endpos, _re2.re2_Anchor anchoring):
 398 |         """
 399 |         Scan through string looking for a match, and return a corresponding
 400 |         Match instance. Return None if no position in the string matches.
 401 |         """
 402 |         cdef Py_ssize_t size
 403 |         cdef int result
 404 |         cdef char * cstring
 405 |         cdef int encoded = 0
 406 |         cdef _re2.StringPiece * sp
 407 |         cdef Match m = Match(self, self.ngroups + 1)
 408 | 
 409 |         if hasattr(string, 'tostring'):
 410 |             string = string.tostring()
 411 | 
 412 |         string = unicode_to_bytestring(string, &encoded)
 413 | 
 414 |         if pystring_to_bytestring(string, &cstring, &size) == -1:
 415 |             raise TypeError("expected string or buffer")
 416 | 
 417 |         if endpos >= 0 and endpos <= pos:
 418 |             return None
 419 | 
 420 |         if endpos >= 0 and endpos < size:
 421 |             size = endpos
 422 | 
 423 |         if pos > size:
 424 |             return None
 425 | 
 426 |         sp = new _re2.StringPiece(cstring, size)
 427 |         with nogil:
 428 |             result = self.re_pattern.Match(sp[0], <int>pos, <int>size, anchoring, m.matches, self.ngroups + 1)
 429 | 
 430 |         del sp
 431 |         if result == 0:
 432 |             return None
 433 |         m.encoded = <bint>(encoded)
 434 |         m.named_groups = _re2.addressof(self.re_pattern.NamedCapturingGroups())
 435 |         m.nmatches = self.ngroups + 1
 436 |         m.match_string = string
 437 |         m._pos = pos
 438 |         if endpos == -1:
 439 |             m._endpos = len(string)
 440 |         else:
 441 |             m._endpos = endpos
 442 |         return m
 443 | 
 444 | 
 445 |     def search(self, string, int pos=0, int endpos=-1):
 446 |         """
 447 |         Scan through string looking for a match, and return a corresponding
 448 |         Match instance. Return None if no position in the string matches.
 449 |         """
 450 |         return self._search(string, pos, endpos, _re2.UNANCHORED)
 451 | 
 452 | 
 453 |     def match(self, string, int pos=0, int endpos=-1):
 454 |         """
 455 |         Matches zero or more characters at the beginning of the string.
 456 |         """
 457 |         return self._search(string, pos, endpos, _re2.ANCHOR_START)
 458 | 
 459 |     cdef _print_pattern(self):
 460 |         cdef _re2.cpp_string * s
 461 |         s = <_re2.cpp_string *>_re2.addressofs(self.re_pattern.pattern())
 462 |         print cpp_to_pystring(s[0]) + "\n"
 463 |         sys.stdout.flush()
 464 | 
 465 | 
 466 |     cdef _finditer(self, object string, int pos=0, int endpos=-1, int as_match=0):
 467 |         cdef Py_ssize_t size
 468 |         cdef int result
 469 |         cdef char * cstring
 470 |         cdef _re2.StringPiece * sp
 471 |         cdef Match m
 472 |         cdef list resultlist = []
 473 |         cdef int encoded = 0
 474 | 
 475 |         string = unicode_to_bytestring(string, &encoded)
 476 |         if pystring_to_bytestring(string, &cstring, &size) == -1:
 477 |             raise TypeError("expected string or buffer")
 478 |         encoded = <bint>encoded
 479 | 
 480 |         if endpos != -1 and endpos < size:
 481 |             size = endpos
 482 | 
 483 |         sp = new _re2.StringPiece(cstring, size)
 484 | 
 485 |         while True:
 486 |             m = Match(self, self.ngroups + 1)
 487 |             with nogil:
 488 |                 result = self.re_pattern.Match(sp[0], <int>pos, <int>size, _re2.UNANCHORED, m.matches, self.ngroups + 1)
 489 |             if result == 0:
 490 |                 break
 491 |             m.encoded = encoded
 492 |             m.named_groups = _re2.addressof(self.re_pattern.NamedCapturingGroups())
 493 |             m.nmatches = self.ngroups + 1
 494 |             m.match_string = string
 495 |             m._pos = pos
 496 |             if endpos == -1:
 497 |                 m._endpos = len(string)
 498 |             else:
 499 |                 m._endpos = endpos
 500 |             if as_match:
 501 |                 if self.ngroups > 1:
 502 |                     resultlist.append(m.groups(""))
 503 |                 else:
 504 |                     resultlist.append(m.group(self.ngroups))
 505 |             else:
 506 |                 resultlist.append(m)
 507 |             if pos == size:
 508 |                 break
 509 |             # offset the pos to move to the next point
 510 |             if m.matches[0].length() == 0:
 511 |                 pos += 1
 512 |             else:
 513 |                 pos = m.matches[0].data() - cstring + m.matches[0].length()
 514 |         del sp
 515 |         return resultlist
 516 | 
 517 |     def finditer(self, object string, int pos=0, int endpos=-1):
 518 |         """
 519 |         Return all non-overlapping matches of pattern in string as a list
 520 |         of match objects.
 521 |         """
 522 |         # TODO This builds a list and returns its iterator. Probably could be more memory efficient
 523 |         return self._finditer(string, pos, endpos, 0).__iter__()
 524 | 
 525 |     def findall(self, object string, int pos=0, int endpos=-1):
 526 |         """
 527 |         Return all non-overlapping matches of pattern in string as a list
 528 |         of strings.
 529 |         """
 530 |         return self._finditer(string, pos, endpos, 1)
 531 | 
 532 |     def split(self, string, int maxsplit=0):
 533 |         """
 534 |         split(string[, maxsplit = 0]) --> list
 535 |         Split a string by the occurances of the pattern.
 536 |         """
 537 |         cdef Py_ssize_t size
 538 |         cdef int num_groups = 1
 539 |         cdef int result
 540 |         cdef int endpos
 541 |         cdef int pos = 0
 542 |         cdef int lookahead = 0
 543 |         cdef int num_split = 0
 544 |         cdef char * cstring
 545 |         cdef _re2.StringPiece * sp
 546 |         cdef _re2.StringPiece * matches
 547 |         cdef Match m
 548 |         cdef list resultlist = []
 549 |         cdef int encoded = 0
 550 | 
 551 |         if maxsplit < 0:
 552 |             maxsplit = 0
 553 | 
 554 |         string = unicode_to_bytestring(string, &encoded)
 555 |         if pystring_to_bytestring(string, &cstring, &size) == -1:
 556 |             raise TypeError("expected string or buffer")
 557 | 
 558 |         encoded = <bint>encoded
 559 | 
 560 |         matches = _re2.new_StringPiece_array(self.ngroups + 1)
 561 |         sp = new _re2.StringPiece(cstring, size)
 562 | 
 563 |         while True:
 564 |             with nogil:
 565 |                 result = self.re_pattern.Match(sp[0], <int>(pos + lookahead), <int>size, _re2.UNANCHORED, matches, self.ngroups + 1)
 566 |             if result == 0:
 567 |                 break
 568 | 
 569 |             match_start = matches[0].data() - cstring
 570 |             match_end = match_start + matches[0].length()
 571 | 
 572 |             # If an empty match, just look ahead until you find something
 573 |             if match_start == match_end:
 574 |                 if pos + lookahead == size:
 575 |                     break
 576 |                 lookahead += 1
 577 |                 continue
 578 | 
 579 |             if encoded:
 580 |                 resultlist.append(char_to_utf8(&sp.data()[pos], match_start - pos))
 581 |             else:
 582 |                 resultlist.append(sp.data()[pos:match_start])
 583 |             if self.ngroups > 0:
 584 |                 for group in range(self.ngroups):
 585 |                     if matches[group + 1].data() == NULL:
 586 |                         resultlist.append(None)
 587 |                     else:
 588 |                         if encoded:
 589 |                             resultlist.append(char_to_utf8(matches[group + 1].data(), matches[group + 1].length()))
 590 |                         else:
 591 |                             resultlist.append(matches[group + 1].data()[:matches[group + 1].length()])
 592 | 
 593 |             # offset the pos to move to the next point
 594 |             pos = match_end
 595 |             lookahead = 0
 596 | 
 597 |             num_split += 1
 598 |             if maxsplit and num_split >= maxsplit:
 599 |                 break
 600 | 
 601 |         if encoded:
 602 |             resultlist.append(char_to_utf8(&sp.data()[pos], sp.length() - pos))
 603 |         else:
 604 |             resultlist.append(sp.data()[pos:])
 605 |         _re2.delete_StringPiece_array(matches)
 606 |         del sp
 607 |         return resultlist
 608 | 
 609 |     def sub(self, repl, string, int count=0):
 610 |         """
 611 |         sub(repl, string[, count = 0]) --> newstring
 612 |         Return the string obtained by replacing the leftmost non-overlapping
 613 |         occurrences of pattern in string by the replacement repl.
 614 |         """
 615 |         return self.subn(repl, string, count)[0]
 616 | 
 617 |     def subn(self, repl, string, int count=0):
 618 |         """
 619 |         subn(repl, string[, count = 0]) --> (newstring, number of subs)
 620 |         Return the tuple (new_string, number_of_subs_made) found by replacing
 621 |         the leftmost non-overlapping occurrences of pattern with the
 622 |         replacement repl.
 623 |         """
 624 |         cdef Py_ssize_t size
 625 |         cdef char * cstring
 626 |         cdef _re2.cpp_string * fixed_repl
 627 |         cdef _re2.StringPiece * sp
 628 |         cdef _re2.cpp_string * input_str
 629 |         cdef total_replacements = 0
 630 |         cdef int string_encoded = 0
 631 |         cdef int repl_encoded = 0
 632 |         cdef int encoded = 0
 633 | 
 634 |         if callable(repl):
 635 |             # This is a callback, so let's use the custom function
 636 |             return self._subn_callback(repl, string, count)
 637 | 
 638 |         string = unicode_to_bytestring(string, &string_encoded)
 639 |         repl = unicode_to_bytestring(repl, &repl_encoded)
 640 |         if pystring_to_bytestring(repl, &cstring, &size) == -1:
 641 |             raise TypeError("expected string or buffer")
 642 | 
 643 |         fixed_repl = NULL
 644 |         cdef _re2.const_char_ptr s = cstring
 645 |         cdef _re2.const_char_ptr end = s + size
 646 |         cdef int c = 0
 647 |         while s < end:
 648 |             c = s[0]
 649 |             if (c == '\\'):
 650 |                 s += 1
 651 |                 if s == end:
 652 |                     raise RegexError("Invalid rewrite pattern")
 653 |                 c = s[0]
 654 |                 if c == '\\' or (c >= '0' and c <= '9'):
 655 |                     if fixed_repl != NULL:
 656 |                         fixed_repl.push_back('\\')
 657 |                         fixed_repl.push_back(c)
 658 |                 else:
 659 |                     if fixed_repl == NULL:
 660 |                         fixed_repl = new _re2.cpp_string(cstring, s - cstring - 1)
 661 |                     if c == 'n':
 662 |                         fixed_repl.push_back('\n')
 663 |                     else:
 664 |                         fixed_repl.push_back('\\')
 665 |                         fixed_repl.push_back('\\')
 666 |                         fixed_repl.push_back(c)
 667 |             else:
 668 |                 if fixed_repl != NULL:
 669 |                     fixed_repl.push_back(c)
 670 | 
 671 |             s += 1
 672 |         if fixed_repl != NULL:
 673 |             sp = new _re2.StringPiece(fixed_repl.c_str())
 674 |         else:
 675 |             sp = new _re2.StringPiece(cstring, size)
 676 | 
 677 |         input_str = new _re2.cpp_string(string)
 678 |         if not count:
 679 |             total_replacements = _re2.pattern_GlobalReplace(input_str,
 680 |                                                             self.re_pattern[0],
 681 |                                                             sp[0])
 682 |         elif count == 1:
 683 |             total_replacements = _re2.pattern_Replace(input_str,
 684 |                                                       self.re_pattern[0],
 685 |                                                       sp[0])
 686 |         else:
 687 |             del fixed_repl
 688 |             del input_str
 689 |             del sp
 690 |             raise NotImplementedError("So far pyre2 does not support custom replacement counts")
 691 | 
 692 |         if string_encoded or (repl_encoded and total_replacements > 0):
 693 |             result = cpp_to_utf8(input_str[0])
 694 |         else:
 695 |             result = cpp_to_pystring(input_str[0])
 696 |         del fixed_repl
 697 |         del input_str
 698 |         del sp
 699 |         return (result, total_replacements)
 700 | 
 701 |     def _subn_callback(self, callback, string, int count=0):
 702 |         """
 703 |         This function is probably the hardest to implement correctly.
 704 |         This is my first attempt, but if anybody has a better solution, please help out.
 705 |         """
 706 |         cdef Py_ssize_t size
 707 |         cdef int result
 708 |         cdef int endpos
 709 |         cdef int pos = 0
 710 |         cdef int encoded = 0
 711 |         cdef int num_repl = 0
 712 |         cdef char * cstring
 713 |         cdef _re2.StringPiece * sp
 714 |         cdef Match m
 715 |         cdef list resultlist = []
 716 | 
 717 |         if count < 0:
 718 |             count = 0
 719 | 
 720 |         string = unicode_to_bytestring(string, &encoded)
 721 |         if pystring_to_bytestring(string, &cstring, &size) == -1:
 722 |             raise TypeError("expected string or buffer")
 723 |         encoded = <bint>encoded
 724 | 
 725 |         sp = new _re2.StringPiece(cstring, size)
 726 | 
 727 |         try:
 728 |             while True:
 729 |                 m = Match(self, self.ngroups + 1)
 730 |                 with nogil:
 731 |                     result = self.re_pattern.Match(sp[0], <int>pos, <int>size, _re2.UNANCHORED, m.matches, self.ngroups + 1)
 732 |                 if result == 0:
 733 |                     break
 734 | 
 735 |                 endpos = m.matches[0].data() - cstring
 736 |                 if encoded:
 737 |                     resultlist.append(char_to_utf8(&sp.data()[pos], endpos - pos))
 738 |                 else:
 739 |                     resultlist.append(sp.data()[pos:endpos])
 740 |                 pos = endpos + m.matches[0].length()
 741 | 
 742 |                 m.encoded = encoded
 743 |                 m.named_groups = _re2.addressof(self.re_pattern.NamedCapturingGroups())
 744 |                 m.nmatches = self.ngroups + 1
 745 |                 m.match_string = string
 746 |                 resultlist.append(callback(m) or '')
 747 | 
 748 |                 num_repl += 1
 749 |                 if count and num_repl >= count:
 750 |                     break
 751 | 
 752 |             if encoded:
 753 |                 resultlist.append(char_to_utf8(&sp.data()[pos], sp.length() - pos))
 754 |                 return (u''.join(resultlist), num_repl)
 755 |             else:
 756 |                 resultlist.append(sp.data()[pos:])
 757 |                 return (''.join(resultlist), num_repl)
 758 |         finally:
 759 |             del sp
 760 | 
 761 | _cache = {}
 762 | _cache_repl = {}
 763 | 
 764 | _MAXCACHE = 100
 765 | 
 766 | def compile(pattern, int flags=0, int max_mem=8388608):
 767 |     cachekey = (type(pattern),) + (pattern, flags)
 768 |     p = _cache.get(cachekey)
 769 |     if p is not None:
 770 |         return p
 771 |     p = _compile(pattern, flags, max_mem)
 772 | 
 773 |     if len(_cache) >= _MAXCACHE:
 774 |         _cache.clear()
 775 |     _cache[cachekey] = p
 776 |     return p
 777 | 
 778 | class BackreferencesException(Exception):
 779 |     pass
 780 | 
 781 | class CharClassProblemException(Exception):
 782 |     pass
 783 | 
 784 | WHITESPACE = set(" \t\n\r\v\f")
 785 | 
 786 | class Tokenizer:
 787 |     def __init__(self, string):
 788 |         self.string = string
 789 |         self.index = 0
 790 |         self.__next()
 791 |     def __next(self):
 792 |         if self.index >= len(self.string):
 793 |             self.next = None
 794 |             return
 795 |         ch = self.string[self.index]
 796 |         if ch[0] == "\\":
 797 |             try:
 798 |                 c = self.string[self.index + 1]
 799 |             except IndexError:
 800 |                 raise RegexError, "bogus escape (end of line)"
 801 |             ch = ch + c
 802 |         self.index = self.index + len(ch)
 803 |         self.next = ch
 804 |     def get(self):
 805 |         this = self.next
 806 |         self.__next()
 807 |         return this
 808 | 
 809 | def prepare_pattern(pattern, int flags):
 810 |     source = Tokenizer(pattern)
 811 |     new_pattern = []
 812 | 
 813 |     cdef str strflags = ''
 814 |     if flags & _S:
 815 |         strflags += 's'
 816 |     if flags & _M:
 817 |         strflags += 'm'
 818 | 
 819 |     if strflags:
 820 |         new_pattern.append('(?' + strflags + ')')
 821 | 
 822 |     while 1:
 823 |         this = source.get()
 824 |         if this is None:
 825 |             break
 826 |         if flags & _X:
 827 |             if this in WHITESPACE:
 828 |                 continue
 829 |             if this == "#":
 830 |                 while 1:
 831 |                     this = source.get()
 832 |                     if this in (None, "\n"):
 833 |                         break
 834 |                 continue
 835 | 
 836 |         if this[0] not in '[\\':
 837 |             new_pattern.append(this)
 838 |             continue
 839 | 
 840 |         elif this == '[':
 841 |             new_pattern.append(this)
 842 |             while 1:
 843 |                 this = source.get()
 844 |                 if this is None:
 845 |                     raise RegexError, "unexpected end of regular expression"
 846 |                 elif this == ']':
 847 |                     new_pattern.append(this)
 848 |                     break
 849 |                 elif this[0] == '\\':
 850 |                     if flags & _U:
 851 |                         if this[1] == 'd':
 852 |                             new_pattern.append(r'\p{Nd}')
 853 |                         elif this[1] == 'w':
 854 |                             new_pattern.append(r'_\p{L}\p{Nd}')
 855 |                         elif this[1] == 's':
 856 |                             new_pattern.append(r'\s\p{Z}')
 857 |                         elif this[1] == 'D':
 858 |                             new_pattern.append(r'\P{Nd}')
 859 |                         elif this[1] == 'W':
 860 |                             # Since \w and \s are made out of several character groups,
 861 |                             # I don't see a way to convert their complements into a group
 862 |                             # without rewriting the whole expression, which seems too complicated.
 863 | 
 864 |                             raise CharClassProblemException()
 865 |                         elif this[1] == 'S':
 866 |                             raise CharClassProblemException()
 867 |                         else:
 868 |                             new_pattern.append(this)
 869 |                     else:
 870 |                         new_pattern.append(this)
 871 |                 else:
 872 |                     new_pattern.append(this)
 873 |         elif this[0] == '\\':
 874 |             if this[1] in '89':
 875 |                 raise BackreferencesException()
 876 |             elif this[1] in '1234567':
 877 |                 if source.next and source.next in '1234567':
 878 |                     this += source.get()
 879 |                     if source.next and source.next in '1234567':
 880 |                         # all clear, this is an octal escape
 881 |                         new_pattern.append(this)
 882 |                     else:
 883 |                         raise BackreferencesException()
 884 |                 else:
 885 |                     raise BackreferencesException()
 886 |             elif flags & _U:
 887 |                 if this[1] == 'd':
 888 |                     new_pattern.append(r'\p{Nd}')
 889 |                 elif this[1] == 'w':
 890 |                     new_pattern.append(r'[_\p{L}\p{Nd}]')
 891 |                 elif this[1] == 's':
 892 |                     new_pattern.append(r'[\s\p{Z}]')
 893 |                 elif this[1] == 'D':
 894 |                     new_pattern.append(r'[^\p{Nd}]')
 895 |                 elif this[1] == 'W':
 896 |                     new_pattern.append(r'[^_\p{L}\p{Nd}]')
 897 |                 elif this[1] == 'S':
 898 |                     new_pattern.append(r'[^\s\p{Z}]')
 899 |                 else:
 900 |                     new_pattern.append(this)
 901 |             else:
 902 |                 new_pattern.append(this)
 903 | 
 904 |     return ''.join(new_pattern)
 905 | 
 906 | 
 907 | 
 908 | def _compile(pattern, int flags=0, int max_mem=8388608):
 909 |     """
 910 |     Compile a regular expression pattern, returning a pattern object.
 911 |     """
 912 |     cdef char * string
 913 |     cdef Py_ssize_t length
 914 |     cdef _re2.StringPiece * s
 915 |     cdef _re2.Options opts
 916 |     cdef int error_code
 917 |     cdef int encoded = 0
 918 | 
 919 |     if isinstance(pattern, (Pattern, SREPattern)):
 920 |         if flags:
 921 |             raise ValueError('Cannot process flags argument with a compiled pattern')
 922 |         return pattern
 923 | 
 924 |     cdef object original_pattern = pattern
 925 |     try:
 926 |         pattern = prepare_pattern(original_pattern, flags)
 927 |     except BackreferencesException:
 928 |         error_msg = "Backreferences not supported"
 929 |         if current_notification == <int>FALLBACK_EXCEPTION:
 930 |             # Raise an exception regardless of the type of error.
 931 |             raise RegexError(error_msg)
 932 |         elif current_notification == <int>FALLBACK_WARNING:
 933 |             warnings.warn("WARNING: Using re module. Reason: %s" % error_msg)
 934 |         return re.compile(original_pattern, flags)
 935 |     except CharClassProblemException:
 936 |         error_msg = "\W and \S not supported inside character classes"
 937 |         if current_notification == <int>FALLBACK_EXCEPTION:
 938 |             # Raise an exception regardless of the type of error.
 939 |             raise RegexError(error_msg)
 940 |         elif current_notification == <int>FALLBACK_WARNING:
 941 |             warnings.warn("WARNING: Using re module. Reason: %s" % error_msg)
 942 |         return re.compile(original_pattern, flags)
 943 | 
 944 |     # Set the options given the flags above.
 945 |     if flags & _I:
 946 |         opts.set_case_sensitive(0);
 947 | 
 948 |     opts.set_max_mem(max_mem)
 949 |     opts.set_log_errors(0)
 950 |     opts.set_encoding(_re2.EncodingUTF8)
 951 | 
 952 |     # We use this function to get the proper length of the string.
 953 | 
 954 |     pattern = unicode_to_bytestring(pattern, &encoded)
 955 |     if pystring_to_bytestring(pattern, &string, &length) == -1:
 956 |         raise TypeError("first argument must be a string or compiled pattern")
 957 | 
 958 |     s = new _re2.StringPiece(string, length)
 959 | 
 960 |     cdef _re2.RE2 *re_pattern
 961 |     with nogil:
 962 |          re_pattern = new _re2.RE2(s[0], opts)
 963 | 
 964 |     if not re_pattern.ok():
 965 |         # Something went wrong with the compilation.
 966 |         del s
 967 |         error_msg = cpp_to_pystring(re_pattern.error())
 968 |         error_code = re_pattern.error_code()
 969 |         del re_pattern
 970 |         if current_notification == <int>FALLBACK_EXCEPTION:
 971 |             # Raise an exception regardless of the type of error.
 972 |             raise RegexError(error_msg)
 973 |         elif error_code not in (_re2.ErrorBadPerlOp, _re2.ErrorRepeatSize,
 974 |                                 _re2.ErrorBadEscape):
 975 |             # Raise an error because these will not be fixed by using the
 976 |             # ``re`` module.
 977 |             raise RegexError(error_msg)
 978 |         elif current_notification == <int>FALLBACK_WARNING:
 979 |             warnings.warn("WARNING: Using re module. Reason: %s" % error_msg)
 980 |         return re.compile(original_pattern, flags)
 981 | 
 982 |     cdef Pattern pypattern = Pattern()
 983 |     pypattern.pattern = original_pattern
 984 |     pypattern.re_pattern = re_pattern
 985 |     pypattern.ngroups = re_pattern.NumberOfCapturingGroups()
 986 |     pypattern.encoded = <bint>encoded
 987 |     pypattern._flags = flags
 988 |     del s
 989 |     return pypattern
 990 | 
 991 | def search(pattern, string, int flags=0):
 992 |     """
 993 |     Scan through string looking for a match to the pattern, returning
 994 |     a match object or none if no match was found.
 995 |     """
 996 |     return compile(pattern, flags).search(string)
 997 | 
 998 | def match(pattern, string, int flags=0):
 999 |     """
1000 |     Try to apply the pattern at the start of the string, returning
1001 |     a match object, or None if no match was found.
1002 |     """
1003 |     return compile(pattern, flags).match(string)
1004 | 
1005 | def finditer(pattern, string, int flags=0):
1006 |     """
1007 |     Return an list of all non-overlapping matches in the
1008 |     string.  For each match, the iterator returns a match object.
1009 | 
1010 |     Empty matches are included in the result.
1011 |     """
1012 |     return compile(pattern, flags).finditer(string)
1013 | 
1014 | def findall(pattern, string, int flags=0):
1015 |     """
1016 |     Return an list of all non-overlapping matches in the
1017 |     string.  For each match, the iterator returns a match object.
1018 | 
1019 |     Empty matches are included in the result.
1020 |     """
1021 |     return compile(pattern, flags).findall(string)
1022 | 
1023 | def split(pattern, string, int maxsplit=0):
1024 |     """
1025 |     Split the source string by the occurrences of the pattern,
1026 |     returning a list containing the resulting substrings.
1027 |     """
1028 |     return compile(pattern).split(string, maxsplit)
1029 | 
1030 | def sub(pattern, repl, string, int count=0):
1031 |     """
1032 |     Return the string obtained by replacing the leftmost
1033 |     non-overlapping occurrences of the pattern in string by the
1034 |     replacement repl.  repl can be either a string or a callable;
1035 |     if a string, backslash escapes in it are processed.  If it is
1036 |     a callable, it's passed the match object and must return
1037 |     a replacement string to be used.
1038 |     """
1039 |     return compile(pattern).sub(repl, string, count)
1040 | 
1041 | def subn(pattern, repl, string, int count=0):
1042 |     """
1043 |     Return a 2-tuple containing (new_string, number).
1044 |     new_string is the string obtained by replacing the leftmost
1045 |     non-overlapping occurrences of the pattern in the source
1046 |     string by the replacement repl.  number is the number of
1047 |     substitutions that were made. repl can be either a string or a
1048 |     callable; if a string, backslash escapes in it are processed.
1049 |     If it is a callable, it's passed the match object and must
1050 |     return a replacement string to be used.
1051 |     """
1052 |     return compile(pattern).subn(repl, string, count)
1053 | 
1054 | _alphanum = {}
1055 | for c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890':
1056 |     _alphanum[c] = 1
1057 | del c
1058 | 
1059 | def escape(pattern):
1060 |     "Escape all non-alphanumeric characters in pattern."
1061 |     s = list(pattern)
1062 |     alphanum = _alphanum
1063 |     for i in range(len(pattern)):
1064 |         c = pattern[i]
1065 |         if ord(c) < 0x80 and c not in alphanum:
1066 |             if c == "\000":
1067 |                 s[i] = "\\000"
1068 |             else:
1069 |                 s[i] = "\\" + c
1070 |     return pattern[:0].join(s)
1071 | 
1072 | 


--------------------------------------------------------------------------------