├── .github
    └── workflows
    │   └── main.yml
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── changelog.txt
├── docs
    ├── Features.html
    └── UnicodeProperties.rst
├── pyproject.toml
├── regex_3
    ├── __init__.py
    ├── _regex.c
    ├── _regex.h
    ├── _regex_core.py
    ├── _regex_unicode.c
    ├── _regex_unicode.h
    ├── regex.py
    └── test_regex.py
├── setup.py
└── tools
    └── build_regex_unicode.py


/.github/workflows/main.yml:
--------------------------------------------------------------------------------
  1 | # Automatically build binary wheels and source packages.
  2 | name: cibuildwheel
  3 | 
  4 | # Build on every branch push with tag.
  5 | on:
  6 |   push:
  7 |     tags:
  8 |       - '*'
  9 | 
 10 | env:
 11 |   PYTHON_VER: '3.11'  # Python to run test/cibuildwheel
 12 |   CIBW_BUILD: cp39-* cp310-* cp311-* cp312-* cp313-*
 13 |   CIBW_TEST_COMMAND: python -m unittest regex.test_regex
 14 | 
 15 | jobs:
 16 |   # Run test on Ubuntu/macOS/Windows for every commit.
 17 |   run_test:
 18 |     name: Run test on ${{ matrix.platform }}
 19 |     runs-on: ${{ matrix.platform }}
 20 | 
 21 |     strategy:
 22 |       matrix:
 23 |         platform: [ubuntu-latest, macos-latest, windows-latest]
 24 | 
 25 |     steps:
 26 |       - uses: actions/checkout@v3
 27 |       - uses: actions/setup-python@v4
 28 |         with:
 29 |           python-version: ${{ env.PYTHON_VER }}
 30 | 
 31 |       - name: Run test
 32 |         run: |
 33 |           python -m pip install -vv .
 34 |           python -m unittest -v regex.test_regex
 35 | 
 36 |   # Build Linux/macOS/Windows wheels.
 37 |   build_wheels:
 38 |     name: Build ${{ matrix.platform }} wheels
 39 |     if: github.event_name == 'push'
 40 |     runs-on: ${{ matrix.platform }}
 41 | 
 42 |     strategy:
 43 |       matrix:
 44 |         platform: [ubuntu-latest, macos-latest, windows-latest]
 45 | 
 46 |     env:
 47 |       # macOS archs
 48 |       CIBW_ARCHS_MACOS: "x86_64 arm64 universal2"
 49 |       # Windows archs
 50 |       CIBW_ARCHS_WINDOWS: "AMD64 x86 ARM64"
 51 | 
 52 |     steps:
 53 |       - uses: actions/checkout@v3
 54 |       - uses: actions/setup-python@v4
 55 |         with:
 56 |           python-version: ${{ env.PYTHON_VER }}
 57 | 
 58 |       - name: Install cibuildwheel & build wheels
 59 |         run: |
 60 |           python -m pip install -U cibuildwheel
 61 |           python -m cibuildwheel --output-dir wheelhouse
 62 | 
 63 |       - name: Upload wheels
 64 |         uses: actions/upload-artifact@v4
 65 |         with:
 66 |           name: regex-files-wheels-${{ matrix.platform }}
 67 |           path: wheelhouse/*.whl
 68 | 
 69 |      # I cannot get this to work!
 70 |      # - name: Create GitHub release
 71 |      #   uses: actions/create-release@v1
 72 |      #   env:
 73 |      #     GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 74 |      #   with:
 75 |      #     tag_name: ${{ github.ref }}
 76 |      #     release_name: regex ${{ github.ref }}
 77 | 
 78 |   # Build source distribution & manylinux1_x86_64 wheels
 79 |   # These two jobs build:
 80 |   #   1, build_wheels (above): manylinux1_i686 / manylinux2014_x86_64
 81 |   #   2, build_in_manylinux2010 (this): manylinux1_x86_64
 82 |   # manylinux2014_x86_64 wheels use a new memcpy() function
 83 |   # (memcpy@GLIBC_2.14), so the wheels are not compatible with
 84 |   # manylinux1_x86_64 environment. In order to be compatible as
 85 |   # much as possible, this job builds manylinux1_x86_64 wheels.
 86 |   build_in_manylinux2010:
 87 |     name: Build in manylinux2010 environment
 88 |     if: github.event_name == 'push'
 89 |     runs-on: ubuntu-latest
 90 | 
 91 |     env:
 92 |       # Generate manylinux1_x86_64 wheels.
 93 |       #     tag         pip      CPython with the pip      glibc
 94 |       # manylinux1     >=8.1.0  3.5.2+, 3.6.0+            2.5  (2006-09-29)
 95 |       # manylinux2010  >=19.0   3.7.3+, 3.8.0+            2.12 (2010-05-03)
 96 |       # manylinux2014  >=19.3   3.7.8+, 3.8.4+, 3.9.0+    2.17 (2012-12-25)
 97 |       # manylinux_x_y  >=20.3   3.8.10+, 3.9.5+, 3.10.0+  x.y
 98 |       # manylinux2010 images EOL on 2022-08-01, it doesn't support cp311.
 99 |       CIBW_BUILD: cp39-* cp310-*
100 |       CIBW_MANYLINUX_X86_64_IMAGE: manylinux2010
101 |       CIBW_ARCHS_LINUX: x86_64
102 | 
103 |     steps:
104 |       - uses: actions/checkout@v3
105 |       - uses: actions/setup-python@v4
106 |         with:
107 |           python-version: ${{ env.PYTHON_VER }}
108 | 
109 |       - name: Build source distribution & wheels
110 |         run: |
111 |           python setup.py sdist --formats=gztar
112 |           python -m pip install -U cibuildwheel
113 |           python -m cibuildwheel --output-dir wheelhouse
114 | 
115 |       - name: Upload source distribution
116 |         uses: actions/upload-artifact@v4
117 |         with:
118 |           name: regex-files-dist
119 |           path: dist/*.tar.gz
120 | 
121 |       - name: Upload manylinux1_x86_64 wheels
122 |         uses: actions/upload-artifact@v4
123 |         with:
124 |           name: regex-files-manylinux2010
125 |           path: wheelhouse/*.whl
126 | 
127 |   # Build and upload aarch64/ppc64le/s390x wheels.
128 |   build_arch_wheels:
129 |     name: Build ${{ matrix.arch }} Linux wheels
130 |     if: github.event_name == 'push'
131 |     runs-on: ubuntu-latest
132 | 
133 |     strategy:
134 |       matrix:
135 |         arch: [aarch64, ppc64le, s390x]
136 |         # Building in QEMU is very slow, so parallelize the tasks.
137 |         skip_image: ["*musllinux*", "*manylinux*"]
138 | 
139 |     env:
140 |       CIBW_ARCHS: ${{ matrix.arch }}
141 |       CIBW_SKIP:  ${{ matrix.skip_image }}
142 | 
143 |     steps:
144 |       - uses: actions/checkout@v3
145 |       - uses: actions/setup-python@v4
146 |         with:
147 |           python-version: ${{ env.PYTHON_VER }}
148 | 
149 |       - name: Set up QEMU
150 |         uses: docker/setup-qemu-action@v2
151 | 
152 |       - name: Install cibuildwheel & build wheels
153 |         run: |
154 |           python -m pip install -U cibuildwheel
155 |           python -m cibuildwheel --output-dir wheelhouse
156 | 
157 |       - name: Upload ${{ matrix.arch }} wheels
158 |         uses: actions/upload-artifact@v4
159 |         with:
160 |           name: regex-files-arch-${{ matrix.arch }}
161 |           path: wheelhouse/*.whl
162 | 
163 |   # Upload to PyPI
164 |   upload_pypi:
165 |     name: Publish to PyPI
166 |     needs: [build_wheels, build_in_manylinux2010, build_arch_wheels]
167 |     runs-on: ubuntu-latest
168 | 
169 |     steps:
170 |       - uses: actions/download-artifact@v4
171 |         with:
172 |           pattern: regex-files-*
173 |           path: dist
174 |           merge-multiple: true
175 | 
176 |       - name: Upload to PyPI
177 |         uses: pypa/gh-action-pypi-publish@release/v1
178 |         with:
179 |           user: __token__
180 |           password: ${{ secrets.PYPI_TOKEN }}
181 |           skip_existing: true
182 |           verbose: true
183 |           print_hash: true
184 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | This work was derived from the 're' module of CPython 2.6 and CPython 3.1,
  2 | copyright (c) 1998-2001 by Secret Labs AB and licensed under CNRI's Python 1.6
  3 | license.
  4 | 
  5 | All additions and alterations are licensed under the Apache 2.0 License.
  6 | 
  7 | 
  8 |                                  Apache License
  9 |                            Version 2.0, January 2004
 10 |                         http://www.apache.org/licenses/
 11 | 
 12 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 13 | 
 14 |    1. Definitions.
 15 | 
 16 |       "License" shall mean the terms and conditions for use, reproduction,
 17 |       and distribution as defined by Sections 1 through 9 of this document.
 18 | 
 19 |       "Licensor" shall mean the copyright owner or entity authorized by
 20 |       the copyright owner that is granting the License.
 21 | 
 22 |       "Legal Entity" shall mean the union of the acting entity and all
 23 |       other entities that control, are controlled by, or are under common
 24 |       control with that entity. For the purposes of this definition,
 25 |       "control" means (i) the power, direct or indirect, to cause the
 26 |       direction or management of such entity, whether by contract or
 27 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 28 |       outstanding shares, or (iii) beneficial ownership of such entity.
 29 | 
 30 |       "You" (or "Your") shall mean an individual or Legal Entity
 31 |       exercising permissions granted by this License.
 32 | 
 33 |       "Source" form shall mean the preferred form for making modifications,
 34 |       including but not limited to software source code, documentation
 35 |       source, and configuration files.
 36 | 
 37 |       "Object" form shall mean any form resulting from mechanical
 38 |       transformation or translation of a Source form, including but
 39 |       not limited to compiled object code, generated documentation,
 40 |       and conversions to other media types.
 41 | 
 42 |       "Work" shall mean the work of authorship, whether in Source or
 43 |       Object form, made available under the License, as indicated by a
 44 |       copyright notice that is included in or attached to the work
 45 |       (an example is provided in the Appendix below).
 46 | 
 47 |       "Derivative Works" shall mean any work, whether in Source or Object
 48 |       form, that is based on (or derived from) the Work and for which the
 49 |       editorial revisions, annotations, elaborations, or other modifications
 50 |       represent, as a whole, an original work of authorship. For the purposes
 51 |       of this License, Derivative Works shall not include works that remain
 52 |       separable from, or merely link (or bind by name) to the interfaces of,
 53 |       the Work and Derivative Works thereof.
 54 | 
 55 |       "Contribution" shall mean any work of authorship, including
 56 |       the original version of the Work and any modifications or additions
 57 |       to that Work or Derivative Works thereof, that is intentionally
 58 |       submitted to Licensor for inclusion in the Work by the copyright owner
 59 |       or by an individual or Legal Entity authorized to submit on behalf of
 60 |       the copyright owner. For the purposes of this definition, "submitted"
 61 |       means any form of electronic, verbal, or written communication sent
 62 |       to the Licensor or its representatives, including but not limited to
 63 |       communication on electronic mailing lists, source code control systems,
 64 |       and issue tracking systems that are managed by, or on behalf of, the
 65 |       Licensor for the purpose of discussing and improving the Work, but
 66 |       excluding communication that is conspicuously marked or otherwise
 67 |       designated in writing by the copyright owner as "Not a Contribution."
 68 | 
 69 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 70 |       on behalf of whom a Contribution has been received by Licensor and
 71 |       subsequently incorporated within the Work.
 72 | 
 73 |    2. Grant of Copyright License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       copyright license to reproduce, prepare Derivative Works of,
 77 |       publicly display, publicly perform, sublicense, and distribute the
 78 |       Work and such Derivative Works in Source or Object form.
 79 | 
 80 |    3. Grant of Patent License. Subject to the terms and conditions of
 81 |       this License, each Contributor hereby grants to You a perpetual,
 82 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 83 |       (except as stated in this section) patent license to make, have made,
 84 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 85 |       where such license applies only to those patent claims licensable
 86 |       by such Contributor that are necessarily infringed by their
 87 |       Contribution(s) alone or by combination of their Contribution(s)
 88 |       with the Work to which such Contribution(s) was submitted. If You
 89 |       institute patent litigation against any entity (including a
 90 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 91 |       or a Contribution incorporated within the Work constitutes direct
 92 |       or contributory patent infringement, then any patent licenses
 93 |       granted to You under this License for that Work shall terminate
 94 |       as of the date such litigation is filed.
 95 | 
 96 |    4. Redistribution. You may reproduce and distribute copies of the
 97 |       Work or Derivative Works thereof in any medium, with or without
 98 |       modifications, and in Source or Object form, provided that You
 99 |       meet the following conditions:
100 | 
101 |       (a) You must give any other recipients of the Work or
102 |           Derivative Works a copy of this License; and
103 | 
104 |       (b) You must cause any modified files to carry prominent notices
105 |           stating that You changed the files; and
106 | 
107 |       (c) You must retain, in the Source form of any Derivative Works
108 |           that You distribute, all copyright, patent, trademark, and
109 |           attribution notices from the Source form of the Work,
110 |           excluding those notices that do not pertain to any part of
111 |           the Derivative Works; and
112 | 
113 |       (d) If the Work includes a "NOTICE" text file as part of its
114 |           distribution, then any Derivative Works that You distribute must
115 |           include a readable copy of the attribution notices contained
116 |           within such NOTICE file, excluding those notices that do not
117 |           pertain to any part of the Derivative Works, in at least one
118 |           of the following places: within a NOTICE text file distributed
119 |           as part of the Derivative Works; within the Source form or
120 |           documentation, if provided along with the Derivative Works; or,
121 |           within a display generated by the Derivative Works, if and
122 |           wherever such third-party notices normally appear. The contents
123 |           of the NOTICE file are for informational purposes only and
124 |           do not modify the License. You may add Your own attribution
125 |           notices within Derivative Works that You distribute, alongside
126 |           or as an addendum to the NOTICE text from the Work, provided
127 |           that such additional attribution notices cannot be construed
128 |           as modifying the License.
129 | 
130 |       You may add Your own copyright statement to Your modifications and
131 |       may provide additional or different license terms and conditions
132 |       for use, reproduction, or distribution of Your modifications, or
133 |       for any such Derivative Works as a whole, provided Your use,
134 |       reproduction, and distribution of the Work otherwise complies with
135 |       the conditions stated in this License.
136 | 
137 |    5. Submission of Contributions. Unless You explicitly state otherwise,
138 |       any Contribution intentionally submitted for inclusion in the Work
139 |       by You to the Licensor shall be under the terms and conditions of
140 |       this License, without any additional terms or conditions.
141 |       Notwithstanding the above, nothing herein shall supersede or modify
142 |       the terms of any separate license agreement you may have executed
143 |       with Licensor regarding such Contributions.
144 | 
145 |    6. Trademarks. This License does not grant permission to use the trade
146 |       names, trademarks, service marks, or product names of the Licensor,
147 |       except as required for reasonable and customary use in describing the
148 |       origin of the Work and reproducing the content of the NOTICE file.
149 | 
150 |    7. Disclaimer of Warranty. Unless required by applicable law or
151 |       agreed to in writing, Licensor provides the Work (and each
152 |       Contributor provides its Contributions) on an "AS IS" BASIS,
153 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
154 |       implied, including, without limitation, any warranties or conditions
155 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
156 |       PARTICULAR PURPOSE. You are solely responsible for determining the
157 |       appropriateness of using or redistributing the Work and assume any
158 |       risks associated with Your exercise of permissions under this License.
159 | 
160 |    8. Limitation of Liability. In no event and under no legal theory,
161 |       whether in tort (including negligence), contract, or otherwise,
162 |       unless required by applicable law (such as deliberate and grossly
163 |       negligent acts) or agreed to in writing, shall any Contributor be
164 |       liable to You for damages, including any direct, indirect, special,
165 |       incidental, or consequential damages of any character arising as a
166 |       result of this License or out of the use or inability to use the
167 |       Work (including but not limited to damages for loss of goodwill,
168 |       work stoppage, computer failure or malfunction, or any and all
169 |       other commercial damages or losses), even if such Contributor
170 |       has been advised of the possibility of such damages.
171 | 
172 |    9. Accepting Warranty or Additional Liability. While redistributing
173 |       the Work or Derivative Works thereof, You may choose to offer,
174 |       and charge a fee for, acceptance of support, warranty, indemnity,
175 |       or other liability obligations and/or rights consistent with this
176 |       License. However, in accepting such obligations, You may act only
177 |       on Your own behalf and on Your sole responsibility, not on behalf
178 |       of any other Contributor, and only if You agree to indemnify,
179 |       defend, and hold each Contributor harmless for any liability
180 |       incurred by, or claims asserted against, such Contributor by reason
181 |       of your accepting any such warranty or additional liability.
182 | 
183 |    END OF TERMS AND CONDITIONS
184 | 
185 |    APPENDIX: How to apply the Apache License to your work.
186 | 
187 |       To apply the Apache License to your work, attach the following
188 |       boilerplate notice, with the fields enclosed by brackets "[]"
189 |       replaced with your own identifying information. (Don't include
190 |       the brackets!)  The text should be enclosed in the appropriate
191 |       comment syntax for the file format. We also recommend that a
192 |       file or class name and description of purpose be included on the
193 |       same "printed page" as the copyright notice for easier
194 |       identification within third-party archives.
195 | 
196 |    Copyright 2020 Matthew Barnett
197 | 
198 |    Licensed under the Apache License, Version 2.0 (the "License");
199 |    you may not use this file except in compliance with the License.
200 |    You may obtain a copy of the License at
201 | 
202 |        http://www.apache.org/licenses/LICENSE-2.0
203 | 
204 |    Unless required by applicable law or agreed to in writing, software
205 |    distributed under the License is distributed on an "AS IS" BASIS,
206 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
207 |    See the License for the specific language governing permissions and
208 |    limitations under the License.
209 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include regex_3/*.c
2 | include regex_3/*.h
3 | include regex_3/*.py
4 | include docs/*.*
5 | include tools/*.py
6 | include LICENSE.txt
7 | include pyproject.toml
8 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
   1 | Introduction
   2 | ------------
   3 | 
   4 | This regex implementation is backwards-compatible with the standard 're' module, but offers additional functionality.
   5 | 
   6 | Python 2
   7 | --------
   8 | 
   9 | Python 2 is no longer supported. The last release that supported Python 2 was 2021.11.10.
  10 | 
  11 | PyPy
  12 | ----
  13 | 
  14 | This module is targeted at CPython. It expects that all codepoints are the same width, so it won't behave properly with PyPy outside U+0000..U+007F because PyPy stores strings as UTF-8.
  15 | 
  16 | Multithreading
  17 | --------------
  18 | 
  19 | The regex module releases the GIL during matching on instances of the built-in (immutable) string classes, enabling other Python threads to run concurrently. It is also possible to force the regex module to release the GIL during matching by calling the matching methods with the keyword argument ``concurrent=True``. The behaviour is undefined if the string changes during matching, so use it *only* when it is guaranteed that that won't happen.
  20 | 
  21 | Unicode
  22 | -------
  23 | 
  24 | This module supports Unicode 16.0.0. Full Unicode case-folding is supported.
  25 | 
  26 | Flags
  27 | -----
  28 | 
  29 | There are 2 kinds of flag: scoped and global. Scoped flags can apply to only part of a pattern and can be turned on or off; global flags apply to the entire pattern and can only be turned on.
  30 | 
  31 | The scoped flags are: ``ASCII (?a)``, ``FULLCASE (?f)``, ``IGNORECASE (?i)``, ``LOCALE (?L)``, ``MULTILINE (?m)``, ``DOTALL (?s)``, ``UNICODE (?u)``, ``VERBOSE (?x)``, ``WORD (?w)``.
  32 | 
  33 | The global flags are: ``BESTMATCH (?b)``, ``ENHANCEMATCH (?e)``, ``POSIX (?p)``, ``REVERSE (?r)``, ``VERSION0 (?V0)``, ``VERSION1 (?V1)``.
  34 | 
  35 | If neither the ``ASCII``, ``LOCALE`` nor ``UNICODE`` flag is specified, it will default to ``UNICODE`` if the regex pattern is a Unicode string and ``ASCII`` if it's a bytestring.
  36 | 
  37 | The ``ENHANCEMATCH`` flag makes fuzzy matching attempt to improve the fit of the next match that it finds.
  38 | 
  39 | The ``BESTMATCH`` flag makes fuzzy matching search for the best match instead of the next match.
  40 | 
  41 | Old vs new behaviour
  42 | --------------------
  43 | 
  44 | In order to be compatible with the re module, this module has 2 behaviours:
  45 | 
  46 | * **Version 0** behaviour (old behaviour, compatible with the re module):
  47 | 
  48 |   Please note that the re module's behaviour may change over time, and I'll endeavour to match that behaviour in version 0.
  49 | 
  50 |   * Indicated by the ``VERSION0`` flag.
  51 | 
  52 |   * Zero-width matches are not handled correctly in the re module before Python 3.7. The behaviour in those earlier versions is:
  53 | 
  54 |     * ``.split`` won't split a string at a zero-width match.
  55 | 
  56 |     * ``.sub`` will advance by one character after a zero-width match.
  57 | 
  58 |   * Inline flags apply to the entire pattern, and they can't be turned off.
  59 | 
  60 |   * Only simple sets are supported.
  61 | 
  62 |   * Case-insensitive matches in Unicode use simple case-folding by default.
  63 | 
  64 | * **Version 1** behaviour (new behaviour, possibly different from the re module):
  65 | 
  66 |   * Indicated by the ``VERSION1`` flag.
  67 | 
  68 |   * Zero-width matches are handled correctly.
  69 | 
  70 |   * Inline flags apply to the end of the group or pattern, and they can be turned off.
  71 | 
  72 |   * Nested sets and set operations are supported.
  73 | 
  74 |   * Case-insensitive matches in Unicode use full case-folding by default.
  75 | 
  76 | If no version is specified, the regex module will default to ``regex.DEFAULT_VERSION``.
  77 | 
  78 | Case-insensitive matches in Unicode
  79 | -----------------------------------
  80 | 
  81 | The regex module supports both simple and full case-folding for case-insensitive matches in Unicode. Use of full case-folding can be turned on using the ``FULLCASE`` flag. Please note that this flag affects how the ``IGNORECASE`` flag works; the ``FULLCASE`` flag itself does not turn on case-insensitive matching.
  82 | 
  83 | Version 0 behaviour: the flag is off by default.
  84 | 
  85 | Version 1 behaviour: the flag is on by default.
  86 | 
  87 | Nested sets and set operations
  88 | ------------------------------
  89 | 
  90 | It's not possible to support both simple sets, as used in the re module, and nested sets at the same time because of a difference in the meaning of an unescaped ``"["`` in a set.
  91 | 
  92 | For example, the pattern ``[[a-z]--[aeiou]]`` is treated in the version 0 behaviour (simple sets, compatible with the re module) as:
  93 | 
  94 | * Set containing "[" and the letters "a" to "z"
  95 | 
  96 | * Literal "--"
  97 | 
  98 | * Set containing letters "a", "e", "i", "o", "u"
  99 | 
 100 | * Literal "]"
 101 | 
 102 | but in the version 1 behaviour (nested sets, enhanced behaviour) as:
 103 | 
 104 | * Set which is:
 105 | 
 106 |   * Set containing the letters "a" to "z"
 107 | 
 108 | * but excluding:
 109 | 
 110 |   * Set containing the letters "a", "e", "i", "o", "u"
 111 | 
 112 | Version 0 behaviour: only simple sets are supported.
 113 | 
 114 | Version 1 behaviour: nested sets and set operations are supported.
 115 | 
 116 | Notes on named groups
 117 | ---------------------
 118 | 
 119 | All groups have a group number, starting from 1.
 120 | 
 121 | Groups with the same group name will have the same group number, and groups with a different group name will have a different group number.
 122 | 
 123 | The same name can be used by more than one group, with later captures 'overwriting' earlier captures. All the captures of the group will be available from the ``captures`` method of the match object.
 124 | 
 125 | Group numbers will be reused across different branches of a branch reset, eg. ``(?|(first)|(second))`` has only group 1. If groups have different group names then they will, of course, have different group numbers, eg. ``(?|(?P<foo>first)|(?P<bar>second))`` has group 1 ("foo") and group 2 ("bar").
 126 | 
 127 | In the regex ``(\s+)(?|(?P<foo>[A-Z]+)|(\w+) (?P<foo>[0-9]+)`` there are 2 groups:
 128 | 
 129 | * ``(\s+)`` is group 1.
 130 | 
 131 | * ``(?P<foo>[A-Z]+)`` is group 2, also called "foo".
 132 | 
 133 | * ``(\w+)`` is group 2 because of the branch reset.
 134 | 
 135 | * ``(?P<foo>[0-9]+)`` is group 2 because it's called "foo".
 136 | 
 137 | If you want to prevent ``(\w+)`` from being group 2, you need to name it (different name, different group number).
 138 | 
 139 | Additional features
 140 | -------------------
 141 | 
 142 | The issue numbers relate to the Python bug tracker, except where listed otherwise.
 143 | 
 144 | Added ``\p{Horiz_Space}`` and ``\p{Vert_Space}`` (`GitHub issue 477 <https://github.com/mrabarnett/mrab-regex/issues/477#issuecomment-1216779547>`_)
 145 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 146 | 
 147 | ``\p{Horiz_Space}`` or ``\p{H}`` matches horizontal whitespace and ``\p{Vert_Space}`` or ``\p{V}`` matches vertical whitespace.
 148 | 
 149 | Added support for lookaround in conditional pattern (`Hg issue 163 <https://github.com/mrabarnett/mrab-regex/issues/163>`_)
 150 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 151 | 
 152 | The test of a conditional pattern can be a lookaround.
 153 | 
 154 | .. sourcecode:: python
 155 | 
 156 |   >>> regex.match(r'(?(?=\d)\d+|\w+)', '123abc')
 157 |   <regex.Match object; span=(0, 3), match='123'>
 158 |   >>> regex.match(r'(?(?=\d)\d+|\w+)', 'abc123')
 159 |   <regex.Match object; span=(0, 6), match='abc123'>
 160 | 
 161 | This is not quite the same as putting a lookaround in the first branch of a pair of alternatives.
 162 | 
 163 | .. sourcecode:: python
 164 | 
 165 |   >>> print(regex.match(r'(?:(?=\d)\d+\b|\w+)', '123abc'))
 166 |   <regex.Match object; span=(0, 6), match='123abc'>
 167 |   >>> print(regex.match(r'(?(?=\d)\d+\b|\w+)', '123abc'))
 168 |   None
 169 | 
 170 | In the first example, the lookaround matched, but the remainder of the first branch failed to match, and so the second branch was attempted, whereas in the second example, the lookaround matched, and the first branch failed to match, but the second branch was **not** attempted.
 171 | 
 172 | Added POSIX matching (leftmost longest) (`Hg issue 150 <https://github.com/mrabarnett/mrab-regex/issues/150>`_)
 173 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 174 | 
 175 | The POSIX standard for regex is to return the leftmost longest match. This can be turned on using the ``POSIX`` flag.
 176 | 
 177 | .. sourcecode:: python
 178 | 
 179 |   >>> # Normal matching.
 180 |   >>> regex.search(r'Mr|Mrs', 'Mrs')
 181 |   <regex.Match object; span=(0, 2), match='Mr'>
 182 |   >>> regex.search(r'one(self)?(selfsufficient)?', 'oneselfsufficient')
 183 |   <regex.Match object; span=(0, 7), match='oneself'>
 184 |   >>> # POSIX matching.
 185 |   >>> regex.search(r'(?p)Mr|Mrs', 'Mrs')
 186 |   <regex.Match object; span=(0, 3), match='Mrs'>
 187 |   >>> regex.search(r'(?p)one(self)?(selfsufficient)?', 'oneselfsufficient')
 188 |   <regex.Match object; span=(0, 17), match='oneselfsufficient'>
 189 | 
 190 | Note that it will take longer to find matches because when it finds a match at a certain position, it won't return that immediately, but will keep looking to see if there's another longer match there.
 191 | 
 192 | Added ``(?(DEFINE)...)`` (`Hg issue 152 <https://github.com/mrabarnett/mrab-regex/issues/152>`_)
 193 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 194 | 
 195 | If there's no group called "DEFINE", then ... will be ignored except that any groups defined within it can be called and that the normal rules for numbering groups still apply.
 196 | 
 197 | .. sourcecode:: python
 198 | 
 199 |   >>> regex.search(r'(?(DEFINE)(?P<quant>\d+)(?P<item>\w+))(?&quant) (?&item)', '5 elephants')
 200 |   <regex.Match object; span=(0, 11), match='5 elephants'>
 201 | 
 202 | Added ``(*PRUNE)``, ``(*SKIP)`` and ``(*FAIL)`` (`Hg issue 153 <https://github.com/mrabarnett/mrab-regex/issues/153>`_)
 203 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 204 | 
 205 | ``(*PRUNE)`` discards the backtracking info up to that point. When used in an atomic group or a lookaround, it won't affect the enclosing pattern.
 206 | 
 207 | ``(*SKIP)`` is similar to ``(*PRUNE)``, except that it also sets where in the text the next attempt to match will start. When used in an atomic group or a lookaround, it won't affect the enclosing pattern.
 208 | 
 209 | ``(*FAIL)`` causes immediate backtracking. ``(*F)`` is a permitted abbreviation.
 210 | 
 211 | Added ``\K`` (`Hg issue 151 <https://github.com/mrabarnett/mrab-regex/issues/151>`_)
 212 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 213 | 
 214 | Keeps the part of the entire match after the position where ``\K`` occurred; the part before it is discarded.
 215 | 
 216 | It does not affect what groups return.
 217 | 
 218 | .. sourcecode:: python
 219 | 
 220 |   >>> m = regex.search(r'(\w\w\K\w\w\w)', 'abcdef')
 221 |   >>> m[0]
 222 |   'cde'
 223 |   >>> m[1]
 224 |   'abcde'
 225 |   >>>
 226 |   >>> m = regex.search(r'(?r)(\w\w\K\w\w\w)', 'abcdef')
 227 |   >>> m[0]
 228 |   'bc'
 229 |   >>> m[1]
 230 |   'bcdef'
 231 | 
 232 | Added capture subscripting for ``expandf`` and ``subf``/``subfn`` (`Hg issue 133 <https://github.com/mrabarnett/mrab-regex/issues/133>`_)
 233 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 234 | 
 235 | You can use subscripting to get the captures of a repeated group.
 236 | 
 237 | .. sourcecode:: python
 238 | 
 239 |   >>> m = regex.match(r"(\w)+", "abc")
 240 |   >>> m.expandf("{1}")
 241 |   'c'
 242 |   >>> m.expandf("{1[0]} {1[1]} {1[2]}")
 243 |   'a b c'
 244 |   >>> m.expandf("{1[-1]} {1[-2]} {1[-3]}")
 245 |   'c b a'
 246 |   >>>
 247 |   >>> m = regex.match(r"(?P<letter>\w)+", "abc")
 248 |   >>> m.expandf("{letter}")
 249 |   'c'
 250 |   >>> m.expandf("{letter[0]} {letter[1]} {letter[2]}")
 251 |   'a b c'
 252 |   >>> m.expandf("{letter[-1]} {letter[-2]} {letter[-3]}")
 253 |   'c b a'
 254 | 
 255 | Added support for referring to a group by number using ``(?P=...)``
 256 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 257 | 
 258 | This is in addition to the existing ``\g<...>``.
 259 | 
 260 | Fixed the handling of locale-sensitive regexes
 261 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 262 | 
 263 | The ``LOCALE`` flag is intended for legacy code and has limited support. You're still recommended to use Unicode instead.
 264 | 
 265 | Added partial matches (`Hg issue 102 <https://github.com/mrabarnett/mrab-regex/issues/102>`_)
 266 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 267 | 
 268 | A partial match is one that matches up to the end of string, but that string has been truncated and you want to know whether a complete match could be possible if the string had not been truncated.
 269 | 
 270 | Partial matches are supported by ``match``, ``search``, ``fullmatch`` and ``finditer`` with the ``partial`` keyword argument.
 271 | 
 272 | Match objects have a ``partial`` attribute, which is ``True`` if it's a partial match.
 273 | 
 274 | For example, if you wanted a user to enter a 4-digit number and check it character by character as it was being entered:
 275 | 
 276 | .. sourcecode:: python
 277 | 
 278 |   >>> pattern = regex.compile(r'\d{4}')
 279 | 
 280 |   >>> # Initially, nothing has been entered:
 281 |   >>> print(pattern.fullmatch('', partial=True))
 282 |   <regex.Match object; span=(0, 0), match='', partial=True>
 283 | 
 284 |   >>> # An empty string is OK, but it's only a partial match.
 285 |   >>> # The user enters a letter:
 286 |   >>> print(pattern.fullmatch('a', partial=True))
 287 |   None
 288 |   >>> # It'll never match.
 289 | 
 290 |   >>> # The user deletes that and enters a digit:
 291 |   >>> print(pattern.fullmatch('1', partial=True))
 292 |   <regex.Match object; span=(0, 1), match='1', partial=True>
 293 |   >>> # It matches this far, but it's only a partial match.
 294 | 
 295 |   >>> # The user enters 2 more digits:
 296 |   >>> print(pattern.fullmatch('123', partial=True))
 297 |   <regex.Match object; span=(0, 3), match='123', partial=True>
 298 |   >>> # It matches this far, but it's only a partial match.
 299 | 
 300 |   >>> # The user enters another digit:
 301 |   >>> print(pattern.fullmatch('1234', partial=True))
 302 |   <regex.Match object; span=(0, 4), match='1234'>
 303 |   >>> # It's a complete match.
 304 | 
 305 |   >>> # If the user enters another digit:
 306 |   >>> print(pattern.fullmatch('12345', partial=True))
 307 |   None
 308 |   >>> # It's no longer a match.
 309 | 
 310 |   >>> # This is a partial match:
 311 |   >>> pattern.match('123', partial=True).partial
 312 |   True
 313 | 
 314 |   >>> # This is a complete match:
 315 |   >>> pattern.match('1233', partial=True).partial
 316 |   False
 317 | 
 318 | ``*`` operator not working correctly with sub() (`Hg issue 106 <https://github.com/mrabarnett/mrab-regex/issues/106>`_)
 319 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 320 | 
 321 | Sometimes it's not clear how zero-width matches should be handled. For example, should ``.*`` match 0 characters directly after matching >0 characters?
 322 | 
 323 | .. sourcecode:: python
 324 | 
 325 |   >>> regex.sub('.*', 'x', 'test')
 326 |   'xx'
 327 |   >>> regex.sub('.*?', '|', 'test')
 328 |   '|||||||||'
 329 | 
 330 | Added ``capturesdict`` (`Hg issue 86 <https://github.com/mrabarnett/mrab-regex/issues/86>`_)
 331 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 332 | 
 333 | ``capturesdict`` is a combination of ``groupdict`` and ``captures``:
 334 | 
 335 | ``groupdict`` returns a dict of the named groups and the last capture of those groups.
 336 | 
 337 | ``captures`` returns a list of all the captures of a group
 338 | 
 339 | ``capturesdict`` returns a dict of the named groups and lists of all the captures of those groups.
 340 | 
 341 | .. sourcecode:: python
 342 | 
 343 |   >>> m = regex.match(r"(?:(?P<word>\w+) (?P<digits>\d+)\n)+", "one 1\ntwo 2\nthree 3\n")
 344 |   >>> m.groupdict()
 345 |   {'word': 'three', 'digits': '3'}
 346 |   >>> m.captures("word")
 347 |   ['one', 'two', 'three']
 348 |   >>> m.captures("digits")
 349 |   ['1', '2', '3']
 350 |   >>> m.capturesdict()
 351 |   {'word': ['one', 'two', 'three'], 'digits': ['1', '2', '3']}
 352 | 
 353 | Added ``allcaptures`` and ``allspans`` (`Git issue 474 <https://github.com/mrabarnett/mrab-regex/issues/474>`_)
 354 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 355 | 
 356 | ``allcaptures`` returns a list of all the captures of all the groups.
 357 | 
 358 | ``allspans`` returns a list of all the spans of the all captures of all the groups.
 359 | 
 360 | .. sourcecode:: python
 361 | 
 362 |   >>> m = regex.match(r"(?:(?P<word>\w+) (?P<digits>\d+)\n)+", "one 1\ntwo 2\nthree 3\n")
 363 |   >>> m.allcaptures()
 364 |   (['one 1\ntwo 2\nthree 3\n'], ['one', 'two', 'three'], ['1', '2', '3'])
 365 |   >>> m.allspans()
 366 |   ([(0, 20)], [(0, 3), (6, 9), (12, 17)], [(4, 5), (10, 11), (18, 19)])
 367 | 
 368 | Allow duplicate names of groups (`Hg issue 87 <https://github.com/mrabarnett/mrab-regex/issues/87>`_)
 369 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 370 | 
 371 | Group names can be duplicated.
 372 | 
 373 | .. sourcecode:: python
 374 | 
 375 |   >>> # With optional groups:
 376 |   >>>
 377 |   >>> # Both groups capture, the second capture 'overwriting' the first.
 378 |   >>> m = regex.match(r"(?P<item>\w+)? or (?P<item>\w+)?", "first or second")
 379 |   >>> m.group("item")
 380 |   'second'
 381 |   >>> m.captures("item")
 382 |   ['first', 'second']
 383 |   >>> # Only the second group captures.
 384 |   >>> m = regex.match(r"(?P<item>\w+)? or (?P<item>\w+)?", " or second")
 385 |   >>> m.group("item")
 386 |   'second'
 387 |   >>> m.captures("item")
 388 |   ['second']
 389 |   >>> # Only the first group captures.
 390 |   >>> m = regex.match(r"(?P<item>\w+)? or (?P<item>\w+)?", "first or ")
 391 |   >>> m.group("item")
 392 |   'first'
 393 |   >>> m.captures("item")
 394 |   ['first']
 395 |   >>>
 396 |   >>> # With mandatory groups:
 397 |   >>>
 398 |   >>> # Both groups capture, the second capture 'overwriting' the first.
 399 |   >>> m = regex.match(r"(?P<item>\w*) or (?P<item>\w*)?", "first or second")
 400 |   >>> m.group("item")
 401 |   'second'
 402 |   >>> m.captures("item")
 403 |   ['first', 'second']
 404 |   >>> # Again, both groups capture, the second capture 'overwriting' the first.
 405 |   >>> m = regex.match(r"(?P<item>\w*) or (?P<item>\w*)", " or second")
 406 |   >>> m.group("item")
 407 |   'second'
 408 |   >>> m.captures("item")
 409 |   ['', 'second']
 410 |   >>> # And yet again, both groups capture, the second capture 'overwriting' the first.
 411 |   >>> m = regex.match(r"(?P<item>\w*) or (?P<item>\w*)", "first or ")
 412 |   >>> m.group("item")
 413 |   ''
 414 |   >>> m.captures("item")
 415 |   ['first', '']
 416 | 
 417 | Added ``fullmatch`` (`issue #16203 <https://bugs.python.org/issue16203>`_)
 418 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 419 | 
 420 | ``fullmatch`` behaves like ``match``, except that it must match all of the string.
 421 | 
 422 | .. sourcecode:: python
 423 | 
 424 |   >>> print(regex.fullmatch(r"abc", "abc").span())
 425 |   (0, 3)
 426 |   >>> print(regex.fullmatch(r"abc", "abcx"))
 427 |   None
 428 |   >>> print(regex.fullmatch(r"abc", "abcx", endpos=3).span())
 429 |   (0, 3)
 430 |   >>> print(regex.fullmatch(r"abc", "xabcy", pos=1, endpos=4).span())
 431 |   (1, 4)
 432 |   >>>
 433 |   >>> regex.match(r"a.*?", "abcd").group(0)
 434 |   'a'
 435 |   >>> regex.fullmatch(r"a.*?", "abcd").group(0)
 436 |   'abcd'
 437 | 
 438 | Added ``subf`` and ``subfn``
 439 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 440 | 
 441 | ``subf`` and ``subfn`` are alternatives to ``sub`` and ``subn`` respectively. When passed a replacement string, they treat it as a format string.
 442 | 
 443 | .. sourcecode:: python
 444 | 
 445 |   >>> regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar")
 446 |   'foo bar => bar foo'
 447 |   >>> regex.subf(r"(?P<word1>\w+) (?P<word2>\w+)", "{word2} {word1}", "foo bar")
 448 |   'bar foo'
 449 | 
 450 | Added ``expandf`` to match object
 451 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 452 | 
 453 | ``expandf`` is an alternative to ``expand``. When passed a replacement string, it treats it as a format string.
 454 | 
 455 | .. sourcecode:: python
 456 | 
 457 |   >>> m = regex.match(r"(\w+) (\w+)", "foo bar")
 458 |   >>> m.expandf("{0} => {2} {1}")
 459 |   'foo bar => bar foo'
 460 |   >>>
 461 |   >>> m = regex.match(r"(?P<word1>\w+) (?P<word2>\w+)", "foo bar")
 462 |   >>> m.expandf("{word2} {word1}")
 463 |   'bar foo'
 464 | 
 465 | Detach searched string
 466 | ^^^^^^^^^^^^^^^^^^^^^^
 467 | 
 468 | A match object contains a reference to the string that was searched, via its ``string`` attribute. The ``detach_string`` method will 'detach' that string, making it available for garbage collection, which might save valuable memory if that string is very large.
 469 | 
 470 | .. sourcecode:: python
 471 | 
 472 |   >>> m = regex.search(r"\w+", "Hello world")
 473 |   >>> print(m.group())
 474 |   Hello
 475 |   >>> print(m.string)
 476 |   Hello world
 477 |   >>> m.detach_string()
 478 |   >>> print(m.group())
 479 |   Hello
 480 |   >>> print(m.string)
 481 |   None
 482 | 
 483 | Recursive patterns (`Hg issue 27 <https://github.com/mrabarnett/mrab-regex/issues/27>`_)
 484 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 485 | 
 486 | Recursive and repeated patterns are supported.
 487 | 
 488 | ``(?R)`` or ``(?0)`` tries to match the entire regex recursively. ``(?1)``, ``(?2)``, etc, try to match the relevant group.
 489 | 
 490 | ``(?&name)`` tries to match the named group.
 491 | 
 492 | .. sourcecode:: python
 493 | 
 494 |   >>> regex.match(r"(Tarzan|Jane) loves (?1)", "Tarzan loves Jane").groups()
 495 |   ('Tarzan',)
 496 |   >>> regex.match(r"(Tarzan|Jane) loves (?1)", "Jane loves Tarzan").groups()
 497 |   ('Jane',)
 498 | 
 499 |   >>> m = regex.search(r"(\w)(?:(?R)|(\w?))\1", "kayak")
 500 |   >>> m.group(0, 1, 2)
 501 |   ('kayak', 'k', None)
 502 | 
 503 | The first two examples show how the subpattern within the group is reused, but is _not_ itself a group. In other words, ``"(Tarzan|Jane) loves (?1)"`` is equivalent to ``"(Tarzan|Jane) loves (?:Tarzan|Jane)"``.
 504 | 
 505 | It's possible to backtrack into a recursed or repeated group.
 506 | 
 507 | You can't call a group if there is more than one group with that group name or group number (``"ambiguous group reference"``).
 508 | 
 509 | The alternative forms ``(?P>name)`` and ``(?P&name)`` are also supported.
 510 | 
 511 | Full Unicode case-folding is supported
 512 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 513 | 
 514 | In version 1 behaviour, the regex module uses full case-folding when performing case-insensitive matches in Unicode.
 515 | 
 516 | .. sourcecode:: python
 517 | 
 518 |   >>> regex.match(r"(?iV1)strasse", "stra\N{LATIN SMALL LETTER SHARP S}e").span()
 519 |   (0, 6)
 520 |   >>> regex.match(r"(?iV1)stra\N{LATIN SMALL LETTER SHARP S}e", "STRASSE").span()
 521 |   (0, 7)
 522 | 
 523 | In version 0 behaviour, it uses simple case-folding for backward compatibility with the re module.
 524 | 
 525 | Approximate "fuzzy" matching (`Hg issue 12 <https://github.com/mrabarnett/mrab-regex/issues/12>`_, `Hg issue 41 <https://github.com/mrabarnett/mrab-regex/issues/41>`_, `Hg issue 109 <https://github.com/mrabarnett/mrab-regex/issues/109>`_)
 526 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 527 | 
 528 | Regex usually attempts an exact match, but sometimes an approximate, or "fuzzy", match is needed, for those cases where the text being searched may contain errors in the form of inserted, deleted or substituted characters.
 529 | 
 530 | A fuzzy regex specifies which types of errors are permitted, and, optionally, either the minimum and maximum or only the maximum permitted number of each type. (You cannot specify only a minimum.)
 531 | 
 532 | The 3 types of error are:
 533 | 
 534 | * Insertion, indicated by "i"
 535 | 
 536 | * Deletion, indicated by "d"
 537 | 
 538 | * Substitution, indicated by "s"
 539 | 
 540 | In addition, "e" indicates any type of error.
 541 | 
 542 | The fuzziness of a regex item is specified between "{" and "}" after the item.
 543 | 
 544 | Examples:
 545 | 
 546 | * ``foo`` match "foo" exactly
 547 | 
 548 | * ``(?:foo){i}`` match "foo", permitting insertions
 549 | 
 550 | * ``(?:foo){d}`` match "foo", permitting deletions
 551 | 
 552 | * ``(?:foo){s}`` match "foo", permitting substitutions
 553 | 
 554 | * ``(?:foo){i,s}`` match "foo", permitting insertions and substitutions
 555 | 
 556 | * ``(?:foo){e}`` match "foo", permitting errors
 557 | 
 558 | If a certain type of error is specified, then any type not specified will **not** be permitted.
 559 | 
 560 | In the following examples I'll omit the item and write only the fuzziness:
 561 | 
 562 | * ``{d<=3}`` permit at most 3 deletions, but no other types
 563 | 
 564 | * ``{i<=1,s<=2}`` permit at most 1 insertion and at most 2 substitutions, but no deletions
 565 | 
 566 | * ``{1<=e<=3}`` permit at least 1 and at most 3 errors
 567 | 
 568 | * ``{i<=2,d<=2,e<=3}`` permit at most 2 insertions, at most 2 deletions, at most 3 errors in total, but no substitutions
 569 | 
 570 | It's also possible to state the costs of each type of error and the maximum permitted total cost.
 571 | 
 572 | Examples:
 573 | 
 574 | * ``{2i+2d+1s<=4}`` each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4
 575 | 
 576 | * ``{i<=1,d<=1,s<=1,2i+2d+1s<=4}`` at most 1 insertion, at most 1 deletion, at most 1 substitution; each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4
 577 | 
 578 | You can also use "<" instead of "<=" if you want an exclusive minimum or maximum.
 579 | 
 580 | You can add a test to perform on a character that's substituted or inserted.
 581 | 
 582 | Examples:
 583 | 
 584 | * ``{s<=2:[a-z]}`` at most 2 substitutions, which must be in the character set ``[a-z]``.
 585 | 
 586 | * ``{s<=2,i<=3:\d}`` at most 2 substitutions, at most 3 insertions, which must be digits.
 587 | 
 588 | By default, fuzzy matching searches for the first match that meets the given constraints. The ``ENHANCEMATCH`` flag will cause it to attempt to improve the fit (i.e. reduce the number of errors) of the match that it has found.
 589 | 
 590 | The ``BESTMATCH`` flag will make it search for the best match instead.
 591 | 
 592 | Further examples to note:
 593 | 
 594 | * ``regex.search("(dog){e}", "cat and dog")[1]`` returns ``"cat"`` because that matches ``"dog"`` with 3 errors (an unlimited number of errors is permitted).
 595 | 
 596 | * ``regex.search("(dog){e<=1}", "cat and dog")[1]`` returns ``" dog"`` (with a leading space) because that matches ``"dog"`` with 1 error, which is within the limit.
 597 | 
 598 | * ``regex.search("(?e)(dog){e<=1}", "cat and dog")[1]`` returns ``"dog"`` (without a leading space) because the fuzzy search matches ``" dog"`` with 1 error, which is within the limit, and the ``(?e)`` then it attempts a better fit.
 599 | 
 600 | In the first two examples there are perfect matches later in the string, but in neither case is it the first possible match.
 601 | 
 602 | The match object has an attribute ``fuzzy_counts`` which gives the total number of substitutions, insertions and deletions.
 603 | 
 604 | .. sourcecode:: python
 605 | 
 606 |   >>> # A 'raw' fuzzy match:
 607 |   >>> regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts
 608 |   (0, 0, 1)
 609 |   >>> # 0 substitutions, 0 insertions, 1 deletion.
 610 | 
 611 |   >>> # A better match might be possible if the ENHANCEMATCH flag used:
 612 |   >>> regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", "cat").fuzzy_counts
 613 |   (0, 0, 0)
 614 |   >>> # 0 substitutions, 0 insertions, 0 deletions.
 615 | 
 616 | The match object also has an attribute ``fuzzy_changes`` which gives a tuple of the positions of the substitutions, insertions and deletions.
 617 | 
 618 | .. sourcecode:: python
 619 | 
 620 |   >>> m = regex.search('(fuu){i<=2,d<=2,e<=5}', 'anaconda foo bar')
 621 |   >>> m
 622 |   <regex.Match object; span=(7, 10), match='a f', fuzzy_counts=(0, 2, 2)>
 623 |   >>> m.fuzzy_changes
 624 |   ([], [7, 8], [10, 11])
 625 | 
 626 | What this means is that if the matched part of the string had been:
 627 | 
 628 | .. sourcecode:: python
 629 | 
 630 |   'anacondfuuoo bar'
 631 | 
 632 | it would've been an exact match.
 633 | 
 634 | However, there were insertions at positions 7 and 8:
 635 | 
 636 | .. sourcecode:: python
 637 | 
 638 |   'anaconda fuuoo bar'
 639 |           ^^
 640 | 
 641 | and deletions at positions 10 and 11:
 642 | 
 643 | .. sourcecode:: python
 644 | 
 645 |   'anaconda f~~oo bar'
 646 |              ^^
 647 | 
 648 | So the actual string was:
 649 | 
 650 | .. sourcecode:: python
 651 | 
 652 |   'anaconda foo bar'
 653 | 
 654 | Named lists ``\L<name>`` (`Hg issue 11 <https://github.com/mrabarnett/mrab-regex/issues/11>`_)
 655 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 656 | 
 657 | There are occasions where you may want to include a list (actually, a set) of options in a regex.
 658 | 
 659 | One way is to build the pattern like this:
 660 | 
 661 | .. sourcecode:: python
 662 | 
 663 |   >>> p = regex.compile(r"first|second|third|fourth|fifth")
 664 | 
 665 | but if the list is large, parsing the resulting regex can take considerable time, and care must also be taken that the strings are properly escaped and properly ordered, for example, "cats" before "cat".
 666 | 
 667 | The new alternative is to use a named list:
 668 | 
 669 | .. sourcecode:: python
 670 | 
 671 |   >>> option_set = ["first", "second", "third", "fourth", "fifth"]
 672 |   >>> p = regex.compile(r"\L<options>", options=option_set)
 673 | 
 674 | The order of the items is irrelevant, they are treated as a set. The named lists are available as the ``.named_lists`` attribute of the pattern object :
 675 | 
 676 | .. sourcecode:: python
 677 | 
 678 |   >>> print(p.named_lists)
 679 |   {'options': frozenset({'third', 'first', 'fifth', 'fourth', 'second'})}
 680 | 
 681 | If there are any unused keyword arguments, ``ValueError`` will be raised unless you tell it otherwise:
 682 | 
 683 | .. sourcecode:: python
 684 | 
 685 |   >>> option_set = ["first", "second", "third", "fourth", "fifth"]
 686 |   >>> p = regex.compile(r"\L<options>", options=option_set, other_options=[])
 687 |   Traceback (most recent call last):
 688 |     File "<stdin>", line 1, in <module>
 689 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 353, in compile
 690 |       return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern)
 691 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 500, in _compile
 692 |       complain_unused_args()
 693 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 483, in complain_unused_args
 694 |       raise ValueError('unused keyword argument {!a}'.format(any_one))
 695 |   ValueError: unused keyword argument 'other_options'
 696 |   >>> p = regex.compile(r"\L<options>", options=option_set, other_options=[], ignore_unused=True)
 697 |   >>> p = regex.compile(r"\L<options>", options=option_set, other_options=[], ignore_unused=False)
 698 |   Traceback (most recent call last):
 699 |     File "<stdin>", line 1, in <module>
 700 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 353, in compile
 701 |       return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern)
 702 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 500, in _compile
 703 |       complain_unused_args()
 704 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 483, in complain_unused_args
 705 |       raise ValueError('unused keyword argument {!a}'.format(any_one))
 706 |   ValueError: unused keyword argument 'other_options'
 707 |   >>>
 708 | 
 709 | Start and end of word
 710 | ^^^^^^^^^^^^^^^^^^^^^
 711 | 
 712 | ``\m`` matches at the start of a word.
 713 | 
 714 | ``\M`` matches at the end of a word.
 715 | 
 716 | Compare with ``\b``, which matches at the start or end of a word.
 717 | 
 718 | Unicode line separators
 719 | ^^^^^^^^^^^^^^^^^^^^^^^
 720 | 
 721 | Normally the only line separator is ``\n`` (``\x0A``), but if the ``WORD`` flag is turned on then the line separators are ``\x0D\x0A``, ``\x0A``, ``\x0B``, ``\x0C`` and ``\x0D``, plus ``\x85``, ``\u2028`` and ``\u2029`` when working with Unicode.
 722 | 
 723 | This affects the regex dot ``"."``, which, with the ``DOTALL`` flag turned off, matches any character except a line separator. It also affects the line anchors ``^`` and ``$`` (in multiline mode).
 724 | 
 725 | Set operators
 726 | ^^^^^^^^^^^^^
 727 | 
 728 | **Version 1 behaviour only**
 729 | 
 730 | Set operators have been added, and a set ``[...]`` can include nested sets.
 731 | 
 732 | The operators, in order of increasing precedence, are:
 733 | 
 734 | * ``||`` for union ("x||y" means "x or y")
 735 | 
 736 | * ``~~`` (double tilde) for symmetric difference ("x~~y" means "x or y, but not both")
 737 | 
 738 | * ``&&`` for intersection ("x&&y" means "x and y")
 739 | 
 740 | * ``--`` (double dash) for difference ("x--y" means "x but not y")
 741 | 
 742 | Implicit union, ie, simple juxtaposition like in ``[ab]``, has the highest precedence. Thus, ``[ab&&cd]`` is the same as ``[[a||b]&&[c||d]]``.
 743 | 
 744 | Examples:
 745 | 
 746 | * ``[ab]`` # Set containing 'a' and 'b'
 747 | 
 748 | * ``[a-z]`` # Set containing 'a' .. 'z'
 749 | 
 750 | * ``[[a-z]--[qw]]`` # Set containing 'a' .. 'z', but not 'q' or 'w'
 751 | 
 752 | * ``[a-z--qw]`` # Same as above
 753 | 
 754 | * ``[\p{L}--QW]`` # Set containing all letters except 'Q' and 'W'
 755 | 
 756 | * ``[\p{N}--[0-9]]`` # Set containing all numbers except '0' .. '9'
 757 | 
 758 | * ``[\p{ASCII}&&\p{Letter}]`` # Set containing all characters which are ASCII and letter
 759 | 
 760 | regex.escape (`issue #2650 <https://bugs.python.org/issue2650>`_)
 761 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 762 | 
 763 | regex.escape has an additional keyword parameter ``special_only``. When True, only 'special' regex characters, such as '?', are escaped.
 764 | 
 765 | .. sourcecode:: python
 766 | 
 767 |   >>> regex.escape("foo!?", special_only=False)
 768 |   'foo\\!\\?'
 769 |   >>> regex.escape("foo!?", special_only=True)
 770 |   'foo!\\?'
 771 | 
 772 | regex.escape (`Hg issue 249 <https://github.com/mrabarnett/mrab-regex/issues/249>`_)
 773 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 774 | 
 775 | regex.escape has an additional keyword parameter ``literal_spaces``. When True, spaces are not escaped.
 776 | 
 777 | .. sourcecode:: python
 778 | 
 779 |   >>> regex.escape("foo bar!?", literal_spaces=False)
 780 |   'foo\\ bar!\\?'
 781 |   >>> regex.escape("foo bar!?", literal_spaces=True)
 782 |   'foo bar!\\?'
 783 | 
 784 | Repeated captures (`issue #7132 <https://bugs.python.org/issue7132>`_)
 785 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 786 | 
 787 | A match object has additional methods which return information on all the successful matches of a repeated group. These methods are:
 788 | 
 789 | * ``matchobject.captures([group1, ...])``
 790 | 
 791 |   * Returns a list of the strings matched in a group or groups. Compare with ``matchobject.group([group1, ...])``.
 792 | 
 793 | * ``matchobject.starts([group])``
 794 | 
 795 |   * Returns a list of the start positions. Compare with ``matchobject.start([group])``.
 796 | 
 797 | * ``matchobject.ends([group])``
 798 | 
 799 |   * Returns a list of the end positions. Compare with ``matchobject.end([group])``.
 800 | 
 801 | * ``matchobject.spans([group])``
 802 | 
 803 |   * Returns a list of the spans. Compare with ``matchobject.span([group])``.
 804 | 
 805 | .. sourcecode:: python
 806 | 
 807 |   >>> m = regex.search(r"(\w{3})+", "123456789")
 808 |   >>> m.group(1)
 809 |   '789'
 810 |   >>> m.captures(1)
 811 |   ['123', '456', '789']
 812 |   >>> m.start(1)
 813 |   6
 814 |   >>> m.starts(1)
 815 |   [0, 3, 6]
 816 |   >>> m.end(1)
 817 |   9
 818 |   >>> m.ends(1)
 819 |   [3, 6, 9]
 820 |   >>> m.span(1)
 821 |   (6, 9)
 822 |   >>> m.spans(1)
 823 |   [(0, 3), (3, 6), (6, 9)]
 824 | 
 825 | Atomic grouping ``(?>...)`` (`issue #433030 <https://bugs.python.org/issue433030>`_)
 826 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 827 | 
 828 | If the following pattern subsequently fails, then the subpattern as a whole will fail.
 829 | 
 830 | Possessive quantifiers
 831 | ^^^^^^^^^^^^^^^^^^^^^^
 832 | 
 833 | ``(?:...)?+`` ; ``(?:...)*+`` ; ``(?:...)++`` ; ``(?:...){min,max}+``
 834 | 
 835 | The subpattern is matched up to 'max' times. If the following pattern subsequently fails, then all the repeated subpatterns will fail as a whole. For example, ``(?:...)++`` is equivalent to ``(?>(?:...)+)``.
 836 | 
 837 | Scoped flags (`issue #433028 <https://bugs.python.org/issue433028>`_)
 838 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 839 | 
 840 | ``(?flags-flags:...)``
 841 | 
 842 | The flags will apply only to the subpattern. Flags can be turned on or off.
 843 | 
 844 | Definition of 'word' character (`issue #1693050 <https://bugs.python.org/issue1693050>`_)
 845 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 846 | 
 847 | The definition of a 'word' character has been expanded for Unicode. It conforms to the Unicode specification at ``http://www.unicode.org/reports/tr29/``.
 848 | 
 849 | Variable-length lookbehind
 850 | ^^^^^^^^^^^^^^^^^^^^^^^^^^
 851 | 
 852 | A lookbehind can match a variable-length string.
 853 | 
 854 | Flags argument for regex.split, regex.sub and regex.subn (`issue #3482 <https://bugs.python.org/issue3482>`_)
 855 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 856 | 
 857 | ``regex.split``, ``regex.sub`` and ``regex.subn`` support a 'flags' argument.
 858 | 
 859 | Pos and endpos arguments for regex.sub and regex.subn
 860 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 861 | 
 862 | ``regex.sub`` and ``regex.subn`` support 'pos' and 'endpos' arguments.
 863 | 
 864 | 'Overlapped' argument for regex.findall and regex.finditer
 865 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 866 | 
 867 | ``regex.findall`` and ``regex.finditer`` support an 'overlapped' flag which permits overlapped matches.
 868 | 
 869 | Splititer
 870 | ^^^^^^^^^
 871 | 
 872 | ``regex.splititer`` has been added. It's a generator equivalent of ``regex.split``.
 873 | 
 874 | Subscripting match objects for groups
 875 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 876 | 
 877 | A match object accepts access to the groups via subscripting and slicing:
 878 | 
 879 | .. sourcecode:: python
 880 | 
 881 |   >>> m = regex.search(r"(?P<before>.*?)(?P<num>\d+)(?P<after>.*)", "pqr123stu")
 882 |   >>> print(m["before"])
 883 |   pqr
 884 |   >>> print(len(m))
 885 |   4
 886 |   >>> print(m[:])
 887 |   ('pqr123stu', 'pqr', '123', 'stu')
 888 | 
 889 | Named groups
 890 | ^^^^^^^^^^^^
 891 | 
 892 | Groups can be named with ``(?<name>...)`` as well as the existing ``(?P<name>...)``.
 893 | 
 894 | Group references
 895 | ^^^^^^^^^^^^^^^^
 896 | 
 897 | Groups can be referenced within a pattern with ``\g<name>``. This also allows there to be more than 99 groups.
 898 | 
 899 | Named characters ``\N{name}``
 900 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 901 | 
 902 | Named characters are supported. Note that only those known by Python's Unicode database will be recognised.
 903 | 
 904 | Unicode codepoint properties, including scripts and blocks
 905 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 906 | 
 907 | ``\p{property=value}``; ``\P{property=value}``; ``\p{value}`` ; ``\P{value}``
 908 | 
 909 | Many Unicode properties are supported, including blocks and scripts. ``\p{property=value}`` or ``\p{property:value}`` matches a character whose property ``property`` has value ``value``. The inverse of ``\p{property=value}`` is ``\P{property=value}`` or ``\p{^property=value}``.
 910 | 
 911 | If the short form ``\p{value}`` is used, the properties are checked in the order: ``General_Category``, ``Script``, ``Block``, binary property:
 912 | 
 913 | * ``Latin``, the 'Latin' script (``Script=Latin``).
 914 | 
 915 | * ``BasicLatin``, the 'BasicLatin' block (``Block=BasicLatin``).
 916 | 
 917 | * ``Alphabetic``, the 'Alphabetic' binary property (``Alphabetic=Yes``).
 918 | 
 919 | A short form starting with ``Is`` indicates a script or binary property:
 920 | 
 921 | * ``IsLatin``, the 'Latin' script (``Script=Latin``).
 922 | 
 923 | * ``IsAlphabetic``, the 'Alphabetic' binary property (``Alphabetic=Yes``).
 924 | 
 925 | A short form starting with ``In`` indicates a block property:
 926 | 
 927 | * ``InBasicLatin``, the 'BasicLatin' block (``Block=BasicLatin``).
 928 | 
 929 | POSIX character classes
 930 | ^^^^^^^^^^^^^^^^^^^^^^^
 931 | 
 932 | ``[[:alpha:]]``; ``[[:^alpha:]]``
 933 | 
 934 | POSIX character classes are supported. These are normally treated as an alternative form of ``\p{...}``.
 935 | 
 936 | The exceptions are ``alnum``, ``digit``, ``punct`` and ``xdigit``, whose definitions are different from those of Unicode.
 937 | 
 938 | ``[[:alnum:]]`` is equivalent to ``\p{posix_alnum}``.
 939 | 
 940 | ``[[:digit:]]`` is equivalent to ``\p{posix_digit}``.
 941 | 
 942 | ``[[:punct:]]`` is equivalent to ``\p{posix_punct}``.
 943 | 
 944 | ``[[:xdigit:]]`` is equivalent to ``\p{posix_xdigit}``.
 945 | 
 946 | Search anchor ``\G``
 947 | ^^^^^^^^^^^^^^^^^^^^
 948 | 
 949 | A search anchor has been added. It matches at the position where each search started/continued and can be used for contiguous matches or in negative variable-length lookbehinds to limit how far back the lookbehind goes:
 950 | 
 951 | .. sourcecode:: python
 952 | 
 953 |   >>> regex.findall(r"\w{2}", "abcd ef")
 954 |   ['ab', 'cd', 'ef']
 955 |   >>> regex.findall(r"\G\w{2}", "abcd ef")
 956 |   ['ab', 'cd']
 957 | 
 958 | * The search starts at position 0 and matches 'ab'.
 959 | 
 960 | * The search continues at position 2 and matches 'cd'.
 961 | 
 962 | * The search continues at position 4 and fails to match any letters.
 963 | 
 964 | * The anchor stops the search start position from being advanced, so there are no more results.
 965 | 
 966 | Reverse searching
 967 | ^^^^^^^^^^^^^^^^^
 968 | 
 969 | Searches can also work backwards:
 970 | 
 971 | .. sourcecode:: python
 972 | 
 973 |   >>> regex.findall(r".", "abc")
 974 |   ['a', 'b', 'c']
 975 |   >>> regex.findall(r"(?r).", "abc")
 976 |   ['c', 'b', 'a']
 977 | 
 978 | Note that the result of a reverse search is not necessarily the reverse of a forward search:
 979 | 
 980 | .. sourcecode:: python
 981 | 
 982 |   >>> regex.findall(r"..", "abcde")
 983 |   ['ab', 'cd']
 984 |   >>> regex.findall(r"(?r)..", "abcde")
 985 |   ['de', 'bc']
 986 | 
 987 | Matching a single grapheme ``\X``
 988 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 989 | 
 990 | The grapheme matcher is supported. It conforms to the Unicode specification at ``http://www.unicode.org/reports/tr29/``.
 991 | 
 992 | Branch reset ``(?|...|...)``
 993 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 994 | 
 995 | Group numbers will be reused across the alternatives, but groups with different names will have different group numbers.
 996 | 
 997 | .. sourcecode:: python
 998 | 
 999 |   >>> regex.match(r"(?|(first)|(second))", "first").groups()
1000 |   ('first',)
1001 |   >>> regex.match(r"(?|(first)|(second))", "second").groups()
1002 |   ('second',)
1003 | 
1004 | Note that there is only one group.
1005 | 
1006 | Default Unicode word boundary
1007 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1008 | 
1009 | The ``WORD`` flag changes the definition of a 'word boundary' to that of a default Unicode word boundary. This applies to ``\b`` and ``\B``.
1010 | 
1011 | Timeout
1012 | ^^^^^^^
1013 | 
1014 | The matching methods and functions support timeouts. The timeout (in seconds) applies to the entire operation:
1015 | 
1016 | .. sourcecode:: python
1017 | 
1018 |   >>> from time import sleep
1019 |   >>>
1020 |   >>> def fast_replace(m):
1021 |   ...     return 'X'
1022 |   ...
1023 |   >>> def slow_replace(m):
1024 |   ...     sleep(0.5)
1025 |   ...     return 'X'
1026 |   ...
1027 |   >>> regex.sub(r'[a-z]', fast_replace, 'abcde', timeout=2)
1028 |   'XXXXX'
1029 |   >>> regex.sub(r'[a-z]', slow_replace, 'abcde', timeout=2)
1030 |   Traceback (most recent call last):
1031 |     File "<stdin>", line 1, in <module>
1032 |     File "C:\Python310\lib\site-packages\regex\regex.py", line 278, in sub
1033 |       return pat.sub(repl, string, count, pos, endpos, concurrent, timeout)
1034 |   TimeoutError: regex timed out
1035 | 


--------------------------------------------------------------------------------
/changelog.txt:
--------------------------------------------------------------------------------
   1 | Version: 2025.5.18
   2 | 
   3 |     Updated main.yml to build Windows ARM64/aarch64 wheel.
   4 | 
   5 |     Updated licence text format in pyproject.toml.
   6 | 
   7 | Version: 2025.2.13
   8 | 
   9 |     Dropping support for Python 3.8 and removing it from main.yml.
  10 | 
  11 | Version: 2025.2.12
  12 | 
  13 |     Further fixes to main.yml.
  14 | 
  15 | Version: 2025.2.11
  16 | 
  17 |     Updated main.yml to Artifacts v4.
  18 | 
  19 | Version: 2025.2.10
  20 | 
  21 |     Git issue 551: Infinite loop on V1 search
  22 | 
  23 |     It's catastrophic backtracking due to the possibilities of full casefolding.
  24 | 
  25 |     Replacing `[\s\S]` with `(?s:.)` can help, but a proper fix would be more difficult.
  26 | 
  27 | Version: 2024.11.7
  28 | 
  29 |     Updated pyproject.toml and setup.py according to PEP 517.
  30 | 
  31 | Version: 2024.11.6
  32 | 
  33 |     Git issue 546: Partial match not working in some instances with non-greedy capture
  34 | 
  35 | Version: 2024.9.14
  36 | 
  37 |     Reverted to actions/download-artifact@v3 and actions/upload-artifact@v3 in main.yml because GitHub Actions failed when using them.
  38 | 
  39 | Version: 2024.9.13
  40 | 
  41 |     Updated to actions/upload-artifact@v4 in main.yml.
  42 | 
  43 | Version: 2024.9.12
  44 | 
  45 |     Updated to actions/download-artifact@v4 in main.yml.
  46 | 
  47 | Version: 2024.9.11
  48 | 
  49 |     Updated to Unicode 16.0.0.
  50 | 
  51 | Version: 2024.7.24
  52 | 
  53 |     Git issue 539: Bug: Partial matching fails on a simple example
  54 | 
  55 | Version: 2024.6.22
  56 | 
  57 |     Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to missing new GB9c rule implementation
  58 | 
  59 | Version: 2024.5.15
  60 | 
  61 |     Git issue 530: hangs with fuzzy and optionals
  62 | 
  63 |     It's not hanging, it'll finish eventually. It's just an example of catastrophic backtracking.
  64 | 
  65 |     The error printed when Ctrl+C is pressed does show a bug, though, which is now fixed.
  66 | 
  67 | Version: 2024.5.10
  68 | 
  69 |     Updated for Python 3.13.
  70 | 
  71 |     <time.h> now needs to be included explicitly because Python.h no longer includes it.
  72 | 
  73 | Version: 2024.4.28
  74 | 
  75 |     Git issue 527: `VERBOSE`/`X` flag breaks `\N` escapes
  76 | 
  77 | Version: 2024.4.16
  78 | 
  79 |     Git issue 525: segfault when fuzzy matching empty list
  80 | 
  81 | Version: 2023.12.25
  82 | 
  83 |     Cannot get release notification action in main.yml to work. Commenting it out for now.
  84 | 
  85 | Version: 2023.12.24
  86 | 
  87 |     Fixed invalid main.yml.
  88 | 
  89 | Version: 2023.12.23
  90 | 
  91 |     The escape function no longer escapes \x00. It's not necessary.
  92 | 
  93 |     Inline flags can now be turned off and apply to what follows.
  94 | 
  95 |     Added \R to match line endings.
  96 | 
  97 | Version: 2023.10.3
  98 | 
  99 |     Updated to Unicode 15.1.0.
 100 | 
 101 | Version: 2023.8.8
 102 | 
 103 |     Git issue 508: Regex doesn't build using CPython main (3.13.0a0)
 104 |     Removed usage of _PyBytes_Join and did a little tidying of the code that makes the result string.
 105 | 
 106 | Version: 2023.6.3
 107 | 
 108 |     Git issue 498: Conditional negative lookahead inside positive lookahead fails to match
 109 |     Conditional node needed an additional member that points to the true branch.
 110 | 
 111 | Version: 2023.5.5
 112 | 
 113 |     Removed semicolon after 'else' in 'munge_name'.
 114 | 
 115 | Version: 2023.5.4
 116 | 
 117 |     Fixed pyproject.toml and setup.py.
 118 | 
 119 | Version: 2023.5.3
 120 | 
 121 |     pyproject.toml was missing.
 122 | 
 123 | Version: 2023.5.2
 124 | 
 125 |     Added pyproject.toml.
 126 | 
 127 | Version: 2023.3.23
 128 | 
 129 |     Git issue 495: Running time for failing fullmatch increases rapidly with input length
 130 |     Re-enabled modified repeat guards due to regression in speed caused by excessive backtracking.
 131 | 
 132 | Version: 2023.3.22
 133 | 
 134 |     Git issue 494: Backtracking failure matching regex `^a?(a?)b?c\1$` against string `abca`
 135 |     Disabled repeat guards. They keep causing issues, and it's just simpler to rely on timeouts.
 136 | 
 137 | Version: 2022.10.31
 138 | 
 139 |     Updated text for supported Unicode and Python versions.
 140 | 
 141 | Version: 2022.9.13
 142 | 
 143 |     Updated to Unicode 15.0.0.
 144 | 
 145 | Version: 2022.9.11
 146 | 
 147 |     Updated version.
 148 | 
 149 | Version: 2022.8.17
 150 | 
 151 |     Git issue 477: \v for vertical spacing
 152 | 
 153 |     Added \p{HorizSpace} (\p{H}) and \p{VertSpace} (\p{V}).
 154 | 
 155 | Version: 2022.7.25
 156 | 
 157 |     Git issue 475: 2022.7.24 improperly released
 158 | 
 159 |     The file https://pypi.org/pypi/regex/2022.7.24/json was missing references to most of the wheels, so this is a new release in the hope that it was just a glitch in GitHub Actions.
 160 | 
 161 | Version: 2022.7.24
 162 | 
 163 |     Git issue 474: regex has no equivalent to re.Match.groups() for captures
 164 | 
 165 |     Added 'allcaptures' and 'allspans' methods to match objects.
 166 | 
 167 |     Fixed bug where compiling a pattern didn't always check for unused arguments.
 168 | 
 169 | Version: 2022.7.9
 170 | 
 171 |     Git issue 473: Emoji classified as letter
 172 | 
 173 |     The values for GC:Assigned and GC:LC were flipped.
 174 | 
 175 | Version: 2022.6.2
 176 | 
 177 |     Git issue 472: Revisit compilation flag to prevent adding a single explicitly compiled regex to the cache
 178 | 
 179 |     Added 'cache_pattern' parameter to 'compile' function to improve use of the cache.
 180 | 
 181 | Version: 2022.4.24
 182 | 
 183 |     Git issue 467: Scoped inline flags 'a', 'u' and 'L' affect global flags
 184 | 
 185 |     Those flags scan now be scoped.
 186 | 
 187 | Version: 2022.3.15
 188 | 
 189 |     Git issue 457: Difference with `re`, when repl returns None
 190 | 
 191 |     Make regex consistent with re by treating a replacement template of None as ''.
 192 | 
 193 |     Also, now rejects invalid ASCII escapes like re module does.
 194 | 
 195 | Version: 2022.3.2
 196 | 
 197 |     Git issue 453: Document last supported python2 version
 198 | 
 199 |     Added a brief reference to the last version to support Python 2 in README.rst.
 200 | 
 201 |     Git issue 456: RegexFlag exists in re, but not regex
 202 | 
 203 |     Updated the flags to use enum now that regex supports only Python 3.6+.
 204 | 
 205 | Version: 2022.1.21
 206 | 
 207 |     Added 'python_requires' to setup.py now that Python 2 no longer supported.
 208 | 
 209 | Version: 2022.1.18
 210 | 
 211 |     Updated version for new release.
 212 | 
 213 | Version: 2022.1.18
 214 | 
 215 |     * Dropped support for Python 2 and remove all references to Python <3.6, the earliest supported version.
 216 | 
 217 |     Removed Features.rst, which was just a duplicate of README.rst.
 218 | 
 219 | Version: 2021.11.9
 220 | 
 221 |     Git issue 443: 2021.11.9 source release is missing C headers
 222 | 
 223 |     Updated version.
 224 | 
 225 | Version: 2021.11.9
 226 | 
 227 |     Git issue 442: Fuzzy regex matching doesn't seem to test insertions correctly
 228 | 
 229 | Version: 2021.11.2
 230 | 
 231 |     Git issue 435: Unmatched groups: sub vs subf
 232 | 
 233 |     A similar fix also applies to expandf: unmatched groups should expand to an empty string.
 234 | 
 235 | Version: 2021.11.2
 236 | 
 237 |     Removed unused functions.
 238 | 
 239 |     Added long description type to setup.py.
 240 | 
 241 | Version: 2021.11.1
 242 | 
 243 |     Further changes for migration to Github.
 244 | 
 245 | Version: 2021.10.23
 246 | 
 247 |     Git issue 433: Disagreement between fuzzy_counts and fuzzy_changes
 248 | 
 249 |     Fuzzy changes were sometimes not removed when backtracking.
 250 | 
 251 | Version: 2021.10.21
 252 | 
 253 |     Removed Apple Silicon build from .travis.yml because it's not currently codesigned by Travis CI.
 254 | 
 255 | Version: 2021.10.8
 256 | 
 257 |     Git issue 428: match hangs on the following example - possible infinite loop?
 258 | 
 259 |     Fixed miscalculation of total error count when there's more than one fuzzy term.
 260 | 
 261 | Version: 2021.9.30
 262 | 
 263 |     Git issue 427: Possible bug with BESTMATCH
 264 | 
 265 | Version: 2021.9.24
 266 | 
 267 |     Updated to Unicode 14.0.0.
 268 | 
 269 | Version: 2021.8.27
 270 | 
 271 |     Git issue 421: 2021.8.27 results in "Fatal Python error: Segmentation fault"
 272 | 
 273 |     Fixed problems with use of fast searching tables in opposite direction.
 274 | 
 275 | Version: 2021.8.27
 276 | 
 277 |     Git issue 420: segmentation fault in finditer (maybe others)
 278 | 
 279 |     Fixed a bugs in fast searches in reverse direction.
 280 | 
 281 | Version: 2021.8.21
 282 | 
 283 |     Updated version.
 284 | 
 285 | Version: 2021.8.3
 286 | 
 287 |     Forgot to update version!
 288 | 
 289 | Version: 2021.7.6
 290 | 
 291 |     Additional fix for Git issue 415.
 292 | 
 293 | Version: 2021.7.5
 294 | 
 295 |     Git issue 415: Fuzzy character restrictions don't apply to insertions at "right edge"
 296 | 
 297 | Version: 2021.7.1
 298 | 
 299 |     Git issue 407: API is not a drop-in replacement for python's re when it comes to typing
 300 | 
 301 |     Now exports Match object as well as Pattern object.
 302 | 
 303 |     Git issue 414: Memory optimization questions
 304 | 
 305 |     sys.getsizeof returns a more accurate size of a pattern object. It includes the size of internal data, but, as is the norm, does not include the size of public objects.
 306 | 
 307 | Version: 2021.4.4
 308 | 
 309 |     Git issue 408: regex fails with a quantified backreference but succeeds with repeated backref
 310 |     Git issue 407: API is not a drop-in replacement for python's re when it comes to typing
 311 | 
 312 | Version: 2021.3.17
 313 | 
 314 |     Git issue 403: Fuzzy matching with wrong distance (unnecessary substitutions)
 315 | 
 316 |     Reworked the fuzzy matching code.
 317 | 
 318 | Version: 2020.11.13
 319 | 
 320 |     Git issue 394: Unexpected behaviour in fuzzy matching with limited character set with IGNORECASE flag
 321 | 
 322 | Version: 2020.11.11
 323 | 
 324 |     Update version.
 325 | 
 326 | Version: 2020.11.2
 327 | 
 328 |     Updated list of supported Python versions.
 329 | 
 330 |     Added .travis.yml file.
 331 | 
 332 | Version: 2020.10.28
 333 | 
 334 |     Git issue 362: Any LICENSE work for this project?
 335 | 
 336 |     Changed licence to Apache 2.0 and added licence file.
 337 | 
 338 | Version: 2020.10.23
 339 | 
 340 |     Git issue 387: Compilaton flag to avoid storing compiled regexp in internal cache
 341 | 
 342 |     Slight reversion/revision. You can prevent explicitly-compiled patterns from being cached by using "cache_all(False)".
 343 | 
 344 | Version: 2020.10.22
 345 | 
 346 |     Git issue 387: Compilaton flag to avoid storing compiled regexp in internal cache
 347 | 
 348 |     No longer caches patterns that are compiled explicitly.
 349 | 
 350 | Version: 2020.10.15
 351 | 
 352 |     Git issue 386: GCC 10 warnings
 353 | 
 354 |     Fixed bugs in fuzzy_match_string_fld and fuzzy_match_group_fld.
 355 | 
 356 |     Added more braces around data in some Unicode tables.
 357 | 
 358 | Version: 2020.10.11
 359 | 
 360 |     Git issue 385: Comments in expressions
 361 | 
 362 |     Didn't parse regex comments property when in VERBOSE mode.
 363 | 
 364 | Version: 2020.9.27
 365 | 
 366 |     Git issue 383: Memory Error - regex.findall
 367 | 
 368 |     The problem was caused by a lazy repeat looping forever, growing the backtracking stack. Greedy repeats were OK.
 369 | 
 370 | Version: 2020.7.14
 371 | 
 372 |     Git issue 377: request: \h for horizontal space
 373 | 
 374 |     Added \h as an alias to [[:blank:]].
 375 | 
 376 | Version: 2020.6.7
 377 | 
 378 |     Git issue 376: Is the \L option as efficient as it can be?
 379 | 
 380 |     Improved performance of string sets.
 381 | 
 382 | Version: 2020.6.7
 383 | 
 384 |     Git issue 376: Is the \L option as efficient as it can be?
 385 | 
 386 |     Switched StringSet to use fallback method due to inefficiencies in the engine. Needs more investigation.
 387 | 
 388 | Version: 2020.5.14
 389 | 
 390 |     Git issue 372: Regression from 2020.4.4 -> 2020.5.7 in non-fuzzy matching pattern
 391 | 
 392 |     Changed the 'state' member that's tested in is_repeat_guarded for a fuzzy match. The previously-used member wasn't initialised in a non-fuzzy match. The new test is a better one to use anyway.
 393 | 
 394 | Version: 2020.5.13
 395 | 
 396 |     Git issue 371: Specifying character set when fuzzy-matching allows characters not in the set
 397 | 
 398 |     fuzzy_ext_match and fuzzy_ext_match_group_fld didn't support sets!
 399 | 
 400 | Version: 2020.5.7
 401 | 
 402 |     Git issue 370: Confusions about Fuzzy matching behavior (prob a bug?)
 403 | 
 404 | Version: 2020.4.4
 405 | 
 406 |     Updated to Unicode 13.0.0.
 407 | 
 408 | Version: 2020.2.20
 409 | 
 410 |     Git issue 365: Memory leak occurs in fuzzy match at some substitution use cases
 411 | 
 412 | Version: 2020.2.18
 413 | 
 414 |     Git issue #364: Contradictory values in fuzzy_counts and fuzzy_changes
 415 | 
 416 | Version: 2020.1.7
 417 | 
 418 |     Issue 357: New exception "ValueError: unused keyword argument" breaks use case
 419 | 
 420 |     Added ignore_unused keyword argument.
 421 | 
 422 |     Issue 359: 2020.1.7 source distribution release contains \r\n line endings
 423 | 
 424 |     Fixed line endings for source distribution.
 425 | 
 426 |     Issue 360: Invalid modeline in `_regex.c`
 427 | 
 428 |     Removed vim modeline.
 429 | 
 430 | Version: 2020.1.7
 431 | 
 432 |     Fix to previous change.
 433 | 
 434 | Version: 2019.12.21
 435 | 
 436 |     Hg issue 353: fuzzy changes negative indexes
 437 | 
 438 |     Fuzzy change positions were off by 1 for deletions.
 439 | 
 440 | Version: 2019.12.18
 441 | 
 442 |     Another complaint from Linux.
 443 | 
 444 | Version: 2019.12.17
 445 | 
 446 |     New release and upload because of problem with source distribution.
 447 | 
 448 | Version: 2019.12.17
 449 | 
 450 |     New release and upload because of previous issues.
 451 | 
 452 | Version: 2019.12.17
 453 | 
 454 |     Make changes to setup.py.
 455 | 
 456 | Version: 2019.12.16
 457 | 
 458 |     Discarded changes for Linux.
 459 | 
 460 | Version: 2019.12.16
 461 | 
 462 |     Backed out changeset: f57e64d2085b
 463 | 
 464 | Version: 2019.12.15
 465 | 
 466 |     add bdist_wheel command to setup.py
 467 | 
 468 | Version: 2019.12.9
 469 | 
 470 |     Hg issue 348: '\X' (extended grapheme cluster) can't pass Unicode's GraphemeBreakTest (12.1.0)
 471 | 
 472 |     Fixed a couple of bugs in unicode_at_grapheme_boundary.
 473 | 
 474 | Version: 2019.10.31
 475 | 
 476 |     Made "Additional Features" linkable.
 477 | 
 478 |     Updated setup.
 479 | 
 480 |     Minor bug fix.
 481 | 
 482 | Version: 2019.8.19
 483 | 
 484 |     Hg issue 338: specifying allowed characters when fuzzy-matching
 485 | 
 486 |     Added character testing to a fuzzy constraint.
 487 | 
 488 | Version: 2019.6.8
 489 | 
 490 |     Hg issue 333: error when installing regex on PyPy2.7 v7.1.1 on Windows
 491 | 
 492 |     PyPy isn't officially supported, but this might fix it!
 493 | 
 494 | Version: 2019.6.5
 495 | 
 496 |     Updated for Python 3.8.
 497 | 
 498 | Version: 2019.6.2
 499 | 
 500 |     Updated to Unicode 12.1.0.
 501 | 
 502 | Version: 2019.5.25
 503 | 
 504 |     Hg issue 329: Wrong group matches when question mark quantifier is used within a look behind
 505 | 
 506 |     REPEAT_ONE was backtracking in the wrong direction, so it never hit the limit.
 507 | 
 508 | Version: 2019.4.14
 509 | 
 510 |     Hg issue 327: .fullmatch() causes MemoryError
 511 | 
 512 |     For fullmatch, added check for end/start of string for RE_OP_SUCCESS in try_match.
 513 | 
 514 | Version: 2019.4.12
 515 | 
 516 |     Missing brace in 'state_fini'.
 517 | 
 518 | Version: 2019.4.10
 519 | 
 520 |     Hg issue 325: module docstring not accessible
 521 | 
 522 |     Additional fix for regex_3/regex.py.
 523 | 
 524 |     Hg issue 326: Version is out of sync with PyPI
 525 | 
 526 | Version: 2019.3.12
 527 | 
 528 |     Hg issue 319: Support for a timespan parameter
 529 | 
 530 |     Added timeout parameter in Python 3. TimeoutError was added in Python 3.3, and as Python 2.7 will soon reach EOL, I'm not bothered about supporting timeouts for Python 2.
 531 | 
 532 | Version: 2019.3.9
 533 | 
 534 |     Hg issue 320: Abnormal performance
 535 | 
 536 |     Forgot about negative lookarounds! Previous change now applies only to positive lookarounds.
 537 | 
 538 | Version: 2019.3.8
 539 | 
 540 |     Hg issue 320: Abnormal performance
 541 | 
 542 |     Included firstset from lookaround, where appropriate.
 543 | 
 544 | Version: 2019.2.21
 545 | 
 546 |     Hg issue 316: __version__ no longer accessible via regex.__version__
 547 | 
 548 | Version: 2019.2.20
 549 | 
 550 |     Hg issue 314: Import error: "No module named regex._regex_core"
 551 | 
 552 |     Was OK for wheels, but not for setup.py.
 553 | 
 554 | Version: 2019.2.19
 555 | 
 556 |     Hg issue 313: test_regex.py ends up in site-packages/test_regex.py
 557 | 
 558 |     Tidied files away into subfolder in site-packages.
 559 | 
 560 | Version: 2019.2.18
 561 | 
 562 |     Fixed bug in unicode_at_grapheme_boundary.
 563 | 
 564 | Version: 2019.2.7
 565 | 
 566 |     Moved some StateData declarations to make the code conform to the C89 standard.
 567 | 
 568 | Version: 2019.2.6
 569 | 
 570 |     Lookarounds no longer save the repeat data. Lookarounds no longer saves captures if they don't contain any groups.
 571 | 
 572 | Version: 2019.2.5
 573 | 
 574 |     Atomic groups no longer save the repeat data; that proved to be unnecessary.
 575 | 
 576 | Version: 2019.2.3
 577 | 
 578 |     Further improvements to the new code.
 579 | 
 580 | Version: 2019.1.24
 581 | 
 582 |     Hg issue 308: infinite search
 583 | 
 584 |     Fixed a re-allocation bug.
 585 | 
 586 | Version: 2019.1.23
 587 | 
 588 |     Major overhaul of code to use simpler stacks. The result is now much easier to understand and maintain!
 589 | 
 590 | Version: 2018.11.22
 591 | 
 592 |     Hg issue 304: Unreasonable edge case that used to work
 593 | 
 594 |     Now moves the minimum number of repeats out of a repeat if it contains a repeat. This allows a repeat guard to be put back, which reduces the chance of catastrophic backtracking.
 595 | 
 596 | Version: 2018.11.7
 597 | 
 598 |     Hg issue 301: TypeError: character mapping must return integer, None or unicode
 599 | 
 600 |     Fixed bug introduced by broken workflow.
 601 | 
 602 | Version: 2018.11.6
 603 | 
 604 |     Hg issue 300: segmentation fault
 605 | 
 606 |     Fixed a problem with not recording all fuzzy changes.
 607 | 
 608 |     Also fixed the check for prefix/suffix in branches: when fuzzy subpatterns were compared, their constraints weren't compared.
 609 | 
 610 | Version: 2018.11.3
 611 | 
 612 |     Hg issue 299
 613 | 
 614 |     Reworked the fix to perform a normal match and then fall back to a partial match if that was originally requested.
 615 | 
 616 | Version: 2018.11.2
 617 | 
 618 |     Hg issue 299: Partial gives misleading results with "open ended" regexp
 619 | 
 620 |     Added checks for end of text at start and end of repeats.
 621 | 
 622 | Version: 2018.8.29
 623 | 
 624 |     Hg issue 293: scx (Script Extensions) property currently matches incorrectly
 625 | 
 626 |     One of the tables for Script Extensions was partly incorrectly ordered.
 627 | 
 628 | Version: 2018.8.17
 629 | 
 630 |     Hg issue 291: Include Script Extensions as a supported Unicode property
 631 | 
 632 |     Added the Unicode Script Extensions property.
 633 | 
 634 | Version: 2018.7.11
 635 | 
 636 |     Hg issue #289: Regex and Python typing
 637 | 
 638 |     Added types for Pattern and Match.
 639 | 
 640 |     Hg issue #290: Turkish locale causes import of regex to fail
 641 | 
 642 |     str (bytestring) in Python 2 is locale-sensitive. Added a function to uppercase ASCII-range letters in a locale-insensitive way.
 643 | 
 644 | Version: 2018.6.21
 645 | 
 646 |     Hg issue 286: Regex matches with `re` but not with `regex` module
 647 | 
 648 | Version: 2018.6.20
 649 | 
 650 |     The reported positions of fuzzy changes were sometimes incorrect.
 651 | 
 652 | Version: 2018.6.9
 653 | 
 654 |     Updated Unicode word and grapheme boundaries for Unicode 11.0.0, which I had overlooked... :-(
 655 | 
 656 | Version: 2018.6.6
 657 | 
 658 |     Correction to filenames.
 659 | 
 660 | Version: 2018.6.5
 661 | 
 662 |     Updated to Unicode 11.0.0.
 663 | 
 664 | Version: 2018.2.21
 665 | 
 666 |     Hg issue 276: Partial Matches yield incorrect matches and bounds
 667 | 
 668 |     Fixed an off-by-one bug where a lazy repeat is followed by a character (quick check).
 669 | 
 670 | Version: 2018.2.8
 671 | 
 672 |     Hg issue 273: Missing unicode normalization quick check properties
 673 | 
 674 |     The Unicode normalization quick check properties weren't handled correctly.
 675 | 
 676 | Version: 2018.2.3
 677 | 
 678 |     Hg issue 271: Comment logic different between Re and Regex
 679 |     Hg issue 273: Missing unicode normalization quick check properties
 680 | 
 681 |     Made comments consistent with re module.
 682 | 
 683 |     Added more Unicode properties.
 684 | 
 685 | Version: 2018.1.10
 686 | 
 687 |     Further changes to match re module's behaviour on zero-width matching for Python 3.7.
 688 | 
 689 |     Changes to the locations of the source files.
 690 | 
 691 | Version: 2017.12.12
 692 | 
 693 |     Hg issue 268: Update the escape() documentation
 694 | 
 695 |     Added documentation for escape.
 696 | 
 697 |     Hg issue 269: Building a bdist using setuptools throws an error.
 698 | 
 699 |     Moved source files from subfolders to main folders for Python versions.
 700 | 
 701 | Version: 2017.12.9
 702 | 
 703 |     Further changes to match re module's behaviour on zero-width matching for Python 3.7.
 704 | 
 705 | Version: 2017.12.5
 706 | 
 707 |     Hg issue 266: fuzzy match alignment recovery
 708 | 
 709 |     Added 'fuzzy_changes' attribute to match object to indicate positions of changes in fuzzy match.
 710 | 
 711 |     Stopped supporting Python 2.5 and Python 3.1-3.2.
 712 | 
 713 |     Made changes to zero-width matching for Python 3.7.
 714 | 
 715 | Version: 2017.11.9
 716 | 
 717 |     Hg issue 264: Failure to import regex in pypy3-5.8.0
 718 | 
 719 | Version: 2017.11.8
 720 | 
 721 |     Hg issue 264: Failure to import regex in pypy3-5.8.0
 722 |     Hg issue 265: Invalid pointer in munmap_chunk (core dump) for specific inputs
 723 | 
 724 | Version: 2017.9.23
 725 | 
 726 |     Hg issue 253: Run into error under PyPy 5.8.0
 727 | 
 728 | Version: 2017.8.1
 729 | 
 730 |     Hg issue #240: Unable to build the project from source on OSX with PyPy3.5-5.7.1-beta
 731 | 
 732 |     Needed to compensate for the differences between CPython and PyPy.
 733 | 
 734 | Version: 2017.4.6
 735 | 
 736 |     Added setup.py for building from the Hg working directory.
 737 | 
 738 | Version: 2017.4.6
 739 | 
 740 |     Hg issue 236: Incorrect references to bugs.python.org issues
 741 | 
 742 | Version: 2017.2.8
 743 | 
 744 |     Failed to build on AIX using xlc because FALSE and TRUE were already #define'd. Replaced enum {FALSE, TRUE} with #define's.
 745 | 
 746 | Version: 2017.1.17
 747 | 
 748 |     Hg issue 230: Is it a bug of (?(DEFINE)...)
 749 | 
 750 |     Capture groups in (?(DEFINE)...) shouldn't be treated as 'visible' capture groups by .findall.
 751 | 
 752 | Version: 2017.1.14
 753 | 
 754 |     Hg issue 227: Performance trap of (?V1i) flags
 755 | 
 756 |     Further improvements. It now tries to split full case-folded literals into simple and full case-folded literals where full case-folding isn't needed because simple case-folding is faster.
 757 | 
 758 | Version: 2017.1.13
 759 | 
 760 |     Hg issue 227: Performance trap of (?V1i) flags
 761 | 
 762 | Version: 2016.12.27
 763 | 
 764 |     Hg issue 227: Incorrect behavior for ? operator with UNICODE + IGNORECASE
 765 | 
 766 |     'end_pos' wasn't always initialised in 'locate_required_string'.
 767 | 
 768 | Version: 2016.11.21
 769 | 
 770 |     Hg issue 226: Error matching at start of string
 771 | 
 772 |     Fuzzy matching of zero-width items wasn't quite right.
 773 | 
 774 | Version: 2016.11.18
 775 | 
 776 |     Hg issue 225: BESTMATCH in fuzzy match not working
 777 | 
 778 | Version: 2016.10.22
 779 | 
 780 |     Hg issue 221: Got an exception using PyPy
 781 | 
 782 |     pypy2-v5.4.1 appears not to support the buffer protocol on bytestrings (str in Python 2), so added code to handle them analogously to how unicode strings are handled.
 783 | 
 784 | Version: 2016.10.20
 785 | 
 786 |     Make setup use setuptools if it's available.
 787 | 
 788 | Version: 2016.10.12
 789 | 
 790 |     Hg issue 221: Got an exception using PyPy
 791 | 
 792 |     Added check for error in fold_case which, I hope, will reveal the exception that it's not reporting.
 793 | 
 794 | Version: 2016.9.22
 795 | 
 796 |     Hg issue 220: Misbehavior of group capture with OR operand
 797 | 
 798 |     Repeats should not be factored out of branches because a branch should be exhausted before trying the next one.
 799 | 
 800 | Version: 2016.9.13
 801 | 
 802 |     Also supported on Python 3.6.
 803 | 
 804 | Version: 2016.8.27
 805 | 
 806 |     Hg issue 219: Unicode word boundries
 807 | 
 808 |     For a Unicode word boundary (UAX #29), apostrophe in rule WB5a should include both U+0027 (APOSTROPHE) and U+2019 (RIGHT SINGLE QUOTATION MARK /
 809 |      * curly apostrophe).
 810 | 
 811 | Version: 2016.7.21
 812 | 
 813 |     Hg issue 217: Core dump in conditional ahead match and matching \! character
 814 | 
 815 |     Fixed bug where it incorrectly tried to restore saved groups when none saved.
 816 | 
 817 | Version: 2016.7.14
 818 | 
 819 |     Hg Issue 216: Invalid match when using negative lookbehind and pipe
 820 | 
 821 |     The creation and position of a branch firstset wasn't always correct.
 822 | 
 823 | Version: 2016.6.25
 824 | 
 825 |     Updated to support Unicode 9.0.0.
 826 | 
 827 | Version: 2016.6.19
 828 | 
 829 |     Hg issue 214: tests failure when using python debug flavor
 830 | 
 831 |     Further tweaks re error handling.
 832 | 
 833 | Version: 2016.6.14
 834 | 
 835 |     Hg issue 213: Segmentation Fault
 836 | 
 837 |     Info about atomic groups wasn't pushed properly.
 838 | 
 839 | Version: 2016.6.5
 840 | 
 841 |     Hg issue 212: Unexpected matching difference with .*? between re and regex
 842 | 
 843 |     In 'add_repeat_guards', it wasn't propagating the status from the tail back towards the head across default nodes, so the guards weren't always correct.
 844 | 
 845 | Version: 2016.6.2
 846 | 
 847 |     Hg issue 211: Segmentation fault with recursive matches and atomic groups
 848 | 
 849 |     It wasn't saving the call frame and then restoring it when backtracking out of atomic groups.
 850 | 
 851 | Version: 2016.5.23
 852 | 
 853 |     Hg issue 206: Incompatible with re if single { in the pattern
 854 | 
 855 |     Brought the regex module's handling more in line with the re module. It now more readily treats an invalid fuzzy constraint as a literal.
 856 | 
 857 | Version: 2016.5.15
 858 | 
 859 |     Hg issue 208: Named list, (?ri) flags, Backreference
 860 | 
 861 |     Fixed more issues with the alignment of text in a buffer when using named lists.
 862 | 
 863 |     Also changed how a compiled regex is pickled to use a bytestring for the packed code list. (It was actually pickling an unpacked list of ints.)
 864 | 
 865 | Version: 2016.5.14
 866 | 
 867 |     Hg issue 205: Named list and (?ri) flags
 868 | 
 869 |     string_set_match_ign_fwdrev wasn't taking into account that it fills the folded buffer from the end when searching in reverse.
 870 | 
 871 | Version: 2016.5.13
 872 | 
 873 |     Hg issue 204: confusion of (?aif) flags
 874 | 
 875 |     The FULLCASE flag is now ignored if the ASCII flag is turned on.
 876 | 
 877 | Version: 2016.4.25
 878 | 
 879 |     Hg issue 203: partial matching bug
 880 | 
 881 |     The text position wasn't always set correctly before returning the status.
 882 | 
 883 | Version: 2016.4.16
 884 | 
 885 |     Hg issue 201: ENHANCEMATCH crashes interpreter
 886 | 
 887 |     Fixed an issue with restoring group captures.
 888 | 
 889 | Version: 2016.4.7
 890 | 
 891 |     Hg issue 199: Segfault in re.compile
 892 | 
 893 |     Removed copies of groups that weren't called.
 894 | 
 895 | Version: 2016.4.3
 896 | 
 897 |     Hg issue 197: ValueError in regex.compile
 898 |     Hg issue 198: ValueError in regex.compile
 899 | 
 900 |     It wasn't catching ValueError and then raising regex.error.
 901 | 
 902 | Version: 2016.4.2
 903 | 
 904 |     Hg issue 196: Fuzzy matching on repeated regex not working as expected
 905 | 
 906 |     Also reduced memory usage of pickle data.
 907 | 
 908 | Version: 2016.3.31
 909 | 
 910 |     # Hg issue #194: .FULLCASE and Backreference
 911 | 
 912 |     Capture groups failed to match when using full case folding because of a bug in the handling of "I", which needs to be treated specially to cope with the Turkic I).
 913 | 
 914 | Version: 2016.3.26
 915 | 
 916 |     Hg issue #193: Alternation and .REVERSE flag.
 917 | 
 918 |     The firstset before a branch was at the wrong end for a reverse pattern.
 919 | 
 920 |     Added back some tests that were accidentally omitted.
 921 | 
 922 | Version: 2016.3.24
 923 | 
 924 |     Hg issue 192: Named lists reverse matching doesn't work with IGNORECASE and V1
 925 | 
 926 |     string_set_match_fld_fwdrev wasn't taking into account that it fills the folded buffer from the end when searching in reverse.
 927 | 
 928 | Version: 2016.3.2
 929 | 
 930 |     Hg issue 190: Regression? Neverending regexp when upgrading to latest version.
 931 | 
 932 |     The fix for Hg issue 187 wasn't quite right. Managed to remove recursion entirely from 'add_repeat_guards'.
 933 | 
 934 | Version: 2016.2.25
 935 | 
 936 |     Hg issue 188: Crash during search
 937 | 
 938 |     Stopped calling 'try_match' recursively when the tail of a branch is a branch. Increasing alternatives to 40000 caused the stack to overflow.
 939 | 
 940 | Version: 2016.2.24
 941 | 
 942 |     Hg issue 187: Crash on Anaconda Python if large number of pattern
 943 | 
 944 |     Remove the recursion in 'use_nodes'. Increasing alternatives to 50000 caused the stack to overflow.
 945 | 
 946 | Version: 2016.2.23
 947 | 
 948 |     Hg issue 187: Crash on Anaconda Python if large number of pattern
 949 | 
 950 |     Reduced the amount of recursion in 'add_repeat_guards'. The large number of alternatives (25154) caused the stack to overflow.
 951 | 
 952 | Version: 2016.1.10
 953 | 
 954 |     Hg issue 177: Build fails on pypy (OS X 10.11, clang)
 955 | 
 956 |     Stripped out the #if...#endif that was added for Hg issue 135, which is no longer needed by more recent versions of PyPy.
 957 | 
 958 | Version: 2015.11.22
 959 | 
 960 |     Hg issue 180: bug of POSIX matching
 961 | 
 962 |     Fixed bug where the groups aren't always correct with POSIX matching.
 963 | 
 964 | Version: 2015.11.14
 965 | 
 966 |     Hg issue 172: Performance of V1 mode
 967 | 
 968 |     Fixed bug where RE_FLAG_FULLCASE not turned off when RE_FLAG_IGNORECASE turned off, leading to omission of first set.
 969 | 
 970 | Version: 2015.11.12
 971 | 
 972 |     Hg issue 171: Weird performance of V1 mode
 973 | 
 974 |     Fixed bug where RE_FLAG_FULLCASE not turned off when RE_FLAG_IGNORECASE turned off for required string, leading to required string not being used.
 975 | 
 976 | Version: 2015.11.9
 977 | 
 978 |     Hg issue 169: Performance
 979 | 
 980 |     Further tweaks to fuzzy matching.
 981 | 
 982 | Version: 2015.11.8
 983 | 
 984 |     Hg issue 167: Performance of Backreference
 985 | 
 986 |     No longer saves/restores groups or repeats around a lookaround if it doesn't contain any.
 987 | 
 988 | Version: 2015.11.7
 989 | 
 990 |     Hg issue 166: Performance
 991 | 
 992 |     Improved the performance of fuzzy matching.
 993 | 
 994 | Version: 2015.11.5
 995 | 
 996 |     Hg issue 165: Performance / hung search
 997 | 
 998 |     Made changes to fuzzy matching code, including refactoring different kind of fuzzy matching (exact/simple/enhanced/best).
 999 | 
1000 | Version: 2015.10.28
1001 | 
1002 |     Hg issue 163: allow lookarounds in conditionals
1003 | 
1004 |     Added support for a lookaround in a conditional pattern, e.g. r'(?(?=\d)\d+\b|\w+)'.
1005 | 
1006 | Version: 2015.10.22
1007 | 
1008 |      Hg issue 161: Unexpected fuzzy match results
1009 | 
1010 |      Fixed the bug and did some related tidying up.
1011 | 
1012 | Version: 2015.10.5
1013 | 
1014 |     Hg issue 158: Group issue with (?(DEFINE)...)
1015 | 
1016 |     The groups and repeats weren't restored properly when a lookaround completed and it contained a group call.
1017 | 
1018 | Version: 2015.10.1
1019 | 
1020 |     Hg issue 157: regression: segfault on complex lookaround
1021 | 
1022 |     Nested lookarounds/atomic groups didn't restore state correctly.
1023 | 
1024 | Version: 2015.9.28
1025 | 
1026 |     Hg issue 156: regression on atomic grouping
1027 | 
1028 |     It didn't initialise min_width when building the atomic group.
1029 | 
1030 | Version: 2015.9.23
1031 | 
1032 |     Hg issue 154: Segmentation fault 11 when working with an atomic group
1033 | 
1034 | Version: 2015.9.15
1035 | 
1036 |     Hg issue 150: Have an option for POSIX-compatible longest match of alternates
1037 | 
1038 |     Added POSIX matching (leftmost longest).
1039 | 
1040 | Version: 2015.7.20
1041 | 
1042 |     Hg issue #147: Fuzzy match can return match points beyond buffer end
1043 | 
1044 |     It wasn't checking for the edge of the text when case-folding and it was also advancing even when case-folding had failed.
1045 | 
1046 | Version: 2015.7.12
1047 | 
1048 |     Hg issue #146: Forced-fail (?!) works improperly in conditional
1049 | 
1050 |     Empty negative lookarounds weren't optimised correctly.
1051 | 
1052 |     The capture groups weren't being cleared before retrying after failure.
1053 | 
1054 | Version: 2015.6.24
1055 | 
1056 |     Hg issue #144: Latest version problem with matching 'R|R'
1057 | 
1058 |     The prefix of a set of branches was omitted.
1059 | 
1060 | Version: 2015.6.21
1061 | 
1062 |     Hg issue #143: Partial matches have incorrect span if prefix is '.' wildcard
1063 | 
1064 |     Didn't set state->match_pos if search_start returned a partial match status.
1065 | 
1066 | Version: 2015.6.19
1067 | 
1068 |     Updated to Unicode 8.0.
1069 | 
1070 |     Some performance tweaks.
1071 | 
1072 | Version: 2015.6.15
1073 | 
1074 |     Removed a few lines that should've been removed in the previous fix!
1075 | 
1076 | Version: 2015.6.14
1077 | 
1078 |     Fixed a bug where it could sometimes search for the same required string multiple times.
1079 | 
1080 | Version: 2015.6.10
1081 | 
1082 |     Hg issue #141: Crash on a certain partial match
1083 | 
1084 |     It didn't check the result of 'try_match' correctly in certain places (the status returned isn't limited to success and failure).
1085 | 
1086 | Version: 2015.6.9
1087 | 
1088 |     Hg issue #140: Replace with REVERSE and groups has unexpected behavior
1089 | 
1090 |     subx needed to add the template items to the list in reverse order when searching backwards because the list will be reversed after completion.
1091 | 
1092 | Version: 2015.6.4
1093 | 
1094 |     Hg issue #98: regex module is not thread safe because of _cache
1095 | 
1096 |     Now iterates over a snapshot of the cache keys in case the dict resizes.
1097 | 
1098 | Version: 2015.6.2
1099 | 
1100 |     Hg issue #139: Regular expression with multiple wildcards where first should match empty string does not always work
1101 | 
1102 |     The problem was caused by a negative-character in the firstset, eg "[^a]".
1103 | 
1104 |     Fixed.
1105 | 
1106 | Version: 2015.5.28
1107 | 
1108 |     Hg issue #137: Posix character class :punct: does not seem to be supported
1109 | 
1110 |     It _is_ supported.
1111 | 
1112 |     Corrected Posix-style properties for 'alnum', 'digit', 'punct' and 'xdigit' which are different from that of Unicode. Now also available as \p[posix_alnum}, etc.
1113 | 
1114 |     Hg issue  #138: grapheme anchored search not working properly
1115 | 
1116 |     Fixed.
1117 | 
1118 | Version: 2015.5.10
1119 | 
1120 |     Hg issue #136: Use a DFA when possible
1121 | 
1122 |     Made a slight tweak so that it now treats an optional subpattern like a repeated subpattern (e.g. "(?:xyz)?" -> "(?:xyz){0,1}") to make use of the repeat guards.
1123 | 
1124 |     The advantage over DFA is that it'll work even for those patterns that aren't compatible with DFA.
1125 | 
1126 | Version: 2015.5.7
1127 | 
1128 |     Hg issue 135: PyPy Support (with patch)
1129 | 
1130 |     It should now build on PyPy.
1131 | 
1132 | Version: 2015.3.18
1133 | 
1134 |     Hg issue 133: support for captures() in expandf().
1135 | 
1136 |     Now supported in expandf and subf.
1137 | 
1138 |     Issue 23692: Undocumented feature prevents re module from finding certain matches
1139 | 
1140 |     This also applied to regex which failed to take into account group references by group capture testa, e.g. "(?(1)...|...)", when guarding against excessive repeats.
1141 | 
1142 | Version: 2014.12.24
1143 | 
1144 |     Hg issue 132: index out of range on null property \p{}
1145 | 
1146 |     It's now reported as an unknown property.
1147 | 
1148 | Version: 2014.12.15
1149 | 
1150 |     Hg issue 131: nested sets behaviour
1151 | 
1152 |     The set difference operator '--' wasn't handled correctly after an implicit set union.
1153 | 
1154 | Version: 2014.11.14
1155 | 
1156 |     Unreported issue: no such builtin as 'ascii' in Python 2. Fixed.
1157 | 
1158 | Version: 2014.11.13
1159 | 
1160 |     Hg issue 127: Infinite loop is found
1161 | 
1162 |     Not an infinite loop, but slow because of repeated backtracking on a very long chunk of text.
1163 | 
1164 |     This fix reduces the amount of backtracking and re-matching.
1165 | 
1166 | Version: 2014.11.3
1167 | 
1168 |     Hg issue 125: Reference to entire match (\g<0>) in Pattern.sub() doesn't work as of 2014.09.22 release.
1169 | 
1170 | Version: 2014.10.24
1171 | 
1172 |     Reverted licence.
1173 | 
1174 | Version: 2014.10.23
1175 | 
1176 |     Fixed bug in determining line number in regex.
1177 | 
1178 |     Changed licence to Apache License 2.0 and included copy in release.
1179 | 
1180 | Version: 2014.10.9
1181 | 
1182 |     Issue 22578: Add additional attributes to re.error
1183 | 
1184 |     Added the attributes .msg, .pattern, .pos, .lineno and .colno to the regex error class.
1185 | 
1186 | Version: 2014.10.7
1187 | 
1188 |     Fixed bug in partial matching when required string occurs after repeat.
1189 | 
1190 | Version: 2014.9.22
1191 | 
1192 |     Issue #22437: Added support for referring to a group by number using (?P=...). This is in addition to the existing \g<...>.
1193 | 
1194 |     Fixed bug in handling of cache for locale-sensitive patterns.
1195 | 
1196 | Version: 2014.9.18
1197 | 
1198 |     Adjusted line-endings in PKG-INFO.
1199 | 
1200 | Version: 2014.8.15
1201 | 
1202 |     Hg issue 115: Infinite loop when processing backreferences
1203 | 
1204 | Version: 2014.6.28
1205 | 
1206 |     Updated to Unicode 7.0.
1207 | 
1208 | Version: 2014.5.23
1209 | 
1210 |     Fixed fuzzy counts that were wrong when using BESTMATCH or ENHANCEMATCH flags.
1211 | 
1212 | Version: 2014.5.17
1213 | 
1214 |     Hg issue 112 in mrab-regex-hg: re: OK, but regex: SystemError
1215 | 
1216 | Version: 2014.4.10
1217 | 
1218 |     Hg issue 102: Partial matches
1219 |     Hg issue 109: Edit distance of fuzzy match
1220 | 
1221 |     Added partial matches.
1222 |     Added .fuzzy_counts attribute to match objects.
1223 | 
1224 | Version: 2014.2.19
1225 | 
1226 |     Unicode properties sometimes failed to match when the IGNORECASE flag was set.
1227 | 
1228 | Version: 2014.2.16
1229 | 
1230 |     Hg issue 108: Fails to build from source on s390x
1231 | 
1232 | Version: 2014.1.30
1233 | 
1234 |     Hg issue 106: * operator not working correctly with sub()
1235 | 
1236 |     Made to conform more to re module in version 0 behaviour.
1237 | 
1238 | Version: 2014.1.20
1239 | 
1240 |     Hg issue 105: FAIL: test_case_folding (__main__.RegexTests) with py2.7
1241 | 
1242 |     data.fold_len wasn't initialised.
1243 | 
1244 |     Also, deleted some unused functions.
1245 | 
1246 | Version: 2014.1.10
1247 | 
1248 |     Issue #17087: Improve the repr for regular expression match objects
1249 | 
1250 | Version: 2013.12.31
1251 | 
1252 |     Hg issue 101: findall() broken (seems like memory corruption)
1253 | 
1254 |     state->req_pos wasn't initialised when the required string is an initial string.
1255 | 
1256 | Version: 2013.11.29
1257 | 
1258 |     Hg issue 100: strange results from regex.search
1259 | 
1260 | Version: 2013.10.25
1261 | 
1262 |     Hg issue 98: regex module is not thread safe because of _cache
1263 | 
1264 | Version: 2013.10.24
1265 | 
1266 |     Further fixes for Hg issue 96.
1267 | 
1268 |     It should now use correctly use sets for named lists, except when there's fuzzy matching.
1269 | 
1270 | Version: 2013.10.23
1271 | 
1272 |     Hg issue 96: compile '\L<...>' with 'i' flag was very slow
1273 | 
1274 | Version: 2013.10.22
1275 | 
1276 |     Hg issue 95: 'pos' for regex.error
1277 | 
1278 | Version: 2013.10.21
1279 | 
1280 |     Python crashes when executing regex updates pattern.findall
1281 | 
1282 | Version: 2013.10.12
1283 | 
1284 |     Updated to Unicode 6.3.
1285 | 
1286 | Version: 2013.10.4
1287 | 
1288 |     Issue #18468: re.group() should never return a bytearray
1289 | 
1290 |     Applies to Python 3.4 and later for compatibility with the re module.
1291 | 
1292 |     Also, some performance improvements.
1293 | 
1294 | Version: 2013.8.4
1295 | 
1296 |     Update for Python 3.4a1 release.
1297 | 
1298 | Version: 2013.6.26
1299 | 
1300 |     Performance improvements.
1301 | 
1302 | Version: 2013.6.5
1303 | 
1304 |     Hg issue 92: running the following regex causes a segfault
1305 | 
1306 | Version: 2013.5.21
1307 | 
1308 |     Hg issue 91: match.expand is extremely slow
1309 | 
1310 |     Also tidied up code.
1311 | 
1312 | Version: 2013.3.11
1313 | 
1314 |     Hg issue 89: Certain regexes extremely slow compared to re module
1315 | 
1316 |     Disabled one of the optimisations that appears to cause performance problems.
1317 | 
1318 | Version: 2013.2.22
1319 | 
1320 |     Fixed issue with LOCALE flag not working properly.
1321 | 
1322 | Version: 2013.2.16
1323 | 
1324 |     Fixed a locale-specific test. Whether b'\xE0' is a word character depends on the locale.
1325 | 
1326 | Version: 2013.1.26
1327 | 
1328 |     Another fix for Hg issue 87: Allow duplicate names of groups
1329 |     It didn't correctly handle a name group within a group of the same name.
1330 | 
1331 | Version: 2013.1.25
1332 | 
1333 |     Second attempt to fix
1334 | 
1335 | Version: 2013.1.24
1336 | 
1337 |     Hg issue 86: Enhance API of captures() to enable retrieval of ALL groups at once, as a dictionary
1338 |         Added capturesdict() method to match object.
1339 | 
1340 |     Hg issue 87: Allow duplicate names of groups
1341 |         Now allowed.
1342 | 
1343 |     Hg issue 88: regex.match() hangs
1344 |         Fixed.
1345 | 
1346 | Version: 2013.1.20
1347 | 
1348 |     Hg issue 85: Non-conformance to Unicode UAX#29 re: ZWJ / ZWNJ
1349 | 
1350 | Version: 2012.12.16
1351 | 
1352 |     Hg issue 83: slash handling in presence of a quantifier
1353 | 
1354 |     The bug was not limited just slash!
1355 | 
1356 | Version: 2012.11.20
1357 | 
1358 |     Updated to Unicode 6.2.
1359 | 
1360 | Version: 2012.11.13
1361 | 
1362 |     Issue 16443: Add docstrings to regular expression match objects
1363 | 
1364 | Version: 2012.11.5
1365 | 
1366 |     Further performance improvements.
1367 | 
1368 | Version: 2012.10.31
1369 | 
1370 |     Performance improvements.
1371 | 
1372 | Version: 2012.10.17
1373 | 
1374 |     Added "fullmatch" method (issue #16203).
1375 | 
1376 |     Fixed bug (Hg issue #80). Now raises the correct error.
1377 | 
1378 | Version: 2012.10.8
1379 | 
1380 |     Added subf, subfn and expandf methods.
1381 | 
1382 |     Performed some refactoring.
1383 | 
1384 | Version: 2012.9.4
1385 | 
1386 |     Hg issue 78: "Captures" doesn't work for recursive calls
1387 | 
1388 | Version: 2012.8.25
1389 | 
1390 |     Added 'detach_string' method to match object.
1391 | 
1392 |     Made objects copyable.
1393 | 
1394 | Version: 2012.8.3
1395 | 
1396 |     Speed improvements.
1397 | 
1398 | Version: 2012.7.10
1399 | 
1400 |     Fixed bug in debug output in Python 2 version.
1401 | 
1402 |     Also expanded fuzzy info in debug output.
1403 | 
1404 | Version: 2012.7.9
1405 | 
1406 |     Hg issue 75: DEBUG flag
1407 | 
1408 |     Also made the debug output a little more readable by showing string literals and property names/values.
1409 | 
1410 | Version: 2012.7.8
1411 | 
1412 |     Hg issue 73: conditional patterns
1413 | 
1414 | Version: 2012.7.5
1415 | 
1416 |     Hg issue 71: non-greedy quantifier in lookbehind
1417 | 
1418 | Version: 2012.6.13
1419 | 
1420 |     Hg issue 69: Changing DEFAULT_VERSION does not actually work.
1421 | 
1422 |     DEFAULT_VERSION isn't part of the public API, but changing it should now work as expected.
1423 | 
1424 | Version: 2011.5.14
1425 | 
1426 |     Fixed bug in case-insensitive set.
1427 | 
1428 | Version: 2011.3.15
1429 | 
1430 |     Shared iterators now work in both Python 3 and Python 2.
1431 | 
1432 | 


--------------------------------------------------------------------------------
/docs/UnicodeProperties.rst:
--------------------------------------------------------------------------------
   1 | The following is a list of the 94 properties which are supported by this module:
   2 | 
   3 | Alphabetic [Alpha]
   4 |     No [F, False, N]
   5 |     Yes [T, True, Y]
   6 | 
   7 | Alphanumeric [AlNum]
   8 |     No [F, False, N]
   9 |     Yes [T, True, Y]
  10 | 
  11 | Any
  12 |     No [F, False, N]
  13 |     Yes [T, True, Y]
  14 | 
  15 | ASCII_Hex_Digit [AHex]
  16 |     No [F, False, N]
  17 |     Yes [T, True, Y]
  18 | 
  19 | Bidi_Class [bc]
  20 |     Arabic_Letter [AL]
  21 |     Arabic_Number [AN]
  22 |     Boundary_Neutral [BN]
  23 |     Common_Separator [CS]
  24 |     European_Number [EN]
  25 |     European_Separator [ES]
  26 |     European_Terminator [ET]
  27 |     First_Strong_Isolate [FSI]
  28 |     Left_To_Right [L]
  29 |     Left_To_Right_Embedding [LRE]
  30 |     Left_To_Right_Isolate [LRI]
  31 |     Left_To_Right_Override [LRO]
  32 |     Nonspacing_Mark [NSM]
  33 |     Other_Neutral [ON]
  34 |     Paragraph_Separator [B]
  35 |     Pop_Directional_Format [PDF]
  36 |     Pop_Directional_Isolate [PDI]
  37 |     Right_To_Left [R]
  38 |     Right_To_Left_Embedding [RLE]
  39 |     Right_To_Left_Isolate [RLI]
  40 |     Right_To_Left_Override [RLO]
  41 |     Segment_Separator [S]
  42 |     White_Space [WS]
  43 | 
  44 | Bidi_Control [Bidi_C]
  45 |     No [F, False, N]
  46 |     Yes [T, True, Y]
  47 | 
  48 | Bidi_Mirrored [Bidi_M]
  49 |     No [F, False, N]
  50 |     Yes [T, True, Y]
  51 | 
  52 | Blank
  53 |     No [F, False, N]
  54 |     Yes [T, True, Y]
  55 | 
  56 | Block [blk]
  57 |     Adlam
  58 |     Aegean_Numbers
  59 |     Ahom
  60 |     Alchemical_Symbols [Alchemical]
  61 |     Alphabetic_Presentation_Forms [Alphabetic_PF]
  62 |     Anatolian_Hieroglyphs
  63 |     Ancient_Greek_Musical_Notation [Ancient_Greek_Music]
  64 |     Ancient_Greek_Numbers
  65 |     Ancient_Symbols
  66 |     Arabic
  67 |     Arabic_Extended_A [Arabic_Ext_A]
  68 |     Arabic_Mathematical_Alphabetic_Symbols [Arabic_Math]
  69 |     Arabic_Presentation_Forms_A [Arabic_PF_A]
  70 |     Arabic_Presentation_Forms_B [Arabic_PF_B]
  71 |     Arabic_Supplement [Arabic_Sup]
  72 |     Armenian
  73 |     Arrows
  74 |     Avestan
  75 |     Balinese
  76 |     Bamum
  77 |     Bamum_Supplement [Bamum_Sup]
  78 |     Basic_Latin [ASCII]
  79 |     Bassa_Vah
  80 |     Batak
  81 |     Bengali
  82 |     Bhaiksuki
  83 |     Block_Elements
  84 |     Bopomofo
  85 |     Bopomofo_Extended [Bopomofo_Ext]
  86 |     Box_Drawing
  87 |     Brahmi
  88 |     Braille_Patterns [Braille]
  89 |     Buginese
  90 |     Buhid
  91 |     Byzantine_Musical_Symbols [Byzantine_Music]
  92 |     Carian
  93 |     Caucasian_Albanian
  94 |     Chakma
  95 |     Cham
  96 |     Cherokee
  97 |     Cherokee_Supplement [Cherokee_Sup]
  98 |     Chess_Symbols
  99 |     CJK_Compatibility [CJK_Compat]
 100 |     CJK_Compatibility_Forms [CJK_Compat_Forms]
 101 |     CJK_Compatibility_Ideographs [CJK_Compat_Ideographs]
 102 |     CJK_Compatibility_Ideographs_Supplement [CJK_Compat_Ideographs_Sup]
 103 |     CJK_Radicals_Supplement [CJK_Radicals_Sup]
 104 |     CJK_Strokes
 105 |     CJK_Symbols_And_Punctuation [CJK_Symbols]
 106 |     CJK_Unified_Ideographs [CJK]
 107 |     CJK_Unified_Ideographs_Extension_A [CJK_Ext_A]
 108 |     CJK_Unified_Ideographs_Extension_B [CJK_Ext_B]
 109 |     CJK_Unified_Ideographs_Extension_C [CJK_Ext_C]
 110 |     CJK_Unified_Ideographs_Extension_D [CJK_Ext_D]
 111 |     CJK_Unified_Ideographs_Extension_E [CJK_Ext_E]
 112 |     CJK_Unified_Ideographs_Extension_F [CJK_Ext_F]
 113 |     Combining_Diacritical_Marks [Diacriticals]
 114 |     Combining_Diacritical_Marks_Extended [Diacriticals_Ext]
 115 |     Combining_Diacritical_Marks_For_Symbols [Combining_Marks_For_Symbols, Diacriticals_For_Symbols]
 116 |     Combining_Diacritical_Marks_Supplement [Diacriticals_Sup]
 117 |     Combining_Half_Marks [Half_Marks]
 118 |     Common_Indic_Number_Forms [Indic_Number_Forms]
 119 |     Control_Pictures
 120 |     Coptic
 121 |     Coptic_Epact_Numbers
 122 |     Counting_Rod_Numerals [Counting_Rod]
 123 |     Cuneiform
 124 |     Cuneiform_Numbers_And_Punctuation [Cuneiform_Numbers]
 125 |     Currency_Symbols
 126 |     Cypriot_Syllabary
 127 |     Cyrillic
 128 |     Cyrillic_Extended_A [Cyrillic_Ext_A]
 129 |     Cyrillic_Extended_B [Cyrillic_Ext_B]
 130 |     Cyrillic_Extended_C [Cyrillic_Ext_C]
 131 |     Cyrillic_Supplement [Cyrillic_Sup, Cyrillic_Supplementary]
 132 |     Deseret
 133 |     Devanagari
 134 |     Devanagari_Extended [Devanagari_Ext]
 135 |     Dingbats
 136 |     Dogra
 137 |     Domino_Tiles [Domino]
 138 |     Duployan
 139 |     Early_Dynastic_Cuneiform
 140 |     Egyptian_Hieroglyphs
 141 |     Egyptian_Hieroglyph_Format_Controls
 142 |     Elbasan
 143 |     Elymaic
 144 |     Emoticons
 145 |     Enclosed_Alphanumerics [Enclosed_Alphanum]
 146 |     Enclosed_Alphanumeric_Supplement [Enclosed_Alphanum_Sup]
 147 |     Enclosed_CJK_Letters_And_Months [Enclosed_CJK]
 148 |     Enclosed_Ideographic_Supplement [Enclosed_Ideographic_Sup]
 149 |     Ethiopic
 150 |     Ethiopic_Extended [Ethiopic_Ext]
 151 |     Ethiopic_Extended_A [Ethiopic_Ext_A]
 152 |     Ethiopic_Supplement [Ethiopic_Sup]
 153 |     General_Punctuation [Punctuation]
 154 |     Geometric_Shapes
 155 |     Geometric_Shapes_Extended [Geometric_Shapes_Ext]
 156 |     Georgian
 157 |     Georgian_Extended [Georgian_Ext]
 158 |     Georgian_Supplement [Georgian_Sup]
 159 |     Glagolitic
 160 |     Glagolitic_Supplement [Glagolitic_Sup]
 161 |     Gothic
 162 |     Grantha
 163 |     Greek_And_Coptic [Greek]
 164 |     Greek_Extended [Greek_Ext]
 165 |     Gujarati
 166 |     Gunjala_Gondi
 167 |     Gurmukhi
 168 |     Halfwidth_And_Fullwidth_Forms [Half_And_Full_Forms]
 169 |     Hangul_Compatibility_Jamo [Compat_Jamo]
 170 |     Hangul_Jamo [Jamo]
 171 |     Hangul_Jamo_Extended_A [Jamo_Ext_A]
 172 |     Hangul_Jamo_Extended_B [Jamo_Ext_B]
 173 |     Hangul_Syllables [Hangul]
 174 |     Hanifi_Rohingya
 175 |     Hanunoo
 176 |     Hatran
 177 |     Hebrew
 178 |     High_Private_Use_Surrogates [High_PU_Surrogates]
 179 |     High_Surrogates
 180 |     Hiragana
 181 |     Ideographic_Description_Characters [IDC]
 182 |     Ideographic_Symbols_And_Punctuation [Ideographic_Symbols]
 183 |     Imperial_Aramaic
 184 |     Indic_Siyaq_Numbers
 185 |     Inscriptional_Pahlavi
 186 |     Inscriptional_Parthian
 187 |     IPA_Extensions [IPA_Ext]
 188 |     Javanese
 189 |     Kaithi
 190 |     Kana_Extended_A [Kana_Ext_A]
 191 |     Kana_Supplement [Kana_Sup]
 192 |     Kanbun
 193 |     Kangxi_Radicals [Kangxi]
 194 |     Kannada
 195 |     Katakana
 196 |     Katakana_Phonetic_Extensions [Katakana_Ext]
 197 |     Kayah_Li
 198 |     Kharoshthi
 199 |     Khmer
 200 |     Khmer_Symbols
 201 |     Khojki
 202 |     Khudawadi
 203 |     Lao
 204 |     Latin_1_Supplement [Latin_1, Latin_1_Sup]
 205 |     Latin_Extended_A [Latin_Ext_A]
 206 |     Latin_Extended_Additional [Latin_Ext_Additional]
 207 |     Latin_Extended_B [Latin_Ext_B]
 208 |     Latin_Extended_C [Latin_Ext_C]
 209 |     Latin_Extended_D [Latin_Ext_D]
 210 |     Latin_Extended_E [Latin_Ext_E]
 211 |     Lepcha
 212 |     Letterlike_Symbols
 213 |     Limbu
 214 |     Linear_A
 215 |     Linear_B_Ideograms
 216 |     Linear_B_Syllabary
 217 |     Lisu
 218 |     Low_Surrogates
 219 |     Lycian
 220 |     Lydian
 221 |     Mahajani
 222 |     Mahjong_Tiles [Mahjong]
 223 |     Makasar
 224 |     Malayalam
 225 |     Mandaic
 226 |     Manichaean
 227 |     Marchen
 228 |     Masaram_Gondi
 229 |     Mathematical_Alphanumeric_Symbols [Math_Alphanum]
 230 |     Mathematical_Operators [Math_Operators]
 231 |     Mayan_Numerals
 232 |     Medefaidrin
 233 |     Meetei_Mayek
 234 |     Meetei_Mayek_Extensions [Meetei_Mayek_Ext]
 235 |     Mende_Kikakui
 236 |     Meroitic_Cursive
 237 |     Meroitic_Hieroglyphs
 238 |     Miao
 239 |     Miscellaneous_Mathematical_Symbols_A [Misc_Math_Symbols_A]
 240 |     Miscellaneous_Mathematical_Symbols_B [Misc_Math_Symbols_B]
 241 |     Miscellaneous_Symbols [Misc_Symbols]
 242 |     Miscellaneous_Symbols_And_Arrows [Misc_Arrows]
 243 |     Miscellaneous_Symbols_And_Pictographs [Misc_Pictographs]
 244 |     Miscellaneous_Technical [Misc_Technical]
 245 |     Modi
 246 |     Modifier_Tone_Letters
 247 |     Mongolian
 248 |     Mongolian_Supplement [Mongolian_Sup]
 249 |     Mro
 250 |     Multani
 251 |     Musical_Symbols [Music]
 252 |     Myanmar
 253 |     Myanmar_Extended_A [Myanmar_Ext_A]
 254 |     Myanmar_Extended_B [Myanmar_Ext_B]
 255 |     Nabataean
 256 |     Nandinagari
 257 |     Newa
 258 |     New_Tai_Lue
 259 |     NKo
 260 |     No_Block [NB]
 261 |     Number_Forms
 262 |     Nushu
 263 |     Nyiakeng_Puachue_Hmong
 264 |     Ogham
 265 |     Old_Hungarian
 266 |     Old_Italic
 267 |     Old_North_Arabian
 268 |     Old_Permic
 269 |     Old_Persian
 270 |     Old_Sogdian
 271 |     Old_South_Arabian
 272 |     Old_Turkic
 273 |     Ol_Chiki
 274 |     Optical_Character_Recognition [OCR]
 275 |     Oriya
 276 |     Ornamental_Dingbats
 277 |     Osage
 278 |     Osmanya
 279 |     Ottoman_Siyaq_Numbers
 280 |     Pahawh_Hmong
 281 |     Palmyrene
 282 |     Pau_Cin_Hau
 283 |     Phags_Pa
 284 |     Phaistos_Disc [Phaistos]
 285 |     Phoenician
 286 |     Phonetic_Extensions [Phonetic_Ext]
 287 |     Phonetic_Extensions_Supplement [Phonetic_Ext_Sup]
 288 |     Playing_Cards
 289 |     Private_Use_Area [Private_Use, PUA]
 290 |     Psalter_Pahlavi
 291 |     Rejang
 292 |     Rumi_Numeral_Symbols [Rumi]
 293 |     Runic
 294 |     Samaritan
 295 |     Saurashtra
 296 |     Sharada
 297 |     Shavian
 298 |     Shorthand_Format_Controls
 299 |     Siddham
 300 |     Sinhala
 301 |     Sinhala_Archaic_Numbers
 302 |     Small_Form_Variants [Small_Forms]
 303 |     Small_Kana_Extension [Small_Kana_Ext]
 304 |     Sogdian
 305 |     Sora_Sompeng
 306 |     Soyombo
 307 |     Spacing_Modifier_Letters [Modifier_Letters]
 308 |     Specials
 309 |     Sundanese
 310 |     Sundanese_Supplement [Sundanese_Sup]
 311 |     Superscripts_And_Subscripts [Super_And_Sub]
 312 |     Supplemental_Arrows_A [Sup_Arrows_A]
 313 |     Supplemental_Arrows_B [Sup_Arrows_B]
 314 |     Supplemental_Arrows_C [Sup_Arrows_C]
 315 |     Supplemental_Mathematical_Operators [Sup_Math_Operators]
 316 |     Supplemental_Punctuation [Sup_Punctuation]
 317 |     Supplemental_Symbols_And_Pictographs [Sup_Symbols_And_Pictographs]
 318 |     Supplementary_Private_Use_Area_A [Sup_PUA_A]
 319 |     Supplementary_Private_Use_Area_B [Sup_PUA_B]
 320 |     Sutton_SignWriting
 321 |     Syloti_Nagri
 322 |     Symbols_And_Pictographs_Extended_A [Symbols_And_Pictographs_Ext_A]
 323 |     Syriac
 324 |     Syriac_Supplement [Syriac_Sup]
 325 |     Tagalog
 326 |     Tagbanwa
 327 |     Tags
 328 |     Tai_Le
 329 |     Tai_Tham
 330 |     Tai_Viet
 331 |     Tai_Xuan_Jing_Symbols [Tai_Xuan_Jing]
 332 |     Takri
 333 |     Tamil
 334 |     Tamil_Supplement [Tamil_Sup]
 335 |     Tangut
 336 |     Tangut_Components
 337 |     Telugu
 338 |     Thaana
 339 |     Thai
 340 |     Tibetan
 341 |     Tifinagh
 342 |     Tirhuta
 343 |     Transport_And_Map_Symbols [Transport_And_Map]
 344 |     Ugaritic
 345 |     Unified_Canadian_Aboriginal_Syllabics [Canadian_Syllabics, UCAS]
 346 |     Unified_Canadian_Aboriginal_Syllabics_Extended [UCAS_Ext]
 347 |     Vai
 348 |     Variation_Selectors [VS]
 349 |     Variation_Selectors_Supplement [VS_Sup]
 350 |     Vedic_Extensions [Vedic_Ext]
 351 |     Vertical_Forms
 352 |     Wancho
 353 |     Warang_Citi
 354 |     Yijing_Hexagram_Symbols [Yijing]
 355 |     Yi_Radicals
 356 |     Yi_Syllables
 357 |     Zanabazar_Square
 358 | 
 359 | Canonical_Combining_Class [ccc]
 360 |     Above [230, A]
 361 |     Above_Left [228, AL]
 362 |     Above_Right [232, AR]
 363 |     Attached_Above [214, ATA]
 364 |     Attached_Above_Right [216, ATAR]
 365 |     Attached_Below [202, ATB]
 366 |     Attached_Below_Left [200, ATBL]
 367 |     Below [220, B]
 368 |     Below_Left [218, BL]
 369 |     Below_Right [222, BR]
 370 |     CCC10 [10]
 371 |     CCC103 [103]
 372 |     CCC107 [107]
 373 |     CCC11 [11]
 374 |     CCC118 [118]
 375 |     CCC12 [12]
 376 |     CCC122 [122]
 377 |     CCC129 [129]
 378 |     CCC13 [13]
 379 |     CCC130 [130]
 380 |     CCC132 [132]
 381 |     CCC133 [133]
 382 |     CCC14 [14]
 383 |     CCC15 [15]
 384 |     CCC16 [16]
 385 |     CCC17 [17]
 386 |     CCC18 [18]
 387 |     CCC19 [19]
 388 |     CCC20 [20]
 389 |     CCC21 [21]
 390 |     CCC22 [22]
 391 |     CCC23 [23]
 392 |     CCC24 [24]
 393 |     CCC25 [25]
 394 |     CCC26 [26]
 395 |     CCC27 [27]
 396 |     CCC28 [28]
 397 |     CCC29 [29]
 398 |     CCC30 [30]
 399 |     CCC31 [31]
 400 |     CCC32 [32]
 401 |     CCC33 [33]
 402 |     CCC34 [34]
 403 |     CCC35 [35]
 404 |     CCC36 [36]
 405 |     CCC84 [84]
 406 |     CCC91 [91]
 407 |     Double_Above [234, DA]
 408 |     Double_Below [233, DB]
 409 |     Iota_Subscript [240, IS]
 410 |     Kana_Voicing [8, KV]
 411 |     Left [224, L]
 412 |     Not_Reordered [0, NR]
 413 |     Nukta [7, NK]
 414 |     Overlay [1, OV]
 415 |     Right [226, R]
 416 |     Virama [9, VR]
 417 | 
 418 | Cased
 419 |     No [F, False, N]
 420 |     Yes [T, True, Y]
 421 | 
 422 | Case_Ignorable [CI]
 423 |     No [F, False, N]
 424 |     Yes [T, True, Y]
 425 | 
 426 | Changes_When_Casefolded [CWCF]
 427 |     No [F, False, N]
 428 |     Yes [T, True, Y]
 429 | 
 430 | Changes_When_Casemapped [CWCM]
 431 |     No [F, False, N]
 432 |     Yes [T, True, Y]
 433 | 
 434 | Changes_When_Lowercased [CWL]
 435 |     No [F, False, N]
 436 |     Yes [T, True, Y]
 437 | 
 438 | Changes_When_Titlecased [CWT]
 439 |     No [F, False, N]
 440 |     Yes [T, True, Y]
 441 | 
 442 | Changes_When_Uppercased [CWU]
 443 |     No [F, False, N]
 444 |     Yes [T, True, Y]
 445 | 
 446 | Dash
 447 |     No [F, False, N]
 448 |     Yes [T, True, Y]
 449 | 
 450 | Decomposition_Type [dt]
 451 |     Canonical [Can]
 452 |     Circle [Enc]
 453 |     Compat [Com]
 454 |     Final [Fin]
 455 |     Font
 456 |     Fraction [Fra]
 457 |     Initial [Init]
 458 |     Isolated [Iso]
 459 |     Medial [Med]
 460 |     Narrow [Nar]
 461 |     Nobreak [Nb]
 462 |     None
 463 |     Small [Sml]
 464 |     Square [Sqr]
 465 |     Sub
 466 |     Super [Sup]
 467 |     Vertical [Vert]
 468 |     Wide
 469 | 
 470 | Default_Ignorable_Code_Point [DI]
 471 |     No [F, False, N]
 472 |     Yes [T, True, Y]
 473 | 
 474 | Deprecated [Dep]
 475 |     No [F, False, N]
 476 |     Yes [T, True, Y]
 477 | 
 478 | Diacritic [Dia]
 479 |     No [F, False, N]
 480 |     Yes [T, True, Y]
 481 | 
 482 | East_Asian_Width [ea]
 483 |     Ambiguous [A]
 484 |     Fullwidth [F]
 485 |     Halfwidth [H]
 486 |     Narrow [Na]
 487 |     Neutral [N]
 488 |     Wide [W]
 489 | 
 490 | Emoji
 491 |     No
 492 |     Yes
 493 | 
 494 | Emoji_Component
 495 |     No
 496 |     Yes
 497 | 
 498 | Emoji_Modifier
 499 |     No
 500 |     Yes
 501 | 
 502 | Emoji_Modifier_Base
 503 |     No
 504 |     Yes
 505 | 
 506 | Emoji_Presentation
 507 |     No
 508 |     Yes
 509 | 
 510 | Extended_Pictographic
 511 |     No
 512 |     Yes
 513 | 
 514 | Extender [Ext]
 515 |     No [F, False, N]
 516 |     Yes [T, True, Y]
 517 | 
 518 | General_Category [gc]
 519 |     Assigned
 520 |     Cased_Letter [LC]
 521 |     Close_Punctuation [Pe]
 522 |     Connector_Punctuation [Pc]
 523 |     Control [Cc, cntrl]
 524 |     Currency_Symbol [Sc]
 525 |     Dash_Punctuation [Pd]
 526 |     Decimal_Number [digit, Nd]
 527 |     Enclosing_Mark [Me]
 528 |     Final_Punctuation [Pf]
 529 |     Format [Cf]
 530 |     Initial_Punctuation [Pi]
 531 |     Letter [L, L&]
 532 |     Letter_Number [Nl]
 533 |     Line_Separator [Zl]
 534 |     Lowercase_Letter [Ll]
 535 |     Mark [Combining_Mark, M, M&]
 536 |     Math_Symbol [Sm]
 537 |     Modifier_Letter [Lm]
 538 |     Modifier_Symbol [Sk]
 539 |     Nonspacing_Mark [Mn]
 540 |     Number [N, N&]
 541 |     Open_Punctuation [Ps]
 542 |     Other [C, C&]
 543 |     Other_Letter [Lo]
 544 |     Other_Number [No]
 545 |     Other_Punctuation [Po]
 546 |     Other_Symbol [So]
 547 |     Paragraph_Separator [Zp]
 548 |     Private_Use [Co]
 549 |     Punctuation [P, P&, punct]
 550 |     Separator [Z, Z&]
 551 |     Space_Separator [Zs]
 552 |     Spacing_Mark [Mc]
 553 |     Surrogate [Cs]
 554 |     Symbol [S, S&]
 555 |     Titlecase_Letter [Lt]
 556 |     Unassigned [Cn]
 557 |     Uppercase_Letter [Lu]
 558 | 
 559 | Graph
 560 |     No [F, False, N]
 561 |     Yes [T, True, Y]
 562 | 
 563 | Grapheme_Base [Gr_Base]
 564 |     No [F, False, N]
 565 |     Yes [T, True, Y]
 566 | 
 567 | Grapheme_Cluster_Break [GCB]
 568 |     Control [CN]
 569 |     CR
 570 |     Extend [EX]
 571 |     E_Base [EB]
 572 |     E_Base_GAZ [EBG]
 573 |     E_Modifier [EM]
 574 |     Glue_After_Zwj [GAZ]
 575 |     L
 576 |     LF
 577 |     LV
 578 |     LVT
 579 |     Other [XX]
 580 |     Prepend [PP]
 581 |     Regional_Indicator [RI]
 582 |     SpacingMark [SM]
 583 |     T
 584 |     V
 585 |     ZWJ
 586 | 
 587 | Grapheme_Extend [Gr_Ext]
 588 |     No [F, False, N]
 589 |     Yes [T, True, Y]
 590 | 
 591 | Grapheme_Link [Gr_Link]
 592 |     No [F, False, N]
 593 |     Yes [T, True, Y]
 594 | 
 595 | Hangul_Syllable_Type [hst]
 596 |     Leading_Jamo [L]
 597 |     LVT_Syllable [LVT]
 598 |     LV_Syllable [LV]
 599 |     Not_Applicable [NA]
 600 |     Trailing_Jamo [T]
 601 |     Vowel_Jamo [V]
 602 | 
 603 | Hex_Digit [Hex]
 604 |     No [F, False, N]
 605 |     Yes [T, True, Y]
 606 | 
 607 | Hyphen
 608 |     No [F, False, N]
 609 |     Yes [T, True, Y]
 610 | 
 611 | Ideographic [Ideo]
 612 |     No [F, False, N]
 613 |     Yes [T, True, Y]
 614 | 
 615 | IDS_Binary_Operator [IDSB]
 616 |     No [F, False, N]
 617 |     Yes [T, True, Y]
 618 | 
 619 | IDS_Trinary_Operator [IDST]
 620 |     No [F, False, N]
 621 |     Yes [T, True, Y]
 622 | 
 623 | ID_Continue [IDC]
 624 |     No [F, False, N]
 625 |     Yes [T, True, Y]
 626 | 
 627 | ID_Start [IDS]
 628 |     No [F, False, N]
 629 |     Yes [T, True, Y]
 630 | 
 631 | Indic_Positional_Category [InPC]
 632 |     Bottom
 633 |     Bottom_And_Left
 634 |     Bottom_And_Right
 635 |     Left
 636 |     Left_And_Right
 637 |     NA
 638 |     Overstruck
 639 |     Right
 640 |     Top
 641 |     Top_And_Bottom
 642 |     Top_And_Bottom_And_Right
 643 |     Top_And_Left
 644 |     Top_And_Left_And_Right
 645 |     Top_And_Right
 646 |     Visual_Order_Left
 647 | 
 648 | Indic_Syllabic_Category [InSC]
 649 |     Avagraha
 650 |     Bindu
 651 |     Brahmi_Joining_Number
 652 |     Cantillation_Mark
 653 |     Consonant
 654 |     Consonant_Dead
 655 |     Consonant_Final
 656 |     Consonant_Head_Letter
 657 |     Consonant_Initial_Postfixed
 658 |     Consonant_Killer
 659 |     Consonant_Medial
 660 |     Consonant_Placeholder
 661 |     Consonant_Preceding_Repha
 662 |     Consonant_Prefixed
 663 |     Consonant_Subjoined
 664 |     Consonant_Succeeding_Repha
 665 |     Consonant_With_Stacker
 666 |     Gemination_Mark
 667 |     Invisible_Stacker
 668 |     Joiner
 669 |     Modifying_Letter
 670 |     Non_Joiner
 671 |     Nukta
 672 |     Number
 673 |     Number_Joiner
 674 |     Other
 675 |     Pure_Killer
 676 |     Register_Shifter
 677 |     Syllable_Modifier
 678 |     Tone_Letter
 679 |     Tone_Mark
 680 |     Virama
 681 |     Visarga
 682 |     Vowel
 683 |     Vowel_Dependent
 684 |     Vowel_Independent
 685 | 
 686 | Joining_Group [jg]
 687 |     African_Feh
 688 |     African_Noon
 689 |     African_Qaf
 690 |     Ain
 691 |     Alaph
 692 |     Alef
 693 |     Beh
 694 |     Beth
 695 |     Burushaski_Yeh_Barree
 696 |     Dal
 697 |     Dalath_Rish
 698 |     E
 699 |     Farsi_Yeh
 700 |     Fe
 701 |     Feh
 702 |     Final_Semkath
 703 |     Gaf
 704 |     Gamal
 705 |     Hah
 706 |     Hamza_On_Heh_Goal [Teh_Marbuta_Goal]
 707 |     Hanifi_Rohingya_Kinna_Ya
 708 |     Hanifi_Rohingya_Pa
 709 |     He
 710 |     Heh
 711 |     Heh_Goal
 712 |     Heth
 713 |     Kaf
 714 |     Kaph
 715 |     Khaph
 716 |     Knotted_Heh
 717 |     Lam
 718 |     Lamadh
 719 |     Malayalam_Bha
 720 |     Malayalam_Ja
 721 |     Malayalam_Lla
 722 |     Malayalam_Llla
 723 |     Malayalam_Nga
 724 |     Malayalam_Nna
 725 |     Malayalam_Nnna
 726 |     Malayalam_Nya
 727 |     Malayalam_Ra
 728 |     Malayalam_Ssa
 729 |     Malayalam_Tta
 730 |     Manichaean_Aleph
 731 |     Manichaean_Ayin
 732 |     Manichaean_Beth
 733 |     Manichaean_Daleth
 734 |     Manichaean_Dhamedh
 735 |     Manichaean_Five
 736 |     Manichaean_Gimel
 737 |     Manichaean_Heth
 738 |     Manichaean_Hundred
 739 |     Manichaean_Kaph
 740 |     Manichaean_Lamedh
 741 |     Manichaean_Mem
 742 |     Manichaean_Nun
 743 |     Manichaean_One
 744 |     Manichaean_Pe
 745 |     Manichaean_Qoph
 746 |     Manichaean_Resh
 747 |     Manichaean_Sadhe
 748 |     Manichaean_Samekh
 749 |     Manichaean_Taw
 750 |     Manichaean_Ten
 751 |     Manichaean_Teth
 752 |     Manichaean_Thamedh
 753 |     Manichaean_Twenty
 754 |     Manichaean_Waw
 755 |     Manichaean_Yodh
 756 |     Manichaean_Zayin
 757 |     Meem
 758 |     Mim
 759 |     Noon
 760 |     No_Joining_Group
 761 |     Nun
 762 |     Nya
 763 |     Pe
 764 |     Qaf
 765 |     Qaph
 766 |     Reh
 767 |     Reversed_Pe
 768 |     Rohingya_Yeh
 769 |     Sad
 770 |     Sadhe
 771 |     Seen
 772 |     Semkath
 773 |     Shin
 774 |     Straight_Waw
 775 |     Swash_Kaf
 776 |     Syriac_Waw
 777 |     Tah
 778 |     Taw
 779 |     Teh_Marbuta
 780 |     Teth
 781 |     Waw
 782 |     Yeh
 783 |     Yeh_Barree
 784 |     Yeh_With_Tail
 785 |     Yudh
 786 |     Yudh_He
 787 |     Zain
 788 |     Zhain
 789 | 
 790 | Joining_Type [jt]
 791 |     Dual_Joining [D]
 792 |     Join_Causing [C]
 793 |     Left_Joining [L]
 794 |     Non_Joining [U]
 795 |     Right_Joining [R]
 796 |     Transparent [T]
 797 | 
 798 | Join_Control [Join_C]
 799 |     No [F, False, N]
 800 |     Yes [T, True, Y]
 801 | 
 802 | Line_Break [lb]
 803 |     Alphabetic [AL]
 804 |     Ambiguous [AI]
 805 |     Break_After [BA]
 806 |     Break_Before [BB]
 807 |     Break_Both [B2]
 808 |     Break_Symbols [SY]
 809 |     Carriage_Return [CR]
 810 |     Close_Parenthesis [CP]
 811 |     Close_Punctuation [CL]
 812 |     Combining_Mark [CM]
 813 |     Complex_Context [SA]
 814 |     Conditional_Japanese_Starter [CJ]
 815 |     Contingent_Break [CB]
 816 |     Exclamation [EX]
 817 |     E_Base [EB]
 818 |     E_Modifier [EM]
 819 |     Glue [GL]
 820 |     H2
 821 |     H3
 822 |     Hebrew_Letter [HL]
 823 |     Hyphen [HY]
 824 |     Ideographic [ID]
 825 |     Infix_Numeric [IS]
 826 |     Inseparable [IN, Inseperable]
 827 |     JL
 828 |     JT
 829 |     JV
 830 |     Line_Feed [LF]
 831 |     Mandatory_Break [BK]
 832 |     Next_Line [NL]
 833 |     Nonstarter [NS]
 834 |     Numeric [NU]
 835 |     Open_Punctuation [OP]
 836 |     Postfix_Numeric [PO]
 837 |     Prefix_Numeric [PR]
 838 |     Quotation [QU]
 839 |     Regional_Indicator [RI]
 840 |     Space [SP]
 841 |     Surrogate [SG]
 842 |     Unknown [XX]
 843 |     Word_Joiner [WJ]
 844 |     ZWJ
 845 |     ZWSpace [ZW]
 846 | 
 847 | Logical_Order_Exception [LOE]
 848 |     No [F, False, N]
 849 |     Yes [T, True, Y]
 850 | 
 851 | Lowercase [Lower]
 852 |     No [F, False, N]
 853 |     Yes [T, True, Y]
 854 | 
 855 | Math
 856 |     No [F, False, N]
 857 |     Yes [T, True, Y]
 858 | 
 859 | NFC_Quick_Check [NFC_QC]
 860 |     Maybe [M]
 861 |     No [N]
 862 |     Yes [Y]
 863 | 
 864 | NFD_Quick_Check [NFD_QC]
 865 |     No [N]
 866 |     Yes [Y]
 867 | 
 868 | NFKC_Quick_Check [NFKC_QC]
 869 |     Maybe [M]
 870 |     No [N]
 871 |     Yes [Y]
 872 | 
 873 | NFKD_Quick_Check [NFKD_QC]
 874 |     No [N]
 875 |     Yes [Y]
 876 | 
 877 | Noncharacter_Code_Point [NChar]
 878 |     No [F, False, N]
 879 |     Yes [T, True, Y]
 880 | 
 881 | Numeric_Type [nt]
 882 |     Decimal [De]
 883 |     Digit [Di]
 884 |     None
 885 |     Numeric [Nu]
 886 | 
 887 | Numeric_Value [nv]
 888 |     -1/2
 889 |     0
 890 |     1
 891 |     1/10
 892 |     1/12
 893 |     1/16
 894 |     1/160
 895 |     1/2
 896 |     1/20
 897 |     1/3
 898 |     1/32
 899 |     1/320
 900 |     1/4
 901 |     1/40
 902 |     1/5
 903 |     1/6
 904 |     1/64
 905 |     1/7
 906 |     1/8
 907 |     1/80
 908 |     1/9
 909 |     10
 910 |     100
 911 |     1000
 912 |     10000
 913 |     100000
 914 |     1000000
 915 |     10000000
 916 |     100000000
 917 |     10000000000
 918 |     1000000000000
 919 |     11
 920 |     11/12
 921 |     11/2
 922 |     12
 923 |     13
 924 |     13/2
 925 |     14
 926 |     15
 927 |     15/2
 928 |     16
 929 |     17
 930 |     17/2
 931 |     18
 932 |     19
 933 |     2
 934 |     2/3
 935 |     2/5
 936 |     20
 937 |     200
 938 |     2000
 939 |     20000
 940 |     200000
 941 |     20000000
 942 |     21
 943 |     216000
 944 |     22
 945 |     23
 946 |     24
 947 |     25
 948 |     26
 949 |     27
 950 |     28
 951 |     29
 952 |     3
 953 |     3/16
 954 |     3/2
 955 |     3/20
 956 |     3/4
 957 |     3/5
 958 |     3/64
 959 |     3/8
 960 |     3/80
 961 |     30
 962 |     300
 963 |     3000
 964 |     30000
 965 |     300000
 966 |     31
 967 |     32
 968 |     33
 969 |     34
 970 |     35
 971 |     36
 972 |     37
 973 |     38
 974 |     39
 975 |     4
 976 |     4/5
 977 |     40
 978 |     400
 979 |     4000
 980 |     40000
 981 |     400000
 982 |     41
 983 |     42
 984 |     43
 985 |     432000
 986 |     44
 987 |     45
 988 |     46
 989 |     47
 990 |     48
 991 |     49
 992 |     5
 993 |     5/12
 994 |     5/2
 995 |     5/6
 996 |     5/8
 997 |     50
 998 |     500
 999 |     5000
1000 |     50000
1001 |     500000
1002 |     6
1003 |     60
1004 |     600
1005 |     6000
1006 |     60000
1007 |     600000
1008 |     7
1009 |     7/12
1010 |     7/2
1011 |     7/8
1012 |     70
1013 |     700
1014 |     7000
1015 |     70000
1016 |     700000
1017 |     8
1018 |     80
1019 |     800
1020 |     8000
1021 |     80000
1022 |     800000
1023 |     9
1024 |     9/2
1025 |     90
1026 |     900
1027 |     9000
1028 |     90000
1029 |     900000
1030 |     NaN
1031 | 
1032 | Other_Alphabetic [OAlpha]
1033 |     No [F, False, N]
1034 |     Yes [T, True, Y]
1035 | 
1036 | Other_Default_Ignorable_Code_Point [ODI]
1037 |     No [F, False, N]
1038 |     Yes [T, True, Y]
1039 | 
1040 | Other_Grapheme_Extend [OGr_Ext]
1041 |     No [F, False, N]
1042 |     Yes [T, True, Y]
1043 | 
1044 | Other_ID_Continue [OIDC]
1045 |     No [F, False, N]
1046 |     Yes [T, True, Y]
1047 | 
1048 | Other_ID_Start [OIDS]
1049 |     No [F, False, N]
1050 |     Yes [T, True, Y]
1051 | 
1052 | Other_Lowercase [OLower]
1053 |     No [F, False, N]
1054 |     Yes [T, True, Y]
1055 | 
1056 | Other_Math [OMath]
1057 |     No [F, False, N]
1058 |     Yes [T, True, Y]
1059 | 
1060 | Other_Uppercase [OUpper]
1061 |     No [F, False, N]
1062 |     Yes [T, True, Y]
1063 | 
1064 | Pattern_Syntax [Pat_Syn]
1065 |     No [F, False, N]
1066 |     Yes [T, True, Y]
1067 | 
1068 | Pattern_White_Space [Pat_WS]
1069 |     No [F, False, N]
1070 |     Yes [T, True, Y]
1071 | 
1072 | Posix_AlNum
1073 |     No [F, False, N]
1074 |     Yes [T, True, Y]
1075 | 
1076 | Posix_Digit
1077 |     No [F, False, N]
1078 |     Yes [T, True, Y]
1079 | 
1080 | Posix_Punct
1081 |     No [F, False, N]
1082 |     Yes [T, True, Y]
1083 | 
1084 | Posix_XDigit
1085 |     No [F, False, N]
1086 |     Yes [T, True, Y]
1087 | 
1088 | Prepended_Concatenation_Mark [PCM]
1089 |     No [F, False, N]
1090 |     Yes [T, True, Y]
1091 | 
1092 | Print
1093 |     No [F, False, N]
1094 |     Yes [T, True, Y]
1095 | 
1096 | Quotation_Mark [QMark]
1097 |     No [F, False, N]
1098 |     Yes [T, True, Y]
1099 | 
1100 | Radical
1101 |     No [F, False, N]
1102 |     Yes [T, True, Y]
1103 | 
1104 | Regional_Indicator [RI]
1105 |     No [F, False, N]
1106 |     Yes [T, True, Y]
1107 | 
1108 | Script [sc]
1109 |     Adlam [Adlm]
1110 |     Ahom
1111 |     Anatolian_Hieroglyphs [Hluw]
1112 |     Arabic [Arab]
1113 |     Armenian [Armn]
1114 |     Avestan [Avst]
1115 |     Balinese [Bali]
1116 |     Bamum [Bamu]
1117 |     Bassa_Vah [Bass]
1118 |     Batak [Batk]
1119 |     Bengali [Beng]
1120 |     Bhaiksuki [Bhks]
1121 |     Bopomofo [Bopo]
1122 |     Brahmi [Brah]
1123 |     Braille [Brai]
1124 |     Buginese [Bugi]
1125 |     Buhid [Buhd]
1126 |     Canadian_Aboriginal [Cans]
1127 |     Carian [Cari]
1128 |     Caucasian_Albanian [Aghb]
1129 |     Chakma [Cakm]
1130 |     Cham
1131 |     Cherokee [Cher]
1132 |     Common [Zyyy]
1133 |     Coptic [Copt, Qaac]
1134 |     Cuneiform [Xsux]
1135 |     Cypriot [Cprt]
1136 |     Cyrillic [Cyrl]
1137 |     Deseret [Dsrt]
1138 |     Devanagari [Deva]
1139 |     Dogra [Dogr]
1140 |     Duployan [Dupl]
1141 |     Egyptian_Hieroglyphs [Egyp]
1142 |     Elbasan [Elba]
1143 |     Elymaic [Elym]
1144 |     Ethiopic [Ethi]
1145 |     Georgian [Geor]
1146 |     Glagolitic [Glag]
1147 |     Gothic [Goth]
1148 |     Grantha [Gran]
1149 |     Greek [Grek]
1150 |     Gujarati [Gujr]
1151 |     Gunjala_Gondi [Gong]
1152 |     Gurmukhi [Guru]
1153 |     Han [Hani]
1154 |     Hangul [Hang]
1155 |     Hanifi_Rohingya [Rohg]
1156 |     Hanunoo [Hano]
1157 |     Hatran [Hatr]
1158 |     Hebrew [Hebr]
1159 |     Hiragana [Hira]
1160 |     Imperial_Aramaic [Armi]
1161 |     Inherited [Qaai, Zinh]
1162 |     Inscriptional_Pahlavi [Phli]
1163 |     Inscriptional_Parthian [Prti]
1164 |     Javanese [Java]
1165 |     Kaithi [Kthi]
1166 |     Kannada [Knda]
1167 |     Katakana [Kana]
1168 |     Katakana_Or_Hiragana [Hrkt]
1169 |     Kayah_Li [Kali]
1170 |     Kharoshthi [Khar]
1171 |     Khmer [Khmr]
1172 |     Khojki [Khoj]
1173 |     Khudawadi [Sind]
1174 |     Lao [Laoo]
1175 |     Latin [Latn]
1176 |     Lepcha [Lepc]
1177 |     Limbu [Limb]
1178 |     Linear_A [Lina]
1179 |     Linear_B [Linb]
1180 |     Lisu
1181 |     Lycian [Lyci]
1182 |     Lydian [Lydi]
1183 |     Mahajani [Mahj]
1184 |     Makasar [Maka]
1185 |     Malayalam [Mlym]
1186 |     Mandaic [Mand]
1187 |     Manichaean [Mani]
1188 |     Marchen [Marc]
1189 |     Masaram_Gondi [Gonm]
1190 |     Medefaidrin [Medf]
1191 |     Meetei_Mayek [Mtei]
1192 |     Mende_Kikakui [Mend]
1193 |     Meroitic_Cursive [Merc]
1194 |     Meroitic_Hieroglyphs [Mero]
1195 |     Miao [Plrd]
1196 |     Modi
1197 |     Mongolian [Mong]
1198 |     Mro [Mroo]
1199 |     Multani [Mult]
1200 |     Myanmar [Mymr]
1201 |     Nabataean [Nbat]
1202 |     Nandinagari [Nand]
1203 |     Newa
1204 |     New_Tai_Lue [Talu]
1205 |     Nko [Nkoo]
1206 |     Nushu [Nshu]
1207 |     Nyiakeng_Puachue_Hmong [Hmnp]
1208 |     Ogham [Ogam]
1209 |     Old_Hungarian [Hung]
1210 |     Old_Italic [Ital]
1211 |     Old_North_Arabian [Narb]
1212 |     Old_Permic [Perm]
1213 |     Old_Persian [Xpeo]
1214 |     Old_Sogdian [Sogo]
1215 |     Old_South_Arabian [Sarb]
1216 |     Old_Turkic [Orkh]
1217 |     Ol_Chiki [Olck]
1218 |     Oriya [Orya]
1219 |     Osage [Osge]
1220 |     Osmanya [Osma]
1221 |     Pahawh_Hmong [Hmng]
1222 |     Palmyrene [Palm]
1223 |     Pau_Cin_Hau [Pauc]
1224 |     Phags_Pa [Phag]
1225 |     Phoenician [Phnx]
1226 |     Psalter_Pahlavi [Phlp]
1227 |     Rejang [Rjng]
1228 |     Runic [Runr]
1229 |     Samaritan [Samr]
1230 |     Saurashtra [Saur]
1231 |     Sharada [Shrd]
1232 |     Shavian [Shaw]
1233 |     Siddham [Sidd]
1234 |     SignWriting [Sgnw]
1235 |     Sinhala [Sinh]
1236 |     Sogdian [Sogd]
1237 |     Sora_Sompeng [Sora]
1238 |     Soyombo [Soyo]
1239 |     Sundanese [Sund]
1240 |     Syloti_Nagri [Sylo]
1241 |     Syriac [Syrc]
1242 |     Tagalog [Tglg]
1243 |     Tagbanwa [Tagb]
1244 |     Tai_Le [Tale]
1245 |     Tai_Tham [Lana]
1246 |     Tai_Viet [Tavt]
1247 |     Takri [Takr]
1248 |     Tamil [Taml]
1249 |     Tangut [Tang]
1250 |     Telugu [Telu]
1251 |     Thaana [Thaa]
1252 |     Thai
1253 |     Tibetan [Tibt]
1254 |     Tifinagh [Tfng]
1255 |     Tirhuta [Tirh]
1256 |     Ugaritic [Ugar]
1257 |     Unknown [Zzzz]
1258 |     Vai [Vaii]
1259 |     Wancho [Wcho]
1260 |     Warang_Citi [Wara]
1261 |     Yi [Yiii]
1262 |     Zanabazar_Square [Zanb]
1263 | 
1264 | Script_Extensions [scx]
1265 |     Adlam [Adlm]
1266 |     Adlm Arab Mand Mani Phlp Rohg Sogd Syrc
1267 |     Ahom
1268 |     Anatolian_Hieroglyphs [Hluw]
1269 |     Arab Copt
1270 |     Arab Rohg
1271 |     Arab Rohg Syrc Thaa
1272 |     Arab Syrc
1273 |     Arab Syrc Thaa
1274 |     Arab Thaa
1275 |     Arabic [Arab]
1276 |     Armenian [Armn]
1277 |     Armn Geor
1278 |     Avestan [Avst]
1279 |     Balinese [Bali]
1280 |     Bamum [Bamu]
1281 |     Bassa_Vah [Bass]
1282 |     Batak [Batk]
1283 |     Beng Cakm Sylo
1284 |     Beng Deva
1285 |     Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
1286 |     Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
1287 |     Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
1288 |     Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
1289 |     Beng Deva Gran Knda
1290 |     Beng Deva Gran Knda Nand Orya Telu Tirh
1291 |     Bengali [Beng]
1292 |     Bhaiksuki [Bhks]
1293 |     Bopo Hang Hani Hira Kana
1294 |     Bopo Hang Hani Hira Kana Yiii
1295 |     Bopo Hani
1296 |     Bopomofo [Bopo]
1297 |     Brahmi [Brah]
1298 |     Braille [Brai]
1299 |     Bugi Java
1300 |     Buginese [Bugi]
1301 |     Buhd Hano Tagb Tglg
1302 |     Buhid [Buhd]
1303 |     Cakm Mymr Tale
1304 |     Canadian_Aboriginal [Cans]
1305 |     Carian [Cari]
1306 |     Caucasian_Albanian [Aghb]
1307 |     Chakma [Cakm]
1308 |     Cham
1309 |     Cherokee [Cher]
1310 |     Common [Zyyy]
1311 |     Coptic [Copt, Qaac]
1312 |     Cprt Lina Linb
1313 |     Cprt Linb
1314 |     Cuneiform [Xsux]
1315 |     Cypriot [Cprt]
1316 |     Cyrillic [Cyrl]
1317 |     Cyrl Glag
1318 |     Cyrl Latn
1319 |     Cyrl Perm
1320 |     Deseret [Dsrt]
1321 |     Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
1322 |     Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
1323 |     Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
1324 |     Deva Dogr Kthi Mahj
1325 |     Deva Gran
1326 |     Deva Gran Knda
1327 |     Deva Gran Latn
1328 |     Deva Knda Mlym Orya Taml Telu
1329 |     Deva Nand
1330 |     Deva Shrd
1331 |     Deva Taml
1332 |     Devanagari [Deva]
1333 |     Dogra [Dogr]
1334 |     Duployan [Dupl]
1335 |     Egyptian_Hieroglyphs [Egyp]
1336 |     Elbasan [Elba]
1337 |     Elymaic [Elym]
1338 |     Ethiopic [Ethi]
1339 |     Geor Latn
1340 |     Georgian [Geor]
1341 |     Glagolitic [Glag]
1342 |     Gothic [Goth]
1343 |     Gran Taml
1344 |     Grantha [Gran]
1345 |     Greek [Grek]
1346 |     Gujarati [Gujr]
1347 |     Gujr Khoj
1348 |     Gunjala_Gondi [Gong]
1349 |     Gurmukhi [Guru]
1350 |     Guru Mult
1351 |     Han [Hani]
1352 |     Hangul [Hang]
1353 |     Hani Hira Kana
1354 |     Hanifi_Rohingya [Rohg]
1355 |     Hanunoo [Hano]
1356 |     Hatran [Hatr]
1357 |     Hebrew [Hebr]
1358 |     Hira Kana
1359 |     Hiragana [Hira]
1360 |     Imperial_Aramaic [Armi]
1361 |     Inherited [Qaai, Zinh]
1362 |     Inscriptional_Pahlavi [Phli]
1363 |     Inscriptional_Parthian [Prti]
1364 |     Javanese [Java]
1365 |     Kaithi [Kthi]
1366 |     Kali Latn Mymr
1367 |     Kannada [Knda]
1368 |     Katakana [Kana]
1369 |     Kayah_Li [Kali]
1370 |     Kharoshthi [Khar]
1371 |     Khmer [Khmr]
1372 |     Khojki [Khoj]
1373 |     Khudawadi [Sind]
1374 |     Knda Nand
1375 |     Lao [Laoo]
1376 |     Latin [Latn]
1377 |     Latn Mong
1378 |     Lepcha [Lepc]
1379 |     Limbu [Limb]
1380 |     Linear_A [Lina]
1381 |     Linear_B [Linb]
1382 |     Lisu
1383 |     Lycian [Lyci]
1384 |     Lydian [Lydi]
1385 |     Mahajani [Mahj]
1386 |     Makasar [Maka]
1387 |     Malayalam [Mlym]
1388 |     Mandaic [Mand]
1389 |     Manichaean [Mani]
1390 |     Marchen [Marc]
1391 |     Masaram_Gondi [Gonm]
1392 |     Medefaidrin [Medf]
1393 |     Meetei_Mayek [Mtei]
1394 |     Mende_Kikakui [Mend]
1395 |     Meroitic_Cursive [Merc]
1396 |     Meroitic_Hieroglyphs [Mero]
1397 |     Miao [Plrd]
1398 |     Modi
1399 |     Mong Phag
1400 |     Mongolian [Mong]
1401 |     Mro [Mroo]
1402 |     Multani [Mult]
1403 |     Myanmar [Mymr]
1404 |     Nabataean [Nbat]
1405 |     Nandinagari [Nand]
1406 |     Newa
1407 |     New_Tai_Lue [Talu]
1408 |     Nko [Nkoo]
1409 |     Nushu [Nshu]
1410 |     Nyiakeng_Puachue_Hmong [Hmnp]
1411 |     Ogham [Ogam]
1412 |     Old_Hungarian [Hung]
1413 |     Old_Italic [Ital]
1414 |     Old_North_Arabian [Narb]
1415 |     Old_Permic [Perm]
1416 |     Old_Persian [Xpeo]
1417 |     Old_Sogdian [Sogo]
1418 |     Old_South_Arabian [Sarb]
1419 |     Old_Turkic [Orkh]
1420 |     Ol_Chiki [Olck]
1421 |     Oriya [Orya]
1422 |     Osage [Osge]
1423 |     Osmanya [Osma]
1424 |     Pahawh_Hmong [Hmng]
1425 |     Palmyrene [Palm]
1426 |     Pau_Cin_Hau [Pauc]
1427 |     Phags_Pa [Phag]
1428 |     Phoenician [Phnx]
1429 |     Psalter_Pahlavi [Phlp]
1430 |     Rejang [Rjng]
1431 |     Runic [Runr]
1432 |     Samaritan [Samr]
1433 |     Saurashtra [Saur]
1434 |     Sharada [Shrd]
1435 |     Shavian [Shaw]
1436 |     Siddham [Sidd]
1437 |     SignWriting [Sgnw]
1438 |     Sinhala [Sinh]
1439 |     Sogdian [Sogd]
1440 |     Sora_Sompeng [Sora]
1441 |     Soyombo [Soyo]
1442 |     Sundanese [Sund]
1443 |     Syloti_Nagri [Sylo]
1444 |     Syriac [Syrc]
1445 |     Tagalog [Tglg]
1446 |     Tagbanwa [Tagb]
1447 |     Tai_Le [Tale]
1448 |     Tai_Tham [Lana]
1449 |     Tai_Viet [Tavt]
1450 |     Takri [Takr]
1451 |     Tamil [Taml]
1452 |     Tangut [Tang]
1453 |     Telugu [Telu]
1454 |     Thaana [Thaa]
1455 |     Thai
1456 |     Tibetan [Tibt]
1457 |     Tifinagh [Tfng]
1458 |     Tirhuta [Tirh]
1459 |     Ugaritic [Ugar]
1460 |     Unknown [Zzzz]
1461 |     Vai [Vaii]
1462 |     Wancho [Wcho]
1463 |     Warang_Citi [Wara]
1464 |     Yi [Yiii]
1465 |     Zanabazar_Square [Zanb]
1466 | 
1467 | Sentence_Break [SB]
1468 |     ATerm [AT]
1469 |     Close [CL]
1470 |     CR
1471 |     Extend [EX]
1472 |     Format [FO]
1473 |     LF
1474 |     Lower [LO]
1475 |     Numeric [NU]
1476 |     OLetter [LE]
1477 |     Other [XX]
1478 |     SContinue [SC]
1479 |     Sep [SE]
1480 |     Sp
1481 |     STerm [ST]
1482 |     Upper [UP]
1483 | 
1484 | Sentence_Terminal [STerm]
1485 |     No [F, False, N]
1486 |     Yes [T, True, Y]
1487 | 
1488 | Soft_Dotted [SD]
1489 |     No [F, False, N]
1490 |     Yes [T, True, Y]
1491 | 
1492 | Terminal_Punctuation [Term]
1493 |     No [F, False, N]
1494 |     Yes [T, True, Y]
1495 | 
1496 | Unified_Ideograph [UIdeo]
1497 |     No [F, False, N]
1498 |     Yes [T, True, Y]
1499 | 
1500 | Uppercase [Upper]
1501 |     No [F, False, N]
1502 |     Yes [T, True, Y]
1503 | 
1504 | Variation_Selector [VS]
1505 |     No [F, False, N]
1506 |     Yes [T, True, Y]
1507 | 
1508 | White_Space [space, WSpace]
1509 |     No [F, False, N]
1510 |     Yes [T, True, Y]
1511 | 
1512 | Word
1513 |     No [F, False, N]
1514 |     Yes [T, True, Y]
1515 | 
1516 | Word_Break [WB]
1517 |     ALetter [LE]
1518 |     CR
1519 |     Double_Quote [DQ]
1520 |     Extend
1521 |     ExtendNumLet [EX]
1522 |     E_Base [EB]
1523 |     E_Base_GAZ [EBG]
1524 |     E_Modifier [EM]
1525 |     Format [FO]
1526 |     Glue_After_Zwj [GAZ]
1527 |     Hebrew_Letter [HL]
1528 |     Katakana [KA]
1529 |     LF
1530 |     MidLetter [ML]
1531 |     MidNum [MN]
1532 |     MidNumLet [MB]
1533 |     Newline [NL]
1534 |     Numeric [NU]
1535 |     Other [XX]
1536 |     Regional_Indicator [RI]
1537 |     Single_Quote [SQ]
1538 |     WSegSpace
1539 |     ZWJ
1540 | 
1541 | XDigit
1542 |     No [F, False, N]
1543 |     Yes [T, True, Y]
1544 | 
1545 | XID_Continue [XIDC]
1546 |     No [F, False, N]
1547 |     Yes [T, True, Y]
1548 | 
1549 | XID_Start [XIDS]
1550 |     No [F, False, N]
1551 |     Yes [T, True, Y]
1552 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools > 61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "regex"
 7 | version = "2025.5.18"
 8 | description = "Alternative regular expression module, to replace re."
 9 | readme = "README.rst"
10 | authors = [
11 |     {name = "Matthew Barnett", email = "regex@mrabarnett.plus.com"},
12 | ]
13 | license = "Apache-2.0"
14 | 
15 | classifiers = [
16 |     "Development Status :: 5 - Production/Stable",
17 |     "Intended Audience :: Developers",
18 |     "Operating System :: OS Independent",
19 |     "Programming Language :: Python :: 3.9",
20 |     "Programming Language :: Python :: 3.10",
21 |     "Programming Language :: Python :: 3.11",
22 |     "Programming Language :: Python :: 3.12",
23 |     "Programming Language :: Python :: 3.13",
24 |     "Topic :: Scientific/Engineering :: Information Analysis",
25 |     "Topic :: Software Development :: Libraries :: Python Modules",
26 |     "Topic :: Text Processing",
27 |     "Topic :: Text Processing :: General",
28 | ]
29 | 
30 | requires-python = ">= 3.9"
31 | 
32 | [project.urls]
33 | Homepage = "https://github.com/mrabarnett/mrab-regex"
34 | 
35 | [tool.setuptools]
36 | package-dir = {regex = "regex_3"}
37 | py-modules = [
38 |     "regex.__init__",
39 |     "regex.regex",
40 |     "regex._regex_core",
41 |     "regex.test_regex",
42 | ]
43 | 


--------------------------------------------------------------------------------
/regex_3/__init__.py:
--------------------------------------------------------------------------------
1 | from .regex import *
2 | from . import regex
3 | __all__ = regex.__all__
4 | 


--------------------------------------------------------------------------------
/regex_3/_regex.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Secret Labs' Regular Expression Engine
  3 |  *
  4 |  * regular expression matching engine
  5 |  *
  6 |  * Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
  7 |  *
  8 |  * NOTE: This file is generated by regex.py.  If you need
  9 |  * to change anything in here, edit regex.py and run it.
 10 |  *
 11 |  * 2010-01-16 mrab Re-written
 12 |  */
 13 | 
 14 | /* Supports Unicode version 12.1.0. */
 15 | 
 16 | #define RE_MAGIC 20100116
 17 | 
 18 | #include "_regex_unicode.h"
 19 | 
 20 | /* Operators. */
 21 | #define RE_OP_FAILURE 0
 22 | #define RE_OP_SUCCESS 1
 23 | #define RE_OP_ANY 2
 24 | #define RE_OP_ANY_ALL 3
 25 | #define RE_OP_ANY_ALL_REV 4
 26 | #define RE_OP_ANY_REV 5
 27 | #define RE_OP_ANY_U 6
 28 | #define RE_OP_ANY_U_REV 7
 29 | #define RE_OP_ATOMIC 8
 30 | #define RE_OP_BOUNDARY 9
 31 | #define RE_OP_BRANCH 10
 32 | #define RE_OP_CALL_REF 11
 33 | #define RE_OP_CHARACTER 12
 34 | #define RE_OP_CHARACTER_IGN 13
 35 | #define RE_OP_CHARACTER_IGN_REV 14
 36 | #define RE_OP_CHARACTER_REV 15
 37 | #define RE_OP_CONDITIONAL 16
 38 | #define RE_OP_DEFAULT_BOUNDARY 17
 39 | #define RE_OP_DEFAULT_END_OF_WORD 18
 40 | #define RE_OP_DEFAULT_START_OF_WORD 19
 41 | #define RE_OP_END 20
 42 | #define RE_OP_END_OF_LINE 21
 43 | #define RE_OP_END_OF_LINE_U 22
 44 | #define RE_OP_END_OF_STRING 23
 45 | #define RE_OP_END_OF_STRING_LINE 24
 46 | #define RE_OP_END_OF_STRING_LINE_U 25
 47 | #define RE_OP_END_OF_WORD 26
 48 | #define RE_OP_FUZZY 27
 49 | #define RE_OP_GRAPHEME_BOUNDARY 28
 50 | #define RE_OP_GREEDY_REPEAT 29
 51 | #define RE_OP_GROUP 30
 52 | #define RE_OP_GROUP_CALL 31
 53 | #define RE_OP_GROUP_EXISTS 32
 54 | #define RE_OP_KEEP 33
 55 | #define RE_OP_LAZY_REPEAT 34
 56 | #define RE_OP_LOOKAROUND 35
 57 | #define RE_OP_NEXT 36
 58 | #define RE_OP_PROPERTY 37
 59 | #define RE_OP_PROPERTY_IGN 38
 60 | #define RE_OP_PROPERTY_IGN_REV 39
 61 | #define RE_OP_PROPERTY_REV 40
 62 | #define RE_OP_PRUNE 41
 63 | #define RE_OP_RANGE 42
 64 | #define RE_OP_RANGE_IGN 43
 65 | #define RE_OP_RANGE_IGN_REV 44
 66 | #define RE_OP_RANGE_REV 45
 67 | #define RE_OP_REF_GROUP 46
 68 | #define RE_OP_REF_GROUP_FLD 47
 69 | #define RE_OP_REF_GROUP_FLD_REV 48
 70 | #define RE_OP_REF_GROUP_IGN 49
 71 | #define RE_OP_REF_GROUP_IGN_REV 50
 72 | #define RE_OP_REF_GROUP_REV 51
 73 | #define RE_OP_SEARCH_ANCHOR 52
 74 | #define RE_OP_SET_DIFF 53
 75 | #define RE_OP_SET_DIFF_IGN 54
 76 | #define RE_OP_SET_DIFF_IGN_REV 55
 77 | #define RE_OP_SET_DIFF_REV 56
 78 | #define RE_OP_SET_INTER 57
 79 | #define RE_OP_SET_INTER_IGN 58
 80 | #define RE_OP_SET_INTER_IGN_REV 59
 81 | #define RE_OP_SET_INTER_REV 60
 82 | #define RE_OP_SET_SYM_DIFF 61
 83 | #define RE_OP_SET_SYM_DIFF_IGN 62
 84 | #define RE_OP_SET_SYM_DIFF_IGN_REV 63
 85 | #define RE_OP_SET_SYM_DIFF_REV 64
 86 | #define RE_OP_SET_UNION 65
 87 | #define RE_OP_SET_UNION_IGN 66
 88 | #define RE_OP_SET_UNION_IGN_REV 67
 89 | #define RE_OP_SET_UNION_REV 68
 90 | #define RE_OP_SKIP 69
 91 | #define RE_OP_START_OF_LINE 70
 92 | #define RE_OP_START_OF_LINE_U 71
 93 | #define RE_OP_START_OF_STRING 72
 94 | #define RE_OP_START_OF_WORD 73
 95 | #define RE_OP_STRING 74
 96 | #define RE_OP_STRING_FLD 75
 97 | #define RE_OP_STRING_FLD_REV 76
 98 | #define RE_OP_STRING_IGN 77
 99 | #define RE_OP_STRING_IGN_REV 78
100 | #define RE_OP_STRING_REV 79
101 | #define RE_OP_FUZZY_EXT 80
102 | #define RE_OP_BODY_END 81
103 | #define RE_OP_BODY_START 82
104 | #define RE_OP_END_ATOMIC 83
105 | #define RE_OP_END_CONDITIONAL 84
106 | #define RE_OP_END_FUZZY 85
107 | #define RE_OP_END_GREEDY_REPEAT 86
108 | #define RE_OP_END_GROUP 87
109 | #define RE_OP_END_LAZY_REPEAT 88
110 | #define RE_OP_END_LOOKAROUND 89
111 | #define RE_OP_FUZZY_INSERT 90
112 | #define RE_OP_GREEDY_REPEAT_ONE 91
113 | #define RE_OP_GROUP_RETURN 92
114 | #define RE_OP_LAZY_REPEAT_ONE 93
115 | #define RE_OP_MATCH_BODY 94
116 | #define RE_OP_MATCH_TAIL 95
117 | #define RE_OP_START_GROUP 96
118 | #define RE_OP_TAIL_START 97
119 | 
120 | char* re_op_text[] = {
121 |     "RE_OP_FAILURE",
122 |     "RE_OP_SUCCESS",
123 |     "RE_OP_ANY",
124 |     "RE_OP_ANY_ALL",
125 |     "RE_OP_ANY_ALL_REV",
126 |     "RE_OP_ANY_REV",
127 |     "RE_OP_ANY_U",
128 |     "RE_OP_ANY_U_REV",
129 |     "RE_OP_ATOMIC",
130 |     "RE_OP_BOUNDARY",
131 |     "RE_OP_BRANCH",
132 |     "RE_OP_CALL_REF",
133 |     "RE_OP_CHARACTER",
134 |     "RE_OP_CHARACTER_IGN",
135 |     "RE_OP_CHARACTER_IGN_REV",
136 |     "RE_OP_CHARACTER_REV",
137 |     "RE_OP_CONDITIONAL",
138 |     "RE_OP_DEFAULT_BOUNDARY",
139 |     "RE_OP_DEFAULT_END_OF_WORD",
140 |     "RE_OP_DEFAULT_START_OF_WORD",
141 |     "RE_OP_END",
142 |     "RE_OP_END_OF_LINE",
143 |     "RE_OP_END_OF_LINE_U",
144 |     "RE_OP_END_OF_STRING",
145 |     "RE_OP_END_OF_STRING_LINE",
146 |     "RE_OP_END_OF_STRING_LINE_U",
147 |     "RE_OP_END_OF_WORD",
148 |     "RE_OP_FUZZY",
149 |     "RE_OP_GRAPHEME_BOUNDARY",
150 |     "RE_OP_GREEDY_REPEAT",
151 |     "RE_OP_GROUP",
152 |     "RE_OP_GROUP_CALL",
153 |     "RE_OP_GROUP_EXISTS",
154 |     "RE_OP_KEEP",
155 |     "RE_OP_LAZY_REPEAT",
156 |     "RE_OP_LOOKAROUND",
157 |     "RE_OP_NEXT",
158 |     "RE_OP_PROPERTY",
159 |     "RE_OP_PROPERTY_IGN",
160 |     "RE_OP_PROPERTY_IGN_REV",
161 |     "RE_OP_PROPERTY_REV",
162 |     "RE_OP_PRUNE",
163 |     "RE_OP_RANGE",
164 |     "RE_OP_RANGE_IGN",
165 |     "RE_OP_RANGE_IGN_REV",
166 |     "RE_OP_RANGE_REV",
167 |     "RE_OP_REF_GROUP",
168 |     "RE_OP_REF_GROUP_FLD",
169 |     "RE_OP_REF_GROUP_FLD_REV",
170 |     "RE_OP_REF_GROUP_IGN",
171 |     "RE_OP_REF_GROUP_IGN_REV",
172 |     "RE_OP_REF_GROUP_REV",
173 |     "RE_OP_SEARCH_ANCHOR",
174 |     "RE_OP_SET_DIFF",
175 |     "RE_OP_SET_DIFF_IGN",
176 |     "RE_OP_SET_DIFF_IGN_REV",
177 |     "RE_OP_SET_DIFF_REV",
178 |     "RE_OP_SET_INTER",
179 |     "RE_OP_SET_INTER_IGN",
180 |     "RE_OP_SET_INTER_IGN_REV",
181 |     "RE_OP_SET_INTER_REV",
182 |     "RE_OP_SET_SYM_DIFF",
183 |     "RE_OP_SET_SYM_DIFF_IGN",
184 |     "RE_OP_SET_SYM_DIFF_IGN_REV",
185 |     "RE_OP_SET_SYM_DIFF_REV",
186 |     "RE_OP_SET_UNION",
187 |     "RE_OP_SET_UNION_IGN",
188 |     "RE_OP_SET_UNION_IGN_REV",
189 |     "RE_OP_SET_UNION_REV",
190 |     "RE_OP_SKIP",
191 |     "RE_OP_START_OF_LINE",
192 |     "RE_OP_START_OF_LINE_U",
193 |     "RE_OP_START_OF_STRING",
194 |     "RE_OP_START_OF_WORD",
195 |     "RE_OP_STRING",
196 |     "RE_OP_STRING_FLD",
197 |     "RE_OP_STRING_FLD_REV",
198 |     "RE_OP_STRING_IGN",
199 |     "RE_OP_STRING_IGN_REV",
200 |     "RE_OP_STRING_REV",
201 |     "RE_OP_FUZZY_EXT",
202 |     "RE_OP_BODY_END",
203 |     "RE_OP_BODY_START",
204 |     "RE_OP_END_ATOMIC",
205 |     "RE_OP_END_CONDITIONAL",
206 |     "RE_OP_END_FUZZY",
207 |     "RE_OP_END_GREEDY_REPEAT",
208 |     "RE_OP_END_GROUP",
209 |     "RE_OP_END_LAZY_REPEAT",
210 |     "RE_OP_END_LOOKAROUND",
211 |     "RE_OP_FUZZY_INSERT",
212 |     "RE_OP_GREEDY_REPEAT_ONE",
213 |     "RE_OP_GROUP_RETURN",
214 |     "RE_OP_LAZY_REPEAT_ONE",
215 |     "RE_OP_MATCH_BODY",
216 |     "RE_OP_MATCH_TAIL",
217 |     "RE_OP_START_GROUP",
218 |     "RE_OP_TAIL_START"
219 | };
220 | 
221 | #define RE_FLAG_ASCII 0x80
222 | #define RE_FLAG_BESTMATCH 0x1000
223 | #define RE_FLAG_DEBUG 0x200
224 | #define RE_FLAG_DOTALL 0x10
225 | #define RE_FLAG_ENHANCEMATCH 0x8000
226 | #define RE_FLAG_FULLCASE 0x4000
227 | #define RE_FLAG_IGNORECASE 0x2
228 | #define RE_FLAG_LOCALE 0x4
229 | #define RE_FLAG_MULTILINE 0x8
230 | #define RE_FLAG_POSIX 0x10000
231 | #define RE_FLAG_REVERSE 0x400
232 | #define RE_FLAG_TEMPLATE 0x1
233 | #define RE_FLAG_UNICODE 0x20
234 | #define RE_FLAG_VERBOSE 0x40
235 | #define RE_FLAG_VERSION0 0x2000
236 | #define RE_FLAG_VERSION1 0x100
237 | #define RE_FLAG_WORD 0x800
238 | 


--------------------------------------------------------------------------------
/regex_3/_regex_unicode.h:
--------------------------------------------------------------------------------
  1 | typedef unsigned char RE_UINT8;
  2 | typedef signed char RE_INT8;
  3 | typedef unsigned short RE_UINT16;
  4 | typedef signed short RE_INT16;
  5 | typedef unsigned int RE_UINT32;
  6 | typedef signed int RE_INT32;
  7 | 
  8 | typedef unsigned char BOOL;
  9 | #if !defined(FALSE) || !defined(TRUE)
 10 | #define FALSE 0
 11 | #define TRUE 1
 12 | #endif
 13 | 
 14 | #define RE_ASCII_MAX 0x7F
 15 | #define RE_LOCALE_MAX 0xFF
 16 | 
 17 | #define RE_MAX_CASES 4
 18 | #define RE_MAX_FOLDED 3
 19 | #define RE_MAX_SCX 23
 20 | 
 21 | typedef struct RE_Property {
 22 |     RE_UINT16 name;
 23 |     RE_UINT8 id;
 24 |     RE_UINT8 value_set;
 25 | } RE_Property;
 26 | 
 27 | typedef struct RE_PropertyValue {
 28 |     RE_UINT16 name;
 29 |     RE_UINT8 value_set;
 30 |     RE_UINT16 id;
 31 | } RE_PropertyValue;
 32 | 
 33 | typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 codepoint);
 34 | 
 35 | #define RE_PROP_GC 0x1E
 36 | #define RE_PROP_CASED 0xA
 37 | #define RE_PROP_UPPERCASE 0x5C
 38 | #define RE_PROP_LOWERCASE 0x38
 39 | #define RE_PROP_SCX 0x56
 40 | 
 41 | #define RE_PROP_C 30
 42 | #define RE_PROP_L 31
 43 | #define RE_PROP_M 32
 44 | #define RE_PROP_N 33
 45 | #define RE_PROP_P 34
 46 | #define RE_PROP_S 35
 47 | #define RE_PROP_Z 36
 48 | #define RE_PROP_ASSIGNED 37
 49 | #define RE_PROP_CASEDLETTER 38
 50 | 
 51 | #define RE_PROP_CN 0
 52 | #define RE_PROP_CC 1
 53 | #define RE_PROP_ZS 2
 54 | #define RE_PROP_PO 3
 55 | #define RE_PROP_SC 4
 56 | #define RE_PROP_PS 5
 57 | #define RE_PROP_PE 6
 58 | #define RE_PROP_SM 7
 59 | #define RE_PROP_PD 8
 60 | #define RE_PROP_ND 9
 61 | #define RE_PROP_LU 10
 62 | #define RE_PROP_SK 11
 63 | #define RE_PROP_PC 12
 64 | #define RE_PROP_LL 13
 65 | #define RE_PROP_SO 14
 66 | #define RE_PROP_LO 15
 67 | #define RE_PROP_PI 16
 68 | #define RE_PROP_CF 17
 69 | #define RE_PROP_NO 18
 70 | #define RE_PROP_PF 19
 71 | #define RE_PROP_LT 20
 72 | #define RE_PROP_LM 21
 73 | #define RE_PROP_MN 22
 74 | #define RE_PROP_ME 23
 75 | #define RE_PROP_MC 24
 76 | #define RE_PROP_NL 25
 77 | #define RE_PROP_ZL 26
 78 | #define RE_PROP_ZP 27
 79 | #define RE_PROP_CS 28
 80 | #define RE_PROP_CO 29
 81 | 
 82 | #define RE_PROP_C_MASK 0x30020003
 83 | #define RE_PROP_L_MASK 0x0030A400
 84 | #define RE_PROP_M_MASK 0x01C00000
 85 | #define RE_PROP_N_MASK 0x02040200
 86 | #define RE_PROP_P_MASK 0x00091168
 87 | #define RE_PROP_S_MASK 0x00004890
 88 | #define RE_PROP_Z_MASK 0x0C000004
 89 | 
 90 | #define RE_PROP_ALNUM 0x010001
 91 | #define RE_PROP_ALPHA 0x000001
 92 | #define RE_PROP_ANY 0x020001
 93 | #define RE_PROP_ASCII 0x080001
 94 | #define RE_PROP_BLANK 0x070001
 95 | #define RE_PROP_CNTRL 0x1E0001
 96 | #define RE_PROP_DIGIT 0x1E0009
 97 | #define RE_PROP_GRAPH 0x1F0001
 98 | #define RE_PROP_LOWER 0x380001
 99 | #define RE_PROP_PRINT 0x510001
100 | #define RE_PROP_SPACE 0x5F0001
101 | #define RE_PROP_UPPER 0x5C0001
102 | #define RE_PROP_WORD 0x600001
103 | #define RE_PROP_XDIGIT 0x620001
104 | #define RE_PROP_POSIX_ALNUM 0x4C0001
105 | #define RE_PROP_POSIX_DIGIT 0x4D0001
106 | #define RE_PROP_POSIX_PUNCT 0x4E0001
107 | #define RE_PROP_POSIX_XDIGIT 0x4F0001
108 | 
109 | #define RE_WBREAK_OTHER 0
110 | #define RE_WBREAK_LF 1
111 | #define RE_WBREAK_NEWLINE 2
112 | #define RE_WBREAK_CR 3
113 | #define RE_WBREAK_WSEGSPACE 4
114 | #define RE_WBREAK_DOUBLEQUOTE 5
115 | #define RE_WBREAK_SINGLEQUOTE 6
116 | #define RE_WBREAK_MIDNUM 7
117 | #define RE_WBREAK_MIDNUMLET 8
118 | #define RE_WBREAK_NUMERIC 9
119 | #define RE_WBREAK_MIDLETTER 10
120 | #define RE_WBREAK_ALETTER 11
121 | #define RE_WBREAK_EXTENDNUMLET 12
122 | #define RE_WBREAK_FORMAT 13
123 | #define RE_WBREAK_EXTEND 14
124 | #define RE_WBREAK_HEBREWLETTER 15
125 | #define RE_WBREAK_ZWJ 16
126 | #define RE_WBREAK_KATAKANA 17
127 | #define RE_WBREAK_REGIONALINDICATOR 18
128 | #define RE_WBREAK_EBASE 19
129 | #define RE_WBREAK_EBASEGAZ 20
130 | #define RE_WBREAK_EMODIFIER 21
131 | #define RE_WBREAK_GLUEAFTERZWJ 22
132 | 
133 | #define RE_GBREAK_OTHER 0
134 | #define RE_GBREAK_CONTROL 1
135 | #define RE_GBREAK_LF 2
136 | #define RE_GBREAK_CR 3
137 | #define RE_GBREAK_EXTEND 4
138 | #define RE_GBREAK_PREPEND 5
139 | #define RE_GBREAK_SPACINGMARK 6
140 | #define RE_GBREAK_L 7
141 | #define RE_GBREAK_V 8
142 | #define RE_GBREAK_T 9
143 | #define RE_GBREAK_ZWJ 10
144 | #define RE_GBREAK_LV 11
145 | #define RE_GBREAK_LVT 12
146 | #define RE_GBREAK_REGIONALINDICATOR 13
147 | #define RE_GBREAK_EBASE 14
148 | #define RE_GBREAK_EBASEGAZ 15
149 | #define RE_GBREAK_EMODIFIER 16
150 | #define RE_GBREAK_GLUEAFTERZWJ 17
151 | 
152 | #define RE_LBREAK_UNKNOWN 0
153 | #define RE_LBREAK_COMBININGMARK 1
154 | #define RE_LBREAK_BREAKAFTER 2
155 | #define RE_LBREAK_LINEFEED 3
156 | #define RE_LBREAK_MANDATORYBREAK 4
157 | #define RE_LBREAK_CARRIAGERETURN 5
158 | #define RE_LBREAK_SPACE 6
159 | #define RE_LBREAK_EXCLAMATION 7
160 | #define RE_LBREAK_QUOTATION 8
161 | #define RE_LBREAK_ALPHABETIC 9
162 | #define RE_LBREAK_PREFIXNUMERIC 10
163 | #define RE_LBREAK_POSTFIXNUMERIC 11
164 | #define RE_LBREAK_OPENPUNCTUATION 12
165 | #define RE_LBREAK_CLOSEPARENTHESIS 13
166 | #define RE_LBREAK_INFIXNUMERIC 14
167 | #define RE_LBREAK_HYPHEN 15
168 | #define RE_LBREAK_BREAKSYMBOLS 16
169 | #define RE_LBREAK_NUMERIC 17
170 | #define RE_LBREAK_CLOSEPUNCTUATION 18
171 | #define RE_LBREAK_NEXTLINE 19
172 | #define RE_LBREAK_GLUE 20
173 | #define RE_LBREAK_AMBIGUOUS 21
174 | #define RE_LBREAK_BREAKBEFORE 22
175 | #define RE_LBREAK_HEBREWLETTER 23
176 | #define RE_LBREAK_COMPLEXCONTEXT 24
177 | #define RE_LBREAK_JL 25
178 | #define RE_LBREAK_JV 26
179 | #define RE_LBREAK_JT 27
180 | #define RE_LBREAK_NONSTARTER 28
181 | #define RE_LBREAK_AKSARA 29
182 | #define RE_LBREAK_VIRAMA 30
183 | #define RE_LBREAK_AKSARASTART 31
184 | #define RE_LBREAK_IDEOGRAPHIC 32
185 | #define RE_LBREAK_VIRAMAFINAL 33
186 | #define RE_LBREAK_ZWSPACE 34
187 | #define RE_LBREAK_ZWJ 35
188 | #define RE_LBREAK_BREAKBOTH 36
189 | #define RE_LBREAK_INSEPARABLE 37
190 | #define RE_LBREAK_WORDJOINER 38
191 | #define RE_LBREAK_EBASE 39
192 | #define RE_LBREAK_CONDITIONALJAPANESESTARTER 40
193 | #define RE_LBREAK_H2 41
194 | #define RE_LBREAK_H3 42
195 | #define RE_LBREAK_SURROGATE 43
196 | #define RE_LBREAK_CONTINGENTBREAK 44
197 | #define RE_LBREAK_AKSARAPREBASE 45
198 | #define RE_LBREAK_REGIONALINDICATOR 46
199 | #define RE_LBREAK_EMODIFIER 47
200 | 
201 | #define RE_INCB_NONE 0
202 | #define RE_INCB_EXTEND 1
203 | #define RE_INCB_CONSONANT 2
204 | #define RE_INCB_LINKER 3
205 | 
206 | extern char* re_strings[1530];
207 | extern RE_Property re_properties[185];
208 | extern RE_PropertyValue re_property_values[1680];
209 | extern RE_UINT16 re_expand_on_folding[104];
210 | extern RE_GetPropertyFunc re_get_property[101];
211 | 
212 | RE_UINT32 re_get_alphabetic(RE_UINT32 codepoint);
213 | RE_UINT32 re_get_alphanumeric(RE_UINT32 codepoint);
214 | RE_UINT32 re_get_any(RE_UINT32 codepoint);
215 | RE_UINT32 re_get_ascii_hex_digit(RE_UINT32 codepoint);
216 | RE_UINT32 re_get_bidi_class(RE_UINT32 codepoint);
217 | RE_UINT32 re_get_bidi_control(RE_UINT32 codepoint);
218 | RE_UINT32 re_get_bidi_mirrored(RE_UINT32 codepoint);
219 | RE_UINT32 re_get_blank(RE_UINT32 codepoint);
220 | RE_UINT32 re_get_block(RE_UINT32 codepoint);
221 | RE_UINT32 re_get_canonical_combining_class(RE_UINT32 codepoint);
222 | RE_UINT32 re_get_cased(RE_UINT32 codepoint);
223 | RE_UINT32 re_get_case_ignorable(RE_UINT32 codepoint);
224 | RE_UINT32 re_get_changes_when_casefolded(RE_UINT32 codepoint);
225 | RE_UINT32 re_get_changes_when_casemapped(RE_UINT32 codepoint);
226 | RE_UINT32 re_get_changes_when_lowercased(RE_UINT32 codepoint);
227 | RE_UINT32 re_get_changes_when_titlecased(RE_UINT32 codepoint);
228 | RE_UINT32 re_get_changes_when_uppercased(RE_UINT32 codepoint);
229 | RE_UINT32 re_get_dash(RE_UINT32 codepoint);
230 | RE_UINT32 re_get_decomposition_type(RE_UINT32 codepoint);
231 | RE_UINT32 re_get_default_ignorable_code_point(RE_UINT32 codepoint);
232 | RE_UINT32 re_get_deprecated(RE_UINT32 codepoint);
233 | RE_UINT32 re_get_diacritic(RE_UINT32 codepoint);
234 | RE_UINT32 re_get_east_asian_width(RE_UINT32 codepoint);
235 | RE_UINT32 re_get_emoji(RE_UINT32 codepoint);
236 | RE_UINT32 re_get_emoji_component(RE_UINT32 codepoint);
237 | RE_UINT32 re_get_emoji_modifier(RE_UINT32 codepoint);
238 | RE_UINT32 re_get_emoji_modifier_base(RE_UINT32 codepoint);
239 | RE_UINT32 re_get_emoji_presentation(RE_UINT32 codepoint);
240 | RE_UINT32 re_get_extended_pictographic(RE_UINT32 codepoint);
241 | RE_UINT32 re_get_extender(RE_UINT32 codepoint);
242 | RE_UINT32 re_get_general_category(RE_UINT32 codepoint);
243 | RE_UINT32 re_get_graph(RE_UINT32 codepoint);
244 | RE_UINT32 re_get_grapheme_base(RE_UINT32 codepoint);
245 | RE_UINT32 re_get_grapheme_cluster_break(RE_UINT32 codepoint);
246 | RE_UINT32 re_get_grapheme_extend(RE_UINT32 codepoint);
247 | RE_UINT32 re_get_grapheme_link(RE_UINT32 codepoint);
248 | RE_UINT32 re_get_hangul_syllable_type(RE_UINT32 codepoint);
249 | RE_UINT32 re_get_hex_digit(RE_UINT32 codepoint);
250 | RE_UINT32 re_get_horiz_space(RE_UINT32 codepoint);
251 | RE_UINT32 re_get_hyphen(RE_UINT32 codepoint);
252 | RE_UINT32 re_get_id_compat_math_continue(RE_UINT32 codepoint);
253 | RE_UINT32 re_get_id_compat_math_start(RE_UINT32 codepoint);
254 | RE_UINT32 re_get_id_continue(RE_UINT32 codepoint);
255 | RE_UINT32 re_get_ideographic(RE_UINT32 codepoint);
256 | RE_UINT32 re_get_ids_binary_operator(RE_UINT32 codepoint);
257 | RE_UINT32 re_get_id_start(RE_UINT32 codepoint);
258 | RE_UINT32 re_get_ids_trinary_operator(RE_UINT32 codepoint);
259 | RE_UINT32 re_get_ids_unary_operator(RE_UINT32 codepoint);
260 | RE_UINT32 re_get_indic_conjunct_break(RE_UINT32 codepoint);
261 | RE_UINT32 re_get_indic_positional_category(RE_UINT32 codepoint);
262 | RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 codepoint);
263 | RE_UINT32 re_get_join_control(RE_UINT32 codepoint);
264 | RE_UINT32 re_get_joining_group(RE_UINT32 codepoint);
265 | RE_UINT32 re_get_joining_type(RE_UINT32 codepoint);
266 | RE_UINT32 re_get_line_break(RE_UINT32 codepoint);
267 | RE_UINT32 re_get_logical_order_exception(RE_UINT32 codepoint);
268 | RE_UINT32 re_get_lowercase(RE_UINT32 codepoint);
269 | RE_UINT32 re_get_math(RE_UINT32 codepoint);
270 | RE_UINT32 re_get_modifier_combining_mark(RE_UINT32 codepoint);
271 | RE_UINT32 re_get_nfc_quick_check(RE_UINT32 codepoint);
272 | RE_UINT32 re_get_nfd_quick_check(RE_UINT32 codepoint);
273 | RE_UINT32 re_get_nfkc_quick_check(RE_UINT32 codepoint);
274 | RE_UINT32 re_get_nfkd_quick_check(RE_UINT32 codepoint);
275 | RE_UINT32 re_get_noncharacter_code_point(RE_UINT32 codepoint);
276 | RE_UINT32 re_get_numeric_type(RE_UINT32 codepoint);
277 | RE_UINT32 re_get_numeric_value(RE_UINT32 codepoint);
278 | RE_UINT32 re_get_other_alphabetic(RE_UINT32 codepoint);
279 | RE_UINT32 re_get_other_default_ignorable_code_point(RE_UINT32 codepoint);
280 | RE_UINT32 re_get_other_grapheme_extend(RE_UINT32 codepoint);
281 | RE_UINT32 re_get_other_id_continue(RE_UINT32 codepoint);
282 | RE_UINT32 re_get_other_id_start(RE_UINT32 codepoint);
283 | RE_UINT32 re_get_other_lowercase(RE_UINT32 codepoint);
284 | RE_UINT32 re_get_other_math(RE_UINT32 codepoint);
285 | RE_UINT32 re_get_other_uppercase(RE_UINT32 codepoint);
286 | RE_UINT32 re_get_pattern_syntax(RE_UINT32 codepoint);
287 | RE_UINT32 re_get_pattern_white_space(RE_UINT32 codepoint);
288 | RE_UINT32 re_get_posix_alnum(RE_UINT32 codepoint);
289 | RE_UINT32 re_get_posix_digit(RE_UINT32 codepoint);
290 | RE_UINT32 re_get_posix_punct(RE_UINT32 codepoint);
291 | RE_UINT32 re_get_posix_xdigit(RE_UINT32 codepoint);
292 | RE_UINT32 re_get_prepended_concatenation_mark(RE_UINT32 codepoint);
293 | RE_UINT32 re_get_print(RE_UINT32 codepoint);
294 | RE_UINT32 re_get_quotation_mark(RE_UINT32 codepoint);
295 | RE_UINT32 re_get_radical(RE_UINT32 codepoint);
296 | RE_UINT32 re_get_regional_indicator(RE_UINT32 codepoint);
297 | RE_UINT32 re_get_script(RE_UINT32 codepoint);
298 | int re_get_script_extensions(RE_UINT32 codepoint, RE_UINT8* scripts);
299 | RE_UINT32 re_get_sentence_break(RE_UINT32 codepoint);
300 | RE_UINT32 re_get_sentence_terminal(RE_UINT32 codepoint);
301 | RE_UINT32 re_get_soft_dotted(RE_UINT32 codepoint);
302 | RE_UINT32 re_get_terminal_punctuation(RE_UINT32 codepoint);
303 | RE_UINT32 re_get_unified_ideograph(RE_UINT32 codepoint);
304 | RE_UINT32 re_get_uppercase(RE_UINT32 codepoint);
305 | RE_UINT32 re_get_variation_selector(RE_UINT32 codepoint);
306 | RE_UINT32 re_get_vert_space(RE_UINT32 codepoint);
307 | RE_UINT32 re_get_white_space(RE_UINT32 codepoint);
308 | RE_UINT32 re_get_word(RE_UINT32 codepoint);
309 | RE_UINT32 re_get_word_break(RE_UINT32 codepoint);
310 | RE_UINT32 re_get_xdigit(RE_UINT32 codepoint);
311 | RE_UINT32 re_get_xid_continue(RE_UINT32 codepoint);
312 | RE_UINT32 re_get_xid_start(RE_UINT32 codepoint);
313 | int re_get_all_cases(RE_UINT32 codepoint, RE_UINT32* cases);
314 | RE_UINT32 re_get_simple_case_folding(RE_UINT32 codepoint);
315 | int re_get_full_case_folding(RE_UINT32 codepoint, RE_UINT32* folded);
316 | 


--------------------------------------------------------------------------------
/regex_3/regex.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Secret Labs' Regular Expression Engine
  3 | #
  4 | # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.
  5 | #
  6 | # This version of the SRE library can be redistributed under CNRI's
  7 | # Python 1.6 license.  For any other use, please contact Secret Labs
  8 | # AB (info@pythonware.com).
  9 | #
 10 | # Portions of this engine have been developed in cooperation with
 11 | # CNRI.  Hewlett-Packard provided funding for 1.6 integration and
 12 | # other compatibility work.
 13 | #
 14 | # 2010-01-16 mrab Python front-end re-written and extended
 15 | 
 16 | r"""Support for regular expressions (RE).
 17 | 
 18 | This module provides regular expression matching operations similar to those
 19 | found in Perl. It supports both 8-bit and Unicode strings; both the pattern and
 20 | the strings being processed can contain null bytes and characters outside the
 21 | US ASCII range.
 22 | 
 23 | Regular expressions can contain both special and ordinary characters. Most
 24 | ordinary characters, like "A", "a", or "0", are the simplest regular
 25 | expressions; they simply match themselves. You can concatenate ordinary
 26 | characters, so last matches the string 'last'.
 27 | 
 28 | There are a few differences between the old (legacy) behaviour and the new
 29 | (enhanced) behaviour, which are indicated by VERSION0 or VERSION1.
 30 | 
 31 | The special characters are:
 32 |     "."                 Matches any character except a newline.
 33 |     "^"                 Matches the start of the string.
 34 |     "$"                 Matches the end of the string or just before the
 35 |                         newline at the end of the string.
 36 |     "*"                 Matches 0 or more (greedy) repetitions of the preceding
 37 |                         RE. Greedy means that it will match as many repetitions
 38 |                         as possible.
 39 |     "+"                 Matches 1 or more (greedy) repetitions of the preceding
 40 |                         RE.
 41 |     "?"                 Matches 0 or 1 (greedy) of the preceding RE.
 42 |     *?,+?,??            Non-greedy versions of the previous three special
 43 |                         characters.
 44 |     *+,++,?+            Possessive versions of the previous three special
 45 |                         characters.
 46 |     {m,n}               Matches from m to n repetitions of the preceding RE.
 47 |     {m,n}?              Non-greedy version of the above.
 48 |     {m,n}+              Possessive version of the above.
 49 |     {...}               Fuzzy matching constraints.
 50 |     "\\"                Either escapes special characters or signals a special
 51 |                         sequence.
 52 |     [...]               Indicates a set of characters. A "^" as the first
 53 |                         character indicates a complementing set.
 54 |     "|"                 A|B, creates an RE that will match either A or B.
 55 |     (...)               Matches the RE inside the parentheses. The contents are
 56 |                         captured and can be retrieved or matched later in the
 57 |                         string.
 58 |     (?flags-flags)      VERSION1: Sets/clears the flags for the remainder of
 59 |                         the group or pattern; VERSION0: Sets the flags for the
 60 |                         entire pattern.
 61 |     (?:...)             Non-capturing version of regular parentheses.
 62 |     (?>...)             Atomic non-capturing version of regular parentheses.
 63 |     (?flags-flags:...)  Non-capturing version of regular parentheses with local
 64 |                         flags.
 65 |     (?P<name>...)       The substring matched by the group is accessible by
 66 |                         name.
 67 |     (?<name>...)        The substring matched by the group is accessible by
 68 |                         name.
 69 |     (?P=name)           Matches the text matched earlier by the group named
 70 |                         name.
 71 |     (?#...)             A comment; ignored.
 72 |     (?=...)             Matches if ... matches next, but doesn't consume the
 73 |                         string.
 74 |     (?!...)             Matches if ... doesn't match next.
 75 |     (?<=...)            Matches if preceded by ....
 76 |     (?<!...)            Matches if not preceded by ....
 77 |     (?(id)yes|no)       Matches yes pattern if group id matched, the (optional)
 78 |                         no pattern otherwise.
 79 |     (?(DEFINE)...)      If there's no group called "DEFINE", then ... will be
 80 |                         ignored, but any group definitions will be available.
 81 |     (?|...|...)         (?|A|B), creates an RE that will match either A or B,
 82 |                         but reuses capture group numbers across the
 83 |                         alternatives.
 84 |     (*FAIL)             Forces matching to fail, which means immediate
 85 |                         backtracking.
 86 |     (*F)                Abbreviation for (*FAIL).
 87 |     (*PRUNE)            Discards the current backtracking information. Its
 88 |                         effect doesn't extend outside an atomic group or a
 89 |                         lookaround.
 90 |     (*SKIP)             Similar to (*PRUNE), except that it also sets where in
 91 |                         the text the next attempt at matching the entire
 92 |                         pattern will start. Its effect doesn't extend outside
 93 |                         an atomic group or a lookaround.
 94 | 
 95 | The fuzzy matching constraints are: "i" to permit insertions, "d" to permit
 96 | deletions, "s" to permit substitutions, "e" to permit any of these. Limits are
 97 | optional with "<=" and "<". If any type of error is provided then any type not
 98 | provided is not permitted.
 99 | 
100 | A cost equation may be provided.
101 | 
102 | Examples:
103 |     (?:fuzzy){i<=2}
104 |     (?:fuzzy){i<=1,s<=2,d<=1,1i+1s+1d<3}
105 | 
106 | VERSION1: Set operators are supported, and a set can include nested sets. The
107 | set operators, in order of increasing precedence, are:
108 |     ||  Set union ("x||y" means "x or y").
109 |     ~~  (double tilde) Symmetric set difference ("x~~y" means "x or y, but not
110 |         both").
111 |     &&  Set intersection ("x&&y" means "x and y").
112 |     --  (double dash) Set difference ("x--y" means "x but not y").
113 | 
114 | Implicit union, ie, simple juxtaposition like in [ab], has the highest
115 | precedence.
116 | 
117 | VERSION0 and VERSION1:
118 | The special sequences consist of "\\" and a character from the list below. If
119 | the ordinary character is not on the list, then the resulting RE will match the
120 | second character.
121 |     \number         Matches the contents of the group of the same number if
122 |                     number is no more than 2 digits, otherwise the character
123 |                     with the 3-digit octal code.
124 |     \a              Matches the bell character.
125 |     \A              Matches only at the start of the string.
126 |     \b              Matches the empty string, but only at the start or end of a
127 |                     word.
128 |     \B              Matches the empty string, but not at the start or end of a
129 |                     word.
130 |     \d              Matches any decimal digit; equivalent to the set [0-9] when
131 |                     matching a bytestring or a Unicode string with the ASCII
132 |                     flag, or the whole range of Unicode digits when matching a
133 |                     Unicode string.
134 |     \D              Matches any non-digit character; equivalent to [^\d].
135 |     \f              Matches the formfeed character.
136 |     \g<name>        Matches the text matched by the group named name.
137 |     \G              Matches the empty string, but only at the position where
138 |                     the search started.
139 |     \h              Matches horizontal whitespace.
140 |     \K              Keeps only what follows for the entire match.
141 |     \L<name>        Named list. The list is provided as a keyword argument.
142 |     \m              Matches the empty string, but only at the start of a word.
143 |     \M              Matches the empty string, but only at the end of a word.
144 |     \n              Matches the newline character.
145 |     \N{name}        Matches the named character.
146 |     \p{name=value}  Matches the character if its property has the specified
147 |                     value.
148 |     \P{name=value}  Matches the character if its property hasn't the specified
149 |                     value.
150 |     \r              Matches the carriage-return character.
151 |     \s              Matches any whitespace character; equivalent to
152 |                     [ \t\n\r\f\v].
153 |     \S              Matches any non-whitespace character; equivalent to [^\s].
154 |     \t              Matches the tab character.
155 |     \uXXXX          Matches the Unicode codepoint with 4-digit hex code XXXX.
156 |     \UXXXXXXXX      Matches the Unicode codepoint with 8-digit hex code
157 |                     XXXXXXXX.
158 |     \v              Matches the vertical tab character.
159 |     \w              Matches any alphanumeric character; equivalent to
160 |                     [a-zA-Z0-9_] when matching a bytestring or a Unicode string
161 |                     with the ASCII flag, or the whole range of Unicode
162 |                     alphanumeric characters (letters plus digits plus
163 |                     underscore) when matching a Unicode string. With LOCALE, it
164 |                     will match the set [0-9_] plus characters defined as
165 |                     letters for the current locale.
166 |     \W              Matches the complement of \w; equivalent to [^\w].
167 |     \xXX            Matches the character with 2-digit hex code XX.
168 |     \X              Matches a grapheme.
169 |     \Z              Matches only at the end of the string.
170 |     \\              Matches a literal backslash.
171 | 
172 | This module exports the following functions:
173 |     match      Match a regular expression pattern at the beginning of a string.
174 |     fullmatch  Match a regular expression pattern against all of a string.
175 |     search     Search a string for the presence of a pattern.
176 |     sub        Substitute occurrences of a pattern found in a string using a
177 |                template string.
178 |     subf       Substitute occurrences of a pattern found in a string using a
179 |                format string.
180 |     subn       Same as sub, but also return the number of substitutions made.
181 |     subfn      Same as subf, but also return the number of substitutions made.
182 |     split      Split a string by the occurrences of a pattern. VERSION1: will
183 |                split at zero-width match; VERSION0: won't split at zero-width
184 |                match.
185 |     splititer  Return an iterator yielding the parts of a split string.
186 |     findall    Find all occurrences of a pattern in a string.
187 |     finditer   Return an iterator yielding a match object for each match.
188 |     compile    Compile a pattern into a Pattern object.
189 |     purge      Clear the regular expression cache.
190 |     escape     Backslash all non-alphanumerics or special characters in a
191 |                string.
192 | 
193 | Most of the functions support a concurrent parameter: if True, the GIL will be
194 | released during matching, allowing other Python threads to run concurrently. If
195 | the string changes during matching, the behaviour is undefined. This parameter
196 | is not needed when working on the builtin (immutable) string classes.
197 | 
198 | Some of the functions in this module take flags as optional parameters. Most of
199 | these flags can also be set within an RE:
200 |     A   a   ASCII         Make \w, \W, \b, \B, \d, and \D match the
201 |                           corresponding ASCII character categories. Default
202 |                           when matching a bytestring.
203 |     B   b   BESTMATCH     Find the best fuzzy match (default is first).
204 |     D       DEBUG         Print the parsed pattern.
205 |     E   e   ENHANCEMATCH  Attempt to improve the fit after finding the first
206 |                           fuzzy match.
207 |     F   f   FULLCASE      Use full case-folding when performing
208 |                           case-insensitive matching in Unicode.
209 |     I   i   IGNORECASE    Perform case-insensitive matching.
210 |     L   L   LOCALE        Make \w, \W, \b, \B, \d, and \D dependent on the
211 |                           current locale. (One byte per character only.)
212 |     M   m   MULTILINE     "^" matches the beginning of lines (after a newline)
213 |                           as well as the string. "$" matches the end of lines
214 |                           (before a newline) as well as the end of the string.
215 |     P   p   POSIX         Perform POSIX-standard matching (leftmost longest).
216 |     R   r   REVERSE       Searches backwards.
217 |     S   s   DOTALL        "." matches any character at all, including the
218 |                           newline.
219 |     U   u   UNICODE       Make \w, \W, \b, \B, \d, and \D dependent on the
220 |                           Unicode locale. Default when matching a Unicode
221 |                           string.
222 |     V0  V0  VERSION0      Turn on the old legacy behaviour.
223 |     V1  V1  VERSION1      Turn on the new enhanced behaviour. This flag
224 |                           includes the FULLCASE flag.
225 |     W   w   WORD          Make \b and \B work with default Unicode word breaks
226 |                           and make ".", "^" and "$" work with Unicode line
227 |                           breaks.
228 |     X   x   VERBOSE       Ignore whitespace and comments for nicer looking REs.
229 | 
230 | This module also defines an exception 'error'.
231 | 
232 | """
233 | 
234 | # Public symbols.
235 | __all__ = ["cache_all", "compile", "DEFAULT_VERSION", "escape", "findall",
236 |   "finditer", "fullmatch", "match", "purge", "search", "split", "splititer",
237 |   "sub", "subf", "subfn", "subn", "template", "Scanner", "A", "ASCII", "B",
238 |   "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", "S", "DOTALL", "F",
239 |   "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", "POSIX",
240 |   "R", "REVERSE", "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1",
241 |   "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex", "__version__",
242 |   "__doc__", "RegexFlag"]
243 | 
244 | __version__ = "2.5.153"
245 | 
246 | # --------------------------------------------------------------------
247 | # Public interface.
248 | 
249 | def match(pattern, string, flags=0, pos=None, endpos=None, partial=False,
250 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
251 |     """Try to apply the pattern at the start of the string, returning a match
252 |     object, or None if no match was found."""
253 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
254 |     return pat.match(string, pos, endpos, concurrent, partial, timeout)
255 | 
256 | def fullmatch(pattern, string, flags=0, pos=None, endpos=None, partial=False,
257 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
258 |     """Try to apply the pattern against all of the string, returning a match
259 |     object, or None if no match was found."""
260 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
261 |     return pat.fullmatch(string, pos, endpos, concurrent, partial, timeout)
262 | 
263 | def search(pattern, string, flags=0, pos=None, endpos=None, partial=False,
264 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
265 |     """Search through string looking for a match to the pattern, returning a
266 |     match object, or None if no match was found."""
267 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
268 |     return pat.search(string, pos, endpos, concurrent, partial, timeout)
269 | 
270 | def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None,
271 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
272 |     """Return the string obtained by replacing the leftmost (or rightmost with a
273 |     reverse pattern) non-overlapping occurrences of the pattern in string by the
274 |     replacement repl. repl can be either a string or a callable; if a string,
275 |     backslash escapes in it are processed; if a callable, it's passed the match
276 |     object and must return a replacement string to be used."""
277 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
278 |     return pat.sub(repl, string, count, pos, endpos, concurrent, timeout)
279 | 
280 | def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None,
281 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
282 |     """Return the string obtained by replacing the leftmost (or rightmost with a
283 |     reverse pattern) non-overlapping occurrences of the pattern in string by the
284 |     replacement format. format can be either a string or a callable; if a string,
285 |     it's treated as a format string; if a callable, it's passed the match object
286 |     and must return a replacement string to be used."""
287 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
288 |     return pat.subf(format, string, count, pos, endpos, concurrent, timeout)
289 | 
290 | def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None,
291 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
292 |     """Return a 2-tuple containing (new_string, number). new_string is the string
293 |     obtained by replacing the leftmost (or rightmost with a reverse pattern)
294 |     non-overlapping occurrences of the pattern in the source string by the
295 |     replacement repl. number is the number of substitutions that were made. repl
296 |     can be either a string or a callable; if a string, backslash escapes in it
297 |     are processed; if a callable, it's passed the match object and must return a
298 |     replacement string to be used."""
299 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
300 |     return pat.subn(repl, string, count, pos, endpos, concurrent, timeout)
301 | 
302 | def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None,
303 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
304 |     """Return a 2-tuple containing (new_string, number). new_string is the string
305 |     obtained by replacing the leftmost (or rightmost with a reverse pattern)
306 |     non-overlapping occurrences of the pattern in the source string by the
307 |     replacement format. number is the number of substitutions that were made. format
308 |     can be either a string or a callable; if a string, it's treated as a format
309 |     string; if a callable, it's passed the match object and must return a
310 |     replacement string to be used."""
311 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
312 |     return pat.subfn(format, string, count, pos, endpos, concurrent, timeout)
313 | 
314 | def split(pattern, string, maxsplit=0, flags=0, concurrent=None, timeout=None,
315 |   ignore_unused=False, **kwargs):
316 |     """Split the source string by the occurrences of the pattern, returning a
317 |     list containing the resulting substrings.  If capturing parentheses are used
318 |     in pattern, then the text of all groups in the pattern are also returned as
319 |     part of the resulting list.  If maxsplit is nonzero, at most maxsplit splits
320 |     occur, and the remainder of the string is returned as the final element of
321 |     the list."""
322 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
323 |     return pat.split(string, maxsplit, concurrent, timeout)
324 | 
325 | def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None,
326 |   timeout=None, ignore_unused=False, **kwargs):
327 |     "Return an iterator yielding the parts of a split string."
328 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
329 |     return pat.splititer(string, maxsplit, concurrent, timeout)
330 | 
331 | def findall(pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
332 |   concurrent=None, timeout=None, ignore_unused=False, **kwargs):
333 |     """Return a list of all matches in the string. The matches may be overlapped
334 |     if overlapped is True. If one or more groups are present in the pattern,
335 |     return a list of groups; this will be a list of tuples if the pattern has
336 |     more than one group. Empty matches are included in the result."""
337 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
338 |     return pat.findall(string, pos, endpos, overlapped, concurrent, timeout)
339 | 
340 | def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False,
341 |   partial=False, concurrent=None, timeout=None, ignore_unused=False, **kwargs):
342 |     """Return an iterator over all matches in the string. The matches may be
343 |     overlapped if overlapped is True. For each match, the iterator returns a
344 |     match object. Empty matches are included in the result."""
345 |     pat = _compile(pattern, flags, ignore_unused, kwargs, True)
346 |     return pat.finditer(string, pos, endpos, overlapped, concurrent, partial,
347 |       timeout)
348 | 
349 | def compile(pattern, flags=0, ignore_unused=False, cache_pattern=None, **kwargs):
350 |     "Compile a regular expression pattern, returning a pattern object."
351 |     if cache_pattern is None:
352 |         cache_pattern = _cache_all
353 |     return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern)
354 | 
355 | def purge():
356 |     "Clear the regular expression cache"
357 |     _cache.clear()
358 |     _locale_sensitive.clear()
359 | 
360 | # Whether to cache all patterns.
361 | _cache_all = True
362 | 
363 | def cache_all(value=True):
364 |     """Sets whether to cache all patterns, even those are compiled explicitly.
365 |     Passing None has no effect, but returns the current setting."""
366 |     global _cache_all
367 | 
368 |     if value is None:
369 |         return _cache_all
370 | 
371 |     _cache_all = value
372 | 
373 | def template(pattern, flags=0):
374 |     "Compile a template pattern, returning a pattern object."
375 |     return _compile(pattern, flags | TEMPLATE, False, {}, False)
376 | 
377 | def escape(pattern, special_only=True, literal_spaces=False):
378 |     """Escape a string for use as a literal in a pattern. If special_only is
379 |     True, escape only special characters, else escape all non-alphanumeric
380 |     characters. If literal_spaces is True, don't escape spaces."""
381 |     # Convert it to Unicode.
382 |     if isinstance(pattern, bytes):
383 |         p = pattern.decode("latin-1")
384 |     else:
385 |         p = pattern
386 | 
387 |     s = []
388 |     if special_only:
389 |         for c in p:
390 |             if c == " " and literal_spaces:
391 |                 s.append(c)
392 |             elif c in _METACHARS or c.isspace():
393 |                 s.append("\\")
394 |                 s.append(c)
395 |             else:
396 |                 s.append(c)
397 |     else:
398 |         for c in p:
399 |             if c == " " and literal_spaces:
400 |                 s.append(c)
401 |             elif c in _ALNUM:
402 |                 s.append(c)
403 |             else:
404 |                 s.append("\\")
405 |                 s.append(c)
406 | 
407 |     r = "".join(s)
408 |     # Convert it back to bytes if necessary.
409 |     if isinstance(pattern, bytes):
410 |         r = r.encode("latin-1")
411 | 
412 |     return r
413 | 
414 | # --------------------------------------------------------------------
415 | # Internals.
416 | 
417 | import regex._regex_core as _regex_core
418 | import regex._regex as _regex
419 | from threading import RLock as _RLock
420 | from locale import getpreferredencoding as _getpreferredencoding
421 | from regex._regex_core import *
422 | from regex._regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError,
423 |   _UnscopedFlagSet, _check_group_features, _compile_firstset,
424 |   _compile_replacement, _flatten_code, _fold_case, _get_required_string,
425 |   _parse_pattern, _shrink_cache)
426 | from regex._regex_core import (ALNUM as _ALNUM, Info as _Info, OP as _OP, Source
427 |   as _Source, Fuzzy as _Fuzzy)
428 | 
429 | # Version 0 is the old behaviour, compatible with the original 're' module.
430 | # Version 1 is the new behaviour, which differs slightly.
431 | 
432 | DEFAULT_VERSION = VERSION0
433 | 
434 | _METACHARS = frozenset("()[]{}?*+|^$\\.-#&~")
435 | 
436 | _regex_core.DEFAULT_VERSION = DEFAULT_VERSION
437 | 
438 | # Caches for the patterns and replacements.
439 | _cache = {}
440 | _cache_lock = _RLock()
441 | _named_args = {}
442 | _replacement_cache = {}
443 | _locale_sensitive = {}
444 | 
445 | # Maximum size of the cache.
446 | _MAXCACHE = 500
447 | _MAXREPCACHE = 500
448 | 
449 | def _compile(pattern, flags, ignore_unused, kwargs, cache_it):
450 |     "Compiles a regular expression to a PatternObject."
451 | 
452 |     global DEFAULT_VERSION
453 |     try:
454 |         from regex import DEFAULT_VERSION
455 |     except ImportError:
456 |         pass
457 | 
458 |     # We won't bother to cache the pattern if we're debugging.
459 |     if (flags & DEBUG) != 0:
460 |         cache_it = False
461 | 
462 |     # What locale is this pattern using?
463 |     locale_key = (type(pattern), pattern)
464 |     if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0:
465 |         # This pattern is, or might be, locale-sensitive.
466 |         pattern_locale = _getpreferredencoding()
467 |     else:
468 |         # This pattern is definitely not locale-sensitive.
469 |         pattern_locale = None
470 | 
471 |     def complain_unused_args():
472 |         if ignore_unused:
473 |             return
474 | 
475 |         # Complain about any unused keyword arguments, possibly resulting from a typo.
476 |         unused_kwargs = set(kwargs) - {k for k, v in args_needed}
477 |         if unused_kwargs:
478 |             any_one = next(iter(unused_kwargs))
479 |             raise ValueError('unused keyword argument {!a}'.format(any_one))
480 | 
481 |     if cache_it:
482 |         try:
483 |             # Do we know what keyword arguments are needed?
484 |             args_key = pattern, type(pattern), flags
485 |             args_needed = _named_args[args_key]
486 | 
487 |             # Are we being provided with its required keyword arguments?
488 |             args_supplied = set()
489 |             if args_needed:
490 |                 for k, v in args_needed:
491 |                     try:
492 |                         args_supplied.add((k, frozenset(kwargs[k])))
493 |                     except KeyError:
494 |                         raise error("missing named list: {!r}".format(k))
495 | 
496 |             complain_unused_args()
497 | 
498 |             args_supplied = frozenset(args_supplied)
499 | 
500 |             # Have we already seen this regular expression and named list?
501 |             pattern_key = (pattern, type(pattern), flags, args_supplied,
502 |               DEFAULT_VERSION, pattern_locale)
503 |             return _cache[pattern_key]
504 |         except KeyError:
505 |             # It's a new pattern, or new named list for a known pattern.
506 |             pass
507 | 
508 |     # Guess the encoding from the class of the pattern string.
509 |     if isinstance(pattern, str):
510 |         guess_encoding = UNICODE
511 |     elif isinstance(pattern, bytes):
512 |         guess_encoding = ASCII
513 |     elif isinstance(pattern, Pattern):
514 |         if flags:
515 |             raise ValueError("cannot process flags argument with a compiled pattern")
516 | 
517 |         return pattern
518 |     else:
519 |         raise TypeError("first argument must be a string or compiled pattern")
520 | 
521 |     # Set the default version in the core code in case it has been changed.
522 |     _regex_core.DEFAULT_VERSION = DEFAULT_VERSION
523 | 
524 |     global_flags = flags
525 | 
526 |     while True:
527 |         caught_exception = None
528 |         try:
529 |             source = _Source(pattern)
530 |             info = _Info(global_flags, source.char_type, kwargs)
531 |             info.guess_encoding = guess_encoding
532 |             source.ignore_space = bool(info.flags & VERBOSE)
533 |             parsed = _parse_pattern(source, info)
534 |             break
535 |         except _UnscopedFlagSet:
536 |             # Remember the global flags for the next attempt.
537 |             global_flags = info.global_flags
538 |         except error as e:
539 |             caught_exception = e
540 | 
541 |         if caught_exception:
542 |             raise error(caught_exception.msg, caught_exception.pattern,
543 |               caught_exception.pos)
544 | 
545 |     if not source.at_end():
546 |         raise error("unbalanced parenthesis", pattern, source.pos)
547 | 
548 |     # Check the global flags for conflicts.
549 |     version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION
550 |     if version not in (0, VERSION0, VERSION1):
551 |         raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")
552 | 
553 |     if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE):
554 |         raise ValueError("ASCII, LOCALE and UNICODE flags are mutually incompatible")
555 | 
556 |     if isinstance(pattern, bytes) and (info.flags & UNICODE):
557 |         raise ValueError("cannot use UNICODE flag with a bytes pattern")
558 | 
559 |     if not (info.flags & _ALL_ENCODINGS):
560 |         if isinstance(pattern, str):
561 |             info.flags |= UNICODE
562 |         else:
563 |             info.flags |= ASCII
564 | 
565 |     reverse = bool(info.flags & REVERSE)
566 |     fuzzy = isinstance(parsed, _Fuzzy)
567 | 
568 |     # Remember whether this pattern as an inline locale flag.
569 |     _locale_sensitive[locale_key] = info.inline_locale
570 | 
571 |     # Fix the group references.
572 |     caught_exception = None
573 |     try:
574 |         parsed.fix_groups(pattern, reverse, False)
575 |     except error as e:
576 |         caught_exception = e
577 | 
578 |     if caught_exception:
579 |         raise error(caught_exception.msg, caught_exception.pattern,
580 |           caught_exception.pos)
581 | 
582 |     # Should we print the parsed pattern?
583 |     if flags & DEBUG:
584 |         parsed.dump(indent=0, reverse=reverse)
585 | 
586 |     # Optimise the parsed pattern.
587 |     parsed = parsed.optimise(info, reverse)
588 |     parsed = parsed.pack_characters(info)
589 | 
590 |     # Get the required string.
591 |     req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags)
592 | 
593 |     # Build the named lists.
594 |     named_lists = {}
595 |     named_list_indexes = [None] * len(info.named_lists_used)
596 |     args_needed = set()
597 |     for key, index in info.named_lists_used.items():
598 |         name, case_flags = key
599 |         values = frozenset(kwargs[name])
600 |         if case_flags:
601 |             items = frozenset(_fold_case(info, v) for v in values)
602 |         else:
603 |             items = values
604 |         named_lists[name] = values
605 |         named_list_indexes[index] = items
606 |         args_needed.add((name, values))
607 | 
608 |     complain_unused_args()
609 | 
610 |     # Check the features of the groups.
611 |     _check_group_features(info, parsed)
612 | 
613 |     # Compile the parsed pattern. The result is a list of tuples.
614 |     code = parsed.compile(reverse)
615 | 
616 |     # Is there a group call to the pattern as a whole?
617 |     key = (0, reverse, fuzzy)
618 |     ref = info.call_refs.get(key)
619 |     if ref is not None:
620 |         code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )]
621 | 
622 |     # Add the final 'success' opcode.
623 |     code += [(_OP.SUCCESS, )]
624 | 
625 |     # Compile the additional copies of the groups that we need.
626 |     for group, rev, fuz in info.additional_groups:
627 |         code += group.compile(rev, fuz)
628 | 
629 |     # Flatten the code into a list of ints.
630 |     code = _flatten_code(code)
631 | 
632 |     if not parsed.has_simple_start():
633 |         # Get the first set, if possible.
634 |         try:
635 |             fs_code = _compile_firstset(info, parsed.get_firstset(reverse))
636 |             fs_code = _flatten_code(fs_code)
637 |             code = fs_code + code
638 |         except _FirstSetError:
639 |             pass
640 | 
641 |     # The named capture groups.
642 |     index_group = dict((v, n) for n, v in info.group_index.items())
643 | 
644 |     # Create the PatternObject.
645 |     #
646 |     # Local flags like IGNORECASE affect the code generation, but aren't needed
647 |     # by the PatternObject itself. Conversely, global flags like LOCALE _don't_
648 |     # affect the code generation but _are_ needed by the PatternObject.
649 |     compiled_pattern = _regex.compile(pattern, info.flags | version, code,
650 |       info.group_index, index_group, named_lists, named_list_indexes,
651 |       req_offset, req_chars, req_flags, info.group_count)
652 | 
653 |     # Do we need to reduce the size of the cache?
654 |     if len(_cache) >= _MAXCACHE:
655 |         with _cache_lock:
656 |             _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE)
657 | 
658 |     if cache_it:
659 |         if (info.flags & LOCALE) == 0:
660 |             pattern_locale = None
661 | 
662 |         args_needed = frozenset(args_needed)
663 | 
664 |         # Store this regular expression and named list.
665 |         pattern_key = (pattern, type(pattern), flags, args_needed,
666 |           DEFAULT_VERSION, pattern_locale)
667 |         _cache[pattern_key] = compiled_pattern
668 | 
669 |         # Store what keyword arguments are needed.
670 |         _named_args[args_key] = args_needed
671 | 
672 |     return compiled_pattern
673 | 
674 | def _compile_replacement_helper(pattern, template):
675 |     "Compiles a replacement template."
676 |     # This function is called by the _regex module.
677 | 
678 |     # Have we seen this before?
679 |     key = pattern.pattern, pattern.flags, template
680 |     compiled = _replacement_cache.get(key)
681 |     if compiled is not None:
682 |         return compiled
683 | 
684 |     if len(_replacement_cache) >= _MAXREPCACHE:
685 |         _replacement_cache.clear()
686 | 
687 |     is_unicode = isinstance(template, str)
688 |     source = _Source(template)
689 |     if is_unicode:
690 |         def make_string(char_codes):
691 |             return "".join(chr(c) for c in char_codes)
692 |     else:
693 |         def make_string(char_codes):
694 |             return bytes(char_codes)
695 | 
696 |     compiled = []
697 |     literal = []
698 |     while True:
699 |         ch = source.get()
700 |         if not ch:
701 |             break
702 |         if ch == "\\":
703 |             # '_compile_replacement' will return either an int group reference
704 |             # or a string literal. It returns items (plural) in order to handle
705 |             # a 2-character literal (an invalid escape sequence).
706 |             is_group, items = _compile_replacement(source, pattern, is_unicode)
707 |             if is_group:
708 |                 # It's a group, so first flush the literal.
709 |                 if literal:
710 |                     compiled.append(make_string(literal))
711 |                     literal = []
712 |                 compiled.extend(items)
713 |             else:
714 |                 literal.extend(items)
715 |         else:
716 |             literal.append(ord(ch))
717 | 
718 |     # Flush the literal.
719 |     if literal:
720 |         compiled.append(make_string(literal))
721 | 
722 |     _replacement_cache[key] = compiled
723 | 
724 |     return compiled
725 | 
726 | # We define Pattern here after all the support objects have been defined.
727 | _pat = _compile('', 0, False, {}, False)
728 | Pattern = type(_pat)
729 | Match = type(_pat.match(''))
730 | del _pat
731 | 
732 | # Make Pattern public for typing annotations.
733 | __all__.append("Pattern")
734 | __all__.append("Match")
735 | 
736 | # We'll define an alias for the 'compile' function so that the repr of a
737 | # pattern object is eval-able.
738 | Regex = compile
739 | 
740 | # Register myself for pickling.
741 | import copyreg as _copy_reg
742 | 
743 | def _pickle(pattern):
744 |     return _regex.compile, pattern._pickled_data
745 | 
746 | _copy_reg.pickle(Pattern, _pickle)
747 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from setuptools import setup, Extension
 4 | from os.path import join
 5 | 
 6 | setup(
 7 |     ext_modules=[Extension('regex._regex', [join('regex_3', '_regex.c'),
 8 |       join('regex_3', '_regex_unicode.c')])],
 9 | )
10 | 


--------------------------------------------------------------------------------