├── .github └── workflows │ └── main.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── changelog.txt ├── docs ├── Features.html └── UnicodeProperties.rst ├── pyproject.toml ├── regex_3 ├── __init__.py ├── _regex.c ├── _regex.h ├── _regex_core.py ├── _regex_unicode.c ├── _regex_unicode.h ├── regex.py └── test_regex.py ├── setup.py └── tools └── build_regex_unicode.py /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # Automatically build binary wheels and source packages. 2 | name: cibuildwheel 3 | 4 | # Build on every branch push with tag. 5 | on: 6 | push: 7 | tags: 8 | - '*' 9 | 10 | env: 11 | PYTHON_VER: '3.11' # Python to run test/cibuildwheel 12 | CIBW_BUILD: cp39-* cp310-* cp311-* cp312-* cp313-* 13 | CIBW_TEST_COMMAND: python -m unittest regex.test_regex 14 | 15 | jobs: 16 | # Run test on Ubuntu/macOS/Windows for every commit. 17 | run_test: 18 | name: Run test on ${{ matrix.platform }} 19 | runs-on: ${{ matrix.platform }} 20 | 21 | strategy: 22 | matrix: 23 | platform: [ubuntu-latest, macos-latest, windows-latest] 24 | 25 | steps: 26 | - uses: actions/checkout@v3 27 | - uses: actions/setup-python@v4 28 | with: 29 | python-version: ${{ env.PYTHON_VER }} 30 | 31 | - name: Run test 32 | run: | 33 | python -m pip install -vv . 34 | python -m unittest -v regex.test_regex 35 | 36 | # Build Linux/macOS/Windows wheels. 37 | build_wheels: 38 | name: Build ${{ matrix.platform }} wheels 39 | if: github.event_name == 'push' 40 | runs-on: ${{ matrix.platform }} 41 | 42 | strategy: 43 | matrix: 44 | platform: [ubuntu-latest, macos-latest, windows-latest] 45 | 46 | env: 47 | # macOS archs 48 | CIBW_ARCHS_MACOS: "x86_64 arm64 universal2" 49 | # Windows archs 50 | CIBW_ARCHS_WINDOWS: "AMD64 x86 ARM64" 51 | 52 | steps: 53 | - uses: actions/checkout@v3 54 | - uses: actions/setup-python@v4 55 | with: 56 | python-version: ${{ env.PYTHON_VER }} 57 | 58 | - name: Install cibuildwheel & build wheels 59 | run: | 60 | python -m pip install -U cibuildwheel 61 | python -m cibuildwheel --output-dir wheelhouse 62 | 63 | - name: Upload wheels 64 | uses: actions/upload-artifact@v4 65 | with: 66 | name: regex-files-wheels-${{ matrix.platform }} 67 | path: wheelhouse/*.whl 68 | 69 | # I cannot get this to work! 70 | # - name: Create GitHub release 71 | # uses: actions/create-release@v1 72 | # env: 73 | # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 74 | # with: 75 | # tag_name: ${{ github.ref }} 76 | # release_name: regex ${{ github.ref }} 77 | 78 | # Build source distribution & manylinux1_x86_64 wheels 79 | # These two jobs build: 80 | # 1, build_wheels (above): manylinux1_i686 / manylinux2014_x86_64 81 | # 2, build_in_manylinux2010 (this): manylinux1_x86_64 82 | # manylinux2014_x86_64 wheels use a new memcpy() function 83 | # (memcpy@GLIBC_2.14), so the wheels are not compatible with 84 | # manylinux1_x86_64 environment. In order to be compatible as 85 | # much as possible, this job builds manylinux1_x86_64 wheels. 86 | build_in_manylinux2010: 87 | name: Build in manylinux2010 environment 88 | if: github.event_name == 'push' 89 | runs-on: ubuntu-latest 90 | 91 | env: 92 | # Generate manylinux1_x86_64 wheels. 93 | # tag pip CPython with the pip glibc 94 | # manylinux1 >=8.1.0 3.5.2+, 3.6.0+ 2.5 (2006-09-29) 95 | # manylinux2010 >=19.0 3.7.3+, 3.8.0+ 2.12 (2010-05-03) 96 | # manylinux2014 >=19.3 3.7.8+, 3.8.4+, 3.9.0+ 2.17 (2012-12-25) 97 | # manylinux_x_y >=20.3 3.8.10+, 3.9.5+, 3.10.0+ x.y 98 | # manylinux2010 images EOL on 2022-08-01, it doesn't support cp311. 99 | CIBW_BUILD: cp39-* cp310-* 100 | CIBW_MANYLINUX_X86_64_IMAGE: manylinux2010 101 | CIBW_ARCHS_LINUX: x86_64 102 | 103 | steps: 104 | - uses: actions/checkout@v3 105 | - uses: actions/setup-python@v4 106 | with: 107 | python-version: ${{ env.PYTHON_VER }} 108 | 109 | - name: Build source distribution & wheels 110 | run: | 111 | python setup.py sdist --formats=gztar 112 | python -m pip install -U cibuildwheel 113 | python -m cibuildwheel --output-dir wheelhouse 114 | 115 | - name: Upload source distribution 116 | uses: actions/upload-artifact@v4 117 | with: 118 | name: regex-files-dist 119 | path: dist/*.tar.gz 120 | 121 | - name: Upload manylinux1_x86_64 wheels 122 | uses: actions/upload-artifact@v4 123 | with: 124 | name: regex-files-manylinux2010 125 | path: wheelhouse/*.whl 126 | 127 | # Build and upload aarch64/ppc64le/s390x wheels. 128 | build_arch_wheels: 129 | name: Build ${{ matrix.arch }} Linux wheels 130 | if: github.event_name == 'push' 131 | runs-on: ubuntu-latest 132 | 133 | strategy: 134 | matrix: 135 | arch: [aarch64, ppc64le, s390x] 136 | # Building in QEMU is very slow, so parallelize the tasks. 137 | skip_image: ["*musllinux*", "*manylinux*"] 138 | 139 | env: 140 | CIBW_ARCHS: ${{ matrix.arch }} 141 | CIBW_SKIP: ${{ matrix.skip_image }} 142 | 143 | steps: 144 | - uses: actions/checkout@v3 145 | - uses: actions/setup-python@v4 146 | with: 147 | python-version: ${{ env.PYTHON_VER }} 148 | 149 | - name: Set up QEMU 150 | uses: docker/setup-qemu-action@v2 151 | 152 | - name: Install cibuildwheel & build wheels 153 | run: | 154 | python -m pip install -U cibuildwheel 155 | python -m cibuildwheel --output-dir wheelhouse 156 | 157 | - name: Upload ${{ matrix.arch }} wheels 158 | uses: actions/upload-artifact@v4 159 | with: 160 | name: regex-files-arch-${{ matrix.arch }} 161 | path: wheelhouse/*.whl 162 | 163 | # Upload to PyPI 164 | upload_pypi: 165 | name: Publish to PyPI 166 | needs: [build_wheels, build_in_manylinux2010, build_arch_wheels] 167 | runs-on: ubuntu-latest 168 | 169 | steps: 170 | - uses: actions/download-artifact@v4 171 | with: 172 | pattern: regex-files-* 173 | path: dist 174 | merge-multiple: true 175 | 176 | - name: Upload to PyPI 177 | uses: pypa/gh-action-pypi-publish@release/v1 178 | with: 179 | user: __token__ 180 | password: ${{ secrets.PYPI_TOKEN }} 181 | skip_existing: true 182 | verbose: true 183 | print_hash: true 184 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | This work was derived from the 're' module of CPython 2.6 and CPython 3.1, 2 | copyright (c) 1998-2001 by Secret Labs AB and licensed under CNRI's Python 1.6 3 | license. 4 | 5 | All additions and alterations are licensed under the Apache 2.0 License. 6 | 7 | 8 | Apache License 9 | Version 2.0, January 2004 10 | http://www.apache.org/licenses/ 11 | 12 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 13 | 14 | 1. Definitions. 15 | 16 | "License" shall mean the terms and conditions for use, reproduction, 17 | and distribution as defined by Sections 1 through 9 of this document. 18 | 19 | "Licensor" shall mean the copyright owner or entity authorized by 20 | the copyright owner that is granting the License. 21 | 22 | "Legal Entity" shall mean the union of the acting entity and all 23 | other entities that control, are controlled by, or are under common 24 | control with that entity. For the purposes of this definition, 25 | "control" means (i) the power, direct or indirect, to cause the 26 | direction or management of such entity, whether by contract or 27 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 28 | outstanding shares, or (iii) beneficial ownership of such entity. 29 | 30 | "You" (or "Your") shall mean an individual or Legal Entity 31 | exercising permissions granted by this License. 32 | 33 | "Source" form shall mean the preferred form for making modifications, 34 | including but not limited to software source code, documentation 35 | source, and configuration files. 36 | 37 | "Object" form shall mean any form resulting from mechanical 38 | transformation or translation of a Source form, including but 39 | not limited to compiled object code, generated documentation, 40 | and conversions to other media types. 41 | 42 | "Work" shall mean the work of authorship, whether in Source or 43 | Object form, made available under the License, as indicated by a 44 | copyright notice that is included in or attached to the work 45 | (an example is provided in the Appendix below). 46 | 47 | "Derivative Works" shall mean any work, whether in Source or Object 48 | form, that is based on (or derived from) the Work and for which the 49 | editorial revisions, annotations, elaborations, or other modifications 50 | represent, as a whole, an original work of authorship. For the purposes 51 | of this License, Derivative Works shall not include works that remain 52 | separable from, or merely link (or bind by name) to the interfaces of, 53 | the Work and Derivative Works thereof. 54 | 55 | "Contribution" shall mean any work of authorship, including 56 | the original version of the Work and any modifications or additions 57 | to that Work or Derivative Works thereof, that is intentionally 58 | submitted to Licensor for inclusion in the Work by the copyright owner 59 | or by an individual or Legal Entity authorized to submit on behalf of 60 | the copyright owner. For the purposes of this definition, "submitted" 61 | means any form of electronic, verbal, or written communication sent 62 | to the Licensor or its representatives, including but not limited to 63 | communication on electronic mailing lists, source code control systems, 64 | and issue tracking systems that are managed by, or on behalf of, the 65 | Licensor for the purpose of discussing and improving the Work, but 66 | excluding communication that is conspicuously marked or otherwise 67 | designated in writing by the copyright owner as "Not a Contribution." 68 | 69 | "Contributor" shall mean Licensor and any individual or Legal Entity 70 | on behalf of whom a Contribution has been received by Licensor and 71 | subsequently incorporated within the Work. 72 | 73 | 2. Grant of Copyright License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | copyright license to reproduce, prepare Derivative Works of, 77 | publicly display, publicly perform, sublicense, and distribute the 78 | Work and such Derivative Works in Source or Object form. 79 | 80 | 3. Grant of Patent License. Subject to the terms and conditions of 81 | this License, each Contributor hereby grants to You a perpetual, 82 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 83 | (except as stated in this section) patent license to make, have made, 84 | use, offer to sell, sell, import, and otherwise transfer the Work, 85 | where such license applies only to those patent claims licensable 86 | by such Contributor that are necessarily infringed by their 87 | Contribution(s) alone or by combination of their Contribution(s) 88 | with the Work to which such Contribution(s) was submitted. If You 89 | institute patent litigation against any entity (including a 90 | cross-claim or counterclaim in a lawsuit) alleging that the Work 91 | or a Contribution incorporated within the Work constitutes direct 92 | or contributory patent infringement, then any patent licenses 93 | granted to You under this License for that Work shall terminate 94 | as of the date such litigation is filed. 95 | 96 | 4. Redistribution. You may reproduce and distribute copies of the 97 | Work or Derivative Works thereof in any medium, with or without 98 | modifications, and in Source or Object form, provided that You 99 | meet the following conditions: 100 | 101 | (a) You must give any other recipients of the Work or 102 | Derivative Works a copy of this License; and 103 | 104 | (b) You must cause any modified files to carry prominent notices 105 | stating that You changed the files; and 106 | 107 | (c) You must retain, in the Source form of any Derivative Works 108 | that You distribute, all copyright, patent, trademark, and 109 | attribution notices from the Source form of the Work, 110 | excluding those notices that do not pertain to any part of 111 | the Derivative Works; and 112 | 113 | (d) If the Work includes a "NOTICE" text file as part of its 114 | distribution, then any Derivative Works that You distribute must 115 | include a readable copy of the attribution notices contained 116 | within such NOTICE file, excluding those notices that do not 117 | pertain to any part of the Derivative Works, in at least one 118 | of the following places: within a NOTICE text file distributed 119 | as part of the Derivative Works; within the Source form or 120 | documentation, if provided along with the Derivative Works; or, 121 | within a display generated by the Derivative Works, if and 122 | wherever such third-party notices normally appear. The contents 123 | of the NOTICE file are for informational purposes only and 124 | do not modify the License. You may add Your own attribution 125 | notices within Derivative Works that You distribute, alongside 126 | or as an addendum to the NOTICE text from the Work, provided 127 | that such additional attribution notices cannot be construed 128 | as modifying the License. 129 | 130 | You may add Your own copyright statement to Your modifications and 131 | may provide additional or different license terms and conditions 132 | for use, reproduction, or distribution of Your modifications, or 133 | for any such Derivative Works as a whole, provided Your use, 134 | reproduction, and distribution of the Work otherwise complies with 135 | the conditions stated in this License. 136 | 137 | 5. Submission of Contributions. Unless You explicitly state otherwise, 138 | any Contribution intentionally submitted for inclusion in the Work 139 | by You to the Licensor shall be under the terms and conditions of 140 | this License, without any additional terms or conditions. 141 | Notwithstanding the above, nothing herein shall supersede or modify 142 | the terms of any separate license agreement you may have executed 143 | with Licensor regarding such Contributions. 144 | 145 | 6. Trademarks. This License does not grant permission to use the trade 146 | names, trademarks, service marks, or product names of the Licensor, 147 | except as required for reasonable and customary use in describing the 148 | origin of the Work and reproducing the content of the NOTICE file. 149 | 150 | 7. Disclaimer of Warranty. Unless required by applicable law or 151 | agreed to in writing, Licensor provides the Work (and each 152 | Contributor provides its Contributions) on an "AS IS" BASIS, 153 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 154 | implied, including, without limitation, any warranties or conditions 155 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 156 | PARTICULAR PURPOSE. You are solely responsible for determining the 157 | appropriateness of using or redistributing the Work and assume any 158 | risks associated with Your exercise of permissions under this License. 159 | 160 | 8. Limitation of Liability. In no event and under no legal theory, 161 | whether in tort (including negligence), contract, or otherwise, 162 | unless required by applicable law (such as deliberate and grossly 163 | negligent acts) or agreed to in writing, shall any Contributor be 164 | liable to You for damages, including any direct, indirect, special, 165 | incidental, or consequential damages of any character arising as a 166 | result of this License or out of the use or inability to use the 167 | Work (including but not limited to damages for loss of goodwill, 168 | work stoppage, computer failure or malfunction, or any and all 169 | other commercial damages or losses), even if such Contributor 170 | has been advised of the possibility of such damages. 171 | 172 | 9. Accepting Warranty or Additional Liability. While redistributing 173 | the Work or Derivative Works thereof, You may choose to offer, 174 | and charge a fee for, acceptance of support, warranty, indemnity, 175 | or other liability obligations and/or rights consistent with this 176 | License. However, in accepting such obligations, You may act only 177 | on Your own behalf and on Your sole responsibility, not on behalf 178 | of any other Contributor, and only if You agree to indemnify, 179 | defend, and hold each Contributor harmless for any liability 180 | incurred by, or claims asserted against, such Contributor by reason 181 | of your accepting any such warranty or additional liability. 182 | 183 | END OF TERMS AND CONDITIONS 184 | 185 | APPENDIX: How to apply the Apache License to your work. 186 | 187 | To apply the Apache License to your work, attach the following 188 | boilerplate notice, with the fields enclosed by brackets "[]" 189 | replaced with your own identifying information. (Don't include 190 | the brackets!) The text should be enclosed in the appropriate 191 | comment syntax for the file format. We also recommend that a 192 | file or class name and description of purpose be included on the 193 | same "printed page" as the copyright notice for easier 194 | identification within third-party archives. 195 | 196 | Copyright 2020 Matthew Barnett 197 | 198 | Licensed under the Apache License, Version 2.0 (the "License"); 199 | you may not use this file except in compliance with the License. 200 | You may obtain a copy of the License at 201 | 202 | http://www.apache.org/licenses/LICENSE-2.0 203 | 204 | Unless required by applicable law or agreed to in writing, software 205 | distributed under the License is distributed on an "AS IS" BASIS, 206 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 207 | See the License for the specific language governing permissions and 208 | limitations under the License. 209 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include regex_3/*.c 2 | include regex_3/*.h 3 | include regex_3/*.py 4 | include docs/*.* 5 | include tools/*.py 6 | include LICENSE.txt 7 | include pyproject.toml 8 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ------------ 3 | 4 | This regex implementation is backwards-compatible with the standard 're' module, but offers additional functionality. 5 | 6 | Python 2 7 | -------- 8 | 9 | Python 2 is no longer supported. The last release that supported Python 2 was 2021.11.10. 10 | 11 | PyPy 12 | ---- 13 | 14 | This module is targeted at CPython. It expects that all codepoints are the same width, so it won't behave properly with PyPy outside U+0000..U+007F because PyPy stores strings as UTF-8. 15 | 16 | Multithreading 17 | -------------- 18 | 19 | The regex module releases the GIL during matching on instances of the built-in (immutable) string classes, enabling other Python threads to run concurrently. It is also possible to force the regex module to release the GIL during matching by calling the matching methods with the keyword argument ``concurrent=True``. The behaviour is undefined if the string changes during matching, so use it *only* when it is guaranteed that that won't happen. 20 | 21 | Unicode 22 | ------- 23 | 24 | This module supports Unicode 16.0.0. Full Unicode case-folding is supported. 25 | 26 | Flags 27 | ----- 28 | 29 | There are 2 kinds of flag: scoped and global. Scoped flags can apply to only part of a pattern and can be turned on or off; global flags apply to the entire pattern and can only be turned on. 30 | 31 | The scoped flags are: ``ASCII (?a)``, ``FULLCASE (?f)``, ``IGNORECASE (?i)``, ``LOCALE (?L)``, ``MULTILINE (?m)``, ``DOTALL (?s)``, ``UNICODE (?u)``, ``VERBOSE (?x)``, ``WORD (?w)``. 32 | 33 | The global flags are: ``BESTMATCH (?b)``, ``ENHANCEMATCH (?e)``, ``POSIX (?p)``, ``REVERSE (?r)``, ``VERSION0 (?V0)``, ``VERSION1 (?V1)``. 34 | 35 | If neither the ``ASCII``, ``LOCALE`` nor ``UNICODE`` flag is specified, it will default to ``UNICODE`` if the regex pattern is a Unicode string and ``ASCII`` if it's a bytestring. 36 | 37 | The ``ENHANCEMATCH`` flag makes fuzzy matching attempt to improve the fit of the next match that it finds. 38 | 39 | The ``BESTMATCH`` flag makes fuzzy matching search for the best match instead of the next match. 40 | 41 | Old vs new behaviour 42 | -------------------- 43 | 44 | In order to be compatible with the re module, this module has 2 behaviours: 45 | 46 | * **Version 0** behaviour (old behaviour, compatible with the re module): 47 | 48 | Please note that the re module's behaviour may change over time, and I'll endeavour to match that behaviour in version 0. 49 | 50 | * Indicated by the ``VERSION0`` flag. 51 | 52 | * Zero-width matches are not handled correctly in the re module before Python 3.7. The behaviour in those earlier versions is: 53 | 54 | * ``.split`` won't split a string at a zero-width match. 55 | 56 | * ``.sub`` will advance by one character after a zero-width match. 57 | 58 | * Inline flags apply to the entire pattern, and they can't be turned off. 59 | 60 | * Only simple sets are supported. 61 | 62 | * Case-insensitive matches in Unicode use simple case-folding by default. 63 | 64 | * **Version 1** behaviour (new behaviour, possibly different from the re module): 65 | 66 | * Indicated by the ``VERSION1`` flag. 67 | 68 | * Zero-width matches are handled correctly. 69 | 70 | * Inline flags apply to the end of the group or pattern, and they can be turned off. 71 | 72 | * Nested sets and set operations are supported. 73 | 74 | * Case-insensitive matches in Unicode use full case-folding by default. 75 | 76 | If no version is specified, the regex module will default to ``regex.DEFAULT_VERSION``. 77 | 78 | Case-insensitive matches in Unicode 79 | ----------------------------------- 80 | 81 | The regex module supports both simple and full case-folding for case-insensitive matches in Unicode. Use of full case-folding can be turned on using the ``FULLCASE`` flag. Please note that this flag affects how the ``IGNORECASE`` flag works; the ``FULLCASE`` flag itself does not turn on case-insensitive matching. 82 | 83 | Version 0 behaviour: the flag is off by default. 84 | 85 | Version 1 behaviour: the flag is on by default. 86 | 87 | Nested sets and set operations 88 | ------------------------------ 89 | 90 | It's not possible to support both simple sets, as used in the re module, and nested sets at the same time because of a difference in the meaning of an unescaped ``"["`` in a set. 91 | 92 | For example, the pattern ``[[a-z]--[aeiou]]`` is treated in the version 0 behaviour (simple sets, compatible with the re module) as: 93 | 94 | * Set containing "[" and the letters "a" to "z" 95 | 96 | * Literal "--" 97 | 98 | * Set containing letters "a", "e", "i", "o", "u" 99 | 100 | * Literal "]" 101 | 102 | but in the version 1 behaviour (nested sets, enhanced behaviour) as: 103 | 104 | * Set which is: 105 | 106 | * Set containing the letters "a" to "z" 107 | 108 | * but excluding: 109 | 110 | * Set containing the letters "a", "e", "i", "o", "u" 111 | 112 | Version 0 behaviour: only simple sets are supported. 113 | 114 | Version 1 behaviour: nested sets and set operations are supported. 115 | 116 | Notes on named groups 117 | --------------------- 118 | 119 | All groups have a group number, starting from 1. 120 | 121 | Groups with the same group name will have the same group number, and groups with a different group name will have a different group number. 122 | 123 | The same name can be used by more than one group, with later captures 'overwriting' earlier captures. All the captures of the group will be available from the ``captures`` method of the match object. 124 | 125 | Group numbers will be reused across different branches of a branch reset, eg. ``(?|(first)|(second))`` has only group 1. If groups have different group names then they will, of course, have different group numbers, eg. ``(?|(?Pfirst)|(?Psecond))`` has group 1 ("foo") and group 2 ("bar"). 126 | 127 | In the regex ``(\s+)(?|(?P[A-Z]+)|(\w+) (?P[0-9]+)`` there are 2 groups: 128 | 129 | * ``(\s+)`` is group 1. 130 | 131 | * ``(?P[A-Z]+)`` is group 2, also called "foo". 132 | 133 | * ``(\w+)`` is group 2 because of the branch reset. 134 | 135 | * ``(?P[0-9]+)`` is group 2 because it's called "foo". 136 | 137 | If you want to prevent ``(\w+)`` from being group 2, you need to name it (different name, different group number). 138 | 139 | Additional features 140 | ------------------- 141 | 142 | The issue numbers relate to the Python bug tracker, except where listed otherwise. 143 | 144 | Added ``\p{Horiz_Space}`` and ``\p{Vert_Space}`` (`GitHub issue 477 `_) 145 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 146 | 147 | ``\p{Horiz_Space}`` or ``\p{H}`` matches horizontal whitespace and ``\p{Vert_Space}`` or ``\p{V}`` matches vertical whitespace. 148 | 149 | Added support for lookaround in conditional pattern (`Hg issue 163 `_) 150 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 151 | 152 | The test of a conditional pattern can be a lookaround. 153 | 154 | .. sourcecode:: python 155 | 156 | >>> regex.match(r'(?(?=\d)\d+|\w+)', '123abc') 157 | 158 | >>> regex.match(r'(?(?=\d)\d+|\w+)', 'abc123') 159 | 160 | 161 | This is not quite the same as putting a lookaround in the first branch of a pair of alternatives. 162 | 163 | .. sourcecode:: python 164 | 165 | >>> print(regex.match(r'(?:(?=\d)\d+\b|\w+)', '123abc')) 166 | 167 | >>> print(regex.match(r'(?(?=\d)\d+\b|\w+)', '123abc')) 168 | None 169 | 170 | In the first example, the lookaround matched, but the remainder of the first branch failed to match, and so the second branch was attempted, whereas in the second example, the lookaround matched, and the first branch failed to match, but the second branch was **not** attempted. 171 | 172 | Added POSIX matching (leftmost longest) (`Hg issue 150 `_) 173 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 174 | 175 | The POSIX standard for regex is to return the leftmost longest match. This can be turned on using the ``POSIX`` flag. 176 | 177 | .. sourcecode:: python 178 | 179 | >>> # Normal matching. 180 | >>> regex.search(r'Mr|Mrs', 'Mrs') 181 | 182 | >>> regex.search(r'one(self)?(selfsufficient)?', 'oneselfsufficient') 183 | 184 | >>> # POSIX matching. 185 | >>> regex.search(r'(?p)Mr|Mrs', 'Mrs') 186 | 187 | >>> regex.search(r'(?p)one(self)?(selfsufficient)?', 'oneselfsufficient') 188 | 189 | 190 | Note that it will take longer to find matches because when it finds a match at a certain position, it won't return that immediately, but will keep looking to see if there's another longer match there. 191 | 192 | Added ``(?(DEFINE)...)`` (`Hg issue 152 `_) 193 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 194 | 195 | If there's no group called "DEFINE", then ... will be ignored except that any groups defined within it can be called and that the normal rules for numbering groups still apply. 196 | 197 | .. sourcecode:: python 198 | 199 | >>> regex.search(r'(?(DEFINE)(?P\d+)(?P\w+))(?&quant) (?&item)', '5 elephants') 200 | 201 | 202 | Added ``(*PRUNE)``, ``(*SKIP)`` and ``(*FAIL)`` (`Hg issue 153 `_) 203 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 204 | 205 | ``(*PRUNE)`` discards the backtracking info up to that point. When used in an atomic group or a lookaround, it won't affect the enclosing pattern. 206 | 207 | ``(*SKIP)`` is similar to ``(*PRUNE)``, except that it also sets where in the text the next attempt to match will start. When used in an atomic group or a lookaround, it won't affect the enclosing pattern. 208 | 209 | ``(*FAIL)`` causes immediate backtracking. ``(*F)`` is a permitted abbreviation. 210 | 211 | Added ``\K`` (`Hg issue 151 `_) 212 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 213 | 214 | Keeps the part of the entire match after the position where ``\K`` occurred; the part before it is discarded. 215 | 216 | It does not affect what groups return. 217 | 218 | .. sourcecode:: python 219 | 220 | >>> m = regex.search(r'(\w\w\K\w\w\w)', 'abcdef') 221 | >>> m[0] 222 | 'cde' 223 | >>> m[1] 224 | 'abcde' 225 | >>> 226 | >>> m = regex.search(r'(?r)(\w\w\K\w\w\w)', 'abcdef') 227 | >>> m[0] 228 | 'bc' 229 | >>> m[1] 230 | 'bcdef' 231 | 232 | Added capture subscripting for ``expandf`` and ``subf``/``subfn`` (`Hg issue 133 `_) 233 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 234 | 235 | You can use subscripting to get the captures of a repeated group. 236 | 237 | .. sourcecode:: python 238 | 239 | >>> m = regex.match(r"(\w)+", "abc") 240 | >>> m.expandf("{1}") 241 | 'c' 242 | >>> m.expandf("{1[0]} {1[1]} {1[2]}") 243 | 'a b c' 244 | >>> m.expandf("{1[-1]} {1[-2]} {1[-3]}") 245 | 'c b a' 246 | >>> 247 | >>> m = regex.match(r"(?P\w)+", "abc") 248 | >>> m.expandf("{letter}") 249 | 'c' 250 | >>> m.expandf("{letter[0]} {letter[1]} {letter[2]}") 251 | 'a b c' 252 | >>> m.expandf("{letter[-1]} {letter[-2]} {letter[-3]}") 253 | 'c b a' 254 | 255 | Added support for referring to a group by number using ``(?P=...)`` 256 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 257 | 258 | This is in addition to the existing ``\g<...>``. 259 | 260 | Fixed the handling of locale-sensitive regexes 261 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 262 | 263 | The ``LOCALE`` flag is intended for legacy code and has limited support. You're still recommended to use Unicode instead. 264 | 265 | Added partial matches (`Hg issue 102 `_) 266 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 267 | 268 | A partial match is one that matches up to the end of string, but that string has been truncated and you want to know whether a complete match could be possible if the string had not been truncated. 269 | 270 | Partial matches are supported by ``match``, ``search``, ``fullmatch`` and ``finditer`` with the ``partial`` keyword argument. 271 | 272 | Match objects have a ``partial`` attribute, which is ``True`` if it's a partial match. 273 | 274 | For example, if you wanted a user to enter a 4-digit number and check it character by character as it was being entered: 275 | 276 | .. sourcecode:: python 277 | 278 | >>> pattern = regex.compile(r'\d{4}') 279 | 280 | >>> # Initially, nothing has been entered: 281 | >>> print(pattern.fullmatch('', partial=True)) 282 | 283 | 284 | >>> # An empty string is OK, but it's only a partial match. 285 | >>> # The user enters a letter: 286 | >>> print(pattern.fullmatch('a', partial=True)) 287 | None 288 | >>> # It'll never match. 289 | 290 | >>> # The user deletes that and enters a digit: 291 | >>> print(pattern.fullmatch('1', partial=True)) 292 | 293 | >>> # It matches this far, but it's only a partial match. 294 | 295 | >>> # The user enters 2 more digits: 296 | >>> print(pattern.fullmatch('123', partial=True)) 297 | 298 | >>> # It matches this far, but it's only a partial match. 299 | 300 | >>> # The user enters another digit: 301 | >>> print(pattern.fullmatch('1234', partial=True)) 302 | 303 | >>> # It's a complete match. 304 | 305 | >>> # If the user enters another digit: 306 | >>> print(pattern.fullmatch('12345', partial=True)) 307 | None 308 | >>> # It's no longer a match. 309 | 310 | >>> # This is a partial match: 311 | >>> pattern.match('123', partial=True).partial 312 | True 313 | 314 | >>> # This is a complete match: 315 | >>> pattern.match('1233', partial=True).partial 316 | False 317 | 318 | ``*`` operator not working correctly with sub() (`Hg issue 106 `_) 319 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 320 | 321 | Sometimes it's not clear how zero-width matches should be handled. For example, should ``.*`` match 0 characters directly after matching >0 characters? 322 | 323 | .. sourcecode:: python 324 | 325 | >>> regex.sub('.*', 'x', 'test') 326 | 'xx' 327 | >>> regex.sub('.*?', '|', 'test') 328 | '|||||||||' 329 | 330 | Added ``capturesdict`` (`Hg issue 86 `_) 331 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 332 | 333 | ``capturesdict`` is a combination of ``groupdict`` and ``captures``: 334 | 335 | ``groupdict`` returns a dict of the named groups and the last capture of those groups. 336 | 337 | ``captures`` returns a list of all the captures of a group 338 | 339 | ``capturesdict`` returns a dict of the named groups and lists of all the captures of those groups. 340 | 341 | .. sourcecode:: python 342 | 343 | >>> m = regex.match(r"(?:(?P\w+) (?P\d+)\n)+", "one 1\ntwo 2\nthree 3\n") 344 | >>> m.groupdict() 345 | {'word': 'three', 'digits': '3'} 346 | >>> m.captures("word") 347 | ['one', 'two', 'three'] 348 | >>> m.captures("digits") 349 | ['1', '2', '3'] 350 | >>> m.capturesdict() 351 | {'word': ['one', 'two', 'three'], 'digits': ['1', '2', '3']} 352 | 353 | Added ``allcaptures`` and ``allspans`` (`Git issue 474 `_) 354 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 355 | 356 | ``allcaptures`` returns a list of all the captures of all the groups. 357 | 358 | ``allspans`` returns a list of all the spans of the all captures of all the groups. 359 | 360 | .. sourcecode:: python 361 | 362 | >>> m = regex.match(r"(?:(?P\w+) (?P\d+)\n)+", "one 1\ntwo 2\nthree 3\n") 363 | >>> m.allcaptures() 364 | (['one 1\ntwo 2\nthree 3\n'], ['one', 'two', 'three'], ['1', '2', '3']) 365 | >>> m.allspans() 366 | ([(0, 20)], [(0, 3), (6, 9), (12, 17)], [(4, 5), (10, 11), (18, 19)]) 367 | 368 | Allow duplicate names of groups (`Hg issue 87 `_) 369 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 370 | 371 | Group names can be duplicated. 372 | 373 | .. sourcecode:: python 374 | 375 | >>> # With optional groups: 376 | >>> 377 | >>> # Both groups capture, the second capture 'overwriting' the first. 378 | >>> m = regex.match(r"(?P\w+)? or (?P\w+)?", "first or second") 379 | >>> m.group("item") 380 | 'second' 381 | >>> m.captures("item") 382 | ['first', 'second'] 383 | >>> # Only the second group captures. 384 | >>> m = regex.match(r"(?P\w+)? or (?P\w+)?", " or second") 385 | >>> m.group("item") 386 | 'second' 387 | >>> m.captures("item") 388 | ['second'] 389 | >>> # Only the first group captures. 390 | >>> m = regex.match(r"(?P\w+)? or (?P\w+)?", "first or ") 391 | >>> m.group("item") 392 | 'first' 393 | >>> m.captures("item") 394 | ['first'] 395 | >>> 396 | >>> # With mandatory groups: 397 | >>> 398 | >>> # Both groups capture, the second capture 'overwriting' the first. 399 | >>> m = regex.match(r"(?P\w*) or (?P\w*)?", "first or second") 400 | >>> m.group("item") 401 | 'second' 402 | >>> m.captures("item") 403 | ['first', 'second'] 404 | >>> # Again, both groups capture, the second capture 'overwriting' the first. 405 | >>> m = regex.match(r"(?P\w*) or (?P\w*)", " or second") 406 | >>> m.group("item") 407 | 'second' 408 | >>> m.captures("item") 409 | ['', 'second'] 410 | >>> # And yet again, both groups capture, the second capture 'overwriting' the first. 411 | >>> m = regex.match(r"(?P\w*) or (?P\w*)", "first or ") 412 | >>> m.group("item") 413 | '' 414 | >>> m.captures("item") 415 | ['first', ''] 416 | 417 | Added ``fullmatch`` (`issue #16203 `_) 418 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 419 | 420 | ``fullmatch`` behaves like ``match``, except that it must match all of the string. 421 | 422 | .. sourcecode:: python 423 | 424 | >>> print(regex.fullmatch(r"abc", "abc").span()) 425 | (0, 3) 426 | >>> print(regex.fullmatch(r"abc", "abcx")) 427 | None 428 | >>> print(regex.fullmatch(r"abc", "abcx", endpos=3).span()) 429 | (0, 3) 430 | >>> print(regex.fullmatch(r"abc", "xabcy", pos=1, endpos=4).span()) 431 | (1, 4) 432 | >>> 433 | >>> regex.match(r"a.*?", "abcd").group(0) 434 | 'a' 435 | >>> regex.fullmatch(r"a.*?", "abcd").group(0) 436 | 'abcd' 437 | 438 | Added ``subf`` and ``subfn`` 439 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 440 | 441 | ``subf`` and ``subfn`` are alternatives to ``sub`` and ``subn`` respectively. When passed a replacement string, they treat it as a format string. 442 | 443 | .. sourcecode:: python 444 | 445 | >>> regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar") 446 | 'foo bar => bar foo' 447 | >>> regex.subf(r"(?P\w+) (?P\w+)", "{word2} {word1}", "foo bar") 448 | 'bar foo' 449 | 450 | Added ``expandf`` to match object 451 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 452 | 453 | ``expandf`` is an alternative to ``expand``. When passed a replacement string, it treats it as a format string. 454 | 455 | .. sourcecode:: python 456 | 457 | >>> m = regex.match(r"(\w+) (\w+)", "foo bar") 458 | >>> m.expandf("{0} => {2} {1}") 459 | 'foo bar => bar foo' 460 | >>> 461 | >>> m = regex.match(r"(?P\w+) (?P\w+)", "foo bar") 462 | >>> m.expandf("{word2} {word1}") 463 | 'bar foo' 464 | 465 | Detach searched string 466 | ^^^^^^^^^^^^^^^^^^^^^^ 467 | 468 | A match object contains a reference to the string that was searched, via its ``string`` attribute. The ``detach_string`` method will 'detach' that string, making it available for garbage collection, which might save valuable memory if that string is very large. 469 | 470 | .. sourcecode:: python 471 | 472 | >>> m = regex.search(r"\w+", "Hello world") 473 | >>> print(m.group()) 474 | Hello 475 | >>> print(m.string) 476 | Hello world 477 | >>> m.detach_string() 478 | >>> print(m.group()) 479 | Hello 480 | >>> print(m.string) 481 | None 482 | 483 | Recursive patterns (`Hg issue 27 `_) 484 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 485 | 486 | Recursive and repeated patterns are supported. 487 | 488 | ``(?R)`` or ``(?0)`` tries to match the entire regex recursively. ``(?1)``, ``(?2)``, etc, try to match the relevant group. 489 | 490 | ``(?&name)`` tries to match the named group. 491 | 492 | .. sourcecode:: python 493 | 494 | >>> regex.match(r"(Tarzan|Jane) loves (?1)", "Tarzan loves Jane").groups() 495 | ('Tarzan',) 496 | >>> regex.match(r"(Tarzan|Jane) loves (?1)", "Jane loves Tarzan").groups() 497 | ('Jane',) 498 | 499 | >>> m = regex.search(r"(\w)(?:(?R)|(\w?))\1", "kayak") 500 | >>> m.group(0, 1, 2) 501 | ('kayak', 'k', None) 502 | 503 | The first two examples show how the subpattern within the group is reused, but is _not_ itself a group. In other words, ``"(Tarzan|Jane) loves (?1)"`` is equivalent to ``"(Tarzan|Jane) loves (?:Tarzan|Jane)"``. 504 | 505 | It's possible to backtrack into a recursed or repeated group. 506 | 507 | You can't call a group if there is more than one group with that group name or group number (``"ambiguous group reference"``). 508 | 509 | The alternative forms ``(?P>name)`` and ``(?P&name)`` are also supported. 510 | 511 | Full Unicode case-folding is supported 512 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 513 | 514 | In version 1 behaviour, the regex module uses full case-folding when performing case-insensitive matches in Unicode. 515 | 516 | .. sourcecode:: python 517 | 518 | >>> regex.match(r"(?iV1)strasse", "stra\N{LATIN SMALL LETTER SHARP S}e").span() 519 | (0, 6) 520 | >>> regex.match(r"(?iV1)stra\N{LATIN SMALL LETTER SHARP S}e", "STRASSE").span() 521 | (0, 7) 522 | 523 | In version 0 behaviour, it uses simple case-folding for backward compatibility with the re module. 524 | 525 | Approximate "fuzzy" matching (`Hg issue 12 `_, `Hg issue 41 `_, `Hg issue 109 `_) 526 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 527 | 528 | Regex usually attempts an exact match, but sometimes an approximate, or "fuzzy", match is needed, for those cases where the text being searched may contain errors in the form of inserted, deleted or substituted characters. 529 | 530 | A fuzzy regex specifies which types of errors are permitted, and, optionally, either the minimum and maximum or only the maximum permitted number of each type. (You cannot specify only a minimum.) 531 | 532 | The 3 types of error are: 533 | 534 | * Insertion, indicated by "i" 535 | 536 | * Deletion, indicated by "d" 537 | 538 | * Substitution, indicated by "s" 539 | 540 | In addition, "e" indicates any type of error. 541 | 542 | The fuzziness of a regex item is specified between "{" and "}" after the item. 543 | 544 | Examples: 545 | 546 | * ``foo`` match "foo" exactly 547 | 548 | * ``(?:foo){i}`` match "foo", permitting insertions 549 | 550 | * ``(?:foo){d}`` match "foo", permitting deletions 551 | 552 | * ``(?:foo){s}`` match "foo", permitting substitutions 553 | 554 | * ``(?:foo){i,s}`` match "foo", permitting insertions and substitutions 555 | 556 | * ``(?:foo){e}`` match "foo", permitting errors 557 | 558 | If a certain type of error is specified, then any type not specified will **not** be permitted. 559 | 560 | In the following examples I'll omit the item and write only the fuzziness: 561 | 562 | * ``{d<=3}`` permit at most 3 deletions, but no other types 563 | 564 | * ``{i<=1,s<=2}`` permit at most 1 insertion and at most 2 substitutions, but no deletions 565 | 566 | * ``{1<=e<=3}`` permit at least 1 and at most 3 errors 567 | 568 | * ``{i<=2,d<=2,e<=3}`` permit at most 2 insertions, at most 2 deletions, at most 3 errors in total, but no substitutions 569 | 570 | It's also possible to state the costs of each type of error and the maximum permitted total cost. 571 | 572 | Examples: 573 | 574 | * ``{2i+2d+1s<=4}`` each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4 575 | 576 | * ``{i<=1,d<=1,s<=1,2i+2d+1s<=4}`` at most 1 insertion, at most 1 deletion, at most 1 substitution; each insertion costs 2, each deletion costs 2, each substitution costs 1, the total cost must not exceed 4 577 | 578 | You can also use "<" instead of "<=" if you want an exclusive minimum or maximum. 579 | 580 | You can add a test to perform on a character that's substituted or inserted. 581 | 582 | Examples: 583 | 584 | * ``{s<=2:[a-z]}`` at most 2 substitutions, which must be in the character set ``[a-z]``. 585 | 586 | * ``{s<=2,i<=3:\d}`` at most 2 substitutions, at most 3 insertions, which must be digits. 587 | 588 | By default, fuzzy matching searches for the first match that meets the given constraints. The ``ENHANCEMATCH`` flag will cause it to attempt to improve the fit (i.e. reduce the number of errors) of the match that it has found. 589 | 590 | The ``BESTMATCH`` flag will make it search for the best match instead. 591 | 592 | Further examples to note: 593 | 594 | * ``regex.search("(dog){e}", "cat and dog")[1]`` returns ``"cat"`` because that matches ``"dog"`` with 3 errors (an unlimited number of errors is permitted). 595 | 596 | * ``regex.search("(dog){e<=1}", "cat and dog")[1]`` returns ``" dog"`` (with a leading space) because that matches ``"dog"`` with 1 error, which is within the limit. 597 | 598 | * ``regex.search("(?e)(dog){e<=1}", "cat and dog")[1]`` returns ``"dog"`` (without a leading space) because the fuzzy search matches ``" dog"`` with 1 error, which is within the limit, and the ``(?e)`` then it attempts a better fit. 599 | 600 | In the first two examples there are perfect matches later in the string, but in neither case is it the first possible match. 601 | 602 | The match object has an attribute ``fuzzy_counts`` which gives the total number of substitutions, insertions and deletions. 603 | 604 | .. sourcecode:: python 605 | 606 | >>> # A 'raw' fuzzy match: 607 | >>> regex.fullmatch(r"(?:cats|cat){e<=1}", "cat").fuzzy_counts 608 | (0, 0, 1) 609 | >>> # 0 substitutions, 0 insertions, 1 deletion. 610 | 611 | >>> # A better match might be possible if the ENHANCEMATCH flag used: 612 | >>> regex.fullmatch(r"(?e)(?:cats|cat){e<=1}", "cat").fuzzy_counts 613 | (0, 0, 0) 614 | >>> # 0 substitutions, 0 insertions, 0 deletions. 615 | 616 | The match object also has an attribute ``fuzzy_changes`` which gives a tuple of the positions of the substitutions, insertions and deletions. 617 | 618 | .. sourcecode:: python 619 | 620 | >>> m = regex.search('(fuu){i<=2,d<=2,e<=5}', 'anaconda foo bar') 621 | >>> m 622 | 623 | >>> m.fuzzy_changes 624 | ([], [7, 8], [10, 11]) 625 | 626 | What this means is that if the matched part of the string had been: 627 | 628 | .. sourcecode:: python 629 | 630 | 'anacondfuuoo bar' 631 | 632 | it would've been an exact match. 633 | 634 | However, there were insertions at positions 7 and 8: 635 | 636 | .. sourcecode:: python 637 | 638 | 'anaconda fuuoo bar' 639 | ^^ 640 | 641 | and deletions at positions 10 and 11: 642 | 643 | .. sourcecode:: python 644 | 645 | 'anaconda f~~oo bar' 646 | ^^ 647 | 648 | So the actual string was: 649 | 650 | .. sourcecode:: python 651 | 652 | 'anaconda foo bar' 653 | 654 | Named lists ``\L`` (`Hg issue 11 `_) 655 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 656 | 657 | There are occasions where you may want to include a list (actually, a set) of options in a regex. 658 | 659 | One way is to build the pattern like this: 660 | 661 | .. sourcecode:: python 662 | 663 | >>> p = regex.compile(r"first|second|third|fourth|fifth") 664 | 665 | but if the list is large, parsing the resulting regex can take considerable time, and care must also be taken that the strings are properly escaped and properly ordered, for example, "cats" before "cat". 666 | 667 | The new alternative is to use a named list: 668 | 669 | .. sourcecode:: python 670 | 671 | >>> option_set = ["first", "second", "third", "fourth", "fifth"] 672 | >>> p = regex.compile(r"\L", options=option_set) 673 | 674 | The order of the items is irrelevant, they are treated as a set. The named lists are available as the ``.named_lists`` attribute of the pattern object : 675 | 676 | .. sourcecode:: python 677 | 678 | >>> print(p.named_lists) 679 | {'options': frozenset({'third', 'first', 'fifth', 'fourth', 'second'})} 680 | 681 | If there are any unused keyword arguments, ``ValueError`` will be raised unless you tell it otherwise: 682 | 683 | .. sourcecode:: python 684 | 685 | >>> option_set = ["first", "second", "third", "fourth", "fifth"] 686 | >>> p = regex.compile(r"\L", options=option_set, other_options=[]) 687 | Traceback (most recent call last): 688 | File "", line 1, in 689 | File "C:\Python310\lib\site-packages\regex\regex.py", line 353, in compile 690 | return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern) 691 | File "C:\Python310\lib\site-packages\regex\regex.py", line 500, in _compile 692 | complain_unused_args() 693 | File "C:\Python310\lib\site-packages\regex\regex.py", line 483, in complain_unused_args 694 | raise ValueError('unused keyword argument {!a}'.format(any_one)) 695 | ValueError: unused keyword argument 'other_options' 696 | >>> p = regex.compile(r"\L", options=option_set, other_options=[], ignore_unused=True) 697 | >>> p = regex.compile(r"\L", options=option_set, other_options=[], ignore_unused=False) 698 | Traceback (most recent call last): 699 | File "", line 1, in 700 | File "C:\Python310\lib\site-packages\regex\regex.py", line 353, in compile 701 | return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern) 702 | File "C:\Python310\lib\site-packages\regex\regex.py", line 500, in _compile 703 | complain_unused_args() 704 | File "C:\Python310\lib\site-packages\regex\regex.py", line 483, in complain_unused_args 705 | raise ValueError('unused keyword argument {!a}'.format(any_one)) 706 | ValueError: unused keyword argument 'other_options' 707 | >>> 708 | 709 | Start and end of word 710 | ^^^^^^^^^^^^^^^^^^^^^ 711 | 712 | ``\m`` matches at the start of a word. 713 | 714 | ``\M`` matches at the end of a word. 715 | 716 | Compare with ``\b``, which matches at the start or end of a word. 717 | 718 | Unicode line separators 719 | ^^^^^^^^^^^^^^^^^^^^^^^ 720 | 721 | Normally the only line separator is ``\n`` (``\x0A``), but if the ``WORD`` flag is turned on then the line separators are ``\x0D\x0A``, ``\x0A``, ``\x0B``, ``\x0C`` and ``\x0D``, plus ``\x85``, ``\u2028`` and ``\u2029`` when working with Unicode. 722 | 723 | This affects the regex dot ``"."``, which, with the ``DOTALL`` flag turned off, matches any character except a line separator. It also affects the line anchors ``^`` and ``$`` (in multiline mode). 724 | 725 | Set operators 726 | ^^^^^^^^^^^^^ 727 | 728 | **Version 1 behaviour only** 729 | 730 | Set operators have been added, and a set ``[...]`` can include nested sets. 731 | 732 | The operators, in order of increasing precedence, are: 733 | 734 | * ``||`` for union ("x||y" means "x or y") 735 | 736 | * ``~~`` (double tilde) for symmetric difference ("x~~y" means "x or y, but not both") 737 | 738 | * ``&&`` for intersection ("x&&y" means "x and y") 739 | 740 | * ``--`` (double dash) for difference ("x--y" means "x but not y") 741 | 742 | Implicit union, ie, simple juxtaposition like in ``[ab]``, has the highest precedence. Thus, ``[ab&&cd]`` is the same as ``[[a||b]&&[c||d]]``. 743 | 744 | Examples: 745 | 746 | * ``[ab]`` # Set containing 'a' and 'b' 747 | 748 | * ``[a-z]`` # Set containing 'a' .. 'z' 749 | 750 | * ``[[a-z]--[qw]]`` # Set containing 'a' .. 'z', but not 'q' or 'w' 751 | 752 | * ``[a-z--qw]`` # Same as above 753 | 754 | * ``[\p{L}--QW]`` # Set containing all letters except 'Q' and 'W' 755 | 756 | * ``[\p{N}--[0-9]]`` # Set containing all numbers except '0' .. '9' 757 | 758 | * ``[\p{ASCII}&&\p{Letter}]`` # Set containing all characters which are ASCII and letter 759 | 760 | regex.escape (`issue #2650 `_) 761 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 762 | 763 | regex.escape has an additional keyword parameter ``special_only``. When True, only 'special' regex characters, such as '?', are escaped. 764 | 765 | .. sourcecode:: python 766 | 767 | >>> regex.escape("foo!?", special_only=False) 768 | 'foo\\!\\?' 769 | >>> regex.escape("foo!?", special_only=True) 770 | 'foo!\\?' 771 | 772 | regex.escape (`Hg issue 249 `_) 773 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 774 | 775 | regex.escape has an additional keyword parameter ``literal_spaces``. When True, spaces are not escaped. 776 | 777 | .. sourcecode:: python 778 | 779 | >>> regex.escape("foo bar!?", literal_spaces=False) 780 | 'foo\\ bar!\\?' 781 | >>> regex.escape("foo bar!?", literal_spaces=True) 782 | 'foo bar!\\?' 783 | 784 | Repeated captures (`issue #7132 `_) 785 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 786 | 787 | A match object has additional methods which return information on all the successful matches of a repeated group. These methods are: 788 | 789 | * ``matchobject.captures([group1, ...])`` 790 | 791 | * Returns a list of the strings matched in a group or groups. Compare with ``matchobject.group([group1, ...])``. 792 | 793 | * ``matchobject.starts([group])`` 794 | 795 | * Returns a list of the start positions. Compare with ``matchobject.start([group])``. 796 | 797 | * ``matchobject.ends([group])`` 798 | 799 | * Returns a list of the end positions. Compare with ``matchobject.end([group])``. 800 | 801 | * ``matchobject.spans([group])`` 802 | 803 | * Returns a list of the spans. Compare with ``matchobject.span([group])``. 804 | 805 | .. sourcecode:: python 806 | 807 | >>> m = regex.search(r"(\w{3})+", "123456789") 808 | >>> m.group(1) 809 | '789' 810 | >>> m.captures(1) 811 | ['123', '456', '789'] 812 | >>> m.start(1) 813 | 6 814 | >>> m.starts(1) 815 | [0, 3, 6] 816 | >>> m.end(1) 817 | 9 818 | >>> m.ends(1) 819 | [3, 6, 9] 820 | >>> m.span(1) 821 | (6, 9) 822 | >>> m.spans(1) 823 | [(0, 3), (3, 6), (6, 9)] 824 | 825 | Atomic grouping ``(?>...)`` (`issue #433030 `_) 826 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 827 | 828 | If the following pattern subsequently fails, then the subpattern as a whole will fail. 829 | 830 | Possessive quantifiers 831 | ^^^^^^^^^^^^^^^^^^^^^^ 832 | 833 | ``(?:...)?+`` ; ``(?:...)*+`` ; ``(?:...)++`` ; ``(?:...){min,max}+`` 834 | 835 | The subpattern is matched up to 'max' times. If the following pattern subsequently fails, then all the repeated subpatterns will fail as a whole. For example, ``(?:...)++`` is equivalent to ``(?>(?:...)+)``. 836 | 837 | Scoped flags (`issue #433028 `_) 838 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 839 | 840 | ``(?flags-flags:...)`` 841 | 842 | The flags will apply only to the subpattern. Flags can be turned on or off. 843 | 844 | Definition of 'word' character (`issue #1693050 `_) 845 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 846 | 847 | The definition of a 'word' character has been expanded for Unicode. It conforms to the Unicode specification at ``http://www.unicode.org/reports/tr29/``. 848 | 849 | Variable-length lookbehind 850 | ^^^^^^^^^^^^^^^^^^^^^^^^^^ 851 | 852 | A lookbehind can match a variable-length string. 853 | 854 | Flags argument for regex.split, regex.sub and regex.subn (`issue #3482 `_) 855 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 856 | 857 | ``regex.split``, ``regex.sub`` and ``regex.subn`` support a 'flags' argument. 858 | 859 | Pos and endpos arguments for regex.sub and regex.subn 860 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 861 | 862 | ``regex.sub`` and ``regex.subn`` support 'pos' and 'endpos' arguments. 863 | 864 | 'Overlapped' argument for regex.findall and regex.finditer 865 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 866 | 867 | ``regex.findall`` and ``regex.finditer`` support an 'overlapped' flag which permits overlapped matches. 868 | 869 | Splititer 870 | ^^^^^^^^^ 871 | 872 | ``regex.splititer`` has been added. It's a generator equivalent of ``regex.split``. 873 | 874 | Subscripting match objects for groups 875 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 876 | 877 | A match object accepts access to the groups via subscripting and slicing: 878 | 879 | .. sourcecode:: python 880 | 881 | >>> m = regex.search(r"(?P.*?)(?P\d+)(?P.*)", "pqr123stu") 882 | >>> print(m["before"]) 883 | pqr 884 | >>> print(len(m)) 885 | 4 886 | >>> print(m[:]) 887 | ('pqr123stu', 'pqr', '123', 'stu') 888 | 889 | Named groups 890 | ^^^^^^^^^^^^ 891 | 892 | Groups can be named with ``(?...)`` as well as the existing ``(?P...)``. 893 | 894 | Group references 895 | ^^^^^^^^^^^^^^^^ 896 | 897 | Groups can be referenced within a pattern with ``\g``. This also allows there to be more than 99 groups. 898 | 899 | Named characters ``\N{name}`` 900 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 901 | 902 | Named characters are supported. Note that only those known by Python's Unicode database will be recognised. 903 | 904 | Unicode codepoint properties, including scripts and blocks 905 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 906 | 907 | ``\p{property=value}``; ``\P{property=value}``; ``\p{value}`` ; ``\P{value}`` 908 | 909 | Many Unicode properties are supported, including blocks and scripts. ``\p{property=value}`` or ``\p{property:value}`` matches a character whose property ``property`` has value ``value``. The inverse of ``\p{property=value}`` is ``\P{property=value}`` or ``\p{^property=value}``. 910 | 911 | If the short form ``\p{value}`` is used, the properties are checked in the order: ``General_Category``, ``Script``, ``Block``, binary property: 912 | 913 | * ``Latin``, the 'Latin' script (``Script=Latin``). 914 | 915 | * ``BasicLatin``, the 'BasicLatin' block (``Block=BasicLatin``). 916 | 917 | * ``Alphabetic``, the 'Alphabetic' binary property (``Alphabetic=Yes``). 918 | 919 | A short form starting with ``Is`` indicates a script or binary property: 920 | 921 | * ``IsLatin``, the 'Latin' script (``Script=Latin``). 922 | 923 | * ``IsAlphabetic``, the 'Alphabetic' binary property (``Alphabetic=Yes``). 924 | 925 | A short form starting with ``In`` indicates a block property: 926 | 927 | * ``InBasicLatin``, the 'BasicLatin' block (``Block=BasicLatin``). 928 | 929 | POSIX character classes 930 | ^^^^^^^^^^^^^^^^^^^^^^^ 931 | 932 | ``[[:alpha:]]``; ``[[:^alpha:]]`` 933 | 934 | POSIX character classes are supported. These are normally treated as an alternative form of ``\p{...}``. 935 | 936 | The exceptions are ``alnum``, ``digit``, ``punct`` and ``xdigit``, whose definitions are different from those of Unicode. 937 | 938 | ``[[:alnum:]]`` is equivalent to ``\p{posix_alnum}``. 939 | 940 | ``[[:digit:]]`` is equivalent to ``\p{posix_digit}``. 941 | 942 | ``[[:punct:]]`` is equivalent to ``\p{posix_punct}``. 943 | 944 | ``[[:xdigit:]]`` is equivalent to ``\p{posix_xdigit}``. 945 | 946 | Search anchor ``\G`` 947 | ^^^^^^^^^^^^^^^^^^^^ 948 | 949 | A search anchor has been added. It matches at the position where each search started/continued and can be used for contiguous matches or in negative variable-length lookbehinds to limit how far back the lookbehind goes: 950 | 951 | .. sourcecode:: python 952 | 953 | >>> regex.findall(r"\w{2}", "abcd ef") 954 | ['ab', 'cd', 'ef'] 955 | >>> regex.findall(r"\G\w{2}", "abcd ef") 956 | ['ab', 'cd'] 957 | 958 | * The search starts at position 0 and matches 'ab'. 959 | 960 | * The search continues at position 2 and matches 'cd'. 961 | 962 | * The search continues at position 4 and fails to match any letters. 963 | 964 | * The anchor stops the search start position from being advanced, so there are no more results. 965 | 966 | Reverse searching 967 | ^^^^^^^^^^^^^^^^^ 968 | 969 | Searches can also work backwards: 970 | 971 | .. sourcecode:: python 972 | 973 | >>> regex.findall(r".", "abc") 974 | ['a', 'b', 'c'] 975 | >>> regex.findall(r"(?r).", "abc") 976 | ['c', 'b', 'a'] 977 | 978 | Note that the result of a reverse search is not necessarily the reverse of a forward search: 979 | 980 | .. sourcecode:: python 981 | 982 | >>> regex.findall(r"..", "abcde") 983 | ['ab', 'cd'] 984 | >>> regex.findall(r"(?r)..", "abcde") 985 | ['de', 'bc'] 986 | 987 | Matching a single grapheme ``\X`` 988 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 989 | 990 | The grapheme matcher is supported. It conforms to the Unicode specification at ``http://www.unicode.org/reports/tr29/``. 991 | 992 | Branch reset ``(?|...|...)`` 993 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 994 | 995 | Group numbers will be reused across the alternatives, but groups with different names will have different group numbers. 996 | 997 | .. sourcecode:: python 998 | 999 | >>> regex.match(r"(?|(first)|(second))", "first").groups() 1000 | ('first',) 1001 | >>> regex.match(r"(?|(first)|(second))", "second").groups() 1002 | ('second',) 1003 | 1004 | Note that there is only one group. 1005 | 1006 | Default Unicode word boundary 1007 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 1008 | 1009 | The ``WORD`` flag changes the definition of a 'word boundary' to that of a default Unicode word boundary. This applies to ``\b`` and ``\B``. 1010 | 1011 | Timeout 1012 | ^^^^^^^ 1013 | 1014 | The matching methods and functions support timeouts. The timeout (in seconds) applies to the entire operation: 1015 | 1016 | .. sourcecode:: python 1017 | 1018 | >>> from time import sleep 1019 | >>> 1020 | >>> def fast_replace(m): 1021 | ... return 'X' 1022 | ... 1023 | >>> def slow_replace(m): 1024 | ... sleep(0.5) 1025 | ... return 'X' 1026 | ... 1027 | >>> regex.sub(r'[a-z]', fast_replace, 'abcde', timeout=2) 1028 | 'XXXXX' 1029 | >>> regex.sub(r'[a-z]', slow_replace, 'abcde', timeout=2) 1030 | Traceback (most recent call last): 1031 | File "", line 1, in 1032 | File "C:\Python310\lib\site-packages\regex\regex.py", line 278, in sub 1033 | return pat.sub(repl, string, count, pos, endpos, concurrent, timeout) 1034 | TimeoutError: regex timed out 1035 | -------------------------------------------------------------------------------- /changelog.txt: -------------------------------------------------------------------------------- 1 | Version: 2025.5.18 2 | 3 | Updated main.yml to build Windows ARM64/aarch64 wheel. 4 | 5 | Updated licence text format in pyproject.toml. 6 | 7 | Version: 2025.2.13 8 | 9 | Dropping support for Python 3.8 and removing it from main.yml. 10 | 11 | Version: 2025.2.12 12 | 13 | Further fixes to main.yml. 14 | 15 | Version: 2025.2.11 16 | 17 | Updated main.yml to Artifacts v4. 18 | 19 | Version: 2025.2.10 20 | 21 | Git issue 551: Infinite loop on V1 search 22 | 23 | It's catastrophic backtracking due to the possibilities of full casefolding. 24 | 25 | Replacing `[\s\S]` with `(?s:.)` can help, but a proper fix would be more difficult. 26 | 27 | Version: 2024.11.7 28 | 29 | Updated pyproject.toml and setup.py according to PEP 517. 30 | 31 | Version: 2024.11.6 32 | 33 | Git issue 546: Partial match not working in some instances with non-greedy capture 34 | 35 | Version: 2024.9.14 36 | 37 | Reverted to actions/download-artifact@v3 and actions/upload-artifact@v3 in main.yml because GitHub Actions failed when using them. 38 | 39 | Version: 2024.9.13 40 | 41 | Updated to actions/upload-artifact@v4 in main.yml. 42 | 43 | Version: 2024.9.12 44 | 45 | Updated to actions/download-artifact@v4 in main.yml. 46 | 47 | Version: 2024.9.11 48 | 49 | Updated to Unicode 16.0.0. 50 | 51 | Version: 2024.7.24 52 | 53 | Git issue 539: Bug: Partial matching fails on a simple example 54 | 55 | Version: 2024.6.22 56 | 57 | Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to missing new GB9c rule implementation 58 | 59 | Version: 2024.5.15 60 | 61 | Git issue 530: hangs with fuzzy and optionals 62 | 63 | It's not hanging, it'll finish eventually. It's just an example of catastrophic backtracking. 64 | 65 | The error printed when Ctrl+C is pressed does show a bug, though, which is now fixed. 66 | 67 | Version: 2024.5.10 68 | 69 | Updated for Python 3.13. 70 | 71 | now needs to be included explicitly because Python.h no longer includes it. 72 | 73 | Version: 2024.4.28 74 | 75 | Git issue 527: `VERBOSE`/`X` flag breaks `\N` escapes 76 | 77 | Version: 2024.4.16 78 | 79 | Git issue 525: segfault when fuzzy matching empty list 80 | 81 | Version: 2023.12.25 82 | 83 | Cannot get release notification action in main.yml to work. Commenting it out for now. 84 | 85 | Version: 2023.12.24 86 | 87 | Fixed invalid main.yml. 88 | 89 | Version: 2023.12.23 90 | 91 | The escape function no longer escapes \x00. It's not necessary. 92 | 93 | Inline flags can now be turned off and apply to what follows. 94 | 95 | Added \R to match line endings. 96 | 97 | Version: 2023.10.3 98 | 99 | Updated to Unicode 15.1.0. 100 | 101 | Version: 2023.8.8 102 | 103 | Git issue 508: Regex doesn't build using CPython main (3.13.0a0) 104 | Removed usage of _PyBytes_Join and did a little tidying of the code that makes the result string. 105 | 106 | Version: 2023.6.3 107 | 108 | Git issue 498: Conditional negative lookahead inside positive lookahead fails to match 109 | Conditional node needed an additional member that points to the true branch. 110 | 111 | Version: 2023.5.5 112 | 113 | Removed semicolon after 'else' in 'munge_name'. 114 | 115 | Version: 2023.5.4 116 | 117 | Fixed pyproject.toml and setup.py. 118 | 119 | Version: 2023.5.3 120 | 121 | pyproject.toml was missing. 122 | 123 | Version: 2023.5.2 124 | 125 | Added pyproject.toml. 126 | 127 | Version: 2023.3.23 128 | 129 | Git issue 495: Running time for failing fullmatch increases rapidly with input length 130 | Re-enabled modified repeat guards due to regression in speed caused by excessive backtracking. 131 | 132 | Version: 2023.3.22 133 | 134 | Git issue 494: Backtracking failure matching regex `^a?(a?)b?c\1$` against string `abca` 135 | Disabled repeat guards. They keep causing issues, and it's just simpler to rely on timeouts. 136 | 137 | Version: 2022.10.31 138 | 139 | Updated text for supported Unicode and Python versions. 140 | 141 | Version: 2022.9.13 142 | 143 | Updated to Unicode 15.0.0. 144 | 145 | Version: 2022.9.11 146 | 147 | Updated version. 148 | 149 | Version: 2022.8.17 150 | 151 | Git issue 477: \v for vertical spacing 152 | 153 | Added \p{HorizSpace} (\p{H}) and \p{VertSpace} (\p{V}). 154 | 155 | Version: 2022.7.25 156 | 157 | Git issue 475: 2022.7.24 improperly released 158 | 159 | The file https://pypi.org/pypi/regex/2022.7.24/json was missing references to most of the wheels, so this is a new release in the hope that it was just a glitch in GitHub Actions. 160 | 161 | Version: 2022.7.24 162 | 163 | Git issue 474: regex has no equivalent to re.Match.groups() for captures 164 | 165 | Added 'allcaptures' and 'allspans' methods to match objects. 166 | 167 | Fixed bug where compiling a pattern didn't always check for unused arguments. 168 | 169 | Version: 2022.7.9 170 | 171 | Git issue 473: Emoji classified as letter 172 | 173 | The values for GC:Assigned and GC:LC were flipped. 174 | 175 | Version: 2022.6.2 176 | 177 | Git issue 472: Revisit compilation flag to prevent adding a single explicitly compiled regex to the cache 178 | 179 | Added 'cache_pattern' parameter to 'compile' function to improve use of the cache. 180 | 181 | Version: 2022.4.24 182 | 183 | Git issue 467: Scoped inline flags 'a', 'u' and 'L' affect global flags 184 | 185 | Those flags scan now be scoped. 186 | 187 | Version: 2022.3.15 188 | 189 | Git issue 457: Difference with `re`, when repl returns None 190 | 191 | Make regex consistent with re by treating a replacement template of None as ''. 192 | 193 | Also, now rejects invalid ASCII escapes like re module does. 194 | 195 | Version: 2022.3.2 196 | 197 | Git issue 453: Document last supported python2 version 198 | 199 | Added a brief reference to the last version to support Python 2 in README.rst. 200 | 201 | Git issue 456: RegexFlag exists in re, but not regex 202 | 203 | Updated the flags to use enum now that regex supports only Python 3.6+. 204 | 205 | Version: 2022.1.21 206 | 207 | Added 'python_requires' to setup.py now that Python 2 no longer supported. 208 | 209 | Version: 2022.1.18 210 | 211 | Updated version for new release. 212 | 213 | Version: 2022.1.18 214 | 215 | * Dropped support for Python 2 and remove all references to Python <3.6, the earliest supported version. 216 | 217 | Removed Features.rst, which was just a duplicate of README.rst. 218 | 219 | Version: 2021.11.9 220 | 221 | Git issue 443: 2021.11.9 source release is missing C headers 222 | 223 | Updated version. 224 | 225 | Version: 2021.11.9 226 | 227 | Git issue 442: Fuzzy regex matching doesn't seem to test insertions correctly 228 | 229 | Version: 2021.11.2 230 | 231 | Git issue 435: Unmatched groups: sub vs subf 232 | 233 | A similar fix also applies to expandf: unmatched groups should expand to an empty string. 234 | 235 | Version: 2021.11.2 236 | 237 | Removed unused functions. 238 | 239 | Added long description type to setup.py. 240 | 241 | Version: 2021.11.1 242 | 243 | Further changes for migration to Github. 244 | 245 | Version: 2021.10.23 246 | 247 | Git issue 433: Disagreement between fuzzy_counts and fuzzy_changes 248 | 249 | Fuzzy changes were sometimes not removed when backtracking. 250 | 251 | Version: 2021.10.21 252 | 253 | Removed Apple Silicon build from .travis.yml because it's not currently codesigned by Travis CI. 254 | 255 | Version: 2021.10.8 256 | 257 | Git issue 428: match hangs on the following example - possible infinite loop? 258 | 259 | Fixed miscalculation of total error count when there's more than one fuzzy term. 260 | 261 | Version: 2021.9.30 262 | 263 | Git issue 427: Possible bug with BESTMATCH 264 | 265 | Version: 2021.9.24 266 | 267 | Updated to Unicode 14.0.0. 268 | 269 | Version: 2021.8.27 270 | 271 | Git issue 421: 2021.8.27 results in "Fatal Python error: Segmentation fault" 272 | 273 | Fixed problems with use of fast searching tables in opposite direction. 274 | 275 | Version: 2021.8.27 276 | 277 | Git issue 420: segmentation fault in finditer (maybe others) 278 | 279 | Fixed a bugs in fast searches in reverse direction. 280 | 281 | Version: 2021.8.21 282 | 283 | Updated version. 284 | 285 | Version: 2021.8.3 286 | 287 | Forgot to update version! 288 | 289 | Version: 2021.7.6 290 | 291 | Additional fix for Git issue 415. 292 | 293 | Version: 2021.7.5 294 | 295 | Git issue 415: Fuzzy character restrictions don't apply to insertions at "right edge" 296 | 297 | Version: 2021.7.1 298 | 299 | Git issue 407: API is not a drop-in replacement for python's re when it comes to typing 300 | 301 | Now exports Match object as well as Pattern object. 302 | 303 | Git issue 414: Memory optimization questions 304 | 305 | sys.getsizeof returns a more accurate size of a pattern object. It includes the size of internal data, but, as is the norm, does not include the size of public objects. 306 | 307 | Version: 2021.4.4 308 | 309 | Git issue 408: regex fails with a quantified backreference but succeeds with repeated backref 310 | Git issue 407: API is not a drop-in replacement for python's re when it comes to typing 311 | 312 | Version: 2021.3.17 313 | 314 | Git issue 403: Fuzzy matching with wrong distance (unnecessary substitutions) 315 | 316 | Reworked the fuzzy matching code. 317 | 318 | Version: 2020.11.13 319 | 320 | Git issue 394: Unexpected behaviour in fuzzy matching with limited character set with IGNORECASE flag 321 | 322 | Version: 2020.11.11 323 | 324 | Update version. 325 | 326 | Version: 2020.11.2 327 | 328 | Updated list of supported Python versions. 329 | 330 | Added .travis.yml file. 331 | 332 | Version: 2020.10.28 333 | 334 | Git issue 362: Any LICENSE work for this project? 335 | 336 | Changed licence to Apache 2.0 and added licence file. 337 | 338 | Version: 2020.10.23 339 | 340 | Git issue 387: Compilaton flag to avoid storing compiled regexp in internal cache 341 | 342 | Slight reversion/revision. You can prevent explicitly-compiled patterns from being cached by using "cache_all(False)". 343 | 344 | Version: 2020.10.22 345 | 346 | Git issue 387: Compilaton flag to avoid storing compiled regexp in internal cache 347 | 348 | No longer caches patterns that are compiled explicitly. 349 | 350 | Version: 2020.10.15 351 | 352 | Git issue 386: GCC 10 warnings 353 | 354 | Fixed bugs in fuzzy_match_string_fld and fuzzy_match_group_fld. 355 | 356 | Added more braces around data in some Unicode tables. 357 | 358 | Version: 2020.10.11 359 | 360 | Git issue 385: Comments in expressions 361 | 362 | Didn't parse regex comments property when in VERBOSE mode. 363 | 364 | Version: 2020.9.27 365 | 366 | Git issue 383: Memory Error - regex.findall 367 | 368 | The problem was caused by a lazy repeat looping forever, growing the backtracking stack. Greedy repeats were OK. 369 | 370 | Version: 2020.7.14 371 | 372 | Git issue 377: request: \h for horizontal space 373 | 374 | Added \h as an alias to [[:blank:]]. 375 | 376 | Version: 2020.6.7 377 | 378 | Git issue 376: Is the \L option as efficient as it can be? 379 | 380 | Improved performance of string sets. 381 | 382 | Version: 2020.6.7 383 | 384 | Git issue 376: Is the \L option as efficient as it can be? 385 | 386 | Switched StringSet to use fallback method due to inefficiencies in the engine. Needs more investigation. 387 | 388 | Version: 2020.5.14 389 | 390 | Git issue 372: Regression from 2020.4.4 -> 2020.5.7 in non-fuzzy matching pattern 391 | 392 | Changed the 'state' member that's tested in is_repeat_guarded for a fuzzy match. The previously-used member wasn't initialised in a non-fuzzy match. The new test is a better one to use anyway. 393 | 394 | Version: 2020.5.13 395 | 396 | Git issue 371: Specifying character set when fuzzy-matching allows characters not in the set 397 | 398 | fuzzy_ext_match and fuzzy_ext_match_group_fld didn't support sets! 399 | 400 | Version: 2020.5.7 401 | 402 | Git issue 370: Confusions about Fuzzy matching behavior (prob a bug?) 403 | 404 | Version: 2020.4.4 405 | 406 | Updated to Unicode 13.0.0. 407 | 408 | Version: 2020.2.20 409 | 410 | Git issue 365: Memory leak occurs in fuzzy match at some substitution use cases 411 | 412 | Version: 2020.2.18 413 | 414 | Git issue #364: Contradictory values in fuzzy_counts and fuzzy_changes 415 | 416 | Version: 2020.1.7 417 | 418 | Issue 357: New exception "ValueError: unused keyword argument" breaks use case 419 | 420 | Added ignore_unused keyword argument. 421 | 422 | Issue 359: 2020.1.7 source distribution release contains \r\n line endings 423 | 424 | Fixed line endings for source distribution. 425 | 426 | Issue 360: Invalid modeline in `_regex.c` 427 | 428 | Removed vim modeline. 429 | 430 | Version: 2020.1.7 431 | 432 | Fix to previous change. 433 | 434 | Version: 2019.12.21 435 | 436 | Hg issue 353: fuzzy changes negative indexes 437 | 438 | Fuzzy change positions were off by 1 for deletions. 439 | 440 | Version: 2019.12.18 441 | 442 | Another complaint from Linux. 443 | 444 | Version: 2019.12.17 445 | 446 | New release and upload because of problem with source distribution. 447 | 448 | Version: 2019.12.17 449 | 450 | New release and upload because of previous issues. 451 | 452 | Version: 2019.12.17 453 | 454 | Make changes to setup.py. 455 | 456 | Version: 2019.12.16 457 | 458 | Discarded changes for Linux. 459 | 460 | Version: 2019.12.16 461 | 462 | Backed out changeset: f57e64d2085b 463 | 464 | Version: 2019.12.15 465 | 466 | add bdist_wheel command to setup.py 467 | 468 | Version: 2019.12.9 469 | 470 | Hg issue 348: '\X' (extended grapheme cluster) can't pass Unicode's GraphemeBreakTest (12.1.0) 471 | 472 | Fixed a couple of bugs in unicode_at_grapheme_boundary. 473 | 474 | Version: 2019.10.31 475 | 476 | Made "Additional Features" linkable. 477 | 478 | Updated setup. 479 | 480 | Minor bug fix. 481 | 482 | Version: 2019.8.19 483 | 484 | Hg issue 338: specifying allowed characters when fuzzy-matching 485 | 486 | Added character testing to a fuzzy constraint. 487 | 488 | Version: 2019.6.8 489 | 490 | Hg issue 333: error when installing regex on PyPy2.7 v7.1.1 on Windows 491 | 492 | PyPy isn't officially supported, but this might fix it! 493 | 494 | Version: 2019.6.5 495 | 496 | Updated for Python 3.8. 497 | 498 | Version: 2019.6.2 499 | 500 | Updated to Unicode 12.1.0. 501 | 502 | Version: 2019.5.25 503 | 504 | Hg issue 329: Wrong group matches when question mark quantifier is used within a look behind 505 | 506 | REPEAT_ONE was backtracking in the wrong direction, so it never hit the limit. 507 | 508 | Version: 2019.4.14 509 | 510 | Hg issue 327: .fullmatch() causes MemoryError 511 | 512 | For fullmatch, added check for end/start of string for RE_OP_SUCCESS in try_match. 513 | 514 | Version: 2019.4.12 515 | 516 | Missing brace in 'state_fini'. 517 | 518 | Version: 2019.4.10 519 | 520 | Hg issue 325: module docstring not accessible 521 | 522 | Additional fix for regex_3/regex.py. 523 | 524 | Hg issue 326: Version is out of sync with PyPI 525 | 526 | Version: 2019.3.12 527 | 528 | Hg issue 319: Support for a timespan parameter 529 | 530 | Added timeout parameter in Python 3. TimeoutError was added in Python 3.3, and as Python 2.7 will soon reach EOL, I'm not bothered about supporting timeouts for Python 2. 531 | 532 | Version: 2019.3.9 533 | 534 | Hg issue 320: Abnormal performance 535 | 536 | Forgot about negative lookarounds! Previous change now applies only to positive lookarounds. 537 | 538 | Version: 2019.3.8 539 | 540 | Hg issue 320: Abnormal performance 541 | 542 | Included firstset from lookaround, where appropriate. 543 | 544 | Version: 2019.2.21 545 | 546 | Hg issue 316: __version__ no longer accessible via regex.__version__ 547 | 548 | Version: 2019.2.20 549 | 550 | Hg issue 314: Import error: "No module named regex._regex_core" 551 | 552 | Was OK for wheels, but not for setup.py. 553 | 554 | Version: 2019.2.19 555 | 556 | Hg issue 313: test_regex.py ends up in site-packages/test_regex.py 557 | 558 | Tidied files away into subfolder in site-packages. 559 | 560 | Version: 2019.2.18 561 | 562 | Fixed bug in unicode_at_grapheme_boundary. 563 | 564 | Version: 2019.2.7 565 | 566 | Moved some StateData declarations to make the code conform to the C89 standard. 567 | 568 | Version: 2019.2.6 569 | 570 | Lookarounds no longer save the repeat data. Lookarounds no longer saves captures if they don't contain any groups. 571 | 572 | Version: 2019.2.5 573 | 574 | Atomic groups no longer save the repeat data; that proved to be unnecessary. 575 | 576 | Version: 2019.2.3 577 | 578 | Further improvements to the new code. 579 | 580 | Version: 2019.1.24 581 | 582 | Hg issue 308: infinite search 583 | 584 | Fixed a re-allocation bug. 585 | 586 | Version: 2019.1.23 587 | 588 | Major overhaul of code to use simpler stacks. The result is now much easier to understand and maintain! 589 | 590 | Version: 2018.11.22 591 | 592 | Hg issue 304: Unreasonable edge case that used to work 593 | 594 | Now moves the minimum number of repeats out of a repeat if it contains a repeat. This allows a repeat guard to be put back, which reduces the chance of catastrophic backtracking. 595 | 596 | Version: 2018.11.7 597 | 598 | Hg issue 301: TypeError: character mapping must return integer, None or unicode 599 | 600 | Fixed bug introduced by broken workflow. 601 | 602 | Version: 2018.11.6 603 | 604 | Hg issue 300: segmentation fault 605 | 606 | Fixed a problem with not recording all fuzzy changes. 607 | 608 | Also fixed the check for prefix/suffix in branches: when fuzzy subpatterns were compared, their constraints weren't compared. 609 | 610 | Version: 2018.11.3 611 | 612 | Hg issue 299 613 | 614 | Reworked the fix to perform a normal match and then fall back to a partial match if that was originally requested. 615 | 616 | Version: 2018.11.2 617 | 618 | Hg issue 299: Partial gives misleading results with "open ended" regexp 619 | 620 | Added checks for end of text at start and end of repeats. 621 | 622 | Version: 2018.8.29 623 | 624 | Hg issue 293: scx (Script Extensions) property currently matches incorrectly 625 | 626 | One of the tables for Script Extensions was partly incorrectly ordered. 627 | 628 | Version: 2018.8.17 629 | 630 | Hg issue 291: Include Script Extensions as a supported Unicode property 631 | 632 | Added the Unicode Script Extensions property. 633 | 634 | Version: 2018.7.11 635 | 636 | Hg issue #289: Regex and Python typing 637 | 638 | Added types for Pattern and Match. 639 | 640 | Hg issue #290: Turkish locale causes import of regex to fail 641 | 642 | str (bytestring) in Python 2 is locale-sensitive. Added a function to uppercase ASCII-range letters in a locale-insensitive way. 643 | 644 | Version: 2018.6.21 645 | 646 | Hg issue 286: Regex matches with `re` but not with `regex` module 647 | 648 | Version: 2018.6.20 649 | 650 | The reported positions of fuzzy changes were sometimes incorrect. 651 | 652 | Version: 2018.6.9 653 | 654 | Updated Unicode word and grapheme boundaries for Unicode 11.0.0, which I had overlooked... :-( 655 | 656 | Version: 2018.6.6 657 | 658 | Correction to filenames. 659 | 660 | Version: 2018.6.5 661 | 662 | Updated to Unicode 11.0.0. 663 | 664 | Version: 2018.2.21 665 | 666 | Hg issue 276: Partial Matches yield incorrect matches and bounds 667 | 668 | Fixed an off-by-one bug where a lazy repeat is followed by a character (quick check). 669 | 670 | Version: 2018.2.8 671 | 672 | Hg issue 273: Missing unicode normalization quick check properties 673 | 674 | The Unicode normalization quick check properties weren't handled correctly. 675 | 676 | Version: 2018.2.3 677 | 678 | Hg issue 271: Comment logic different between Re and Regex 679 | Hg issue 273: Missing unicode normalization quick check properties 680 | 681 | Made comments consistent with re module. 682 | 683 | Added more Unicode properties. 684 | 685 | Version: 2018.1.10 686 | 687 | Further changes to match re module's behaviour on zero-width matching for Python 3.7. 688 | 689 | Changes to the locations of the source files. 690 | 691 | Version: 2017.12.12 692 | 693 | Hg issue 268: Update the escape() documentation 694 | 695 | Added documentation for escape. 696 | 697 | Hg issue 269: Building a bdist using setuptools throws an error. 698 | 699 | Moved source files from subfolders to main folders for Python versions. 700 | 701 | Version: 2017.12.9 702 | 703 | Further changes to match re module's behaviour on zero-width matching for Python 3.7. 704 | 705 | Version: 2017.12.5 706 | 707 | Hg issue 266: fuzzy match alignment recovery 708 | 709 | Added 'fuzzy_changes' attribute to match object to indicate positions of changes in fuzzy match. 710 | 711 | Stopped supporting Python 2.5 and Python 3.1-3.2. 712 | 713 | Made changes to zero-width matching for Python 3.7. 714 | 715 | Version: 2017.11.9 716 | 717 | Hg issue 264: Failure to import regex in pypy3-5.8.0 718 | 719 | Version: 2017.11.8 720 | 721 | Hg issue 264: Failure to import regex in pypy3-5.8.0 722 | Hg issue 265: Invalid pointer in munmap_chunk (core dump) for specific inputs 723 | 724 | Version: 2017.9.23 725 | 726 | Hg issue 253: Run into error under PyPy 5.8.0 727 | 728 | Version: 2017.8.1 729 | 730 | Hg issue #240: Unable to build the project from source on OSX with PyPy3.5-5.7.1-beta 731 | 732 | Needed to compensate for the differences between CPython and PyPy. 733 | 734 | Version: 2017.4.6 735 | 736 | Added setup.py for building from the Hg working directory. 737 | 738 | Version: 2017.4.6 739 | 740 | Hg issue 236: Incorrect references to bugs.python.org issues 741 | 742 | Version: 2017.2.8 743 | 744 | Failed to build on AIX using xlc because FALSE and TRUE were already #define'd. Replaced enum {FALSE, TRUE} with #define's. 745 | 746 | Version: 2017.1.17 747 | 748 | Hg issue 230: Is it a bug of (?(DEFINE)...) 749 | 750 | Capture groups in (?(DEFINE)...) shouldn't be treated as 'visible' capture groups by .findall. 751 | 752 | Version: 2017.1.14 753 | 754 | Hg issue 227: Performance trap of (?V1i) flags 755 | 756 | Further improvements. It now tries to split full case-folded literals into simple and full case-folded literals where full case-folding isn't needed because simple case-folding is faster. 757 | 758 | Version: 2017.1.13 759 | 760 | Hg issue 227: Performance trap of (?V1i) flags 761 | 762 | Version: 2016.12.27 763 | 764 | Hg issue 227: Incorrect behavior for ? operator with UNICODE + IGNORECASE 765 | 766 | 'end_pos' wasn't always initialised in 'locate_required_string'. 767 | 768 | Version: 2016.11.21 769 | 770 | Hg issue 226: Error matching at start of string 771 | 772 | Fuzzy matching of zero-width items wasn't quite right. 773 | 774 | Version: 2016.11.18 775 | 776 | Hg issue 225: BESTMATCH in fuzzy match not working 777 | 778 | Version: 2016.10.22 779 | 780 | Hg issue 221: Got an exception using PyPy 781 | 782 | pypy2-v5.4.1 appears not to support the buffer protocol on bytestrings (str in Python 2), so added code to handle them analogously to how unicode strings are handled. 783 | 784 | Version: 2016.10.20 785 | 786 | Make setup use setuptools if it's available. 787 | 788 | Version: 2016.10.12 789 | 790 | Hg issue 221: Got an exception using PyPy 791 | 792 | Added check for error in fold_case which, I hope, will reveal the exception that it's not reporting. 793 | 794 | Version: 2016.9.22 795 | 796 | Hg issue 220: Misbehavior of group capture with OR operand 797 | 798 | Repeats should not be factored out of branches because a branch should be exhausted before trying the next one. 799 | 800 | Version: 2016.9.13 801 | 802 | Also supported on Python 3.6. 803 | 804 | Version: 2016.8.27 805 | 806 | Hg issue 219: Unicode word boundries 807 | 808 | For a Unicode word boundary (UAX #29), apostrophe in rule WB5a should include both U+0027 (APOSTROPHE) and U+2019 (RIGHT SINGLE QUOTATION MARK / 809 | * curly apostrophe). 810 | 811 | Version: 2016.7.21 812 | 813 | Hg issue 217: Core dump in conditional ahead match and matching \! character 814 | 815 | Fixed bug where it incorrectly tried to restore saved groups when none saved. 816 | 817 | Version: 2016.7.14 818 | 819 | Hg Issue 216: Invalid match when using negative lookbehind and pipe 820 | 821 | The creation and position of a branch firstset wasn't always correct. 822 | 823 | Version: 2016.6.25 824 | 825 | Updated to support Unicode 9.0.0. 826 | 827 | Version: 2016.6.19 828 | 829 | Hg issue 214: tests failure when using python debug flavor 830 | 831 | Further tweaks re error handling. 832 | 833 | Version: 2016.6.14 834 | 835 | Hg issue 213: Segmentation Fault 836 | 837 | Info about atomic groups wasn't pushed properly. 838 | 839 | Version: 2016.6.5 840 | 841 | Hg issue 212: Unexpected matching difference with .*? between re and regex 842 | 843 | In 'add_repeat_guards', it wasn't propagating the status from the tail back towards the head across default nodes, so the guards weren't always correct. 844 | 845 | Version: 2016.6.2 846 | 847 | Hg issue 211: Segmentation fault with recursive matches and atomic groups 848 | 849 | It wasn't saving the call frame and then restoring it when backtracking out of atomic groups. 850 | 851 | Version: 2016.5.23 852 | 853 | Hg issue 206: Incompatible with re if single { in the pattern 854 | 855 | Brought the regex module's handling more in line with the re module. It now more readily treats an invalid fuzzy constraint as a literal. 856 | 857 | Version: 2016.5.15 858 | 859 | Hg issue 208: Named list, (?ri) flags, Backreference 860 | 861 | Fixed more issues with the alignment of text in a buffer when using named lists. 862 | 863 | Also changed how a compiled regex is pickled to use a bytestring for the packed code list. (It was actually pickling an unpacked list of ints.) 864 | 865 | Version: 2016.5.14 866 | 867 | Hg issue 205: Named list and (?ri) flags 868 | 869 | string_set_match_ign_fwdrev wasn't taking into account that it fills the folded buffer from the end when searching in reverse. 870 | 871 | Version: 2016.5.13 872 | 873 | Hg issue 204: confusion of (?aif) flags 874 | 875 | The FULLCASE flag is now ignored if the ASCII flag is turned on. 876 | 877 | Version: 2016.4.25 878 | 879 | Hg issue 203: partial matching bug 880 | 881 | The text position wasn't always set correctly before returning the status. 882 | 883 | Version: 2016.4.16 884 | 885 | Hg issue 201: ENHANCEMATCH crashes interpreter 886 | 887 | Fixed an issue with restoring group captures. 888 | 889 | Version: 2016.4.7 890 | 891 | Hg issue 199: Segfault in re.compile 892 | 893 | Removed copies of groups that weren't called. 894 | 895 | Version: 2016.4.3 896 | 897 | Hg issue 197: ValueError in regex.compile 898 | Hg issue 198: ValueError in regex.compile 899 | 900 | It wasn't catching ValueError and then raising regex.error. 901 | 902 | Version: 2016.4.2 903 | 904 | Hg issue 196: Fuzzy matching on repeated regex not working as expected 905 | 906 | Also reduced memory usage of pickle data. 907 | 908 | Version: 2016.3.31 909 | 910 | # Hg issue #194: .FULLCASE and Backreference 911 | 912 | Capture groups failed to match when using full case folding because of a bug in the handling of "I", which needs to be treated specially to cope with the Turkic I). 913 | 914 | Version: 2016.3.26 915 | 916 | Hg issue #193: Alternation and .REVERSE flag. 917 | 918 | The firstset before a branch was at the wrong end for a reverse pattern. 919 | 920 | Added back some tests that were accidentally omitted. 921 | 922 | Version: 2016.3.24 923 | 924 | Hg issue 192: Named lists reverse matching doesn't work with IGNORECASE and V1 925 | 926 | string_set_match_fld_fwdrev wasn't taking into account that it fills the folded buffer from the end when searching in reverse. 927 | 928 | Version: 2016.3.2 929 | 930 | Hg issue 190: Regression? Neverending regexp when upgrading to latest version. 931 | 932 | The fix for Hg issue 187 wasn't quite right. Managed to remove recursion entirely from 'add_repeat_guards'. 933 | 934 | Version: 2016.2.25 935 | 936 | Hg issue 188: Crash during search 937 | 938 | Stopped calling 'try_match' recursively when the tail of a branch is a branch. Increasing alternatives to 40000 caused the stack to overflow. 939 | 940 | Version: 2016.2.24 941 | 942 | Hg issue 187: Crash on Anaconda Python if large number of pattern 943 | 944 | Remove the recursion in 'use_nodes'. Increasing alternatives to 50000 caused the stack to overflow. 945 | 946 | Version: 2016.2.23 947 | 948 | Hg issue 187: Crash on Anaconda Python if large number of pattern 949 | 950 | Reduced the amount of recursion in 'add_repeat_guards'. The large number of alternatives (25154) caused the stack to overflow. 951 | 952 | Version: 2016.1.10 953 | 954 | Hg issue 177: Build fails on pypy (OS X 10.11, clang) 955 | 956 | Stripped out the #if...#endif that was added for Hg issue 135, which is no longer needed by more recent versions of PyPy. 957 | 958 | Version: 2015.11.22 959 | 960 | Hg issue 180: bug of POSIX matching 961 | 962 | Fixed bug where the groups aren't always correct with POSIX matching. 963 | 964 | Version: 2015.11.14 965 | 966 | Hg issue 172: Performance of V1 mode 967 | 968 | Fixed bug where RE_FLAG_FULLCASE not turned off when RE_FLAG_IGNORECASE turned off, leading to omission of first set. 969 | 970 | Version: 2015.11.12 971 | 972 | Hg issue 171: Weird performance of V1 mode 973 | 974 | Fixed bug where RE_FLAG_FULLCASE not turned off when RE_FLAG_IGNORECASE turned off for required string, leading to required string not being used. 975 | 976 | Version: 2015.11.9 977 | 978 | Hg issue 169: Performance 979 | 980 | Further tweaks to fuzzy matching. 981 | 982 | Version: 2015.11.8 983 | 984 | Hg issue 167: Performance of Backreference 985 | 986 | No longer saves/restores groups or repeats around a lookaround if it doesn't contain any. 987 | 988 | Version: 2015.11.7 989 | 990 | Hg issue 166: Performance 991 | 992 | Improved the performance of fuzzy matching. 993 | 994 | Version: 2015.11.5 995 | 996 | Hg issue 165: Performance / hung search 997 | 998 | Made changes to fuzzy matching code, including refactoring different kind of fuzzy matching (exact/simple/enhanced/best). 999 | 1000 | Version: 2015.10.28 1001 | 1002 | Hg issue 163: allow lookarounds in conditionals 1003 | 1004 | Added support for a lookaround in a conditional pattern, e.g. r'(?(?=\d)\d+\b|\w+)'. 1005 | 1006 | Version: 2015.10.22 1007 | 1008 | Hg issue 161: Unexpected fuzzy match results 1009 | 1010 | Fixed the bug and did some related tidying up. 1011 | 1012 | Version: 2015.10.5 1013 | 1014 | Hg issue 158: Group issue with (?(DEFINE)...) 1015 | 1016 | The groups and repeats weren't restored properly when a lookaround completed and it contained a group call. 1017 | 1018 | Version: 2015.10.1 1019 | 1020 | Hg issue 157: regression: segfault on complex lookaround 1021 | 1022 | Nested lookarounds/atomic groups didn't restore state correctly. 1023 | 1024 | Version: 2015.9.28 1025 | 1026 | Hg issue 156: regression on atomic grouping 1027 | 1028 | It didn't initialise min_width when building the atomic group. 1029 | 1030 | Version: 2015.9.23 1031 | 1032 | Hg issue 154: Segmentation fault 11 when working with an atomic group 1033 | 1034 | Version: 2015.9.15 1035 | 1036 | Hg issue 150: Have an option for POSIX-compatible longest match of alternates 1037 | 1038 | Added POSIX matching (leftmost longest). 1039 | 1040 | Version: 2015.7.20 1041 | 1042 | Hg issue #147: Fuzzy match can return match points beyond buffer end 1043 | 1044 | It wasn't checking for the edge of the text when case-folding and it was also advancing even when case-folding had failed. 1045 | 1046 | Version: 2015.7.12 1047 | 1048 | Hg issue #146: Forced-fail (?!) works improperly in conditional 1049 | 1050 | Empty negative lookarounds weren't optimised correctly. 1051 | 1052 | The capture groups weren't being cleared before retrying after failure. 1053 | 1054 | Version: 2015.6.24 1055 | 1056 | Hg issue #144: Latest version problem with matching 'R|R' 1057 | 1058 | The prefix of a set of branches was omitted. 1059 | 1060 | Version: 2015.6.21 1061 | 1062 | Hg issue #143: Partial matches have incorrect span if prefix is '.' wildcard 1063 | 1064 | Didn't set state->match_pos if search_start returned a partial match status. 1065 | 1066 | Version: 2015.6.19 1067 | 1068 | Updated to Unicode 8.0. 1069 | 1070 | Some performance tweaks. 1071 | 1072 | Version: 2015.6.15 1073 | 1074 | Removed a few lines that should've been removed in the previous fix! 1075 | 1076 | Version: 2015.6.14 1077 | 1078 | Fixed a bug where it could sometimes search for the same required string multiple times. 1079 | 1080 | Version: 2015.6.10 1081 | 1082 | Hg issue #141: Crash on a certain partial match 1083 | 1084 | It didn't check the result of 'try_match' correctly in certain places (the status returned isn't limited to success and failure). 1085 | 1086 | Version: 2015.6.9 1087 | 1088 | Hg issue #140: Replace with REVERSE and groups has unexpected behavior 1089 | 1090 | subx needed to add the template items to the list in reverse order when searching backwards because the list will be reversed after completion. 1091 | 1092 | Version: 2015.6.4 1093 | 1094 | Hg issue #98: regex module is not thread safe because of _cache 1095 | 1096 | Now iterates over a snapshot of the cache keys in case the dict resizes. 1097 | 1098 | Version: 2015.6.2 1099 | 1100 | Hg issue #139: Regular expression with multiple wildcards where first should match empty string does not always work 1101 | 1102 | The problem was caused by a negative-character in the firstset, eg "[^a]". 1103 | 1104 | Fixed. 1105 | 1106 | Version: 2015.5.28 1107 | 1108 | Hg issue #137: Posix character class :punct: does not seem to be supported 1109 | 1110 | It _is_ supported. 1111 | 1112 | Corrected Posix-style properties for 'alnum', 'digit', 'punct' and 'xdigit' which are different from that of Unicode. Now also available as \p[posix_alnum}, etc. 1113 | 1114 | Hg issue #138: grapheme anchored search not working properly 1115 | 1116 | Fixed. 1117 | 1118 | Version: 2015.5.10 1119 | 1120 | Hg issue #136: Use a DFA when possible 1121 | 1122 | Made a slight tweak so that it now treats an optional subpattern like a repeated subpattern (e.g. "(?:xyz)?" -> "(?:xyz){0,1}") to make use of the repeat guards. 1123 | 1124 | The advantage over DFA is that it'll work even for those patterns that aren't compatible with DFA. 1125 | 1126 | Version: 2015.5.7 1127 | 1128 | Hg issue 135: PyPy Support (with patch) 1129 | 1130 | It should now build on PyPy. 1131 | 1132 | Version: 2015.3.18 1133 | 1134 | Hg issue 133: support for captures() in expandf(). 1135 | 1136 | Now supported in expandf and subf. 1137 | 1138 | Issue 23692: Undocumented feature prevents re module from finding certain matches 1139 | 1140 | This also applied to regex which failed to take into account group references by group capture testa, e.g. "(?(1)...|...)", when guarding against excessive repeats. 1141 | 1142 | Version: 2014.12.24 1143 | 1144 | Hg issue 132: index out of range on null property \p{} 1145 | 1146 | It's now reported as an unknown property. 1147 | 1148 | Version: 2014.12.15 1149 | 1150 | Hg issue 131: nested sets behaviour 1151 | 1152 | The set difference operator '--' wasn't handled correctly after an implicit set union. 1153 | 1154 | Version: 2014.11.14 1155 | 1156 | Unreported issue: no such builtin as 'ascii' in Python 2. Fixed. 1157 | 1158 | Version: 2014.11.13 1159 | 1160 | Hg issue 127: Infinite loop is found 1161 | 1162 | Not an infinite loop, but slow because of repeated backtracking on a very long chunk of text. 1163 | 1164 | This fix reduces the amount of backtracking and re-matching. 1165 | 1166 | Version: 2014.11.3 1167 | 1168 | Hg issue 125: Reference to entire match (\g<0>) in Pattern.sub() doesn't work as of 2014.09.22 release. 1169 | 1170 | Version: 2014.10.24 1171 | 1172 | Reverted licence. 1173 | 1174 | Version: 2014.10.23 1175 | 1176 | Fixed bug in determining line number in regex. 1177 | 1178 | Changed licence to Apache License 2.0 and included copy in release. 1179 | 1180 | Version: 2014.10.9 1181 | 1182 | Issue 22578: Add additional attributes to re.error 1183 | 1184 | Added the attributes .msg, .pattern, .pos, .lineno and .colno to the regex error class. 1185 | 1186 | Version: 2014.10.7 1187 | 1188 | Fixed bug in partial matching when required string occurs after repeat. 1189 | 1190 | Version: 2014.9.22 1191 | 1192 | Issue #22437: Added support for referring to a group by number using (?P=...). This is in addition to the existing \g<...>. 1193 | 1194 | Fixed bug in handling of cache for locale-sensitive patterns. 1195 | 1196 | Version: 2014.9.18 1197 | 1198 | Adjusted line-endings in PKG-INFO. 1199 | 1200 | Version: 2014.8.15 1201 | 1202 | Hg issue 115: Infinite loop when processing backreferences 1203 | 1204 | Version: 2014.6.28 1205 | 1206 | Updated to Unicode 7.0. 1207 | 1208 | Version: 2014.5.23 1209 | 1210 | Fixed fuzzy counts that were wrong when using BESTMATCH or ENHANCEMATCH flags. 1211 | 1212 | Version: 2014.5.17 1213 | 1214 | Hg issue 112 in mrab-regex-hg: re: OK, but regex: SystemError 1215 | 1216 | Version: 2014.4.10 1217 | 1218 | Hg issue 102: Partial matches 1219 | Hg issue 109: Edit distance of fuzzy match 1220 | 1221 | Added partial matches. 1222 | Added .fuzzy_counts attribute to match objects. 1223 | 1224 | Version: 2014.2.19 1225 | 1226 | Unicode properties sometimes failed to match when the IGNORECASE flag was set. 1227 | 1228 | Version: 2014.2.16 1229 | 1230 | Hg issue 108: Fails to build from source on s390x 1231 | 1232 | Version: 2014.1.30 1233 | 1234 | Hg issue 106: * operator not working correctly with sub() 1235 | 1236 | Made to conform more to re module in version 0 behaviour. 1237 | 1238 | Version: 2014.1.20 1239 | 1240 | Hg issue 105: FAIL: test_case_folding (__main__.RegexTests) with py2.7 1241 | 1242 | data.fold_len wasn't initialised. 1243 | 1244 | Also, deleted some unused functions. 1245 | 1246 | Version: 2014.1.10 1247 | 1248 | Issue #17087: Improve the repr for regular expression match objects 1249 | 1250 | Version: 2013.12.31 1251 | 1252 | Hg issue 101: findall() broken (seems like memory corruption) 1253 | 1254 | state->req_pos wasn't initialised when the required string is an initial string. 1255 | 1256 | Version: 2013.11.29 1257 | 1258 | Hg issue 100: strange results from regex.search 1259 | 1260 | Version: 2013.10.25 1261 | 1262 | Hg issue 98: regex module is not thread safe because of _cache 1263 | 1264 | Version: 2013.10.24 1265 | 1266 | Further fixes for Hg issue 96. 1267 | 1268 | It should now use correctly use sets for named lists, except when there's fuzzy matching. 1269 | 1270 | Version: 2013.10.23 1271 | 1272 | Hg issue 96: compile '\L<...>' with 'i' flag was very slow 1273 | 1274 | Version: 2013.10.22 1275 | 1276 | Hg issue 95: 'pos' for regex.error 1277 | 1278 | Version: 2013.10.21 1279 | 1280 | Python crashes when executing regex updates pattern.findall 1281 | 1282 | Version: 2013.10.12 1283 | 1284 | Updated to Unicode 6.3. 1285 | 1286 | Version: 2013.10.4 1287 | 1288 | Issue #18468: re.group() should never return a bytearray 1289 | 1290 | Applies to Python 3.4 and later for compatibility with the re module. 1291 | 1292 | Also, some performance improvements. 1293 | 1294 | Version: 2013.8.4 1295 | 1296 | Update for Python 3.4a1 release. 1297 | 1298 | Version: 2013.6.26 1299 | 1300 | Performance improvements. 1301 | 1302 | Version: 2013.6.5 1303 | 1304 | Hg issue 92: running the following regex causes a segfault 1305 | 1306 | Version: 2013.5.21 1307 | 1308 | Hg issue 91: match.expand is extremely slow 1309 | 1310 | Also tidied up code. 1311 | 1312 | Version: 2013.3.11 1313 | 1314 | Hg issue 89: Certain regexes extremely slow compared to re module 1315 | 1316 | Disabled one of the optimisations that appears to cause performance problems. 1317 | 1318 | Version: 2013.2.22 1319 | 1320 | Fixed issue with LOCALE flag not working properly. 1321 | 1322 | Version: 2013.2.16 1323 | 1324 | Fixed a locale-specific test. Whether b'\xE0' is a word character depends on the locale. 1325 | 1326 | Version: 2013.1.26 1327 | 1328 | Another fix for Hg issue 87: Allow duplicate names of groups 1329 | It didn't correctly handle a name group within a group of the same name. 1330 | 1331 | Version: 2013.1.25 1332 | 1333 | Second attempt to fix 1334 | 1335 | Version: 2013.1.24 1336 | 1337 | Hg issue 86: Enhance API of captures() to enable retrieval of ALL groups at once, as a dictionary 1338 | Added capturesdict() method to match object. 1339 | 1340 | Hg issue 87: Allow duplicate names of groups 1341 | Now allowed. 1342 | 1343 | Hg issue 88: regex.match() hangs 1344 | Fixed. 1345 | 1346 | Version: 2013.1.20 1347 | 1348 | Hg issue 85: Non-conformance to Unicode UAX#29 re: ZWJ / ZWNJ 1349 | 1350 | Version: 2012.12.16 1351 | 1352 | Hg issue 83: slash handling in presence of a quantifier 1353 | 1354 | The bug was not limited just slash! 1355 | 1356 | Version: 2012.11.20 1357 | 1358 | Updated to Unicode 6.2. 1359 | 1360 | Version: 2012.11.13 1361 | 1362 | Issue 16443: Add docstrings to regular expression match objects 1363 | 1364 | Version: 2012.11.5 1365 | 1366 | Further performance improvements. 1367 | 1368 | Version: 2012.10.31 1369 | 1370 | Performance improvements. 1371 | 1372 | Version: 2012.10.17 1373 | 1374 | Added "fullmatch" method (issue #16203). 1375 | 1376 | Fixed bug (Hg issue #80). Now raises the correct error. 1377 | 1378 | Version: 2012.10.8 1379 | 1380 | Added subf, subfn and expandf methods. 1381 | 1382 | Performed some refactoring. 1383 | 1384 | Version: 2012.9.4 1385 | 1386 | Hg issue 78: "Captures" doesn't work for recursive calls 1387 | 1388 | Version: 2012.8.25 1389 | 1390 | Added 'detach_string' method to match object. 1391 | 1392 | Made objects copyable. 1393 | 1394 | Version: 2012.8.3 1395 | 1396 | Speed improvements. 1397 | 1398 | Version: 2012.7.10 1399 | 1400 | Fixed bug in debug output in Python 2 version. 1401 | 1402 | Also expanded fuzzy info in debug output. 1403 | 1404 | Version: 2012.7.9 1405 | 1406 | Hg issue 75: DEBUG flag 1407 | 1408 | Also made the debug output a little more readable by showing string literals and property names/values. 1409 | 1410 | Version: 2012.7.8 1411 | 1412 | Hg issue 73: conditional patterns 1413 | 1414 | Version: 2012.7.5 1415 | 1416 | Hg issue 71: non-greedy quantifier in lookbehind 1417 | 1418 | Version: 2012.6.13 1419 | 1420 | Hg issue 69: Changing DEFAULT_VERSION does not actually work. 1421 | 1422 | DEFAULT_VERSION isn't part of the public API, but changing it should now work as expected. 1423 | 1424 | Version: 2011.5.14 1425 | 1426 | Fixed bug in case-insensitive set. 1427 | 1428 | Version: 2011.3.15 1429 | 1430 | Shared iterators now work in both Python 3 and Python 2. 1431 | 1432 | -------------------------------------------------------------------------------- /docs/UnicodeProperties.rst: -------------------------------------------------------------------------------- 1 | The following is a list of the 94 properties which are supported by this module: 2 | 3 | Alphabetic [Alpha] 4 | No [F, False, N] 5 | Yes [T, True, Y] 6 | 7 | Alphanumeric [AlNum] 8 | No [F, False, N] 9 | Yes [T, True, Y] 10 | 11 | Any 12 | No [F, False, N] 13 | Yes [T, True, Y] 14 | 15 | ASCII_Hex_Digit [AHex] 16 | No [F, False, N] 17 | Yes [T, True, Y] 18 | 19 | Bidi_Class [bc] 20 | Arabic_Letter [AL] 21 | Arabic_Number [AN] 22 | Boundary_Neutral [BN] 23 | Common_Separator [CS] 24 | European_Number [EN] 25 | European_Separator [ES] 26 | European_Terminator [ET] 27 | First_Strong_Isolate [FSI] 28 | Left_To_Right [L] 29 | Left_To_Right_Embedding [LRE] 30 | Left_To_Right_Isolate [LRI] 31 | Left_To_Right_Override [LRO] 32 | Nonspacing_Mark [NSM] 33 | Other_Neutral [ON] 34 | Paragraph_Separator [B] 35 | Pop_Directional_Format [PDF] 36 | Pop_Directional_Isolate [PDI] 37 | Right_To_Left [R] 38 | Right_To_Left_Embedding [RLE] 39 | Right_To_Left_Isolate [RLI] 40 | Right_To_Left_Override [RLO] 41 | Segment_Separator [S] 42 | White_Space [WS] 43 | 44 | Bidi_Control [Bidi_C] 45 | No [F, False, N] 46 | Yes [T, True, Y] 47 | 48 | Bidi_Mirrored [Bidi_M] 49 | No [F, False, N] 50 | Yes [T, True, Y] 51 | 52 | Blank 53 | No [F, False, N] 54 | Yes [T, True, Y] 55 | 56 | Block [blk] 57 | Adlam 58 | Aegean_Numbers 59 | Ahom 60 | Alchemical_Symbols [Alchemical] 61 | Alphabetic_Presentation_Forms [Alphabetic_PF] 62 | Anatolian_Hieroglyphs 63 | Ancient_Greek_Musical_Notation [Ancient_Greek_Music] 64 | Ancient_Greek_Numbers 65 | Ancient_Symbols 66 | Arabic 67 | Arabic_Extended_A [Arabic_Ext_A] 68 | Arabic_Mathematical_Alphabetic_Symbols [Arabic_Math] 69 | Arabic_Presentation_Forms_A [Arabic_PF_A] 70 | Arabic_Presentation_Forms_B [Arabic_PF_B] 71 | Arabic_Supplement [Arabic_Sup] 72 | Armenian 73 | Arrows 74 | Avestan 75 | Balinese 76 | Bamum 77 | Bamum_Supplement [Bamum_Sup] 78 | Basic_Latin [ASCII] 79 | Bassa_Vah 80 | Batak 81 | Bengali 82 | Bhaiksuki 83 | Block_Elements 84 | Bopomofo 85 | Bopomofo_Extended [Bopomofo_Ext] 86 | Box_Drawing 87 | Brahmi 88 | Braille_Patterns [Braille] 89 | Buginese 90 | Buhid 91 | Byzantine_Musical_Symbols [Byzantine_Music] 92 | Carian 93 | Caucasian_Albanian 94 | Chakma 95 | Cham 96 | Cherokee 97 | Cherokee_Supplement [Cherokee_Sup] 98 | Chess_Symbols 99 | CJK_Compatibility [CJK_Compat] 100 | CJK_Compatibility_Forms [CJK_Compat_Forms] 101 | CJK_Compatibility_Ideographs [CJK_Compat_Ideographs] 102 | CJK_Compatibility_Ideographs_Supplement [CJK_Compat_Ideographs_Sup] 103 | CJK_Radicals_Supplement [CJK_Radicals_Sup] 104 | CJK_Strokes 105 | CJK_Symbols_And_Punctuation [CJK_Symbols] 106 | CJK_Unified_Ideographs [CJK] 107 | CJK_Unified_Ideographs_Extension_A [CJK_Ext_A] 108 | CJK_Unified_Ideographs_Extension_B [CJK_Ext_B] 109 | CJK_Unified_Ideographs_Extension_C [CJK_Ext_C] 110 | CJK_Unified_Ideographs_Extension_D [CJK_Ext_D] 111 | CJK_Unified_Ideographs_Extension_E [CJK_Ext_E] 112 | CJK_Unified_Ideographs_Extension_F [CJK_Ext_F] 113 | Combining_Diacritical_Marks [Diacriticals] 114 | Combining_Diacritical_Marks_Extended [Diacriticals_Ext] 115 | Combining_Diacritical_Marks_For_Symbols [Combining_Marks_For_Symbols, Diacriticals_For_Symbols] 116 | Combining_Diacritical_Marks_Supplement [Diacriticals_Sup] 117 | Combining_Half_Marks [Half_Marks] 118 | Common_Indic_Number_Forms [Indic_Number_Forms] 119 | Control_Pictures 120 | Coptic 121 | Coptic_Epact_Numbers 122 | Counting_Rod_Numerals [Counting_Rod] 123 | Cuneiform 124 | Cuneiform_Numbers_And_Punctuation [Cuneiform_Numbers] 125 | Currency_Symbols 126 | Cypriot_Syllabary 127 | Cyrillic 128 | Cyrillic_Extended_A [Cyrillic_Ext_A] 129 | Cyrillic_Extended_B [Cyrillic_Ext_B] 130 | Cyrillic_Extended_C [Cyrillic_Ext_C] 131 | Cyrillic_Supplement [Cyrillic_Sup, Cyrillic_Supplementary] 132 | Deseret 133 | Devanagari 134 | Devanagari_Extended [Devanagari_Ext] 135 | Dingbats 136 | Dogra 137 | Domino_Tiles [Domino] 138 | Duployan 139 | Early_Dynastic_Cuneiform 140 | Egyptian_Hieroglyphs 141 | Egyptian_Hieroglyph_Format_Controls 142 | Elbasan 143 | Elymaic 144 | Emoticons 145 | Enclosed_Alphanumerics [Enclosed_Alphanum] 146 | Enclosed_Alphanumeric_Supplement [Enclosed_Alphanum_Sup] 147 | Enclosed_CJK_Letters_And_Months [Enclosed_CJK] 148 | Enclosed_Ideographic_Supplement [Enclosed_Ideographic_Sup] 149 | Ethiopic 150 | Ethiopic_Extended [Ethiopic_Ext] 151 | Ethiopic_Extended_A [Ethiopic_Ext_A] 152 | Ethiopic_Supplement [Ethiopic_Sup] 153 | General_Punctuation [Punctuation] 154 | Geometric_Shapes 155 | Geometric_Shapes_Extended [Geometric_Shapes_Ext] 156 | Georgian 157 | Georgian_Extended [Georgian_Ext] 158 | Georgian_Supplement [Georgian_Sup] 159 | Glagolitic 160 | Glagolitic_Supplement [Glagolitic_Sup] 161 | Gothic 162 | Grantha 163 | Greek_And_Coptic [Greek] 164 | Greek_Extended [Greek_Ext] 165 | Gujarati 166 | Gunjala_Gondi 167 | Gurmukhi 168 | Halfwidth_And_Fullwidth_Forms [Half_And_Full_Forms] 169 | Hangul_Compatibility_Jamo [Compat_Jamo] 170 | Hangul_Jamo [Jamo] 171 | Hangul_Jamo_Extended_A [Jamo_Ext_A] 172 | Hangul_Jamo_Extended_B [Jamo_Ext_B] 173 | Hangul_Syllables [Hangul] 174 | Hanifi_Rohingya 175 | Hanunoo 176 | Hatran 177 | Hebrew 178 | High_Private_Use_Surrogates [High_PU_Surrogates] 179 | High_Surrogates 180 | Hiragana 181 | Ideographic_Description_Characters [IDC] 182 | Ideographic_Symbols_And_Punctuation [Ideographic_Symbols] 183 | Imperial_Aramaic 184 | Indic_Siyaq_Numbers 185 | Inscriptional_Pahlavi 186 | Inscriptional_Parthian 187 | IPA_Extensions [IPA_Ext] 188 | Javanese 189 | Kaithi 190 | Kana_Extended_A [Kana_Ext_A] 191 | Kana_Supplement [Kana_Sup] 192 | Kanbun 193 | Kangxi_Radicals [Kangxi] 194 | Kannada 195 | Katakana 196 | Katakana_Phonetic_Extensions [Katakana_Ext] 197 | Kayah_Li 198 | Kharoshthi 199 | Khmer 200 | Khmer_Symbols 201 | Khojki 202 | Khudawadi 203 | Lao 204 | Latin_1_Supplement [Latin_1, Latin_1_Sup] 205 | Latin_Extended_A [Latin_Ext_A] 206 | Latin_Extended_Additional [Latin_Ext_Additional] 207 | Latin_Extended_B [Latin_Ext_B] 208 | Latin_Extended_C [Latin_Ext_C] 209 | Latin_Extended_D [Latin_Ext_D] 210 | Latin_Extended_E [Latin_Ext_E] 211 | Lepcha 212 | Letterlike_Symbols 213 | Limbu 214 | Linear_A 215 | Linear_B_Ideograms 216 | Linear_B_Syllabary 217 | Lisu 218 | Low_Surrogates 219 | Lycian 220 | Lydian 221 | Mahajani 222 | Mahjong_Tiles [Mahjong] 223 | Makasar 224 | Malayalam 225 | Mandaic 226 | Manichaean 227 | Marchen 228 | Masaram_Gondi 229 | Mathematical_Alphanumeric_Symbols [Math_Alphanum] 230 | Mathematical_Operators [Math_Operators] 231 | Mayan_Numerals 232 | Medefaidrin 233 | Meetei_Mayek 234 | Meetei_Mayek_Extensions [Meetei_Mayek_Ext] 235 | Mende_Kikakui 236 | Meroitic_Cursive 237 | Meroitic_Hieroglyphs 238 | Miao 239 | Miscellaneous_Mathematical_Symbols_A [Misc_Math_Symbols_A] 240 | Miscellaneous_Mathematical_Symbols_B [Misc_Math_Symbols_B] 241 | Miscellaneous_Symbols [Misc_Symbols] 242 | Miscellaneous_Symbols_And_Arrows [Misc_Arrows] 243 | Miscellaneous_Symbols_And_Pictographs [Misc_Pictographs] 244 | Miscellaneous_Technical [Misc_Technical] 245 | Modi 246 | Modifier_Tone_Letters 247 | Mongolian 248 | Mongolian_Supplement [Mongolian_Sup] 249 | Mro 250 | Multani 251 | Musical_Symbols [Music] 252 | Myanmar 253 | Myanmar_Extended_A [Myanmar_Ext_A] 254 | Myanmar_Extended_B [Myanmar_Ext_B] 255 | Nabataean 256 | Nandinagari 257 | Newa 258 | New_Tai_Lue 259 | NKo 260 | No_Block [NB] 261 | Number_Forms 262 | Nushu 263 | Nyiakeng_Puachue_Hmong 264 | Ogham 265 | Old_Hungarian 266 | Old_Italic 267 | Old_North_Arabian 268 | Old_Permic 269 | Old_Persian 270 | Old_Sogdian 271 | Old_South_Arabian 272 | Old_Turkic 273 | Ol_Chiki 274 | Optical_Character_Recognition [OCR] 275 | Oriya 276 | Ornamental_Dingbats 277 | Osage 278 | Osmanya 279 | Ottoman_Siyaq_Numbers 280 | Pahawh_Hmong 281 | Palmyrene 282 | Pau_Cin_Hau 283 | Phags_Pa 284 | Phaistos_Disc [Phaistos] 285 | Phoenician 286 | Phonetic_Extensions [Phonetic_Ext] 287 | Phonetic_Extensions_Supplement [Phonetic_Ext_Sup] 288 | Playing_Cards 289 | Private_Use_Area [Private_Use, PUA] 290 | Psalter_Pahlavi 291 | Rejang 292 | Rumi_Numeral_Symbols [Rumi] 293 | Runic 294 | Samaritan 295 | Saurashtra 296 | Sharada 297 | Shavian 298 | Shorthand_Format_Controls 299 | Siddham 300 | Sinhala 301 | Sinhala_Archaic_Numbers 302 | Small_Form_Variants [Small_Forms] 303 | Small_Kana_Extension [Small_Kana_Ext] 304 | Sogdian 305 | Sora_Sompeng 306 | Soyombo 307 | Spacing_Modifier_Letters [Modifier_Letters] 308 | Specials 309 | Sundanese 310 | Sundanese_Supplement [Sundanese_Sup] 311 | Superscripts_And_Subscripts [Super_And_Sub] 312 | Supplemental_Arrows_A [Sup_Arrows_A] 313 | Supplemental_Arrows_B [Sup_Arrows_B] 314 | Supplemental_Arrows_C [Sup_Arrows_C] 315 | Supplemental_Mathematical_Operators [Sup_Math_Operators] 316 | Supplemental_Punctuation [Sup_Punctuation] 317 | Supplemental_Symbols_And_Pictographs [Sup_Symbols_And_Pictographs] 318 | Supplementary_Private_Use_Area_A [Sup_PUA_A] 319 | Supplementary_Private_Use_Area_B [Sup_PUA_B] 320 | Sutton_SignWriting 321 | Syloti_Nagri 322 | Symbols_And_Pictographs_Extended_A [Symbols_And_Pictographs_Ext_A] 323 | Syriac 324 | Syriac_Supplement [Syriac_Sup] 325 | Tagalog 326 | Tagbanwa 327 | Tags 328 | Tai_Le 329 | Tai_Tham 330 | Tai_Viet 331 | Tai_Xuan_Jing_Symbols [Tai_Xuan_Jing] 332 | Takri 333 | Tamil 334 | Tamil_Supplement [Tamil_Sup] 335 | Tangut 336 | Tangut_Components 337 | Telugu 338 | Thaana 339 | Thai 340 | Tibetan 341 | Tifinagh 342 | Tirhuta 343 | Transport_And_Map_Symbols [Transport_And_Map] 344 | Ugaritic 345 | Unified_Canadian_Aboriginal_Syllabics [Canadian_Syllabics, UCAS] 346 | Unified_Canadian_Aboriginal_Syllabics_Extended [UCAS_Ext] 347 | Vai 348 | Variation_Selectors [VS] 349 | Variation_Selectors_Supplement [VS_Sup] 350 | Vedic_Extensions [Vedic_Ext] 351 | Vertical_Forms 352 | Wancho 353 | Warang_Citi 354 | Yijing_Hexagram_Symbols [Yijing] 355 | Yi_Radicals 356 | Yi_Syllables 357 | Zanabazar_Square 358 | 359 | Canonical_Combining_Class [ccc] 360 | Above [230, A] 361 | Above_Left [228, AL] 362 | Above_Right [232, AR] 363 | Attached_Above [214, ATA] 364 | Attached_Above_Right [216, ATAR] 365 | Attached_Below [202, ATB] 366 | Attached_Below_Left [200, ATBL] 367 | Below [220, B] 368 | Below_Left [218, BL] 369 | Below_Right [222, BR] 370 | CCC10 [10] 371 | CCC103 [103] 372 | CCC107 [107] 373 | CCC11 [11] 374 | CCC118 [118] 375 | CCC12 [12] 376 | CCC122 [122] 377 | CCC129 [129] 378 | CCC13 [13] 379 | CCC130 [130] 380 | CCC132 [132] 381 | CCC133 [133] 382 | CCC14 [14] 383 | CCC15 [15] 384 | CCC16 [16] 385 | CCC17 [17] 386 | CCC18 [18] 387 | CCC19 [19] 388 | CCC20 [20] 389 | CCC21 [21] 390 | CCC22 [22] 391 | CCC23 [23] 392 | CCC24 [24] 393 | CCC25 [25] 394 | CCC26 [26] 395 | CCC27 [27] 396 | CCC28 [28] 397 | CCC29 [29] 398 | CCC30 [30] 399 | CCC31 [31] 400 | CCC32 [32] 401 | CCC33 [33] 402 | CCC34 [34] 403 | CCC35 [35] 404 | CCC36 [36] 405 | CCC84 [84] 406 | CCC91 [91] 407 | Double_Above [234, DA] 408 | Double_Below [233, DB] 409 | Iota_Subscript [240, IS] 410 | Kana_Voicing [8, KV] 411 | Left [224, L] 412 | Not_Reordered [0, NR] 413 | Nukta [7, NK] 414 | Overlay [1, OV] 415 | Right [226, R] 416 | Virama [9, VR] 417 | 418 | Cased 419 | No [F, False, N] 420 | Yes [T, True, Y] 421 | 422 | Case_Ignorable [CI] 423 | No [F, False, N] 424 | Yes [T, True, Y] 425 | 426 | Changes_When_Casefolded [CWCF] 427 | No [F, False, N] 428 | Yes [T, True, Y] 429 | 430 | Changes_When_Casemapped [CWCM] 431 | No [F, False, N] 432 | Yes [T, True, Y] 433 | 434 | Changes_When_Lowercased [CWL] 435 | No [F, False, N] 436 | Yes [T, True, Y] 437 | 438 | Changes_When_Titlecased [CWT] 439 | No [F, False, N] 440 | Yes [T, True, Y] 441 | 442 | Changes_When_Uppercased [CWU] 443 | No [F, False, N] 444 | Yes [T, True, Y] 445 | 446 | Dash 447 | No [F, False, N] 448 | Yes [T, True, Y] 449 | 450 | Decomposition_Type [dt] 451 | Canonical [Can] 452 | Circle [Enc] 453 | Compat [Com] 454 | Final [Fin] 455 | Font 456 | Fraction [Fra] 457 | Initial [Init] 458 | Isolated [Iso] 459 | Medial [Med] 460 | Narrow [Nar] 461 | Nobreak [Nb] 462 | None 463 | Small [Sml] 464 | Square [Sqr] 465 | Sub 466 | Super [Sup] 467 | Vertical [Vert] 468 | Wide 469 | 470 | Default_Ignorable_Code_Point [DI] 471 | No [F, False, N] 472 | Yes [T, True, Y] 473 | 474 | Deprecated [Dep] 475 | No [F, False, N] 476 | Yes [T, True, Y] 477 | 478 | Diacritic [Dia] 479 | No [F, False, N] 480 | Yes [T, True, Y] 481 | 482 | East_Asian_Width [ea] 483 | Ambiguous [A] 484 | Fullwidth [F] 485 | Halfwidth [H] 486 | Narrow [Na] 487 | Neutral [N] 488 | Wide [W] 489 | 490 | Emoji 491 | No 492 | Yes 493 | 494 | Emoji_Component 495 | No 496 | Yes 497 | 498 | Emoji_Modifier 499 | No 500 | Yes 501 | 502 | Emoji_Modifier_Base 503 | No 504 | Yes 505 | 506 | Emoji_Presentation 507 | No 508 | Yes 509 | 510 | Extended_Pictographic 511 | No 512 | Yes 513 | 514 | Extender [Ext] 515 | No [F, False, N] 516 | Yes [T, True, Y] 517 | 518 | General_Category [gc] 519 | Assigned 520 | Cased_Letter [LC] 521 | Close_Punctuation [Pe] 522 | Connector_Punctuation [Pc] 523 | Control [Cc, cntrl] 524 | Currency_Symbol [Sc] 525 | Dash_Punctuation [Pd] 526 | Decimal_Number [digit, Nd] 527 | Enclosing_Mark [Me] 528 | Final_Punctuation [Pf] 529 | Format [Cf] 530 | Initial_Punctuation [Pi] 531 | Letter [L, L&] 532 | Letter_Number [Nl] 533 | Line_Separator [Zl] 534 | Lowercase_Letter [Ll] 535 | Mark [Combining_Mark, M, M&] 536 | Math_Symbol [Sm] 537 | Modifier_Letter [Lm] 538 | Modifier_Symbol [Sk] 539 | Nonspacing_Mark [Mn] 540 | Number [N, N&] 541 | Open_Punctuation [Ps] 542 | Other [C, C&] 543 | Other_Letter [Lo] 544 | Other_Number [No] 545 | Other_Punctuation [Po] 546 | Other_Symbol [So] 547 | Paragraph_Separator [Zp] 548 | Private_Use [Co] 549 | Punctuation [P, P&, punct] 550 | Separator [Z, Z&] 551 | Space_Separator [Zs] 552 | Spacing_Mark [Mc] 553 | Surrogate [Cs] 554 | Symbol [S, S&] 555 | Titlecase_Letter [Lt] 556 | Unassigned [Cn] 557 | Uppercase_Letter [Lu] 558 | 559 | Graph 560 | No [F, False, N] 561 | Yes [T, True, Y] 562 | 563 | Grapheme_Base [Gr_Base] 564 | No [F, False, N] 565 | Yes [T, True, Y] 566 | 567 | Grapheme_Cluster_Break [GCB] 568 | Control [CN] 569 | CR 570 | Extend [EX] 571 | E_Base [EB] 572 | E_Base_GAZ [EBG] 573 | E_Modifier [EM] 574 | Glue_After_Zwj [GAZ] 575 | L 576 | LF 577 | LV 578 | LVT 579 | Other [XX] 580 | Prepend [PP] 581 | Regional_Indicator [RI] 582 | SpacingMark [SM] 583 | T 584 | V 585 | ZWJ 586 | 587 | Grapheme_Extend [Gr_Ext] 588 | No [F, False, N] 589 | Yes [T, True, Y] 590 | 591 | Grapheme_Link [Gr_Link] 592 | No [F, False, N] 593 | Yes [T, True, Y] 594 | 595 | Hangul_Syllable_Type [hst] 596 | Leading_Jamo [L] 597 | LVT_Syllable [LVT] 598 | LV_Syllable [LV] 599 | Not_Applicable [NA] 600 | Trailing_Jamo [T] 601 | Vowel_Jamo [V] 602 | 603 | Hex_Digit [Hex] 604 | No [F, False, N] 605 | Yes [T, True, Y] 606 | 607 | Hyphen 608 | No [F, False, N] 609 | Yes [T, True, Y] 610 | 611 | Ideographic [Ideo] 612 | No [F, False, N] 613 | Yes [T, True, Y] 614 | 615 | IDS_Binary_Operator [IDSB] 616 | No [F, False, N] 617 | Yes [T, True, Y] 618 | 619 | IDS_Trinary_Operator [IDST] 620 | No [F, False, N] 621 | Yes [T, True, Y] 622 | 623 | ID_Continue [IDC] 624 | No [F, False, N] 625 | Yes [T, True, Y] 626 | 627 | ID_Start [IDS] 628 | No [F, False, N] 629 | Yes [T, True, Y] 630 | 631 | Indic_Positional_Category [InPC] 632 | Bottom 633 | Bottom_And_Left 634 | Bottom_And_Right 635 | Left 636 | Left_And_Right 637 | NA 638 | Overstruck 639 | Right 640 | Top 641 | Top_And_Bottom 642 | Top_And_Bottom_And_Right 643 | Top_And_Left 644 | Top_And_Left_And_Right 645 | Top_And_Right 646 | Visual_Order_Left 647 | 648 | Indic_Syllabic_Category [InSC] 649 | Avagraha 650 | Bindu 651 | Brahmi_Joining_Number 652 | Cantillation_Mark 653 | Consonant 654 | Consonant_Dead 655 | Consonant_Final 656 | Consonant_Head_Letter 657 | Consonant_Initial_Postfixed 658 | Consonant_Killer 659 | Consonant_Medial 660 | Consonant_Placeholder 661 | Consonant_Preceding_Repha 662 | Consonant_Prefixed 663 | Consonant_Subjoined 664 | Consonant_Succeeding_Repha 665 | Consonant_With_Stacker 666 | Gemination_Mark 667 | Invisible_Stacker 668 | Joiner 669 | Modifying_Letter 670 | Non_Joiner 671 | Nukta 672 | Number 673 | Number_Joiner 674 | Other 675 | Pure_Killer 676 | Register_Shifter 677 | Syllable_Modifier 678 | Tone_Letter 679 | Tone_Mark 680 | Virama 681 | Visarga 682 | Vowel 683 | Vowel_Dependent 684 | Vowel_Independent 685 | 686 | Joining_Group [jg] 687 | African_Feh 688 | African_Noon 689 | African_Qaf 690 | Ain 691 | Alaph 692 | Alef 693 | Beh 694 | Beth 695 | Burushaski_Yeh_Barree 696 | Dal 697 | Dalath_Rish 698 | E 699 | Farsi_Yeh 700 | Fe 701 | Feh 702 | Final_Semkath 703 | Gaf 704 | Gamal 705 | Hah 706 | Hamza_On_Heh_Goal [Teh_Marbuta_Goal] 707 | Hanifi_Rohingya_Kinna_Ya 708 | Hanifi_Rohingya_Pa 709 | He 710 | Heh 711 | Heh_Goal 712 | Heth 713 | Kaf 714 | Kaph 715 | Khaph 716 | Knotted_Heh 717 | Lam 718 | Lamadh 719 | Malayalam_Bha 720 | Malayalam_Ja 721 | Malayalam_Lla 722 | Malayalam_Llla 723 | Malayalam_Nga 724 | Malayalam_Nna 725 | Malayalam_Nnna 726 | Malayalam_Nya 727 | Malayalam_Ra 728 | Malayalam_Ssa 729 | Malayalam_Tta 730 | Manichaean_Aleph 731 | Manichaean_Ayin 732 | Manichaean_Beth 733 | Manichaean_Daleth 734 | Manichaean_Dhamedh 735 | Manichaean_Five 736 | Manichaean_Gimel 737 | Manichaean_Heth 738 | Manichaean_Hundred 739 | Manichaean_Kaph 740 | Manichaean_Lamedh 741 | Manichaean_Mem 742 | Manichaean_Nun 743 | Manichaean_One 744 | Manichaean_Pe 745 | Manichaean_Qoph 746 | Manichaean_Resh 747 | Manichaean_Sadhe 748 | Manichaean_Samekh 749 | Manichaean_Taw 750 | Manichaean_Ten 751 | Manichaean_Teth 752 | Manichaean_Thamedh 753 | Manichaean_Twenty 754 | Manichaean_Waw 755 | Manichaean_Yodh 756 | Manichaean_Zayin 757 | Meem 758 | Mim 759 | Noon 760 | No_Joining_Group 761 | Nun 762 | Nya 763 | Pe 764 | Qaf 765 | Qaph 766 | Reh 767 | Reversed_Pe 768 | Rohingya_Yeh 769 | Sad 770 | Sadhe 771 | Seen 772 | Semkath 773 | Shin 774 | Straight_Waw 775 | Swash_Kaf 776 | Syriac_Waw 777 | Tah 778 | Taw 779 | Teh_Marbuta 780 | Teth 781 | Waw 782 | Yeh 783 | Yeh_Barree 784 | Yeh_With_Tail 785 | Yudh 786 | Yudh_He 787 | Zain 788 | Zhain 789 | 790 | Joining_Type [jt] 791 | Dual_Joining [D] 792 | Join_Causing [C] 793 | Left_Joining [L] 794 | Non_Joining [U] 795 | Right_Joining [R] 796 | Transparent [T] 797 | 798 | Join_Control [Join_C] 799 | No [F, False, N] 800 | Yes [T, True, Y] 801 | 802 | Line_Break [lb] 803 | Alphabetic [AL] 804 | Ambiguous [AI] 805 | Break_After [BA] 806 | Break_Before [BB] 807 | Break_Both [B2] 808 | Break_Symbols [SY] 809 | Carriage_Return [CR] 810 | Close_Parenthesis [CP] 811 | Close_Punctuation [CL] 812 | Combining_Mark [CM] 813 | Complex_Context [SA] 814 | Conditional_Japanese_Starter [CJ] 815 | Contingent_Break [CB] 816 | Exclamation [EX] 817 | E_Base [EB] 818 | E_Modifier [EM] 819 | Glue [GL] 820 | H2 821 | H3 822 | Hebrew_Letter [HL] 823 | Hyphen [HY] 824 | Ideographic [ID] 825 | Infix_Numeric [IS] 826 | Inseparable [IN, Inseperable] 827 | JL 828 | JT 829 | JV 830 | Line_Feed [LF] 831 | Mandatory_Break [BK] 832 | Next_Line [NL] 833 | Nonstarter [NS] 834 | Numeric [NU] 835 | Open_Punctuation [OP] 836 | Postfix_Numeric [PO] 837 | Prefix_Numeric [PR] 838 | Quotation [QU] 839 | Regional_Indicator [RI] 840 | Space [SP] 841 | Surrogate [SG] 842 | Unknown [XX] 843 | Word_Joiner [WJ] 844 | ZWJ 845 | ZWSpace [ZW] 846 | 847 | Logical_Order_Exception [LOE] 848 | No [F, False, N] 849 | Yes [T, True, Y] 850 | 851 | Lowercase [Lower] 852 | No [F, False, N] 853 | Yes [T, True, Y] 854 | 855 | Math 856 | No [F, False, N] 857 | Yes [T, True, Y] 858 | 859 | NFC_Quick_Check [NFC_QC] 860 | Maybe [M] 861 | No [N] 862 | Yes [Y] 863 | 864 | NFD_Quick_Check [NFD_QC] 865 | No [N] 866 | Yes [Y] 867 | 868 | NFKC_Quick_Check [NFKC_QC] 869 | Maybe [M] 870 | No [N] 871 | Yes [Y] 872 | 873 | NFKD_Quick_Check [NFKD_QC] 874 | No [N] 875 | Yes [Y] 876 | 877 | Noncharacter_Code_Point [NChar] 878 | No [F, False, N] 879 | Yes [T, True, Y] 880 | 881 | Numeric_Type [nt] 882 | Decimal [De] 883 | Digit [Di] 884 | None 885 | Numeric [Nu] 886 | 887 | Numeric_Value [nv] 888 | -1/2 889 | 0 890 | 1 891 | 1/10 892 | 1/12 893 | 1/16 894 | 1/160 895 | 1/2 896 | 1/20 897 | 1/3 898 | 1/32 899 | 1/320 900 | 1/4 901 | 1/40 902 | 1/5 903 | 1/6 904 | 1/64 905 | 1/7 906 | 1/8 907 | 1/80 908 | 1/9 909 | 10 910 | 100 911 | 1000 912 | 10000 913 | 100000 914 | 1000000 915 | 10000000 916 | 100000000 917 | 10000000000 918 | 1000000000000 919 | 11 920 | 11/12 921 | 11/2 922 | 12 923 | 13 924 | 13/2 925 | 14 926 | 15 927 | 15/2 928 | 16 929 | 17 930 | 17/2 931 | 18 932 | 19 933 | 2 934 | 2/3 935 | 2/5 936 | 20 937 | 200 938 | 2000 939 | 20000 940 | 200000 941 | 20000000 942 | 21 943 | 216000 944 | 22 945 | 23 946 | 24 947 | 25 948 | 26 949 | 27 950 | 28 951 | 29 952 | 3 953 | 3/16 954 | 3/2 955 | 3/20 956 | 3/4 957 | 3/5 958 | 3/64 959 | 3/8 960 | 3/80 961 | 30 962 | 300 963 | 3000 964 | 30000 965 | 300000 966 | 31 967 | 32 968 | 33 969 | 34 970 | 35 971 | 36 972 | 37 973 | 38 974 | 39 975 | 4 976 | 4/5 977 | 40 978 | 400 979 | 4000 980 | 40000 981 | 400000 982 | 41 983 | 42 984 | 43 985 | 432000 986 | 44 987 | 45 988 | 46 989 | 47 990 | 48 991 | 49 992 | 5 993 | 5/12 994 | 5/2 995 | 5/6 996 | 5/8 997 | 50 998 | 500 999 | 5000 1000 | 50000 1001 | 500000 1002 | 6 1003 | 60 1004 | 600 1005 | 6000 1006 | 60000 1007 | 600000 1008 | 7 1009 | 7/12 1010 | 7/2 1011 | 7/8 1012 | 70 1013 | 700 1014 | 7000 1015 | 70000 1016 | 700000 1017 | 8 1018 | 80 1019 | 800 1020 | 8000 1021 | 80000 1022 | 800000 1023 | 9 1024 | 9/2 1025 | 90 1026 | 900 1027 | 9000 1028 | 90000 1029 | 900000 1030 | NaN 1031 | 1032 | Other_Alphabetic [OAlpha] 1033 | No [F, False, N] 1034 | Yes [T, True, Y] 1035 | 1036 | Other_Default_Ignorable_Code_Point [ODI] 1037 | No [F, False, N] 1038 | Yes [T, True, Y] 1039 | 1040 | Other_Grapheme_Extend [OGr_Ext] 1041 | No [F, False, N] 1042 | Yes [T, True, Y] 1043 | 1044 | Other_ID_Continue [OIDC] 1045 | No [F, False, N] 1046 | Yes [T, True, Y] 1047 | 1048 | Other_ID_Start [OIDS] 1049 | No [F, False, N] 1050 | Yes [T, True, Y] 1051 | 1052 | Other_Lowercase [OLower] 1053 | No [F, False, N] 1054 | Yes [T, True, Y] 1055 | 1056 | Other_Math [OMath] 1057 | No [F, False, N] 1058 | Yes [T, True, Y] 1059 | 1060 | Other_Uppercase [OUpper] 1061 | No [F, False, N] 1062 | Yes [T, True, Y] 1063 | 1064 | Pattern_Syntax [Pat_Syn] 1065 | No [F, False, N] 1066 | Yes [T, True, Y] 1067 | 1068 | Pattern_White_Space [Pat_WS] 1069 | No [F, False, N] 1070 | Yes [T, True, Y] 1071 | 1072 | Posix_AlNum 1073 | No [F, False, N] 1074 | Yes [T, True, Y] 1075 | 1076 | Posix_Digit 1077 | No [F, False, N] 1078 | Yes [T, True, Y] 1079 | 1080 | Posix_Punct 1081 | No [F, False, N] 1082 | Yes [T, True, Y] 1083 | 1084 | Posix_XDigit 1085 | No [F, False, N] 1086 | Yes [T, True, Y] 1087 | 1088 | Prepended_Concatenation_Mark [PCM] 1089 | No [F, False, N] 1090 | Yes [T, True, Y] 1091 | 1092 | Print 1093 | No [F, False, N] 1094 | Yes [T, True, Y] 1095 | 1096 | Quotation_Mark [QMark] 1097 | No [F, False, N] 1098 | Yes [T, True, Y] 1099 | 1100 | Radical 1101 | No [F, False, N] 1102 | Yes [T, True, Y] 1103 | 1104 | Regional_Indicator [RI] 1105 | No [F, False, N] 1106 | Yes [T, True, Y] 1107 | 1108 | Script [sc] 1109 | Adlam [Adlm] 1110 | Ahom 1111 | Anatolian_Hieroglyphs [Hluw] 1112 | Arabic [Arab] 1113 | Armenian [Armn] 1114 | Avestan [Avst] 1115 | Balinese [Bali] 1116 | Bamum [Bamu] 1117 | Bassa_Vah [Bass] 1118 | Batak [Batk] 1119 | Bengali [Beng] 1120 | Bhaiksuki [Bhks] 1121 | Bopomofo [Bopo] 1122 | Brahmi [Brah] 1123 | Braille [Brai] 1124 | Buginese [Bugi] 1125 | Buhid [Buhd] 1126 | Canadian_Aboriginal [Cans] 1127 | Carian [Cari] 1128 | Caucasian_Albanian [Aghb] 1129 | Chakma [Cakm] 1130 | Cham 1131 | Cherokee [Cher] 1132 | Common [Zyyy] 1133 | Coptic [Copt, Qaac] 1134 | Cuneiform [Xsux] 1135 | Cypriot [Cprt] 1136 | Cyrillic [Cyrl] 1137 | Deseret [Dsrt] 1138 | Devanagari [Deva] 1139 | Dogra [Dogr] 1140 | Duployan [Dupl] 1141 | Egyptian_Hieroglyphs [Egyp] 1142 | Elbasan [Elba] 1143 | Elymaic [Elym] 1144 | Ethiopic [Ethi] 1145 | Georgian [Geor] 1146 | Glagolitic [Glag] 1147 | Gothic [Goth] 1148 | Grantha [Gran] 1149 | Greek [Grek] 1150 | Gujarati [Gujr] 1151 | Gunjala_Gondi [Gong] 1152 | Gurmukhi [Guru] 1153 | Han [Hani] 1154 | Hangul [Hang] 1155 | Hanifi_Rohingya [Rohg] 1156 | Hanunoo [Hano] 1157 | Hatran [Hatr] 1158 | Hebrew [Hebr] 1159 | Hiragana [Hira] 1160 | Imperial_Aramaic [Armi] 1161 | Inherited [Qaai, Zinh] 1162 | Inscriptional_Pahlavi [Phli] 1163 | Inscriptional_Parthian [Prti] 1164 | Javanese [Java] 1165 | Kaithi [Kthi] 1166 | Kannada [Knda] 1167 | Katakana [Kana] 1168 | Katakana_Or_Hiragana [Hrkt] 1169 | Kayah_Li [Kali] 1170 | Kharoshthi [Khar] 1171 | Khmer [Khmr] 1172 | Khojki [Khoj] 1173 | Khudawadi [Sind] 1174 | Lao [Laoo] 1175 | Latin [Latn] 1176 | Lepcha [Lepc] 1177 | Limbu [Limb] 1178 | Linear_A [Lina] 1179 | Linear_B [Linb] 1180 | Lisu 1181 | Lycian [Lyci] 1182 | Lydian [Lydi] 1183 | Mahajani [Mahj] 1184 | Makasar [Maka] 1185 | Malayalam [Mlym] 1186 | Mandaic [Mand] 1187 | Manichaean [Mani] 1188 | Marchen [Marc] 1189 | Masaram_Gondi [Gonm] 1190 | Medefaidrin [Medf] 1191 | Meetei_Mayek [Mtei] 1192 | Mende_Kikakui [Mend] 1193 | Meroitic_Cursive [Merc] 1194 | Meroitic_Hieroglyphs [Mero] 1195 | Miao [Plrd] 1196 | Modi 1197 | Mongolian [Mong] 1198 | Mro [Mroo] 1199 | Multani [Mult] 1200 | Myanmar [Mymr] 1201 | Nabataean [Nbat] 1202 | Nandinagari [Nand] 1203 | Newa 1204 | New_Tai_Lue [Talu] 1205 | Nko [Nkoo] 1206 | Nushu [Nshu] 1207 | Nyiakeng_Puachue_Hmong [Hmnp] 1208 | Ogham [Ogam] 1209 | Old_Hungarian [Hung] 1210 | Old_Italic [Ital] 1211 | Old_North_Arabian [Narb] 1212 | Old_Permic [Perm] 1213 | Old_Persian [Xpeo] 1214 | Old_Sogdian [Sogo] 1215 | Old_South_Arabian [Sarb] 1216 | Old_Turkic [Orkh] 1217 | Ol_Chiki [Olck] 1218 | Oriya [Orya] 1219 | Osage [Osge] 1220 | Osmanya [Osma] 1221 | Pahawh_Hmong [Hmng] 1222 | Palmyrene [Palm] 1223 | Pau_Cin_Hau [Pauc] 1224 | Phags_Pa [Phag] 1225 | Phoenician [Phnx] 1226 | Psalter_Pahlavi [Phlp] 1227 | Rejang [Rjng] 1228 | Runic [Runr] 1229 | Samaritan [Samr] 1230 | Saurashtra [Saur] 1231 | Sharada [Shrd] 1232 | Shavian [Shaw] 1233 | Siddham [Sidd] 1234 | SignWriting [Sgnw] 1235 | Sinhala [Sinh] 1236 | Sogdian [Sogd] 1237 | Sora_Sompeng [Sora] 1238 | Soyombo [Soyo] 1239 | Sundanese [Sund] 1240 | Syloti_Nagri [Sylo] 1241 | Syriac [Syrc] 1242 | Tagalog [Tglg] 1243 | Tagbanwa [Tagb] 1244 | Tai_Le [Tale] 1245 | Tai_Tham [Lana] 1246 | Tai_Viet [Tavt] 1247 | Takri [Takr] 1248 | Tamil [Taml] 1249 | Tangut [Tang] 1250 | Telugu [Telu] 1251 | Thaana [Thaa] 1252 | Thai 1253 | Tibetan [Tibt] 1254 | Tifinagh [Tfng] 1255 | Tirhuta [Tirh] 1256 | Ugaritic [Ugar] 1257 | Unknown [Zzzz] 1258 | Vai [Vaii] 1259 | Wancho [Wcho] 1260 | Warang_Citi [Wara] 1261 | Yi [Yiii] 1262 | Zanabazar_Square [Zanb] 1263 | 1264 | Script_Extensions [scx] 1265 | Adlam [Adlm] 1266 | Adlm Arab Mand Mani Phlp Rohg Sogd Syrc 1267 | Ahom 1268 | Anatolian_Hieroglyphs [Hluw] 1269 | Arab Copt 1270 | Arab Rohg 1271 | Arab Rohg Syrc Thaa 1272 | Arab Syrc 1273 | Arab Syrc Thaa 1274 | Arab Thaa 1275 | Arabic [Arab] 1276 | Armenian [Armn] 1277 | Armn Geor 1278 | Avestan [Avst] 1279 | Balinese [Bali] 1280 | Bamum [Bamu] 1281 | Bassa_Vah [Bass] 1282 | Batak [Batk] 1283 | Beng Cakm Sylo 1284 | Beng Deva 1285 | Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh 1286 | Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh 1287 | Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh 1288 | Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh 1289 | Beng Deva Gran Knda 1290 | Beng Deva Gran Knda Nand Orya Telu Tirh 1291 | Bengali [Beng] 1292 | Bhaiksuki [Bhks] 1293 | Bopo Hang Hani Hira Kana 1294 | Bopo Hang Hani Hira Kana Yiii 1295 | Bopo Hani 1296 | Bopomofo [Bopo] 1297 | Brahmi [Brah] 1298 | Braille [Brai] 1299 | Bugi Java 1300 | Buginese [Bugi] 1301 | Buhd Hano Tagb Tglg 1302 | Buhid [Buhd] 1303 | Cakm Mymr Tale 1304 | Canadian_Aboriginal [Cans] 1305 | Carian [Cari] 1306 | Caucasian_Albanian [Aghb] 1307 | Chakma [Cakm] 1308 | Cham 1309 | Cherokee [Cher] 1310 | Common [Zyyy] 1311 | Coptic [Copt, Qaac] 1312 | Cprt Lina Linb 1313 | Cprt Linb 1314 | Cuneiform [Xsux] 1315 | Cypriot [Cprt] 1316 | Cyrillic [Cyrl] 1317 | Cyrl Glag 1318 | Cyrl Latn 1319 | Cyrl Perm 1320 | Deseret [Dsrt] 1321 | Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh 1322 | Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh 1323 | Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh 1324 | Deva Dogr Kthi Mahj 1325 | Deva Gran 1326 | Deva Gran Knda 1327 | Deva Gran Latn 1328 | Deva Knda Mlym Orya Taml Telu 1329 | Deva Nand 1330 | Deva Shrd 1331 | Deva Taml 1332 | Devanagari [Deva] 1333 | Dogra [Dogr] 1334 | Duployan [Dupl] 1335 | Egyptian_Hieroglyphs [Egyp] 1336 | Elbasan [Elba] 1337 | Elymaic [Elym] 1338 | Ethiopic [Ethi] 1339 | Geor Latn 1340 | Georgian [Geor] 1341 | Glagolitic [Glag] 1342 | Gothic [Goth] 1343 | Gran Taml 1344 | Grantha [Gran] 1345 | Greek [Grek] 1346 | Gujarati [Gujr] 1347 | Gujr Khoj 1348 | Gunjala_Gondi [Gong] 1349 | Gurmukhi [Guru] 1350 | Guru Mult 1351 | Han [Hani] 1352 | Hangul [Hang] 1353 | Hani Hira Kana 1354 | Hanifi_Rohingya [Rohg] 1355 | Hanunoo [Hano] 1356 | Hatran [Hatr] 1357 | Hebrew [Hebr] 1358 | Hira Kana 1359 | Hiragana [Hira] 1360 | Imperial_Aramaic [Armi] 1361 | Inherited [Qaai, Zinh] 1362 | Inscriptional_Pahlavi [Phli] 1363 | Inscriptional_Parthian [Prti] 1364 | Javanese [Java] 1365 | Kaithi [Kthi] 1366 | Kali Latn Mymr 1367 | Kannada [Knda] 1368 | Katakana [Kana] 1369 | Kayah_Li [Kali] 1370 | Kharoshthi [Khar] 1371 | Khmer [Khmr] 1372 | Khojki [Khoj] 1373 | Khudawadi [Sind] 1374 | Knda Nand 1375 | Lao [Laoo] 1376 | Latin [Latn] 1377 | Latn Mong 1378 | Lepcha [Lepc] 1379 | Limbu [Limb] 1380 | Linear_A [Lina] 1381 | Linear_B [Linb] 1382 | Lisu 1383 | Lycian [Lyci] 1384 | Lydian [Lydi] 1385 | Mahajani [Mahj] 1386 | Makasar [Maka] 1387 | Malayalam [Mlym] 1388 | Mandaic [Mand] 1389 | Manichaean [Mani] 1390 | Marchen [Marc] 1391 | Masaram_Gondi [Gonm] 1392 | Medefaidrin [Medf] 1393 | Meetei_Mayek [Mtei] 1394 | Mende_Kikakui [Mend] 1395 | Meroitic_Cursive [Merc] 1396 | Meroitic_Hieroglyphs [Mero] 1397 | Miao [Plrd] 1398 | Modi 1399 | Mong Phag 1400 | Mongolian [Mong] 1401 | Mro [Mroo] 1402 | Multani [Mult] 1403 | Myanmar [Mymr] 1404 | Nabataean [Nbat] 1405 | Nandinagari [Nand] 1406 | Newa 1407 | New_Tai_Lue [Talu] 1408 | Nko [Nkoo] 1409 | Nushu [Nshu] 1410 | Nyiakeng_Puachue_Hmong [Hmnp] 1411 | Ogham [Ogam] 1412 | Old_Hungarian [Hung] 1413 | Old_Italic [Ital] 1414 | Old_North_Arabian [Narb] 1415 | Old_Permic [Perm] 1416 | Old_Persian [Xpeo] 1417 | Old_Sogdian [Sogo] 1418 | Old_South_Arabian [Sarb] 1419 | Old_Turkic [Orkh] 1420 | Ol_Chiki [Olck] 1421 | Oriya [Orya] 1422 | Osage [Osge] 1423 | Osmanya [Osma] 1424 | Pahawh_Hmong [Hmng] 1425 | Palmyrene [Palm] 1426 | Pau_Cin_Hau [Pauc] 1427 | Phags_Pa [Phag] 1428 | Phoenician [Phnx] 1429 | Psalter_Pahlavi [Phlp] 1430 | Rejang [Rjng] 1431 | Runic [Runr] 1432 | Samaritan [Samr] 1433 | Saurashtra [Saur] 1434 | Sharada [Shrd] 1435 | Shavian [Shaw] 1436 | Siddham [Sidd] 1437 | SignWriting [Sgnw] 1438 | Sinhala [Sinh] 1439 | Sogdian [Sogd] 1440 | Sora_Sompeng [Sora] 1441 | Soyombo [Soyo] 1442 | Sundanese [Sund] 1443 | Syloti_Nagri [Sylo] 1444 | Syriac [Syrc] 1445 | Tagalog [Tglg] 1446 | Tagbanwa [Tagb] 1447 | Tai_Le [Tale] 1448 | Tai_Tham [Lana] 1449 | Tai_Viet [Tavt] 1450 | Takri [Takr] 1451 | Tamil [Taml] 1452 | Tangut [Tang] 1453 | Telugu [Telu] 1454 | Thaana [Thaa] 1455 | Thai 1456 | Tibetan [Tibt] 1457 | Tifinagh [Tfng] 1458 | Tirhuta [Tirh] 1459 | Ugaritic [Ugar] 1460 | Unknown [Zzzz] 1461 | Vai [Vaii] 1462 | Wancho [Wcho] 1463 | Warang_Citi [Wara] 1464 | Yi [Yiii] 1465 | Zanabazar_Square [Zanb] 1466 | 1467 | Sentence_Break [SB] 1468 | ATerm [AT] 1469 | Close [CL] 1470 | CR 1471 | Extend [EX] 1472 | Format [FO] 1473 | LF 1474 | Lower [LO] 1475 | Numeric [NU] 1476 | OLetter [LE] 1477 | Other [XX] 1478 | SContinue [SC] 1479 | Sep [SE] 1480 | Sp 1481 | STerm [ST] 1482 | Upper [UP] 1483 | 1484 | Sentence_Terminal [STerm] 1485 | No [F, False, N] 1486 | Yes [T, True, Y] 1487 | 1488 | Soft_Dotted [SD] 1489 | No [F, False, N] 1490 | Yes [T, True, Y] 1491 | 1492 | Terminal_Punctuation [Term] 1493 | No [F, False, N] 1494 | Yes [T, True, Y] 1495 | 1496 | Unified_Ideograph [UIdeo] 1497 | No [F, False, N] 1498 | Yes [T, True, Y] 1499 | 1500 | Uppercase [Upper] 1501 | No [F, False, N] 1502 | Yes [T, True, Y] 1503 | 1504 | Variation_Selector [VS] 1505 | No [F, False, N] 1506 | Yes [T, True, Y] 1507 | 1508 | White_Space [space, WSpace] 1509 | No [F, False, N] 1510 | Yes [T, True, Y] 1511 | 1512 | Word 1513 | No [F, False, N] 1514 | Yes [T, True, Y] 1515 | 1516 | Word_Break [WB] 1517 | ALetter [LE] 1518 | CR 1519 | Double_Quote [DQ] 1520 | Extend 1521 | ExtendNumLet [EX] 1522 | E_Base [EB] 1523 | E_Base_GAZ [EBG] 1524 | E_Modifier [EM] 1525 | Format [FO] 1526 | Glue_After_Zwj [GAZ] 1527 | Hebrew_Letter [HL] 1528 | Katakana [KA] 1529 | LF 1530 | MidLetter [ML] 1531 | MidNum [MN] 1532 | MidNumLet [MB] 1533 | Newline [NL] 1534 | Numeric [NU] 1535 | Other [XX] 1536 | Regional_Indicator [RI] 1537 | Single_Quote [SQ] 1538 | WSegSpace 1539 | ZWJ 1540 | 1541 | XDigit 1542 | No [F, False, N] 1543 | Yes [T, True, Y] 1544 | 1545 | XID_Continue [XIDC] 1546 | No [F, False, N] 1547 | Yes [T, True, Y] 1548 | 1549 | XID_Start [XIDS] 1550 | No [F, False, N] 1551 | Yes [T, True, Y] 1552 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools > 61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "regex" 7 | version = "2025.5.18" 8 | description = "Alternative regular expression module, to replace re." 9 | readme = "README.rst" 10 | authors = [ 11 | {name = "Matthew Barnett", email = "regex@mrabarnett.plus.com"}, 12 | ] 13 | license = "Apache-2.0" 14 | 15 | classifiers = [ 16 | "Development Status :: 5 - Production/Stable", 17 | "Intended Audience :: Developers", 18 | "Operating System :: OS Independent", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "Programming Language :: Python :: 3.13", 24 | "Topic :: Scientific/Engineering :: Information Analysis", 25 | "Topic :: Software Development :: Libraries :: Python Modules", 26 | "Topic :: Text Processing", 27 | "Topic :: Text Processing :: General", 28 | ] 29 | 30 | requires-python = ">= 3.9" 31 | 32 | [project.urls] 33 | Homepage = "https://github.com/mrabarnett/mrab-regex" 34 | 35 | [tool.setuptools] 36 | package-dir = {regex = "regex_3"} 37 | py-modules = [ 38 | "regex.__init__", 39 | "regex.regex", 40 | "regex._regex_core", 41 | "regex.test_regex", 42 | ] 43 | -------------------------------------------------------------------------------- /regex_3/__init__.py: -------------------------------------------------------------------------------- 1 | from .regex import * 2 | from . import regex 3 | __all__ = regex.__all__ 4 | -------------------------------------------------------------------------------- /regex_3/_regex.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Secret Labs' Regular Expression Engine 3 | * 4 | * regular expression matching engine 5 | * 6 | * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. 7 | * 8 | * NOTE: This file is generated by regex.py. If you need 9 | * to change anything in here, edit regex.py and run it. 10 | * 11 | * 2010-01-16 mrab Re-written 12 | */ 13 | 14 | /* Supports Unicode version 12.1.0. */ 15 | 16 | #define RE_MAGIC 20100116 17 | 18 | #include "_regex_unicode.h" 19 | 20 | /* Operators. */ 21 | #define RE_OP_FAILURE 0 22 | #define RE_OP_SUCCESS 1 23 | #define RE_OP_ANY 2 24 | #define RE_OP_ANY_ALL 3 25 | #define RE_OP_ANY_ALL_REV 4 26 | #define RE_OP_ANY_REV 5 27 | #define RE_OP_ANY_U 6 28 | #define RE_OP_ANY_U_REV 7 29 | #define RE_OP_ATOMIC 8 30 | #define RE_OP_BOUNDARY 9 31 | #define RE_OP_BRANCH 10 32 | #define RE_OP_CALL_REF 11 33 | #define RE_OP_CHARACTER 12 34 | #define RE_OP_CHARACTER_IGN 13 35 | #define RE_OP_CHARACTER_IGN_REV 14 36 | #define RE_OP_CHARACTER_REV 15 37 | #define RE_OP_CONDITIONAL 16 38 | #define RE_OP_DEFAULT_BOUNDARY 17 39 | #define RE_OP_DEFAULT_END_OF_WORD 18 40 | #define RE_OP_DEFAULT_START_OF_WORD 19 41 | #define RE_OP_END 20 42 | #define RE_OP_END_OF_LINE 21 43 | #define RE_OP_END_OF_LINE_U 22 44 | #define RE_OP_END_OF_STRING 23 45 | #define RE_OP_END_OF_STRING_LINE 24 46 | #define RE_OP_END_OF_STRING_LINE_U 25 47 | #define RE_OP_END_OF_WORD 26 48 | #define RE_OP_FUZZY 27 49 | #define RE_OP_GRAPHEME_BOUNDARY 28 50 | #define RE_OP_GREEDY_REPEAT 29 51 | #define RE_OP_GROUP 30 52 | #define RE_OP_GROUP_CALL 31 53 | #define RE_OP_GROUP_EXISTS 32 54 | #define RE_OP_KEEP 33 55 | #define RE_OP_LAZY_REPEAT 34 56 | #define RE_OP_LOOKAROUND 35 57 | #define RE_OP_NEXT 36 58 | #define RE_OP_PROPERTY 37 59 | #define RE_OP_PROPERTY_IGN 38 60 | #define RE_OP_PROPERTY_IGN_REV 39 61 | #define RE_OP_PROPERTY_REV 40 62 | #define RE_OP_PRUNE 41 63 | #define RE_OP_RANGE 42 64 | #define RE_OP_RANGE_IGN 43 65 | #define RE_OP_RANGE_IGN_REV 44 66 | #define RE_OP_RANGE_REV 45 67 | #define RE_OP_REF_GROUP 46 68 | #define RE_OP_REF_GROUP_FLD 47 69 | #define RE_OP_REF_GROUP_FLD_REV 48 70 | #define RE_OP_REF_GROUP_IGN 49 71 | #define RE_OP_REF_GROUP_IGN_REV 50 72 | #define RE_OP_REF_GROUP_REV 51 73 | #define RE_OP_SEARCH_ANCHOR 52 74 | #define RE_OP_SET_DIFF 53 75 | #define RE_OP_SET_DIFF_IGN 54 76 | #define RE_OP_SET_DIFF_IGN_REV 55 77 | #define RE_OP_SET_DIFF_REV 56 78 | #define RE_OP_SET_INTER 57 79 | #define RE_OP_SET_INTER_IGN 58 80 | #define RE_OP_SET_INTER_IGN_REV 59 81 | #define RE_OP_SET_INTER_REV 60 82 | #define RE_OP_SET_SYM_DIFF 61 83 | #define RE_OP_SET_SYM_DIFF_IGN 62 84 | #define RE_OP_SET_SYM_DIFF_IGN_REV 63 85 | #define RE_OP_SET_SYM_DIFF_REV 64 86 | #define RE_OP_SET_UNION 65 87 | #define RE_OP_SET_UNION_IGN 66 88 | #define RE_OP_SET_UNION_IGN_REV 67 89 | #define RE_OP_SET_UNION_REV 68 90 | #define RE_OP_SKIP 69 91 | #define RE_OP_START_OF_LINE 70 92 | #define RE_OP_START_OF_LINE_U 71 93 | #define RE_OP_START_OF_STRING 72 94 | #define RE_OP_START_OF_WORD 73 95 | #define RE_OP_STRING 74 96 | #define RE_OP_STRING_FLD 75 97 | #define RE_OP_STRING_FLD_REV 76 98 | #define RE_OP_STRING_IGN 77 99 | #define RE_OP_STRING_IGN_REV 78 100 | #define RE_OP_STRING_REV 79 101 | #define RE_OP_FUZZY_EXT 80 102 | #define RE_OP_BODY_END 81 103 | #define RE_OP_BODY_START 82 104 | #define RE_OP_END_ATOMIC 83 105 | #define RE_OP_END_CONDITIONAL 84 106 | #define RE_OP_END_FUZZY 85 107 | #define RE_OP_END_GREEDY_REPEAT 86 108 | #define RE_OP_END_GROUP 87 109 | #define RE_OP_END_LAZY_REPEAT 88 110 | #define RE_OP_END_LOOKAROUND 89 111 | #define RE_OP_FUZZY_INSERT 90 112 | #define RE_OP_GREEDY_REPEAT_ONE 91 113 | #define RE_OP_GROUP_RETURN 92 114 | #define RE_OP_LAZY_REPEAT_ONE 93 115 | #define RE_OP_MATCH_BODY 94 116 | #define RE_OP_MATCH_TAIL 95 117 | #define RE_OP_START_GROUP 96 118 | #define RE_OP_TAIL_START 97 119 | 120 | char* re_op_text[] = { 121 | "RE_OP_FAILURE", 122 | "RE_OP_SUCCESS", 123 | "RE_OP_ANY", 124 | "RE_OP_ANY_ALL", 125 | "RE_OP_ANY_ALL_REV", 126 | "RE_OP_ANY_REV", 127 | "RE_OP_ANY_U", 128 | "RE_OP_ANY_U_REV", 129 | "RE_OP_ATOMIC", 130 | "RE_OP_BOUNDARY", 131 | "RE_OP_BRANCH", 132 | "RE_OP_CALL_REF", 133 | "RE_OP_CHARACTER", 134 | "RE_OP_CHARACTER_IGN", 135 | "RE_OP_CHARACTER_IGN_REV", 136 | "RE_OP_CHARACTER_REV", 137 | "RE_OP_CONDITIONAL", 138 | "RE_OP_DEFAULT_BOUNDARY", 139 | "RE_OP_DEFAULT_END_OF_WORD", 140 | "RE_OP_DEFAULT_START_OF_WORD", 141 | "RE_OP_END", 142 | "RE_OP_END_OF_LINE", 143 | "RE_OP_END_OF_LINE_U", 144 | "RE_OP_END_OF_STRING", 145 | "RE_OP_END_OF_STRING_LINE", 146 | "RE_OP_END_OF_STRING_LINE_U", 147 | "RE_OP_END_OF_WORD", 148 | "RE_OP_FUZZY", 149 | "RE_OP_GRAPHEME_BOUNDARY", 150 | "RE_OP_GREEDY_REPEAT", 151 | "RE_OP_GROUP", 152 | "RE_OP_GROUP_CALL", 153 | "RE_OP_GROUP_EXISTS", 154 | "RE_OP_KEEP", 155 | "RE_OP_LAZY_REPEAT", 156 | "RE_OP_LOOKAROUND", 157 | "RE_OP_NEXT", 158 | "RE_OP_PROPERTY", 159 | "RE_OP_PROPERTY_IGN", 160 | "RE_OP_PROPERTY_IGN_REV", 161 | "RE_OP_PROPERTY_REV", 162 | "RE_OP_PRUNE", 163 | "RE_OP_RANGE", 164 | "RE_OP_RANGE_IGN", 165 | "RE_OP_RANGE_IGN_REV", 166 | "RE_OP_RANGE_REV", 167 | "RE_OP_REF_GROUP", 168 | "RE_OP_REF_GROUP_FLD", 169 | "RE_OP_REF_GROUP_FLD_REV", 170 | "RE_OP_REF_GROUP_IGN", 171 | "RE_OP_REF_GROUP_IGN_REV", 172 | "RE_OP_REF_GROUP_REV", 173 | "RE_OP_SEARCH_ANCHOR", 174 | "RE_OP_SET_DIFF", 175 | "RE_OP_SET_DIFF_IGN", 176 | "RE_OP_SET_DIFF_IGN_REV", 177 | "RE_OP_SET_DIFF_REV", 178 | "RE_OP_SET_INTER", 179 | "RE_OP_SET_INTER_IGN", 180 | "RE_OP_SET_INTER_IGN_REV", 181 | "RE_OP_SET_INTER_REV", 182 | "RE_OP_SET_SYM_DIFF", 183 | "RE_OP_SET_SYM_DIFF_IGN", 184 | "RE_OP_SET_SYM_DIFF_IGN_REV", 185 | "RE_OP_SET_SYM_DIFF_REV", 186 | "RE_OP_SET_UNION", 187 | "RE_OP_SET_UNION_IGN", 188 | "RE_OP_SET_UNION_IGN_REV", 189 | "RE_OP_SET_UNION_REV", 190 | "RE_OP_SKIP", 191 | "RE_OP_START_OF_LINE", 192 | "RE_OP_START_OF_LINE_U", 193 | "RE_OP_START_OF_STRING", 194 | "RE_OP_START_OF_WORD", 195 | "RE_OP_STRING", 196 | "RE_OP_STRING_FLD", 197 | "RE_OP_STRING_FLD_REV", 198 | "RE_OP_STRING_IGN", 199 | "RE_OP_STRING_IGN_REV", 200 | "RE_OP_STRING_REV", 201 | "RE_OP_FUZZY_EXT", 202 | "RE_OP_BODY_END", 203 | "RE_OP_BODY_START", 204 | "RE_OP_END_ATOMIC", 205 | "RE_OP_END_CONDITIONAL", 206 | "RE_OP_END_FUZZY", 207 | "RE_OP_END_GREEDY_REPEAT", 208 | "RE_OP_END_GROUP", 209 | "RE_OP_END_LAZY_REPEAT", 210 | "RE_OP_END_LOOKAROUND", 211 | "RE_OP_FUZZY_INSERT", 212 | "RE_OP_GREEDY_REPEAT_ONE", 213 | "RE_OP_GROUP_RETURN", 214 | "RE_OP_LAZY_REPEAT_ONE", 215 | "RE_OP_MATCH_BODY", 216 | "RE_OP_MATCH_TAIL", 217 | "RE_OP_START_GROUP", 218 | "RE_OP_TAIL_START" 219 | }; 220 | 221 | #define RE_FLAG_ASCII 0x80 222 | #define RE_FLAG_BESTMATCH 0x1000 223 | #define RE_FLAG_DEBUG 0x200 224 | #define RE_FLAG_DOTALL 0x10 225 | #define RE_FLAG_ENHANCEMATCH 0x8000 226 | #define RE_FLAG_FULLCASE 0x4000 227 | #define RE_FLAG_IGNORECASE 0x2 228 | #define RE_FLAG_LOCALE 0x4 229 | #define RE_FLAG_MULTILINE 0x8 230 | #define RE_FLAG_POSIX 0x10000 231 | #define RE_FLAG_REVERSE 0x400 232 | #define RE_FLAG_TEMPLATE 0x1 233 | #define RE_FLAG_UNICODE 0x20 234 | #define RE_FLAG_VERBOSE 0x40 235 | #define RE_FLAG_VERSION0 0x2000 236 | #define RE_FLAG_VERSION1 0x100 237 | #define RE_FLAG_WORD 0x800 238 | -------------------------------------------------------------------------------- /regex_3/_regex_unicode.h: -------------------------------------------------------------------------------- 1 | typedef unsigned char RE_UINT8; 2 | typedef signed char RE_INT8; 3 | typedef unsigned short RE_UINT16; 4 | typedef signed short RE_INT16; 5 | typedef unsigned int RE_UINT32; 6 | typedef signed int RE_INT32; 7 | 8 | typedef unsigned char BOOL; 9 | #if !defined(FALSE) || !defined(TRUE) 10 | #define FALSE 0 11 | #define TRUE 1 12 | #endif 13 | 14 | #define RE_ASCII_MAX 0x7F 15 | #define RE_LOCALE_MAX 0xFF 16 | 17 | #define RE_MAX_CASES 4 18 | #define RE_MAX_FOLDED 3 19 | #define RE_MAX_SCX 23 20 | 21 | typedef struct RE_Property { 22 | RE_UINT16 name; 23 | RE_UINT8 id; 24 | RE_UINT8 value_set; 25 | } RE_Property; 26 | 27 | typedef struct RE_PropertyValue { 28 | RE_UINT16 name; 29 | RE_UINT8 value_set; 30 | RE_UINT16 id; 31 | } RE_PropertyValue; 32 | 33 | typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 codepoint); 34 | 35 | #define RE_PROP_GC 0x1E 36 | #define RE_PROP_CASED 0xA 37 | #define RE_PROP_UPPERCASE 0x5C 38 | #define RE_PROP_LOWERCASE 0x38 39 | #define RE_PROP_SCX 0x56 40 | 41 | #define RE_PROP_C 30 42 | #define RE_PROP_L 31 43 | #define RE_PROP_M 32 44 | #define RE_PROP_N 33 45 | #define RE_PROP_P 34 46 | #define RE_PROP_S 35 47 | #define RE_PROP_Z 36 48 | #define RE_PROP_ASSIGNED 37 49 | #define RE_PROP_CASEDLETTER 38 50 | 51 | #define RE_PROP_CN 0 52 | #define RE_PROP_CC 1 53 | #define RE_PROP_ZS 2 54 | #define RE_PROP_PO 3 55 | #define RE_PROP_SC 4 56 | #define RE_PROP_PS 5 57 | #define RE_PROP_PE 6 58 | #define RE_PROP_SM 7 59 | #define RE_PROP_PD 8 60 | #define RE_PROP_ND 9 61 | #define RE_PROP_LU 10 62 | #define RE_PROP_SK 11 63 | #define RE_PROP_PC 12 64 | #define RE_PROP_LL 13 65 | #define RE_PROP_SO 14 66 | #define RE_PROP_LO 15 67 | #define RE_PROP_PI 16 68 | #define RE_PROP_CF 17 69 | #define RE_PROP_NO 18 70 | #define RE_PROP_PF 19 71 | #define RE_PROP_LT 20 72 | #define RE_PROP_LM 21 73 | #define RE_PROP_MN 22 74 | #define RE_PROP_ME 23 75 | #define RE_PROP_MC 24 76 | #define RE_PROP_NL 25 77 | #define RE_PROP_ZL 26 78 | #define RE_PROP_ZP 27 79 | #define RE_PROP_CS 28 80 | #define RE_PROP_CO 29 81 | 82 | #define RE_PROP_C_MASK 0x30020003 83 | #define RE_PROP_L_MASK 0x0030A400 84 | #define RE_PROP_M_MASK 0x01C00000 85 | #define RE_PROP_N_MASK 0x02040200 86 | #define RE_PROP_P_MASK 0x00091168 87 | #define RE_PROP_S_MASK 0x00004890 88 | #define RE_PROP_Z_MASK 0x0C000004 89 | 90 | #define RE_PROP_ALNUM 0x010001 91 | #define RE_PROP_ALPHA 0x000001 92 | #define RE_PROP_ANY 0x020001 93 | #define RE_PROP_ASCII 0x080001 94 | #define RE_PROP_BLANK 0x070001 95 | #define RE_PROP_CNTRL 0x1E0001 96 | #define RE_PROP_DIGIT 0x1E0009 97 | #define RE_PROP_GRAPH 0x1F0001 98 | #define RE_PROP_LOWER 0x380001 99 | #define RE_PROP_PRINT 0x510001 100 | #define RE_PROP_SPACE 0x5F0001 101 | #define RE_PROP_UPPER 0x5C0001 102 | #define RE_PROP_WORD 0x600001 103 | #define RE_PROP_XDIGIT 0x620001 104 | #define RE_PROP_POSIX_ALNUM 0x4C0001 105 | #define RE_PROP_POSIX_DIGIT 0x4D0001 106 | #define RE_PROP_POSIX_PUNCT 0x4E0001 107 | #define RE_PROP_POSIX_XDIGIT 0x4F0001 108 | 109 | #define RE_WBREAK_OTHER 0 110 | #define RE_WBREAK_LF 1 111 | #define RE_WBREAK_NEWLINE 2 112 | #define RE_WBREAK_CR 3 113 | #define RE_WBREAK_WSEGSPACE 4 114 | #define RE_WBREAK_DOUBLEQUOTE 5 115 | #define RE_WBREAK_SINGLEQUOTE 6 116 | #define RE_WBREAK_MIDNUM 7 117 | #define RE_WBREAK_MIDNUMLET 8 118 | #define RE_WBREAK_NUMERIC 9 119 | #define RE_WBREAK_MIDLETTER 10 120 | #define RE_WBREAK_ALETTER 11 121 | #define RE_WBREAK_EXTENDNUMLET 12 122 | #define RE_WBREAK_FORMAT 13 123 | #define RE_WBREAK_EXTEND 14 124 | #define RE_WBREAK_HEBREWLETTER 15 125 | #define RE_WBREAK_ZWJ 16 126 | #define RE_WBREAK_KATAKANA 17 127 | #define RE_WBREAK_REGIONALINDICATOR 18 128 | #define RE_WBREAK_EBASE 19 129 | #define RE_WBREAK_EBASEGAZ 20 130 | #define RE_WBREAK_EMODIFIER 21 131 | #define RE_WBREAK_GLUEAFTERZWJ 22 132 | 133 | #define RE_GBREAK_OTHER 0 134 | #define RE_GBREAK_CONTROL 1 135 | #define RE_GBREAK_LF 2 136 | #define RE_GBREAK_CR 3 137 | #define RE_GBREAK_EXTEND 4 138 | #define RE_GBREAK_PREPEND 5 139 | #define RE_GBREAK_SPACINGMARK 6 140 | #define RE_GBREAK_L 7 141 | #define RE_GBREAK_V 8 142 | #define RE_GBREAK_T 9 143 | #define RE_GBREAK_ZWJ 10 144 | #define RE_GBREAK_LV 11 145 | #define RE_GBREAK_LVT 12 146 | #define RE_GBREAK_REGIONALINDICATOR 13 147 | #define RE_GBREAK_EBASE 14 148 | #define RE_GBREAK_EBASEGAZ 15 149 | #define RE_GBREAK_EMODIFIER 16 150 | #define RE_GBREAK_GLUEAFTERZWJ 17 151 | 152 | #define RE_LBREAK_UNKNOWN 0 153 | #define RE_LBREAK_COMBININGMARK 1 154 | #define RE_LBREAK_BREAKAFTER 2 155 | #define RE_LBREAK_LINEFEED 3 156 | #define RE_LBREAK_MANDATORYBREAK 4 157 | #define RE_LBREAK_CARRIAGERETURN 5 158 | #define RE_LBREAK_SPACE 6 159 | #define RE_LBREAK_EXCLAMATION 7 160 | #define RE_LBREAK_QUOTATION 8 161 | #define RE_LBREAK_ALPHABETIC 9 162 | #define RE_LBREAK_PREFIXNUMERIC 10 163 | #define RE_LBREAK_POSTFIXNUMERIC 11 164 | #define RE_LBREAK_OPENPUNCTUATION 12 165 | #define RE_LBREAK_CLOSEPARENTHESIS 13 166 | #define RE_LBREAK_INFIXNUMERIC 14 167 | #define RE_LBREAK_HYPHEN 15 168 | #define RE_LBREAK_BREAKSYMBOLS 16 169 | #define RE_LBREAK_NUMERIC 17 170 | #define RE_LBREAK_CLOSEPUNCTUATION 18 171 | #define RE_LBREAK_NEXTLINE 19 172 | #define RE_LBREAK_GLUE 20 173 | #define RE_LBREAK_AMBIGUOUS 21 174 | #define RE_LBREAK_BREAKBEFORE 22 175 | #define RE_LBREAK_HEBREWLETTER 23 176 | #define RE_LBREAK_COMPLEXCONTEXT 24 177 | #define RE_LBREAK_JL 25 178 | #define RE_LBREAK_JV 26 179 | #define RE_LBREAK_JT 27 180 | #define RE_LBREAK_NONSTARTER 28 181 | #define RE_LBREAK_AKSARA 29 182 | #define RE_LBREAK_VIRAMA 30 183 | #define RE_LBREAK_AKSARASTART 31 184 | #define RE_LBREAK_IDEOGRAPHIC 32 185 | #define RE_LBREAK_VIRAMAFINAL 33 186 | #define RE_LBREAK_ZWSPACE 34 187 | #define RE_LBREAK_ZWJ 35 188 | #define RE_LBREAK_BREAKBOTH 36 189 | #define RE_LBREAK_INSEPARABLE 37 190 | #define RE_LBREAK_WORDJOINER 38 191 | #define RE_LBREAK_EBASE 39 192 | #define RE_LBREAK_CONDITIONALJAPANESESTARTER 40 193 | #define RE_LBREAK_H2 41 194 | #define RE_LBREAK_H3 42 195 | #define RE_LBREAK_SURROGATE 43 196 | #define RE_LBREAK_CONTINGENTBREAK 44 197 | #define RE_LBREAK_AKSARAPREBASE 45 198 | #define RE_LBREAK_REGIONALINDICATOR 46 199 | #define RE_LBREAK_EMODIFIER 47 200 | 201 | #define RE_INCB_NONE 0 202 | #define RE_INCB_EXTEND 1 203 | #define RE_INCB_CONSONANT 2 204 | #define RE_INCB_LINKER 3 205 | 206 | extern char* re_strings[1530]; 207 | extern RE_Property re_properties[185]; 208 | extern RE_PropertyValue re_property_values[1680]; 209 | extern RE_UINT16 re_expand_on_folding[104]; 210 | extern RE_GetPropertyFunc re_get_property[101]; 211 | 212 | RE_UINT32 re_get_alphabetic(RE_UINT32 codepoint); 213 | RE_UINT32 re_get_alphanumeric(RE_UINT32 codepoint); 214 | RE_UINT32 re_get_any(RE_UINT32 codepoint); 215 | RE_UINT32 re_get_ascii_hex_digit(RE_UINT32 codepoint); 216 | RE_UINT32 re_get_bidi_class(RE_UINT32 codepoint); 217 | RE_UINT32 re_get_bidi_control(RE_UINT32 codepoint); 218 | RE_UINT32 re_get_bidi_mirrored(RE_UINT32 codepoint); 219 | RE_UINT32 re_get_blank(RE_UINT32 codepoint); 220 | RE_UINT32 re_get_block(RE_UINT32 codepoint); 221 | RE_UINT32 re_get_canonical_combining_class(RE_UINT32 codepoint); 222 | RE_UINT32 re_get_cased(RE_UINT32 codepoint); 223 | RE_UINT32 re_get_case_ignorable(RE_UINT32 codepoint); 224 | RE_UINT32 re_get_changes_when_casefolded(RE_UINT32 codepoint); 225 | RE_UINT32 re_get_changes_when_casemapped(RE_UINT32 codepoint); 226 | RE_UINT32 re_get_changes_when_lowercased(RE_UINT32 codepoint); 227 | RE_UINT32 re_get_changes_when_titlecased(RE_UINT32 codepoint); 228 | RE_UINT32 re_get_changes_when_uppercased(RE_UINT32 codepoint); 229 | RE_UINT32 re_get_dash(RE_UINT32 codepoint); 230 | RE_UINT32 re_get_decomposition_type(RE_UINT32 codepoint); 231 | RE_UINT32 re_get_default_ignorable_code_point(RE_UINT32 codepoint); 232 | RE_UINT32 re_get_deprecated(RE_UINT32 codepoint); 233 | RE_UINT32 re_get_diacritic(RE_UINT32 codepoint); 234 | RE_UINT32 re_get_east_asian_width(RE_UINT32 codepoint); 235 | RE_UINT32 re_get_emoji(RE_UINT32 codepoint); 236 | RE_UINT32 re_get_emoji_component(RE_UINT32 codepoint); 237 | RE_UINT32 re_get_emoji_modifier(RE_UINT32 codepoint); 238 | RE_UINT32 re_get_emoji_modifier_base(RE_UINT32 codepoint); 239 | RE_UINT32 re_get_emoji_presentation(RE_UINT32 codepoint); 240 | RE_UINT32 re_get_extended_pictographic(RE_UINT32 codepoint); 241 | RE_UINT32 re_get_extender(RE_UINT32 codepoint); 242 | RE_UINT32 re_get_general_category(RE_UINT32 codepoint); 243 | RE_UINT32 re_get_graph(RE_UINT32 codepoint); 244 | RE_UINT32 re_get_grapheme_base(RE_UINT32 codepoint); 245 | RE_UINT32 re_get_grapheme_cluster_break(RE_UINT32 codepoint); 246 | RE_UINT32 re_get_grapheme_extend(RE_UINT32 codepoint); 247 | RE_UINT32 re_get_grapheme_link(RE_UINT32 codepoint); 248 | RE_UINT32 re_get_hangul_syllable_type(RE_UINT32 codepoint); 249 | RE_UINT32 re_get_hex_digit(RE_UINT32 codepoint); 250 | RE_UINT32 re_get_horiz_space(RE_UINT32 codepoint); 251 | RE_UINT32 re_get_hyphen(RE_UINT32 codepoint); 252 | RE_UINT32 re_get_id_compat_math_continue(RE_UINT32 codepoint); 253 | RE_UINT32 re_get_id_compat_math_start(RE_UINT32 codepoint); 254 | RE_UINT32 re_get_id_continue(RE_UINT32 codepoint); 255 | RE_UINT32 re_get_ideographic(RE_UINT32 codepoint); 256 | RE_UINT32 re_get_ids_binary_operator(RE_UINT32 codepoint); 257 | RE_UINT32 re_get_id_start(RE_UINT32 codepoint); 258 | RE_UINT32 re_get_ids_trinary_operator(RE_UINT32 codepoint); 259 | RE_UINT32 re_get_ids_unary_operator(RE_UINT32 codepoint); 260 | RE_UINT32 re_get_indic_conjunct_break(RE_UINT32 codepoint); 261 | RE_UINT32 re_get_indic_positional_category(RE_UINT32 codepoint); 262 | RE_UINT32 re_get_indic_syllabic_category(RE_UINT32 codepoint); 263 | RE_UINT32 re_get_join_control(RE_UINT32 codepoint); 264 | RE_UINT32 re_get_joining_group(RE_UINT32 codepoint); 265 | RE_UINT32 re_get_joining_type(RE_UINT32 codepoint); 266 | RE_UINT32 re_get_line_break(RE_UINT32 codepoint); 267 | RE_UINT32 re_get_logical_order_exception(RE_UINT32 codepoint); 268 | RE_UINT32 re_get_lowercase(RE_UINT32 codepoint); 269 | RE_UINT32 re_get_math(RE_UINT32 codepoint); 270 | RE_UINT32 re_get_modifier_combining_mark(RE_UINT32 codepoint); 271 | RE_UINT32 re_get_nfc_quick_check(RE_UINT32 codepoint); 272 | RE_UINT32 re_get_nfd_quick_check(RE_UINT32 codepoint); 273 | RE_UINT32 re_get_nfkc_quick_check(RE_UINT32 codepoint); 274 | RE_UINT32 re_get_nfkd_quick_check(RE_UINT32 codepoint); 275 | RE_UINT32 re_get_noncharacter_code_point(RE_UINT32 codepoint); 276 | RE_UINT32 re_get_numeric_type(RE_UINT32 codepoint); 277 | RE_UINT32 re_get_numeric_value(RE_UINT32 codepoint); 278 | RE_UINT32 re_get_other_alphabetic(RE_UINT32 codepoint); 279 | RE_UINT32 re_get_other_default_ignorable_code_point(RE_UINT32 codepoint); 280 | RE_UINT32 re_get_other_grapheme_extend(RE_UINT32 codepoint); 281 | RE_UINT32 re_get_other_id_continue(RE_UINT32 codepoint); 282 | RE_UINT32 re_get_other_id_start(RE_UINT32 codepoint); 283 | RE_UINT32 re_get_other_lowercase(RE_UINT32 codepoint); 284 | RE_UINT32 re_get_other_math(RE_UINT32 codepoint); 285 | RE_UINT32 re_get_other_uppercase(RE_UINT32 codepoint); 286 | RE_UINT32 re_get_pattern_syntax(RE_UINT32 codepoint); 287 | RE_UINT32 re_get_pattern_white_space(RE_UINT32 codepoint); 288 | RE_UINT32 re_get_posix_alnum(RE_UINT32 codepoint); 289 | RE_UINT32 re_get_posix_digit(RE_UINT32 codepoint); 290 | RE_UINT32 re_get_posix_punct(RE_UINT32 codepoint); 291 | RE_UINT32 re_get_posix_xdigit(RE_UINT32 codepoint); 292 | RE_UINT32 re_get_prepended_concatenation_mark(RE_UINT32 codepoint); 293 | RE_UINT32 re_get_print(RE_UINT32 codepoint); 294 | RE_UINT32 re_get_quotation_mark(RE_UINT32 codepoint); 295 | RE_UINT32 re_get_radical(RE_UINT32 codepoint); 296 | RE_UINT32 re_get_regional_indicator(RE_UINT32 codepoint); 297 | RE_UINT32 re_get_script(RE_UINT32 codepoint); 298 | int re_get_script_extensions(RE_UINT32 codepoint, RE_UINT8* scripts); 299 | RE_UINT32 re_get_sentence_break(RE_UINT32 codepoint); 300 | RE_UINT32 re_get_sentence_terminal(RE_UINT32 codepoint); 301 | RE_UINT32 re_get_soft_dotted(RE_UINT32 codepoint); 302 | RE_UINT32 re_get_terminal_punctuation(RE_UINT32 codepoint); 303 | RE_UINT32 re_get_unified_ideograph(RE_UINT32 codepoint); 304 | RE_UINT32 re_get_uppercase(RE_UINT32 codepoint); 305 | RE_UINT32 re_get_variation_selector(RE_UINT32 codepoint); 306 | RE_UINT32 re_get_vert_space(RE_UINT32 codepoint); 307 | RE_UINT32 re_get_white_space(RE_UINT32 codepoint); 308 | RE_UINT32 re_get_word(RE_UINT32 codepoint); 309 | RE_UINT32 re_get_word_break(RE_UINT32 codepoint); 310 | RE_UINT32 re_get_xdigit(RE_UINT32 codepoint); 311 | RE_UINT32 re_get_xid_continue(RE_UINT32 codepoint); 312 | RE_UINT32 re_get_xid_start(RE_UINT32 codepoint); 313 | int re_get_all_cases(RE_UINT32 codepoint, RE_UINT32* cases); 314 | RE_UINT32 re_get_simple_case_folding(RE_UINT32 codepoint); 315 | int re_get_full_case_folding(RE_UINT32 codepoint, RE_UINT32* folded); 316 | -------------------------------------------------------------------------------- /regex_3/regex.py: -------------------------------------------------------------------------------- 1 | # 2 | # Secret Labs' Regular Expression Engine 3 | # 4 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 5 | # 6 | # This version of the SRE library can be redistributed under CNRI's 7 | # Python 1.6 license. For any other use, please contact Secret Labs 8 | # AB (info@pythonware.com). 9 | # 10 | # Portions of this engine have been developed in cooperation with 11 | # CNRI. Hewlett-Packard provided funding for 1.6 integration and 12 | # other compatibility work. 13 | # 14 | # 2010-01-16 mrab Python front-end re-written and extended 15 | 16 | r"""Support for regular expressions (RE). 17 | 18 | This module provides regular expression matching operations similar to those 19 | found in Perl. It supports both 8-bit and Unicode strings; both the pattern and 20 | the strings being processed can contain null bytes and characters outside the 21 | US ASCII range. 22 | 23 | Regular expressions can contain both special and ordinary characters. Most 24 | ordinary characters, like "A", "a", or "0", are the simplest regular 25 | expressions; they simply match themselves. You can concatenate ordinary 26 | characters, so last matches the string 'last'. 27 | 28 | There are a few differences between the old (legacy) behaviour and the new 29 | (enhanced) behaviour, which are indicated by VERSION0 or VERSION1. 30 | 31 | The special characters are: 32 | "." Matches any character except a newline. 33 | "^" Matches the start of the string. 34 | "$" Matches the end of the string or just before the 35 | newline at the end of the string. 36 | "*" Matches 0 or more (greedy) repetitions of the preceding 37 | RE. Greedy means that it will match as many repetitions 38 | as possible. 39 | "+" Matches 1 or more (greedy) repetitions of the preceding 40 | RE. 41 | "?" Matches 0 or 1 (greedy) of the preceding RE. 42 | *?,+?,?? Non-greedy versions of the previous three special 43 | characters. 44 | *+,++,?+ Possessive versions of the previous three special 45 | characters. 46 | {m,n} Matches from m to n repetitions of the preceding RE. 47 | {m,n}? Non-greedy version of the above. 48 | {m,n}+ Possessive version of the above. 49 | {...} Fuzzy matching constraints. 50 | "\\" Either escapes special characters or signals a special 51 | sequence. 52 | [...] Indicates a set of characters. A "^" as the first 53 | character indicates a complementing set. 54 | "|" A|B, creates an RE that will match either A or B. 55 | (...) Matches the RE inside the parentheses. The contents are 56 | captured and can be retrieved or matched later in the 57 | string. 58 | (?flags-flags) VERSION1: Sets/clears the flags for the remainder of 59 | the group or pattern; VERSION0: Sets the flags for the 60 | entire pattern. 61 | (?:...) Non-capturing version of regular parentheses. 62 | (?>...) Atomic non-capturing version of regular parentheses. 63 | (?flags-flags:...) Non-capturing version of regular parentheses with local 64 | flags. 65 | (?P...) The substring matched by the group is accessible by 66 | name. 67 | (?...) The substring matched by the group is accessible by 68 | name. 69 | (?P=name) Matches the text matched earlier by the group named 70 | name. 71 | (?#...) A comment; ignored. 72 | (?=...) Matches if ... matches next, but doesn't consume the 73 | string. 74 | (?!...) Matches if ... doesn't match next. 75 | (?<=...) Matches if preceded by .... 76 | (? Matches the text matched by the group named name. 137 | \G Matches the empty string, but only at the position where 138 | the search started. 139 | \h Matches horizontal whitespace. 140 | \K Keeps only what follows for the entire match. 141 | \L Named list. The list is provided as a keyword argument. 142 | \m Matches the empty string, but only at the start of a word. 143 | \M Matches the empty string, but only at the end of a word. 144 | \n Matches the newline character. 145 | \N{name} Matches the named character. 146 | \p{name=value} Matches the character if its property has the specified 147 | value. 148 | \P{name=value} Matches the character if its property hasn't the specified 149 | value. 150 | \r Matches the carriage-return character. 151 | \s Matches any whitespace character; equivalent to 152 | [ \t\n\r\f\v]. 153 | \S Matches any non-whitespace character; equivalent to [^\s]. 154 | \t Matches the tab character. 155 | \uXXXX Matches the Unicode codepoint with 4-digit hex code XXXX. 156 | \UXXXXXXXX Matches the Unicode codepoint with 8-digit hex code 157 | XXXXXXXX. 158 | \v Matches the vertical tab character. 159 | \w Matches any alphanumeric character; equivalent to 160 | [a-zA-Z0-9_] when matching a bytestring or a Unicode string 161 | with the ASCII flag, or the whole range of Unicode 162 | alphanumeric characters (letters plus digits plus 163 | underscore) when matching a Unicode string. With LOCALE, it 164 | will match the set [0-9_] plus characters defined as 165 | letters for the current locale. 166 | \W Matches the complement of \w; equivalent to [^\w]. 167 | \xXX Matches the character with 2-digit hex code XX. 168 | \X Matches a grapheme. 169 | \Z Matches only at the end of the string. 170 | \\ Matches a literal backslash. 171 | 172 | This module exports the following functions: 173 | match Match a regular expression pattern at the beginning of a string. 174 | fullmatch Match a regular expression pattern against all of a string. 175 | search Search a string for the presence of a pattern. 176 | sub Substitute occurrences of a pattern found in a string using a 177 | template string. 178 | subf Substitute occurrences of a pattern found in a string using a 179 | format string. 180 | subn Same as sub, but also return the number of substitutions made. 181 | subfn Same as subf, but also return the number of substitutions made. 182 | split Split a string by the occurrences of a pattern. VERSION1: will 183 | split at zero-width match; VERSION0: won't split at zero-width 184 | match. 185 | splititer Return an iterator yielding the parts of a split string. 186 | findall Find all occurrences of a pattern in a string. 187 | finditer Return an iterator yielding a match object for each match. 188 | compile Compile a pattern into a Pattern object. 189 | purge Clear the regular expression cache. 190 | escape Backslash all non-alphanumerics or special characters in a 191 | string. 192 | 193 | Most of the functions support a concurrent parameter: if True, the GIL will be 194 | released during matching, allowing other Python threads to run concurrently. If 195 | the string changes during matching, the behaviour is undefined. This parameter 196 | is not needed when working on the builtin (immutable) string classes. 197 | 198 | Some of the functions in this module take flags as optional parameters. Most of 199 | these flags can also be set within an RE: 200 | A a ASCII Make \w, \W, \b, \B, \d, and \D match the 201 | corresponding ASCII character categories. Default 202 | when matching a bytestring. 203 | B b BESTMATCH Find the best fuzzy match (default is first). 204 | D DEBUG Print the parsed pattern. 205 | E e ENHANCEMATCH Attempt to improve the fit after finding the first 206 | fuzzy match. 207 | F f FULLCASE Use full case-folding when performing 208 | case-insensitive matching in Unicode. 209 | I i IGNORECASE Perform case-insensitive matching. 210 | L L LOCALE Make \w, \W, \b, \B, \d, and \D dependent on the 211 | current locale. (One byte per character only.) 212 | M m MULTILINE "^" matches the beginning of lines (after a newline) 213 | as well as the string. "$" matches the end of lines 214 | (before a newline) as well as the end of the string. 215 | P p POSIX Perform POSIX-standard matching (leftmost longest). 216 | R r REVERSE Searches backwards. 217 | S s DOTALL "." matches any character at all, including the 218 | newline. 219 | U u UNICODE Make \w, \W, \b, \B, \d, and \D dependent on the 220 | Unicode locale. Default when matching a Unicode 221 | string. 222 | V0 V0 VERSION0 Turn on the old legacy behaviour. 223 | V1 V1 VERSION1 Turn on the new enhanced behaviour. This flag 224 | includes the FULLCASE flag. 225 | W w WORD Make \b and \B work with default Unicode word breaks 226 | and make ".", "^" and "$" work with Unicode line 227 | breaks. 228 | X x VERBOSE Ignore whitespace and comments for nicer looking REs. 229 | 230 | This module also defines an exception 'error'. 231 | 232 | """ 233 | 234 | # Public symbols. 235 | __all__ = ["cache_all", "compile", "DEFAULT_VERSION", "escape", "findall", 236 | "finditer", "fullmatch", "match", "purge", "search", "split", "splititer", 237 | "sub", "subf", "subfn", "subn", "template", "Scanner", "A", "ASCII", "B", 238 | "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", "S", "DOTALL", "F", 239 | "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", "POSIX", 240 | "R", "REVERSE", "T", "TEMPLATE", "U", "UNICODE", "V0", "VERSION0", "V1", 241 | "VERSION1", "X", "VERBOSE", "W", "WORD", "error", "Regex", "__version__", 242 | "__doc__", "RegexFlag"] 243 | 244 | __version__ = "2.5.153" 245 | 246 | # -------------------------------------------------------------------- 247 | # Public interface. 248 | 249 | def match(pattern, string, flags=0, pos=None, endpos=None, partial=False, 250 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 251 | """Try to apply the pattern at the start of the string, returning a match 252 | object, or None if no match was found.""" 253 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 254 | return pat.match(string, pos, endpos, concurrent, partial, timeout) 255 | 256 | def fullmatch(pattern, string, flags=0, pos=None, endpos=None, partial=False, 257 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 258 | """Try to apply the pattern against all of the string, returning a match 259 | object, or None if no match was found.""" 260 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 261 | return pat.fullmatch(string, pos, endpos, concurrent, partial, timeout) 262 | 263 | def search(pattern, string, flags=0, pos=None, endpos=None, partial=False, 264 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 265 | """Search through string looking for a match to the pattern, returning a 266 | match object, or None if no match was found.""" 267 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 268 | return pat.search(string, pos, endpos, concurrent, partial, timeout) 269 | 270 | def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, 271 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 272 | """Return the string obtained by replacing the leftmost (or rightmost with a 273 | reverse pattern) non-overlapping occurrences of the pattern in string by the 274 | replacement repl. repl can be either a string or a callable; if a string, 275 | backslash escapes in it are processed; if a callable, it's passed the match 276 | object and must return a replacement string to be used.""" 277 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 278 | return pat.sub(repl, string, count, pos, endpos, concurrent, timeout) 279 | 280 | def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None, 281 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 282 | """Return the string obtained by replacing the leftmost (or rightmost with a 283 | reverse pattern) non-overlapping occurrences of the pattern in string by the 284 | replacement format. format can be either a string or a callable; if a string, 285 | it's treated as a format string; if a callable, it's passed the match object 286 | and must return a replacement string to be used.""" 287 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 288 | return pat.subf(format, string, count, pos, endpos, concurrent, timeout) 289 | 290 | def subn(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, 291 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 292 | """Return a 2-tuple containing (new_string, number). new_string is the string 293 | obtained by replacing the leftmost (or rightmost with a reverse pattern) 294 | non-overlapping occurrences of the pattern in the source string by the 295 | replacement repl. number is the number of substitutions that were made. repl 296 | can be either a string or a callable; if a string, backslash escapes in it 297 | are processed; if a callable, it's passed the match object and must return a 298 | replacement string to be used.""" 299 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 300 | return pat.subn(repl, string, count, pos, endpos, concurrent, timeout) 301 | 302 | def subfn(pattern, format, string, count=0, flags=0, pos=None, endpos=None, 303 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 304 | """Return a 2-tuple containing (new_string, number). new_string is the string 305 | obtained by replacing the leftmost (or rightmost with a reverse pattern) 306 | non-overlapping occurrences of the pattern in the source string by the 307 | replacement format. number is the number of substitutions that were made. format 308 | can be either a string or a callable; if a string, it's treated as a format 309 | string; if a callable, it's passed the match object and must return a 310 | replacement string to be used.""" 311 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 312 | return pat.subfn(format, string, count, pos, endpos, concurrent, timeout) 313 | 314 | def split(pattern, string, maxsplit=0, flags=0, concurrent=None, timeout=None, 315 | ignore_unused=False, **kwargs): 316 | """Split the source string by the occurrences of the pattern, returning a 317 | list containing the resulting substrings. If capturing parentheses are used 318 | in pattern, then the text of all groups in the pattern are also returned as 319 | part of the resulting list. If maxsplit is nonzero, at most maxsplit splits 320 | occur, and the remainder of the string is returned as the final element of 321 | the list.""" 322 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 323 | return pat.split(string, maxsplit, concurrent, timeout) 324 | 325 | def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None, 326 | timeout=None, ignore_unused=False, **kwargs): 327 | "Return an iterator yielding the parts of a split string." 328 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 329 | return pat.splititer(string, maxsplit, concurrent, timeout) 330 | 331 | def findall(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, 332 | concurrent=None, timeout=None, ignore_unused=False, **kwargs): 333 | """Return a list of all matches in the string. The matches may be overlapped 334 | if overlapped is True. If one or more groups are present in the pattern, 335 | return a list of groups; this will be a list of tuples if the pattern has 336 | more than one group. Empty matches are included in the result.""" 337 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 338 | return pat.findall(string, pos, endpos, overlapped, concurrent, timeout) 339 | 340 | def finditer(pattern, string, flags=0, pos=None, endpos=None, overlapped=False, 341 | partial=False, concurrent=None, timeout=None, ignore_unused=False, **kwargs): 342 | """Return an iterator over all matches in the string. The matches may be 343 | overlapped if overlapped is True. For each match, the iterator returns a 344 | match object. Empty matches are included in the result.""" 345 | pat = _compile(pattern, flags, ignore_unused, kwargs, True) 346 | return pat.finditer(string, pos, endpos, overlapped, concurrent, partial, 347 | timeout) 348 | 349 | def compile(pattern, flags=0, ignore_unused=False, cache_pattern=None, **kwargs): 350 | "Compile a regular expression pattern, returning a pattern object." 351 | if cache_pattern is None: 352 | cache_pattern = _cache_all 353 | return _compile(pattern, flags, ignore_unused, kwargs, cache_pattern) 354 | 355 | def purge(): 356 | "Clear the regular expression cache" 357 | _cache.clear() 358 | _locale_sensitive.clear() 359 | 360 | # Whether to cache all patterns. 361 | _cache_all = True 362 | 363 | def cache_all(value=True): 364 | """Sets whether to cache all patterns, even those are compiled explicitly. 365 | Passing None has no effect, but returns the current setting.""" 366 | global _cache_all 367 | 368 | if value is None: 369 | return _cache_all 370 | 371 | _cache_all = value 372 | 373 | def template(pattern, flags=0): 374 | "Compile a template pattern, returning a pattern object." 375 | return _compile(pattern, flags | TEMPLATE, False, {}, False) 376 | 377 | def escape(pattern, special_only=True, literal_spaces=False): 378 | """Escape a string for use as a literal in a pattern. If special_only is 379 | True, escape only special characters, else escape all non-alphanumeric 380 | characters. If literal_spaces is True, don't escape spaces.""" 381 | # Convert it to Unicode. 382 | if isinstance(pattern, bytes): 383 | p = pattern.decode("latin-1") 384 | else: 385 | p = pattern 386 | 387 | s = [] 388 | if special_only: 389 | for c in p: 390 | if c == " " and literal_spaces: 391 | s.append(c) 392 | elif c in _METACHARS or c.isspace(): 393 | s.append("\\") 394 | s.append(c) 395 | else: 396 | s.append(c) 397 | else: 398 | for c in p: 399 | if c == " " and literal_spaces: 400 | s.append(c) 401 | elif c in _ALNUM: 402 | s.append(c) 403 | else: 404 | s.append("\\") 405 | s.append(c) 406 | 407 | r = "".join(s) 408 | # Convert it back to bytes if necessary. 409 | if isinstance(pattern, bytes): 410 | r = r.encode("latin-1") 411 | 412 | return r 413 | 414 | # -------------------------------------------------------------------- 415 | # Internals. 416 | 417 | import regex._regex_core as _regex_core 418 | import regex._regex as _regex 419 | from threading import RLock as _RLock 420 | from locale import getpreferredencoding as _getpreferredencoding 421 | from regex._regex_core import * 422 | from regex._regex_core import (_ALL_VERSIONS, _ALL_ENCODINGS, _FirstSetError, 423 | _UnscopedFlagSet, _check_group_features, _compile_firstset, 424 | _compile_replacement, _flatten_code, _fold_case, _get_required_string, 425 | _parse_pattern, _shrink_cache) 426 | from regex._regex_core import (ALNUM as _ALNUM, Info as _Info, OP as _OP, Source 427 | as _Source, Fuzzy as _Fuzzy) 428 | 429 | # Version 0 is the old behaviour, compatible with the original 're' module. 430 | # Version 1 is the new behaviour, which differs slightly. 431 | 432 | DEFAULT_VERSION = VERSION0 433 | 434 | _METACHARS = frozenset("()[]{}?*+|^$\\.-#&~") 435 | 436 | _regex_core.DEFAULT_VERSION = DEFAULT_VERSION 437 | 438 | # Caches for the patterns and replacements. 439 | _cache = {} 440 | _cache_lock = _RLock() 441 | _named_args = {} 442 | _replacement_cache = {} 443 | _locale_sensitive = {} 444 | 445 | # Maximum size of the cache. 446 | _MAXCACHE = 500 447 | _MAXREPCACHE = 500 448 | 449 | def _compile(pattern, flags, ignore_unused, kwargs, cache_it): 450 | "Compiles a regular expression to a PatternObject." 451 | 452 | global DEFAULT_VERSION 453 | try: 454 | from regex import DEFAULT_VERSION 455 | except ImportError: 456 | pass 457 | 458 | # We won't bother to cache the pattern if we're debugging. 459 | if (flags & DEBUG) != 0: 460 | cache_it = False 461 | 462 | # What locale is this pattern using? 463 | locale_key = (type(pattern), pattern) 464 | if _locale_sensitive.get(locale_key, True) or (flags & LOCALE) != 0: 465 | # This pattern is, or might be, locale-sensitive. 466 | pattern_locale = _getpreferredencoding() 467 | else: 468 | # This pattern is definitely not locale-sensitive. 469 | pattern_locale = None 470 | 471 | def complain_unused_args(): 472 | if ignore_unused: 473 | return 474 | 475 | # Complain about any unused keyword arguments, possibly resulting from a typo. 476 | unused_kwargs = set(kwargs) - {k for k, v in args_needed} 477 | if unused_kwargs: 478 | any_one = next(iter(unused_kwargs)) 479 | raise ValueError('unused keyword argument {!a}'.format(any_one)) 480 | 481 | if cache_it: 482 | try: 483 | # Do we know what keyword arguments are needed? 484 | args_key = pattern, type(pattern), flags 485 | args_needed = _named_args[args_key] 486 | 487 | # Are we being provided with its required keyword arguments? 488 | args_supplied = set() 489 | if args_needed: 490 | for k, v in args_needed: 491 | try: 492 | args_supplied.add((k, frozenset(kwargs[k]))) 493 | except KeyError: 494 | raise error("missing named list: {!r}".format(k)) 495 | 496 | complain_unused_args() 497 | 498 | args_supplied = frozenset(args_supplied) 499 | 500 | # Have we already seen this regular expression and named list? 501 | pattern_key = (pattern, type(pattern), flags, args_supplied, 502 | DEFAULT_VERSION, pattern_locale) 503 | return _cache[pattern_key] 504 | except KeyError: 505 | # It's a new pattern, or new named list for a known pattern. 506 | pass 507 | 508 | # Guess the encoding from the class of the pattern string. 509 | if isinstance(pattern, str): 510 | guess_encoding = UNICODE 511 | elif isinstance(pattern, bytes): 512 | guess_encoding = ASCII 513 | elif isinstance(pattern, Pattern): 514 | if flags: 515 | raise ValueError("cannot process flags argument with a compiled pattern") 516 | 517 | return pattern 518 | else: 519 | raise TypeError("first argument must be a string or compiled pattern") 520 | 521 | # Set the default version in the core code in case it has been changed. 522 | _regex_core.DEFAULT_VERSION = DEFAULT_VERSION 523 | 524 | global_flags = flags 525 | 526 | while True: 527 | caught_exception = None 528 | try: 529 | source = _Source(pattern) 530 | info = _Info(global_flags, source.char_type, kwargs) 531 | info.guess_encoding = guess_encoding 532 | source.ignore_space = bool(info.flags & VERBOSE) 533 | parsed = _parse_pattern(source, info) 534 | break 535 | except _UnscopedFlagSet: 536 | # Remember the global flags for the next attempt. 537 | global_flags = info.global_flags 538 | except error as e: 539 | caught_exception = e 540 | 541 | if caught_exception: 542 | raise error(caught_exception.msg, caught_exception.pattern, 543 | caught_exception.pos) 544 | 545 | if not source.at_end(): 546 | raise error("unbalanced parenthesis", pattern, source.pos) 547 | 548 | # Check the global flags for conflicts. 549 | version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 550 | if version not in (0, VERSION0, VERSION1): 551 | raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") 552 | 553 | if (info.flags & _ALL_ENCODINGS) not in (0, ASCII, LOCALE, UNICODE): 554 | raise ValueError("ASCII, LOCALE and UNICODE flags are mutually incompatible") 555 | 556 | if isinstance(pattern, bytes) and (info.flags & UNICODE): 557 | raise ValueError("cannot use UNICODE flag with a bytes pattern") 558 | 559 | if not (info.flags & _ALL_ENCODINGS): 560 | if isinstance(pattern, str): 561 | info.flags |= UNICODE 562 | else: 563 | info.flags |= ASCII 564 | 565 | reverse = bool(info.flags & REVERSE) 566 | fuzzy = isinstance(parsed, _Fuzzy) 567 | 568 | # Remember whether this pattern as an inline locale flag. 569 | _locale_sensitive[locale_key] = info.inline_locale 570 | 571 | # Fix the group references. 572 | caught_exception = None 573 | try: 574 | parsed.fix_groups(pattern, reverse, False) 575 | except error as e: 576 | caught_exception = e 577 | 578 | if caught_exception: 579 | raise error(caught_exception.msg, caught_exception.pattern, 580 | caught_exception.pos) 581 | 582 | # Should we print the parsed pattern? 583 | if flags & DEBUG: 584 | parsed.dump(indent=0, reverse=reverse) 585 | 586 | # Optimise the parsed pattern. 587 | parsed = parsed.optimise(info, reverse) 588 | parsed = parsed.pack_characters(info) 589 | 590 | # Get the required string. 591 | req_offset, req_chars, req_flags = _get_required_string(parsed, info.flags) 592 | 593 | # Build the named lists. 594 | named_lists = {} 595 | named_list_indexes = [None] * len(info.named_lists_used) 596 | args_needed = set() 597 | for key, index in info.named_lists_used.items(): 598 | name, case_flags = key 599 | values = frozenset(kwargs[name]) 600 | if case_flags: 601 | items = frozenset(_fold_case(info, v) for v in values) 602 | else: 603 | items = values 604 | named_lists[name] = values 605 | named_list_indexes[index] = items 606 | args_needed.add((name, values)) 607 | 608 | complain_unused_args() 609 | 610 | # Check the features of the groups. 611 | _check_group_features(info, parsed) 612 | 613 | # Compile the parsed pattern. The result is a list of tuples. 614 | code = parsed.compile(reverse) 615 | 616 | # Is there a group call to the pattern as a whole? 617 | key = (0, reverse, fuzzy) 618 | ref = info.call_refs.get(key) 619 | if ref is not None: 620 | code = [(_OP.CALL_REF, ref)] + code + [(_OP.END, )] 621 | 622 | # Add the final 'success' opcode. 623 | code += [(_OP.SUCCESS, )] 624 | 625 | # Compile the additional copies of the groups that we need. 626 | for group, rev, fuz in info.additional_groups: 627 | code += group.compile(rev, fuz) 628 | 629 | # Flatten the code into a list of ints. 630 | code = _flatten_code(code) 631 | 632 | if not parsed.has_simple_start(): 633 | # Get the first set, if possible. 634 | try: 635 | fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) 636 | fs_code = _flatten_code(fs_code) 637 | code = fs_code + code 638 | except _FirstSetError: 639 | pass 640 | 641 | # The named capture groups. 642 | index_group = dict((v, n) for n, v in info.group_index.items()) 643 | 644 | # Create the PatternObject. 645 | # 646 | # Local flags like IGNORECASE affect the code generation, but aren't needed 647 | # by the PatternObject itself. Conversely, global flags like LOCALE _don't_ 648 | # affect the code generation but _are_ needed by the PatternObject. 649 | compiled_pattern = _regex.compile(pattern, info.flags | version, code, 650 | info.group_index, index_group, named_lists, named_list_indexes, 651 | req_offset, req_chars, req_flags, info.group_count) 652 | 653 | # Do we need to reduce the size of the cache? 654 | if len(_cache) >= _MAXCACHE: 655 | with _cache_lock: 656 | _shrink_cache(_cache, _named_args, _locale_sensitive, _MAXCACHE) 657 | 658 | if cache_it: 659 | if (info.flags & LOCALE) == 0: 660 | pattern_locale = None 661 | 662 | args_needed = frozenset(args_needed) 663 | 664 | # Store this regular expression and named list. 665 | pattern_key = (pattern, type(pattern), flags, args_needed, 666 | DEFAULT_VERSION, pattern_locale) 667 | _cache[pattern_key] = compiled_pattern 668 | 669 | # Store what keyword arguments are needed. 670 | _named_args[args_key] = args_needed 671 | 672 | return compiled_pattern 673 | 674 | def _compile_replacement_helper(pattern, template): 675 | "Compiles a replacement template." 676 | # This function is called by the _regex module. 677 | 678 | # Have we seen this before? 679 | key = pattern.pattern, pattern.flags, template 680 | compiled = _replacement_cache.get(key) 681 | if compiled is not None: 682 | return compiled 683 | 684 | if len(_replacement_cache) >= _MAXREPCACHE: 685 | _replacement_cache.clear() 686 | 687 | is_unicode = isinstance(template, str) 688 | source = _Source(template) 689 | if is_unicode: 690 | def make_string(char_codes): 691 | return "".join(chr(c) for c in char_codes) 692 | else: 693 | def make_string(char_codes): 694 | return bytes(char_codes) 695 | 696 | compiled = [] 697 | literal = [] 698 | while True: 699 | ch = source.get() 700 | if not ch: 701 | break 702 | if ch == "\\": 703 | # '_compile_replacement' will return either an int group reference 704 | # or a string literal. It returns items (plural) in order to handle 705 | # a 2-character literal (an invalid escape sequence). 706 | is_group, items = _compile_replacement(source, pattern, is_unicode) 707 | if is_group: 708 | # It's a group, so first flush the literal. 709 | if literal: 710 | compiled.append(make_string(literal)) 711 | literal = [] 712 | compiled.extend(items) 713 | else: 714 | literal.extend(items) 715 | else: 716 | literal.append(ord(ch)) 717 | 718 | # Flush the literal. 719 | if literal: 720 | compiled.append(make_string(literal)) 721 | 722 | _replacement_cache[key] = compiled 723 | 724 | return compiled 725 | 726 | # We define Pattern here after all the support objects have been defined. 727 | _pat = _compile('', 0, False, {}, False) 728 | Pattern = type(_pat) 729 | Match = type(_pat.match('')) 730 | del _pat 731 | 732 | # Make Pattern public for typing annotations. 733 | __all__.append("Pattern") 734 | __all__.append("Match") 735 | 736 | # We'll define an alias for the 'compile' function so that the repr of a 737 | # pattern object is eval-able. 738 | Regex = compile 739 | 740 | # Register myself for pickling. 741 | import copyreg as _copy_reg 742 | 743 | def _pickle(pattern): 744 | return _regex.compile, pattern._pickled_data 745 | 746 | _copy_reg.pickle(Pattern, _pickle) 747 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from setuptools import setup, Extension 4 | from os.path import join 5 | 6 | setup( 7 | ext_modules=[Extension('regex._regex', [join('regex_3', '_regex.c'), 8 | join('regex_3', '_regex_unicode.c')])], 9 | ) 10 | --------------------------------------------------------------------------------