├── .flake8 ├── .github └── workflows │ └── main.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── regexploit ├── __init__.py ├── ast │ ├── __init__.py │ ├── at.py │ ├── branch.py │ ├── categories.py │ ├── char.py │ ├── groupref.py │ ├── ranges.py │ ├── repeat.py │ ├── sequence.py │ └── sre.py ├── bin │ ├── __init__.py │ ├── files.py │ ├── javascript │ │ ├── .eslintrc.yml │ │ ├── cli.js │ │ ├── find.js │ │ ├── index.js │ │ ├── package-lock.json │ │ ├── package.json │ │ └── test │ │ │ └── test.js │ ├── regexploit-python-env │ ├── regexploit.py │ ├── regexploit_csharp.py │ ├── regexploit_js.py │ ├── regexploit_python_ast.py │ └── regexploit_yaml.py ├── found_regex.py ├── hook.py ├── languages │ ├── __init__.py │ ├── csharp_string_extractor.py │ ├── javascript.py │ └── python_node_visitor.py ├── output │ ├── __init__.py │ └── text.py └── redos.py ├── requirements-dev.txt ├── setup.py └── tests ├── test.cs ├── test_at.py ├── test_character.py ├── test_csharp.py ├── test_javascript.py ├── test_python_ast.py ├── test_redos.py └── test_repeat.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = 3 | E203, # whitespace before ':' 4 | E501, # Line length 5 | E722, # do not use bare 'except' 6 | W503, # line break before binary operator 7 | application-import-names=regexploit 8 | import-order-style=pycharm 9 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test-python: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi 23 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 24 | - name: Lint with black and flake8 25 | run: | 26 | black --check regexploit tests 27 | flake8 regexploit tests 28 | - name: Test 29 | run: | 30 | pip install -e . 31 | pytest 32 | 33 | test-node: 34 | 35 | runs-on: ubuntu-latest 36 | defaults: 37 | run: 38 | working-directory: regexploit/bin/javascript 39 | 40 | strategy: 41 | matrix: 42 | node-version: [12.x, 15.x] 43 | 44 | steps: 45 | - uses: actions/checkout@v2 46 | - name: Use Node.js ${{ matrix.node-version }} 47 | uses: actions/setup-node@v1 48 | with: 49 | node-version: ${{ matrix.node-version }} 50 | - run: npm install 51 | - run: npm run build --if-present 52 | - run: npm test 53 | env: 54 | CI: true 55 | - run: npm run lint --if-present 56 | 57 | 58 | build-python: 59 | 60 | needs: [test-python, test-node] 61 | runs-on: ubuntu-latest 62 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 63 | steps: 64 | - uses: actions/checkout@v2 65 | - name: Set up Python 3.9 66 | uses: actions/setup-python@v2 67 | with: 68 | python-version: 3.9 69 | - name: Build 70 | run: | 71 | pip install wheel 72 | python setup.py sdist bdist_wheel 73 | - name: Upload artifacts 74 | uses: actions/upload-artifact@v2 75 | with: 76 | name: build 77 | path: dist/* 78 | - name: Publish package 79 | uses: pypa/gh-action-pypi-publish@master 80 | with: 81 | user: __token__ 82 | password: ${{ secrets.PYPI_API_TOKEN }} 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | !.flake8 3 | !.github 4 | __pycache__ 5 | *.egg-info 6 | *.log 7 | node_modules 8 | build 9 | dist 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Doyensec LLC. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include regexploit/bin/javascript/*.js 4 | include regexploit/bin/javascript/*.json 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Regexploit 2 | 3 | ![regexploit_logo](https://user-images.githubusercontent.com/6027823/110626827-7f46db80-81a1-11eb-9a3d-3e3376bd9a4f.png) 4 | 5 | Find regexes which are vulnerable to Regular Expression Denial of Service (ReDoS). 6 | 7 | **More info on [the Doyensec blog](https://blog.doyensec.com/2021/03/11/regexploit.html)** 8 | 9 | Many default regular expression parsers have unbounded worst-case complexity. Regex matching may be quick when presented with a matching input string. However, certain non-matching input strings can make the regular expression matcher go into crazy backtracking loops and take ages to process. This can cause denial of service, as the CPU will be stuck trying to match the regex. 10 | 11 | This tool is designed to: 12 | * find regular expressions which are vulnerable to ReDoS 13 | * give an example malicious string which will cause catastrophic backtracking 14 | 15 | ## Worst-case complexity 16 | 17 | This reflects the complexity of the regular expression matcher's backtracking procedure with respect to the length of the entered string. 18 | 19 | Cubic complexity here means that if the vulnerable part of the string is doubled in length, the execution time should be about 8 times longer (2^3). 20 | For exponential ReDoS with starred stars e.g. `(a*)*$` a fudge factor is used and the complexity will be greater than 10. 21 | 22 | For explotability, cubic complexity or higher is typically required unless truly giant strings are allowed as input. 23 | 24 | ## Example 25 | 26 | Run `regexploit` and enter the regular expression `v\w*_\w*_\w*$` at the command line. 27 | 28 | ``` 29 | $ regexploit 30 | v\w*_\w*_\w*$ 31 | Pattern: v\w*_\w*_\w*$ 32 | --- 33 | Worst-case complexity: 3 ⭐⭐⭐ (cubic) 34 | Repeated character: [5f:_] 35 | Final character to cause backtracking: [^WORD] 36 | Example: 'v' + '_' * 3456 + '!' 37 | ``` 38 | 39 | The part `\w*_\w*_\w*` contains three overlapping repeating groups (\w matches letters, digits *and underscores*). As showed in the line `Repeated character: [5f:_]`, a long string of `_` (0x5f) will match this section in many different ways. The worst-case complexity is 3 as there are 3 infinitely repeating groups. An example to cause ReDoS is given: it consists of the required prefix `v`, a long string of `_` and then a `!` (non-word character) to cause backtracking. Not all ReDoSes require a particular character at the end, but in this case, a long string of `_` will match the regex successfully and won't backtrack. The line `Final character to cause backtracking: [^WORD]` shows that a non-matching character (not a word character) is required at the end to prevent matching and cause ReDoS. 40 | 41 | As another example, install a module version vulnerable to ReDoS such as `pip install ua-parser==0.9.0`. 42 | To scan the installed python modules run `regexploit-python-env`. 43 | 44 | ``` 45 | Importing ua_parser.user_agent_parser 46 | Vulnerable regex in /somewhere/.env/lib/python3.9/site-packages/ua_parser/user_agent_parser.py #183 47 | Pattern: \bSmartWatch *\( *([^;]+) *; *([^;]+) *; 48 | Context: self.user_agent_re = re.compile(self.pattern) 49 | --- 50 | Worst-case complexity: 3 ⭐⭐⭐ 51 | Repeated character: [20] 52 | Example: 'SmartWatch(' + ' ' * 3456 53 | 54 | Worst-case complexity: 3 ⭐⭐⭐ 55 | Repeated character: [20] 56 | Example: 'SmartWatch(0;' + ' ' * 3456 57 | 58 | Vulnerable regex in /somewhere/.env/lib/python3.9/site-packages/ua_parser/user_agent_parser.py #183 59 | Pattern: ; *([^;/]+) Build[/ ]Huawei(MT1-U06|[A-Z]+\d+[^\);]+)[^\);]*\) 60 | Context: self.user_agent_re = re.compile(self.pattern) 61 | --- 62 | Worst-case complexity: 3 ⭐⭐⭐ 63 | Repeated character: [[0-9]] 64 | Example: ';0 Build/HuaweiA' + '0' * 3456 65 | ... 66 | ``` 67 | 68 | For each vulnerable regular expression it prints one or more malicious string to trigger ReDoS. Setting your user agent to `;0 Build/HuaweiA000000000000000...` and browsing a website using an old version of ua-parser may cause the server to take a long time to process your request, probably ending in status 502. 69 | 70 | # Installation 71 | 72 | Python 3.8+ is required. To extract regexes from JavaScript / TypeScript code, NodeJS 12+ is also required. 73 | 74 | Optionally make a virtual environment 75 | 76 | ```bash 77 | python3 -m venv .env 78 | source .env/bin/activate 79 | ``` 80 | 81 | Now actually install with pip 82 | 83 | ``` 84 | pip install regexploit 85 | ``` 86 | 87 | # Usage 88 | 89 | ## Regexploit with a list of regexes 90 | 91 | Enter regular expressions via stdin (one per line) into `regexploit`. 92 | 93 | ```bash 94 | regexploit 95 | ``` 96 | 97 | or via a file 98 | 99 | ```bash 100 | cat myregexes.txt | regexploit 101 | ``` 102 | 103 | ## Extract regexes automatically 104 | 105 | There is built-in support for parsing regexes out of Python, JavaScript, TypeScript, C#, YAML and JSON. 106 | ### Python code 107 | 108 | Parses Python code (without executing it) via the AST to find regexes. The regexes are then analysed for ReDoS. 109 | 110 | ```bash 111 | regexploit-py my-project/ 112 | regexploit-py "my-project/**/*.py" --glob 113 | ``` 114 | ### Javascript / Typescript 115 | 116 | This will use the bundled NodeJS package in `regexploit/bin/javascript` which parses your JavaScript as an AST with [eslint](https://github.com/typescript-eslint/typescript-eslint/tree/master/packages/parser) and prints out all regexes. 117 | 118 | Those regexes are fed into the python ReDoS finder. 119 | 120 | ```bash 121 | regexploit-js my-module/my-file.js another/file.js some/folder/ 122 | regexploit-js "my-project/node_modules/**/*.js" --glob 123 | ``` 124 | 125 | N.B. there are differences between javascript and python regex parsing so there may be some errors. I'm [not sure I want](https://hackernoon.com/the-madness-of-parsing-real-world-javascript-regexps-d9ee336df983) to write a JS regex AST! 126 | 127 | ### Python imports 128 | 129 | Search for regexes in all the python modules currently installed in your path / env. This means you can `pip install` whatever modules you are interested in and they will be analysed. Cpython code is included. 130 | 131 | ```bash 132 | regexploit-python-env 133 | ``` 134 | 135 | N.B. this doesn't parse the python code to an AST and will only find regexes compiled automatically on module import. Modules are actually imported, **so code in the modules will be executed**. This is helpful for finding regexes which are built up from smaller strings on load e.g. [CVE-2021-25292 in Pillow](https://github.com/python-pillow/Pillow/commit/3bce145966374dd39ce58a6fc0083f8d1890719c) 136 | 137 | ### JSON / YAML 138 | 139 | Yaml support requires pyyaml, which can be installed with `pip install regexploit[yaml]`. 140 | 141 | ```bash 142 | regexploit-json *.json 143 | regexploit-yaml *.yaml 144 | ``` 145 | ### C# (.NET) 146 | 147 | ```bash 148 | regexploit-csharp something.cs 149 | ``` 150 | # :trophy: Bugs reported :trophy: 151 | 152 | * [CVE-2020-5243: uap-core](https://github.com/ua-parser/uap-core/security/advisories/GHSA-cmcx-xhr8-3w9p) affecting uap-python, [uap-ruby](https://github.com/ua-parser/uap-ruby/security/advisories/GHSA-pcqq-5962-hvcw), etc. (User-Agent header parsing) 153 | * [CVE-2020-8492: cpython's urllib.request](https://github.com/python/cpython/commit/0b297d4ff1c0e4480ad33acae793fbaf4bf015b4) (WWW-Authenticate header parsing) 154 | * [CVE-2021-21236: CairoSVG](https://github.com/advisories/GHSA-hq37-853p-g5cf) (SVG parsing) 155 | * [CVE-2021-21240: httplib2](https://github.com/httplib2/httplib2/security/advisories/GHSA-93xj-8mrv-444m) (WWW-Authenticate header parsing) 156 | * [CVE-2021-25292: python-pillow](https://github.com/python-pillow/Pillow/commit/3bce145966374dd39ce58a6fc0083f8d1890719c) (PDF parsing) 157 | * [CVE-2021-26813: python-markdown2](https://github.com/trentm/python-markdown2/pull/387) (Markdown parsing) 158 | * [CVE-2021-27290: npm/ssri](https://doyensec.com/resources/Doyensec_Advisory_ssri_redos.pdf) (SRI parsing) 159 | * [CVE-2021-27291: pygments](https://github.com/pygments/pygments/commit/2e7e8c4a7b318f4032493773732754e418279a14) lexers for ADL, CADL, Ceylon, Evoque, Factor, Logos, Matlab, Octave, ODIN, Scilab & Varnish VCL (Syntax highlighting) 160 | * [CVE-2021-27292: ua-parser-js](https://github.com/faisalman/ua-parser-js/commit/809439e20e273ce0d25c1d04e111dcf6011eb566) (User-Agent header parsing) 161 | * [CVE-2021-27293: RestSharp](https://github.com/restsharp/RestSharp/issues/1556) (JSON deserialisation in a .NET C# package) 162 | * [bpo-38804: cpython's http.cookiejar](https://github.com/python/cpython/pull/17157) (Set-Cookie header parsing) 163 | * [SimpleCrawler (archived)](https://doyensec.com/resources/Doyensec_Advisory_simplecrawler_redos.pdf) (HTML parsing) 164 | * [CVE-2021-28092: is-svg](https://github.com/sindresorhus/is-svg/commit/01f8a087fab8a69c3ac9085fbb16035907ab6a5b) (SVG parsing) 165 | * [nuget.org, NuGetGallery](https://github.com/NuGet/NuGetGallery/commit/25d2d3b32b2d9f0b1ca6e0a105b0210c2c4820f4) and [NuGet.Client](https://github.com/NuGet/NuGet.Client/commit/a0671e946ce71dc59def5cc8a67c6457d66f33bf) (Parsing NuGet package IDs) 166 | * [markdown (python)](https://github.com/Python-Markdown/markdown/pull/1130) (Markdown parsing) 167 | * [ansi-html (nodejs)](https://github.com/Tjatse/ansi-html/issues/19) (ANSI parsing) 168 | * Plus unpublished bugs in a handful of pypi, npm, ruby and nuget packages 169 | 170 | ## Credits 171 | 172 | This tool has been created by Ben Caller of [Doyensec LLC](https://www.doyensec.com) during research time. 173 | 174 | ![alt text](https://doyensec.com/images/logo.svg "Doyensec Logo") 175 | -------------------------------------------------------------------------------- /regexploit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/__init__.py -------------------------------------------------------------------------------- /regexploit/ast/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/ast/__init__.py -------------------------------------------------------------------------------- /regexploit/ast/at.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional 3 | 4 | from regexploit.ast.char import Character 5 | from regexploit.ast.repeat import InfiniteRepeat, Repeat 6 | 7 | 8 | @dataclass 9 | class EndOfString: 10 | character: Optional[Character] = None 11 | 12 | @property 13 | def starriness(self): 14 | return 0 15 | 16 | @property 17 | def minimum_length(self): 18 | return 1 # Meaningless really here 19 | 20 | def overall_character_class(self): 21 | return self.character 22 | 23 | def __repr__(self) -> str: 24 | return f"${self.character}" 25 | 26 | def __and__(self, other: Character) -> Optional[Character]: 27 | return other & self.character 28 | 29 | def example(self): 30 | return "\n" # ish 31 | 32 | def set_character(self, previous_elems: List): 33 | """ 34 | To force backtracking, the dollar will have to not match any previous groups until a mandatory group. 35 | This can perhaps be made more lenient. 36 | 37 | To cause backtracking on a long string of a's: 38 | a*a*a*$ -> Any [^a] 39 | [ab]+a*a*a*$ -> Any [^ab] (baaaaaaaaaaaab does not backtrack) 40 | b+a*a*a*$ -> Any [^a] 41 | .a*a*a*$ -> Any [^a] 42 | .+a*a*a*$ -> Cannot backtrack because everything gets matched by .+ :( 43 | """ 44 | self.character = None 45 | for elem in reversed(previous_elems): 46 | if elem.minimum_length > 0 and not isinstance(elem, InfiniteRepeat): 47 | return # xa*[ab]*a*$ -> [ab] 48 | c = ( 49 | elem.maximal_character_class() 50 | if isinstance(elem, Repeat) 51 | else elem.overall_character_class() 52 | ) 53 | if c: 54 | if elem.minimum_length > 0 and (self.character & c) != self.character: 55 | # c is smaller than self.character (i.e. c is not an ANY) 56 | # x+a*[ab]*a*$ -> [ab] 57 | return 58 | self.character |= c 59 | -------------------------------------------------------------------------------- /regexploit/ast/branch.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Iterator, List, Optional 3 | 4 | from regexploit.ast.at import EndOfString 5 | from regexploit.ast.char import Character 6 | from regexploit.ast.repeat import FiniteRepeat, InfiniteRepeat 7 | from regexploit.ast.sequence import Sequence 8 | 9 | 10 | @dataclass(frozen=True) 11 | class Branch: 12 | branches: List 13 | optional: bool = False 14 | 15 | def get_branches(self) -> Iterator: 16 | for b in self.branches: 17 | yield b 18 | if self.optional: 19 | yield None 20 | 21 | @property 22 | def starriness(self) -> int: 23 | return max(b.starriness for b in self.branches) 24 | 25 | @property 26 | def minimum_length(self) -> int: 27 | return 0 if self.optional else min(b.minimum_length for b in self.branches) 28 | 29 | def overall_character_class(self) -> Optional[Character]: 30 | c = Character.ANY() 31 | for b in self.branches: 32 | c &= b.overall_character_class() 33 | if c is None: 34 | return None 35 | return c 36 | 37 | def maximal_character_class(self): 38 | return None # Really? 39 | 40 | def example(self) -> str: 41 | if self.optional: 42 | return "" 43 | return self.branches[0].example() 44 | 45 | def __len__(self) -> int: 46 | return len(self.branches) + int(self.optional) 47 | 48 | def __repr__(self) -> str: 49 | middle = " | ".join(str(b) for b in self.branches) 50 | return f"BR( {middle} ){'?' if self.optional else ''}" 51 | 52 | def matching_repeats(self): 53 | for b in self.branches: 54 | if b.starriness > 0: 55 | if isinstance(b, InfiniteRepeat): 56 | yield b 57 | elif isinstance(b, Sequence): 58 | yield from b.matching_repeats() 59 | 60 | 61 | def make_branch(branches: List): 62 | if len(branches) == 1: 63 | return branches[0] 64 | optional = False 65 | non_empty_branches = [b for b in branches if b and not isinstance(b, EndOfString)] 66 | if not non_empty_branches: 67 | return None 68 | if len(non_empty_branches) < len(branches): 69 | # (ab|cd|) -> (ab|cd)? 70 | optional = True 71 | if all(isinstance(b, Character) for b in non_empty_branches): 72 | # (a|b) -> [ab], (a|b|) -> [ab]? 73 | c = None 74 | for b in non_empty_branches: 75 | c |= b 76 | if optional: 77 | return FiniteRepeat(c, 0, 1) 78 | else: 79 | return c 80 | 81 | return Branch(non_empty_branches, optional) 82 | -------------------------------------------------------------------------------- /regexploit/ast/categories.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unicodedata 3 | from enum import Enum, auto 4 | from typing import Set 5 | 6 | 7 | class Category(Enum): 8 | DIGIT = auto() 9 | NOT_DIGIT = auto() 10 | WORD = auto() 11 | NOT_WORD = auto() 12 | SPACE = auto() 13 | NOT_SPACE = auto() 14 | 15 | @property 16 | def is_positive(self) -> bool: 17 | return not self.name.startswith("NOT_") 18 | 19 | def negate(self) -> "Category": 20 | if self.is_positive: 21 | return Category[f"NOT_{self.name}"] 22 | else: 23 | return Category[self.name[4:]] 24 | 25 | def example(self) -> str: 26 | return EXAMPLE_FOR_CAT[self] 27 | 28 | def contains(self, literal: int) -> bool: 29 | c = chr(literal) 30 | unicat = unicodedata.category(c) 31 | if self is Category.DIGIT: 32 | return unicat == "Nd" 33 | if self is Category.NOT_DIGIT: 34 | return unicat != "Nd" 35 | if self is Category.WORD: 36 | return ( 37 | unicat[0] == "L" or unicat == "Nd" or literal == 0x5F 38 | ) # underscore is a word character 39 | if self is Category.NOT_WORD: 40 | return unicat[0] != "L" and unicat != "Nd" and literal != 0x5F 41 | if self is Category.SPACE: 42 | return unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v") 43 | if self is Category.NOT_SPACE: 44 | return unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v") 45 | 46 | 47 | CATS = {} 48 | 49 | 50 | def list_category(category, full_unicode: bool = False): 51 | if cached := CATS.get(category): 52 | yield from cached 53 | for data in range((sys.maxunicode + 1) if full_unicode else 256): 54 | c = chr(data) 55 | unicat = unicodedata.category(c) 56 | if category is Category.DIGIT: 57 | if unicat == "Nd": 58 | yield data 59 | elif category is Category.NOT_DIGIT: 60 | if unicat != "Nd": 61 | yield data 62 | elif category is Category.WORD: 63 | if unicat[0] == "L" or unicat == "Nd" or data == 0x5F: 64 | yield data 65 | elif category is Category.NOT_WORD: 66 | if unicat[0] != "L" and unicat != "Nd" and data != 0x5F: 67 | yield data 68 | elif category is Category.SPACE: 69 | if unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v"): 70 | yield data 71 | elif category is Category.NOT_SPACE: 72 | if unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v"): 73 | yield data 74 | 75 | 76 | def covers_any(categories: Set[Category]) -> bool: 77 | for c in categories: 78 | if c.is_positive and c.negate() in categories: 79 | return True 80 | return False 81 | 82 | 83 | # CATS[sre_parse.CATEGORY_DIGIT] = list(list_category(sre_parse.CATEGORY_DIGIT)) 84 | # CATS[sre_parse.CATEGORY_SPACE] = list(list_category(sre_parse.CATEGORY_SPACE)) 85 | # CATS[sre_parse.CATEGORY_WORD] = list(list_category(sre_parse.CATEGORY_WORD)) 86 | EXAMPLE_FOR_CAT = { 87 | Category.DIGIT: "4", 88 | Category.NOT_DIGIT: "!", 89 | Category.WORD: "w", 90 | Category.NOT_WORD: "$", 91 | Category.SPACE: " ", 92 | Category.NOT_SPACE: ".", 93 | } 94 | -------------------------------------------------------------------------------- /regexploit/ast/char.py: -------------------------------------------------------------------------------- 1 | import string 2 | from dataclasses import dataclass 3 | from typing import Optional, Set 4 | 5 | from regexploit.ast.categories import Category, covers_any, list_category 6 | from regexploit.ast.ranges import Range, lits_to_ranges 7 | 8 | 9 | @dataclass(frozen=True) 10 | class Character: 11 | literals: Optional[Set[int]] = None 12 | categories: Optional[Set[Category]] = None 13 | positive: bool = True 14 | 15 | @staticmethod 16 | def ANY() -> "Character": 17 | return Character() 18 | 19 | @staticmethod 20 | def LITERAL(literal: int) -> "Character": 21 | return Character({literal}) 22 | 23 | @property 24 | def minimum_length(self) -> int: 25 | return 1 26 | 27 | @property 28 | def starriness(self) -> int: 29 | return 0 30 | 31 | def __hash__(self) -> int: 32 | return hash( 33 | ( 34 | self.positive, 35 | tuple(sorted(self.literals)) if self.literals else None, 36 | tuple(sorted(self.categories)) if self.categories else None, 37 | ) 38 | ) 39 | 40 | def exact_character_class(self) -> "Character": 41 | return self 42 | 43 | def overall_character_class(self) -> "Character": 44 | return self 45 | 46 | def maximal_character_class(self) -> "Character": 47 | return self 48 | 49 | @property 50 | def is_any(self) -> bool: 51 | return self.literals is None and self.categories is None and self.positive 52 | 53 | @property 54 | def _is_positive_literal(self) -> bool: 55 | return self.positive and self.literals is not None and self.categories is None 56 | 57 | @property 58 | def _is_negative_literal(self) -> bool: 59 | return ( 60 | not self.positive and self.literals is not None and self.categories is None 61 | ) 62 | 63 | @property 64 | def _is_positive_category(self) -> bool: 65 | return self.positive and self.literals is None and self.categories is not None 66 | 67 | @property 68 | def _is_negative_category(self) -> bool: 69 | return ( 70 | not self.positive and self.literals is None and self.categories is not None 71 | ) 72 | 73 | def expand_categories(self) -> "Character": 74 | """ 75 | This is the nuclear option where we expand the categories into literals. 76 | Can be huge in unicode. 77 | """ 78 | if self.categories: 79 | lits: Set[int] = set(self.literals) if self.literals else set() 80 | for c in self.categories: 81 | lits.update(list_category(c)) 82 | return Character(literals=lits, positive=self.positive) 83 | 84 | return self 85 | 86 | def __and__(self, other: "Optional[Character]") -> "Optional[Character]": 87 | if other is None: 88 | return None 89 | if self.is_any: 90 | return other 91 | if other.is_any: 92 | return self 93 | 94 | # [ab] & [bc] -> [c] 95 | if self._is_positive_literal and other._is_positive_literal: 96 | lits = self.literals & other.literals 97 | if not lits: 98 | return None 99 | return Character(literals=lits) 100 | if self._is_positive_category and other._is_positive_category: 101 | cats = self.categories & other.categories 102 | if not cats: 103 | return None 104 | return Character(categories=cats) 105 | # [^ab] & [^bc] -> [^abc] 106 | if self._is_negative_literal and other._is_negative_literal: 107 | return Character(literals=self.literals | other.literals, positive=False) 108 | if self._is_negative_category and other._is_negative_category: 109 | categories = self.categories | other.categories 110 | if covers_any(categories): # [^\d] & [^\D] = nothing 111 | return None 112 | return Character(categories=categories, positive=False) 113 | # [ab] & [^bc] -> [a] 114 | if self._is_positive_literal and other._is_negative_literal: 115 | lits = self.literals - other.literals 116 | if not lits: 117 | return None 118 | return Character(literals=lits) 119 | if other._is_positive_literal and self._is_negative_literal: 120 | lits = other.literals - self.literals 121 | if not lits: 122 | return None 123 | return Character(literals=lits) 124 | 125 | # TODO: be less lazy and sort out the general case without expanding everything if possible 126 | return self.expand_categories() & other.expand_categories() 127 | 128 | def __rand__(self, other: "Optional[Character]") -> "Optional[Character]": 129 | return self & other 130 | 131 | def __or__(self, other: "Optional[Character]") -> "Optional[Character]": 132 | if other is None: 133 | return self 134 | if self.is_any or other.is_any: 135 | return Character.ANY() 136 | if self == other: 137 | return self 138 | if nor := (self.negate() & other.negate()): # Slow, but logical 139 | return nor.negate() 140 | else: 141 | return Character.ANY() 142 | 143 | def __ror__(self, other: "Optional[Character]") -> "Optional[Character]": 144 | return self | other 145 | 146 | def __repr__(self) -> str: 147 | if self.is_any: 148 | return "." 149 | result = "[" 150 | if not self.positive: 151 | result += "^" 152 | more = False 153 | if self.literals is not None: 154 | lits, ranges = lits_to_ranges(self.literals) 155 | result += ",".join(literal_repr(o) for o in lits) 156 | if lits and ranges: 157 | result += "," 158 | result += ",".join(range_repr(r) for r in ranges) 159 | more = True 160 | if self.categories is not None: 161 | if more: 162 | result += ";" 163 | result += ",".join(c.name for c in self.categories) 164 | more = True 165 | return result + "]" 166 | 167 | def example(self) -> str: 168 | for c in nice_characters(): 169 | if self.matches(c): 170 | return chr(c) 171 | 172 | if self.positive: 173 | if self.literals: 174 | if len(self.literals) > 1: 175 | # Try to avoid \n due to false positives with the . character and flags 176 | return chr(next(o for o in self.literals if o != 0xA)) 177 | return chr(next(iter(self.literals))) 178 | elif self.categories: 179 | return sorted(self.categories, key=lambda c: 0 if c.is_positive else 1)[ 180 | 0 181 | ].example() 182 | 183 | raise NotImplementedError(self) 184 | 185 | def negate(self) -> "Optional[Character]": 186 | if self.is_any: 187 | return None 188 | return Character( 189 | literals=self.literals, 190 | categories=self.categories, 191 | positive=not self.positive, 192 | ) 193 | 194 | def contains(self, subgroup: "Character") -> bool: 195 | if self.is_any: 196 | return True 197 | if subgroup.is_any: 198 | return False 199 | if subgroup == self: 200 | return True 201 | 202 | if self._is_positive_literal and subgroup._is_positive_literal: 203 | return not (subgroup.literals - self.literals) 204 | if self._is_positive_category and subgroup._is_positive_category: 205 | return not (subgroup.categories - self.categories) 206 | 207 | raise NotImplementedError # Lazy, TODO: do full match 208 | 209 | def matches(self, literal: int) -> bool: 210 | if self.is_any: 211 | return True 212 | if self.literals is not None and literal in self.literals: 213 | return self.positive 214 | if self.categories: 215 | for cat in self.categories: 216 | if cat.contains(literal): 217 | return self.positive 218 | return not self.positive 219 | 220 | 221 | def nice_characters(): 222 | for c in string.printable[:-5]: 223 | yield ord(c) 224 | 225 | 226 | def literal_repr(literal: int) -> str: 227 | c = chr(literal) 228 | if c in string.digits or c in string.ascii_letters: 229 | return c 230 | elif c in string.punctuation: 231 | return f"{literal:02x}:{c}" 232 | return f"{literal:02x}" 233 | 234 | 235 | def range_repr(r: Range) -> str: 236 | return "[{}-{}]".format(literal_repr(r.min_val), literal_repr(r.max_val)) 237 | -------------------------------------------------------------------------------- /regexploit/ast/groupref.py: -------------------------------------------------------------------------------- 1 | import regexploit.ast.repeat as repeat 2 | from regexploit.ast.branch import Branch 3 | from regexploit.ast.sequence import Sequence 4 | 5 | 6 | def subpattern_to_groupref(subpattern): 7 | if subpattern is None: 8 | return None 9 | if subpattern.starriness == 0: 10 | return subpattern 11 | if isinstance(subpattern, repeat.FiniteRepeat): 12 | return subpattern.alter_repeat( 13 | subpattern_to_groupref(subpattern.repeat), 14 | ) 15 | if isinstance(subpattern, repeat.InfiniteRepeat): 16 | return repeat.FiniteRepeat( 17 | subpattern_to_groupref(subpattern.repeat), 18 | subpattern.minimum_repeats, 19 | subpattern.minimum_repeats + 1, 20 | ) 21 | if isinstance(subpattern, Branch): 22 | return Branch( 23 | [subpattern_to_groupref(b) for b in subpattern.branches], 24 | subpattern.optional, 25 | ) 26 | if isinstance(subpattern, Sequence): 27 | return Sequence([subpattern_to_groupref(e) for e in subpattern.elements]) 28 | return subpattern 29 | -------------------------------------------------------------------------------- /regexploit/ast/ranges.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Iterator, List, Set, Tuple 3 | 4 | 5 | @dataclass(frozen=True) 6 | class Range: 7 | min_val: int 8 | max_val: int 9 | 10 | 11 | def lits_to_ranges( 12 | literals: Iterator[int], 13 | ) -> Tuple[Set[int], Set[Range]]: 14 | lits = set() 15 | ranges = set() 16 | buf: List[int] = [] 17 | for lit in sorted(literals): 18 | if len(buf) and buf[-1] != lit - 1: 19 | # Discontinuity 20 | if len(buf) < 3: 21 | lits.update(buf) 22 | else: 23 | ranges.add(Range(buf[0], buf[-1])) 24 | buf = [lit] 25 | else: 26 | buf.append(lit) 27 | 28 | if len(buf) == 1: 29 | lits.add(buf[0]) 30 | elif len(buf) > 1: 31 | ranges.add(Range(buf[0], buf[-1])) 32 | 33 | return lits, ranges 34 | -------------------------------------------------------------------------------- /regexploit/ast/repeat.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Any, Optional 3 | 4 | from regexploit.ast.char import Character 5 | 6 | 7 | @dataclass(frozen=True) 8 | class Repeat: 9 | repeat: Any 10 | minimum_repeats: int 11 | 12 | def example(self) -> str: 13 | if self.minimum_repeats == 0: 14 | return "" 15 | return self.repeat.example() * self.minimum_repeats 16 | 17 | @property 18 | def minimum_length(self) -> int: 19 | return self.minimum_repeats * self.repeat.minimum_length 20 | 21 | @property 22 | def starriness(self) -> int: 23 | return self.repeat.starriness # ? and {1,30} are not that starry 24 | 25 | def exact_character_class(self) -> Optional[Character]: 26 | """ 27 | Repeated character e.g. [bc] for [bc]*, or [a] for (aaa)* 28 | """ 29 | return self.repeat.exact_character_class() 30 | 31 | def overall_character_class(self) -> Optional[Character]: 32 | """ 33 | (23)+ -> None, (22)* -> 2 34 | """ 35 | return self.repeat.overall_character_class() 36 | 37 | def maximal_character_class(self) -> Character: 38 | """ 39 | (23)+ -> [23], (22)* -> 2, (23*)* -> [23] 40 | Useful for finding a way to kill a sequence like a(bc*)*$ 41 | """ 42 | return self.repeat.maximal_character_class() 43 | 44 | 45 | @dataclass(frozen=True) 46 | class InfiniteRepeat(Repeat): 47 | forced_starriness: Optional[int] = None 48 | 49 | @property 50 | def starriness(self) -> int: 51 | if self.forced_starriness is not None: 52 | return self.forced_starriness 53 | # a*a*a* is cubic whereas (a*)* is exponential but here we just call it 10 54 | return 1 + self.repeat.starriness * 10 55 | 56 | def __repr__(self) -> str: 57 | return f"{self.repeat}{{{self.minimum_repeats}+}}" 58 | 59 | def alter_repeat(self, repeat) -> "InfiniteRepeat": 60 | return InfiniteRepeat(repeat, self.minimum_repeats) 61 | 62 | 63 | @dataclass(frozen=True) 64 | class FiniteRepeat(Repeat): 65 | maximum_repeats: int 66 | 67 | def __repr__(self) -> str: 68 | return f"{self.repeat}{{{self.minimum_repeats},{self.maximum_repeats}}}" 69 | 70 | def alter_repeat(self, repeat) -> "FiniteRepeat": 71 | return FiniteRepeat(repeat, self.minimum_repeats, self.maximum_repeats) 72 | -------------------------------------------------------------------------------- /regexploit/ast/sequence.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional 3 | 4 | from regexploit.ast.char import Character 5 | from regexploit.ast.repeat import InfiniteRepeat 6 | 7 | 8 | @dataclass(frozen=True) 9 | class Sequence: 10 | elements: List 11 | 12 | @property 13 | def starriness(self): 14 | return sum(e.starriness for e in self.elements) 15 | 16 | def __len__(self): 17 | return len(self.elements) 18 | 19 | def example(self) -> str: 20 | return "".join(e.example() for e in self.elements) 21 | 22 | @property 23 | def minimum_length(self) -> int: 24 | accum: int = 0 25 | for e in self.elements: 26 | accum += e.minimum_length 27 | return accum 28 | 29 | def exact_character_class(self) -> Optional[Character]: 30 | """ 31 | aa*a -> a, abc -> None, [ab][abc] -> None 32 | """ 33 | first = self.elements[0].exact_character_class() 34 | if first is None: 35 | return None 36 | for c in self.elements[1:]: 37 | if c != first: 38 | return None 39 | return c 40 | 41 | def overall_character_class(self) -> Optional[Character]: 42 | """ 43 | aa*a -> a, abc -> None, [ab][abc] -> [ab] 44 | a?b -> b, a+b -> None, [ab]+b* -> b 45 | """ 46 | c = Character.ANY() 47 | for e in self.elements: 48 | c &= e.overall_character_class() 49 | if not c: 50 | return None 51 | return c 52 | 53 | def matching_repeats(self): 54 | """Complicated way to get the possible character classes for a sequence""" 55 | c = Character.ANY() 56 | has_mandatory = False 57 | optionals = [] 58 | starriness = 0 59 | minimum_length = 0 60 | for e in self.elements: 61 | if e.minimum_length: 62 | c &= e.overall_character_class() 63 | if not c: 64 | return None 65 | has_mandatory = True 66 | starriness += e.starriness 67 | minimum_length += e.minimum_length 68 | elif e.starriness > 0: 69 | optionals.append(e) 70 | possibilities = {c: starriness} if has_mandatory else {} 71 | for e in optionals: 72 | if new_c := e.overall_character_class() & c: 73 | if new_c in possibilities: 74 | possibilities[new_c] += e.starriness 75 | else: 76 | possibilities[new_c] = e.starriness 77 | 78 | if len(possibilities) > 1: 79 | # (a*[ab]*a*[bc]*[bcd]*.+a*)*@ has classes {.: 1, [a]: 5, [[a-b]]: 2, [[b-c]]: 3, [[b-d]]: 2, [b]: 3} 80 | # This could blow up! 81 | poss_chars = list(possibilities.items()) 82 | merged_chars = {} 83 | while poss_chars: 84 | c_a, s_a = poss_chars.pop() 85 | for c_b, s_b in poss_chars: 86 | if (merged := c_a & c_b) is not None: 87 | if merged == c_a: 88 | possibilities[c_a] += s_b 89 | elif merged == c_b: 90 | possibilities[c_b] += s_a 91 | else: 92 | if merged not in merged_chars: 93 | merged_chars[merged] = set() 94 | merged_chars[merged] |= {(c_a, s_a), (c_b, s_b)} 95 | for merged, set_of_chars in merged_chars.items(): 96 | possibilities[merged] = sum(s for _, s in set_of_chars) 97 | 98 | for cc, s in possibilities.items(): 99 | if s: 100 | yield InfiniteRepeat(cc, minimum_length, forced_starriness=s) 101 | 102 | def maximal_character_class(self) -> Character: 103 | """ 104 | Only useful when this Sequence is inside a Repeat 105 | a*b -> [ab], ab* -> [ab] 106 | Since forcing backtracking for (bc*)$ 107 | """ 108 | c = None 109 | for e in self.elements: 110 | if (mcc := e.maximal_character_class()) is not None: 111 | c = mcc | c 112 | return c 113 | 114 | def __repr__(self) -> str: 115 | return "SEQ{ " + " ".join(str(e) for e in self.elements) + " }" 116 | -------------------------------------------------------------------------------- /regexploit/ast/sre.py: -------------------------------------------------------------------------------- 1 | import sre_constants 2 | import sre_parse 3 | from typing import List, Optional, Set, Tuple, Union # noqa: I100, I201 4 | 5 | from regexploit.ast.at import EndOfString 6 | from regexploit.ast.branch import Branch, make_branch 7 | from regexploit.ast.categories import Category, covers_any 8 | from regexploit.ast.char import Character 9 | from regexploit.ast.groupref import subpattern_to_groupref 10 | from regexploit.ast.repeat import FiniteRepeat, InfiniteRepeat 11 | from regexploit.ast.sequence import Sequence 12 | 13 | SreConstant = sre_constants._NamedIntConstant 14 | SreOpData = Union[Tuple, List, int, SreConstant, None] 15 | SreOp = Tuple[SreConstant, SreOpData] 16 | 17 | 18 | class SreOpParser: 19 | def __init__(self): 20 | self._groups = {} 21 | self.negative_lookahead: Optional[Character] = None 22 | 23 | def parse_sre(self, pattern: str, flags: int = 0): 24 | return self.sequence_or_singleton(sre_parse.parse(pattern, flags)) 25 | 26 | def parse_op(self, op: SreConstant, data: SreOpData): 27 | return getattr(self, f"from_{op.name}")(data) 28 | 29 | def sequence_or_singleton(self, ops: List[SreOp]): 30 | elems = [] 31 | for p in (self.parse_op(*op) for op in ops): 32 | if p is not None: 33 | if isinstance(p, Sequence): 34 | elems.extend(p.elements) 35 | else: 36 | elems.append(p) 37 | if len(elems) == 0: 38 | return None 39 | if len(elems) == 1: 40 | return elems[0] 41 | return Sequence(elems) 42 | 43 | def from_SUBPATTERN(self, data: Tuple[int, int, int, List[SreOp]]): 44 | ref = data[0] 45 | elements = data[3] 46 | result = self.sequence_or_singleton(elements) 47 | self._groups[ref] = result 48 | return result 49 | 50 | def from_MAX_REPEAT( 51 | self, 52 | data: Tuple[ 53 | int, 54 | Union[int, SreConstant], 55 | List[SreOp], 56 | ], 57 | ) -> Union[FiniteRepeat, InfiniteRepeat, Branch, None]: 58 | minimum, maximum, elements = data 59 | infinite = maximum is sre_constants.MAXREPEAT 60 | # TODO support negative lookahead before repeat with minimum = 0 61 | negative_lookahead = self.use_negative_lookahead() 62 | repeatable = self.sequence_or_singleton(elements) 63 | if repeatable is None: 64 | return None 65 | if ( 66 | minimum == 0 67 | and maximum == 1 68 | and repeatable.starriness 69 | and not repeatable.overall_character_class() 70 | ): 71 | # Interesting (starry) optional sequences as branches (ab*)? -> (ab*|) 72 | return make_branch([repeatable, None]) 73 | if infinite: 74 | if ( 75 | negative_lookahead is not None 76 | and minimum > 0 77 | and isinstance(repeatable, Character) 78 | ): 79 | return Sequence( 80 | [ 81 | negative_lookahead & repeatable, 82 | InfiniteRepeat(repeatable, minimum - 1), 83 | ] 84 | ) 85 | return InfiniteRepeat(repeatable, minimum) 86 | if ( 87 | negative_lookahead is not None 88 | and minimum > 0 89 | and maximum > 1 90 | and isinstance(repeatable, Character) 91 | ): 92 | return Sequence( 93 | [ 94 | negative_lookahead & repeatable, 95 | FiniteRepeat(repeatable, minimum - 1, maximum - 1), 96 | ] 97 | ) 98 | return FiniteRepeat(repeatable, minimum, maximum) 99 | 100 | def from_MIN_REPEAT(self, data): 101 | return self.from_MAX_REPEAT(data) 102 | 103 | def from_BRANCH( 104 | self, data: Tuple[None, List[List[SreOp]]] 105 | ) -> Union[Branch, FiniteRepeat, Character, None]: 106 | # sre already transforms (a|b|c) -> [abc] 107 | branches = data[1] 108 | negative_lookahead = self.use_negative_lookahead() 109 | processed_branches = [] 110 | for branch in branches: 111 | self.negative_lookahead = negative_lookahead 112 | processed_branches.append(self.sequence_or_singleton(branch)) 113 | self.negative_lookahead = None 114 | return make_branch(processed_branches) 115 | 116 | def from_AT(self, at: SreConstant): 117 | # TODO: handling for multiline 118 | # TODO: handling for \\b 119 | self.use_negative_lookahead() 120 | if at is sre_constants.AT_END: 121 | return EndOfString() 122 | return None 123 | 124 | def from_ANY(self, _: None) -> Character: 125 | if negative_lookahead := self.use_negative_lookahead(): 126 | return negative_lookahead 127 | return Character.ANY() 128 | 129 | def from_LITERAL(self, literal: int) -> Character: 130 | if negative_lookahead := self.use_negative_lookahead(): 131 | return Character.LITERAL(literal) & negative_lookahead 132 | return Character.LITERAL(literal) 133 | 134 | def from_NOT_LITERAL(self, not_literal: int) -> Character: 135 | if negative_lookahead := self.use_negative_lookahead(): 136 | return ( 137 | Character(literals={not_literal}, positive=False) & negative_lookahead 138 | ) 139 | return Character(literals={not_literal}, positive=False) 140 | 141 | def from_IN(self, data: List[SreOp]) -> Character: 142 | literals: Optional[Set[int]] = None 143 | categories: Optional[Set] = None 144 | positive = True 145 | if len(data) > 1 and data[0] == (sre_constants.NEGATE, None): 146 | positive = False 147 | data = data[1:] 148 | for in_op, in_data in data: 149 | if in_op is sre_constants.LITERAL: 150 | if literals is None: 151 | literals = set() 152 | literals.add(in_data) 153 | elif in_op is sre_constants.RANGE: 154 | if literals is None: 155 | literals = set() 156 | min_val, max_val = in_data 157 | literals.update(range(min_val, max_val + 1)) 158 | elif in_op is sre_constants.CATEGORY: 159 | if categories is None: 160 | categories = set() 161 | categories.add(Category[in_data.name[9:]]) 162 | 163 | if categories and covers_any(categories): 164 | return self.from_ANY(None) if positive else None 165 | if negative_lookahead := self.use_negative_lookahead(): 166 | return Character(literals, categories, positive) & negative_lookahead 167 | return Character(literals, categories, positive) 168 | 169 | def from_GROUPREF(self, ref: int): 170 | return subpattern_to_groupref(self._groups.get(ref)) 171 | 172 | @staticmethod 173 | def from_GROUPREF_EXISTS(_) -> None: 174 | return None # No intention to implement this properly 175 | 176 | @staticmethod 177 | def from_ASSERT(_) -> None: 178 | return None # No intention to implement this properly 179 | 180 | def from_ASSERT_NOT(self, data) -> None: 181 | typ, ops = data 182 | if typ == 1: 183 | if len(ops) == 1: 184 | character_op = ops[0] 185 | if character_op[0] in ( 186 | sre_constants.LITERAL, 187 | sre_constants.NOT_LITERAL, 188 | sre_constants.IN, 189 | ): 190 | negative_lookahead = self.use_negative_lookahead() 191 | not_assertion = self.parse_op(*character_op) 192 | if not_assertion and (assertion := not_assertion.negate()): 193 | self.negative_lookahead = assertion 194 | if negative_lookahead is not None: 195 | self.negative_lookahead &= negative_lookahead 196 | else: 197 | self.negative_lookahead = negative_lookahead 198 | 199 | return None # No intention to implement this fully 200 | 201 | def use_negative_lookahead(self) -> Optional[Character]: 202 | if self.negative_lookahead is not None: 203 | negative_lookahead = self.negative_lookahead 204 | self.negative_lookahead = None 205 | return negative_lookahead 206 | -------------------------------------------------------------------------------- /regexploit/bin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/bin/__init__.py -------------------------------------------------------------------------------- /regexploit/bin/files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from glob import iglob 4 | from typing import List, Optional 5 | 6 | 7 | def _file_generator( 8 | files_argument: List[str], is_glob: bool, filename_globs: List[str] 9 | ): 10 | if is_glob: 11 | for fglob in files_argument: 12 | yield from iglob(fglob, recursive=True) 13 | else: 14 | for f in files_argument: 15 | if os.path.isdir(f): 16 | for g in filename_globs: 17 | yield from iglob(os.path.join(f, "**", g), recursive=True) 18 | else: 19 | yield f 20 | 21 | 22 | def file_generator( 23 | files_argument: List[str], 24 | is_glob: bool, 25 | filename_globs: List[str], 26 | ignore: Optional[List[str]] = None, 27 | ): 28 | gen = _file_generator(files_argument, is_glob, filename_globs) 29 | if ignore: 30 | for f in gen: 31 | if any(i in f for i in ignore): 32 | continue 33 | yield f 34 | else: 35 | yield from gen 36 | -------------------------------------------------------------------------------- /regexploit/bin/javascript/.eslintrc.yml: -------------------------------------------------------------------------------- 1 | env: 2 | node: true 3 | commonjs: true 4 | es2021: true 5 | extends: 'eslint:recommended' 6 | parserOptions: 7 | ecmaVersion: 12 8 | rules: 9 | accessor-pairs: error 10 | array-bracket-newline: error 11 | array-bracket-spacing: error 12 | array-callback-return: error 13 | array-element-newline: error 14 | arrow-body-style: error 15 | arrow-parens: error 16 | arrow-spacing: error 17 | block-scoped-var: error 18 | block-spacing: error 19 | brace-style: 20 | - error 21 | - 1tbs 22 | callback-return: error 23 | camelcase: error 24 | capitalized-comments: 25 | - error 26 | - never 27 | class-methods-use-this: error 28 | comma-dangle: 'off' 29 | comma-spacing: 30 | - error 31 | - after: true 32 | before: false 33 | comma-style: 34 | - error 35 | - last 36 | complexity: error 37 | computed-property-spacing: 38 | - error 39 | - never 40 | consistent-return: error 41 | consistent-this: error 42 | curly: error 43 | default-case: error 44 | default-case-last: error 45 | default-param-last: error 46 | dot-location: error 47 | dot-notation: error 48 | eol-last: error 49 | eqeqeq: 'off' 50 | func-call-spacing: error 51 | func-name-matching: error 52 | func-names: error 53 | func-style: error 54 | function-paren-newline: error 55 | generator-star-spacing: 'off' 56 | global-require: error 57 | grouped-accessor-pairs: error 58 | guard-for-in: error 59 | handle-callback-err: error 60 | id-blacklist: error 61 | id-denylist: error 62 | id-length: error 63 | id-match: error 64 | implicit-arrow-linebreak: error 65 | indent: 'off' 66 | indent-legacy: 'off' 67 | init-declarations: error 68 | jsx-quotes: error 69 | key-spacing: error 70 | keyword-spacing: 71 | - error 72 | - after: true 73 | before: true 74 | line-comment-position: error 75 | linebreak-style: 76 | - error 77 | - unix 78 | lines-around-comment: error 79 | lines-around-directive: error 80 | lines-between-class-members: error 81 | max-classes-per-file: error 82 | max-depth: error 83 | max-len: 'off' 84 | max-lines: error 85 | max-lines-per-function: error 86 | max-nested-callbacks: error 87 | max-params: error 88 | max-statements: 'off' 89 | max-statements-per-line: error 90 | multiline-comment-style: error 91 | new-cap: error 92 | new-parens: error 93 | newline-after-var: 'off' 94 | newline-before-return: 'off' 95 | newline-per-chained-call: error 96 | no-alert: error 97 | no-array-constructor: error 98 | no-await-in-loop: 'off' 99 | no-bitwise: error 100 | no-buffer-constructor: error 101 | no-caller: error 102 | no-catch-shadow: error 103 | no-confusing-arrow: error 104 | no-console: 'off' 105 | no-constructor-return: error 106 | no-continue: error 107 | no-div-regex: error 108 | no-duplicate-imports: error 109 | no-else-return: error 110 | no-empty-function: error 111 | no-eq-null: error 112 | no-eval: error 113 | no-extend-native: error 114 | no-extra-bind: error 115 | no-extra-label: error 116 | no-extra-parens: 'off' 117 | no-floating-decimal: error 118 | no-implicit-coercion: error 119 | no-implicit-globals: 'off' 120 | no-implied-eval: error 121 | no-inline-comments: error 122 | no-invalid-this: error 123 | no-iterator: error 124 | no-label-var: error 125 | no-labels: error 126 | no-lone-blocks: error 127 | no-lonely-if: error 128 | no-loop-func: error 129 | no-loss-of-precision: error 130 | no-magic-numbers: 'off' 131 | no-mixed-operators: error 132 | no-mixed-requires: error 133 | no-multi-assign: error 134 | no-multi-spaces: error 135 | no-multi-str: error 136 | no-multiple-empty-lines: error 137 | no-native-reassign: error 138 | no-negated-condition: error 139 | no-negated-in-lhs: error 140 | no-nested-ternary: error 141 | no-new: error 142 | no-new-func: error 143 | no-new-object: error 144 | no-new-require: error 145 | no-new-wrappers: error 146 | no-nonoctal-decimal-escape: error 147 | no-octal-escape: error 148 | no-param-reassign: error 149 | no-path-concat: error 150 | no-plusplus: error 151 | no-process-env: error 152 | no-process-exit: error 153 | no-promise-executor-return: error 154 | no-proto: error 155 | no-restricted-exports: error 156 | no-restricted-globals: error 157 | no-restricted-imports: error 158 | no-restricted-modules: error 159 | no-restricted-properties: error 160 | no-restricted-syntax: error 161 | no-return-assign: 162 | - error 163 | - except-parens 164 | no-return-await: error 165 | no-script-url: error 166 | no-self-compare: error 167 | no-sequences: error 168 | no-shadow: error 169 | no-spaced-func: error 170 | no-sync: error 171 | no-tabs: error 172 | no-template-curly-in-string: error 173 | no-ternary: 'off' 174 | no-throw-literal: error 175 | no-trailing-spaces: error 176 | no-undef-init: error 177 | no-undefined: error 178 | no-underscore-dangle: error 179 | no-unmodified-loop-condition: error 180 | no-unneeded-ternary: error 181 | no-unreachable-loop: error 182 | no-unused-expressions: error 183 | no-use-before-define: error 184 | no-useless-backreference: error 185 | no-useless-call: error 186 | no-useless-computed-key: error 187 | no-useless-concat: error 188 | no-useless-constructor: error 189 | no-useless-rename: error 190 | no-useless-return: error 191 | no-var: error 192 | no-void: error 193 | no-warning-comments: error 194 | no-whitespace-before-property: error 195 | nonblock-statement-body-position: error 196 | object-curly-newline: error 197 | object-curly-spacing: 198 | - error 199 | - always 200 | object-shorthand: error 201 | one-var: off 202 | one-var-declaration-per-line: error 203 | operator-assignment: error 204 | operator-linebreak: error 205 | padded-blocks: 'off' 206 | padding-line-between-statements: error 207 | prefer-arrow-callback: error 208 | prefer-const: error 209 | prefer-destructuring: error 210 | prefer-exponentiation-operator: error 211 | prefer-named-capture-group: error 212 | prefer-numeric-literals: error 213 | prefer-object-spread: error 214 | prefer-promise-reject-errors: error 215 | prefer-reflect: error 216 | prefer-regex-literals: error 217 | prefer-rest-params: error 218 | prefer-spread: error 219 | prefer-template: error 220 | quote-props: 'off' 221 | quotes: 222 | - error 223 | - single 224 | radix: error 225 | require-atomic-updates: error 226 | require-await: error 227 | require-jsdoc: error 228 | require-unicode-regexp: error 229 | rest-spread-spacing: 230 | - error 231 | - never 232 | semi: 'off' 233 | semi-spacing: error 234 | semi-style: 235 | - error 236 | - last 237 | sort-imports: error 238 | sort-keys: 'off' 239 | sort-vars: error 240 | space-before-blocks: error 241 | space-before-function-paren: 'off' 242 | space-in-parens: 243 | - error 244 | - never 245 | space-infix-ops: error 246 | space-unary-ops: 247 | - error 248 | - nonwords: false 249 | words: true 250 | spaced-comment: 251 | - error 252 | - always 253 | strict: 254 | - error 255 | - never 256 | switch-colon-spacing: error 257 | symbol-description: error 258 | template-curly-spacing: error 259 | template-tag-spacing: error 260 | unicode-bom: 261 | - error 262 | - never 263 | valid-jsdoc: error 264 | vars-on-top: error 265 | wrap-iife: error 266 | wrap-regex: error 267 | yield-star-spacing: error 268 | yoda: 269 | - error 270 | - never 271 | -------------------------------------------------------------------------------- /regexploit/bin/javascript/cli.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs').promises; 2 | const findRegex = require('./find'); 3 | 4 | module.exports = { 5 | async * parseFile(filename) { 6 | try { 7 | const code = await fs.readFile(filename) 8 | yield* this.parseCode(code, filename); 9 | } catch (error) { 10 | yield JSON.stringify({ error, filename }); 11 | } 12 | }, 13 | 14 | * parseCode(code, filename) { 15 | try { 16 | for (const regex of findRegex.extractRegexesFromSource(code, filename)) { 17 | yield JSON.stringify({ 18 | ...regex, 19 | filename, 20 | }); 21 | } 22 | } catch (error) { 23 | yield JSON.stringify({ error, filename }); 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /regexploit/bin/javascript/find.js: -------------------------------------------------------------------------------- 1 | const parser = require('@typescript-eslint/parser'); 2 | 3 | module.exports = { 4 | * extractRegexesFromSource(content, filename) { 5 | // options https://github.com/typescript-eslint/typescript-eslint/blob/master/packages/types/src/parser-options.ts 6 | const tree = parser.parse(content, { 7 | ecmaFeatures: { 8 | jsx: true 9 | }, 10 | comment: false, 11 | ecmaVersion: 2020, 12 | errorOnTypeScriptSyntacticAndSemanticIssues: false, 13 | errorOnUnknownASTType: false, 14 | range: true, 15 | loc: true, 16 | filename, 17 | }); 18 | yield* this.walkASTForRegexes(tree); 19 | }, 20 | 21 | * walkASTForRegexes(tree) { 22 | if (!tree) { 23 | return; 24 | } 25 | if (tree.regex) { 26 | yield { 27 | 'pattern': tree.regex.pattern, 28 | 'flags': tree.regex.flags, 29 | 'lineno': tree.loc.start.line, 30 | } 31 | return; 32 | } 33 | if ( 34 | (tree.type == 'NewExpression' || tree.type == 'CallExpression') && 35 | tree.callee && tree.callee.name == 'RegExp' && tree.arguments && tree.arguments[0].type == 'Literal' 36 | ) { 37 | yield { 38 | 'pattern': tree.arguments[0].value, 39 | 'flags': tree.arguments.length > 1 && tree.arguments[1].type == 'Literal' ? tree.arguments[1].value : '', 40 | 'lineno': tree.loc.start.line, 41 | } 42 | return; 43 | } 44 | for (const element of Object.values(tree)) { 45 | if (element && typeof element == 'object') { 46 | yield* this.walkASTForRegexes(element); 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /regexploit/bin/javascript/index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const cli = require('./cli'), 3 | readline = require('readline'); 4 | 5 | const args = process.argv.slice(2); 6 | 7 | 8 | if (args.length == 1 && args[0] == '-') { 9 | process.stdin.setEncoding('utf-8'); 10 | let data = ''; 11 | readline.createInterface({ input: process.stdin }). 12 | on('line', (line) => (data += line)). 13 | on('close', () => { 14 | for (const output of cli.parseCode(data)) { 15 | console.log(output); 16 | } 17 | }) 18 | } else { 19 | (async () => { 20 | for (const filename of args) { 21 | for await (const output of cli.parseFile(filename)) { 22 | console.log(output); 23 | } 24 | } 25 | })() 26 | } 27 | -------------------------------------------------------------------------------- /regexploit/bin/javascript/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extract-regexes-from-ast", 3 | "version": "0.0.1", 4 | "description": "Parse a javascript or typescript file and output most regular expressions", 5 | "main": "index.js", 6 | "engines" : { 7 | "node" : ">=12" 8 | }, 9 | "scripts": { 10 | "lint": "eslint *.js", 11 | "test": "mocha" 12 | }, 13 | "author": "", 14 | "dependencies": { 15 | "@typescript-eslint/parser": "^4.0.0", 16 | "typescript": "^4.0.0" 17 | }, 18 | "devDependencies": { 19 | "better-assert": "^1.0.0", 20 | "eslint": "^7.0.0", 21 | "mocha": "^8.0.0" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /regexploit/bin/javascript/test/test.js: -------------------------------------------------------------------------------- 1 | const assert = require('better-assert'); 2 | const cli = require('../cli'); 3 | const findRegex = require('../find'); 4 | 5 | describe('findRegex', function() { 6 | describe('extractRegexesFromSource()', function() { 7 | it('should be able to return nothing', function() { 8 | assert([...findRegex.extractRegexesFromSource('abc')].length == 0); 9 | }); 10 | it('should find a literal regex', function() { 11 | let found = [...findRegex.extractRegexesFromSource('const a = /ab+c/g')]; 12 | assert(found.length == 1); 13 | assert(found[0].pattern == 'ab+c'); 14 | assert(found[0].flags == 'g'); 15 | assert(found[0].lineno == 1); 16 | }); 17 | it('should find the RegExp constructor', function() { 18 | let found = [...findRegex.extractRegexesFromSource('const a = [\nnew RegExp("one"),\nRegExp("two", "flags")]')]; 19 | assert(found.length == 2); 20 | assert(found[0].pattern == 'one'); 21 | assert(found[0].flags == ''); 22 | assert(found[0].lineno == 2); 23 | assert(found[1].pattern == 'two'); 24 | assert(found[1].flags == 'flags'); 25 | assert(found[1].lineno == 3); 26 | }); 27 | var burriedTests = [ 28 | "var a = {b: /abc/}", 29 | "function x() { return function* () { yield /abc/ } }", 30 | "function x(y = /abc/) { return y; }", 31 | "a ? /abc/ : null", 32 | "if(/abc/){}", // a bit stupid 33 | "[12, abc, /abc/, ...ghi]", 34 | "for (const a of x.match(/abc/)) {}" 35 | ] 36 | burriedTests.forEach(function (code) { 37 | it('should find burried regex ' + code, function() { 38 | let found = [...findRegex.extractRegexesFromSource(code)]; 39 | assert(found.length == 1); 40 | assert(found[0].pattern == 'abc'); 41 | assert(found[0].flags == ''); 42 | }); 43 | }); 44 | }); 45 | }); 46 | 47 | describe("cli", function() { 48 | describe("parseCode()", function() { 49 | it('should find a literal regex', function() { 50 | let found = [...cli.parseCode('/a(((b)+c))/im', 'fname')]; 51 | assert(found.length == 1); 52 | const output = JSON.parse(found[0]); 53 | assert(output.pattern == 'a(((b)+c))'); 54 | assert(output.flags == 'im'); 55 | assert(output.lineno == '1'); 56 | assert(output.filename == 'fname'); 57 | assert(!output.error); 58 | }); 59 | it('should return errors if necessary', function() { 60 | let found = [...cli.parseCode('/!#~')]; 61 | assert(found.length == 1); 62 | assert(JSON.parse(found[0]).error); 63 | assert(!JSON.parse(found[0]).pattern); 64 | }); 65 | }); 66 | }); 67 | -------------------------------------------------------------------------------- /regexploit/bin/regexploit-python-env: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import regexploit.hook 3 | 4 | regexploit.hook.install() 5 | 6 | import importlib 7 | import pkgutil 8 | import sys 9 | 10 | from regexploit.ast.sre import SreOpParser 11 | from regexploit.redos import find 12 | from regexploit.output.text import TextOutput 13 | 14 | # Load python modules and process regexes which are compiled on import by hooking re.compile 15 | 16 | 17 | def main(): 18 | def onerror(name): 19 | print("Cannot load", name) 20 | 21 | names = tuple(sys.argv[1:]) if len(sys.argv) > 1 else None 22 | sys.argv = sys.argv[:1] 23 | if names: 24 | regexploit.hook.regexes.clear() 25 | 26 | output = TextOutput() 27 | for p in pkgutil.walk_packages(sys.path, onerror=onerror): 28 | # Importing some modules is disruptive https://xkcd.com/353/ 29 | if ( 30 | not names 31 | and p.name not in ("antigravity", "rstpep2html", "setup") 32 | and not p.name.startswith(("test", "pip", "setuptools", "idlelib", "rst2")) 33 | and not p.name.endswith(("__main__", ".main", ".conftest")) 34 | and ".test" not in p.name 35 | ) or (names and p.name.startswith(names)): 36 | print(f"Importing {p.name}") 37 | try: 38 | importlib.import_module(p.name) 39 | hooked_regex: regexploit.hook.CompiledRegex 40 | for hooked_regex in regexploit.hook.get_and_clear_regexes(): 41 | output.next() 42 | parsed = SreOpParser().parse_sre( 43 | hooked_regex.pattern, hooked_regex.flags 44 | ) 45 | for redos in find(parsed): 46 | if redos.starriness > 2: 47 | output.record( 48 | redos, 49 | hooked_regex.pattern, 50 | filename=hooked_regex.last_tb.filename, 51 | lineno=hooked_regex.last_tb.lineno, 52 | context=hooked_regex.last_tb.line, 53 | ) 54 | 55 | except Exception as e: 56 | print("Cannot load", p, e) 57 | print(f"Processed {output.regexes} regexes") 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /regexploit/bin/regexploit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import fileinput 4 | import logging 5 | import re 6 | import sys 7 | import traceback 8 | 9 | from regexploit.ast.sre import SreOpParser 10 | from regexploit.languages.javascript import fix_js_regex 11 | from regexploit.output.text import TextOutput 12 | from regexploit.redos import find 13 | 14 | 15 | def find_redos(pattern: str, flags: int, output: TextOutput, parser): 16 | try: 17 | parsed = parser(pattern, flags) 18 | except Exception as e: 19 | print(f"Error parsing: {pattern}", e) 20 | return 21 | output.next() 22 | for redos in find(parsed): 23 | if redos.starriness > 2: 24 | output.record(redos, pattern) 25 | yield redos 26 | 27 | 28 | def python(pattern: str, flags: int): 29 | return SreOpParser().parse_sre(pattern, flags) 30 | 31 | 32 | def javascript(pattern: str, flags: int): 33 | try: 34 | return SreOpParser().parse_sre(pattern) 35 | except: 36 | try: 37 | fixed = fix_js_regex(pattern) 38 | re.compile(fixed) 39 | except: 40 | raise 41 | 42 | try: 43 | return SreOpParser().parse_sre(fixed) 44 | except: 45 | print(traceback.format_exc()) 46 | raise 47 | 48 | 49 | def main(): 50 | parser = argparse.ArgumentParser( 51 | description="Parse regexes from stdin and scan them for ReDoS" 52 | ) 53 | parser.add_argument( 54 | "-f", 55 | "--flavour", 56 | "--flavor", 57 | choices=["python", "js"], 58 | default="python", 59 | help="Regex language", 60 | ) 61 | parser.add_argument( 62 | "-v", "--verbose", action="count", default=0, help="Verbose logging" 63 | ) 64 | parser.add_argument( 65 | "-u", 66 | "--unescape", 67 | action="store_true", 68 | help="Unescape the regular expressions before parsing them (e.g. double backslashes)", 69 | ) 70 | args = parser.parse_args() 71 | sys.argv = sys.argv[:1] 72 | if args.verbose == 1: 73 | logging.basicConfig(level=logging.INFO) 74 | elif args.verbose > 1: 75 | logging.basicConfig(level=logging.DEBUG) 76 | 77 | isatty = sys.stdin.isatty() 78 | if isatty: 79 | print("Welcome to Regexploit. Enter your regexes:") 80 | output = TextOutput() 81 | try: 82 | for line in fileinput.input(): 83 | found = False 84 | line = line.rstrip("\n") 85 | if args.unescape: 86 | # \\d -> \d 87 | line = line.encode().decode("unicode_escape") 88 | for _ in find_redos( 89 | line, 0, output, javascript if args.flavour == "js" else python 90 | ): 91 | found = True 92 | if isatty and not found: 93 | print("No ReDoS found.") 94 | except KeyboardInterrupt: 95 | pass 96 | 97 | 98 | if __name__ == "__main__": 99 | main() 100 | -------------------------------------------------------------------------------- /regexploit/bin/regexploit_csharp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import logging 4 | import re 5 | import traceback 6 | import warnings 7 | 8 | from regexploit.ast.sre import SreOpParser 9 | from regexploit.bin.files import file_generator 10 | from regexploit.languages.csharp_string_extractor import find_regexes 11 | from regexploit.languages.javascript import fix_js_regex 12 | from regexploit.output.text import TextOutput 13 | from regexploit.redos import find 14 | 15 | 16 | def handle_file(filename: str, output: TextOutput): 17 | with open(filename, "rb") as f: 18 | code = f.read() 19 | for regex in find_regexes(code): 20 | pattern = regex.pattern 21 | if len(pattern) < 5: 22 | continue # (.+)+ 23 | if pattern.count("*") + pattern.count("+") + pattern.count(",}") < 2: 24 | continue # no ReDoS possible 25 | try: 26 | logging.debug("%s#%s: %s", filename, regex.lineno, pattern) 27 | parsed = SreOpParser().parse_sre(pattern, regex.flags) 28 | except: 29 | try: 30 | fixed = fix_js_regex(pattern) 31 | re.compile(fixed, regex.flags) 32 | except: 33 | if regex.definitely_regex: 34 | print( 35 | f"Error parsing: {pattern} from {filename} line {regex.lineno}\n" 36 | ) 37 | continue 38 | try: 39 | parsed = SreOpParser().parse_sre(fixed, regex.flags) 40 | except: 41 | print(f"Error in regexploit parsing: {pattern} from {filename}") 42 | print(traceback.format_exc()) 43 | continue 44 | try: 45 | output.next() 46 | for redos in find(parsed): 47 | if redos.starriness > 2: 48 | context = None 49 | try: 50 | context = code.splitlines()[regex.lineno - 1].decode().strip() 51 | except UnicodeDecodeError: 52 | pass 53 | output.record( 54 | redos, 55 | pattern, 56 | filename=filename, 57 | lineno=regex.lineno, 58 | context=context, 59 | ) 60 | except Exception: 61 | print(f"Error finding ReDoS: {pattern} from {filename} #{regex.lineno}") 62 | print(traceback.format_exc()) 63 | 64 | 65 | def main(): 66 | with warnings.catch_warnings(): 67 | warnings.simplefilter( 68 | "ignore", category=FutureWarning 69 | ) # Some csharp/js regexes are weird 70 | parser = argparse.ArgumentParser( 71 | description="Parse regexes out of C# files and scan them for ReDoS" 72 | ) 73 | parser.add_argument("files", nargs="+", help="C# files") 74 | parser.add_argument( 75 | "--glob", action="store_true", help="Glob the input filenames (**/*)" 76 | ) 77 | parser.add_argument("--verbose", action="store_true", help="Verbose logging") 78 | parser.add_argument( 79 | "--ignore", action="append", help="Paths containing this string are ignored" 80 | ) 81 | args = parser.parse_args() 82 | 83 | if args.verbose: 84 | logging.basicConfig(level=logging.DEBUG) 85 | 86 | output = TextOutput(js_flavour=True) 87 | files = file_generator(args.files, args.glob, ["*.cs"], args.ignore) 88 | for filename in files: 89 | logging.debug(filename) 90 | handle_file(filename, output) 91 | print(f"Processed {output.regexes} regexes") 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /regexploit/bin/regexploit_js.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import io 4 | import json 5 | import logging 6 | import os.path 7 | import re 8 | import subprocess 9 | import sys 10 | import traceback 11 | import warnings 12 | 13 | from regexploit.ast.sre import SreOpParser 14 | from regexploit.bin.files import file_generator 15 | from regexploit.languages.javascript import fix_js_regex 16 | from regexploit.output.text import TextOutput 17 | from regexploit.redos import find 18 | 19 | 20 | def handle_line_from_node(line: str, output: TextOutput): 21 | regex = json.loads(line) 22 | if pattern := regex.get("pattern"): 23 | if (pattern_len := len(pattern)) < 5: 24 | return # (.+)+ 25 | if pattern_len == 8059 and pattern.startswith("\\u{1F3F4}(?:\\u{E0067"): 26 | return # annoying emoji regex 27 | if pattern.count("*") + pattern.count("+") + pattern.count(",}") < 2: 28 | return # no ReDoS possible 29 | filename = regex["filename"] 30 | lineno = regex["lineno"] 31 | try: 32 | logging.debug("%s#%s: %s", filename, lineno, pattern) 33 | parsed = SreOpParser().parse_sre(pattern) 34 | except: 35 | try: 36 | fixed = fix_js_regex(pattern) 37 | re.compile(fixed) 38 | except: 39 | print(f"Error parsing: {pattern} from {filename}\n") 40 | return 41 | try: 42 | parsed = SreOpParser().parse_sre(fixed) 43 | except: 44 | print(f"Error in regexploit parsing: {pattern} from {filename}") 45 | print(traceback.format_exc()) 46 | return 47 | output.next() 48 | try: 49 | for redos in find(parsed): 50 | if redos.starriness > 2: 51 | output.record(redos, pattern, filename=filename, lineno=lineno) 52 | except Exception: 53 | print(f"Error finding ReDoS: {pattern} from {filename}") 54 | print(traceback.format_exc()) 55 | elif error := regex.get("error"): 56 | print("ERR", error, regex.get("filename")) 57 | 58 | 59 | def process_files(filenames, nodejs_executable, output): 60 | args = [ 61 | os.path.join(os.path.split(__file__)[0], "javascript", "index.js"), 62 | *filenames, 63 | ] 64 | if nodejs_executable: 65 | args = [nodejs_executable] + args 66 | logging.debug("Processing batch: %s", args[2:]) 67 | node = subprocess.Popen(args, stdout=subprocess.PIPE) 68 | for line in io.TextIOWrapper(node.stdout, encoding="utf-8"): 69 | handle_line_from_node(line, output) 70 | rc = node.poll() 71 | return rc 72 | 73 | 74 | def main(): 75 | if not os.path.isdir( 76 | os.path.join(os.path.split(__file__)[0], "javascript", "node_modules") 77 | ): 78 | path = os.path.join(os.path.split(__file__)[0], "javascript") 79 | print("The JavaScript & TypeScript parsers require some node modules.\n") 80 | print(f"Run (cd {path}; npm install)") 81 | sys.exit(1) 82 | with warnings.catch_warnings(): 83 | warnings.simplefilter( 84 | "ignore", category=FutureWarning 85 | ) # Some js regexes are weird 86 | parser = argparse.ArgumentParser( 87 | description="Parse regexes out of javascript files and scan them for ReDoS" 88 | ) 89 | parser.add_argument("files", nargs="+", help="Javascript or typescript files") 90 | parser.add_argument( 91 | "--node", 92 | help="Location of nodejs executable (rather than using node from PATH)", 93 | ) 94 | parser.add_argument( 95 | "--glob", action="store_true", help="Glob the input filenames (**/*)" 96 | ) 97 | parser.add_argument("--verbose", action="store_true", help="Verbose logging") 98 | parser.add_argument( 99 | "--ignore", action="append", help="Paths containing this string are ignored" 100 | ) 101 | args = parser.parse_args() 102 | 103 | if args.verbose: 104 | logging.basicConfig(level=logging.DEBUG) 105 | 106 | output = TextOutput(js_flavour=True) 107 | files = file_generator(args.files, args.glob, ["*.js", "*.ts"], args.ignore) 108 | while True: 109 | batch = [] 110 | for _ in range(50): 111 | try: 112 | batch.append(next(files)) 113 | except StopIteration: 114 | if batch: 115 | process_files(batch, args.node, output) 116 | return 117 | process_files(batch, args.node, output) 118 | print(f"Processed {output.regexes} regexes") 119 | 120 | 121 | if __name__ == "__main__": 122 | main() 123 | -------------------------------------------------------------------------------- /regexploit/bin/regexploit_python_ast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import ast 4 | import logging 5 | import re 6 | import traceback 7 | import warnings 8 | 9 | from regexploit.ast.sre import SreOpParser 10 | from regexploit.bin.files import file_generator 11 | from regexploit.languages.python_node_visitor import PythonNodeVisitor 12 | from regexploit.output.text import TextOutput 13 | from regexploit.redos import find 14 | 15 | 16 | def handle_file(filename: str, output: TextOutput): 17 | with open(filename, "rb") as f: 18 | code = f.read() 19 | try: 20 | code_ast = ast.parse(code) 21 | pnv = PythonNodeVisitor() 22 | pnv.visit(code_ast) 23 | except RecursionError: 24 | print(f"RecursionError parsing AST for {filename}") 25 | return 26 | except SyntaxError as e: 27 | print(f"Bad Python3 syntax in {filename}: {e}") 28 | return 29 | for regex in pnv.patterns: 30 | try: 31 | parsed = SreOpParser().parse_sre(regex.pattern, regex.flags) 32 | except re.error: 33 | continue # We will have many strings which aren't actually regexes 34 | try: 35 | output.next() 36 | for redos in find(parsed): 37 | if redos.starriness > 2: 38 | context = None 39 | try: 40 | context = code.splitlines()[regex.lineno - 1].decode().strip() 41 | except UnicodeDecodeError: 42 | pass 43 | output.record( 44 | redos, 45 | regex.pattern, 46 | filename=filename, 47 | lineno=regex.lineno, 48 | context=context, 49 | ) 50 | except Exception: 51 | print( 52 | f"Error finding ReDoS: {regex.pattern} from {filename} #{regex.lineno}" 53 | ) 54 | print(traceback.format_exc()) 55 | 56 | 57 | def main(): 58 | with warnings.catch_warnings(): 59 | # Some weird regexes emit warnings 60 | warnings.simplefilter("ignore", category=FutureWarning) 61 | warnings.simplefilter("ignore", category=DeprecationWarning) 62 | parser = argparse.ArgumentParser( 63 | description="Parse regexes out of python files and scan them for ReDoS" 64 | ) 65 | parser.add_argument("files", nargs="+", help="Python files or directories") 66 | parser.add_argument( 67 | "--glob", action="store_true", help="Glob the input filenames (**/*)" 68 | ) 69 | parser.add_argument("--verbose", action="store_true", help="Verbose logging") 70 | parser.add_argument( 71 | "--ignore", action="append", help="Paths containing this string are ignored" 72 | ) 73 | args = parser.parse_args() 74 | 75 | if args.verbose: 76 | logging.basicConfig(level=logging.DEBUG) 77 | 78 | files = file_generator(args.files, args.glob, ["*.py"], args.ignore) 79 | output = TextOutput() 80 | for filename in files: 81 | logging.debug(filename) 82 | handle_file(filename, output) 83 | print(f"Processed {output.regexes} regexes") 84 | 85 | 86 | if __name__ == "__main__": 87 | main() 88 | -------------------------------------------------------------------------------- /regexploit/bin/regexploit_yaml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import json 4 | import logging 5 | import re 6 | import traceback 7 | import warnings 8 | 9 | from regexploit.ast.sre import SreOpParser 10 | from regexploit.bin.files import file_generator 11 | from regexploit.output.text import TextOutput 12 | from regexploit.redos import find 13 | 14 | 15 | def get_json(filename: str): 16 | with open(filename, "rb") as f: 17 | try: 18 | return json.load(f) 19 | except json.decoder.JSONDecodeError: 20 | print(f"Error parsing JSON from {filename}") 21 | return 22 | 23 | 24 | def handle_file(yamljson, filename: str, output: TextOutput): 25 | if isinstance(yamljson, (list, dict)): 26 | YamlJsonWalker(filename, output).handle(yamljson) 27 | 28 | 29 | class YamlJsonWalker: 30 | def __init__(self, filename: str, output: TextOutput): 31 | self.filename = filename 32 | self.output = output 33 | 34 | def handle(self, elem): 35 | if isinstance(elem, str) and len(elem) > 5: 36 | try: 37 | parsed = SreOpParser().parse_sre(elem) 38 | except re.error: 39 | return # We will have many strings which aren't actually regexes 40 | try: 41 | self.output.next() 42 | for redos in find(parsed): 43 | if redos.starriness > 2: 44 | self.output.record( 45 | redos, 46 | elem, 47 | filename=self.filename, 48 | ) 49 | except Exception: 50 | print(f"Error finding ReDoS: {elem} from {self.filename}") 51 | print(traceback.format_exc()) 52 | elif isinstance(elem, list): 53 | for _elem in elem: 54 | self.handle(_elem) 55 | elif isinstance(elem, dict): 56 | for _elem in elem.values(): 57 | self.handle(_elem) 58 | 59 | 60 | def main(get_object=get_json): 61 | with warnings.catch_warnings(): 62 | # Some weird regexes emit warnings 63 | warnings.simplefilter("ignore", category=FutureWarning) 64 | warnings.simplefilter("ignore", category=DeprecationWarning) 65 | parser = argparse.ArgumentParser( 66 | description="Parse regexes out of YAML files (strings, lists and dictionary values) and scan them for ReDoS" 67 | ) 68 | parser.add_argument("files", nargs="+", help="YAML files") 69 | parser.add_argument( 70 | "--glob", action="store_true", help="Glob the input filenames (**/*)" 71 | ) 72 | parser.add_argument("--verbose", action="store_true", help="Verbose logging") 73 | parser.add_argument( 74 | "--ignore", action="append", help="Paths containing this string are ignored" 75 | ) 76 | args = parser.parse_args() 77 | 78 | if args.verbose: 79 | logging.basicConfig(level=logging.DEBUG) 80 | 81 | files = file_generator( 82 | args.files, 83 | args.glob, 84 | ["*.json"] if get_object is get_json else ["*.yaml", "*.yml", "*.json"], 85 | args.ignore, 86 | ) 87 | output = TextOutput() 88 | for filename in files: 89 | logging.debug(filename) 90 | handle_file(get_object(filename), filename, output) 91 | print(f"Processed {output.regexes} regexes") 92 | 93 | 94 | def main_yaml(): 95 | try: 96 | from yaml import safe_load, YAMLError 97 | 98 | def get_yaml(filename: str): 99 | with open(filename, "rb") as f: 100 | try: 101 | return safe_load(f.read()) 102 | except YAMLError: 103 | print(f"Error parsing YAML from {filename}") 104 | return 105 | 106 | main(get_object=get_yaml) 107 | except ImportError: 108 | print( 109 | "Pyyaml extra required: Install regexploit with 'pip install regexploit[yaml]' or run 'pip install pyyaml'" 110 | ) 111 | raise 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /regexploit/found_regex.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass(frozen=True) 5 | class FoundRegex: 6 | lineno: int 7 | pattern: str 8 | flags: int 9 | definitely_regex: bool 10 | -------------------------------------------------------------------------------- /regexploit/hook.py: -------------------------------------------------------------------------------- 1 | # The module records any regexes used by python code for later inspection. 2 | # Import this hook and install() before loading other modules or start python with `python -i /path/to/hook.py` 3 | import re 4 | import traceback 5 | 6 | # By default, the re and traceback modules will not be hooked 7 | 8 | regexes = set() 9 | 10 | 11 | class CompiledRegex: 12 | def __init__(self, pattern, flags, traceback): 13 | self.pattern = pattern 14 | self.flags = flags 15 | self.traceback = traceback 16 | 17 | def __hash__(self) -> int: 18 | # ignore the traceback for now 19 | return hash((self.pattern, self.flags)) 20 | 21 | def __repr__(self) -> str: 22 | return f"({self.pattern} at {self.last_tb.filename})" 23 | 24 | @property 25 | def last_tb(self): 26 | return self.traceback[-1] 27 | 28 | 29 | class WrappedRegex: 30 | def __init__(self, regex): 31 | self.regex = regex 32 | 33 | def run_and_log(self, method, args, kwargs): 34 | print("Pattern:", repr(self.regex.pattern[:200])) 35 | print(f"{method}()", *(repr(a) for a in args)) 36 | print(*traceback.format_stack()[2:-4]) 37 | return getattr(self.regex, method)(*args, **kwargs) 38 | 39 | def search(self, *args, **kwargs): 40 | return self.run_and_log("search", args, kwargs) 41 | 42 | def match(self, *args, **kwargs): 43 | return self.run_and_log("match", args, kwargs) 44 | 45 | def fullmatch(self, *args, **kwargs): 46 | return self.run_and_log("fullmatch", args, kwargs) 47 | 48 | def sub(self, *args, **kwargs): 49 | return self.run_and_log("sub", args, kwargs) 50 | 51 | def subn(self, *args, **kwargs): 52 | return self.run_and_log("subn", args, kwargs) 53 | 54 | def split(self, *args, **kwargs): 55 | return self.run_and_log("split", args, kwargs) 56 | 57 | def findall(self, *args, **kwargs): 58 | return self.run_and_log("findall", args, kwargs) 59 | 60 | def finditer(self, *args, **kwargs): 61 | return self.run_and_log("finditer", args, kwargs) 62 | 63 | 64 | def get_and_clear_regexes(): 65 | """ 66 | Retrieves regexes that have been `re.compile`-ed and removes them from the `regexes` set. 67 | """ 68 | while True: 69 | try: 70 | yield regexes.pop() 71 | except KeyError: 72 | return 73 | 74 | 75 | def install(log_all_uses: bool = False): 76 | """ 77 | Activate the hook. 78 | """ 79 | if not hasattr(re.compile, "_is_hook"): 80 | old_compile = re.compile 81 | 82 | def compile(pattern, flags=0): 83 | tb = traceback.extract_stack()[:-1] # Ignore our hook 84 | regexes.add(CompiledRegex(pattern, flags, tb)) 85 | regex = old_compile(pattern, flags) 86 | if log_all_uses: 87 | return WrappedRegex(regex) 88 | else: 89 | return regex 90 | 91 | compile._is_hook = True 92 | re.compile = compile 93 | 94 | 95 | if __name__ == "__main__": 96 | install() 97 | -------------------------------------------------------------------------------- /regexploit/languages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/languages/__init__.py -------------------------------------------------------------------------------- /regexploit/languages/csharp_string_extractor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from regexploit.found_regex import FoundRegex 5 | 6 | 7 | def make_token_regex(*token_specification): 8 | return re.compile("|".join("(?P<%s>%s)" % pair for pair in token_specification)) 9 | 10 | 11 | TOKENS_BASE = make_token_regex( 12 | ("LINE_COMMENT", r"//"), 13 | ("MULTILINE_COMMENT", r"/\*"), 14 | ("INDENT_OR_PREPROCESSOR", r"(?:^|\n)\s*#?"), 15 | ("SEMI_COLON", r";"), 16 | ("DOUBLE_QUOTE_CHAR_LITERAL", r"'\\?\"'"), 17 | ("NEW_REGEX", r"new\s+[\w.]*?Regex\("), 18 | ("BEGIN_VERBATIM_STRING", r'(\$@|@\$?)"'), 19 | ("BEGIN_STRING", r'\$?"'), 20 | ) 21 | TOKENS_LINE_COMMENT = make_token_regex(("END_COMMENT", "\n")) 22 | TOKENS_MULTILINE_COMMENT = make_token_regex(("END_COMMENT", r"\*/")) 23 | TOKENS_VERBATIM_STRING = make_token_regex( 24 | ("LITERAL_QUOTE", r'""'), 25 | ("END_VERBATIM_STRING", r'"'), 26 | ) 27 | TOKENS_STRING = make_token_regex( 28 | ("LITERAL_BACKSLASH", r"\\\\"), 29 | ("LITERAL_QUOTE", r'\\"'), 30 | ("END_STRING", '"'), 31 | ) 32 | TOKENS_END_NEW_REGEX = make_token_regex(("SEMI_COLON", ";")) 33 | 34 | 35 | def find_regexes(code): 36 | code = code.decode("utf-8", "replace") 37 | cursor: int = 0 38 | mode: re.Pattern = TOKENS_BASE 39 | reached_end: bool = False 40 | inside_new_regex: bool = False 41 | buffered_regex = None 42 | interpolated: bool = False # TODO: interpolated $ strings 43 | newline_positions = make_lines(code) 44 | seen_line = 0 45 | 46 | while not reached_end: 47 | for mo in mode.finditer(code, cursor): 48 | kind = mo.lastgroup 49 | value = mo.group() 50 | # print(kind, value.replace('\n', '\\n'), code[cursor:mo.start()].replace('\n', '\\n'), code[mo.start():mo.end()].replace('\n', '\\n')) 51 | if kind == "END_COMMENT": 52 | mode = TOKENS_BASE 53 | cursor = mo.end() 54 | break 55 | elif kind == "LINE_COMMENT": 56 | mode = TOKENS_LINE_COMMENT 57 | cursor = mo.end() 58 | break 59 | elif kind == "INDENT_OR_PREPROCESSOR": 60 | if value and value[-1] == "#": # Preprocessor 61 | mode = TOKENS_LINE_COMMENT 62 | cursor = mo.end() 63 | break 64 | elif kind == "MULTILINE_COMMENT": 65 | mode = TOKENS_MULTILINE_COMMENT 66 | cursor = mo.end() 67 | break 68 | elif kind == "SEMI_COLON": 69 | if inside_new_regex and buffered_regex is not None: 70 | char_index, line, string = buffered_regex 71 | flag_string = code[char_index : mo.start()] 72 | flags = 0 73 | # TODO: https://docs.microsoft.com/en-us/dotnet/api/system.text.regularexpressions.regexoptions 74 | if "IgnoreCase" in flag_string: 75 | flags |= re.I 76 | if "Multiline" in flag_string: 77 | flags |= re.M 78 | if "IgnorePatternWhitespace" in flag_string: 79 | flags |= re.X 80 | yield FoundRegex(line, string, flags, True) 81 | mode = TOKENS_BASE 82 | cursor = mo.end() 83 | inside_new_regex = False 84 | buffered_regex = None 85 | break 86 | inside_new_regex = False 87 | buffered_regex = None 88 | elif kind == "NEW_REGEX": 89 | inside_new_regex = True 90 | buffered_regex = None 91 | elif kind == "BEGIN_VERBATIM_STRING": 92 | interpolated = "$" in value 93 | mode = TOKENS_VERBATIM_STRING 94 | cursor = mo.end() 95 | break 96 | elif kind == "BEGIN_STRING": 97 | interpolated = "$" in value # noqa: F841 98 | mode = TOKENS_STRING 99 | cursor = mo.end() 100 | break 101 | elif kind in ["END_VERBATIM_STRING", "END_STRING"]: 102 | string = code[cursor : mo.start()] 103 | if kind == "END_STRING": 104 | try: 105 | string = string.encode().decode("unicode_escape") 106 | except UnicodeDecodeError: 107 | logging.warning(f"Unable to process: {string}") 108 | string = string.encode().decode("utf-8", "replace") 109 | else: 110 | string = string.replace('""', '"') 111 | line = line_of(cursor, newline_positions, seen_line) 112 | seen_line = line - 1 113 | cursor = mo.end() 114 | if inside_new_regex: 115 | buffered_regex = (cursor, line, string) 116 | mode = TOKENS_END_NEW_REGEX 117 | else: 118 | flags = ( 119 | re.X if kind == "END_VERBATIM_STRING" and "\n" in string else 0 120 | ) 121 | yield FoundRegex(line, string, flags, False) 122 | mode = TOKENS_BASE 123 | break 124 | else: 125 | reached_end = True 126 | 127 | 128 | def make_lines(code): 129 | return [m.start() for m in re.finditer("\n", code)] 130 | 131 | 132 | def line_of(character_index: int, newline_positions, seen_line: int): 133 | if not newline_positions: 134 | return 1 135 | for line_index, newline_position in enumerate(newline_positions[seen_line:]): 136 | if character_index < newline_position: 137 | return line_index + seen_line + 1 138 | return line_index + seen_line 139 | -------------------------------------------------------------------------------- /regexploit/languages/javascript.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | # There's quite a lot wrong here, but it'll do for now. 5 | # Wow, looking back on this, this is still horrific. 6 | CARAT_FIX = re.compile(r"(?") 8 | HYPHEN_FIX_1 = re.compile(r"(? str: 13 | """Alter a javascript regex so that python can parse it. May accidentally alter meaning.""" 14 | pattern = CARAT_FIX.sub(r"\^", pattern) 15 | pattern = NAMED_GROUP_FIX.sub(r"(?P<\1>", pattern) 16 | pattern = HYPHEN_FIX_1.sub(r"\1\-", pattern) 17 | pattern = HYPHEN_FIX_2.sub(r"\1\-\2", pattern) 18 | return pattern 19 | -------------------------------------------------------------------------------- /regexploit/languages/python_node_visitor.py: -------------------------------------------------------------------------------- 1 | import ast # The python library not regexploit.ast 2 | import re 3 | from typing import List, Union 4 | 5 | from regexploit.found_regex import FoundRegex 6 | 7 | 8 | RE_FUNC_TO_FLAGS_POS = { 9 | "compile": 1, 10 | "search": 2, 11 | "match": 2, 12 | "fullmatch": 2, 13 | "findall": 2, 14 | "finditer": 2, 15 | "split": 3, 16 | "sub": 4, 17 | "subn": 4, 18 | } 19 | 20 | 21 | class PythonNodeVisitor(ast.NodeVisitor): 22 | """ 23 | Try to extract regular expressions from python code by walking the AST. 24 | """ 25 | 26 | def __init__(self): 27 | self.patterns: List[FoundRegex] = [] 28 | 29 | def maybe_pattern(self, lineno: int, pattern: str): 30 | """Check if the pattern could possibly have ReDoS: if so, add it.""" 31 | if pattern.count("*") + pattern.count("+") + pattern.count(",}") >= 2: 32 | # Could have ReDoS 33 | # Now check if it still looks like a docstring 34 | if " * * *" in pattern: 35 | return # Looks like cron (of course could just be really silly regex) 36 | if pattern.count("\n") < 5 or "?" in pattern or "\\" in pattern: 37 | self.patterns.append(FoundRegex(lineno, pattern, 0, False)) 38 | 39 | def visit_Constant(self, constant: ast.Constant): 40 | if isinstance(constant.value, bytes): 41 | try: 42 | self.maybe_pattern(constant.lineno, constant.value.decode()) 43 | except UnicodeDecodeError: 44 | pass # TODO Parse unicode patterns 45 | elif isinstance(constant.value, str): 46 | self.maybe_pattern(constant.lineno, constant.value) 47 | 48 | def visit_Assign(self, node: ast.Assign): 49 | if ( 50 | len(node.targets) != 1 51 | or not isinstance(node.targets[0], ast.Name) 52 | or node.targets[0].id != "__doc__" 53 | ): 54 | self.generic_visit(node) 55 | 56 | def visit_body_without_docstring( 57 | self, 58 | node: Union[ast.FunctionDef, ast.AsyncFunctionDef, ast.Module, ast.ClassDef], 59 | ): 60 | if node.body: 61 | body = node.body 62 | if isinstance(body[0], ast.Expr): 63 | potential_docstring = body[0].value 64 | if isinstance(potential_docstring, ast.Constant): 65 | node.body = node.body[1:] # Ignore docstring 66 | 67 | self.generic_visit(node) 68 | 69 | def visit_FunctionDef(self, node: ast.FunctionDef): 70 | self.visit_body_without_docstring(node) 71 | 72 | def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): 73 | self.visit_body_without_docstring(node) 74 | 75 | def visit_ClassDef(self, node: ast.ClassDef): 76 | self.visit_body_without_docstring(node) 77 | 78 | def visit_Module(self, node: ast.Module): 79 | self.visit_body_without_docstring(node) 80 | 81 | def visit_Call(self, node: ast.Call): 82 | if isinstance(node.func, ast.Attribute): 83 | attr: ast.Attribute = node.func 84 | if ( 85 | isinstance(attr.value, ast.Name) 86 | and attr.value.id == "re" 87 | and len(node.args) 88 | and isinstance(node.args[0], ast.Constant) 89 | ): 90 | flags = 0 91 | pattern = node.args[0].value 92 | flags_pos = RE_FUNC_TO_FLAGS_POS.get(attr.attr) 93 | if flags_pos is not None: 94 | # re.compile, re.sub, re.match etc 95 | if len(node.args) == flags_pos + 1: 96 | flags = RegexFlagVisitor.get_flags(node.args[flags_pos]) 97 | else: 98 | for kw in node.keywords: 99 | if kw.arg == "flags": 100 | flags = RegexFlagVisitor.get_flags(kw.value) 101 | break 102 | if isinstance(pattern, bytes): 103 | try: 104 | pattern = pattern.decode() 105 | except UnicodeDecodeError: 106 | return # TODO unicode 107 | if isinstance(pattern, str): 108 | self.patterns.append( 109 | FoundRegex(node.lineno, pattern, flags, True) 110 | ) 111 | return 112 | 113 | self.generic_visit(node) 114 | 115 | 116 | class RegexFlagVisitor(ast.NodeVisitor): 117 | """Guess the flags from the 2nd argument of re.compile("abc", re.X | re.M)""" 118 | 119 | def __init__(self): 120 | self.flags: int = 0 121 | 122 | def visit_Attribute(self, node: ast.Attribute): 123 | if isinstance(node.value, ast.Name) and node.value.id == "re": 124 | if node.attr == node.attr.upper(): 125 | try: 126 | self.flags |= getattr(re, node.attr) 127 | except AttributeError: 128 | pass 129 | 130 | @staticmethod 131 | def get_flags(node) -> int: 132 | rfv = RegexFlagVisitor() 133 | rfv.visit(node) 134 | return rfv.flags 135 | -------------------------------------------------------------------------------- /regexploit/output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/output/__init__.py -------------------------------------------------------------------------------- /regexploit/output/text.py: -------------------------------------------------------------------------------- 1 | POLYNOMIAL_DEGREES = [ 2 | "linear", 3 | "quadratic", 4 | "cubic", 5 | "quartic", 6 | "quintic", 7 | "sextic", 8 | "septic", 9 | "octic", 10 | "nonic", 11 | "decic", 12 | ] 13 | 14 | 15 | class TextOutput: 16 | def __init__(self, js_flavour: bool = False): 17 | self.first_for_regex = True 18 | self.regexes = 0 19 | self.js_flavour = js_flavour 20 | 21 | def next(self): 22 | """Next regex being processed.""" 23 | self.first_for_regex = True 24 | self.regexes += 1 25 | 26 | def record(self, redos, pattern, *, filename=None, lineno=None, context=None): 27 | if self.first_for_regex: 28 | if filename: 29 | if lineno is not None: 30 | print(f"Vulnerable regex in {filename} #{lineno}") 31 | else: 32 | print(f"Vulnerable regex in {filename}") 33 | print(f"Pattern: {pattern}") 34 | if context: 35 | print(f"Context: {context}") 36 | print("---") 37 | self.first_for_regex = False 38 | print(redos) 39 | stars = "\u2b50" * min(10, redos.starriness) 40 | degree = ( 41 | "exponential" 42 | if redos.starriness > 10 43 | else POLYNOMIAL_DEGREES[redos.starriness - 1] 44 | if redos.starriness > 0 45 | else "?" 46 | ) 47 | print(f"Worst-case complexity: {redos.starriness} {stars} ({degree})") 48 | print(f"Repeated character: {redos.repeated_character}") 49 | if redos.killer: 50 | print(f"Final character to cause backtracking: {redos.killer}") 51 | print(f"Example: {redos.example(self.js_flavour)}\n") 52 | -------------------------------------------------------------------------------- /regexploit/redos.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass 3 | from typing import Iterator, List, Optional 4 | 5 | from regexploit.ast.at import EndOfString 6 | from regexploit.ast.branch import Branch 7 | from regexploit.ast.char import Character 8 | from regexploit.ast.repeat import InfiniteRepeat, Repeat 9 | from regexploit.ast.sequence import Sequence 10 | 11 | 12 | @dataclass(frozen=True) 13 | class Redos: 14 | starriness: int 15 | prefix_sequence: Sequence 16 | redos_sequence: Sequence 17 | repeated_character: Character 18 | killer: Optional[Character] 19 | 20 | @property 21 | def example_prefix(self) -> str: 22 | return self.prefix_sequence.example() 23 | 24 | def example(self, js_flavour: bool = False) -> str: 25 | repeated_char = self.repeated_character 26 | killer = self.killer 27 | # Try to find a repeating character which is also a killer 28 | if killer and (killing_repeat := repeated_char & killer): 29 | repeated_char = killing_repeat 30 | killer = None 31 | 32 | prefix = ( 33 | self.example_prefix.encode("unicode_escape").decode().replace("'", "\\'") 34 | ) 35 | repeated_char_s = ( 36 | repeated_char.example() 37 | .encode("unicode_escape") 38 | .decode() 39 | .replace("'", "\\'") 40 | ) 41 | e = f"'{prefix}' + " if prefix else "" 42 | if js_flavour: 43 | e += f"'{repeated_char_s}'.repeat(3456)" 44 | else: 45 | e += f"'{repeated_char_s}' * 3456" 46 | 47 | if killer: 48 | killer_s = ( 49 | killer.example().encode("unicode_escape").decode().replace("'", "\\'") 50 | ) 51 | return e + f" + '{killer_s}'" 52 | return e 53 | 54 | 55 | def find(sequence, flags: int = 0) -> List[Redos]: 56 | """ 57 | Returns Redos objects sorted by severity (most starry first), then sorted by example_prefix (shortest first). 58 | """ 59 | redos = [] 60 | for r in find_redos(sequence): 61 | if r not in redos: 62 | redos.append(r) 63 | return sorted(redos, key=lambda r: -r.starriness * 1000 + len(r.example_prefix)) 64 | 65 | 66 | def expand_branches(seq: Sequence) -> Iterator[Sequence]: 67 | """ 68 | This could blow up exponentially, but it's nicer for now to expand branches. 69 | """ 70 | head = [] 71 | for i, elem in enumerate(seq.elements): 72 | if isinstance(elem, Branch): 73 | for b in elem.get_branches(): 74 | head_plus_branch = head + ( 75 | [] if not b else [b] if not isinstance(b, Sequence) else b.elements 76 | ) 77 | for tail in expand_branches(Sequence(seq.elements[i + 1 :])): 78 | yield Sequence(head_plus_branch + tail.elements) 79 | return # All processing in yields 80 | elif isinstance(elem, Repeat) and elem.starriness > 10: 81 | logging.debug("Exponential: %s", elem) 82 | if isinstance(elem.repeat, (Sequence, Branch)): 83 | for tail in expand_branches(Sequence(seq.elements[i + 1 :])): 84 | yield Sequence(head + [elem] + tail.elements) 85 | for pseudo_repeat in elem.repeat.matching_repeats(): 86 | logging.debug("Pseudo repeat %s", pseudo_repeat) 87 | yield Sequence( 88 | head + [elem.alter_repeat(pseudo_repeat)] + tail.elements 89 | ) 90 | else: 91 | head.append(elem) 92 | else: 93 | head.append(elem) 94 | yield Sequence(head) 95 | 96 | 97 | def find_redos(sequence_with_branches) -> Iterator[Redos]: 98 | logging.debug(sequence_with_branches) 99 | if not isinstance( 100 | sequence_with_branches, Sequence 101 | ): # singleton like Branch (ab|cd) 102 | sequence_with_branches = Sequence([sequence_with_branches]) 103 | for seq in expand_branches(sequence_with_branches): 104 | yield from find_redos_in_branchless_sequence(seq) 105 | 106 | 107 | def find_redos_in_branchless_sequence(seq: Sequence) -> Iterator[Redos]: 108 | logging.debug(seq) 109 | for i, elem in enumerate(seq.elements): 110 | # TODO branches 111 | if isinstance(elem, InfiniteRepeat) and (c := elem.overall_character_class()): 112 | yield from make_redos(seq, i, i + 1, c, elem.starriness) 113 | 114 | 115 | def make_redos( 116 | seq: Sequence, 117 | sequence_start: int, 118 | continue_from: int, 119 | repeated_character: Character, 120 | starriness: int, 121 | ) -> Iterator[Redos]: 122 | # TODO branches 123 | character_history = [repeated_character] 124 | logging.debug( 125 | "Make ReDoS %d %d %s %d", 126 | sequence_start, 127 | continue_from, 128 | repeated_character, 129 | starriness, 130 | ) 131 | for current_index in range(continue_from, len(seq)): 132 | elem = seq.elements[current_index] 133 | 134 | if isinstance(elem, EndOfString): 135 | # May need to go back before the matching sequence to calculate $ 136 | elem.set_character(seq.elements[:current_index]) 137 | 138 | eoc = elem.overall_character_class() 139 | new_c = repeated_character & eoc 140 | logging.debug("%s & %s = %s (for %s)", repeated_character, eoc, new_c, elem) 141 | 142 | # Handle optional elements 143 | if elem.minimum_length == 0: 144 | if elem.starriness: 145 | # If we have a*, we branch and try with and without it 146 | if new_c != repeated_character: 147 | # Only branch if we have [ab]a* : if we have aa* or a[ab]* then the character class doesn't change 148 | # Try without this element 149 | yield from make_redos( 150 | seq, 151 | sequence_start, 152 | current_index + 1, 153 | repeated_character, 154 | starriness, 155 | ) 156 | else: 157 | continue # Don't care about finite repeats (abc)? or a{,4} 158 | 159 | # print(repeated_character, "+", elem.overall_character_class(), "->", new_c) 160 | if new_c is None: 161 | # This element will force backtracking as it's incompatible with `repeated_character` 162 | if elem.minimum_length and starriness > 2: 163 | yield redos_found( 164 | seq, 165 | sequence_start, 166 | current_index, 167 | repeated_character, 168 | starriness, 169 | None, 170 | ) 171 | return 172 | 173 | starriness += elem.starriness 174 | repeated_character = new_c 175 | character_history.append(new_c) 176 | 177 | # Everything matched! We need to work backwards and find a 'killer' to cause backtracking if we want ReDoS 178 | logging.debug("Backtracking: %s", character_history) 179 | for current_index in reversed(range(continue_from, len(seq))): 180 | elem = seq.elements[current_index] 181 | character_history.pop() 182 | starriness -= elem.starriness 183 | if starriness <= 2: 184 | return 185 | # Can't get backtracking by not matching optional groups 186 | if elem.minimum_length > 0: 187 | # Find a character which matches the sequence and then fails on the killer 188 | if (match := elem.overall_character_class()) and (killer := match.negate()): 189 | old_repeat = character_history.pop() 190 | logging.debug( 191 | "%s (for %s): killer=%s, repeat=%s", 192 | match, 193 | elem, 194 | killer, 195 | old_repeat, 196 | ) 197 | yield redos_found( 198 | seq, 199 | sequence_start, 200 | current_index, 201 | old_repeat, 202 | starriness, 203 | killer, 204 | ) 205 | return 206 | logging.debug("Backtracking: FAIL") 207 | 208 | 209 | def redos_found( 210 | seq: Sequence, 211 | start: int, 212 | backtrack_at: int, 213 | repeated_character: Character, 214 | starriness: int, 215 | killer: Optional[Character], 216 | ) -> Redos: 217 | # TODO: Try to include some skipped optional parts (like `?`) just to make it nicer 218 | logging.debug("ReDoS found") 219 | return Redos( 220 | starriness, 221 | Sequence(seq.elements[:start]), 222 | Sequence(seq.elements[start : backtrack_at + 1]), 223 | repeated_character, 224 | killer, 225 | ) 226 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | flake8-import-order 4 | pytest 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="regexploit", 8 | version="1.0.0", 9 | author="Ben Caller :: Doyensec", 10 | author_email="REMOVETHISPREFIX.ben@doyensec.com", 11 | description="Find regular expressions vulnerable to ReDoS", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/doyensec/regexploit", 15 | packages=setuptools.find_packages(), 16 | include_package_data=True, 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | 'License :: OSI Approved :: Apache Software License', 20 | "Operating System :: OS Independent", 21 | ], 22 | python_requires=">=3.8", 23 | extras_require={ 24 | "yaml": ['pyyaml>=5.3.1'] 25 | }, 26 | scripts=[ 27 | # Easy-install uses imports, so can miss findings 28 | "regexploit/bin/regexploit-python-env", 29 | ], 30 | entry_points={ 31 | "console_scripts": [ 32 | "regexploit=regexploit.bin.regexploit:main", 33 | "regexploit-js=regexploit.bin.regexploit_js:main", 34 | "regexploit-py=regexploit.bin.regexploit_python_ast:main", 35 | "regexploit-yaml=regexploit.bin.regexploit_yaml:main_yaml", 36 | "regexploit-json=regexploit.bin.regexploit_yaml:main", 37 | "regexploit-csharp=regexploit.bin.regexploit_csharp:main", 38 | ], 39 | }, 40 | ) 41 | -------------------------------------------------------------------------------- /tests/test.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text.RegularExpressions; 3 | 4 | public class Example 5 | { 6 | #line 1 "C:\Users\test" 7 | public static void Main() 8 | { 9 | /****"@" 10 | ; " @ '"\ 11 | */ 12 | string input = "Not a regex*****"; 13 | string regex = "\\w+_[\\w\"]+_\\w+w"; 14 | /**/ 15 | string pattern = @"x""\d+.\d+.\d+!"; 16 | char c = '"'; 17 | char d = '\"'; 18 | Regex r = new Regex(@"\b(?\w+)\s+x\b", RegexOptions.IgnoreCase); 19 | Regex r = new Regex( 20 | "\\b(?\\w+)\\s+\\b", 21 | // What? 22 | /**/ 23 | RegexOptions.IgnoreCase 24 | ); 25 | Something(@" 26 | (a # An a 27 | * # starred 28 | ) # bracket 29 | * # starred again 30 | x", x); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /tests/test_at.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from regexploit.ast.at import EndOfString 4 | from regexploit.ast.sre import SreOpParser 5 | 6 | 7 | def from_regex(pattern: str): 8 | return SreOpParser().parse_sre(pattern) 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "r", 13 | [ 14 | r".*b*", 15 | r".*\w*b*", 16 | r".+b*", 17 | ], 18 | ) 19 | def test_cannot_backtrack(r): 20 | dollar = EndOfString() 21 | dollar.set_character(from_regex(r).elements) 22 | assert dollar.character.is_any 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "r", 27 | [ 28 | r"x[ab]*b*", 29 | r"x+[ab]*", 30 | r"x+a*[ab]*a*b*", 31 | ], 32 | ) 33 | def test_dollar_simple(r): 34 | dollar = EndOfString() 35 | dollar.set_character(from_regex(r).elements) 36 | assert dollar.character == from_regex("[ab]") 37 | 38 | 39 | @pytest.mark.parametrize( 40 | "r", 41 | [ 42 | r"\w*b*", 43 | r"x\w*\w*b*", 44 | r"\w+b*", 45 | ], 46 | ) 47 | def test_dollar_optionals_contained_by_mandatory(r): 48 | dollar = EndOfString() 49 | dollar.set_character(from_regex(r).elements) 50 | assert dollar.character == from_regex(r"[\w]").expand_categories() 51 | 52 | 53 | def test_whole_string(): 54 | dollar = EndOfString() 55 | dollar.set_character(from_regex(r"a*a*").elements) 56 | assert dollar.character == from_regex(r"[a]") 57 | 58 | 59 | def test_real(): 60 | dollar = EndOfString() 61 | dollar.set_character(from_regex(r"-\d+(\s*\s*\s*)").elements) 62 | assert dollar.character == from_regex(r"[\s]") 63 | -------------------------------------------------------------------------------- /tests/test_character.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sre_parse import parse as sre_parse 3 | 4 | from regexploit.ast.categories import Category 5 | from regexploit.ast.char import Character 6 | from regexploit.ast.sre import SreOpParser 7 | 8 | 9 | def from_regex(pattern: str) -> Character: 10 | (parsed_char,) = sre_parse(pattern) 11 | char = SreOpParser().parse_op(*parsed_char) 12 | assert isinstance(char, Character) 13 | return char 14 | 15 | 16 | def test_literal_and(): 17 | assert from_regex("[abc]") & from_regex("[bcd]") == from_regex("[bc]") 18 | 19 | 20 | def test_literal_negated_and(): 21 | assert from_regex("[^abc]") & from_regex("[^bcd]") == from_regex("[^a-d]") 22 | 23 | 24 | def test_literal_mixed_and(): 25 | assert from_regex("[abcz]") & from_regex("[^bcd]") == from_regex("[az]") 26 | assert from_regex("[^bcd]") & from_regex("[abcz]") == from_regex("[az]") 27 | 28 | 29 | def test_category_and(): 30 | assert from_regex(r"[\s\d]") & from_regex(r"[\d\w]") == from_regex(r"\d") 31 | 32 | 33 | def test_category_negated_and(): 34 | assert from_regex(r"[^\s\d]") & from_regex(r"[^\d\w]") == from_regex(r"[^\s\d\w]") 35 | 36 | 37 | def test_category_negated_and_simplifies_to_nothing(): 38 | assert (from_regex(r"[^\s\d]") & from_regex(r"[^\D]")) is None 39 | 40 | 41 | def test_mixed_and(): 42 | assert from_regex(r"[abc123\s]") & from_regex(r"[^\d\s]") == from_regex("[abc]") 43 | 44 | 45 | def test_mixed_and_none(): 46 | c = from_regex(r"[123]") & from_regex(r"[^\d\s]") 47 | assert c is None 48 | 49 | 50 | @pytest.mark.parametrize( 51 | "r", 52 | [ 53 | r"a", 54 | r"\s", 55 | r"[a\s\S\d]", 56 | r"[A-z]", 57 | r"[^A-z\d]", 58 | ], 59 | ) 60 | def test_and_any_none(r): 61 | any = Character.ANY() 62 | other = from_regex(r) 63 | assert (any & other) == other 64 | assert (other & any) == other 65 | assert (any & None) is None 66 | assert (None & any) is None 67 | 68 | 69 | def test_class(): 70 | assert from_regex("[abc]").exact_character_class() == from_regex("[cba]") 71 | 72 | 73 | def test_negate_simple(): 74 | assert from_regex("a").negate() == from_regex("[^a]") 75 | assert from_regex(r"\w").negate() == from_regex(r"[^\w]") 76 | assert from_regex("[^ab]").negate() == from_regex("[ab]") 77 | assert from_regex(r"[^\s]").negate() == from_regex(r"\s") 78 | 79 | 80 | def test_negate_mixed(): 81 | assert from_regex(r"[a\s\w]").negate() == from_regex(r"[^a\s\w]") 82 | 83 | 84 | def test_or(): 85 | assert from_regex("a") | from_regex("a") == from_regex("a") 86 | assert from_regex("a") | from_regex("b") == from_regex("[ab]") 87 | assert from_regex(r"\w") | from_regex("b") == from_regex(r"\w").expand_categories() 88 | assert ( 89 | from_regex(r"\w") | from_regex("9") == from_regex(r"[9\w]").expand_categories() 90 | ) 91 | assert from_regex("[^a]") | from_regex("[^b]") == from_regex(".") 92 | 93 | 94 | def test_category_category_covers_all(): 95 | assert from_regex(r"[\s\S]").is_any is True 96 | assert from_regex(r"[\Dd\d]").is_any is True 97 | 98 | 99 | def test_negative_lookahead(): 100 | assert SreOpParser().parse_sre(r"(?![0248])(?!6)(?!a)(?!xyz123)\d") == from_regex( 101 | r"[13579]" 102 | ) 103 | 104 | 105 | def test_category_category_covers_none(): 106 | assert SreOpParser().parse_sre(r"[^x0-9\w\W]") is None 107 | 108 | 109 | @pytest.mark.parametrize( 110 | "category_identifier,category_enum,character", 111 | [ 112 | ("w", Category.WORD, "b"), 113 | ("w", Category.WORD, "C"), 114 | ("w", Category.WORD, "_"), 115 | ("w", Category.WORD, "3"), 116 | ("W", Category.NOT_WORD, "-"), 117 | ("W", Category.NOT_WORD, "."), 118 | ("s", Category.SPACE, "\xa0"), 119 | ("s", Category.SPACE, "\v"), 120 | ], 121 | ) 122 | def test_categories(category_identifier: str, category_enum: Category, character: str): 123 | # \w ~= [a-zA-Z0-9_], \s ~= [ \t\n\r\f\v] 124 | category_characters = from_regex("\\" + category_identifier).expand_categories() 125 | char = Character.LITERAL(ord(character)) 126 | assert category_characters | char == category_characters 127 | assert category_characters & char == char 128 | assert category_enum.contains(ord(character)) 129 | 130 | 131 | @pytest.mark.parametrize( 132 | "category_identifier,category_enum,not_character", 133 | [ 134 | ("w", Category.WORD, "-"), 135 | ("W", Category.NOT_WORD, "_"), 136 | ("W", Category.NOT_WORD, "9"), 137 | ("s", Category.SPACE, "\x00"), 138 | ("S", Category.NOT_SPACE, "\f"), 139 | ], 140 | ) 141 | def test_not_categories( 142 | category_identifier: str, category_enum: Category, not_character: str 143 | ): 144 | category_characters = from_regex("\\" + category_identifier).expand_categories() 145 | char = Character.LITERAL(ord(not_character)) 146 | assert category_characters & char is None 147 | assert not category_enum.contains(ord(not_character)) 148 | -------------------------------------------------------------------------------- /tests/test_csharp.py: -------------------------------------------------------------------------------- 1 | import re 2 | from unittest.mock import Mock 3 | 4 | from regexploit.bin.regexploit_csharp import handle_file 5 | from regexploit.languages.csharp_string_extractor import find_regexes 6 | 7 | 8 | def test_csharp(): 9 | with open("tests/test.cs", "rb") as f: 10 | code = f.read() 11 | found = list(find_regexes(code)) 12 | assert len(found) == 6 13 | assert found[0].pattern == "Not a regex*****" 14 | assert found[1].pattern == '\\w+_[\\w"]+_\\w+w' 15 | assert found[2].pattern == r'x"\d+.\d+.\d+!' 16 | assert found[2].lineno == 15 17 | assert not found[2].definitely_regex 18 | assert found[3].definitely_regex 19 | assert found[4].flags == re.I 20 | assert found[5].flags == re.X 21 | 22 | 23 | def test_handle_file(): 24 | output = Mock(spec=["next", "record"]) 25 | handle_file("tests/test.cs", output) 26 | assert output.next.call_count == 5 27 | assert output.record.call_count == 3 28 | -------------------------------------------------------------------------------- /tests/test_javascript.py: -------------------------------------------------------------------------------- 1 | import re 2 | from json import dumps 3 | from unittest.mock import Mock 4 | 5 | import pytest 6 | 7 | from regexploit.bin.regexploit_js import handle_line_from_node 8 | from regexploit.languages.javascript import fix_js_regex 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "r,f", 13 | [ 14 | # Carat 15 | (r"[^]", r"\^"), 16 | (r"[^][^][^]", r"\^\^\^"), 17 | (r"([^])+([^])+([^])+", r"(\^)+(\^)+(\^)+"), 18 | (r"[^][^][^]([\[^])+", r"\^\^\^([\[^])+"), 19 | # Named groups 20 | (r"(?y>)+(?abc)\(?", r"(?Py>)+(?Pabc)\(?"), 21 | # Hyphen in character class 22 | (r"[\w-:]", r"[\w\-:]"), 23 | (r"[!-\w]", r"[!\-\w]"), 24 | ], 25 | ) 26 | def test_fixes(r, f): 27 | with pytest.raises(re.error): 28 | re.compile(r) 29 | fixed = fix_js_regex(r) 30 | assert fixed == f 31 | re.compile(fixed) 32 | 33 | 34 | @pytest.mark.parametrize( 35 | "pat,next_called,recorded", 36 | [ 37 | ("ab*cdef", False, False), # too few stars 38 | ("ab+c+def", True, False), 39 | ("ab*b+b*c", True, True), 40 | ("a[^](?c*)*d", True, True), 41 | ("a[^](?c*)d*", True, False), 42 | ], 43 | ) 44 | def test_handle_line_from_node(pat, next_called, recorded): 45 | output = Mock(spec=["next", "record"]) 46 | line_json = dict(pattern=pat, lineno=1, filename="testfile") 47 | handle_line_from_node(dumps(line_json), output) 48 | if next_called: 49 | output.next.assert_called_once() 50 | else: 51 | output.next.assert_not_called() 52 | if recorded: 53 | output.record.assert_called_once() 54 | else: 55 | output.record.assert_not_called() 56 | -------------------------------------------------------------------------------- /tests/test_python_ast.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import re 3 | import textwrap 4 | from unittest.mock import Mock 5 | 6 | from regexploit.bin.regexploit_python_ast import handle_file 7 | from regexploit.found_regex import FoundRegex 8 | from regexploit.languages.python_node_visitor import PythonNodeVisitor 9 | 10 | 11 | def patterns_from_code(code: str): 12 | pnv = PythonNodeVisitor() 13 | code = textwrap.dedent(code) 14 | pnv.visit(ast.parse(code)) 15 | return pnv.patterns 16 | 17 | 18 | def test_basic(): 19 | code = """ 20 | MY_RE = "abc+d+" 21 | def x(): 22 | '''Just*a*docstring*''' 23 | a = "nostarsorpluses" 24 | b = "(" + re.sub("aregex", "*****", "notaregex", flags=re.A) + ")" 25 | return re.compile(b"x*y*z", re.X | re.MULTILINE) 26 | """ 27 | patterns = patterns_from_code(code) 28 | assert len(patterns) == 3 29 | assert patterns[0] == FoundRegex(2, "abc+d+", 0, False) 30 | assert patterns[1] == FoundRegex(6, "aregex", re.A, True) 31 | assert patterns[2] == FoundRegex(7, "x*y*z", re.X | re.MULTILINE, True) 32 | 33 | 34 | def test_file(): 35 | output = Mock(spec=["next"]) 36 | handle_file(__file__, output) 37 | assert output.next.call_count == 2 # abc+d+, x*y*z, code string errors 38 | -------------------------------------------------------------------------------- /tests/test_redos.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from regexploit.ast.sequence import Sequence 6 | from regexploit.ast.sre import SreOpParser 7 | from regexploit.redos import find 8 | 9 | 10 | def from_regex(pattern: str, flags: int = 0) -> Sequence: 11 | return SreOpParser().parse_sre(pattern, flags) 12 | 13 | 14 | def find_redos(pattern: str, flags: int = 0): 15 | return find(from_regex(pattern, flags)) 16 | 17 | 18 | def test_no_repeats(): 19 | assert len(find_redos(r"aaaaa[abc](\w[\wz]){1,7}X[^x]")) == 0 20 | 21 | 22 | def test_simple_repeat1(): 23 | (r,) = find_redos(r"abd\w*[def]+\w+[de]!") 24 | assert r.starriness == 3 25 | assert r.repeated_character == from_regex("[de]") 26 | assert r.example_prefix == "abd" 27 | assert r.killer is None 28 | 29 | 30 | def test_simple_repeat2(): 31 | rs = find_redos(r"\w*x0*\d*\.?\d\.?\d+4") 32 | assert len(rs) 33 | r = rs[0] 34 | assert r.starriness == 3 35 | assert r.repeated_character == from_regex("0") 36 | assert r.example_prefix == "x" 37 | assert r.killer is None 38 | 39 | 40 | def test_simple_best_repeat(): 41 | rs = find_redos(r"\d*0*\d*x?\dx?\d+4") 42 | assert len(rs) > 1 43 | a = rs[0] 44 | assert a.starriness == 4 45 | assert a.repeated_character == from_regex("0") 46 | assert a.example_prefix == "" 47 | assert a.killer is None 48 | assert rs[1].starriness == 3 49 | 50 | 51 | def test_backtrack(): 52 | rs = find_redos(r"[abc]+\w+[ab]+a") 53 | r = rs[0] 54 | assert r.starriness == 3 55 | assert len(r.redos_sequence) == 4 56 | assert r.killer == from_regex("[^a]") 57 | assert r.example() == "'b' * 3456" 58 | 59 | 60 | def test_real_hbbtv(): 61 | rs = find_redos( 62 | r"(HbbTV)/[0-9]+\.[0-9]+\.[0-9]+ \([^;]*; *(LG)E *; *([^;]*) *;[^;]*;[^;]*;\)" 63 | ) 64 | r = rs[0] 65 | assert r.starriness == 3 66 | assert len(r.redos_sequence) == 4 67 | assert r.repeated_character == from_regex(" ") 68 | assert r.example_prefix.startswith("HbbTV/") 69 | assert r.example_prefix.endswith("(;LGE;") 70 | 71 | 72 | def test_real_branching(): 73 | rs = [ 74 | redos 75 | for redos in find_redos( 76 | r"(HbbTV)/[0-9]+\.[0-9]+\.[0-9]+ \([^;]*; *(?:CUS:([^;]*)|([^;]+)) *; *([^;]*) *;.*;" 77 | ) 78 | if redos.starriness >= 3 79 | ] 80 | assert all(r.starriness == 3 for r in rs) 81 | assert all(r.killer is None for r in rs) 82 | assert all(r.repeated_character == from_regex(" ") for r in rs) 83 | assert {r.example_prefix for r in rs} == { 84 | "HbbTV/0.0.0 (;CUS:;", 85 | "HbbTV/0.0.0 (;", 86 | "HbbTV/0.0.0 (;0;", 87 | } 88 | 89 | 90 | def test_dollar(): 91 | rs = find_redos(r"^a+(b*b*b*)$") 92 | r = rs[0] 93 | assert r.starriness == 3 94 | assert r.repeated_character == from_regex(r"b") 95 | assert r.killer == from_regex(r"[^b]") 96 | 97 | 98 | def test_real_cpython_cookielib(): 99 | # We don't support the (?!) assertions, but can still find ReDoS 100 | LOOSE_HTTP_DATE_RE = r"""^ 101 | (\d\d?) # day 102 | (?:\s+|[-\/]) 103 | (\w+) # month 104 | (?:\s+|[-\/]) 105 | (\d+) # year 106 | (?: 107 | (?:\s+|:) # separator before clock 108 | (\d\d?):(\d\d) # hour:min 109 | (?::(\d\d))? # optional seconds 110 | )? # optional clock 111 | \s* 112 | ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone 113 | \s* 114 | (?:\(\w+\))? # ASCII representation of timezone in parens. 115 | \s*$""" 116 | rs = find_redos(LOOSE_HTTP_DATE_RE, re.X) 117 | r = rs[0] 118 | assert r.starriness == 3 119 | assert r.repeated_character == from_regex(r"\s") 120 | assert r.killer == from_regex(r"[^\s]") 121 | 122 | 123 | def test_real_cpython_cve(): 124 | rs = find_redos(r"(,*,)*(,+)[ \t]") 125 | r = rs[0] 126 | assert r.starriness == 12 # exponential 127 | 128 | 129 | def test_real_ssri(): 130 | rs = find_redos(r"^([A-Za-z0-9+/=]{4})(\?[\x21-\x7E]*)*$") 131 | r = rs[0] 132 | assert r.starriness > 10 133 | assert r.repeated_character == from_regex(r"\?") 134 | assert r.example_prefix == "0000" 135 | 136 | 137 | def test_real_pdf(): 138 | # \012 == \n == \x0a 139 | rs = find_redos( 140 | r"t[\011\012\015\040]*\<\<(.*?\>\>)[\011\012\015\040]*[\r\n]+[\011\012\015\040]*s" 141 | ) 142 | r = rs[0] 143 | assert r.starriness == 3 144 | assert r.repeated_character == from_regex(r"[\n\r]") 145 | assert r.example_prefix == "t<<>>" 146 | assert not r.killer 147 | 148 | 149 | def test_real_markdown(): 150 | # \s\S == . 151 | rs = find_redos(r"\(\s*(<)?([\s\S]*?)(?(2)>)(?:\s+'([\s\S]*?)')?\s*\)") 152 | r = rs[0] 153 | assert r.starriness == 3 154 | assert r.repeated_character == from_regex(r"\s") 155 | assert r.example_prefix == "(" 156 | assert not r.killer 157 | 158 | 159 | def test_backtrack_repeated_char(): 160 | # (' ' * 3456 + '\t') won't backtrack because of the .* after 161 | rs = find_redos(r"#\s*\s*\s*([^ \t]+)(.*)$") 162 | r = rs[0] 163 | assert r.starriness == 3 164 | assert r.repeated_character == from_regex(r"\s") 165 | assert r.example_prefix == "#" 166 | assert r.killer == from_regex(r"[ \t]") 167 | assert ( 168 | r.example() == "'#' + ' ' * 3456" 169 | ), "Merge repeated character and killer in example if possible" 170 | 171 | 172 | @pytest.mark.parametrize( 173 | "r", 174 | [ 175 | r"a+", 176 | r"a(aa)+a", 177 | r"a*", 178 | r"\w*b?c*(def|gh+i|$|\b||)+", 179 | ], 180 | ) 181 | def test_groupref(r): 182 | rs = find_redos(fr"({r})(a+)\1(a+)b") 183 | r = rs[0] 184 | assert r.starriness == 3 185 | assert r.repeated_character == from_regex(r"a") 186 | 187 | 188 | def test_groupref_not_starry_itself(): 189 | rs = find_redos(r"(a+)(a+)\1b") 190 | assert not rs 191 | 192 | 193 | def test_groupref_false_positive(): 194 | # from codemirror 195 | rs = find_redos(r"^([*\-_])(?:\s*\1){2,}\s*$") 196 | assert not rs 197 | 198 | 199 | def test_optional_starry(): 200 | # ua-parser CFNetwork 201 | rs = find_redos(r"(\d+).?(\d+)?.?(\d+)?.?(\d+)?C") 202 | r = rs[0] 203 | assert r.starriness == 4 204 | assert r.repeated_character == from_regex(r"\d") 205 | 206 | 207 | def test_negative_lookahead(): 208 | # The final (?!c) isn't actually doing anything yet 209 | rs = find_redos(r"[abc]+(?!c)[abc]+(?!b)([abc]+[abc])(?!c)[abc]*x") 210 | r = rs[0] 211 | assert r.starriness == 4 212 | assert r.repeated_character == from_regex(r"a") 213 | 214 | 215 | @pytest.mark.parametrize( 216 | "r", 217 | [ 218 | r"(a?b+)+c", 219 | r"(x*[ab]*x?[bc]*x?)*c", 220 | r"(x?[ab]+x?[bc]+\w*x?)*c", 221 | ], 222 | ) 223 | def test_regexlib_sequence_exponential(r): 224 | rs = find_redos(r) 225 | r = rs[0] 226 | assert r.starriness > 10 227 | assert r.repeated_character == from_regex(r"b") 228 | assert r.killer is None 229 | 230 | 231 | def test_dt_branch_exponential(): 232 | rs = find_redos(r"a(z|\w*b)*d") 233 | r = rs[0] 234 | assert r.starriness == 11 235 | assert r.repeated_character == from_regex(r"b") 236 | assert r.killer is None 237 | 238 | 239 | def test_node_forge_false_positive(): 240 | rs = find_redos(r"\s*([^=]*)=?([^;]*)(;|$)") 241 | assert not rs 242 | 243 | 244 | def test_ruby_maruku_false_positive(): 245 | rs = find_redos(r"(\S.*\S)*\s*") 246 | assert not rs 247 | 248 | 249 | def test_real_httplib2(): 250 | rs = find_redos( 251 | r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(? Union[Repeat, Character]: 10 | (parsed_char,) = sre_parse(pattern) 11 | repeat = SreOpParser().parse_op(*parsed_char) 12 | return repeat 13 | 14 | 15 | def test_star(): 16 | r = from_regex(r"(abc)*") 17 | assert r.starriness == 1 18 | assert r.minimum_length == 0 19 | assert r.exact_character_class() is None 20 | 21 | 22 | def test_question(): 23 | r = from_regex(r"(abc)?") 24 | assert r.starriness == 0 25 | assert r.minimum_length == 0 26 | assert r.maximum_repeats == 1 27 | assert r.exact_character_class() is None 28 | 29 | 30 | def test_plus(): 31 | r = from_regex(r"(?:abc)+") 32 | assert r.starriness == 1 33 | assert r.minimum_length == 3 34 | assert r.exact_character_class() is None 35 | 36 | 37 | def test_character_class(): 38 | r = from_regex(r"a{4,}") 39 | assert r.starriness == 1 40 | assert r.minimum_length == 4 41 | assert r.exact_character_class() == from_regex(r"a") 42 | 43 | 44 | def test_subsequence_character_class(): 45 | r = from_regex(r"(a?b+)*") 46 | assert r.starriness == 11 47 | assert r.minimum_length == 0 48 | assert r.exact_character_class() is None 49 | assert r.overall_character_class() is None 50 | inner_repeats = list(r.repeat.matching_repeats()) 51 | assert len(inner_repeats) == 1 52 | assert inner_repeats[0].overall_character_class() == from_regex(r"b") 53 | 54 | 55 | def test_negative_lookahead_infinite(): 56 | r = SreOpParser().parse_sre(r"(?!b)[a-d]+") 57 | assert r == SreOpParser().parse_sre(r"[acd][a-d]*") 58 | 59 | 60 | def test_negative_lookahead_finite(): 61 | r = SreOpParser().parse_sre(r"(?!b)[a-d]{1,3}") 62 | assert r == SreOpParser().parse_sre(r"[acd][a-d]{0,2}") 63 | 64 | 65 | def test_exponential_starriness(): 66 | r = from_regex(r"(?:(?:a{4,})*)+") 67 | assert r.starriness == 111 # ((1 * 10) * 10) + 1 68 | assert r.minimum_length == 0 69 | assert r.exact_character_class() == from_regex(r"a") 70 | 71 | 72 | def test_exponential_starriness2(): 73 | r = from_regex(r"(?:(?:a{4,}bc+)*)+") 74 | assert r.starriness == 211 # ((2 * 10) * 10) + 1 75 | assert r.minimum_length == 0 76 | assert r.exact_character_class() is None 77 | --------------------------------------------------------------------------------