├── .flake8
├── .github
    └── workflows
    │   └── main.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── regexploit
    ├── __init__.py
    ├── ast
    │   ├── __init__.py
    │   ├── at.py
    │   ├── branch.py
    │   ├── categories.py
    │   ├── char.py
    │   ├── groupref.py
    │   ├── ranges.py
    │   ├── repeat.py
    │   ├── sequence.py
    │   └── sre.py
    ├── bin
    │   ├── __init__.py
    │   ├── files.py
    │   ├── javascript
    │   │   ├── .eslintrc.yml
    │   │   ├── cli.js
    │   │   ├── find.js
    │   │   ├── index.js
    │   │   ├── package-lock.json
    │   │   ├── package.json
    │   │   └── test
    │   │   │   └── test.js
    │   ├── regexploit-python-env
    │   ├── regexploit.py
    │   ├── regexploit_csharp.py
    │   ├── regexploit_js.py
    │   ├── regexploit_python_ast.py
    │   └── regexploit_yaml.py
    ├── found_regex.py
    ├── hook.py
    ├── languages
    │   ├── __init__.py
    │   ├── csharp_string_extractor.py
    │   ├── javascript.py
    │   └── python_node_visitor.py
    ├── output
    │   ├── __init__.py
    │   └── text.py
    └── redos.py
├── requirements-dev.txt
├── setup.py
└── tests
    ├── test.cs
    ├── test_at.py
    ├── test_character.py
    ├── test_csharp.py
    ├── test_javascript.py
    ├── test_python_ast.py
    ├── test_redos.py
    └── test_repeat.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore =
3 |   E203, # whitespace before ':'
4 |   E501, # Line length
5 |   E722, # do not use bare 'except'
6 |   W503, # line break before binary operator
7 | application-import-names=regexploit
8 | import-order-style=pycharm
9 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   test-python:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         python-version: [3.8, 3.9]
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; fi
23 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
24 |     - name: Lint with black and flake8
25 |       run: |
26 |         black --check regexploit tests
27 |         flake8 regexploit tests
28 |     - name: Test
29 |       run: |
30 |         pip install -e .
31 |         pytest
32 | 
33 |   test-node:
34 | 
35 |     runs-on: ubuntu-latest
36 |     defaults:
37 |       run:
38 |         working-directory: regexploit/bin/javascript
39 | 
40 |     strategy:
41 |       matrix:
42 |         node-version: [12.x, 15.x]
43 | 
44 |     steps:
45 |     - uses: actions/checkout@v2
46 |     - name: Use Node.js ${{ matrix.node-version }}
47 |       uses: actions/setup-node@v1
48 |       with:
49 |         node-version: ${{ matrix.node-version }}
50 |     - run: npm install
51 |     - run: npm run build --if-present
52 |     - run: npm test
53 |       env:
54 |         CI: true
55 |     - run: npm run lint --if-present
56 | 
57 | 
58 |   build-python:
59 | 
60 |     needs: [test-python, test-node]
61 |     runs-on: ubuntu-latest
62 |     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
63 |     steps:
64 |     - uses: actions/checkout@v2
65 |     - name: Set up Python 3.9
66 |       uses: actions/setup-python@v2
67 |       with:
68 |         python-version: 3.9
69 |     - name: Build
70 |       run: |
71 |         pip install wheel
72 |         python setup.py sdist bdist_wheel
73 |     - name: Upload artifacts
74 |       uses: actions/upload-artifact@v2
75 |       with:
76 |         name: build
77 |         path: dist/*
78 |     - name: Publish package
79 |       uses: pypa/gh-action-pypi-publish@master
80 |       with:
81 |         user: __token__
82 |         password: ${{ secrets.PYPI_API_TOKEN }}
83 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | !.flake8
 3 | !.github
 4 | __pycache__
 5 | *.egg-info
 6 | *.log
 7 | node_modules
 8 | build
 9 | dist
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 Doyensec LLC.
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include regexploit/bin/javascript/*.js
4 | include regexploit/bin/javascript/*.json
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Regexploit
  2 | 
  3 | ![regexploit_logo](https://user-images.githubusercontent.com/6027823/110626827-7f46db80-81a1-11eb-9a3d-3e3376bd9a4f.png)
  4 | 
  5 | Find regexes which are vulnerable to Regular Expression Denial of Service (ReDoS).
  6 | 
  7 | **More info on [the Doyensec blog](https://blog.doyensec.com/2021/03/11/regexploit.html)**
  8 | 
  9 | Many default regular expression parsers have unbounded worst-case complexity. Regex matching may be quick when presented with a matching input string. However, certain non-matching input strings can make the regular expression matcher go into crazy backtracking loops and take ages to process. This can cause denial of service, as the CPU will be stuck trying to match the regex.
 10 | 
 11 | This tool is designed to:
 12 | *  find regular expressions which are vulnerable to ReDoS
 13 | *  give an example malicious string which will cause catastrophic backtracking
 14 | 
 15 | ## Worst-case complexity
 16 | 
 17 | This reflects the complexity of the regular expression matcher's backtracking procedure with respect to the length of the entered string.
 18 | 
 19 | Cubic complexity here means that if the vulnerable part of the string is doubled in length, the execution time should be about 8 times longer (2^3).
 20 | For exponential ReDoS with starred stars e.g. `(a*)*$` a fudge factor is used and the complexity will be greater than 10.
 21 | 
 22 | For explotability, cubic complexity or higher is typically required unless truly giant strings are allowed as input.
 23 | 
 24 | ## Example
 25 | 
 26 | Run `regexploit` and enter the regular expression `v\w*_\w*_\w*$` at the command line.
 27 | 
 28 | ```
 29 | $ regexploit
 30 | v\w*_\w*_\w*$
 31 | Pattern: v\w*_\w*_\w*$
 32 | ---
 33 | Worst-case complexity: 3 ⭐⭐⭐ (cubic)
 34 | Repeated character: [5f:_]
 35 | Final character to cause backtracking: [^WORD]
 36 | Example: 'v' + '_' * 3456 + '!'
 37 | ```
 38 | 
 39 | The part `\w*_\w*_\w*` contains three overlapping repeating groups (\w matches letters, digits *and underscores*). As showed in the line `Repeated character: [5f:_]`, a long string of `_` (0x5f) will match this section in many different ways. The worst-case complexity is 3 as there are 3 infinitely repeating groups. An example to cause ReDoS is given: it consists of the required prefix `v`, a long string of `_` and then a `!` (non-word character) to cause backtracking. Not all ReDoSes require a particular character at the end, but in this case, a long string of `_` will match the regex successfully and won't backtrack. The line `Final character to cause backtracking: [^WORD]` shows that a non-matching character (not a word character) is required at the end to prevent matching and cause ReDoS.
 40 | 
 41 | As another example, install a module version vulnerable to ReDoS such as `pip install ua-parser==0.9.0`.
 42 | To scan the installed python modules run `regexploit-python-env`.
 43 | 
 44 | ```
 45 | Importing ua_parser.user_agent_parser
 46 | Vulnerable regex in /somewhere/.env/lib/python3.9/site-packages/ua_parser/user_agent_parser.py #183
 47 | Pattern: \bSmartWatch *\( *([^;]+) *; *([^;]+) *;
 48 | Context: self.user_agent_re = re.compile(self.pattern)
 49 | ---
 50 | Worst-case complexity: 3 ⭐⭐⭐
 51 | Repeated character: [20]
 52 | Example: 'SmartWatch(' + ' ' * 3456
 53 | 
 54 | Worst-case complexity: 3 ⭐⭐⭐
 55 | Repeated character: [20]
 56 | Example: 'SmartWatch(0;' + ' ' * 3456
 57 | 
 58 | Vulnerable regex in /somewhere/.env/lib/python3.9/site-packages/ua_parser/user_agent_parser.py #183
 59 | Pattern: ; *([^;/]+) Build[/ ]Huawei(MT1-U06|[A-Z]+\d+[^\);]+)[^\);]*\)
 60 | Context: self.user_agent_re = re.compile(self.pattern)
 61 | ---
 62 | Worst-case complexity: 3 ⭐⭐⭐
 63 | Repeated character: [[0-9]]
 64 | Example: ';0 Build/HuaweiA' + '0' * 3456
 65 | ...
 66 | ```
 67 | 
 68 | For each vulnerable regular expression it prints one or more malicious string to trigger ReDoS. Setting your user agent to `;0 Build/HuaweiA000000000000000...` and browsing a website using an old version of ua-parser may cause the server to take a long time to process your request, probably ending in status 502.
 69 | 
 70 | # Installation
 71 | 
 72 | Python 3.8+ is required. To extract regexes from JavaScript / TypeScript code, NodeJS 12+ is also required.
 73 | 
 74 | Optionally make a virtual environment
 75 | 
 76 | ```bash
 77 | python3 -m venv .env
 78 | source .env/bin/activate
 79 | ```
 80 | 
 81 | Now actually install with pip
 82 | 
 83 | ```
 84 | pip install regexploit
 85 | ```
 86 | 
 87 | # Usage
 88 | 
 89 | ## Regexploit with a list of regexes
 90 | 
 91 | Enter regular expressions via stdin (one per line) into `regexploit`.
 92 | 
 93 | ```bash
 94 | regexploit
 95 | ```
 96 | 
 97 | or via a file
 98 | 
 99 | ```bash
100 | cat myregexes.txt | regexploit
101 | ```
102 | 
103 | ## Extract regexes automatically
104 | 
105 | There is built-in support for parsing regexes out of Python, JavaScript, TypeScript, C#, YAML and JSON.
106 | ### Python code
107 | 
108 | Parses Python code (without executing it) via the AST to find regexes. The regexes are then analysed for ReDoS.
109 | 
110 | ```bash
111 | regexploit-py my-project/
112 | regexploit-py "my-project/**/*.py" --glob
113 | ```
114 | ### Javascript / Typescript
115 | 
116 | This will use the bundled NodeJS package in `regexploit/bin/javascript` which parses your JavaScript as an AST with [eslint](https://github.com/typescript-eslint/typescript-eslint/tree/master/packages/parser) and prints out all regexes.
117 | 
118 | Those regexes are fed into the python ReDoS finder.
119 | 
120 | ```bash
121 | regexploit-js my-module/my-file.js another/file.js some/folder/
122 | regexploit-js "my-project/node_modules/**/*.js" --glob
123 | ```
124 | 
125 | N.B. there are differences between javascript and python regex parsing so there may be some errors. I'm [not sure I want](https://hackernoon.com/the-madness-of-parsing-real-world-javascript-regexps-d9ee336df983) to write a JS regex AST!
126 | 
127 | ### Python imports
128 | 
129 | Search for regexes in all the python modules currently installed in your path / env. This means you can `pip install` whatever modules you are interested in and they will be analysed. Cpython code is included.
130 | 
131 | ```bash
132 | regexploit-python-env
133 | ```
134 | 
135 | N.B. this doesn't parse the python code to an AST and will only find regexes compiled automatically on module import. Modules are actually imported, **so code in the modules will be executed**. This is helpful for finding regexes which are built up from smaller strings on load e.g. [CVE-2021-25292 in Pillow](https://github.com/python-pillow/Pillow/commit/3bce145966374dd39ce58a6fc0083f8d1890719c)
136 | 
137 | ### JSON / YAML
138 | 
139 | Yaml support requires pyyaml, which can be installed with `pip install regexploit[yaml]`.
140 | 
141 | ```bash
142 | regexploit-json *.json
143 | regexploit-yaml *.yaml
144 | ```
145 | ### C# (.NET)
146 | 
147 | ```bash
148 | regexploit-csharp something.cs
149 | ```
150 | # :trophy: Bugs reported :trophy:
151 | 
152 | * [CVE-2020-5243: uap-core](https://github.com/ua-parser/uap-core/security/advisories/GHSA-cmcx-xhr8-3w9p) affecting uap-python, [uap-ruby](https://github.com/ua-parser/uap-ruby/security/advisories/GHSA-pcqq-5962-hvcw), etc. (User-Agent header parsing)
153 | * [CVE-2020-8492: cpython's urllib.request](https://github.com/python/cpython/commit/0b297d4ff1c0e4480ad33acae793fbaf4bf015b4) (WWW-Authenticate header parsing)
154 | * [CVE-2021-21236: CairoSVG](https://github.com/advisories/GHSA-hq37-853p-g5cf) (SVG parsing)
155 | * [CVE-2021-21240: httplib2](https://github.com/httplib2/httplib2/security/advisories/GHSA-93xj-8mrv-444m) (WWW-Authenticate header parsing)
156 | * [CVE-2021-25292: python-pillow](https://github.com/python-pillow/Pillow/commit/3bce145966374dd39ce58a6fc0083f8d1890719c) (PDF parsing)
157 | * [CVE-2021-26813: python-markdown2](https://github.com/trentm/python-markdown2/pull/387) (Markdown parsing)
158 | * [CVE-2021-27290: npm/ssri](https://doyensec.com/resources/Doyensec_Advisory_ssri_redos.pdf) (SRI parsing)
159 | * [CVE-2021-27291: pygments](https://github.com/pygments/pygments/commit/2e7e8c4a7b318f4032493773732754e418279a14) lexers for ADL, CADL, Ceylon, Evoque, Factor, Logos, Matlab, Octave, ODIN, Scilab & Varnish VCL (Syntax highlighting)
160 | * [CVE-2021-27292: ua-parser-js](https://github.com/faisalman/ua-parser-js/commit/809439e20e273ce0d25c1d04e111dcf6011eb566) (User-Agent header parsing)
161 | * [CVE-2021-27293: RestSharp](https://github.com/restsharp/RestSharp/issues/1556) (JSON deserialisation in a .NET C# package)
162 | * [bpo-38804: cpython's http.cookiejar](https://github.com/python/cpython/pull/17157) (Set-Cookie header parsing)
163 | * [SimpleCrawler (archived)](https://doyensec.com/resources/Doyensec_Advisory_simplecrawler_redos.pdf) (HTML parsing)
164 | * [CVE-2021-28092: is-svg](https://github.com/sindresorhus/is-svg/commit/01f8a087fab8a69c3ac9085fbb16035907ab6a5b) (SVG parsing)
165 | * [nuget.org, NuGetGallery](https://github.com/NuGet/NuGetGallery/commit/25d2d3b32b2d9f0b1ca6e0a105b0210c2c4820f4) and [NuGet.Client](https://github.com/NuGet/NuGet.Client/commit/a0671e946ce71dc59def5cc8a67c6457d66f33bf) (Parsing NuGet package IDs)
166 | * [markdown (python)](https://github.com/Python-Markdown/markdown/pull/1130) (Markdown parsing)
167 | * [ansi-html (nodejs)](https://github.com/Tjatse/ansi-html/issues/19) (ANSI parsing)
168 | * Plus unpublished bugs in a handful of pypi, npm, ruby and nuget packages
169 | 
170 | ## Credits
171 | 
172 | This tool has been created by Ben Caller of [Doyensec LLC](https://www.doyensec.com) during research time. 
173 | 
174 | ![alt text](https://doyensec.com/images/logo.svg "Doyensec Logo")
175 | 


--------------------------------------------------------------------------------
/regexploit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/__init__.py


--------------------------------------------------------------------------------
/regexploit/ast/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/ast/__init__.py


--------------------------------------------------------------------------------
/regexploit/ast/at.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List, Optional
 3 | 
 4 | from regexploit.ast.char import Character
 5 | from regexploit.ast.repeat import InfiniteRepeat, Repeat
 6 | 
 7 | 
 8 | @dataclass
 9 | class EndOfString:
10 |     character: Optional[Character] = None
11 | 
12 |     @property
13 |     def starriness(self):
14 |         return 0
15 | 
16 |     @property
17 |     def minimum_length(self):
18 |         return 1  # Meaningless really here
19 | 
20 |     def overall_character_class(self):
21 |         return self.character
22 | 
23 |     def __repr__(self) -> str:
24 |         return f"${self.character}"
25 | 
26 |     def __and__(self, other: Character) -> Optional[Character]:
27 |         return other & self.character
28 | 
29 |     def example(self):
30 |         return "\n"  # ish
31 | 
32 |     def set_character(self, previous_elems: List):
33 |         """
34 |         To force backtracking, the dollar will have to not match any previous groups until a mandatory group.
35 |         This can perhaps be made more lenient.
36 | 
37 |         To cause backtracking on a long string of a's:
38 |         a*a*a*$ -> Any [^a]
39 |         [ab]+a*a*a*$ -> Any [^ab] (baaaaaaaaaaaab does not backtrack)
40 |         b+a*a*a*$ -> Any [^a]
41 |         .a*a*a*$ -> Any [^a]
42 |         .+a*a*a*$ -> Cannot backtrack because everything gets matched by .+ :(
43 |         """
44 |         self.character = None
45 |         for elem in reversed(previous_elems):
46 |             if elem.minimum_length > 0 and not isinstance(elem, InfiniteRepeat):
47 |                 return  # xa*[ab]*a*$ -> [ab]
48 |             c = (
49 |                 elem.maximal_character_class()
50 |                 if isinstance(elem, Repeat)
51 |                 else elem.overall_character_class()
52 |             )
53 |             if c:
54 |                 if elem.minimum_length > 0 and (self.character & c) != self.character:
55 |                     # c is smaller than self.character (i.e. c is not an ANY)
56 |                     # x+a*[ab]*a*$ -> [ab]
57 |                     return
58 |                 self.character |= c
59 | 


--------------------------------------------------------------------------------
/regexploit/ast/branch.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Iterator, List, Optional
 3 | 
 4 | from regexploit.ast.at import EndOfString
 5 | from regexploit.ast.char import Character
 6 | from regexploit.ast.repeat import FiniteRepeat, InfiniteRepeat
 7 | from regexploit.ast.sequence import Sequence
 8 | 
 9 | 
10 | @dataclass(frozen=True)
11 | class Branch:
12 |     branches: List
13 |     optional: bool = False
14 | 
15 |     def get_branches(self) -> Iterator:
16 |         for b in self.branches:
17 |             yield b
18 |         if self.optional:
19 |             yield None
20 | 
21 |     @property
22 |     def starriness(self) -> int:
23 |         return max(b.starriness for b in self.branches)
24 | 
25 |     @property
26 |     def minimum_length(self) -> int:
27 |         return 0 if self.optional else min(b.minimum_length for b in self.branches)
28 | 
29 |     def overall_character_class(self) -> Optional[Character]:
30 |         c = Character.ANY()
31 |         for b in self.branches:
32 |             c &= b.overall_character_class()
33 |             if c is None:
34 |                 return None
35 |         return c
36 | 
37 |     def maximal_character_class(self):
38 |         return None  # Really?
39 | 
40 |     def example(self) -> str:
41 |         if self.optional:
42 |             return ""
43 |         return self.branches[0].example()
44 | 
45 |     def __len__(self) -> int:
46 |         return len(self.branches) + int(self.optional)
47 | 
48 |     def __repr__(self) -> str:
49 |         middle = " | ".join(str(b) for b in self.branches)
50 |         return f"BR( {middle} ){'?' if self.optional else ''}"
51 | 
52 |     def matching_repeats(self):
53 |         for b in self.branches:
54 |             if b.starriness > 0:
55 |                 if isinstance(b, InfiniteRepeat):
56 |                     yield b
57 |                 elif isinstance(b, Sequence):
58 |                     yield from b.matching_repeats()
59 | 
60 | 
61 | def make_branch(branches: List):
62 |     if len(branches) == 1:
63 |         return branches[0]
64 |     optional = False
65 |     non_empty_branches = [b for b in branches if b and not isinstance(b, EndOfString)]
66 |     if not non_empty_branches:
67 |         return None
68 |     if len(non_empty_branches) < len(branches):
69 |         # (ab|cd|) -> (ab|cd)?
70 |         optional = True
71 |     if all(isinstance(b, Character) for b in non_empty_branches):
72 |         # (a|b) -> [ab], (a|b|) -> [ab]?
73 |         c = None
74 |         for b in non_empty_branches:
75 |             c |= b
76 |         if optional:
77 |             return FiniteRepeat(c, 0, 1)
78 |         else:
79 |             return c
80 | 
81 |     return Branch(non_empty_branches, optional)
82 | 


--------------------------------------------------------------------------------
/regexploit/ast/categories.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import unicodedata
 3 | from enum import Enum, auto
 4 | from typing import Set
 5 | 
 6 | 
 7 | class Category(Enum):
 8 |     DIGIT = auto()
 9 |     NOT_DIGIT = auto()
10 |     WORD = auto()
11 |     NOT_WORD = auto()
12 |     SPACE = auto()
13 |     NOT_SPACE = auto()
14 | 
15 |     @property
16 |     def is_positive(self) -> bool:
17 |         return not self.name.startswith("NOT_")
18 | 
19 |     def negate(self) -> "Category":
20 |         if self.is_positive:
21 |             return Category[f"NOT_{self.name}"]
22 |         else:
23 |             return Category[self.name[4:]]
24 | 
25 |     def example(self) -> str:
26 |         return EXAMPLE_FOR_CAT[self]
27 | 
28 |     def contains(self, literal: int) -> bool:
29 |         c = chr(literal)
30 |         unicat = unicodedata.category(c)
31 |         if self is Category.DIGIT:
32 |             return unicat == "Nd"
33 |         if self is Category.NOT_DIGIT:
34 |             return unicat != "Nd"
35 |         if self is Category.WORD:
36 |             return (
37 |                 unicat[0] == "L" or unicat == "Nd" or literal == 0x5F
38 |             )  # underscore is a word character
39 |         if self is Category.NOT_WORD:
40 |             return unicat[0] != "L" and unicat != "Nd" and literal != 0x5F
41 |         if self is Category.SPACE:
42 |             return unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v")
43 |         if self is Category.NOT_SPACE:
44 |             return unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v")
45 | 
46 | 
47 | CATS = {}
48 | 
49 | 
50 | def list_category(category, full_unicode: bool = False):
51 |     if cached := CATS.get(category):
52 |         yield from cached
53 |     for data in range((sys.maxunicode + 1) if full_unicode else 256):
54 |         c = chr(data)
55 |         unicat = unicodedata.category(c)
56 |         if category is Category.DIGIT:
57 |             if unicat == "Nd":
58 |                 yield data
59 |         elif category is Category.NOT_DIGIT:
60 |             if unicat != "Nd":
61 |                 yield data
62 |         elif category is Category.WORD:
63 |             if unicat[0] == "L" or unicat == "Nd" or data == 0x5F:
64 |                 yield data
65 |         elif category is Category.NOT_WORD:
66 |             if unicat[0] != "L" and unicat != "Nd" and data != 0x5F:
67 |                 yield data
68 |         elif category is Category.SPACE:
69 |             if unicat == "Zs" or c in (" ", "\n", "\t", "\r", "\f", "\v"):
70 |                 yield data
71 |         elif category is Category.NOT_SPACE:
72 |             if unicat != "Zs" and c not in (" ", "\n", "\t", "\r", "\f", "\v"):
73 |                 yield data
74 | 
75 | 
76 | def covers_any(categories: Set[Category]) -> bool:
77 |     for c in categories:
78 |         if c.is_positive and c.negate() in categories:
79 |             return True
80 |     return False
81 | 
82 | 
83 | # CATS[sre_parse.CATEGORY_DIGIT] = list(list_category(sre_parse.CATEGORY_DIGIT))
84 | # CATS[sre_parse.CATEGORY_SPACE] = list(list_category(sre_parse.CATEGORY_SPACE))
85 | # CATS[sre_parse.CATEGORY_WORD] = list(list_category(sre_parse.CATEGORY_WORD))
86 | EXAMPLE_FOR_CAT = {
87 |     Category.DIGIT: "4",
88 |     Category.NOT_DIGIT: "!",
89 |     Category.WORD: "w",
90 |     Category.NOT_WORD: "$",
91 |     Category.SPACE: " ",
92 |     Category.NOT_SPACE: ".",
93 | }
94 | 


--------------------------------------------------------------------------------
/regexploit/ast/char.py:
--------------------------------------------------------------------------------
  1 | import string
  2 | from dataclasses import dataclass
  3 | from typing import Optional, Set
  4 | 
  5 | from regexploit.ast.categories import Category, covers_any, list_category
  6 | from regexploit.ast.ranges import Range, lits_to_ranges
  7 | 
  8 | 
  9 | @dataclass(frozen=True)
 10 | class Character:
 11 |     literals: Optional[Set[int]] = None
 12 |     categories: Optional[Set[Category]] = None
 13 |     positive: bool = True
 14 | 
 15 |     @staticmethod
 16 |     def ANY() -> "Character":
 17 |         return Character()
 18 | 
 19 |     @staticmethod
 20 |     def LITERAL(literal: int) -> "Character":
 21 |         return Character({literal})
 22 | 
 23 |     @property
 24 |     def minimum_length(self) -> int:
 25 |         return 1
 26 | 
 27 |     @property
 28 |     def starriness(self) -> int:
 29 |         return 0
 30 | 
 31 |     def __hash__(self) -> int:
 32 |         return hash(
 33 |             (
 34 |                 self.positive,
 35 |                 tuple(sorted(self.literals)) if self.literals else None,
 36 |                 tuple(sorted(self.categories)) if self.categories else None,
 37 |             )
 38 |         )
 39 | 
 40 |     def exact_character_class(self) -> "Character":
 41 |         return self
 42 | 
 43 |     def overall_character_class(self) -> "Character":
 44 |         return self
 45 | 
 46 |     def maximal_character_class(self) -> "Character":
 47 |         return self
 48 | 
 49 |     @property
 50 |     def is_any(self) -> bool:
 51 |         return self.literals is None and self.categories is None and self.positive
 52 | 
 53 |     @property
 54 |     def _is_positive_literal(self) -> bool:
 55 |         return self.positive and self.literals is not None and self.categories is None
 56 | 
 57 |     @property
 58 |     def _is_negative_literal(self) -> bool:
 59 |         return (
 60 |             not self.positive and self.literals is not None and self.categories is None
 61 |         )
 62 | 
 63 |     @property
 64 |     def _is_positive_category(self) -> bool:
 65 |         return self.positive and self.literals is None and self.categories is not None
 66 | 
 67 |     @property
 68 |     def _is_negative_category(self) -> bool:
 69 |         return (
 70 |             not self.positive and self.literals is None and self.categories is not None
 71 |         )
 72 | 
 73 |     def expand_categories(self) -> "Character":
 74 |         """
 75 |         This is the nuclear option where we expand the categories into literals.
 76 |         Can be huge in unicode.
 77 |         """
 78 |         if self.categories:
 79 |             lits: Set[int] = set(self.literals) if self.literals else set()
 80 |             for c in self.categories:
 81 |                 lits.update(list_category(c))
 82 |             return Character(literals=lits, positive=self.positive)
 83 | 
 84 |         return self
 85 | 
 86 |     def __and__(self, other: "Optional[Character]") -> "Optional[Character]":
 87 |         if other is None:
 88 |             return None
 89 |         if self.is_any:
 90 |             return other
 91 |         if other.is_any:
 92 |             return self
 93 | 
 94 |         # [ab] & [bc] -> [c]
 95 |         if self._is_positive_literal and other._is_positive_literal:
 96 |             lits = self.literals & other.literals
 97 |             if not lits:
 98 |                 return None
 99 |             return Character(literals=lits)
100 |         if self._is_positive_category and other._is_positive_category:
101 |             cats = self.categories & other.categories
102 |             if not cats:
103 |                 return None
104 |             return Character(categories=cats)
105 |         # [^ab] & [^bc] -> [^abc]
106 |         if self._is_negative_literal and other._is_negative_literal:
107 |             return Character(literals=self.literals | other.literals, positive=False)
108 |         if self._is_negative_category and other._is_negative_category:
109 |             categories = self.categories | other.categories
110 |             if covers_any(categories):  # [^\d] & [^\D] = nothing
111 |                 return None
112 |             return Character(categories=categories, positive=False)
113 |         # [ab] & [^bc] -> [a]
114 |         if self._is_positive_literal and other._is_negative_literal:
115 |             lits = self.literals - other.literals
116 |             if not lits:
117 |                 return None
118 |             return Character(literals=lits)
119 |         if other._is_positive_literal and self._is_negative_literal:
120 |             lits = other.literals - self.literals
121 |             if not lits:
122 |                 return None
123 |             return Character(literals=lits)
124 | 
125 |         # TODO: be less lazy and sort out the general case without expanding everything if possible
126 |         return self.expand_categories() & other.expand_categories()
127 | 
128 |     def __rand__(self, other: "Optional[Character]") -> "Optional[Character]":
129 |         return self & other
130 | 
131 |     def __or__(self, other: "Optional[Character]") -> "Optional[Character]":
132 |         if other is None:
133 |             return self
134 |         if self.is_any or other.is_any:
135 |             return Character.ANY()
136 |         if self == other:
137 |             return self
138 |         if nor := (self.negate() & other.negate()):  # Slow, but logical
139 |             return nor.negate()
140 |         else:
141 |             return Character.ANY()
142 | 
143 |     def __ror__(self, other: "Optional[Character]") -> "Optional[Character]":
144 |         return self | other
145 | 
146 |     def __repr__(self) -> str:
147 |         if self.is_any:
148 |             return "."
149 |         result = "["
150 |         if not self.positive:
151 |             result += "^"
152 |         more = False
153 |         if self.literals is not None:
154 |             lits, ranges = lits_to_ranges(self.literals)
155 |             result += ",".join(literal_repr(o) for o in lits)
156 |             if lits and ranges:
157 |                 result += ","
158 |             result += ",".join(range_repr(r) for r in ranges)
159 |             more = True
160 |         if self.categories is not None:
161 |             if more:
162 |                 result += ";"
163 |             result += ",".join(c.name for c in self.categories)
164 |             more = True
165 |         return result + "]"
166 | 
167 |     def example(self) -> str:
168 |         for c in nice_characters():
169 |             if self.matches(c):
170 |                 return chr(c)
171 | 
172 |         if self.positive:
173 |             if self.literals:
174 |                 if len(self.literals) > 1:
175 |                     # Try to avoid \n due to false positives with the . character and flags
176 |                     return chr(next(o for o in self.literals if o != 0xA))
177 |                 return chr(next(iter(self.literals)))
178 |             elif self.categories:
179 |                 return sorted(self.categories, key=lambda c: 0 if c.is_positive else 1)[
180 |                     0
181 |                 ].example()
182 | 
183 |         raise NotImplementedError(self)
184 | 
185 |     def negate(self) -> "Optional[Character]":
186 |         if self.is_any:
187 |             return None
188 |         return Character(
189 |             literals=self.literals,
190 |             categories=self.categories,
191 |             positive=not self.positive,
192 |         )
193 | 
194 |     def contains(self, subgroup: "Character") -> bool:
195 |         if self.is_any:
196 |             return True
197 |         if subgroup.is_any:
198 |             return False
199 |         if subgroup == self:
200 |             return True
201 | 
202 |         if self._is_positive_literal and subgroup._is_positive_literal:
203 |             return not (subgroup.literals - self.literals)
204 |         if self._is_positive_category and subgroup._is_positive_category:
205 |             return not (subgroup.categories - self.categories)
206 | 
207 |         raise NotImplementedError  # Lazy, TODO: do full match
208 | 
209 |     def matches(self, literal: int) -> bool:
210 |         if self.is_any:
211 |             return True
212 |         if self.literals is not None and literal in self.literals:
213 |             return self.positive
214 |         if self.categories:
215 |             for cat in self.categories:
216 |                 if cat.contains(literal):
217 |                     return self.positive
218 |         return not self.positive
219 | 
220 | 
221 | def nice_characters():
222 |     for c in string.printable[:-5]:
223 |         yield ord(c)
224 | 
225 | 
226 | def literal_repr(literal: int) -> str:
227 |     c = chr(literal)
228 |     if c in string.digits or c in string.ascii_letters:
229 |         return c
230 |     elif c in string.punctuation:
231 |         return f"{literal:02x}:{c}"
232 |     return f"{literal:02x}"
233 | 
234 | 
235 | def range_repr(r: Range) -> str:
236 |     return "[{}-{}]".format(literal_repr(r.min_val), literal_repr(r.max_val))
237 | 


--------------------------------------------------------------------------------
/regexploit/ast/groupref.py:
--------------------------------------------------------------------------------
 1 | import regexploit.ast.repeat as repeat
 2 | from regexploit.ast.branch import Branch
 3 | from regexploit.ast.sequence import Sequence
 4 | 
 5 | 
 6 | def subpattern_to_groupref(subpattern):
 7 |     if subpattern is None:
 8 |         return None
 9 |     if subpattern.starriness == 0:
10 |         return subpattern
11 |     if isinstance(subpattern, repeat.FiniteRepeat):
12 |         return subpattern.alter_repeat(
13 |             subpattern_to_groupref(subpattern.repeat),
14 |         )
15 |     if isinstance(subpattern, repeat.InfiniteRepeat):
16 |         return repeat.FiniteRepeat(
17 |             subpattern_to_groupref(subpattern.repeat),
18 |             subpattern.minimum_repeats,
19 |             subpattern.minimum_repeats + 1,
20 |         )
21 |     if isinstance(subpattern, Branch):
22 |         return Branch(
23 |             [subpattern_to_groupref(b) for b in subpattern.branches],
24 |             subpattern.optional,
25 |         )
26 |     if isinstance(subpattern, Sequence):
27 |         return Sequence([subpattern_to_groupref(e) for e in subpattern.elements])
28 |     return subpattern
29 | 


--------------------------------------------------------------------------------
/regexploit/ast/ranges.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Iterator, List, Set, Tuple
 3 | 
 4 | 
 5 | @dataclass(frozen=True)
 6 | class Range:
 7 |     min_val: int
 8 |     max_val: int
 9 | 
10 | 
11 | def lits_to_ranges(
12 |     literals: Iterator[int],
13 | ) -> Tuple[Set[int], Set[Range]]:
14 |     lits = set()
15 |     ranges = set()
16 |     buf: List[int] = []
17 |     for lit in sorted(literals):
18 |         if len(buf) and buf[-1] != lit - 1:
19 |             # Discontinuity
20 |             if len(buf) < 3:
21 |                 lits.update(buf)
22 |             else:
23 |                 ranges.add(Range(buf[0], buf[-1]))
24 |             buf = [lit]
25 |         else:
26 |             buf.append(lit)
27 | 
28 |     if len(buf) == 1:
29 |         lits.add(buf[0])
30 |     elif len(buf) > 1:
31 |         ranges.add(Range(buf[0], buf[-1]))
32 | 
33 |     return lits, ranges
34 | 


--------------------------------------------------------------------------------
/regexploit/ast/repeat.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any, Optional
 3 | 
 4 | from regexploit.ast.char import Character
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class Repeat:
 9 |     repeat: Any
10 |     minimum_repeats: int
11 | 
12 |     def example(self) -> str:
13 |         if self.minimum_repeats == 0:
14 |             return ""
15 |         return self.repeat.example() * self.minimum_repeats
16 | 
17 |     @property
18 |     def minimum_length(self) -> int:
19 |         return self.minimum_repeats * self.repeat.minimum_length
20 | 
21 |     @property
22 |     def starriness(self) -> int:
23 |         return self.repeat.starriness  # ? and {1,30} are not that starry
24 | 
25 |     def exact_character_class(self) -> Optional[Character]:
26 |         """
27 |         Repeated character e.g. [bc] for [bc]*, or [a] for (aaa)*
28 |         """
29 |         return self.repeat.exact_character_class()
30 | 
31 |     def overall_character_class(self) -> Optional[Character]:
32 |         """
33 |         (23)+ -> None, (22)* -> 2
34 |         """
35 |         return self.repeat.overall_character_class()
36 | 
37 |     def maximal_character_class(self) -> Character:
38 |         """
39 |         (23)+ -> [23], (22)* -> 2, (23*)* -> [23]
40 |         Useful for finding a way to kill a sequence like a(bc*)*$
41 |         """
42 |         return self.repeat.maximal_character_class()
43 | 
44 | 
45 | @dataclass(frozen=True)
46 | class InfiniteRepeat(Repeat):
47 |     forced_starriness: Optional[int] = None
48 | 
49 |     @property
50 |     def starriness(self) -> int:
51 |         if self.forced_starriness is not None:
52 |             return self.forced_starriness
53 |         # a*a*a* is cubic whereas (a*)* is exponential but here we just call it 10
54 |         return 1 + self.repeat.starriness * 10
55 | 
56 |     def __repr__(self) -> str:
57 |         return f"{self.repeat}{{{self.minimum_repeats}+}}"
58 | 
59 |     def alter_repeat(self, repeat) -> "InfiniteRepeat":
60 |         return InfiniteRepeat(repeat, self.minimum_repeats)
61 | 
62 | 
63 | @dataclass(frozen=True)
64 | class FiniteRepeat(Repeat):
65 |     maximum_repeats: int
66 | 
67 |     def __repr__(self) -> str:
68 |         return f"{self.repeat}{{{self.minimum_repeats},{self.maximum_repeats}}}"
69 | 
70 |     def alter_repeat(self, repeat) -> "FiniteRepeat":
71 |         return FiniteRepeat(repeat, self.minimum_repeats, self.maximum_repeats)
72 | 


--------------------------------------------------------------------------------
/regexploit/ast/sequence.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import List, Optional
  3 | 
  4 | from regexploit.ast.char import Character
  5 | from regexploit.ast.repeat import InfiniteRepeat
  6 | 
  7 | 
  8 | @dataclass(frozen=True)
  9 | class Sequence:
 10 |     elements: List
 11 | 
 12 |     @property
 13 |     def starriness(self):
 14 |         return sum(e.starriness for e in self.elements)
 15 | 
 16 |     def __len__(self):
 17 |         return len(self.elements)
 18 | 
 19 |     def example(self) -> str:
 20 |         return "".join(e.example() for e in self.elements)
 21 | 
 22 |     @property
 23 |     def minimum_length(self) -> int:
 24 |         accum: int = 0
 25 |         for e in self.elements:
 26 |             accum += e.minimum_length
 27 |         return accum
 28 | 
 29 |     def exact_character_class(self) -> Optional[Character]:
 30 |         """
 31 |         aa*a -> a, abc -> None, [ab][abc] -> None
 32 |         """
 33 |         first = self.elements[0].exact_character_class()
 34 |         if first is None:
 35 |             return None
 36 |         for c in self.elements[1:]:
 37 |             if c != first:
 38 |                 return None
 39 |         return c
 40 | 
 41 |     def overall_character_class(self) -> Optional[Character]:
 42 |         """
 43 |         aa*a -> a, abc -> None, [ab][abc] -> [ab]
 44 |         a?b -> b, a+b -> None, [ab]+b* -> b
 45 |         """
 46 |         c = Character.ANY()
 47 |         for e in self.elements:
 48 |             c &= e.overall_character_class()
 49 |             if not c:
 50 |                 return None
 51 |         return c
 52 | 
 53 |     def matching_repeats(self):
 54 |         """Complicated way to get the possible character classes for a sequence"""
 55 |         c = Character.ANY()
 56 |         has_mandatory = False
 57 |         optionals = []
 58 |         starriness = 0
 59 |         minimum_length = 0
 60 |         for e in self.elements:
 61 |             if e.minimum_length:
 62 |                 c &= e.overall_character_class()
 63 |                 if not c:
 64 |                     return None
 65 |                 has_mandatory = True
 66 |                 starriness += e.starriness
 67 |                 minimum_length += e.minimum_length
 68 |             elif e.starriness > 0:
 69 |                 optionals.append(e)
 70 |         possibilities = {c: starriness} if has_mandatory else {}
 71 |         for e in optionals:
 72 |             if new_c := e.overall_character_class() & c:
 73 |                 if new_c in possibilities:
 74 |                     possibilities[new_c] += e.starriness
 75 |                 else:
 76 |                     possibilities[new_c] = e.starriness
 77 | 
 78 |         if len(possibilities) > 1:
 79 |             # (a*[ab]*a*[bc]*[bcd]*.+a*)*@ has classes {.: 1, [a]: 5, [[a-b]]: 2, [[b-c]]: 3, [[b-d]]: 2, [b]: 3}
 80 |             # This could blow up!
 81 |             poss_chars = list(possibilities.items())
 82 |             merged_chars = {}
 83 |             while poss_chars:
 84 |                 c_a, s_a = poss_chars.pop()
 85 |                 for c_b, s_b in poss_chars:
 86 |                     if (merged := c_a & c_b) is not None:
 87 |                         if merged == c_a:
 88 |                             possibilities[c_a] += s_b
 89 |                         elif merged == c_b:
 90 |                             possibilities[c_b] += s_a
 91 |                         else:
 92 |                             if merged not in merged_chars:
 93 |                                 merged_chars[merged] = set()
 94 |                             merged_chars[merged] |= {(c_a, s_a), (c_b, s_b)}
 95 |             for merged, set_of_chars in merged_chars.items():
 96 |                 possibilities[merged] = sum(s for _, s in set_of_chars)
 97 | 
 98 |         for cc, s in possibilities.items():
 99 |             if s:
100 |                 yield InfiniteRepeat(cc, minimum_length, forced_starriness=s)
101 | 
102 |     def maximal_character_class(self) -> Character:
103 |         """
104 |         Only useful when this Sequence is inside a Repeat
105 |         a*b -> [ab], ab* -> [ab]
106 |         Since forcing backtracking for (bc*)$
107 |         """
108 |         c = None
109 |         for e in self.elements:
110 |             if (mcc := e.maximal_character_class()) is not None:
111 |                 c = mcc | c
112 |         return c
113 | 
114 |     def __repr__(self) -> str:
115 |         return "SEQ{ " + " ".join(str(e) for e in self.elements) + " }"
116 | 


--------------------------------------------------------------------------------
/regexploit/ast/sre.py:
--------------------------------------------------------------------------------
  1 | import sre_constants
  2 | import sre_parse
  3 | from typing import List, Optional, Set, Tuple, Union  # noqa: I100, I201
  4 | 
  5 | from regexploit.ast.at import EndOfString
  6 | from regexploit.ast.branch import Branch, make_branch
  7 | from regexploit.ast.categories import Category, covers_any
  8 | from regexploit.ast.char import Character
  9 | from regexploit.ast.groupref import subpattern_to_groupref
 10 | from regexploit.ast.repeat import FiniteRepeat, InfiniteRepeat
 11 | from regexploit.ast.sequence import Sequence
 12 | 
 13 | SreConstant = sre_constants._NamedIntConstant
 14 | SreOpData = Union[Tuple, List, int, SreConstant, None]
 15 | SreOp = Tuple[SreConstant, SreOpData]
 16 | 
 17 | 
 18 | class SreOpParser:
 19 |     def __init__(self):
 20 |         self._groups = {}
 21 |         self.negative_lookahead: Optional[Character] = None
 22 | 
 23 |     def parse_sre(self, pattern: str, flags: int = 0):
 24 |         return self.sequence_or_singleton(sre_parse.parse(pattern, flags))
 25 | 
 26 |     def parse_op(self, op: SreConstant, data: SreOpData):
 27 |         return getattr(self, f"from_{op.name}")(data)
 28 | 
 29 |     def sequence_or_singleton(self, ops: List[SreOp]):
 30 |         elems = []
 31 |         for p in (self.parse_op(*op) for op in ops):
 32 |             if p is not None:
 33 |                 if isinstance(p, Sequence):
 34 |                     elems.extend(p.elements)
 35 |                 else:
 36 |                     elems.append(p)
 37 |         if len(elems) == 0:
 38 |             return None
 39 |         if len(elems) == 1:
 40 |             return elems[0]
 41 |         return Sequence(elems)
 42 | 
 43 |     def from_SUBPATTERN(self, data: Tuple[int, int, int, List[SreOp]]):
 44 |         ref = data[0]
 45 |         elements = data[3]
 46 |         result = self.sequence_or_singleton(elements)
 47 |         self._groups[ref] = result
 48 |         return result
 49 | 
 50 |     def from_MAX_REPEAT(
 51 |         self,
 52 |         data: Tuple[
 53 |             int,
 54 |             Union[int, SreConstant],
 55 |             List[SreOp],
 56 |         ],
 57 |     ) -> Union[FiniteRepeat, InfiniteRepeat, Branch, None]:
 58 |         minimum, maximum, elements = data
 59 |         infinite = maximum is sre_constants.MAXREPEAT
 60 |         # TODO support negative lookahead before repeat with minimum = 0
 61 |         negative_lookahead = self.use_negative_lookahead()
 62 |         repeatable = self.sequence_or_singleton(elements)
 63 |         if repeatable is None:
 64 |             return None
 65 |         if (
 66 |             minimum == 0
 67 |             and maximum == 1
 68 |             and repeatable.starriness
 69 |             and not repeatable.overall_character_class()
 70 |         ):
 71 |             # Interesting (starry) optional sequences as branches (ab*)? -> (ab*|)
 72 |             return make_branch([repeatable, None])
 73 |         if infinite:
 74 |             if (
 75 |                 negative_lookahead is not None
 76 |                 and minimum > 0
 77 |                 and isinstance(repeatable, Character)
 78 |             ):
 79 |                 return Sequence(
 80 |                     [
 81 |                         negative_lookahead & repeatable,
 82 |                         InfiniteRepeat(repeatable, minimum - 1),
 83 |                     ]
 84 |                 )
 85 |             return InfiniteRepeat(repeatable, minimum)
 86 |         if (
 87 |             negative_lookahead is not None
 88 |             and minimum > 0
 89 |             and maximum > 1
 90 |             and isinstance(repeatable, Character)
 91 |         ):
 92 |             return Sequence(
 93 |                 [
 94 |                     negative_lookahead & repeatable,
 95 |                     FiniteRepeat(repeatable, minimum - 1, maximum - 1),
 96 |                 ]
 97 |             )
 98 |         return FiniteRepeat(repeatable, minimum, maximum)
 99 | 
100 |     def from_MIN_REPEAT(self, data):
101 |         return self.from_MAX_REPEAT(data)
102 | 
103 |     def from_BRANCH(
104 |         self, data: Tuple[None, List[List[SreOp]]]
105 |     ) -> Union[Branch, FiniteRepeat, Character, None]:
106 |         # sre already transforms (a|b|c) -> [abc]
107 |         branches = data[1]
108 |         negative_lookahead = self.use_negative_lookahead()
109 |         processed_branches = []
110 |         for branch in branches:
111 |             self.negative_lookahead = negative_lookahead
112 |             processed_branches.append(self.sequence_or_singleton(branch))
113 |         self.negative_lookahead = None
114 |         return make_branch(processed_branches)
115 | 
116 |     def from_AT(self, at: SreConstant):
117 |         # TODO: handling for multiline
118 |         # TODO: handling for \\b
119 |         self.use_negative_lookahead()
120 |         if at is sre_constants.AT_END:
121 |             return EndOfString()
122 |         return None
123 | 
124 |     def from_ANY(self, _: None) -> Character:
125 |         if negative_lookahead := self.use_negative_lookahead():
126 |             return negative_lookahead
127 |         return Character.ANY()
128 | 
129 |     def from_LITERAL(self, literal: int) -> Character:
130 |         if negative_lookahead := self.use_negative_lookahead():
131 |             return Character.LITERAL(literal) & negative_lookahead
132 |         return Character.LITERAL(literal)
133 | 
134 |     def from_NOT_LITERAL(self, not_literal: int) -> Character:
135 |         if negative_lookahead := self.use_negative_lookahead():
136 |             return (
137 |                 Character(literals={not_literal}, positive=False) & negative_lookahead
138 |             )
139 |         return Character(literals={not_literal}, positive=False)
140 | 
141 |     def from_IN(self, data: List[SreOp]) -> Character:
142 |         literals: Optional[Set[int]] = None
143 |         categories: Optional[Set] = None
144 |         positive = True
145 |         if len(data) > 1 and data[0] == (sre_constants.NEGATE, None):
146 |             positive = False
147 |             data = data[1:]
148 |         for in_op, in_data in data:
149 |             if in_op is sre_constants.LITERAL:
150 |                 if literals is None:
151 |                     literals = set()
152 |                 literals.add(in_data)
153 |             elif in_op is sre_constants.RANGE:
154 |                 if literals is None:
155 |                     literals = set()
156 |                 min_val, max_val = in_data
157 |                 literals.update(range(min_val, max_val + 1))
158 |             elif in_op is sre_constants.CATEGORY:
159 |                 if categories is None:
160 |                     categories = set()
161 |                 categories.add(Category[in_data.name[9:]])
162 | 
163 |         if categories and covers_any(categories):
164 |             return self.from_ANY(None) if positive else None
165 |         if negative_lookahead := self.use_negative_lookahead():
166 |             return Character(literals, categories, positive) & negative_lookahead
167 |         return Character(literals, categories, positive)
168 | 
169 |     def from_GROUPREF(self, ref: int):
170 |         return subpattern_to_groupref(self._groups.get(ref))
171 | 
172 |     @staticmethod
173 |     def from_GROUPREF_EXISTS(_) -> None:
174 |         return None  # No intention to implement this properly
175 | 
176 |     @staticmethod
177 |     def from_ASSERT(_) -> None:
178 |         return None  # No intention to implement this properly
179 | 
180 |     def from_ASSERT_NOT(self, data) -> None:
181 |         typ, ops = data
182 |         if typ == 1:
183 |             if len(ops) == 1:
184 |                 character_op = ops[0]
185 |                 if character_op[0] in (
186 |                     sre_constants.LITERAL,
187 |                     sre_constants.NOT_LITERAL,
188 |                     sre_constants.IN,
189 |                 ):
190 |                     negative_lookahead = self.use_negative_lookahead()
191 |                     not_assertion = self.parse_op(*character_op)
192 |                     if not_assertion and (assertion := not_assertion.negate()):
193 |                         self.negative_lookahead = assertion
194 |                         if negative_lookahead is not None:
195 |                             self.negative_lookahead &= negative_lookahead
196 |                     else:
197 |                         self.negative_lookahead = negative_lookahead
198 | 
199 |         return None  # No intention to implement this fully
200 | 
201 |     def use_negative_lookahead(self) -> Optional[Character]:
202 |         if self.negative_lookahead is not None:
203 |             negative_lookahead = self.negative_lookahead
204 |             self.negative_lookahead = None
205 |             return negative_lookahead
206 | 


--------------------------------------------------------------------------------
/regexploit/bin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/bin/__init__.py


--------------------------------------------------------------------------------
/regexploit/bin/files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path
 3 | from glob import iglob
 4 | from typing import List, Optional
 5 | 
 6 | 
 7 | def _file_generator(
 8 |     files_argument: List[str], is_glob: bool, filename_globs: List[str]
 9 | ):
10 |     if is_glob:
11 |         for fglob in files_argument:
12 |             yield from iglob(fglob, recursive=True)
13 |     else:
14 |         for f in files_argument:
15 |             if os.path.isdir(f):
16 |                 for g in filename_globs:
17 |                     yield from iglob(os.path.join(f, "**", g), recursive=True)
18 |             else:
19 |                 yield f
20 | 
21 | 
22 | def file_generator(
23 |     files_argument: List[str],
24 |     is_glob: bool,
25 |     filename_globs: List[str],
26 |     ignore: Optional[List[str]] = None,
27 | ):
28 |     gen = _file_generator(files_argument, is_glob, filename_globs)
29 |     if ignore:
30 |         for f in gen:
31 |             if any(i in f for i in ignore):
32 |                 continue
33 |             yield f
34 |     else:
35 |         yield from gen
36 | 


--------------------------------------------------------------------------------
/regexploit/bin/javascript/.eslintrc.yml:
--------------------------------------------------------------------------------
  1 | env:
  2 |   node: true
  3 |   commonjs: true
  4 |   es2021: true
  5 | extends: 'eslint:recommended'
  6 | parserOptions:
  7 |   ecmaVersion: 12
  8 | rules:
  9 |   accessor-pairs: error
 10 |   array-bracket-newline: error
 11 |   array-bracket-spacing: error
 12 |   array-callback-return: error
 13 |   array-element-newline: error
 14 |   arrow-body-style: error
 15 |   arrow-parens: error
 16 |   arrow-spacing: error
 17 |   block-scoped-var: error
 18 |   block-spacing: error
 19 |   brace-style:
 20 |     - error
 21 |     - 1tbs
 22 |   callback-return: error
 23 |   camelcase: error
 24 |   capitalized-comments:
 25 |     - error
 26 |     - never
 27 |   class-methods-use-this: error
 28 |   comma-dangle: 'off'
 29 |   comma-spacing:
 30 |     - error
 31 |     - after: true
 32 |       before: false
 33 |   comma-style:
 34 |     - error
 35 |     - last
 36 |   complexity: error
 37 |   computed-property-spacing:
 38 |     - error
 39 |     - never
 40 |   consistent-return: error
 41 |   consistent-this: error
 42 |   curly: error
 43 |   default-case: error
 44 |   default-case-last: error
 45 |   default-param-last: error
 46 |   dot-location: error
 47 |   dot-notation: error
 48 |   eol-last: error
 49 |   eqeqeq: 'off'
 50 |   func-call-spacing: error
 51 |   func-name-matching: error
 52 |   func-names: error
 53 |   func-style: error
 54 |   function-paren-newline: error
 55 |   generator-star-spacing: 'off'
 56 |   global-require: error
 57 |   grouped-accessor-pairs: error
 58 |   guard-for-in: error
 59 |   handle-callback-err: error
 60 |   id-blacklist: error
 61 |   id-denylist: error
 62 |   id-length: error
 63 |   id-match: error
 64 |   implicit-arrow-linebreak: error
 65 |   indent: 'off'
 66 |   indent-legacy: 'off'
 67 |   init-declarations: error
 68 |   jsx-quotes: error
 69 |   key-spacing: error
 70 |   keyword-spacing:
 71 |     - error
 72 |     - after: true
 73 |       before: true
 74 |   line-comment-position: error
 75 |   linebreak-style:
 76 |     - error
 77 |     - unix
 78 |   lines-around-comment: error
 79 |   lines-around-directive: error
 80 |   lines-between-class-members: error
 81 |   max-classes-per-file: error
 82 |   max-depth: error
 83 |   max-len: 'off'
 84 |   max-lines: error
 85 |   max-lines-per-function: error
 86 |   max-nested-callbacks: error
 87 |   max-params: error
 88 |   max-statements: 'off'
 89 |   max-statements-per-line: error
 90 |   multiline-comment-style: error
 91 |   new-cap: error
 92 |   new-parens: error
 93 |   newline-after-var: 'off'
 94 |   newline-before-return: 'off'
 95 |   newline-per-chained-call: error
 96 |   no-alert: error
 97 |   no-array-constructor: error
 98 |   no-await-in-loop: 'off'
 99 |   no-bitwise: error
100 |   no-buffer-constructor: error
101 |   no-caller: error
102 |   no-catch-shadow: error
103 |   no-confusing-arrow: error
104 |   no-console: 'off'
105 |   no-constructor-return: error
106 |   no-continue: error
107 |   no-div-regex: error
108 |   no-duplicate-imports: error
109 |   no-else-return: error
110 |   no-empty-function: error
111 |   no-eq-null: error
112 |   no-eval: error
113 |   no-extend-native: error
114 |   no-extra-bind: error
115 |   no-extra-label: error
116 |   no-extra-parens: 'off'
117 |   no-floating-decimal: error
118 |   no-implicit-coercion: error
119 |   no-implicit-globals: 'off'
120 |   no-implied-eval: error
121 |   no-inline-comments: error
122 |   no-invalid-this: error
123 |   no-iterator: error
124 |   no-label-var: error
125 |   no-labels: error
126 |   no-lone-blocks: error
127 |   no-lonely-if: error
128 |   no-loop-func: error
129 |   no-loss-of-precision: error
130 |   no-magic-numbers: 'off'
131 |   no-mixed-operators: error
132 |   no-mixed-requires: error
133 |   no-multi-assign: error
134 |   no-multi-spaces: error
135 |   no-multi-str: error
136 |   no-multiple-empty-lines: error
137 |   no-native-reassign: error
138 |   no-negated-condition: error
139 |   no-negated-in-lhs: error
140 |   no-nested-ternary: error
141 |   no-new: error
142 |   no-new-func: error
143 |   no-new-object: error
144 |   no-new-require: error
145 |   no-new-wrappers: error
146 |   no-nonoctal-decimal-escape: error
147 |   no-octal-escape: error
148 |   no-param-reassign: error
149 |   no-path-concat: error
150 |   no-plusplus: error
151 |   no-process-env: error
152 |   no-process-exit: error
153 |   no-promise-executor-return: error
154 |   no-proto: error
155 |   no-restricted-exports: error
156 |   no-restricted-globals: error
157 |   no-restricted-imports: error
158 |   no-restricted-modules: error
159 |   no-restricted-properties: error
160 |   no-restricted-syntax: error
161 |   no-return-assign:
162 |     - error
163 |     - except-parens
164 |   no-return-await: error
165 |   no-script-url: error
166 |   no-self-compare: error
167 |   no-sequences: error
168 |   no-shadow: error
169 |   no-spaced-func: error
170 |   no-sync: error
171 |   no-tabs: error
172 |   no-template-curly-in-string: error
173 |   no-ternary: 'off'
174 |   no-throw-literal: error
175 |   no-trailing-spaces: error
176 |   no-undef-init: error
177 |   no-undefined: error
178 |   no-underscore-dangle: error
179 |   no-unmodified-loop-condition: error
180 |   no-unneeded-ternary: error
181 |   no-unreachable-loop: error
182 |   no-unused-expressions: error
183 |   no-use-before-define: error
184 |   no-useless-backreference: error
185 |   no-useless-call: error
186 |   no-useless-computed-key: error
187 |   no-useless-concat: error
188 |   no-useless-constructor: error
189 |   no-useless-rename: error
190 |   no-useless-return: error
191 |   no-var: error
192 |   no-void: error
193 |   no-warning-comments: error
194 |   no-whitespace-before-property: error
195 |   nonblock-statement-body-position: error
196 |   object-curly-newline: error
197 |   object-curly-spacing:
198 |     - error
199 |     - always
200 |   object-shorthand: error
201 |   one-var: off
202 |   one-var-declaration-per-line: error
203 |   operator-assignment: error
204 |   operator-linebreak: error
205 |   padded-blocks: 'off'
206 |   padding-line-between-statements: error
207 |   prefer-arrow-callback: error
208 |   prefer-const: error
209 |   prefer-destructuring: error
210 |   prefer-exponentiation-operator: error
211 |   prefer-named-capture-group: error
212 |   prefer-numeric-literals: error
213 |   prefer-object-spread: error
214 |   prefer-promise-reject-errors: error
215 |   prefer-reflect: error
216 |   prefer-regex-literals: error
217 |   prefer-rest-params: error
218 |   prefer-spread: error
219 |   prefer-template: error
220 |   quote-props: 'off'
221 |   quotes:
222 |     - error
223 |     - single
224 |   radix: error
225 |   require-atomic-updates: error
226 |   require-await: error
227 |   require-jsdoc: error
228 |   require-unicode-regexp: error
229 |   rest-spread-spacing:
230 |     - error
231 |     - never
232 |   semi: 'off'
233 |   semi-spacing: error
234 |   semi-style:
235 |     - error
236 |     - last
237 |   sort-imports: error
238 |   sort-keys: 'off'
239 |   sort-vars: error
240 |   space-before-blocks: error
241 |   space-before-function-paren: 'off'
242 |   space-in-parens:
243 |     - error
244 |     - never
245 |   space-infix-ops: error
246 |   space-unary-ops:
247 |     - error
248 |     - nonwords: false
249 |       words: true
250 |   spaced-comment:
251 |     - error
252 |     - always
253 |   strict:
254 |     - error
255 |     - never
256 |   switch-colon-spacing: error
257 |   symbol-description: error
258 |   template-curly-spacing: error
259 |   template-tag-spacing: error
260 |   unicode-bom:
261 |     - error
262 |     - never
263 |   valid-jsdoc: error
264 |   vars-on-top: error
265 |   wrap-iife: error
266 |   wrap-regex: error
267 |   yield-star-spacing: error
268 |   yoda:
269 |     - error
270 |     - never
271 | 


--------------------------------------------------------------------------------
/regexploit/bin/javascript/cli.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs').promises;
 2 | const findRegex = require('./find');
 3 | 
 4 | module.exports = {
 5 |   async * parseFile(filename) {
 6 |     try {
 7 |       const code = await fs.readFile(filename)
 8 |       yield* this.parseCode(code, filename);
 9 |     } catch (error) {
10 |       yield JSON.stringify({ error, filename });
11 |     }
12 |   },
13 | 
14 |   * parseCode(code, filename) {
15 |     try {
16 |       for (const regex of findRegex.extractRegexesFromSource(code, filename)) {
17 |         yield JSON.stringify({
18 |           ...regex,
19 |           filename,
20 |         });
21 |       }
22 |     } catch (error) {
23 |       yield JSON.stringify({ error, filename });
24 |     }
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/regexploit/bin/javascript/find.js:
--------------------------------------------------------------------------------
 1 | const parser = require('@typescript-eslint/parser');
 2 | 
 3 | module.exports = {
 4 |     * extractRegexesFromSource(content, filename) {
 5 |         // options https://github.com/typescript-eslint/typescript-eslint/blob/master/packages/types/src/parser-options.ts
 6 |         const tree = parser.parse(content, {
 7 |             ecmaFeatures: {
 8 |                 jsx: true
 9 |             },
10 |             comment: false,
11 |             ecmaVersion: 2020,
12 |             errorOnTypeScriptSyntacticAndSemanticIssues: false,
13 |             errorOnUnknownASTType: false,
14 |             range: true,
15 |             loc: true,
16 |             filename,
17 |         });
18 |         yield* this.walkASTForRegexes(tree);
19 |     },
20 | 
21 |     * walkASTForRegexes(tree) {
22 |         if (!tree) {
23 |             return;
24 |         }
25 |         if (tree.regex) {
26 |             yield {
27 |                 'pattern': tree.regex.pattern,
28 |                 'flags': tree.regex.flags,
29 |                 'lineno': tree.loc.start.line,
30 |             }
31 |             return;
32 |         }
33 |         if (
34 |             (tree.type == 'NewExpression' || tree.type == 'CallExpression') &&
35 |             tree.callee && tree.callee.name == 'RegExp' && tree.arguments && tree.arguments[0].type == 'Literal'
36 |         ) {
37 |             yield {
38 |                 'pattern': tree.arguments[0].value,
39 |                 'flags': tree.arguments.length > 1 && tree.arguments[1].type == 'Literal' ? tree.arguments[1].value : '',
40 |                 'lineno': tree.loc.start.line,
41 |             }
42 |             return;
43 |         }
44 |         for (const element of Object.values(tree)) {
45 |             if (element && typeof element == 'object') {
46 |                 yield* this.walkASTForRegexes(element);
47 |             }
48 |         }
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/regexploit/bin/javascript/index.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | const cli = require('./cli'),
 3 |  readline = require('readline');
 4 | 
 5 | const args = process.argv.slice(2);
 6 | 
 7 | 
 8 | if (args.length == 1 && args[0] == '-') {
 9 |   process.stdin.setEncoding('utf-8');
10 |   let data = '';
11 |   readline.createInterface({ input: process.stdin }).
12 |     on('line', (line) => (data += line)).
13 |     on('close', () => {
14 |       for (const output of cli.parseCode(data)) {
15 |         console.log(output);
16 |       }
17 |     })
18 | } else {
19 |   (async () => {
20 |     for (const filename of args) {
21 |       for await (const output of cli.parseFile(filename)) {
22 |         console.log(output);
23 |       }
24 |     }
25 |   })()
26 | }
27 | 


--------------------------------------------------------------------------------
/regexploit/bin/javascript/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "extract-regexes-from-ast",
 3 |   "version": "0.0.1",
 4 |   "description": "Parse a javascript or typescript file and output most regular expressions",
 5 |   "main": "index.js",
 6 |   "engines" : {
 7 |     "node" : ">=12"
 8 |   },
 9 |   "scripts": {
10 |     "lint": "eslint *.js",
11 |     "test": "mocha"
12 |   },
13 |   "author": "",
14 |   "dependencies": {
15 |     "@typescript-eslint/parser": "^4.0.0",
16 |     "typescript": "^4.0.0"
17 |   },
18 |   "devDependencies": {
19 |     "better-assert": "^1.0.0",
20 |     "eslint": "^7.0.0",
21 |     "mocha": "^8.0.0"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/regexploit/bin/javascript/test/test.js:
--------------------------------------------------------------------------------
 1 | const assert = require('better-assert');
 2 | const cli = require('../cli');
 3 | const findRegex = require('../find');
 4 | 
 5 | describe('findRegex', function() {
 6 |   describe('extractRegexesFromSource()', function() {
 7 |     it('should be able to return nothing', function() {
 8 |       assert([...findRegex.extractRegexesFromSource('abc')].length == 0);
 9 |     });
10 |     it('should find a literal regex', function() {
11 |       let found = [...findRegex.extractRegexesFromSource('const a = /ab+c/g')];
12 |       assert(found.length == 1);
13 |       assert(found[0].pattern == 'ab+c');
14 |       assert(found[0].flags == 'g');
15 |       assert(found[0].lineno == 1);
16 |     });
17 |     it('should find the RegExp constructor', function() {
18 |       let found = [...findRegex.extractRegexesFromSource('const a = [\nnew RegExp("one"),\nRegExp("two", "flags")]')];
19 |       assert(found.length == 2);
20 |       assert(found[0].pattern == 'one');
21 |       assert(found[0].flags == '');
22 |       assert(found[0].lineno == 2);
23 |       assert(found[1].pattern == 'two');
24 |       assert(found[1].flags == 'flags');
25 |       assert(found[1].lineno == 3);
26 |     });
27 |     var burriedTests = [
28 |       "var a = {b: /abc/}",
29 |       "function x() { return function* () { yield /abc/ } }",
30 |       "function x(y = /abc/) { return y; }",
31 |       "a ? /abc/ : null",
32 |       "if(/abc/){}", // a bit stupid
33 |       "[12, abc, /abc/, ...ghi]",
34 |       "for (const a of x.match(/abc/)) {}"
35 |     ]
36 |     burriedTests.forEach(function (code) {
37 |       it('should find burried regex ' + code, function() {
38 |         let found = [...findRegex.extractRegexesFromSource(code)];
39 |         assert(found.length == 1);
40 |         assert(found[0].pattern == 'abc');
41 |         assert(found[0].flags == '');
42 |       });
43 |     });
44 |   });
45 | });
46 | 
47 | describe("cli", function() {
48 |   describe("parseCode()", function() {
49 |     it('should find a literal regex', function() {
50 |       let found = [...cli.parseCode('/a(((b)+c))/im', 'fname')];
51 |       assert(found.length == 1);
52 |       const output = JSON.parse(found[0]);
53 |       assert(output.pattern == 'a(((b)+c))');
54 |       assert(output.flags == 'im');
55 |       assert(output.lineno == '1');
56 |       assert(output.filename == 'fname');
57 |       assert(!output.error);
58 |     });
59 |     it('should return errors if necessary', function() {
60 |       let found = [...cli.parseCode('/!#~')];
61 |       assert(found.length == 1);
62 |       assert(JSON.parse(found[0]).error);
63 |       assert(!JSON.parse(found[0]).pattern);
64 |     });
65 |   });
66 | });
67 | 


--------------------------------------------------------------------------------
/regexploit/bin/regexploit-python-env:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import regexploit.hook
 3 | 
 4 | regexploit.hook.install()
 5 | 
 6 | import importlib
 7 | import pkgutil
 8 | import sys
 9 | 
10 | from regexploit.ast.sre import SreOpParser
11 | from regexploit.redos import find
12 | from regexploit.output.text import TextOutput
13 | 
14 | # Load python modules and process regexes which are compiled on import by hooking re.compile
15 | 
16 | 
17 | def main():
18 |     def onerror(name):
19 |         print("Cannot load", name)
20 | 
21 |     names = tuple(sys.argv[1:]) if len(sys.argv) > 1 else None
22 |     sys.argv = sys.argv[:1]
23 |     if names:
24 |         regexploit.hook.regexes.clear()
25 | 
26 |     output = TextOutput()
27 |     for p in pkgutil.walk_packages(sys.path, onerror=onerror):
28 |         # Importing some modules is disruptive https://xkcd.com/353/
29 |         if (
30 |             not names
31 |             and p.name not in ("antigravity", "rstpep2html", "setup")
32 |             and not p.name.startswith(("test", "pip", "setuptools", "idlelib", "rst2"))
33 |             and not p.name.endswith(("__main__", ".main", ".conftest"))
34 |             and ".test" not in p.name
35 |         ) or (names and p.name.startswith(names)):
36 |             print(f"Importing {p.name}")
37 |             try:
38 |                 importlib.import_module(p.name)
39 |                 hooked_regex: regexploit.hook.CompiledRegex
40 |                 for hooked_regex in regexploit.hook.get_and_clear_regexes():
41 |                     output.next()
42 |                     parsed = SreOpParser().parse_sre(
43 |                         hooked_regex.pattern, hooked_regex.flags
44 |                     )
45 |                     for redos in find(parsed):
46 |                         if redos.starriness > 2:
47 |                             output.record(
48 |                                 redos,
49 |                                 hooked_regex.pattern,
50 |                                 filename=hooked_regex.last_tb.filename,
51 |                                 lineno=hooked_regex.last_tb.lineno,
52 |                                 context=hooked_regex.last_tb.line,
53 |                             )
54 | 
55 |             except Exception as e:
56 |                 print("Cannot load", p, e)
57 |     print(f"Processed {output.regexes} regexes")
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/regexploit/bin/regexploit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import fileinput
  4 | import logging
  5 | import re
  6 | import sys
  7 | import traceback
  8 | 
  9 | from regexploit.ast.sre import SreOpParser
 10 | from regexploit.languages.javascript import fix_js_regex
 11 | from regexploit.output.text import TextOutput
 12 | from regexploit.redos import find
 13 | 
 14 | 
 15 | def find_redos(pattern: str, flags: int, output: TextOutput, parser):
 16 |     try:
 17 |         parsed = parser(pattern, flags)
 18 |     except Exception as e:
 19 |         print(f"Error parsing: {pattern}", e)
 20 |         return
 21 |     output.next()
 22 |     for redos in find(parsed):
 23 |         if redos.starriness > 2:
 24 |             output.record(redos, pattern)
 25 |             yield redos
 26 | 
 27 | 
 28 | def python(pattern: str, flags: int):
 29 |     return SreOpParser().parse_sre(pattern, flags)
 30 | 
 31 | 
 32 | def javascript(pattern: str, flags: int):
 33 |     try:
 34 |         return SreOpParser().parse_sre(pattern)
 35 |     except:
 36 |         try:
 37 |             fixed = fix_js_regex(pattern)
 38 |             re.compile(fixed)
 39 |         except:
 40 |             raise
 41 | 
 42 |         try:
 43 |             return SreOpParser().parse_sre(fixed)
 44 |         except:
 45 |             print(traceback.format_exc())
 46 |             raise
 47 | 
 48 | 
 49 | def main():
 50 |     parser = argparse.ArgumentParser(
 51 |         description="Parse regexes from stdin and scan them for ReDoS"
 52 |     )
 53 |     parser.add_argument(
 54 |         "-f",
 55 |         "--flavour",
 56 |         "--flavor",
 57 |         choices=["python", "js"],
 58 |         default="python",
 59 |         help="Regex language",
 60 |     )
 61 |     parser.add_argument(
 62 |         "-v", "--verbose", action="count", default=0, help="Verbose logging"
 63 |     )
 64 |     parser.add_argument(
 65 |         "-u",
 66 |         "--unescape",
 67 |         action="store_true",
 68 |         help="Unescape the regular expressions before parsing them (e.g. double backslashes)",
 69 |     )
 70 |     args = parser.parse_args()
 71 |     sys.argv = sys.argv[:1]
 72 |     if args.verbose == 1:
 73 |         logging.basicConfig(level=logging.INFO)
 74 |     elif args.verbose > 1:
 75 |         logging.basicConfig(level=logging.DEBUG)
 76 | 
 77 |     isatty = sys.stdin.isatty()
 78 |     if isatty:
 79 |         print("Welcome to Regexploit. Enter your regexes:")
 80 |     output = TextOutput()
 81 |     try:
 82 |         for line in fileinput.input():
 83 |             found = False
 84 |             line = line.rstrip("\n")
 85 |             if args.unescape:
 86 |                 # \\d -> \d
 87 |                 line = line.encode().decode("unicode_escape")
 88 |             for _ in find_redos(
 89 |                 line, 0, output, javascript if args.flavour == "js" else python
 90 |             ):
 91 |                 found = True
 92 |             if isatty and not found:
 93 |                 print("No ReDoS found.")
 94 |     except KeyboardInterrupt:
 95 |         pass
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     main()
100 | 


--------------------------------------------------------------------------------
/regexploit/bin/regexploit_csharp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import logging
 4 | import re
 5 | import traceback
 6 | import warnings
 7 | 
 8 | from regexploit.ast.sre import SreOpParser
 9 | from regexploit.bin.files import file_generator
10 | from regexploit.languages.csharp_string_extractor import find_regexes
11 | from regexploit.languages.javascript import fix_js_regex
12 | from regexploit.output.text import TextOutput
13 | from regexploit.redos import find
14 | 
15 | 
16 | def handle_file(filename: str, output: TextOutput):
17 |     with open(filename, "rb") as f:
18 |         code = f.read()
19 |     for regex in find_regexes(code):
20 |         pattern = regex.pattern
21 |         if len(pattern) < 5:
22 |             continue  # (.+)+
23 |         if pattern.count("*") + pattern.count("+") + pattern.count(",}") < 2:
24 |             continue  # no ReDoS possible
25 |         try:
26 |             logging.debug("%s#%s: %s", filename, regex.lineno, pattern)
27 |             parsed = SreOpParser().parse_sre(pattern, regex.flags)
28 |         except:
29 |             try:
30 |                 fixed = fix_js_regex(pattern)
31 |                 re.compile(fixed, regex.flags)
32 |             except:
33 |                 if regex.definitely_regex:
34 |                     print(
35 |                         f"Error parsing: {pattern} from {filename} line {regex.lineno}\n"
36 |                     )
37 |                 continue
38 |             try:
39 |                 parsed = SreOpParser().parse_sre(fixed, regex.flags)
40 |             except:
41 |                 print(f"Error in regexploit parsing: {pattern} from {filename}")
42 |                 print(traceback.format_exc())
43 |                 continue
44 |         try:
45 |             output.next()
46 |             for redos in find(parsed):
47 |                 if redos.starriness > 2:
48 |                     context = None
49 |                     try:
50 |                         context = code.splitlines()[regex.lineno - 1].decode().strip()
51 |                     except UnicodeDecodeError:
52 |                         pass
53 |                     output.record(
54 |                         redos,
55 |                         pattern,
56 |                         filename=filename,
57 |                         lineno=regex.lineno,
58 |                         context=context,
59 |                     )
60 |         except Exception:
61 |             print(f"Error finding ReDoS: {pattern} from {filename} #{regex.lineno}")
62 |             print(traceback.format_exc())
63 | 
64 | 
65 | def main():
66 |     with warnings.catch_warnings():
67 |         warnings.simplefilter(
68 |             "ignore", category=FutureWarning
69 |         )  # Some csharp/js regexes are weird
70 |         parser = argparse.ArgumentParser(
71 |             description="Parse regexes out of C# files and scan them for ReDoS"
72 |         )
73 |         parser.add_argument("files", nargs="+", help="C# files")
74 |         parser.add_argument(
75 |             "--glob", action="store_true", help="Glob the input filenames (**/*)"
76 |         )
77 |         parser.add_argument("--verbose", action="store_true", help="Verbose logging")
78 |         parser.add_argument(
79 |             "--ignore", action="append", help="Paths containing this string are ignored"
80 |         )
81 |         args = parser.parse_args()
82 | 
83 |         if args.verbose:
84 |             logging.basicConfig(level=logging.DEBUG)
85 | 
86 |         output = TextOutput(js_flavour=True)
87 |         files = file_generator(args.files, args.glob, ["*.cs"], args.ignore)
88 |         for filename in files:
89 |             logging.debug(filename)
90 |             handle_file(filename, output)
91 |         print(f"Processed {output.regexes} regexes")
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/regexploit/bin/regexploit_js.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import io
  4 | import json
  5 | import logging
  6 | import os.path
  7 | import re
  8 | import subprocess
  9 | import sys
 10 | import traceback
 11 | import warnings
 12 | 
 13 | from regexploit.ast.sre import SreOpParser
 14 | from regexploit.bin.files import file_generator
 15 | from regexploit.languages.javascript import fix_js_regex
 16 | from regexploit.output.text import TextOutput
 17 | from regexploit.redos import find
 18 | 
 19 | 
 20 | def handle_line_from_node(line: str, output: TextOutput):
 21 |     regex = json.loads(line)
 22 |     if pattern := regex.get("pattern"):
 23 |         if (pattern_len := len(pattern)) < 5:
 24 |             return  # (.+)+
 25 |         if pattern_len == 8059 and pattern.startswith("\\u{1F3F4}(?:\\u{E0067"):
 26 |             return  # annoying emoji regex
 27 |         if pattern.count("*") + pattern.count("+") + pattern.count(",}") < 2:
 28 |             return  # no ReDoS possible
 29 |         filename = regex["filename"]
 30 |         lineno = regex["lineno"]
 31 |         try:
 32 |             logging.debug("%s#%s: %s", filename, lineno, pattern)
 33 |             parsed = SreOpParser().parse_sre(pattern)
 34 |         except:
 35 |             try:
 36 |                 fixed = fix_js_regex(pattern)
 37 |                 re.compile(fixed)
 38 |             except:
 39 |                 print(f"Error parsing: {pattern} from {filename}\n")
 40 |                 return
 41 |             try:
 42 |                 parsed = SreOpParser().parse_sre(fixed)
 43 |             except:
 44 |                 print(f"Error in regexploit parsing: {pattern} from {filename}")
 45 |                 print(traceback.format_exc())
 46 |                 return
 47 |         output.next()
 48 |         try:
 49 |             for redos in find(parsed):
 50 |                 if redos.starriness > 2:
 51 |                     output.record(redos, pattern, filename=filename, lineno=lineno)
 52 |         except Exception:
 53 |             print(f"Error finding ReDoS: {pattern} from {filename}")
 54 |             print(traceback.format_exc())
 55 |     elif error := regex.get("error"):
 56 |         print("ERR", error, regex.get("filename"))
 57 | 
 58 | 
 59 | def process_files(filenames, nodejs_executable, output):
 60 |     args = [
 61 |         os.path.join(os.path.split(__file__)[0], "javascript", "index.js"),
 62 |         *filenames,
 63 |     ]
 64 |     if nodejs_executable:
 65 |         args = [nodejs_executable] + args
 66 |     logging.debug("Processing batch: %s", args[2:])
 67 |     node = subprocess.Popen(args, stdout=subprocess.PIPE)
 68 |     for line in io.TextIOWrapper(node.stdout, encoding="utf-8"):
 69 |         handle_line_from_node(line, output)
 70 |     rc = node.poll()
 71 |     return rc
 72 | 
 73 | 
 74 | def main():
 75 |     if not os.path.isdir(
 76 |         os.path.join(os.path.split(__file__)[0], "javascript", "node_modules")
 77 |     ):
 78 |         path = os.path.join(os.path.split(__file__)[0], "javascript")
 79 |         print("The JavaScript & TypeScript parsers require some node modules.\n")
 80 |         print(f"Run (cd {path}; npm install)")
 81 |         sys.exit(1)
 82 |     with warnings.catch_warnings():
 83 |         warnings.simplefilter(
 84 |             "ignore", category=FutureWarning
 85 |         )  # Some js regexes are weird
 86 |         parser = argparse.ArgumentParser(
 87 |             description="Parse regexes out of javascript files and scan them for ReDoS"
 88 |         )
 89 |         parser.add_argument("files", nargs="+", help="Javascript or typescript files")
 90 |         parser.add_argument(
 91 |             "--node",
 92 |             help="Location of nodejs executable (rather than using node from PATH)",
 93 |         )
 94 |         parser.add_argument(
 95 |             "--glob", action="store_true", help="Glob the input filenames (**/*)"
 96 |         )
 97 |         parser.add_argument("--verbose", action="store_true", help="Verbose logging")
 98 |         parser.add_argument(
 99 |             "--ignore", action="append", help="Paths containing this string are ignored"
100 |         )
101 |         args = parser.parse_args()
102 | 
103 |         if args.verbose:
104 |             logging.basicConfig(level=logging.DEBUG)
105 | 
106 |         output = TextOutput(js_flavour=True)
107 |         files = file_generator(args.files, args.glob, ["*.js", "*.ts"], args.ignore)
108 |         while True:
109 |             batch = []
110 |             for _ in range(50):
111 |                 try:
112 |                     batch.append(next(files))
113 |                 except StopIteration:
114 |                     if batch:
115 |                         process_files(batch, args.node, output)
116 |                     return
117 |             process_files(batch, args.node, output)
118 |         print(f"Processed {output.regexes} regexes")
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/regexploit/bin/regexploit_python_ast.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import ast
 4 | import logging
 5 | import re
 6 | import traceback
 7 | import warnings
 8 | 
 9 | from regexploit.ast.sre import SreOpParser
10 | from regexploit.bin.files import file_generator
11 | from regexploit.languages.python_node_visitor import PythonNodeVisitor
12 | from regexploit.output.text import TextOutput
13 | from regexploit.redos import find
14 | 
15 | 
16 | def handle_file(filename: str, output: TextOutput):
17 |     with open(filename, "rb") as f:
18 |         code = f.read()
19 |     try:
20 |         code_ast = ast.parse(code)
21 |         pnv = PythonNodeVisitor()
22 |         pnv.visit(code_ast)
23 |     except RecursionError:
24 |         print(f"RecursionError parsing AST for {filename}")
25 |         return
26 |     except SyntaxError as e:
27 |         print(f"Bad Python3 syntax in {filename}: {e}")
28 |         return
29 |     for regex in pnv.patterns:
30 |         try:
31 |             parsed = SreOpParser().parse_sre(regex.pattern, regex.flags)
32 |         except re.error:
33 |             continue  # We will have many strings which aren't actually regexes
34 |         try:
35 |             output.next()
36 |             for redos in find(parsed):
37 |                 if redos.starriness > 2:
38 |                     context = None
39 |                     try:
40 |                         context = code.splitlines()[regex.lineno - 1].decode().strip()
41 |                     except UnicodeDecodeError:
42 |                         pass
43 |                     output.record(
44 |                         redos,
45 |                         regex.pattern,
46 |                         filename=filename,
47 |                         lineno=regex.lineno,
48 |                         context=context,
49 |                     )
50 |         except Exception:
51 |             print(
52 |                 f"Error finding ReDoS: {regex.pattern} from {filename} #{regex.lineno}"
53 |             )
54 |             print(traceback.format_exc())
55 | 
56 | 
57 | def main():
58 |     with warnings.catch_warnings():
59 |         # Some weird regexes emit warnings
60 |         warnings.simplefilter("ignore", category=FutureWarning)
61 |         warnings.simplefilter("ignore", category=DeprecationWarning)
62 |         parser = argparse.ArgumentParser(
63 |             description="Parse regexes out of python files and scan them for ReDoS"
64 |         )
65 |         parser.add_argument("files", nargs="+", help="Python files or directories")
66 |         parser.add_argument(
67 |             "--glob", action="store_true", help="Glob the input filenames (**/*)"
68 |         )
69 |         parser.add_argument("--verbose", action="store_true", help="Verbose logging")
70 |         parser.add_argument(
71 |             "--ignore", action="append", help="Paths containing this string are ignored"
72 |         )
73 |         args = parser.parse_args()
74 | 
75 |         if args.verbose:
76 |             logging.basicConfig(level=logging.DEBUG)
77 | 
78 |         files = file_generator(args.files, args.glob, ["*.py"], args.ignore)
79 |         output = TextOutput()
80 |         for filename in files:
81 |             logging.debug(filename)
82 |             handle_file(filename, output)
83 |         print(f"Processed {output.regexes} regexes")
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 


--------------------------------------------------------------------------------
/regexploit/bin/regexploit_yaml.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import json
  4 | import logging
  5 | import re
  6 | import traceback
  7 | import warnings
  8 | 
  9 | from regexploit.ast.sre import SreOpParser
 10 | from regexploit.bin.files import file_generator
 11 | from regexploit.output.text import TextOutput
 12 | from regexploit.redos import find
 13 | 
 14 | 
 15 | def get_json(filename: str):
 16 |     with open(filename, "rb") as f:
 17 |         try:
 18 |             return json.load(f)
 19 |         except json.decoder.JSONDecodeError:
 20 |             print(f"Error parsing JSON from {filename}")
 21 |             return
 22 | 
 23 | 
 24 | def handle_file(yamljson, filename: str, output: TextOutput):
 25 |     if isinstance(yamljson, (list, dict)):
 26 |         YamlJsonWalker(filename, output).handle(yamljson)
 27 | 
 28 | 
 29 | class YamlJsonWalker:
 30 |     def __init__(self, filename: str, output: TextOutput):
 31 |         self.filename = filename
 32 |         self.output = output
 33 | 
 34 |     def handle(self, elem):
 35 |         if isinstance(elem, str) and len(elem) > 5:
 36 |             try:
 37 |                 parsed = SreOpParser().parse_sre(elem)
 38 |             except re.error:
 39 |                 return  # We will have many strings which aren't actually regexes
 40 |             try:
 41 |                 self.output.next()
 42 |                 for redos in find(parsed):
 43 |                     if redos.starriness > 2:
 44 |                         self.output.record(
 45 |                             redos,
 46 |                             elem,
 47 |                             filename=self.filename,
 48 |                         )
 49 |             except Exception:
 50 |                 print(f"Error finding ReDoS: {elem} from {self.filename}")
 51 |                 print(traceback.format_exc())
 52 |         elif isinstance(elem, list):
 53 |             for _elem in elem:
 54 |                 self.handle(_elem)
 55 |         elif isinstance(elem, dict):
 56 |             for _elem in elem.values():
 57 |                 self.handle(_elem)
 58 | 
 59 | 
 60 | def main(get_object=get_json):
 61 |     with warnings.catch_warnings():
 62 |         # Some weird regexes emit warnings
 63 |         warnings.simplefilter("ignore", category=FutureWarning)
 64 |         warnings.simplefilter("ignore", category=DeprecationWarning)
 65 |         parser = argparse.ArgumentParser(
 66 |             description="Parse regexes out of YAML files (strings, lists and dictionary values) and scan them for ReDoS"
 67 |         )
 68 |         parser.add_argument("files", nargs="+", help="YAML files")
 69 |         parser.add_argument(
 70 |             "--glob", action="store_true", help="Glob the input filenames (**/*)"
 71 |         )
 72 |         parser.add_argument("--verbose", action="store_true", help="Verbose logging")
 73 |         parser.add_argument(
 74 |             "--ignore", action="append", help="Paths containing this string are ignored"
 75 |         )
 76 |         args = parser.parse_args()
 77 | 
 78 |         if args.verbose:
 79 |             logging.basicConfig(level=logging.DEBUG)
 80 | 
 81 |         files = file_generator(
 82 |             args.files,
 83 |             args.glob,
 84 |             ["*.json"] if get_object is get_json else ["*.yaml", "*.yml", "*.json"],
 85 |             args.ignore,
 86 |         )
 87 |         output = TextOutput()
 88 |         for filename in files:
 89 |             logging.debug(filename)
 90 |             handle_file(get_object(filename), filename, output)
 91 |         print(f"Processed {output.regexes} regexes")
 92 | 
 93 | 
 94 | def main_yaml():
 95 |     try:
 96 |         from yaml import safe_load, YAMLError
 97 | 
 98 |         def get_yaml(filename: str):
 99 |             with open(filename, "rb") as f:
100 |                 try:
101 |                     return safe_load(f.read())
102 |                 except YAMLError:
103 |                     print(f"Error parsing YAML from {filename}")
104 |                     return
105 | 
106 |         main(get_object=get_yaml)
107 |     except ImportError:
108 |         print(
109 |             "Pyyaml extra required: Install regexploit with 'pip install regexploit[yaml]' or run 'pip install pyyaml'"
110 |         )
111 |         raise
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/regexploit/found_regex.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass(frozen=True)
 5 | class FoundRegex:
 6 |     lineno: int
 7 |     pattern: str
 8 |     flags: int
 9 |     definitely_regex: bool
10 | 


--------------------------------------------------------------------------------
/regexploit/hook.py:
--------------------------------------------------------------------------------
 1 | # The module records any regexes used by python code for later inspection.
 2 | # Import this hook and install() before loading other modules or start python with `python -i /path/to/hook.py`
 3 | import re
 4 | import traceback
 5 | 
 6 | # By default, the re and traceback modules will not be hooked
 7 | 
 8 | regexes = set()
 9 | 
10 | 
11 | class CompiledRegex:
12 |     def __init__(self, pattern, flags, traceback):
13 |         self.pattern = pattern
14 |         self.flags = flags
15 |         self.traceback = traceback
16 | 
17 |     def __hash__(self) -> int:
18 |         # ignore the traceback for now
19 |         return hash((self.pattern, self.flags))
20 | 
21 |     def __repr__(self) -> str:
22 |         return f"({self.pattern} at {self.last_tb.filename})"
23 | 
24 |     @property
25 |     def last_tb(self):
26 |         return self.traceback[-1]
27 | 
28 | 
29 | class WrappedRegex:
30 |     def __init__(self, regex):
31 |         self.regex = regex
32 | 
33 |     def run_and_log(self, method, args, kwargs):
34 |         print("Pattern:", repr(self.regex.pattern[:200]))
35 |         print(f"{method}()", *(repr(a) for a in args))
36 |         print(*traceback.format_stack()[2:-4])
37 |         return getattr(self.regex, method)(*args, **kwargs)
38 | 
39 |     def search(self, *args, **kwargs):
40 |         return self.run_and_log("search", args, kwargs)
41 | 
42 |     def match(self, *args, **kwargs):
43 |         return self.run_and_log("match", args, kwargs)
44 | 
45 |     def fullmatch(self, *args, **kwargs):
46 |         return self.run_and_log("fullmatch", args, kwargs)
47 | 
48 |     def sub(self, *args, **kwargs):
49 |         return self.run_and_log("sub", args, kwargs)
50 | 
51 |     def subn(self, *args, **kwargs):
52 |         return self.run_and_log("subn", args, kwargs)
53 | 
54 |     def split(self, *args, **kwargs):
55 |         return self.run_and_log("split", args, kwargs)
56 | 
57 |     def findall(self, *args, **kwargs):
58 |         return self.run_and_log("findall", args, kwargs)
59 | 
60 |     def finditer(self, *args, **kwargs):
61 |         return self.run_and_log("finditer", args, kwargs)
62 | 
63 | 
64 | def get_and_clear_regexes():
65 |     """
66 |     Retrieves regexes that have been `re.compile`-ed and removes them from the `regexes` set.
67 |     """
68 |     while True:
69 |         try:
70 |             yield regexes.pop()
71 |         except KeyError:
72 |             return
73 | 
74 | 
75 | def install(log_all_uses: bool = False):
76 |     """
77 |     Activate the hook.
78 |     """
79 |     if not hasattr(re.compile, "_is_hook"):
80 |         old_compile = re.compile
81 | 
82 |         def compile(pattern, flags=0):
83 |             tb = traceback.extract_stack()[:-1]  # Ignore our hook
84 |             regexes.add(CompiledRegex(pattern, flags, tb))
85 |             regex = old_compile(pattern, flags)
86 |             if log_all_uses:
87 |                 return WrappedRegex(regex)
88 |             else:
89 |                 return regex
90 | 
91 |         compile._is_hook = True
92 |         re.compile = compile
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     install()
97 | 


--------------------------------------------------------------------------------
/regexploit/languages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/languages/__init__.py


--------------------------------------------------------------------------------
/regexploit/languages/csharp_string_extractor.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import re
  3 | 
  4 | from regexploit.found_regex import FoundRegex
  5 | 
  6 | 
  7 | def make_token_regex(*token_specification):
  8 |     return re.compile("|".join("(?P<%s>%s)" % pair for pair in token_specification))
  9 | 
 10 | 
 11 | TOKENS_BASE = make_token_regex(
 12 |     ("LINE_COMMENT", r"//"),
 13 |     ("MULTILINE_COMMENT", r"/\*"),
 14 |     ("INDENT_OR_PREPROCESSOR", r"(?:^|\n)\s*#?"),
 15 |     ("SEMI_COLON", r";"),
 16 |     ("DOUBLE_QUOTE_CHAR_LITERAL", r"'\\?\"'"),
 17 |     ("NEW_REGEX", r"new\s+[\w.]*?Regex\("),
 18 |     ("BEGIN_VERBATIM_STRING", r'(\$@|@\$?)"'),
 19 |     ("BEGIN_STRING", r'\$?"'),
 20 | )
 21 | TOKENS_LINE_COMMENT = make_token_regex(("END_COMMENT", "\n"))
 22 | TOKENS_MULTILINE_COMMENT = make_token_regex(("END_COMMENT", r"\*/"))
 23 | TOKENS_VERBATIM_STRING = make_token_regex(
 24 |     ("LITERAL_QUOTE", r'""'),
 25 |     ("END_VERBATIM_STRING", r'"'),
 26 | )
 27 | TOKENS_STRING = make_token_regex(
 28 |     ("LITERAL_BACKSLASH", r"\\\\"),
 29 |     ("LITERAL_QUOTE", r'\\"'),
 30 |     ("END_STRING", '"'),
 31 | )
 32 | TOKENS_END_NEW_REGEX = make_token_regex(("SEMI_COLON", ";"))
 33 | 
 34 | 
 35 | def find_regexes(code):
 36 |     code = code.decode("utf-8", "replace")
 37 |     cursor: int = 0
 38 |     mode: re.Pattern = TOKENS_BASE
 39 |     reached_end: bool = False
 40 |     inside_new_regex: bool = False
 41 |     buffered_regex = None
 42 |     interpolated: bool = False  # TODO: interpolated $ strings
 43 |     newline_positions = make_lines(code)
 44 |     seen_line = 0
 45 | 
 46 |     while not reached_end:
 47 |         for mo in mode.finditer(code, cursor):
 48 |             kind = mo.lastgroup
 49 |             value = mo.group()
 50 |             # print(kind, value.replace('\n', '\\n'), code[cursor:mo.start()].replace('\n', '\\n'), code[mo.start():mo.end()].replace('\n', '\\n'))
 51 |             if kind == "END_COMMENT":
 52 |                 mode = TOKENS_BASE
 53 |                 cursor = mo.end()
 54 |                 break
 55 |             elif kind == "LINE_COMMENT":
 56 |                 mode = TOKENS_LINE_COMMENT
 57 |                 cursor = mo.end()
 58 |                 break
 59 |             elif kind == "INDENT_OR_PREPROCESSOR":
 60 |                 if value and value[-1] == "#":  # Preprocessor
 61 |                     mode = TOKENS_LINE_COMMENT
 62 |                     cursor = mo.end()
 63 |                     break
 64 |             elif kind == "MULTILINE_COMMENT":
 65 |                 mode = TOKENS_MULTILINE_COMMENT
 66 |                 cursor = mo.end()
 67 |                 break
 68 |             elif kind == "SEMI_COLON":
 69 |                 if inside_new_regex and buffered_regex is not None:
 70 |                     char_index, line, string = buffered_regex
 71 |                     flag_string = code[char_index : mo.start()]
 72 |                     flags = 0
 73 |                     # TODO: https://docs.microsoft.com/en-us/dotnet/api/system.text.regularexpressions.regexoptions
 74 |                     if "IgnoreCase" in flag_string:
 75 |                         flags |= re.I
 76 |                     if "Multiline" in flag_string:
 77 |                         flags |= re.M
 78 |                     if "IgnorePatternWhitespace" in flag_string:
 79 |                         flags |= re.X
 80 |                     yield FoundRegex(line, string, flags, True)
 81 |                     mode = TOKENS_BASE
 82 |                     cursor = mo.end()
 83 |                     inside_new_regex = False
 84 |                     buffered_regex = None
 85 |                     break
 86 |                 inside_new_regex = False
 87 |                 buffered_regex = None
 88 |             elif kind == "NEW_REGEX":
 89 |                 inside_new_regex = True
 90 |                 buffered_regex = None
 91 |             elif kind == "BEGIN_VERBATIM_STRING":
 92 |                 interpolated = "$" in value
 93 |                 mode = TOKENS_VERBATIM_STRING
 94 |                 cursor = mo.end()
 95 |                 break
 96 |             elif kind == "BEGIN_STRING":
 97 |                 interpolated = "$" in value  # noqa: F841
 98 |                 mode = TOKENS_STRING
 99 |                 cursor = mo.end()
100 |                 break
101 |             elif kind in ["END_VERBATIM_STRING", "END_STRING"]:
102 |                 string = code[cursor : mo.start()]
103 |                 if kind == "END_STRING":
104 |                     try:
105 |                         string = string.encode().decode("unicode_escape")
106 |                     except UnicodeDecodeError:
107 |                         logging.warning(f"Unable to process: {string}")
108 |                         string = string.encode().decode("utf-8", "replace")
109 |                 else:
110 |                     string = string.replace('""', '"')
111 |                 line = line_of(cursor, newline_positions, seen_line)
112 |                 seen_line = line - 1
113 |                 cursor = mo.end()
114 |                 if inside_new_regex:
115 |                     buffered_regex = (cursor, line, string)
116 |                     mode = TOKENS_END_NEW_REGEX
117 |                 else:
118 |                     flags = (
119 |                         re.X if kind == "END_VERBATIM_STRING" and "\n" in string else 0
120 |                     )
121 |                     yield FoundRegex(line, string, flags, False)
122 |                     mode = TOKENS_BASE
123 |                 break
124 |         else:
125 |             reached_end = True
126 | 
127 | 
128 | def make_lines(code):
129 |     return [m.start() for m in re.finditer("\n", code)]
130 | 
131 | 
132 | def line_of(character_index: int, newline_positions, seen_line: int):
133 |     if not newline_positions:
134 |         return 1
135 |     for line_index, newline_position in enumerate(newline_positions[seen_line:]):
136 |         if character_index < newline_position:
137 |             return line_index + seen_line + 1
138 |     return line_index + seen_line
139 | 


--------------------------------------------------------------------------------
/regexploit/languages/javascript.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | # There's quite a lot wrong here, but it'll do for now.
 5 | # Wow, looking back on this, this is still horrific.
 6 | CARAT_FIX = re.compile(r"(?<!\\)\[\^\]")
 7 | NAMED_GROUP_FIX = re.compile(r"(?<!\\)\(\?<(\w+)>")
 8 | HYPHEN_FIX_1 = re.compile(r"(?<!\\)(\[[^\]]*(?<!\\)\\[wsdWSD])-")
 9 | HYPHEN_FIX_2 = re.compile(r"(?<!\\)(\[[^\]]*(?<!\\))-(\\[wsdWSD])")
10 | 
11 | 
12 | def fix_js_regex(pattern: str) -> str:
13 |     """Alter a javascript regex so that python can parse it. May accidentally alter meaning."""
14 |     pattern = CARAT_FIX.sub(r"\^", pattern)
15 |     pattern = NAMED_GROUP_FIX.sub(r"(?P<\1>", pattern)
16 |     pattern = HYPHEN_FIX_1.sub(r"\1\-", pattern)
17 |     pattern = HYPHEN_FIX_2.sub(r"\1\-\2", pattern)
18 |     return pattern
19 | 


--------------------------------------------------------------------------------
/regexploit/languages/python_node_visitor.py:
--------------------------------------------------------------------------------
  1 | import ast  # The python library not regexploit.ast
  2 | import re
  3 | from typing import List, Union
  4 | 
  5 | from regexploit.found_regex import FoundRegex
  6 | 
  7 | 
  8 | RE_FUNC_TO_FLAGS_POS = {
  9 |     "compile": 1,
 10 |     "search": 2,
 11 |     "match": 2,
 12 |     "fullmatch": 2,
 13 |     "findall": 2,
 14 |     "finditer": 2,
 15 |     "split": 3,
 16 |     "sub": 4,
 17 |     "subn": 4,
 18 | }
 19 | 
 20 | 
 21 | class PythonNodeVisitor(ast.NodeVisitor):
 22 |     """
 23 |     Try to extract regular expressions from python code by walking the AST.
 24 |     """
 25 | 
 26 |     def __init__(self):
 27 |         self.patterns: List[FoundRegex] = []
 28 | 
 29 |     def maybe_pattern(self, lineno: int, pattern: str):
 30 |         """Check if the pattern could possibly have ReDoS: if so, add it."""
 31 |         if pattern.count("*") + pattern.count("+") + pattern.count(",}") >= 2:
 32 |             # Could have ReDoS
 33 |             # Now check if it still looks like a docstring
 34 |             if " * * *" in pattern:
 35 |                 return  # Looks like cron (of course could just be really silly regex)
 36 |             if pattern.count("\n") < 5 or "?" in pattern or "\\" in pattern:
 37 |                 self.patterns.append(FoundRegex(lineno, pattern, 0, False))
 38 | 
 39 |     def visit_Constant(self, constant: ast.Constant):
 40 |         if isinstance(constant.value, bytes):
 41 |             try:
 42 |                 self.maybe_pattern(constant.lineno, constant.value.decode())
 43 |             except UnicodeDecodeError:
 44 |                 pass  # TODO Parse unicode patterns
 45 |         elif isinstance(constant.value, str):
 46 |             self.maybe_pattern(constant.lineno, constant.value)
 47 | 
 48 |     def visit_Assign(self, node: ast.Assign):
 49 |         if (
 50 |             len(node.targets) != 1
 51 |             or not isinstance(node.targets[0], ast.Name)
 52 |             or node.targets[0].id != "__doc__"
 53 |         ):
 54 |             self.generic_visit(node)
 55 | 
 56 |     def visit_body_without_docstring(
 57 |         self,
 58 |         node: Union[ast.FunctionDef, ast.AsyncFunctionDef, ast.Module, ast.ClassDef],
 59 |     ):
 60 |         if node.body:
 61 |             body = node.body
 62 |             if isinstance(body[0], ast.Expr):
 63 |                 potential_docstring = body[0].value
 64 |                 if isinstance(potential_docstring, ast.Constant):
 65 |                     node.body = node.body[1:]  # Ignore docstring
 66 | 
 67 |         self.generic_visit(node)
 68 | 
 69 |     def visit_FunctionDef(self, node: ast.FunctionDef):
 70 |         self.visit_body_without_docstring(node)
 71 | 
 72 |     def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
 73 |         self.visit_body_without_docstring(node)
 74 | 
 75 |     def visit_ClassDef(self, node: ast.ClassDef):
 76 |         self.visit_body_without_docstring(node)
 77 | 
 78 |     def visit_Module(self, node: ast.Module):
 79 |         self.visit_body_without_docstring(node)
 80 | 
 81 |     def visit_Call(self, node: ast.Call):
 82 |         if isinstance(node.func, ast.Attribute):
 83 |             attr: ast.Attribute = node.func
 84 |             if (
 85 |                 isinstance(attr.value, ast.Name)
 86 |                 and attr.value.id == "re"
 87 |                 and len(node.args)
 88 |                 and isinstance(node.args[0], ast.Constant)
 89 |             ):
 90 |                 flags = 0
 91 |                 pattern = node.args[0].value
 92 |                 flags_pos = RE_FUNC_TO_FLAGS_POS.get(attr.attr)
 93 |                 if flags_pos is not None:
 94 |                     # re.compile, re.sub, re.match etc
 95 |                     if len(node.args) == flags_pos + 1:
 96 |                         flags = RegexFlagVisitor.get_flags(node.args[flags_pos])
 97 |                     else:
 98 |                         for kw in node.keywords:
 99 |                             if kw.arg == "flags":
100 |                                 flags = RegexFlagVisitor.get_flags(kw.value)
101 |                                 break
102 |                     if isinstance(pattern, bytes):
103 |                         try:
104 |                             pattern = pattern.decode()
105 |                         except UnicodeDecodeError:
106 |                             return  # TODO unicode
107 |                     if isinstance(pattern, str):
108 |                         self.patterns.append(
109 |                             FoundRegex(node.lineno, pattern, flags, True)
110 |                         )
111 |                         return
112 | 
113 |         self.generic_visit(node)
114 | 
115 | 
116 | class RegexFlagVisitor(ast.NodeVisitor):
117 |     """Guess the flags from the 2nd argument of re.compile("abc", re.X | re.M)"""
118 | 
119 |     def __init__(self):
120 |         self.flags: int = 0
121 | 
122 |     def visit_Attribute(self, node: ast.Attribute):
123 |         if isinstance(node.value, ast.Name) and node.value.id == "re":
124 |             if node.attr == node.attr.upper():
125 |                 try:
126 |                     self.flags |= getattr(re, node.attr)
127 |                 except AttributeError:
128 |                     pass
129 | 
130 |     @staticmethod
131 |     def get_flags(node) -> int:
132 |         rfv = RegexFlagVisitor()
133 |         rfv.visit(node)
134 |         return rfv.flags
135 | 


--------------------------------------------------------------------------------
/regexploit/output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doyensec/regexploit/e0ad915be5c8b041bbc5d5f6e66998ffbe60fec6/regexploit/output/__init__.py


--------------------------------------------------------------------------------
/regexploit/output/text.py:
--------------------------------------------------------------------------------
 1 | POLYNOMIAL_DEGREES = [
 2 |     "linear",
 3 |     "quadratic",
 4 |     "cubic",
 5 |     "quartic",
 6 |     "quintic",
 7 |     "sextic",
 8 |     "septic",
 9 |     "octic",
10 |     "nonic",
11 |     "decic",
12 | ]
13 | 
14 | 
15 | class TextOutput:
16 |     def __init__(self, js_flavour: bool = False):
17 |         self.first_for_regex = True
18 |         self.regexes = 0
19 |         self.js_flavour = js_flavour
20 | 
21 |     def next(self):
22 |         """Next regex being processed."""
23 |         self.first_for_regex = True
24 |         self.regexes += 1
25 | 
26 |     def record(self, redos, pattern, *, filename=None, lineno=None, context=None):
27 |         if self.first_for_regex:
28 |             if filename:
29 |                 if lineno is not None:
30 |                     print(f"Vulnerable regex in {filename} #{lineno}")
31 |                 else:
32 |                     print(f"Vulnerable regex in {filename}")
33 |             print(f"Pattern: {pattern}")
34 |             if context:
35 |                 print(f"Context: {context}")
36 |             print("---")
37 |             self.first_for_regex = False
38 |         print(redos)
39 |         stars = "\u2b50" * min(10, redos.starriness)
40 |         degree = (
41 |             "exponential"
42 |             if redos.starriness > 10
43 |             else POLYNOMIAL_DEGREES[redos.starriness - 1]
44 |             if redos.starriness > 0
45 |             else "?"
46 |         )
47 |         print(f"Worst-case complexity: {redos.starriness} {stars} ({degree})")
48 |         print(f"Repeated character: {redos.repeated_character}")
49 |         if redos.killer:
50 |             print(f"Final character to cause backtracking: {redos.killer}")
51 |         print(f"Example: {redos.example(self.js_flavour)}\n")
52 | 


--------------------------------------------------------------------------------
/regexploit/redos.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from dataclasses import dataclass
  3 | from typing import Iterator, List, Optional
  4 | 
  5 | from regexploit.ast.at import EndOfString
  6 | from regexploit.ast.branch import Branch
  7 | from regexploit.ast.char import Character
  8 | from regexploit.ast.repeat import InfiniteRepeat, Repeat
  9 | from regexploit.ast.sequence import Sequence
 10 | 
 11 | 
 12 | @dataclass(frozen=True)
 13 | class Redos:
 14 |     starriness: int
 15 |     prefix_sequence: Sequence
 16 |     redos_sequence: Sequence
 17 |     repeated_character: Character
 18 |     killer: Optional[Character]
 19 | 
 20 |     @property
 21 |     def example_prefix(self) -> str:
 22 |         return self.prefix_sequence.example()
 23 | 
 24 |     def example(self, js_flavour: bool = False) -> str:
 25 |         repeated_char = self.repeated_character
 26 |         killer = self.killer
 27 |         # Try to find a repeating character which is also a killer
 28 |         if killer and (killing_repeat := repeated_char & killer):
 29 |             repeated_char = killing_repeat
 30 |             killer = None
 31 | 
 32 |         prefix = (
 33 |             self.example_prefix.encode("unicode_escape").decode().replace("'", "\\'")
 34 |         )
 35 |         repeated_char_s = (
 36 |             repeated_char.example()
 37 |             .encode("unicode_escape")
 38 |             .decode()
 39 |             .replace("'", "\\'")
 40 |         )
 41 |         e = f"'{prefix}' + " if prefix else ""
 42 |         if js_flavour:
 43 |             e += f"'{repeated_char_s}'.repeat(3456)"
 44 |         else:
 45 |             e += f"'{repeated_char_s}' * 3456"
 46 | 
 47 |         if killer:
 48 |             killer_s = (
 49 |                 killer.example().encode("unicode_escape").decode().replace("'", "\\'")
 50 |             )
 51 |             return e + f" + '{killer_s}'"
 52 |         return e
 53 | 
 54 | 
 55 | def find(sequence, flags: int = 0) -> List[Redos]:
 56 |     """
 57 |     Returns Redos objects sorted by severity (most starry first), then sorted by example_prefix (shortest first).
 58 |     """
 59 |     redos = []
 60 |     for r in find_redos(sequence):
 61 |         if r not in redos:
 62 |             redos.append(r)
 63 |     return sorted(redos, key=lambda r: -r.starriness * 1000 + len(r.example_prefix))
 64 | 
 65 | 
 66 | def expand_branches(seq: Sequence) -> Iterator[Sequence]:
 67 |     """
 68 |     This could blow up exponentially, but it's nicer for now to expand branches.
 69 |     """
 70 |     head = []
 71 |     for i, elem in enumerate(seq.elements):
 72 |         if isinstance(elem, Branch):
 73 |             for b in elem.get_branches():
 74 |                 head_plus_branch = head + (
 75 |                     [] if not b else [b] if not isinstance(b, Sequence) else b.elements
 76 |                 )
 77 |                 for tail in expand_branches(Sequence(seq.elements[i + 1 :])):
 78 |                     yield Sequence(head_plus_branch + tail.elements)
 79 |             return  # All processing in yields
 80 |         elif isinstance(elem, Repeat) and elem.starriness > 10:
 81 |             logging.debug("Exponential: %s", elem)
 82 |             if isinstance(elem.repeat, (Sequence, Branch)):
 83 |                 for tail in expand_branches(Sequence(seq.elements[i + 1 :])):
 84 |                     yield Sequence(head + [elem] + tail.elements)
 85 |                     for pseudo_repeat in elem.repeat.matching_repeats():
 86 |                         logging.debug("Pseudo repeat %s", pseudo_repeat)
 87 |                         yield Sequence(
 88 |                             head + [elem.alter_repeat(pseudo_repeat)] + tail.elements
 89 |                         )
 90 |             else:
 91 |                 head.append(elem)
 92 |         else:
 93 |             head.append(elem)
 94 |     yield Sequence(head)
 95 | 
 96 | 
 97 | def find_redos(sequence_with_branches) -> Iterator[Redos]:
 98 |     logging.debug(sequence_with_branches)
 99 |     if not isinstance(
100 |         sequence_with_branches, Sequence
101 |     ):  # singleton like Branch (ab|cd)
102 |         sequence_with_branches = Sequence([sequence_with_branches])
103 |     for seq in expand_branches(sequence_with_branches):
104 |         yield from find_redos_in_branchless_sequence(seq)
105 | 
106 | 
107 | def find_redos_in_branchless_sequence(seq: Sequence) -> Iterator[Redos]:
108 |     logging.debug(seq)
109 |     for i, elem in enumerate(seq.elements):
110 |         # TODO branches
111 |         if isinstance(elem, InfiniteRepeat) and (c := elem.overall_character_class()):
112 |             yield from make_redos(seq, i, i + 1, c, elem.starriness)
113 | 
114 | 
115 | def make_redos(
116 |     seq: Sequence,
117 |     sequence_start: int,
118 |     continue_from: int,
119 |     repeated_character: Character,
120 |     starriness: int,
121 | ) -> Iterator[Redos]:
122 |     # TODO branches
123 |     character_history = [repeated_character]
124 |     logging.debug(
125 |         "Make ReDoS %d %d %s %d",
126 |         sequence_start,
127 |         continue_from,
128 |         repeated_character,
129 |         starriness,
130 |     )
131 |     for current_index in range(continue_from, len(seq)):
132 |         elem = seq.elements[current_index]
133 | 
134 |         if isinstance(elem, EndOfString):
135 |             # May need to go back before the matching sequence to calculate $
136 |             elem.set_character(seq.elements[:current_index])
137 | 
138 |         eoc = elem.overall_character_class()
139 |         new_c = repeated_character & eoc
140 |         logging.debug("%s & %s = %s (for %s)", repeated_character, eoc, new_c, elem)
141 | 
142 |         # Handle optional elements
143 |         if elem.minimum_length == 0:
144 |             if elem.starriness:
145 |                 # If we have a*, we branch and try with and without it
146 |                 if new_c != repeated_character:
147 |                     # Only branch if we have [ab]a* : if we have aa* or a[ab]* then the character class doesn't change
148 |                     # Try without this element
149 |                     yield from make_redos(
150 |                         seq,
151 |                         sequence_start,
152 |                         current_index + 1,
153 |                         repeated_character,
154 |                         starriness,
155 |                     )
156 |             else:
157 |                 continue  # Don't care about finite repeats (abc)? or a{,4}
158 | 
159 |         # print(repeated_character, "+", elem.overall_character_class(), "->", new_c)
160 |         if new_c is None:
161 |             # This element will force backtracking as it's incompatible with `repeated_character`
162 |             if elem.minimum_length and starriness > 2:
163 |                 yield redos_found(
164 |                     seq,
165 |                     sequence_start,
166 |                     current_index,
167 |                     repeated_character,
168 |                     starriness,
169 |                     None,
170 |                 )
171 |             return
172 | 
173 |         starriness += elem.starriness
174 |         repeated_character = new_c
175 |         character_history.append(new_c)
176 | 
177 |     # Everything matched! We need to work backwards and find a 'killer' to cause backtracking if we want ReDoS
178 |     logging.debug("Backtracking: %s", character_history)
179 |     for current_index in reversed(range(continue_from, len(seq))):
180 |         elem = seq.elements[current_index]
181 |         character_history.pop()
182 |         starriness -= elem.starriness
183 |         if starriness <= 2:
184 |             return
185 |         # Can't get backtracking by not matching optional groups
186 |         if elem.minimum_length > 0:
187 |             # Find a character which matches the sequence and then fails on the killer
188 |             if (match := elem.overall_character_class()) and (killer := match.negate()):
189 |                 old_repeat = character_history.pop()
190 |                 logging.debug(
191 |                     "%s (for %s): killer=%s, repeat=%s",
192 |                     match,
193 |                     elem,
194 |                     killer,
195 |                     old_repeat,
196 |                 )
197 |                 yield redos_found(
198 |                     seq,
199 |                     sequence_start,
200 |                     current_index,
201 |                     old_repeat,
202 |                     starriness,
203 |                     killer,
204 |                 )
205 |                 return
206 |     logging.debug("Backtracking: FAIL")
207 | 
208 | 
209 | def redos_found(
210 |     seq: Sequence,
211 |     start: int,
212 |     backtrack_at: int,
213 |     repeated_character: Character,
214 |     starriness: int,
215 |     killer: Optional[Character],
216 | ) -> Redos:
217 |     # TODO: Try to include some skipped optional parts (like `?`) just to make it nicer
218 |     logging.debug("ReDoS found")
219 |     return Redos(
220 |         starriness,
221 |         Sequence(seq.elements[:start]),
222 |         Sequence(seq.elements[start : backtrack_at + 1]),
223 |         repeated_character,
224 |         killer,
225 |     )
226 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | flake8-import-order
4 | pytest
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="regexploit",
 8 |     version="1.0.0",
 9 |     author="Ben Caller :: Doyensec",
10 |     author_email="REMOVETHISPREFIX.ben@doyensec.com",
11 |     description="Find regular expressions vulnerable to ReDoS",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/doyensec/regexploit",
15 |     packages=setuptools.find_packages(),
16 |     include_package_data=True,
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3",
19 |         'License :: OSI Approved :: Apache Software License',
20 |         "Operating System :: OS Independent",
21 |     ],
22 |     python_requires=">=3.8",
23 |     extras_require={
24 |         "yaml": ['pyyaml>=5.3.1']
25 |     },
26 |     scripts=[
27 |         # Easy-install uses imports, so can miss findings
28 |         "regexploit/bin/regexploit-python-env",
29 |     ],
30 |     entry_points={
31 |         "console_scripts": [
32 |             "regexploit=regexploit.bin.regexploit:main",
33 |             "regexploit-js=regexploit.bin.regexploit_js:main",
34 |             "regexploit-py=regexploit.bin.regexploit_python_ast:main",
35 |             "regexploit-yaml=regexploit.bin.regexploit_yaml:main_yaml",
36 |             "regexploit-json=regexploit.bin.regexploit_yaml:main",
37 |             "regexploit-csharp=regexploit.bin.regexploit_csharp:main",
38 |         ],
39 |     },
40 | )
41 | 


--------------------------------------------------------------------------------
/tests/test.cs:
--------------------------------------------------------------------------------
 1 | using System;
 2 | using System.Text.RegularExpressions;
 3 | 
 4 | public class Example
 5 | {
 6 |    #line 1 "C:\Users\test"
 7 |    public static void Main()
 8 |    {
 9 |       /****"@"
10 |       ; " @ '"\
11 |       */
12 |       string input = "Not a regex*****";
13 |       string regex = "\\w+_[\\w\"]+_\\w+w";
14 |       /**/
15 |       string pattern = @"x""\d+.\d+.\d+!";
16 |       char c = '"';
17 |       char d = '\"';
18 |       Regex r = new Regex(@"\b(?<word>\w+)\s+x\b", RegexOptions.IgnoreCase);
19 |       Regex r = new Regex(
20 |           "\\b(?<word>\\w+)\\s+\\b",
21 |           // What?
22 |           /**/
23 |           RegexOptions.IgnoreCase
24 |       );
25 |       Something(@"
26 |          (a              # An a
27 |            *   # starred
28 |          )  # bracket
29 |          *  # starred again
30 |       x", x);
31 |    }
32 | }
33 | 


--------------------------------------------------------------------------------
/tests/test_at.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from regexploit.ast.at import EndOfString
 4 | from regexploit.ast.sre import SreOpParser
 5 | 
 6 | 
 7 | def from_regex(pattern: str):
 8 |     return SreOpParser().parse_sre(pattern)
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "r",
13 |     [
14 |         r".*b*",
15 |         r".*\w*b*",
16 |         r".+b*",
17 |     ],
18 | )
19 | def test_cannot_backtrack(r):
20 |     dollar = EndOfString()
21 |     dollar.set_character(from_regex(r).elements)
22 |     assert dollar.character.is_any
23 | 
24 | 
25 | @pytest.mark.parametrize(
26 |     "r",
27 |     [
28 |         r"x[ab]*b*",
29 |         r"x+[ab]*",
30 |         r"x+a*[ab]*a*b*",
31 |     ],
32 | )
33 | def test_dollar_simple(r):
34 |     dollar = EndOfString()
35 |     dollar.set_character(from_regex(r).elements)
36 |     assert dollar.character == from_regex("[ab]")
37 | 
38 | 
39 | @pytest.mark.parametrize(
40 |     "r",
41 |     [
42 |         r"\w*b*",
43 |         r"x\w*\w*b*",
44 |         r"\w+b*",
45 |     ],
46 | )
47 | def test_dollar_optionals_contained_by_mandatory(r):
48 |     dollar = EndOfString()
49 |     dollar.set_character(from_regex(r).elements)
50 |     assert dollar.character == from_regex(r"[\w]").expand_categories()
51 | 
52 | 
53 | def test_whole_string():
54 |     dollar = EndOfString()
55 |     dollar.set_character(from_regex(r"a*a*").elements)
56 |     assert dollar.character == from_regex(r"[a]")
57 | 
58 | 
59 | def test_real():
60 |     dollar = EndOfString()
61 |     dollar.set_character(from_regex(r"-\d+(\s*\s*\s*)").elements)
62 |     assert dollar.character == from_regex(r"[\s]")
63 | 


--------------------------------------------------------------------------------
/tests/test_character.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from sre_parse import parse as sre_parse
  3 | 
  4 | from regexploit.ast.categories import Category
  5 | from regexploit.ast.char import Character
  6 | from regexploit.ast.sre import SreOpParser
  7 | 
  8 | 
  9 | def from_regex(pattern: str) -> Character:
 10 |     (parsed_char,) = sre_parse(pattern)
 11 |     char = SreOpParser().parse_op(*parsed_char)
 12 |     assert isinstance(char, Character)
 13 |     return char
 14 | 
 15 | 
 16 | def test_literal_and():
 17 |     assert from_regex("[abc]") & from_regex("[bcd]") == from_regex("[bc]")
 18 | 
 19 | 
 20 | def test_literal_negated_and():
 21 |     assert from_regex("[^abc]") & from_regex("[^bcd]") == from_regex("[^a-d]")
 22 | 
 23 | 
 24 | def test_literal_mixed_and():
 25 |     assert from_regex("[abcz]") & from_regex("[^bcd]") == from_regex("[az]")
 26 |     assert from_regex("[^bcd]") & from_regex("[abcz]") == from_regex("[az]")
 27 | 
 28 | 
 29 | def test_category_and():
 30 |     assert from_regex(r"[\s\d]") & from_regex(r"[\d\w]") == from_regex(r"\d")
 31 | 
 32 | 
 33 | def test_category_negated_and():
 34 |     assert from_regex(r"[^\s\d]") & from_regex(r"[^\d\w]") == from_regex(r"[^\s\d\w]")
 35 | 
 36 | 
 37 | def test_category_negated_and_simplifies_to_nothing():
 38 |     assert (from_regex(r"[^\s\d]") & from_regex(r"[^\D]")) is None
 39 | 
 40 | 
 41 | def test_mixed_and():
 42 |     assert from_regex(r"[abc123\s]") & from_regex(r"[^\d\s]") == from_regex("[abc]")
 43 | 
 44 | 
 45 | def test_mixed_and_none():
 46 |     c = from_regex(r"[123]") & from_regex(r"[^\d\s]")
 47 |     assert c is None
 48 | 
 49 | 
 50 | @pytest.mark.parametrize(
 51 |     "r",
 52 |     [
 53 |         r"a",
 54 |         r"\s",
 55 |         r"[a\s\S\d]",
 56 |         r"[A-z]",
 57 |         r"[^A-z\d]",
 58 |     ],
 59 | )
 60 | def test_and_any_none(r):
 61 |     any = Character.ANY()
 62 |     other = from_regex(r)
 63 |     assert (any & other) == other
 64 |     assert (other & any) == other
 65 |     assert (any & None) is None
 66 |     assert (None & any) is None
 67 | 
 68 | 
 69 | def test_class():
 70 |     assert from_regex("[abc]").exact_character_class() == from_regex("[cba]")
 71 | 
 72 | 
 73 | def test_negate_simple():
 74 |     assert from_regex("a").negate() == from_regex("[^a]")
 75 |     assert from_regex(r"\w").negate() == from_regex(r"[^\w]")
 76 |     assert from_regex("[^ab]").negate() == from_regex("[ab]")
 77 |     assert from_regex(r"[^\s]").negate() == from_regex(r"\s")
 78 | 
 79 | 
 80 | def test_negate_mixed():
 81 |     assert from_regex(r"[a\s\w]").negate() == from_regex(r"[^a\s\w]")
 82 | 
 83 | 
 84 | def test_or():
 85 |     assert from_regex("a") | from_regex("a") == from_regex("a")
 86 |     assert from_regex("a") | from_regex("b") == from_regex("[ab]")
 87 |     assert from_regex(r"\w") | from_regex("b") == from_regex(r"\w").expand_categories()
 88 |     assert (
 89 |         from_regex(r"\w") | from_regex("9") == from_regex(r"[9\w]").expand_categories()
 90 |     )
 91 |     assert from_regex("[^a]") | from_regex("[^b]") == from_regex(".")
 92 | 
 93 | 
 94 | def test_category_category_covers_all():
 95 |     assert from_regex(r"[\s\S]").is_any is True
 96 |     assert from_regex(r"[\Dd\d]").is_any is True
 97 | 
 98 | 
 99 | def test_negative_lookahead():
100 |     assert SreOpParser().parse_sre(r"(?![0248])(?!6)(?!a)(?!xyz123)\d") == from_regex(
101 |         r"[13579]"
102 |     )
103 | 
104 | 
105 | def test_category_category_covers_none():
106 |     assert SreOpParser().parse_sre(r"[^x0-9\w\W]") is None
107 | 
108 | 
109 | @pytest.mark.parametrize(
110 |     "category_identifier,category_enum,character",
111 |     [
112 |         ("w", Category.WORD, "b"),
113 |         ("w", Category.WORD, "C"),
114 |         ("w", Category.WORD, "_"),
115 |         ("w", Category.WORD, "3"),
116 |         ("W", Category.NOT_WORD, "-"),
117 |         ("W", Category.NOT_WORD, "."),
118 |         ("s", Category.SPACE, "\xa0"),
119 |         ("s", Category.SPACE, "\v"),
120 |     ],
121 | )
122 | def test_categories(category_identifier: str, category_enum: Category, character: str):
123 |     # \w ~= [a-zA-Z0-9_], \s ~= [ \t\n\r\f\v]
124 |     category_characters = from_regex("\\" + category_identifier).expand_categories()
125 |     char = Character.LITERAL(ord(character))
126 |     assert category_characters | char == category_characters
127 |     assert category_characters & char == char
128 |     assert category_enum.contains(ord(character))
129 | 
130 | 
131 | @pytest.mark.parametrize(
132 |     "category_identifier,category_enum,not_character",
133 |     [
134 |         ("w", Category.WORD, "-"),
135 |         ("W", Category.NOT_WORD, "_"),
136 |         ("W", Category.NOT_WORD, "9"),
137 |         ("s", Category.SPACE, "\x00"),
138 |         ("S", Category.NOT_SPACE, "\f"),
139 |     ],
140 | )
141 | def test_not_categories(
142 |     category_identifier: str, category_enum: Category, not_character: str
143 | ):
144 |     category_characters = from_regex("\\" + category_identifier).expand_categories()
145 |     char = Character.LITERAL(ord(not_character))
146 |     assert category_characters & char is None
147 |     assert not category_enum.contains(ord(not_character))
148 | 


--------------------------------------------------------------------------------
/tests/test_csharp.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from unittest.mock import Mock
 3 | 
 4 | from regexploit.bin.regexploit_csharp import handle_file
 5 | from regexploit.languages.csharp_string_extractor import find_regexes
 6 | 
 7 | 
 8 | def test_csharp():
 9 |     with open("tests/test.cs", "rb") as f:
10 |         code = f.read()
11 |     found = list(find_regexes(code))
12 |     assert len(found) == 6
13 |     assert found[0].pattern == "Not a regex*****"
14 |     assert found[1].pattern == '\\w+_[\\w"]+_\\w+w'
15 |     assert found[2].pattern == r'x"\d+.\d+.\d+!'
16 |     assert found[2].lineno == 15
17 |     assert not found[2].definitely_regex
18 |     assert found[3].definitely_regex
19 |     assert found[4].flags == re.I
20 |     assert found[5].flags == re.X
21 | 
22 | 
23 | def test_handle_file():
24 |     output = Mock(spec=["next", "record"])
25 |     handle_file("tests/test.cs", output)
26 |     assert output.next.call_count == 5
27 |     assert output.record.call_count == 3
28 | 


--------------------------------------------------------------------------------
/tests/test_javascript.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from json import dumps
 3 | from unittest.mock import Mock
 4 | 
 5 | import pytest
 6 | 
 7 | from regexploit.bin.regexploit_js import handle_line_from_node
 8 | from regexploit.languages.javascript import fix_js_regex
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     "r,f",
13 |     [
14 |         # Carat
15 |         (r"[^]", r"\^"),
16 |         (r"[^][^][^]", r"\^\^\^"),
17 |         (r"([^])+([^])+([^])+", r"(\^)+(\^)+(\^)+"),
18 |         (r"[^][^][^]([\[^])+", r"\^\^\^([\[^])+"),
19 |         # Named groups
20 |         (r"(?<x>y>)+(?<ab_cD0>abc)\(?<a>", r"(?P<x>y>)+(?P<ab_cD0>abc)\(?<a>"),
21 |         # Hyphen in character class
22 |         (r"[\w-:]", r"[\w\-:]"),
23 |         (r"[!-\w]", r"[!\-\w]"),
24 |     ],
25 | )
26 | def test_fixes(r, f):
27 |     with pytest.raises(re.error):
28 |         re.compile(r)
29 |     fixed = fix_js_regex(r)
30 |     assert fixed == f
31 |     re.compile(fixed)
32 | 
33 | 
34 | @pytest.mark.parametrize(
35 |     "pat,next_called,recorded",
36 |     [
37 |         ("ab*cdef", False, False),  # too few stars
38 |         ("ab+c+def", True, False),
39 |         ("ab*b+b*c", True, True),
40 |         ("a[^](?<xyz>c*)*d", True, True),
41 |         ("a[^](?<xyz>c*)d*", True, False),
42 |     ],
43 | )
44 | def test_handle_line_from_node(pat, next_called, recorded):
45 |     output = Mock(spec=["next", "record"])
46 |     line_json = dict(pattern=pat, lineno=1, filename="testfile")
47 |     handle_line_from_node(dumps(line_json), output)
48 |     if next_called:
49 |         output.next.assert_called_once()
50 |     else:
51 |         output.next.assert_not_called()
52 |     if recorded:
53 |         output.record.assert_called_once()
54 |     else:
55 |         output.record.assert_not_called()
56 | 


--------------------------------------------------------------------------------
/tests/test_python_ast.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | import re
 3 | import textwrap
 4 | from unittest.mock import Mock
 5 | 
 6 | from regexploit.bin.regexploit_python_ast import handle_file
 7 | from regexploit.found_regex import FoundRegex
 8 | from regexploit.languages.python_node_visitor import PythonNodeVisitor
 9 | 
10 | 
11 | def patterns_from_code(code: str):
12 |     pnv = PythonNodeVisitor()
13 |     code = textwrap.dedent(code)
14 |     pnv.visit(ast.parse(code))
15 |     return pnv.patterns
16 | 
17 | 
18 | def test_basic():
19 |     code = """
20 |     MY_RE = "abc+d+"
21 |     def x():
22 |         '''Just*a*docstring*'''
23 |         a = "nostarsorpluses"
24 |         b = "(" + re.sub("aregex", "*****", "notaregex", flags=re.A) + ")"
25 |         return re.compile(b"x*y*z", re.X | re.MULTILINE)
26 |     """
27 |     patterns = patterns_from_code(code)
28 |     assert len(patterns) == 3
29 |     assert patterns[0] == FoundRegex(2, "abc+d+", 0, False)
30 |     assert patterns[1] == FoundRegex(6, "aregex", re.A, True)
31 |     assert patterns[2] == FoundRegex(7, "x*y*z", re.X | re.MULTILINE, True)
32 | 
33 | 
34 | def test_file():
35 |     output = Mock(spec=["next"])
36 |     handle_file(__file__, output)
37 |     assert output.next.call_count == 2  # abc+d+, x*y*z, code string errors
38 | 


--------------------------------------------------------------------------------
/tests/test_redos.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import pytest
  4 | 
  5 | from regexploit.ast.sequence import Sequence
  6 | from regexploit.ast.sre import SreOpParser
  7 | from regexploit.redos import find
  8 | 
  9 | 
 10 | def from_regex(pattern: str, flags: int = 0) -> Sequence:
 11 |     return SreOpParser().parse_sre(pattern, flags)
 12 | 
 13 | 
 14 | def find_redos(pattern: str, flags: int = 0):
 15 |     return find(from_regex(pattern, flags))
 16 | 
 17 | 
 18 | def test_no_repeats():
 19 |     assert len(find_redos(r"aaaaa[abc](\w[\wz]){1,7}X[^x]")) == 0
 20 | 
 21 | 
 22 | def test_simple_repeat1():
 23 |     (r,) = find_redos(r"abd\w*[def]+\w+[de]!")
 24 |     assert r.starriness == 3
 25 |     assert r.repeated_character == from_regex("[de]")
 26 |     assert r.example_prefix == "abd"
 27 |     assert r.killer is None
 28 | 
 29 | 
 30 | def test_simple_repeat2():
 31 |     rs = find_redos(r"\w*x0*\d*\.?\d\.?\d+4")
 32 |     assert len(rs)
 33 |     r = rs[0]
 34 |     assert r.starriness == 3
 35 |     assert r.repeated_character == from_regex("0")
 36 |     assert r.example_prefix == "x"
 37 |     assert r.killer is None
 38 | 
 39 | 
 40 | def test_simple_best_repeat():
 41 |     rs = find_redos(r"\d*0*\d*x?\dx?\d+4")
 42 |     assert len(rs) > 1
 43 |     a = rs[0]
 44 |     assert a.starriness == 4
 45 |     assert a.repeated_character == from_regex("0")
 46 |     assert a.example_prefix == ""
 47 |     assert a.killer is None
 48 |     assert rs[1].starriness == 3
 49 | 
 50 | 
 51 | def test_backtrack():
 52 |     rs = find_redos(r"[abc]+\w+[ab]+a")
 53 |     r = rs[0]
 54 |     assert r.starriness == 3
 55 |     assert len(r.redos_sequence) == 4
 56 |     assert r.killer == from_regex("[^a]")
 57 |     assert r.example() == "'b' * 3456"
 58 | 
 59 | 
 60 | def test_real_hbbtv():
 61 |     rs = find_redos(
 62 |         r"(HbbTV)/[0-9]+\.[0-9]+\.[0-9]+ \([^;]*; *(LG)E *; *([^;]*) *;[^;]*;[^;]*;\)"
 63 |     )
 64 |     r = rs[0]
 65 |     assert r.starriness == 3
 66 |     assert len(r.redos_sequence) == 4
 67 |     assert r.repeated_character == from_regex(" ")
 68 |     assert r.example_prefix.startswith("HbbTV/")
 69 |     assert r.example_prefix.endswith("(;LGE;")
 70 | 
 71 | 
 72 | def test_real_branching():
 73 |     rs = [
 74 |         redos
 75 |         for redos in find_redos(
 76 |             r"(HbbTV)/[0-9]+\.[0-9]+\.[0-9]+ \([^;]*; *(?:CUS:([^;]*)|([^;]+)) *; *([^;]*) *;.*;"
 77 |         )
 78 |         if redos.starriness >= 3
 79 |     ]
 80 |     assert all(r.starriness == 3 for r in rs)
 81 |     assert all(r.killer is None for r in rs)
 82 |     assert all(r.repeated_character == from_regex(" ") for r in rs)
 83 |     assert {r.example_prefix for r in rs} == {
 84 |         "HbbTV/0.0.0 (;CUS:;",
 85 |         "HbbTV/0.0.0 (;",
 86 |         "HbbTV/0.0.0 (;0;",
 87 |     }
 88 | 
 89 | 
 90 | def test_dollar():
 91 |     rs = find_redos(r"^a+(b*b*b*)$")
 92 |     r = rs[0]
 93 |     assert r.starriness == 3
 94 |     assert r.repeated_character == from_regex(r"b")
 95 |     assert r.killer == from_regex(r"[^b]")
 96 | 
 97 | 
 98 | def test_real_cpython_cookielib():
 99 |     # We don't support the (?!) assertions, but can still find ReDoS
100 |     LOOSE_HTTP_DATE_RE = r"""^
101 |         (\d\d?)            # day
102 |            (?:\s+|[-\/])
103 |         (\w+)              # month
104 |             (?:\s+|[-\/])
105 |         (\d+)              # year
106 |         (?:
107 |               (?:\s+|:)    # separator before clock
108 |            (\d\d?):(\d\d)  # hour:min
109 |            (?::(\d\d))?    # optional seconds
110 |         )?                 # optional clock
111 |            \s*
112 |         ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
113 |            \s*
114 |         (?:\(\w+\))?       # ASCII representation of timezone in parens.
115 |            \s*$"""
116 |     rs = find_redos(LOOSE_HTTP_DATE_RE, re.X)
117 |     r = rs[0]
118 |     assert r.starriness == 3
119 |     assert r.repeated_character == from_regex(r"\s")
120 |     assert r.killer == from_regex(r"[^\s]")
121 | 
122 | 
123 | def test_real_cpython_cve():
124 |     rs = find_redos(r"(,*,)*(,+)[ \t]")
125 |     r = rs[0]
126 |     assert r.starriness == 12  # exponential
127 | 
128 | 
129 | def test_real_ssri():
130 |     rs = find_redos(r"^([A-Za-z0-9+/=]{4})(\?[\x21-\x7E]*)*$")
131 |     r = rs[0]
132 |     assert r.starriness > 10
133 |     assert r.repeated_character == from_regex(r"\?")
134 |     assert r.example_prefix == "0000"
135 | 
136 | 
137 | def test_real_pdf():
138 |     # \012 == \n == \x0a
139 |     rs = find_redos(
140 |         r"t[\011\012\015\040]*\<\<(.*?\>\>)[\011\012\015\040]*[\r\n]+[\011\012\015\040]*s"
141 |     )
142 |     r = rs[0]
143 |     assert r.starriness == 3
144 |     assert r.repeated_character == from_regex(r"[\n\r]")
145 |     assert r.example_prefix == "t<<>>"
146 |     assert not r.killer
147 | 
148 | 
149 | def test_real_markdown():
150 |     # \s\S == .
151 |     rs = find_redos(r"\(\s*(<)?([\s\S]*?)(?(2)>)(?:\s+'([\s\S]*?)')?\s*\)")
152 |     r = rs[0]
153 |     assert r.starriness == 3
154 |     assert r.repeated_character == from_regex(r"\s")
155 |     assert r.example_prefix == "("
156 |     assert not r.killer
157 | 
158 | 
159 | def test_backtrack_repeated_char():
160 |     # (' ' * 3456 + '\t') won't backtrack because of the .* after
161 |     rs = find_redos(r"#\s*\s*\s*([^ \t]+)(.*)$")
162 |     r = rs[0]
163 |     assert r.starriness == 3
164 |     assert r.repeated_character == from_regex(r"\s")
165 |     assert r.example_prefix == "#"
166 |     assert r.killer == from_regex(r"[ \t]")
167 |     assert (
168 |         r.example() == "'#' + ' ' * 3456"
169 |     ), "Merge repeated character and killer in example if possible"
170 | 
171 | 
172 | @pytest.mark.parametrize(
173 |     "r",
174 |     [
175 |         r"a+",
176 |         r"a(aa)+a",
177 |         r"a*",
178 |         r"\w*b?c*(def|gh+i|$|\b||)+",
179 |     ],
180 | )
181 | def test_groupref(r):
182 |     rs = find_redos(fr"({r})(a+)\1(a+)b")
183 |     r = rs[0]
184 |     assert r.starriness == 3
185 |     assert r.repeated_character == from_regex(r"a")
186 | 
187 | 
188 | def test_groupref_not_starry_itself():
189 |     rs = find_redos(r"(a+)(a+)\1b")
190 |     assert not rs
191 | 
192 | 
193 | def test_groupref_false_positive():
194 |     # from codemirror
195 |     rs = find_redos(r"^([*\-_])(?:\s*\1){2,}\s*$")
196 |     assert not rs
197 | 
198 | 
199 | def test_optional_starry():
200 |     # ua-parser CFNetwork
201 |     rs = find_redos(r"(\d+).?(\d+)?.?(\d+)?.?(\d+)?C")
202 |     r = rs[0]
203 |     assert r.starriness == 4
204 |     assert r.repeated_character == from_regex(r"\d")
205 | 
206 | 
207 | def test_negative_lookahead():
208 |     # The final (?!c) isn't actually doing anything yet
209 |     rs = find_redos(r"[abc]+(?!c)[abc]+(?!b)([abc]+[abc])(?!c)[abc]*x")
210 |     r = rs[0]
211 |     assert r.starriness == 4
212 |     assert r.repeated_character == from_regex(r"a")
213 | 
214 | 
215 | @pytest.mark.parametrize(
216 |     "r",
217 |     [
218 |         r"(a?b+)+c",
219 |         r"(x*[ab]*x?[bc]*x?)*c",
220 |         r"(x?[ab]+x?[bc]+\w*x?)*c",
221 |     ],
222 | )
223 | def test_regexlib_sequence_exponential(r):
224 |     rs = find_redos(r)
225 |     r = rs[0]
226 |     assert r.starriness > 10
227 |     assert r.repeated_character == from_regex(r"b")
228 |     assert r.killer is None
229 | 
230 | 
231 | def test_dt_branch_exponential():
232 |     rs = find_redos(r"a(z|\w*b)*d")
233 |     r = rs[0]
234 |     assert r.starriness == 11
235 |     assert r.repeated_character == from_regex(r"b")
236 |     assert r.killer is None
237 | 
238 | 
239 | def test_node_forge_false_positive():
240 |     rs = find_redos(r"\s*([^=]*)=?([^;]*)(;|$)")
241 |     assert not rs
242 | 
243 | 
244 | def test_ruby_maruku_false_positive():
245 |     rs = find_redos(r"(\S.*\S)*\s*")
246 |     assert not rs
247 | 
248 | 
249 | def test_real_httplib2():
250 |     rs = find_redos(
251 |         r"^(?:\s*(?:,\s*)?([^ \t\r\n=]+)\s*=\s*\"?((?<=\")(?:[^\\\"]|\\.)*?(?=\")|(?<!\")[^ \t\r\n,]+(?!\"))\"?)(.*)$"
252 |     )
253 |     r = rs[0]
254 |     assert r.starriness == 3
255 |     assert r.repeated_character == from_regex(r"[\f\v\xa0]")
256 |     assert r.killer is None
257 | 
258 | 
259 | def test_real_markdown2():
260 |     rs = find_redos(r"\s*?([\w+-]+)?\s*?\n(.*?)^```")
261 |     r = rs[0]
262 |     assert r.starriness == 3
263 |     assert r.repeated_character == from_regex(r"\n")
264 |     assert r.killer is None
265 | 
266 | 
267 | def test_editorconfig_false_positive():
268 |     rs = find_redos(r"\s*(.*?)\s*([#;].*)?$")
269 |     assert not rs
270 | 


--------------------------------------------------------------------------------
/tests/test_repeat.py:
--------------------------------------------------------------------------------
 1 | from sre_parse import parse as sre_parse
 2 | from typing import Union  # noqa: I100, I201
 3 | 
 4 | from regexploit.ast.char import Character
 5 | from regexploit.ast.repeat import Repeat
 6 | from regexploit.ast.sre import SreOpParser
 7 | 
 8 | 
 9 | def from_regex(pattern: str) -> Union[Repeat, Character]:
10 |     (parsed_char,) = sre_parse(pattern)
11 |     repeat = SreOpParser().parse_op(*parsed_char)
12 |     return repeat
13 | 
14 | 
15 | def test_star():
16 |     r = from_regex(r"(abc)*")
17 |     assert r.starriness == 1
18 |     assert r.minimum_length == 0
19 |     assert r.exact_character_class() is None
20 | 
21 | 
22 | def test_question():
23 |     r = from_regex(r"(abc)?")
24 |     assert r.starriness == 0
25 |     assert r.minimum_length == 0
26 |     assert r.maximum_repeats == 1
27 |     assert r.exact_character_class() is None
28 | 
29 | 
30 | def test_plus():
31 |     r = from_regex(r"(?:abc)+")
32 |     assert r.starriness == 1
33 |     assert r.minimum_length == 3
34 |     assert r.exact_character_class() is None
35 | 
36 | 
37 | def test_character_class():
38 |     r = from_regex(r"a{4,}")
39 |     assert r.starriness == 1
40 |     assert r.minimum_length == 4
41 |     assert r.exact_character_class() == from_regex(r"a")
42 | 
43 | 
44 | def test_subsequence_character_class():
45 |     r = from_regex(r"(a?b+)*")
46 |     assert r.starriness == 11
47 |     assert r.minimum_length == 0
48 |     assert r.exact_character_class() is None
49 |     assert r.overall_character_class() is None
50 |     inner_repeats = list(r.repeat.matching_repeats())
51 |     assert len(inner_repeats) == 1
52 |     assert inner_repeats[0].overall_character_class() == from_regex(r"b")
53 | 
54 | 
55 | def test_negative_lookahead_infinite():
56 |     r = SreOpParser().parse_sre(r"(?!b)[a-d]+")
57 |     assert r == SreOpParser().parse_sre(r"[acd][a-d]*")
58 | 
59 | 
60 | def test_negative_lookahead_finite():
61 |     r = SreOpParser().parse_sre(r"(?!b)[a-d]{1,3}")
62 |     assert r == SreOpParser().parse_sre(r"[acd][a-d]{0,2}")
63 | 
64 | 
65 | def test_exponential_starriness():
66 |     r = from_regex(r"(?:(?:a{4,})*)+")
67 |     assert r.starriness == 111  # ((1 * 10) * 10) + 1
68 |     assert r.minimum_length == 0
69 |     assert r.exact_character_class() == from_regex(r"a")
70 | 
71 | 
72 | def test_exponential_starriness2():
73 |     r = from_regex(r"(?:(?:a{4,}bc+)*)+")
74 |     assert r.starriness == 211  # ((2 * 10) * 10) + 1
75 |     assert r.minimum_length == 0
76 |     assert r.exact_character_class() is None
77 | 


--------------------------------------------------------------------------------