├── .editorconfig
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── python-build.yml
    │   ├── release.yml
    │   └── rust-build.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── README_PYPI.md
├── RELEASE_NOTES.md
├── benches
    ├── benchmark.rs
    └── testcases.txt
├── demo.gif
├── demo.tape
├── grex.pyi
├── logo.png
├── pyproject.toml
├── requirements.txt
├── src
    ├── builder.rs
    ├── cluster.rs
    ├── component.rs
    ├── config.rs
    ├── dfa.rs
    ├── expression.rs
    ├── format.rs
    ├── grapheme.rs
    ├── lib.rs
    ├── macros.rs
    ├── main.rs
    ├── python.rs
    ├── quantifier.rs
    ├── regexp.rs
    ├── substring.rs
    ├── unicode_tables
    │   ├── decimal.rs
    │   ├── mod.rs
    │   ├── space.rs
    │   └── word.rs
    └── wasm.rs
├── tests
    ├── cli_integration_tests.rs
    ├── lib_integration_tests.rs
    ├── property_tests.rs
    ├── python
    │   └── test_grex.py
    ├── wasm_browser_tests.rs
    └── wasm_node_tests.rs
└── website.jpg


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Editor configuration, see http://editorconfig.org
16 | root = true
17 | 
18 | [*.rs]
19 | charset = utf-8
20 | indent_style = space
21 | indent_size = 4
22 | insert_final_newline = true
23 | trim_trailing_whitespace = false
24 | max_line_length = 100
25 | 
26 | [*.md]
27 | max_line_length = off
28 | trim_trailing_whitespace = false
29 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "cargo"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 | 
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/python-build.yml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | name: Python Build
17 | 
18 | on:
19 |   push:
20 |     branches:
21 |       - main
22 |     paths:
23 |       - 'Cargo.lock'
24 |       - 'Cargo.toml'
25 |       - 'pyproject.toml'
26 |       - 'requirements.txt'
27 |       - 'src/**'
28 |       - 'tests/**'
29 |       - '**.yml'
30 |   pull_request:
31 |     branches:
32 |       - main
33 |     paths:
34 |       - 'Cargo.lock'
35 |       - 'Cargo.toml'
36 |       - 'pyproject.toml'
37 |       - 'requirements.txt'
38 |       - 'src/**'
39 |       - 'tests/**'
40 |       - '**.yml'
41 | 
42 | jobs:
43 |   python-build:
44 |       name: Python ${{ matrix.python-version }} on ${{ matrix.name }}
45 | 
46 |       runs-on: ${{ matrix.os }}
47 | 
48 |       strategy:
49 |         fail-fast: false
50 |         matrix:
51 |           os: [ ubuntu-latest, macos-latest, windows-latest ]
52 |           python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ]
53 |           include:
54 |             - os: ubuntu-latest
55 |               name: Linux 64-Bit
56 | 
57 |             - os: macos-latest
58 |               name: MacOS 64-Bit
59 | 
60 |             - os: windows-latest
61 |               name: Windows 64-Bit
62 | 
63 |       steps:
64 |         - name: Check out repository
65 |           uses: actions/checkout@v4
66 | 
67 |         - name: Set up Python
68 |           uses: actions/setup-python@v5
69 |           with:
70 |             python-version: ${{ matrix.python-version }}
71 |             cache: 'pip'
72 | 
73 |         - name: Install maturin and pytest
74 |           run: pip install -r requirements.txt
75 | 
76 |         - name: Build Python extension
77 |           run: maturin build
78 | 
79 |         - name: Install Python extension
80 |           run: pip install --find-links=target/wheels grex
81 | 
82 |         - name: Run Python unit tests
83 |           run: pytest tests/python/test_grex.py
84 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | name: Release
 17 | 
 18 | on:
 19 |   push:
 20 |     tags:
 21 |       - v1.*
 22 | 
 23 | jobs:
 24 |   rust-release-build:
 25 |     name: ${{ matrix.name }}
 26 | 
 27 |     runs-on: ${{ matrix.os }}
 28 | 
 29 |     strategy:
 30 |       matrix:
 31 |         os: [ubuntu-latest, macos-latest, windows-latest]
 32 |         include:
 33 |           - os: ubuntu-latest
 34 |             name: Rust Release Build on Linux
 35 |             x86_64-target: x86_64-unknown-linux-musl
 36 |             aarch64-target: aarch64-unknown-linux-musl
 37 | 
 38 |           - os: macos-latest
 39 |             name: Rust Release Build on MacOS
 40 |             x86_64-target: x86_64-apple-darwin
 41 |             aarch64-target: aarch64-apple-darwin
 42 | 
 43 |           - os: windows-latest
 44 |             name: Rust Release Build on Windows
 45 |             x86_64-target: x86_64-pc-windows-msvc
 46 |             aarch64-target: aarch64-pc-windows-msvc
 47 | 
 48 |     steps:
 49 |       - name: Check out repository
 50 |         uses: actions/checkout@v4
 51 | 
 52 |       - name: Build x86_64 target in release mode
 53 |         uses: houseabsolute/actions-rust-cross@v0
 54 |         with:
 55 |           target: ${{ matrix.x86_64-target }}
 56 |           args: '--release --locked'
 57 | 
 58 |       - name: Build aarch64 target in release mode
 59 |         uses: houseabsolute/actions-rust-cross@v0
 60 |         with:
 61 |           target: ${{ matrix.aarch64-target }}
 62 |           args: '--release --locked'
 63 | 
 64 |       - name: Get latest release version number
 65 |         id: get_version
 66 |         uses: battila7/get-version-action@v2
 67 | 
 68 |       - name: Create x86_64 zip file on Windows
 69 |         if: ${{ matrix.os == 'windows-latest' }}
 70 |         run: |
 71 |           choco install zip
 72 |           cd target/${{ matrix.x86_64-target }}/release
 73 |           zip grex-${{ steps.get_version.outputs.version }}-${{ matrix.x86_64-target }}.zip grex.exe
 74 |           cd ../../..
 75 | 
 76 |       - name: Create aarch64 zip file on Windows
 77 |         if: ${{ matrix.os == 'windows-latest' }}
 78 |         run: |
 79 |           cd target/${{ matrix.aarch64-target }}/release
 80 |           zip grex-${{ steps.get_version.outputs.version }}-${{ matrix.aarch64-target }}.zip grex.exe
 81 |           cd ../../..
 82 | 
 83 |       - name: Create x86_64 tar.gz file on Linux and macOS
 84 |         if: ${{ matrix.os != 'windows-latest' }}
 85 |         run: |
 86 |           chmod +x target/${{ matrix.x86_64-target }}/release/grex
 87 |           tar -zcf target/${{ matrix.x86_64-target }}/release/grex-${{ steps.get_version.outputs.version }}-${{ matrix.x86_64-target }}.tar.gz -C target/${{ matrix.x86_64-target }}/release grex
 88 | 
 89 |       - name: Create aarch64 tar.gz file on Linux and macOS
 90 |         if: ${{ matrix.os != 'windows-latest' }}
 91 |         run: |
 92 |           chmod +x target/${{ matrix.aarch64-target }}/release/grex
 93 |           tar -zcf target/${{ matrix.aarch64-target }}/release/grex-${{ steps.get_version.outputs.version }}-${{ matrix.aarch64-target }}.tar.gz -C target/${{ matrix.aarch64-target }}/release grex
 94 | 
 95 |       - name: Upload release and assets to GitHub
 96 |         uses: svenstaro/upload-release-action@v2
 97 |         with:
 98 |           repo_token: ${{ secrets.GITHUB_TOKEN }}
 99 |           tag: ${{ github.ref }}
100 |           release_name: grex ${{ steps.get_version.outputs.version-without-v }}
101 |           file_glob: true
102 |           file: target/*/release/grex-${{ steps.get_version.outputs.version }}-*.{zip,tar.gz}
103 | 
104 |   python-linux-release-build:
105 |     name: Python Release Build on Linux and target ${{ matrix.target }}
106 |     needs: rust-release-build
107 | 
108 |     runs-on: ubuntu-latest
109 | 
110 |     strategy:
111 |       matrix:
112 |         target: [ x86_64, x86, aarch64 ]
113 |         linux: [ auto, musllinux_1_2 ]
114 | 
115 |     steps:
116 |       - name: Check out repository
117 |         uses: actions/checkout@v4
118 | 
119 |       - name: Build wheels
120 |         uses: PyO3/maturin-action@v1
121 |         with:
122 |           target: ${{ matrix.target }}
123 |           args: --release --out dist -i 3.8 3.9 3.10 3.11 3.12 pypy3.8 pypy3.9 pypy3.10
124 |           sccache: 'true'
125 |           manylinux: ${{ matrix.linux }}
126 | 
127 |       - name: Upload wheels
128 |         uses: actions/upload-artifact@v4
129 |         with:
130 |           name: linux-${{ matrix.linux }}-${{ matrix.target }}-wheels
131 |           path: dist
132 | 
133 |   python-windows-release-build:
134 |     name: Python Release Build on Windows and target ${{ matrix.target }}
135 |     needs: rust-release-build
136 | 
137 |     runs-on: windows-latest
138 | 
139 |     strategy:
140 |       matrix:
141 |         target: [ x64, x86 ]
142 | 
143 |     steps:
144 |       - name: Check out repository
145 |         uses: actions/checkout@v4
146 | 
147 |       - name: Build wheels
148 |         uses: PyO3/maturin-action@v1
149 |         with:
150 |           target: ${{ matrix.target }}
151 |           args: --release --out dist -i 3.8 3.9 3.10 3.11 3.12
152 |           sccache: 'true'
153 | 
154 |       - name: Upload wheels
155 |         uses: actions/upload-artifact@v4
156 |         with:
157 |           name: windows-${{ matrix.target }}-wheels
158 |           path: dist
159 | 
160 |   python-macos-release-build:
161 |     name: Python Release Build on MacOS and target ${{ matrix.target }}
162 |     needs: rust-release-build
163 | 
164 |     runs-on: macos-latest
165 | 
166 |     strategy:
167 |       matrix:
168 |         target: [ x86_64, aarch64 ]
169 | 
170 |     steps:
171 |       - name: Check out repository
172 |         uses: actions/checkout@v4
173 | 
174 |       - name: Build wheels
175 |         uses: PyO3/maturin-action@v1
176 |         with:
177 |           target: ${{ matrix.target }}
178 |           args: --release --out dist -i 3.8 3.9 3.10 3.11 3.12 pypy3.8 pypy3.9 pypy3.10
179 |           sccache: 'true'
180 | 
181 |       - name: Upload wheels
182 |         uses: actions/upload-artifact@v4
183 |         with:
184 |           name: macos-${{ matrix.target }}-wheels
185 |           path: dist
186 | 
187 |   python-release-upload:
188 |     name: Publish wheels to PyPI
189 |     needs: [ python-linux-release-build, python-windows-release-build, python-macos-release-build ]
190 | 
191 |     runs-on: ubuntu-latest
192 | 
193 |     steps:
194 |       - name: Download wheels from previous jobs
195 |         uses: actions/download-artifact@v4
196 |         with:
197 |           path: wheels
198 |           merge-multiple: true
199 | 
200 |       - name: Upload to PyPI
201 |         uses: PyO3/maturin-action@v1
202 |         env:
203 |           MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
204 |         with:
205 |           command: upload
206 |           args: --skip-existing wheels/*.whl
207 | 
208 |   rust-release-upload:
209 |     name: Upload to crates.io
210 |     needs: [ python-linux-release-build, python-windows-release-build, python-macos-release-build ]
211 | 
212 |     runs-on: ubuntu-latest
213 | 
214 |     steps:
215 |       - name: Check out repository
216 |         uses: actions/checkout@v4
217 | 
218 |       - name: Upload release to crates.io
219 |         uses: katyo/publish-crates@v2
220 |         with:
221 |           registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }}
222 | 


--------------------------------------------------------------------------------
/.github/workflows/rust-build.yml:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | name: Rust Build
 17 | 
 18 | on:
 19 |   push:
 20 |     branches:
 21 |       - main
 22 |     paths:
 23 |       - 'Cargo.lock'
 24 |       - 'Cargo.toml'
 25 |       - 'src/**'
 26 |       - 'tests/**'
 27 |       - '**.yml'
 28 |   pull_request:
 29 |     branches:
 30 |       - main
 31 |     paths:
 32 |       - 'Cargo.lock'
 33 |       - 'Cargo.toml'
 34 |       - 'src/**'
 35 |       - 'tests/**'
 36 |       - '**.yml'
 37 | 
 38 | jobs:
 39 |   rust-build:
 40 |     name: Rust on ${{ matrix.name }}
 41 | 
 42 |     runs-on: ${{ matrix.os }}
 43 | 
 44 |     strategy:
 45 |       fail-fast: false
 46 |       matrix:
 47 |         os: [ubuntu-latest, macos-latest, windows-latest]
 48 |         include:
 49 |           - os: ubuntu-latest
 50 |             name: Linux 64-Bit
 51 |             target: x86_64-unknown-linux-musl
 52 | 
 53 |           - os: macos-latest
 54 |             name: MacOS 64-Bit
 55 |             target: x86_64-apple-darwin
 56 |             env:
 57 |               MACOSX_DEPLOYMENT_TARGET: 10.7
 58 | 
 59 |           - os: windows-latest
 60 |             name: Windows 64-Bit
 61 |             target: x86_64-pc-windows-msvc
 62 | 
 63 |     steps:
 64 |       - name: Check out repository
 65 |         uses: actions/checkout@v4
 66 | 
 67 |       - name: Add rustup target
 68 |         run: rustup target add ${{ matrix.target }}
 69 | 
 70 |       - name: Store or retrieve cargo caches
 71 |         uses: actions/cache@v4
 72 |         with:
 73 |           path: |
 74 |             ~/.cargo/bin/
 75 |             ~/.cargo/registry/index/
 76 |             ~/.cargo/registry/cache/
 77 |             ~/.cargo/git/db/
 78 |             target/
 79 |           key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
 80 | 
 81 |       - name: Build target in debug mode
 82 |         run: cargo build --target ${{ matrix.target }} --locked
 83 | 
 84 |       - name: Test target in debug mode
 85 |         run: cargo test --target ${{ matrix.target }}
 86 | 
 87 |   wasm-build:
 88 |     name: WASM Build
 89 |     needs: rust-build
 90 | 
 91 |     runs-on: macos-latest
 92 | 
 93 |     steps:
 94 |       - name: Check out repository
 95 |         uses: actions/checkout@v4
 96 | 
 97 |       - name: Install wasm-pack
 98 |         run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
 99 | 
100 |       - name: Install Firefox and Geckodriver # not available anymore in macos-latest
101 |         run: |
102 |           brew install --cask firefox
103 |           brew install geckodriver
104 | 
105 |       #- name: Enable Safari web driver
106 |       #  run: sudo safaridriver --enable
107 | 
108 |       - name: Run WASM integration tests on NodeJS
109 |         run: wasm-pack test --node -- --no-default-features
110 | 
111 |       - name: Run WASM integration tests in Chrome
112 |         run: wasm-pack test --headless --chrome -- --no-default-features
113 | 
114 |       - name: Run WASM integration tests in Firefox
115 |         run: wasm-pack test --headless --firefox -- --no-default-features
116 | 
117 |       # Safari WASM tests not working, reason unclear
118 |       # https://github.com/pemistahl/grex/actions/runs/12146729784/job/33871544034#step:9:30
119 |       #- name: Run WASM integration tests in Safari
120 |       #  run: wasm-pack test --headless --safari -- --no-default-features
121 | 
122 |   coverage-report:
123 |     name: Coverage Report
124 |     needs: rust-build
125 |     if: ${{ github.event_name == 'push' }}
126 | 
127 |     runs-on: ubuntu-latest
128 | 
129 |     container:
130 |       image: xd009642/tarpaulin:develop-nightly
131 |       options: --security-opt seccomp=unconfined
132 | 
133 |     steps:
134 |       - name: Check out repository
135 |         uses: actions/checkout@v4
136 | 
137 |       - name: Generate coverage report
138 |         run: cargo +nightly tarpaulin --ignore-config --ignore-panics --ignore-tests --exclude-files src/python.rs src/main.rs src/wasm.rs --verbose --timeout 900 --out xml
139 | 
140 |       - name: Workaround for codecov/feedback#263
141 |         run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
142 | 
143 |       - name: Upload coverage report
144 |         uses: codecov/codecov-action@v4
145 |         with:
146 |           token: ${{ secrets.CODECOV_TOKEN }}
147 |           fail_ci_if_error: true
148 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | /pkg/
16 | /target/
17 | **/*.rs.bk
18 | 
19 | .idea
20 | .project
21 | .c9/
22 | *.launch
23 | .settings/
24 | .metadata/
25 | .venv
26 | *.sublime-workspace
27 | bin/
28 | tmp/
29 | out/
30 | *.iml
31 | *.ipr
32 | *.iws
33 | *.bak
34 | *.tmp
35 | *.class
36 | *.html
37 | .buildpath
38 | .classpath
39 | .vscode/*
40 | !.vscode/settings.json
41 | !.vscode/tasks.json
42 | !.vscode/launch.json
43 | !.vscode/extensions.json
44 | 
45 | .DS_Store
46 | Thumbs.db
47 | $RECYCLE.BIN/
48 | ._*
49 | .AppleDouble
50 | .LSOverride
51 | *.lnk
52 | Desktop.ini
53 | ehthumbs.db
54 | 
55 | *.proptest-regressions
56 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | [package]
17 | name = "grex"
18 | version = "1.4.5"
19 | authors = ["Peter M. Stahl <pemistahl@gmail.com>"]
20 | description = """
21 | grex generates regular expressions from user-provided test cases.
22 | """
23 | homepage = "https://github.com/pemistahl/grex"
24 | repository = "https://github.com/pemistahl/grex"
25 | documentation = "https://docs.rs/grex"
26 | license = "Apache-2.0"
27 | readme = "README.md"
28 | edition = "2021"
29 | categories = ["command-line-utilities", "parsing"]
30 | keywords = ["pattern", "regex", "regexp"]
31 | 
32 | [lib]
33 | crate-type = ["cdylib", "rlib"]
34 | 
35 | [dependencies]
36 | itertools = "0.13.0"
37 | lazy_static = "1.5.0"
38 | ndarray = "0.16.1"
39 | petgraph = {version = "0.6.5", default-features = false, features = ["stable_graph"]}
40 | regex = "1.10.6"
41 | unic-char-range = "0.9.0"
42 | unic-ucd-category = "0.9.0"
43 | unicode-segmentation = "1.12.0"
44 | 
45 | [target.'cfg(not(target_family = "wasm"))'.dependencies]
46 | clap = {version = "4.5.22", features = ["derive", "wrap_help"], optional = true}
47 | pyo3 = {version = "0.23.3", optional = true}
48 | 
49 | [target.'cfg(target_family = "wasm")'.dependencies]
50 | wasm-bindgen = "0.2.97"
51 | 
52 | [dev-dependencies]
53 | indoc = "2.0.5"
54 | rstest = "0.23.0"
55 | 
56 | [target.'cfg(not(target_family = "wasm"))'.dev-dependencies]
57 | assert_cmd = "2.0.16"
58 | criterion = "0.5.1"
59 | predicates = "3.1.2"
60 | proptest = "1.5.0"
61 | tempfile = "3.14.0"
62 | 
63 | [target.'cfg(target_family = "wasm")'.dev-dependencies]
64 | wasm-bindgen-test = "0.3.47"
65 | 
66 | [features]
67 | default = ["cli"]
68 | cli = ["clap"]
69 | python = ["pyo3"]
70 | 
71 | [[bench]]
72 | name = "benchmark"
73 | harness = false
74 | 
75 | [profile.bench]
76 | debug = true
77 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README_PYPI.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | ![grex](https://raw.githubusercontent.com/pemistahl/grex/main/logo.png)
  4 | 
  5 | <br>
  6 | 
  7 | [![build status](https://github.com/pemistahl/grex/actions/workflows/python-build.yml/badge.svg)](https://github.com/pemistahl/grex/actions/workflows/python-build.yml)
  8 | [![codecov](https://codecov.io/gh/pemistahl/grex/branch/main/graph/badge.svg)](https://codecov.io/gh/pemistahl/grex)
  9 | [![demo](https://img.shields.io/badge/-Demo%20Website-orange?logo=HTML5&labelColor=white)](https://pemistahl.github.io/grex-js/)
 10 | ![supported Python versions](https://img.shields.io/badge/Python-%3E%3D%203.8-blue?logo=Python&logoColor=yellow)
 11 | [![pypi](https://img.shields.io/badge/PYPI-v1.0.1-blue?logo=PyPI&logoColor=yellow)](https://pypi.org/project/grex)
 12 | [![license](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0)
 13 | </div>
 14 | 
 15 | <br>
 16 | 
 17 | ## 1. What does this library do?
 18 | 
 19 | *grex* is a library that is meant to simplify the often complicated and tedious
 20 | task of creating regular expressions. It does so by automatically generating a
 21 | single regular expression from user-provided test cases. The resulting
 22 | expression is guaranteed to match the test cases which it was generated from.
 23 | 
 24 | This project has started as a [Rust port](https://github.com/pemistahl/grex) of
 25 | the JavaScript tool [*regexgen*](https://github.com/devongovett/regexgen)
 26 | written by [Devon Govett](https://github.com/devongovett). Although a lot of
 27 | further useful features could be added to it, its development was apparently
 28 | ceased several years ago. The Rust library offers new features and extended
 29 | Unicode support. With the help of [PyO3](https://github.com/PyO3/pyo3) and 
 30 | [Maturin](https://github.com/PyO3/maturin), the library has been compiled to a 
 31 | Python extension module so that it can be used within any Python software as well.
 32 | 
 33 | The philosophy of this project is to generate the most specific regular expression
 34 | possible by default which exactly matches the given input only and nothing else.
 35 | With the use of preprocessing methods, more generalized expressions can be created.
 36 | 
 37 | The produced expressions are [Perl-compatible regular expressions](https://www.pcre.org) which are also
 38 | compatible with the [regular expression module](https://docs.python.org/3/library/re.html) in Python's 
 39 | standard library.
 40 | 
 41 | There is a [demo website](https://pemistahl.github.io/grex-js/) available where you can give grex a try.
 42 | 
 43 | ![demo website](https://raw.githubusercontent.com/pemistahl/grex/main/website.jpg)
 44 | 
 45 | ## 2. Do I still need to learn to write regexes then?
 46 | 
 47 | **Definitely, yes!** Using the standard settings, *grex* produces a regular expression that is guaranteed
 48 | to match only the test cases given as input and nothing else. However, if the conversion to shorthand
 49 | character classes such as `\w` is enabled, the resulting regex matches a much wider scope of test cases.
 50 | Knowledge about the consequences of this conversion is essential for finding a correct regular expression
 51 | for your business domain.
 52 | 
 53 | *grex* uses an algorithm that tries to find the shortest possible regex for the given test cases.
 54 | Very often though, the resulting expression is still longer or more complex than it needs to be.
 55 | In such cases, a more compact or elegant regex can be created only by hand.
 56 | Also, every regular expression engine has different built-in optimizations. *grex* does not know anything
 57 | about those and therefore cannot optimize its regexes for a specific engine.
 58 | 
 59 | **So, please learn how to write regular expressions!** The currently best use case for *grex* is to find
 60 | an initial correct regex which should be inspected by hand if further optimizations are possible.
 61 | 
 62 | ## 3. Current Features
 63 | 
 64 | - literals
 65 | - character classes
 66 | - detection of common prefixes and suffixes
 67 | - detection of repeated substrings and conversion to `{min,max}` quantifier notation
 68 | - alternation using `|` operator
 69 | - optionality using `?` quantifier
 70 | - escaping of non-ascii characters, with optional conversion of astral code points to surrogate pairs
 71 | - case-sensitive or case-insensitive matching
 72 | - capturing or non-capturing groups
 73 | - optional anchors `^` and `$`
 74 | - fully compliant to [Unicode Standard 15.0](https://unicode.org/versions/Unicode15.0.0)
 75 | - correctly handles graphemes consisting of multiple Unicode symbols
 76 | - produces more readable expressions indented on multiple using optional verbose mode
 77 | - optional syntax highlighting for nicer output in supported terminals
 78 | 
 79 | ## 4. How to install?
 80 | 
 81 | *grex* is available in the [Python Package Index](https://pypi.org/project/grex) and can be installed with:
 82 | 
 83 | ```
 84 | pip install grex
 85 | ```
 86 | 
 87 | The current version 1.0.1 corresponds to the latest version 1.4.5 of the Rust
 88 | library and command-line tool.
 89 | 
 90 | ## 5. How to use?
 91 | 
 92 | This library contains a single class named `RegExpBuilder` that can be imported like so:
 93 | 
 94 | ```python
 95 | from grex import RegExpBuilder
 96 | ```
 97 | 
 98 | ### 5.1 Default settings
 99 | 
100 | ```python
101 | pattern = RegExpBuilder.from_test_cases(["a", "aa", "aaa"]).build()
102 | assert pattern == "^a(?:aa?)?$"
103 | ```
104 | 
105 | ### 5.2 Convert to character classes
106 | 
107 | ```python
108 | pattern = (RegExpBuilder.from_test_cases(["a", "aa", "123"])
109 |     .with_conversion_of_digits()
110 |     .with_conversion_of_words()
111 |     .build())
112 | assert pattern == "^(?:\\d\\d\\d|\\w(?:\\w)?)$"
113 | ```
114 | 
115 | ### 5.3 Convert repeated substrings
116 | 
117 | ```python
118 | pattern = (RegExpBuilder.from_test_cases(["aa", "bcbc", "defdefdef"])
119 |     .with_conversion_of_repetitions()
120 |     .build())
121 | assert pattern == "^(?:a{2}|(?:bc){2}|(?:def){3})$"
122 | ```
123 | 
124 | By default, *grex* converts each substring this way which is at least a single character long
125 | and which is subsequently repeated at least once. You can customize these two parameters if you like.
126 | 
127 | In the following example, the test case `aa` is not converted to `a{2}` because the repeated substring
128 | `a` has a length of 1, but the minimum substring length has been set to 2.
129 | 
130 | ```python
131 | pattern = (RegExpBuilder.from_test_cases(["aa", "bcbc", "defdefdef"])
132 |     .with_conversion_of_repetitions()
133 |     .with_minimum_substring_length(2)
134 |     .build())
135 | assert pattern == "^(?:aa|(?:bc){2}|(?:def){3})$"
136 | ```
137 | 
138 | Setting a minimum number of 2 repetitions in the next example, only the test case `defdefdef` will be
139 | converted because it is the only one that is repeated twice.
140 | 
141 | ```python
142 | pattern = (RegExpBuilder.from_test_cases(["aa", "bcbc", "defdefdef"])
143 |     .with_conversion_of_repetitions()
144 |     .with_minimum_repetitions(2)
145 |     .build())
146 | assert pattern == "^(?:bcbc|aa|(?:def){3})$"
147 | ```
148 | 
149 | ### 5.4 Escape non-ascii characters
150 | 
151 | ```python
152 | pattern = (RegExpBuilder.from_test_cases(["You smell like 💩."])
153 |     .with_escaping_of_non_ascii_chars(use_surrogate_pairs=False)
154 |     .build())
155 | assert pattern == "^You smell like \\U0001f4a9\\.$"
156 | ```
157 | 
158 | Old versions of JavaScript do not support unicode escape sequences for the astral code planes
159 | (range `U+010000` to `U+10FFFF`). In order to support these symbols in JavaScript regular
160 | expressions, the conversion to surrogate pairs is necessary. More information on that matter
161 | can be found [here](https://mathiasbynens.be/notes/javascript-unicode).
162 | 
163 | ```python
164 | pattern = (RegExpBuilder.from_test_cases(["You smell like 💩."])
165 |     .with_escaping_of_non_ascii_chars(use_surrogate_pairs=True)
166 |     .build())
167 | assert pattern == "^You smell like \\ud83d\\udca9\\.$"
168 | ```
169 | 
170 | ### 5.5 Case-insensitive matching
171 | 
172 | The regular expressions that *grex* generates are case-sensitive by default.
173 | Case-insensitive matching can be enabled like so:
174 | 
175 | ```python
176 | pattern = (RegExpBuilder.from_test_cases(["big", "BIGGER"])
177 |     .with_case_insensitive_matching()
178 |     .build())
179 | assert pattern == "(?i)^big(?:ger)?$"
180 | ```
181 | 
182 | ### 5.6 Capturing Groups
183 | 
184 | Non-capturing groups are used by default.
185 | Extending the previous example, you can switch to capturing groups instead.
186 | 
187 | ```python
188 | pattern = (RegExpBuilder.from_test_cases(["big", "BIGGER"])
189 |     .with_case_insensitive_matching()
190 |     .with_capturing_groups()
191 |     .build())
192 | assert pattern == "(?i)^big(ger)?$"
193 | ```
194 | 
195 | ### 5.7 Verbose mode
196 | 
197 | If you find the generated regular expression hard to read, you can enable verbose mode.
198 | The expression is then put on multiple lines and indented to make it more pleasant to the eyes.
199 | 
200 | ```python
201 | import inspect
202 | 
203 | pattern = (RegExpBuilder.from_test_cases(["a", "b", "bcd"])
204 |     .with_verbose_mode()
205 |     .build())
206 | 
207 | assert pattern == inspect.cleandoc("""
208 |     (?x)
209 |     ^
210 |       (?:
211 |         b
212 |         (?:
213 |           cd
214 |         )?
215 |         |
216 |         a
217 |       )
218 |     $
219 |     """
220 | )
221 | ```
222 | 
223 | ### 5.8 Disable anchors
224 | 
225 | By default, the anchors `^` and `$` are put around every generated regular expression in order
226 | to ensure that it matches only the test cases given as input. Often enough, however, it is
227 | desired to use the generated pattern as part of a larger one. For this purpose, the anchors
228 | can be disabled, either separately or both of them.
229 | 
230 | ```python
231 | pattern = (RegExpBuilder.from_test_cases(["a", "aa", "aaa"])
232 |     .without_anchors()
233 |     .build())
234 | assert pattern == "a(?:aa?)?"
235 | ```
236 | 
237 | ## 6. How to build?
238 | 
239 | In order to build the source code yourself, you need the
240 | [stable Rust toolchain](https://www.rust-lang.org/tools/install) installed on your machine
241 | so that [*cargo*](https://doc.rust-lang.org/cargo/), the Rust package manager is available.
242 | 
243 | ```shell
244 | git clone https://github.com/pemistahl/grex.git
245 | cd grex
246 | cargo build
247 | ```
248 | 
249 | To build the Python extension module, create a virtual environment and install [Maturin](https://github.com/PyO3/maturin).
250 | 
251 | ```shell
252 | python -m venv /path/to/virtual/environment
253 | source /path/to/virtual/environment/bin/activate
254 | pip install maturin
255 | maturin build
256 | ```
257 | 
258 | The Rust source code is accompanied by an extensive test suite consisting of unit tests, integration
259 | tests and property tests. For running them, simply say:
260 | 
261 | ```shell
262 | cargo test
263 | ```
264 | 
265 | Additional Python tests can be run after installing pytest which is an optional dependency:
266 | 
267 | ```shell
268 | maturin develop --extras=test
269 | pytest tests/python/test_grex.py
270 | ```
271 | 


--------------------------------------------------------------------------------
/RELEASE_NOTES.md:
--------------------------------------------------------------------------------
  1 | ## grex 1.4.5 (released on 06 Mar 2024)
  2 | 
  3 | ### Improvements
  4 | 
  5 | - Type stubs for the Python bindings are now available, allowing better static code 
  6 |   analysis, better code completion in supported IDEs and easier understanding of the library's API.
  7 | - The code for creating regular expressions in verbose mode has been simplified and is more performant now.
  8 | - ARM64 binaries are now provided for every major platform (Linux, macOs, Windows).
  9 | 
 10 | ### Bug Fixes
 11 | 
 12 | - For a small set of special characters, *grex* produced incorrect regular expressions when
 13 |   the case-insensitivity feature was enabled. This has been fixed.
 14 | 
 15 | ### Changes
 16 | - All dependencies have been updated to their latest versions.
 17 | 
 18 | ## grex 1.4.4 (released on 24 Aug 2023)
 19 | 
 20 | ### Bug Fixes
 21 | - The Python release workflow was incorrect as it produced too many wheels for upload.
 22 |   This has been fixed.
 23 | 
 24 | ## grex 1.4.3 (released on 24 Aug 2023)
 25 | 
 26 | ### Features
 27 | - Python bindings are now available for the library. Use grex within any Python software. (#172)
 28 | 
 29 | ### Changes
 30 | - All dependencies have been updated to their latest versions.
 31 | 
 32 | ## grex 1.4.2 (released on 26 Jul 2023)
 33 | 
 34 | ### Improvements
 35 | - All characters from the current Unicode standard 15.0 are now fully supported. (#128)
 36 | - A proper exit code is now returned if the provided user input cannot be handled by the CLI.
 37 |   Big thanks to @spenserblack for the respective pull request. (#165)
 38 | 
 39 | ### Changes
 40 | - It is not possible anymore to call `RegExpBuilder.with_syntax_highlighting()` in the library
 41 |   as it only makes sense for the CLI.
 42 | - The dependency `atty` has been removed in favor of `std::io::IsTerminal` in Rust >= 1.70.0.
 43 |   As a result, Rust >= 1.70.0 is now needed to compile the CLI. 
 44 | - All remaining dependencies have been updated to their latest versions.
 45 | 
 46 | ### Bug Fixes
 47 | - Several bugs have been fixed that caused incorrect expressions to be generated in rare cases.
 48 | 
 49 | ## grex 1.4.1 (released on 21 Oct 2022)
 50 | 
 51 | ### Changes
 52 | - `clap` has been updated to version 4.0. The help output by `grex -h` now looks a little different.
 53 | 
 54 | ### Bug Fixes
 55 | - A bug in the grapheme segmentation was fixed that caused test cases which contain backslashes to produce
 56 |   incorrect regular expressions.
 57 | 
 58 | ## grex 1.4.0 (released on 26 Jul 2022)
 59 | 
 60 | ### Features
 61 | - The library can now be compiled to WebAssembly and be used in any JavaScript project. (#82)
 62 | - The supported character set for regular expression generation has been updated to the current Unicode Standard 14.0.
 63 | - `structopt` has been replaced with `clap` providing much nicer help output for the command-line tool.
 64 | 
 65 | ### Improvements
 66 | - The regular expression generation performance has been significantly improved, especially for generating very long
 67 |   expressions from a large set of test cases. This has been accomplished by reducing the number of memory allocations,
 68 |   removing deprecated code and applying several minor optimizations.
 69 | 
 70 | ### Bug Fixes
 71 | - Several bugs have been fixed that caused incorrect expressions to be generated in rare cases.
 72 | 
 73 | ## grex 1.3.0 (released on 15 Sep 2021)
 74 | 
 75 | ### Features
 76 | - anchors can now be disabled so that the generated expression can be used as part of a larger one (#30)
 77 | - the command-line tool can now be used within Unix pipelines (#45)
 78 | 
 79 | ### Changes
 80 | - Additional methods have been added to `RegExpBuilder` in order to replace the enum `Feature` and make the library API more consistent. (#47)
 81 | 
 82 | ### Bug Fixes
 83 | - Under rare circumstances, the conversion of repetitions did not work. This has been fixed. (#36)
 84 | 
 85 | ## grex 1.2.0 (released on 28 Mar 2021)
 86 | 
 87 | ### Features
 88 | - verbose mode is now supported with the `--verbose` flag to produce regular expressions which are easier to read (#17)
 89 | 
 90 | ## grex 1.1.0 (released on 17 Apr 2020)
 91 | 
 92 | ### Features
 93 | - case-insensitive matching regexes are now supported with the `--ignore-case` command-line flag or with `Feature::CaseInsensitivity` in the library (#23)
 94 | - non-capturing groups are now the default; capturing groups can be enabled with the `--capture-groups` command-line flag or with `Feature::CapturingGroup` in the library (#15)
 95 | - a lower bound for the conversion of repeated substrings can now be set by specifying `--min-repetitions` and `--min-substring-length` or using the library methods `RegExpBuilder.with_minimum_repetitions()` and `RegExpBuilder.with_minimum_substring_length()` (#10)
 96 | - test cases can now be passed from a file within the library as well using `RegExpBuilder::from_file()` (#13)
 97 | 
 98 | ### Changes
 99 | 
100 | - the rules for the conversion of test cases to shorthand character classes have been updated to be compliant to the newest Unicode Standard 13.0 (#21)
101 | - the dependency on the unmaintained linked-list crate has been removed (#24)
102 | 
103 | ### Bug Fixes
104 | 
105 | - test cases starting with a hyphen are now correctly parsed on the command-line (#12)
106 | - the common substring detection algorithm now uses optionality expressions where possible instead of redundant union operations (#22)
107 | 
108 | ### Test Coverage
109 | - new unit tests, integration tests and property tests have been added
110 | 
111 | ## grex 1.0.0 (released on 02 Feb 2020)
112 | 
113 | ### Features
114 | - conversion to character classes `\d`, `\D`, `\s`, `\S`, `\w`, `\W` is now supported
115 | - repetition detection now works with arbitrarily nested expressions. Input strings such as `aaabaaab` which were previously converted to `^(aaab){2}$` are now converted to `^(a{3}b){2}$`.
116 | - optional syntax highlighting for the produced regular expressions can now be enabled using the `--colorize` command-line flag or with the library method `RegExpBuilder.with_syntax_highlighting()`
117 | 
118 | ### Test Coverage
119 | - new unit tests, integration tests and property tests have been added
120 | 
121 | ## grex 0.3.2 (released on 12 Jan 2020)
122 | 
123 | ### Test Coverage
124 | - new property tests have been added that revealed new bugs
125 | 
126 | ### Bug Fixes
127 | - entire rewrite of the repetition detection algorithm
128 | - the former algorithm produced wrong regular expressions or even panicked for certain test cases
129 | 
130 | ## grex 0.3.1 (released on 06 Jan 2020)
131 | 
132 | ### Test Coverage
133 | - property tests have been added using the [proptest](https://crates.io/crates/proptest) crate 
134 | - big thanks go to [Christophe Biocca](https://github.com/christophebiocca) for pointing me to the concept of property tests in the first place and for writing an initial implementation of these tests
135 | 
136 | ### Bug Fixes
137 | - some regular expression specific characters were not escaped correctly in the generated expression
138 | - expressions consisting of a single alternation such as `^(abc|xyz)$` were missing the outer parentheses. This caused an erroneous match of strings such as `abc123` or `456xyz` because of precedence rules.
139 | - the created DFA was wrong for repetition conversion in some corner cases. The input `a, aa, aaa, aaaa, aaab` previously returned the expression `^a{1,4}b?$` which erroneously matches `aaaab`. Now the correct expression `^(a{3}b|a{1,4})$` is returned.
140 | 
141 | ### Documentation
142 | - some minor documentation updates
143 | 
144 | ## grex 0.3.0 (released on 24 Dec 2019)
145 | 
146 | ### Features
147 | - *grex* is now also available as a library
148 | - escaping of non-ascii characters is now supported with the `-e` flag
149 | - astral code points can be converted to surrogate with the `--with-surrogates` flag
150 | - repeated non-overlapping substrings can be converted to `{min,max}` quantifier notation using the `-r` flag
151 | 
152 | ### Bug Fixes
153 | - many many many bug fixes :-O
154 | 
155 | ## grex 0.2.0 (released on 20 Oct 2019)
156 | 
157 | ### Features
158 | - character classes are now supported
159 | - input strings can now be read from a text file
160 | 
161 | ### Changes
162 | - unicode characters are not escaped anymore by default
163 | - the performance of the DFA minimization algorithm has been improved for large DFAs
164 | - regular expressions are now always surrounded by anchors `^` and `$`
165 | 
166 | ### Bug Fixes
167 | - fixed a bug that caused a panic when giving an empty string as input
168 | 
169 | ## grex 0.1.0 (released on 06 Oct 2019)
170 | 
171 | This is the very first release of *grex*. It aims at simplifying the construction of regular expressions based on matching example input.
172 | 
173 | ### Features
174 | - literals
175 | - detection of common prefixes and suffixes
176 | - alternation using `|` operator
177 | - optionality using `?` quantifier
178 | - concatenation of all of the former
179 | 


--------------------------------------------------------------------------------
/benches/benchmark.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use criterion::{criterion_group, criterion_main, Criterion};
 18 | use grex::RegExpBuilder;
 19 | use itertools::Itertools;
 20 | use std::fs::File;
 21 | use std::io::Read;
 22 | 
 23 | fn load_test_cases() -> Vec<String> {
 24 |     let mut f = File::open("./benches/testcases.txt").expect("Test cases could not be loaded");
 25 |     let mut s = String::new();
 26 |     f.read_to_string(&mut s).unwrap();
 27 |     s.split("\n")
 28 |         .map(|test_case| test_case.to_string())
 29 |         .collect_vec()
 30 | }
 31 | 
 32 | fn benchmark_grex_with_default_settings(c: &mut Criterion) {
 33 |     let test_cases = load_test_cases();
 34 |     c.bench_function("grex with default settings", |bencher| {
 35 |         bencher.iter(|| RegExpBuilder::from(&test_cases).build())
 36 |     });
 37 | }
 38 | 
 39 | fn benchmark_grex_with_conversion_of_repetitions(c: &mut Criterion) {
 40 |     let test_cases = load_test_cases();
 41 |     c.bench_function("grex with conversion of repetitions", |bencher| {
 42 |         bencher.iter(|| {
 43 |             RegExpBuilder::from(&test_cases)
 44 |                 .with_conversion_of_repetitions()
 45 |                 .build()
 46 |         })
 47 |     });
 48 | }
 49 | 
 50 | fn benchmark_grex_with_conversion_of_digits(c: &mut Criterion) {
 51 |     let test_cases = load_test_cases();
 52 |     c.bench_function("grex with conversion of digits", |bencher| {
 53 |         bencher.iter(|| {
 54 |             RegExpBuilder::from(&test_cases)
 55 |                 .with_conversion_of_digits()
 56 |                 .build()
 57 |         })
 58 |     });
 59 | }
 60 | 
 61 | fn benchmark_grex_with_conversion_of_non_digits(c: &mut Criterion) {
 62 |     let test_cases = load_test_cases();
 63 |     c.bench_function("grex with conversion of non-digits", |bencher| {
 64 |         bencher.iter(|| {
 65 |             RegExpBuilder::from(&test_cases)
 66 |                 .with_conversion_of_non_digits()
 67 |                 .build()
 68 |         })
 69 |     });
 70 | }
 71 | 
 72 | fn benchmark_grex_with_conversion_of_words(c: &mut Criterion) {
 73 |     let test_cases = load_test_cases();
 74 |     c.bench_function("grex with conversion of words", |bencher| {
 75 |         bencher.iter(|| {
 76 |             RegExpBuilder::from(&test_cases)
 77 |                 .with_conversion_of_words()
 78 |                 .build()
 79 |         })
 80 |     });
 81 | }
 82 | 
 83 | fn benchmark_grex_with_conversion_of_non_words(c: &mut Criterion) {
 84 |     let test_cases = load_test_cases();
 85 |     c.bench_function("grex with conversion of non-words", |bencher| {
 86 |         bencher.iter(|| {
 87 |             RegExpBuilder::from(&test_cases)
 88 |                 .with_conversion_of_non_words()
 89 |                 .build()
 90 |         })
 91 |     });
 92 | }
 93 | 
 94 | fn benchmark_grex_with_conversion_of_whitespace(c: &mut Criterion) {
 95 |     let test_cases = load_test_cases();
 96 |     c.bench_function("grex with conversion of whitespace", |bencher| {
 97 |         bencher.iter(|| {
 98 |             RegExpBuilder::from(&test_cases)
 99 |                 .with_conversion_of_whitespace()
100 |                 .build()
101 |         })
102 |     });
103 | }
104 | 
105 | fn benchmark_grex_with_conversion_of_non_whitespace(c: &mut Criterion) {
106 |     let test_cases = load_test_cases();
107 |     c.bench_function("grex with conversion of non-whitespace", |bencher| {
108 |         bencher.iter(|| {
109 |             RegExpBuilder::from(&test_cases)
110 |                 .with_conversion_of_non_whitespace()
111 |                 .build()
112 |         })
113 |     });
114 | }
115 | 
116 | fn benchmark_grex_with_case_insensitive_matching(c: &mut Criterion) {
117 |     let test_cases = load_test_cases();
118 |     c.bench_function("grex with case-insensitive matching", |bencher| {
119 |         bencher.iter(|| {
120 |             RegExpBuilder::from(&test_cases)
121 |                 .with_case_insensitive_matching()
122 |                 .build()
123 |         })
124 |     });
125 | }
126 | 
127 | fn benchmark_grex_with_verbose_mode(c: &mut Criterion) {
128 |     let test_cases = load_test_cases();
129 |     c.bench_function("grex with verbose mode", |bencher| {
130 |         bencher.iter(|| RegExpBuilder::from(&test_cases).with_verbose_mode().build())
131 |     });
132 | }
133 | 
134 | criterion_group!(
135 |     benches,
136 |     benchmark_grex_with_default_settings,
137 |     benchmark_grex_with_conversion_of_repetitions,
138 |     benchmark_grex_with_conversion_of_digits,
139 |     benchmark_grex_with_conversion_of_non_digits,
140 |     benchmark_grex_with_conversion_of_words,
141 |     benchmark_grex_with_conversion_of_non_words,
142 |     benchmark_grex_with_conversion_of_whitespace,
143 |     benchmark_grex_with_conversion_of_non_whitespace,
144 |     benchmark_grex_with_case_insensitive_matching,
145 |     benchmark_grex_with_verbose_mode
146 | );
147 | 
148 | criterion_main!(benches);
149 | 


--------------------------------------------------------------------------------
/benches/testcases.txt:
--------------------------------------------------------------------------------
 1 | Rocket Sled
 2 | Elysian Heirloom
 3 | Kaleb's Favor
 4 | Blazing Renegade
 5 | Flash Fire
 6 | Silence
 7 | Talir's Favored
 8 | Timekeeper
 9 | Oasis Sanctuary
10 | Rolant's Favor
11 | Mantle of Justice
12 | Eilyn's Favor
13 | Thunderbird
14 | Primal Incarnation
15 | Vampire Bat
16 | Vara's Favor
17 | Devouring Shadow
18 | Seat of Order
19 | Seat of Fury
20 | Seat of Impulse
21 | Seat of Vengeance
22 | Seat of Glory
23 | Seat of Progress
24 | Seat of Chaos
25 | Seat of Mystery
26 | Seat of Cunning
27 | Seat of Wisdom
28 | Firebomb
29 | Grenadin
30 | Iron Sword
31 | Magmahound
32 | Wisp
33 | Rhinarc
34 | Sentinel
35 | Owl
36 | Gemblade
37 | Frog
38 | Snowball
39 | Pig
40 | Serpent Hatchling
41 | Carnosaur
42 | Stormdancer
43 | Illusionary Dragon
44 | Spiteling
45 | Vengeful Gargoyle
46 | Muertis, Pale Rider
47 | Occi, Pale Rider
48 | Sangu, Pale Rider
49 | Volan, Pale Rider
50 | Direwood Beast
51 | 


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pemistahl/grex/cb71c10815e2216f4941f0b52154fb5d1fc0a01c/demo.gif


--------------------------------------------------------------------------------
/demo.tape:
--------------------------------------------------------------------------------
 1 | # demo.gif created with https://github.com/charmbracelet/vhs on macOS 13 (Ventura)
 2 | 
 3 | Require grex
 4 | Output demo.gif
 5 | 
 6 | Set Shell zsh
 7 | Set Theme "Whimsy"
 8 | Set Width 1200
 9 | Set Height 850
10 | Set TypingSpeed 150ms
11 | 
12 | Type "grex -c 'regexes are awesome' 'regexes are awful'"
13 | Sleep 3s
14 | Enter
15 | Sleep 10s
16 | 
17 | Up
18 | Left 42
19 | Type " --verbose"
20 | Sleep 3s
21 | Enter
22 | Sleep 15s
23 | Type "clear"
24 | Enter
25 | 
26 | Type "grex -c haha HAHAHA"
27 | Sleep 3s
28 | Enter
29 | Sleep 10s
30 | 
31 | Up
32 | Left 12
33 | Type " --repetitions"
34 | Sleep 3s
35 | Enter
36 | Sleep 10s
37 | 
38 | Up
39 | Left 12
40 | Type " --verbose"
41 | Sleep 3s
42 | Enter
43 | Sleep 15s
44 | 
45 | Up
46 | Left 12
47 | Type " --ignore-case"
48 | Sleep 3s
49 | Enter
50 | Sleep 15s
51 | 


--------------------------------------------------------------------------------
/grex.pyi:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from typing import List
 17 | 
 18 | 
 19 | class RegExpBuilder:
 20 |     """This class builds regular expressions from user-provided test cases."""
 21 | 
 22 |     @classmethod
 23 |     def from_test_cases(cls, test_cases: List[str]) -> "RegExpBuilder":
 24 |         """Specify the test cases to build the regular expression from.
 25 | 
 26 |         The test cases need not be sorted because `RegExpBuilder` sorts them internally.
 27 | 
 28 |         Args:
 29 |             test_cases (list[str]): The list of test cases
 30 | 
 31 |         Raises:
 32 |             ValueError: if `test_cases` is empty
 33 |         """
 34 | 
 35 |     def with_conversion_of_digits(self) -> "RegExpBuilder":
 36 |         """Convert any Unicode decimal digit to character class `\d`.
 37 | 
 38 |         This method takes precedence over `with_conversion_of_words` if both are set.
 39 |         Decimal digits are converted to `\d`, the remaining word characters to `\w`.
 40 | 
 41 |         This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
 42 |         Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`.
 43 |         """
 44 | 
 45 |     def with_conversion_of_non_digits(self) -> "RegExpBuilder":
 46 |         """Convert any character which is not a Unicode decimal digit to character class `\D`.
 47 | 
 48 |         This method takes precedence over `with_conversion_of_non_words` if both are set.
 49 |         Non-digits which are also non-word characters are converted to `\D`.
 50 | 
 51 |         This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
 52 |         Non-digits which are also non-space characters are converted to `\D`.
 53 |         """
 54 | 
 55 |     def with_conversion_of_whitespace(self) -> "RegExpBuilder":
 56 |         """Convert any Unicode whitespace character to character class `\s`.
 57 | 
 58 |         This method takes precedence over `with_conversion_of_non_digits` if both are set.
 59 |         Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`.
 60 | 
 61 |         This method takes precedence over `with_conversion_of_non_words` if both are set.
 62 |         Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`.
 63 |         """
 64 | 
 65 |     def with_conversion_of_non_whitespace(self) -> "RegExpBuilder":
 66 |         """Convert any character which is not a Unicode whitespace character to character class `\S`."""
 67 | 
 68 |     def with_conversion_of_words(self) -> "RegExpBuilder":
 69 |         """Convert any Unicode word character to character class `\w`.
 70 | 
 71 |         This method takes precedence over `with_conversion_of_non_digits` if both are set.
 72 |         Word characters are converted to `\w`, the remaining non-digit characters to `\D`.
 73 | 
 74 |         This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
 75 |         Word characters are converted to `\w`, the remaining non-space characters to `\S`.
 76 |         """
 77 | 
 78 |     def with_conversion_of_non_words(self) -> "RegExpBuilder":
 79 |         """Convert any character which is not a Unicode word character to character class `\W`.
 80 | 
 81 |         This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
 82 |         Non-words which are also non-space characters are converted to `\W`.
 83 |         """
 84 | 
 85 |     def with_conversion_of_repetitions(self) -> "RegExpBuilder":
 86 |         """Detect repeated non-overlapping substrings and to convert them to `{min,max}` quantifier notation."""
 87 | 
 88 |     def with_case_insensitive_matching(self) -> "RegExpBuilder":
 89 |         """Enable case-insensitive matching of test cases so that letters match both upper and lower case."""
 90 | 
 91 |     def with_capturing_groups(self) -> "RegExpBuilder":
 92 |         """Replace non-capturing groups with capturing ones."""
 93 | 
 94 |     def with_minimum_repetitions(self, quantity: int) -> "RegExpBuilder":
 95 |         """Specify the minimum quantity of substring repetitions to be converted
 96 |         if `with_conversion_of_repetitions` is set.
 97 | 
 98 |         If the quantity is not explicitly set with this method, a default value of 1 will be used.
 99 | 
100 |         Args:
101 |             quantity (int): The minimum quantity of substring repetitions
102 | 
103 |         Raises:
104 |             ValueError: if `quantity` is zero
105 |         """
106 | 
107 |     def with_minimum_substring_length(self, length: int) -> "RegExpBuilder":
108 |         """Specify the minimum length a repeated substring must have in order
109 |         to be converted if `with_conversion_of_repetitions` is set.
110 | 
111 |         If the length is not explicitly set with this method, a default value of 1 will be used.
112 | 
113 |         Args:
114 |             length (int): The minimum substring length
115 | 
116 |         Raises:
117 |             ValueError: if `length` is zero
118 |         """
119 | 
120 |     def with_escaping_of_non_ascii_chars(self, use_surrogate_pairs: bool) -> "RegExpBuilder":
121 |         """Convert non-ASCII characters to unicode escape sequences.
122 | 
123 |         The parameter `use_surrogate_pairs` specifies whether to convert astral
124 |         code planes (range `U+010000` to `U+10FFFF`) to surrogate pairs.
125 | 
126 |         Args:
127 |             use_surrogate_pairs (bool): Whether to convert astral code planes to surrogate pairs
128 |         """
129 | 
130 |     def with_verbose_mode(self) -> "RegExpBuilder":
131 |         """ Produce a nicer looking regular expression in verbose mode."""
132 | 
133 |     def without_start_anchor(self) -> "RegExpBuilder":
134 |         """Remove the caret anchor '^' from the resulting regular expression,
135 |         thereby allowing to match the test cases also when they do not occur
136 |         at the start of a string.
137 |         """
138 | 
139 |     def without_end_anchor(self) -> "RegExpBuilder":
140 |         """Remove the dollar sign anchor '$' from the resulting regular expression,
141 |         thereby allowing to match the test cases also when they do not occur
142 |         at the end of a string.
143 |         """
144 | 
145 |     def without_anchors(self) -> "RegExpBuilder":
146 |         """Remove the caret and dollar sign anchors from the resulting regular expression,
147 |         thereby allowing to match the test cases also when they occur within a larger
148 |         string that contains other content as well.
149 |         """
150 | 
151 |     def build(self) -> str:
152 |         """Build the actual regular expression using the previously given settings."""
153 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pemistahl/grex/cb71c10815e2216f4941f0b52154fb5d1fc0a01c/logo.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "grex"
 3 | version = "1.0.1"
 4 | authors = [{name = "Peter M. Stahl", email = "pemistahl@gmail.com"}]
 5 | description = "grex generates regular expressions from user-provided test cases."
 6 | readme = "README_PYPI.md"
 7 | requires-python = ">=3.8"
 8 | license = {file = "LICENSE"}
 9 | keywords = ["pattern", "regex", "regexp"]
10 | classifiers = [
11 |     "Development Status :: 5 - Production/Stable",
12 |     "Intended Audience :: Developers",
13 |     "Intended Audience :: Information Technology",
14 |     "Intended Audience :: Science/Research",
15 |     "License :: OSI Approved :: Apache Software License",
16 |     "Programming Language :: Python :: 3.8",
17 |     "Programming Language :: Python :: 3.9",
18 |     "Programming Language :: Python :: 3.10",
19 |     "Programming Language :: Python :: 3.11",
20 |     "Programming Language :: Python :: 3.12",
21 |     "Programming Language :: Rust",
22 |     "Topic :: Software Development :: Libraries :: Python Modules",
23 |     "Topic :: Text Processing"
24 | ]
25 | 
26 | [project.urls]
27 | homepage = "https://github.com/pemistahl/grex"
28 | repository = "https://github.com/pemistahl/grex"
29 | 
30 | [project.optional-dependencies]
31 | test = ["pytest == 8.0.2"]
32 | 
33 | [tool.maturin]
34 | no-default-features = true
35 | features = ["pyo3/extension-module", "python"]
36 | 
37 | [build-system]
38 | requires = ["maturin>=1.1,<2.0"]
39 | build-backend = "maturin"
40 | 
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | maturin == 1.6.0
2 | pytest == 8.2.2
3 | 


--------------------------------------------------------------------------------
/src/builder.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::config::RegExpConfig;
 18 | use crate::regexp::RegExp;
 19 | use itertools::Itertools;
 20 | use std::io::ErrorKind;
 21 | use std::path::PathBuf;
 22 | 
 23 | pub(crate) const MISSING_TEST_CASES_MESSAGE: &str =
 24 |     "No test cases have been provided for regular expression generation";
 25 | 
 26 | pub(crate) const MINIMUM_REPETITIONS_MESSAGE: &str =
 27 |     "Quantity of minimum repetitions must be greater than zero";
 28 | 
 29 | pub(crate) const MINIMUM_SUBSTRING_LENGTH_MESSAGE: &str =
 30 |     "Minimum substring length must be greater than zero";
 31 | 
 32 | /// This struct builds regular expressions from user-provided test cases.
 33 | #[derive(Clone)]
 34 | #[cfg_attr(feature = "python", pyo3::prelude::pyclass)]
 35 | pub struct RegExpBuilder {
 36 |     pub(crate) test_cases: Vec<String>,
 37 |     pub(crate) config: RegExpConfig,
 38 | }
 39 | 
 40 | impl RegExpBuilder {
 41 |     /// Specifies the test cases to build the regular expression from.
 42 |     ///
 43 |     /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
 44 |     ///
 45 |     /// ⚠ Panics if `test_cases` is empty.
 46 |     pub fn from<T: Clone + Into<String>>(test_cases: &[T]) -> Self {
 47 |         if test_cases.is_empty() {
 48 |             panic!("{}", MISSING_TEST_CASES_MESSAGE);
 49 |         }
 50 |         Self {
 51 |             test_cases: test_cases.iter().cloned().map(|it| it.into()).collect_vec(),
 52 |             config: RegExpConfig::new(),
 53 |         }
 54 |     }
 55 | 
 56 |     /// Specifies a text file containing test cases to build the regular expression from.
 57 |     ///
 58 |     /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
 59 |     ///
 60 |     /// Each test case needs to be on a separate line.
 61 |     /// Lines may be ended with either a newline (`\n`) or
 62 |     /// a carriage return with a line feed (`\r\n`).
 63 |     /// The final line ending is optional.
 64 |     ///
 65 |     /// ⚠ Panics if:
 66 |     /// - the file cannot be found
 67 |     /// - the file's encoding is not valid UTF-8 data
 68 |     /// - the file cannot be opened because of conflicting permissions
 69 |     pub fn from_file<T: Into<PathBuf>>(file_path: T) -> Self {
 70 |         match std::fs::read_to_string(file_path.into()) {
 71 |             Ok(file_content) => Self {
 72 |                 test_cases: file_content.lines().map(|it| it.to_string()).collect_vec(),
 73 |                 config: RegExpConfig::new(),
 74 |             },
 75 |             Err(error) => match error.kind() {
 76 |                 ErrorKind::NotFound => panic!("The specified file could not be found"),
 77 |                 ErrorKind::InvalidData => {
 78 |                     panic!("The specified file's encoding is not valid UTF-8")
 79 |                 }
 80 |                 ErrorKind::PermissionDenied => {
 81 |                     panic!("Permission denied: The specified file could not be opened")
 82 |                 }
 83 |                 _ => panic!("{}", error),
 84 |             },
 85 |         }
 86 |     }
 87 | 
 88 |     /// Converts any Unicode decimal digit to character class `\d`.
 89 |     ///
 90 |     /// This method takes precedence over
 91 |     /// [`with_conversion_of_words`](Self::with_conversion_of_words) if both are set.
 92 |     /// Decimal digits are converted to `\d`, the remaining word characters to `\w`.
 93 |     ///
 94 |     /// This method takes precedence over
 95 |     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
 96 |     /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`.
 97 |     pub fn with_conversion_of_digits(&mut self) -> &mut Self {
 98 |         self.config.is_digit_converted = true;
 99 |         self
100 |     }
101 | 
102 |     /// Converts any character which is not a Unicode decimal digit to character class `\D`.
103 |     ///
104 |     /// This method takes precedence over
105 |     /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set.
106 |     /// Non-digits which are also non-word characters are converted to `\D`.
107 |     ///
108 |     /// This method takes precedence over
109 |     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
110 |     /// Non-digits which are also non-space characters are converted to `\D`.
111 |     pub fn with_conversion_of_non_digits(&mut self) -> &mut Self {
112 |         self.config.is_non_digit_converted = true;
113 |         self
114 |     }
115 | 
116 |     /// Converts any Unicode whitespace character to character class `\s`.
117 |     ///
118 |     /// This method takes precedence over
119 |     /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set.
120 |     /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`.
121 |     ///
122 |     /// This method takes precedence over
123 |     /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set.
124 |     /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`.
125 |     pub fn with_conversion_of_whitespace(&mut self) -> &mut Self {
126 |         self.config.is_space_converted = true;
127 |         self
128 |     }
129 | 
130 |     /// Converts any character which is not a Unicode whitespace character to character class `\S`.
131 |     pub fn with_conversion_of_non_whitespace(&mut self) -> &mut Self {
132 |         self.config.is_non_space_converted = true;
133 |         self
134 |     }
135 | 
136 |     /// Converts any Unicode word character to character class `\w`.
137 |     ///
138 |     /// This method takes precedence over
139 |     /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set.
140 |     /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`.
141 |     ///
142 |     /// This method takes precedence over
143 |     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
144 |     /// Word characters are converted to `\w`, the remaining non-space characters to `\S`.
145 |     pub fn with_conversion_of_words(&mut self) -> &mut Self {
146 |         self.config.is_word_converted = true;
147 |         self
148 |     }
149 | 
150 |     /// Converts any character which is not a Unicode word character to character class `\W`.
151 |     ///
152 |     /// This method takes precedence over
153 |     /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set.
154 |     /// Non-words which are also non-space characters are converted to `\W`.
155 |     pub fn with_conversion_of_non_words(&mut self) -> &mut Self {
156 |         self.config.is_non_word_converted = true;
157 |         self
158 |     }
159 | 
160 |     /// Detects repeated non-overlapping substrings and
161 |     /// to convert them to `{min,max}` quantifier notation.
162 |     pub fn with_conversion_of_repetitions(&mut self) -> &mut Self {
163 |         self.config.is_repetition_converted = true;
164 |         self
165 |     }
166 | 
167 |     /// Enables case-insensitive matching of test cases
168 |     /// so that letters match both upper and lower case.
169 |     pub fn with_case_insensitive_matching(&mut self) -> &mut Self {
170 |         self.config.is_case_insensitive_matching = true;
171 |         self
172 |     }
173 | 
174 |     /// Replaces non-capturing groups with capturing ones.
175 |     pub fn with_capturing_groups(&mut self) -> &mut Self {
176 |         self.config.is_capturing_group_enabled = true;
177 |         self
178 |     }
179 | 
180 |     /// Specifies the minimum quantity of substring repetitions to be converted if
181 |     /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set.
182 |     ///
183 |     /// If the quantity is not explicitly set with this method, a default value of 1 will be used.
184 |     ///
185 |     /// ⚠ Panics if `quantity` is zero.
186 |     pub fn with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self {
187 |         if quantity == 0 {
188 |             panic!("{}", MINIMUM_REPETITIONS_MESSAGE);
189 |         }
190 |         self.config.minimum_repetitions = quantity;
191 |         self
192 |     }
193 | 
194 |     /// Specifies the minimum length a repeated substring must have in order to be converted if
195 |     /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set.
196 |     ///
197 |     /// If the length is not explicitly set with this method, a default value of 1 will be used.
198 |     ///
199 |     /// ⚠ Panics if `length` is zero.
200 |     pub fn with_minimum_substring_length(&mut self, length: u32) -> &mut Self {
201 |         if length == 0 {
202 |             panic!("{}", MINIMUM_SUBSTRING_LENGTH_MESSAGE);
203 |         }
204 |         self.config.minimum_substring_length = length;
205 |         self
206 |     }
207 | 
208 |     /// Converts non-ASCII characters to unicode escape sequences.
209 |     /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes
210 |     /// (range `U+010000` to `U+10FFFF`) to surrogate pairs.
211 |     pub fn with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self {
212 |         self.config.is_non_ascii_char_escaped = true;
213 |         self.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs;
214 |         self
215 |     }
216 | 
217 |     /// Produces a nicer looking regular expression in verbose mode.
218 |     pub fn with_verbose_mode(&mut self) -> &mut Self {
219 |         self.config.is_verbose_mode_enabled = true;
220 |         self
221 |     }
222 | 
223 |     /// Removes the caret anchor '^' from the resulting regular
224 |     /// expression, thereby allowing to match the test cases also when they do not occur
225 |     /// at the start of a string.
226 |     pub fn without_start_anchor(&mut self) -> &mut Self {
227 |         self.config.is_start_anchor_disabled = true;
228 |         self
229 |     }
230 | 
231 |     /// Removes the dollar sign anchor '$' from the resulting regular
232 |     /// expression, thereby allowing to match the test cases also when they do not occur
233 |     /// at the end of a string.
234 |     pub fn without_end_anchor(&mut self) -> &mut Self {
235 |         self.config.is_end_anchor_disabled = true;
236 |         self
237 |     }
238 | 
239 |     /// Removes the caret and dollar sign anchors from the resulting
240 |     /// regular expression, thereby allowing to match the test cases also when they occur
241 |     /// within a larger string that contains other content as well.
242 |     pub fn without_anchors(&mut self) -> &mut Self {
243 |         self.config.is_start_anchor_disabled = true;
244 |         self.config.is_end_anchor_disabled = true;
245 |         self
246 |     }
247 | 
248 |     /// Provides syntax highlighting for the resulting regular expression.
249 |     ///
250 |     /// ⚠ This method may only be used if the resulting regular expression is meant to
251 |     /// be printed to the console. The regex string representation returned from enabling
252 |     /// this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate.
253 |     #[cfg(feature = "cli")]
254 |     #[doc(hidden)]
255 |     pub fn with_syntax_highlighting(&mut self) -> &mut Self {
256 |         self.config.is_output_colorized = true;
257 |         self
258 |     }
259 | 
260 |     /// Builds the actual regular expression using the previously given settings.
261 |     pub fn build(&mut self) -> String {
262 |         RegExp::from(&mut self.test_cases, &self.config).to_string()
263 |     }
264 | }
265 | 


--------------------------------------------------------------------------------
/src/cluster.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::config::RegExpConfig;
 18 | use crate::grapheme::Grapheme;
 19 | use crate::unicode_tables::{DECIMAL_NUMBER, WHITE_SPACE, WORD};
 20 | use itertools::Itertools;
 21 | use lazy_static::lazy_static;
 22 | use std::cmp::Ordering;
 23 | use std::collections::HashMap;
 24 | use std::ops::Range;
 25 | use unic_char_range::CharRange;
 26 | use unic_ucd_category::GeneralCategory;
 27 | use unicode_segmentation::UnicodeSegmentation;
 28 | 
 29 | #[derive(Clone, Debug, Eq, PartialEq)]
 30 | pub struct GraphemeCluster<'a> {
 31 |     graphemes: Vec<Grapheme>,
 32 |     config: &'a RegExpConfig,
 33 | }
 34 | 
 35 | impl<'a> GraphemeCluster<'a> {
 36 |     pub(crate) fn from(s: &str, config: &'a RegExpConfig) -> Self {
 37 |         Self {
 38 |             graphemes: UnicodeSegmentation::graphemes(s, true)
 39 |                 .flat_map(|it| {
 40 |                     let contains_backslash = it.chars().count() == 2 && it.contains('\\');
 41 |                     let contains_combining_mark_or_unassigned_chars = it.chars().any(|c| {
 42 |                         let category = GeneralCategory::of(c);
 43 |                         category.is_mark() || category.is_other()
 44 |                     });
 45 | 
 46 |                     if contains_backslash || contains_combining_mark_or_unassigned_chars {
 47 |                         it.chars()
 48 |                             .map(|c| {
 49 |                                 Grapheme::from(
 50 |                                     &c.to_string(),
 51 |                                     config.is_capturing_group_enabled,
 52 |                                     config.is_output_colorized,
 53 |                                     config.is_verbose_mode_enabled,
 54 |                                 )
 55 |                             })
 56 |                             .collect_vec()
 57 |                     } else {
 58 |                         vec![Grapheme::from(
 59 |                             it,
 60 |                             config.is_capturing_group_enabled,
 61 |                             config.is_output_colorized,
 62 |                             config.is_verbose_mode_enabled,
 63 |                         )]
 64 |                     }
 65 |                 })
 66 |                 .collect_vec(),
 67 |             config,
 68 |         }
 69 |     }
 70 | 
 71 |     pub(crate) fn from_graphemes(graphemes: Vec<Grapheme>, config: &'a RegExpConfig) -> Self {
 72 |         Self { graphemes, config }
 73 |     }
 74 | 
 75 |     pub(crate) fn new(grapheme: Grapheme, config: &'a RegExpConfig) -> Self {
 76 |         Self {
 77 |             graphemes: vec![grapheme],
 78 |             config,
 79 |         }
 80 |     }
 81 | 
 82 |     pub(crate) fn convert_to_char_classes(&mut self) {
 83 |         let is_digit_converted = self.config.is_digit_converted;
 84 |         let is_non_digit_converted = self.config.is_non_digit_converted;
 85 |         let is_space_converted = self.config.is_space_converted;
 86 |         let is_non_space_converted = self.config.is_non_space_converted;
 87 |         let is_word_converted = self.config.is_word_converted;
 88 |         let is_non_word_converted = self.config.is_non_word_converted;
 89 | 
 90 |         for grapheme in self.graphemes.iter_mut() {
 91 |             grapheme.chars = grapheme
 92 |                 .chars
 93 |                 .iter()
 94 |                 .map(|it| {
 95 |                     it.chars()
 96 |                         .map(|c| {
 97 |                             if is_digit_converted && is_digit(c) {
 98 |                                 "\\d".to_string()
 99 |                             } else if is_word_converted && is_word(c) {
100 |                                 "\\w".to_string()
101 |                             } else if is_space_converted && is_space(c) {
102 |                                 "\\s".to_string()
103 |                             } else if is_non_digit_converted && !is_digit(c) {
104 |                                 "\\D".to_string()
105 |                             } else if is_non_word_converted && !is_word(c) {
106 |                                 "\\W".to_string()
107 |                             } else if is_non_space_converted && !is_space(c) {
108 |                                 "\\S".to_string()
109 |                             } else {
110 |                                 c.to_string()
111 |                             }
112 |                         })
113 |                         .join("")
114 |                 })
115 |                 .collect_vec();
116 |         }
117 |     }
118 | 
119 |     pub(crate) fn convert_repetitions(&mut self) {
120 |         let mut repetitions = vec![];
121 |         convert_repetitions(self.graphemes(), repetitions.as_mut(), self.config);
122 |         if !repetitions.is_empty() {
123 |             self.graphemes = repetitions;
124 |         }
125 |     }
126 | 
127 |     pub(crate) fn merge(
128 |         first: &GraphemeCluster,
129 |         second: &GraphemeCluster,
130 |         config: &'a RegExpConfig,
131 |     ) -> Self {
132 |         let mut graphemes = vec![];
133 |         graphemes.extend_from_slice(&first.graphemes);
134 |         graphemes.extend_from_slice(&second.graphemes);
135 |         Self { graphemes, config }
136 |     }
137 | 
138 |     pub(crate) fn graphemes(&self) -> &Vec<Grapheme> {
139 |         &self.graphemes
140 |     }
141 | 
142 |     pub(crate) fn graphemes_mut(&mut self) -> &mut Vec<Grapheme> {
143 |         &mut self.graphemes
144 |     }
145 | 
146 |     pub(crate) fn size(&self) -> usize {
147 |         self.graphemes.len()
148 |     }
149 | 
150 |     pub(crate) fn char_count(&self, is_non_ascii_char_escaped: bool) -> usize {
151 |         self.graphemes
152 |             .iter()
153 |             .map(|it| it.char_count(is_non_ascii_char_escaped))
154 |             .sum()
155 |     }
156 | 
157 |     pub(crate) fn is_empty(&self) -> bool {
158 |         self.graphemes.is_empty()
159 |     }
160 | }
161 | 
162 | fn is_digit(c: char) -> bool {
163 |     lazy_static! {
164 |         static ref VALID_NUMERIC_CHARS: Vec<CharRange> = convert_chars_to_range(DECIMAL_NUMBER);
165 |     }
166 |     VALID_NUMERIC_CHARS.iter().any(|range| range.contains(c))
167 | }
168 | 
169 | fn is_word(c: char) -> bool {
170 |     lazy_static! {
171 |         static ref VALID_ALPHANUMERIC_CHARS: Vec<CharRange> = convert_chars_to_range(WORD);
172 |     }
173 |     VALID_ALPHANUMERIC_CHARS
174 |         .iter()
175 |         .any(|range| range.contains(c))
176 | }
177 | 
178 | fn is_space(c: char) -> bool {
179 |     lazy_static! {
180 |         static ref VALID_SPACE_CHARS: Vec<CharRange> = convert_chars_to_range(WHITE_SPACE);
181 |     }
182 |     VALID_SPACE_CHARS.iter().any(|range| range.contains(c))
183 | }
184 | 
185 | fn convert_repetitions(
186 |     graphemes: &[Grapheme],
187 |     repetitions: &mut Vec<Grapheme>,
188 |     config: &RegExpConfig,
189 | ) {
190 |     let repeated_substrings = collect_repeated_substrings(graphemes);
191 |     let ranges_of_repetitions = create_ranges_of_repetitions(repeated_substrings, config);
192 |     let coalesced_repetitions = coalesce_repetitions(ranges_of_repetitions);
193 |     replace_graphemes_with_repetitions(coalesced_repetitions, graphemes, repetitions, config)
194 | }
195 | 
196 | fn collect_repeated_substrings(graphemes: &[Grapheme]) -> HashMap<Vec<String>, Vec<usize>> {
197 |     let mut map = HashMap::new();
198 | 
199 |     for i in 0..graphemes.len() {
200 |         let suffix = &graphemes[i..];
201 |         for j in 1..=graphemes.len() / 2 {
202 |             if suffix.len() >= j {
203 |                 let prefix = suffix[..j].iter().map(|it| it.value()).collect_vec();
204 |                 let indices = map.entry(prefix).or_insert_with(Vec::new);
205 |                 indices.push(i);
206 |             }
207 |         }
208 |     }
209 |     map
210 | }
211 | 
212 | fn create_ranges_of_repetitions(
213 |     repeated_substrings: HashMap<Vec<String>, Vec<usize>>,
214 |     config: &RegExpConfig,
215 | ) -> Vec<(Range<usize>, Vec<String>)> {
216 |     let mut repetitions = Vec::<(Range<usize>, Vec<String>)>::new();
217 | 
218 |     for (prefix_length, group) in &repeated_substrings
219 |         .iter()
220 |         .filter(|&(prefix, indices)| {
221 |             indices
222 |                 .iter()
223 |                 .tuple_windows()
224 |                 .all(|(first, second)| (second - first) >= prefix.len())
225 |         })
226 |         .sorted_by_key(|&(prefix, _)| prefix.len())
227 |         .rev()
228 |         .chunk_by(|&(prefix, _)| prefix.len())
229 |     {
230 |         for (prefix, indices) in group.sorted_by_key(|&(_, indices)| indices[0]) {
231 |             indices
232 |                 .iter()
233 |                 .map(|it| *it..it + prefix_length)
234 |                 .coalesce(|x, y| {
235 |                     if x.end == y.start {
236 |                         Ok(x.start..y.end)
237 |                     } else {
238 |                         Err((x, y))
239 |                     }
240 |                 })
241 |                 .filter(|range| {
242 |                     let count = ((range.end - range.start) / prefix_length) as u32;
243 |                     count > config.minimum_repetitions
244 |                 })
245 |                 .for_each(|range| repetitions.push((range, prefix.clone())));
246 |         }
247 |     }
248 |     repetitions
249 | }
250 | 
251 | fn coalesce_repetitions(
252 |     ranges_of_repetitions: Vec<(Range<usize>, Vec<String>)>,
253 | ) -> Vec<(Range<usize>, Vec<String>)> {
254 |     ranges_of_repetitions
255 |         .iter()
256 |         .sorted_by(|&(first_range, _), &(second_range, _)| {
257 |             match second_range.end.cmp(&first_range.end) {
258 |                 Ordering::Equal => first_range.start.cmp(&second_range.start),
259 |                 other => other,
260 |             }
261 |         })
262 |         .coalesce(|first_tup, second_tup| {
263 |             let first_range = &first_tup.0;
264 |             let second_range = &second_tup.0;
265 | 
266 |             if (first_range.contains(&second_range.start)
267 |                 || first_range.contains(&second_range.end))
268 |                 && second_range.end != first_range.start
269 |             {
270 |                 Ok(first_tup)
271 |             } else {
272 |                 Err((first_tup, second_tup))
273 |             }
274 |         })
275 |         .map(|(range, substr)| (range.clone(), substr.clone()))
276 |         .collect_vec()
277 | }
278 | 
279 | fn replace_graphemes_with_repetitions(
280 |     coalesced_repetitions: Vec<(Range<usize>, Vec<String>)>,
281 |     graphemes: &[Grapheme],
282 |     repetitions: &mut Vec<Grapheme>,
283 |     config: &RegExpConfig,
284 | ) {
285 |     if coalesced_repetitions.is_empty() {
286 |         return;
287 |     }
288 | 
289 |     for grapheme in graphemes {
290 |         repetitions.push(grapheme.clone());
291 |     }
292 | 
293 |     for (range, substr) in coalesced_repetitions.iter() {
294 |         if range.end > repetitions.len() {
295 |             break;
296 |         }
297 | 
298 |         let count = ((range.end - range.start) / substr.len()) as u32;
299 | 
300 |         if substr.len() < config.minimum_substring_length as usize {
301 |             continue;
302 |         }
303 | 
304 |         repetitions.splice(
305 |             range.clone(),
306 |             [Grapheme::new(
307 |                 substr.clone(),
308 |                 count,
309 |                 count,
310 |                 config.is_capturing_group_enabled,
311 |                 config.is_output_colorized,
312 |                 config.is_verbose_mode_enabled,
313 |             )]
314 |             .iter()
315 |             .cloned(),
316 |         );
317 |     }
318 | 
319 |     for new_grapheme in repetitions.iter_mut() {
320 |         convert_repetitions(
321 |             &new_grapheme
322 |                 .chars
323 |                 .iter()
324 |                 .map(|it| {
325 |                     Grapheme::from(
326 |                         it,
327 |                         config.is_capturing_group_enabled,
328 |                         config.is_output_colorized,
329 |                         config.is_verbose_mode_enabled,
330 |                     )
331 |                 })
332 |                 .collect_vec(),
333 |             new_grapheme.repetitions.as_mut(),
334 |             config,
335 |         );
336 |     }
337 | }
338 | 
339 | fn convert_chars_to_range(chars: &[(char, char)]) -> Vec<CharRange> {
340 |     chars
341 |         .iter()
342 |         .map(|&(start, end)| CharRange::closed(start, end))
343 |         .collect_vec()
344 | }
345 | 


--------------------------------------------------------------------------------
/src/component.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::quantifier::Quantifier;
 18 | use std::fmt::{Display, Formatter, Result};
 19 | 
 20 | pub(crate) enum Component {
 21 |     CapturedLeftParenthesis,
 22 |     CapturedParenthesizedExpression(String, bool, bool),
 23 |     Caret(bool),
 24 |     CharClass(String),
 25 |     DollarSign(bool),
 26 |     Hyphen,
 27 |     IgnoreCaseFlag,
 28 |     IgnoreCaseAndVerboseModeFlag,
 29 |     LeftBracket,
 30 |     Pipe,
 31 |     Quantifier(Quantifier, bool),
 32 |     Repetition(u32, bool),
 33 |     RepetitionRange(u32, u32, bool),
 34 |     RightBracket,
 35 |     RightParenthesis,
 36 |     UncapturedLeftParenthesis,
 37 |     UncapturedParenthesizedExpression(String, bool, bool),
 38 |     VerboseModeFlag,
 39 | }
 40 | 
 41 | impl Component {
 42 |     pub(crate) fn to_repr(&self, is_output_colorized: bool) -> String {
 43 |         match is_output_colorized {
 44 |             true => self.to_colored_string(false),
 45 |             false => self.to_string(),
 46 |         }
 47 |     }
 48 | 
 49 |     pub(crate) fn to_colored_string(&self, is_escaped: bool) -> String {
 50 |         match self {
 51 |             Component::CapturedLeftParenthesis => Self::green_bold(&self.to_string(), is_escaped),
 52 |             Component::CapturedParenthesizedExpression(
 53 |                 expr,
 54 |                 is_verbose_mode_enabled,
 55 |                 has_final_line_break,
 56 |             ) => {
 57 |                 if *is_verbose_mode_enabled {
 58 |                     if *has_final_line_break {
 59 |                         format!(
 60 |                             "\n{}\n{}\n{}\n",
 61 |                             Component::CapturedLeftParenthesis.to_colored_string(is_escaped),
 62 |                             expr,
 63 |                             Component::RightParenthesis.to_colored_string(is_escaped)
 64 |                         )
 65 |                     } else {
 66 |                         format!(
 67 |                             "\n{}\n{}\n{}",
 68 |                             Component::CapturedLeftParenthesis.to_colored_string(is_escaped),
 69 |                             expr,
 70 |                             Component::RightParenthesis.to_colored_string(is_escaped)
 71 |                         )
 72 |                     }
 73 |                 } else {
 74 |                     format!(
 75 |                         "{}{}{}",
 76 |                         Component::CapturedLeftParenthesis.to_colored_string(is_escaped),
 77 |                         expr,
 78 |                         Component::RightParenthesis.to_colored_string(is_escaped)
 79 |                     )
 80 |                 }
 81 |             }
 82 |             Component::Caret(is_verbose_mode_enabled) => {
 83 |                 if *is_verbose_mode_enabled {
 84 |                     format!(
 85 |                         "{}\n",
 86 |                         Self::yellow_bold(&Component::Caret(false).to_string(), is_escaped)
 87 |                     )
 88 |                 } else {
 89 |                     Self::yellow_bold(&self.to_string(), is_escaped)
 90 |                 }
 91 |             }
 92 |             Component::CharClass(value) => Self::black_on_bright_yellow(value, is_escaped),
 93 |             Component::DollarSign(is_verbose_mode_enabled) => {
 94 |                 if *is_verbose_mode_enabled {
 95 |                     format!(
 96 |                         "\n{}",
 97 |                         Self::yellow_bold(&Component::DollarSign(false).to_string(), is_escaped)
 98 |                     )
 99 |                 } else {
100 |                     Self::yellow_bold(&self.to_string(), is_escaped)
101 |                 }
102 |             }
103 |             Component::Hyphen => Self::cyan_bold(&self.to_string(), is_escaped),
104 |             Component::IgnoreCaseFlag => {
105 |                 Self::bright_yellow_on_black(&self.to_string(), is_escaped)
106 |             }
107 |             Component::IgnoreCaseAndVerboseModeFlag => {
108 |                 format!("{}\n", Self::bright_yellow_on_black("(?ix)", is_escaped))
109 |             }
110 |             Component::LeftBracket => Self::cyan_bold(&self.to_string(), is_escaped),
111 |             Component::Pipe => Self::red_bold(&self.to_string(), is_escaped),
112 |             Component::Quantifier(quantifier, is_verbose_mode_enabled) => {
113 |                 if *is_verbose_mode_enabled {
114 |                     format!(
115 |                         "{}\n",
116 |                         Self::purple_bold(&quantifier.to_string(), is_escaped)
117 |                     )
118 |                 } else {
119 |                     Self::purple_bold(&self.to_string(), is_escaped)
120 |                 }
121 |             }
122 |             Component::Repetition(num, is_verbose_mode_enabled) => {
123 |                 if *is_verbose_mode_enabled {
124 |                     format!(
125 |                         "{}\n",
126 |                         Self::white_on_bright_blue(
127 |                             &Component::Repetition(*num, false).to_string(),
128 |                             is_escaped
129 |                         )
130 |                     )
131 |                 } else {
132 |                     Self::white_on_bright_blue(&self.to_string(), is_escaped)
133 |                 }
134 |             }
135 |             Component::RepetitionRange(min, max, is_verbose_mode_enabled) => {
136 |                 if *is_verbose_mode_enabled {
137 |                     format!(
138 |                         "{}\n",
139 |                         Self::white_on_bright_blue(
140 |                             &Component::RepetitionRange(*min, *max, false).to_string(),
141 |                             is_escaped
142 |                         )
143 |                     )
144 |                 } else {
145 |                     Self::white_on_bright_blue(&self.to_string(), is_escaped)
146 |                 }
147 |             }
148 |             Component::RightBracket => Self::cyan_bold(&self.to_string(), is_escaped),
149 |             Component::RightParenthesis => Self::green_bold(&self.to_string(), is_escaped),
150 |             Component::UncapturedLeftParenthesis => Self::green_bold(&self.to_string(), is_escaped),
151 |             Component::UncapturedParenthesizedExpression(
152 |                 expr,
153 |                 is_verbose_mode_enabled,
154 |                 has_final_line_break,
155 |             ) => {
156 |                 if *is_verbose_mode_enabled {
157 |                     if *has_final_line_break {
158 |                         format!(
159 |                             "\n{}\n{}\n{}\n",
160 |                             Component::UncapturedLeftParenthesis.to_colored_string(is_escaped),
161 |                             expr,
162 |                             Component::RightParenthesis.to_colored_string(is_escaped)
163 |                         )
164 |                     } else {
165 |                         format!(
166 |                             "\n{}\n{}\n{}",
167 |                             Component::UncapturedLeftParenthesis.to_colored_string(is_escaped),
168 |                             expr,
169 |                             Component::RightParenthesis.to_colored_string(is_escaped)
170 |                         )
171 |                     }
172 |                 } else {
173 |                     format!(
174 |                         "{}{}{}",
175 |                         Component::UncapturedLeftParenthesis.to_colored_string(is_escaped),
176 |                         expr,
177 |                         Component::RightParenthesis.to_colored_string(is_escaped)
178 |                     )
179 |                 }
180 |             }
181 |             Component::VerboseModeFlag => {
182 |                 format!("{}\n", Self::bright_yellow_on_black("(?x)", is_escaped))
183 |             }
184 |         }
185 |     }
186 | 
187 |     fn black_on_bright_yellow(value: &str, is_escaped: bool) -> String {
188 |         Self::color_code("103;30", value, is_escaped)
189 |     }
190 | 
191 |     fn bright_yellow_on_black(value: &str, is_escaped: bool) -> String {
192 |         Self::color_code("40;93", value, is_escaped)
193 |     }
194 | 
195 |     fn cyan_bold(value: &str, is_escaped: bool) -> String {
196 |         Self::color_code("1;36", value, is_escaped)
197 |     }
198 | 
199 |     fn green_bold(value: &str, is_escaped: bool) -> String {
200 |         Self::color_code("1;32", value, is_escaped)
201 |     }
202 | 
203 |     fn purple_bold(value: &str, is_escaped: bool) -> String {
204 |         Self::color_code("1;35", value, is_escaped)
205 |     }
206 | 
207 |     fn red_bold(value: &str, is_escaped: bool) -> String {
208 |         Self::color_code("1;31", value, is_escaped)
209 |     }
210 | 
211 |     fn white_on_bright_blue(value: &str, is_escaped: bool) -> String {
212 |         Self::color_code("104;37", value, is_escaped)
213 |     }
214 | 
215 |     fn yellow_bold(value: &str, is_escaped: bool) -> String {
216 |         Self::color_code("1;33", value, is_escaped)
217 |     }
218 | 
219 |     fn color_code(code: &str, value: &str, is_escaped: bool) -> String {
220 |         if is_escaped {
221 |             format!("\u{1b}\\[{}m\\{}\u{1b}\\[0m", code, value)
222 |         } else {
223 |             format!("\u{1b}[{}m{}\u{1b}[0m", code, value)
224 |         }
225 |     }
226 | }
227 | 
228 | impl Display for Component {
229 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
230 |         write!(
231 |             f,
232 |             "{}",
233 |             match self {
234 |                 Component::CapturedLeftParenthesis => "(".to_string(),
235 |                 Component::CapturedParenthesizedExpression(
236 |                     expr,
237 |                     is_verbose_mode_enabled,
238 |                     has_final_line_break,
239 |                 ) =>
240 |                     if *is_verbose_mode_enabled {
241 |                         if *has_final_line_break {
242 |                             format!(
243 |                                 "\n{}\n{}\n{}\n",
244 |                                 Component::CapturedLeftParenthesis,
245 |                                 expr,
246 |                                 Component::RightParenthesis
247 |                             )
248 |                         } else {
249 |                             format!(
250 |                                 "\n{}\n{}\n{}",
251 |                                 Component::CapturedLeftParenthesis,
252 |                                 expr,
253 |                                 Component::RightParenthesis
254 |                             )
255 |                         }
256 |                     } else {
257 |                         format!(
258 |                             "{}{}{}",
259 |                             Component::CapturedLeftParenthesis,
260 |                             expr,
261 |                             Component::RightParenthesis
262 |                         )
263 |                     },
264 |                 Component::Caret(is_verbose_mode_enabled) =>
265 |                     if *is_verbose_mode_enabled {
266 |                         "^\n".to_string()
267 |                     } else {
268 |                         "^".to_string()
269 |                     },
270 |                 Component::CharClass(value) => value.clone(),
271 |                 Component::DollarSign(is_verbose_mode_enabled) =>
272 |                     if *is_verbose_mode_enabled {
273 |                         "\n$".to_string()
274 |                     } else {
275 |                         "$".to_string()
276 |                     },
277 |                 Component::Hyphen => "-".to_string(),
278 |                 Component::IgnoreCaseFlag => "(?i)".to_string(),
279 |                 Component::IgnoreCaseAndVerboseModeFlag => "(?ix)\n".to_string(),
280 |                 Component::LeftBracket => "[".to_string(),
281 |                 Component::Pipe => "|".to_string(),
282 |                 Component::Quantifier(quantifier, is_verbose_mode_enabled) =>
283 |                     if *is_verbose_mode_enabled {
284 |                         format!("{}\n", quantifier)
285 |                     } else {
286 |                         quantifier.to_string()
287 |                     },
288 |                 Component::Repetition(num, is_verbose_mode_enabled) => {
289 |                     if *num == 0 && *is_verbose_mode_enabled {
290 |                         "{\\d+\\}\n".to_string()
291 |                     } else if *num == 0 {
292 |                         "{\\d+\\}".to_string()
293 |                     } else if *is_verbose_mode_enabled {
294 |                         format!("{{{}}}\n", num)
295 |                     } else {
296 |                         format!("{{{}}}", num)
297 |                     }
298 |                 }
299 |                 Component::RepetitionRange(min, max, is_verbose_mode_enabled) => {
300 |                     if *min == 0 && *max == 0 && *is_verbose_mode_enabled {
301 |                         "{\\d+,\\d+\\}\n".to_string()
302 |                     } else if *min == 0 && *max == 0 {
303 |                         "{\\d+,\\d+\\}".to_string()
304 |                     } else if *is_verbose_mode_enabled {
305 |                         format!("{{{},{}}}\n", min, max)
306 |                     } else {
307 |                         format!("{{{},{}}}", min, max)
308 |                     }
309 |                 }
310 |                 Component::RightBracket => "]".to_string(),
311 |                 Component::RightParenthesis => ")".to_string(),
312 |                 Component::UncapturedLeftParenthesis => "(?:".to_string(),
313 |                 Component::UncapturedParenthesizedExpression(
314 |                     expr,
315 |                     is_verbose_mode_enabled,
316 |                     has_final_line_break,
317 |                 ) => {
318 |                     if *is_verbose_mode_enabled {
319 |                         if *has_final_line_break {
320 |                             format!(
321 |                                 "\n{}\n{}\n{}\n",
322 |                                 Component::UncapturedLeftParenthesis,
323 |                                 expr,
324 |                                 Component::RightParenthesis
325 |                             )
326 |                         } else {
327 |                             format!(
328 |                                 "\n{}\n{}\n{}",
329 |                                 Component::UncapturedLeftParenthesis,
330 |                                 expr,
331 |                                 Component::RightParenthesis
332 |                             )
333 |                         }
334 |                     } else {
335 |                         format!(
336 |                             "{}{}{}",
337 |                             Component::UncapturedLeftParenthesis,
338 |                             expr,
339 |                             Component::RightParenthesis
340 |                         )
341 |                     }
342 |                 }
343 |                 Component::VerboseModeFlag => "(?x)\n".to_string(),
344 |             }
345 |         )
346 |     }
347 | }
348 | 


--------------------------------------------------------------------------------
/src/config.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #[derive(Clone, Debug, Hash, Ord, PartialOrd, Eq, PartialEq)]
18 | pub struct RegExpConfig {
19 |     pub(crate) minimum_repetitions: u32,
20 |     pub(crate) minimum_substring_length: u32,
21 |     pub(crate) is_digit_converted: bool,
22 |     pub(crate) is_non_digit_converted: bool,
23 |     pub(crate) is_space_converted: bool,
24 |     pub(crate) is_non_space_converted: bool,
25 |     pub(crate) is_word_converted: bool,
26 |     pub(crate) is_non_word_converted: bool,
27 |     pub(crate) is_repetition_converted: bool,
28 |     pub(crate) is_case_insensitive_matching: bool,
29 |     pub(crate) is_capturing_group_enabled: bool,
30 |     pub(crate) is_non_ascii_char_escaped: bool,
31 |     pub(crate) is_astral_code_point_converted_to_surrogate: bool,
32 |     pub(crate) is_verbose_mode_enabled: bool,
33 |     pub(crate) is_start_anchor_disabled: bool,
34 |     pub(crate) is_end_anchor_disabled: bool,
35 |     pub(crate) is_output_colorized: bool,
36 | }
37 | 
38 | impl RegExpConfig {
39 |     pub(crate) fn new() -> Self {
40 |         Self {
41 |             minimum_repetitions: 1,
42 |             minimum_substring_length: 1,
43 |             is_digit_converted: false,
44 |             is_non_digit_converted: false,
45 |             is_space_converted: false,
46 |             is_non_space_converted: false,
47 |             is_word_converted: false,
48 |             is_non_word_converted: false,
49 |             is_repetition_converted: false,
50 |             is_case_insensitive_matching: false,
51 |             is_capturing_group_enabled: false,
52 |             is_non_ascii_char_escaped: false,
53 |             is_astral_code_point_converted_to_surrogate: false,
54 |             is_verbose_mode_enabled: false,
55 |             is_start_anchor_disabled: false,
56 |             is_end_anchor_disabled: false,
57 |             is_output_colorized: false,
58 |         }
59 |     }
60 | 
61 |     pub(crate) fn is_char_class_feature_enabled(&self) -> bool {
62 |         self.is_digit_converted
63 |             || self.is_non_digit_converted
64 |             || self.is_space_converted
65 |             || self.is_non_space_converted
66 |             || self.is_word_converted
67 |             || self.is_non_word_converted
68 |             || self.is_case_insensitive_matching
69 |             || self.is_capturing_group_enabled
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/dfa.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::cluster::GraphemeCluster;
 18 | use crate::config::RegExpConfig;
 19 | use crate::grapheme::Grapheme;
 20 | use itertools::Itertools;
 21 | use petgraph::graph::NodeIndex;
 22 | use petgraph::stable_graph::{Edges, StableGraph};
 23 | use petgraph::visit::Dfs;
 24 | use petgraph::{Directed, Direction};
 25 | use std::cmp::{max, min};
 26 | use std::collections::{BTreeSet, HashMap, HashSet};
 27 | 
 28 | type State = NodeIndex<u32>;
 29 | type StateLabel = String;
 30 | type EdgeLabel = Grapheme;
 31 | 
 32 | pub struct Dfa<'a> {
 33 |     alphabet: BTreeSet<Grapheme>,
 34 |     graph: StableGraph<StateLabel, EdgeLabel>,
 35 |     initial_state: State,
 36 |     final_state_indices: HashSet<usize>,
 37 |     config: &'a RegExpConfig,
 38 | }
 39 | 
 40 | impl<'a> Dfa<'a> {
 41 |     pub(crate) fn from(
 42 |         grapheme_clusters: &[GraphemeCluster],
 43 |         is_minimized: bool,
 44 |         config: &'a RegExpConfig,
 45 |     ) -> Self {
 46 |         let mut dfa = Self::new(config);
 47 |         for cluster in grapheme_clusters {
 48 |             dfa.insert(cluster);
 49 |         }
 50 |         if is_minimized {
 51 |             dfa.minimize();
 52 |         }
 53 |         dfa
 54 |     }
 55 | 
 56 |     pub(crate) fn state_count(&self) -> usize {
 57 |         self.graph.node_count()
 58 |     }
 59 | 
 60 |     pub(crate) fn states_in_depth_first_order(&self) -> Vec<State> {
 61 |         let mut depth_first_search = Dfs::new(&self.graph, self.initial_state);
 62 |         let mut states = vec![];
 63 |         while let Some(state) = depth_first_search.next(&self.graph) {
 64 |             states.push(state);
 65 |         }
 66 |         states
 67 |     }
 68 | 
 69 |     pub(crate) fn outgoing_edges(&self, state: State) -> Edges<Grapheme, Directed> {
 70 |         self.graph.edges_directed(state, Direction::Outgoing)
 71 |     }
 72 | 
 73 |     pub(crate) fn is_final_state(&self, state: State) -> bool {
 74 |         self.final_state_indices.contains(&state.index())
 75 |     }
 76 | 
 77 |     fn new(config: &'a RegExpConfig) -> Self {
 78 |         let mut graph = StableGraph::new();
 79 |         let initial_state = graph.add_node("".to_string());
 80 |         Self {
 81 |             alphabet: BTreeSet::new(),
 82 |             graph,
 83 |             initial_state,
 84 |             final_state_indices: HashSet::new(),
 85 |             config,
 86 |         }
 87 |     }
 88 | 
 89 |     fn insert(&mut self, cluster: &GraphemeCluster) {
 90 |         let mut current_state = self.initial_state;
 91 | 
 92 |         for grapheme in cluster.graphemes() {
 93 |             self.alphabet.insert(grapheme.clone());
 94 |             current_state = self.return_next_state(current_state, grapheme);
 95 |         }
 96 |         self.final_state_indices.insert(current_state.index());
 97 |     }
 98 | 
 99 |     fn return_next_state(&mut self, current_state: State, edge_label: &Grapheme) -> State {
100 |         match self.find_next_state(current_state, edge_label) {
101 |             Some(next_state) => next_state,
102 |             None => self.add_new_state(current_state, edge_label),
103 |         }
104 |     }
105 | 
106 |     fn find_next_state(&mut self, current_state: State, grapheme: &Grapheme) -> Option<State> {
107 |         for next_state in self.graph.neighbors(current_state) {
108 |             let edge_idx = self.graph.find_edge(current_state, next_state).unwrap();
109 |             let current_grapheme = self.graph.edge_weight(edge_idx).unwrap();
110 | 
111 |             if current_grapheme.value() != grapheme.value() {
112 |                 continue;
113 |             }
114 | 
115 |             if current_grapheme.maximum() == grapheme.maximum() - 1 {
116 |                 let min = min(current_grapheme.minimum(), grapheme.minimum());
117 |                 let max = max(current_grapheme.maximum(), grapheme.maximum());
118 |                 let new_grapheme = Grapheme::new(
119 |                     grapheme.chars().clone(),
120 |                     min,
121 |                     max,
122 |                     self.config.is_capturing_group_enabled,
123 |                     self.config.is_output_colorized,
124 |                     self.config.is_verbose_mode_enabled,
125 |                 );
126 |                 self.graph
127 |                     .update_edge(current_state, next_state, new_grapheme);
128 |                 return Some(next_state);
129 |             } else if current_grapheme.maximum() == grapheme.maximum() {
130 |                 return Some(next_state);
131 |             }
132 |         }
133 |         None
134 |     }
135 | 
136 |     fn add_new_state(&mut self, current_state: State, edge_label: &Grapheme) -> State {
137 |         let next_state = self.graph.add_node("".to_string());
138 |         self.graph
139 |             .add_edge(current_state, next_state, edge_label.clone());
140 |         next_state
141 |     }
142 | 
143 |     #[allow(clippy::many_single_char_names)]
144 |     fn minimize(&mut self) {
145 |         let mut p = self.get_initial_partition();
146 |         let mut w = p.iter().cloned().collect_vec();
147 | 
148 |         while !w.is_empty() {
149 |             let a = w.drain(0..1).next().unwrap();
150 | 
151 |             for edge_label in self.alphabet.iter() {
152 |                 let x = self.get_parent_states(&a, edge_label);
153 |                 let mut replacements = vec![];
154 |                 let mut is_replacement_needed = true;
155 |                 let mut start_idx = 0;
156 | 
157 |                 while is_replacement_needed {
158 |                     for (idx, y) in p.iter().enumerate().skip(start_idx) {
159 |                         if x.intersection(y).count() == 0 || y.difference(&x).count() == 0 {
160 |                             is_replacement_needed = false;
161 |                             continue;
162 |                         }
163 | 
164 |                         let i = x.intersection(y).copied().collect::<HashSet<State>>();
165 |                         let d = y.difference(&x).copied().collect::<HashSet<State>>();
166 | 
167 |                         is_replacement_needed = true;
168 |                         start_idx = idx;
169 | 
170 |                         replacements.push((y.clone(), i, d));
171 | 
172 |                         break;
173 |                     }
174 | 
175 |                     if is_replacement_needed {
176 |                         let (_, i, d) = replacements.last().unwrap();
177 | 
178 |                         p.remove(start_idx);
179 |                         p.insert(start_idx, i.clone());
180 |                         p.insert(start_idx + 1, d.clone());
181 |                     }
182 |                 }
183 | 
184 |                 for (y, i, d) in replacements {
185 |                     if w.contains(&y) {
186 |                         let idx = w.iter().position(|it| it == &y).unwrap();
187 |                         w.remove(idx);
188 |                         w.push(i);
189 |                         w.push(d);
190 |                     } else if i.len() <= d.len() {
191 |                         w.push(i);
192 |                     } else {
193 |                         w.push(d);
194 |                     }
195 |                 }
196 |             }
197 |         }
198 | 
199 |         self.recreate_graph(p.iter().filter(|&it| !it.is_empty()).collect_vec());
200 |     }
201 | 
202 |     fn get_initial_partition(&self) -> Vec<HashSet<State>> {
203 |         let (final_states, non_final_states): (HashSet<State>, HashSet<State>) = self
204 |             .graph
205 |             .node_indices()
206 |             .partition(|&state| !self.final_state_indices.contains(&state.index()));
207 | 
208 |         vec![final_states, non_final_states]
209 |     }
210 | 
211 |     fn get_parent_states(&self, a: &HashSet<State>, label: &Grapheme) -> HashSet<State> {
212 |         let mut x = HashSet::new();
213 | 
214 |         for &state in a {
215 |             let direct_parent_states = self.graph.neighbors_directed(state, Direction::Incoming);
216 |             for parent_state in direct_parent_states {
217 |                 let edge = self.graph.find_edge(parent_state, state).unwrap();
218 |                 let grapheme = self.graph.edge_weight(edge).unwrap();
219 |                 if grapheme.value() == label.value()
220 |                     && (grapheme.maximum() == label.maximum()
221 |                         || grapheme.minimum() == label.minimum())
222 |                 {
223 |                     x.insert(parent_state);
224 |                     break;
225 |                 }
226 |             }
227 |         }
228 |         x
229 |     }
230 | 
231 |     fn recreate_graph(&mut self, p: Vec<&HashSet<State>>) {
232 |         let mut graph = StableGraph::<StateLabel, EdgeLabel>::new();
233 |         let mut final_state_indices = HashSet::new();
234 |         let mut state_mappings = HashMap::new();
235 |         let mut new_initial_state: Option<NodeIndex> = None;
236 | 
237 |         for equivalence_class in p.iter() {
238 |             let new_state = graph.add_node("".to_string());
239 | 
240 |             for old_state in equivalence_class.iter() {
241 |                 if self.initial_state == *old_state {
242 |                     new_initial_state = Some(new_state);
243 |                 }
244 |                 state_mappings.insert(*old_state, new_state);
245 |             }
246 |         }
247 | 
248 |         for equivalence_class in p.iter() {
249 |             let old_source_state = *equivalence_class.iter().next().unwrap();
250 |             let new_source_state = state_mappings.get(&old_source_state).unwrap();
251 | 
252 |             for old_target_state in self.graph.neighbors(old_source_state) {
253 |                 let edge = self
254 |                     .graph
255 |                     .find_edge(old_source_state, old_target_state)
256 |                     .unwrap();
257 | 
258 |                 let grapheme = self.graph.edge_weight(edge).unwrap().clone();
259 |                 let new_target_state = state_mappings.get(&old_target_state).unwrap();
260 | 
261 |                 graph.add_edge(*new_source_state, *new_target_state, grapheme.clone());
262 | 
263 |                 if self.final_state_indices.contains(&old_target_state.index()) {
264 |                     final_state_indices.insert(new_target_state.index());
265 |                 }
266 |             }
267 |         }
268 |         self.initial_state = new_initial_state.unwrap();
269 |         self.final_state_indices = final_state_indices;
270 |         self.graph = graph;
271 |     }
272 | }
273 | 
274 | #[cfg(test)]
275 | mod tests {
276 |     use super::*;
277 | 
278 |     #[test]
279 |     fn test_state_count() {
280 |         let config = RegExpConfig::new();
281 |         let mut dfa = Dfa::new(&config);
282 |         assert_eq!(dfa.state_count(), 1);
283 | 
284 |         dfa.insert(&GraphemeCluster::from("abcd", &RegExpConfig::new()));
285 |         assert_eq!(dfa.state_count(), 5);
286 |     }
287 | 
288 |     #[test]
289 |     fn test_is_final_state() {
290 |         let config = RegExpConfig::new();
291 |         let dfa = Dfa::from(
292 |             &[GraphemeCluster::from("abcd", &RegExpConfig::new())],
293 |             true,
294 |             &config,
295 |         );
296 | 
297 |         let intermediate_state = State::new(3);
298 |         assert_eq!(dfa.is_final_state(intermediate_state), false);
299 | 
300 |         let final_state = State::new(4);
301 |         assert_eq!(dfa.is_final_state(final_state), true);
302 |     }
303 | 
304 |     #[test]
305 |     fn test_outgoing_edges() {
306 |         let config = RegExpConfig::new();
307 |         let dfa = Dfa::from(
308 |             &[
309 |                 GraphemeCluster::from("abcd", &RegExpConfig::new()),
310 |                 GraphemeCluster::from("abxd", &RegExpConfig::new()),
311 |             ],
312 |             true,
313 |             &config,
314 |         );
315 |         let state = State::new(2);
316 |         let mut edges = dfa.outgoing_edges(state);
317 | 
318 |         let first_edge = edges.next();
319 |         assert!(first_edge.is_some());
320 |         assert_eq!(
321 |             first_edge.unwrap().weight(),
322 |             &Grapheme::from("c", false, false, false)
323 |         );
324 | 
325 |         let second_edge = edges.next();
326 |         assert!(second_edge.is_some());
327 |         assert_eq!(
328 |             second_edge.unwrap().weight(),
329 |             &Grapheme::from("x", false, false, false)
330 |         );
331 | 
332 |         let third_edge = edges.next();
333 |         assert!(third_edge.is_none());
334 |     }
335 | 
336 |     #[test]
337 |     fn test_states_in_depth_first_order() {
338 |         let config = RegExpConfig::new();
339 |         let dfa = Dfa::from(
340 |             &[
341 |                 GraphemeCluster::from("abcd", &RegExpConfig::new()),
342 |                 GraphemeCluster::from("axyz", &RegExpConfig::new()),
343 |             ],
344 |             true,
345 |             &config,
346 |         );
347 |         let states = dfa.states_in_depth_first_order();
348 |         assert_eq!(states.len(), 7);
349 | 
350 |         let first_state = states.get(0).unwrap();
351 |         let mut edges = dfa.outgoing_edges(*first_state);
352 |         assert_eq!(
353 |             edges.next().unwrap().weight(),
354 |             &Grapheme::from("a", false, false, false)
355 |         );
356 |         assert!(edges.next().is_none());
357 | 
358 |         let second_state = states.get(1).unwrap();
359 |         edges = dfa.outgoing_edges(*second_state);
360 |         assert_eq!(
361 |             edges.next().unwrap().weight(),
362 |             &Grapheme::from("b", false, false, false)
363 |         );
364 |         assert_eq!(
365 |             edges.next().unwrap().weight(),
366 |             &Grapheme::from("x", false, false, false)
367 |         );
368 |         assert!(edges.next().is_none());
369 | 
370 |         let third_state = states.get(2).unwrap();
371 |         edges = dfa.outgoing_edges(*third_state);
372 |         assert_eq!(
373 |             edges.next().unwrap().weight(),
374 |             &Grapheme::from("y", false, false, false)
375 |         );
376 |         assert!(edges.next().is_none());
377 | 
378 |         let fourth_state = states.get(3).unwrap();
379 |         edges = dfa.outgoing_edges(*fourth_state);
380 |         assert_eq!(
381 |             edges.next().unwrap().weight(),
382 |             &Grapheme::from("z", false, false, false)
383 |         );
384 |         assert!(edges.next().is_none());
385 | 
386 |         let fifth_state = states.get(4).unwrap();
387 |         edges = dfa.outgoing_edges(*fifth_state);
388 |         assert!(edges.next().is_none());
389 | 
390 |         let sixth_state = states.get(5).unwrap();
391 |         edges = dfa.outgoing_edges(*sixth_state);
392 |         assert_eq!(
393 |             edges.next().unwrap().weight(),
394 |             &Grapheme::from("c", false, false, false)
395 |         );
396 |         assert!(edges.next().is_none());
397 | 
398 |         let seventh_state = states.get(6).unwrap();
399 |         edges = dfa.outgoing_edges(*seventh_state);
400 |         assert_eq!(
401 |             edges.next().unwrap().weight(),
402 |             &Grapheme::from("d", false, false, false)
403 |         );
404 |         assert!(edges.next().is_none());
405 |     }
406 | 
407 |     #[test]
408 |     fn test_minimization_algorithm() {
409 |         let config = RegExpConfig::new();
410 |         let mut dfa = Dfa::new(&config);
411 |         assert_eq!(dfa.graph.node_count(), 1);
412 |         assert_eq!(dfa.graph.edge_count(), 0);
413 | 
414 |         dfa.insert(&GraphemeCluster::from("abcd", &RegExpConfig::new()));
415 |         assert_eq!(dfa.graph.node_count(), 5);
416 |         assert_eq!(dfa.graph.edge_count(), 4);
417 | 
418 |         dfa.insert(&GraphemeCluster::from("abxd", &RegExpConfig::new()));
419 |         assert_eq!(dfa.graph.node_count(), 7);
420 |         assert_eq!(dfa.graph.edge_count(), 6);
421 | 
422 |         dfa.minimize();
423 |         assert_eq!(dfa.graph.node_count(), 5);
424 |         assert_eq!(dfa.graph.edge_count(), 5);
425 |     }
426 | 
427 |     #[test]
428 |     fn test_dfa_constructor() {
429 |         let config = RegExpConfig::new();
430 |         let dfa = Dfa::from(
431 |             &[
432 |                 GraphemeCluster::from("abcd", &RegExpConfig::new()),
433 |                 GraphemeCluster::from("abxd", &RegExpConfig::new()),
434 |             ],
435 |             true,
436 |             &config,
437 |         );
438 |         assert_eq!(dfa.graph.node_count(), 5);
439 |         assert_eq!(dfa.graph.edge_count(), 5);
440 |     }
441 | }
442 | 


--------------------------------------------------------------------------------
/src/format.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::cluster::GraphemeCluster;
 18 | use crate::component::Component;
 19 | use crate::expression::Expression;
 20 | use crate::quantifier::Quantifier;
 21 | use itertools::Itertools;
 22 | use std::collections::BTreeSet;
 23 | use std::fmt::{Display, Formatter, Result};
 24 | use unic_char_range::CharRange;
 25 | 
 26 | impl Display for Expression<'_> {
 27 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
 28 |         match self {
 29 |             Expression::Alternation(
 30 |                 options,
 31 |                 is_capturing_group_enabled,
 32 |                 is_output_colorized,
 33 |                 is_verbose_mode_enabled,
 34 |             ) => format_alternation(
 35 |                 f,
 36 |                 self,
 37 |                 options,
 38 |                 *is_capturing_group_enabled,
 39 |                 *is_output_colorized,
 40 |                 *is_verbose_mode_enabled,
 41 |             ),
 42 |             Expression::CharacterClass(char_set, is_output_colorized) => {
 43 |                 format_character_class(f, char_set, *is_output_colorized)
 44 |             }
 45 |             Expression::Concatenation(
 46 |                 expr1,
 47 |                 expr2,
 48 |                 is_capturing_group_enabled,
 49 |                 is_output_colorized,
 50 |                 is_verbose_mode_enabled,
 51 |             ) => format_concatenation(
 52 |                 f,
 53 |                 self,
 54 |                 expr1,
 55 |                 expr2,
 56 |                 *is_capturing_group_enabled,
 57 |                 *is_output_colorized,
 58 |                 *is_verbose_mode_enabled,
 59 |             ),
 60 |             Expression::Literal(
 61 |                 cluster,
 62 |                 is_non_ascii_char_escaped,
 63 |                 is_astral_code_point_converted_to_surrogate,
 64 |             ) => format_literal(
 65 |                 f,
 66 |                 cluster,
 67 |                 *is_non_ascii_char_escaped,
 68 |                 *is_astral_code_point_converted_to_surrogate,
 69 |             ),
 70 |             Expression::Repetition(
 71 |                 expr,
 72 |                 quantifier,
 73 |                 is_capturing_group_enabled,
 74 |                 is_output_colorized,
 75 |                 is_verbose_mode_enabled,
 76 |             ) => format_repetition(
 77 |                 f,
 78 |                 self,
 79 |                 expr,
 80 |                 quantifier,
 81 |                 *is_capturing_group_enabled,
 82 |                 *is_output_colorized,
 83 |                 *is_verbose_mode_enabled,
 84 |             ),
 85 |         }
 86 |     }
 87 | }
 88 | 
 89 | fn get_codepoint_position(c: char) -> usize {
 90 |     CharRange::all().iter().position(|it| it == c).unwrap()
 91 | }
 92 | 
 93 | fn format_alternation(
 94 |     f: &mut Formatter<'_>,
 95 |     expr: &Expression,
 96 |     options: &[Expression],
 97 |     is_capturing_group_enabled: bool,
 98 |     is_output_colorized: bool,
 99 |     is_verbose_mode_enabled: bool,
100 | ) -> Result {
101 |     let pipe_component = Component::Pipe.to_repr(is_output_colorized);
102 |     let disjunction_operator = if is_verbose_mode_enabled {
103 |         format!("\n{}\n", pipe_component)
104 |     } else {
105 |         pipe_component
106 |     };
107 |     let alternation_str = options
108 |         .iter()
109 |         .map(|option| {
110 |             if option.precedence() < expr.precedence() && !option.is_single_codepoint() {
111 |                 if is_capturing_group_enabled {
112 |                     Component::CapturedParenthesizedExpression(
113 |                         option.to_string(),
114 |                         is_verbose_mode_enabled,
115 |                         true,
116 |                     )
117 |                     .to_repr(is_output_colorized)
118 |                 } else {
119 |                     Component::UncapturedParenthesizedExpression(
120 |                         option.to_string(),
121 |                         is_verbose_mode_enabled,
122 |                         true,
123 |                     )
124 |                     .to_repr(is_output_colorized)
125 |                 }
126 |             } else {
127 |                 format!("{}", option)
128 |             }
129 |         })
130 |         .join(&disjunction_operator);
131 | 
132 |     write!(f, "{}", alternation_str)
133 | }
134 | 
135 | fn format_character_class(
136 |     f: &mut Formatter<'_>,
137 |     char_set: &BTreeSet<char>,
138 |     is_output_colorized: bool,
139 | ) -> Result {
140 |     let chars_to_escape = ['[', ']', '\\', '-', '^', '$'];
141 |     let escaped_char_set = char_set
142 |         .iter()
143 |         .map(|c| {
144 |             if chars_to_escape.contains(c) {
145 |                 format!("{}{}", "\\", c)
146 |             } else if c == &'\n' {
147 |                 "\\n".to_string()
148 |             } else if c == &'\r' {
149 |                 "\\r".to_string()
150 |             } else if c == &'\t' {
151 |                 "\\t".to_string()
152 |             } else {
153 |                 c.to_string()
154 |             }
155 |         })
156 |         .collect_vec();
157 |     let char_positions = char_set
158 |         .iter()
159 |         .map(|&it| get_codepoint_position(it))
160 |         .collect_vec();
161 | 
162 |     let mut subsets = vec![];
163 |     let mut subset = vec![];
164 | 
165 |     for ((first_c, first_pos), (second_c, second_pos)) in
166 |         escaped_char_set.iter().zip(char_positions).tuple_windows()
167 |     {
168 |         if subset.is_empty() {
169 |             subset.push(first_c);
170 |         }
171 |         if second_pos == first_pos + 1 {
172 |             subset.push(second_c);
173 |         } else {
174 |             subsets.push(subset);
175 |             subset = vec![second_c];
176 |         }
177 |     }
178 | 
179 |     subsets.push(subset);
180 | 
181 |     let mut char_class_strs = vec![];
182 | 
183 |     for subset in subsets.iter() {
184 |         if subset.len() <= 2 {
185 |             for c in subset.iter() {
186 |                 char_class_strs.push((*c).to_string());
187 |             }
188 |         } else {
189 |             char_class_strs.push(format!(
190 |                 "{}{}{}",
191 |                 subset.first().unwrap(),
192 |                 Component::Hyphen.to_repr(is_output_colorized),
193 |                 subset.last().unwrap()
194 |             ));
195 |         }
196 |     }
197 | 
198 |     write!(
199 |         f,
200 |         "{}{}{}",
201 |         Component::LeftBracket.to_repr(is_output_colorized),
202 |         char_class_strs.join(""),
203 |         Component::RightBracket.to_repr(is_output_colorized)
204 |     )
205 | }
206 | 
207 | fn format_concatenation(
208 |     f: &mut Formatter<'_>,
209 |     expr: &Expression,
210 |     expr1: &Expression,
211 |     expr2: &Expression,
212 |     is_capturing_group_enabled: bool,
213 |     is_output_colorized: bool,
214 |     is_verbose_mode_enabled: bool,
215 | ) -> Result {
216 |     let expr_strs = [expr1, expr2]
217 |         .iter()
218 |         .map(|&it| {
219 |             if it.precedence() < expr.precedence() && !it.is_single_codepoint() {
220 |                 if is_capturing_group_enabled {
221 |                     Component::CapturedParenthesizedExpression(
222 |                         it.to_string(),
223 |                         is_verbose_mode_enabled,
224 |                         true,
225 |                     )
226 |                     .to_repr(is_output_colorized)
227 |                 } else {
228 |                     Component::UncapturedParenthesizedExpression(
229 |                         it.to_string(),
230 |                         is_verbose_mode_enabled,
231 |                         true,
232 |                     )
233 |                     .to_repr(is_output_colorized)
234 |                 }
235 |             } else {
236 |                 format!("{}", it)
237 |             }
238 |         })
239 |         .collect_vec();
240 | 
241 |     write!(
242 |         f,
243 |         "{}{}",
244 |         expr_strs.first().unwrap(),
245 |         expr_strs.last().unwrap()
246 |     )
247 | }
248 | 
249 | fn format_literal(
250 |     f: &mut Formatter<'_>,
251 |     cluster: &GraphemeCluster,
252 |     is_non_ascii_char_escaped: bool,
253 |     is_astral_code_point_converted_to_surrogate: bool,
254 | ) -> Result {
255 |     let literal_str = cluster
256 |         .graphemes()
257 |         .iter()
258 |         .cloned()
259 |         .map(|mut grapheme| {
260 |             if grapheme.has_repetitions() {
261 |                 grapheme
262 |                     .repetitions_mut()
263 |                     .iter_mut()
264 |                     .for_each(|repeated_grapheme| {
265 |                         repeated_grapheme.escape_regexp_symbols(
266 |                             is_non_ascii_char_escaped,
267 |                             is_astral_code_point_converted_to_surrogate,
268 |                         );
269 |                     });
270 |             } else {
271 |                 grapheme.escape_regexp_symbols(
272 |                     is_non_ascii_char_escaped,
273 |                     is_astral_code_point_converted_to_surrogate,
274 |                 );
275 |             }
276 |             grapheme.to_string()
277 |         })
278 |         .join("");
279 | 
280 |     write!(f, "{}", literal_str)
281 | }
282 | 
283 | fn format_repetition(
284 |     f: &mut Formatter<'_>,
285 |     expr: &Expression,
286 |     expr1: &Expression,
287 |     quantifier: &Quantifier,
288 |     is_capturing_group_enabled: bool,
289 |     is_output_colorized: bool,
290 |     is_verbose_mode_enabled: bool,
291 | ) -> Result {
292 |     if expr1.precedence() < expr.precedence() && !expr1.is_single_codepoint() {
293 |         if is_capturing_group_enabled {
294 |             write!(
295 |                 f,
296 |                 "{}{}",
297 |                 Component::CapturedParenthesizedExpression(
298 |                     expr1.to_string(),
299 |                     is_verbose_mode_enabled,
300 |                     false
301 |                 )
302 |                 .to_repr(is_output_colorized),
303 |                 Component::Quantifier(quantifier.clone(), is_verbose_mode_enabled)
304 |                     .to_repr(is_output_colorized)
305 |             )
306 |         } else {
307 |             write!(
308 |                 f,
309 |                 "{}{}",
310 |                 Component::UncapturedParenthesizedExpression(
311 |                     expr1.to_string(),
312 |                     is_verbose_mode_enabled,
313 |                     false
314 |                 )
315 |                 .to_repr(is_output_colorized),
316 |                 Component::Quantifier(quantifier.clone(), is_verbose_mode_enabled)
317 |                     .to_repr(is_output_colorized)
318 |             )
319 |         }
320 |     } else {
321 |         write!(
322 |             f,
323 |             "{}{}",
324 |             expr1,
325 |             Component::Quantifier(quantifier.clone(), is_verbose_mode_enabled)
326 |                 .to_repr(is_output_colorized)
327 |         )
328 |     }
329 | }
330 | 


--------------------------------------------------------------------------------
/src/grapheme.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::component::Component;
 18 | use itertools::Itertools;
 19 | use std::fmt::{Display, Formatter, Result};
 20 | 
 21 | const CHARS_TO_ESCAPE: [&str; 14] = [
 22 |     "(", ")", "[", "]", "{", "}", "+", "*", "-", ".", "?", "|", "^", "$",
 23 | ];
 24 | 
 25 | const CHAR_CLASSES: [&str; 6] = ["\\d", "\\s", "\\w", "\\D", "\\S", "\\W"];
 26 | 
 27 | #[derive(Clone, Debug, Hash, Ord, PartialOrd, Eq, PartialEq)]
 28 | pub struct Grapheme {
 29 |     pub(crate) chars: Vec<String>,
 30 |     pub(crate) repetitions: Vec<Grapheme>,
 31 |     min: u32,
 32 |     max: u32,
 33 |     is_capturing_group_enabled: bool,
 34 |     is_output_colorized: bool,
 35 |     is_verbose_mode_enabled: bool,
 36 | }
 37 | 
 38 | impl Grapheme {
 39 |     pub(crate) fn from(
 40 |         s: &str,
 41 |         is_capturing_group_enabled: bool,
 42 |         is_output_colorized: bool,
 43 |         is_verbose_mode_enabled: bool,
 44 |     ) -> Self {
 45 |         Self {
 46 |             chars: vec![s.to_string()],
 47 |             repetitions: vec![],
 48 |             min: 1,
 49 |             max: 1,
 50 |             is_capturing_group_enabled,
 51 |             is_output_colorized,
 52 |             is_verbose_mode_enabled,
 53 |         }
 54 |     }
 55 | 
 56 |     pub(crate) fn new(
 57 |         chars: Vec<String>,
 58 |         min: u32,
 59 |         max: u32,
 60 |         is_capturing_group_enabled: bool,
 61 |         is_output_colorized: bool,
 62 |         is_verbose_mode_enabled: bool,
 63 |     ) -> Self {
 64 |         Self {
 65 |             chars,
 66 |             repetitions: vec![],
 67 |             min,
 68 |             max,
 69 |             is_capturing_group_enabled,
 70 |             is_output_colorized,
 71 |             is_verbose_mode_enabled,
 72 |         }
 73 |     }
 74 | 
 75 |     pub(crate) fn value(&self) -> String {
 76 |         self.chars.join("")
 77 |     }
 78 | 
 79 |     pub(crate) fn chars(&self) -> &Vec<String> {
 80 |         &self.chars
 81 |     }
 82 | 
 83 |     pub(crate) fn chars_mut(&mut self) -> &mut Vec<String> {
 84 |         &mut self.chars
 85 |     }
 86 | 
 87 |     pub(crate) fn has_repetitions(&self) -> bool {
 88 |         !self.repetitions.is_empty()
 89 |     }
 90 | 
 91 |     pub(crate) fn repetitions_mut(&mut self) -> &mut Vec<Grapheme> {
 92 |         &mut self.repetitions
 93 |     }
 94 | 
 95 |     pub(crate) fn minimum(&self) -> u32 {
 96 |         self.min
 97 |     }
 98 | 
 99 |     pub(crate) fn maximum(&self) -> u32 {
100 |         self.max
101 |     }
102 | 
103 |     pub(crate) fn char_count(&self, is_non_ascii_char_escaped: bool) -> usize {
104 |         if is_non_ascii_char_escaped {
105 |             self.chars
106 |                 .iter()
107 |                 .map(|it| it.chars().map(|c| self.escape(c, false)).join(""))
108 |                 .join("")
109 |                 .chars()
110 |                 .count()
111 |         } else {
112 |             self.chars.iter().map(|it| it.chars().count()).sum()
113 |         }
114 |     }
115 | 
116 |     pub(crate) fn escape_non_ascii_chars(&mut self, use_surrogate_pairs: bool) {
117 |         self.chars = self
118 |             .chars
119 |             .iter()
120 |             .map(|it| {
121 |                 it.chars()
122 |                     .map(|c| self.escape(c, use_surrogate_pairs))
123 |                     .join("")
124 |             })
125 |             .collect_vec();
126 |     }
127 | 
128 |     pub(crate) fn escape_regexp_symbols(
129 |         &mut self,
130 |         is_non_ascii_char_escaped: bool,
131 |         is_astral_code_point_converted_to_surrogate: bool,
132 |     ) {
133 |         let characters = self.chars_mut();
134 | 
135 |         #[allow(clippy::needless_range_loop)]
136 |         for i in 0..characters.len() {
137 |             let mut character = characters[i].clone();
138 | 
139 |             for char_to_escape in CHARS_TO_ESCAPE.iter() {
140 |                 character =
141 |                     character.replace(char_to_escape, &format!("{}{}", "\\", char_to_escape));
142 |             }
143 | 
144 |             character = character
145 |                 .replace('\n', "\\n")
146 |                 .replace('\r', "\\r")
147 |                 .replace('\t', "\\t");
148 | 
149 |             if character == "\\" {
150 |                 character = "\\\\".to_string();
151 |             }
152 | 
153 |             characters[i] = character;
154 |         }
155 | 
156 |         if is_non_ascii_char_escaped {
157 |             self.escape_non_ascii_chars(is_astral_code_point_converted_to_surrogate);
158 |         }
159 |     }
160 | 
161 |     fn escape(&self, c: char, use_surrogate_pairs: bool) -> String {
162 |         if c.is_ascii() {
163 |             c.to_string()
164 |         } else if use_surrogate_pairs && ('\u{10000}'..'\u{10ffff}').contains(&c) {
165 |             self.convert_to_surrogate_pair(c)
166 |         } else {
167 |             c.escape_unicode().to_string()
168 |         }
169 |     }
170 | 
171 |     fn convert_to_surrogate_pair(&self, c: char) -> String {
172 |         c.encode_utf16(&mut [0; 2])
173 |             .iter()
174 |             .map(|it| format!("\\u{{{:x}}}", it))
175 |             .join("")
176 |     }
177 | }
178 | 
179 | impl Display for Grapheme {
180 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
181 |         let is_single_char = self.char_count(false) == 1
182 |             || (self.chars.len() == 1 && self.chars[0].matches('\\').count() == 1);
183 |         let is_range = self.min < self.max;
184 |         let is_repetition = self.min > 1;
185 |         let mut value = if self.repetitions.is_empty() {
186 |             self.value()
187 |         } else {
188 |             self.repetitions.iter().map(|it| it.to_string()).join("")
189 |         };
190 |         value = Component::CharClass(value.clone())
191 |             .to_repr(self.is_output_colorized && CHAR_CLASSES.contains(&&*value));
192 | 
193 |         if !is_range && is_repetition && is_single_char {
194 |             write!(
195 |                 f,
196 |                 "{}{}",
197 |                 value,
198 |                 Component::Repetition(self.min, false).to_repr(self.is_output_colorized)
199 |             )
200 |         } else if !is_range && is_repetition && !is_single_char {
201 |             write!(
202 |                 f,
203 |                 "{}{}",
204 |                 if self.is_capturing_group_enabled {
205 |                     Component::CapturedParenthesizedExpression(
206 |                         value,
207 |                         self.is_verbose_mode_enabled,
208 |                         false,
209 |                     )
210 |                     .to_repr(self.is_output_colorized)
211 |                 } else {
212 |                     Component::UncapturedParenthesizedExpression(
213 |                         value,
214 |                         self.is_verbose_mode_enabled,
215 |                         false,
216 |                     )
217 |                     .to_repr(self.is_output_colorized)
218 |                 },
219 |                 Component::Repetition(self.min, self.is_verbose_mode_enabled)
220 |                     .to_repr(self.is_output_colorized)
221 |             )
222 |         } else if is_range && is_single_char {
223 |             write!(
224 |                 f,
225 |                 "{}{}",
226 |                 value,
227 |                 Component::RepetitionRange(self.min, self.max, false)
228 |                     .to_repr(self.is_output_colorized)
229 |             )
230 |         } else if is_range && !is_single_char {
231 |             write!(
232 |                 f,
233 |                 "{}{}",
234 |                 if self.is_capturing_group_enabled {
235 |                     Component::CapturedParenthesizedExpression(
236 |                         value,
237 |                         self.is_verbose_mode_enabled,
238 |                         false,
239 |                     )
240 |                     .to_repr(self.is_output_colorized)
241 |                 } else {
242 |                     Component::UncapturedParenthesizedExpression(
243 |                         value,
244 |                         self.is_verbose_mode_enabled,
245 |                         false,
246 |                     )
247 |                     .to_repr(self.is_output_colorized)
248 |                 },
249 |                 Component::RepetitionRange(self.min, self.max, self.is_verbose_mode_enabled)
250 |                     .to_repr(self.is_output_colorized)
251 |             )
252 |         } else {
253 |             write!(f, "{}", value)
254 |         }
255 |     }
256 | }
257 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | //! ## 1. What does this tool do?
 18 | //!
 19 | //! *grex* is a library as well as a command-line utility that is meant to simplify the often
 20 | //! complicated and tedious task of creating regular expressions. It does so by automatically
 21 | //! generating a single regular expression from user-provided test cases. The resulting
 22 | //! expression is guaranteed to match the test cases which it was generated from.
 23 | //!
 24 | //! This project has started as a Rust port of the JavaScript tool
 25 | //! [*regexgen*](https://github.com/devongovett/regexgen) written by
 26 | //! [Devon Govett](https://github.com/devongovett). Although a lot of further useful features
 27 | //! could be added to it, its development was apparently ceased several years ago. The plan
 28 | //! is now to add these new features to *grex* as Rust really shines when it comes to
 29 | //! command-line tools. *grex* offers all features that *regexgen* provides, and more.
 30 | //!
 31 | //! The philosophy of this project is to generate the most specific regular expression
 32 | //! possible by default which exactly matches the given input only and nothing else.
 33 | //! With the use of command-line flags (in the CLI tool) or preprocessing methods
 34 | //! (in the library), more generalized expressions can be created.
 35 | //!
 36 | //! The produced expressions are [Perl-compatible regular expressions](https://www.pcre.org)
 37 | //! which are also compatible with the regular expression parser in Rust's
 38 | //! [*regex crate*](https://crates.io/crates/regex).
 39 | //! Other regular expression parsers or respective libraries from other programming languages
 40 | //! have not been tested so far, but they ought to be mostly compatible as well.
 41 | //!
 42 | //! ## 2. Do I still need to learn to write regexes then?
 43 | //!
 44 | //! **Definitely, yes!** Using the standard settings, *grex* produces a regular expression that
 45 | //! is guaranteed to match only the test cases given as input and nothing else. This has been
 46 | //! verified by [property tests](https://github.com/pemistahl/grex/blob/main/tests/property_tests.rs).
 47 | //! However, if the conversion to shorthand character classes such as `\w` is enabled, the
 48 | //! resulting regex matches a much wider scope of test cases. Knowledge about the consequences of
 49 | //! this conversion is essential for finding a correct regular expression for your business domain.
 50 | //!
 51 | //! *grex* uses an algorithm that tries to find the shortest possible regex for the given test cases.
 52 | //! Very often though, the resulting expression is still longer or more complex than it needs to be.
 53 | //! In such cases, a more compact or elegant regex can be created only by hand.
 54 | //! Also, every regular expression engine has different built-in optimizations.
 55 | //! *grex* does not know anything about those and therefore cannot optimize its regexes
 56 | //! for a specific engine.
 57 | //!
 58 | //! **So, please learn how to write regular expressions!** The currently best use case for *grex*
 59 | //! is to find an initial correct regex which should be inspected by hand if further optimizations
 60 | //! are possible.
 61 | //!
 62 | //! ## 3. Current features
 63 | //!
 64 | //! - literals
 65 | //! - character classes
 66 | //! - detection of common prefixes and suffixes
 67 | //! - detection of repeated substrings and conversion to `{min,max}` quantifier notation
 68 | //! - alternation using `|` operator
 69 | //! - optionality using `?` quantifier
 70 | //! - escaping of non-ascii characters, with optional conversion of astral code points to surrogate pairs
 71 | //! - case-sensitive or case-insensitive matching
 72 | //! - capturing or non-capturing groups
 73 | //! - optional anchors `^` and `$`
 74 | //! - fully compliant to [Unicode Standard 15.0](https://unicode.org/versions/Unicode15.0.0)
 75 | //! - fully compatible with [*regex* crate 1.9.0+](https://crates.io/crates/regex)
 76 | //! - correctly handles graphemes consisting of multiple Unicode symbols
 77 | //! - reads input strings from the command-line or from a file
 78 | //! - produces more readable expressions indented on multiple using optional verbose mode
 79 | //!
 80 | //! ## 4. How to use?
 81 | //!
 82 | //! The code snippets below show how to use the public api.
 83 | //!
 84 | //! For [more detailed examples](https://github.com/pemistahl/grex/tree/main#53-examples), please
 85 | //! take a look at the project's readme file on GitHub.
 86 | //!
 87 | //! ### 4.1 Default settings
 88 | //!
 89 | //! Test cases are passed either from a collection via [`RegExpBuilder::from()`]
 90 | //! or from a file via [`RegExpBuilder::from_file()`].
 91 | //!
 92 | //! ```
 93 | //! use grex::RegExpBuilder;
 94 | //!
 95 | //! let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]).build();
 96 | //! assert_eq!(regexp, "^a(?:aa?)?$");
 97 | //! ```
 98 | //!
 99 | //! ### 4.2 Convert to character classes
100 | //!
101 | //! ```
102 | //! use grex::RegExpBuilder;
103 | //!
104 | //! let regexp = RegExpBuilder::from(&["a", "aa", "123"])
105 | //!     .with_conversion_of_digits()
106 | //!     .with_conversion_of_words()
107 | //!     .build();
108 | //! assert_eq!(regexp, "^(?:\\d\\d\\d|\\w(?:\\w)?)$");
109 | //! ```
110 | //!
111 | //! ### 4.3 Convert repeated substrings
112 | //!
113 | //! ```
114 | //! use grex::RegExpBuilder;
115 | //!
116 | //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"])
117 | //!     .with_conversion_of_repetitions()
118 | //!     .build();
119 | //! assert_eq!(regexp, "^(?:a{2}|(?:bc){2}|(?:def){3})$");
120 | //! ```
121 | //!
122 | //! By default, *grex* converts each substring this way which is at least a single character long
123 | //! and which is subsequently repeated at least once. You can customize these two parameters
124 | //! if you like.
125 | //!
126 | //! In the following example, the test case `aa` is not converted to `a{2}` because the repeated
127 | //! substring `a` has a length of 1, but the minimum substring length has been set to 2.
128 | //!
129 | //! ```
130 | //! use grex::RegExpBuilder;
131 | //!
132 | //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"])
133 | //!     .with_conversion_of_repetitions()
134 | //!     .with_minimum_substring_length(2)
135 | //!     .build();
136 | //! assert_eq!(regexp, "^(?:aa|(?:bc){2}|(?:def){3})$");
137 | //! ```
138 | //!
139 | //! Setting a minimum number of 2 repetitions in the next example, only the test case `defdefdef`
140 | //! will be converted because it is the only one that is repeated twice.
141 | //!
142 | //! ```
143 | //! use grex::RegExpBuilder;
144 | //!
145 | //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"])
146 | //!     .with_conversion_of_repetitions()
147 | //!     .with_minimum_repetitions(2)
148 | //!     .build();
149 | //! assert_eq!(regexp, "^(?:bcbc|aa|(?:def){3})$");
150 | //! ```
151 | //!
152 | //! ### 4.4 Escape non-ascii characters
153 | //!
154 | //! ```
155 | //! use grex::RegExpBuilder;
156 | //!
157 | //! let regexp = RegExpBuilder::from(&["You smell like 💩."])
158 | //!     .with_escaping_of_non_ascii_chars(false)
159 | //!     .build();
160 | //! assert_eq!(regexp, "^You smell like \\u{1f4a9}\\.$");
161 | //! ```
162 | //!
163 | //! Old versions of JavaScript do not support unicode escape sequences for
164 | //! the astral code planes (range `U+010000` to `U+10FFFF`). In order to
165 | //! support these symbols in JavaScript regular expressions, the conversion
166 | //! to surrogate pairs is necessary. More information on that matter can be
167 | //! found [here](https://mathiasbynens.be/notes/javascript-unicode).
168 | //!
169 | //! ```
170 | //! use grex::RegExpBuilder;
171 | //!
172 | //! let regexp = RegExpBuilder::from(&["You smell like 💩."])
173 | //!     .with_escaping_of_non_ascii_chars(true)
174 | //!     .build();
175 | //! assert_eq!(regexp, "^You smell like \\u{d83d}\\u{dca9}\\.$");
176 | //! ```
177 | //!
178 | //! ### 4.5 Case-insensitive matching
179 | //!
180 | //! The regular expressions that *grex* generates are case-sensitive by default.
181 | //! Case-insensitive matching can be enabled like so:
182 | //!
183 | //! ```
184 | //! use grex::RegExpBuilder;
185 | //!
186 | //! let regexp = RegExpBuilder::from(&["big", "BIGGER"])
187 | //!     .with_case_insensitive_matching()
188 | //!     .build();
189 | //! assert_eq!(regexp, "(?i)^big(?:ger)?$");
190 | //! ```
191 | //!
192 | //! ### 4.6 Capturing Groups
193 | //!
194 | //! Non-capturing groups are used by default.
195 | //! Extending the previous example, you can switch to capturing groups instead.
196 | //!
197 | //! ```
198 | //! use grex::RegExpBuilder;
199 | //!
200 | //! let regexp = RegExpBuilder::from(&["big", "BIGGER"])
201 | //!     .with_case_insensitive_matching()
202 | //!     .with_capturing_groups()
203 | //!     .build();
204 | //! assert_eq!(regexp, "(?i)^big(ger)?$");
205 | //! ```
206 | //!
207 | //! ### 4.7 Verbose mode
208 | //!
209 | //! If you find the generated regular expression hard to read, you can enable verbose mode.
210 | //! The expression is then put on multiple lines and indented to make it more pleasant to the eyes.
211 | //!
212 | //! ```
213 | //! use grex::RegExpBuilder;
214 | //! use indoc::indoc;
215 | //!
216 | //! let regexp = RegExpBuilder::from(&["a", "b", "bcd"])
217 | //!     .with_verbose_mode()
218 | //!     .build();
219 | //!
220 | //! assert_eq!(regexp, indoc!(
221 | //!     r#"
222 | //!     (?x)
223 | //!     ^
224 | //!       (?:
225 | //!         b
226 | //!         (?:
227 | //!           cd
228 | //!         )?
229 | //!         |
230 | //!         a
231 | //!       )
232 | //!     $"#
233 | //! ));
234 | //! ```
235 | //!
236 | //! ### 4.8 Disable anchors
237 | //!
238 | //! By default, the anchors `^` and `$` are put around every generated regular expression in order
239 | //! to ensure that it matches only the test cases given as input. Often enough, however, it is
240 | //! desired to use the generated pattern as part of a larger one. For this purpose, the anchors
241 | //! can be disabled, either separately or both of them.
242 | //!
243 | //! ```
244 | //! use grex::RegExpBuilder;
245 | //!
246 | //! let regexp = RegExpBuilder::from(&["a", "aa", "aaa"])
247 | //!     .without_anchors()
248 | //!     .build();
249 | //! assert_eq!(regexp, "a(?:aa?)?");
250 | //! ```
251 | //!
252 | //! ### 5. How does it work?
253 | //!
254 | //! 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA)
255 | //! is created from the input strings.
256 | //!
257 | //! 2. The number of states and transitions between states in the DFA is reduced by applying
258 | //! [Hopcroft's DFA minimization algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm).
259 | //!
260 | //! 3. The minimized DFA is expressed as a system of linear equations which are solved with
261 | //! [Brzozowski's algebraic method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392),
262 | //! resulting in the final regular expression.
263 | 
264 | #[macro_use]
265 | mod macros;
266 | 
267 | mod builder;
268 | mod cluster;
269 | mod component;
270 | mod config;
271 | mod dfa;
272 | mod expression;
273 | mod format;
274 | mod grapheme;
275 | mod quantifier;
276 | mod regexp;
277 | mod substring;
278 | mod unicode_tables;
279 | 
280 | #[cfg(feature = "python")]
281 | mod python;
282 | 
283 | #[cfg(target_family = "wasm")]
284 | mod wasm;
285 | 
286 | pub use builder::RegExpBuilder;
287 | 
288 | #[cfg(target_family = "wasm")]
289 | pub use wasm::RegExpBuilder as WasmRegExpBuilder;
290 | 


--------------------------------------------------------------------------------
/src/macros.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | macro_rules! btreeset {
18 |     ( $( $value: expr ),* ) => {{
19 |         let mut set = std::collections::BTreeSet::new();
20 |         $( set.insert($value); )*
21 |         set
22 |     }};
23 | }
24 | 


--------------------------------------------------------------------------------
/src/python.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::builder::{
 18 |     RegExpBuilder, MINIMUM_REPETITIONS_MESSAGE, MINIMUM_SUBSTRING_LENGTH_MESSAGE,
 19 |     MISSING_TEST_CASES_MESSAGE,
 20 | };
 21 | use crate::config::RegExpConfig;
 22 | use lazy_static::lazy_static;
 23 | use pyo3::exceptions::PyValueError;
 24 | use pyo3::prelude::*;
 25 | use pyo3::types::PyType;
 26 | use regex::{Captures, Regex};
 27 | 
 28 | #[pymodule]
 29 | fn grex(m: &Bound<'_, PyModule>) -> PyResult<()> {
 30 |     m.add_class::<RegExpBuilder>()?;
 31 |     Ok(())
 32 | }
 33 | 
 34 | #[pymethods]
 35 | impl RegExpBuilder {
 36 |     #[new]
 37 |     fn new(test_cases: Vec<String>) -> PyResult<Self> {
 38 |         if test_cases.is_empty() {
 39 |             Err(PyValueError::new_err(MISSING_TEST_CASES_MESSAGE))
 40 |         } else {
 41 |             Ok(Self {
 42 |                 test_cases,
 43 |                 config: RegExpConfig::new(),
 44 |             })
 45 |         }
 46 |     }
 47 | 
 48 |     /// Specify the test cases to build the regular expression from.
 49 |     ///
 50 |     /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
 51 |     ///
 52 |     /// Args:
 53 |     ///     test_cases (list[str]): The list of test cases
 54 |     ///
 55 |     /// Raises:
 56 |     ///     ValueError: if `test_cases` is empty
 57 |     #[classmethod]
 58 |     fn from_test_cases(_cls: &Bound<PyType>, test_cases: Vec<String>) -> PyResult<Self> {
 59 |         Self::new(test_cases)
 60 |     }
 61 | 
 62 |     /// Convert any Unicode decimal digit to character class `\d`.
 63 |     ///
 64 |     /// This method takes precedence over `with_conversion_of_words` if both are set.
 65 |     /// Decimal digits are converted to `\d`, the remaining word characters to `\w`.
 66 |     ///
 67 |     /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
 68 |     /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`.
 69 |     #[pyo3(name = "with_conversion_of_digits")]
 70 |     fn py_with_conversion_of_digits(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
 71 |         self_.config.is_digit_converted = true;
 72 |         self_
 73 |     }
 74 | 
 75 |     /// Convert any character which is not a Unicode decimal digit to character class `\D`.
 76 |     ///
 77 |     /// This method takes precedence over `with_conversion_of_non_words` if both are set.
 78 |     /// Non-digits which are also non-word characters are converted to `\D`.
 79 |     ///
 80 |     /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
 81 |     /// Non-digits which are also non-space characters are converted to `\D`.
 82 |     #[pyo3(name = "with_conversion_of_non_digits")]
 83 |     fn py_with_conversion_of_non_digits(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
 84 |         self_.config.is_non_digit_converted = true;
 85 |         self_
 86 |     }
 87 | 
 88 |     /// Convert any Unicode whitespace character to character class `\s`.
 89 |     ///
 90 |     /// This method takes precedence over `with_conversion_of_non_digits` if both are set.
 91 |     /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`.
 92 |     ///
 93 |     /// This method takes precedence over `with_conversion_of_non_words` if both are set.
 94 |     /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`.
 95 |     #[pyo3(name = "with_conversion_of_whitespace")]
 96 |     fn py_with_conversion_of_whitespace(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
 97 |         self_.config.is_space_converted = true;
 98 |         self_
 99 |     }
100 | 
101 |     /// Convert any character which is not a Unicode whitespace character to character class `\S`.
102 |     #[pyo3(name = "with_conversion_of_non_whitespace")]
103 |     fn py_with_conversion_of_non_whitespace(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
104 |         self_.config.is_non_space_converted = true;
105 |         self_
106 |     }
107 | 
108 |     /// Convert any Unicode word character to character class `\w`.
109 |     ///
110 |     /// This method takes precedence over `with_conversion_of_non_digits` if both are set.
111 |     /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`.
112 |     ///
113 |     /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
114 |     /// Word characters are converted to `\w`, the remaining non-space characters to `\S`.
115 |     #[pyo3(name = "with_conversion_of_words")]
116 |     fn py_with_conversion_of_words(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
117 |         self_.config.is_word_converted = true;
118 |         self_
119 |     }
120 | 
121 |     /// Convert any character which is not a Unicode word character to character class `\W`.
122 |     ///
123 |     /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set.
124 |     /// Non-words which are also non-space characters are converted to `\W`.
125 |     #[pyo3(name = "with_conversion_of_non_words")]
126 |     fn py_with_conversion_of_non_words(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
127 |         self_.config.is_non_word_converted = true;
128 |         self_
129 |     }
130 | 
131 |     /// Detect repeated non-overlapping substrings and convert them to `{min,max}` quantifier notation.
132 |     #[pyo3(name = "with_conversion_of_repetitions")]
133 |     fn py_with_conversion_of_repetitions(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
134 |         self_.config.is_repetition_converted = true;
135 |         self_
136 |     }
137 | 
138 |     /// Enable case-insensitive matching of test cases so that letters match both upper and lower case.
139 |     #[pyo3(name = "with_case_insensitive_matching")]
140 |     fn py_with_case_insensitive_matching(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
141 |         self_.config.is_case_insensitive_matching = true;
142 |         self_
143 |     }
144 | 
145 |     /// Replace non-capturing groups by capturing ones.
146 |     #[pyo3(name = "with_capturing_groups")]
147 |     fn py_with_capturing_groups(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
148 |         self_.config.is_capturing_group_enabled = true;
149 |         self_
150 |     }
151 | 
152 |     /// Specify the minimum quantity of substring repetitions to be converted if `with_conversion_of_repetitions` is set.
153 |     ///
154 |     /// If the quantity is not explicitly set with this method, a default value of 1 will be used.
155 |     ///
156 |     /// Args:
157 |     ///     quantity (int): The minimum quantity of substring repetitions
158 |     ///
159 |     /// Raises:
160 |     ///     ValueError: if `quantity` is zero
161 |     #[pyo3(name = "with_minimum_repetitions")]
162 |     fn py_with_minimum_repetitions(
163 |         mut self_: PyRefMut<Self>,
164 |         quantity: i32,
165 |     ) -> PyResult<PyRefMut<Self>> {
166 |         if quantity <= 0 {
167 |             Err(PyValueError::new_err(MINIMUM_REPETITIONS_MESSAGE))
168 |         } else {
169 |             self_.config.minimum_repetitions = quantity as u32;
170 |             Ok(self_)
171 |         }
172 |     }
173 | 
174 |     /// Specify the minimum length a repeated substring must have in order to be converted if `with_conversion_of_repetitions` is set.
175 |     ///
176 |     /// If the length is not explicitly set with this method, a default value of 1 will be used.
177 |     ///
178 |     /// Args:
179 |     ///     length (int): The minimum substring length
180 |     ///
181 |     /// Raises:
182 |     ///     ValueError: if `length` is zero
183 |     #[pyo3(name = "with_minimum_substring_length")]
184 |     fn py_with_minimum_substring_length(
185 |         mut self_: PyRefMut<Self>,
186 |         length: i32,
187 |     ) -> PyResult<PyRefMut<Self>> {
188 |         if length <= 0 {
189 |             Err(PyValueError::new_err(MINIMUM_SUBSTRING_LENGTH_MESSAGE))
190 |         } else {
191 |             self_.config.minimum_substring_length = length as u32;
192 |             Ok(self_)
193 |         }
194 |     }
195 | 
196 |     /// Convert non-ASCII characters to unicode escape sequences.
197 |     ///
198 |     /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes
199 |     /// (range `U+010000` to `U+10FFFF`) to surrogate pairs.
200 |     ///
201 |     /// Args:
202 |     ///     use_surrogate_pairs (bool): Whether to convert astral code planes to surrogate pairs
203 |     #[pyo3(name = "with_escaping_of_non_ascii_chars")]
204 |     fn py_with_escaping_of_non_ascii_chars(
205 |         mut self_: PyRefMut<Self>,
206 |         use_surrogate_pairs: bool,
207 |     ) -> PyRefMut<Self> {
208 |         self_.config.is_non_ascii_char_escaped = true;
209 |         self_.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs;
210 |         self_
211 |     }
212 | 
213 |     /// Produce a nicer looking regular expression in verbose mode.
214 |     #[pyo3(name = "with_verbose_mode")]
215 |     fn py_with_verbose_mode(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
216 |         self_.config.is_verbose_mode_enabled = true;
217 |         self_
218 |     }
219 | 
220 |     /// Remove the caret anchor '^' from the resulting regular expression, thereby allowing to
221 |     /// match the test cases also when they do not occur at the start of a string.
222 |     #[pyo3(name = "without_start_anchor")]
223 |     fn py_without_start_anchor(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
224 |         self_.config.is_start_anchor_disabled = true;
225 |         self_
226 |     }
227 | 
228 |     /// Remove the dollar sign anchor '$' from the resulting regular expression, thereby allowing
229 |     /// to match the test cases also when they do not occur at the end of a string.
230 |     #[pyo3(name = "without_end_anchor")]
231 |     fn py_without_end_anchor(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
232 |         self_.config.is_end_anchor_disabled = true;
233 |         self_
234 |     }
235 | 
236 |     /// Remove the caret and dollar sign anchors from the resulting regular expression, thereby
237 |     /// allowing to match the test cases also when they occur within a larger string that contains
238 |     /// other content as well.
239 |     #[pyo3(name = "without_anchors")]
240 |     fn py_without_anchors(mut self_: PyRefMut<Self>) -> PyRefMut<Self> {
241 |         self_.config.is_start_anchor_disabled = true;
242 |         self_.config.is_end_anchor_disabled = true;
243 |         self_
244 |     }
245 | 
246 |     /// Build the actual regular expression using the previously given settings.
247 |     #[pyo3(name = "build")]
248 |     fn py_build(&mut self) -> String {
249 |         let regexp = self.build();
250 |         if self.config.is_non_ascii_char_escaped {
251 |             replace_unicode_escape_sequences(regexp)
252 |         } else {
253 |             regexp
254 |         }
255 |     }
256 | }
257 | 
258 | /// Replaces Rust Unicode escape sequences to Python Unicode escape sequences.
259 | fn replace_unicode_escape_sequences(regexp: String) -> String {
260 |     lazy_static! {
261 |         static ref FOUR_CHARS_ESCAPE_SEQUENCE: Regex = Regex::new(r"\\u\{([0-9a-f]{4})\}").unwrap();
262 |         static ref FIVE_CHARS_ESCAPE_SEQUENCE: Regex = Regex::new(r"\\u\{([0-9a-f]{5})\}").unwrap();
263 |     }
264 |     let mut replacement = FOUR_CHARS_ESCAPE_SEQUENCE
265 |         .replace_all(&regexp, |caps: &Captures| format!("\\u{}", &caps[1]))
266 |         .to_string();
267 | 
268 |     replacement = FIVE_CHARS_ESCAPE_SEQUENCE
269 |         .replace_all(&replacement, |caps: &Captures| {
270 |             format!("\\U000{}", &caps[1])
271 |         })
272 |         .to_string();
273 | 
274 |     replacement
275 | }
276 | 


--------------------------------------------------------------------------------
/src/quantifier.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | use std::fmt::{Display, Formatter, Result};
18 | 
19 | #[derive(Clone, Debug, Eq, PartialEq)]
20 | pub enum Quantifier {
21 |     KleeneStar,
22 |     QuestionMark,
23 | }
24 | 
25 | impl Display for Quantifier {
26 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
27 |         write!(
28 |             f,
29 |             "{}",
30 |             match self {
31 |                 Quantifier::KleeneStar => '*',
32 |                 Quantifier::QuestionMark => '?',
33 |             }
34 |         )
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/regexp.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | use crate::cluster::GraphemeCluster;
 18 | use crate::component::Component;
 19 | use crate::config::RegExpConfig;
 20 | use crate::dfa::Dfa;
 21 | use crate::expression::Expression;
 22 | use itertools::Itertools;
 23 | use regex::Regex;
 24 | use std::cmp::Ordering;
 25 | use std::fmt::{Display, Formatter, Result};
 26 | 
 27 | pub struct RegExp<'a> {
 28 |     ast: Expression<'a>,
 29 |     config: &'a RegExpConfig,
 30 | }
 31 | 
 32 | impl<'a> RegExp<'a> {
 33 |     pub(crate) fn from(test_cases: &'a mut Vec<String>, config: &'a RegExpConfig) -> Self {
 34 |         if config.is_case_insensitive_matching {
 35 |             Self::convert_for_case_insensitive_matching(test_cases);
 36 |         }
 37 |         Self::sort(test_cases);
 38 |         let grapheme_clusters = Self::grapheme_clusters(test_cases, config);
 39 |         let mut dfa = Dfa::from(&grapheme_clusters, true, config);
 40 |         let mut ast = Expression::from(dfa, config);
 41 | 
 42 |         if config.is_start_anchor_disabled && config.is_end_anchor_disabled {
 43 |             let mut regex = Self::convert_expr_to_regex(&ast, config);
 44 | 
 45 |             if config.is_verbose_mode_enabled {
 46 |                 // Remove line breaks before checking matches, otherwise check will be incorrect.
 47 |                 regex = Regex::new(&regex.to_string().replace('\n', "")).unwrap();
 48 |             }
 49 | 
 50 |             if !Self::is_each_test_case_matched_after_rotating_alternations(
 51 |                 &regex, &mut ast, test_cases,
 52 |             ) {
 53 |                 dfa = Dfa::from(&grapheme_clusters, false, config);
 54 |                 ast = Expression::from(dfa, config);
 55 |                 regex = Self::convert_expr_to_regex(&ast, config);
 56 | 
 57 |                 if !Self::regex_matches_all_test_cases(&regex, test_cases) {
 58 |                     let mut exprs = vec![];
 59 |                     for cluster in grapheme_clusters {
 60 |                         let literal = Expression::new_literal(cluster, config);
 61 |                         exprs.push(literal);
 62 |                     }
 63 |                     ast = Expression::new_alternation(exprs, config);
 64 |                 }
 65 |             }
 66 |         }
 67 | 
 68 |         Self { ast, config }
 69 |     }
 70 | 
 71 |     fn convert_for_case_insensitive_matching(test_cases: &mut Vec<String>) {
 72 |         // Convert only those test cases to lowercase if
 73 |         // they keep their original number of characters.
 74 |         // Otherwise, "İ" -> "i\u{307}" would not match "İ".
 75 |         *test_cases = test_cases
 76 |             .iter()
 77 |             .map(|it| {
 78 |                 let lower_test_case = it.to_lowercase();
 79 |                 if lower_test_case.chars().count() == it.chars().count() {
 80 |                     lower_test_case
 81 |                 } else {
 82 |                     it.to_string()
 83 |                 }
 84 |             })
 85 |             .collect_vec();
 86 |     }
 87 | 
 88 |     fn convert_expr_to_regex(expr: &Expression, config: &RegExpConfig) -> Regex {
 89 |         if config.is_output_colorized {
 90 |             let color_replace_regex = Regex::new("\u{1b}\\[(?:\\d+;\\d+|0)m").unwrap();
 91 |             Regex::new(&color_replace_regex.replace_all(&expr.to_string(), "")).unwrap()
 92 |         } else {
 93 |             Regex::new(&expr.to_string()).unwrap()
 94 |         }
 95 |     }
 96 | 
 97 |     fn regex_matches_all_test_cases(regex: &Regex, test_cases: &[String]) -> bool {
 98 |         test_cases
 99 |             .iter()
100 |             .all(|test_case| regex.find_iter(test_case).count() == 1)
101 |     }
102 | 
103 |     fn sort(test_cases: &mut Vec<String>) {
104 |         test_cases.sort();
105 |         test_cases.dedup();
106 |         test_cases.sort_by(|a, b| match a.len().cmp(&b.len()) {
107 |             Ordering::Equal => a.cmp(b),
108 |             other => other,
109 |         });
110 |     }
111 | 
112 |     fn grapheme_clusters(
113 |         test_cases: &'a [String],
114 |         config: &'a RegExpConfig,
115 |     ) -> Vec<GraphemeCluster<'a>> {
116 |         let mut clusters = test_cases
117 |             .iter()
118 |             .map(|it| GraphemeCluster::from(it, config))
119 |             .collect_vec();
120 | 
121 |         if config.is_char_class_feature_enabled() {
122 |             for cluster in clusters.iter_mut() {
123 |                 cluster.convert_to_char_classes();
124 |             }
125 |         }
126 | 
127 |         if config.is_repetition_converted {
128 |             for cluster in clusters.iter_mut() {
129 |                 cluster.convert_repetitions();
130 |             }
131 |         }
132 | 
133 |         clusters
134 |     }
135 | 
136 |     fn is_each_test_case_matched_after_rotating_alternations(
137 |         regex: &Regex,
138 |         expr: &mut Expression,
139 |         test_cases: &[String],
140 |     ) -> bool {
141 |         for _ in 1..test_cases.len() {
142 |             if Self::regex_matches_all_test_cases(regex, test_cases) {
143 |                 return true;
144 |             } else if let Expression::Alternation(options, _, _, _) = expr {
145 |                 options.rotate_right(1);
146 |             } else if let Expression::Concatenation(first, second, _, _, _) = expr {
147 |                 let a: &mut Expression = first;
148 |                 let b: &mut Expression = second;
149 | 
150 |                 if let Expression::Alternation(options, _, _, _) = a {
151 |                     options.rotate_right(1);
152 |                 } else if let Expression::Alternation(options, _, _, _) = b {
153 |                     options.rotate_right(1);
154 |                 }
155 |             }
156 |         }
157 |         false
158 |     }
159 | }
160 | 
161 | impl Display for RegExp<'_> {
162 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
163 |         let flag =
164 |             if self.config.is_case_insensitive_matching && self.config.is_verbose_mode_enabled {
165 |                 Component::IgnoreCaseAndVerboseModeFlag.to_repr(self.config.is_output_colorized)
166 |             } else if self.config.is_case_insensitive_matching {
167 |                 Component::IgnoreCaseFlag.to_repr(self.config.is_output_colorized)
168 |             } else if self.config.is_verbose_mode_enabled {
169 |                 Component::VerboseModeFlag.to_repr(self.config.is_output_colorized)
170 |             } else {
171 |                 String::new()
172 |             };
173 | 
174 |         let caret = if self.config.is_start_anchor_disabled {
175 |             String::new()
176 |         } else {
177 |             Component::Caret(self.config.is_verbose_mode_enabled)
178 |                 .to_repr(self.config.is_output_colorized)
179 |         };
180 | 
181 |         let dollar_sign = if self.config.is_end_anchor_disabled {
182 |             String::new()
183 |         } else {
184 |             Component::DollarSign(self.config.is_verbose_mode_enabled)
185 |                 .to_repr(self.config.is_output_colorized)
186 |         };
187 | 
188 |         let mut regexp = match self.ast {
189 |             Expression::Alternation(_, _, _, _) => {
190 |                 format!(
191 |                     "{}{}{}{}",
192 |                     flag,
193 |                     caret,
194 |                     if self.config.is_capturing_group_enabled {
195 |                         Component::CapturedParenthesizedExpression(
196 |                             self.ast.to_string(),
197 |                             self.config.is_verbose_mode_enabled,
198 |                             false,
199 |                         )
200 |                         .to_repr(self.config.is_output_colorized)
201 |                     } else {
202 |                         Component::UncapturedParenthesizedExpression(
203 |                             self.ast.to_string(),
204 |                             self.config.is_verbose_mode_enabled,
205 |                             false,
206 |                         )
207 |                         .to_repr(self.config.is_output_colorized)
208 |                     },
209 |                     dollar_sign
210 |                 )
211 |             }
212 |             _ => {
213 |                 format!("{}{}{}{}", flag, caret, self.ast, dollar_sign)
214 |             }
215 |         };
216 | 
217 |         regexp = regexp
218 |             .replace('\u{b}', "\\v") // U+000B Line Tabulation
219 |             .replace('\u{c}', "\\f"); // U+000C Form Feed
220 | 
221 |         if self.config.is_verbose_mode_enabled {
222 |             regexp = regexp
223 |                 .replace('#', "\\#")
224 |                 .replace(
225 |                     [
226 |                         ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u{85}', '\u{a0}', '\u{1680}',
227 |                         '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}',
228 |                         '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200a}', '\u{2028}',
229 |                         '\u{2029}', '\u{202f}', '\u{205f}', '\u{3000}',
230 |                     ],
231 |                     "\\s",
232 |                 )
233 |                 .replace(' ', "\\ ");
234 |         }
235 | 
236 |         write!(
237 |             f,
238 |             "{}",
239 |             if self.config.is_verbose_mode_enabled {
240 |                 indent_regexp(regexp, self.config)
241 |             } else {
242 |                 regexp
243 |             }
244 |         )
245 |     }
246 | }
247 | 
248 | fn indent_regexp(regexp: String, config: &RegExpConfig) -> String {
249 |     let mut indented_regexp = vec![];
250 |     let mut nesting_level = 0;
251 | 
252 |     for (i, line) in regexp.lines().enumerate() {
253 |         if i == 1 && config.is_start_anchor_disabled {
254 |             nesting_level += 1;
255 |         }
256 |         if line.is_empty() {
257 |             continue;
258 |         }
259 | 
260 |         let is_colored_line = line.starts_with("\u{1b}[");
261 | 
262 |         if nesting_level > 0
263 |             && ((is_colored_line && (line.contains('$') || line.contains(')')))
264 |                 || (line == "$" || line.starts_with(')')))
265 |         {
266 |             nesting_level -= 1;
267 |         }
268 | 
269 |         let indentation = "  ".repeat(nesting_level);
270 |         indented_regexp.push(format!("{indentation}{line}"));
271 | 
272 |         if (is_colored_line && (line.contains('^') || (i > 0 && line.contains('('))))
273 |             || (line == "^" || (i > 0 && line.starts_with('(')))
274 |         {
275 |             nesting_level += 1;
276 |         }
277 |     }
278 | 
279 |     indented_regexp.join("\n")
280 | }
281 | 


--------------------------------------------------------------------------------
/src/substring.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | pub enum Substring {
18 |     Prefix,
19 |     Suffix,
20 | }
21 | 


--------------------------------------------------------------------------------
/src/unicode_tables/decimal.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
18 | //
19 | //   ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber
20 | //
21 | // Unicode version: 15.0.0.
22 | //
23 | // ucd-generate 0.3.0 is available on crates.io.
24 | 
25 | pub const DECIMAL_NUMBER: &[(char, char)] = &[
26 |     ('0', '9'),
27 |     ('٠', '٩'),
28 |     ('۰', '۹'),
29 |     ('߀', '߉'),
30 |     ('०', '९'),
31 |     ('০', '৯'),
32 |     ('੦', '੯'),
33 |     ('૦', '૯'),
34 |     ('୦', '୯'),
35 |     ('௦', '௯'),
36 |     ('౦', '౯'),
37 |     ('೦', '೯'),
38 |     ('൦', '൯'),
39 |     ('෦', '෯'),
40 |     ('๐', '๙'),
41 |     ('໐', '໙'),
42 |     ('༠', '༩'),
43 |     ('၀', '၉'),
44 |     ('႐', '႙'),
45 |     ('០', '៩'),
46 |     ('᠐', '᠙'),
47 |     ('᥆', '᥏'),
48 |     ('᧐', '᧙'),
49 |     ('᪀', '᪉'),
50 |     ('᪐', '᪙'),
51 |     ('᭐', '᭙'),
52 |     ('᮰', '᮹'),
53 |     ('᱀', '᱉'),
54 |     ('᱐', '᱙'),
55 |     ('꘠', '꘩'),
56 |     ('꣐', '꣙'),
57 |     ('꤀', '꤉'),
58 |     ('꧐', '꧙'),
59 |     ('꧰', '꧹'),
60 |     ('꩐', '꩙'),
61 |     ('꯰', '꯹'),
62 |     ('０', '９'),
63 |     ('𐒠', '𐒩'),
64 |     ('𐴰', '𐴹'),
65 |     ('𑁦', '𑁯'),
66 |     ('𑃰', '𑃹'),
67 |     ('𑄶', '𑄿'),
68 |     ('𑇐', '𑇙'),
69 |     ('𑋰', '𑋹'),
70 |     ('𑑐', '𑑙'),
71 |     ('𑓐', '𑓙'),
72 |     ('𑙐', '𑙙'),
73 |     ('𑛀', '𑛉'),
74 |     ('𑜰', '𑜹'),
75 |     ('𑣠', '𑣩'),
76 |     ('𑥐', '𑥙'),
77 |     ('𑱐', '𑱙'),
78 |     ('𑵐', '𑵙'),
79 |     ('𑶠', '𑶩'),
80 |     ('𑽐', '𑽙'),
81 |     ('𖩠', '𖩩'),
82 |     ('𖫀', '𖫉'),
83 |     ('𖭐', '𖭙'),
84 |     ('𝟎', '𝟿'),
85 |     ('𞅀', '𞅉'),
86 |     ('𞋰', '𞋹'),
87 |     ('𞓰', '𞓹'),
88 |     ('𞥐', '𞥙'),
89 |     ('🯰', '🯹'),
90 | ];
91 | 


--------------------------------------------------------------------------------
/src/unicode_tables/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | mod decimal;
18 | mod space;
19 | mod word;
20 | 
21 | pub use decimal::DECIMAL_NUMBER;
22 | pub use space::WHITE_SPACE;
23 | pub use word::WORD;
24 | 


--------------------------------------------------------------------------------
/src/unicode_tables/space.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
18 | //
19 | //   ucd-generate property-bool ucd-15.0.0 --chars --include whitespace
20 | //
21 | // Unicode version: 15.0.0.
22 | //
23 | // ucd-generate 0.3.0 is available on crates.io.
24 | 
25 | pub const WHITE_SPACE: &[(char, char)] = &[
26 |     ('\t', '\r'),
27 |     (' ', ' '),
28 |     ('\u{85}', '\u{85}'),
29 |     ('\u{a0}', '\u{a0}'),
30 |     ('\u{1680}', '\u{1680}'),
31 |     ('\u{2000}', '\u{200a}'),
32 |     ('\u{2028}', '\u{2029}'),
33 |     ('\u{202f}', '\u{202f}'),
34 |     ('\u{205f}', '\u{205f}'),
35 |     ('\u{3000}', '\u{3000}'),
36 | ];
37 | 


--------------------------------------------------------------------------------
/src/wasm.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #![allow(non_snake_case)]
 18 | 
 19 | use crate::builder::{
 20 |     RegExpBuilder as Builder, MINIMUM_REPETITIONS_MESSAGE, MINIMUM_SUBSTRING_LENGTH_MESSAGE,
 21 |     MISSING_TEST_CASES_MESSAGE,
 22 | };
 23 | use itertools::Itertools;
 24 | use wasm_bindgen::prelude::*;
 25 | 
 26 | /// This class builds regular expressions from user-provided test cases.
 27 | #[wasm_bindgen]
 28 | #[derive(Clone)]
 29 | pub struct RegExpBuilder {
 30 |     builder: Builder,
 31 | }
 32 | 
 33 | #[wasm_bindgen]
 34 | impl RegExpBuilder {
 35 |     /// Specifies the test cases to build the regular expression from.
 36 |     ///
 37 |     /// The test cases need not be sorted because `RegExpBuilder` sorts them internally.
 38 |     ///
 39 |     /// ⚠ Throws an error if `testCases` is empty.
 40 |     pub fn from(testCases: Box<[JsValue]>) -> Result<RegExpBuilder, JsValue> {
 41 |         let strs = testCases
 42 |             .iter()
 43 |             .filter_map(|it| it.as_string())
 44 |             .collect_vec();
 45 | 
 46 |         if strs.is_empty() {
 47 |             return Err(JsValue::from(MISSING_TEST_CASES_MESSAGE));
 48 |         }
 49 |         Ok(RegExpBuilder {
 50 |             builder: Builder::from(&strs),
 51 |         })
 52 |     }
 53 | 
 54 |     /// Tells `RegExpBuilder` to convert any Unicode decimal digit to character class `\d`.
 55 |     ///
 56 |     /// This method takes precedence over `withConversionOfWords` if both are set.
 57 |     /// Decimal digits are converted to `\d`, the remaining word characters to `\w`.
 58 |     ///
 59 |     /// This method takes precedence over `withConversionOfWhitespace` if both are set.
 60 |     /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`.
 61 |     pub fn withConversionOfDigits(&mut self) -> RegExpBuilder {
 62 |         self.builder.config.is_digit_converted = true;
 63 |         self.clone()
 64 |     }
 65 | 
 66 |     /// Tells `RegExpBuilder` to convert any character which is not
 67 |     /// a Unicode decimal digit to character class `\D`.
 68 |     ///
 69 |     /// This method takes precedence over `withConversionOfNonWords` if both are set.
 70 |     /// Non-digits which are also non-word characters are converted to `\D`.
 71 |     ///
 72 |     /// This method takes precedence over `withConversionOfNonWhitespace` if both are set.
 73 |     /// Non-digits which are also non-space characters are converted to `\D`.
 74 |     pub fn withConversionOfNonDigits(&mut self) -> RegExpBuilder {
 75 |         self.builder.config.is_non_digit_converted = true;
 76 |         self.clone()
 77 |     }
 78 | 
 79 |     /// Tells `RegExpBuilder` to convert any Unicode whitespace character to character class `\s`.
 80 |     ///
 81 |     /// This method takes precedence over `withConversionOfNonDigits` if both are set.
 82 |     /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`.
 83 |     ///
 84 |     /// This method takes precedence over `withConversionOfNonWords` if both are set.
 85 |     /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`.
 86 |     pub fn withConversionOfWhitespace(&mut self) -> RegExpBuilder {
 87 |         self.builder.config.is_space_converted = true;
 88 |         self.clone()
 89 |     }
 90 | 
 91 |     /// Tells `RegExpBuilder` to convert any character which is not
 92 |     /// a Unicode whitespace character to character class `\S`.
 93 |     pub fn withConversionOfNonWhitespace(&mut self) -> RegExpBuilder {
 94 |         self.builder.config.is_non_space_converted = true;
 95 |         self.clone()
 96 |     }
 97 | 
 98 |     /// Tells `RegExpBuilder` to convert any Unicode word character to character class `\w`.
 99 |     ///
100 |     /// This method takes precedence over `withConversionOfNonDigits` if both are set.
101 |     /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`.
102 |     ///
103 |     /// This method takes precedence over `withConversionOfNonWhitespace` if both are set.
104 |     /// Word characters are converted to `\w`, the remaining non-space characters to `\S`.
105 |     pub fn withConversionOfWords(&mut self) -> RegExpBuilder {
106 |         self.builder.config.is_word_converted = true;
107 |         self.clone()
108 |     }
109 | 
110 |     /// Tells `RegExpBuilder` to convert any character which is not
111 |     /// a Unicode word character to character class `\W`.
112 |     ///
113 |     /// This method takes precedence over `withConversionOfNonWhitespace` if both are set.
114 |     /// Non-words which are also non-space characters are converted to `\W`.
115 |     pub fn withConversionOfNonWords(&mut self) -> RegExpBuilder {
116 |         self.builder.config.is_non_word_converted = true;
117 |         self.clone()
118 |     }
119 | 
120 |     /// Tells `RegExpBuilder` to detect repeated non-overlapping substrings and
121 |     /// to convert them to `{min,max}` quantifier notation.
122 |     pub fn withConversionOfRepetitions(&mut self) -> RegExpBuilder {
123 |         self.builder.config.is_repetition_converted = true;
124 |         self.clone()
125 |     }
126 | 
127 |     /// Tells `RegExpBuilder` to enable case-insensitive matching of test cases
128 |     /// so that letters match both upper and lower case.
129 |     pub fn withCaseInsensitiveMatching(&mut self) -> RegExpBuilder {
130 |         self.builder.config.is_case_insensitive_matching = true;
131 |         self.clone()
132 |     }
133 | 
134 |     /// Tells `RegExpBuilder` to replace non-capturing groups by capturing ones.
135 |     pub fn withCapturingGroups(&mut self) -> RegExpBuilder {
136 |         self.builder.config.is_capturing_group_enabled = true;
137 |         self.clone()
138 |     }
139 | 
140 |     /// Tells `RegExpBuilder` to convert non-ASCII characters to unicode escape sequences.
141 |     /// The parameter `useSurrogatePairs` specifies whether to convert astral code planes
142 |     /// (range `U+010000` to `U+10FFFF`) to surrogate pairs.
143 |     pub fn withEscapingOfNonAsciiChars(&mut self, useSurrogatePairs: bool) -> RegExpBuilder {
144 |         self.builder.config.is_non_ascii_char_escaped = true;
145 |         self.builder
146 |             .config
147 |             .is_astral_code_point_converted_to_surrogate = useSurrogatePairs;
148 |         self.clone()
149 |     }
150 | 
151 |     /// Tells `RegExpBuilder` to produce a nicer looking regular expression in verbose mode.
152 |     pub fn withVerboseMode(&mut self) -> RegExpBuilder {
153 |         self.builder.config.is_verbose_mode_enabled = true;
154 |         self.clone()
155 |     }
156 | 
157 |     /// Tells `RegExpBuilder` to remove the caret anchor '^' from the resulting regular
158 |     /// expression, thereby allowing to match the test cases also when they do not occur
159 |     /// at the start of a string.
160 |     pub fn withoutStartAnchor(&mut self) -> RegExpBuilder {
161 |         self.builder.config.is_start_anchor_disabled = true;
162 |         self.clone()
163 |     }
164 | 
165 |     /// Tells `RegExpBuilder` to remove the dollar sign anchor '$' from the resulting regular
166 |     /// expression, thereby allowing to match the test cases also when they do not occur
167 |     /// at the end of a string.
168 |     pub fn withoutEndAnchor(&mut self) -> RegExpBuilder {
169 |         self.builder.config.is_end_anchor_disabled = true;
170 |         self.clone()
171 |     }
172 | 
173 |     /// Tells `RegExpBuilder` to remove the caret and dollar sign anchors from the resulting
174 |     /// regular expression, thereby allowing to match the test cases also when they occur
175 |     /// within a larger string that contains other content as well.
176 |     pub fn withoutAnchors(&mut self) -> RegExpBuilder {
177 |         self.builder.config.is_start_anchor_disabled = true;
178 |         self.builder.config.is_end_anchor_disabled = true;
179 |         self.clone()
180 |     }
181 | 
182 |     /// Specifies the minimum quantity of substring repetitions to be converted
183 |     /// if `withConversionOfRepetitions` is set.
184 |     ///
185 |     /// If the quantity is not explicitly set with this method, a default value of 1 will be used.
186 |     ///
187 |     /// ⚠ Throws an error if `quantity` is zero.
188 |     pub fn withMinimumRepetitions(&mut self, quantity: u32) -> Result<RegExpBuilder, JsValue> {
189 |         if quantity < 1 {
190 |             return Err(JsValue::from(MINIMUM_REPETITIONS_MESSAGE));
191 |         }
192 |         self.builder.config.minimum_repetitions = quantity;
193 |         Ok(self.clone())
194 |     }
195 | 
196 |     /// Specifies the minimum length a repeated substring must have in order to be converted
197 |     /// if `withConversionOfRepetitions` is set.
198 |     ///
199 |     /// If the length is not explicitly set with this method, a default value of 1 will be used.
200 |     ///
201 |     /// ⚠ Throws an error if `length` is zero.
202 |     pub fn withMinimumSubstringLength(&mut self, length: u32) -> Result<RegExpBuilder, JsValue> {
203 |         if length < 1 {
204 |             return Err(JsValue::from(MINIMUM_SUBSTRING_LENGTH_MESSAGE));
205 |         }
206 |         self.builder.config.minimum_substring_length = length;
207 |         Ok(self.clone())
208 |     }
209 | 
210 |     /// Builds the actual regular expression using the previously given settings.
211 |     pub fn build(&mut self) -> String {
212 |         self.builder.build()
213 |     }
214 | }
215 | 


--------------------------------------------------------------------------------
/tests/python/test_grex.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import inspect
 17 | import pytest
 18 | import re
 19 | 
 20 | from grex import RegExpBuilder
 21 | 
 22 | 
 23 | @pytest.mark.parametrize(
 24 |     "test_cases,expected_pattern",
 25 |     [
 26 |         pytest.param(["abc", "abd", "abe"], "^ab[c-e]$"),
 27 |     ]
 28 | )
 29 | def test_default_settings(test_cases, expected_pattern):
 30 |     pattern = RegExpBuilder.from_test_cases(test_cases).build()
 31 |     assert pattern == expected_pattern
 32 |     for test_case in test_cases:
 33 |         assert re.match(pattern, test_case)
 34 | 
 35 | 
 36 | @pytest.mark.parametrize(
 37 |     "test_cases,expected_pattern",
 38 |     [
 39 |         pytest.param(["My ♥ and 💩 is yours."], "^My \\u2665 and \\U0001f4a9 is yours\\.$"),
 40 |     ]
 41 | )
 42 | def test_escaping(test_cases, expected_pattern):
 43 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
 44 |                .with_escaping_of_non_ascii_chars(use_surrogate_pairs=False)
 45 |                .build())
 46 |     assert pattern == expected_pattern
 47 |     for test_case in test_cases:
 48 |         assert re.match(pattern, test_case)
 49 | 
 50 | 
 51 | @pytest.mark.parametrize(
 52 |     "test_cases,expected_pattern",
 53 |     [
 54 |         pytest.param(["My ♥ and 💩 is yours."], "^My \\u2665 and \\ud83d\\udca9 is yours\\.$"),
 55 |     ]
 56 | )
 57 | def test_escaping_with_surrogate_pairs(test_cases, expected_pattern):
 58 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
 59 |                .with_escaping_of_non_ascii_chars(use_surrogate_pairs=True)
 60 |                .build())
 61 |     assert pattern == expected_pattern
 62 |     # module re does not support matching surrogate pairs
 63 | 
 64 | 
 65 | @pytest.mark.parametrize(
 66 |     "test_cases,expected_pattern",
 67 |     [
 68 |         pytest.param(["efgh", "abcxy", "abcw"], "^(abc(xy|w)|efgh)$"),
 69 |     ]
 70 | )
 71 | def test_capturing_groups(test_cases, expected_pattern):
 72 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
 73 |                .with_capturing_groups()
 74 |                .build())
 75 |     assert pattern == expected_pattern
 76 |     for test_case in test_cases:
 77 |         assert re.match(pattern, test_case)
 78 | 
 79 | 
 80 | @pytest.mark.parametrize(
 81 |     "test_cases,expected_pattern",
 82 |     [
 83 |         pytest.param(["efgh", "abcxy", "abcw"], "(?:abc(?:xy|w)|efgh)"),
 84 |     ]
 85 | )
 86 | def test_without_anchors(test_cases, expected_pattern):
 87 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
 88 |                .without_anchors()
 89 |                .build())
 90 |     assert pattern == expected_pattern
 91 |     for test_case in test_cases:
 92 |         assert re.match(pattern, test_case)
 93 | 
 94 | 
 95 | @pytest.mark.parametrize(
 96 |     "test_cases,expected_pattern",
 97 |     [
 98 |         pytest.param(["ABC", "zBC", "abc", "AbC", "aBc"], "(?i)^[az]bc$"),
 99 |     ]
100 | )
101 | def test_case_insensitive_matching(test_cases, expected_pattern):
102 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
103 |                .with_case_insensitive_matching()
104 |                .build())
105 |     assert pattern == expected_pattern
106 |     for test_case in test_cases:
107 |         assert re.match(pattern, test_case)
108 | 
109 | 
110 | @pytest.mark.parametrize(
111 |     "test_cases,expected_pattern",
112 |     [
113 |         pytest.param(
114 |             ["[a-z]", "(d,e,f)"],
115 |             inspect.cleandoc("""
116 |                 (?x)
117 |                 ^
118 |                   (?:
119 |                     \\(d,e,f\\)
120 |                     |
121 |                     \\[a\\-z\\]
122 |                   )
123 |                 $
124 |                 """)
125 |         ),
126 |     ]
127 | )
128 | def test_verbose_mode(test_cases, expected_pattern):
129 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
130 |                .with_verbose_mode()
131 |                .build())
132 |     assert pattern == expected_pattern
133 |     for test_case in test_cases:
134 |         assert re.match(pattern, test_case)
135 | 
136 | 
137 | @pytest.mark.parametrize(
138 |     "test_cases,expected_pattern",
139 |     [
140 |         pytest.param(
141 |             ["Ä@Ö€Ü", "ä@ö€ü", "Ä@ö€Ü", "ä@Ö€ü"],
142 |             inspect.cleandoc("""
143 |                 (?ix)
144 |                 ^
145 |                   ä@ö€ü
146 |                 $
147 |                 """)
148 |         )
149 |     ]
150 | )
151 | def test_case_insensitive_matching_and_verbose_mode(test_cases, expected_pattern):
152 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
153 |                .with_case_insensitive_matching()
154 |                .with_verbose_mode()
155 |                .build())
156 |     assert pattern == expected_pattern
157 |     for test_case in test_cases:
158 |         assert re.match(pattern, test_case)
159 | 
160 | 
161 | @pytest.mark.parametrize(
162 |     "test_cases,expected_pattern",
163 |     [
164 |         pytest.param(["a", "b\nx\nx", "c"], "^(?:b(?:\\nx){2}|[ac])$"),
165 |     ]
166 | )
167 | def test_conversion_of_repetitions(test_cases, expected_pattern):
168 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
169 |                .with_conversion_of_repetitions()
170 |                .build())
171 |     assert pattern == expected_pattern
172 |     for test_case in test_cases:
173 |         assert re.match(pattern, test_case)
174 | 
175 | 
176 | @pytest.mark.parametrize(
177 |     "test_cases,expected_pattern",
178 |     [
179 |         pytest.param(["My ♥♥♥ and 💩💩 is yours."], "^My \\u2665{3} and \\U0001f4a9{2} is yours\\.$"),
180 |     ]
181 | )
182 | def test_escaping_and_conversion_of_repetitions(test_cases, expected_pattern):
183 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
184 |                .with_escaping_of_non_ascii_chars(use_surrogate_pairs=False)
185 |                .with_conversion_of_repetitions()
186 |                .build())
187 |     assert pattern == expected_pattern
188 |     for test_case in test_cases:
189 |         assert re.match(pattern, test_case)
190 | 
191 | 
192 | @pytest.mark.parametrize(
193 |     "test_cases,expected_pattern",
194 |     [
195 |         pytest.param(["a1b2c3"], "^a\\db\\dc\\d$"),
196 |     ]
197 | )
198 | def test_conversion_of_digits(test_cases, expected_pattern):
199 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
200 |                .with_conversion_of_digits()
201 |                .build())
202 |     assert pattern == expected_pattern
203 |     for test_case in test_cases:
204 |         assert re.match(pattern, test_case)
205 | 
206 | 
207 | @pytest.mark.parametrize(
208 |     "test_cases,expected_pattern",
209 |     [
210 |         pytest.param(["a1b2c3"], "^\\D1\\D2\\D3$"),
211 |     ]
212 | )
213 | def test_conversion_of_non_digits(test_cases, expected_pattern):
214 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
215 |                .with_conversion_of_non_digits()
216 |                .build())
217 |     assert pattern == expected_pattern
218 |     for test_case in test_cases:
219 |         assert re.match(pattern, test_case)
220 | 
221 | 
222 | @pytest.mark.parametrize(
223 |     "test_cases,expected_pattern",
224 |     [
225 |         pytest.param(["\n\t", "\r"], "^\\s(?:\\s)?$"),
226 |     ]
227 | )
228 | def test_conversion_of_whitespace(test_cases, expected_pattern):
229 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
230 |                .with_conversion_of_whitespace()
231 |                .build())
232 |     assert pattern == expected_pattern
233 |     for test_case in test_cases:
234 |         assert re.match(pattern, test_case)
235 | 
236 | 
237 | @pytest.mark.parametrize(
238 |     "test_cases,expected_pattern",
239 |     [
240 |         pytest.param(["a1 b2 c3"], "^\\S\\S \\S\\S \\S\\S$"),
241 |     ]
242 | )
243 | def test_conversion_of_non_whitespace(test_cases, expected_pattern):
244 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
245 |                .with_conversion_of_non_whitespace()
246 |                .build())
247 |     assert pattern == expected_pattern
248 |     for test_case in test_cases:
249 |         assert re.match(pattern, test_case)
250 | 
251 | 
252 | @pytest.mark.parametrize(
253 |     "test_cases,expected_pattern",
254 |     [
255 |         pytest.param(["abc", "1234"], "^\\w\\w\\w(?:\\w)?$"),
256 |     ]
257 | )
258 | def test_conversion_of_words(test_cases, expected_pattern):
259 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
260 |                .with_conversion_of_words()
261 |                .build())
262 |     assert pattern == expected_pattern
263 |     for test_case in test_cases:
264 |         assert re.match(pattern, test_case)
265 | 
266 | 
267 | @pytest.mark.parametrize(
268 |     "test_cases,expected_pattern",
269 |     [
270 |         pytest.param(["abc 1234"], "^abc\\W1234$"),
271 |     ]
272 | )
273 | def test_conversion_of_non_words(test_cases, expected_pattern):
274 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
275 |                .with_conversion_of_non_words()
276 |                .build())
277 |     assert pattern == expected_pattern
278 |     for test_case in test_cases:
279 |         assert re.match(pattern, test_case)
280 | 
281 | 
282 | @pytest.mark.parametrize(
283 |     "test_cases,expected_pattern",
284 |     [
285 |         pytest.param(["aababab"], "^aababab$"),
286 |         pytest.param(["aabababab"], "^a(?:ab){4}$")
287 |     ]
288 | )
289 | def test_minimum_repetitions(test_cases, expected_pattern):
290 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
291 |                .with_conversion_of_repetitions()
292 |                .with_minimum_repetitions(3)
293 |                .build())
294 |     assert pattern == expected_pattern
295 |     for test_case in test_cases:
296 |         assert re.match(pattern, test_case)
297 | 
298 | 
299 | @pytest.mark.parametrize(
300 |     "test_cases,expected_pattern",
301 |     [
302 |         pytest.param(["ababab"], "^ababab$"),
303 |         pytest.param(["abcabcabc"], "^(?:abc){3}$")
304 |     ]
305 | )
306 | def test_minimum_substring_length(test_cases, expected_pattern):
307 |     pattern = (RegExpBuilder.from_test_cases(test_cases)
308 |                .with_conversion_of_repetitions()
309 |                .with_minimum_substring_length(3)
310 |                .build())
311 |     assert pattern == expected_pattern
312 |     for test_case in test_cases:
313 |         assert re.match(pattern, test_case)
314 | 
315 | 
316 | def test_error_for_empty_test_cases():
317 |     with pytest.raises(ValueError) as exception_info:
318 |         RegExpBuilder.from_test_cases([])
319 |     assert (
320 |         exception_info.value.args[0] ==
321 |         "No test cases have been provided for regular expression generation"
322 |     )
323 | 
324 | 
325 | def test_error_for_invalid_minimum_repetitions():
326 |     with pytest.raises(ValueError) as exception_info:
327 |         RegExpBuilder.from_test_cases(["abcd"]).with_minimum_repetitions(-4)
328 |     assert (
329 |         exception_info.value.args[0] ==
330 |         "Quantity of minimum repetitions must be greater than zero"
331 |     )
332 | 
333 | 
334 | def test_error_for_invalid_minimum_substring_length():
335 |     with pytest.raises(ValueError) as exception_info:
336 |         RegExpBuilder.from_test_cases(["abcd"]).with_minimum_substring_length(-2)
337 |     assert (
338 |         exception_info.value.args[0] ==
339 |         "Minimum substring length must be greater than zero"
340 |     )
341 | 


--------------------------------------------------------------------------------
/tests/wasm_browser_tests.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #![cfg(target_family = "wasm")]
 18 | 
 19 | use grex::WasmRegExpBuilder;
 20 | use indoc::indoc;
 21 | use wasm_bindgen::JsValue;
 22 | use wasm_bindgen_test::*;
 23 | 
 24 | wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
 25 | 
 26 | #[wasm_bindgen_test]
 27 | fn assert_regexpbuilder_succeeds() {
 28 |     let test_cases = Box::new([JsValue::from("hello"), JsValue::from("world")]);
 29 |     let builder = WasmRegExpBuilder::from(test_cases);
 30 |     assert!(builder.is_ok());
 31 |     let regexp = builder.unwrap().build();
 32 |     assert_eq!(regexp, "^(?:hello|world)$");
 33 | }
 34 | 
 35 | #[wasm_bindgen_test]
 36 | fn assert_regexpbuilder_fails() {
 37 |     let builder = WasmRegExpBuilder::from(Box::new([]));
 38 |     assert_eq!(
 39 |         builder.err(),
 40 |         Some(JsValue::from(
 41 |             "No test cases have been provided for regular expression generation"
 42 |         ))
 43 |     );
 44 | }
 45 | 
 46 | #[wasm_bindgen_test]
 47 | fn test_conversion_of_digits() {
 48 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 49 |     let regexp = WasmRegExpBuilder::from(test_cases)
 50 |         .unwrap()
 51 |         .withConversionOfDigits()
 52 |         .build();
 53 |     assert_eq!(regexp, "^(?:abc  |\\d\\d\\d)$");
 54 | }
 55 | 
 56 | #[wasm_bindgen_test]
 57 | fn test_conversion_of_non_digits() {
 58 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 59 |     let regexp = WasmRegExpBuilder::from(test_cases)
 60 |         .unwrap()
 61 |         .withConversionOfNonDigits()
 62 |         .build();
 63 |     assert_eq!(regexp, "^(?:\\D\\D\\D\\D\\D|123)$");
 64 | }
 65 | 
 66 | #[wasm_bindgen_test]
 67 | fn test_conversion_of_whitespace() {
 68 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 69 |     let regexp = WasmRegExpBuilder::from(test_cases)
 70 |         .unwrap()
 71 |         .withConversionOfWhitespace()
 72 |         .build();
 73 |     assert_eq!(regexp, "^(?:abc\\s\\s|123)$");
 74 | }
 75 | 
 76 | #[wasm_bindgen_test]
 77 | fn test_conversion_of_non_whitespace() {
 78 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 79 |     let regexp = WasmRegExpBuilder::from(test_cases)
 80 |         .unwrap()
 81 |         .withConversionOfNonWhitespace()
 82 |         .build();
 83 |     assert_eq!(regexp, "^\\S\\S\\S(?:  )?$");
 84 | }
 85 | 
 86 | #[wasm_bindgen_test]
 87 | fn test_conversion_of_words() {
 88 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 89 |     let regexp = WasmRegExpBuilder::from(test_cases)
 90 |         .unwrap()
 91 |         .withConversionOfWords()
 92 |         .build();
 93 |     assert_eq!(regexp, "^\\w\\w\\w(?:  )?$");
 94 | }
 95 | 
 96 | #[wasm_bindgen_test]
 97 | fn test_conversion_of_non_words() {
 98 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 99 |     let regexp = WasmRegExpBuilder::from(test_cases)
100 |         .unwrap()
101 |         .withConversionOfNonWords()
102 |         .build();
103 |     assert_eq!(regexp, "^(?:abc\\W\\W|123)$");
104 | }
105 | 
106 | #[wasm_bindgen_test]
107 | fn test_conversion_of_repetitions() {
108 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
109 |     let regexp = WasmRegExpBuilder::from(test_cases)
110 |         .unwrap()
111 |         .withConversionOfRepetitions()
112 |         .build();
113 |     assert_eq!(regexp, "^(?:abc {2}|123)$");
114 | }
115 | 
116 | #[wasm_bindgen_test]
117 | fn test_case_insensitive_matching() {
118 |     let test_cases = Box::new([
119 |         JsValue::from("ABC"),
120 |         JsValue::from("abc  "),
121 |         JsValue::from("123"),
122 |     ]);
123 |     let regexp = WasmRegExpBuilder::from(test_cases)
124 |         .unwrap()
125 |         .withCaseInsensitiveMatching()
126 |         .build();
127 |     assert_eq!(regexp, "(?i)^(?:abc(?:  )?|123)$");
128 | }
129 | 
130 | #[wasm_bindgen_test]
131 | fn test_capturing_groups() {
132 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
133 |     let regexp = WasmRegExpBuilder::from(test_cases)
134 |         .unwrap()
135 |         .withCapturingGroups()
136 |         .build();
137 |     assert_eq!(regexp, "^(abc  |123)$");
138 | }
139 | 
140 | #[wasm_bindgen_test]
141 | fn test_escaping_of_non_ascii_chars() {
142 |     let test_cases = Box::new([
143 |         JsValue::from("abc  "),
144 |         JsValue::from("123"),
145 |         JsValue::from("♥"),
146 |     ]);
147 |     let regexp = WasmRegExpBuilder::from(test_cases)
148 |         .unwrap()
149 |         .withEscapingOfNonAsciiChars(false)
150 |         .build();
151 |     assert_eq!(regexp, "^(?:abc  |123|\\u{2665})$");
152 | }
153 | 
154 | #[wasm_bindgen_test]
155 | fn test_verbose_mode() {
156 |     let test_cases = Box::new([
157 |         JsValue::from("abc  "),
158 |         JsValue::from("123"),
159 |         JsValue::from("♥"),
160 |     ]);
161 |     let regexp = WasmRegExpBuilder::from(test_cases)
162 |         .unwrap()
163 |         .withVerboseMode()
164 |         .build();
165 |     assert_eq!(
166 |         regexp,
167 |         indoc!(
168 |             r#"
169 |             (?x)
170 |             ^
171 |               (?:
172 |                 abc\ \ 
173 |                 |
174 |                 123
175 |                 |
176 |                 ♥
177 |               )
178 |             $"#
179 |         )
180 |     );
181 | }
182 | 
183 | #[wasm_bindgen_test]
184 | fn test_without_start_anchor() {
185 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
186 |     let regexp = WasmRegExpBuilder::from(test_cases)
187 |         .unwrap()
188 |         .withoutStartAnchor()
189 |         .build();
190 |     assert_eq!(regexp, "(?:abc  |123)$");
191 | }
192 | 
193 | #[wasm_bindgen_test]
194 | fn test_without_end_anchor() {
195 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
196 |     let regexp = WasmRegExpBuilder::from(test_cases)
197 |         .unwrap()
198 |         .withoutEndAnchor()
199 |         .build();
200 |     assert_eq!(regexp, "^(?:abc  |123)");
201 | }
202 | 
203 | #[wasm_bindgen_test]
204 | fn test_without_anchors() {
205 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
206 |     let regexp = WasmRegExpBuilder::from(test_cases)
207 |         .unwrap()
208 |         .withoutAnchors()
209 |         .build();
210 |     assert_eq!(regexp, "(?:abc  |123)");
211 | }
212 | 
213 | #[wasm_bindgen_test]
214 | fn test_minimum_repetitions() {
215 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
216 |     let builder = WasmRegExpBuilder::from(test_cases)
217 |         .unwrap()
218 |         .withMinimumRepetitions(0);
219 |     assert_eq!(
220 |         builder.err(),
221 |         Some(JsValue::from(
222 |             "Quantity of minimum repetitions must be greater than zero"
223 |         ))
224 |     );
225 | }
226 | 
227 | #[wasm_bindgen_test]
228 | fn test_minimum_substring_length() {
229 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
230 |     let builder = WasmRegExpBuilder::from(test_cases)
231 |         .unwrap()
232 |         .withMinimumSubstringLength(0);
233 |     assert_eq!(
234 |         builder.err(),
235 |         Some(JsValue::from(
236 |             "Minimum substring length must be greater than zero"
237 |         ))
238 |     );
239 | }
240 | 


--------------------------------------------------------------------------------
/tests/wasm_node_tests.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #![cfg(target_family = "wasm")]
 18 | 
 19 | use grex::WasmRegExpBuilder;
 20 | use indoc::indoc;
 21 | use wasm_bindgen::JsValue;
 22 | use wasm_bindgen_test::*;
 23 | 
 24 | #[wasm_bindgen_test]
 25 | fn assert_regexpbuilder_succeeds() {
 26 |     let test_cases = Box::new([JsValue::from("hello"), JsValue::from("world")]);
 27 |     let builder = WasmRegExpBuilder::from(test_cases);
 28 |     assert!(builder.is_ok());
 29 |     let regexp = builder.unwrap().build();
 30 |     assert_eq!(regexp, "^(?:hello|world)$");
 31 | }
 32 | 
 33 | #[wasm_bindgen_test]
 34 | fn assert_regexpbuilder_fails() {
 35 |     let builder = WasmRegExpBuilder::from(Box::new([]));
 36 |     assert_eq!(
 37 |         builder.err(),
 38 |         Some(JsValue::from(
 39 |             "No test cases have been provided for regular expression generation"
 40 |         ))
 41 |     );
 42 | }
 43 | 
 44 | #[wasm_bindgen_test]
 45 | fn test_conversion_of_digits() {
 46 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 47 |     let regexp = WasmRegExpBuilder::from(test_cases)
 48 |         .unwrap()
 49 |         .withConversionOfDigits()
 50 |         .build();
 51 |     assert_eq!(regexp, "^(?:abc  |\\d\\d\\d)$");
 52 | }
 53 | 
 54 | #[wasm_bindgen_test]
 55 | fn test_conversion_of_non_digits() {
 56 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 57 |     let regexp = WasmRegExpBuilder::from(test_cases)
 58 |         .unwrap()
 59 |         .withConversionOfNonDigits()
 60 |         .build();
 61 |     assert_eq!(regexp, "^(?:\\D\\D\\D\\D\\D|123)$");
 62 | }
 63 | 
 64 | #[wasm_bindgen_test]
 65 | fn test_conversion_of_whitespace() {
 66 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 67 |     let regexp = WasmRegExpBuilder::from(test_cases)
 68 |         .unwrap()
 69 |         .withConversionOfWhitespace()
 70 |         .build();
 71 |     assert_eq!(regexp, "^(?:abc\\s\\s|123)$");
 72 | }
 73 | 
 74 | #[wasm_bindgen_test]
 75 | fn test_conversion_of_non_whitespace() {
 76 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 77 |     let regexp = WasmRegExpBuilder::from(test_cases)
 78 |         .unwrap()
 79 |         .withConversionOfNonWhitespace()
 80 |         .build();
 81 |     assert_eq!(regexp, "^\\S\\S\\S(?:  )?$");
 82 | }
 83 | 
 84 | #[wasm_bindgen_test]
 85 | fn test_conversion_of_words() {
 86 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 87 |     let regexp = WasmRegExpBuilder::from(test_cases)
 88 |         .unwrap()
 89 |         .withConversionOfWords()
 90 |         .build();
 91 |     assert_eq!(regexp, "^\\w\\w\\w(?:  )?$");
 92 | }
 93 | 
 94 | #[wasm_bindgen_test]
 95 | fn test_conversion_of_non_words() {
 96 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
 97 |     let regexp = WasmRegExpBuilder::from(test_cases)
 98 |         .unwrap()
 99 |         .withConversionOfNonWords()
100 |         .build();
101 |     assert_eq!(regexp, "^(?:abc\\W\\W|123)$");
102 | }
103 | 
104 | #[wasm_bindgen_test]
105 | fn test_conversion_of_repetitions() {
106 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
107 |     let regexp = WasmRegExpBuilder::from(test_cases)
108 |         .unwrap()
109 |         .withConversionOfRepetitions()
110 |         .build();
111 |     assert_eq!(regexp, "^(?:abc {2}|123)$");
112 | }
113 | 
114 | #[wasm_bindgen_test]
115 | fn test_case_insensitive_matching() {
116 |     let test_cases = Box::new([
117 |         JsValue::from("ABC"),
118 |         JsValue::from("abc  "),
119 |         JsValue::from("123"),
120 |     ]);
121 |     let regexp = WasmRegExpBuilder::from(test_cases)
122 |         .unwrap()
123 |         .withCaseInsensitiveMatching()
124 |         .build();
125 |     assert_eq!(regexp, "(?i)^(?:abc(?:  )?|123)$");
126 | }
127 | 
128 | #[wasm_bindgen_test]
129 | fn test_capturing_groups() {
130 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
131 |     let regexp = WasmRegExpBuilder::from(test_cases)
132 |         .unwrap()
133 |         .withCapturingGroups()
134 |         .build();
135 |     assert_eq!(regexp, "^(abc  |123)$");
136 | }
137 | 
138 | #[wasm_bindgen_test]
139 | fn test_escaping_of_non_ascii_chars() {
140 |     let test_cases = Box::new([
141 |         JsValue::from("abc  "),
142 |         JsValue::from("123"),
143 |         JsValue::from("♥"),
144 |     ]);
145 |     let regexp = WasmRegExpBuilder::from(test_cases)
146 |         .unwrap()
147 |         .withEscapingOfNonAsciiChars(false)
148 |         .build();
149 |     assert_eq!(regexp, "^(?:abc  |123|\\u{2665})$");
150 | }
151 | 
152 | #[wasm_bindgen_test]
153 | fn test_verbose_mode() {
154 |     let test_cases = Box::new([
155 |         JsValue::from("abc  "),
156 |         JsValue::from("123"),
157 |         JsValue::from("♥"),
158 |     ]);
159 |     let regexp = WasmRegExpBuilder::from(test_cases)
160 |         .unwrap()
161 |         .withVerboseMode()
162 |         .build();
163 |     assert_eq!(
164 |         regexp,
165 |         indoc!(
166 |             r#"
167 |             (?x)
168 |             ^
169 |               (?:
170 |                 abc\ \ 
171 |                 |
172 |                 123
173 |                 |
174 |                 ♥
175 |               )
176 |             $"#
177 |         )
178 |     );
179 | }
180 | 
181 | #[wasm_bindgen_test]
182 | fn test_without_start_anchor() {
183 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
184 |     let regexp = WasmRegExpBuilder::from(test_cases)
185 |         .unwrap()
186 |         .withoutStartAnchor()
187 |         .build();
188 |     assert_eq!(regexp, "(?:abc  |123)$");
189 | }
190 | 
191 | #[wasm_bindgen_test]
192 | fn test_without_end_anchor() {
193 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
194 |     let regexp = WasmRegExpBuilder::from(test_cases)
195 |         .unwrap()
196 |         .withoutEndAnchor()
197 |         .build();
198 |     assert_eq!(regexp, "^(?:abc  |123)");
199 | }
200 | 
201 | #[wasm_bindgen_test]
202 | fn test_without_anchors() {
203 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
204 |     let regexp = WasmRegExpBuilder::from(test_cases)
205 |         .unwrap()
206 |         .withoutAnchors()
207 |         .build();
208 |     assert_eq!(regexp, "(?:abc  |123)");
209 | }
210 | 
211 | #[wasm_bindgen_test]
212 | fn test_minimum_repetitions() {
213 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
214 |     let builder = WasmRegExpBuilder::from(test_cases)
215 |         .unwrap()
216 |         .withMinimumRepetitions(0);
217 |     assert_eq!(
218 |         builder.err(),
219 |         Some(JsValue::from(
220 |             "Quantity of minimum repetitions must be greater than zero"
221 |         ))
222 |     );
223 | }
224 | 
225 | #[wasm_bindgen_test]
226 | fn test_minimum_substring_length() {
227 |     let test_cases = Box::new([JsValue::from("abc  "), JsValue::from("123")]);
228 |     let builder = WasmRegExpBuilder::from(test_cases)
229 |         .unwrap()
230 |         .withMinimumSubstringLength(0);
231 |     assert_eq!(
232 |         builder.err(),
233 |         Some(JsValue::from(
234 |             "Minimum substring length must be greater than zero"
235 |         ))
236 |     );
237 | }
238 | 


--------------------------------------------------------------------------------
/website.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pemistahl/grex/cb71c10815e2216f4941f0b52154fb5d1fc0a01c/website.jpg


--------------------------------------------------------------------------------