├── .editorconfig ├── .github ├── dependabot.yml └── workflows │ ├── python-build.yml │ ├── release.yml │ └── rust-build.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── README_PYPI.md ├── RELEASE_NOTES.md ├── benches ├── benchmark.rs └── testcases.txt ├── demo.gif ├── demo.tape ├── grex.pyi ├── logo.png ├── pyproject.toml ├── requirements.txt ├── src ├── builder.rs ├── cluster.rs ├── component.rs ├── config.rs ├── dfa.rs ├── expression.rs ├── format.rs ├── grapheme.rs ├── lib.rs ├── macros.rs ├── main.rs ├── python.rs ├── quantifier.rs ├── regexp.rs ├── substring.rs ├── unicode_tables │ ├── decimal.rs │ ├── mod.rs │ ├── space.rs │ └── word.rs └── wasm.rs ├── tests ├── cli_integration_tests.rs ├── lib_integration_tests.rs ├── property_tests.rs ├── python │ └── test_grex.py ├── wasm_browser_tests.rs └── wasm_node_tests.rs └── website.jpg /.editorconfig: -------------------------------------------------------------------------------- 1 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Editor configuration, see http://editorconfig.org 16 | root = true 17 | 18 | [*.rs] 19 | charset = utf-8 20 | indent_style = space 21 | indent_size = 4 22 | insert_final_newline = true 23 | trim_trailing_whitespace = false 24 | max_line_length = 100 25 | 26 | [*.md] 27 | max_line_length = off 28 | trim_trailing_whitespace = false 29 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "cargo" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/python-build.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Python Build 17 | 18 | on: 19 | push: 20 | branches: 21 | - main 22 | paths: 23 | - 'Cargo.lock' 24 | - 'Cargo.toml' 25 | - 'pyproject.toml' 26 | - 'requirements.txt' 27 | - 'src/**' 28 | - 'tests/**' 29 | - '**.yml' 30 | pull_request: 31 | branches: 32 | - main 33 | paths: 34 | - 'Cargo.lock' 35 | - 'Cargo.toml' 36 | - 'pyproject.toml' 37 | - 'requirements.txt' 38 | - 'src/**' 39 | - 'tests/**' 40 | - '**.yml' 41 | 42 | jobs: 43 | python-build: 44 | name: Python ${{ matrix.python-version }} on ${{ matrix.name }} 45 | 46 | runs-on: ${{ matrix.os }} 47 | 48 | strategy: 49 | fail-fast: false 50 | matrix: 51 | os: [ ubuntu-latest, macos-latest, windows-latest ] 52 | python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] 53 | include: 54 | - os: ubuntu-latest 55 | name: Linux 64-Bit 56 | 57 | - os: macos-latest 58 | name: MacOS 64-Bit 59 | 60 | - os: windows-latest 61 | name: Windows 64-Bit 62 | 63 | steps: 64 | - name: Check out repository 65 | uses: actions/checkout@v4 66 | 67 | - name: Set up Python 68 | uses: actions/setup-python@v5 69 | with: 70 | python-version: ${{ matrix.python-version }} 71 | cache: 'pip' 72 | 73 | - name: Install maturin and pytest 74 | run: pip install -r requirements.txt 75 | 76 | - name: Build Python extension 77 | run: maturin build 78 | 79 | - name: Install Python extension 80 | run: pip install --find-links=target/wheels grex 81 | 82 | - name: Run Python unit tests 83 | run: pytest tests/python/test_grex.py 84 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Release 17 | 18 | on: 19 | push: 20 | tags: 21 | - v1.* 22 | 23 | jobs: 24 | rust-release-build: 25 | name: ${{ matrix.name }} 26 | 27 | runs-on: ${{ matrix.os }} 28 | 29 | strategy: 30 | matrix: 31 | os: [ubuntu-latest, macos-latest, windows-latest] 32 | include: 33 | - os: ubuntu-latest 34 | name: Rust Release Build on Linux 35 | x86_64-target: x86_64-unknown-linux-musl 36 | aarch64-target: aarch64-unknown-linux-musl 37 | 38 | - os: macos-latest 39 | name: Rust Release Build on MacOS 40 | x86_64-target: x86_64-apple-darwin 41 | aarch64-target: aarch64-apple-darwin 42 | 43 | - os: windows-latest 44 | name: Rust Release Build on Windows 45 | x86_64-target: x86_64-pc-windows-msvc 46 | aarch64-target: aarch64-pc-windows-msvc 47 | 48 | steps: 49 | - name: Check out repository 50 | uses: actions/checkout@v4 51 | 52 | - name: Build x86_64 target in release mode 53 | uses: houseabsolute/actions-rust-cross@v0 54 | with: 55 | target: ${{ matrix.x86_64-target }} 56 | args: '--release --locked' 57 | 58 | - name: Build aarch64 target in release mode 59 | uses: houseabsolute/actions-rust-cross@v0 60 | with: 61 | target: ${{ matrix.aarch64-target }} 62 | args: '--release --locked' 63 | 64 | - name: Get latest release version number 65 | id: get_version 66 | uses: battila7/get-version-action@v2 67 | 68 | - name: Create x86_64 zip file on Windows 69 | if: ${{ matrix.os == 'windows-latest' }} 70 | run: | 71 | choco install zip 72 | cd target/${{ matrix.x86_64-target }}/release 73 | zip grex-${{ steps.get_version.outputs.version }}-${{ matrix.x86_64-target }}.zip grex.exe 74 | cd ../../.. 75 | 76 | - name: Create aarch64 zip file on Windows 77 | if: ${{ matrix.os == 'windows-latest' }} 78 | run: | 79 | cd target/${{ matrix.aarch64-target }}/release 80 | zip grex-${{ steps.get_version.outputs.version }}-${{ matrix.aarch64-target }}.zip grex.exe 81 | cd ../../.. 82 | 83 | - name: Create x86_64 tar.gz file on Linux and macOS 84 | if: ${{ matrix.os != 'windows-latest' }} 85 | run: | 86 | chmod +x target/${{ matrix.x86_64-target }}/release/grex 87 | tar -zcf target/${{ matrix.x86_64-target }}/release/grex-${{ steps.get_version.outputs.version }}-${{ matrix.x86_64-target }}.tar.gz -C target/${{ matrix.x86_64-target }}/release grex 88 | 89 | - name: Create aarch64 tar.gz file on Linux and macOS 90 | if: ${{ matrix.os != 'windows-latest' }} 91 | run: | 92 | chmod +x target/${{ matrix.aarch64-target }}/release/grex 93 | tar -zcf target/${{ matrix.aarch64-target }}/release/grex-${{ steps.get_version.outputs.version }}-${{ matrix.aarch64-target }}.tar.gz -C target/${{ matrix.aarch64-target }}/release grex 94 | 95 | - name: Upload release and assets to GitHub 96 | uses: svenstaro/upload-release-action@v2 97 | with: 98 | repo_token: ${{ secrets.GITHUB_TOKEN }} 99 | tag: ${{ github.ref }} 100 | release_name: grex ${{ steps.get_version.outputs.version-without-v }} 101 | file_glob: true 102 | file: target/*/release/grex-${{ steps.get_version.outputs.version }}-*.{zip,tar.gz} 103 | 104 | python-linux-release-build: 105 | name: Python Release Build on Linux and target ${{ matrix.target }} 106 | needs: rust-release-build 107 | 108 | runs-on: ubuntu-latest 109 | 110 | strategy: 111 | matrix: 112 | target: [ x86_64, x86, aarch64 ] 113 | linux: [ auto, musllinux_1_2 ] 114 | 115 | steps: 116 | - name: Check out repository 117 | uses: actions/checkout@v4 118 | 119 | - name: Build wheels 120 | uses: PyO3/maturin-action@v1 121 | with: 122 | target: ${{ matrix.target }} 123 | args: --release --out dist -i 3.8 3.9 3.10 3.11 3.12 pypy3.8 pypy3.9 pypy3.10 124 | sccache: 'true' 125 | manylinux: ${{ matrix.linux }} 126 | 127 | - name: Upload wheels 128 | uses: actions/upload-artifact@v4 129 | with: 130 | name: linux-${{ matrix.linux }}-${{ matrix.target }}-wheels 131 | path: dist 132 | 133 | python-windows-release-build: 134 | name: Python Release Build on Windows and target ${{ matrix.target }} 135 | needs: rust-release-build 136 | 137 | runs-on: windows-latest 138 | 139 | strategy: 140 | matrix: 141 | target: [ x64, x86 ] 142 | 143 | steps: 144 | - name: Check out repository 145 | uses: actions/checkout@v4 146 | 147 | - name: Build wheels 148 | uses: PyO3/maturin-action@v1 149 | with: 150 | target: ${{ matrix.target }} 151 | args: --release --out dist -i 3.8 3.9 3.10 3.11 3.12 152 | sccache: 'true' 153 | 154 | - name: Upload wheels 155 | uses: actions/upload-artifact@v4 156 | with: 157 | name: windows-${{ matrix.target }}-wheels 158 | path: dist 159 | 160 | python-macos-release-build: 161 | name: Python Release Build on MacOS and target ${{ matrix.target }} 162 | needs: rust-release-build 163 | 164 | runs-on: macos-latest 165 | 166 | strategy: 167 | matrix: 168 | target: [ x86_64, aarch64 ] 169 | 170 | steps: 171 | - name: Check out repository 172 | uses: actions/checkout@v4 173 | 174 | - name: Build wheels 175 | uses: PyO3/maturin-action@v1 176 | with: 177 | target: ${{ matrix.target }} 178 | args: --release --out dist -i 3.8 3.9 3.10 3.11 3.12 pypy3.8 pypy3.9 pypy3.10 179 | sccache: 'true' 180 | 181 | - name: Upload wheels 182 | uses: actions/upload-artifact@v4 183 | with: 184 | name: macos-${{ matrix.target }}-wheels 185 | path: dist 186 | 187 | python-release-upload: 188 | name: Publish wheels to PyPI 189 | needs: [ python-linux-release-build, python-windows-release-build, python-macos-release-build ] 190 | 191 | runs-on: ubuntu-latest 192 | 193 | steps: 194 | - name: Download wheels from previous jobs 195 | uses: actions/download-artifact@v4 196 | with: 197 | path: wheels 198 | merge-multiple: true 199 | 200 | - name: Upload to PyPI 201 | uses: PyO3/maturin-action@v1 202 | env: 203 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 204 | with: 205 | command: upload 206 | args: --skip-existing wheels/*.whl 207 | 208 | rust-release-upload: 209 | name: Upload to crates.io 210 | needs: [ python-linux-release-build, python-windows-release-build, python-macos-release-build ] 211 | 212 | runs-on: ubuntu-latest 213 | 214 | steps: 215 | - name: Check out repository 216 | uses: actions/checkout@v4 217 | 218 | - name: Upload release to crates.io 219 | uses: katyo/publish-crates@v2 220 | with: 221 | registry-token: ${{ secrets.CARGO_REGISTRY_TOKEN }} 222 | -------------------------------------------------------------------------------- /.github/workflows/rust-build.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | name: Rust Build 17 | 18 | on: 19 | push: 20 | branches: 21 | - main 22 | paths: 23 | - 'Cargo.lock' 24 | - 'Cargo.toml' 25 | - 'src/**' 26 | - 'tests/**' 27 | - '**.yml' 28 | pull_request: 29 | branches: 30 | - main 31 | paths: 32 | - 'Cargo.lock' 33 | - 'Cargo.toml' 34 | - 'src/**' 35 | - 'tests/**' 36 | - '**.yml' 37 | 38 | jobs: 39 | rust-build: 40 | name: Rust on ${{ matrix.name }} 41 | 42 | runs-on: ${{ matrix.os }} 43 | 44 | strategy: 45 | fail-fast: false 46 | matrix: 47 | os: [ubuntu-latest, macos-latest, windows-latest] 48 | include: 49 | - os: ubuntu-latest 50 | name: Linux 64-Bit 51 | target: x86_64-unknown-linux-musl 52 | 53 | - os: macos-latest 54 | name: MacOS 64-Bit 55 | target: x86_64-apple-darwin 56 | env: 57 | MACOSX_DEPLOYMENT_TARGET: 10.7 58 | 59 | - os: windows-latest 60 | name: Windows 64-Bit 61 | target: x86_64-pc-windows-msvc 62 | 63 | steps: 64 | - name: Check out repository 65 | uses: actions/checkout@v4 66 | 67 | - name: Add rustup target 68 | run: rustup target add ${{ matrix.target }} 69 | 70 | - name: Store or retrieve cargo caches 71 | uses: actions/cache@v4 72 | with: 73 | path: | 74 | ~/.cargo/bin/ 75 | ~/.cargo/registry/index/ 76 | ~/.cargo/registry/cache/ 77 | ~/.cargo/git/db/ 78 | target/ 79 | key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }} 80 | 81 | - name: Build target in debug mode 82 | run: cargo build --target ${{ matrix.target }} --locked 83 | 84 | - name: Test target in debug mode 85 | run: cargo test --target ${{ matrix.target }} 86 | 87 | wasm-build: 88 | name: WASM Build 89 | needs: rust-build 90 | 91 | runs-on: macos-latest 92 | 93 | steps: 94 | - name: Check out repository 95 | uses: actions/checkout@v4 96 | 97 | - name: Install wasm-pack 98 | run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh 99 | 100 | - name: Install Firefox and Geckodriver # not available anymore in macos-latest 101 | run: | 102 | brew install --cask firefox 103 | brew install geckodriver 104 | 105 | #- name: Enable Safari web driver 106 | # run: sudo safaridriver --enable 107 | 108 | - name: Run WASM integration tests on NodeJS 109 | run: wasm-pack test --node -- --no-default-features 110 | 111 | - name: Run WASM integration tests in Chrome 112 | run: wasm-pack test --headless --chrome -- --no-default-features 113 | 114 | - name: Run WASM integration tests in Firefox 115 | run: wasm-pack test --headless --firefox -- --no-default-features 116 | 117 | # Safari WASM tests not working, reason unclear 118 | # https://github.com/pemistahl/grex/actions/runs/12146729784/job/33871544034#step:9:30 119 | #- name: Run WASM integration tests in Safari 120 | # run: wasm-pack test --headless --safari -- --no-default-features 121 | 122 | coverage-report: 123 | name: Coverage Report 124 | needs: rust-build 125 | if: ${{ github.event_name == 'push' }} 126 | 127 | runs-on: ubuntu-latest 128 | 129 | container: 130 | image: xd009642/tarpaulin:develop-nightly 131 | options: --security-opt seccomp=unconfined 132 | 133 | steps: 134 | - name: Check out repository 135 | uses: actions/checkout@v4 136 | 137 | - name: Generate coverage report 138 | run: cargo +nightly tarpaulin --ignore-config --ignore-panics --ignore-tests --exclude-files src/python.rs src/main.rs src/wasm.rs --verbose --timeout 900 --out xml 139 | 140 | - name: Workaround for codecov/feedback#263 141 | run: git config --global --add safe.directory "$GITHUB_WORKSPACE" 142 | 143 | - name: Upload coverage report 144 | uses: codecov/codecov-action@v4 145 | with: 146 | token: ${{ secrets.CODECOV_TOKEN }} 147 | fail_ci_if_error: true 148 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | /pkg/ 16 | /target/ 17 | **/*.rs.bk 18 | 19 | .idea 20 | .project 21 | .c9/ 22 | *.launch 23 | .settings/ 24 | .metadata/ 25 | .venv 26 | *.sublime-workspace 27 | bin/ 28 | tmp/ 29 | out/ 30 | *.iml 31 | *.ipr 32 | *.iws 33 | *.bak 34 | *.tmp 35 | *.class 36 | *.html 37 | .buildpath 38 | .classpath 39 | .vscode/* 40 | !.vscode/settings.json 41 | !.vscode/tasks.json 42 | !.vscode/launch.json 43 | !.vscode/extensions.json 44 | 45 | .DS_Store 46 | Thumbs.db 47 | $RECYCLE.BIN/ 48 | ._* 49 | .AppleDouble 50 | .LSOverride 51 | *.lnk 52 | Desktop.ini 53 | ehthumbs.db 54 | 55 | *.proptest-regressions 56 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | [package] 17 | name = "grex" 18 | version = "1.4.5" 19 | authors = ["Peter M. Stahl "] 20 | description = """ 21 | grex generates regular expressions from user-provided test cases. 22 | """ 23 | homepage = "https://github.com/pemistahl/grex" 24 | repository = "https://github.com/pemistahl/grex" 25 | documentation = "https://docs.rs/grex" 26 | license = "Apache-2.0" 27 | readme = "README.md" 28 | edition = "2021" 29 | categories = ["command-line-utilities", "parsing"] 30 | keywords = ["pattern", "regex", "regexp"] 31 | 32 | [lib] 33 | crate-type = ["cdylib", "rlib"] 34 | 35 | [dependencies] 36 | itertools = "0.13.0" 37 | lazy_static = "1.5.0" 38 | ndarray = "0.16.1" 39 | petgraph = {version = "0.6.5", default-features = false, features = ["stable_graph"]} 40 | regex = "1.10.6" 41 | unic-char-range = "0.9.0" 42 | unic-ucd-category = "0.9.0" 43 | unicode-segmentation = "1.12.0" 44 | 45 | [target.'cfg(not(target_family = "wasm"))'.dependencies] 46 | clap = {version = "4.5.22", features = ["derive", "wrap_help"], optional = true} 47 | pyo3 = {version = "0.23.3", optional = true} 48 | 49 | [target.'cfg(target_family = "wasm")'.dependencies] 50 | wasm-bindgen = "0.2.97" 51 | 52 | [dev-dependencies] 53 | indoc = "2.0.5" 54 | rstest = "0.23.0" 55 | 56 | [target.'cfg(not(target_family = "wasm"))'.dev-dependencies] 57 | assert_cmd = "2.0.16" 58 | criterion = "0.5.1" 59 | predicates = "3.1.2" 60 | proptest = "1.5.0" 61 | tempfile = "3.14.0" 62 | 63 | [target.'cfg(target_family = "wasm")'.dev-dependencies] 64 | wasm-bindgen-test = "0.3.47" 65 | 66 | [features] 67 | default = ["cli"] 68 | cli = ["clap"] 69 | python = ["pyo3"] 70 | 71 | [[bench]] 72 | name = "benchmark" 73 | harness = false 74 | 75 | [profile.bench] 76 | debug = true 77 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README_PYPI.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ![grex](https://raw.githubusercontent.com/pemistahl/grex/main/logo.png) 4 | 5 |
6 | 7 | [![build status](https://github.com/pemistahl/grex/actions/workflows/python-build.yml/badge.svg)](https://github.com/pemistahl/grex/actions/workflows/python-build.yml) 8 | [![codecov](https://codecov.io/gh/pemistahl/grex/branch/main/graph/badge.svg)](https://codecov.io/gh/pemistahl/grex) 9 | [![demo](https://img.shields.io/badge/-Demo%20Website-orange?logo=HTML5&labelColor=white)](https://pemistahl.github.io/grex-js/) 10 | ![supported Python versions](https://img.shields.io/badge/Python-%3E%3D%203.8-blue?logo=Python&logoColor=yellow) 11 | [![pypi](https://img.shields.io/badge/PYPI-v1.0.1-blue?logo=PyPI&logoColor=yellow)](https://pypi.org/project/grex) 12 | [![license](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://www.apache.org/licenses/LICENSE-2.0) 13 |
14 | 15 |
16 | 17 | ## 1. What does this library do? 18 | 19 | *grex* is a library that is meant to simplify the often complicated and tedious 20 | task of creating regular expressions. It does so by automatically generating a 21 | single regular expression from user-provided test cases. The resulting 22 | expression is guaranteed to match the test cases which it was generated from. 23 | 24 | This project has started as a [Rust port](https://github.com/pemistahl/grex) of 25 | the JavaScript tool [*regexgen*](https://github.com/devongovett/regexgen) 26 | written by [Devon Govett](https://github.com/devongovett). Although a lot of 27 | further useful features could be added to it, its development was apparently 28 | ceased several years ago. The Rust library offers new features and extended 29 | Unicode support. With the help of [PyO3](https://github.com/PyO3/pyo3) and 30 | [Maturin](https://github.com/PyO3/maturin), the library has been compiled to a 31 | Python extension module so that it can be used within any Python software as well. 32 | 33 | The philosophy of this project is to generate the most specific regular expression 34 | possible by default which exactly matches the given input only and nothing else. 35 | With the use of preprocessing methods, more generalized expressions can be created. 36 | 37 | The produced expressions are [Perl-compatible regular expressions](https://www.pcre.org) which are also 38 | compatible with the [regular expression module](https://docs.python.org/3/library/re.html) in Python's 39 | standard library. 40 | 41 | There is a [demo website](https://pemistahl.github.io/grex-js/) available where you can give grex a try. 42 | 43 | ![demo website](https://raw.githubusercontent.com/pemistahl/grex/main/website.jpg) 44 | 45 | ## 2. Do I still need to learn to write regexes then? 46 | 47 | **Definitely, yes!** Using the standard settings, *grex* produces a regular expression that is guaranteed 48 | to match only the test cases given as input and nothing else. However, if the conversion to shorthand 49 | character classes such as `\w` is enabled, the resulting regex matches a much wider scope of test cases. 50 | Knowledge about the consequences of this conversion is essential for finding a correct regular expression 51 | for your business domain. 52 | 53 | *grex* uses an algorithm that tries to find the shortest possible regex for the given test cases. 54 | Very often though, the resulting expression is still longer or more complex than it needs to be. 55 | In such cases, a more compact or elegant regex can be created only by hand. 56 | Also, every regular expression engine has different built-in optimizations. *grex* does not know anything 57 | about those and therefore cannot optimize its regexes for a specific engine. 58 | 59 | **So, please learn how to write regular expressions!** The currently best use case for *grex* is to find 60 | an initial correct regex which should be inspected by hand if further optimizations are possible. 61 | 62 | ## 3. Current Features 63 | 64 | - literals 65 | - character classes 66 | - detection of common prefixes and suffixes 67 | - detection of repeated substrings and conversion to `{min,max}` quantifier notation 68 | - alternation using `|` operator 69 | - optionality using `?` quantifier 70 | - escaping of non-ascii characters, with optional conversion of astral code points to surrogate pairs 71 | - case-sensitive or case-insensitive matching 72 | - capturing or non-capturing groups 73 | - optional anchors `^` and `$` 74 | - fully compliant to [Unicode Standard 15.0](https://unicode.org/versions/Unicode15.0.0) 75 | - correctly handles graphemes consisting of multiple Unicode symbols 76 | - produces more readable expressions indented on multiple using optional verbose mode 77 | - optional syntax highlighting for nicer output in supported terminals 78 | 79 | ## 4. How to install? 80 | 81 | *grex* is available in the [Python Package Index](https://pypi.org/project/grex) and can be installed with: 82 | 83 | ``` 84 | pip install grex 85 | ``` 86 | 87 | The current version 1.0.1 corresponds to the latest version 1.4.5 of the Rust 88 | library and command-line tool. 89 | 90 | ## 5. How to use? 91 | 92 | This library contains a single class named `RegExpBuilder` that can be imported like so: 93 | 94 | ```python 95 | from grex import RegExpBuilder 96 | ``` 97 | 98 | ### 5.1 Default settings 99 | 100 | ```python 101 | pattern = RegExpBuilder.from_test_cases(["a", "aa", "aaa"]).build() 102 | assert pattern == "^a(?:aa?)?$" 103 | ``` 104 | 105 | ### 5.2 Convert to character classes 106 | 107 | ```python 108 | pattern = (RegExpBuilder.from_test_cases(["a", "aa", "123"]) 109 | .with_conversion_of_digits() 110 | .with_conversion_of_words() 111 | .build()) 112 | assert pattern == "^(?:\\d\\d\\d|\\w(?:\\w)?)$" 113 | ``` 114 | 115 | ### 5.3 Convert repeated substrings 116 | 117 | ```python 118 | pattern = (RegExpBuilder.from_test_cases(["aa", "bcbc", "defdefdef"]) 119 | .with_conversion_of_repetitions() 120 | .build()) 121 | assert pattern == "^(?:a{2}|(?:bc){2}|(?:def){3})$" 122 | ``` 123 | 124 | By default, *grex* converts each substring this way which is at least a single character long 125 | and which is subsequently repeated at least once. You can customize these two parameters if you like. 126 | 127 | In the following example, the test case `aa` is not converted to `a{2}` because the repeated substring 128 | `a` has a length of 1, but the minimum substring length has been set to 2. 129 | 130 | ```python 131 | pattern = (RegExpBuilder.from_test_cases(["aa", "bcbc", "defdefdef"]) 132 | .with_conversion_of_repetitions() 133 | .with_minimum_substring_length(2) 134 | .build()) 135 | assert pattern == "^(?:aa|(?:bc){2}|(?:def){3})$" 136 | ``` 137 | 138 | Setting a minimum number of 2 repetitions in the next example, only the test case `defdefdef` will be 139 | converted because it is the only one that is repeated twice. 140 | 141 | ```python 142 | pattern = (RegExpBuilder.from_test_cases(["aa", "bcbc", "defdefdef"]) 143 | .with_conversion_of_repetitions() 144 | .with_minimum_repetitions(2) 145 | .build()) 146 | assert pattern == "^(?:bcbc|aa|(?:def){3})$" 147 | ``` 148 | 149 | ### 5.4 Escape non-ascii characters 150 | 151 | ```python 152 | pattern = (RegExpBuilder.from_test_cases(["You smell like 💩."]) 153 | .with_escaping_of_non_ascii_chars(use_surrogate_pairs=False) 154 | .build()) 155 | assert pattern == "^You smell like \\U0001f4a9\\.$" 156 | ``` 157 | 158 | Old versions of JavaScript do not support unicode escape sequences for the astral code planes 159 | (range `U+010000` to `U+10FFFF`). In order to support these symbols in JavaScript regular 160 | expressions, the conversion to surrogate pairs is necessary. More information on that matter 161 | can be found [here](https://mathiasbynens.be/notes/javascript-unicode). 162 | 163 | ```python 164 | pattern = (RegExpBuilder.from_test_cases(["You smell like 💩."]) 165 | .with_escaping_of_non_ascii_chars(use_surrogate_pairs=True) 166 | .build()) 167 | assert pattern == "^You smell like \\ud83d\\udca9\\.$" 168 | ``` 169 | 170 | ### 5.5 Case-insensitive matching 171 | 172 | The regular expressions that *grex* generates are case-sensitive by default. 173 | Case-insensitive matching can be enabled like so: 174 | 175 | ```python 176 | pattern = (RegExpBuilder.from_test_cases(["big", "BIGGER"]) 177 | .with_case_insensitive_matching() 178 | .build()) 179 | assert pattern == "(?i)^big(?:ger)?$" 180 | ``` 181 | 182 | ### 5.6 Capturing Groups 183 | 184 | Non-capturing groups are used by default. 185 | Extending the previous example, you can switch to capturing groups instead. 186 | 187 | ```python 188 | pattern = (RegExpBuilder.from_test_cases(["big", "BIGGER"]) 189 | .with_case_insensitive_matching() 190 | .with_capturing_groups() 191 | .build()) 192 | assert pattern == "(?i)^big(ger)?$" 193 | ``` 194 | 195 | ### 5.7 Verbose mode 196 | 197 | If you find the generated regular expression hard to read, you can enable verbose mode. 198 | The expression is then put on multiple lines and indented to make it more pleasant to the eyes. 199 | 200 | ```python 201 | import inspect 202 | 203 | pattern = (RegExpBuilder.from_test_cases(["a", "b", "bcd"]) 204 | .with_verbose_mode() 205 | .build()) 206 | 207 | assert pattern == inspect.cleandoc(""" 208 | (?x) 209 | ^ 210 | (?: 211 | b 212 | (?: 213 | cd 214 | )? 215 | | 216 | a 217 | ) 218 | $ 219 | """ 220 | ) 221 | ``` 222 | 223 | ### 5.8 Disable anchors 224 | 225 | By default, the anchors `^` and `$` are put around every generated regular expression in order 226 | to ensure that it matches only the test cases given as input. Often enough, however, it is 227 | desired to use the generated pattern as part of a larger one. For this purpose, the anchors 228 | can be disabled, either separately or both of them. 229 | 230 | ```python 231 | pattern = (RegExpBuilder.from_test_cases(["a", "aa", "aaa"]) 232 | .without_anchors() 233 | .build()) 234 | assert pattern == "a(?:aa?)?" 235 | ``` 236 | 237 | ## 6. How to build? 238 | 239 | In order to build the source code yourself, you need the 240 | [stable Rust toolchain](https://www.rust-lang.org/tools/install) installed on your machine 241 | so that [*cargo*](https://doc.rust-lang.org/cargo/), the Rust package manager is available. 242 | 243 | ```shell 244 | git clone https://github.com/pemistahl/grex.git 245 | cd grex 246 | cargo build 247 | ``` 248 | 249 | To build the Python extension module, create a virtual environment and install [Maturin](https://github.com/PyO3/maturin). 250 | 251 | ```shell 252 | python -m venv /path/to/virtual/environment 253 | source /path/to/virtual/environment/bin/activate 254 | pip install maturin 255 | maturin build 256 | ``` 257 | 258 | The Rust source code is accompanied by an extensive test suite consisting of unit tests, integration 259 | tests and property tests. For running them, simply say: 260 | 261 | ```shell 262 | cargo test 263 | ``` 264 | 265 | Additional Python tests can be run after installing pytest which is an optional dependency: 266 | 267 | ```shell 268 | maturin develop --extras=test 269 | pytest tests/python/test_grex.py 270 | ``` 271 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | ## grex 1.4.5 (released on 06 Mar 2024) 2 | 3 | ### Improvements 4 | 5 | - Type stubs for the Python bindings are now available, allowing better static code 6 | analysis, better code completion in supported IDEs and easier understanding of the library's API. 7 | - The code for creating regular expressions in verbose mode has been simplified and is more performant now. 8 | - ARM64 binaries are now provided for every major platform (Linux, macOs, Windows). 9 | 10 | ### Bug Fixes 11 | 12 | - For a small set of special characters, *grex* produced incorrect regular expressions when 13 | the case-insensitivity feature was enabled. This has been fixed. 14 | 15 | ### Changes 16 | - All dependencies have been updated to their latest versions. 17 | 18 | ## grex 1.4.4 (released on 24 Aug 2023) 19 | 20 | ### Bug Fixes 21 | - The Python release workflow was incorrect as it produced too many wheels for upload. 22 | This has been fixed. 23 | 24 | ## grex 1.4.3 (released on 24 Aug 2023) 25 | 26 | ### Features 27 | - Python bindings are now available for the library. Use grex within any Python software. (#172) 28 | 29 | ### Changes 30 | - All dependencies have been updated to their latest versions. 31 | 32 | ## grex 1.4.2 (released on 26 Jul 2023) 33 | 34 | ### Improvements 35 | - All characters from the current Unicode standard 15.0 are now fully supported. (#128) 36 | - A proper exit code is now returned if the provided user input cannot be handled by the CLI. 37 | Big thanks to @spenserblack for the respective pull request. (#165) 38 | 39 | ### Changes 40 | - It is not possible anymore to call `RegExpBuilder.with_syntax_highlighting()` in the library 41 | as it only makes sense for the CLI. 42 | - The dependency `atty` has been removed in favor of `std::io::IsTerminal` in Rust >= 1.70.0. 43 | As a result, Rust >= 1.70.0 is now needed to compile the CLI. 44 | - All remaining dependencies have been updated to their latest versions. 45 | 46 | ### Bug Fixes 47 | - Several bugs have been fixed that caused incorrect expressions to be generated in rare cases. 48 | 49 | ## grex 1.4.1 (released on 21 Oct 2022) 50 | 51 | ### Changes 52 | - `clap` has been updated to version 4.0. The help output by `grex -h` now looks a little different. 53 | 54 | ### Bug Fixes 55 | - A bug in the grapheme segmentation was fixed that caused test cases which contain backslashes to produce 56 | incorrect regular expressions. 57 | 58 | ## grex 1.4.0 (released on 26 Jul 2022) 59 | 60 | ### Features 61 | - The library can now be compiled to WebAssembly and be used in any JavaScript project. (#82) 62 | - The supported character set for regular expression generation has been updated to the current Unicode Standard 14.0. 63 | - `structopt` has been replaced with `clap` providing much nicer help output for the command-line tool. 64 | 65 | ### Improvements 66 | - The regular expression generation performance has been significantly improved, especially for generating very long 67 | expressions from a large set of test cases. This has been accomplished by reducing the number of memory allocations, 68 | removing deprecated code and applying several minor optimizations. 69 | 70 | ### Bug Fixes 71 | - Several bugs have been fixed that caused incorrect expressions to be generated in rare cases. 72 | 73 | ## grex 1.3.0 (released on 15 Sep 2021) 74 | 75 | ### Features 76 | - anchors can now be disabled so that the generated expression can be used as part of a larger one (#30) 77 | - the command-line tool can now be used within Unix pipelines (#45) 78 | 79 | ### Changes 80 | - Additional methods have been added to `RegExpBuilder` in order to replace the enum `Feature` and make the library API more consistent. (#47) 81 | 82 | ### Bug Fixes 83 | - Under rare circumstances, the conversion of repetitions did not work. This has been fixed. (#36) 84 | 85 | ## grex 1.2.0 (released on 28 Mar 2021) 86 | 87 | ### Features 88 | - verbose mode is now supported with the `--verbose` flag to produce regular expressions which are easier to read (#17) 89 | 90 | ## grex 1.1.0 (released on 17 Apr 2020) 91 | 92 | ### Features 93 | - case-insensitive matching regexes are now supported with the `--ignore-case` command-line flag or with `Feature::CaseInsensitivity` in the library (#23) 94 | - non-capturing groups are now the default; capturing groups can be enabled with the `--capture-groups` command-line flag or with `Feature::CapturingGroup` in the library (#15) 95 | - a lower bound for the conversion of repeated substrings can now be set by specifying `--min-repetitions` and `--min-substring-length` or using the library methods `RegExpBuilder.with_minimum_repetitions()` and `RegExpBuilder.with_minimum_substring_length()` (#10) 96 | - test cases can now be passed from a file within the library as well using `RegExpBuilder::from_file()` (#13) 97 | 98 | ### Changes 99 | 100 | - the rules for the conversion of test cases to shorthand character classes have been updated to be compliant to the newest Unicode Standard 13.0 (#21) 101 | - the dependency on the unmaintained linked-list crate has been removed (#24) 102 | 103 | ### Bug Fixes 104 | 105 | - test cases starting with a hyphen are now correctly parsed on the command-line (#12) 106 | - the common substring detection algorithm now uses optionality expressions where possible instead of redundant union operations (#22) 107 | 108 | ### Test Coverage 109 | - new unit tests, integration tests and property tests have been added 110 | 111 | ## grex 1.0.0 (released on 02 Feb 2020) 112 | 113 | ### Features 114 | - conversion to character classes `\d`, `\D`, `\s`, `\S`, `\w`, `\W` is now supported 115 | - repetition detection now works with arbitrarily nested expressions. Input strings such as `aaabaaab` which were previously converted to `^(aaab){2}$` are now converted to `^(a{3}b){2}$`. 116 | - optional syntax highlighting for the produced regular expressions can now be enabled using the `--colorize` command-line flag or with the library method `RegExpBuilder.with_syntax_highlighting()` 117 | 118 | ### Test Coverage 119 | - new unit tests, integration tests and property tests have been added 120 | 121 | ## grex 0.3.2 (released on 12 Jan 2020) 122 | 123 | ### Test Coverage 124 | - new property tests have been added that revealed new bugs 125 | 126 | ### Bug Fixes 127 | - entire rewrite of the repetition detection algorithm 128 | - the former algorithm produced wrong regular expressions or even panicked for certain test cases 129 | 130 | ## grex 0.3.1 (released on 06 Jan 2020) 131 | 132 | ### Test Coverage 133 | - property tests have been added using the [proptest](https://crates.io/crates/proptest) crate 134 | - big thanks go to [Christophe Biocca](https://github.com/christophebiocca) for pointing me to the concept of property tests in the first place and for writing an initial implementation of these tests 135 | 136 | ### Bug Fixes 137 | - some regular expression specific characters were not escaped correctly in the generated expression 138 | - expressions consisting of a single alternation such as `^(abc|xyz)$` were missing the outer parentheses. This caused an erroneous match of strings such as `abc123` or `456xyz` because of precedence rules. 139 | - the created DFA was wrong for repetition conversion in some corner cases. The input `a, aa, aaa, aaaa, aaab` previously returned the expression `^a{1,4}b?$` which erroneously matches `aaaab`. Now the correct expression `^(a{3}b|a{1,4})$` is returned. 140 | 141 | ### Documentation 142 | - some minor documentation updates 143 | 144 | ## grex 0.3.0 (released on 24 Dec 2019) 145 | 146 | ### Features 147 | - *grex* is now also available as a library 148 | - escaping of non-ascii characters is now supported with the `-e` flag 149 | - astral code points can be converted to surrogate with the `--with-surrogates` flag 150 | - repeated non-overlapping substrings can be converted to `{min,max}` quantifier notation using the `-r` flag 151 | 152 | ### Bug Fixes 153 | - many many many bug fixes :-O 154 | 155 | ## grex 0.2.0 (released on 20 Oct 2019) 156 | 157 | ### Features 158 | - character classes are now supported 159 | - input strings can now be read from a text file 160 | 161 | ### Changes 162 | - unicode characters are not escaped anymore by default 163 | - the performance of the DFA minimization algorithm has been improved for large DFAs 164 | - regular expressions are now always surrounded by anchors `^` and `$` 165 | 166 | ### Bug Fixes 167 | - fixed a bug that caused a panic when giving an empty string as input 168 | 169 | ## grex 0.1.0 (released on 06 Oct 2019) 170 | 171 | This is the very first release of *grex*. It aims at simplifying the construction of regular expressions based on matching example input. 172 | 173 | ### Features 174 | - literals 175 | - detection of common prefixes and suffixes 176 | - alternation using `|` operator 177 | - optionality using `?` quantifier 178 | - concatenation of all of the former 179 | -------------------------------------------------------------------------------- /benches/benchmark.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use criterion::{criterion_group, criterion_main, Criterion}; 18 | use grex::RegExpBuilder; 19 | use itertools::Itertools; 20 | use std::fs::File; 21 | use std::io::Read; 22 | 23 | fn load_test_cases() -> Vec { 24 | let mut f = File::open("./benches/testcases.txt").expect("Test cases could not be loaded"); 25 | let mut s = String::new(); 26 | f.read_to_string(&mut s).unwrap(); 27 | s.split("\n") 28 | .map(|test_case| test_case.to_string()) 29 | .collect_vec() 30 | } 31 | 32 | fn benchmark_grex_with_default_settings(c: &mut Criterion) { 33 | let test_cases = load_test_cases(); 34 | c.bench_function("grex with default settings", |bencher| { 35 | bencher.iter(|| RegExpBuilder::from(&test_cases).build()) 36 | }); 37 | } 38 | 39 | fn benchmark_grex_with_conversion_of_repetitions(c: &mut Criterion) { 40 | let test_cases = load_test_cases(); 41 | c.bench_function("grex with conversion of repetitions", |bencher| { 42 | bencher.iter(|| { 43 | RegExpBuilder::from(&test_cases) 44 | .with_conversion_of_repetitions() 45 | .build() 46 | }) 47 | }); 48 | } 49 | 50 | fn benchmark_grex_with_conversion_of_digits(c: &mut Criterion) { 51 | let test_cases = load_test_cases(); 52 | c.bench_function("grex with conversion of digits", |bencher| { 53 | bencher.iter(|| { 54 | RegExpBuilder::from(&test_cases) 55 | .with_conversion_of_digits() 56 | .build() 57 | }) 58 | }); 59 | } 60 | 61 | fn benchmark_grex_with_conversion_of_non_digits(c: &mut Criterion) { 62 | let test_cases = load_test_cases(); 63 | c.bench_function("grex with conversion of non-digits", |bencher| { 64 | bencher.iter(|| { 65 | RegExpBuilder::from(&test_cases) 66 | .with_conversion_of_non_digits() 67 | .build() 68 | }) 69 | }); 70 | } 71 | 72 | fn benchmark_grex_with_conversion_of_words(c: &mut Criterion) { 73 | let test_cases = load_test_cases(); 74 | c.bench_function("grex with conversion of words", |bencher| { 75 | bencher.iter(|| { 76 | RegExpBuilder::from(&test_cases) 77 | .with_conversion_of_words() 78 | .build() 79 | }) 80 | }); 81 | } 82 | 83 | fn benchmark_grex_with_conversion_of_non_words(c: &mut Criterion) { 84 | let test_cases = load_test_cases(); 85 | c.bench_function("grex with conversion of non-words", |bencher| { 86 | bencher.iter(|| { 87 | RegExpBuilder::from(&test_cases) 88 | .with_conversion_of_non_words() 89 | .build() 90 | }) 91 | }); 92 | } 93 | 94 | fn benchmark_grex_with_conversion_of_whitespace(c: &mut Criterion) { 95 | let test_cases = load_test_cases(); 96 | c.bench_function("grex with conversion of whitespace", |bencher| { 97 | bencher.iter(|| { 98 | RegExpBuilder::from(&test_cases) 99 | .with_conversion_of_whitespace() 100 | .build() 101 | }) 102 | }); 103 | } 104 | 105 | fn benchmark_grex_with_conversion_of_non_whitespace(c: &mut Criterion) { 106 | let test_cases = load_test_cases(); 107 | c.bench_function("grex with conversion of non-whitespace", |bencher| { 108 | bencher.iter(|| { 109 | RegExpBuilder::from(&test_cases) 110 | .with_conversion_of_non_whitespace() 111 | .build() 112 | }) 113 | }); 114 | } 115 | 116 | fn benchmark_grex_with_case_insensitive_matching(c: &mut Criterion) { 117 | let test_cases = load_test_cases(); 118 | c.bench_function("grex with case-insensitive matching", |bencher| { 119 | bencher.iter(|| { 120 | RegExpBuilder::from(&test_cases) 121 | .with_case_insensitive_matching() 122 | .build() 123 | }) 124 | }); 125 | } 126 | 127 | fn benchmark_grex_with_verbose_mode(c: &mut Criterion) { 128 | let test_cases = load_test_cases(); 129 | c.bench_function("grex with verbose mode", |bencher| { 130 | bencher.iter(|| RegExpBuilder::from(&test_cases).with_verbose_mode().build()) 131 | }); 132 | } 133 | 134 | criterion_group!( 135 | benches, 136 | benchmark_grex_with_default_settings, 137 | benchmark_grex_with_conversion_of_repetitions, 138 | benchmark_grex_with_conversion_of_digits, 139 | benchmark_grex_with_conversion_of_non_digits, 140 | benchmark_grex_with_conversion_of_words, 141 | benchmark_grex_with_conversion_of_non_words, 142 | benchmark_grex_with_conversion_of_whitespace, 143 | benchmark_grex_with_conversion_of_non_whitespace, 144 | benchmark_grex_with_case_insensitive_matching, 145 | benchmark_grex_with_verbose_mode 146 | ); 147 | 148 | criterion_main!(benches); 149 | -------------------------------------------------------------------------------- /benches/testcases.txt: -------------------------------------------------------------------------------- 1 | Rocket Sled 2 | Elysian Heirloom 3 | Kaleb's Favor 4 | Blazing Renegade 5 | Flash Fire 6 | Silence 7 | Talir's Favored 8 | Timekeeper 9 | Oasis Sanctuary 10 | Rolant's Favor 11 | Mantle of Justice 12 | Eilyn's Favor 13 | Thunderbird 14 | Primal Incarnation 15 | Vampire Bat 16 | Vara's Favor 17 | Devouring Shadow 18 | Seat of Order 19 | Seat of Fury 20 | Seat of Impulse 21 | Seat of Vengeance 22 | Seat of Glory 23 | Seat of Progress 24 | Seat of Chaos 25 | Seat of Mystery 26 | Seat of Cunning 27 | Seat of Wisdom 28 | Firebomb 29 | Grenadin 30 | Iron Sword 31 | Magmahound 32 | Wisp 33 | Rhinarc 34 | Sentinel 35 | Owl 36 | Gemblade 37 | Frog 38 | Snowball 39 | Pig 40 | Serpent Hatchling 41 | Carnosaur 42 | Stormdancer 43 | Illusionary Dragon 44 | Spiteling 45 | Vengeful Gargoyle 46 | Muertis, Pale Rider 47 | Occi, Pale Rider 48 | Sangu, Pale Rider 49 | Volan, Pale Rider 50 | Direwood Beast 51 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pemistahl/grex/cb71c10815e2216f4941f0b52154fb5d1fc0a01c/demo.gif -------------------------------------------------------------------------------- /demo.tape: -------------------------------------------------------------------------------- 1 | # demo.gif created with https://github.com/charmbracelet/vhs on macOS 13 (Ventura) 2 | 3 | Require grex 4 | Output demo.gif 5 | 6 | Set Shell zsh 7 | Set Theme "Whimsy" 8 | Set Width 1200 9 | Set Height 850 10 | Set TypingSpeed 150ms 11 | 12 | Type "grex -c 'regexes are awesome' 'regexes are awful'" 13 | Sleep 3s 14 | Enter 15 | Sleep 10s 16 | 17 | Up 18 | Left 42 19 | Type " --verbose" 20 | Sleep 3s 21 | Enter 22 | Sleep 15s 23 | Type "clear" 24 | Enter 25 | 26 | Type "grex -c haha HAHAHA" 27 | Sleep 3s 28 | Enter 29 | Sleep 10s 30 | 31 | Up 32 | Left 12 33 | Type " --repetitions" 34 | Sleep 3s 35 | Enter 36 | Sleep 10s 37 | 38 | Up 39 | Left 12 40 | Type " --verbose" 41 | Sleep 3s 42 | Enter 43 | Sleep 15s 44 | 45 | Up 46 | Left 12 47 | Type " --ignore-case" 48 | Sleep 3s 49 | Enter 50 | Sleep 15s 51 | -------------------------------------------------------------------------------- /grex.pyi: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from typing import List 17 | 18 | 19 | class RegExpBuilder: 20 | """This class builds regular expressions from user-provided test cases.""" 21 | 22 | @classmethod 23 | def from_test_cases(cls, test_cases: List[str]) -> "RegExpBuilder": 24 | """Specify the test cases to build the regular expression from. 25 | 26 | The test cases need not be sorted because `RegExpBuilder` sorts them internally. 27 | 28 | Args: 29 | test_cases (list[str]): The list of test cases 30 | 31 | Raises: 32 | ValueError: if `test_cases` is empty 33 | """ 34 | 35 | def with_conversion_of_digits(self) -> "RegExpBuilder": 36 | """Convert any Unicode decimal digit to character class `\d`. 37 | 38 | This method takes precedence over `with_conversion_of_words` if both are set. 39 | Decimal digits are converted to `\d`, the remaining word characters to `\w`. 40 | 41 | This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 42 | Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. 43 | """ 44 | 45 | def with_conversion_of_non_digits(self) -> "RegExpBuilder": 46 | """Convert any character which is not a Unicode decimal digit to character class `\D`. 47 | 48 | This method takes precedence over `with_conversion_of_non_words` if both are set. 49 | Non-digits which are also non-word characters are converted to `\D`. 50 | 51 | This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 52 | Non-digits which are also non-space characters are converted to `\D`. 53 | """ 54 | 55 | def with_conversion_of_whitespace(self) -> "RegExpBuilder": 56 | """Convert any Unicode whitespace character to character class `\s`. 57 | 58 | This method takes precedence over `with_conversion_of_non_digits` if both are set. 59 | Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. 60 | 61 | This method takes precedence over `with_conversion_of_non_words` if both are set. 62 | Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. 63 | """ 64 | 65 | def with_conversion_of_non_whitespace(self) -> "RegExpBuilder": 66 | """Convert any character which is not a Unicode whitespace character to character class `\S`.""" 67 | 68 | def with_conversion_of_words(self) -> "RegExpBuilder": 69 | """Convert any Unicode word character to character class `\w`. 70 | 71 | This method takes precedence over `with_conversion_of_non_digits` if both are set. 72 | Word characters are converted to `\w`, the remaining non-digit characters to `\D`. 73 | 74 | This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 75 | Word characters are converted to `\w`, the remaining non-space characters to `\S`. 76 | """ 77 | 78 | def with_conversion_of_non_words(self) -> "RegExpBuilder": 79 | """Convert any character which is not a Unicode word character to character class `\W`. 80 | 81 | This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 82 | Non-words which are also non-space characters are converted to `\W`. 83 | """ 84 | 85 | def with_conversion_of_repetitions(self) -> "RegExpBuilder": 86 | """Detect repeated non-overlapping substrings and to convert them to `{min,max}` quantifier notation.""" 87 | 88 | def with_case_insensitive_matching(self) -> "RegExpBuilder": 89 | """Enable case-insensitive matching of test cases so that letters match both upper and lower case.""" 90 | 91 | def with_capturing_groups(self) -> "RegExpBuilder": 92 | """Replace non-capturing groups with capturing ones.""" 93 | 94 | def with_minimum_repetitions(self, quantity: int) -> "RegExpBuilder": 95 | """Specify the minimum quantity of substring repetitions to be converted 96 | if `with_conversion_of_repetitions` is set. 97 | 98 | If the quantity is not explicitly set with this method, a default value of 1 will be used. 99 | 100 | Args: 101 | quantity (int): The minimum quantity of substring repetitions 102 | 103 | Raises: 104 | ValueError: if `quantity` is zero 105 | """ 106 | 107 | def with_minimum_substring_length(self, length: int) -> "RegExpBuilder": 108 | """Specify the minimum length a repeated substring must have in order 109 | to be converted if `with_conversion_of_repetitions` is set. 110 | 111 | If the length is not explicitly set with this method, a default value of 1 will be used. 112 | 113 | Args: 114 | length (int): The minimum substring length 115 | 116 | Raises: 117 | ValueError: if `length` is zero 118 | """ 119 | 120 | def with_escaping_of_non_ascii_chars(self, use_surrogate_pairs: bool) -> "RegExpBuilder": 121 | """Convert non-ASCII characters to unicode escape sequences. 122 | 123 | The parameter `use_surrogate_pairs` specifies whether to convert astral 124 | code planes (range `U+010000` to `U+10FFFF`) to surrogate pairs. 125 | 126 | Args: 127 | use_surrogate_pairs (bool): Whether to convert astral code planes to surrogate pairs 128 | """ 129 | 130 | def with_verbose_mode(self) -> "RegExpBuilder": 131 | """ Produce a nicer looking regular expression in verbose mode.""" 132 | 133 | def without_start_anchor(self) -> "RegExpBuilder": 134 | """Remove the caret anchor '^' from the resulting regular expression, 135 | thereby allowing to match the test cases also when they do not occur 136 | at the start of a string. 137 | """ 138 | 139 | def without_end_anchor(self) -> "RegExpBuilder": 140 | """Remove the dollar sign anchor '$' from the resulting regular expression, 141 | thereby allowing to match the test cases also when they do not occur 142 | at the end of a string. 143 | """ 144 | 145 | def without_anchors(self) -> "RegExpBuilder": 146 | """Remove the caret and dollar sign anchors from the resulting regular expression, 147 | thereby allowing to match the test cases also when they occur within a larger 148 | string that contains other content as well. 149 | """ 150 | 151 | def build(self) -> str: 152 | """Build the actual regular expression using the previously given settings.""" 153 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pemistahl/grex/cb71c10815e2216f4941f0b52154fb5d1fc0a01c/logo.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "grex" 3 | version = "1.0.1" 4 | authors = [{name = "Peter M. Stahl", email = "pemistahl@gmail.com"}] 5 | description = "grex generates regular expressions from user-provided test cases." 6 | readme = "README_PYPI.md" 7 | requires-python = ">=3.8" 8 | license = {file = "LICENSE"} 9 | keywords = ["pattern", "regex", "regexp"] 10 | classifiers = [ 11 | "Development Status :: 5 - Production/Stable", 12 | "Intended Audience :: Developers", 13 | "Intended Audience :: Information Technology", 14 | "Intended Audience :: Science/Research", 15 | "License :: OSI Approved :: Apache Software License", 16 | "Programming Language :: Python :: 3.8", 17 | "Programming Language :: Python :: 3.9", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Python :: 3.12", 21 | "Programming Language :: Rust", 22 | "Topic :: Software Development :: Libraries :: Python Modules", 23 | "Topic :: Text Processing" 24 | ] 25 | 26 | [project.urls] 27 | homepage = "https://github.com/pemistahl/grex" 28 | repository = "https://github.com/pemistahl/grex" 29 | 30 | [project.optional-dependencies] 31 | test = ["pytest == 8.0.2"] 32 | 33 | [tool.maturin] 34 | no-default-features = true 35 | features = ["pyo3/extension-module", "python"] 36 | 37 | [build-system] 38 | requires = ["maturin>=1.1,<2.0"] 39 | build-backend = "maturin" 40 | 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | maturin == 1.6.0 2 | pytest == 8.2.2 3 | -------------------------------------------------------------------------------- /src/builder.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::config::RegExpConfig; 18 | use crate::regexp::RegExp; 19 | use itertools::Itertools; 20 | use std::io::ErrorKind; 21 | use std::path::PathBuf; 22 | 23 | pub(crate) const MISSING_TEST_CASES_MESSAGE: &str = 24 | "No test cases have been provided for regular expression generation"; 25 | 26 | pub(crate) const MINIMUM_REPETITIONS_MESSAGE: &str = 27 | "Quantity of minimum repetitions must be greater than zero"; 28 | 29 | pub(crate) const MINIMUM_SUBSTRING_LENGTH_MESSAGE: &str = 30 | "Minimum substring length must be greater than zero"; 31 | 32 | /// This struct builds regular expressions from user-provided test cases. 33 | #[derive(Clone)] 34 | #[cfg_attr(feature = "python", pyo3::prelude::pyclass)] 35 | pub struct RegExpBuilder { 36 | pub(crate) test_cases: Vec, 37 | pub(crate) config: RegExpConfig, 38 | } 39 | 40 | impl RegExpBuilder { 41 | /// Specifies the test cases to build the regular expression from. 42 | /// 43 | /// The test cases need not be sorted because `RegExpBuilder` sorts them internally. 44 | /// 45 | /// ⚠ Panics if `test_cases` is empty. 46 | pub fn from>(test_cases: &[T]) -> Self { 47 | if test_cases.is_empty() { 48 | panic!("{}", MISSING_TEST_CASES_MESSAGE); 49 | } 50 | Self { 51 | test_cases: test_cases.iter().cloned().map(|it| it.into()).collect_vec(), 52 | config: RegExpConfig::new(), 53 | } 54 | } 55 | 56 | /// Specifies a text file containing test cases to build the regular expression from. 57 | /// 58 | /// The test cases need not be sorted because `RegExpBuilder` sorts them internally. 59 | /// 60 | /// Each test case needs to be on a separate line. 61 | /// Lines may be ended with either a newline (`\n`) or 62 | /// a carriage return with a line feed (`\r\n`). 63 | /// The final line ending is optional. 64 | /// 65 | /// ⚠ Panics if: 66 | /// - the file cannot be found 67 | /// - the file's encoding is not valid UTF-8 data 68 | /// - the file cannot be opened because of conflicting permissions 69 | pub fn from_file>(file_path: T) -> Self { 70 | match std::fs::read_to_string(file_path.into()) { 71 | Ok(file_content) => Self { 72 | test_cases: file_content.lines().map(|it| it.to_string()).collect_vec(), 73 | config: RegExpConfig::new(), 74 | }, 75 | Err(error) => match error.kind() { 76 | ErrorKind::NotFound => panic!("The specified file could not be found"), 77 | ErrorKind::InvalidData => { 78 | panic!("The specified file's encoding is not valid UTF-8") 79 | } 80 | ErrorKind::PermissionDenied => { 81 | panic!("Permission denied: The specified file could not be opened") 82 | } 83 | _ => panic!("{}", error), 84 | }, 85 | } 86 | } 87 | 88 | /// Converts any Unicode decimal digit to character class `\d`. 89 | /// 90 | /// This method takes precedence over 91 | /// [`with_conversion_of_words`](Self::with_conversion_of_words) if both are set. 92 | /// Decimal digits are converted to `\d`, the remaining word characters to `\w`. 93 | /// 94 | /// This method takes precedence over 95 | /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 96 | /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. 97 | pub fn with_conversion_of_digits(&mut self) -> &mut Self { 98 | self.config.is_digit_converted = true; 99 | self 100 | } 101 | 102 | /// Converts any character which is not a Unicode decimal digit to character class `\D`. 103 | /// 104 | /// This method takes precedence over 105 | /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set. 106 | /// Non-digits which are also non-word characters are converted to `\D`. 107 | /// 108 | /// This method takes precedence over 109 | /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 110 | /// Non-digits which are also non-space characters are converted to `\D`. 111 | pub fn with_conversion_of_non_digits(&mut self) -> &mut Self { 112 | self.config.is_non_digit_converted = true; 113 | self 114 | } 115 | 116 | /// Converts any Unicode whitespace character to character class `\s`. 117 | /// 118 | /// This method takes precedence over 119 | /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set. 120 | /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. 121 | /// 122 | /// This method takes precedence over 123 | /// [`with_conversion_of_non_words`](Self::with_conversion_of_non_words) if both are set. 124 | /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. 125 | pub fn with_conversion_of_whitespace(&mut self) -> &mut Self { 126 | self.config.is_space_converted = true; 127 | self 128 | } 129 | 130 | /// Converts any character which is not a Unicode whitespace character to character class `\S`. 131 | pub fn with_conversion_of_non_whitespace(&mut self) -> &mut Self { 132 | self.config.is_non_space_converted = true; 133 | self 134 | } 135 | 136 | /// Converts any Unicode word character to character class `\w`. 137 | /// 138 | /// This method takes precedence over 139 | /// [`with_conversion_of_non_digits`](Self::with_conversion_of_non_digits) if both are set. 140 | /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`. 141 | /// 142 | /// This method takes precedence over 143 | /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 144 | /// Word characters are converted to `\w`, the remaining non-space characters to `\S`. 145 | pub fn with_conversion_of_words(&mut self) -> &mut Self { 146 | self.config.is_word_converted = true; 147 | self 148 | } 149 | 150 | /// Converts any character which is not a Unicode word character to character class `\W`. 151 | /// 152 | /// This method takes precedence over 153 | /// [`with_conversion_of_non_whitespace`](Self::with_conversion_of_non_whitespace) if both are set. 154 | /// Non-words which are also non-space characters are converted to `\W`. 155 | pub fn with_conversion_of_non_words(&mut self) -> &mut Self { 156 | self.config.is_non_word_converted = true; 157 | self 158 | } 159 | 160 | /// Detects repeated non-overlapping substrings and 161 | /// to convert them to `{min,max}` quantifier notation. 162 | pub fn with_conversion_of_repetitions(&mut self) -> &mut Self { 163 | self.config.is_repetition_converted = true; 164 | self 165 | } 166 | 167 | /// Enables case-insensitive matching of test cases 168 | /// so that letters match both upper and lower case. 169 | pub fn with_case_insensitive_matching(&mut self) -> &mut Self { 170 | self.config.is_case_insensitive_matching = true; 171 | self 172 | } 173 | 174 | /// Replaces non-capturing groups with capturing ones. 175 | pub fn with_capturing_groups(&mut self) -> &mut Self { 176 | self.config.is_capturing_group_enabled = true; 177 | self 178 | } 179 | 180 | /// Specifies the minimum quantity of substring repetitions to be converted if 181 | /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set. 182 | /// 183 | /// If the quantity is not explicitly set with this method, a default value of 1 will be used. 184 | /// 185 | /// ⚠ Panics if `quantity` is zero. 186 | pub fn with_minimum_repetitions(&mut self, quantity: u32) -> &mut Self { 187 | if quantity == 0 { 188 | panic!("{}", MINIMUM_REPETITIONS_MESSAGE); 189 | } 190 | self.config.minimum_repetitions = quantity; 191 | self 192 | } 193 | 194 | /// Specifies the minimum length a repeated substring must have in order to be converted if 195 | /// [`with_conversion_of_repetitions`](Self::with_conversion_of_repetitions) is set. 196 | /// 197 | /// If the length is not explicitly set with this method, a default value of 1 will be used. 198 | /// 199 | /// ⚠ Panics if `length` is zero. 200 | pub fn with_minimum_substring_length(&mut self, length: u32) -> &mut Self { 201 | if length == 0 { 202 | panic!("{}", MINIMUM_SUBSTRING_LENGTH_MESSAGE); 203 | } 204 | self.config.minimum_substring_length = length; 205 | self 206 | } 207 | 208 | /// Converts non-ASCII characters to unicode escape sequences. 209 | /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes 210 | /// (range `U+010000` to `U+10FFFF`) to surrogate pairs. 211 | pub fn with_escaping_of_non_ascii_chars(&mut self, use_surrogate_pairs: bool) -> &mut Self { 212 | self.config.is_non_ascii_char_escaped = true; 213 | self.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs; 214 | self 215 | } 216 | 217 | /// Produces a nicer looking regular expression in verbose mode. 218 | pub fn with_verbose_mode(&mut self) -> &mut Self { 219 | self.config.is_verbose_mode_enabled = true; 220 | self 221 | } 222 | 223 | /// Removes the caret anchor '^' from the resulting regular 224 | /// expression, thereby allowing to match the test cases also when they do not occur 225 | /// at the start of a string. 226 | pub fn without_start_anchor(&mut self) -> &mut Self { 227 | self.config.is_start_anchor_disabled = true; 228 | self 229 | } 230 | 231 | /// Removes the dollar sign anchor '$' from the resulting regular 232 | /// expression, thereby allowing to match the test cases also when they do not occur 233 | /// at the end of a string. 234 | pub fn without_end_anchor(&mut self) -> &mut Self { 235 | self.config.is_end_anchor_disabled = true; 236 | self 237 | } 238 | 239 | /// Removes the caret and dollar sign anchors from the resulting 240 | /// regular expression, thereby allowing to match the test cases also when they occur 241 | /// within a larger string that contains other content as well. 242 | pub fn without_anchors(&mut self) -> &mut Self { 243 | self.config.is_start_anchor_disabled = true; 244 | self.config.is_end_anchor_disabled = true; 245 | self 246 | } 247 | 248 | /// Provides syntax highlighting for the resulting regular expression. 249 | /// 250 | /// ⚠ This method may only be used if the resulting regular expression is meant to 251 | /// be printed to the console. The regex string representation returned from enabling 252 | /// this setting cannot be fed into the [*regex*](https://crates.io/crates/regex) crate. 253 | #[cfg(feature = "cli")] 254 | #[doc(hidden)] 255 | pub fn with_syntax_highlighting(&mut self) -> &mut Self { 256 | self.config.is_output_colorized = true; 257 | self 258 | } 259 | 260 | /// Builds the actual regular expression using the previously given settings. 261 | pub fn build(&mut self) -> String { 262 | RegExp::from(&mut self.test_cases, &self.config).to_string() 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /src/cluster.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::config::RegExpConfig; 18 | use crate::grapheme::Grapheme; 19 | use crate::unicode_tables::{DECIMAL_NUMBER, WHITE_SPACE, WORD}; 20 | use itertools::Itertools; 21 | use lazy_static::lazy_static; 22 | use std::cmp::Ordering; 23 | use std::collections::HashMap; 24 | use std::ops::Range; 25 | use unic_char_range::CharRange; 26 | use unic_ucd_category::GeneralCategory; 27 | use unicode_segmentation::UnicodeSegmentation; 28 | 29 | #[derive(Clone, Debug, Eq, PartialEq)] 30 | pub struct GraphemeCluster<'a> { 31 | graphemes: Vec, 32 | config: &'a RegExpConfig, 33 | } 34 | 35 | impl<'a> GraphemeCluster<'a> { 36 | pub(crate) fn from(s: &str, config: &'a RegExpConfig) -> Self { 37 | Self { 38 | graphemes: UnicodeSegmentation::graphemes(s, true) 39 | .flat_map(|it| { 40 | let contains_backslash = it.chars().count() == 2 && it.contains('\\'); 41 | let contains_combining_mark_or_unassigned_chars = it.chars().any(|c| { 42 | let category = GeneralCategory::of(c); 43 | category.is_mark() || category.is_other() 44 | }); 45 | 46 | if contains_backslash || contains_combining_mark_or_unassigned_chars { 47 | it.chars() 48 | .map(|c| { 49 | Grapheme::from( 50 | &c.to_string(), 51 | config.is_capturing_group_enabled, 52 | config.is_output_colorized, 53 | config.is_verbose_mode_enabled, 54 | ) 55 | }) 56 | .collect_vec() 57 | } else { 58 | vec![Grapheme::from( 59 | it, 60 | config.is_capturing_group_enabled, 61 | config.is_output_colorized, 62 | config.is_verbose_mode_enabled, 63 | )] 64 | } 65 | }) 66 | .collect_vec(), 67 | config, 68 | } 69 | } 70 | 71 | pub(crate) fn from_graphemes(graphemes: Vec, config: &'a RegExpConfig) -> Self { 72 | Self { graphemes, config } 73 | } 74 | 75 | pub(crate) fn new(grapheme: Grapheme, config: &'a RegExpConfig) -> Self { 76 | Self { 77 | graphemes: vec![grapheme], 78 | config, 79 | } 80 | } 81 | 82 | pub(crate) fn convert_to_char_classes(&mut self) { 83 | let is_digit_converted = self.config.is_digit_converted; 84 | let is_non_digit_converted = self.config.is_non_digit_converted; 85 | let is_space_converted = self.config.is_space_converted; 86 | let is_non_space_converted = self.config.is_non_space_converted; 87 | let is_word_converted = self.config.is_word_converted; 88 | let is_non_word_converted = self.config.is_non_word_converted; 89 | 90 | for grapheme in self.graphemes.iter_mut() { 91 | grapheme.chars = grapheme 92 | .chars 93 | .iter() 94 | .map(|it| { 95 | it.chars() 96 | .map(|c| { 97 | if is_digit_converted && is_digit(c) { 98 | "\\d".to_string() 99 | } else if is_word_converted && is_word(c) { 100 | "\\w".to_string() 101 | } else if is_space_converted && is_space(c) { 102 | "\\s".to_string() 103 | } else if is_non_digit_converted && !is_digit(c) { 104 | "\\D".to_string() 105 | } else if is_non_word_converted && !is_word(c) { 106 | "\\W".to_string() 107 | } else if is_non_space_converted && !is_space(c) { 108 | "\\S".to_string() 109 | } else { 110 | c.to_string() 111 | } 112 | }) 113 | .join("") 114 | }) 115 | .collect_vec(); 116 | } 117 | } 118 | 119 | pub(crate) fn convert_repetitions(&mut self) { 120 | let mut repetitions = vec![]; 121 | convert_repetitions(self.graphemes(), repetitions.as_mut(), self.config); 122 | if !repetitions.is_empty() { 123 | self.graphemes = repetitions; 124 | } 125 | } 126 | 127 | pub(crate) fn merge( 128 | first: &GraphemeCluster, 129 | second: &GraphemeCluster, 130 | config: &'a RegExpConfig, 131 | ) -> Self { 132 | let mut graphemes = vec![]; 133 | graphemes.extend_from_slice(&first.graphemes); 134 | graphemes.extend_from_slice(&second.graphemes); 135 | Self { graphemes, config } 136 | } 137 | 138 | pub(crate) fn graphemes(&self) -> &Vec { 139 | &self.graphemes 140 | } 141 | 142 | pub(crate) fn graphemes_mut(&mut self) -> &mut Vec { 143 | &mut self.graphemes 144 | } 145 | 146 | pub(crate) fn size(&self) -> usize { 147 | self.graphemes.len() 148 | } 149 | 150 | pub(crate) fn char_count(&self, is_non_ascii_char_escaped: bool) -> usize { 151 | self.graphemes 152 | .iter() 153 | .map(|it| it.char_count(is_non_ascii_char_escaped)) 154 | .sum() 155 | } 156 | 157 | pub(crate) fn is_empty(&self) -> bool { 158 | self.graphemes.is_empty() 159 | } 160 | } 161 | 162 | fn is_digit(c: char) -> bool { 163 | lazy_static! { 164 | static ref VALID_NUMERIC_CHARS: Vec = convert_chars_to_range(DECIMAL_NUMBER); 165 | } 166 | VALID_NUMERIC_CHARS.iter().any(|range| range.contains(c)) 167 | } 168 | 169 | fn is_word(c: char) -> bool { 170 | lazy_static! { 171 | static ref VALID_ALPHANUMERIC_CHARS: Vec = convert_chars_to_range(WORD); 172 | } 173 | VALID_ALPHANUMERIC_CHARS 174 | .iter() 175 | .any(|range| range.contains(c)) 176 | } 177 | 178 | fn is_space(c: char) -> bool { 179 | lazy_static! { 180 | static ref VALID_SPACE_CHARS: Vec = convert_chars_to_range(WHITE_SPACE); 181 | } 182 | VALID_SPACE_CHARS.iter().any(|range| range.contains(c)) 183 | } 184 | 185 | fn convert_repetitions( 186 | graphemes: &[Grapheme], 187 | repetitions: &mut Vec, 188 | config: &RegExpConfig, 189 | ) { 190 | let repeated_substrings = collect_repeated_substrings(graphemes); 191 | let ranges_of_repetitions = create_ranges_of_repetitions(repeated_substrings, config); 192 | let coalesced_repetitions = coalesce_repetitions(ranges_of_repetitions); 193 | replace_graphemes_with_repetitions(coalesced_repetitions, graphemes, repetitions, config) 194 | } 195 | 196 | fn collect_repeated_substrings(graphemes: &[Grapheme]) -> HashMap, Vec> { 197 | let mut map = HashMap::new(); 198 | 199 | for i in 0..graphemes.len() { 200 | let suffix = &graphemes[i..]; 201 | for j in 1..=graphemes.len() / 2 { 202 | if suffix.len() >= j { 203 | let prefix = suffix[..j].iter().map(|it| it.value()).collect_vec(); 204 | let indices = map.entry(prefix).or_insert_with(Vec::new); 205 | indices.push(i); 206 | } 207 | } 208 | } 209 | map 210 | } 211 | 212 | fn create_ranges_of_repetitions( 213 | repeated_substrings: HashMap, Vec>, 214 | config: &RegExpConfig, 215 | ) -> Vec<(Range, Vec)> { 216 | let mut repetitions = Vec::<(Range, Vec)>::new(); 217 | 218 | for (prefix_length, group) in &repeated_substrings 219 | .iter() 220 | .filter(|&(prefix, indices)| { 221 | indices 222 | .iter() 223 | .tuple_windows() 224 | .all(|(first, second)| (second - first) >= prefix.len()) 225 | }) 226 | .sorted_by_key(|&(prefix, _)| prefix.len()) 227 | .rev() 228 | .chunk_by(|&(prefix, _)| prefix.len()) 229 | { 230 | for (prefix, indices) in group.sorted_by_key(|&(_, indices)| indices[0]) { 231 | indices 232 | .iter() 233 | .map(|it| *it..it + prefix_length) 234 | .coalesce(|x, y| { 235 | if x.end == y.start { 236 | Ok(x.start..y.end) 237 | } else { 238 | Err((x, y)) 239 | } 240 | }) 241 | .filter(|range| { 242 | let count = ((range.end - range.start) / prefix_length) as u32; 243 | count > config.minimum_repetitions 244 | }) 245 | .for_each(|range| repetitions.push((range, prefix.clone()))); 246 | } 247 | } 248 | repetitions 249 | } 250 | 251 | fn coalesce_repetitions( 252 | ranges_of_repetitions: Vec<(Range, Vec)>, 253 | ) -> Vec<(Range, Vec)> { 254 | ranges_of_repetitions 255 | .iter() 256 | .sorted_by(|&(first_range, _), &(second_range, _)| { 257 | match second_range.end.cmp(&first_range.end) { 258 | Ordering::Equal => first_range.start.cmp(&second_range.start), 259 | other => other, 260 | } 261 | }) 262 | .coalesce(|first_tup, second_tup| { 263 | let first_range = &first_tup.0; 264 | let second_range = &second_tup.0; 265 | 266 | if (first_range.contains(&second_range.start) 267 | || first_range.contains(&second_range.end)) 268 | && second_range.end != first_range.start 269 | { 270 | Ok(first_tup) 271 | } else { 272 | Err((first_tup, second_tup)) 273 | } 274 | }) 275 | .map(|(range, substr)| (range.clone(), substr.clone())) 276 | .collect_vec() 277 | } 278 | 279 | fn replace_graphemes_with_repetitions( 280 | coalesced_repetitions: Vec<(Range, Vec)>, 281 | graphemes: &[Grapheme], 282 | repetitions: &mut Vec, 283 | config: &RegExpConfig, 284 | ) { 285 | if coalesced_repetitions.is_empty() { 286 | return; 287 | } 288 | 289 | for grapheme in graphemes { 290 | repetitions.push(grapheme.clone()); 291 | } 292 | 293 | for (range, substr) in coalesced_repetitions.iter() { 294 | if range.end > repetitions.len() { 295 | break; 296 | } 297 | 298 | let count = ((range.end - range.start) / substr.len()) as u32; 299 | 300 | if substr.len() < config.minimum_substring_length as usize { 301 | continue; 302 | } 303 | 304 | repetitions.splice( 305 | range.clone(), 306 | [Grapheme::new( 307 | substr.clone(), 308 | count, 309 | count, 310 | config.is_capturing_group_enabled, 311 | config.is_output_colorized, 312 | config.is_verbose_mode_enabled, 313 | )] 314 | .iter() 315 | .cloned(), 316 | ); 317 | } 318 | 319 | for new_grapheme in repetitions.iter_mut() { 320 | convert_repetitions( 321 | &new_grapheme 322 | .chars 323 | .iter() 324 | .map(|it| { 325 | Grapheme::from( 326 | it, 327 | config.is_capturing_group_enabled, 328 | config.is_output_colorized, 329 | config.is_verbose_mode_enabled, 330 | ) 331 | }) 332 | .collect_vec(), 333 | new_grapheme.repetitions.as_mut(), 334 | config, 335 | ); 336 | } 337 | } 338 | 339 | fn convert_chars_to_range(chars: &[(char, char)]) -> Vec { 340 | chars 341 | .iter() 342 | .map(|&(start, end)| CharRange::closed(start, end)) 343 | .collect_vec() 344 | } 345 | -------------------------------------------------------------------------------- /src/component.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::quantifier::Quantifier; 18 | use std::fmt::{Display, Formatter, Result}; 19 | 20 | pub(crate) enum Component { 21 | CapturedLeftParenthesis, 22 | CapturedParenthesizedExpression(String, bool, bool), 23 | Caret(bool), 24 | CharClass(String), 25 | DollarSign(bool), 26 | Hyphen, 27 | IgnoreCaseFlag, 28 | IgnoreCaseAndVerboseModeFlag, 29 | LeftBracket, 30 | Pipe, 31 | Quantifier(Quantifier, bool), 32 | Repetition(u32, bool), 33 | RepetitionRange(u32, u32, bool), 34 | RightBracket, 35 | RightParenthesis, 36 | UncapturedLeftParenthesis, 37 | UncapturedParenthesizedExpression(String, bool, bool), 38 | VerboseModeFlag, 39 | } 40 | 41 | impl Component { 42 | pub(crate) fn to_repr(&self, is_output_colorized: bool) -> String { 43 | match is_output_colorized { 44 | true => self.to_colored_string(false), 45 | false => self.to_string(), 46 | } 47 | } 48 | 49 | pub(crate) fn to_colored_string(&self, is_escaped: bool) -> String { 50 | match self { 51 | Component::CapturedLeftParenthesis => Self::green_bold(&self.to_string(), is_escaped), 52 | Component::CapturedParenthesizedExpression( 53 | expr, 54 | is_verbose_mode_enabled, 55 | has_final_line_break, 56 | ) => { 57 | if *is_verbose_mode_enabled { 58 | if *has_final_line_break { 59 | format!( 60 | "\n{}\n{}\n{}\n", 61 | Component::CapturedLeftParenthesis.to_colored_string(is_escaped), 62 | expr, 63 | Component::RightParenthesis.to_colored_string(is_escaped) 64 | ) 65 | } else { 66 | format!( 67 | "\n{}\n{}\n{}", 68 | Component::CapturedLeftParenthesis.to_colored_string(is_escaped), 69 | expr, 70 | Component::RightParenthesis.to_colored_string(is_escaped) 71 | ) 72 | } 73 | } else { 74 | format!( 75 | "{}{}{}", 76 | Component::CapturedLeftParenthesis.to_colored_string(is_escaped), 77 | expr, 78 | Component::RightParenthesis.to_colored_string(is_escaped) 79 | ) 80 | } 81 | } 82 | Component::Caret(is_verbose_mode_enabled) => { 83 | if *is_verbose_mode_enabled { 84 | format!( 85 | "{}\n", 86 | Self::yellow_bold(&Component::Caret(false).to_string(), is_escaped) 87 | ) 88 | } else { 89 | Self::yellow_bold(&self.to_string(), is_escaped) 90 | } 91 | } 92 | Component::CharClass(value) => Self::black_on_bright_yellow(value, is_escaped), 93 | Component::DollarSign(is_verbose_mode_enabled) => { 94 | if *is_verbose_mode_enabled { 95 | format!( 96 | "\n{}", 97 | Self::yellow_bold(&Component::DollarSign(false).to_string(), is_escaped) 98 | ) 99 | } else { 100 | Self::yellow_bold(&self.to_string(), is_escaped) 101 | } 102 | } 103 | Component::Hyphen => Self::cyan_bold(&self.to_string(), is_escaped), 104 | Component::IgnoreCaseFlag => { 105 | Self::bright_yellow_on_black(&self.to_string(), is_escaped) 106 | } 107 | Component::IgnoreCaseAndVerboseModeFlag => { 108 | format!("{}\n", Self::bright_yellow_on_black("(?ix)", is_escaped)) 109 | } 110 | Component::LeftBracket => Self::cyan_bold(&self.to_string(), is_escaped), 111 | Component::Pipe => Self::red_bold(&self.to_string(), is_escaped), 112 | Component::Quantifier(quantifier, is_verbose_mode_enabled) => { 113 | if *is_verbose_mode_enabled { 114 | format!( 115 | "{}\n", 116 | Self::purple_bold(&quantifier.to_string(), is_escaped) 117 | ) 118 | } else { 119 | Self::purple_bold(&self.to_string(), is_escaped) 120 | } 121 | } 122 | Component::Repetition(num, is_verbose_mode_enabled) => { 123 | if *is_verbose_mode_enabled { 124 | format!( 125 | "{}\n", 126 | Self::white_on_bright_blue( 127 | &Component::Repetition(*num, false).to_string(), 128 | is_escaped 129 | ) 130 | ) 131 | } else { 132 | Self::white_on_bright_blue(&self.to_string(), is_escaped) 133 | } 134 | } 135 | Component::RepetitionRange(min, max, is_verbose_mode_enabled) => { 136 | if *is_verbose_mode_enabled { 137 | format!( 138 | "{}\n", 139 | Self::white_on_bright_blue( 140 | &Component::RepetitionRange(*min, *max, false).to_string(), 141 | is_escaped 142 | ) 143 | ) 144 | } else { 145 | Self::white_on_bright_blue(&self.to_string(), is_escaped) 146 | } 147 | } 148 | Component::RightBracket => Self::cyan_bold(&self.to_string(), is_escaped), 149 | Component::RightParenthesis => Self::green_bold(&self.to_string(), is_escaped), 150 | Component::UncapturedLeftParenthesis => Self::green_bold(&self.to_string(), is_escaped), 151 | Component::UncapturedParenthesizedExpression( 152 | expr, 153 | is_verbose_mode_enabled, 154 | has_final_line_break, 155 | ) => { 156 | if *is_verbose_mode_enabled { 157 | if *has_final_line_break { 158 | format!( 159 | "\n{}\n{}\n{}\n", 160 | Component::UncapturedLeftParenthesis.to_colored_string(is_escaped), 161 | expr, 162 | Component::RightParenthesis.to_colored_string(is_escaped) 163 | ) 164 | } else { 165 | format!( 166 | "\n{}\n{}\n{}", 167 | Component::UncapturedLeftParenthesis.to_colored_string(is_escaped), 168 | expr, 169 | Component::RightParenthesis.to_colored_string(is_escaped) 170 | ) 171 | } 172 | } else { 173 | format!( 174 | "{}{}{}", 175 | Component::UncapturedLeftParenthesis.to_colored_string(is_escaped), 176 | expr, 177 | Component::RightParenthesis.to_colored_string(is_escaped) 178 | ) 179 | } 180 | } 181 | Component::VerboseModeFlag => { 182 | format!("{}\n", Self::bright_yellow_on_black("(?x)", is_escaped)) 183 | } 184 | } 185 | } 186 | 187 | fn black_on_bright_yellow(value: &str, is_escaped: bool) -> String { 188 | Self::color_code("103;30", value, is_escaped) 189 | } 190 | 191 | fn bright_yellow_on_black(value: &str, is_escaped: bool) -> String { 192 | Self::color_code("40;93", value, is_escaped) 193 | } 194 | 195 | fn cyan_bold(value: &str, is_escaped: bool) -> String { 196 | Self::color_code("1;36", value, is_escaped) 197 | } 198 | 199 | fn green_bold(value: &str, is_escaped: bool) -> String { 200 | Self::color_code("1;32", value, is_escaped) 201 | } 202 | 203 | fn purple_bold(value: &str, is_escaped: bool) -> String { 204 | Self::color_code("1;35", value, is_escaped) 205 | } 206 | 207 | fn red_bold(value: &str, is_escaped: bool) -> String { 208 | Self::color_code("1;31", value, is_escaped) 209 | } 210 | 211 | fn white_on_bright_blue(value: &str, is_escaped: bool) -> String { 212 | Self::color_code("104;37", value, is_escaped) 213 | } 214 | 215 | fn yellow_bold(value: &str, is_escaped: bool) -> String { 216 | Self::color_code("1;33", value, is_escaped) 217 | } 218 | 219 | fn color_code(code: &str, value: &str, is_escaped: bool) -> String { 220 | if is_escaped { 221 | format!("\u{1b}\\[{}m\\{}\u{1b}\\[0m", code, value) 222 | } else { 223 | format!("\u{1b}[{}m{}\u{1b}[0m", code, value) 224 | } 225 | } 226 | } 227 | 228 | impl Display for Component { 229 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 230 | write!( 231 | f, 232 | "{}", 233 | match self { 234 | Component::CapturedLeftParenthesis => "(".to_string(), 235 | Component::CapturedParenthesizedExpression( 236 | expr, 237 | is_verbose_mode_enabled, 238 | has_final_line_break, 239 | ) => 240 | if *is_verbose_mode_enabled { 241 | if *has_final_line_break { 242 | format!( 243 | "\n{}\n{}\n{}\n", 244 | Component::CapturedLeftParenthesis, 245 | expr, 246 | Component::RightParenthesis 247 | ) 248 | } else { 249 | format!( 250 | "\n{}\n{}\n{}", 251 | Component::CapturedLeftParenthesis, 252 | expr, 253 | Component::RightParenthesis 254 | ) 255 | } 256 | } else { 257 | format!( 258 | "{}{}{}", 259 | Component::CapturedLeftParenthesis, 260 | expr, 261 | Component::RightParenthesis 262 | ) 263 | }, 264 | Component::Caret(is_verbose_mode_enabled) => 265 | if *is_verbose_mode_enabled { 266 | "^\n".to_string() 267 | } else { 268 | "^".to_string() 269 | }, 270 | Component::CharClass(value) => value.clone(), 271 | Component::DollarSign(is_verbose_mode_enabled) => 272 | if *is_verbose_mode_enabled { 273 | "\n$".to_string() 274 | } else { 275 | "$".to_string() 276 | }, 277 | Component::Hyphen => "-".to_string(), 278 | Component::IgnoreCaseFlag => "(?i)".to_string(), 279 | Component::IgnoreCaseAndVerboseModeFlag => "(?ix)\n".to_string(), 280 | Component::LeftBracket => "[".to_string(), 281 | Component::Pipe => "|".to_string(), 282 | Component::Quantifier(quantifier, is_verbose_mode_enabled) => 283 | if *is_verbose_mode_enabled { 284 | format!("{}\n", quantifier) 285 | } else { 286 | quantifier.to_string() 287 | }, 288 | Component::Repetition(num, is_verbose_mode_enabled) => { 289 | if *num == 0 && *is_verbose_mode_enabled { 290 | "{\\d+\\}\n".to_string() 291 | } else if *num == 0 { 292 | "{\\d+\\}".to_string() 293 | } else if *is_verbose_mode_enabled { 294 | format!("{{{}}}\n", num) 295 | } else { 296 | format!("{{{}}}", num) 297 | } 298 | } 299 | Component::RepetitionRange(min, max, is_verbose_mode_enabled) => { 300 | if *min == 0 && *max == 0 && *is_verbose_mode_enabled { 301 | "{\\d+,\\d+\\}\n".to_string() 302 | } else if *min == 0 && *max == 0 { 303 | "{\\d+,\\d+\\}".to_string() 304 | } else if *is_verbose_mode_enabled { 305 | format!("{{{},{}}}\n", min, max) 306 | } else { 307 | format!("{{{},{}}}", min, max) 308 | } 309 | } 310 | Component::RightBracket => "]".to_string(), 311 | Component::RightParenthesis => ")".to_string(), 312 | Component::UncapturedLeftParenthesis => "(?:".to_string(), 313 | Component::UncapturedParenthesizedExpression( 314 | expr, 315 | is_verbose_mode_enabled, 316 | has_final_line_break, 317 | ) => { 318 | if *is_verbose_mode_enabled { 319 | if *has_final_line_break { 320 | format!( 321 | "\n{}\n{}\n{}\n", 322 | Component::UncapturedLeftParenthesis, 323 | expr, 324 | Component::RightParenthesis 325 | ) 326 | } else { 327 | format!( 328 | "\n{}\n{}\n{}", 329 | Component::UncapturedLeftParenthesis, 330 | expr, 331 | Component::RightParenthesis 332 | ) 333 | } 334 | } else { 335 | format!( 336 | "{}{}{}", 337 | Component::UncapturedLeftParenthesis, 338 | expr, 339 | Component::RightParenthesis 340 | ) 341 | } 342 | } 343 | Component::VerboseModeFlag => "(?x)\n".to_string(), 344 | } 345 | ) 346 | } 347 | } 348 | -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #[derive(Clone, Debug, Hash, Ord, PartialOrd, Eq, PartialEq)] 18 | pub struct RegExpConfig { 19 | pub(crate) minimum_repetitions: u32, 20 | pub(crate) minimum_substring_length: u32, 21 | pub(crate) is_digit_converted: bool, 22 | pub(crate) is_non_digit_converted: bool, 23 | pub(crate) is_space_converted: bool, 24 | pub(crate) is_non_space_converted: bool, 25 | pub(crate) is_word_converted: bool, 26 | pub(crate) is_non_word_converted: bool, 27 | pub(crate) is_repetition_converted: bool, 28 | pub(crate) is_case_insensitive_matching: bool, 29 | pub(crate) is_capturing_group_enabled: bool, 30 | pub(crate) is_non_ascii_char_escaped: bool, 31 | pub(crate) is_astral_code_point_converted_to_surrogate: bool, 32 | pub(crate) is_verbose_mode_enabled: bool, 33 | pub(crate) is_start_anchor_disabled: bool, 34 | pub(crate) is_end_anchor_disabled: bool, 35 | pub(crate) is_output_colorized: bool, 36 | } 37 | 38 | impl RegExpConfig { 39 | pub(crate) fn new() -> Self { 40 | Self { 41 | minimum_repetitions: 1, 42 | minimum_substring_length: 1, 43 | is_digit_converted: false, 44 | is_non_digit_converted: false, 45 | is_space_converted: false, 46 | is_non_space_converted: false, 47 | is_word_converted: false, 48 | is_non_word_converted: false, 49 | is_repetition_converted: false, 50 | is_case_insensitive_matching: false, 51 | is_capturing_group_enabled: false, 52 | is_non_ascii_char_escaped: false, 53 | is_astral_code_point_converted_to_surrogate: false, 54 | is_verbose_mode_enabled: false, 55 | is_start_anchor_disabled: false, 56 | is_end_anchor_disabled: false, 57 | is_output_colorized: false, 58 | } 59 | } 60 | 61 | pub(crate) fn is_char_class_feature_enabled(&self) -> bool { 62 | self.is_digit_converted 63 | || self.is_non_digit_converted 64 | || self.is_space_converted 65 | || self.is_non_space_converted 66 | || self.is_word_converted 67 | || self.is_non_word_converted 68 | || self.is_case_insensitive_matching 69 | || self.is_capturing_group_enabled 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/dfa.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::cluster::GraphemeCluster; 18 | use crate::config::RegExpConfig; 19 | use crate::grapheme::Grapheme; 20 | use itertools::Itertools; 21 | use petgraph::graph::NodeIndex; 22 | use petgraph::stable_graph::{Edges, StableGraph}; 23 | use petgraph::visit::Dfs; 24 | use petgraph::{Directed, Direction}; 25 | use std::cmp::{max, min}; 26 | use std::collections::{BTreeSet, HashMap, HashSet}; 27 | 28 | type State = NodeIndex; 29 | type StateLabel = String; 30 | type EdgeLabel = Grapheme; 31 | 32 | pub struct Dfa<'a> { 33 | alphabet: BTreeSet, 34 | graph: StableGraph, 35 | initial_state: State, 36 | final_state_indices: HashSet, 37 | config: &'a RegExpConfig, 38 | } 39 | 40 | impl<'a> Dfa<'a> { 41 | pub(crate) fn from( 42 | grapheme_clusters: &[GraphemeCluster], 43 | is_minimized: bool, 44 | config: &'a RegExpConfig, 45 | ) -> Self { 46 | let mut dfa = Self::new(config); 47 | for cluster in grapheme_clusters { 48 | dfa.insert(cluster); 49 | } 50 | if is_minimized { 51 | dfa.minimize(); 52 | } 53 | dfa 54 | } 55 | 56 | pub(crate) fn state_count(&self) -> usize { 57 | self.graph.node_count() 58 | } 59 | 60 | pub(crate) fn states_in_depth_first_order(&self) -> Vec { 61 | let mut depth_first_search = Dfs::new(&self.graph, self.initial_state); 62 | let mut states = vec![]; 63 | while let Some(state) = depth_first_search.next(&self.graph) { 64 | states.push(state); 65 | } 66 | states 67 | } 68 | 69 | pub(crate) fn outgoing_edges(&self, state: State) -> Edges { 70 | self.graph.edges_directed(state, Direction::Outgoing) 71 | } 72 | 73 | pub(crate) fn is_final_state(&self, state: State) -> bool { 74 | self.final_state_indices.contains(&state.index()) 75 | } 76 | 77 | fn new(config: &'a RegExpConfig) -> Self { 78 | let mut graph = StableGraph::new(); 79 | let initial_state = graph.add_node("".to_string()); 80 | Self { 81 | alphabet: BTreeSet::new(), 82 | graph, 83 | initial_state, 84 | final_state_indices: HashSet::new(), 85 | config, 86 | } 87 | } 88 | 89 | fn insert(&mut self, cluster: &GraphemeCluster) { 90 | let mut current_state = self.initial_state; 91 | 92 | for grapheme in cluster.graphemes() { 93 | self.alphabet.insert(grapheme.clone()); 94 | current_state = self.return_next_state(current_state, grapheme); 95 | } 96 | self.final_state_indices.insert(current_state.index()); 97 | } 98 | 99 | fn return_next_state(&mut self, current_state: State, edge_label: &Grapheme) -> State { 100 | match self.find_next_state(current_state, edge_label) { 101 | Some(next_state) => next_state, 102 | None => self.add_new_state(current_state, edge_label), 103 | } 104 | } 105 | 106 | fn find_next_state(&mut self, current_state: State, grapheme: &Grapheme) -> Option { 107 | for next_state in self.graph.neighbors(current_state) { 108 | let edge_idx = self.graph.find_edge(current_state, next_state).unwrap(); 109 | let current_grapheme = self.graph.edge_weight(edge_idx).unwrap(); 110 | 111 | if current_grapheme.value() != grapheme.value() { 112 | continue; 113 | } 114 | 115 | if current_grapheme.maximum() == grapheme.maximum() - 1 { 116 | let min = min(current_grapheme.minimum(), grapheme.minimum()); 117 | let max = max(current_grapheme.maximum(), grapheme.maximum()); 118 | let new_grapheme = Grapheme::new( 119 | grapheme.chars().clone(), 120 | min, 121 | max, 122 | self.config.is_capturing_group_enabled, 123 | self.config.is_output_colorized, 124 | self.config.is_verbose_mode_enabled, 125 | ); 126 | self.graph 127 | .update_edge(current_state, next_state, new_grapheme); 128 | return Some(next_state); 129 | } else if current_grapheme.maximum() == grapheme.maximum() { 130 | return Some(next_state); 131 | } 132 | } 133 | None 134 | } 135 | 136 | fn add_new_state(&mut self, current_state: State, edge_label: &Grapheme) -> State { 137 | let next_state = self.graph.add_node("".to_string()); 138 | self.graph 139 | .add_edge(current_state, next_state, edge_label.clone()); 140 | next_state 141 | } 142 | 143 | #[allow(clippy::many_single_char_names)] 144 | fn minimize(&mut self) { 145 | let mut p = self.get_initial_partition(); 146 | let mut w = p.iter().cloned().collect_vec(); 147 | 148 | while !w.is_empty() { 149 | let a = w.drain(0..1).next().unwrap(); 150 | 151 | for edge_label in self.alphabet.iter() { 152 | let x = self.get_parent_states(&a, edge_label); 153 | let mut replacements = vec![]; 154 | let mut is_replacement_needed = true; 155 | let mut start_idx = 0; 156 | 157 | while is_replacement_needed { 158 | for (idx, y) in p.iter().enumerate().skip(start_idx) { 159 | if x.intersection(y).count() == 0 || y.difference(&x).count() == 0 { 160 | is_replacement_needed = false; 161 | continue; 162 | } 163 | 164 | let i = x.intersection(y).copied().collect::>(); 165 | let d = y.difference(&x).copied().collect::>(); 166 | 167 | is_replacement_needed = true; 168 | start_idx = idx; 169 | 170 | replacements.push((y.clone(), i, d)); 171 | 172 | break; 173 | } 174 | 175 | if is_replacement_needed { 176 | let (_, i, d) = replacements.last().unwrap(); 177 | 178 | p.remove(start_idx); 179 | p.insert(start_idx, i.clone()); 180 | p.insert(start_idx + 1, d.clone()); 181 | } 182 | } 183 | 184 | for (y, i, d) in replacements { 185 | if w.contains(&y) { 186 | let idx = w.iter().position(|it| it == &y).unwrap(); 187 | w.remove(idx); 188 | w.push(i); 189 | w.push(d); 190 | } else if i.len() <= d.len() { 191 | w.push(i); 192 | } else { 193 | w.push(d); 194 | } 195 | } 196 | } 197 | } 198 | 199 | self.recreate_graph(p.iter().filter(|&it| !it.is_empty()).collect_vec()); 200 | } 201 | 202 | fn get_initial_partition(&self) -> Vec> { 203 | let (final_states, non_final_states): (HashSet, HashSet) = self 204 | .graph 205 | .node_indices() 206 | .partition(|&state| !self.final_state_indices.contains(&state.index())); 207 | 208 | vec![final_states, non_final_states] 209 | } 210 | 211 | fn get_parent_states(&self, a: &HashSet, label: &Grapheme) -> HashSet { 212 | let mut x = HashSet::new(); 213 | 214 | for &state in a { 215 | let direct_parent_states = self.graph.neighbors_directed(state, Direction::Incoming); 216 | for parent_state in direct_parent_states { 217 | let edge = self.graph.find_edge(parent_state, state).unwrap(); 218 | let grapheme = self.graph.edge_weight(edge).unwrap(); 219 | if grapheme.value() == label.value() 220 | && (grapheme.maximum() == label.maximum() 221 | || grapheme.minimum() == label.minimum()) 222 | { 223 | x.insert(parent_state); 224 | break; 225 | } 226 | } 227 | } 228 | x 229 | } 230 | 231 | fn recreate_graph(&mut self, p: Vec<&HashSet>) { 232 | let mut graph = StableGraph::::new(); 233 | let mut final_state_indices = HashSet::new(); 234 | let mut state_mappings = HashMap::new(); 235 | let mut new_initial_state: Option = None; 236 | 237 | for equivalence_class in p.iter() { 238 | let new_state = graph.add_node("".to_string()); 239 | 240 | for old_state in equivalence_class.iter() { 241 | if self.initial_state == *old_state { 242 | new_initial_state = Some(new_state); 243 | } 244 | state_mappings.insert(*old_state, new_state); 245 | } 246 | } 247 | 248 | for equivalence_class in p.iter() { 249 | let old_source_state = *equivalence_class.iter().next().unwrap(); 250 | let new_source_state = state_mappings.get(&old_source_state).unwrap(); 251 | 252 | for old_target_state in self.graph.neighbors(old_source_state) { 253 | let edge = self 254 | .graph 255 | .find_edge(old_source_state, old_target_state) 256 | .unwrap(); 257 | 258 | let grapheme = self.graph.edge_weight(edge).unwrap().clone(); 259 | let new_target_state = state_mappings.get(&old_target_state).unwrap(); 260 | 261 | graph.add_edge(*new_source_state, *new_target_state, grapheme.clone()); 262 | 263 | if self.final_state_indices.contains(&old_target_state.index()) { 264 | final_state_indices.insert(new_target_state.index()); 265 | } 266 | } 267 | } 268 | self.initial_state = new_initial_state.unwrap(); 269 | self.final_state_indices = final_state_indices; 270 | self.graph = graph; 271 | } 272 | } 273 | 274 | #[cfg(test)] 275 | mod tests { 276 | use super::*; 277 | 278 | #[test] 279 | fn test_state_count() { 280 | let config = RegExpConfig::new(); 281 | let mut dfa = Dfa::new(&config); 282 | assert_eq!(dfa.state_count(), 1); 283 | 284 | dfa.insert(&GraphemeCluster::from("abcd", &RegExpConfig::new())); 285 | assert_eq!(dfa.state_count(), 5); 286 | } 287 | 288 | #[test] 289 | fn test_is_final_state() { 290 | let config = RegExpConfig::new(); 291 | let dfa = Dfa::from( 292 | &[GraphemeCluster::from("abcd", &RegExpConfig::new())], 293 | true, 294 | &config, 295 | ); 296 | 297 | let intermediate_state = State::new(3); 298 | assert_eq!(dfa.is_final_state(intermediate_state), false); 299 | 300 | let final_state = State::new(4); 301 | assert_eq!(dfa.is_final_state(final_state), true); 302 | } 303 | 304 | #[test] 305 | fn test_outgoing_edges() { 306 | let config = RegExpConfig::new(); 307 | let dfa = Dfa::from( 308 | &[ 309 | GraphemeCluster::from("abcd", &RegExpConfig::new()), 310 | GraphemeCluster::from("abxd", &RegExpConfig::new()), 311 | ], 312 | true, 313 | &config, 314 | ); 315 | let state = State::new(2); 316 | let mut edges = dfa.outgoing_edges(state); 317 | 318 | let first_edge = edges.next(); 319 | assert!(first_edge.is_some()); 320 | assert_eq!( 321 | first_edge.unwrap().weight(), 322 | &Grapheme::from("c", false, false, false) 323 | ); 324 | 325 | let second_edge = edges.next(); 326 | assert!(second_edge.is_some()); 327 | assert_eq!( 328 | second_edge.unwrap().weight(), 329 | &Grapheme::from("x", false, false, false) 330 | ); 331 | 332 | let third_edge = edges.next(); 333 | assert!(third_edge.is_none()); 334 | } 335 | 336 | #[test] 337 | fn test_states_in_depth_first_order() { 338 | let config = RegExpConfig::new(); 339 | let dfa = Dfa::from( 340 | &[ 341 | GraphemeCluster::from("abcd", &RegExpConfig::new()), 342 | GraphemeCluster::from("axyz", &RegExpConfig::new()), 343 | ], 344 | true, 345 | &config, 346 | ); 347 | let states = dfa.states_in_depth_first_order(); 348 | assert_eq!(states.len(), 7); 349 | 350 | let first_state = states.get(0).unwrap(); 351 | let mut edges = dfa.outgoing_edges(*first_state); 352 | assert_eq!( 353 | edges.next().unwrap().weight(), 354 | &Grapheme::from("a", false, false, false) 355 | ); 356 | assert!(edges.next().is_none()); 357 | 358 | let second_state = states.get(1).unwrap(); 359 | edges = dfa.outgoing_edges(*second_state); 360 | assert_eq!( 361 | edges.next().unwrap().weight(), 362 | &Grapheme::from("b", false, false, false) 363 | ); 364 | assert_eq!( 365 | edges.next().unwrap().weight(), 366 | &Grapheme::from("x", false, false, false) 367 | ); 368 | assert!(edges.next().is_none()); 369 | 370 | let third_state = states.get(2).unwrap(); 371 | edges = dfa.outgoing_edges(*third_state); 372 | assert_eq!( 373 | edges.next().unwrap().weight(), 374 | &Grapheme::from("y", false, false, false) 375 | ); 376 | assert!(edges.next().is_none()); 377 | 378 | let fourth_state = states.get(3).unwrap(); 379 | edges = dfa.outgoing_edges(*fourth_state); 380 | assert_eq!( 381 | edges.next().unwrap().weight(), 382 | &Grapheme::from("z", false, false, false) 383 | ); 384 | assert!(edges.next().is_none()); 385 | 386 | let fifth_state = states.get(4).unwrap(); 387 | edges = dfa.outgoing_edges(*fifth_state); 388 | assert!(edges.next().is_none()); 389 | 390 | let sixth_state = states.get(5).unwrap(); 391 | edges = dfa.outgoing_edges(*sixth_state); 392 | assert_eq!( 393 | edges.next().unwrap().weight(), 394 | &Grapheme::from("c", false, false, false) 395 | ); 396 | assert!(edges.next().is_none()); 397 | 398 | let seventh_state = states.get(6).unwrap(); 399 | edges = dfa.outgoing_edges(*seventh_state); 400 | assert_eq!( 401 | edges.next().unwrap().weight(), 402 | &Grapheme::from("d", false, false, false) 403 | ); 404 | assert!(edges.next().is_none()); 405 | } 406 | 407 | #[test] 408 | fn test_minimization_algorithm() { 409 | let config = RegExpConfig::new(); 410 | let mut dfa = Dfa::new(&config); 411 | assert_eq!(dfa.graph.node_count(), 1); 412 | assert_eq!(dfa.graph.edge_count(), 0); 413 | 414 | dfa.insert(&GraphemeCluster::from("abcd", &RegExpConfig::new())); 415 | assert_eq!(dfa.graph.node_count(), 5); 416 | assert_eq!(dfa.graph.edge_count(), 4); 417 | 418 | dfa.insert(&GraphemeCluster::from("abxd", &RegExpConfig::new())); 419 | assert_eq!(dfa.graph.node_count(), 7); 420 | assert_eq!(dfa.graph.edge_count(), 6); 421 | 422 | dfa.minimize(); 423 | assert_eq!(dfa.graph.node_count(), 5); 424 | assert_eq!(dfa.graph.edge_count(), 5); 425 | } 426 | 427 | #[test] 428 | fn test_dfa_constructor() { 429 | let config = RegExpConfig::new(); 430 | let dfa = Dfa::from( 431 | &[ 432 | GraphemeCluster::from("abcd", &RegExpConfig::new()), 433 | GraphemeCluster::from("abxd", &RegExpConfig::new()), 434 | ], 435 | true, 436 | &config, 437 | ); 438 | assert_eq!(dfa.graph.node_count(), 5); 439 | assert_eq!(dfa.graph.edge_count(), 5); 440 | } 441 | } 442 | -------------------------------------------------------------------------------- /src/format.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::cluster::GraphemeCluster; 18 | use crate::component::Component; 19 | use crate::expression::Expression; 20 | use crate::quantifier::Quantifier; 21 | use itertools::Itertools; 22 | use std::collections::BTreeSet; 23 | use std::fmt::{Display, Formatter, Result}; 24 | use unic_char_range::CharRange; 25 | 26 | impl Display for Expression<'_> { 27 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 28 | match self { 29 | Expression::Alternation( 30 | options, 31 | is_capturing_group_enabled, 32 | is_output_colorized, 33 | is_verbose_mode_enabled, 34 | ) => format_alternation( 35 | f, 36 | self, 37 | options, 38 | *is_capturing_group_enabled, 39 | *is_output_colorized, 40 | *is_verbose_mode_enabled, 41 | ), 42 | Expression::CharacterClass(char_set, is_output_colorized) => { 43 | format_character_class(f, char_set, *is_output_colorized) 44 | } 45 | Expression::Concatenation( 46 | expr1, 47 | expr2, 48 | is_capturing_group_enabled, 49 | is_output_colorized, 50 | is_verbose_mode_enabled, 51 | ) => format_concatenation( 52 | f, 53 | self, 54 | expr1, 55 | expr2, 56 | *is_capturing_group_enabled, 57 | *is_output_colorized, 58 | *is_verbose_mode_enabled, 59 | ), 60 | Expression::Literal( 61 | cluster, 62 | is_non_ascii_char_escaped, 63 | is_astral_code_point_converted_to_surrogate, 64 | ) => format_literal( 65 | f, 66 | cluster, 67 | *is_non_ascii_char_escaped, 68 | *is_astral_code_point_converted_to_surrogate, 69 | ), 70 | Expression::Repetition( 71 | expr, 72 | quantifier, 73 | is_capturing_group_enabled, 74 | is_output_colorized, 75 | is_verbose_mode_enabled, 76 | ) => format_repetition( 77 | f, 78 | self, 79 | expr, 80 | quantifier, 81 | *is_capturing_group_enabled, 82 | *is_output_colorized, 83 | *is_verbose_mode_enabled, 84 | ), 85 | } 86 | } 87 | } 88 | 89 | fn get_codepoint_position(c: char) -> usize { 90 | CharRange::all().iter().position(|it| it == c).unwrap() 91 | } 92 | 93 | fn format_alternation( 94 | f: &mut Formatter<'_>, 95 | expr: &Expression, 96 | options: &[Expression], 97 | is_capturing_group_enabled: bool, 98 | is_output_colorized: bool, 99 | is_verbose_mode_enabled: bool, 100 | ) -> Result { 101 | let pipe_component = Component::Pipe.to_repr(is_output_colorized); 102 | let disjunction_operator = if is_verbose_mode_enabled { 103 | format!("\n{}\n", pipe_component) 104 | } else { 105 | pipe_component 106 | }; 107 | let alternation_str = options 108 | .iter() 109 | .map(|option| { 110 | if option.precedence() < expr.precedence() && !option.is_single_codepoint() { 111 | if is_capturing_group_enabled { 112 | Component::CapturedParenthesizedExpression( 113 | option.to_string(), 114 | is_verbose_mode_enabled, 115 | true, 116 | ) 117 | .to_repr(is_output_colorized) 118 | } else { 119 | Component::UncapturedParenthesizedExpression( 120 | option.to_string(), 121 | is_verbose_mode_enabled, 122 | true, 123 | ) 124 | .to_repr(is_output_colorized) 125 | } 126 | } else { 127 | format!("{}", option) 128 | } 129 | }) 130 | .join(&disjunction_operator); 131 | 132 | write!(f, "{}", alternation_str) 133 | } 134 | 135 | fn format_character_class( 136 | f: &mut Formatter<'_>, 137 | char_set: &BTreeSet, 138 | is_output_colorized: bool, 139 | ) -> Result { 140 | let chars_to_escape = ['[', ']', '\\', '-', '^', '$']; 141 | let escaped_char_set = char_set 142 | .iter() 143 | .map(|c| { 144 | if chars_to_escape.contains(c) { 145 | format!("{}{}", "\\", c) 146 | } else if c == &'\n' { 147 | "\\n".to_string() 148 | } else if c == &'\r' { 149 | "\\r".to_string() 150 | } else if c == &'\t' { 151 | "\\t".to_string() 152 | } else { 153 | c.to_string() 154 | } 155 | }) 156 | .collect_vec(); 157 | let char_positions = char_set 158 | .iter() 159 | .map(|&it| get_codepoint_position(it)) 160 | .collect_vec(); 161 | 162 | let mut subsets = vec![]; 163 | let mut subset = vec![]; 164 | 165 | for ((first_c, first_pos), (second_c, second_pos)) in 166 | escaped_char_set.iter().zip(char_positions).tuple_windows() 167 | { 168 | if subset.is_empty() { 169 | subset.push(first_c); 170 | } 171 | if second_pos == first_pos + 1 { 172 | subset.push(second_c); 173 | } else { 174 | subsets.push(subset); 175 | subset = vec![second_c]; 176 | } 177 | } 178 | 179 | subsets.push(subset); 180 | 181 | let mut char_class_strs = vec![]; 182 | 183 | for subset in subsets.iter() { 184 | if subset.len() <= 2 { 185 | for c in subset.iter() { 186 | char_class_strs.push((*c).to_string()); 187 | } 188 | } else { 189 | char_class_strs.push(format!( 190 | "{}{}{}", 191 | subset.first().unwrap(), 192 | Component::Hyphen.to_repr(is_output_colorized), 193 | subset.last().unwrap() 194 | )); 195 | } 196 | } 197 | 198 | write!( 199 | f, 200 | "{}{}{}", 201 | Component::LeftBracket.to_repr(is_output_colorized), 202 | char_class_strs.join(""), 203 | Component::RightBracket.to_repr(is_output_colorized) 204 | ) 205 | } 206 | 207 | fn format_concatenation( 208 | f: &mut Formatter<'_>, 209 | expr: &Expression, 210 | expr1: &Expression, 211 | expr2: &Expression, 212 | is_capturing_group_enabled: bool, 213 | is_output_colorized: bool, 214 | is_verbose_mode_enabled: bool, 215 | ) -> Result { 216 | let expr_strs = [expr1, expr2] 217 | .iter() 218 | .map(|&it| { 219 | if it.precedence() < expr.precedence() && !it.is_single_codepoint() { 220 | if is_capturing_group_enabled { 221 | Component::CapturedParenthesizedExpression( 222 | it.to_string(), 223 | is_verbose_mode_enabled, 224 | true, 225 | ) 226 | .to_repr(is_output_colorized) 227 | } else { 228 | Component::UncapturedParenthesizedExpression( 229 | it.to_string(), 230 | is_verbose_mode_enabled, 231 | true, 232 | ) 233 | .to_repr(is_output_colorized) 234 | } 235 | } else { 236 | format!("{}", it) 237 | } 238 | }) 239 | .collect_vec(); 240 | 241 | write!( 242 | f, 243 | "{}{}", 244 | expr_strs.first().unwrap(), 245 | expr_strs.last().unwrap() 246 | ) 247 | } 248 | 249 | fn format_literal( 250 | f: &mut Formatter<'_>, 251 | cluster: &GraphemeCluster, 252 | is_non_ascii_char_escaped: bool, 253 | is_astral_code_point_converted_to_surrogate: bool, 254 | ) -> Result { 255 | let literal_str = cluster 256 | .graphemes() 257 | .iter() 258 | .cloned() 259 | .map(|mut grapheme| { 260 | if grapheme.has_repetitions() { 261 | grapheme 262 | .repetitions_mut() 263 | .iter_mut() 264 | .for_each(|repeated_grapheme| { 265 | repeated_grapheme.escape_regexp_symbols( 266 | is_non_ascii_char_escaped, 267 | is_astral_code_point_converted_to_surrogate, 268 | ); 269 | }); 270 | } else { 271 | grapheme.escape_regexp_symbols( 272 | is_non_ascii_char_escaped, 273 | is_astral_code_point_converted_to_surrogate, 274 | ); 275 | } 276 | grapheme.to_string() 277 | }) 278 | .join(""); 279 | 280 | write!(f, "{}", literal_str) 281 | } 282 | 283 | fn format_repetition( 284 | f: &mut Formatter<'_>, 285 | expr: &Expression, 286 | expr1: &Expression, 287 | quantifier: &Quantifier, 288 | is_capturing_group_enabled: bool, 289 | is_output_colorized: bool, 290 | is_verbose_mode_enabled: bool, 291 | ) -> Result { 292 | if expr1.precedence() < expr.precedence() && !expr1.is_single_codepoint() { 293 | if is_capturing_group_enabled { 294 | write!( 295 | f, 296 | "{}{}", 297 | Component::CapturedParenthesizedExpression( 298 | expr1.to_string(), 299 | is_verbose_mode_enabled, 300 | false 301 | ) 302 | .to_repr(is_output_colorized), 303 | Component::Quantifier(quantifier.clone(), is_verbose_mode_enabled) 304 | .to_repr(is_output_colorized) 305 | ) 306 | } else { 307 | write!( 308 | f, 309 | "{}{}", 310 | Component::UncapturedParenthesizedExpression( 311 | expr1.to_string(), 312 | is_verbose_mode_enabled, 313 | false 314 | ) 315 | .to_repr(is_output_colorized), 316 | Component::Quantifier(quantifier.clone(), is_verbose_mode_enabled) 317 | .to_repr(is_output_colorized) 318 | ) 319 | } 320 | } else { 321 | write!( 322 | f, 323 | "{}{}", 324 | expr1, 325 | Component::Quantifier(quantifier.clone(), is_verbose_mode_enabled) 326 | .to_repr(is_output_colorized) 327 | ) 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /src/grapheme.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::component::Component; 18 | use itertools::Itertools; 19 | use std::fmt::{Display, Formatter, Result}; 20 | 21 | const CHARS_TO_ESCAPE: [&str; 14] = [ 22 | "(", ")", "[", "]", "{", "}", "+", "*", "-", ".", "?", "|", "^", "$", 23 | ]; 24 | 25 | const CHAR_CLASSES: [&str; 6] = ["\\d", "\\s", "\\w", "\\D", "\\S", "\\W"]; 26 | 27 | #[derive(Clone, Debug, Hash, Ord, PartialOrd, Eq, PartialEq)] 28 | pub struct Grapheme { 29 | pub(crate) chars: Vec, 30 | pub(crate) repetitions: Vec, 31 | min: u32, 32 | max: u32, 33 | is_capturing_group_enabled: bool, 34 | is_output_colorized: bool, 35 | is_verbose_mode_enabled: bool, 36 | } 37 | 38 | impl Grapheme { 39 | pub(crate) fn from( 40 | s: &str, 41 | is_capturing_group_enabled: bool, 42 | is_output_colorized: bool, 43 | is_verbose_mode_enabled: bool, 44 | ) -> Self { 45 | Self { 46 | chars: vec![s.to_string()], 47 | repetitions: vec![], 48 | min: 1, 49 | max: 1, 50 | is_capturing_group_enabled, 51 | is_output_colorized, 52 | is_verbose_mode_enabled, 53 | } 54 | } 55 | 56 | pub(crate) fn new( 57 | chars: Vec, 58 | min: u32, 59 | max: u32, 60 | is_capturing_group_enabled: bool, 61 | is_output_colorized: bool, 62 | is_verbose_mode_enabled: bool, 63 | ) -> Self { 64 | Self { 65 | chars, 66 | repetitions: vec![], 67 | min, 68 | max, 69 | is_capturing_group_enabled, 70 | is_output_colorized, 71 | is_verbose_mode_enabled, 72 | } 73 | } 74 | 75 | pub(crate) fn value(&self) -> String { 76 | self.chars.join("") 77 | } 78 | 79 | pub(crate) fn chars(&self) -> &Vec { 80 | &self.chars 81 | } 82 | 83 | pub(crate) fn chars_mut(&mut self) -> &mut Vec { 84 | &mut self.chars 85 | } 86 | 87 | pub(crate) fn has_repetitions(&self) -> bool { 88 | !self.repetitions.is_empty() 89 | } 90 | 91 | pub(crate) fn repetitions_mut(&mut self) -> &mut Vec { 92 | &mut self.repetitions 93 | } 94 | 95 | pub(crate) fn minimum(&self) -> u32 { 96 | self.min 97 | } 98 | 99 | pub(crate) fn maximum(&self) -> u32 { 100 | self.max 101 | } 102 | 103 | pub(crate) fn char_count(&self, is_non_ascii_char_escaped: bool) -> usize { 104 | if is_non_ascii_char_escaped { 105 | self.chars 106 | .iter() 107 | .map(|it| it.chars().map(|c| self.escape(c, false)).join("")) 108 | .join("") 109 | .chars() 110 | .count() 111 | } else { 112 | self.chars.iter().map(|it| it.chars().count()).sum() 113 | } 114 | } 115 | 116 | pub(crate) fn escape_non_ascii_chars(&mut self, use_surrogate_pairs: bool) { 117 | self.chars = self 118 | .chars 119 | .iter() 120 | .map(|it| { 121 | it.chars() 122 | .map(|c| self.escape(c, use_surrogate_pairs)) 123 | .join("") 124 | }) 125 | .collect_vec(); 126 | } 127 | 128 | pub(crate) fn escape_regexp_symbols( 129 | &mut self, 130 | is_non_ascii_char_escaped: bool, 131 | is_astral_code_point_converted_to_surrogate: bool, 132 | ) { 133 | let characters = self.chars_mut(); 134 | 135 | #[allow(clippy::needless_range_loop)] 136 | for i in 0..characters.len() { 137 | let mut character = characters[i].clone(); 138 | 139 | for char_to_escape in CHARS_TO_ESCAPE.iter() { 140 | character = 141 | character.replace(char_to_escape, &format!("{}{}", "\\", char_to_escape)); 142 | } 143 | 144 | character = character 145 | .replace('\n', "\\n") 146 | .replace('\r', "\\r") 147 | .replace('\t', "\\t"); 148 | 149 | if character == "\\" { 150 | character = "\\\\".to_string(); 151 | } 152 | 153 | characters[i] = character; 154 | } 155 | 156 | if is_non_ascii_char_escaped { 157 | self.escape_non_ascii_chars(is_astral_code_point_converted_to_surrogate); 158 | } 159 | } 160 | 161 | fn escape(&self, c: char, use_surrogate_pairs: bool) -> String { 162 | if c.is_ascii() { 163 | c.to_string() 164 | } else if use_surrogate_pairs && ('\u{10000}'..'\u{10ffff}').contains(&c) { 165 | self.convert_to_surrogate_pair(c) 166 | } else { 167 | c.escape_unicode().to_string() 168 | } 169 | } 170 | 171 | fn convert_to_surrogate_pair(&self, c: char) -> String { 172 | c.encode_utf16(&mut [0; 2]) 173 | .iter() 174 | .map(|it| format!("\\u{{{:x}}}", it)) 175 | .join("") 176 | } 177 | } 178 | 179 | impl Display for Grapheme { 180 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 181 | let is_single_char = self.char_count(false) == 1 182 | || (self.chars.len() == 1 && self.chars[0].matches('\\').count() == 1); 183 | let is_range = self.min < self.max; 184 | let is_repetition = self.min > 1; 185 | let mut value = if self.repetitions.is_empty() { 186 | self.value() 187 | } else { 188 | self.repetitions.iter().map(|it| it.to_string()).join("") 189 | }; 190 | value = Component::CharClass(value.clone()) 191 | .to_repr(self.is_output_colorized && CHAR_CLASSES.contains(&&*value)); 192 | 193 | if !is_range && is_repetition && is_single_char { 194 | write!( 195 | f, 196 | "{}{}", 197 | value, 198 | Component::Repetition(self.min, false).to_repr(self.is_output_colorized) 199 | ) 200 | } else if !is_range && is_repetition && !is_single_char { 201 | write!( 202 | f, 203 | "{}{}", 204 | if self.is_capturing_group_enabled { 205 | Component::CapturedParenthesizedExpression( 206 | value, 207 | self.is_verbose_mode_enabled, 208 | false, 209 | ) 210 | .to_repr(self.is_output_colorized) 211 | } else { 212 | Component::UncapturedParenthesizedExpression( 213 | value, 214 | self.is_verbose_mode_enabled, 215 | false, 216 | ) 217 | .to_repr(self.is_output_colorized) 218 | }, 219 | Component::Repetition(self.min, self.is_verbose_mode_enabled) 220 | .to_repr(self.is_output_colorized) 221 | ) 222 | } else if is_range && is_single_char { 223 | write!( 224 | f, 225 | "{}{}", 226 | value, 227 | Component::RepetitionRange(self.min, self.max, false) 228 | .to_repr(self.is_output_colorized) 229 | ) 230 | } else if is_range && !is_single_char { 231 | write!( 232 | f, 233 | "{}{}", 234 | if self.is_capturing_group_enabled { 235 | Component::CapturedParenthesizedExpression( 236 | value, 237 | self.is_verbose_mode_enabled, 238 | false, 239 | ) 240 | .to_repr(self.is_output_colorized) 241 | } else { 242 | Component::UncapturedParenthesizedExpression( 243 | value, 244 | self.is_verbose_mode_enabled, 245 | false, 246 | ) 247 | .to_repr(self.is_output_colorized) 248 | }, 249 | Component::RepetitionRange(self.min, self.max, self.is_verbose_mode_enabled) 250 | .to_repr(self.is_output_colorized) 251 | ) 252 | } else { 253 | write!(f, "{}", value) 254 | } 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | //! ## 1. What does this tool do? 18 | //! 19 | //! *grex* is a library as well as a command-line utility that is meant to simplify the often 20 | //! complicated and tedious task of creating regular expressions. It does so by automatically 21 | //! generating a single regular expression from user-provided test cases. The resulting 22 | //! expression is guaranteed to match the test cases which it was generated from. 23 | //! 24 | //! This project has started as a Rust port of the JavaScript tool 25 | //! [*regexgen*](https://github.com/devongovett/regexgen) written by 26 | //! [Devon Govett](https://github.com/devongovett). Although a lot of further useful features 27 | //! could be added to it, its development was apparently ceased several years ago. The plan 28 | //! is now to add these new features to *grex* as Rust really shines when it comes to 29 | //! command-line tools. *grex* offers all features that *regexgen* provides, and more. 30 | //! 31 | //! The philosophy of this project is to generate the most specific regular expression 32 | //! possible by default which exactly matches the given input only and nothing else. 33 | //! With the use of command-line flags (in the CLI tool) or preprocessing methods 34 | //! (in the library), more generalized expressions can be created. 35 | //! 36 | //! The produced expressions are [Perl-compatible regular expressions](https://www.pcre.org) 37 | //! which are also compatible with the regular expression parser in Rust's 38 | //! [*regex crate*](https://crates.io/crates/regex). 39 | //! Other regular expression parsers or respective libraries from other programming languages 40 | //! have not been tested so far, but they ought to be mostly compatible as well. 41 | //! 42 | //! ## 2. Do I still need to learn to write regexes then? 43 | //! 44 | //! **Definitely, yes!** Using the standard settings, *grex* produces a regular expression that 45 | //! is guaranteed to match only the test cases given as input and nothing else. This has been 46 | //! verified by [property tests](https://github.com/pemistahl/grex/blob/main/tests/property_tests.rs). 47 | //! However, if the conversion to shorthand character classes such as `\w` is enabled, the 48 | //! resulting regex matches a much wider scope of test cases. Knowledge about the consequences of 49 | //! this conversion is essential for finding a correct regular expression for your business domain. 50 | //! 51 | //! *grex* uses an algorithm that tries to find the shortest possible regex for the given test cases. 52 | //! Very often though, the resulting expression is still longer or more complex than it needs to be. 53 | //! In such cases, a more compact or elegant regex can be created only by hand. 54 | //! Also, every regular expression engine has different built-in optimizations. 55 | //! *grex* does not know anything about those and therefore cannot optimize its regexes 56 | //! for a specific engine. 57 | //! 58 | //! **So, please learn how to write regular expressions!** The currently best use case for *grex* 59 | //! is to find an initial correct regex which should be inspected by hand if further optimizations 60 | //! are possible. 61 | //! 62 | //! ## 3. Current features 63 | //! 64 | //! - literals 65 | //! - character classes 66 | //! - detection of common prefixes and suffixes 67 | //! - detection of repeated substrings and conversion to `{min,max}` quantifier notation 68 | //! - alternation using `|` operator 69 | //! - optionality using `?` quantifier 70 | //! - escaping of non-ascii characters, with optional conversion of astral code points to surrogate pairs 71 | //! - case-sensitive or case-insensitive matching 72 | //! - capturing or non-capturing groups 73 | //! - optional anchors `^` and `$` 74 | //! - fully compliant to [Unicode Standard 15.0](https://unicode.org/versions/Unicode15.0.0) 75 | //! - fully compatible with [*regex* crate 1.9.0+](https://crates.io/crates/regex) 76 | //! - correctly handles graphemes consisting of multiple Unicode symbols 77 | //! - reads input strings from the command-line or from a file 78 | //! - produces more readable expressions indented on multiple using optional verbose mode 79 | //! 80 | //! ## 4. How to use? 81 | //! 82 | //! The code snippets below show how to use the public api. 83 | //! 84 | //! For [more detailed examples](https://github.com/pemistahl/grex/tree/main#53-examples), please 85 | //! take a look at the project's readme file on GitHub. 86 | //! 87 | //! ### 4.1 Default settings 88 | //! 89 | //! Test cases are passed either from a collection via [`RegExpBuilder::from()`] 90 | //! or from a file via [`RegExpBuilder::from_file()`]. 91 | //! 92 | //! ``` 93 | //! use grex::RegExpBuilder; 94 | //! 95 | //! let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]).build(); 96 | //! assert_eq!(regexp, "^a(?:aa?)?$"); 97 | //! ``` 98 | //! 99 | //! ### 4.2 Convert to character classes 100 | //! 101 | //! ``` 102 | //! use grex::RegExpBuilder; 103 | //! 104 | //! let regexp = RegExpBuilder::from(&["a", "aa", "123"]) 105 | //! .with_conversion_of_digits() 106 | //! .with_conversion_of_words() 107 | //! .build(); 108 | //! assert_eq!(regexp, "^(?:\\d\\d\\d|\\w(?:\\w)?)$"); 109 | //! ``` 110 | //! 111 | //! ### 4.3 Convert repeated substrings 112 | //! 113 | //! ``` 114 | //! use grex::RegExpBuilder; 115 | //! 116 | //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) 117 | //! .with_conversion_of_repetitions() 118 | //! .build(); 119 | //! assert_eq!(regexp, "^(?:a{2}|(?:bc){2}|(?:def){3})$"); 120 | //! ``` 121 | //! 122 | //! By default, *grex* converts each substring this way which is at least a single character long 123 | //! and which is subsequently repeated at least once. You can customize these two parameters 124 | //! if you like. 125 | //! 126 | //! In the following example, the test case `aa` is not converted to `a{2}` because the repeated 127 | //! substring `a` has a length of 1, but the minimum substring length has been set to 2. 128 | //! 129 | //! ``` 130 | //! use grex::RegExpBuilder; 131 | //! 132 | //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) 133 | //! .with_conversion_of_repetitions() 134 | //! .with_minimum_substring_length(2) 135 | //! .build(); 136 | //! assert_eq!(regexp, "^(?:aa|(?:bc){2}|(?:def){3})$"); 137 | //! ``` 138 | //! 139 | //! Setting a minimum number of 2 repetitions in the next example, only the test case `defdefdef` 140 | //! will be converted because it is the only one that is repeated twice. 141 | //! 142 | //! ``` 143 | //! use grex::RegExpBuilder; 144 | //! 145 | //! let regexp = RegExpBuilder::from(&["aa", "bcbc", "defdefdef"]) 146 | //! .with_conversion_of_repetitions() 147 | //! .with_minimum_repetitions(2) 148 | //! .build(); 149 | //! assert_eq!(regexp, "^(?:bcbc|aa|(?:def){3})$"); 150 | //! ``` 151 | //! 152 | //! ### 4.4 Escape non-ascii characters 153 | //! 154 | //! ``` 155 | //! use grex::RegExpBuilder; 156 | //! 157 | //! let regexp = RegExpBuilder::from(&["You smell like 💩."]) 158 | //! .with_escaping_of_non_ascii_chars(false) 159 | //! .build(); 160 | //! assert_eq!(regexp, "^You smell like \\u{1f4a9}\\.$"); 161 | //! ``` 162 | //! 163 | //! Old versions of JavaScript do not support unicode escape sequences for 164 | //! the astral code planes (range `U+010000` to `U+10FFFF`). In order to 165 | //! support these symbols in JavaScript regular expressions, the conversion 166 | //! to surrogate pairs is necessary. More information on that matter can be 167 | //! found [here](https://mathiasbynens.be/notes/javascript-unicode). 168 | //! 169 | //! ``` 170 | //! use grex::RegExpBuilder; 171 | //! 172 | //! let regexp = RegExpBuilder::from(&["You smell like 💩."]) 173 | //! .with_escaping_of_non_ascii_chars(true) 174 | //! .build(); 175 | //! assert_eq!(regexp, "^You smell like \\u{d83d}\\u{dca9}\\.$"); 176 | //! ``` 177 | //! 178 | //! ### 4.5 Case-insensitive matching 179 | //! 180 | //! The regular expressions that *grex* generates are case-sensitive by default. 181 | //! Case-insensitive matching can be enabled like so: 182 | //! 183 | //! ``` 184 | //! use grex::RegExpBuilder; 185 | //! 186 | //! let regexp = RegExpBuilder::from(&["big", "BIGGER"]) 187 | //! .with_case_insensitive_matching() 188 | //! .build(); 189 | //! assert_eq!(regexp, "(?i)^big(?:ger)?$"); 190 | //! ``` 191 | //! 192 | //! ### 4.6 Capturing Groups 193 | //! 194 | //! Non-capturing groups are used by default. 195 | //! Extending the previous example, you can switch to capturing groups instead. 196 | //! 197 | //! ``` 198 | //! use grex::RegExpBuilder; 199 | //! 200 | //! let regexp = RegExpBuilder::from(&["big", "BIGGER"]) 201 | //! .with_case_insensitive_matching() 202 | //! .with_capturing_groups() 203 | //! .build(); 204 | //! assert_eq!(regexp, "(?i)^big(ger)?$"); 205 | //! ``` 206 | //! 207 | //! ### 4.7 Verbose mode 208 | //! 209 | //! If you find the generated regular expression hard to read, you can enable verbose mode. 210 | //! The expression is then put on multiple lines and indented to make it more pleasant to the eyes. 211 | //! 212 | //! ``` 213 | //! use grex::RegExpBuilder; 214 | //! use indoc::indoc; 215 | //! 216 | //! let regexp = RegExpBuilder::from(&["a", "b", "bcd"]) 217 | //! .with_verbose_mode() 218 | //! .build(); 219 | //! 220 | //! assert_eq!(regexp, indoc!( 221 | //! r#" 222 | //! (?x) 223 | //! ^ 224 | //! (?: 225 | //! b 226 | //! (?: 227 | //! cd 228 | //! )? 229 | //! | 230 | //! a 231 | //! ) 232 | //! $"# 233 | //! )); 234 | //! ``` 235 | //! 236 | //! ### 4.8 Disable anchors 237 | //! 238 | //! By default, the anchors `^` and `$` are put around every generated regular expression in order 239 | //! to ensure that it matches only the test cases given as input. Often enough, however, it is 240 | //! desired to use the generated pattern as part of a larger one. For this purpose, the anchors 241 | //! can be disabled, either separately or both of them. 242 | //! 243 | //! ``` 244 | //! use grex::RegExpBuilder; 245 | //! 246 | //! let regexp = RegExpBuilder::from(&["a", "aa", "aaa"]) 247 | //! .without_anchors() 248 | //! .build(); 249 | //! assert_eq!(regexp, "a(?:aa?)?"); 250 | //! ``` 251 | //! 252 | //! ### 5. How does it work? 253 | //! 254 | //! 1. A [deterministic finite automaton](https://en.wikipedia.org/wiki/Deterministic_finite_automaton) (DFA) 255 | //! is created from the input strings. 256 | //! 257 | //! 2. The number of states and transitions between states in the DFA is reduced by applying 258 | //! [Hopcroft's DFA minimization algorithm](https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft.27s_algorithm). 259 | //! 260 | //! 3. The minimized DFA is expressed as a system of linear equations which are solved with 261 | //! [Brzozowski's algebraic method](http://cs.stackexchange.com/questions/2016/how-to-convert-finite-automata-to-regular-expressions#2392), 262 | //! resulting in the final regular expression. 263 | 264 | #[macro_use] 265 | mod macros; 266 | 267 | mod builder; 268 | mod cluster; 269 | mod component; 270 | mod config; 271 | mod dfa; 272 | mod expression; 273 | mod format; 274 | mod grapheme; 275 | mod quantifier; 276 | mod regexp; 277 | mod substring; 278 | mod unicode_tables; 279 | 280 | #[cfg(feature = "python")] 281 | mod python; 282 | 283 | #[cfg(target_family = "wasm")] 284 | mod wasm; 285 | 286 | pub use builder::RegExpBuilder; 287 | 288 | #[cfg(target_family = "wasm")] 289 | pub use wasm::RegExpBuilder as WasmRegExpBuilder; 290 | -------------------------------------------------------------------------------- /src/macros.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | macro_rules! btreeset { 18 | ( $( $value: expr ),* ) => {{ 19 | let mut set = std::collections::BTreeSet::new(); 20 | $( set.insert($value); )* 21 | set 22 | }}; 23 | } 24 | -------------------------------------------------------------------------------- /src/python.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::builder::{ 18 | RegExpBuilder, MINIMUM_REPETITIONS_MESSAGE, MINIMUM_SUBSTRING_LENGTH_MESSAGE, 19 | MISSING_TEST_CASES_MESSAGE, 20 | }; 21 | use crate::config::RegExpConfig; 22 | use lazy_static::lazy_static; 23 | use pyo3::exceptions::PyValueError; 24 | use pyo3::prelude::*; 25 | use pyo3::types::PyType; 26 | use regex::{Captures, Regex}; 27 | 28 | #[pymodule] 29 | fn grex(m: &Bound<'_, PyModule>) -> PyResult<()> { 30 | m.add_class::()?; 31 | Ok(()) 32 | } 33 | 34 | #[pymethods] 35 | impl RegExpBuilder { 36 | #[new] 37 | fn new(test_cases: Vec) -> PyResult { 38 | if test_cases.is_empty() { 39 | Err(PyValueError::new_err(MISSING_TEST_CASES_MESSAGE)) 40 | } else { 41 | Ok(Self { 42 | test_cases, 43 | config: RegExpConfig::new(), 44 | }) 45 | } 46 | } 47 | 48 | /// Specify the test cases to build the regular expression from. 49 | /// 50 | /// The test cases need not be sorted because `RegExpBuilder` sorts them internally. 51 | /// 52 | /// Args: 53 | /// test_cases (list[str]): The list of test cases 54 | /// 55 | /// Raises: 56 | /// ValueError: if `test_cases` is empty 57 | #[classmethod] 58 | fn from_test_cases(_cls: &Bound, test_cases: Vec) -> PyResult { 59 | Self::new(test_cases) 60 | } 61 | 62 | /// Convert any Unicode decimal digit to character class `\d`. 63 | /// 64 | /// This method takes precedence over `with_conversion_of_words` if both are set. 65 | /// Decimal digits are converted to `\d`, the remaining word characters to `\w`. 66 | /// 67 | /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 68 | /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. 69 | #[pyo3(name = "with_conversion_of_digits")] 70 | fn py_with_conversion_of_digits(mut self_: PyRefMut) -> PyRefMut { 71 | self_.config.is_digit_converted = true; 72 | self_ 73 | } 74 | 75 | /// Convert any character which is not a Unicode decimal digit to character class `\D`. 76 | /// 77 | /// This method takes precedence over `with_conversion_of_non_words` if both are set. 78 | /// Non-digits which are also non-word characters are converted to `\D`. 79 | /// 80 | /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 81 | /// Non-digits which are also non-space characters are converted to `\D`. 82 | #[pyo3(name = "with_conversion_of_non_digits")] 83 | fn py_with_conversion_of_non_digits(mut self_: PyRefMut) -> PyRefMut { 84 | self_.config.is_non_digit_converted = true; 85 | self_ 86 | } 87 | 88 | /// Convert any Unicode whitespace character to character class `\s`. 89 | /// 90 | /// This method takes precedence over `with_conversion_of_non_digits` if both are set. 91 | /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. 92 | /// 93 | /// This method takes precedence over `with_conversion_of_non_words` if both are set. 94 | /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. 95 | #[pyo3(name = "with_conversion_of_whitespace")] 96 | fn py_with_conversion_of_whitespace(mut self_: PyRefMut) -> PyRefMut { 97 | self_.config.is_space_converted = true; 98 | self_ 99 | } 100 | 101 | /// Convert any character which is not a Unicode whitespace character to character class `\S`. 102 | #[pyo3(name = "with_conversion_of_non_whitespace")] 103 | fn py_with_conversion_of_non_whitespace(mut self_: PyRefMut) -> PyRefMut { 104 | self_.config.is_non_space_converted = true; 105 | self_ 106 | } 107 | 108 | /// Convert any Unicode word character to character class `\w`. 109 | /// 110 | /// This method takes precedence over `with_conversion_of_non_digits` if both are set. 111 | /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`. 112 | /// 113 | /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 114 | /// Word characters are converted to `\w`, the remaining non-space characters to `\S`. 115 | #[pyo3(name = "with_conversion_of_words")] 116 | fn py_with_conversion_of_words(mut self_: PyRefMut) -> PyRefMut { 117 | self_.config.is_word_converted = true; 118 | self_ 119 | } 120 | 121 | /// Convert any character which is not a Unicode word character to character class `\W`. 122 | /// 123 | /// This method takes precedence over `with_conversion_of_non_whitespace` if both are set. 124 | /// Non-words which are also non-space characters are converted to `\W`. 125 | #[pyo3(name = "with_conversion_of_non_words")] 126 | fn py_with_conversion_of_non_words(mut self_: PyRefMut) -> PyRefMut { 127 | self_.config.is_non_word_converted = true; 128 | self_ 129 | } 130 | 131 | /// Detect repeated non-overlapping substrings and convert them to `{min,max}` quantifier notation. 132 | #[pyo3(name = "with_conversion_of_repetitions")] 133 | fn py_with_conversion_of_repetitions(mut self_: PyRefMut) -> PyRefMut { 134 | self_.config.is_repetition_converted = true; 135 | self_ 136 | } 137 | 138 | /// Enable case-insensitive matching of test cases so that letters match both upper and lower case. 139 | #[pyo3(name = "with_case_insensitive_matching")] 140 | fn py_with_case_insensitive_matching(mut self_: PyRefMut) -> PyRefMut { 141 | self_.config.is_case_insensitive_matching = true; 142 | self_ 143 | } 144 | 145 | /// Replace non-capturing groups by capturing ones. 146 | #[pyo3(name = "with_capturing_groups")] 147 | fn py_with_capturing_groups(mut self_: PyRefMut) -> PyRefMut { 148 | self_.config.is_capturing_group_enabled = true; 149 | self_ 150 | } 151 | 152 | /// Specify the minimum quantity of substring repetitions to be converted if `with_conversion_of_repetitions` is set. 153 | /// 154 | /// If the quantity is not explicitly set with this method, a default value of 1 will be used. 155 | /// 156 | /// Args: 157 | /// quantity (int): The minimum quantity of substring repetitions 158 | /// 159 | /// Raises: 160 | /// ValueError: if `quantity` is zero 161 | #[pyo3(name = "with_minimum_repetitions")] 162 | fn py_with_minimum_repetitions( 163 | mut self_: PyRefMut, 164 | quantity: i32, 165 | ) -> PyResult> { 166 | if quantity <= 0 { 167 | Err(PyValueError::new_err(MINIMUM_REPETITIONS_MESSAGE)) 168 | } else { 169 | self_.config.minimum_repetitions = quantity as u32; 170 | Ok(self_) 171 | } 172 | } 173 | 174 | /// Specify the minimum length a repeated substring must have in order to be converted if `with_conversion_of_repetitions` is set. 175 | /// 176 | /// If the length is not explicitly set with this method, a default value of 1 will be used. 177 | /// 178 | /// Args: 179 | /// length (int): The minimum substring length 180 | /// 181 | /// Raises: 182 | /// ValueError: if `length` is zero 183 | #[pyo3(name = "with_minimum_substring_length")] 184 | fn py_with_minimum_substring_length( 185 | mut self_: PyRefMut, 186 | length: i32, 187 | ) -> PyResult> { 188 | if length <= 0 { 189 | Err(PyValueError::new_err(MINIMUM_SUBSTRING_LENGTH_MESSAGE)) 190 | } else { 191 | self_.config.minimum_substring_length = length as u32; 192 | Ok(self_) 193 | } 194 | } 195 | 196 | /// Convert non-ASCII characters to unicode escape sequences. 197 | /// 198 | /// The parameter `use_surrogate_pairs` specifies whether to convert astral code planes 199 | /// (range `U+010000` to `U+10FFFF`) to surrogate pairs. 200 | /// 201 | /// Args: 202 | /// use_surrogate_pairs (bool): Whether to convert astral code planes to surrogate pairs 203 | #[pyo3(name = "with_escaping_of_non_ascii_chars")] 204 | fn py_with_escaping_of_non_ascii_chars( 205 | mut self_: PyRefMut, 206 | use_surrogate_pairs: bool, 207 | ) -> PyRefMut { 208 | self_.config.is_non_ascii_char_escaped = true; 209 | self_.config.is_astral_code_point_converted_to_surrogate = use_surrogate_pairs; 210 | self_ 211 | } 212 | 213 | /// Produce a nicer looking regular expression in verbose mode. 214 | #[pyo3(name = "with_verbose_mode")] 215 | fn py_with_verbose_mode(mut self_: PyRefMut) -> PyRefMut { 216 | self_.config.is_verbose_mode_enabled = true; 217 | self_ 218 | } 219 | 220 | /// Remove the caret anchor '^' from the resulting regular expression, thereby allowing to 221 | /// match the test cases also when they do not occur at the start of a string. 222 | #[pyo3(name = "without_start_anchor")] 223 | fn py_without_start_anchor(mut self_: PyRefMut) -> PyRefMut { 224 | self_.config.is_start_anchor_disabled = true; 225 | self_ 226 | } 227 | 228 | /// Remove the dollar sign anchor '$' from the resulting regular expression, thereby allowing 229 | /// to match the test cases also when they do not occur at the end of a string. 230 | #[pyo3(name = "without_end_anchor")] 231 | fn py_without_end_anchor(mut self_: PyRefMut) -> PyRefMut { 232 | self_.config.is_end_anchor_disabled = true; 233 | self_ 234 | } 235 | 236 | /// Remove the caret and dollar sign anchors from the resulting regular expression, thereby 237 | /// allowing to match the test cases also when they occur within a larger string that contains 238 | /// other content as well. 239 | #[pyo3(name = "without_anchors")] 240 | fn py_without_anchors(mut self_: PyRefMut) -> PyRefMut { 241 | self_.config.is_start_anchor_disabled = true; 242 | self_.config.is_end_anchor_disabled = true; 243 | self_ 244 | } 245 | 246 | /// Build the actual regular expression using the previously given settings. 247 | #[pyo3(name = "build")] 248 | fn py_build(&mut self) -> String { 249 | let regexp = self.build(); 250 | if self.config.is_non_ascii_char_escaped { 251 | replace_unicode_escape_sequences(regexp) 252 | } else { 253 | regexp 254 | } 255 | } 256 | } 257 | 258 | /// Replaces Rust Unicode escape sequences to Python Unicode escape sequences. 259 | fn replace_unicode_escape_sequences(regexp: String) -> String { 260 | lazy_static! { 261 | static ref FOUR_CHARS_ESCAPE_SEQUENCE: Regex = Regex::new(r"\\u\{([0-9a-f]{4})\}").unwrap(); 262 | static ref FIVE_CHARS_ESCAPE_SEQUENCE: Regex = Regex::new(r"\\u\{([0-9a-f]{5})\}").unwrap(); 263 | } 264 | let mut replacement = FOUR_CHARS_ESCAPE_SEQUENCE 265 | .replace_all(®exp, |caps: &Captures| format!("\\u{}", &caps[1])) 266 | .to_string(); 267 | 268 | replacement = FIVE_CHARS_ESCAPE_SEQUENCE 269 | .replace_all(&replacement, |caps: &Captures| { 270 | format!("\\U000{}", &caps[1]) 271 | }) 272 | .to_string(); 273 | 274 | replacement 275 | } 276 | -------------------------------------------------------------------------------- /src/quantifier.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use std::fmt::{Display, Formatter, Result}; 18 | 19 | #[derive(Clone, Debug, Eq, PartialEq)] 20 | pub enum Quantifier { 21 | KleeneStar, 22 | QuestionMark, 23 | } 24 | 25 | impl Display for Quantifier { 26 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 27 | write!( 28 | f, 29 | "{}", 30 | match self { 31 | Quantifier::KleeneStar => '*', 32 | Quantifier::QuestionMark => '?', 33 | } 34 | ) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/regexp.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | use crate::cluster::GraphemeCluster; 18 | use crate::component::Component; 19 | use crate::config::RegExpConfig; 20 | use crate::dfa::Dfa; 21 | use crate::expression::Expression; 22 | use itertools::Itertools; 23 | use regex::Regex; 24 | use std::cmp::Ordering; 25 | use std::fmt::{Display, Formatter, Result}; 26 | 27 | pub struct RegExp<'a> { 28 | ast: Expression<'a>, 29 | config: &'a RegExpConfig, 30 | } 31 | 32 | impl<'a> RegExp<'a> { 33 | pub(crate) fn from(test_cases: &'a mut Vec, config: &'a RegExpConfig) -> Self { 34 | if config.is_case_insensitive_matching { 35 | Self::convert_for_case_insensitive_matching(test_cases); 36 | } 37 | Self::sort(test_cases); 38 | let grapheme_clusters = Self::grapheme_clusters(test_cases, config); 39 | let mut dfa = Dfa::from(&grapheme_clusters, true, config); 40 | let mut ast = Expression::from(dfa, config); 41 | 42 | if config.is_start_anchor_disabled && config.is_end_anchor_disabled { 43 | let mut regex = Self::convert_expr_to_regex(&ast, config); 44 | 45 | if config.is_verbose_mode_enabled { 46 | // Remove line breaks before checking matches, otherwise check will be incorrect. 47 | regex = Regex::new(®ex.to_string().replace('\n', "")).unwrap(); 48 | } 49 | 50 | if !Self::is_each_test_case_matched_after_rotating_alternations( 51 | ®ex, &mut ast, test_cases, 52 | ) { 53 | dfa = Dfa::from(&grapheme_clusters, false, config); 54 | ast = Expression::from(dfa, config); 55 | regex = Self::convert_expr_to_regex(&ast, config); 56 | 57 | if !Self::regex_matches_all_test_cases(®ex, test_cases) { 58 | let mut exprs = vec![]; 59 | for cluster in grapheme_clusters { 60 | let literal = Expression::new_literal(cluster, config); 61 | exprs.push(literal); 62 | } 63 | ast = Expression::new_alternation(exprs, config); 64 | } 65 | } 66 | } 67 | 68 | Self { ast, config } 69 | } 70 | 71 | fn convert_for_case_insensitive_matching(test_cases: &mut Vec) { 72 | // Convert only those test cases to lowercase if 73 | // they keep their original number of characters. 74 | // Otherwise, "İ" -> "i\u{307}" would not match "İ". 75 | *test_cases = test_cases 76 | .iter() 77 | .map(|it| { 78 | let lower_test_case = it.to_lowercase(); 79 | if lower_test_case.chars().count() == it.chars().count() { 80 | lower_test_case 81 | } else { 82 | it.to_string() 83 | } 84 | }) 85 | .collect_vec(); 86 | } 87 | 88 | fn convert_expr_to_regex(expr: &Expression, config: &RegExpConfig) -> Regex { 89 | if config.is_output_colorized { 90 | let color_replace_regex = Regex::new("\u{1b}\\[(?:\\d+;\\d+|0)m").unwrap(); 91 | Regex::new(&color_replace_regex.replace_all(&expr.to_string(), "")).unwrap() 92 | } else { 93 | Regex::new(&expr.to_string()).unwrap() 94 | } 95 | } 96 | 97 | fn regex_matches_all_test_cases(regex: &Regex, test_cases: &[String]) -> bool { 98 | test_cases 99 | .iter() 100 | .all(|test_case| regex.find_iter(test_case).count() == 1) 101 | } 102 | 103 | fn sort(test_cases: &mut Vec) { 104 | test_cases.sort(); 105 | test_cases.dedup(); 106 | test_cases.sort_by(|a, b| match a.len().cmp(&b.len()) { 107 | Ordering::Equal => a.cmp(b), 108 | other => other, 109 | }); 110 | } 111 | 112 | fn grapheme_clusters( 113 | test_cases: &'a [String], 114 | config: &'a RegExpConfig, 115 | ) -> Vec> { 116 | let mut clusters = test_cases 117 | .iter() 118 | .map(|it| GraphemeCluster::from(it, config)) 119 | .collect_vec(); 120 | 121 | if config.is_char_class_feature_enabled() { 122 | for cluster in clusters.iter_mut() { 123 | cluster.convert_to_char_classes(); 124 | } 125 | } 126 | 127 | if config.is_repetition_converted { 128 | for cluster in clusters.iter_mut() { 129 | cluster.convert_repetitions(); 130 | } 131 | } 132 | 133 | clusters 134 | } 135 | 136 | fn is_each_test_case_matched_after_rotating_alternations( 137 | regex: &Regex, 138 | expr: &mut Expression, 139 | test_cases: &[String], 140 | ) -> bool { 141 | for _ in 1..test_cases.len() { 142 | if Self::regex_matches_all_test_cases(regex, test_cases) { 143 | return true; 144 | } else if let Expression::Alternation(options, _, _, _) = expr { 145 | options.rotate_right(1); 146 | } else if let Expression::Concatenation(first, second, _, _, _) = expr { 147 | let a: &mut Expression = first; 148 | let b: &mut Expression = second; 149 | 150 | if let Expression::Alternation(options, _, _, _) = a { 151 | options.rotate_right(1); 152 | } else if let Expression::Alternation(options, _, _, _) = b { 153 | options.rotate_right(1); 154 | } 155 | } 156 | } 157 | false 158 | } 159 | } 160 | 161 | impl Display for RegExp<'_> { 162 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 163 | let flag = 164 | if self.config.is_case_insensitive_matching && self.config.is_verbose_mode_enabled { 165 | Component::IgnoreCaseAndVerboseModeFlag.to_repr(self.config.is_output_colorized) 166 | } else if self.config.is_case_insensitive_matching { 167 | Component::IgnoreCaseFlag.to_repr(self.config.is_output_colorized) 168 | } else if self.config.is_verbose_mode_enabled { 169 | Component::VerboseModeFlag.to_repr(self.config.is_output_colorized) 170 | } else { 171 | String::new() 172 | }; 173 | 174 | let caret = if self.config.is_start_anchor_disabled { 175 | String::new() 176 | } else { 177 | Component::Caret(self.config.is_verbose_mode_enabled) 178 | .to_repr(self.config.is_output_colorized) 179 | }; 180 | 181 | let dollar_sign = if self.config.is_end_anchor_disabled { 182 | String::new() 183 | } else { 184 | Component::DollarSign(self.config.is_verbose_mode_enabled) 185 | .to_repr(self.config.is_output_colorized) 186 | }; 187 | 188 | let mut regexp = match self.ast { 189 | Expression::Alternation(_, _, _, _) => { 190 | format!( 191 | "{}{}{}{}", 192 | flag, 193 | caret, 194 | if self.config.is_capturing_group_enabled { 195 | Component::CapturedParenthesizedExpression( 196 | self.ast.to_string(), 197 | self.config.is_verbose_mode_enabled, 198 | false, 199 | ) 200 | .to_repr(self.config.is_output_colorized) 201 | } else { 202 | Component::UncapturedParenthesizedExpression( 203 | self.ast.to_string(), 204 | self.config.is_verbose_mode_enabled, 205 | false, 206 | ) 207 | .to_repr(self.config.is_output_colorized) 208 | }, 209 | dollar_sign 210 | ) 211 | } 212 | _ => { 213 | format!("{}{}{}{}", flag, caret, self.ast, dollar_sign) 214 | } 215 | }; 216 | 217 | regexp = regexp 218 | .replace('\u{b}', "\\v") // U+000B Line Tabulation 219 | .replace('\u{c}', "\\f"); // U+000C Form Feed 220 | 221 | if self.config.is_verbose_mode_enabled { 222 | regexp = regexp 223 | .replace('#', "\\#") 224 | .replace( 225 | [ 226 | ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u{85}', '\u{a0}', '\u{1680}', 227 | '\u{2000}', '\u{2001}', '\u{2002}', '\u{2003}', '\u{2004}', '\u{2005}', 228 | '\u{2006}', '\u{2007}', '\u{2008}', '\u{2009}', '\u{200a}', '\u{2028}', 229 | '\u{2029}', '\u{202f}', '\u{205f}', '\u{3000}', 230 | ], 231 | "\\s", 232 | ) 233 | .replace(' ', "\\ "); 234 | } 235 | 236 | write!( 237 | f, 238 | "{}", 239 | if self.config.is_verbose_mode_enabled { 240 | indent_regexp(regexp, self.config) 241 | } else { 242 | regexp 243 | } 244 | ) 245 | } 246 | } 247 | 248 | fn indent_regexp(regexp: String, config: &RegExpConfig) -> String { 249 | let mut indented_regexp = vec![]; 250 | let mut nesting_level = 0; 251 | 252 | for (i, line) in regexp.lines().enumerate() { 253 | if i == 1 && config.is_start_anchor_disabled { 254 | nesting_level += 1; 255 | } 256 | if line.is_empty() { 257 | continue; 258 | } 259 | 260 | let is_colored_line = line.starts_with("\u{1b}["); 261 | 262 | if nesting_level > 0 263 | && ((is_colored_line && (line.contains('$') || line.contains(')'))) 264 | || (line == "$" || line.starts_with(')'))) 265 | { 266 | nesting_level -= 1; 267 | } 268 | 269 | let indentation = " ".repeat(nesting_level); 270 | indented_regexp.push(format!("{indentation}{line}")); 271 | 272 | if (is_colored_line && (line.contains('^') || (i > 0 && line.contains('(')))) 273 | || (line == "^" || (i > 0 && line.starts_with('('))) 274 | { 275 | nesting_level += 1; 276 | } 277 | } 278 | 279 | indented_regexp.join("\n") 280 | } 281 | -------------------------------------------------------------------------------- /src/substring.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | pub enum Substring { 18 | Prefix, 19 | Suffix, 20 | } 21 | -------------------------------------------------------------------------------- /src/unicode_tables/decimal.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 18 | // 19 | // ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber 20 | // 21 | // Unicode version: 15.0.0. 22 | // 23 | // ucd-generate 0.3.0 is available on crates.io. 24 | 25 | pub const DECIMAL_NUMBER: &[(char, char)] = &[ 26 | ('0', '9'), 27 | ('٠', '٩'), 28 | ('۰', '۹'), 29 | ('߀', '߉'), 30 | ('०', '९'), 31 | ('০', '৯'), 32 | ('੦', '੯'), 33 | ('૦', '૯'), 34 | ('୦', '୯'), 35 | ('௦', '௯'), 36 | ('౦', '౯'), 37 | ('೦', '೯'), 38 | ('൦', '൯'), 39 | ('෦', '෯'), 40 | ('๐', '๙'), 41 | ('໐', '໙'), 42 | ('༠', '༩'), 43 | ('၀', '၉'), 44 | ('႐', '႙'), 45 | ('០', '៩'), 46 | ('᠐', '᠙'), 47 | ('᥆', '᥏'), 48 | ('᧐', '᧙'), 49 | ('᪀', '᪉'), 50 | ('᪐', '᪙'), 51 | ('᭐', '᭙'), 52 | ('᮰', '᮹'), 53 | ('᱀', '᱉'), 54 | ('᱐', '᱙'), 55 | ('꘠', '꘩'), 56 | ('꣐', '꣙'), 57 | ('꤀', '꤉'), 58 | ('꧐', '꧙'), 59 | ('꧰', '꧹'), 60 | ('꩐', '꩙'), 61 | ('꯰', '꯹'), 62 | ('0', '9'), 63 | ('𐒠', '𐒩'), 64 | ('𐴰', '𐴹'), 65 | ('𑁦', '𑁯'), 66 | ('𑃰', '𑃹'), 67 | ('𑄶', '𑄿'), 68 | ('𑇐', '𑇙'), 69 | ('𑋰', '𑋹'), 70 | ('𑑐', '𑑙'), 71 | ('𑓐', '𑓙'), 72 | ('𑙐', '𑙙'), 73 | ('𑛀', '𑛉'), 74 | ('𑜰', '𑜹'), 75 | ('𑣠', '𑣩'), 76 | ('𑥐', '𑥙'), 77 | ('𑱐', '𑱙'), 78 | ('𑵐', '𑵙'), 79 | ('𑶠', '𑶩'), 80 | ('𑽐', '𑽙'), 81 | ('𖩠', '𖩩'), 82 | ('𖫀', '𖫉'), 83 | ('𖭐', '𖭙'), 84 | ('𝟎', '𝟿'), 85 | ('𞅀', '𞅉'), 86 | ('𞋰', '𞋹'), 87 | ('𞓰', '𞓹'), 88 | ('𞥐', '𞥙'), 89 | ('🯰', '🯹'), 90 | ]; 91 | -------------------------------------------------------------------------------- /src/unicode_tables/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | mod decimal; 18 | mod space; 19 | mod word; 20 | 21 | pub use decimal::DECIMAL_NUMBER; 22 | pub use space::WHITE_SPACE; 23 | pub use word::WORD; 24 | -------------------------------------------------------------------------------- /src/unicode_tables/space.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: 18 | // 19 | // ucd-generate property-bool ucd-15.0.0 --chars --include whitespace 20 | // 21 | // Unicode version: 15.0.0. 22 | // 23 | // ucd-generate 0.3.0 is available on crates.io. 24 | 25 | pub const WHITE_SPACE: &[(char, char)] = &[ 26 | ('\t', '\r'), 27 | (' ', ' '), 28 | ('\u{85}', '\u{85}'), 29 | ('\u{a0}', '\u{a0}'), 30 | ('\u{1680}', '\u{1680}'), 31 | ('\u{2000}', '\u{200a}'), 32 | ('\u{2028}', '\u{2029}'), 33 | ('\u{202f}', '\u{202f}'), 34 | ('\u{205f}', '\u{205f}'), 35 | ('\u{3000}', '\u{3000}'), 36 | ]; 37 | -------------------------------------------------------------------------------- /src/wasm.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #![allow(non_snake_case)] 18 | 19 | use crate::builder::{ 20 | RegExpBuilder as Builder, MINIMUM_REPETITIONS_MESSAGE, MINIMUM_SUBSTRING_LENGTH_MESSAGE, 21 | MISSING_TEST_CASES_MESSAGE, 22 | }; 23 | use itertools::Itertools; 24 | use wasm_bindgen::prelude::*; 25 | 26 | /// This class builds regular expressions from user-provided test cases. 27 | #[wasm_bindgen] 28 | #[derive(Clone)] 29 | pub struct RegExpBuilder { 30 | builder: Builder, 31 | } 32 | 33 | #[wasm_bindgen] 34 | impl RegExpBuilder { 35 | /// Specifies the test cases to build the regular expression from. 36 | /// 37 | /// The test cases need not be sorted because `RegExpBuilder` sorts them internally. 38 | /// 39 | /// ⚠ Throws an error if `testCases` is empty. 40 | pub fn from(testCases: Box<[JsValue]>) -> Result { 41 | let strs = testCases 42 | .iter() 43 | .filter_map(|it| it.as_string()) 44 | .collect_vec(); 45 | 46 | if strs.is_empty() { 47 | return Err(JsValue::from(MISSING_TEST_CASES_MESSAGE)); 48 | } 49 | Ok(RegExpBuilder { 50 | builder: Builder::from(&strs), 51 | }) 52 | } 53 | 54 | /// Tells `RegExpBuilder` to convert any Unicode decimal digit to character class `\d`. 55 | /// 56 | /// This method takes precedence over `withConversionOfWords` if both are set. 57 | /// Decimal digits are converted to `\d`, the remaining word characters to `\w`. 58 | /// 59 | /// This method takes precedence over `withConversionOfWhitespace` if both are set. 60 | /// Decimal digits are converted to `\d`, the remaining non-whitespace characters to `\S`. 61 | pub fn withConversionOfDigits(&mut self) -> RegExpBuilder { 62 | self.builder.config.is_digit_converted = true; 63 | self.clone() 64 | } 65 | 66 | /// Tells `RegExpBuilder` to convert any character which is not 67 | /// a Unicode decimal digit to character class `\D`. 68 | /// 69 | /// This method takes precedence over `withConversionOfNonWords` if both are set. 70 | /// Non-digits which are also non-word characters are converted to `\D`. 71 | /// 72 | /// This method takes precedence over `withConversionOfNonWhitespace` if both are set. 73 | /// Non-digits which are also non-space characters are converted to `\D`. 74 | pub fn withConversionOfNonDigits(&mut self) -> RegExpBuilder { 75 | self.builder.config.is_non_digit_converted = true; 76 | self.clone() 77 | } 78 | 79 | /// Tells `RegExpBuilder` to convert any Unicode whitespace character to character class `\s`. 80 | /// 81 | /// This method takes precedence over `withConversionOfNonDigits` if both are set. 82 | /// Whitespace characters are converted to `\s`, the remaining non-digit characters to `\D`. 83 | /// 84 | /// This method takes precedence over `withConversionOfNonWords` if both are set. 85 | /// Whitespace characters are converted to `\s`, the remaining non-word characters to `\W`. 86 | pub fn withConversionOfWhitespace(&mut self) -> RegExpBuilder { 87 | self.builder.config.is_space_converted = true; 88 | self.clone() 89 | } 90 | 91 | /// Tells `RegExpBuilder` to convert any character which is not 92 | /// a Unicode whitespace character to character class `\S`. 93 | pub fn withConversionOfNonWhitespace(&mut self) -> RegExpBuilder { 94 | self.builder.config.is_non_space_converted = true; 95 | self.clone() 96 | } 97 | 98 | /// Tells `RegExpBuilder` to convert any Unicode word character to character class `\w`. 99 | /// 100 | /// This method takes precedence over `withConversionOfNonDigits` if both are set. 101 | /// Word characters are converted to `\w`, the remaining non-digit characters to `\D`. 102 | /// 103 | /// This method takes precedence over `withConversionOfNonWhitespace` if both are set. 104 | /// Word characters are converted to `\w`, the remaining non-space characters to `\S`. 105 | pub fn withConversionOfWords(&mut self) -> RegExpBuilder { 106 | self.builder.config.is_word_converted = true; 107 | self.clone() 108 | } 109 | 110 | /// Tells `RegExpBuilder` to convert any character which is not 111 | /// a Unicode word character to character class `\W`. 112 | /// 113 | /// This method takes precedence over `withConversionOfNonWhitespace` if both are set. 114 | /// Non-words which are also non-space characters are converted to `\W`. 115 | pub fn withConversionOfNonWords(&mut self) -> RegExpBuilder { 116 | self.builder.config.is_non_word_converted = true; 117 | self.clone() 118 | } 119 | 120 | /// Tells `RegExpBuilder` to detect repeated non-overlapping substrings and 121 | /// to convert them to `{min,max}` quantifier notation. 122 | pub fn withConversionOfRepetitions(&mut self) -> RegExpBuilder { 123 | self.builder.config.is_repetition_converted = true; 124 | self.clone() 125 | } 126 | 127 | /// Tells `RegExpBuilder` to enable case-insensitive matching of test cases 128 | /// so that letters match both upper and lower case. 129 | pub fn withCaseInsensitiveMatching(&mut self) -> RegExpBuilder { 130 | self.builder.config.is_case_insensitive_matching = true; 131 | self.clone() 132 | } 133 | 134 | /// Tells `RegExpBuilder` to replace non-capturing groups by capturing ones. 135 | pub fn withCapturingGroups(&mut self) -> RegExpBuilder { 136 | self.builder.config.is_capturing_group_enabled = true; 137 | self.clone() 138 | } 139 | 140 | /// Tells `RegExpBuilder` to convert non-ASCII characters to unicode escape sequences. 141 | /// The parameter `useSurrogatePairs` specifies whether to convert astral code planes 142 | /// (range `U+010000` to `U+10FFFF`) to surrogate pairs. 143 | pub fn withEscapingOfNonAsciiChars(&mut self, useSurrogatePairs: bool) -> RegExpBuilder { 144 | self.builder.config.is_non_ascii_char_escaped = true; 145 | self.builder 146 | .config 147 | .is_astral_code_point_converted_to_surrogate = useSurrogatePairs; 148 | self.clone() 149 | } 150 | 151 | /// Tells `RegExpBuilder` to produce a nicer looking regular expression in verbose mode. 152 | pub fn withVerboseMode(&mut self) -> RegExpBuilder { 153 | self.builder.config.is_verbose_mode_enabled = true; 154 | self.clone() 155 | } 156 | 157 | /// Tells `RegExpBuilder` to remove the caret anchor '^' from the resulting regular 158 | /// expression, thereby allowing to match the test cases also when they do not occur 159 | /// at the start of a string. 160 | pub fn withoutStartAnchor(&mut self) -> RegExpBuilder { 161 | self.builder.config.is_start_anchor_disabled = true; 162 | self.clone() 163 | } 164 | 165 | /// Tells `RegExpBuilder` to remove the dollar sign anchor '$' from the resulting regular 166 | /// expression, thereby allowing to match the test cases also when they do not occur 167 | /// at the end of a string. 168 | pub fn withoutEndAnchor(&mut self) -> RegExpBuilder { 169 | self.builder.config.is_end_anchor_disabled = true; 170 | self.clone() 171 | } 172 | 173 | /// Tells `RegExpBuilder` to remove the caret and dollar sign anchors from the resulting 174 | /// regular expression, thereby allowing to match the test cases also when they occur 175 | /// within a larger string that contains other content as well. 176 | pub fn withoutAnchors(&mut self) -> RegExpBuilder { 177 | self.builder.config.is_start_anchor_disabled = true; 178 | self.builder.config.is_end_anchor_disabled = true; 179 | self.clone() 180 | } 181 | 182 | /// Specifies the minimum quantity of substring repetitions to be converted 183 | /// if `withConversionOfRepetitions` is set. 184 | /// 185 | /// If the quantity is not explicitly set with this method, a default value of 1 will be used. 186 | /// 187 | /// ⚠ Throws an error if `quantity` is zero. 188 | pub fn withMinimumRepetitions(&mut self, quantity: u32) -> Result { 189 | if quantity < 1 { 190 | return Err(JsValue::from(MINIMUM_REPETITIONS_MESSAGE)); 191 | } 192 | self.builder.config.minimum_repetitions = quantity; 193 | Ok(self.clone()) 194 | } 195 | 196 | /// Specifies the minimum length a repeated substring must have in order to be converted 197 | /// if `withConversionOfRepetitions` is set. 198 | /// 199 | /// If the length is not explicitly set with this method, a default value of 1 will be used. 200 | /// 201 | /// ⚠ Throws an error if `length` is zero. 202 | pub fn withMinimumSubstringLength(&mut self, length: u32) -> Result { 203 | if length < 1 { 204 | return Err(JsValue::from(MINIMUM_SUBSTRING_LENGTH_MESSAGE)); 205 | } 206 | self.builder.config.minimum_substring_length = length; 207 | Ok(self.clone()) 208 | } 209 | 210 | /// Builds the actual regular expression using the previously given settings. 211 | pub fn build(&mut self) -> String { 212 | self.builder.build() 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /tests/python/test_grex.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import inspect 17 | import pytest 18 | import re 19 | 20 | from grex import RegExpBuilder 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "test_cases,expected_pattern", 25 | [ 26 | pytest.param(["abc", "abd", "abe"], "^ab[c-e]$"), 27 | ] 28 | ) 29 | def test_default_settings(test_cases, expected_pattern): 30 | pattern = RegExpBuilder.from_test_cases(test_cases).build() 31 | assert pattern == expected_pattern 32 | for test_case in test_cases: 33 | assert re.match(pattern, test_case) 34 | 35 | 36 | @pytest.mark.parametrize( 37 | "test_cases,expected_pattern", 38 | [ 39 | pytest.param(["My ♥ and 💩 is yours."], "^My \\u2665 and \\U0001f4a9 is yours\\.$"), 40 | ] 41 | ) 42 | def test_escaping(test_cases, expected_pattern): 43 | pattern = (RegExpBuilder.from_test_cases(test_cases) 44 | .with_escaping_of_non_ascii_chars(use_surrogate_pairs=False) 45 | .build()) 46 | assert pattern == expected_pattern 47 | for test_case in test_cases: 48 | assert re.match(pattern, test_case) 49 | 50 | 51 | @pytest.mark.parametrize( 52 | "test_cases,expected_pattern", 53 | [ 54 | pytest.param(["My ♥ and 💩 is yours."], "^My \\u2665 and \\ud83d\\udca9 is yours\\.$"), 55 | ] 56 | ) 57 | def test_escaping_with_surrogate_pairs(test_cases, expected_pattern): 58 | pattern = (RegExpBuilder.from_test_cases(test_cases) 59 | .with_escaping_of_non_ascii_chars(use_surrogate_pairs=True) 60 | .build()) 61 | assert pattern == expected_pattern 62 | # module re does not support matching surrogate pairs 63 | 64 | 65 | @pytest.mark.parametrize( 66 | "test_cases,expected_pattern", 67 | [ 68 | pytest.param(["efgh", "abcxy", "abcw"], "^(abc(xy|w)|efgh)$"), 69 | ] 70 | ) 71 | def test_capturing_groups(test_cases, expected_pattern): 72 | pattern = (RegExpBuilder.from_test_cases(test_cases) 73 | .with_capturing_groups() 74 | .build()) 75 | assert pattern == expected_pattern 76 | for test_case in test_cases: 77 | assert re.match(pattern, test_case) 78 | 79 | 80 | @pytest.mark.parametrize( 81 | "test_cases,expected_pattern", 82 | [ 83 | pytest.param(["efgh", "abcxy", "abcw"], "(?:abc(?:xy|w)|efgh)"), 84 | ] 85 | ) 86 | def test_without_anchors(test_cases, expected_pattern): 87 | pattern = (RegExpBuilder.from_test_cases(test_cases) 88 | .without_anchors() 89 | .build()) 90 | assert pattern == expected_pattern 91 | for test_case in test_cases: 92 | assert re.match(pattern, test_case) 93 | 94 | 95 | @pytest.mark.parametrize( 96 | "test_cases,expected_pattern", 97 | [ 98 | pytest.param(["ABC", "zBC", "abc", "AbC", "aBc"], "(?i)^[az]bc$"), 99 | ] 100 | ) 101 | def test_case_insensitive_matching(test_cases, expected_pattern): 102 | pattern = (RegExpBuilder.from_test_cases(test_cases) 103 | .with_case_insensitive_matching() 104 | .build()) 105 | assert pattern == expected_pattern 106 | for test_case in test_cases: 107 | assert re.match(pattern, test_case) 108 | 109 | 110 | @pytest.mark.parametrize( 111 | "test_cases,expected_pattern", 112 | [ 113 | pytest.param( 114 | ["[a-z]", "(d,e,f)"], 115 | inspect.cleandoc(""" 116 | (?x) 117 | ^ 118 | (?: 119 | \\(d,e,f\\) 120 | | 121 | \\[a\\-z\\] 122 | ) 123 | $ 124 | """) 125 | ), 126 | ] 127 | ) 128 | def test_verbose_mode(test_cases, expected_pattern): 129 | pattern = (RegExpBuilder.from_test_cases(test_cases) 130 | .with_verbose_mode() 131 | .build()) 132 | assert pattern == expected_pattern 133 | for test_case in test_cases: 134 | assert re.match(pattern, test_case) 135 | 136 | 137 | @pytest.mark.parametrize( 138 | "test_cases,expected_pattern", 139 | [ 140 | pytest.param( 141 | ["Ä@Ö€Ü", "ä@ö€ü", "Ä@ö€Ü", "ä@Ö€ü"], 142 | inspect.cleandoc(""" 143 | (?ix) 144 | ^ 145 | ä@ö€ü 146 | $ 147 | """) 148 | ) 149 | ] 150 | ) 151 | def test_case_insensitive_matching_and_verbose_mode(test_cases, expected_pattern): 152 | pattern = (RegExpBuilder.from_test_cases(test_cases) 153 | .with_case_insensitive_matching() 154 | .with_verbose_mode() 155 | .build()) 156 | assert pattern == expected_pattern 157 | for test_case in test_cases: 158 | assert re.match(pattern, test_case) 159 | 160 | 161 | @pytest.mark.parametrize( 162 | "test_cases,expected_pattern", 163 | [ 164 | pytest.param(["a", "b\nx\nx", "c"], "^(?:b(?:\\nx){2}|[ac])$"), 165 | ] 166 | ) 167 | def test_conversion_of_repetitions(test_cases, expected_pattern): 168 | pattern = (RegExpBuilder.from_test_cases(test_cases) 169 | .with_conversion_of_repetitions() 170 | .build()) 171 | assert pattern == expected_pattern 172 | for test_case in test_cases: 173 | assert re.match(pattern, test_case) 174 | 175 | 176 | @pytest.mark.parametrize( 177 | "test_cases,expected_pattern", 178 | [ 179 | pytest.param(["My ♥♥♥ and 💩💩 is yours."], "^My \\u2665{3} and \\U0001f4a9{2} is yours\\.$"), 180 | ] 181 | ) 182 | def test_escaping_and_conversion_of_repetitions(test_cases, expected_pattern): 183 | pattern = (RegExpBuilder.from_test_cases(test_cases) 184 | .with_escaping_of_non_ascii_chars(use_surrogate_pairs=False) 185 | .with_conversion_of_repetitions() 186 | .build()) 187 | assert pattern == expected_pattern 188 | for test_case in test_cases: 189 | assert re.match(pattern, test_case) 190 | 191 | 192 | @pytest.mark.parametrize( 193 | "test_cases,expected_pattern", 194 | [ 195 | pytest.param(["a1b2c3"], "^a\\db\\dc\\d$"), 196 | ] 197 | ) 198 | def test_conversion_of_digits(test_cases, expected_pattern): 199 | pattern = (RegExpBuilder.from_test_cases(test_cases) 200 | .with_conversion_of_digits() 201 | .build()) 202 | assert pattern == expected_pattern 203 | for test_case in test_cases: 204 | assert re.match(pattern, test_case) 205 | 206 | 207 | @pytest.mark.parametrize( 208 | "test_cases,expected_pattern", 209 | [ 210 | pytest.param(["a1b2c3"], "^\\D1\\D2\\D3$"), 211 | ] 212 | ) 213 | def test_conversion_of_non_digits(test_cases, expected_pattern): 214 | pattern = (RegExpBuilder.from_test_cases(test_cases) 215 | .with_conversion_of_non_digits() 216 | .build()) 217 | assert pattern == expected_pattern 218 | for test_case in test_cases: 219 | assert re.match(pattern, test_case) 220 | 221 | 222 | @pytest.mark.parametrize( 223 | "test_cases,expected_pattern", 224 | [ 225 | pytest.param(["\n\t", "\r"], "^\\s(?:\\s)?$"), 226 | ] 227 | ) 228 | def test_conversion_of_whitespace(test_cases, expected_pattern): 229 | pattern = (RegExpBuilder.from_test_cases(test_cases) 230 | .with_conversion_of_whitespace() 231 | .build()) 232 | assert pattern == expected_pattern 233 | for test_case in test_cases: 234 | assert re.match(pattern, test_case) 235 | 236 | 237 | @pytest.mark.parametrize( 238 | "test_cases,expected_pattern", 239 | [ 240 | pytest.param(["a1 b2 c3"], "^\\S\\S \\S\\S \\S\\S$"), 241 | ] 242 | ) 243 | def test_conversion_of_non_whitespace(test_cases, expected_pattern): 244 | pattern = (RegExpBuilder.from_test_cases(test_cases) 245 | .with_conversion_of_non_whitespace() 246 | .build()) 247 | assert pattern == expected_pattern 248 | for test_case in test_cases: 249 | assert re.match(pattern, test_case) 250 | 251 | 252 | @pytest.mark.parametrize( 253 | "test_cases,expected_pattern", 254 | [ 255 | pytest.param(["abc", "1234"], "^\\w\\w\\w(?:\\w)?$"), 256 | ] 257 | ) 258 | def test_conversion_of_words(test_cases, expected_pattern): 259 | pattern = (RegExpBuilder.from_test_cases(test_cases) 260 | .with_conversion_of_words() 261 | .build()) 262 | assert pattern == expected_pattern 263 | for test_case in test_cases: 264 | assert re.match(pattern, test_case) 265 | 266 | 267 | @pytest.mark.parametrize( 268 | "test_cases,expected_pattern", 269 | [ 270 | pytest.param(["abc 1234"], "^abc\\W1234$"), 271 | ] 272 | ) 273 | def test_conversion_of_non_words(test_cases, expected_pattern): 274 | pattern = (RegExpBuilder.from_test_cases(test_cases) 275 | .with_conversion_of_non_words() 276 | .build()) 277 | assert pattern == expected_pattern 278 | for test_case in test_cases: 279 | assert re.match(pattern, test_case) 280 | 281 | 282 | @pytest.mark.parametrize( 283 | "test_cases,expected_pattern", 284 | [ 285 | pytest.param(["aababab"], "^aababab$"), 286 | pytest.param(["aabababab"], "^a(?:ab){4}$") 287 | ] 288 | ) 289 | def test_minimum_repetitions(test_cases, expected_pattern): 290 | pattern = (RegExpBuilder.from_test_cases(test_cases) 291 | .with_conversion_of_repetitions() 292 | .with_minimum_repetitions(3) 293 | .build()) 294 | assert pattern == expected_pattern 295 | for test_case in test_cases: 296 | assert re.match(pattern, test_case) 297 | 298 | 299 | @pytest.mark.parametrize( 300 | "test_cases,expected_pattern", 301 | [ 302 | pytest.param(["ababab"], "^ababab$"), 303 | pytest.param(["abcabcabc"], "^(?:abc){3}$") 304 | ] 305 | ) 306 | def test_minimum_substring_length(test_cases, expected_pattern): 307 | pattern = (RegExpBuilder.from_test_cases(test_cases) 308 | .with_conversion_of_repetitions() 309 | .with_minimum_substring_length(3) 310 | .build()) 311 | assert pattern == expected_pattern 312 | for test_case in test_cases: 313 | assert re.match(pattern, test_case) 314 | 315 | 316 | def test_error_for_empty_test_cases(): 317 | with pytest.raises(ValueError) as exception_info: 318 | RegExpBuilder.from_test_cases([]) 319 | assert ( 320 | exception_info.value.args[0] == 321 | "No test cases have been provided for regular expression generation" 322 | ) 323 | 324 | 325 | def test_error_for_invalid_minimum_repetitions(): 326 | with pytest.raises(ValueError) as exception_info: 327 | RegExpBuilder.from_test_cases(["abcd"]).with_minimum_repetitions(-4) 328 | assert ( 329 | exception_info.value.args[0] == 330 | "Quantity of minimum repetitions must be greater than zero" 331 | ) 332 | 333 | 334 | def test_error_for_invalid_minimum_substring_length(): 335 | with pytest.raises(ValueError) as exception_info: 336 | RegExpBuilder.from_test_cases(["abcd"]).with_minimum_substring_length(-2) 337 | assert ( 338 | exception_info.value.args[0] == 339 | "Minimum substring length must be greater than zero" 340 | ) 341 | -------------------------------------------------------------------------------- /tests/wasm_browser_tests.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #![cfg(target_family = "wasm")] 18 | 19 | use grex::WasmRegExpBuilder; 20 | use indoc::indoc; 21 | use wasm_bindgen::JsValue; 22 | use wasm_bindgen_test::*; 23 | 24 | wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser); 25 | 26 | #[wasm_bindgen_test] 27 | fn assert_regexpbuilder_succeeds() { 28 | let test_cases = Box::new([JsValue::from("hello"), JsValue::from("world")]); 29 | let builder = WasmRegExpBuilder::from(test_cases); 30 | assert!(builder.is_ok()); 31 | let regexp = builder.unwrap().build(); 32 | assert_eq!(regexp, "^(?:hello|world)$"); 33 | } 34 | 35 | #[wasm_bindgen_test] 36 | fn assert_regexpbuilder_fails() { 37 | let builder = WasmRegExpBuilder::from(Box::new([])); 38 | assert_eq!( 39 | builder.err(), 40 | Some(JsValue::from( 41 | "No test cases have been provided for regular expression generation" 42 | )) 43 | ); 44 | } 45 | 46 | #[wasm_bindgen_test] 47 | fn test_conversion_of_digits() { 48 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 49 | let regexp = WasmRegExpBuilder::from(test_cases) 50 | .unwrap() 51 | .withConversionOfDigits() 52 | .build(); 53 | assert_eq!(regexp, "^(?:abc |\\d\\d\\d)$"); 54 | } 55 | 56 | #[wasm_bindgen_test] 57 | fn test_conversion_of_non_digits() { 58 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 59 | let regexp = WasmRegExpBuilder::from(test_cases) 60 | .unwrap() 61 | .withConversionOfNonDigits() 62 | .build(); 63 | assert_eq!(regexp, "^(?:\\D\\D\\D\\D\\D|123)$"); 64 | } 65 | 66 | #[wasm_bindgen_test] 67 | fn test_conversion_of_whitespace() { 68 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 69 | let regexp = WasmRegExpBuilder::from(test_cases) 70 | .unwrap() 71 | .withConversionOfWhitespace() 72 | .build(); 73 | assert_eq!(regexp, "^(?:abc\\s\\s|123)$"); 74 | } 75 | 76 | #[wasm_bindgen_test] 77 | fn test_conversion_of_non_whitespace() { 78 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 79 | let regexp = WasmRegExpBuilder::from(test_cases) 80 | .unwrap() 81 | .withConversionOfNonWhitespace() 82 | .build(); 83 | assert_eq!(regexp, "^\\S\\S\\S(?: )?$"); 84 | } 85 | 86 | #[wasm_bindgen_test] 87 | fn test_conversion_of_words() { 88 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 89 | let regexp = WasmRegExpBuilder::from(test_cases) 90 | .unwrap() 91 | .withConversionOfWords() 92 | .build(); 93 | assert_eq!(regexp, "^\\w\\w\\w(?: )?$"); 94 | } 95 | 96 | #[wasm_bindgen_test] 97 | fn test_conversion_of_non_words() { 98 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 99 | let regexp = WasmRegExpBuilder::from(test_cases) 100 | .unwrap() 101 | .withConversionOfNonWords() 102 | .build(); 103 | assert_eq!(regexp, "^(?:abc\\W\\W|123)$"); 104 | } 105 | 106 | #[wasm_bindgen_test] 107 | fn test_conversion_of_repetitions() { 108 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 109 | let regexp = WasmRegExpBuilder::from(test_cases) 110 | .unwrap() 111 | .withConversionOfRepetitions() 112 | .build(); 113 | assert_eq!(regexp, "^(?:abc {2}|123)$"); 114 | } 115 | 116 | #[wasm_bindgen_test] 117 | fn test_case_insensitive_matching() { 118 | let test_cases = Box::new([ 119 | JsValue::from("ABC"), 120 | JsValue::from("abc "), 121 | JsValue::from("123"), 122 | ]); 123 | let regexp = WasmRegExpBuilder::from(test_cases) 124 | .unwrap() 125 | .withCaseInsensitiveMatching() 126 | .build(); 127 | assert_eq!(regexp, "(?i)^(?:abc(?: )?|123)$"); 128 | } 129 | 130 | #[wasm_bindgen_test] 131 | fn test_capturing_groups() { 132 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 133 | let regexp = WasmRegExpBuilder::from(test_cases) 134 | .unwrap() 135 | .withCapturingGroups() 136 | .build(); 137 | assert_eq!(regexp, "^(abc |123)$"); 138 | } 139 | 140 | #[wasm_bindgen_test] 141 | fn test_escaping_of_non_ascii_chars() { 142 | let test_cases = Box::new([ 143 | JsValue::from("abc "), 144 | JsValue::from("123"), 145 | JsValue::from("♥"), 146 | ]); 147 | let regexp = WasmRegExpBuilder::from(test_cases) 148 | .unwrap() 149 | .withEscapingOfNonAsciiChars(false) 150 | .build(); 151 | assert_eq!(regexp, "^(?:abc |123|\\u{2665})$"); 152 | } 153 | 154 | #[wasm_bindgen_test] 155 | fn test_verbose_mode() { 156 | let test_cases = Box::new([ 157 | JsValue::from("abc "), 158 | JsValue::from("123"), 159 | JsValue::from("♥"), 160 | ]); 161 | let regexp = WasmRegExpBuilder::from(test_cases) 162 | .unwrap() 163 | .withVerboseMode() 164 | .build(); 165 | assert_eq!( 166 | regexp, 167 | indoc!( 168 | r#" 169 | (?x) 170 | ^ 171 | (?: 172 | abc\ \ 173 | | 174 | 123 175 | | 176 | ♥ 177 | ) 178 | $"# 179 | ) 180 | ); 181 | } 182 | 183 | #[wasm_bindgen_test] 184 | fn test_without_start_anchor() { 185 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 186 | let regexp = WasmRegExpBuilder::from(test_cases) 187 | .unwrap() 188 | .withoutStartAnchor() 189 | .build(); 190 | assert_eq!(regexp, "(?:abc |123)$"); 191 | } 192 | 193 | #[wasm_bindgen_test] 194 | fn test_without_end_anchor() { 195 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 196 | let regexp = WasmRegExpBuilder::from(test_cases) 197 | .unwrap() 198 | .withoutEndAnchor() 199 | .build(); 200 | assert_eq!(regexp, "^(?:abc |123)"); 201 | } 202 | 203 | #[wasm_bindgen_test] 204 | fn test_without_anchors() { 205 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 206 | let regexp = WasmRegExpBuilder::from(test_cases) 207 | .unwrap() 208 | .withoutAnchors() 209 | .build(); 210 | assert_eq!(regexp, "(?:abc |123)"); 211 | } 212 | 213 | #[wasm_bindgen_test] 214 | fn test_minimum_repetitions() { 215 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 216 | let builder = WasmRegExpBuilder::from(test_cases) 217 | .unwrap() 218 | .withMinimumRepetitions(0); 219 | assert_eq!( 220 | builder.err(), 221 | Some(JsValue::from( 222 | "Quantity of minimum repetitions must be greater than zero" 223 | )) 224 | ); 225 | } 226 | 227 | #[wasm_bindgen_test] 228 | fn test_minimum_substring_length() { 229 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 230 | let builder = WasmRegExpBuilder::from(test_cases) 231 | .unwrap() 232 | .withMinimumSubstringLength(0); 233 | assert_eq!( 234 | builder.err(), 235 | Some(JsValue::from( 236 | "Minimum substring length must be greater than zero" 237 | )) 238 | ); 239 | } 240 | -------------------------------------------------------------------------------- /tests/wasm_node_tests.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright © 2019-today Peter M. Stahl pemistahl@gmail.com 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #![cfg(target_family = "wasm")] 18 | 19 | use grex::WasmRegExpBuilder; 20 | use indoc::indoc; 21 | use wasm_bindgen::JsValue; 22 | use wasm_bindgen_test::*; 23 | 24 | #[wasm_bindgen_test] 25 | fn assert_regexpbuilder_succeeds() { 26 | let test_cases = Box::new([JsValue::from("hello"), JsValue::from("world")]); 27 | let builder = WasmRegExpBuilder::from(test_cases); 28 | assert!(builder.is_ok()); 29 | let regexp = builder.unwrap().build(); 30 | assert_eq!(regexp, "^(?:hello|world)$"); 31 | } 32 | 33 | #[wasm_bindgen_test] 34 | fn assert_regexpbuilder_fails() { 35 | let builder = WasmRegExpBuilder::from(Box::new([])); 36 | assert_eq!( 37 | builder.err(), 38 | Some(JsValue::from( 39 | "No test cases have been provided for regular expression generation" 40 | )) 41 | ); 42 | } 43 | 44 | #[wasm_bindgen_test] 45 | fn test_conversion_of_digits() { 46 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 47 | let regexp = WasmRegExpBuilder::from(test_cases) 48 | .unwrap() 49 | .withConversionOfDigits() 50 | .build(); 51 | assert_eq!(regexp, "^(?:abc |\\d\\d\\d)$"); 52 | } 53 | 54 | #[wasm_bindgen_test] 55 | fn test_conversion_of_non_digits() { 56 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 57 | let regexp = WasmRegExpBuilder::from(test_cases) 58 | .unwrap() 59 | .withConversionOfNonDigits() 60 | .build(); 61 | assert_eq!(regexp, "^(?:\\D\\D\\D\\D\\D|123)$"); 62 | } 63 | 64 | #[wasm_bindgen_test] 65 | fn test_conversion_of_whitespace() { 66 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 67 | let regexp = WasmRegExpBuilder::from(test_cases) 68 | .unwrap() 69 | .withConversionOfWhitespace() 70 | .build(); 71 | assert_eq!(regexp, "^(?:abc\\s\\s|123)$"); 72 | } 73 | 74 | #[wasm_bindgen_test] 75 | fn test_conversion_of_non_whitespace() { 76 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 77 | let regexp = WasmRegExpBuilder::from(test_cases) 78 | .unwrap() 79 | .withConversionOfNonWhitespace() 80 | .build(); 81 | assert_eq!(regexp, "^\\S\\S\\S(?: )?$"); 82 | } 83 | 84 | #[wasm_bindgen_test] 85 | fn test_conversion_of_words() { 86 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 87 | let regexp = WasmRegExpBuilder::from(test_cases) 88 | .unwrap() 89 | .withConversionOfWords() 90 | .build(); 91 | assert_eq!(regexp, "^\\w\\w\\w(?: )?$"); 92 | } 93 | 94 | #[wasm_bindgen_test] 95 | fn test_conversion_of_non_words() { 96 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 97 | let regexp = WasmRegExpBuilder::from(test_cases) 98 | .unwrap() 99 | .withConversionOfNonWords() 100 | .build(); 101 | assert_eq!(regexp, "^(?:abc\\W\\W|123)$"); 102 | } 103 | 104 | #[wasm_bindgen_test] 105 | fn test_conversion_of_repetitions() { 106 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 107 | let regexp = WasmRegExpBuilder::from(test_cases) 108 | .unwrap() 109 | .withConversionOfRepetitions() 110 | .build(); 111 | assert_eq!(regexp, "^(?:abc {2}|123)$"); 112 | } 113 | 114 | #[wasm_bindgen_test] 115 | fn test_case_insensitive_matching() { 116 | let test_cases = Box::new([ 117 | JsValue::from("ABC"), 118 | JsValue::from("abc "), 119 | JsValue::from("123"), 120 | ]); 121 | let regexp = WasmRegExpBuilder::from(test_cases) 122 | .unwrap() 123 | .withCaseInsensitiveMatching() 124 | .build(); 125 | assert_eq!(regexp, "(?i)^(?:abc(?: )?|123)$"); 126 | } 127 | 128 | #[wasm_bindgen_test] 129 | fn test_capturing_groups() { 130 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 131 | let regexp = WasmRegExpBuilder::from(test_cases) 132 | .unwrap() 133 | .withCapturingGroups() 134 | .build(); 135 | assert_eq!(regexp, "^(abc |123)$"); 136 | } 137 | 138 | #[wasm_bindgen_test] 139 | fn test_escaping_of_non_ascii_chars() { 140 | let test_cases = Box::new([ 141 | JsValue::from("abc "), 142 | JsValue::from("123"), 143 | JsValue::from("♥"), 144 | ]); 145 | let regexp = WasmRegExpBuilder::from(test_cases) 146 | .unwrap() 147 | .withEscapingOfNonAsciiChars(false) 148 | .build(); 149 | assert_eq!(regexp, "^(?:abc |123|\\u{2665})$"); 150 | } 151 | 152 | #[wasm_bindgen_test] 153 | fn test_verbose_mode() { 154 | let test_cases = Box::new([ 155 | JsValue::from("abc "), 156 | JsValue::from("123"), 157 | JsValue::from("♥"), 158 | ]); 159 | let regexp = WasmRegExpBuilder::from(test_cases) 160 | .unwrap() 161 | .withVerboseMode() 162 | .build(); 163 | assert_eq!( 164 | regexp, 165 | indoc!( 166 | r#" 167 | (?x) 168 | ^ 169 | (?: 170 | abc\ \ 171 | | 172 | 123 173 | | 174 | ♥ 175 | ) 176 | $"# 177 | ) 178 | ); 179 | } 180 | 181 | #[wasm_bindgen_test] 182 | fn test_without_start_anchor() { 183 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 184 | let regexp = WasmRegExpBuilder::from(test_cases) 185 | .unwrap() 186 | .withoutStartAnchor() 187 | .build(); 188 | assert_eq!(regexp, "(?:abc |123)$"); 189 | } 190 | 191 | #[wasm_bindgen_test] 192 | fn test_without_end_anchor() { 193 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 194 | let regexp = WasmRegExpBuilder::from(test_cases) 195 | .unwrap() 196 | .withoutEndAnchor() 197 | .build(); 198 | assert_eq!(regexp, "^(?:abc |123)"); 199 | } 200 | 201 | #[wasm_bindgen_test] 202 | fn test_without_anchors() { 203 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 204 | let regexp = WasmRegExpBuilder::from(test_cases) 205 | .unwrap() 206 | .withoutAnchors() 207 | .build(); 208 | assert_eq!(regexp, "(?:abc |123)"); 209 | } 210 | 211 | #[wasm_bindgen_test] 212 | fn test_minimum_repetitions() { 213 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 214 | let builder = WasmRegExpBuilder::from(test_cases) 215 | .unwrap() 216 | .withMinimumRepetitions(0); 217 | assert_eq!( 218 | builder.err(), 219 | Some(JsValue::from( 220 | "Quantity of minimum repetitions must be greater than zero" 221 | )) 222 | ); 223 | } 224 | 225 | #[wasm_bindgen_test] 226 | fn test_minimum_substring_length() { 227 | let test_cases = Box::new([JsValue::from("abc "), JsValue::from("123")]); 228 | let builder = WasmRegExpBuilder::from(test_cases) 229 | .unwrap() 230 | .withMinimumSubstringLength(0); 231 | assert_eq!( 232 | builder.err(), 233 | Some(JsValue::from( 234 | "Minimum substring length must be greater than zero" 235 | )) 236 | ); 237 | } 238 | -------------------------------------------------------------------------------- /website.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pemistahl/grex/cb71c10815e2216f4941f0b52154fb5d1fc0a01c/website.jpg --------------------------------------------------------------------------------