├── .cargo └── config.toml ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── python.yml │ ├── release.yml │ ├── rust.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── DESIGN.md ├── LICENSE-APACHE ├── LICENSE-MIT ├── Makefile ├── README.md ├── README_CN.md ├── data ├── str_conv │ ├── DerivedGeneralCategory.txt │ ├── DerivedNumericValues.txt │ ├── EquivalentUnifiedIdeograph.txt │ ├── NormalizationTest.txt │ ├── PropList.txt │ ├── Unihan_Readings.txt │ └── Unihan_Variants.txt ├── text │ ├── cn │ │ ├── 三体.txt │ │ └── 西游记.txt │ └── en │ │ ├── bible_kjv.txt │ │ └── sherlock.txt └── word_list │ ├── cn │ ├── cn_words_100.txt │ ├── cn_words_100000.txt │ ├── cn_words_15000.txt │ ├── cn_words_30000.txt │ ├── cn_words_5000.txt │ └── cn_words_50000.txt │ └── en │ ├── en_words_100.txt │ ├── en_words_100000.txt │ ├── en_words_15000.txt │ ├── en_words_30000.txt │ ├── en_words_5000.txt │ └── en_words_50000.txt ├── matcher_c ├── Cargo.toml ├── README.md ├── extension_types.py ├── matcher_c.h └── src │ └── lib.rs ├── matcher_java ├── README.md ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── matcher_java │ │ ├── MatcherJava.java │ │ └── extension_types │ │ ├── MatchResult.java │ │ ├── MatchTable.java │ │ ├── MatchTableType.java │ │ ├── ProcessType.java │ │ ├── ProcessTypeSerializer.java │ │ ├── Regex.java │ │ ├── RegexMatchType.java │ │ ├── SimMatchType.java │ │ ├── Similar.java │ │ ├── Simple.java │ │ └── SimpleResult.java │ └── test │ └── java │ └── com │ └── matcher_java │ └── MatcherJavaExample.java ├── matcher_py ├── Cargo.lock ├── Cargo.toml ├── README.md ├── build.rs ├── pyproject.toml ├── python │ └── matcher_py │ │ ├── __init__.py │ │ ├── extension_types.py │ │ ├── matcher_py.pyi │ │ └── py.typed ├── src │ └── lib.rs ├── test │ ├── __init__.py │ ├── test_matcher.py │ └── test_simple_matcher.py └── uv.lock └── matcher_rs ├── Cargo.toml ├── README.md ├── benches └── bench.rs ├── build.rs ├── process_map ├── FANJIAN.txt ├── NORM.txt ├── NUM-NORM.txt ├── PINYIN.txt └── TEXT-DELETE.txt ├── src ├── lib.rs ├── matcher.rs ├── process │ ├── constants.rs │ ├── mod.rs │ └── process_matcher.rs ├── regex_matcher.rs ├── sim_matcher.rs ├── simple_matcher.rs └── util │ ├── mod.rs │ ├── serde.rs │ └── word.rs └── tests └── test.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] 3 | rustdocflags = ["-C", "target-cpu=native", "--document-private-items"] 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Additional context** 27 | Add any other context about the problem here. 28 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: python 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | linux: 13 | runs-on: ${{ matrix.platform.runner }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | platform: 18 | - runner: ubuntu-latest 19 | target: x86_64 20 | - runner: ubuntu-latest 21 | target: aarch64 22 | python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Install Python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: "${{ matrix.python }}" 29 | - name: Build wheels 30 | uses: PyO3/maturin-action@v1 31 | env: 32 | RUSTFLAGS: "-Z threads=2 -D warnings" 33 | with: 34 | target: ${{ matrix.platform.target }} 35 | args: --release --out dist -i python${{ matrix.python }} 36 | sccache: 'true' 37 | manylinux: auto 38 | rust-toolchain: nightly 39 | working-directory: matcher_py 40 | - name: Upload wheels 41 | uses: actions/upload-artifact@v4 42 | with: 43 | name: wheels-linux-${{ matrix.platform.target }}-${{ matrix.python }} 44 | path: matcher_py/dist 45 | overwrite: true 46 | 47 | musllinux: 48 | runs-on: ${{ matrix.platform.runner }} 49 | strategy: 50 | fail-fast: false 51 | matrix: 52 | platform: 53 | - runner: ubuntu-latest 54 | target: x86_64 55 | - runner: ubuntu-latest 56 | target: aarch64 57 | python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 58 | steps: 59 | - uses: actions/checkout@v4 60 | - name: Install Python 61 | uses: actions/setup-python@v5 62 | with: 63 | python-version: "${{ matrix.python }}" 64 | - name: Build wheels 65 | uses: PyO3/maturin-action@v1 66 | env: 67 | RUSTFLAGS: "-Z threads=2 -D warnings" 68 | with: 69 | target: ${{ matrix.platform.target }} 70 | args: --release --out dist -i python${{ matrix.python }} 71 | sccache: 'true' 72 | manylinux: musllinux_1_2 73 | rust-toolchain: nightly 74 | working-directory: matcher_py 75 | - name: Upload wheels 76 | uses: actions/upload-artifact@v4 77 | with: 78 | name: wheels-musllinux-${{ matrix.platform.target }}-${{ matrix.python }} 79 | path: matcher_py/dist 80 | overwrite: true 81 | 82 | windows: 83 | runs-on: ${{ matrix.platform.runner }} 84 | strategy: 85 | fail-fast: false 86 | matrix: 87 | platform: 88 | - runner: windows-latest 89 | target: x64 90 | python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 91 | steps: 92 | - uses: actions/checkout@v4 93 | - name: Install Python 94 | uses: actions/setup-python@v5 95 | with: 96 | python-version: "${{ matrix.python }}" 97 | - name: Build wheels 98 | uses: PyO3/maturin-action@v1 99 | env: 100 | RUSTFLAGS: "-Z threads=2 -D warnings" 101 | with: 102 | target: ${{ matrix.platform.target }} 103 | args: --release --out dist -i python${{ matrix.python }} 104 | sccache: 'true' 105 | rust-toolchain: nightly 106 | working-directory: matcher_py 107 | - name: Upload wheels 108 | uses: actions/upload-artifact@v4 109 | with: 110 | name: wheels-windows-${{ matrix.platform.target }}-${{ matrix.python }} 111 | path: matcher_py/dist 112 | overwrite: true 113 | 114 | macos: 115 | runs-on: ${{ matrix.platform.runner }} 116 | strategy: 117 | fail-fast: false 118 | matrix: 119 | platform: 120 | - runner: macos-15 121 | target: aarch64 122 | python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 123 | steps: 124 | - uses: actions/checkout@v4 125 | - name: Install Python 126 | uses: actions/setup-python@v5 127 | with: 128 | python-version: "${{ matrix.python }}" 129 | - name: Build wheels 130 | uses: PyO3/maturin-action@v1 131 | env: 132 | RUSTFLAGS: "-Z threads=2 -D warnings" 133 | with: 134 | target: ${{ matrix.platform.target }} 135 | args: --release --out dist -i python${{ matrix.python }} 136 | sccache: 'true' 137 | rust-toolchain: nightly 138 | working-directory: matcher_py 139 | - name: Upload wheels 140 | uses: actions/upload-artifact@v4 141 | with: 142 | name: wheels-macos-${{ matrix.platform.target }}-${{ matrix.python }} 143 | path: matcher_py/dist 144 | overwrite: true 145 | 146 | sdist: 147 | runs-on: ubuntu-latest 148 | steps: 149 | - uses: actions/checkout@v4 150 | - name: Install Python 151 | uses: actions/setup-python@v5 152 | with: 153 | python-version: 3.13 154 | - name: Build sdist 155 | uses: PyO3/maturin-action@v1 156 | with: 157 | command: sdist 158 | args: --out dist 159 | rust-toolchain: nightly 160 | working-directory: matcher_py 161 | - name: Upload sdist 162 | uses: actions/upload-artifact@v4 163 | with: 164 | name: wheels-sdist 165 | path: matcher_py/dist 166 | overwrite: true 167 | 168 | release: 169 | name: Release 170 | runs-on: ubuntu-latest 171 | needs: [linux, musllinux, windows, macos, sdist] 172 | steps: 173 | - name: Download artifact 174 | uses: actions/download-artifact@v4 175 | with: 176 | pattern: wheels-* 177 | merge-multiple: true 178 | path: dist 179 | - uses: actions/setup-python@v5 180 | with: 181 | python-version: 3.13 182 | - run: pip install --upgrade pip twine 183 | - name: Publish to pypi 184 | env: 185 | TWINE_USERNAME: __token__ 186 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 187 | run: twine upload --skip-existing dist/* 188 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.platform.runner }} 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | platform: 18 | - runner: ubuntu-latest 19 | target: x86_64-unknown-linux-gnu 20 | suffix: so 21 | # - runner: ubuntu-latest 22 | # target: aarch64-unknown-linux-gnu 23 | # suffix: so 24 | - runner: macos-15 25 | target: aarch64-apple-darwin 26 | suffix: dylib 27 | - runner: windows-latest 28 | target: x86_64-pc-windows-gnu 29 | suffix: dll 30 | - runner: windows-latest 31 | target: x86_64-pc-windows-msvc 32 | suffix: dll 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Install Rust 36 | uses: dtolnay/rust-toolchain@master 37 | with: 38 | toolchain: nightly 39 | targets: ${{ matrix.platform.target }} 40 | - name: Install dependencies 41 | if: matrix.platform.runner == 'ubuntu-latest' && matrix.platform.target == 'aarch64-unknown-linux-gnu' 42 | run: sudo apt-get install gcc-aarch64-linux-gnu 43 | - name: Build 44 | run: cargo build --release --target ${{ matrix.platform.target }} 45 | - name: List files 46 | run: ls ./target/${{ matrix.platform.target }}/release/ 47 | - name: Rename 48 | shell: bash 49 | run: | 50 | mkdir libmatcher 51 | mv ./target/${{ matrix.platform.target }}/release/*matcher_c.${{ matrix.platform.suffix }} libmatcher/${{ matrix.platform.target }}-libmatcher_c.${{ matrix.platform.suffix }} 52 | mv ./target/${{ matrix.platform.target }}/release/*matcher_py.${{ matrix.platform.suffix }} libmatcher/${{ matrix.platform.target }}-libmatcher_py.${{ matrix.platform.suffix }} 53 | - name: Upload release 54 | uses: actions/upload-artifact@v4 55 | with: 56 | name: libmatcher-${{ matrix.platform.target }} 57 | path: libmatcher 58 | 59 | release: 60 | name: Release 61 | runs-on: ubuntu-latest 62 | needs: [build] 63 | steps: 64 | - name: Download artifact 65 | uses: actions/download-artifact@v4 66 | with: 67 | pattern: libmatcher-* 68 | merge-multiple: true 69 | path: artifact 70 | - name: Make release 71 | uses: softprops/action-gh-release@v2 72 | with: 73 | draft: true 74 | prerelease: false 75 | generate_release_notes: true 76 | files: artifact/* 77 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: rust 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: read 10 | 11 | env: 12 | CARGO_TERM_COLOR: always 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Install rust 20 | uses: dtolnay/rust-toolchain@master 21 | with: 22 | toolchain: nightly 23 | - name: Build 24 | run: cargo build --release --verbose 25 | - name: Test 26 | run: cargo test -p matcher_rs --verbose --no-default-features 27 | - name: Test dfa 28 | run: cargo test -p matcher_rs --verbose --no-default-features --features "dfa" 29 | - name: Test runtime_build and dfa 30 | run: cargo test -p matcher_rs --verbose --no-default-features --features "runtime_build,dfa" 31 | - name: Test serde and dfa 32 | run: cargo test -p matcher_rs --verbose --no-default-features --features "serde,dfa" 33 | - name: Run doc 34 | run: cargo doc 35 | - name: Release 36 | env: 37 | CARGO_REGISTRY_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} 38 | run: | 39 | cargo publish -p matcher_rs 40 | cargo publish -p matcher_py 41 | cargo publish -p matcher_c 42 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'matcher_py/src/**' 7 | - 'matcher_rs/src/**' 8 | - 'matcher_c/src/**' 9 | - '.github/workflows/test.yml' 10 | - '.cargo/config.toml' 11 | pull_request: 12 | paths: 13 | - 'matcher_py/src/**' 14 | - 'matcher_rs/src/**' 15 | - 'matcher_c/src/**' 16 | - '.github/workflows/test.yml' 17 | - '.cargo/config.toml' 18 | 19 | permissions: 20 | contents: read 21 | 22 | jobs: 23 | build: 24 | runs-on: ${{ matrix.platform.runner }} 25 | strategy: 26 | fail-fast: false 27 | matrix: 28 | platform: 29 | - runner: ubuntu-latest 30 | target: x86_64-unknown-linux-gnu 31 | suffix: so 32 | # - runner: ubuntu-latest 33 | # target: aarch64-unknown-linux-gnu 34 | - runner: macos-15 35 | target: aarch64-apple-darwin 36 | suffix: dylib 37 | - runner: windows-latest 38 | target: x86_64-pc-windows-gnu 39 | suffix: dll 40 | - runner: windows-latest 41 | target: x86_64-pc-windows-msvc 42 | suffix: dll 43 | steps: 44 | - uses: actions/checkout@v4 45 | - name: Install dependencies 46 | if: matrix.platform.runner == 'ubuntu-latest' && matrix.platform.target == 'aarch64-unknown-linux-gnu' 47 | run: sudo apt-get install gcc-aarch64-linux-gnu 48 | - name: Install Rust 49 | uses: dtolnay/rust-toolchain@master 50 | with: 51 | toolchain: nightly 52 | target: ${{ matrix.platform.target }} 53 | - name: Build 54 | run: cargo build --release --target ${{ matrix.platform.target }} 55 | - name: Test 56 | run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features 57 | - name: Test dfa 58 | run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features --features "dfa" 59 | - name: Test runtime_build and dfa 60 | run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features --features "runtime_build,dfa" 61 | - name: Test serde and dfa 62 | run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features --features "serde,dfa" 63 | - name: Run doc 64 | run: cargo doc 65 | - name: Rename & move 66 | shell: bash 67 | run: | 68 | cp ./target/${{ matrix.platform.target }}/release/*matcher_c.${{ matrix.platform.suffix }} matcher_c/matcher_c.so 69 | cp ./target/${{ matrix.platform.target }}/release/*matcher_py.${{ matrix.platform.suffix }} matcher_py/python/matcher_py/matcher_py.so 70 | - name: Install Python 71 | uses: actions/setup-python@v5 72 | with: 73 | python-version: 3.13 74 | - name: Build wheels 75 | uses: PyO3/maturin-action@v1 76 | env: 77 | RUSTFLAGS: "-Z threads=2 -D warnings" 78 | with: 79 | target: ${{ matrix.platform.target }} 80 | args: --release -i python3.13 81 | sccache: 'true' 82 | rust-toolchain: nightly 83 | working-directory: matcher_py 84 | - name: Python Test 85 | shell: bash 86 | if: matrix.platform.runner == 'ubuntu-latest' 87 | run: | 88 | pip install -U pytest typing_extensions 89 | pip install ./target/wheels/*.whl 90 | pytest matcher_py/test 91 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Python 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | 141 | # custom files 142 | **/.DS_Store 143 | .idea 144 | .metals 145 | .vscode 146 | *.zip 147 | 148 | # Added by cargo 149 | /target 150 | 151 | .ruff_cache 152 | test.ipynb 153 | profile.json -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.5.7 - 2025-03-17 4 | 5 | ### Flexibility 6 | - Update dependencies. 7 | 8 | ## 0.5.6 - 2024-11-18 9 | 10 | ### Performance 11 | - Fix `build_process_type_tree` function, use set instead of list. 12 | - Update several dependencies. 13 | 14 | ## 0.5.5 - 2024-10-14 15 | 16 | ### Bug fixes 17 | - Change `XXX(Enum)` to `XXX(str, Enum)` in extension_types.py to fix json dumps issue. 18 | 19 | ### Flexibility 20 | - Add Python 3.13 support. 21 | - Remove msgspec, only use json in README.md. 22 | 23 | ## 0.5.4 - 2024-08-23 24 | 25 | ### Readability 26 | - Fix typo and cargo clippy warnings. 27 | - Add single line benchmark. 28 | 29 | ## 0.5.3 - 2024-07-26 30 | 31 | ### Bug fixes 32 | - Fix simple matcher is_match function. 33 | 34 | ## 0.5.2 - 2024-07-22 35 | 36 | ### Flexibility 37 | - Remove msgpack, now non-rust users should use json to serialize input of Matcher and SimpleMatcher. 38 | - Refactor Java code. 39 | 40 | ## 0.5.1 - 2024-07-19 41 | 42 | ### Performance 43 | - Use FxHash to speed up simple matcher process. 44 | 45 | ### Flexibility 46 | - Remove unnecessary dependencies. 47 | 48 | ## 0.5.0 - 2024-07-18 49 | 50 | ### Changed 51 | - A bunch of changes and I don't want to explain one by one. 52 | 53 | ## 0.4.6 - 2024-07-15 54 | 55 | ### Performance 56 | - Optimize performance. 57 | 58 | ## 0.4.5 - 2024-07-12 59 | 60 | ### Changed 61 | - Optimize Simple Matcher `process` function when multiple simple_match_type are used. 62 | - add `dfa` feature to matcher_rs. 63 | - shrink `FANJIAN` conversion map. 64 | 65 | ## 0.4.4 - 2024-07-09 66 | 67 | ### Changed 68 | - Merge PINYIN and PINYINCHAR process matcher build. 69 | - Add `process` function to matcher_py/c/java. 70 | - Fix simple matcher process function issue. 71 | - Refactor matcher_py file structure, use `rye` to manage matcher_py. 72 | - Delete `println!` in matcher_c. 73 | 74 | ## 0.4.3 - 2024-07-08 75 | 76 | ### Changed 77 | - Fix exemption word list wrongly reject entire match, not a single table. 78 | - Add match_id to MatchResult. 79 | - Reverse DFA structure to AhoCorasick structure. 80 | - matcher_c use from_utf8_unchecked instead of from_utf8. 81 | - Build multiple wheels for different python version. 82 | - Update FANJIAN.txt and NORM.txt. 83 | - Fix issues with `runtime_build` feature. 84 | 85 | ## 0.4.2 - 2024-07-07 86 | 87 | ### Changed 88 | - Optimize performance. 89 | 90 | ## 0.4.1 - 2024-07-06 91 | 92 | ### Changed 93 | - Rebuild Transformation Rules based on Unicode Standard. 94 | 95 | ## 0.4.0 - 2024-07-03 96 | 97 | ### Changed 98 | - Implement NOT logic word-wise inside SimpleMatcher, now you can use `&`(and) and `~`(not) separator to config simple word, eg: `hello&world~helo`. 99 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | resolver = "2" 3 | members = ["matcher_rs", "matcher_py", "matcher_c"] 4 | 5 | [workspace.package] 6 | authors = ["Foster Guo "] 7 | categories = ["text-processing"] 8 | description = "A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust." 9 | edition = "2021" 10 | homepage = "https://github.com/Lips7/Matcher" 11 | readme = "README.md" 12 | keywords = ["text", "string", "search", "pattern", "multi"] 13 | license = "Apache-2.0 OR MIT" 14 | repository = "https://github.com/Lips7/Matcher" 15 | version = "0.5.7" 16 | 17 | [profile.release] 18 | strip = true 19 | opt-level = 3 20 | lto = true 21 | codegen-units = 1 22 | incremental = false 23 | debug = false 24 | debug-assertions = false 25 | overflow-checks = false 26 | 27 | [profile.bench] 28 | strip = "none" 29 | opt-level = 3 30 | lto = true 31 | codegen-units = 1 32 | incremental = false 33 | debug = true 34 | debug-assertions = false 35 | overflow-checks = false 36 | -------------------------------------------------------------------------------- /DESIGN.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | ## Transformation 4 | 5 | * `FANJIAN`: build from [Unihan_Variants.txt](./data/process_map/Unihan_Variants.txt) and [EquivalentUnifiedIdeograph.txt](./data/process_map/EquivalentUnifiedIdeograph.txt). 6 | * `NUM-NORM`: build from [DerivedNumericValues.txt](./data/process_map/DerivedNumericValues.txt). 7 | * `TEXT-DELETE` and `SYMBOL-NORM`: build from [DerivedGeneralCategory.txt](./data/process_map/DerivedGeneralCategory.txt). 8 | * `WHITE-SPACE`: build from [PropList.txt](./data/process_map/PropList.txt). 9 | * `PINYIN` and `PINYIN-CHAR`: build from [Unihan_Readings.txt](./data/process_map/Unihan_Readings.txt). 10 | * `NORM`: build from [NormalizationTest.txt](./data/process_map/NormalizationTest.txt). 11 | 12 | ## Matcher 13 | 14 | ### Overview 15 | 16 | The `Matcher` is a powerful and complex system designed to identify sentence matches using multiple methods. Despite its complexity, it offers significant flexibility and power when used correctly. The main components of the `Matcher` are `MatchID` and `TableID`. 17 | 18 | ### Key Concepts 19 | 20 | 1. **MatchID**: Represents a unique identifier for a match. 21 | 2. **TableID**: Represents a unique identifier for a table within a match. 22 | 23 | ### Structure 24 | 25 | The `Matcher` utilizes a JSON structure to define matches and tables. Below is an example of its configuration: 26 | 27 | ```json 28 | { 29 | "777": [ 30 | { 31 | "table_id": 45, 32 | "match_table_type": {"process_type": "MatchNone"}, 33 | "word_list": ["hello", "world"], 34 | "exemption_process_type": "MatchNone", 35 | "exemption_word_list": [] 36 | } 37 | // other tables 38 | ] 39 | // other matches 40 | } 41 | ``` 42 | 43 | - `777`: This is the `MatchID`. 44 | - `45`: This is the `TableID`. 45 | 46 | #### Table 47 | 48 | Each `Table` represents a collection of words related to a specific topic (e.g., political, music, math). The table also includes a list of exemption words to exclude certain sentences. The logical operations within a table are as follows: 49 | 50 | - **OR Logic (within `word_list`)**: The table matches if any word in the `word_list` is matched. 51 | - **NOT Logic (between `word_list` and `exemption_word_list`)**: If any word in the `exemption_word_list` is matched, the table will not be considered as matched. 52 | 53 | #### Match 54 | 55 | A `Match` consists of multiple tables. Each match can specify a list of tables to perform the matching. This allows users to experiment with different combinations of tables to find the best configuration for their use case. The logical operation between matches is: 56 | 57 | - **OR Logic (between matches)**: The result will report all the matches if any table inside the match is matched. 58 | 59 | ### Usage Cases 60 | 61 | #### Table1 AND Table2 match 62 | ```json 63 | Input: 64 | { 65 | "1": [ 66 | { 67 | "table_id": 1, 68 | "match_table_type": {"process_type": "MatchNone"}, 69 | "word_list": ["hello", "world"], 70 | "exemption_process_type": "MatchNone", 71 | "exemption_word_list": [] 72 | } 73 | ], 74 | "2": [ 75 | { 76 | "table_id": 2, 77 | "match_table_type": {"process_type": "MatchNone"}, 78 | "word_list": ["你", "好"], 79 | "exemption_process_type": "MatchNone", 80 | "exemption_word_list": [] 81 | } 82 | ], 83 | } 84 | 85 | Output: Check if `match_id` 1 and 2 are both matched. 86 | ``` 87 | 88 | #### Table1 OR Table2 match 89 | ```json 90 | Input: 91 | { 92 | "1": [ 93 | { 94 | "table_id": 1, 95 | "match_table_type": {"process_type": "MatchNone"}, 96 | "word_list": ["hello", "world"], 97 | "exemption_process_type": "MatchNone", 98 | "exemption_word_list": [] 99 | }, 100 | { 101 | "table_id": 2, 102 | "match_table_type": {"process_type": "MatchNone"}, 103 | "word_list": ["你", "好"], 104 | "exemption_process_type": "MatchNone", 105 | "exemption_word_list": [] 106 | } 107 | ] 108 | } 109 | 110 | Output: Check if `match_id` 1 or 2 is matched. 111 | ``` 112 | 113 | #### Table1 NOT Table2 match 114 | ```json 115 | Input: 116 | { 117 | "1": [ 118 | { 119 | "table_id": 1, 120 | "match_table_type": {"process_type": "MatchNone"}, 121 | "word_list": ["hello", "world"], 122 | "exemption_process_type": "MatchNone", 123 | "exemption_word_list": [] 124 | } 125 | ], 126 | "2": [ 127 | { 128 | "table_id": 2, 129 | "match_table_type": {"process_type": "MatchNone"}, 130 | "word_list": ["你", "好"], 131 | "exemption_process_type": "MatchNone", 132 | "exemption_word_list": [] 133 | } 134 | ], 135 | } 136 | 137 | Output: Check if `match_id` 1 is matched and 2 is not matched. 138 | ``` 139 | 140 | ## SimpleMatcher 141 | 142 | ### Overview 143 | 144 | The `SimpleMatcher` is the core component, designed to be fast, efficient, and easy to use. It handles large amounts of data and identifies words based on predefined types. 145 | 146 | ### Key Concepts 147 | 148 | 1. **WordID**: Represents a unique identifier for a word in the `SimpleMatcher`. 149 | 150 | ### Structure 151 | 152 | The `SimpleMatcher` uses a mapping structure to define words and their IDs based on different match types. Below is an example configuration: 153 | 154 | ```json 155 | { 156 | "ProcessType.None": { 157 | "1": "hello&world", 158 | "2": "你好" 159 | // other words 160 | } 161 | // other simple match type word maps 162 | } 163 | ``` 164 | 165 | - `1` and `2`: These are `WordID`s used to identify words in the `SimpleMatcher`. 166 | 167 | ### Real-world Application 168 | 169 | In real-world scenarios, `word_id` is used to uniquely identify a word in the database, allowing for easy updates to the word and its variants. 170 | 171 | ### Logical Operations 172 | 173 | - **OR Logic (between different `process_type` and words in the same `process_type`)**: The `simple_matcher` is considered matched if any word in the map is matched. 174 | - **AND Logic (between words separated by `&` within a `WordID`)**: All words separated by `&` must be matched for the word to be considered as matched. 175 | - **NOT Logic (between words separated by `~` within a `WordID`)**: All words separated by `~` must not be matched for the word to be considered as matched. 176 | 177 | ### Usage Cases 178 | 179 | #### Word1 AND Word2 match 180 | ```json 181 | Input: 182 | { 183 | "ProcessType.None": { 184 | "1": "word1&word2" 185 | } 186 | } 187 | 188 | Output: Check if `word_id` 1 is matched. 189 | ``` 190 | 191 | #### Word1 OR Word2 match 192 | ```json 193 | Input: 194 | { 195 | "ProcessType.None": { 196 | "1": "word1", 197 | "2": "word2" 198 | } 199 | } 200 | 201 | Output: Check if `word_id` 1 or 2 is matched. 202 | ``` 203 | 204 | #### Word1 NOT Word2 match 205 | ```json 206 | Input: 207 | { 208 | "ProcessType.None": { 209 | "1": "word1~word2", 210 | } 211 | } 212 | 213 | Output: Check if `word_id` 1 is matched. 214 | ``` 215 | 216 | ## Summary 217 | 218 | The `Matcher` and `SimpleMatcher` systems are designed to provide a robust and flexible solution for word matching tasks. By understanding the logical operations and structures of `MatchID`, `TableID`, and `WordID`, users can effectively leverage these tools for complex matching requirements. -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2024] [Foster Guo] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any 2 | person obtaining a copy of this software and associated 3 | documentation files (the "Software"), to deal in the 4 | Software without restriction, including without 5 | limitation the rights to use, copy, modify, merge, 6 | publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software 8 | is furnished to do so, subject to the following 9 | conditions: 10 | 11 | The above copyright notice and this permission notice 12 | shall be included in all copies or substantial portions 13 | of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | cargo update 3 | cargo build --release 4 | cp ./target/release/libmatcher_c.dylib ./matcher_c/matcher_c.so 5 | cp ./target/release/libmatcher_c.dylib ./matcher_java/src/main/resources/matcher_c.so 6 | 7 | test: 8 | cargo fmt 9 | cargo clippy --all-targets -- -D warnings 10 | cargo doc 11 | 12 | cd matcher_rs 13 | cargo test --no-default-features 14 | cargo test --no-default-features --features "dfa" 15 | cargo test --no-default-features --features "runtime_build" 16 | cargo test --no-default-features --features "runtime_build,dfa" 17 | cargo test --no-default-features --features "dfa,serde" 18 | cd .. 19 | 20 | cd matcher_py 21 | ruff format . 22 | uv sync 23 | pytest 24 | cd .. 25 | 26 | update: 27 | cargo update --verbose --recursive --breaking -Z unstable-options 28 | cargo upgrade --verbose --recursive -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Matcher 2 | 3 | ![Rust](https://img.shields.io/badge/rust-%23000000.svg?style=for-the-badge&logo=rust&logoColor=white)![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)![Java](https://img.shields.io/badge/java-%23ED8B00.svg?style=for-the-badge&logo=openjdk&logoColor=white)![C](https://img.shields.io/badge/c-%2300599C.svg?style=for-the-badge&logo=c&logoColor=white) 4 | 5 | ![PyPI - License](https://img.shields.io/pypi/l/matcher_py) 6 | 7 | ![Crates.io Version](https://img.shields.io/crates/v/matcher_rs)![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/lips7/Matcher/test.yml)![docs.rs](https://img.shields.io/docsrs/matcher_rs)![Crates.io Total Downloads](https://img.shields.io/crates/d/matcher_rs) 8 | 9 | ![PyPI - Version](https://img.shields.io/pypi/v/matcher_py)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/matcher_py)![PyPI - Downloads](https://img.shields.io/pypi/dm/matcher_py) 10 | 11 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust. 12 | 13 | It's helpful for 14 | - **Precision and Recall**: Word matching is a retrieval process, LOGICAL match improves precision while TEXT VARIATIONS match improves recall. 15 | - **Content Filtering**: Detecting and filtering out offensive or sensitive words. 16 | - **Search Engines**: Improving search results by identifying relevant keywords. 17 | - **Text Analysis**: Extracting specific information from large volumes of text. 18 | - **Spam Detection**: Identifying spam content in emails or messages. 19 | - ··· 20 | 21 | ## Features 22 | 23 | For detailed implementation, see the [Design Document](./DESIGN.md). 24 | 25 | - **Multiple Matching Methods**: 26 | - Simple Word Matching 27 | - Regex-Based Matching 28 | - Similarity-Based Matching 29 | - **Text Transformation**: 30 | - **Fanjian**: Simplify traditional Chinese characters to simplified ones. 31 | Example: `蟲艸` -> `虫草` 32 | - **Delete**: Remove specific characters. 33 | Example: `*Fu&*iii&^%%*&kkkk` -> `Fuiiikkkk` 34 | - **Normalize**: Normalize special characters to identifiable characters. 35 | Example: `𝜢𝕰𝕃𝙻𝝧 𝙒ⓞᵣℒ𝒟!` -> `hello world!` 36 | - **PinYin**: Convert Chinese characters to Pinyin for fuzzy matching. 37 | Example: `西安` -> ` xi an `, matches `洗按` -> ` xi an `, but not `先` -> ` xian ` 38 | - **PinYinChar**: Convert Chinese characters to Pinyin. 39 | Example: `西安` -> `xian`, matches `洗按` and `先` -> `xian` 40 | - **AND OR NOT Word Matching**: 41 | - Takes into account the number of repetitions of words. 42 | - Example: `hello&world` matches `hello world` and `world,hello` 43 | - Example: `无&法&无&天` matches `无无法天` (because `无` is repeated twice), but not `无法天` 44 | - Example: `hello~helloo~hhello` matches `hello` but not `helloo` and `hhello` 45 | - **Customizable Exemption Lists**: Exclude specific words from matching. 46 | - **Efficient Handling of Large Word Lists**: Optimized for performance. 47 | 48 | ### Rust Users 49 | 50 | See the [Rust README](./matcher_rs/README.md). 51 | 52 | ### Python Users 53 | 54 | See the [Python README](./matcher_py/README.md). 55 | 56 | ### C, Java and Other Users 57 | 58 | We provide dynamic library to link. See the [C README](./matcher_c/README.md) and [Java README](./matcher_java/README.md). 59 | 60 | #### Build from source 61 | 62 | ```shell 63 | git clone https://github.com/Lips7/Matcher.git 64 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y 65 | cargo build --release 66 | ``` 67 | 68 | Then you should find the `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll` in the `target/release` directory. 69 | 70 | #### Pre-built binary 71 | 72 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary. 73 | 74 | ## Benchmarks 75 | 76 | Please refer to [benchmarks](./matcher_rs/README.md#benchmarks) for details. 77 | 78 | ## Roadmap 79 | 80 | ### Performance 81 | - [x] ~~Cache middle results during different ProcessType reduce_process_text function calling. (failed, too slow)~~ 82 | - [x] Try more aho-corasick library to improve performance and reduce memory usage. 83 | - [x] ~~https://github.com/daac-tools/crawdad (produce char-wise index, not byte-wise index, it's not acceptable)~~ 84 | - [x] https://github.com/daac-tools/daachorse (use it when Fanjian, PinYin or PinYinChar transformation is performed) 85 | - [x] ~~Test char-wise HashMap transformation for Chinese Characters. (Too slow)~~ 86 | - [x] Make aho-corasick unsafe. 87 | - [x] See https://github.com/Lips7/aho-corasick. 88 | - [ ] Optimize NOT logic word-wise. 89 | - [x] Optimize `RegexMatcher` using `RegexSet`. 90 | - [x] Optimize `SimpleMatcher` when multiple `ProcessType` are used. 91 | 1. Consider if there are multiple `ProcessType` 92 | * None 93 | * Fanjian 94 | * FanjianDelete 95 | * FanjianDeleteNormalize 96 | * FanjianNormalize 97 | 2. We can construct a chain of transformations, 98 | * None -> Fanjian -> Delete -> Normalize 99 | *                  \ -> Normalize. 100 | 3. Calcuate all possible transformations, and cache the results, so that instead calculating 8 times (Fanjian, Fanjian + Delete, Fanjian + Delete + Normalize, Fanjian + Normalize), we only need to calculate 4 times (Fanjian, Delete, Normalize, Normalize). 101 | - [x] ~~Optimize process matcher when perform reduce text processing.~~ 102 | 1. Consider we have to perform FanjianDeleteNormalize, we need to perform Fanjian first, then Delete, then Normalize, 3 kinds of Process Matcher are needed to perform replacement or delete, the text has to be scanned 3 times. 103 | 2. What if we only construct only 1 Process Matcher which's patterns contains all the Fanjian, Delete and Normalize 3 kinds of patterns? We could scan the text only once to get all the positions that should be perform replacement or delete. 104 | 3. We need to take care of the byte index will change after replacement or delete, so we need to take the offset changes into account. 105 | - [x] Merge multiple aho-corasick matcher into one when multiple `ProcessType` are used. 106 | - [x] When `dfa` feature is disabled, use daachorse to perform text processing. 107 | - [x] Do not use it for simple process function, too slow to build. 108 | - [ ] Use more regex set to optimize regex matcher. 109 | 110 | ### Flexibility 111 | - [x] Cache `get_process_matcher` results globally, instead of caching result inside SimpleMatcher. 112 | - [x] Expose `reduce_process_text` to Python. 113 | - [x] Add a new function that can handle single simple match type. 114 | - [x] `text_process` now is available. 115 | - [x] Add fuzzy matcher, https://github.com/lotabout/fuzzy-matcher. 116 | - [x] Use `rapidfuzz` instead. 117 | - [x] Make `SimpleMatcher` and `Matcher` serializable. 118 | - [x] Make aho-corasick serializable. 119 | - [x] See https://github.com/Lips7/aho-corasick. 120 | - [x] Implement NOT logic word-wise. 121 | - [x] Support stable rust. 122 | - [ ] Support iterator. 123 | - [ ] A real java package. 124 | - [x] Multiple Python version wheel build. 125 | - [ ] Customize str conversion map. 126 | - [x] Add Matcher process function to py, c and java. 127 | - [x] ~~For simple matcher, is it possible to use regex-automata to replace aho-corasick? and support regex. (Keep it simple and efficient)~~ 128 | - [x] Add simple match type to `RegexMatcher` and `SimMatcher` to pre-process a text. 129 | - [x] Try to replace msgpack. 130 | 131 | ### Readability 132 | - [x] More precise and convenient MatchTable. 133 | - [x] More detailed and rigorous benchmarks. 134 | - [x] More detailed and rigorous tests. 135 | - [x] More detailed simple match type explanation. 136 | - [ ] More detailed [DESIGN](./DESIGN.md). 137 | - [x] Write a Chinese README. -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 | # Matcher 2 | 3 | ![Rust](https://img.shields.io/badge/rust-%23000000.svg?style=for-the-badge&logo=rust&logoColor=white)![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)![Java](https://img.shields.io/badge/java-%23ED8B00.svg?style=for-the-badge&logo=openjdk&logoColor=white)![C](https://img.shields.io/badge/c-%2300599C.svg?style=for-the-badge&logo=c&logoColor=white) 4 | 5 | ![PyPI - License](https://img.shields.io/pypi/l/matcher_py) 6 | 7 | ![Crates.io Version](https://img.shields.io/crates/v/matcher_rs)![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/lips7/Matcher/test.yml)![docs.rs](https://img.shields.io/docsrs/matcher_rs)![Crates.io Total Downloads](https://img.shields.io/crates/d/matcher_rs) 8 | 9 | ![PyPI - Version](https://img.shields.io/pypi/v/matcher_py)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/matcher_py)![PyPI - Downloads](https://img.shields.io/pypi/dm/matcher_py) 10 | 11 | 一个高性能文本匹配器,旨在解决**逻辑**和**文本变体**的词匹配问题,以Rust实现。 12 | 13 | 它对以下方面非常有帮助: 14 | - **精确率与召回率**:文本匹配是一个召回过程,逻辑匹配提高精确率,文本变体匹配提高召回率。 15 | - **内容过滤**:检测和攻击性或敏感词语。 16 | - **搜索引擎**:通过识别相关关键词来改进搜索结果。 17 | - **文本分析**:从大量文本中提取特定信息。 18 | - **垃圾邮件检测**:识别电子邮件或消息中的垃圾内容。 19 | - ··· 20 | 21 | ## 特性 22 | 23 | 有关详细的实现,请参见[Design Document](./DESIGN.md)。 24 | 25 | - **多种匹配方法**: 26 | - 简单词匹配 27 | - 基于正则表达式的匹配 28 | - 基于相似度的匹配 29 | - **文本转换**: 30 | - **繁简转换**:将繁体字转换为简体字。例如:`蟲艸` -> `虫草` 31 | - **删除特定字符**:移除特定字符。例如:`*Fu&*iii&^%%*&kkkk` -> `Fuiiikkkk` 32 | - **规范化**:将特殊字符规范化为可识别字符。例如:`𝜢𝕰𝕃𝙻𝝧 𝙒ⓞᵣℒ𝒟!` -> `hello world!` 33 | - **拼音转换**:将汉字转换为拼音以进行模糊匹配。例如:`西安` -> ` xi an `, 匹配 `洗按` -> ` xi an `, 但不匹配 `先` -> ` xian ` 34 | - **拼音字符转换**:将汉字转换为拼音。例如:`西安` -> `xian`, 匹配 `洗按` 和 `先` -> `xian` 35 | - **与或非词匹配**: 36 | - 考虑单词的重复次数。 37 | - 例如:`hello&world` 匹配 `hello world` 和 `world,hello` 38 | - 例如:`无&法&无&天` 匹配 `无无法天`(因为 `无` 重复两次),但不匹配 `无法天` 39 | - 例如:`hello~helloo~hhello` 匹配 `hello` 但不匹配 `helloo` 和 `hhello` 40 | - **可定制的豁免列表**:排除特定单词的匹配。 41 | - **高效处理大型词列表**:针对性能进行了优化。 42 | 43 | ### Rust 用户 44 | 45 | 请参阅 [Rust README](./matcher_rs/README.md)。 46 | 47 | ### Python 用户 48 | 49 | 请参阅 [Python README](./matcher_py/README.md)。 50 | 51 | ### C, Java 和其他用户 52 | 53 | 我们提供动态链接库,请参阅 [C README](./matcher_c/README.md) 和 [Java README](./matcher_java/README.md)。 54 | 55 | #### 或从源构建 56 | 57 | ```shell 58 | git clone https://github.com/Lips7/Matcher.git 59 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y 60 | cargo build --release 61 | ``` 62 | 63 | 在 `target/release` 文件夹底下找到 `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll`。 64 | 65 | #### 预构建的包 66 | 67 | 访问 [release page](https://github.com/Lips7/Matcher/releases) 来下载预构建的动态链接库. 68 | 69 | ## 性能测试 70 | 71 | 请参阅 [benchmarks](./matcher_rs/README.md#benchmarks) 查看更多细节。 -------------------------------------------------------------------------------- /data/word_list/cn/cn_words_100.txt: -------------------------------------------------------------------------------- 1 | 的 2 | 了 3 | 在 4 | 和 5 | 是 6 | 有 7 | 个 8 | 上 9 | 中 10 | 为 11 | 年 12 | 这 13 | 他 14 | 日 15 | 对 16 | 也 17 | 要 18 | 我 19 | 地 20 | 到 21 | 说 22 | 我们 23 | 就 24 | 人 25 | 不 26 | 等 27 | 工作 28 | 月 29 | 将 30 | 与 31 | 着 32 | 他们 33 | 以 34 | 人民 35 | 都 36 | 发展 37 | 大 38 | 把 39 | 后 40 | 从 41 | 来 42 | 还 43 | 两 44 | 元 45 | 而 46 | 进行 47 | 时 48 | 生产 49 | 新 50 | 中国 51 | 下 52 | 并 53 | 又 54 | 国家 55 | 问题 56 | 会 57 | 已 58 | 建设 59 | 好 60 | 向 61 | 被 62 | 企业 63 | 经济 64 | 但 65 | 出 66 | 自己 67 | 群众 68 | 使 69 | 市 70 | 没有 71 | 革命 72 | 里 73 | 做 74 | 用 75 | 领导 76 | 政府 77 | 名 78 | 她 79 | 这个 80 | 给 81 | 由 82 | 省 83 | 得 84 | 所 85 | 各 86 | 美国 87 | 前 88 | 次 89 | 该 90 | 于 91 | 今年 92 | 去 93 | 本 94 | 党 95 | 之 96 | 组织 97 | 据 98 | 提高 99 | 家 100 | 干部 101 | -------------------------------------------------------------------------------- /data/word_list/en/en_words_100.txt: -------------------------------------------------------------------------------- 1 | stampeding 2 | commendable 3 | adrenaline 4 | exobiology 5 | indifference 6 | avuncular 7 | prevailed 8 | foreparts 9 | legalistically 10 | intermarries 11 | desideratum 12 | evaluating 13 | lavishing 14 | attractable 15 | philippics 16 | antiabortionist 17 | lascivious 18 | breathable 19 | histogram 20 | rattlings 21 | interdict 22 | summarized 23 | relieving 24 | congresspeople 25 | fitfulness 26 | percolation 27 | upperclasswoman 28 | epistemic 29 | Chantilly 30 | stonemasons 31 | nonferrous 32 | emulsions 33 | charitably 34 | barracudas 35 | integrity 36 | knockdowns 37 | roadworks 38 | abortionists 39 | Salvadoran 40 | chanceries 41 | misinform 42 | caretaker 43 | extricated 44 | mandolins 45 | steeliest 46 | transpiration 47 | weirdness 48 | audiologists 49 | baronetcies 50 | performing 51 | publishing 52 | suspending 53 | dermatological 54 | contemplate 55 | spiritless 56 | nightwatchman 57 | paradisaical 58 | implicating 59 | timpanists 60 | Leavenworth 61 | amorality 62 | strangulated 63 | cellophane 64 | waterboard 65 | astrophysicists 66 | aerospace 67 | passphrase 68 | engendered 69 | spotlighting 70 | misapplication 71 | barterers 72 | poetesses 73 | dollhouse 74 | laparoscopic 75 | Dubrovnik 76 | rerecords 77 | shielding 78 | orthographically 79 | thicknesses 80 | Bendictus 81 | congealed 82 | cooperative 83 | encompass 84 | grouching 85 | shipowners 86 | jealously 87 | generational 88 | antecedents 89 | persecutes 90 | exemplified 91 | admirable 92 | squeakiest 93 | absconding 94 | extirpated 95 | exoskeletons 96 | earthworms 97 | chaotically 98 | shipbuilder 99 | equidistantly 100 | overprint -------------------------------------------------------------------------------- /matcher_c/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "matcher_c" 3 | authors.workspace = true 4 | categories.workspace = true 5 | description.workspace = true 6 | edition.workspace = true 7 | homepage.workspace = true 8 | keywords.workspace = true 9 | license.workspace = true 10 | repository.workspace = true 11 | version.workspace = true 12 | readme = "README.md" 13 | documentation = "https://docs.rs/crate/matcher_c/latest" 14 | 15 | [lib] 16 | name = "matcher_c" 17 | crate-type = ["cdylib", "rlib"] 18 | 19 | [dependencies] 20 | matcher_rs = { path = "../matcher_rs", version = "0.5.7" } 21 | sonic-rs = "0.5.1" 22 | -------------------------------------------------------------------------------- /matcher_c/README.md: -------------------------------------------------------------------------------- 1 | # Matcher Rust Implement C FFI bindings 2 | 3 | ## Overview 4 | 5 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust. 6 | 7 | ## Installation 8 | 9 | ### Build from source 10 | 11 | ```shell 12 | git clone https://github.com/Lips7/Matcher.git 13 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y 14 | cargo build --release 15 | ``` 16 | 17 | Then you should find the `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll` in the `target/release` directory. 18 | 19 | ### Install pre-built binary 20 | 21 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary. 22 | 23 | ## Python usage example 24 | 25 | ```Python 26 | import json 27 | 28 | from cffi import FFI 29 | 30 | from extension_types import MatchTableType, ProcessType, MatchTable 31 | 32 | ## define ffi 33 | ffi = FFI() 34 | ffi.cdef(open("./matcher_c.h", "r", encoding="utf-8").read()) 35 | lib = ffi.dlopen("./matcher_c.so") 36 | 37 | # init matcher 38 | matcher = lib.init_matcher( 39 | json.dumps({ 40 | 1: [ 41 | MatchTable( 42 | table_id=1, 43 | match_table_type=MatchTableType.Simple( 44 | process_type=ProcessType.MatchNone 45 | ), 46 | word_list=["hello,world", "hello", "world"], 47 | exemption_process_type=ProcessType.MatchNone, 48 | exemption_word_list=[], 49 | ) 50 | ] 51 | }).encode() 52 | ) 53 | 54 | # check is match 55 | lib.matcher_is_match(matcher, "hello".encode("utf-8")) # True 56 | 57 | # match as list 58 | res = lib.matcher_process_as_string(matcher, "hello,world".encode("utf-8")) 59 | print(ffi.string(res).decode("utf-8")) 60 | # [{"match_id":1,"table_id":1,"word_id":0,"word":"hello,world","similarity":1.0},{"match_id":1,"table_id":1,"word_id":1,"word":"hello","similarity":1.0},{"match_id":1,"table_id":1,"word_id":2,"word":"world","similarity":1.0}] 61 | lib.drop_string(res) 62 | 63 | # match as dict 64 | res = lib.matcher_word_match_as_string(matcher, "hello,world".encode("utf-8")) 65 | print(ffi.string(res).decode("utf-8")) 66 | # {"1":[{"match_id":1,"table_id":1,"word_id":0,"word":"hello,world","similarity":1.0},{"match_id":1,"table_id":1,"word_id":1,"word":"hello","similarity":1.0},{"match_id":1,"table_id":1,"word_id":2,"word":"world","similarity":1.0}]} 67 | lib.drop_string(res) 68 | 69 | # drop matcher 70 | lib.drop_matcher(matcher) 71 | 72 | # init simple matcher 73 | simple_matcher = lib.init_simple_matcher( 74 | json.dumps(({ 75 | ProcessType.MatchFanjianDeleteNormalize | ProcessType.MatchPinYinChar: { 76 | 1: "妳好&世界", 77 | 2: "hello", 78 | } 79 | })).encode() 80 | ) 81 | 82 | # check is match 83 | lib.simple_matcher_is_match(simple_matcher, "你好世界".encode("utf-8")) # True 84 | 85 | # match as list 86 | res = lib.simple_matcher_process_as_string( 87 | simple_matcher, "nihaoshijie!hello!world!".encode("utf-8") 88 | ) 89 | print(ffi.string(res).decode("utf-8")) 90 | # [{"word_id":1,"word":"妳好&世界"},{"word_id":2,"word":"hello"}] 91 | lib.drop_string(res) 92 | 93 | # drop simple matcher 94 | lib.drop_simple_matcher(simple_matcher) 95 | ``` 96 | 97 | ## Important Notes 98 | 99 | 1. The [extension_types.py](./extension_types.py) is not required, you can use the dynamic library directly. 100 | 2. Always call `drop_matcher`, `drop_simple_matcher`, and `drop_string` after initializing and processing to avoid memory leaks. -------------------------------------------------------------------------------- /matcher_c/extension_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, IntFlag 2 | from typing import Dict, List, TypedDict, Union 3 | 4 | 5 | class ProcessType(IntFlag): 6 | """ 7 | An enumeration representing various types of text processing operations. 8 | 9 | Attributes: 10 | MatchNone (IntFlag): An operation that performs no matching (binary 00000001). 11 | MatchFanjian (IntFlag): An operation that matches traditional and simplified Chinese characters (binary 00000010). 12 | MatchDelete (IntFlag): An operation that matches deleted characters (binary 00000100). 13 | MatchNormalize (IntFlag): An operation that normalizes characters (binary 00001000). 14 | MatchDeleteNormalize (IntFlag): A combined operation that deletes and normalizes characters (binary 00001100). 15 | MatchFanjianDeleteNormalize (IntFlag): A combined operation that matches traditional and simplified Chinese characters, 16 | deletes, and normalizes (binary 00001110). 17 | MatchPinYin (IntFlag): An operation that matches Pinyin representations of Chinese characters (binary 00010000). 18 | MatchPinYinChar (IntFlag): An operation that matches individual characters in the Pinyin representation (binary 00100000). 19 | """ 20 | 21 | MatchNone = 0b00000001 22 | MatchFanjian = 0b00000010 23 | MatchDelete = 0b00000100 24 | MatchNormalize = 0b00001000 25 | MatchDeleteNormalize = 0b00001100 26 | MatchFanjianDeleteNormalize = 0b00001110 27 | MatchPinYin = 0b00010000 28 | MatchPinYinChar = 0b00100000 29 | 30 | 31 | class RegexMatchType(str, Enum): 32 | """ 33 | An enumeration representing various types of regex matching operations. 34 | 35 | Attributes: 36 | MatchSimilarChar (str): An operation that matches characters that are similar in some way. 37 | MatchAcrostic (str): An operation that matches acrostic patterns. 38 | MatchRegex (str): An operation that matches using standard regular expressions. 39 | """ 40 | 41 | MatchSimilarChar = "similar_char" 42 | MatchAcrostic = "acrostic" 43 | MatchRegex = "regex" 44 | 45 | 46 | class SimMatchType(str, Enum): 47 | """ 48 | An enumeration representing various types of similarity matching operations. 49 | 50 | Attributes: 51 | MatchLevenshtein (str): An operation that matches using the Levenshtein distance metric. 52 | """ 53 | 54 | MatchLevenshtein = "levenshtein" 55 | 56 | 57 | class Simple(TypedDict): 58 | """ 59 | A TypedDict representing a simple text processing operation. 60 | 61 | Attributes: 62 | process_type (ProcessType): The type of processing operation to be performed. 63 | """ 64 | 65 | process_type: ProcessType 66 | 67 | 68 | class Regex(TypedDict): 69 | """ 70 | A TypedDict representing a regex-based text processing operation. 71 | 72 | Attributes: 73 | process_type (ProcessType): The type of processing operation to be performed. 74 | regex_match_type (RegexMatchType): The type of regex matching operation to be used. 75 | """ 76 | 77 | process_type: ProcessType 78 | regex_match_type: RegexMatchType 79 | 80 | 81 | class Similar(TypedDict): 82 | """ 83 | A TypedDict representing a similarity-based text processing operation. 84 | 85 | Attributes: 86 | process_type (ProcessType): The type of processing operation to be performed. 87 | sim_match_type (SimMatchType): The type of similarity matching operation to be used. 88 | threshold (float): The threshold value for the similarity matching operation. 89 | """ 90 | 91 | process_type: ProcessType 92 | sim_match_type: SimMatchType 93 | threshold: float 94 | 95 | 96 | class MatchTableType: 97 | def Simple(process_type: ProcessType) -> Dict[str, Simple]: 98 | """ 99 | Create a dictionary representing a simple text processing operation. 100 | 101 | Args: 102 | process_type (ProcessType): The type of processing operation to be performed. 103 | 104 | Returns: 105 | Dict[str, Simple]: A dictionary with one key "simple" mapping to a Simple TypedDict 106 | containing the provided process_type. 107 | """ 108 | return {"simple": Simple(process_type=process_type)} 109 | 110 | def Regex( 111 | process_type: ProcessType, regex_match_type: RegexMatchType 112 | ) -> Dict[str, Regex]: 113 | """ 114 | Create a dictionary representing a regex-based text processing operation. 115 | 116 | Args: 117 | process_type (ProcessType): The type of processing operation to be performed. 118 | regex_match_type (RegexMatchType): The type of regex matching operation to be used. 119 | 120 | Returns: 121 | Dict[str, Regex]: A dictionary with one key "regex" mapping to a Regex TypedDict 122 | containing the provided process_type and regex_match_type. 123 | """ 124 | return { 125 | "regex": Regex(process_type=process_type, regex_match_type=regex_match_type) 126 | } 127 | 128 | def Similar( 129 | process_type: ProcessType, sim_match_type: SimMatchType, threshold: float 130 | ) -> Dict[str, Similar]: 131 | """ 132 | Create a dictionary representing a similarity-based text processing operation. 133 | Args: 134 | process_type (ProcessType): The type of processing operation to be performed. 135 | sim_match_type (SimMatchType): The type of similarity matching operation to be used. 136 | threshold (float): The threshold value for the similarity matching operation. 137 | 138 | Returns: 139 | Dict[str, Similar]: A dictionary with one key "similar" mapping to a Similar TypedDict 140 | containing the provided process_type, sim_match_type, and threshold. 141 | """ 142 | return { 143 | "similar": Similar( 144 | process_type=process_type, 145 | sim_match_type=sim_match_type, 146 | threshold=threshold, 147 | ) 148 | } 149 | 150 | 151 | class MatchTable(TypedDict): 152 | """ 153 | A TypedDict representing a table for matching operations. 154 | 155 | Attributes: 156 | table_id (int): A unique identifier for the match table. 157 | match_table_type (Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]]): 158 | A dictionary that specifies the type of match operation to be performed. The key is a string indicating 159 | the match type ('simple', 'regex', 'similar'), and the value is a corresponding TypedDict describing 160 | the operation. 161 | word_list (List[str]): A list of words that are subject to the matching operations. 162 | exemption_process_type (ProcessType): The type of process for which certain words are exempt from matching. 163 | exemption_word_list (List[str]): A list of words that are exempt from the matching process. 164 | """ 165 | 166 | table_id: int 167 | match_table_type: Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]] 168 | word_list: List[str] 169 | exemption_process_type: ProcessType 170 | exemption_word_list: List[str] 171 | 172 | 173 | MatchTableMap = Dict[int, List[MatchTable]] 174 | """ 175 | A type alias for mapping table identifiers to lists of MatchTable objects. 176 | 177 | Type: 178 | Dict[int, List[MatchTable]] 179 | 180 | This dictionary maps an integer table ID to a list of MatchTable objects that correspond to the ID. It is used to 181 | organize and retrieve match tables based on their unique identifiers. 182 | """ 183 | 184 | 185 | class MatchResult(TypedDict): 186 | """ 187 | A TypedDict representing the result of a matching operation. 188 | 189 | Attributes: 190 | match_id (int): A unique identifier for the match result. 191 | table_id (int): The identifier of the match table where the matching operation was performed. 192 | word_id (int): The identifier of the matched word within the word list. 193 | word (str): The matched word. 194 | similarity (float): The similarity score of the match operation. 195 | """ 196 | 197 | match_id: int 198 | table_id: int 199 | word_id: int 200 | word: str 201 | similarity: float 202 | 203 | 204 | SimpleTable = Dict[ProcessType, Dict[int, str]] 205 | """ 206 | A type alias for representing a simple table structure for text processing. 207 | 208 | This dictionary maps a `ProcessType` to another dictionary that maps an integer ID to a string. 209 | The outer dictionary's keys represent different types of processing operations, while the inner 210 | dictionary's keys represent unique identifiers corresponding to specific strings related to the 211 | operations. 212 | 213 | Type: 214 | Dict[ProcessType, Dict[int, str]] 215 | """ 216 | 217 | 218 | class SimpleResult(TypedDict): 219 | """ 220 | A TypedDict representing a simplified result of a text processing operation. 221 | 222 | Attributes: 223 | word_id (int): The identifier of the word within the word list. 224 | word (str): The word corresponding to the word_id. 225 | """ 226 | 227 | word_id: int 228 | word: str 229 | -------------------------------------------------------------------------------- /matcher_c/matcher_c.h: -------------------------------------------------------------------------------- 1 | void* init_matcher(char* match_table_map_bytes); 2 | bool matcher_is_match(void* matcher, char* text); 3 | char* matcher_process_as_string(void* matcher, char* text); 4 | char* matcher_word_match_as_string(void* matcher, char* text); 5 | void drop_matcher(void* matcher); 6 | 7 | void* init_simple_matcher(char* simple_table_bytes); 8 | bool simple_matcher_is_match(void* simple_matcher, char* text); 9 | char* simple_matcher_process_as_string(void* simple_matcher, char* text); 10 | void drop_simple_matcher(void* simple_matcher); 11 | 12 | void drop_string(char* ptr); -------------------------------------------------------------------------------- /matcher_c/src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | ffi::{c_char, CStr, CString}, 3 | str, 4 | }; 5 | 6 | use matcher_rs::{ 7 | MatchTableMapSerde as MatchTableMap, Matcher, SimpleMatcher, SimpleTableSerde as SimpleTable, 8 | TextMatcherTrait, 9 | }; 10 | 11 | /// Initializes a `Matcher` from a serialized `MatchTableMap` in MessagePack format. 12 | /// 13 | /// # Safety 14 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 15 | /// that `match_table_map_bytes` points to a valid null-terminated C string containing a 16 | /// serialized `MatchTableMap`, and that the string remains valid for the duration of the call. 17 | /// 18 | /// # Parameters 19 | /// - `match_table_map_bytes`: A pointer to a C string containing the serialized `MatchTableMap`. 20 | /// 21 | /// # Returns 22 | /// A raw pointer to the newly created `Matcher`. The caller is responsible for managing the 23 | /// lifetime of this pointer and must eventually call `drop` on it to free the memory. 24 | /// 25 | /// # Panics 26 | /// This function will panic if the input data cannot be deserialized into a `MatchTableMap`. 27 | #[no_mangle] 28 | pub unsafe extern "C" fn init_matcher(match_table_map_bytes: *const c_char) -> *mut Matcher { 29 | unsafe { 30 | let match_table_map: MatchTableMap = match sonic_rs::from_slice( 31 | CStr::from_ptr(match_table_map_bytes).to_bytes(), 32 | ) { 33 | Ok(match_table_map) => match_table_map, 34 | Err(e) => { 35 | panic!("Deserialize match_table_map_bytes failed, Please check the input data.\nErr: {}", e) 36 | } 37 | }; 38 | 39 | Box::into_raw(Box::new(Matcher::new(&match_table_map))) 40 | } 41 | } 42 | 43 | /// Checks if the given text matches any pattern in the Matcher. 44 | /// 45 | /// # Safety 46 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 47 | /// that `matcher` points to a valid `Matcher` instance and that `text` points to a valid 48 | /// null-terminated C string. Both the `matcher` and the `text` must remain valid for the 49 | /// duration of the call. 50 | /// 51 | /// # Parameters 52 | /// - `matcher`: A pointer to the `Matcher` instance. 53 | /// - `text`: A pointer to a C string containing the text to be checked for matches. 54 | /// 55 | /// # Returns 56 | /// - `true` if the text matches any pattern in the `Matcher`. 57 | /// - `false` otherwise. 58 | /// 59 | /// # Panics 60 | /// This function will panic if the input `text` is not a valid UTF-8 string. 61 | #[no_mangle] 62 | pub unsafe extern "C" fn matcher_is_match(matcher: *mut Matcher, text: *const c_char) -> bool { 63 | unsafe { 64 | let text = str::from_utf8(CStr::from_ptr(text).to_bytes()); 65 | match text { 66 | Ok(text) => matcher.as_ref().unwrap().is_match(text), 67 | Err(_) => { 68 | panic!("Input is not a valid utf-8 string"); 69 | } 70 | } 71 | } 72 | } 73 | 74 | /// Processes the input text through the Matcher and returns the result as a C string. 75 | /// 76 | /// # Safety 77 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 78 | /// that `matcher` points to a valid `Matcher` instance and that `text` points to a valid 79 | /// null-terminated C string. Both the `matcher` and the `text` must remain valid for the 80 | /// duration of the call. 81 | /// 82 | /// # Parameters 83 | /// - `matcher`: A pointer to the `Matcher` instance. 84 | /// - `text`: A pointer to a C string containing the text to be processed. 85 | /// 86 | /// # Returns 87 | /// A pointer to a newly allocated C string containing the processing result. The caller is 88 | /// responsible for managing the lifetime of this pointer and must eventually call `drop_string` 89 | /// on it to free the memory. 90 | /// 91 | /// # Panics 92 | /// This function will panic if the input `text` is not a valid UTF-8 string or if the 93 | /// serialization of the result fails. 94 | #[no_mangle] 95 | pub unsafe extern "C" fn matcher_process_as_string( 96 | matcher: *mut Matcher, 97 | text: *const c_char, 98 | ) -> *mut c_char { 99 | unsafe { 100 | let text = str::from_utf8(CStr::from_ptr(text).to_bytes()); 101 | let res = match text { 102 | Ok(text) => matcher.as_ref().unwrap().process(text), 103 | Err(_) => { 104 | panic!("Input is not a valid utf-8 string"); 105 | } 106 | }; 107 | let res_cstring = CString::new(sonic_rs::to_vec(&res).unwrap_unchecked()).unwrap(); 108 | res_cstring.into_raw() 109 | } 110 | } 111 | 112 | /// Processes the input text through the `Matcher` and returns the word match result as a C string. 113 | /// 114 | /// # Safety 115 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 116 | /// that `matcher` points to a valid `Matcher` instance and that `text` points to a valid 117 | /// null-terminated C string. Both the `matcher` and the `text` must remain valid for the 118 | /// duration of the call. 119 | /// 120 | /// # Parameters 121 | /// - `matcher`: A pointer to the `Matcher` instance. 122 | /// - `text`: A pointer to a C string containing the text to be processed. 123 | /// 124 | /// # Returns 125 | /// A pointer to a newly allocated C string containing the word match processing result. 126 | /// The caller is responsible for managing the lifetime of this pointer and must eventually 127 | /// call `drop_string` on it to free the memory. 128 | /// 129 | /// # Panics 130 | /// This function will panic if the input `text` is not a valid UTF-8 string. 131 | #[no_mangle] 132 | pub unsafe extern "C" fn matcher_word_match_as_string( 133 | matcher: *mut Matcher, 134 | text: *const c_char, 135 | ) -> *mut c_char { 136 | unsafe { 137 | let text = str::from_utf8(CStr::from_ptr(text).to_bytes()); 138 | let res = match text { 139 | Ok(text) => { 140 | sonic_rs::to_string(&matcher.as_ref().unwrap().word_match(text)).unwrap_unchecked() 141 | } 142 | Err(_) => { 143 | panic!("Input is not a valid utf-8 string"); 144 | } 145 | }; 146 | let res_cstring = CString::new(res).unwrap(); 147 | res_cstring.into_raw() 148 | } 149 | } 150 | 151 | /// Frees the memory allocated for the `Matcher` instance. 152 | /// 153 | /// # Safety 154 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 155 | /// that `matcher` points to a valid `Matcher` instance. This function transfers ownership 156 | /// of the raw pointer and deallocates the memory, so the caller must not use the `matcher` 157 | /// pointer after calling this function. 158 | /// 159 | /// # Parameters 160 | /// - `matcher`: A pointer to the `Matcher` instance to be deallocated. 161 | #[no_mangle] 162 | pub unsafe extern "C" fn drop_matcher(matcher: *mut Matcher) { 163 | unsafe { drop(Box::from_raw(matcher)) } 164 | } 165 | 166 | /// Initializes a `SimpleMatcher` instance from serialized table bytes. 167 | /// 168 | /// # Safety 169 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 170 | /// that `simple_table_bytes` points to a valid null-terminated C string. The returned 171 | /// `SimpleMatcher` pointer must be properly managed and eventually deallocated by calling 172 | /// `drop_simple_matcher`. 173 | /// 174 | /// # Parameters 175 | /// - `simple_table_bytes`: A pointer to a C string containing the serialized table bytes. 176 | /// 177 | /// # Returns 178 | /// A pointer to a newly allocated `SimpleMatcher` instance. The caller is responsible for managing 179 | /// the lifetime of this pointer and must eventually call `drop_simple_matcher` to free the memory. 180 | /// 181 | /// # Panics 182 | /// This function will panic if the deserialization of `simple_table_bytes` fails. 183 | #[no_mangle] 184 | pub unsafe extern "C" fn init_simple_matcher( 185 | simple_table_bytes: *const c_char, 186 | ) -> *mut SimpleMatcher { 187 | unsafe { 188 | let simple_table: SimpleTable = 189 | match sonic_rs::from_slice(CStr::from_ptr(simple_table_bytes).to_bytes()) { 190 | Ok(simple_table) => simple_table, 191 | Err(e) => { 192 | panic!( 193 | "Deserialize simple_table_bytes failed, Please check the input data.\nErr: {}", 194 | e, 195 | ) 196 | } 197 | }; 198 | 199 | Box::into_raw(Box::new(SimpleMatcher::new(&simple_table))) 200 | } 201 | } 202 | 203 | /// Determines if the input text matches using the `SimpleMatcher`. 204 | /// 205 | /// # Safety 206 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 207 | /// that `simple_matcher` points to a valid `SimpleMatcher` instance and that `text` points to a 208 | /// valid null-terminated C string. Both the `simple_matcher` and the `text` must remain valid for 209 | /// the duration of the call. 210 | /// 211 | /// # Parameters 212 | /// - `simple_matcher`: A pointer to the `SimpleMatcher` instance. 213 | /// - `text`: A pointer to a C string containing the text to be processed. 214 | /// 215 | /// # Returns 216 | /// A boolean indicating whether the text matches based on the `SimpleMatcher`. 217 | /// 218 | /// # Panics 219 | /// This function will panic if the input `text` is not a valid UTF-8 string. 220 | #[no_mangle] 221 | pub unsafe extern "C" fn simple_matcher_is_match( 222 | simple_matcher: *mut SimpleMatcher, 223 | text: *const c_char, 224 | ) -> bool { 225 | unsafe { 226 | let text = str::from_utf8(CStr::from_ptr(text).to_bytes()); 227 | match text { 228 | Ok(text) => simple_matcher.as_ref().unwrap().is_match(text), 229 | Err(_) => { 230 | panic!("Input is not a valid utf-8 string"); 231 | } 232 | } 233 | } 234 | } 235 | 236 | /// Processes the input text using the `SimpleMatcher` and returns the result as a C string. 237 | /// 238 | /// # Safety 239 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 240 | /// that `simple_matcher` points to a valid `SimpleMatcher` instance and that `text` points to a 241 | /// valid null-terminated C string. Both `simple_matcher` and `text` must remain valid for the 242 | /// duration of the call. 243 | /// 244 | /// # Parameters 245 | /// - `simple_matcher`: A pointer to the `SimpleMatcher` instance. 246 | /// - `text`: A pointer to a C string containing the text to be processed. 247 | /// 248 | /// # Returns 249 | /// A pointer to a newly allocated C string containing the processing result. The caller is 250 | /// responsible for managing the lifetime of this pointer and must eventually call 251 | /// `drop_string` on it to free the memory. 252 | /// 253 | /// # Panics 254 | /// This function will panic if the input `text` is not a valid UTF-8 string. 255 | #[no_mangle] 256 | pub unsafe extern "C" fn simple_matcher_process_as_string( 257 | simple_matcher: *mut SimpleMatcher, 258 | text: *const c_char, 259 | ) -> *mut c_char { 260 | unsafe { 261 | let text = str::from_utf8(CStr::from_ptr(text).to_bytes()); 262 | let res = match text { 263 | Ok(text) => simple_matcher.as_ref().unwrap().process(text), 264 | Err(_) => { 265 | panic!("Input is not a valid utf-8 string"); 266 | } 267 | }; 268 | let res_cstring = CString::new(sonic_rs::to_vec(&res).unwrap_unchecked()).unwrap(); 269 | res_cstring.into_raw() 270 | } 271 | } 272 | 273 | /// Deallocates a `SimpleMatcher` instance. 274 | /// 275 | /// # Safety 276 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 277 | /// that `simple_matcher` points to a valid `SimpleMatcher` instance that was previously allocated 278 | /// by `init_simple_matcher`. After calling this function, the `simple_matcher` pointer must not be 279 | /// used again as it points to deallocated memory. 280 | /// 281 | /// # Parameters 282 | /// - `simple_matcher`: A pointer to the `SimpleMatcher` instance to be deallocated. 283 | #[no_mangle] 284 | pub unsafe extern "C" fn drop_simple_matcher(simple_matcher: *mut SimpleMatcher) { 285 | unsafe { drop(Box::from_raw(simple_matcher)) } 286 | } 287 | 288 | /// Deallocates a C string that was previously allocated by the Rust code and passed to C. 289 | /// 290 | /// # Safety 291 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure 292 | /// that `ptr` points to a valid C string that was previously allocated by Rust code using 293 | /// `CString::into_raw` or a similar method. After calling this function, the `ptr` pointer must 294 | /// not be used again as it points to deallocated memory. 295 | /// 296 | /// # Parameters 297 | /// - `ptr`: A pointer to the C string to be deallocated. 298 | #[no_mangle] 299 | pub unsafe extern "C" fn drop_string(ptr: *mut c_char) { 300 | unsafe { drop(CString::from_raw(ptr)) } 301 | } 302 | -------------------------------------------------------------------------------- /matcher_java/README.md: -------------------------------------------------------------------------------- 1 | # Matcher Rust Implement JAVA FFI bindings 2 | 3 | ## Overview 4 | 5 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust. 6 | 7 | ## Installation 8 | 9 | ### Build from source 10 | 11 | ```shell 12 | git clone https://github.com/Lips7/Matcher.git 13 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y 14 | cargo build --release 15 | ``` 16 | 17 | Then you should find the `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll` in the `target/release` directory. 18 | 19 | ### Install pre-built binary 20 | 21 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary. 22 | 23 | ## Java usage example 24 | 25 | Put the `matcher_c` dynamic library under the `src/main/resources` directory. 26 | 27 | Copy the code below or refer to [MatcherJavaExample.java](./src/test/java/com/matcher_java/MatcherJavaExample.java). 28 | 29 | ```java 30 | package com.matcher_java; 31 | 32 | import com.alibaba.fastjson.JSON; 33 | import com.alibaba.fastjson.serializer.SerializeConfig; 34 | import com.matcher_java.extension_types.MatchTable; 35 | import com.matcher_java.extension_types.MatchTableType; 36 | import com.matcher_java.extension_types.ProcessType; 37 | import com.matcher_java.extension_types.ProcessTypeSerializer; 38 | import com.sun.jna.Pointer; 39 | 40 | import java.io.IOException; 41 | import java.util.ArrayList; 42 | import java.util.HashMap; 43 | import java.util.List; 44 | import java.util.Map; 45 | 46 | public class MatcherJavaExample { 47 | public static void main(String[] args) throws IOException { 48 | System.out.println("Simple Matcher Test"); 49 | simple_matcher_process_demo(); 50 | 51 | System.out.println("\n"); 52 | 53 | System.out.println("Matcher Test"); 54 | matcher_process_demo(); 55 | } 56 | 57 | public static void simple_matcher_process_demo() throws IOException { 58 | SerializeConfig serializeConfig = new SerializeConfig(); 59 | serializeConfig.put(ProcessType.class, new ProcessTypeSerializer()); 60 | 61 | Map> simpleTable = new HashMap<>(); 62 | Map wordMap = new HashMap<>(); 63 | wordMap.put("1", "hello&world"); 64 | simpleTable.put(ProcessType.MatchNone, wordMap); 65 | 66 | String simpleTableStr = JSON.toJSONString(simpleTable, serializeConfig); 67 | System.out.printf("simple_table: %s\n", simpleTableStr); 68 | 69 | byte[] simpleTableBytes = JSON.toJSONBytes(simpleTable, serializeConfig); 70 | 71 | MatcherJava instance = MatcherJava.INSTANCE; 72 | 73 | Pointer simpleMatcher = instance.init_simple_matcher(simpleTableBytes); 74 | 75 | byte[] strBytes = "hello,world".getBytes("utf-8"); 76 | byte[] cStrBytes = new byte[strBytes.length + 1]; 77 | System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length); 78 | 79 | boolean isMatch = instance.simple_matcher_is_match(simpleMatcher, cStrBytes); 80 | System.out.printf("isMatch: %s\n", isMatch); 81 | 82 | Pointer matchResPtr = instance.simple_matcher_process_as_string(simpleMatcher, cStrBytes); 83 | String matchRes = matchResPtr.getString(0, "utf-8"); 84 | System.out.printf("matchRes: %s\n", matchRes); 85 | instance.drop_string(matchResPtr); 86 | 87 | instance.drop_simple_matcher(simpleMatcher); 88 | } 89 | 90 | public static void matcher_process_demo() throws IOException { 91 | SerializeConfig serializeConfig = new SerializeConfig(); 92 | serializeConfig.put(ProcessType.class, new ProcessTypeSerializer()); 93 | 94 | Map> matchTableMap = new HashMap<>(); 95 | List matchTableList = new ArrayList<>(); 96 | MatchTable matchTable = new MatchTable(1, MatchTableType.Simple(ProcessType.MatchNone), List.of("hello&world"), ProcessType.MatchNone, List.of()); 97 | matchTableList.add(matchTable); 98 | matchTableMap.put("1", matchTableList); 99 | 100 | String matchTableMapStr = JSON.toJSONString(matchTableMap, serializeConfig); 101 | System.out.printf("match_table_map: %s\n", matchTableMapStr); 102 | 103 | byte[] matchTableMapBytes = JSON.toJSONBytes(matchTableMap, serializeConfig); 104 | 105 | MatcherJava instance = MatcherJava.INSTANCE; 106 | 107 | Pointer matcher = instance.init_matcher(matchTableMapBytes); 108 | 109 | byte[] strBytes = "hello,world".getBytes("utf-8"); 110 | byte[] cStrBytes = new byte[strBytes.length + 1]; 111 | System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length); 112 | 113 | boolean isMatch = instance.matcher_is_match(matcher, cStrBytes); 114 | System.out.printf("isMatch: %s\n", isMatch); 115 | 116 | Pointer matchResPtr1 = instance.matcher_process_as_string(matcher, cStrBytes); 117 | String matchRes1 = matchResPtr1.getString(0, "utf-8"); 118 | System.out.printf("matchRes: %s\n", matchRes1); 119 | instance.drop_string(matchResPtr1); 120 | 121 | Pointer matchResPtr2 = instance.matcher_word_match_as_string(matcher, cStrBytes); 122 | String matchRes2 = matchResPtr2.getString(0, "utf-8"); 123 | System.out.printf("matchRes: %s\n", matchRes2); 124 | instance.drop_string(matchResPtr2); 125 | 126 | instance.drop_matcher(matcher); 127 | } 128 | } 129 | ``` 130 | 131 | ## Important Notes 132 | 133 | Always call `drop_matcher`, `drop_simple_matcher`, and `drop_string` after initializing and processing to avoid memory leaks. -------------------------------------------------------------------------------- /matcher_java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.matcher_java 8 | matcher_java 9 | 0.5.7 10 | 11 | matcher_java 12 | 13 | https://github.com/Lips7/Matcher 14 | 15 | 16 | UTF-8 17 | 23 18 | 23 19 | 20 | 21 | 22 | 23 | junit 24 | junit 25 | 4.11 26 | test 27 | 28 | 29 | com.alibaba 30 | fastjson 31 | 2.0.28 32 | 33 | 34 | net.java.dev.jna 35 | jna 36 | 5.14.0 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | maven-clean-plugin 46 | 3.1.0 47 | 48 | 49 | 50 | maven-resources-plugin 51 | 3.0.2 52 | 53 | 54 | maven-compiler-plugin 55 | 3.8.0 56 | 57 | 58 | maven-surefire-plugin 59 | 2.22.1 60 | 61 | 62 | maven-jar-plugin 63 | 3.0.2 64 | 65 | 66 | maven-install-plugin 67 | 2.5.2 68 | 69 | 70 | maven-deploy-plugin 71 | 2.8.2 72 | 73 | 74 | 75 | maven-site-plugin 76 | 3.7.1 77 | 78 | 79 | maven-project-info-reports-plugin 80 | 3.0.0 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/MatcherJava.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java; 2 | 3 | import com.sun.jna.Library; 4 | import com.sun.jna.Native; 5 | import com.sun.jna.Pointer; 6 | 7 | interface MatcherJava extends Library { 8 | MatcherJava INSTANCE = (MatcherJava) Native.load( 9 | MatcherJava.class.getResource("/matcher_c.so").getPath(), 10 | MatcherJava.class); 11 | 12 | Pointer init_matcher(byte[] match_table_map_bytes); 13 | 14 | boolean matcher_is_match(Pointer matcher, byte[] text_bytes); 15 | 16 | Pointer matcher_process_as_string(Pointer matcher, byte[] text_bytes); 17 | 18 | Pointer matcher_word_match_as_string(Pointer matcher, byte[] text_bytes); 19 | 20 | void drop_matcher(Pointer matcher); 21 | 22 | Pointer init_simple_matcher(byte[] simple_table_bytes); 23 | 24 | boolean simple_matcher_is_match(Pointer simple_matcher, byte[] text_bytes); 25 | 26 | Pointer simple_matcher_process_as_string(Pointer simple_matcher, byte[] text_bytes); 27 | 28 | void drop_simple_matcher(Pointer simple_matcher); 29 | 30 | void drop_string(Pointer ptr); 31 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/MatchResult.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | public class MatchResult { 4 | private int match_id; 5 | private int table_id; 6 | private int word_id; 7 | private String word; 8 | private float similarity; 9 | 10 | public MatchResult(int match_id, int table_id, int word_id, String word, float similarity) { 11 | this.match_id = match_id; 12 | this.table_id = table_id; 13 | this.word_id = word_id; 14 | this.word = word; 15 | this.similarity = similarity; 16 | } 17 | 18 | public int getMatchId() { 19 | return match_id; 20 | } 21 | 22 | public void setMatchId(int match_id) { 23 | this.match_id = match_id; 24 | } 25 | 26 | public int getTableId() { 27 | return table_id; 28 | } 29 | 30 | public void setTableId(int table_id) { 31 | this.table_id = table_id; 32 | } 33 | 34 | public int getWordId() { 35 | return word_id; 36 | } 37 | 38 | public void setWordId(int word_id) { 39 | this.word_id = word_id; 40 | } 41 | 42 | public String getWord() { 43 | return word; 44 | } 45 | 46 | public void setWord(String word) { 47 | this.word = word; 48 | } 49 | 50 | public float getSimilarity() { 51 | return similarity; 52 | } 53 | 54 | public void setSimilarity(float similarity) { 55 | this.similarity = similarity; 56 | } 57 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/MatchTable.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | import com.alibaba.fastjson.PropertyNamingStrategy; 7 | import com.alibaba.fastjson.annotation.JSONType; 8 | 9 | @JSONType(naming = PropertyNamingStrategy.SnakeCase) 10 | public class MatchTable { 11 | private int table_id; 12 | private Map match_table_type; 13 | private List word_List; 14 | private ProcessType exemption_process_type; 15 | private List exemption_word_list; 16 | 17 | public MatchTable(int table_id, Map match_table_type, List word_List, 18 | ProcessType exemption_process_type, List exemption_word_list) { 19 | this.table_id = table_id; 20 | this.match_table_type = match_table_type; 21 | this.word_List = word_List; 22 | this.exemption_process_type = exemption_process_type; 23 | this.exemption_word_list = exemption_word_list; 24 | } 25 | 26 | public int getTableId() { 27 | return table_id; 28 | } 29 | 30 | public void setTableId(int table_id) { 31 | this.table_id = table_id; 32 | } 33 | 34 | public Map getMatchTableType() { 35 | return match_table_type; 36 | } 37 | 38 | public void setMatchTableType(Map match_table_type) { 39 | this.match_table_type = match_table_type; 40 | } 41 | 42 | public List getWordList() { 43 | return word_List; 44 | } 45 | 46 | public void setWordList(List word_List) { 47 | this.word_List = word_List; 48 | } 49 | 50 | public ProcessType getExemptionProcessType() { 51 | return exemption_process_type; 52 | } 53 | 54 | public void setExemptionProcessType(ProcessType exemption_process_type) { 55 | this.exemption_process_type = exemption_process_type; 56 | } 57 | 58 | public List getExemptionWordList() { 59 | return exemption_word_list; 60 | } 61 | 62 | public void setExemptionWordList(List exemption_word_list) { 63 | this.exemption_word_list = exemption_word_list; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/MatchTableType.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import java.util.Map; 4 | import java.util.HashMap; 5 | 6 | public class MatchTableType { 7 | public static Map Simple(ProcessType processType) { 8 | Map map = new HashMap<>(); 9 | map.put("simple", new Simple(processType)); 10 | return map; 11 | } 12 | 13 | public static Map Regex(ProcessType processType, RegexMatchType regexMatchType) { 14 | Map map = new HashMap<>(); 15 | map.put("regex", new Regex(processType, regexMatchType)); 16 | return map; 17 | } 18 | 19 | public static Map Similar(ProcessType processType, SimMatchType simMatchType, 20 | float threshold) { 21 | Map map = new HashMap<>(); 22 | map.put("similar", new Similar(processType, simMatchType, threshold)); 23 | return map; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/ProcessType.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | public enum ProcessType { 4 | MatchNone(0b00000001), 5 | MatchFanjian(0b00000010), 6 | MatchDelete(0b00000100), 7 | MatchNormalize(0b00001000), 8 | MatchDeleteNormalize(0b00001100), 9 | MatchFanjianDeleteNormalize(0b00001110), 10 | MatchPinYin(0b00010000), 11 | MatchPinYinChar(0b00100000); 12 | 13 | private final int value; 14 | 15 | ProcessType(int value) { 16 | this.value = value; 17 | } 18 | 19 | public int getValue() { 20 | return value; 21 | } 22 | 23 | public String toString() { 24 | return String.valueOf(value); 25 | } 26 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/ProcessTypeSerializer.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import java.io.IOException; 4 | import java.lang.reflect.Type; 5 | 6 | import com.alibaba.fastjson.serializer.JSONSerializer; 7 | import com.alibaba.fastjson.serializer.ObjectSerializer; 8 | 9 | public class ProcessTypeSerializer implements ObjectSerializer { 10 | @Override 11 | public void write(JSONSerializer serializer, Object object, Object fieldName, Type fieldType, int features) 12 | throws IOException { 13 | ProcessType processType = (ProcessType) object; 14 | if (fieldName != null) { 15 | serializer.write(processType.getValue()); 16 | } else { 17 | serializer.write(processType.toString()); 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/Regex.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import com.alibaba.fastjson.PropertyNamingStrategy; 4 | import com.alibaba.fastjson.annotation.JSONType; 5 | 6 | @JSONType(naming = PropertyNamingStrategy.SnakeCase) 7 | public class Regex { 8 | private ProcessType process_type; 9 | private RegexMatchType regex_match_type; 10 | 11 | public Regex(ProcessType process_type, RegexMatchType regexMatchType) { 12 | this.process_type = process_type; 13 | this.regex_match_type = regexMatchType; 14 | } 15 | 16 | public ProcessType getProcessType() { 17 | return process_type; 18 | } 19 | 20 | public RegexMatchType getRegexMatchType() { 21 | return regex_match_type; 22 | } 23 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/RegexMatchType.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import com.alibaba.fastjson.annotation.JSONField; 4 | 5 | public enum RegexMatchType { 6 | MatchSimilarChar("similar_char"), 7 | MatchAcrostic("acrostic"), 8 | MatchRegex("regex"); 9 | 10 | private final String value; 11 | 12 | RegexMatchType(String value) { 13 | this.value = value; 14 | } 15 | 16 | @JSONField 17 | public String getValue() { 18 | return value; 19 | } 20 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/SimMatchType.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import com.alibaba.fastjson.annotation.JSONField; 4 | 5 | public enum SimMatchType { 6 | MatchLevenshtein("levenshtein"); 7 | 8 | private final String value; 9 | 10 | SimMatchType(String value) { 11 | this.value = value; 12 | } 13 | 14 | @JSONField 15 | public String getValue() { 16 | return value; 17 | } 18 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/Similar.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import com.alibaba.fastjson.PropertyNamingStrategy; 4 | import com.alibaba.fastjson.annotation.JSONType; 5 | 6 | @JSONType(naming = PropertyNamingStrategy.SnakeCase) 7 | public class Similar { 8 | private ProcessType process_type; 9 | private SimMatchType sim_match_type; 10 | private float threshold; 11 | 12 | public Similar(ProcessType process_type, SimMatchType sim_match_type, float threshold) { 13 | this.process_type = process_type; 14 | this.sim_match_type = sim_match_type; 15 | this.threshold = threshold; 16 | } 17 | 18 | public ProcessType getProcessType() { 19 | return process_type; 20 | } 21 | 22 | public SimMatchType getSimMatchType() { 23 | return sim_match_type; 24 | } 25 | 26 | public float getThreshold() { 27 | return threshold; 28 | } 29 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/Simple.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | import com.alibaba.fastjson.PropertyNamingStrategy; 4 | import com.alibaba.fastjson.annotation.JSONType; 5 | 6 | @JSONType(naming = PropertyNamingStrategy.SnakeCase) 7 | public class Simple { 8 | private ProcessType process_type; 9 | 10 | public Simple(ProcessType process_type) { 11 | this.process_type = process_type; 12 | } 13 | 14 | public ProcessType getProcessType() { 15 | return process_type; 16 | } 17 | } -------------------------------------------------------------------------------- /matcher_java/src/main/java/com/matcher_java/extension_types/SimpleResult.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java.extension_types; 2 | 3 | public class SimpleResult { 4 | private int word_id; 5 | private String word; 6 | 7 | public SimpleResult(int word_id, String word) { 8 | this.word_id = word_id; 9 | this.word = word; 10 | } 11 | 12 | public int getWordId() { 13 | return word_id; 14 | } 15 | 16 | public void setWordId(int word_id) { 17 | this.word_id = word_id; 18 | } 19 | 20 | public String getWord() { 21 | return word; 22 | } 23 | 24 | public void setWord(String word) { 25 | this.word = word; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /matcher_java/src/test/java/com/matcher_java/MatcherJavaExample.java: -------------------------------------------------------------------------------- 1 | package com.matcher_java; 2 | 3 | import com.alibaba.fastjson.JSON; 4 | import com.alibaba.fastjson.serializer.SerializeConfig; 5 | import com.matcher_java.extension_types.MatchTable; 6 | import com.matcher_java.extension_types.MatchTableType; 7 | import com.matcher_java.extension_types.ProcessType; 8 | import com.matcher_java.extension_types.ProcessTypeSerializer; 9 | import com.sun.jna.Pointer; 10 | 11 | import java.io.IOException; 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.List; 15 | import java.util.Map; 16 | 17 | public class MatcherJavaExample { 18 | public static void main(String[] args) throws IOException { 19 | System.out.println("Simple Matcher Test"); 20 | simple_matcher_process_demo(); 21 | 22 | System.out.println("\n"); 23 | 24 | System.out.println("Matcher Test"); 25 | matcher_process_demo(); 26 | } 27 | 28 | public static void simple_matcher_process_demo() throws IOException { 29 | SerializeConfig serializeConfig = new SerializeConfig(); 30 | serializeConfig.put(ProcessType.class, new ProcessTypeSerializer()); 31 | 32 | Map> simpleTable = new HashMap<>(); 33 | Map wordMap = new HashMap<>(); 34 | wordMap.put("1", "hello&world"); 35 | simpleTable.put(ProcessType.MatchNone, wordMap); 36 | 37 | String simpleTableStr = JSON.toJSONString(simpleTable, serializeConfig); 38 | System.out.printf("simple_table: %s\n", simpleTableStr); 39 | 40 | byte[] simpleTableBytes = JSON.toJSONBytes(simpleTable, serializeConfig); 41 | 42 | MatcherJava instance = MatcherJava.INSTANCE; 43 | 44 | Pointer simpleMatcher = instance.init_simple_matcher(simpleTableBytes); 45 | 46 | byte[] strBytes = "hello,world".getBytes("utf-8"); 47 | byte[] cStrBytes = new byte[strBytes.length + 1]; 48 | System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length); 49 | 50 | boolean isMatch = instance.simple_matcher_is_match(simpleMatcher, cStrBytes); 51 | System.out.printf("isMatch: %s\n", isMatch); 52 | 53 | Pointer matchResPtr = instance.simple_matcher_process_as_string(simpleMatcher, cStrBytes); 54 | String matchRes = matchResPtr.getString(0, "utf-8"); 55 | System.out.printf("matchRes: %s\n", matchRes); 56 | instance.drop_string(matchResPtr); 57 | 58 | instance.drop_simple_matcher(simpleMatcher); 59 | } 60 | 61 | public static void matcher_process_demo() throws IOException { 62 | SerializeConfig serializeConfig = new SerializeConfig(); 63 | serializeConfig.put(ProcessType.class, new ProcessTypeSerializer()); 64 | 65 | Map> matchTableMap = new HashMap<>(); 66 | List matchTableList = new ArrayList<>(); 67 | MatchTable matchTable = new MatchTable(1, MatchTableType.Simple(ProcessType.MatchNone), List.of("hello&world"), ProcessType.MatchNone, List.of()); 68 | matchTableList.add(matchTable); 69 | matchTableMap.put("1", matchTableList); 70 | 71 | String matchTableMapStr = JSON.toJSONString(matchTableMap, serializeConfig); 72 | System.out.printf("match_table_map: %s\n", matchTableMapStr); 73 | 74 | byte[] matchTableMapBytes = JSON.toJSONBytes(matchTableMap, serializeConfig); 75 | 76 | MatcherJava instance = MatcherJava.INSTANCE; 77 | 78 | Pointer matcher = instance.init_matcher(matchTableMapBytes); 79 | 80 | byte[] strBytes = "hello,world".getBytes("utf-8"); 81 | byte[] cStrBytes = new byte[strBytes.length + 1]; 82 | System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length); 83 | 84 | boolean isMatch = instance.matcher_is_match(matcher, cStrBytes); 85 | System.out.printf("isMatch: %s\n", isMatch); 86 | 87 | Pointer matchResPtr1 = instance.matcher_process_as_string(matcher, cStrBytes); 88 | String matchRes1 = matchResPtr1.getString(0, "utf-8"); 89 | System.out.printf("matchRes: %s\n", matchRes1); 90 | instance.drop_string(matchResPtr1); 91 | 92 | Pointer matchResPtr2 = instance.matcher_word_match_as_string(matcher, cStrBytes); 93 | String matchRes2 = matchResPtr2.getString(0, "utf-8"); 94 | System.out.printf("matchRes: %s\n", matchRes2); 95 | instance.drop_string(matchResPtr2); 96 | 97 | instance.drop_matcher(matcher); 98 | } 99 | } -------------------------------------------------------------------------------- /matcher_py/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "matcher_py" 3 | authors.workspace = true 4 | categories.workspace = true 5 | description.workspace = true 6 | edition.workspace = true 7 | homepage.workspace = true 8 | keywords.workspace = true 9 | license.workspace = true 10 | repository.workspace = true 11 | version.workspace = true 12 | readme = "README.md" 13 | documentation = "https://docs.rs/crate/matcher_py/latest" 14 | build = "build.rs" 15 | 16 | [lib] 17 | name = "matcher_py" 18 | crate-type = ["cdylib"] 19 | 20 | [dependencies] 21 | matcher_rs = { path = "../matcher_rs", version = "0.5.7" } 22 | pyo3 = { version = "0.25.0", features = ["extension-module"] } 23 | sonic-rs = "0.5.1" 24 | 25 | [build-dependencies] 26 | pyo3-build-config = "0.25.0" 27 | -------------------------------------------------------------------------------- /matcher_py/README.md: -------------------------------------------------------------------------------- 1 | # Matcher Rust Implementation with PyO3 Binding 2 | 3 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust. 4 | 5 | For detailed implementation, see the [Design Document](../DESIGN.md). 6 | 7 | ## Features 8 | 9 | - **Multiple Matching Methods**: 10 | - Simple Word Matching 11 | - Regex-Based Matching 12 | - Similarity-Based Matching 13 | - **Text Normalization**: 14 | - **Fanjian**: Simplify traditional Chinese characters to simplified ones. 15 | Example: `蟲艸` -> `虫艹` 16 | - **Delete**: Remove specific characters. 17 | Example: `*Fu&*iii&^%%*&kkkk` -> `Fuiiikkkk` 18 | - **Normalize**: Normalize special characters to identifiable characters. 19 | Example: `𝜢𝕰𝕃𝙻𝝧 𝙒ⓞᵣℒ𝒟!` -> `hello world!` 20 | - **PinYin**: Convert Chinese characters to Pinyin for fuzzy matching. 21 | Example: `西安` -> ` xi an `, matches `洗按` -> ` xi an `, but not `先` -> ` xian ` 22 | - **PinYinChar**: Convert Chinese characters to Pinyin. 23 | Example: `西安` -> `xian`, matches `洗按` and `先` -> `xian` 24 | - **AND OR NOT Word Matching**: 25 | - Takes into account the number of repetitions of words. 26 | - Example: `hello&world` matches `hello world` and `world,hello` 27 | - Example: `无&法&无&天` matches `无无法天` (because `无` is repeated twice), but not `无法天` 28 | - Example: `hello~helloo~hhello` matches `hello` but not `helloo` and `hhello` 29 | - **Customizable Exemption Lists**: Exclude specific words from matching. 30 | - **Efficient Handling of Large Word Lists**: Optimized for performance. 31 | 32 | ## Installation 33 | 34 | ### Use pip 35 | 36 | ```shell 37 | pip install matcher_py 38 | ``` 39 | 40 | ### Install pre-built binary 41 | 42 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary. 43 | 44 | ## Usage 45 | 46 | All relevant types are defined in [extension_types.py](./python/matcher_py/extension_types.py). 47 | 48 | ### Explanation of the configuration 49 | 50 | * `Matcher`'s configuration is defined by the `MatchTableMap = Dict[int, List[MatchTable]]` type, the key of `MatchTableMap` is called `match_id`, **for each `match_id`, the `table_id` inside is required to be unique**. 51 | * `SimpleMatcher`'s configuration is defined by the `SimpleTable = Dict[ProcessType, Dict[int, str]]` type, the value `Dict[int, str]`'s key is called `word_id`, **`word_id` is required to be globally unique**. 52 | 53 | #### MatchTable 54 | 55 | * `table_id`: The unique ID of the match table. 56 | * `match_table_type`: The type of the match table. 57 | * `word_list`: The word list of the match table. 58 | * `exemption_process_type`: The type of the exemption simple match. 59 | * `exemption_word_list`: The exemption word list of the match table. 60 | 61 | For each match table, word matching is performed over the `word_list`, and exemption word matching is performed over the `exemption_word_list`. If the exemption word matching result is True, the word matching result will be False. 62 | 63 | #### MatchTableType 64 | 65 | * `Simple`: Supports simple multiple patterns matching with text normalization defined by `process_type`. 66 | * It can handle combination patterns and repeated times sensitive matching, delimited by `&` and `~`, such as `hello&world&hello` will match `hellohelloworld` and `worldhellohello`, but not `helloworld` due to the repeated times of `hello`. 67 | * `Regex`: Supports regex patterns matching. 68 | * `SimilarChar`: Supports similar character matching using regex. 69 | * `["hello,hallo,hollo,hi", "word,world,wrd,🌍", "!,?,~"]` will match `helloworld!`, `hollowrd?`, `hi🌍~` ··· any combinations of the words split by `,` in the list. 70 | * `Acrostic`: Supports acrostic matching using regex **(currently only supports Chinese and simple English sentences)**. 71 | * `["h,e,l,l,o", "你,好"]` will match `hope, endures, love, lasts, onward.` and `你的笑容温暖, 好心情常伴。`. 72 | * `Regex`: Supports regex matching. 73 | * `["h[aeiou]llo", "w[aeiou]rd"]` will match `hello`, `world`, `hillo`, `wurld` ··· any text that matches the regex in the list. 74 | * `Similar`: Supports similar text matching based on distance and threshold. 75 | * `Levenshtein`: Supports similar text matching based on Levenshtein distance. 76 | 77 | #### ProcessType 78 | 79 | * `None`: No transformation. 80 | * `Fanjian`: Traditional Chinese to simplified Chinese transformation. Based on [FANJIAN](../matcher_rs/process_map/FANJIAN.txt). 81 | * `妳好` -> `你好` 82 | * `現⾝` -> `现身` 83 | * `Delete`: Delete all punctuation, special characters and white spaces. Based on [TEXT_DELETE](../matcher_rs/process_map/TEXT-DELETE.txt) and `WHITE_SPACE`. 84 | * `hello, world!` -> `helloworld` 85 | * `《你∷好》` -> `你好` 86 | * `Normalize`: Normalize all English character variations and number variations to basic characters. Based on [NORM](../matcher_rs//process_map/NORM.txt) and [NUM_NORM](../matcher_rs//process_map/NUM-NORM.txt). 87 | * `ℋЀ⒈㈠Õ` -> `he11o` 88 | * `⒈Ƨ㊂` -> `123` 89 | * `PinYin`: Convert all unicode Chinese characters to pinyin with boundaries. Based on [PINYIN](../matcher_rs/process_map/PINYIN.txt). 90 | * `你好` -> ` ni hao ` 91 | * `西安` -> ` xi an ` 92 | * `PinYinChar`: Convert all unicode Chinese characters to pinyin without boundaries. Based on [PINYIN](../matcher_rs/process_map/PINYIN.txt). 93 | * `你好` -> `nihao` 94 | * `西安` -> `xian` 95 | 96 | You can combine these transformations as needed. Pre-defined combinations like `DeleteNormalize` and `FanjianDeleteNormalize` are provided for convenience. 97 | 98 | Avoid combining `PinYin` and `PinYinChar` due to that `PinYin` is a more limited version of `PinYinChar`, in some cases like `xian`, can be treat as two words `xi` and `an`, or only one word `xian`. 99 | 100 | ### Text Process Usage 101 | 102 | Here’s an example of how to use the `reduce_text_process` and `text_process` functions: 103 | 104 | ```python 105 | from matcher_py import reduce_text_process, text_process 106 | from matcher_py.extension_types import ProcessType 107 | 108 | print(reduce_text_process(ProcessType.MatchDeleteNormalize, "hello, world!")) 109 | print(text_process(ProcessType.MatchDelete, "hello, world!")) 110 | ``` 111 | 112 | ### Matcher Basic Usage 113 | 114 | Here’s an example of how to use the `Matcher`: 115 | 116 | ```python 117 | import json 118 | 119 | from matcher_py import Matcher 120 | from matcher_py.extension_types import MatchTable, MatchTableType, ProcessType, RegexMatchType, SimMatchType 121 | 122 | matcher = Matcher( 123 | json.dumps({ 124 | 1: [ 125 | MatchTable( 126 | table_id=1, 127 | match_table_type=MatchTableType.Simple(process_type = ProcessType.MatchFanjianDeleteNormalize), 128 | word_list=["hello", "world"], 129 | exemption_process_type=ProcessType.MatchNone, 130 | exemption_word_list=["word"], 131 | ), 132 | MatchTable( 133 | table_id=2, 134 | match_table_type=MatchTableType.Regex( 135 | process_type = ProcessType.MatchFanjianDeleteNormalize, 136 | regex_match_type=RegexMatchType.Regex 137 | ), 138 | word_list=["h[aeiou]llo"], 139 | exemption_process_type=ProcessType.MatchNone, 140 | exemption_word_list=[], 141 | ) 142 | ], 143 | 2: [ 144 | MatchTable( 145 | table_id=3, 146 | match_table_type=MatchTableType.Similar( 147 | process_type = ProcessType.MatchFanjianDeleteNormalize, 148 | sim_match_type=SimMatchType.MatchLevenshtein, 149 | threshold=0.5 150 | ), 151 | word_list=["halxo"], 152 | exemption_process_type=ProcessType.MatchNone, 153 | exemption_word_list=[], 154 | ) 155 | ] 156 | }).encode() 157 | ) 158 | # Check if a text matches 159 | assert matcher.is_match("hello") 160 | assert not matcher.is_match("word") 161 | # Perform process as a list 162 | result = matcher.process("hello") 163 | assert result == [{'match_id': 1, 164 | 'table_id': 2, 165 | 'word_id': 0, 166 | 'word': 'h[aeiou]llo', 167 | 'similarity': 1.0}, 168 | {'match_id': 1, 169 | 'table_id': 1, 170 | 'word_id': 0, 171 | 'word': 'hello', 172 | 'similarity': 1.0}, 173 | {'match_id': 2, 174 | 'table_id': 3, 175 | 'word_id': 0, 176 | 'word': 'halxo', 177 | 'similarity': 0.6}] 178 | # Perform word matching as a dict 179 | assert matcher.word_match(r"hello, world")[1] == [{'match_id': 1, 180 | 'table_id': 2, 181 | 'word_id': 0, 182 | 'word': 'h[aeiou]llo', 183 | 'similarity': 1.0}, 184 | {'match_id': 1, 185 | 'table_id': 1, 186 | 'word_id': 0, 187 | 'word': 'hello', 188 | 'similarity': 1.0}, 189 | {'match_id': 1, 190 | 'table_id': 1, 191 | 'word_id': 1, 192 | 'word': 'world', 193 | 'similarity': 1.0}] 194 | # Perform word matching as a string 195 | result = matcher.word_match_as_string("hello") 196 | assert result == """{"2":[{"match_id":2,"table_id":3,"word_id":0,"word":"halxo","similarity":0.6}],"1":[{"match_id":1,"table_id":2,"word_id":0,"word":"h[aeiou]llo","similarity":1.0},{"match_id":1,"table_id":1,"word_id":0,"word":"hello","similarity":1.0}]}""" 197 | ``` 198 | 199 | ### Simple Matcher Basic Usage 200 | 201 | Here’s an example of how to use the `SimpleMatcher`: 202 | 203 | ```python 204 | import json 205 | 206 | from matcher_py import SimpleMatcher 207 | from matcher_py.extension_types import ProcessType 208 | 209 | simple_matcher = SimpleMatcher( 210 | json.dumps( 211 | { 212 | ProcessType.MatchNone: { 213 | 1: "hello&world", 214 | 2: "word&word~hello" 215 | }, 216 | ProcessType.MatchDelete: { 217 | 3: "hallo" 218 | } 219 | } 220 | ).encode() 221 | ) 222 | # Check if a text matches 223 | assert simple_matcher.is_match("hello^&!#*#&!^#*()world") 224 | # Perform simple processing 225 | result = simple_matcher.process("hello,world,word,word,hallo") 226 | assert result == [{'word_id': 1, 'word': 'hello&world'}, {'word_id': 3, 'word': 'hallo'}] 227 | ``` 228 | 229 | ## Contributing 230 | 231 | Contributions to `matcher_py` are welcome! If you find a bug or have a feature request, please open an issue on the [GitHub repository](https://github.com/Lips7/Matcher). If you would like to contribute code, please fork the repository and submit a pull request. 232 | 233 | ## License 234 | 235 | `matcher_py` is licensed under the MIT OR Apache-2.0 license. 236 | 237 | ## More Information 238 | 239 | For more details, visit the [GitHub repository](https://github.com/Lips7/Matcher). -------------------------------------------------------------------------------- /matcher_py/build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | pyo3_build_config::add_extension_module_link_args(); 3 | } 4 | -------------------------------------------------------------------------------- /matcher_py/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "matcher_py" 3 | description = "A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust." 4 | version = "0.5.7" 5 | readme = "README.md" 6 | requires-python = ">=3.8" 7 | authors = [{ name = 'Foster Guo', email = "f975793771@gmail.com" }] 8 | classifiers = [ 9 | "Development Status :: 5 - Production/Stable", 10 | "Intended Audience :: Developers", 11 | "License :: OSI Approved :: Apache Software License", 12 | "License :: OSI Approved :: MIT License", 13 | "Operating System :: MacOS", 14 | "Operating System :: Microsoft :: Windows", 15 | "Operating System :: POSIX :: Linux", 16 | "Programming Language :: Python :: 3", 17 | "Programming Language :: Python :: 3.8", 18 | "Programming Language :: Python :: 3.9", 19 | "Programming Language :: Python :: 3.10", 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3.13", 23 | "Programming Language :: Python", 24 | "Programming Language :: Rust", 25 | "Typing :: Typed", 26 | ] 27 | 28 | [project.urls] 29 | homepage = "https://github.com/Lips7/Matcher" 30 | repository = "https://github.com/Lips7/Matcher" 31 | changelog = "https://github.com/Lips7/Matcher/blob/master/CHANGELOG.md" 32 | 33 | [build-system] 34 | requires = ["maturin>=1,<2"] 35 | build-backend = "maturin" 36 | 37 | [tool.uv] 38 | managed = true 39 | dev-dependencies = [ 40 | "pytest", 41 | "pip" 42 | ] 43 | 44 | [tool.maturin] 45 | python-source = "python" 46 | bindings = "pyo3" 47 | strip = true 48 | profile = "release" 49 | module-name = "matcher_py" 50 | rustc-args = ["-C", "target-cpu=native"] 51 | -------------------------------------------------------------------------------- /matcher_py/python/matcher_py/__init__.py: -------------------------------------------------------------------------------- 1 | from .matcher_py import Matcher, SimpleMatcher, reduce_text_process, text_process 2 | 3 | __all__ = [ 4 | "Matcher", 5 | "SimpleMatcher", 6 | "reduce_text_process", 7 | "text_process", 8 | "ProcessType", 9 | "MatchTable", 10 | "MatchTableType", 11 | "RegexMatchType", 12 | "SimMatchType", 13 | ] 14 | -------------------------------------------------------------------------------- /matcher_py/python/matcher_py/extension_types.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, IntFlag 2 | from typing import Dict, List, TypedDict, Union 3 | 4 | 5 | class ProcessType(IntFlag): 6 | """ 7 | An enumeration representing various types of text processing operations. 8 | 9 | Attributes: 10 | MatchNone (IntFlag): An operation that performs no matching (binary 00000001). 11 | MatchFanjian (IntFlag): An operation that matches traditional and simplified Chinese characters (binary 00000010). 12 | MatchDelete (IntFlag): An operation that matches deleted characters (binary 00000100). 13 | MatchNormalize (IntFlag): An operation that normalizes characters (binary 00001000). 14 | MatchDeleteNormalize (IntFlag): A combined operation that deletes and normalizes characters (binary 00001100). 15 | MatchFanjianDeleteNormalize (IntFlag): A combined operation that matches traditional and simplified Chinese characters, 16 | deletes, and normalizes (binary 00001110). 17 | MatchPinYin (IntFlag): An operation that matches Pinyin representations of Chinese characters (binary 00010000). 18 | MatchPinYinChar (IntFlag): An operation that matches individual characters in the Pinyin representation (binary 00100000). 19 | """ 20 | 21 | MatchNone = 0b00000001 22 | MatchFanjian = 0b00000010 23 | MatchDelete = 0b00000100 24 | MatchNormalize = 0b00001000 25 | MatchDeleteNormalize = 0b00001100 26 | MatchFanjianDeleteNormalize = 0b00001110 27 | MatchPinYin = 0b00010000 28 | MatchPinYinChar = 0b00100000 29 | 30 | 31 | class RegexMatchType(str, Enum): 32 | """ 33 | An enumeration representing various types of regex matching operations. 34 | 35 | Attributes: 36 | MatchSimilarChar (str): An operation that matches characters that are similar in some way. 37 | MatchAcrostic (str): An operation that matches acrostic patterns. 38 | MatchRegex (str): An operation that matches using standard regular expressions. 39 | """ 40 | 41 | MatchSimilarChar = "similar_char" 42 | MatchAcrostic = "acrostic" 43 | MatchRegex = "regex" 44 | 45 | 46 | class SimMatchType(str, Enum): 47 | """ 48 | An enumeration representing various types of similarity matching operations. 49 | 50 | Attributes: 51 | MatchLevenshtein (str): An operation that matches using the Levenshtein distance metric. 52 | """ 53 | 54 | MatchLevenshtein = "levenshtein" 55 | 56 | 57 | class Simple(TypedDict): 58 | """ 59 | A TypedDict representing a simple text processing operation. 60 | 61 | Attributes: 62 | process_type (ProcessType): The type of processing operation to be performed. 63 | """ 64 | 65 | process_type: ProcessType 66 | 67 | 68 | class Regex(TypedDict): 69 | """ 70 | A TypedDict representing a regex-based text processing operation. 71 | 72 | Attributes: 73 | process_type (ProcessType): The type of processing operation to be performed. 74 | regex_match_type (RegexMatchType): The type of regex matching operation to be used. 75 | """ 76 | 77 | process_type: ProcessType 78 | regex_match_type: RegexMatchType 79 | 80 | 81 | class Similar(TypedDict): 82 | """ 83 | A TypedDict representing a similarity-based text processing operation. 84 | 85 | Attributes: 86 | process_type (ProcessType): The type of processing operation to be performed. 87 | sim_match_type (SimMatchType): The type of similarity matching operation to be used. 88 | threshold (float): The threshold value for the similarity matching operation. 89 | """ 90 | 91 | process_type: ProcessType 92 | sim_match_type: SimMatchType 93 | threshold: float 94 | 95 | 96 | class MatchTableType: 97 | def Simple(process_type: ProcessType) -> Dict[str, Simple]: 98 | """ 99 | Create a dictionary representing a simple text processing operation. 100 | 101 | Args: 102 | process_type (ProcessType): The type of processing operation to be performed. 103 | 104 | Returns: 105 | Dict[str, Simple]: A dictionary with one key "simple" mapping to a Simple TypedDict 106 | containing the provided process_type. 107 | """ 108 | return {"simple": Simple(process_type=process_type)} 109 | 110 | def Regex( 111 | process_type: ProcessType, regex_match_type: RegexMatchType 112 | ) -> Dict[str, Regex]: 113 | """ 114 | Create a dictionary representing a regex-based text processing operation. 115 | 116 | Args: 117 | process_type (ProcessType): The type of processing operation to be performed. 118 | regex_match_type (RegexMatchType): The type of regex matching operation to be used. 119 | 120 | Returns: 121 | Dict[str, Regex]: A dictionary with one key "regex" mapping to a Regex TypedDict 122 | containing the provided process_type and regex_match_type. 123 | """ 124 | return { 125 | "regex": Regex(process_type=process_type, regex_match_type=regex_match_type) 126 | } 127 | 128 | def Similar( 129 | process_type: ProcessType, sim_match_type: SimMatchType, threshold: float 130 | ) -> Dict[str, Similar]: 131 | """ 132 | Create a dictionary representing a similarity-based text processing operation. 133 | Args: 134 | process_type (ProcessType): The type of processing operation to be performed. 135 | sim_match_type (SimMatchType): The type of similarity matching operation to be used. 136 | threshold (float): The threshold value for the similarity matching operation. 137 | 138 | Returns: 139 | Dict[str, Similar]: A dictionary with one key "similar" mapping to a Similar TypedDict 140 | containing the provided process_type, sim_match_type, and threshold. 141 | """ 142 | return { 143 | "similar": Similar( 144 | process_type=process_type, 145 | sim_match_type=sim_match_type, 146 | threshold=threshold, 147 | ) 148 | } 149 | 150 | 151 | class MatchTable(TypedDict): 152 | """ 153 | A TypedDict representing a table for matching operations. 154 | 155 | Attributes: 156 | table_id (int): A unique identifier for the match table. 157 | match_table_type (Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]]): 158 | A dictionary that specifies the type of match operation to be performed. The key is a string indicating 159 | the match type ('simple', 'regex', 'similar'), and the value is a corresponding TypedDict describing 160 | the operation. 161 | word_list (List[str]): A list of words that are subject to the matching operations. 162 | exemption_process_type (ProcessType): The type of process for which certain words are exempt from matching. 163 | exemption_word_list (List[str]): A list of words that are exempt from the matching process. 164 | """ 165 | 166 | table_id: int 167 | match_table_type: Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]] 168 | word_list: List[str] 169 | exemption_process_type: ProcessType 170 | exemption_word_list: List[str] 171 | 172 | 173 | MatchTableMap = Dict[int, List[MatchTable]] 174 | """ 175 | A type alias for mapping table identifiers to lists of MatchTable objects. 176 | 177 | Type: 178 | Dict[int, List[MatchTable]] 179 | 180 | This dictionary maps an integer table ID to a list of MatchTable objects that correspond to the ID. It is used to 181 | organize and retrieve match tables based on their unique identifiers. 182 | """ 183 | 184 | 185 | class MatchResult(TypedDict): 186 | """ 187 | A TypedDict representing the result of a matching operation. 188 | 189 | Attributes: 190 | match_id (int): A unique identifier for the match result. 191 | table_id (int): The identifier of the match table where the matching operation was performed. 192 | word_id (int): The identifier of the matched word within the word list. 193 | word (str): The matched word. 194 | similarity (float): The similarity score of the match operation. 195 | """ 196 | 197 | match_id: int 198 | table_id: int 199 | word_id: int 200 | word: str 201 | similarity: float 202 | 203 | 204 | SimpleTable = Dict[ProcessType, Dict[int, str]] 205 | """ 206 | A type alias for representing a simple table structure for text processing. 207 | 208 | This dictionary maps a `ProcessType` to another dictionary that maps an integer ID to a string. 209 | The outer dictionary's keys represent different types of processing operations, while the inner 210 | dictionary's keys represent unique identifiers corresponding to specific strings related to the 211 | operations. 212 | 213 | Type: 214 | Dict[ProcessType, Dict[int, str]] 215 | """ 216 | 217 | 218 | class SimpleResult(TypedDict): 219 | """ 220 | A TypedDict representing a simplified result of a text processing operation. 221 | 222 | Attributes: 223 | word_id (int): The identifier of the word within the word list. 224 | word (str): The word corresponding to the word_id. 225 | """ 226 | 227 | word_id: int 228 | word: str 229 | -------------------------------------------------------------------------------- /matcher_py/python/matcher_py/matcher_py.pyi: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from .extension_types import SimpleResult, MatchResult 3 | 4 | def text_process(process_type: int, text: str) -> str: 5 | """ 6 | Processes the given text based on the specified process type. 7 | 8 | Parameters: 9 | - process_type (int): An integer indicating the type of process to be applied to the text. 10 | - text (str): The text string that is to be processed. 11 | 12 | Returns: 13 | - str: The text string after processing. 14 | """ 15 | ... 16 | 17 | def reduce_text_process(process_type: int, text: str) -> List[str]: 18 | """ 19 | Reduces the given text based on the specified process type and returns a list of strings. 20 | 21 | Parameters: 22 | - process_type (int): An integer indicating the type of process to be applied to the text. 23 | - text (str): The text string that is to be reduced. 24 | 25 | Returns: 26 | - List[str]: A list of strings after the reduction process. 27 | """ 28 | ... 29 | 30 | class Matcher: 31 | """ 32 | A class used to perform various matching operations using a given set of match table map bytes. 33 | 34 | Methods: 35 | - __init__(self, match_table_map_bytes: bytes) -> None: 36 | Initializes the Matcher with the provided match table map bytes. 37 | - __getnewargs__(self) -> bytes: 38 | Returns the arguments necessary to create a new instance of the Matcher. 39 | - __getstate__(self) -> bytes: 40 | Returns the state of the Matcher, typically used for pickling. 41 | - __setstate__(self, match_table_map_bytes: bytes): 42 | Sets the state of the Matcher from the provided match table map bytes, typically used for unpickling. 43 | - is_match(self, text: str) -> bool: 44 | Checks whether the given text matches any patterns in the match table map. 45 | - process(self, text: str) -> List[MatchResult]: 46 | Processes the given text and returns a list of MatchResult objects corresponding to the matches found. 47 | - word_match(self, text: str) -> Dict[int, List[MatchResult]]: 48 | Performs a word-level match on the given text and returns a dictionary where the keys are word indices and the values are lists of MatchResult objects. 49 | - word_match_as_string(self, text: str) -> str: 50 | Performs a word-level match on the given text and returns a string representation of the matches found. 51 | """ 52 | def __init__(self, match_table_map_bytes: bytes) -> None: ... 53 | def __getnewargs__(self) -> bytes: ... 54 | def __getstate__(self) -> bytes: ... 55 | def __setstate__(self, match_table_map_bytes: bytes): ... 56 | def is_match(self, text: str) -> bool: ... 57 | def process(self, text: str) -> List[MatchResult]: ... 58 | def word_match(self, text: str) -> Dict[int, List[MatchResult]]: ... 59 | def word_match_as_string(self, text: str) -> str: ... 60 | 61 | class SimpleMatcher: 62 | """ 63 | A class used to perform simplified matching operations using a provided set of simple table bytes. 64 | 65 | Methods: 66 | - __init__(self, simple_table_bytes: bytes) -> None: 67 | Initializes the SimpleMatcher with the provided simple table bytes. 68 | - __getnewargs__(self) -> bytes: 69 | Returns the arguments necessary to create a new instance of the SimpleMatcher. 70 | - __getstate__(self) -> bytes: 71 | Returns the state of the SimpleMatcher, typically used for pickling. 72 | - __setstate__(self, simple_table_bytes: bytes): 73 | Sets the state of the SimpleMatcher from the provided simple table bytes, typically used for unpickling. 74 | - is_match(self, text: str) -> bool: 75 | Checks whether the given text matches any patterns in the simple table. 76 | - process(self, text: str) -> List[SimpleResult]: 77 | Processes the given text and returns a list of SimpleResult objects corresponding to the matches found. 78 | """ 79 | def __init__(self, simple_table_bytes: bytes) -> None: ... 80 | def __getnewargs__(self) -> bytes: ... 81 | def __getstate__(self) -> bytes: ... 82 | def __setstate__(self, simple_table_bytes: bytes): ... 83 | def is_match(self, text: str) -> bool: ... 84 | def process(self, text: str) -> List[SimpleResult]: ... 85 | -------------------------------------------------------------------------------- /matcher_py/python/matcher_py/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lips7/Matcher/1371557dcf89de31003afedf2d85de5db87faa8d/matcher_py/python/matcher_py/py.typed -------------------------------------------------------------------------------- /matcher_py/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lips7/Matcher/1371557dcf89de31003afedf2d85de5db87faa8d/matcher_py/test/__init__.py -------------------------------------------------------------------------------- /matcher_py/test/test_matcher.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | from matcher_py.matcher_py import Matcher 5 | from matcher_py.extension_types import ( 6 | ProcessType, 7 | MatchTable, 8 | MatchTableType, 9 | RegexMatchType, 10 | SimMatchType, 11 | ) 12 | 13 | 14 | def test_init_with_non_bytes(): 15 | with pytest.raises(TypeError): 16 | Matcher(1) 17 | Matcher("") 18 | Matcher([]) 19 | Matcher({}) 20 | 21 | 22 | def test_init_with_invalid_bytes(): 23 | with pytest.raises(ValueError): 24 | Matcher(b"") 25 | Matcher(b"123") 26 | Matcher(b"invalid") 27 | Matcher(b"[]") 28 | Matcher(b"{}") 29 | 30 | 31 | def test_init_with_empty_map(): 32 | Matcher(json.dumps({}).encode()) 33 | Matcher(json.dumps({1: []}).encode()) 34 | Matcher( 35 | json.dumps( 36 | { 37 | 1: [ 38 | MatchTable( 39 | table_id=1, 40 | match_table_type=MatchTableType.Simple( 41 | process_type=ProcessType.MatchNone 42 | ), 43 | word_list=[], 44 | exemption_process_type=ProcessType.MatchNone, 45 | exemption_word_list=[], 46 | ) 47 | ] 48 | } 49 | ).encode() 50 | ) 51 | 52 | 53 | def test_init_with_invalid_map(): 54 | with pytest.raises(ValueError): 55 | Matcher(json.dumps({"a": 1}).encode()) 56 | Matcher(json.dumps({"a": {"b": 1}}).encode()) 57 | Matcher(json.dumps({"c": {}}).encode()) 58 | 59 | 60 | def test_regex(): 61 | matcher = Matcher( 62 | json.dumps( 63 | { 64 | 1: [ 65 | MatchTable( 66 | table_id=1, 67 | match_table_type=MatchTableType.Regex( 68 | process_type=ProcessType.MatchNone, 69 | regex_match_type=RegexMatchType.MatchRegex, 70 | ), 71 | word_list=["h[aeiou]llo", "w[aeiou]rd"], 72 | exemption_process_type=ProcessType.MatchNone, 73 | exemption_word_list=[], 74 | ) 75 | ] 76 | } 77 | ).encode() 78 | ) 79 | assert matcher.is_match("hallo") 80 | assert matcher.is_match("ward") 81 | assert matcher.word_match("hallo")[1][0]["table_id"] == 1 82 | assert matcher.word_match("hallo")[1][0]["word"] == "h[aeiou]llo" 83 | 84 | 85 | def test_similar_char(): 86 | matcher = Matcher( 87 | json.dumps( 88 | { 89 | 1: [ 90 | MatchTable( 91 | table_id=1, 92 | match_table_type=MatchTableType.Regex( 93 | process_type=ProcessType.MatchNone, 94 | regex_match_type=RegexMatchType.MatchSimilarChar, 95 | ), 96 | word_list=["hello,hi,H,你好", "world,word,🌍,世界"], 97 | exemption_process_type=ProcessType.MatchNone, 98 | exemption_word_list=[], 99 | ) 100 | ] 101 | } 102 | ).encode() 103 | ) 104 | assert matcher.is_match("helloworld") 105 | assert matcher.is_match("hi世界") 106 | assert matcher.word_match("helloworld")[1][0]["table_id"] == 1 107 | assert matcher.word_match("helloworld")[1][0]["word"] == "helloworld" 108 | 109 | 110 | def test_similar_text_levenshtein(): 111 | matcher = Matcher( 112 | json.dumps( 113 | { 114 | 1: [ 115 | MatchTable( 116 | table_id=1, 117 | match_table_type=MatchTableType.Similar( 118 | process_type=ProcessType.MatchNone, 119 | sim_match_type=SimMatchType.MatchLevenshtein, 120 | threshold=0.8, 121 | ), 122 | word_list=["helloworld"], 123 | exemption_process_type=ProcessType.MatchNone, 124 | exemption_word_list=[], 125 | ) 126 | ] 127 | } 128 | ).encode() 129 | ) 130 | assert matcher.is_match("helloworl") 131 | assert matcher.is_match("halloworld") 132 | assert matcher.is_match("ha1loworld") 133 | assert not matcher.is_match("ha1loworld1") 134 | assert matcher.word_match("helloworl")[1][0]["table_id"] == 1 135 | assert matcher.word_match("helloworl")[1][0]["word"] == "helloworld" 136 | 137 | 138 | def test_acrostic(): 139 | matcher = Matcher( 140 | json.dumps( 141 | { 142 | 1: [ 143 | MatchTable( 144 | table_id=1, 145 | match_table_type=MatchTableType.Regex( 146 | process_type=ProcessType.MatchNone, 147 | regex_match_type=RegexMatchType.MatchAcrostic, 148 | ), 149 | word_list=["h,e,l,l,o", "你,好"], 150 | exemption_process_type=ProcessType.MatchNone, 151 | exemption_word_list=[], 152 | ) 153 | ] 154 | } 155 | ).encode() 156 | ) 157 | assert matcher.is_match("hope, endures, love, lasts, onward.") 158 | assert matcher.is_match( 159 | "Happy moments shared, Every smile and laugh, Love in every word, Lighting up our paths, Open hearts we show." 160 | ) 161 | assert matcher.is_match("你的笑容温暖, 好心情常伴。") 162 | assert not matcher.is_match("你好") 163 | assert ( 164 | matcher.word_match("hope, endures, love, lasts, onward.")[1][0]["word"] 165 | == "h,e,l,l,o" 166 | ) 167 | assert matcher.word_match("你的笑容温暖, 好心情常伴。")[1][0]["word"] == "你,好" 168 | 169 | 170 | def test_exemption(): 171 | matcher = Matcher( 172 | json.dumps( 173 | { 174 | 1: [ 175 | MatchTable( 176 | table_id=1, 177 | match_table_type=MatchTableType.Simple( 178 | process_type=ProcessType.MatchNone 179 | ), 180 | word_list=["helloworld"], 181 | exemption_process_type=ProcessType.MatchNone, 182 | exemption_word_list=["worldwide"], 183 | ) 184 | ] 185 | } 186 | ).encode() 187 | ) 188 | assert matcher.is_match("helloworld") 189 | assert not matcher.is_match("helloworldwide") 190 | 191 | matcher = Matcher( 192 | json.dumps( 193 | { 194 | 1: [ 195 | MatchTable( 196 | table_id=1, 197 | match_table_type=MatchTableType.Simple( 198 | process_type=ProcessType.MatchNone 199 | ), 200 | word_list=["helloworld"], 201 | exemption_process_type=ProcessType.MatchNone, 202 | exemption_word_list=["worldwide"], 203 | ), 204 | MatchTable( 205 | table_id=2, 206 | match_table_type=MatchTableType.Regex( 207 | process_type=ProcessType.MatchNone, 208 | regex_match_type=RegexMatchType.MatchRegex, 209 | ), 210 | word_list=["hello"], 211 | exemption_process_type=ProcessType.MatchNone, 212 | exemption_word_list=["worldwide"], 213 | ), 214 | ] 215 | } 216 | ).encode() 217 | ) 218 | assert matcher.is_match("helloworld") 219 | assert not matcher.is_match("helloworldwide") 220 | -------------------------------------------------------------------------------- /matcher_py/test/test_simple_matcher.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pytest 3 | 4 | from matcher_py.matcher_py import SimpleMatcher 5 | from matcher_py.extension_types import ProcessType 6 | 7 | 8 | def test_init_with_non_bytes(): 9 | with pytest.raises(TypeError): 10 | SimpleMatcher(1) 11 | SimpleMatcher("") 12 | SimpleMatcher([]) 13 | SimpleMatcher({}) 14 | 15 | 16 | def test_init_with_invalid_bytes(): 17 | with pytest.raises(ValueError): 18 | SimpleMatcher(b"") 19 | SimpleMatcher(b"123") 20 | SimpleMatcher(b"invalid") 21 | SimpleMatcher(b"[]") 22 | SimpleMatcher(b"{}") 23 | 24 | 25 | def test_init_with_empty_map(): 26 | SimpleMatcher(json.dumps({}).encode()) 27 | SimpleMatcher(json.dumps({1: {}}).encode()) 28 | 29 | 30 | def test_init_with_invalid_map(): 31 | with pytest.raises(ValueError): 32 | SimpleMatcher(json.dumps({"a": 1}).encode()) 33 | SimpleMatcher(json.dumps({"a": {"b": 1}}).encode()) 34 | SimpleMatcher(json.dumps({1: []}).encode()) 35 | 36 | 37 | def test_backslashes(): 38 | simple_matcher = SimpleMatcher( 39 | json.dumps({ProcessType.MatchNone: {1: r"It's /\/\y duty"}}).encode() 40 | ) 41 | assert simple_matcher.is_match(r"It's /\/\y duty") 42 | assert simple_matcher.process(r"It's /\/\y duty")[0]["word"] == r"It's /\/\y duty" 43 | 44 | 45 | def test_fanjian(): 46 | simple_matcher = SimpleMatcher( 47 | json.dumps({ProcessType.MatchFanjian: {1: "你好"}}).encode() 48 | ) 49 | assert simple_matcher.is_match("妳好") 50 | assert simple_matcher.process("你好")[0]["word_id"] == 1 51 | assert simple_matcher.process("你好")[0]["word"] == "你好" 52 | 53 | simple_matcher = SimpleMatcher( 54 | json.dumps({ProcessType.MatchFanjian: {1: "妳好"}}).encode() 55 | ) 56 | assert simple_matcher.is_match("你好") 57 | assert simple_matcher.process("你好")[0]["word_id"] == 1 58 | assert simple_matcher.process("你好")[0]["word"] == "妳好" 59 | 60 | 61 | def test_delete(): 62 | simple_matcher = SimpleMatcher( 63 | json.dumps({ProcessType.MatchDelete: {1: "你好"}}).encode() 64 | ) 65 | assert simple_matcher.is_match("你!好") 66 | assert len(simple_matcher.process("你!好")) == 1 67 | 68 | 69 | def test_normalize(): 70 | simple_matcher = SimpleMatcher( 71 | json.dumps( 72 | { 73 | ProcessType.MatchNormalize: { 74 | 1: "he11o", 75 | } 76 | } 77 | ).encode() 78 | ) 79 | assert simple_matcher.is_match("ℋЀ⒈㈠Õ") 80 | assert simple_matcher.process("ℋЀ⒈㈠Õ")[0]["word_id"] == 1 81 | assert simple_matcher.process("ℋЀ⒈㈠Õ")[0]["word"] == "he11o" 82 | 83 | 84 | def test_pinyin(): 85 | simple_matcher = SimpleMatcher( 86 | json.dumps( 87 | { 88 | ProcessType.MatchPinYin: { 89 | 1: "西安", 90 | } 91 | } 92 | ).encode() 93 | ) 94 | assert simple_matcher.is_match("洗按") 95 | assert not simple_matcher.is_match("现") 96 | 97 | 98 | def test_pinyinchar(): 99 | simple_matcher = SimpleMatcher( 100 | json.dumps( 101 | { 102 | ProcessType.MatchPinYinChar: { 103 | 1: "西安", 104 | } 105 | } 106 | ).encode() 107 | ) 108 | assert simple_matcher.is_match("洗按") 109 | assert simple_matcher.is_match("现") 110 | assert simple_matcher.is_match("xian") 111 | -------------------------------------------------------------------------------- /matcher_py/uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | revision = 1 3 | requires-python = ">=3.8" 4 | 5 | [[package]] 6 | name = "colorama" 7 | version = "0.4.6" 8 | source = { registry = "https://pypi.org/simple" } 9 | sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } 10 | wheels = [ 11 | { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, 12 | ] 13 | 14 | [[package]] 15 | name = "exceptiongroup" 16 | version = "1.2.2" 17 | source = { registry = "https://pypi.org/simple" } 18 | sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 } 19 | wheels = [ 20 | { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 }, 21 | ] 22 | 23 | [[package]] 24 | name = "iniconfig" 25 | version = "2.1.0" 26 | source = { registry = "https://pypi.org/simple" } 27 | sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 } 28 | wheels = [ 29 | { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 }, 30 | ] 31 | 32 | [[package]] 33 | name = "matcher-py" 34 | version = "0.5.7" 35 | source = { editable = "." } 36 | 37 | [package.dev-dependencies] 38 | dev = [ 39 | { name = "pip" }, 40 | { name = "pytest" }, 41 | ] 42 | 43 | [package.metadata] 44 | 45 | [package.metadata.requires-dev] 46 | dev = [ 47 | { name = "pip" }, 48 | { name = "pytest" }, 49 | ] 50 | 51 | [[package]] 52 | name = "packaging" 53 | version = "24.2" 54 | source = { registry = "https://pypi.org/simple" } 55 | sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 } 56 | wheels = [ 57 | { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, 58 | ] 59 | 60 | [[package]] 61 | name = "pip" 62 | version = "25.0.1" 63 | source = { registry = "https://pypi.org/simple" } 64 | sdist = { url = "https://files.pythonhosted.org/packages/70/53/b309b4a497b09655cb7e07088966881a57d082f48ac3cb54ea729fd2c6cf/pip-25.0.1.tar.gz", hash = "sha256:88f96547ea48b940a3a385494e181e29fb8637898f88d88737c5049780f196ea", size = 1950850 } 65 | wheels = [ 66 | { url = "https://files.pythonhosted.org/packages/c9/bc/b7db44f5f39f9d0494071bddae6880eb645970366d0a200022a1a93d57f5/pip-25.0.1-py3-none-any.whl", hash = "sha256:c46efd13b6aa8279f33f2864459c8ce587ea6a1a59ee20de055868d8f7688f7f", size = 1841526 }, 67 | ] 68 | 69 | [[package]] 70 | name = "pluggy" 71 | version = "1.5.0" 72 | source = { registry = "https://pypi.org/simple" } 73 | sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } 74 | wheels = [ 75 | { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, 76 | ] 77 | 78 | [[package]] 79 | name = "pytest" 80 | version = "8.3.5" 81 | source = { registry = "https://pypi.org/simple" } 82 | dependencies = [ 83 | { name = "colorama", marker = "sys_platform == 'win32'" }, 84 | { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, 85 | { name = "iniconfig" }, 86 | { name = "packaging" }, 87 | { name = "pluggy" }, 88 | { name = "tomli", marker = "python_full_version < '3.11'" }, 89 | ] 90 | sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 } 91 | wheels = [ 92 | { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 }, 93 | ] 94 | 95 | [[package]] 96 | name = "tomli" 97 | version = "2.2.1" 98 | source = { registry = "https://pypi.org/simple" } 99 | sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 } 100 | wheels = [ 101 | { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 }, 102 | { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 }, 103 | { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 }, 104 | { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 }, 105 | { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 }, 106 | { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 }, 107 | { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 }, 108 | { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 }, 109 | { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 }, 110 | { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 }, 111 | { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 }, 112 | { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 }, 113 | { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 }, 114 | { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 }, 115 | { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 }, 116 | { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 }, 117 | { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 }, 118 | { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 }, 119 | { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 }, 120 | { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 }, 121 | { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708 }, 122 | { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582 }, 123 | { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543 }, 124 | { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691 }, 125 | { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170 }, 126 | { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530 }, 127 | { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666 }, 128 | { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954 }, 129 | { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724 }, 130 | { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383 }, 131 | { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 }, 132 | ] 133 | -------------------------------------------------------------------------------- /matcher_rs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "matcher_rs" 3 | authors.workspace = true 4 | categories.workspace = true 5 | description.workspace = true 6 | edition.workspace = true 7 | homepage.workspace = true 8 | keywords.workspace = true 9 | license.workspace = true 10 | repository.workspace = true 11 | version.workspace = true 12 | readme = "README.md" 13 | documentation = "https://docs.rs/crate/matcher_rs/latest" 14 | build = "build.rs" 15 | 16 | [package.metadata.docs.rs] 17 | rustc-args = ["-C", "target-feature=native"] 18 | rustdoc-args = ["-C", "target-feature=native", "--document-private-items"] 19 | 20 | [lib] 21 | name = "matcher_rs" 22 | crate-type = ["rlib"] 23 | 24 | [dependencies] 25 | # aho-corasick = "1.1.3" 26 | aho-corasick-unsafe = { version = "0.0.4", git = "https://github.com/Lips7/aho-corasick" } 27 | bitflags = "2.9.1" 28 | daachorse = "1.0.0" 29 | fancy-regex = "0.14.0" 30 | id-set = "0.2.2" 31 | lazy_static = "1.5.0" 32 | micromap = "0.1.0" 33 | nohash-hasher = "0.2.0" 34 | parking_lot = { version = "0.12.4", features = ["hardware-lock-elision"]} 35 | rapidfuzz = "0.5.0" 36 | regex = { version = "1.11.1", features = ["perf-dfa-full"] } 37 | rustc-hash = "2.1.1" 38 | serde = { version = "1.0.219", features = ["derive"] } 39 | tinyvec = { version = "1.9.0", features = ["serde"] } 40 | 41 | [target.'cfg(all(target_os = "linux", target_arch = "aarch64"))'.dependencies] 42 | tikv-jemallocator = "0.6.0" 43 | 44 | [target.'cfg(not(all(target_os = "linux", target_arch = "aarch64")))'.dependencies] 45 | mimalloc = "0.1.46" 46 | 47 | [build-dependencies] 48 | daachorse = "1.0.0" 49 | 50 | [dev-dependencies] 51 | divan = "0.1.21" 52 | fastrand = "2.3.0" 53 | 54 | [features] 55 | default = ["dfa"] 56 | # By enable runtime_build feature, we could build process matcher at runtime, but with build time increasing. 57 | runtime_build = [] 58 | # By enable serde feature, we could serialize and deserialize matcher and simple_matcher. 59 | # With serde feature, AhoCorasick's prefilter is disabled, because I don't know how to serialize it correctly, 60 | # which will lead to performance regression when the patterns size is small (say, less than 100). 61 | serde = ["aho-corasick-unsafe/serde"] 62 | # By enable dfa feature, we could use dfa to perform simple matching, but with significantly increasing memory consumption. 63 | dfa = [] 64 | 65 | [[bench]] 66 | name = "bench" 67 | harness = false 68 | -------------------------------------------------------------------------------- /matcher_rs/build.rs: -------------------------------------------------------------------------------- 1 | use std::io::Result; 2 | 3 | /// The `main` function serves as the build script for a Rust project, responsible for 4 | /// generating binary data files used in text conversion and matching tasks. 5 | /// Depending on the features enabled, it reads specific conversion mappings from 6 | /// text files, processes them, and writes them to binary files. 7 | /// 8 | /// It comprises several key steps: 9 | /// 10 | /// 1. Print instructions to re-run build script if specific files change. 11 | /// 2. Conditionally process text conversion data only if 'runtime_build' feature is not enabled. 12 | /// 3. Load text content from files in the 'process_map' directory into constants like FANJIAN, NUM_NORM, NORM, and PINYIN. 13 | /// 4. For each mapping type ('fanjian', 'normalize', 'pinyin'): 14 | /// - Aggregate conversion mappings from loaded constants into a HashMap. 15 | /// - Clean the HashMap by removing identity mappings. 16 | /// - Create binary files containing the list of strings to match and the list of corresponding replacements. 17 | /// - For 'pinyin': 18 | /// - Also create a binary file with trimmed replacements. 19 | /// - For specified mappings ('fanjian', 'pinyin'): 20 | /// - Use the `daachorse` crate to build and serialize a CharwiseDoubleArrayAhoCorasick matcher, and write it to a binary file. 21 | /// - For 'normalize', when DFA feature is not enabled: 22 | /// - Similarly, build a matcher with a different match kind and serialize it. 23 | /// 5. Additionally, if 'dfa' feature is not enabled: 24 | /// - Load delete and whitespace character patterns from TEXT_DELETE constant and WHITE_SPACE array respectively. 25 | /// - Aggregate these patterns into a HashSet to remove duplicates. 26 | /// - Write these patterns to a binary file. 27 | /// - Build a matcher for these patterns, serialize it, and write it to a binary file. 28 | /// 29 | /// The function completes by returning `Ok(())` to indicate successful completion of the build script. 30 | fn main() -> Result<()> { 31 | println!("cargo:rerun-if-changed=build.rs"); 32 | println!("cargo:rerun-if-changed=process_map"); 33 | 34 | #[cfg(not(feature = "runtime_build"))] 35 | { 36 | use std::collections::HashMap; 37 | use std::env; 38 | use std::fs::File; 39 | use std::io::Write; 40 | 41 | use daachorse::{ 42 | CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder, 43 | MatchKind as DoubleArrayAhoCorasickMatchKind, 44 | }; 45 | 46 | /// These constants include the contents of their respective text files 47 | /// from the `process_map` directory. Each constant refers to a specific 48 | /// text conversion mapping used within the project. The text files 49 | /// contain tab-separated values, where each line represents a pair of 50 | /// strings that define a specific conversion. 51 | /// 52 | /// - `FANJIAN` includes simplified and traditional Chinese character mappings. 53 | /// - `NUM_NORM` includes mappings for normalizing numbers. 54 | /// - `NORM` includes mappings for various normalization forms. 55 | /// - `PINYIN` includes mappings for converting characters to Pinyin. 56 | const FANJIAN: &str = include_str!("./process_map/FANJIAN.txt"); 57 | const NUM_NORM: &str = include_str!("./process_map/NUM-NORM.txt"); 58 | const NORM: &str = include_str!("./process_map/NORM.txt"); 59 | const PINYIN: &str = include_str!("./process_map/PINYIN.txt"); 60 | 61 | let out_dir = env::var("OUT_DIR").unwrap(); 62 | let process_str_map = HashMap::from([ 63 | ("fanjian", vec![FANJIAN]), 64 | ("normalize", vec![NORM, NUM_NORM]), 65 | ("pinyin", vec![PINYIN]), 66 | ]); 67 | 68 | for process_type_bit_str in ["fanjian", "normalize", "pinyin"] { 69 | let mut process_dict = HashMap::new(); 70 | 71 | for process_map in process_str_map.get(process_type_bit_str).unwrap() { 72 | process_dict.extend(process_map.trim().lines().map(|pair_str| { 73 | let mut pair_str_split = pair_str.split('\t'); 74 | ( 75 | pair_str_split.next().unwrap(), 76 | pair_str_split.next().unwrap(), 77 | ) 78 | })) 79 | } 80 | 81 | process_dict.retain(|&key, &mut value| key != value); 82 | let process_list = process_dict 83 | .iter() 84 | .map(|(&key, _)| key) 85 | .collect::>(); 86 | 87 | let mut process_list_bin = 88 | File::create(format!("{out_dir}/{process_type_bit_str}_process_list.bin"))?; 89 | process_list_bin.write_all(process_list.join("\n").as_bytes())?; 90 | 91 | let process_replace_list = process_dict 92 | .iter() 93 | .map(|(_, &val)| val) 94 | .collect::>(); 95 | let mut process_replace_list_bin = File::create(format!( 96 | "{out_dir}/{process_type_bit_str}_process_replace_list.bin" 97 | ))?; 98 | process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?; 99 | 100 | if process_type_bit_str == "pinyin" { 101 | let process_replace_list = process_dict 102 | .iter() 103 | .map(|(_, &val)| val.trim_matches(' ')) 104 | .collect::>(); 105 | let mut process_replace_list_bin = 106 | File::create(format!("{out_dir}/pinyinchar_process_replace_list.bin"))?; 107 | process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?; 108 | } 109 | 110 | if ["fanjian", "pinyin"].contains(&process_type_bit_str) { 111 | let matcher: CharwiseDoubleArrayAhoCorasick = 112 | CharwiseDoubleArrayAhoCorasickBuilder::new() 113 | .match_kind(DoubleArrayAhoCorasickMatchKind::Standard) 114 | .build(&process_list) 115 | .unwrap(); 116 | let matcher_bytes = matcher.serialize(); 117 | let mut matcher_bin = File::create(format!( 118 | "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin" 119 | ))?; 120 | matcher_bin.write_all(&matcher_bytes)?; 121 | } 122 | 123 | #[cfg(not(feature = "dfa"))] 124 | if process_type_bit_str == "normalize" { 125 | let matcher: CharwiseDoubleArrayAhoCorasick = 126 | CharwiseDoubleArrayAhoCorasickBuilder::new() 127 | .match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest) 128 | .build(&process_list) 129 | .unwrap(); 130 | let matcher_bytes = matcher.serialize(); 131 | let mut matcher_bin = File::create(format!( 132 | "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin" 133 | ))?; 134 | matcher_bin.write_all(&matcher_bytes)?; 135 | } 136 | } 137 | 138 | #[cfg(not(feature = "dfa"))] 139 | { 140 | use std::collections::HashSet; 141 | 142 | /// These constants define deletion and whitespace character mappings 143 | /// that are used within the project. The `TEXT_DELETE` constant 144 | /// includes contents from the `TEXT-DELETE.txt` file in the `process_map` 145 | /// directory, which contains textual patterns to be deleted. 146 | /// The `WHITE_SPACE` constant includes various Unicode whitespace 147 | /// characters that are treated as whitespace in the project's text 148 | /// processing logic. 149 | /// 150 | /// - `TEXT_DELETE` includes patterns of text identified for deletion. 151 | /// - `WHITE_SPACE` includes numerous Unicode representations of whitespace. 152 | const TEXT_DELETE: &str = include_str!("./process_map/TEXT-DELETE.txt"); 153 | const WHITE_SPACE: &[&str] = &[ 154 | "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", 155 | "\u{00A0}", "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", 156 | "\u{2005}", "\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", 157 | "\u{200F}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}", "\u{3000}", 158 | ]; 159 | 160 | let mut process_set = HashSet::new(); 161 | 162 | process_set.extend(TEXT_DELETE.trim().lines().map(|line| line)); 163 | process_set.extend(WHITE_SPACE); 164 | 165 | let process_list = process_set.iter().map(|&s| s).collect::>(); 166 | 167 | let mut process_list_bin = File::create(format!("{out_dir}/delete_process_list.bin"))?; 168 | process_list_bin.write_all(process_list.join("\n").as_bytes())?; 169 | 170 | let matcher: CharwiseDoubleArrayAhoCorasick = 171 | CharwiseDoubleArrayAhoCorasickBuilder::new() 172 | .match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest) 173 | .build(&process_list) 174 | .unwrap(); 175 | let matcher_bytes = matcher.serialize(); 176 | let mut matcher_bin = File::create(format!( 177 | "{out_dir}/delete_daachorse_charwise_u32_matcher.bin" 178 | ))?; 179 | matcher_bin.write_all(&matcher_bytes)?; 180 | } 181 | } 182 | 183 | Ok(()) 184 | } 185 | -------------------------------------------------------------------------------- /matcher_rs/src/lib.rs: -------------------------------------------------------------------------------- 1 | #[cfg(all(target_os = "linux", target_arch = "aarch64"))] 2 | #[global_allocator] 3 | static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; 4 | 5 | #[cfg(not(all(target_os = "linux", target_arch = "aarch64")))] 6 | #[global_allocator] 7 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 8 | 9 | mod util; 10 | pub use util::word::SimpleWord; 11 | 12 | mod process; 13 | pub use process::process_matcher::{ 14 | build_process_type_tree, get_process_matcher, reduce_text_process, reduce_text_process_emit, 15 | reduce_text_process_with_set, reduce_text_process_with_tree, text_process, ProcessType, 16 | }; 17 | 18 | mod simple_matcher; 19 | pub use simple_matcher::{SimpleMatcher, SimpleResult, SimpleTable, SimpleTableSerde}; 20 | 21 | mod regex_matcher; 22 | pub use regex_matcher::{RegexMatchType, RegexMatcher, RegexResult, RegexTable}; 23 | 24 | mod sim_matcher; 25 | pub use sim_matcher::{SimMatchType, SimMatcher, SimResult, SimTable}; 26 | 27 | mod matcher; 28 | pub use matcher::{ 29 | MatchResult, MatchResultTrait, MatchTable, MatchTableMap, MatchTableMapSerde, MatchTableType, 30 | Matcher, TextMatcherTrait, 31 | }; 32 | -------------------------------------------------------------------------------- /matcher_rs/src/process/constants.rs: -------------------------------------------------------------------------------- 1 | /// This module defines several constants for processing and normalization of text data, 2 | /// including definitions for whitespace characters, conditional includes for files, 3 | /// and configurations for runtime build and DFA (Deterministic Finite Automaton) features. 4 | 5 | /// These constants are conditionally included when the `runtime_build` feature is enabled. 6 | /// They provide paths to various text processing maps used for normalization and replacement. 7 | /// 8 | /// - `FANJIAN`: Maps traditional Chinese characters to simplified Chinese characters. 9 | /// - `TEXT_DELETE`: Defines text segments that should be removed during preprocessing. 10 | /// - `NUM_NORM`: Specifies numeric normalization rules. 11 | /// - `NORM`: Contains general normalization rules. 12 | /// - `PINYIN`: Provides mappings for converting Chinese characters to Pinyin. 13 | #[cfg(feature = "runtime_build")] 14 | pub const FANJIAN: &str = include_str!("../../process_map/FANJIAN.txt"); 15 | #[cfg(feature = "runtime_build")] 16 | pub const TEXT_DELETE: &str = include_str!("../../process_map/TEXT-DELETE.txt"); 17 | #[cfg(feature = "runtime_build")] 18 | pub const NUM_NORM: &str = include_str!("../../process_map/NUM-NORM.txt"); 19 | #[cfg(feature = "runtime_build")] 20 | pub const NORM: &str = include_str!("../../process_map/NORM.txt"); 21 | #[cfg(feature = "runtime_build")] 22 | pub const PINYIN: &str = include_str!("../../process_map/PINYIN.txt"); 23 | 24 | /// These constants are for normalization processing and are included based on different 25 | /// feature flags. 26 | /// 27 | /// When the `runtime_build` feature is not enabled and the `dfa` feature is enabled, 28 | /// `NORMALIZE_PROCESS_LIST_STR` is included. This constant provides the path to the 29 | /// normalization process list, which is generated at compile time. 30 | /// 31 | /// When `runtime_build` is not enabled and the `dfa` feature is not enabled, 32 | /// `NORMALIZE_PROCESS_MATCHER_BYTES` is included. This constant provides the path to 33 | /// the normalization matcher bytes, which is also generated during the build process. 34 | /// 35 | /// Additionally, `NORMALIZE_PROCESS_REPLACE_LIST_STR` is included when `runtime_build` 36 | /// is not enabled. This constant provides the path to the normalization replace list, 37 | /// used for text replacement operations during normalization. 38 | #[cfg(all(not(feature = "runtime_build"), feature = "dfa"))] 39 | pub const NORMALIZE_PROCESS_LIST_STR: &str = 40 | include_str!(concat!(env!("OUT_DIR"), "/normalize_process_list.bin")); 41 | #[cfg(all(not(feature = "runtime_build"), not(feature = "dfa")))] 42 | pub const NORMALIZE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( 43 | env!("OUT_DIR"), 44 | "/normalize_daachorse_charwise_u32_matcher.bin" 45 | )); 46 | #[cfg(not(feature = "runtime_build"))] 47 | pub const NORMALIZE_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( 48 | env!("OUT_DIR"), 49 | "/normalize_process_replace_list.bin" 50 | )); 51 | 52 | /// These constants are related to Fanjian (simplified vs traditional Chinese conversion) 53 | /// processing and are included based on feature flags. 54 | /// 55 | /// - When the `runtime_build` feature is not enabled, `FANJIAN_PROCESS_REPLACE_LIST_STR` 56 | /// is included. This constant provides the path to the Fanjian process replace list, 57 | /// which is used for converting traditional Chinese characters to simplified Chinese 58 | /// characters during normalization. 59 | /// 60 | /// - Additionally, when the `runtime_build` feature is not enabled, `FANJIAN_PROCESS_MATCHER_BYTES` 61 | /// is included. This constant provides the path to the Fanjian matcher bytes, which are 62 | /// used for matching Fanjian text patterns during the normalization process. 63 | #[cfg(not(feature = "runtime_build"))] 64 | pub const FANJIAN_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( 65 | env!("OUT_DIR"), 66 | "/fanjian_process_replace_list.bin" 67 | )); 68 | #[cfg(not(feature = "runtime_build"))] 69 | pub const FANJIAN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( 70 | env!("OUT_DIR"), 71 | "/fanjian_daachorse_charwise_u32_matcher.bin" 72 | )); 73 | 74 | /// These constants are related to Pinyin processing and are included based on feature flags. 75 | /// 76 | /// - When the `runtime_build` feature is not enabled, `PINYIN_PROCESS_REPLACE_LIST_STR` 77 | /// is included. This constant provides the path to the Pinyin process replace list, 78 | /// which is used for converting Chinese characters to Pinyin during normalization. 79 | /// 80 | /// - Similarly, when the `runtime_build` feature is not enabled, `PINYINCHAR_PROCESS_REPLACE_LIST_STR` 81 | /// is included. This constant provides the path to the Pinyin character process replace list, 82 | /// which is also used for text replacement operations. 83 | /// 84 | /// - Additionally, when the `runtime_build` feature is not enabled, `PINYIN_PROCESS_MATCHER_BYTES` 85 | /// is included. This constant provides the path to the Pinyin matcher bytes, which are 86 | /// used for matching Pinyin text patterns during the normalization process. 87 | #[cfg(not(feature = "runtime_build"))] 88 | pub const PINYIN_PROCESS_REPLACE_LIST_STR: &str = 89 | include_str!(concat!(env!("OUT_DIR"), "/pinyin_process_replace_list.bin")); 90 | #[cfg(not(feature = "runtime_build"))] 91 | pub const PINYINCHAR_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!( 92 | env!("OUT_DIR"), 93 | "/pinyinchar_process_replace_list.bin" 94 | )); 95 | #[cfg(not(feature = "runtime_build"))] 96 | pub const PINYIN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( 97 | env!("OUT_DIR"), 98 | "/pinyin_daachorse_charwise_u32_matcher.bin" 99 | )); 100 | 101 | /// List of Unicode code points considered as whitespace characters. 102 | #[cfg(any(feature = "runtime_build", feature = "dfa"))] 103 | pub const WHITE_SPACE: &[&str] = &[ 104 | "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", "\u{00A0}", 105 | "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", "\u{2005}", "\u{2006}", 106 | "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", "\u{200F}", "\u{2028}", "\u{2029}", 107 | "\u{202F}", "\u{205F}", "\u{3000}", 108 | ]; 109 | 110 | /// These constants are related to the text deletion processing and are included based on feature flags. 111 | /// 112 | /// - When the `runtime_build` feature is not enabled and the `dfa` feature is enabled, 113 | /// `TEXT_DELETE` is included. This constant provides the path to the text deletion map, 114 | /// used for identifying text segments to be deleted during normalization. 115 | /// 116 | /// - When the `runtime_build` feature is not enabled and the `dfa` feature is not enabled, 117 | /// `TEXT_DELETE_PROCESS_MATCHER_BYTES` is included. This constant provides the path 118 | /// to the text deletion matcher bytes, which are generated during the build process and 119 | /// used for matching text patterns to be deleted during normalization. 120 | #[cfg(all(not(feature = "runtime_build"), feature = "dfa"))] 121 | pub const TEXT_DELETE: &str = include_str!("../../process_map/TEXT-DELETE.txt"); 122 | #[cfg(all(not(feature = "runtime_build"), not(feature = "dfa")))] 123 | pub const TEXT_DELETE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!( 124 | env!("OUT_DIR"), 125 | "/delete_daachorse_charwise_u32_matcher.bin" 126 | )); 127 | -------------------------------------------------------------------------------- /matcher_rs/src/process/mod.rs: -------------------------------------------------------------------------------- 1 | mod constants; 2 | pub mod process_matcher; 3 | -------------------------------------------------------------------------------- /matcher_rs/src/sim_matcher.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | 3 | use id_set::IdSet; 4 | use rapidfuzz::distance; 5 | use serde::{Deserialize, Serialize}; 6 | 7 | use crate::{ 8 | matcher::{MatchResultTrait, TextMatcherTrait}, 9 | process::process_matcher::{ 10 | build_process_type_tree, reduce_text_process_with_tree, ProcessType, ProcessTypeBitNode, 11 | }, 12 | }; 13 | 14 | /// Enumeration representing the types of similarity matching algorithms available. 15 | /// 16 | /// Currently, this enum only supports the Levenshtein distance algorithm. 17 | /// 18 | /// # Variants 19 | /// 20 | /// * [SimMatchType::Levenshtein] - Represents the Levenshtein distance algorithm, a string metric for measuring the difference between two sequences. 21 | /// 22 | /// The enum variants are serialized and deserialized using the `snake_case` naming convention. 23 | #[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)] 24 | #[serde(rename_all = "snake_case")] 25 | pub enum SimMatchType { 26 | Levenshtein, 27 | } 28 | 29 | /// Represents a table structure to be used in the similarity matching process. 30 | /// 31 | /// This structure holds various properties required for similarity matching using different algorithms. 32 | /// 33 | /// # Fields 34 | /// 35 | /// * `table_id` - A unique identifier for the table. 36 | /// * `match_id` - A unique identifier for the matching process. 37 | /// * `process_type` - The type of processing to be applied, represented by the [ProcessType] enum. 38 | /// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [SimMatchType] enum. 39 | /// * `word_list` - A list of words to be used in the matching process. 40 | /// * `threshold` - A float value representing the similarity threshold for matching. 41 | #[derive(Debug, Clone)] 42 | pub struct SimTable<'a> { 43 | pub table_id: u32, 44 | pub match_id: u32, 45 | pub process_type: ProcessType, 46 | pub sim_match_type: SimMatchType, 47 | pub word_list: Vec<&'a str>, 48 | pub threshold: f64, 49 | } 50 | 51 | /// Represents a processed table used in the similarity matching process. 52 | /// 53 | /// This struct is a concrete version of the [SimTable] struct, with ownership over 54 | /// the word list. 55 | /// 56 | /// # Fields 57 | /// 58 | /// * `table_id` - A unique identifier for the table. 59 | /// * `match_id` - A unique identifier for the matching process. 60 | /// * `process_type` - The type of processing to be applied, represented by the [ProcessType] enum. 61 | /// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [SimMatchType] enum. 62 | /// * `word_list` - A list of words over which the matching operation is performed. This is an owned vector of strings. 63 | /// * `threshold` - A float value representing the similarity threshold for a match. 64 | #[derive(Debug, Clone)] 65 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 66 | struct SimProcessedTable { 67 | table_id: u32, 68 | match_id: u32, 69 | process_type: ProcessType, 70 | sim_match_type: SimMatchType, 71 | word_list: Vec, 72 | threshold: f64, 73 | } 74 | 75 | /// Represents the result of a similarity matching operation. 76 | /// 77 | /// This struct holds information about the match including identifiers for the match and table, 78 | /// the word that was matched, and the similarity score of the match. The word is represented as a 79 | /// [Cow] (Clone on Write) for efficient handling of borrowed or owned strings. This allows 80 | /// flexibility in returning either a borrowed string or an owned string. 81 | /// 82 | /// # Fields 83 | /// 84 | /// * `match_id` - A unique identifier for the matching process. 85 | /// * `table_id` - A unique identifier for the table. 86 | /// * `word_id` - A unique identifier for the word within the table. 87 | /// * `word` - The word that was matched, represented as a [Cow] to allow for both borrowed and owned strings. 88 | /// * `similarity` - A float value representing the similarity score of the match. 89 | #[derive(Debug, Clone)] 90 | pub struct SimResult<'a> { 91 | pub match_id: u32, 92 | pub table_id: u32, 93 | pub word_id: u32, 94 | pub word: Cow<'a, str>, 95 | pub similarity: f64, 96 | } 97 | 98 | impl MatchResultTrait<'_> for SimResult<'_> { 99 | fn match_id(&self) -> u32 { 100 | self.match_id 101 | } 102 | fn table_id(&self) -> u32 { 103 | self.table_id 104 | } 105 | fn word_id(&self) -> u32 { 106 | 0 107 | } 108 | fn word(&self) -> &str { 109 | &self.word 110 | } 111 | fn similarity(&self) -> f64 { 112 | self.similarity 113 | } 114 | } 115 | 116 | /// The [SimMatcher] struct is responsible for performing similarity matching operations 117 | /// based on different processing types and similarity algorithms. 118 | /// 119 | /// This struct maintains a process type tree and a list of pre-processed tables that contain 120 | /// the necessary information for performing similarity matching on texts. 121 | /// 122 | /// # Fields 123 | /// 124 | /// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for 125 | /// text processing based on defined process types. 126 | /// * `sim_processed_table_list` - A vector of `SimProcessedTable`, holding the tables with processed information 127 | /// for performing similarity matching. 128 | /// 129 | /// # Example 130 | /// 131 | /// ``` 132 | /// use matcher_rs::{SimMatcher, SimTable, SimMatchType, ProcessType}; 133 | /// 134 | /// // Create a list of `SimTable` with the required properties 135 | /// let sim_table_list = vec![SimTable { 136 | /// table_id: 1, 137 | /// match_id: 1, 138 | /// process_type: ProcessType::None, 139 | /// sim_match_type: SimMatchType::Levenshtein, 140 | /// word_list: vec!["example", "test"], 141 | /// threshold: 0.8, 142 | /// }]; 143 | /// 144 | /// // Instantiate a `SimMatcher` with the list of `SimTable` 145 | /// let matcher = SimMatcher::new(&sim_table_list); 146 | /// 147 | /// // Use `matcher` methods for performing similarity matching operations 148 | /// ``` 149 | /// 150 | /// The [SimMatcher] struct provides methods for checking if a text matches any of the processed tables 151 | /// and for processing texts to obtain a list of similarity results. 152 | #[derive(Debug, Clone)] 153 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] 154 | pub struct SimMatcher { 155 | process_type_tree: Vec, 156 | sim_processed_table_list: Vec, 157 | } 158 | 159 | impl SimMatcher { 160 | /// Creates a new instance of [SimMatcher] from a list of [SimTable]. 161 | /// 162 | /// This function initializes a [SimMatcher] by processing each [SimTable] in the input list. 163 | /// It extracts the process types and constructs a tree structure used for processing texts. 164 | /// Additionally, it converts the word lists in each [SimTable] from borrowed strings to owned strings. 165 | /// 166 | /// # Parameters 167 | /// 168 | /// * `sim_table_list` - A slice of [SimTable] references to be processed and included in the new [SimMatcher] instance. 169 | /// 170 | /// # Returns 171 | /// 172 | /// Returns a new instance of [SimMatcher] containing: 173 | /// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for text processing based on the process types extracted from the input [SimTable] list. 174 | /// * `sim_processed_table_list` - A vector of `SimProcessedTable`, each containing an owned vector of words and other properties derived from the input [SimTable] list. 175 | pub fn new(sim_table_list: &[SimTable]) -> SimMatcher { 176 | let mut process_type_set = IdSet::with_capacity(sim_table_list.len()); 177 | let mut sim_processed_table_list = Vec::with_capacity(sim_table_list.len()); 178 | 179 | for sim_table in sim_table_list { 180 | process_type_set.insert(sim_table.process_type.bits() as usize); 181 | sim_processed_table_list.push(SimProcessedTable { 182 | table_id: sim_table.table_id, 183 | match_id: sim_table.match_id, 184 | process_type: sim_table.process_type, 185 | sim_match_type: sim_table.sim_match_type, 186 | word_list: sim_table 187 | .word_list 188 | .iter() 189 | .map(|&word| word.to_owned()) 190 | .collect::>(), 191 | threshold: sim_table.threshold, 192 | }) 193 | } 194 | 195 | let process_type_tree = build_process_type_tree(&process_type_set); 196 | 197 | SimMatcher { 198 | process_type_tree, 199 | sim_processed_table_list, 200 | } 201 | } 202 | } 203 | 204 | impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher { 205 | /// Checks if the provided text matches any entry in the processed tables. 206 | /// 207 | /// This function processes the input text to generate a set of processed text variants 208 | /// based on the defined process types. It then delegates the actual matching logic to a 209 | /// helper function that checks if any of these processed text variants match the entries 210 | /// in the `sim_processed_table_list`. 211 | /// 212 | /// # Parameters 213 | /// 214 | /// * `text` - A string slice representing the input text to be checked for similarity matches. 215 | /// 216 | /// # Returns 217 | /// 218 | /// Returns `true` if the processed text matches any entry in the processed tables; otherwise returns `false`. 219 | fn is_match(&'a self, text: &'a str) -> bool { 220 | if text.is_empty() { 221 | return false; 222 | } 223 | 224 | let processed_text_process_type_set = 225 | reduce_text_process_with_tree(&self.process_type_tree, text); 226 | 227 | self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set) 228 | } 229 | 230 | /// Checks if any processed text variant matches an entry in the similarity tables. 231 | /// 232 | /// This helper function iterates through the processed text variants and their corresponding 233 | /// process type sets. For each variant, it checks against all entries in the similarity tables 234 | /// to see if there is a match based on the defined similarity match type (e.g., Levenshtein). 235 | /// 236 | /// # Parameters 237 | /// 238 | /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of: 239 | /// - A processed text variant represented as a [`Cow`]. 240 | /// - An [IdSet] containing the process type identifiers associated with the processed text. 241 | /// 242 | /// # Returns 243 | /// 244 | /// Returns `true` if any of the processed text variants match an entry in the similarity tables 245 | /// according to the specified match type and similarity threshold; otherwise, returns `false`. 246 | fn _is_match_with_processed_text_process_type_set( 247 | &'a self, 248 | processed_text_process_type_set: &[(Cow<'a, str>, id_set::IdSet)], 249 | ) -> bool { 250 | for (processed_text, process_type_set) in processed_text_process_type_set { 251 | for sim_processed_table in &self.sim_processed_table_list { 252 | if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) { 253 | continue; 254 | } 255 | let is_match = match sim_processed_table.sim_match_type { 256 | SimMatchType::Levenshtein => sim_processed_table.word_list.iter().any(|text| { 257 | distance::levenshtein::normalized_similarity_with_args( 258 | text.chars(), 259 | processed_text.chars(), 260 | &distance::levenshtein::Args::default() 261 | .score_cutoff(sim_processed_table.threshold), 262 | ) 263 | .is_some() 264 | }), 265 | }; 266 | 267 | if is_match { 268 | return true; 269 | } 270 | } 271 | } 272 | 273 | false 274 | } 275 | 276 | /// Processes the provided text and returns a list of similarity results. 277 | /// 278 | /// This function takes the input text and generates a set of processed text variants based 279 | /// on the defined process types, as described in the `process_type_tree`. It then uses these 280 | /// variants to find matches in the similarity tables, accumulating results where a similarity 281 | /// match is found. 282 | /// 283 | /// # Parameters 284 | /// 285 | /// * `text` - A string slice representing the input text to be processed and checked for similarity matches. 286 | /// 287 | /// # Returns 288 | /// 289 | /// Returns a vector of [SimResult] instances, each containing information about a matched entry 290 | /// in the similarity tables, including the `match_id`, `table_id`, `word_id`, `word`, and the 291 | /// similarity score. 292 | fn process(&'a self, text: &'a str) -> Vec> { 293 | if text.is_empty() { 294 | return Vec::new(); 295 | } 296 | 297 | let processed_text_process_type_set = 298 | reduce_text_process_with_tree(&self.process_type_tree, text); 299 | 300 | self._process_with_processed_text_process_type_set(&processed_text_process_type_set) 301 | } 302 | 303 | /// Processes the provided set of processed text variants and their corresponding process type sets, 304 | /// returning a list of similarity results. 305 | /// 306 | /// This function iterates through each processed text variant and its associated process type set, 307 | /// comparing them against entries in the similarity tables to identify matches based on the defined 308 | /// similarity match type (e.g., Levenshtein). For each match found, the function accumulates the result 309 | /// with relevant information such as `match_id`, `table_id`, `word_id`, `word`, and the similarity score. 310 | /// 311 | /// # Parameters 312 | /// 313 | /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of: 314 | /// - A processed text variant represented as a [`Cow`]. 315 | /// - An [IdSet] containing the process type identifiers associated with the processed text. 316 | /// 317 | /// # Returns 318 | /// 319 | /// Returns a vector of [SimResult] instances, each containing information about a matched entry 320 | /// in the similarity tables, including: 321 | /// - `match_id`: The identifier for the match. 322 | /// - `table_id`: The identifier of the similarity table where the match was found. 323 | /// - `word_id`: The index of the word in the similarity table's word list. 324 | /// - `word`: The word from the similarity table's word list that matched the processed text. 325 | /// - `similarity`: The similarity score of the match. 326 | /// 327 | /// The function ensures that only unique matches are included in the result list by maintaining 328 | /// an [IdSet] to track already processed table ID and word index combinations. 329 | fn _process_with_processed_text_process_type_set( 330 | &'a self, 331 | processed_text_process_type_set: &[(Cow<'a, str>, IdSet)], 332 | ) -> Vec> { 333 | let mut result_list = Vec::new(); 334 | let mut table_id_index_set = IdSet::new(); 335 | 336 | for (processed_text, process_type_set) in processed_text_process_type_set { 337 | for sim_processed_table in &self.sim_processed_table_list { 338 | if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) { 339 | continue; 340 | } 341 | match sim_processed_table.sim_match_type { 342 | SimMatchType::Levenshtein => { 343 | for (index, text) in sim_processed_table.word_list.iter().enumerate() { 344 | let table_id_index = 345 | ((sim_processed_table.table_id as usize) << 32) | index; 346 | 347 | if table_id_index_set.insert(table_id_index) { 348 | if let Some(similarity) = 349 | distance::levenshtein::normalized_similarity_with_args( 350 | text.chars(), 351 | processed_text.chars(), 352 | &distance::levenshtein::Args::default() 353 | .score_cutoff(sim_processed_table.threshold), 354 | ) 355 | { 356 | result_list.push(SimResult { 357 | match_id: sim_processed_table.match_id, 358 | table_id: sim_processed_table.table_id, 359 | word_id: index as u32, 360 | word: Cow::Borrowed(text), 361 | similarity, 362 | }); 363 | } 364 | } 365 | } 366 | } 367 | } 368 | } 369 | } 370 | 371 | result_list 372 | } 373 | } 374 | -------------------------------------------------------------------------------- /matcher_rs/src/util/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod serde; 2 | pub mod word; 3 | -------------------------------------------------------------------------------- /matcher_rs/src/util/serde.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "serde")] 2 | use std::borrow::Cow; 3 | 4 | #[cfg(feature = "serde")] 5 | use fancy_regex::Regex; 6 | #[cfg(feature = "serde")] 7 | use regex::RegexSet; 8 | #[cfg(feature = "serde")] 9 | use serde::{de::Error, Deserialize, Deserializer, Serialize, Serializer}; 10 | 11 | #[cfg(feature = "serde")] 12 | pub mod serde_regex { 13 | use super::*; 14 | 15 | /// Deserialize and serialize functions for `Regex` type. 16 | /// 17 | /// This module provides custom serialization and deserialization 18 | /// for the `Regex` type from the `fancy_regex` crate using Serde. 19 | /// The regex is serialized as a string and deserialized back into a `Regex` object. 20 | /// 21 | /// To use the custom serialization and deserialization, the field in the struct must 22 | /// be annotated with `#[serde(with = "serde_regex")]`. 23 | /// 24 | /// The provided methods ensure that regex patterns are correctly handled during 25 | /// serialization and deserialization processes without losing the actual regex functionalities. 26 | pub fn deserialize<'de, D>(d: D) -> Result 27 | where 28 | D: Deserializer<'de>, 29 | { 30 | let s = >::deserialize(d)?; 31 | 32 | match Regex::new(s.as_ref()) { 33 | Ok(regex) => Ok(regex), 34 | Err(err) => Err(D::Error::custom(err)), 35 | } 36 | } 37 | 38 | pub fn serialize(regex: &Regex, serializer: S) -> Result 39 | where 40 | S: Serializer, 41 | { 42 | regex.as_str().serialize(serializer) 43 | } 44 | } 45 | 46 | #[cfg(feature = "serde")] 47 | pub mod serde_regex_list { 48 | use serde::ser::SerializeSeq; 49 | 50 | use super::*; 51 | 52 | /// Deserialize and serialize functions for a list of `Regex` types. 53 | /// 54 | /// This module provides custom serialization and deserialization 55 | /// for lists of the `Regex` type from the `fancy_regex` crate using Serde. 56 | /// Each regex in the list is serialized as a string and deserialized back into a `Regex` object. 57 | /// 58 | /// To use the custom serialization and deserialization, the field in the struct must 59 | /// be annotated with `#[serde(with = "serde_regex_list")]`. 60 | /// 61 | /// These methods ensure that lists of regex patterns are correctly handled during 62 | /// serialization and deserialization processes without losing the actual regex functionalities. 63 | pub fn deserialize<'de, D>(d: D) -> Result, D::Error> 64 | where 65 | D: Deserializer<'de>, 66 | { 67 | let s = >>::deserialize(d)?; 68 | let mut regex_list = Vec::with_capacity(s.len()); 69 | for e in s.into_iter() { 70 | let regex = Regex::new(e.as_ref()).map_err(D::Error::custom)?; 71 | regex_list.push(regex); 72 | } 73 | 74 | Ok(regex_list) 75 | } 76 | 77 | pub fn serialize(regex_list: &Vec, serializer: S) -> Result 78 | where 79 | S: Serializer, 80 | { 81 | let mut seq = serializer.serialize_seq(Some(regex_list.len()))?; 82 | for e in regex_list { 83 | seq.serialize_element(e.as_str())?; 84 | } 85 | seq.end() 86 | } 87 | } 88 | 89 | #[cfg(feature = "serde")] 90 | pub mod serde_regex_set { 91 | use serde::ser::SerializeSeq; 92 | 93 | use super::*; 94 | 95 | /// Deserialize and serialize functions for `RegexSet` type. 96 | /// 97 | /// This module provides custom serialization and deserialization 98 | /// for the `RegexSet` type from the `regex` crate using Serde. 99 | /// The regex set is serialized as a list of strings and deserialized back into a `RegexSet` object. 100 | /// 101 | /// To use the custom serialization and deserialization, the field in the struct must 102 | /// be annotated with `#[serde(with = "serde_regex_set")]`. 103 | /// 104 | /// These methods ensure that regex set patterns are correctly handled during 105 | /// serialization and deserialization processes without losing the actual regex functionalities. 106 | pub fn deserialize<'de, D>(d: D) -> Result 107 | where 108 | D: Deserializer<'de>, 109 | { 110 | let s = >>::deserialize(d)?; 111 | let regex_set = RegexSet::new(s).map_err(D::Error::custom)?; 112 | 113 | Ok(regex_set) 114 | } 115 | 116 | pub fn serialize(regex_set: &RegexSet, serializer: S) -> Result 117 | where 118 | S: Serializer, 119 | { 120 | let mut seq = serializer.serialize_seq(Some(regex_set.len()))?; 121 | for e in regex_set.patterns() { 122 | seq.serialize_element(e.as_str())?; 123 | } 124 | seq.end() 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /matcher_rs/src/util/word.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | use std::fmt::Display; 3 | 4 | use serde::{Deserialize, Serialize}; 5 | 6 | /// A struct representing a simple word. 7 | /// 8 | /// This struct holds a single `String` and provides various methods for 9 | /// manipulating and querying the contents of the string. It supports the 10 | /// `Debug`, `Default`, `Clone`, `PartialEq`, `Eq`, `Serialize`, and 11 | /// `Deserialize` traits, making it versatile for different use cases such 12 | /// as debugging, serialization, and comparison. 13 | #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)] 14 | pub struct SimpleWord(String); 15 | 16 | impl SimpleWord { 17 | /// Creates a new `SimpleWord` instance from any type that can be referenced as a string. 18 | /// 19 | /// # Arguments 20 | /// 21 | /// * `word` - An input that implements the `AsRef` trait. This allows for a wide range 22 | /// of input types, including `String`, `&str`, and `Cow`. 23 | /// 24 | /// # Returns 25 | /// 26 | /// A `SimpleWord` instance containing the provided word. 27 | /// 28 | /// # Examples 29 | /// 30 | /// ``` 31 | /// use matcher_rs::SimpleWord; 32 | /// 33 | /// let word = SimpleWord::new("hello"); 34 | /// assert_eq!(word.as_str(), "hello"); 35 | /// ``` 36 | pub fn new(word: I) -> Self 37 | where 38 | I: AsRef, 39 | { 40 | SimpleWord(word.as_ref().to_owned()) 41 | } 42 | 43 | /// Returns the length of the string contained within the `SimpleWord`. 44 | /// 45 | /// This method returns the number of characters in the underlying string. 46 | /// 47 | /// # Returns 48 | /// 49 | /// The length of the string as a `usize`. 50 | /// 51 | /// # Examples 52 | /// 53 | /// ``` 54 | /// use matcher_rs::SimpleWord; 55 | /// 56 | /// let word = SimpleWord::new("hello"); 57 | /// assert_eq!(word.len(), 5); 58 | /// ``` 59 | pub fn len(&self) -> usize { 60 | self.0.len() 61 | } 62 | 63 | /// Checks if the string contained within the `SimpleWord` is empty. 64 | /// 65 | /// This method returns true if the underlying string has a length of zero. 66 | /// 67 | /// # Returns 68 | /// 69 | /// `true` if the string is empty, `false` otherwise. 70 | /// 71 | /// # Examples 72 | /// 73 | /// ``` 74 | /// use matcher_rs::SimpleWord; 75 | /// 76 | /// let empty_word = SimpleWord::new(""); 77 | /// assert!(empty_word.is_empty()); 78 | /// 79 | /// let non_empty_word = SimpleWord::new("hello"); 80 | /// assert!(!non_empty_word.is_empty()); 81 | /// ``` 82 | pub fn is_empty(&self) -> bool { 83 | self.0.is_empty() 84 | } 85 | 86 | /// Appends a given word to the current `SimpleWord` with an `&`. 87 | /// 88 | /// This method takes an input that implements the `AsRef` trait and appends 89 | /// it to the current `SimpleWord` instance, preceded by the `&` character. 90 | /// 91 | /// # Arguments 92 | /// 93 | /// * `word` - An input that implements the `AsRef` trait. This could be a 94 | /// `String`, `&str`, or `Cow`. 95 | /// 96 | /// # Returns 97 | /// 98 | /// A new `SimpleWord` instance with the appended word. 99 | /// 100 | /// # Examples 101 | /// 102 | /// ``` 103 | /// use matcher_rs::SimpleWord; 104 | /// 105 | /// let word1 = SimpleWord::new("hello"); 106 | /// let word2 = word1.and("world"); 107 | /// assert_eq!(word2.as_str(), "hello&world"); 108 | /// ``` 109 | pub fn and(mut self, word: I) -> Self 110 | where 111 | I: AsRef, 112 | { 113 | self.0.push('&'); 114 | self.0.push_str(word.as_ref()); 115 | self 116 | } 117 | 118 | /// Prepends a given word to the current `SimpleWord` with a `~`. 119 | /// 120 | /// This method takes an input that implements the `AsRef` trait and prepends 121 | /// it to the current `SimpleWord` instance, preceded by the `~` character. 122 | /// 123 | /// # Arguments 124 | /// 125 | /// * `word` - An input that implements the `AsRef` trait. This could be a 126 | /// `String`, `&str`, or `Cow`. 127 | /// 128 | /// # Returns 129 | /// 130 | /// A new `SimpleWord` instance with the prepended word. 131 | /// 132 | /// # Examples 133 | /// 134 | /// ``` 135 | /// use matcher_rs::SimpleWord; 136 | /// 137 | /// let word1 = SimpleWord::new("world"); 138 | /// let word2 = word1.not("hello"); 139 | /// assert_eq!(word2.as_str(), "world~hello"); 140 | /// ``` 141 | pub fn not(mut self, word: I) -> Self 142 | where 143 | I: AsRef, 144 | { 145 | self.0.push('~'); 146 | self.0.push_str(word.as_ref()); 147 | self 148 | } 149 | 150 | /// Returns a string slice of the contents of the `SimpleWord`. 151 | /// 152 | /// This method allows for borrowing the underlying string without taking ownership. 153 | /// 154 | /// # Returns 155 | /// 156 | /// A string slice (`&str`) of the contents. 157 | /// 158 | /// # Examples 159 | /// 160 | /// ``` 161 | /// use matcher_rs::SimpleWord; 162 | /// 163 | /// let word = SimpleWord::new("hello"); 164 | /// assert_eq!(word.as_str(), "hello"); 165 | /// ``` 166 | pub fn as_str(&self) -> &str { 167 | &self.0 168 | } 169 | } 170 | 171 | impl Display for SimpleWord { 172 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 173 | f.write_str(&self.0) 174 | } 175 | } 176 | 177 | impl From for SimpleWord { 178 | fn from(value: String) -> Self { 179 | SimpleWord(value) 180 | } 181 | } 182 | 183 | impl From<&str> for SimpleWord { 184 | fn from(value: &str) -> Self { 185 | SimpleWord(value.to_owned()) 186 | } 187 | } 188 | 189 | impl<'a> From> for SimpleWord { 190 | fn from(value: Cow<'a, str>) -> Self { 191 | SimpleWord(value.into_owned()) 192 | } 193 | } 194 | 195 | impl From for String { 196 | fn from(value: SimpleWord) -> Self { 197 | value.0 198 | } 199 | } 200 | 201 | impl AsRef for SimpleWord { 202 | fn as_ref(&self) -> &str { 203 | &self.0 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /matcher_rs/tests/test.rs: -------------------------------------------------------------------------------- 1 | mod test_simple { 2 | use std::collections::HashMap; 3 | 4 | use matcher_rs::{ProcessType, SimpleMatcher, SimpleWord, TextMatcherTrait}; 5 | 6 | #[test] 7 | fn simple_match_init() { 8 | let _ = SimpleMatcher::new(&HashMap::from([( 9 | ProcessType::None, 10 | HashMap::from([(1, "")]), 11 | )])); 12 | let _ = SimpleMatcher::new(&HashMap::from([( 13 | ProcessType::None, 14 | HashMap::from([(1, "hello"), (2, "world")]), 15 | )])); 16 | } 17 | 18 | #[test] 19 | fn simple_match_fanjian() { 20 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 21 | ProcessType::Fanjian, 22 | HashMap::from([(1, "你好")]), 23 | )])); 24 | assert!(simple_matcher.is_match("妳好")); 25 | 26 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 27 | ProcessType::Fanjian, 28 | HashMap::from([(1, "妳好")]), 29 | )])); 30 | assert!(simple_matcher.is_match("你好")); 31 | } 32 | 33 | #[test] 34 | fn simple_match_delete() { 35 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 36 | ProcessType::Delete, 37 | HashMap::from([(1, "你好")]), 38 | )])); 39 | assert!(simple_matcher.is_match("你!好")); 40 | } 41 | 42 | #[test] 43 | fn simple_match_normalize() { 44 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 45 | ProcessType::Normalize, 46 | HashMap::from([(1, "he11o")]), 47 | )])); 48 | assert!(simple_matcher.is_match("ℋЀ⒈㈠Õ")); 49 | } 50 | 51 | #[test] 52 | fn simple_match_pinyin() { 53 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 54 | ProcessType::PinYin, 55 | HashMap::from([(1, "西安")]), 56 | )])); 57 | assert!(simple_matcher.is_match("洗按")); 58 | assert!(!simple_matcher.is_match("现")); 59 | } 60 | 61 | #[test] 62 | fn simple_match_pinyinchar() { 63 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 64 | ProcessType::PinYinChar, 65 | HashMap::from([(1, "西安")]), 66 | )])); 67 | assert!(simple_matcher.is_match("洗按")); 68 | assert!(simple_matcher.is_match("现")); 69 | assert!(simple_matcher.is_match("xian")); 70 | } 71 | 72 | #[test] 73 | fn simple_match_combination() { 74 | let simple_matcher = SimpleMatcher::new(&HashMap::from([( 75 | ProcessType::None, 76 | HashMap::from([ 77 | (1, SimpleWord::from("hello").and("world")), 78 | (2, SimpleWord::from("hello").and("world").and("hello")), 79 | (3, SimpleWord::from("hello").not("world")), 80 | (4, SimpleWord::from("hello").not("world").not("world")), 81 | (5, SimpleWord::from("hello").and("world").not("word")), 82 | ( 83 | 6, 84 | SimpleWord::from("hello") 85 | .and("world") 86 | .not("word") 87 | .not("word"), 88 | ), 89 | ]), 90 | )])); 91 | assert!(simple_matcher.is_match("hello world")); 92 | assert!(simple_matcher.is_match("hello hello world")); 93 | assert!(simple_matcher.is_match("hello word")); 94 | } 95 | } 96 | 97 | mod test_regex { 98 | use matcher_rs::{ProcessType, RegexMatchType, RegexMatcher, RegexTable, TextMatcherTrait}; 99 | 100 | #[test] 101 | fn regex_match_regex() { 102 | let regex_matcher = RegexMatcher::new(&[RegexTable { 103 | table_id: 1, 104 | match_id: 1, 105 | process_type: ProcessType::None, 106 | regex_match_type: RegexMatchType::Regex, 107 | word_list: vec!["h[aeiou]llo", "w[aeiou]rd"], 108 | }]); 109 | 110 | assert!(regex_matcher.is_match("hallo")); 111 | assert!(regex_matcher.is_match("ward")); 112 | } 113 | 114 | #[test] 115 | fn regex_match_acrostic() { 116 | let regex_matcher = RegexMatcher::new(&[RegexTable { 117 | table_id: 1, 118 | match_id: 1, 119 | process_type: ProcessType::None, 120 | regex_match_type: RegexMatchType::Acrostic, 121 | word_list: vec!["h,e,l,l,o", "你,好"], 122 | }]); 123 | 124 | assert!(regex_matcher.is_match("hope, endures, love, lasts, onward.")); 125 | assert!(regex_matcher.is_match("Happy moments shared, Every smile and laugh, Love in every word, Lighting up our paths, Open hearts we show.")); 126 | assert!(regex_matcher.is_match("你的笑容温暖, 好心情常伴。")); 127 | } 128 | 129 | #[test] 130 | fn regex_match_similar_char() { 131 | let regex_matcher = RegexMatcher::new(&[RegexTable { 132 | table_id: 1, 133 | match_id: 1, 134 | process_type: ProcessType::None, 135 | regex_match_type: RegexMatchType::SimilarChar, 136 | word_list: vec!["hello,hi,H,你好", "world,word,🌍,世界"], 137 | }]); 138 | 139 | assert!(regex_matcher.is_match("helloworld")); 140 | assert!(regex_matcher.is_match("hi世界")); 141 | } 142 | } 143 | 144 | mod test_sim { 145 | use matcher_rs::{ProcessType, SimMatchType, SimMatcher, SimTable, TextMatcherTrait}; 146 | 147 | #[test] 148 | fn sim_match() { 149 | let sim_matcher = SimMatcher::new(&[SimTable { 150 | table_id: 1, 151 | match_id: 1, 152 | process_type: ProcessType::None, 153 | sim_match_type: SimMatchType::Levenshtein, 154 | word_list: vec!["helloworld"], 155 | threshold: 0.8, 156 | }]); 157 | 158 | assert!(sim_matcher.is_match("helloworl")); 159 | assert!(sim_matcher.is_match("halloworld")); 160 | assert!(sim_matcher.is_match("ha1loworld")); 161 | assert!(!sim_matcher.is_match("ha1loworld1")); 162 | } 163 | } 164 | 165 | mod test_matcher { 166 | use std::collections::HashMap; 167 | 168 | use matcher_rs::{MatchTable, MatchTableType, Matcher, ProcessType, TextMatcherTrait}; 169 | 170 | #[test] 171 | fn matcher_init() { 172 | let _ = Matcher::new(&HashMap::from([( 173 | 1, 174 | vec![MatchTable { 175 | table_id: 1, 176 | match_table_type: MatchTableType::Simple { 177 | process_type: ProcessType::None, 178 | }, 179 | word_list: vec![], 180 | exemption_process_type: ProcessType::None, 181 | exemption_word_list: vec![], 182 | }], 183 | )])); 184 | } 185 | 186 | #[test] 187 | fn matcher_exemption() { 188 | let matcher = Matcher::new(&HashMap::from([( 189 | 1, 190 | vec![MatchTable { 191 | table_id: 1, 192 | match_table_type: MatchTableType::Simple { 193 | process_type: ProcessType::None, 194 | }, 195 | word_list: vec!["hello"], 196 | exemption_process_type: ProcessType::None, 197 | exemption_word_list: vec!["world"], 198 | }], 199 | )])); 200 | assert!(matcher.is_match("hello")); 201 | assert!(!matcher.is_match("hello,world")) 202 | } 203 | } 204 | 205 | mod test_process { 206 | use id_set::IdSet; 207 | use matcher_rs::{ 208 | build_process_type_tree, reduce_text_process, reduce_text_process_emit, 209 | reduce_text_process_with_set, reduce_text_process_with_tree, text_process, ProcessType, 210 | }; 211 | 212 | #[test] 213 | fn test_text_process() { 214 | let text = text_process(ProcessType::Fanjian, "~ᗩ~躶~𝚩~軆~Ⲉ~"); 215 | println!("{:?}", text); 216 | } 217 | 218 | #[test] 219 | fn test_reduce_text_process() { 220 | let text = reduce_text_process(ProcessType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~"); 221 | println!("{:?}", text); 222 | } 223 | 224 | #[test] 225 | fn test_reduce_text_process_emit() { 226 | let text = reduce_text_process_emit(ProcessType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~"); 227 | println!("{:?}", text); 228 | } 229 | 230 | #[test] 231 | fn test_build_process_type_tree() { 232 | let process_type_set = IdSet::from_iter([ 233 | ProcessType::Fanjian.bits() as usize, 234 | ProcessType::DeleteNormalize.bits() as usize, 235 | ProcessType::FanjianDeleteNormalize.bits() as usize, 236 | ProcessType::Delete.bits() as usize, 237 | ProcessType::Normalize.bits() as usize, 238 | ]); 239 | let process_type_tree = build_process_type_tree(&process_type_set); 240 | println!("{:?}", process_type_tree); 241 | } 242 | 243 | #[test] 244 | fn test_reduce_text_process_with_tree() { 245 | let process_type_set = IdSet::from_iter([ 246 | ProcessType::Fanjian.bits() as usize, 247 | ProcessType::DeleteNormalize.bits() as usize, 248 | ProcessType::FanjianDeleteNormalize.bits() as usize, 249 | ProcessType::Delete.bits() as usize, 250 | ProcessType::Normalize.bits() as usize, 251 | ]); 252 | let process_type_tree = build_process_type_tree(&process_type_set); 253 | let text = "test爽-︻"; 254 | 255 | let processed_text_process_type_set = 256 | reduce_text_process_with_tree(&process_type_tree, text); 257 | println!("{processed_text_process_type_set:?}"); 258 | } 259 | 260 | #[test] 261 | fn test_reduce_text_process_with_set() { 262 | let process_type_set = IdSet::from_iter([ 263 | ProcessType::Fanjian.bits() as usize, 264 | ProcessType::DeleteNormalize.bits() as usize, 265 | ProcessType::FanjianDeleteNormalize.bits() as usize, 266 | ProcessType::Delete.bits() as usize, 267 | ProcessType::Normalize.bits() as usize, 268 | ]); 269 | let text = "test爽-︻"; 270 | 271 | let processed_text_process_type_set = reduce_text_process_with_set(&process_type_set, text); 272 | println!("{processed_text_process_type_set:?}"); 273 | } 274 | } 275 | --------------------------------------------------------------------------------