├── .cargo
    └── config.toml
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── python.yml
    │   ├── release.yml
    │   ├── rust.yml
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── DESIGN.md
├── LICENSE-APACHE
├── LICENSE-MIT
├── Makefile
├── README.md
├── README_CN.md
├── data
    ├── str_conv
    │   ├── DerivedGeneralCategory.txt
    │   ├── DerivedNumericValues.txt
    │   ├── EquivalentUnifiedIdeograph.txt
    │   ├── NormalizationTest.txt
    │   ├── PropList.txt
    │   ├── Unihan_Readings.txt
    │   └── Unihan_Variants.txt
    ├── text
    │   ├── cn
    │   │   ├── 三体.txt
    │   │   └── 西游记.txt
    │   └── en
    │   │   ├── bible_kjv.txt
    │   │   └── sherlock.txt
    └── word_list
    │   ├── cn
    │       ├── cn_words_100.txt
    │       ├── cn_words_100000.txt
    │       ├── cn_words_15000.txt
    │       ├── cn_words_30000.txt
    │       ├── cn_words_5000.txt
    │       └── cn_words_50000.txt
    │   └── en
    │       ├── en_words_100.txt
    │       ├── en_words_100000.txt
    │       ├── en_words_15000.txt
    │       ├── en_words_30000.txt
    │       ├── en_words_5000.txt
    │       └── en_words_50000.txt
├── matcher_c
    ├── Cargo.toml
    ├── README.md
    ├── extension_types.py
    ├── matcher_c.h
    └── src
    │   └── lib.rs
├── matcher_java
    ├── README.md
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── matcher_java
    │       │           ├── MatcherJava.java
    │       │           └── extension_types
    │       │               ├── MatchResult.java
    │       │               ├── MatchTable.java
    │       │               ├── MatchTableType.java
    │       │               ├── ProcessType.java
    │       │               ├── ProcessTypeSerializer.java
    │       │               ├── Regex.java
    │       │               ├── RegexMatchType.java
    │       │               ├── SimMatchType.java
    │       │               ├── Similar.java
    │       │               ├── Simple.java
    │       │               └── SimpleResult.java
    │   └── test
    │       └── java
    │           └── com
    │               └── matcher_java
    │                   └── MatcherJavaExample.java
├── matcher_py
    ├── Cargo.lock
    ├── Cargo.toml
    ├── README.md
    ├── build.rs
    ├── pyproject.toml
    ├── python
    │   └── matcher_py
    │   │   ├── __init__.py
    │   │   ├── extension_types.py
    │   │   ├── matcher_py.pyi
    │   │   └── py.typed
    ├── src
    │   └── lib.rs
    ├── test
    │   ├── __init__.py
    │   ├── test_matcher.py
    │   └── test_simple_matcher.py
    └── uv.lock
└── matcher_rs
    ├── Cargo.toml
    ├── README.md
    ├── benches
        └── bench.rs
    ├── build.rs
    ├── process_map
        ├── FANJIAN.txt
        ├── NORM.txt
        ├── NUM-NORM.txt
        ├── PINYIN.txt
        └── TEXT-DELETE.txt
    ├── src
        ├── lib.rs
        ├── matcher.rs
        ├── process
        │   ├── constants.rs
        │   ├── mod.rs
        │   └── process_matcher.rs
        ├── regex_matcher.rs
        ├── sim_matcher.rs
        ├── simple_matcher.rs
        └── util
        │   ├── mod.rs
        │   ├── serde.rs
        │   └── word.rs
    └── tests
        └── test.rs


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | rustflags = ["-C", "target-cpu=native"]
3 | rustdocflags = ["-C", "target-cpu=native", "--document-private-items"]
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Additional context**
27 | Add any other context about the problem here.
28 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/python.yml:
--------------------------------------------------------------------------------
  1 | name: python
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - 'v*'
  7 | 
  8 | permissions:
  9 |   contents: read
 10 | 
 11 | jobs:
 12 |   linux:
 13 |     runs-on: ${{ matrix.platform.runner }}
 14 |     strategy:
 15 |       fail-fast: false
 16 |       matrix:
 17 |         platform:
 18 |           - runner: ubuntu-latest
 19 |             target: x86_64
 20 |           - runner: ubuntu-latest
 21 |             target: aarch64
 22 |         python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
 23 |     steps:
 24 |       - uses: actions/checkout@v4
 25 |       - name: Install Python
 26 |         uses: actions/setup-python@v5
 27 |         with:
 28 |           python-version: "${{ matrix.python }}"
 29 |       - name: Build wheels
 30 |         uses: PyO3/maturin-action@v1
 31 |         env:
 32 |           RUSTFLAGS: "-Z threads=2 -D warnings"
 33 |         with:
 34 |           target: ${{ matrix.platform.target }}
 35 |           args: --release --out dist -i python${{ matrix.python }}
 36 |           sccache: 'true'
 37 |           manylinux: auto
 38 |           rust-toolchain: nightly
 39 |           working-directory: matcher_py
 40 |       - name: Upload wheels
 41 |         uses: actions/upload-artifact@v4
 42 |         with:
 43 |           name: wheels-linux-${{ matrix.platform.target }}-${{ matrix.python }}
 44 |           path: matcher_py/dist
 45 |           overwrite: true
 46 | 
 47 |   musllinux:
 48 |     runs-on: ${{ matrix.platform.runner }}
 49 |     strategy:
 50 |       fail-fast: false
 51 |       matrix:
 52 |         platform:
 53 |           - runner: ubuntu-latest
 54 |             target: x86_64
 55 |           - runner: ubuntu-latest
 56 |             target: aarch64
 57 |         python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
 58 |     steps:
 59 |       - uses: actions/checkout@v4
 60 |       - name: Install Python
 61 |         uses: actions/setup-python@v5
 62 |         with:
 63 |           python-version: "${{ matrix.python }}"
 64 |       - name: Build wheels
 65 |         uses: PyO3/maturin-action@v1
 66 |         env:
 67 |           RUSTFLAGS: "-Z threads=2 -D warnings"
 68 |         with:
 69 |           target: ${{ matrix.platform.target }}
 70 |           args: --release --out dist -i python${{ matrix.python }}
 71 |           sccache: 'true'
 72 |           manylinux: musllinux_1_2
 73 |           rust-toolchain: nightly
 74 |           working-directory: matcher_py
 75 |       - name: Upload wheels
 76 |         uses: actions/upload-artifact@v4
 77 |         with:
 78 |           name: wheels-musllinux-${{ matrix.platform.target }}-${{ matrix.python }}
 79 |           path: matcher_py/dist
 80 |           overwrite: true
 81 | 
 82 |   windows:
 83 |     runs-on: ${{ matrix.platform.runner }}
 84 |     strategy:
 85 |       fail-fast: false
 86 |       matrix:
 87 |         platform:
 88 |           - runner: windows-latest
 89 |             target: x64
 90 |         python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
 91 |     steps:
 92 |       - uses: actions/checkout@v4
 93 |       - name: Install Python
 94 |         uses: actions/setup-python@v5
 95 |         with:
 96 |           python-version: "${{ matrix.python }}"
 97 |       - name: Build wheels
 98 |         uses: PyO3/maturin-action@v1
 99 |         env:
100 |           RUSTFLAGS: "-Z threads=2 -D warnings"
101 |         with:
102 |           target: ${{ matrix.platform.target }}
103 |           args: --release --out dist -i python${{ matrix.python }}
104 |           sccache: 'true'
105 |           rust-toolchain: nightly
106 |           working-directory: matcher_py
107 |       - name: Upload wheels
108 |         uses: actions/upload-artifact@v4
109 |         with:
110 |           name: wheels-windows-${{ matrix.platform.target }}-${{ matrix.python }}
111 |           path: matcher_py/dist
112 |           overwrite: true
113 | 
114 |   macos:
115 |     runs-on: ${{ matrix.platform.runner }}
116 |     strategy:
117 |       fail-fast: false
118 |       matrix:
119 |         platform:
120 |           - runner: macos-15
121 |             target: aarch64
122 |         python: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
123 |     steps:
124 |       - uses: actions/checkout@v4
125 |       - name: Install Python
126 |         uses: actions/setup-python@v5
127 |         with:
128 |           python-version: "${{ matrix.python }}"
129 |       - name: Build wheels
130 |         uses: PyO3/maturin-action@v1
131 |         env:
132 |           RUSTFLAGS: "-Z threads=2 -D warnings"
133 |         with:
134 |           target: ${{ matrix.platform.target }}
135 |           args: --release --out dist -i python${{ matrix.python }}
136 |           sccache: 'true'
137 |           rust-toolchain: nightly
138 |           working-directory: matcher_py
139 |       - name: Upload wheels
140 |         uses: actions/upload-artifact@v4
141 |         with:
142 |           name: wheels-macos-${{ matrix.platform.target }}-${{ matrix.python }}
143 |           path: matcher_py/dist
144 |           overwrite: true
145 | 
146 |   sdist:
147 |     runs-on: ubuntu-latest
148 |     steps:
149 |       - uses: actions/checkout@v4
150 |       - name: Install Python
151 |         uses: actions/setup-python@v5
152 |         with:
153 |           python-version: 3.13
154 |       - name: Build sdist
155 |         uses: PyO3/maturin-action@v1
156 |         with:
157 |           command: sdist
158 |           args: --out dist
159 |           rust-toolchain: nightly
160 |           working-directory: matcher_py
161 |       - name: Upload sdist
162 |         uses: actions/upload-artifact@v4
163 |         with:
164 |           name: wheels-sdist
165 |           path: matcher_py/dist
166 |           overwrite: true
167 | 
168 |   release:
169 |     name: Release
170 |     runs-on: ubuntu-latest
171 |     needs: [linux, musllinux, windows, macos, sdist]
172 |     steps:
173 |       - name: Download artifact
174 |         uses: actions/download-artifact@v4
175 |         with:
176 |           pattern: wheels-*
177 |           merge-multiple: true
178 |           path: dist
179 |       - uses: actions/setup-python@v5
180 |         with:
181 |           python-version: 3.13
182 |       - run: pip install --upgrade pip twine
183 |       - name: Publish to pypi
184 |         env:
185 |           TWINE_USERNAME: __token__
186 |           TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
187 |         run: twine upload --skip-existing dist/*
188 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | permissions:
 9 |   contents: write
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.platform.runner }}
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         platform:
18 |           - runner: ubuntu-latest
19 |             target: x86_64-unknown-linux-gnu
20 |             suffix: so
21 |           # - runner: ubuntu-latest
22 |           #   target: aarch64-unknown-linux-gnu
23 |           #   suffix: so
24 |           - runner: macos-15
25 |             target: aarch64-apple-darwin
26 |             suffix: dylib
27 |           - runner: windows-latest
28 |             target: x86_64-pc-windows-gnu
29 |             suffix: dll
30 |           - runner: windows-latest
31 |             target: x86_64-pc-windows-msvc
32 |             suffix: dll
33 |     steps:
34 |       - uses: actions/checkout@v4
35 |       - name: Install Rust
36 |         uses: dtolnay/rust-toolchain@master
37 |         with:
38 |           toolchain: nightly
39 |           targets: ${{ matrix.platform.target }}
40 |       - name: Install dependencies
41 |         if: matrix.platform.runner == 'ubuntu-latest' && matrix.platform.target == 'aarch64-unknown-linux-gnu'
42 |         run: sudo apt-get install gcc-aarch64-linux-gnu
43 |       - name: Build
44 |         run: cargo build --release --target ${{ matrix.platform.target }}
45 |       - name: List files
46 |         run: ls ./target/${{ matrix.platform.target }}/release/
47 |       - name: Rename
48 |         shell: bash
49 |         run: |
50 |           mkdir libmatcher
51 |           mv ./target/${{ matrix.platform.target }}/release/*matcher_c.${{ matrix.platform.suffix }} libmatcher/${{ matrix.platform.target }}-libmatcher_c.${{ matrix.platform.suffix }}
52 |           mv ./target/${{ matrix.platform.target }}/release/*matcher_py.${{ matrix.platform.suffix }} libmatcher/${{ matrix.platform.target }}-libmatcher_py.${{ matrix.platform.suffix }}
53 |       - name: Upload release
54 |         uses: actions/upload-artifact@v4
55 |         with:
56 |           name: libmatcher-${{ matrix.platform.target }}
57 |           path: libmatcher
58 | 
59 |   release:
60 |     name: Release
61 |     runs-on: ubuntu-latest
62 |     needs: [build]
63 |     steps:
64 |       - name: Download artifact
65 |         uses: actions/download-artifact@v4
66 |         with:
67 |           pattern: libmatcher-*
68 |           merge-multiple: true
69 |           path: artifact
70 |       - name: Make release
71 |         uses: softprops/action-gh-release@v2
72 |         with:
73 |           draft: true
74 |           prerelease: false
75 |           generate_release_notes: true
76 |           files: artifact/*
77 | 


--------------------------------------------------------------------------------
/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: rust
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | env:
12 |   CARGO_TERM_COLOR: always
13 | 
14 | jobs:
15 |   build:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - name: Install rust
20 |         uses: dtolnay/rust-toolchain@master
21 |         with:
22 |             toolchain: nightly
23 |       - name: Build
24 |         run: cargo build --release --verbose
25 |       - name: Test
26 |         run: cargo test -p matcher_rs --verbose --no-default-features
27 |       - name: Test dfa
28 |         run: cargo test -p matcher_rs --verbose --no-default-features --features "dfa"
29 |       - name: Test runtime_build and dfa
30 |         run: cargo test -p matcher_rs --verbose --no-default-features --features "runtime_build,dfa"
31 |       - name: Test serde and dfa
32 |         run: cargo test -p matcher_rs --verbose --no-default-features --features "serde,dfa"
33 |       - name: Run doc
34 |         run: cargo doc
35 |       - name: Release
36 |         env:
37 |           CARGO_REGISTRY_TOKEN: ${{ secrets.CRATES_IO_TOKEN }}
38 |         run: |
39 |           cargo publish -p matcher_rs
40 |           cargo publish -p matcher_py
41 |           cargo publish -p matcher_c
42 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'matcher_py/src/**'
 7 |       - 'matcher_rs/src/**'
 8 |       - 'matcher_c/src/**'
 9 |       - '.github/workflows/test.yml'
10 |       - '.cargo/config.toml'
11 |   pull_request:
12 |     paths:
13 |       - 'matcher_py/src/**'
14 |       - 'matcher_rs/src/**'
15 |       - 'matcher_c/src/**'
16 |       - '.github/workflows/test.yml'
17 |       - '.cargo/config.toml'
18 | 
19 | permissions:
20 |   contents: read
21 | 
22 | jobs:
23 |   build:
24 |     runs-on: ${{ matrix.platform.runner }}
25 |     strategy:
26 |       fail-fast: false
27 |       matrix:
28 |         platform:
29 |           - runner: ubuntu-latest
30 |             target: x86_64-unknown-linux-gnu
31 |             suffix: so
32 |           # - runner: ubuntu-latest
33 |           #   target: aarch64-unknown-linux-gnu
34 |           - runner: macos-15
35 |             target: aarch64-apple-darwin
36 |             suffix: dylib
37 |           - runner: windows-latest
38 |             target: x86_64-pc-windows-gnu
39 |             suffix: dll
40 |           - runner: windows-latest
41 |             target: x86_64-pc-windows-msvc
42 |             suffix: dll
43 |     steps:
44 |       - uses: actions/checkout@v4
45 |       - name: Install dependencies
46 |         if: matrix.platform.runner == 'ubuntu-latest' && matrix.platform.target == 'aarch64-unknown-linux-gnu'
47 |         run: sudo apt-get install gcc-aarch64-linux-gnu
48 |       - name: Install Rust
49 |         uses: dtolnay/rust-toolchain@master
50 |         with:
51 |           toolchain: nightly
52 |           target: ${{ matrix.platform.target }}
53 |       - name: Build
54 |         run: cargo build --release --target ${{ matrix.platform.target }}
55 |       - name: Test
56 |         run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features
57 |       - name: Test dfa
58 |         run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features --features "dfa"
59 |       - name: Test runtime_build and dfa
60 |         run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features --features "runtime_build,dfa"
61 |       - name: Test serde and dfa
62 |         run: cargo test -p matcher_rs --target ${{ matrix.platform.target }} --verbose --no-default-features --features "serde,dfa"
63 |       - name: Run doc
64 |         run: cargo doc
65 |       - name: Rename & move
66 |         shell: bash
67 |         run: |
68 |           cp ./target/${{ matrix.platform.target }}/release/*matcher_c.${{ matrix.platform.suffix }} matcher_c/matcher_c.so
69 |           cp ./target/${{ matrix.platform.target }}/release/*matcher_py.${{ matrix.platform.suffix }} matcher_py/python/matcher_py/matcher_py.so
70 |       - name: Install Python
71 |         uses: actions/setup-python@v5
72 |         with:
73 |           python-version: 3.13
74 |       - name: Build wheels
75 |         uses: PyO3/maturin-action@v1
76 |         env:
77 |           RUSTFLAGS: "-Z threads=2 -D warnings"
78 |         with:
79 |           target: ${{ matrix.platform.target }}
80 |           args: --release -i python3.13
81 |           sccache: 'true'
82 |           rust-toolchain: nightly
83 |           working-directory: matcher_py
84 |       - name: Python Test
85 |         shell: bash
86 |         if: matrix.platform.runner == 'ubuntu-latest'
87 |         run: |
88 |           pip install -U pytest typing_extensions
89 |           pip install ./target/wheels/*.whl
90 |           pytest matcher_py/test
91 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ---> Python
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | # custom files
142 | **/.DS_Store
143 | .idea
144 | .metals
145 | .vscode
146 | *.zip
147 | 
148 | # Added by cargo
149 | /target
150 | 
151 | .ruff_cache
152 | test.ipynb
153 | profile.json


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 0.5.7 - 2025-03-17
 4 | 
 5 | ### Flexibility
 6 | - Update dependencies.
 7 | 
 8 | ## 0.5.6 - 2024-11-18
 9 | 
10 | ### Performance
11 | - Fix `build_process_type_tree` function, use set instead of list.
12 | - Update several dependencies.
13 | 
14 | ## 0.5.5 - 2024-10-14
15 | 
16 | ### Bug fixes
17 | - Change `XXX(Enum)` to `XXX(str, Enum)` in extension_types.py to fix json dumps issue.
18 | 
19 | ### Flexibility
20 | - Add Python 3.13 support.
21 | - Remove msgspec, only use json in README.md.
22 | 
23 | ## 0.5.4 - 2024-08-23
24 | 
25 | ### Readability
26 | - Fix typo and cargo clippy warnings.
27 | - Add single line benchmark.
28 | 
29 | ## 0.5.3 - 2024-07-26
30 | 
31 | ### Bug fixes
32 | - Fix simple matcher is_match function.
33 | 
34 | ## 0.5.2 - 2024-07-22
35 | 
36 | ### Flexibility
37 | - Remove msgpack, now non-rust users should use json to serialize input of Matcher and SimpleMatcher.
38 | - Refactor Java code.
39 | 
40 | ## 0.5.1 - 2024-07-19
41 | 
42 | ### Performance
43 | - Use FxHash to speed up simple matcher process.
44 | 
45 | ### Flexibility
46 | - Remove unnecessary dependencies.
47 | 
48 | ## 0.5.0 - 2024-07-18
49 | 
50 | ### Changed
51 | - A bunch of changes and I don't want to explain one by one.
52 | 
53 | ## 0.4.6 - 2024-07-15
54 | 
55 | ### Performance
56 | - Optimize performance.
57 | 
58 | ## 0.4.5 - 2024-07-12
59 | 
60 | ### Changed
61 | - Optimize Simple Matcher `process` function when multiple simple_match_type are used.
62 | - add `dfa` feature to matcher_rs.
63 | - shrink `FANJIAN` conversion map.
64 | 
65 | ## 0.4.4 - 2024-07-09
66 | 
67 | ### Changed
68 | - Merge PINYIN and PINYINCHAR process matcher build.
69 | - Add `process` function to matcher_py/c/java.
70 | - Fix simple matcher process function issue.
71 | - Refactor matcher_py file structure, use `rye` to manage matcher_py.
72 | - Delete `println!` in matcher_c.
73 | 
74 | ## 0.4.3 - 2024-07-08
75 | 
76 | ### Changed
77 | - Fix exemption word list wrongly reject entire match, not a single table.
78 | - Add match_id to MatchResult.
79 | - Reverse DFA structure to AhoCorasick structure.
80 | - matcher_c use from_utf8_unchecked instead of from_utf8.
81 | - Build multiple wheels for different python version.
82 | - Update FANJIAN.txt and NORM.txt.
83 | - Fix issues with `runtime_build` feature.
84 | 
85 | ## 0.4.2 - 2024-07-07
86 | 
87 | ### Changed
88 | - Optimize performance.
89 | 
90 | ## 0.4.1 - 2024-07-06
91 | 
92 | ### Changed
93 | - Rebuild Transformation Rules based on Unicode Standard.
94 | 
95 | ## 0.4.0 - 2024-07-03
96 | 
97 | ### Changed
98 | - Implement NOT logic word-wise inside SimpleMatcher, now you can use `&`(and) and `~`(not) separator to config simple word, eg: `hello&world~helo`.
99 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | resolver = "2"
 3 | members = ["matcher_rs", "matcher_py", "matcher_c"]
 4 | 
 5 | [workspace.package]
 6 | authors = ["Foster Guo <f975793771@gmail.com>"]
 7 | categories = ["text-processing"]
 8 | description = "A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust."
 9 | edition = "2021"
10 | homepage = "https://github.com/Lips7/Matcher"
11 | readme = "README.md"
12 | keywords = ["text", "string", "search", "pattern", "multi"]
13 | license = "Apache-2.0 OR MIT"
14 | repository = "https://github.com/Lips7/Matcher"
15 | version = "0.5.7"
16 | 
17 | [profile.release]
18 | strip = true
19 | opt-level = 3
20 | lto = true
21 | codegen-units = 1
22 | incremental = false
23 | debug = false
24 | debug-assertions = false
25 | overflow-checks = false
26 | 
27 | [profile.bench]
28 | strip = "none"
29 | opt-level = 3
30 | lto = true
31 | codegen-units = 1
32 | incremental = false
33 | debug = true
34 | debug-assertions = false
35 | overflow-checks = false
36 | 


--------------------------------------------------------------------------------
/DESIGN.md:
--------------------------------------------------------------------------------
  1 | # Design
  2 | 
  3 | ## Transformation
  4 | 
  5 | * `FANJIAN`: build from [Unihan_Variants.txt](./data/process_map/Unihan_Variants.txt) and [EquivalentUnifiedIdeograph.txt](./data/process_map/EquivalentUnifiedIdeograph.txt).
  6 | * `NUM-NORM`: build from [DerivedNumericValues.txt](./data/process_map/DerivedNumericValues.txt).
  7 | * `TEXT-DELETE` and `SYMBOL-NORM`: build from [DerivedGeneralCategory.txt](./data/process_map/DerivedGeneralCategory.txt).
  8 | * `WHITE-SPACE`: build from [PropList.txt](./data/process_map/PropList.txt).
  9 | * `PINYIN` and `PINYIN-CHAR`: build from [Unihan_Readings.txt](./data/process_map/Unihan_Readings.txt).
 10 | * `NORM`: build from [NormalizationTest.txt](./data/process_map/NormalizationTest.txt).
 11 | 
 12 | ## Matcher
 13 | 
 14 | ### Overview
 15 | 
 16 | The `Matcher` is a powerful and complex system designed to identify sentence matches using multiple methods. Despite its complexity, it offers significant flexibility and power when used correctly. The main components of the `Matcher` are `MatchID` and `TableID`.
 17 | 
 18 | ### Key Concepts
 19 | 
 20 | 1. **MatchID**: Represents a unique identifier for a match.
 21 | 2. **TableID**: Represents a unique identifier for a table within a match.
 22 | 
 23 | ### Structure
 24 | 
 25 | The `Matcher` utilizes a JSON structure to define matches and tables. Below is an example of its configuration:
 26 | 
 27 | ```json
 28 | {
 29 |     "777": [
 30 |         {
 31 |             "table_id": 45,
 32 |             "match_table_type": {"process_type": "MatchNone"},
 33 |             "word_list": ["hello", "world"],
 34 |             "exemption_process_type": "MatchNone",
 35 |             "exemption_word_list": []
 36 |         }
 37 |         // other tables
 38 |     ]
 39 |     // other matches
 40 | }
 41 | ```
 42 | 
 43 | - `777`: This is the `MatchID`.
 44 | - `45`: This is the `TableID`.
 45 | 
 46 | #### Table
 47 | 
 48 | Each `Table` represents a collection of words related to a specific topic (e.g., political, music, math). The table also includes a list of exemption words to exclude certain sentences. The logical operations within a table are as follows:
 49 | 
 50 | - **OR Logic (within `word_list`)**: The table matches if any word in the `word_list` is matched.
 51 | - **NOT Logic (between `word_list` and `exemption_word_list`)**: If any word in the `exemption_word_list` is matched, the table will not be considered as matched.
 52 | 
 53 | #### Match
 54 | 
 55 | A `Match` consists of multiple tables. Each match can specify a list of tables to perform the matching. This allows users to experiment with different combinations of tables to find the best configuration for their use case. The logical operation between matches is:
 56 | 
 57 | - **OR Logic (between matches)**: The result will report all the matches if any table inside the match is matched.
 58 | 
 59 | ### Usage Cases
 60 | 
 61 | #### Table1 AND Table2 match
 62 | ```json
 63 | Input:
 64 | {
 65 |     "1": [
 66 |         {
 67 |             "table_id": 1,
 68 |             "match_table_type": {"process_type": "MatchNone"},
 69 |             "word_list": ["hello", "world"],
 70 |             "exemption_process_type": "MatchNone",
 71 |             "exemption_word_list": []
 72 |         }
 73 |     ],
 74 |     "2": [
 75 |         {
 76 |             "table_id": 2,
 77 |             "match_table_type": {"process_type": "MatchNone"},
 78 |             "word_list": ["你", "好"],
 79 |             "exemption_process_type": "MatchNone",
 80 |             "exemption_word_list": []
 81 |         }
 82 |     ],
 83 | }
 84 | 
 85 | Output: Check if `match_id` 1 and 2 are both matched.
 86 | ```
 87 | 
 88 | #### Table1 OR Table2 match
 89 | ```json
 90 | Input:
 91 | {
 92 |     "1": [
 93 |         {
 94 |             "table_id": 1,
 95 |             "match_table_type": {"process_type": "MatchNone"},
 96 |             "word_list": ["hello", "world"],
 97 |             "exemption_process_type": "MatchNone",
 98 |             "exemption_word_list": []
 99 |         },
100 |         {
101 |             "table_id": 2,
102 |             "match_table_type": {"process_type": "MatchNone"},
103 |             "word_list": ["你", "好"],
104 |             "exemption_process_type": "MatchNone",
105 |             "exemption_word_list": []
106 |         }
107 |     ]
108 | }
109 | 
110 | Output: Check if `match_id` 1 or 2 is matched.
111 | ```
112 | 
113 | #### Table1 NOT Table2 match
114 | ```json
115 | Input:
116 | {
117 |     "1": [
118 |         {
119 |             "table_id": 1,
120 |             "match_table_type": {"process_type": "MatchNone"},
121 |             "word_list": ["hello", "world"],
122 |             "exemption_process_type": "MatchNone",
123 |             "exemption_word_list": []
124 |         }
125 |     ],
126 |     "2": [
127 |         {
128 |             "table_id": 2,
129 |             "match_table_type": {"process_type": "MatchNone"},
130 |             "word_list": ["你", "好"],
131 |             "exemption_process_type": "MatchNone",
132 |             "exemption_word_list": []
133 |         }
134 |     ],
135 | }
136 | 
137 | Output: Check if `match_id` 1 is matched and 2 is not matched.
138 | ```
139 | 
140 | ## SimpleMatcher
141 | 
142 | ### Overview
143 | 
144 | The `SimpleMatcher` is the core component, designed to be fast, efficient, and easy to use. It handles large amounts of data and identifies words based on predefined types.
145 | 
146 | ### Key Concepts
147 | 
148 | 1. **WordID**: Represents a unique identifier for a word in the `SimpleMatcher`.
149 | 
150 | ### Structure
151 | 
152 | The `SimpleMatcher` uses a mapping structure to define words and their IDs based on different match types. Below is an example configuration:
153 | 
154 | ```json
155 | {
156 |     "ProcessType.None": {
157 |         "1": "hello&world",
158 |         "2": "你好"
159 |         // other words
160 |     }
161 |     // other simple match type word maps
162 | }
163 | ```
164 | 
165 | - `1` and `2`: These are `WordID`s used to identify words in the `SimpleMatcher`.
166 | 
167 | ### Real-world Application
168 | 
169 | In real-world scenarios, `word_id` is used to uniquely identify a word in the database, allowing for easy updates to the word and its variants.
170 | 
171 | ### Logical Operations
172 | 
173 | - **OR Logic (between different `process_type` and words in the same `process_type`)**: The `simple_matcher` is considered matched if any word in the map is matched.
174 | - **AND Logic (between words separated by `&` within a `WordID`)**: All words separated by `&` must be matched for the word to be considered as matched.
175 | - **NOT Logic (between words separated by `~` within a `WordID`)**: All words separated by `~` must not be matched for the word to be considered as matched.
176 | 
177 | ### Usage Cases
178 | 
179 | #### Word1 AND Word2 match
180 | ```json
181 | Input:
182 | {
183 |     "ProcessType.None": {
184 |         "1": "word1&word2"
185 |     }
186 | }
187 | 
188 | Output: Check if `word_id` 1 is matched.
189 | ```
190 | 
191 | #### Word1 OR Word2 match
192 | ```json
193 | Input:
194 | {
195 |     "ProcessType.None": {
196 |         "1": "word1",
197 |         "2": "word2"
198 |     }
199 | }
200 | 
201 | Output: Check if `word_id` 1 or 2 is matched.
202 | ```
203 | 
204 | #### Word1 NOT Word2 match
205 | ```json
206 | Input:
207 | {
208 |     "ProcessType.None": {
209 |         "1": "word1~word2",
210 |     }
211 | }
212 | 
213 | Output: Check if `word_id` 1 is matched.
214 | ```
215 | 
216 | ## Summary
217 | 
218 | The `Matcher` and `SimpleMatcher` systems are designed to provide a robust and flexible solution for word matching tasks. By understanding the logical operations and structures of `MatchID`, `TableID`, and `WordID`, users can effectively leverage these tools for complex matching requirements.


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [2024] [Foster Guo]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any
 2 | person obtaining a copy of this software and associated
 3 | documentation files (the "Software"), to deal in the
 4 | Software without restriction, including without
 5 | limitation the rights to use, copy, modify, merge,
 6 | publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software
 8 | is furnished to do so, subject to the following
 9 | conditions:
10 | 
11 | The above copyright notice and this permission notice
12 | shall be included in all copies or substantial portions
13 | of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
16 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
17 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
18 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
19 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
22 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 | DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | build:
 2 | 	cargo update
 3 | 	cargo build --release
 4 | 	cp ./target/release/libmatcher_c.dylib ./matcher_c/matcher_c.so
 5 | 	cp ./target/release/libmatcher_c.dylib ./matcher_java/src/main/resources/matcher_c.so
 6 | 
 7 | test:
 8 | 	cargo fmt
 9 | 	cargo clippy --all-targets -- -D warnings
10 | 	cargo doc
11 | 
12 | 	cd matcher_rs
13 | 	cargo test --no-default-features
14 | 	cargo test --no-default-features --features "dfa"
15 | 	cargo test --no-default-features --features "runtime_build"
16 | 	cargo test --no-default-features --features "runtime_build,dfa"
17 | 	cargo test --no-default-features --features "dfa,serde"
18 | 	cd ..
19 | 
20 | 	cd matcher_py
21 | 	ruff format .
22 | 	uv sync
23 | 	pytest
24 | 	cd ..
25 | 
26 | update:
27 | 	cargo update --verbose --recursive --breaking -Z unstable-options
28 | 	cargo upgrade --verbose --recursive


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Matcher
  2 | 
  3 | ![Rust](https://img.shields.io/badge/rust-%23000000.svg?style=for-the-badge&logo=rust&logoColor=white)![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)![Java](https://img.shields.io/badge/java-%23ED8B00.svg?style=for-the-badge&logo=openjdk&logoColor=white)![C](https://img.shields.io/badge/c-%2300599C.svg?style=for-the-badge&logo=c&logoColor=white)
  4 | 
  5 | ![PyPI - License](https://img.shields.io/pypi/l/matcher_py)
  6 | 
  7 | ![Crates.io Version](https://img.shields.io/crates/v/matcher_rs)![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/lips7/Matcher/test.yml)![docs.rs](https://img.shields.io/docsrs/matcher_rs)![Crates.io Total Downloads](https://img.shields.io/crates/d/matcher_rs)
  8 | 
  9 | ![PyPI - Version](https://img.shields.io/pypi/v/matcher_py)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/matcher_py)![PyPI - Downloads](https://img.shields.io/pypi/dm/matcher_py)
 10 | 
 11 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust.
 12 | 
 13 | It's helpful for
 14 | - **Precision and Recall**: Word matching is a retrieval process, LOGICAL match improves precision while TEXT VARIATIONS match improves recall.
 15 | - **Content Filtering**: Detecting and filtering out offensive or sensitive words.
 16 | - **Search Engines**: Improving search results by identifying relevant keywords.
 17 | - **Text Analysis**: Extracting specific information from large volumes of text.
 18 | - **Spam Detection**: Identifying spam content in emails or messages.
 19 | - ···
 20 | 
 21 | ## Features
 22 | 
 23 | For detailed implementation, see the [Design Document](./DESIGN.md).
 24 | 
 25 | - **Multiple Matching Methods**:
 26 |   - Simple Word Matching
 27 |   - Regex-Based Matching
 28 |   - Similarity-Based Matching
 29 | - **Text Transformation**:
 30 |   - **Fanjian**: Simplify traditional Chinese characters to simplified ones.
 31 |     Example: `蟲艸` -> `虫草`
 32 |   - **Delete**: Remove specific characters.
 33 |     Example: `*Fu&*iii&^%%*&kkkk` -> `Fuiiikkkk`
 34 |   - **Normalize**: Normalize special characters to identifiable characters.
 35 |     Example: `𝜢𝕰𝕃𝙻𝝧 𝙒ⓞᵣℒ𝒟!` -> `hello world!`
 36 |   - **PinYin**: Convert Chinese characters to Pinyin for fuzzy matching.
 37 |     Example: `西安` -> ` xi  an `, matches `洗按` -> ` xi  an `, but not `先` -> ` xian `
 38 |   - **PinYinChar**: Convert Chinese characters to Pinyin.
 39 |     Example: `西安` -> `xian`, matches `洗按` and `先` -> `xian`
 40 | - **AND OR NOT Word Matching**:
 41 |   - Takes into account the number of repetitions of words.
 42 |   - Example: `hello&world` matches `hello world` and `world,hello`
 43 |   - Example: `无&法&无&天` matches `无无法天` (because `无` is repeated twice), but not `无法天`
 44 |   - Example: `hello~helloo~hhello` matches `hello` but not `helloo` and `hhello`
 45 | - **Customizable Exemption Lists**: Exclude specific words from matching.
 46 | - **Efficient Handling of Large Word Lists**: Optimized for performance.
 47 | 
 48 | ### Rust Users
 49 | 
 50 | See the [Rust README](./matcher_rs/README.md).
 51 | 
 52 | ### Python Users
 53 | 
 54 | See the [Python README](./matcher_py/README.md).
 55 | 
 56 | ### C, Java and Other Users
 57 | 
 58 | We provide dynamic library to link. See the [C README](./matcher_c/README.md) and [Java README](./matcher_java/README.md).
 59 | 
 60 | #### Build from source
 61 | 
 62 | ```shell
 63 | git clone https://github.com/Lips7/Matcher.git
 64 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y
 65 | cargo build --release
 66 | ```
 67 | 
 68 | Then you should find the `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll` in the `target/release` directory.
 69 | 
 70 | #### Pre-built binary
 71 | 
 72 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary.
 73 | 
 74 | ## Benchmarks
 75 | 
 76 | Please refer to [benchmarks](./matcher_rs/README.md#benchmarks) for details.
 77 | 
 78 | ## Roadmap
 79 | 
 80 | ### Performance
 81 | - [x] ~~Cache middle results during different ProcessType reduce_process_text function calling. (failed, too slow)~~
 82 | - [x] Try more aho-corasick library to improve performance and reduce memory usage.
 83 |   - [x] ~~https://github.com/daac-tools/crawdad (produce char-wise index, not byte-wise index, it's not acceptable)~~
 84 |   - [x] https://github.com/daac-tools/daachorse (use it when Fanjian, PinYin or PinYinChar transformation is performed)
 85 |   - [x] ~~Test char-wise HashMap transformation for Chinese Characters. (Too slow)~~
 86 | - [x] Make aho-corasick unsafe.
 87 |   - [x] See https://github.com/Lips7/aho-corasick.
 88 | - [ ] Optimize NOT logic word-wise.
 89 | - [x] Optimize `RegexMatcher` using `RegexSet`.
 90 | - [x] Optimize `SimpleMatcher` when multiple `ProcessType` are used.
 91 |   1. Consider if there are multiple `ProcessType`
 92 |    * None
 93 |    * Fanjian
 94 |    * FanjianDelete
 95 |    * FanjianDeleteNormalize
 96 |    * FanjianNormalize
 97 |   2. We can construct a chain of transformations,
 98 |    * None -> Fanjian -> Delete -> Normalize
 99 |    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;\ -> Normalize.
100 |   3. Calcuate all possible transformations, and cache the results, so that instead calculating 8 times (Fanjian, Fanjian + Delete, Fanjian + Delete + Normalize, Fanjian + Normalize), we only need to calculate 4 times (Fanjian, Delete, Normalize, Normalize).
101 | - [x] ~~Optimize process matcher when perform reduce text processing.~~
102 |   1. Consider we have to perform FanjianDeleteNormalize, we need to perform Fanjian first, then Delete, then Normalize, 3 kinds of Process Matcher are needed to perform replacement or delete, the text has to be scanned 3 times.
103 |   2. What if we only construct only 1 Process Matcher which's patterns contains all the Fanjian, Delete and Normalize 3 kinds of patterns? We could scan the text only once to get all the positions that should be perform replacement or delete.
104 |   3. We need to take care of the byte index will change after replacement or delete, so we need to take the offset changes into account.
105 | - [x] Merge multiple aho-corasick matcher into one when multiple `ProcessType` are used.
106 | - [x] When `dfa` feature is disabled, use daachorse to perform text processing.
107 |   - [x] Do not use it for simple process function, too slow to build.
108 | - [ ] Use more regex set to optimize regex matcher.
109 | 
110 | ### Flexibility
111 | - [x] Cache `get_process_matcher` results globally, instead of caching result inside SimpleMatcher.
112 | - [x] Expose `reduce_process_text` to Python.
113 | - [x] Add a new function that can handle single simple match type.
114 |   - [x] `text_process` now is available.
115 | - [x] Add fuzzy matcher, https://github.com/lotabout/fuzzy-matcher.
116 |   - [x] Use `rapidfuzz` instead.
117 | - [x] Make `SimpleMatcher` and `Matcher` serializable.
118 |   - [x] Make aho-corasick serializable.
119 |   - [x] See https://github.com/Lips7/aho-corasick.
120 | - [x] Implement NOT logic word-wise.
121 | - [x] Support stable rust.
122 | - [ ] Support iterator.
123 | - [ ] A real java package.
124 | - [x] Multiple Python version wheel build.
125 | - [ ] Customize str conversion map.
126 | - [x] Add Matcher process function to py, c and java.
127 | - [x] ~~For simple matcher, is it possible to use regex-automata to replace aho-corasick? and support regex. (Keep it simple and efficient)~~
128 | - [x] Add simple match type to `RegexMatcher` and `SimMatcher` to pre-process a text.
129 | - [x] Try to replace msgpack.
130 | 
131 | ### Readability
132 | - [x] More precise and convenient MatchTable.
133 | - [x] More detailed and rigorous benchmarks.
134 | - [x] More detailed and rigorous tests.
135 | - [x] More detailed simple match type explanation.
136 | - [ ] More detailed [DESIGN](./DESIGN.md).
137 | - [x] Write a Chinese README.


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
 1 | # Matcher
 2 | 
 3 | ![Rust](https://img.shields.io/badge/rust-%23000000.svg?style=for-the-badge&logo=rust&logoColor=white)![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)![Java](https://img.shields.io/badge/java-%23ED8B00.svg?style=for-the-badge&logo=openjdk&logoColor=white)![C](https://img.shields.io/badge/c-%2300599C.svg?style=for-the-badge&logo=c&logoColor=white)
 4 | 
 5 | ![PyPI - License](https://img.shields.io/pypi/l/matcher_py)
 6 | 
 7 | ![Crates.io Version](https://img.shields.io/crates/v/matcher_rs)![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/lips7/Matcher/test.yml)![docs.rs](https://img.shields.io/docsrs/matcher_rs)![Crates.io Total Downloads](https://img.shields.io/crates/d/matcher_rs)
 8 | 
 9 | ![PyPI - Version](https://img.shields.io/pypi/v/matcher_py)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/matcher_py)![PyPI - Downloads](https://img.shields.io/pypi/dm/matcher_py)
10 | 
11 | 一个高性能文本匹配器，旨在解决**逻辑**和**文本变体**的词匹配问题，以Rust实现。
12 | 
13 | 它对以下方面非常有帮助：
14 | - **精确率与召回率**：文本匹配是一个召回过程，逻辑匹配提高精确率，文本变体匹配提高召回率。
15 | - **内容过滤**：检测和攻击性或敏感词语。
16 | - **搜索引擎**：通过识别相关关键词来改进搜索结果。
17 | - **文本分析**：从大量文本中提取特定信息。
18 | - **垃圾邮件检测**：识别电子邮件或消息中的垃圾内容。
19 | - ···
20 | 
21 | ## 特性
22 | 
23 | 有关详细的实现，请参见[Design Document](./DESIGN.md)。
24 | 
25 | - **多种匹配方法**：
26 | 	- 简单词匹配
27 | 	- 基于正则表达式的匹配
28 | 	- 基于相似度的匹配
29 | - **文本转换**：
30 | 	- **繁简转换**：将繁体字转换为简体字。例如：`蟲艸` -> `虫草`
31 | 	- **删除特定字符**：移除特定字符。例如：`*Fu&*iii&^%%*&kkkk` -> `Fuiiikkkk`
32 | 	- **规范化**：将特殊字符规范化为可识别字符。例如：`𝜢𝕰𝕃𝙻𝝧 𝙒ⓞᵣℒ𝒟!` -> `hello world!`
33 | 	- **拼音转换**：将汉字转换为拼音以进行模糊匹配。例如：`西安` -> ` xi  an `, 匹配 `洗按` -> ` xi  an `, 但不匹配 `先` -> ` xian `
34 |   - **拼音字符转换**：将汉字转换为拼音。例如：`西安` -> `xian`, 匹配 `洗按` 和 `先` -> `xian`
35 | - **与或非词匹配**：
36 | 	- 考虑单词的重复次数。
37 | 	- 例如：`hello&world` 匹配 `hello world` 和 `world,hello`
38 | 	- 例如：`无&法&无&天` 匹配 `无无法天`（因为 `无` 重复两次），但不匹配 `无法天`
39 | 	- 例如：`hello~helloo~hhello` 匹配 `hello` 但不匹配 `helloo` 和 `hhello`
40 | - **可定制的豁免列表**：排除特定单词的匹配。
41 | - **高效处理大型词列表**：针对性能进行了优化。
42 | 
43 | ### Rust 用户
44 | 
45 | 请参阅 [Rust README](./matcher_rs/README.md)。
46 | 
47 | ### Python 用户
48 | 
49 | 请参阅 [Python README](./matcher_py/README.md)。
50 | 
51 | ### C, Java 和其他用户
52 | 
53 | 我们提供动态链接库，请参阅 [C README](./matcher_c/README.md) 和 [Java README](./matcher_java/README.md)。
54 | 
55 | #### 或从源构建
56 | 
57 | ```shell
58 | git clone https://github.com/Lips7/Matcher.git
59 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y
60 | cargo build --release
61 | ```
62 | 
63 | 在 `target/release` 文件夹底下找到 `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll`。
64 | 
65 | #### 预构建的包
66 | 
67 | 访问 [release page](https://github.com/Lips7/Matcher/releases) 来下载预构建的动态链接库.
68 | 
69 | ## 性能测试
70 | 
71 | 请参阅 [benchmarks](./matcher_rs/README.md#benchmarks) 查看更多细节。


--------------------------------------------------------------------------------
/data/word_list/cn/cn_words_100.txt:
--------------------------------------------------------------------------------
  1 | 的
  2 | 了
  3 | 在
  4 | 和
  5 | 是
  6 | 有
  7 | 个
  8 | 上
  9 | 中
 10 | 为
 11 | 年
 12 | 这
 13 | 他
 14 | 日
 15 | 对
 16 | 也
 17 | 要
 18 | 我
 19 | 地
 20 | 到
 21 | 说
 22 | 我们
 23 | 就
 24 | 人
 25 | 不
 26 | 等
 27 | 工作
 28 | 月
 29 | 将
 30 | 与
 31 | 着
 32 | 他们
 33 | 以
 34 | 人民
 35 | 都
 36 | 发展
 37 | 大
 38 | 把
 39 | 后
 40 | 从
 41 | 来
 42 | 还
 43 | 两
 44 | 元
 45 | 而
 46 | 进行
 47 | 时
 48 | 生产
 49 | 新
 50 | 中国
 51 | 下
 52 | 并
 53 | 又
 54 | 国家
 55 | 问题
 56 | 会
 57 | 已
 58 | 建设
 59 | 好
 60 | 向
 61 | 被
 62 | 企业
 63 | 经济
 64 | 但
 65 | 出
 66 | 自己
 67 | 群众
 68 | 使
 69 | 市
 70 | 没有
 71 | 革命
 72 | 里
 73 | 做
 74 | 用
 75 | 领导
 76 | 政府
 77 | 名
 78 | 她
 79 | 这个
 80 | 给
 81 | 由
 82 | 省
 83 | 得
 84 | 所
 85 | 各
 86 | 美国
 87 | 前
 88 | 次
 89 | 该
 90 | 于
 91 | 今年
 92 | 去
 93 | 本
 94 | 党
 95 | 之
 96 | 组织
 97 | 据
 98 | 提高
 99 | 家
100 | 干部
101 | 


--------------------------------------------------------------------------------
/data/word_list/en/en_words_100.txt:
--------------------------------------------------------------------------------
  1 | stampeding
  2 | commendable
  3 | adrenaline
  4 | exobiology
  5 | indifference
  6 | avuncular
  7 | prevailed
  8 | foreparts
  9 | legalistically
 10 | intermarries
 11 | desideratum
 12 | evaluating
 13 | lavishing
 14 | attractable
 15 | philippics
 16 | antiabortionist
 17 | lascivious
 18 | breathable
 19 | histogram
 20 | rattlings
 21 | interdict
 22 | summarized
 23 | relieving
 24 | congresspeople
 25 | fitfulness
 26 | percolation
 27 | upperclasswoman
 28 | epistemic
 29 | Chantilly
 30 | stonemasons
 31 | nonferrous
 32 | emulsions
 33 | charitably
 34 | barracudas
 35 | integrity
 36 | knockdowns
 37 | roadworks
 38 | abortionists
 39 | Salvadoran
 40 | chanceries
 41 | misinform
 42 | caretaker
 43 | extricated
 44 | mandolins
 45 | steeliest
 46 | transpiration
 47 | weirdness
 48 | audiologists
 49 | baronetcies
 50 | performing
 51 | publishing
 52 | suspending
 53 | dermatological
 54 | contemplate
 55 | spiritless
 56 | nightwatchman
 57 | paradisaical
 58 | implicating
 59 | timpanists
 60 | Leavenworth
 61 | amorality
 62 | strangulated
 63 | cellophane
 64 | waterboard
 65 | astrophysicists
 66 | aerospace
 67 | passphrase
 68 | engendered
 69 | spotlighting
 70 | misapplication
 71 | barterers
 72 | poetesses
 73 | dollhouse
 74 | laparoscopic
 75 | Dubrovnik
 76 | rerecords
 77 | shielding
 78 | orthographically
 79 | thicknesses
 80 | Bendictus
 81 | congealed
 82 | cooperative
 83 | encompass
 84 | grouching
 85 | shipowners
 86 | jealously
 87 | generational
 88 | antecedents
 89 | persecutes
 90 | exemplified
 91 | admirable
 92 | squeakiest
 93 | absconding
 94 | extirpated
 95 | exoskeletons
 96 | earthworms
 97 | chaotically
 98 | shipbuilder
 99 | equidistantly
100 | overprint


--------------------------------------------------------------------------------
/matcher_c/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "matcher_c"
 3 | authors.workspace = true
 4 | categories.workspace = true
 5 | description.workspace = true
 6 | edition.workspace = true
 7 | homepage.workspace = true
 8 | keywords.workspace = true
 9 | license.workspace = true
10 | repository.workspace = true
11 | version.workspace = true
12 | readme = "README.md"
13 | documentation = "https://docs.rs/crate/matcher_c/latest"
14 | 
15 | [lib]
16 | name = "matcher_c"
17 | crate-type = ["cdylib", "rlib"]
18 | 
19 | [dependencies]
20 | matcher_rs = { path = "../matcher_rs", version = "0.5.7" }
21 | sonic-rs = "0.5.1"
22 | 


--------------------------------------------------------------------------------
/matcher_c/README.md:
--------------------------------------------------------------------------------
  1 | # Matcher Rust Implement C FFI bindings
  2 | 
  3 | ## Overview
  4 | 
  5 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust.
  6 | 
  7 | ## Installation
  8 | 
  9 | ### Build from source
 10 | 
 11 | ```shell
 12 | git clone https://github.com/Lips7/Matcher.git
 13 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y
 14 | cargo build --release
 15 | ```
 16 | 
 17 | Then you should find the `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll` in the `target/release` directory.
 18 | 
 19 | ### Install pre-built binary
 20 | 
 21 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary.
 22 | 
 23 | ## Python usage example
 24 | 
 25 | ```Python
 26 | import json
 27 | 
 28 | from cffi import FFI
 29 | 
 30 | from extension_types import MatchTableType, ProcessType, MatchTable
 31 | 
 32 | ## define ffi
 33 | ffi = FFI()
 34 | ffi.cdef(open("./matcher_c.h", "r", encoding="utf-8").read())
 35 | lib = ffi.dlopen("./matcher_c.so")
 36 | 
 37 | # init matcher
 38 | matcher = lib.init_matcher(
 39 |     json.dumps({
 40 |         1: [
 41 |             MatchTable(
 42 |                 table_id=1,
 43 |                 match_table_type=MatchTableType.Simple(
 44 |                     process_type=ProcessType.MatchNone
 45 |                 ),
 46 |                 word_list=["hello,world", "hello", "world"],
 47 |                 exemption_process_type=ProcessType.MatchNone,
 48 |                 exemption_word_list=[],
 49 |             )
 50 |         ]
 51 |     }).encode()
 52 | )
 53 | 
 54 | # check is match
 55 | lib.matcher_is_match(matcher, "hello".encode("utf-8"))  # True
 56 | 
 57 | # match as list
 58 | res = lib.matcher_process_as_string(matcher, "hello,world".encode("utf-8"))
 59 | print(ffi.string(res).decode("utf-8"))
 60 | # [{"match_id":1,"table_id":1,"word_id":0,"word":"hello,world","similarity":1.0},{"match_id":1,"table_id":1,"word_id":1,"word":"hello","similarity":1.0},{"match_id":1,"table_id":1,"word_id":2,"word":"world","similarity":1.0}]
 61 | lib.drop_string(res)
 62 | 
 63 | # match as dict
 64 | res = lib.matcher_word_match_as_string(matcher, "hello,world".encode("utf-8"))
 65 | print(ffi.string(res).decode("utf-8"))
 66 | # {"1":[{"match_id":1,"table_id":1,"word_id":0,"word":"hello,world","similarity":1.0},{"match_id":1,"table_id":1,"word_id":1,"word":"hello","similarity":1.0},{"match_id":1,"table_id":1,"word_id":2,"word":"world","similarity":1.0}]}
 67 | lib.drop_string(res)
 68 | 
 69 | # drop matcher
 70 | lib.drop_matcher(matcher)
 71 | 
 72 | # init simple matcher
 73 | simple_matcher = lib.init_simple_matcher(
 74 |     json.dumps(({
 75 |         ProcessType.MatchFanjianDeleteNormalize | ProcessType.MatchPinYinChar: {
 76 |             1: "妳好&世界",
 77 |             2: "hello",
 78 |         }
 79 |     })).encode()
 80 | )
 81 | 
 82 | # check is match
 83 | lib.simple_matcher_is_match(simple_matcher, "你好世界".encode("utf-8"))  # True
 84 | 
 85 | # match as list
 86 | res = lib.simple_matcher_process_as_string(
 87 |     simple_matcher, "nihaoshijie!hello!world!".encode("utf-8")
 88 | )
 89 | print(ffi.string(res).decode("utf-8"))
 90 | # [{"word_id":1,"word":"妳好&世界"},{"word_id":2,"word":"hello"}]
 91 | lib.drop_string(res)
 92 | 
 93 | # drop simple matcher
 94 | lib.drop_simple_matcher(simple_matcher)
 95 | ```
 96 | 
 97 | ## Important Notes
 98 | 
 99 | 1. The [extension_types.py](./extension_types.py) is not required, you can use the dynamic library directly.
100 | 2. Always call `drop_matcher`, `drop_simple_matcher`, and `drop_string` after initializing and processing to avoid memory leaks.


--------------------------------------------------------------------------------
/matcher_c/extension_types.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum, IntFlag
  2 | from typing import Dict, List, TypedDict, Union
  3 | 
  4 | 
  5 | class ProcessType(IntFlag):
  6 |     """
  7 |     An enumeration representing various types of text processing operations.
  8 | 
  9 |     Attributes:
 10 |         MatchNone (IntFlag): An operation that performs no matching (binary 00000001).
 11 |         MatchFanjian (IntFlag): An operation that matches traditional and simplified Chinese characters (binary 00000010).
 12 |         MatchDelete (IntFlag): An operation that matches deleted characters (binary 00000100).
 13 |         MatchNormalize (IntFlag): An operation that normalizes characters (binary 00001000).
 14 |         MatchDeleteNormalize (IntFlag): A combined operation that deletes and normalizes characters (binary 00001100).
 15 |         MatchFanjianDeleteNormalize (IntFlag): A combined operation that matches traditional and simplified Chinese characters,
 16 |                                               deletes, and normalizes (binary 00001110).
 17 |         MatchPinYin (IntFlag): An operation that matches Pinyin representations of Chinese characters (binary 00010000).
 18 |         MatchPinYinChar (IntFlag): An operation that matches individual characters in the Pinyin representation (binary 00100000).
 19 |     """
 20 | 
 21 |     MatchNone = 0b00000001
 22 |     MatchFanjian = 0b00000010
 23 |     MatchDelete = 0b00000100
 24 |     MatchNormalize = 0b00001000
 25 |     MatchDeleteNormalize = 0b00001100
 26 |     MatchFanjianDeleteNormalize = 0b00001110
 27 |     MatchPinYin = 0b00010000
 28 |     MatchPinYinChar = 0b00100000
 29 | 
 30 | 
 31 | class RegexMatchType(str, Enum):
 32 |     """
 33 |     An enumeration representing various types of regex matching operations.
 34 | 
 35 |     Attributes:
 36 |         MatchSimilarChar (str): An operation that matches characters that are similar in some way.
 37 |         MatchAcrostic (str): An operation that matches acrostic patterns.
 38 |         MatchRegex (str): An operation that matches using standard regular expressions.
 39 |     """
 40 | 
 41 |     MatchSimilarChar = "similar_char"
 42 |     MatchAcrostic = "acrostic"
 43 |     MatchRegex = "regex"
 44 | 
 45 | 
 46 | class SimMatchType(str, Enum):
 47 |     """
 48 |     An enumeration representing various types of similarity matching operations.
 49 | 
 50 |     Attributes:
 51 |         MatchLevenshtein (str): An operation that matches using the Levenshtein distance metric.
 52 |     """
 53 | 
 54 |     MatchLevenshtein = "levenshtein"
 55 | 
 56 | 
 57 | class Simple(TypedDict):
 58 |     """
 59 |     A TypedDict representing a simple text processing operation.
 60 | 
 61 |     Attributes:
 62 |         process_type (ProcessType): The type of processing operation to be performed.
 63 |     """
 64 | 
 65 |     process_type: ProcessType
 66 | 
 67 | 
 68 | class Regex(TypedDict):
 69 |     """
 70 |     A TypedDict representing a regex-based text processing operation.
 71 | 
 72 |     Attributes:
 73 |         process_type (ProcessType): The type of processing operation to be performed.
 74 |         regex_match_type (RegexMatchType): The type of regex matching operation to be used.
 75 |     """
 76 | 
 77 |     process_type: ProcessType
 78 |     regex_match_type: RegexMatchType
 79 | 
 80 | 
 81 | class Similar(TypedDict):
 82 |     """
 83 |     A TypedDict representing a similarity-based text processing operation.
 84 | 
 85 |     Attributes:
 86 |         process_type (ProcessType): The type of processing operation to be performed.
 87 |         sim_match_type (SimMatchType): The type of similarity matching operation to be used.
 88 |         threshold (float): The threshold value for the similarity matching operation.
 89 |     """
 90 | 
 91 |     process_type: ProcessType
 92 |     sim_match_type: SimMatchType
 93 |     threshold: float
 94 | 
 95 | 
 96 | class MatchTableType:
 97 |     def Simple(process_type: ProcessType) -> Dict[str, Simple]:
 98 |         """
 99 |         Create a dictionary representing a simple text processing operation.
100 | 
101 |         Args:
102 |             process_type (ProcessType): The type of processing operation to be performed.
103 | 
104 |         Returns:
105 |             Dict[str, Simple]: A dictionary with one key "simple" mapping to a Simple TypedDict
106 |                                containing the provided process_type.
107 |         """
108 |         return {"simple": Simple(process_type=process_type)}
109 | 
110 |     def Regex(
111 |         process_type: ProcessType, regex_match_type: RegexMatchType
112 |     ) -> Dict[str, Regex]:
113 |         """
114 |         Create a dictionary representing a regex-based text processing operation.
115 | 
116 |         Args:
117 |             process_type (ProcessType): The type of processing operation to be performed.
118 |             regex_match_type (RegexMatchType): The type of regex matching operation to be used.
119 | 
120 |         Returns:
121 |             Dict[str, Regex]: A dictionary with one key "regex" mapping to a Regex TypedDict
122 |                               containing the provided process_type and regex_match_type.
123 |         """
124 |         return {
125 |             "regex": Regex(process_type=process_type, regex_match_type=regex_match_type)
126 |         }
127 | 
128 |     def Similar(
129 |         process_type: ProcessType, sim_match_type: SimMatchType, threshold: float
130 |     ) -> Dict[str, Similar]:
131 |         """
132 |         Create a dictionary representing a similarity-based text processing operation.
133 |         Args:
134 |             process_type (ProcessType): The type of processing operation to be performed.
135 |             sim_match_type (SimMatchType): The type of similarity matching operation to be used.
136 |             threshold (float): The threshold value for the similarity matching operation.
137 | 
138 |         Returns:
139 |             Dict[str, Similar]: A dictionary with one key "similar" mapping to a Similar TypedDict
140 |                                 containing the provided process_type, sim_match_type, and threshold.
141 |         """
142 |         return {
143 |             "similar": Similar(
144 |                 process_type=process_type,
145 |                 sim_match_type=sim_match_type,
146 |                 threshold=threshold,
147 |             )
148 |         }
149 | 
150 | 
151 | class MatchTable(TypedDict):
152 |     """
153 |     A TypedDict representing a table for matching operations.
154 | 
155 |     Attributes:
156 |         table_id (int): A unique identifier for the match table.
157 |         match_table_type (Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]]):
158 |             A dictionary that specifies the type of match operation to be performed. The key is a string indicating
159 |             the match type ('simple', 'regex', 'similar'), and the value is a corresponding TypedDict describing
160 |             the operation.
161 |         word_list (List[str]): A list of words that are subject to the matching operations.
162 |         exemption_process_type (ProcessType): The type of process for which certain words are exempt from matching.
163 |         exemption_word_list (List[str]): A list of words that are exempt from the matching process.
164 |     """
165 | 
166 |     table_id: int
167 |     match_table_type: Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]]
168 |     word_list: List[str]
169 |     exemption_process_type: ProcessType
170 |     exemption_word_list: List[str]
171 | 
172 | 
173 | MatchTableMap = Dict[int, List[MatchTable]]
174 | """
175 | A type alias for mapping table identifiers to lists of MatchTable objects.
176 | 
177 | Type:
178 |     Dict[int, List[MatchTable]]
179 | 
180 | This dictionary maps an integer table ID to a list of MatchTable objects that correspond to the ID. It is used to
181 | organize and retrieve match tables based on their unique identifiers.
182 | """
183 | 
184 | 
185 | class MatchResult(TypedDict):
186 |     """
187 |     A TypedDict representing the result of a matching operation.
188 | 
189 |     Attributes:
190 |         match_id (int): A unique identifier for the match result.
191 |         table_id (int): The identifier of the match table where the matching operation was performed.
192 |         word_id (int): The identifier of the matched word within the word list.
193 |         word (str): The matched word.
194 |         similarity (float): The similarity score of the match operation.
195 |     """
196 | 
197 |     match_id: int
198 |     table_id: int
199 |     word_id: int
200 |     word: str
201 |     similarity: float
202 | 
203 | 
204 | SimpleTable = Dict[ProcessType, Dict[int, str]]
205 | """
206 | A type alias for representing a simple table structure for text processing.
207 | 
208 | This dictionary maps a `ProcessType` to another dictionary that maps an integer ID to a string.
209 | The outer dictionary's keys represent different types of processing operations, while the inner
210 | dictionary's keys represent unique identifiers corresponding to specific strings related to the
211 | operations.
212 | 
213 | Type:
214 |     Dict[ProcessType, Dict[int, str]]
215 | """
216 | 
217 | 
218 | class SimpleResult(TypedDict):
219 |     """
220 |     A TypedDict representing a simplified result of a text processing operation.
221 | 
222 |     Attributes:
223 |         word_id (int): The identifier of the word within the word list.
224 |         word (str): The word corresponding to the word_id.
225 |     """
226 | 
227 |     word_id: int
228 |     word: str
229 | 


--------------------------------------------------------------------------------
/matcher_c/matcher_c.h:
--------------------------------------------------------------------------------
 1 | void* init_matcher(char* match_table_map_bytes);
 2 | bool matcher_is_match(void* matcher, char* text);
 3 | char* matcher_process_as_string(void* matcher, char* text);
 4 | char* matcher_word_match_as_string(void* matcher, char* text);
 5 | void drop_matcher(void* matcher);
 6 | 
 7 | void* init_simple_matcher(char* simple_table_bytes);
 8 | bool simple_matcher_is_match(void* simple_matcher, char* text);
 9 | char* simple_matcher_process_as_string(void* simple_matcher, char* text);
10 | void drop_simple_matcher(void* simple_matcher);
11 | 
12 | void drop_string(char* ptr);


--------------------------------------------------------------------------------
/matcher_c/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     ffi::{c_char, CStr, CString},
  3 |     str,
  4 | };
  5 | 
  6 | use matcher_rs::{
  7 |     MatchTableMapSerde as MatchTableMap, Matcher, SimpleMatcher, SimpleTableSerde as SimpleTable,
  8 |     TextMatcherTrait,
  9 | };
 10 | 
 11 | /// Initializes a `Matcher` from a serialized `MatchTableMap` in MessagePack format.
 12 | ///
 13 | /// # Safety
 14 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
 15 | /// that `match_table_map_bytes` points to a valid null-terminated C string containing a
 16 | /// serialized `MatchTableMap`, and that the string remains valid for the duration of the call.
 17 | ///
 18 | /// # Parameters
 19 | /// - `match_table_map_bytes`: A pointer to a C string containing the serialized `MatchTableMap`.
 20 | ///
 21 | /// # Returns
 22 | /// A raw pointer to the newly created `Matcher`. The caller is responsible for managing the
 23 | /// lifetime of this pointer and must eventually call `drop` on it to free the memory.
 24 | ///
 25 | /// # Panics
 26 | /// This function will panic if the input data cannot be deserialized into a `MatchTableMap`.
 27 | #[no_mangle]
 28 | pub unsafe extern "C" fn init_matcher(match_table_map_bytes: *const c_char) -> *mut Matcher {
 29 |     unsafe {
 30 |         let match_table_map: MatchTableMap = match sonic_rs::from_slice(
 31 |             CStr::from_ptr(match_table_map_bytes).to_bytes(),
 32 |         ) {
 33 |             Ok(match_table_map) => match_table_map,
 34 |             Err(e) => {
 35 |                 panic!("Deserialize match_table_map_bytes failed, Please check the input data.\nErr: {}", e)
 36 |             }
 37 |         };
 38 | 
 39 |         Box::into_raw(Box::new(Matcher::new(&match_table_map)))
 40 |     }
 41 | }
 42 | 
 43 | /// Checks if the given text matches any pattern in the Matcher.
 44 | ///
 45 | /// # Safety
 46 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
 47 | /// that `matcher` points to a valid `Matcher` instance and that `text` points to a valid
 48 | /// null-terminated C string. Both the `matcher` and the `text` must remain valid for the
 49 | /// duration of the call.
 50 | ///
 51 | /// # Parameters
 52 | /// - `matcher`: A pointer to the `Matcher` instance.
 53 | /// - `text`: A pointer to a C string containing the text to be checked for matches.
 54 | ///
 55 | /// # Returns
 56 | /// - `true` if the text matches any pattern in the `Matcher`.
 57 | /// - `false` otherwise.
 58 | ///
 59 | /// # Panics
 60 | /// This function will panic if the input `text` is not a valid UTF-8 string.
 61 | #[no_mangle]
 62 | pub unsafe extern "C" fn matcher_is_match(matcher: *mut Matcher, text: *const c_char) -> bool {
 63 |     unsafe {
 64 |         let text = str::from_utf8(CStr::from_ptr(text).to_bytes());
 65 |         match text {
 66 |             Ok(text) => matcher.as_ref().unwrap().is_match(text),
 67 |             Err(_) => {
 68 |                 panic!("Input is not a valid utf-8 string");
 69 |             }
 70 |         }
 71 |     }
 72 | }
 73 | 
 74 | /// Processes the input text through the Matcher and returns the result as a C string.
 75 | ///
 76 | /// # Safety
 77 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
 78 | /// that `matcher` points to a valid `Matcher` instance and that `text` points to a valid
 79 | /// null-terminated C string. Both the `matcher` and the `text` must remain valid for the
 80 | /// duration of the call.
 81 | ///
 82 | /// # Parameters
 83 | /// - `matcher`: A pointer to the `Matcher` instance.
 84 | /// - `text`: A pointer to a C string containing the text to be processed.
 85 | ///
 86 | /// # Returns
 87 | /// A pointer to a newly allocated C string containing the processing result. The caller is
 88 | /// responsible for managing the lifetime of this pointer and must eventually call `drop_string`
 89 | /// on it to free the memory.
 90 | ///
 91 | /// # Panics
 92 | /// This function will panic if the input `text` is not a valid UTF-8 string or if the
 93 | /// serialization of the result fails.
 94 | #[no_mangle]
 95 | pub unsafe extern "C" fn matcher_process_as_string(
 96 |     matcher: *mut Matcher,
 97 |     text: *const c_char,
 98 | ) -> *mut c_char {
 99 |     unsafe {
100 |         let text = str::from_utf8(CStr::from_ptr(text).to_bytes());
101 |         let res = match text {
102 |             Ok(text) => matcher.as_ref().unwrap().process(text),
103 |             Err(_) => {
104 |                 panic!("Input is not a valid utf-8 string");
105 |             }
106 |         };
107 |         let res_cstring = CString::new(sonic_rs::to_vec(&res).unwrap_unchecked()).unwrap();
108 |         res_cstring.into_raw()
109 |     }
110 | }
111 | 
112 | /// Processes the input text through the `Matcher` and returns the word match result as a C string.
113 | ///
114 | /// # Safety
115 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
116 | /// that `matcher` points to a valid `Matcher` instance and that `text` points to a valid
117 | /// null-terminated C string. Both the `matcher` and the `text` must remain valid for the
118 | /// duration of the call.
119 | ///
120 | /// # Parameters
121 | /// - `matcher`: A pointer to the `Matcher` instance.
122 | /// - `text`: A pointer to a C string containing the text to be processed.
123 | ///
124 | /// # Returns
125 | /// A pointer to a newly allocated C string containing the word match processing result.
126 | /// The caller is responsible for managing the lifetime of this pointer and must eventually
127 | /// call `drop_string` on it to free the memory.
128 | ///
129 | /// # Panics
130 | /// This function will panic if the input `text` is not a valid UTF-8 string.
131 | #[no_mangle]
132 | pub unsafe extern "C" fn matcher_word_match_as_string(
133 |     matcher: *mut Matcher,
134 |     text: *const c_char,
135 | ) -> *mut c_char {
136 |     unsafe {
137 |         let text = str::from_utf8(CStr::from_ptr(text).to_bytes());
138 |         let res = match text {
139 |             Ok(text) => {
140 |                 sonic_rs::to_string(&matcher.as_ref().unwrap().word_match(text)).unwrap_unchecked()
141 |             }
142 |             Err(_) => {
143 |                 panic!("Input is not a valid utf-8 string");
144 |             }
145 |         };
146 |         let res_cstring = CString::new(res).unwrap();
147 |         res_cstring.into_raw()
148 |     }
149 | }
150 | 
151 | /// Frees the memory allocated for the `Matcher` instance.
152 | ///
153 | /// # Safety
154 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
155 | /// that `matcher` points to a valid `Matcher` instance. This function transfers ownership
156 | /// of the raw pointer and deallocates the memory, so the caller must not use the `matcher`
157 | /// pointer after calling this function.
158 | ///
159 | /// # Parameters
160 | /// - `matcher`: A pointer to the `Matcher` instance to be deallocated.
161 | #[no_mangle]
162 | pub unsafe extern "C" fn drop_matcher(matcher: *mut Matcher) {
163 |     unsafe { drop(Box::from_raw(matcher)) }
164 | }
165 | 
166 | /// Initializes a `SimpleMatcher` instance from serialized table bytes.
167 | ///
168 | /// # Safety
169 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
170 | /// that `simple_table_bytes` points to a valid null-terminated C string. The returned
171 | /// `SimpleMatcher` pointer must be properly managed and eventually deallocated by calling
172 | /// `drop_simple_matcher`.
173 | ///
174 | /// # Parameters
175 | /// - `simple_table_bytes`: A pointer to a C string containing the serialized table bytes.
176 | ///
177 | /// # Returns
178 | /// A pointer to a newly allocated `SimpleMatcher` instance. The caller is responsible for managing
179 | /// the lifetime of this pointer and must eventually call `drop_simple_matcher` to free the memory.
180 | ///
181 | /// # Panics
182 | /// This function will panic if the deserialization of `simple_table_bytes` fails.
183 | #[no_mangle]
184 | pub unsafe extern "C" fn init_simple_matcher(
185 |     simple_table_bytes: *const c_char,
186 | ) -> *mut SimpleMatcher {
187 |     unsafe {
188 |         let simple_table: SimpleTable =
189 |             match sonic_rs::from_slice(CStr::from_ptr(simple_table_bytes).to_bytes()) {
190 |                 Ok(simple_table) => simple_table,
191 |                 Err(e) => {
192 |                     panic!(
193 |                     "Deserialize simple_table_bytes failed, Please check the input data.\nErr: {}",
194 |                     e,
195 |                 )
196 |                 }
197 |             };
198 | 
199 |         Box::into_raw(Box::new(SimpleMatcher::new(&simple_table)))
200 |     }
201 | }
202 | 
203 | /// Determines if the input text matches using the `SimpleMatcher`.
204 | ///
205 | /// # Safety
206 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
207 | /// that `simple_matcher` points to a valid `SimpleMatcher` instance and that `text` points to a
208 | /// valid null-terminated C string. Both the `simple_matcher` and the `text` must remain valid for
209 | /// the duration of the call.
210 | ///
211 | /// # Parameters
212 | /// - `simple_matcher`: A pointer to the `SimpleMatcher` instance.
213 | /// - `text`: A pointer to a C string containing the text to be processed.
214 | ///
215 | /// # Returns
216 | /// A boolean indicating whether the text matches based on the `SimpleMatcher`.
217 | ///
218 | /// # Panics
219 | /// This function will panic if the input `text` is not a valid UTF-8 string.
220 | #[no_mangle]
221 | pub unsafe extern "C" fn simple_matcher_is_match(
222 |     simple_matcher: *mut SimpleMatcher,
223 |     text: *const c_char,
224 | ) -> bool {
225 |     unsafe {
226 |         let text = str::from_utf8(CStr::from_ptr(text).to_bytes());
227 |         match text {
228 |             Ok(text) => simple_matcher.as_ref().unwrap().is_match(text),
229 |             Err(_) => {
230 |                 panic!("Input is not a valid utf-8 string");
231 |             }
232 |         }
233 |     }
234 | }
235 | 
236 | /// Processes the input text using the `SimpleMatcher` and returns the result as a C string.
237 | ///
238 | /// # Safety
239 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
240 | /// that `simple_matcher` points to a valid `SimpleMatcher` instance and that `text` points to a
241 | /// valid null-terminated C string. Both `simple_matcher` and `text` must remain valid for the
242 | /// duration of the call.
243 | ///
244 | /// # Parameters
245 | /// - `simple_matcher`: A pointer to the `SimpleMatcher` instance.
246 | /// - `text`: A pointer to a C string containing the text to be processed.
247 | ///
248 | /// # Returns
249 | /// A pointer to a newly allocated C string containing the processing result. The caller is
250 | /// responsible for managing the lifetime of this pointer and must eventually call
251 | /// `drop_string` on it to free the memory.
252 | ///
253 | /// # Panics
254 | /// This function will panic if the input `text` is not a valid UTF-8 string.
255 | #[no_mangle]
256 | pub unsafe extern "C" fn simple_matcher_process_as_string(
257 |     simple_matcher: *mut SimpleMatcher,
258 |     text: *const c_char,
259 | ) -> *mut c_char {
260 |     unsafe {
261 |         let text = str::from_utf8(CStr::from_ptr(text).to_bytes());
262 |         let res = match text {
263 |             Ok(text) => simple_matcher.as_ref().unwrap().process(text),
264 |             Err(_) => {
265 |                 panic!("Input is not a valid utf-8 string");
266 |             }
267 |         };
268 |         let res_cstring = CString::new(sonic_rs::to_vec(&res).unwrap_unchecked()).unwrap();
269 |         res_cstring.into_raw()
270 |     }
271 | }
272 | 
273 | /// Deallocates a `SimpleMatcher` instance.
274 | ///
275 | /// # Safety
276 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
277 | /// that `simple_matcher` points to a valid `SimpleMatcher` instance that was previously allocated
278 | /// by `init_simple_matcher`. After calling this function, the `simple_matcher` pointer must not be
279 | /// used again as it points to deallocated memory.
280 | ///
281 | /// # Parameters
282 | /// - `simple_matcher`: A pointer to the `SimpleMatcher` instance to be deallocated.
283 | #[no_mangle]
284 | pub unsafe extern "C" fn drop_simple_matcher(simple_matcher: *mut SimpleMatcher) {
285 |     unsafe { drop(Box::from_raw(simple_matcher)) }
286 | }
287 | 
288 | /// Deallocates a C string that was previously allocated by the Rust code and passed to C.
289 | ///
290 | /// # Safety
291 | /// This function is unsafe because it relies on raw pointers and FFI. The caller must ensure
292 | /// that `ptr` points to a valid C string that was previously allocated by Rust code using
293 | /// `CString::into_raw` or a similar method. After calling this function, the `ptr` pointer must
294 | /// not be used again as it points to deallocated memory.
295 | ///
296 | /// # Parameters
297 | /// - `ptr`: A pointer to the C string to be deallocated.
298 | #[no_mangle]
299 | pub unsafe extern "C" fn drop_string(ptr: *mut c_char) {
300 |     unsafe { drop(CString::from_raw(ptr)) }
301 | }
302 | 


--------------------------------------------------------------------------------
/matcher_java/README.md:
--------------------------------------------------------------------------------
  1 | # Matcher Rust Implement JAVA FFI bindings
  2 | 
  3 | ## Overview
  4 | 
  5 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust.
  6 | 
  7 | ## Installation
  8 | 
  9 | ### Build from source
 10 | 
 11 | ```shell
 12 | git clone https://github.com/Lips7/Matcher.git
 13 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain nightly -y
 14 | cargo build --release
 15 | ```
 16 | 
 17 | Then you should find the `libmatcher_c.so`/`libmatcher_c.dylib`/`matcher_c.dll` in the `target/release` directory.
 18 | 
 19 | ### Install pre-built binary
 20 | 
 21 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary.
 22 | 
 23 | ## Java usage example
 24 | 
 25 | Put the `matcher_c` dynamic library under the `src/main/resources` directory.
 26 | 
 27 | Copy the code below or refer to [MatcherJavaExample.java](./src/test/java/com/matcher_java/MatcherJavaExample.java).
 28 | 
 29 | ```java
 30 | package com.matcher_java;
 31 | 
 32 | import com.alibaba.fastjson.JSON;
 33 | import com.alibaba.fastjson.serializer.SerializeConfig;
 34 | import com.matcher_java.extension_types.MatchTable;
 35 | import com.matcher_java.extension_types.MatchTableType;
 36 | import com.matcher_java.extension_types.ProcessType;
 37 | import com.matcher_java.extension_types.ProcessTypeSerializer;
 38 | import com.sun.jna.Pointer;
 39 | 
 40 | import java.io.IOException;
 41 | import java.util.ArrayList;
 42 | import java.util.HashMap;
 43 | import java.util.List;
 44 | import java.util.Map;
 45 | 
 46 | public class MatcherJavaExample {
 47 |     public static void main(String[] args) throws IOException {
 48 |         System.out.println("Simple Matcher Test");
 49 |         simple_matcher_process_demo();
 50 | 
 51 |         System.out.println("\n");
 52 | 
 53 |         System.out.println("Matcher Test");
 54 |         matcher_process_demo();
 55 |     }
 56 | 
 57 |     public static void simple_matcher_process_demo() throws IOException {
 58 |         SerializeConfig serializeConfig = new SerializeConfig();
 59 |         serializeConfig.put(ProcessType.class, new ProcessTypeSerializer());
 60 | 
 61 |         Map<ProcessType, Map<String, String>> simpleTable = new HashMap<>();
 62 |         Map<String, String> wordMap = new HashMap<>();
 63 |         wordMap.put("1", "hello&world");
 64 |         simpleTable.put(ProcessType.MatchNone, wordMap);
 65 | 
 66 |         String simpleTableStr = JSON.toJSONString(simpleTable, serializeConfig);
 67 |         System.out.printf("simple_table: %s\n", simpleTableStr);
 68 | 
 69 |         byte[] simpleTableBytes = JSON.toJSONBytes(simpleTable, serializeConfig);
 70 | 
 71 |         MatcherJava instance = MatcherJava.INSTANCE;
 72 | 
 73 |         Pointer simpleMatcher = instance.init_simple_matcher(simpleTableBytes);
 74 | 
 75 |         byte[] strBytes = "hello,world".getBytes("utf-8");
 76 |         byte[] cStrBytes = new byte[strBytes.length + 1];
 77 |         System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length);
 78 | 
 79 |         boolean isMatch = instance.simple_matcher_is_match(simpleMatcher, cStrBytes);
 80 |         System.out.printf("isMatch: %s\n", isMatch);
 81 | 
 82 |         Pointer matchResPtr = instance.simple_matcher_process_as_string(simpleMatcher, cStrBytes);
 83 |         String matchRes = matchResPtr.getString(0, "utf-8");
 84 |         System.out.printf("matchRes: %s\n", matchRes);
 85 |         instance.drop_string(matchResPtr);
 86 | 
 87 |         instance.drop_simple_matcher(simpleMatcher);
 88 |     }
 89 | 
 90 |     public static void matcher_process_demo() throws IOException {
 91 |         SerializeConfig serializeConfig = new SerializeConfig();
 92 |         serializeConfig.put(ProcessType.class, new ProcessTypeSerializer());
 93 | 
 94 |         Map<String, List<MatchTable>> matchTableMap = new HashMap<>();
 95 |         List<MatchTable> matchTableList = new ArrayList<>();
 96 |         MatchTable matchTable = new MatchTable(1, MatchTableType.Simple(ProcessType.MatchNone), List.of("hello&world"), ProcessType.MatchNone, List.of());
 97 |         matchTableList.add(matchTable);
 98 |         matchTableMap.put("1", matchTableList);
 99 | 
100 |         String matchTableMapStr = JSON.toJSONString(matchTableMap, serializeConfig);
101 |         System.out.printf("match_table_map: %s\n", matchTableMapStr);
102 | 
103 |         byte[] matchTableMapBytes = JSON.toJSONBytes(matchTableMap, serializeConfig);
104 | 
105 |         MatcherJava instance = MatcherJava.INSTANCE;
106 | 
107 |         Pointer matcher = instance.init_matcher(matchTableMapBytes);
108 | 
109 |         byte[] strBytes = "hello,world".getBytes("utf-8");
110 |         byte[] cStrBytes = new byte[strBytes.length + 1];
111 |         System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length);
112 | 
113 |         boolean isMatch = instance.matcher_is_match(matcher, cStrBytes);
114 |         System.out.printf("isMatch: %s\n", isMatch);
115 | 
116 |         Pointer matchResPtr1 = instance.matcher_process_as_string(matcher, cStrBytes);
117 |         String matchRes1 = matchResPtr1.getString(0, "utf-8");
118 |         System.out.printf("matchRes: %s\n", matchRes1);
119 |         instance.drop_string(matchResPtr1);
120 | 
121 |         Pointer matchResPtr2 = instance.matcher_word_match_as_string(matcher, cStrBytes);
122 |         String matchRes2 = matchResPtr2.getString(0, "utf-8");
123 |         System.out.printf("matchRes: %s\n", matchRes2);
124 |         instance.drop_string(matchResPtr2);
125 | 
126 |         instance.drop_matcher(matcher);
127 |     }
128 | }
129 | ```
130 | 
131 | ## Important Notes
132 | 
133 | Always call `drop_matcher`, `drop_simple_matcher`, and `drop_string` after initializing and processing to avoid memory leaks.


--------------------------------------------------------------------------------
/matcher_java/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |   <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |   <groupId>com.matcher_java</groupId>
 8 |   <artifactId>matcher_java</artifactId>
 9 |   <version>0.5.7</version>
10 | 
11 |   <name>matcher_java</name>
12 |   <!-- FIXME change it to the project's website -->
13 |   <url>https://github.com/Lips7/Matcher</url>
14 | 
15 |   <properties>
16 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
17 |     <maven.compiler.source>23</maven.compiler.source>
18 |     <maven.compiler.target>23</maven.compiler.target>
19 |   </properties>
20 | 
21 |   <dependencies>
22 |     <dependency>
23 |       <groupId>junit</groupId>
24 |       <artifactId>junit</artifactId>
25 |       <version>4.11</version>
26 |       <scope>test</scope>
27 |     </dependency>
28 |     <dependency>
29 |       <groupId>com.alibaba</groupId>
30 |       <artifactId>fastjson</artifactId>
31 |       <version>2.0.28</version>
32 |     </dependency>
33 |     <dependency>
34 |       <groupId>net.java.dev.jna</groupId>
35 |       <artifactId>jna</artifactId>
36 |       <version>5.14.0</version>
37 |     </dependency>
38 |   </dependencies>
39 | 
40 |   <build>
41 |     <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
42 |       <plugins>
43 |         <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
44 |         <plugin>
45 |           <artifactId>maven-clean-plugin</artifactId>
46 |           <version>3.1.0</version>
47 |         </plugin>
48 |         <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
49 |         <plugin>
50 |           <artifactId>maven-resources-plugin</artifactId>
51 |           <version>3.0.2</version>
52 |         </plugin>
53 |         <plugin>
54 |           <artifactId>maven-compiler-plugin</artifactId>
55 |           <version>3.8.0</version>
56 |         </plugin>
57 |         <plugin>
58 |           <artifactId>maven-surefire-plugin</artifactId>
59 |           <version>2.22.1</version>
60 |         </plugin>
61 |         <plugin>
62 |           <artifactId>maven-jar-plugin</artifactId>
63 |           <version>3.0.2</version>
64 |         </plugin>
65 |         <plugin>
66 |           <artifactId>maven-install-plugin</artifactId>
67 |           <version>2.5.2</version>
68 |         </plugin>
69 |         <plugin>
70 |           <artifactId>maven-deploy-plugin</artifactId>
71 |           <version>2.8.2</version>
72 |         </plugin>
73 |         <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
74 |         <plugin>
75 |           <artifactId>maven-site-plugin</artifactId>
76 |           <version>3.7.1</version>
77 |         </plugin>
78 |         <plugin>
79 |           <artifactId>maven-project-info-reports-plugin</artifactId>
80 |           <version>3.0.0</version>
81 |         </plugin>
82 |       </plugins>
83 |     </pluginManagement>
84 |   </build>
85 | </project>
86 | 


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/MatcherJava.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java;
 2 | 
 3 | import com.sun.jna.Library;
 4 | import com.sun.jna.Native;
 5 | import com.sun.jna.Pointer;
 6 | 
 7 | interface MatcherJava extends Library {
 8 |     MatcherJava INSTANCE = (MatcherJava) Native.load(
 9 |             MatcherJava.class.getResource("/matcher_c.so").getPath(),
10 |             MatcherJava.class);
11 | 
12 |     Pointer init_matcher(byte[] match_table_map_bytes);
13 | 
14 |     boolean matcher_is_match(Pointer matcher, byte[] text_bytes);
15 | 
16 |     Pointer matcher_process_as_string(Pointer matcher, byte[] text_bytes);
17 | 
18 |     Pointer matcher_word_match_as_string(Pointer matcher, byte[] text_bytes);
19 | 
20 |     void drop_matcher(Pointer matcher);
21 | 
22 |     Pointer init_simple_matcher(byte[] simple_table_bytes);
23 | 
24 |     boolean simple_matcher_is_match(Pointer simple_matcher, byte[] text_bytes);
25 | 
26 |     Pointer simple_matcher_process_as_string(Pointer simple_matcher, byte[] text_bytes);
27 | 
28 |     void drop_simple_matcher(Pointer simple_matcher);
29 | 
30 |     void drop_string(Pointer ptr);
31 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/MatchResult.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | public class MatchResult {
 4 |     private int match_id;
 5 |     private int table_id;
 6 |     private int word_id;
 7 |     private String word;
 8 |     private float similarity;
 9 | 
10 |     public MatchResult(int match_id, int table_id, int word_id, String word, float similarity) {
11 |         this.match_id = match_id;
12 |         this.table_id = table_id;
13 |         this.word_id = word_id;
14 |         this.word = word;
15 |         this.similarity = similarity;
16 |     }
17 | 
18 |     public int getMatchId() {
19 |         return match_id;
20 |     }
21 | 
22 |     public void setMatchId(int match_id) {
23 |         this.match_id = match_id;
24 |     }
25 | 
26 |     public int getTableId() {
27 |         return table_id;
28 |     }
29 | 
30 |     public void setTableId(int table_id) {
31 |         this.table_id = table_id;
32 |     }
33 | 
34 |     public int getWordId() {
35 |         return word_id;
36 |     }
37 | 
38 |     public void setWordId(int word_id) {
39 |         this.word_id = word_id;
40 |     }
41 | 
42 |     public String getWord() {
43 |         return word;
44 |     }
45 | 
46 |     public void setWord(String word) {
47 |         this.word = word;
48 |     }
49 | 
50 |     public float getSimilarity() {
51 |         return similarity;
52 |     }
53 | 
54 |     public void setSimilarity(float similarity) {
55 |         this.similarity = similarity;
56 |     }
57 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/MatchTable.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Map;
 5 | 
 6 | import com.alibaba.fastjson.PropertyNamingStrategy;
 7 | import com.alibaba.fastjson.annotation.JSONType;
 8 | 
 9 | @JSONType(naming = PropertyNamingStrategy.SnakeCase)
10 | public class MatchTable {
11 |     private int table_id;
12 |     private Map<String, ?> match_table_type;
13 |     private List<String> word_List;
14 |     private ProcessType exemption_process_type;
15 |     private List<String> exemption_word_list;
16 | 
17 |     public MatchTable(int table_id, Map<String, ?> match_table_type, List<String> word_List,
18 |             ProcessType exemption_process_type, List<String> exemption_word_list) {
19 |         this.table_id = table_id;
20 |         this.match_table_type = match_table_type;
21 |         this.word_List = word_List;
22 |         this.exemption_process_type = exemption_process_type;
23 |         this.exemption_word_list = exemption_word_list;
24 |     }
25 | 
26 |     public int getTableId() {
27 |         return table_id;
28 |     }
29 | 
30 |     public void setTableId(int table_id) {
31 |         this.table_id = table_id;
32 |     }
33 | 
34 |     public Map<String, ?> getMatchTableType() {
35 |         return match_table_type;
36 |     }
37 | 
38 |     public void setMatchTableType(Map<String, ?> match_table_type) {
39 |         this.match_table_type = match_table_type;
40 |     }
41 | 
42 |     public List<String> getWordList() {
43 |         return word_List;
44 |     }
45 | 
46 |     public void setWordList(List<String> word_List) {
47 |         this.word_List = word_List;
48 |     }
49 | 
50 |     public ProcessType getExemptionProcessType() {
51 |         return exemption_process_type;
52 |     }
53 | 
54 |     public void setExemptionProcessType(ProcessType exemption_process_type) {
55 |         this.exemption_process_type = exemption_process_type;
56 |     }
57 | 
58 |     public List<String> getExemptionWordList() {
59 |         return exemption_word_list;
60 |     }
61 | 
62 |     public void setExemptionWordList(List<String> exemption_word_list) {
63 |         this.exemption_word_list = exemption_word_list;
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/MatchTableType.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.HashMap;
 5 | 
 6 | public class MatchTableType {
 7 |     public static Map<String, Simple> Simple(ProcessType processType) {
 8 |         Map<String, Simple> map = new HashMap<>();
 9 |         map.put("simple", new Simple(processType));
10 |         return map;
11 |     }
12 | 
13 |     public static Map<String, Regex> Regex(ProcessType processType, RegexMatchType regexMatchType) {
14 |         Map<String, Regex> map = new HashMap<>();
15 |         map.put("regex", new Regex(processType, regexMatchType));
16 |         return map;
17 |     }
18 | 
19 |     public static Map<String, Similar> Similar(ProcessType processType, SimMatchType simMatchType,
20 |             float threshold) {
21 |         Map<String, Similar> map = new HashMap<>();
22 |         map.put("similar", new Similar(processType, simMatchType, threshold));
23 |         return map;
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/ProcessType.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | public enum ProcessType {
 4 |     MatchNone(0b00000001),
 5 |     MatchFanjian(0b00000010),
 6 |     MatchDelete(0b00000100),
 7 |     MatchNormalize(0b00001000),
 8 |     MatchDeleteNormalize(0b00001100),
 9 |     MatchFanjianDeleteNormalize(0b00001110),
10 |     MatchPinYin(0b00010000),
11 |     MatchPinYinChar(0b00100000);
12 | 
13 |     private final int value;
14 | 
15 |     ProcessType(int value) {
16 |         this.value = value;
17 |     }
18 | 
19 |     public int getValue() {
20 |         return value;
21 |     }
22 | 
23 |     public String toString() {
24 |         return String.valueOf(value);
25 |     }
26 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/ProcessTypeSerializer.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import java.io.IOException;
 4 | import java.lang.reflect.Type;
 5 | 
 6 | import com.alibaba.fastjson.serializer.JSONSerializer;
 7 | import com.alibaba.fastjson.serializer.ObjectSerializer;
 8 | 
 9 | public class ProcessTypeSerializer implements ObjectSerializer {
10 |     @Override
11 |     public void write(JSONSerializer serializer, Object object, Object fieldName, Type fieldType, int features)
12 |             throws IOException {
13 |         ProcessType processType = (ProcessType) object;
14 |         if (fieldName != null) {
15 |             serializer.write(processType.getValue());
16 |         } else {
17 |             serializer.write(processType.toString());
18 |         }
19 |     }
20 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/Regex.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import com.alibaba.fastjson.PropertyNamingStrategy;
 4 | import com.alibaba.fastjson.annotation.JSONType;
 5 | 
 6 | @JSONType(naming = PropertyNamingStrategy.SnakeCase)
 7 | public class Regex {
 8 |     private ProcessType process_type;
 9 |     private RegexMatchType regex_match_type;
10 | 
11 |     public Regex(ProcessType process_type, RegexMatchType regexMatchType) {
12 |         this.process_type = process_type;
13 |         this.regex_match_type = regexMatchType;
14 |     }
15 | 
16 |     public ProcessType getProcessType() {
17 |         return process_type;
18 |     }
19 | 
20 |     public RegexMatchType getRegexMatchType() {
21 |         return regex_match_type;
22 |     }
23 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/RegexMatchType.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import com.alibaba.fastjson.annotation.JSONField;
 4 | 
 5 | public enum RegexMatchType {
 6 |     MatchSimilarChar("similar_char"),
 7 |     MatchAcrostic("acrostic"),
 8 |     MatchRegex("regex");
 9 | 
10 |     private final String value;
11 | 
12 |     RegexMatchType(String value) {
13 |         this.value = value;
14 |     }
15 | 
16 |     @JSONField
17 |     public String getValue() {
18 |         return value;
19 |     }
20 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/SimMatchType.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import com.alibaba.fastjson.annotation.JSONField;
 4 | 
 5 | public enum SimMatchType {
 6 |     MatchLevenshtein("levenshtein");
 7 | 
 8 |     private final String value;
 9 | 
10 |     SimMatchType(String value) {
11 |         this.value = value;
12 |     }
13 | 
14 |     @JSONField
15 |     public String getValue() {
16 |         return value;
17 |     }
18 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/Similar.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import com.alibaba.fastjson.PropertyNamingStrategy;
 4 | import com.alibaba.fastjson.annotation.JSONType;
 5 | 
 6 | @JSONType(naming = PropertyNamingStrategy.SnakeCase)
 7 | public class Similar {
 8 |     private ProcessType process_type;
 9 |     private SimMatchType sim_match_type;
10 |     private float threshold;
11 | 
12 |     public Similar(ProcessType process_type, SimMatchType sim_match_type, float threshold) {
13 |         this.process_type = process_type;
14 |         this.sim_match_type = sim_match_type;
15 |         this.threshold = threshold;
16 |     }
17 | 
18 |     public ProcessType getProcessType() {
19 |         return process_type;
20 |     }
21 | 
22 |     public SimMatchType getSimMatchType() {
23 |         return sim_match_type;
24 |     }
25 | 
26 |     public float getThreshold() {
27 |         return threshold;
28 |     }
29 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/Simple.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | import com.alibaba.fastjson.PropertyNamingStrategy;
 4 | import com.alibaba.fastjson.annotation.JSONType;
 5 | 
 6 | @JSONType(naming = PropertyNamingStrategy.SnakeCase)
 7 | public class Simple {
 8 |     private ProcessType process_type;
 9 | 
10 |     public Simple(ProcessType process_type) {
11 |         this.process_type = process_type;
12 |     }
13 | 
14 |     public ProcessType getProcessType() {
15 |         return process_type;
16 |     }
17 | }


--------------------------------------------------------------------------------
/matcher_java/src/main/java/com/matcher_java/extension_types/SimpleResult.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java.extension_types;
 2 | 
 3 | public class SimpleResult {
 4 |     private int word_id;
 5 |     private String word;
 6 | 
 7 |     public SimpleResult(int word_id, String word) {
 8 |         this.word_id = word_id;
 9 |         this.word = word;
10 |     }
11 | 
12 |     public int getWordId() {
13 |         return word_id;
14 |     }
15 | 
16 |     public void setWordId(int word_id) {
17 |         this.word_id = word_id;
18 |     }
19 | 
20 |     public String getWord() {
21 |         return word;
22 |     }
23 | 
24 |     public void setWord(String word) {
25 |         this.word = word;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/matcher_java/src/test/java/com/matcher_java/MatcherJavaExample.java:
--------------------------------------------------------------------------------
 1 | package com.matcher_java;
 2 | 
 3 | import com.alibaba.fastjson.JSON;
 4 | import com.alibaba.fastjson.serializer.SerializeConfig;
 5 | import com.matcher_java.extension_types.MatchTable;
 6 | import com.matcher_java.extension_types.MatchTableType;
 7 | import com.matcher_java.extension_types.ProcessType;
 8 | import com.matcher_java.extension_types.ProcessTypeSerializer;
 9 | import com.sun.jna.Pointer;
10 | 
11 | import java.io.IOException;
12 | import java.util.ArrayList;
13 | import java.util.HashMap;
14 | import java.util.List;
15 | import java.util.Map;
16 | 
17 | public class MatcherJavaExample {
18 |     public static void main(String[] args) throws IOException {
19 |         System.out.println("Simple Matcher Test");
20 |         simple_matcher_process_demo();
21 | 
22 |         System.out.println("\n");
23 | 
24 |         System.out.println("Matcher Test");
25 |         matcher_process_demo();
26 |     }
27 | 
28 |     public static void simple_matcher_process_demo() throws IOException {
29 |         SerializeConfig serializeConfig = new SerializeConfig();
30 |         serializeConfig.put(ProcessType.class, new ProcessTypeSerializer());
31 | 
32 |         Map<ProcessType, Map<String, String>> simpleTable = new HashMap<>();
33 |         Map<String, String> wordMap = new HashMap<>();
34 |         wordMap.put("1", "hello&world");
35 |         simpleTable.put(ProcessType.MatchNone, wordMap);
36 | 
37 |         String simpleTableStr = JSON.toJSONString(simpleTable, serializeConfig);
38 |         System.out.printf("simple_table: %s\n", simpleTableStr);
39 | 
40 |         byte[] simpleTableBytes = JSON.toJSONBytes(simpleTable, serializeConfig);
41 | 
42 |         MatcherJava instance = MatcherJava.INSTANCE;
43 | 
44 |         Pointer simpleMatcher = instance.init_simple_matcher(simpleTableBytes);
45 | 
46 |         byte[] strBytes = "hello,world".getBytes("utf-8");
47 |         byte[] cStrBytes = new byte[strBytes.length + 1];
48 |         System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length);
49 | 
50 |         boolean isMatch = instance.simple_matcher_is_match(simpleMatcher, cStrBytes);
51 |         System.out.printf("isMatch: %s\n", isMatch);
52 | 
53 |         Pointer matchResPtr = instance.simple_matcher_process_as_string(simpleMatcher, cStrBytes);
54 |         String matchRes = matchResPtr.getString(0, "utf-8");
55 |         System.out.printf("matchRes: %s\n", matchRes);
56 |         instance.drop_string(matchResPtr);
57 | 
58 |         instance.drop_simple_matcher(simpleMatcher);
59 |     }
60 | 
61 |     public static void matcher_process_demo() throws IOException {
62 |         SerializeConfig serializeConfig = new SerializeConfig();
63 |         serializeConfig.put(ProcessType.class, new ProcessTypeSerializer());
64 | 
65 |         Map<String, List<MatchTable>> matchTableMap = new HashMap<>();
66 |         List<MatchTable> matchTableList = new ArrayList<>();
67 |         MatchTable matchTable = new MatchTable(1, MatchTableType.Simple(ProcessType.MatchNone), List.of("hello&world"), ProcessType.MatchNone, List.of());
68 |         matchTableList.add(matchTable);
69 |         matchTableMap.put("1", matchTableList);
70 | 
71 |         String matchTableMapStr = JSON.toJSONString(matchTableMap, serializeConfig);
72 |         System.out.printf("match_table_map: %s\n", matchTableMapStr);
73 | 
74 |         byte[] matchTableMapBytes = JSON.toJSONBytes(matchTableMap, serializeConfig);
75 | 
76 |         MatcherJava instance = MatcherJava.INSTANCE;
77 | 
78 |         Pointer matcher = instance.init_matcher(matchTableMapBytes);
79 | 
80 |         byte[] strBytes = "hello,world".getBytes("utf-8");
81 |         byte[] cStrBytes = new byte[strBytes.length + 1];
82 |         System.arraycopy(strBytes, 0, cStrBytes, 0, strBytes.length);
83 | 
84 |         boolean isMatch = instance.matcher_is_match(matcher, cStrBytes);
85 |         System.out.printf("isMatch: %s\n", isMatch);
86 | 
87 |         Pointer matchResPtr1 = instance.matcher_process_as_string(matcher, cStrBytes);
88 |         String matchRes1 = matchResPtr1.getString(0, "utf-8");
89 |         System.out.printf("matchRes: %s\n", matchRes1);
90 |         instance.drop_string(matchResPtr1);
91 | 
92 |         Pointer matchResPtr2 = instance.matcher_word_match_as_string(matcher, cStrBytes);
93 |         String matchRes2 = matchResPtr2.getString(0, "utf-8");
94 |         System.out.printf("matchRes: %s\n", matchRes2);
95 |         instance.drop_string(matchResPtr2);
96 | 
97 |         instance.drop_matcher(matcher);
98 |     }
99 | }


--------------------------------------------------------------------------------
/matcher_py/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "matcher_py"
 3 | authors.workspace = true
 4 | categories.workspace = true
 5 | description.workspace = true
 6 | edition.workspace = true
 7 | homepage.workspace = true
 8 | keywords.workspace = true
 9 | license.workspace = true
10 | repository.workspace = true
11 | version.workspace = true
12 | readme = "README.md"
13 | documentation = "https://docs.rs/crate/matcher_py/latest"
14 | build = "build.rs"
15 | 
16 | [lib]
17 | name = "matcher_py"
18 | crate-type = ["cdylib"]
19 | 
20 | [dependencies]
21 | matcher_rs = { path = "../matcher_rs", version = "0.5.7" }
22 | pyo3 = { version = "0.25.0", features = ["extension-module"] }
23 | sonic-rs = "0.5.1"
24 | 
25 | [build-dependencies]
26 | pyo3-build-config = "0.25.0"
27 | 


--------------------------------------------------------------------------------
/matcher_py/README.md:
--------------------------------------------------------------------------------
  1 | # Matcher Rust Implementation with PyO3 Binding
  2 | 
  3 | A high-performance matcher designed to solve **LOGICAL** and **TEXT VARIATIONS** problems in word matching, implemented in Rust.
  4 | 
  5 | For detailed implementation, see the [Design Document](../DESIGN.md).
  6 | 
  7 | ## Features
  8 | 
  9 | - **Multiple Matching Methods**:
 10 |   - Simple Word Matching
 11 |   - Regex-Based Matching
 12 |   - Similarity-Based Matching
 13 | - **Text Normalization**:
 14 |   - **Fanjian**: Simplify traditional Chinese characters to simplified ones.
 15 |     Example: `蟲艸` -> `虫艹`
 16 |   - **Delete**: Remove specific characters.
 17 |     Example: `*Fu&*iii&^%%*&kkkk` -> `Fuiiikkkk`
 18 |   - **Normalize**: Normalize special characters to identifiable characters.
 19 |     Example: `𝜢𝕰𝕃𝙻𝝧 𝙒ⓞᵣℒ𝒟!` -> `hello world!`
 20 |   - **PinYin**: Convert Chinese characters to Pinyin for fuzzy matching.
 21 |     Example: `西安` -> ` xi  an `, matches `洗按` -> ` xi  an `, but not `先` -> ` xian `
 22 |   - **PinYinChar**: Convert Chinese characters to Pinyin.
 23 |     Example: `西安` -> `xian`, matches `洗按` and `先` -> `xian`
 24 | - **AND OR NOT Word Matching**:
 25 |   - Takes into account the number of repetitions of words.
 26 |   - Example: `hello&world` matches `hello world` and `world,hello`
 27 |   - Example: `无&法&无&天` matches `无无法天` (because `无` is repeated twice), but not `无法天`
 28 |   - Example: `hello~helloo~hhello` matches `hello` but not `helloo` and `hhello`
 29 | - **Customizable Exemption Lists**: Exclude specific words from matching.
 30 | - **Efficient Handling of Large Word Lists**: Optimized for performance.
 31 | 
 32 | ## Installation
 33 | 
 34 | ### Use pip
 35 | 
 36 | ```shell
 37 | pip install matcher_py
 38 | ```
 39 | 
 40 | ### Install pre-built binary
 41 | 
 42 | Visit the [release page](https://github.com/Lips7/Matcher/releases) to download the pre-built binary.
 43 | 
 44 | ## Usage
 45 | 
 46 | All relevant types are defined in [extension_types.py](./python/matcher_py/extension_types.py).
 47 | 
 48 | ### Explanation of the configuration
 49 | 
 50 | * `Matcher`'s configuration is defined by the `MatchTableMap = Dict[int, List[MatchTable]]` type, the key of `MatchTableMap` is called `match_id`, **for each `match_id`, the `table_id` inside is required to be unique**.
 51 | * `SimpleMatcher`'s configuration is defined by the `SimpleTable = Dict[ProcessType, Dict[int, str]]` type, the value `Dict[int, str]`'s key is called `word_id`, **`word_id` is required to be globally unique**.
 52 | 
 53 | #### MatchTable
 54 | 
 55 | * `table_id`: The unique ID of the match table.
 56 | * `match_table_type`: The type of the match table.
 57 | * `word_list`: The word list of the match table.
 58 | * `exemption_process_type`: The type of the exemption simple match.
 59 | * `exemption_word_list`: The exemption word list of the match table.
 60 | 
 61 | For each match table, word matching is performed over the `word_list`, and exemption word matching is performed over the `exemption_word_list`. If the exemption word matching result is True, the word matching result will be False.
 62 | 
 63 | #### MatchTableType
 64 | 
 65 | * `Simple`: Supports simple multiple patterns matching with text normalization defined by `process_type`.
 66 |   * It can handle combination patterns and repeated times sensitive matching, delimited by `&` and `~`, such as `hello&world&hello` will match `hellohelloworld` and `worldhellohello`, but not `helloworld` due to the repeated times of `hello`.
 67 | * `Regex`: Supports regex patterns matching.
 68 |   * `SimilarChar`: Supports similar character matching using regex.
 69 |     * `["hello,hallo,hollo,hi", "word,world,wrd,🌍", "!,?,~"]` will match `helloworld!`, `hollowrd?`, `hi🌍~` ··· any combinations of the words split by `,` in the list.
 70 |   * `Acrostic`: Supports acrostic matching using regex **(currently only supports Chinese and simple English sentences)**.
 71 |     * `["h,e,l,l,o", "你,好"]` will match `hope, endures, love, lasts, onward.` and `你的笑容温暖, 好心情常伴。`.
 72 |   * `Regex`: Supports regex matching.
 73 |     * `["h[aeiou]llo", "w[aeiou]rd"]` will match `hello`, `world`, `hillo`, `wurld` ··· any text that matches the regex in the list.
 74 | * `Similar`: Supports similar text matching based on distance and threshold.
 75 |   * `Levenshtein`: Supports similar text matching based on Levenshtein distance.
 76 | 
 77 | #### ProcessType
 78 | 
 79 | * `None`: No transformation.
 80 | * `Fanjian`: Traditional Chinese to simplified Chinese transformation. Based on [FANJIAN](../matcher_rs/process_map/FANJIAN.txt).
 81 |   * `妳好` -> `你好`
 82 |   * `現⾝` -> `现身`
 83 | * `Delete`: Delete all punctuation, special characters and white spaces. Based on [TEXT_DELETE](../matcher_rs/process_map/TEXT-DELETE.txt) and `WHITE_SPACE`.
 84 |   * `hello, world!` -> `helloworld`
 85 |   * `《你∷好》` -> `你好`
 86 | * `Normalize`: Normalize all English character variations and number variations to basic characters. Based on [NORM](../matcher_rs//process_map/NORM.txt) and [NUM_NORM](../matcher_rs//process_map/NUM-NORM.txt).
 87 |   * `ℋЀ⒈㈠Õ` -> `he11o`
 88 |   * `⒈Ƨ㊂` -> `123`
 89 | * `PinYin`: Convert all unicode Chinese characters to pinyin with boundaries. Based on [PINYIN](../matcher_rs/process_map/PINYIN.txt).
 90 |   * `你好` -> ` ni  hao `
 91 |   * `西安` -> ` xi  an `
 92 | * `PinYinChar`: Convert all unicode Chinese characters to pinyin without boundaries. Based on [PINYIN](../matcher_rs/process_map/PINYIN.txt).
 93 |   * `你好` -> `nihao`
 94 |   * `西安` -> `xian`
 95 | 
 96 | You can combine these transformations as needed. Pre-defined combinations like `DeleteNormalize` and `FanjianDeleteNormalize` are provided for convenience.
 97 | 
 98 | Avoid combining `PinYin` and `PinYinChar` due to that `PinYin` is a more limited version of `PinYinChar`, in some cases like `xian`, can be treat as two words `xi` and `an`, or only one word `xian`.
 99 | 
100 | ### Text Process Usage
101 | 
102 | Here’s an example of how to use the `reduce_text_process` and `text_process` functions:
103 | 
104 | ```python
105 | from matcher_py import reduce_text_process, text_process
106 | from matcher_py.extension_types import ProcessType
107 | 
108 | print(reduce_text_process(ProcessType.MatchDeleteNormalize, "hello, world!"))
109 | print(text_process(ProcessType.MatchDelete, "hello, world!"))
110 | ```
111 | 
112 | ### Matcher Basic Usage
113 | 
114 | Here’s an example of how to use the `Matcher`:
115 | 
116 | ```python
117 | import json
118 | 
119 | from matcher_py import Matcher
120 | from matcher_py.extension_types import MatchTable, MatchTableType, ProcessType, RegexMatchType, SimMatchType
121 | 
122 | matcher = Matcher(
123 |     json.dumps({
124 |         1: [
125 |             MatchTable(
126 |                 table_id=1,
127 |                 match_table_type=MatchTableType.Simple(process_type = ProcessType.MatchFanjianDeleteNormalize),
128 |                 word_list=["hello", "world"],
129 |                 exemption_process_type=ProcessType.MatchNone,
130 |                 exemption_word_list=["word"],
131 |             ),
132 |             MatchTable(
133 |                 table_id=2,
134 |                 match_table_type=MatchTableType.Regex(
135 |                   process_type = ProcessType.MatchFanjianDeleteNormalize,
136 |                   regex_match_type=RegexMatchType.Regex
137 |                 ),
138 |                 word_list=["h[aeiou]llo"],
139 |                 exemption_process_type=ProcessType.MatchNone,
140 |                 exemption_word_list=[],
141 |             )
142 |         ],
143 |         2: [
144 |             MatchTable(
145 |                 table_id=3,
146 |                 match_table_type=MatchTableType.Similar(
147 |                   process_type = ProcessType.MatchFanjianDeleteNormalize,
148 |                   sim_match_type=SimMatchType.MatchLevenshtein,
149 |                   threshold=0.5
150 |                 ),
151 |                 word_list=["halxo"],
152 |                 exemption_process_type=ProcessType.MatchNone,
153 |                 exemption_word_list=[],
154 |             )
155 |         ]
156 |     }).encode()
157 | )
158 | # Check if a text matches
159 | assert matcher.is_match("hello")
160 | assert not matcher.is_match("word")
161 | # Perform process as a list
162 | result = matcher.process("hello")
163 | assert result == [{'match_id': 1,
164 |   'table_id': 2,
165 |   'word_id': 0,
166 |   'word': 'h[aeiou]llo',
167 |   'similarity': 1.0},
168 |  {'match_id': 1,
169 |   'table_id': 1,
170 |   'word_id': 0,
171 |   'word': 'hello',
172 |   'similarity': 1.0},
173 |  {'match_id': 2,
174 |   'table_id': 3,
175 |   'word_id': 0,
176 |   'word': 'halxo',
177 |   'similarity': 0.6}]
178 | # Perform word matching as a dict
179 | assert matcher.word_match(r"hello, world")[1] == [{'match_id': 1,
180 |   'table_id': 2,
181 |   'word_id': 0,
182 |   'word': 'h[aeiou]llo',
183 |   'similarity': 1.0},
184 |  {'match_id': 1,
185 |   'table_id': 1,
186 |   'word_id': 0,
187 |   'word': 'hello',
188 |   'similarity': 1.0},
189 |  {'match_id': 1,
190 |   'table_id': 1,
191 |   'word_id': 1,
192 |   'word': 'world',
193 |   'similarity': 1.0}]
194 | # Perform word matching as a string
195 | result = matcher.word_match_as_string("hello")
196 | assert result == """{"2":[{"match_id":2,"table_id":3,"word_id":0,"word":"halxo","similarity":0.6}],"1":[{"match_id":1,"table_id":2,"word_id":0,"word":"h[aeiou]llo","similarity":1.0},{"match_id":1,"table_id":1,"word_id":0,"word":"hello","similarity":1.0}]}"""
197 | ```
198 | 
199 | ### Simple Matcher Basic Usage
200 | 
201 | Here’s an example of how to use the `SimpleMatcher`:
202 | 
203 | ```python
204 | import json
205 | 
206 | from matcher_py import SimpleMatcher
207 | from matcher_py.extension_types import ProcessType
208 | 
209 | simple_matcher = SimpleMatcher(
210 |     json.dumps(
211 |         {
212 |             ProcessType.MatchNone: {
213 |                 1: "hello&world",
214 |                 2: "word&word~hello"
215 |             },
216 |             ProcessType.MatchDelete: {
217 |                 3: "hallo"
218 |             }
219 |         }
220 |     ).encode()
221 | )
222 | # Check if a text matches
223 | assert simple_matcher.is_match("hello^&!#*#&!^#*()world")
224 | # Perform simple processing
225 | result = simple_matcher.process("hello,world,word,word,hallo")
226 | assert result == [{'word_id': 1, 'word': 'hello&world'}, {'word_id': 3, 'word': 'hallo'}]
227 | ```
228 | 
229 | ## Contributing
230 | 
231 | Contributions to `matcher_py` are welcome! If you find a bug or have a feature request, please open an issue on the [GitHub repository](https://github.com/Lips7/Matcher). If you would like to contribute code, please fork the repository and submit a pull request.
232 | 
233 | ## License
234 | 
235 | `matcher_py` is licensed under the MIT OR Apache-2.0 license.
236 | 
237 | ## More Information
238 | 
239 | For more details, visit the [GitHub repository](https://github.com/Lips7/Matcher).


--------------------------------------------------------------------------------
/matcher_py/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     pyo3_build_config::add_extension_module_link_args();
3 | }
4 | 


--------------------------------------------------------------------------------
/matcher_py/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "matcher_py"
 3 | description = "A high-performance matcher designed to solve LOGICAL and TEXT VARIATIONS problems in word matching, implemented in Rust."
 4 | version = "0.5.7"
 5 | readme = "README.md"
 6 | requires-python = ">=3.8"
 7 | authors = [{ name = 'Foster Guo', email = "f975793771@gmail.com" }]
 8 | classifiers = [
 9 |     "Development Status :: 5 - Production/Stable",
10 |     "Intended Audience :: Developers",
11 |     "License :: OSI Approved :: Apache Software License",
12 |     "License :: OSI Approved :: MIT License",
13 |     "Operating System :: MacOS",
14 |     "Operating System :: Microsoft :: Windows",
15 |     "Operating System :: POSIX :: Linux",
16 |     "Programming Language :: Python :: 3",
17 |     "Programming Language :: Python :: 3.8",
18 |     "Programming Language :: Python :: 3.9",
19 |     "Programming Language :: Python :: 3.10",
20 |     "Programming Language :: Python :: 3.11",
21 |     "Programming Language :: Python :: 3.12",
22 |     "Programming Language :: Python :: 3.13",
23 |     "Programming Language :: Python",
24 |     "Programming Language :: Rust",
25 |     "Typing :: Typed",
26 | ]
27 | 
28 | [project.urls]
29 | homepage = "https://github.com/Lips7/Matcher"
30 | repository = "https://github.com/Lips7/Matcher"
31 | changelog = "https://github.com/Lips7/Matcher/blob/master/CHANGELOG.md"
32 | 
33 | [build-system]
34 | requires = ["maturin>=1,<2"]
35 | build-backend = "maturin"
36 | 
37 | [tool.uv]
38 | managed = true
39 | dev-dependencies = [
40 |     "pytest",
41 |     "pip"
42 | ]
43 | 
44 | [tool.maturin]
45 | python-source = "python"
46 | bindings = "pyo3"
47 | strip = true
48 | profile = "release"
49 | module-name = "matcher_py"
50 | rustc-args = ["-C", "target-cpu=native"]
51 | 


--------------------------------------------------------------------------------
/matcher_py/python/matcher_py/__init__.py:
--------------------------------------------------------------------------------
 1 | from .matcher_py import Matcher, SimpleMatcher, reduce_text_process, text_process
 2 | 
 3 | __all__ = [
 4 |     "Matcher",
 5 |     "SimpleMatcher",
 6 |     "reduce_text_process",
 7 |     "text_process",
 8 |     "ProcessType",
 9 |     "MatchTable",
10 |     "MatchTableType",
11 |     "RegexMatchType",
12 |     "SimMatchType",
13 | ]
14 | 


--------------------------------------------------------------------------------
/matcher_py/python/matcher_py/extension_types.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum, IntFlag
  2 | from typing import Dict, List, TypedDict, Union
  3 | 
  4 | 
  5 | class ProcessType(IntFlag):
  6 |     """
  7 |     An enumeration representing various types of text processing operations.
  8 | 
  9 |     Attributes:
 10 |         MatchNone (IntFlag): An operation that performs no matching (binary 00000001).
 11 |         MatchFanjian (IntFlag): An operation that matches traditional and simplified Chinese characters (binary 00000010).
 12 |         MatchDelete (IntFlag): An operation that matches deleted characters (binary 00000100).
 13 |         MatchNormalize (IntFlag): An operation that normalizes characters (binary 00001000).
 14 |         MatchDeleteNormalize (IntFlag): A combined operation that deletes and normalizes characters (binary 00001100).
 15 |         MatchFanjianDeleteNormalize (IntFlag): A combined operation that matches traditional and simplified Chinese characters,
 16 |                                               deletes, and normalizes (binary 00001110).
 17 |         MatchPinYin (IntFlag): An operation that matches Pinyin representations of Chinese characters (binary 00010000).
 18 |         MatchPinYinChar (IntFlag): An operation that matches individual characters in the Pinyin representation (binary 00100000).
 19 |     """
 20 | 
 21 |     MatchNone = 0b00000001
 22 |     MatchFanjian = 0b00000010
 23 |     MatchDelete = 0b00000100
 24 |     MatchNormalize = 0b00001000
 25 |     MatchDeleteNormalize = 0b00001100
 26 |     MatchFanjianDeleteNormalize = 0b00001110
 27 |     MatchPinYin = 0b00010000
 28 |     MatchPinYinChar = 0b00100000
 29 | 
 30 | 
 31 | class RegexMatchType(str, Enum):
 32 |     """
 33 |     An enumeration representing various types of regex matching operations.
 34 | 
 35 |     Attributes:
 36 |         MatchSimilarChar (str): An operation that matches characters that are similar in some way.
 37 |         MatchAcrostic (str): An operation that matches acrostic patterns.
 38 |         MatchRegex (str): An operation that matches using standard regular expressions.
 39 |     """
 40 | 
 41 |     MatchSimilarChar = "similar_char"
 42 |     MatchAcrostic = "acrostic"
 43 |     MatchRegex = "regex"
 44 | 
 45 | 
 46 | class SimMatchType(str, Enum):
 47 |     """
 48 |     An enumeration representing various types of similarity matching operations.
 49 | 
 50 |     Attributes:
 51 |         MatchLevenshtein (str): An operation that matches using the Levenshtein distance metric.
 52 |     """
 53 | 
 54 |     MatchLevenshtein = "levenshtein"
 55 | 
 56 | 
 57 | class Simple(TypedDict):
 58 |     """
 59 |     A TypedDict representing a simple text processing operation.
 60 | 
 61 |     Attributes:
 62 |         process_type (ProcessType): The type of processing operation to be performed.
 63 |     """
 64 | 
 65 |     process_type: ProcessType
 66 | 
 67 | 
 68 | class Regex(TypedDict):
 69 |     """
 70 |     A TypedDict representing a regex-based text processing operation.
 71 | 
 72 |     Attributes:
 73 |         process_type (ProcessType): The type of processing operation to be performed.
 74 |         regex_match_type (RegexMatchType): The type of regex matching operation to be used.
 75 |     """
 76 | 
 77 |     process_type: ProcessType
 78 |     regex_match_type: RegexMatchType
 79 | 
 80 | 
 81 | class Similar(TypedDict):
 82 |     """
 83 |     A TypedDict representing a similarity-based text processing operation.
 84 | 
 85 |     Attributes:
 86 |         process_type (ProcessType): The type of processing operation to be performed.
 87 |         sim_match_type (SimMatchType): The type of similarity matching operation to be used.
 88 |         threshold (float): The threshold value for the similarity matching operation.
 89 |     """
 90 | 
 91 |     process_type: ProcessType
 92 |     sim_match_type: SimMatchType
 93 |     threshold: float
 94 | 
 95 | 
 96 | class MatchTableType:
 97 |     def Simple(process_type: ProcessType) -> Dict[str, Simple]:
 98 |         """
 99 |         Create a dictionary representing a simple text processing operation.
100 | 
101 |         Args:
102 |             process_type (ProcessType): The type of processing operation to be performed.
103 | 
104 |         Returns:
105 |             Dict[str, Simple]: A dictionary with one key "simple" mapping to a Simple TypedDict
106 |                                containing the provided process_type.
107 |         """
108 |         return {"simple": Simple(process_type=process_type)}
109 | 
110 |     def Regex(
111 |         process_type: ProcessType, regex_match_type: RegexMatchType
112 |     ) -> Dict[str, Regex]:
113 |         """
114 |         Create a dictionary representing a regex-based text processing operation.
115 | 
116 |         Args:
117 |             process_type (ProcessType): The type of processing operation to be performed.
118 |             regex_match_type (RegexMatchType): The type of regex matching operation to be used.
119 | 
120 |         Returns:
121 |             Dict[str, Regex]: A dictionary with one key "regex" mapping to a Regex TypedDict
122 |                               containing the provided process_type and regex_match_type.
123 |         """
124 |         return {
125 |             "regex": Regex(process_type=process_type, regex_match_type=regex_match_type)
126 |         }
127 | 
128 |     def Similar(
129 |         process_type: ProcessType, sim_match_type: SimMatchType, threshold: float
130 |     ) -> Dict[str, Similar]:
131 |         """
132 |         Create a dictionary representing a similarity-based text processing operation.
133 |         Args:
134 |             process_type (ProcessType): The type of processing operation to be performed.
135 |             sim_match_type (SimMatchType): The type of similarity matching operation to be used.
136 |             threshold (float): The threshold value for the similarity matching operation.
137 | 
138 |         Returns:
139 |             Dict[str, Similar]: A dictionary with one key "similar" mapping to a Similar TypedDict
140 |                                 containing the provided process_type, sim_match_type, and threshold.
141 |         """
142 |         return {
143 |             "similar": Similar(
144 |                 process_type=process_type,
145 |                 sim_match_type=sim_match_type,
146 |                 threshold=threshold,
147 |             )
148 |         }
149 | 
150 | 
151 | class MatchTable(TypedDict):
152 |     """
153 |     A TypedDict representing a table for matching operations.
154 | 
155 |     Attributes:
156 |         table_id (int): A unique identifier for the match table.
157 |         match_table_type (Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]]):
158 |             A dictionary that specifies the type of match operation to be performed. The key is a string indicating
159 |             the match type ('simple', 'regex', 'similar'), and the value is a corresponding TypedDict describing
160 |             the operation.
161 |         word_list (List[str]): A list of words that are subject to the matching operations.
162 |         exemption_process_type (ProcessType): The type of process for which certain words are exempt from matching.
163 |         exemption_word_list (List[str]): A list of words that are exempt from the matching process.
164 |     """
165 | 
166 |     table_id: int
167 |     match_table_type: Union[Dict[str, Simple], Dict[str, Regex], Dict[str, Similar]]
168 |     word_list: List[str]
169 |     exemption_process_type: ProcessType
170 |     exemption_word_list: List[str]
171 | 
172 | 
173 | MatchTableMap = Dict[int, List[MatchTable]]
174 | """
175 | A type alias for mapping table identifiers to lists of MatchTable objects.
176 | 
177 | Type:
178 |     Dict[int, List[MatchTable]]
179 | 
180 | This dictionary maps an integer table ID to a list of MatchTable objects that correspond to the ID. It is used to
181 | organize and retrieve match tables based on their unique identifiers.
182 | """
183 | 
184 | 
185 | class MatchResult(TypedDict):
186 |     """
187 |     A TypedDict representing the result of a matching operation.
188 | 
189 |     Attributes:
190 |         match_id (int): A unique identifier for the match result.
191 |         table_id (int): The identifier of the match table where the matching operation was performed.
192 |         word_id (int): The identifier of the matched word within the word list.
193 |         word (str): The matched word.
194 |         similarity (float): The similarity score of the match operation.
195 |     """
196 | 
197 |     match_id: int
198 |     table_id: int
199 |     word_id: int
200 |     word: str
201 |     similarity: float
202 | 
203 | 
204 | SimpleTable = Dict[ProcessType, Dict[int, str]]
205 | """
206 | A type alias for representing a simple table structure for text processing.
207 | 
208 | This dictionary maps a `ProcessType` to another dictionary that maps an integer ID to a string.
209 | The outer dictionary's keys represent different types of processing operations, while the inner
210 | dictionary's keys represent unique identifiers corresponding to specific strings related to the
211 | operations.
212 | 
213 | Type:
214 |     Dict[ProcessType, Dict[int, str]]
215 | """
216 | 
217 | 
218 | class SimpleResult(TypedDict):
219 |     """
220 |     A TypedDict representing a simplified result of a text processing operation.
221 | 
222 |     Attributes:
223 |         word_id (int): The identifier of the word within the word list.
224 |         word (str): The word corresponding to the word_id.
225 |     """
226 | 
227 |     word_id: int
228 |     word: str
229 | 


--------------------------------------------------------------------------------
/matcher_py/python/matcher_py/matcher_py.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | from .extension_types import SimpleResult, MatchResult
 3 | 
 4 | def text_process(process_type: int, text: str) -> str:
 5 |     """
 6 |     Processes the given text based on the specified process type.
 7 | 
 8 |     Parameters:
 9 |     - process_type (int): An integer indicating the type of process to be applied to the text.
10 |     - text (str): The text string that is to be processed.
11 | 
12 |     Returns:
13 |     - str: The text string after processing.
14 |     """
15 |     ...
16 | 
17 | def reduce_text_process(process_type: int, text: str) -> List[str]:
18 |     """
19 |     Reduces the given text based on the specified process type and returns a list of strings.
20 | 
21 |     Parameters:
22 |     - process_type (int): An integer indicating the type of process to be applied to the text.
23 |     - text (str): The text string that is to be reduced.
24 | 
25 |     Returns:
26 |     - List[str]: A list of strings after the reduction process.
27 |     """
28 |     ...
29 | 
30 | class Matcher:
31 |     """
32 |     A class used to perform various matching operations using a given set of match table map bytes.
33 | 
34 |     Methods:
35 |     - __init__(self, match_table_map_bytes: bytes) -> None:
36 |         Initializes the Matcher with the provided match table map bytes.
37 |     - __getnewargs__(self) -> bytes:
38 |         Returns the arguments necessary to create a new instance of the Matcher.
39 |     - __getstate__(self) -> bytes:
40 |         Returns the state of the Matcher, typically used for pickling.
41 |     - __setstate__(self, match_table_map_bytes: bytes):
42 |         Sets the state of the Matcher from the provided match table map bytes, typically used for unpickling.
43 |     - is_match(self, text: str) -> bool:
44 |         Checks whether the given text matches any patterns in the match table map.
45 |     - process(self, text: str) -> List[MatchResult]:
46 |         Processes the given text and returns a list of MatchResult objects corresponding to the matches found.
47 |     - word_match(self, text: str) -> Dict[int, List[MatchResult]]:
48 |         Performs a word-level match on the given text and returns a dictionary where the keys are word indices and the values are lists of MatchResult objects.
49 |     - word_match_as_string(self, text: str) -> str:
50 |         Performs a word-level match on the given text and returns a string representation of the matches found.
51 |     """
52 |     def __init__(self, match_table_map_bytes: bytes) -> None: ...
53 |     def __getnewargs__(self) -> bytes: ...
54 |     def __getstate__(self) -> bytes: ...
55 |     def __setstate__(self, match_table_map_bytes: bytes): ...
56 |     def is_match(self, text: str) -> bool: ...
57 |     def process(self, text: str) -> List[MatchResult]: ...
58 |     def word_match(self, text: str) -> Dict[int, List[MatchResult]]: ...
59 |     def word_match_as_string(self, text: str) -> str: ...
60 | 
61 | class SimpleMatcher:
62 |     """
63 |     A class used to perform simplified matching operations using a provided set of simple table bytes.
64 | 
65 |     Methods:
66 |     - __init__(self, simple_table_bytes: bytes) -> None:
67 |         Initializes the SimpleMatcher with the provided simple table bytes.
68 |     - __getnewargs__(self) -> bytes:
69 |         Returns the arguments necessary to create a new instance of the SimpleMatcher.
70 |     - __getstate__(self) -> bytes:
71 |         Returns the state of the SimpleMatcher, typically used for pickling.
72 |     - __setstate__(self, simple_table_bytes: bytes):
73 |         Sets the state of the SimpleMatcher from the provided simple table bytes, typically used for unpickling.
74 |     - is_match(self, text: str) -> bool:
75 |         Checks whether the given text matches any patterns in the simple table.
76 |     - process(self, text: str) -> List[SimpleResult]:
77 |         Processes the given text and returns a list of SimpleResult objects corresponding to the matches found.
78 |     """
79 |     def __init__(self, simple_table_bytes: bytes) -> None: ...
80 |     def __getnewargs__(self) -> bytes: ...
81 |     def __getstate__(self) -> bytes: ...
82 |     def __setstate__(self, simple_table_bytes: bytes): ...
83 |     def is_match(self, text: str) -> bool: ...
84 |     def process(self, text: str) -> List[SimpleResult]: ...
85 | 


--------------------------------------------------------------------------------
/matcher_py/python/matcher_py/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lips7/Matcher/1371557dcf89de31003afedf2d85de5db87faa8d/matcher_py/python/matcher_py/py.typed


--------------------------------------------------------------------------------
/matcher_py/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lips7/Matcher/1371557dcf89de31003afedf2d85de5db87faa8d/matcher_py/test/__init__.py


--------------------------------------------------------------------------------
/matcher_py/test/test_matcher.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pytest
  3 | 
  4 | from matcher_py.matcher_py import Matcher
  5 | from matcher_py.extension_types import (
  6 |     ProcessType,
  7 |     MatchTable,
  8 |     MatchTableType,
  9 |     RegexMatchType,
 10 |     SimMatchType,
 11 | )
 12 | 
 13 | 
 14 | def test_init_with_non_bytes():
 15 |     with pytest.raises(TypeError):
 16 |         Matcher(1)
 17 |         Matcher("")
 18 |         Matcher([])
 19 |         Matcher({})
 20 | 
 21 | 
 22 | def test_init_with_invalid_bytes():
 23 |     with pytest.raises(ValueError):
 24 |         Matcher(b"")
 25 |         Matcher(b"123")
 26 |         Matcher(b"invalid")
 27 |         Matcher(b"[]")
 28 |         Matcher(b"{}")
 29 | 
 30 | 
 31 | def test_init_with_empty_map():
 32 |     Matcher(json.dumps({}).encode())
 33 |     Matcher(json.dumps({1: []}).encode())
 34 |     Matcher(
 35 |         json.dumps(
 36 |             {
 37 |                 1: [
 38 |                     MatchTable(
 39 |                         table_id=1,
 40 |                         match_table_type=MatchTableType.Simple(
 41 |                             process_type=ProcessType.MatchNone
 42 |                         ),
 43 |                         word_list=[],
 44 |                         exemption_process_type=ProcessType.MatchNone,
 45 |                         exemption_word_list=[],
 46 |                     )
 47 |                 ]
 48 |             }
 49 |         ).encode()
 50 |     )
 51 | 
 52 | 
 53 | def test_init_with_invalid_map():
 54 |     with pytest.raises(ValueError):
 55 |         Matcher(json.dumps({"a": 1}).encode())
 56 |         Matcher(json.dumps({"a": {"b": 1}}).encode())
 57 |         Matcher(json.dumps({"c": {}}).encode())
 58 | 
 59 | 
 60 | def test_regex():
 61 |     matcher = Matcher(
 62 |         json.dumps(
 63 |             {
 64 |                 1: [
 65 |                     MatchTable(
 66 |                         table_id=1,
 67 |                         match_table_type=MatchTableType.Regex(
 68 |                             process_type=ProcessType.MatchNone,
 69 |                             regex_match_type=RegexMatchType.MatchRegex,
 70 |                         ),
 71 |                         word_list=["h[aeiou]llo", "w[aeiou]rd"],
 72 |                         exemption_process_type=ProcessType.MatchNone,
 73 |                         exemption_word_list=[],
 74 |                     )
 75 |                 ]
 76 |             }
 77 |         ).encode()
 78 |     )
 79 |     assert matcher.is_match("hallo")
 80 |     assert matcher.is_match("ward")
 81 |     assert matcher.word_match("hallo")[1][0]["table_id"] == 1
 82 |     assert matcher.word_match("hallo")[1][0]["word"] == "h[aeiou]llo"
 83 | 
 84 | 
 85 | def test_similar_char():
 86 |     matcher = Matcher(
 87 |         json.dumps(
 88 |             {
 89 |                 1: [
 90 |                     MatchTable(
 91 |                         table_id=1,
 92 |                         match_table_type=MatchTableType.Regex(
 93 |                             process_type=ProcessType.MatchNone,
 94 |                             regex_match_type=RegexMatchType.MatchSimilarChar,
 95 |                         ),
 96 |                         word_list=["hello,hi,H,你好", "world,word,🌍,世界"],
 97 |                         exemption_process_type=ProcessType.MatchNone,
 98 |                         exemption_word_list=[],
 99 |                     )
100 |                 ]
101 |             }
102 |         ).encode()
103 |     )
104 |     assert matcher.is_match("helloworld")
105 |     assert matcher.is_match("hi世界")
106 |     assert matcher.word_match("helloworld")[1][0]["table_id"] == 1
107 |     assert matcher.word_match("helloworld")[1][0]["word"] == "helloworld"
108 | 
109 | 
110 | def test_similar_text_levenshtein():
111 |     matcher = Matcher(
112 |         json.dumps(
113 |             {
114 |                 1: [
115 |                     MatchTable(
116 |                         table_id=1,
117 |                         match_table_type=MatchTableType.Similar(
118 |                             process_type=ProcessType.MatchNone,
119 |                             sim_match_type=SimMatchType.MatchLevenshtein,
120 |                             threshold=0.8,
121 |                         ),
122 |                         word_list=["helloworld"],
123 |                         exemption_process_type=ProcessType.MatchNone,
124 |                         exemption_word_list=[],
125 |                     )
126 |                 ]
127 |             }
128 |         ).encode()
129 |     )
130 |     assert matcher.is_match("helloworl")
131 |     assert matcher.is_match("halloworld")
132 |     assert matcher.is_match("ha1loworld")
133 |     assert not matcher.is_match("ha1loworld1")
134 |     assert matcher.word_match("helloworl")[1][0]["table_id"] == 1
135 |     assert matcher.word_match("helloworl")[1][0]["word"] == "helloworld"
136 | 
137 | 
138 | def test_acrostic():
139 |     matcher = Matcher(
140 |         json.dumps(
141 |             {
142 |                 1: [
143 |                     MatchTable(
144 |                         table_id=1,
145 |                         match_table_type=MatchTableType.Regex(
146 |                             process_type=ProcessType.MatchNone,
147 |                             regex_match_type=RegexMatchType.MatchAcrostic,
148 |                         ),
149 |                         word_list=["h,e,l,l,o", "你,好"],
150 |                         exemption_process_type=ProcessType.MatchNone,
151 |                         exemption_word_list=[],
152 |                     )
153 |                 ]
154 |             }
155 |         ).encode()
156 |     )
157 |     assert matcher.is_match("hope, endures, love, lasts, onward.")
158 |     assert matcher.is_match(
159 |         "Happy moments shared, Every smile and laugh, Love in every word, Lighting up our paths, Open hearts we show."
160 |     )
161 |     assert matcher.is_match("你的笑容温暖, 好心情常伴。")
162 |     assert not matcher.is_match("你好")
163 |     assert (
164 |         matcher.word_match("hope, endures, love, lasts, onward.")[1][0]["word"]
165 |         == "h,e,l,l,o"
166 |     )
167 |     assert matcher.word_match("你的笑容温暖, 好心情常伴。")[1][0]["word"] == "你,好"
168 | 
169 | 
170 | def test_exemption():
171 |     matcher = Matcher(
172 |         json.dumps(
173 |             {
174 |                 1: [
175 |                     MatchTable(
176 |                         table_id=1,
177 |                         match_table_type=MatchTableType.Simple(
178 |                             process_type=ProcessType.MatchNone
179 |                         ),
180 |                         word_list=["helloworld"],
181 |                         exemption_process_type=ProcessType.MatchNone,
182 |                         exemption_word_list=["worldwide"],
183 |                     )
184 |                 ]
185 |             }
186 |         ).encode()
187 |     )
188 |     assert matcher.is_match("helloworld")
189 |     assert not matcher.is_match("helloworldwide")
190 | 
191 |     matcher = Matcher(
192 |         json.dumps(
193 |             {
194 |                 1: [
195 |                     MatchTable(
196 |                         table_id=1,
197 |                         match_table_type=MatchTableType.Simple(
198 |                             process_type=ProcessType.MatchNone
199 |                         ),
200 |                         word_list=["helloworld"],
201 |                         exemption_process_type=ProcessType.MatchNone,
202 |                         exemption_word_list=["worldwide"],
203 |                     ),
204 |                     MatchTable(
205 |                         table_id=2,
206 |                         match_table_type=MatchTableType.Regex(
207 |                             process_type=ProcessType.MatchNone,
208 |                             regex_match_type=RegexMatchType.MatchRegex,
209 |                         ),
210 |                         word_list=["hello"],
211 |                         exemption_process_type=ProcessType.MatchNone,
212 |                         exemption_word_list=["worldwide"],
213 |                     ),
214 |                 ]
215 |             }
216 |         ).encode()
217 |     )
218 |     assert matcher.is_match("helloworld")
219 |     assert not matcher.is_match("helloworldwide")
220 | 


--------------------------------------------------------------------------------
/matcher_py/test/test_simple_matcher.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pytest
  3 | 
  4 | from matcher_py.matcher_py import SimpleMatcher
  5 | from matcher_py.extension_types import ProcessType
  6 | 
  7 | 
  8 | def test_init_with_non_bytes():
  9 |     with pytest.raises(TypeError):
 10 |         SimpleMatcher(1)
 11 |         SimpleMatcher("")
 12 |         SimpleMatcher([])
 13 |         SimpleMatcher({})
 14 | 
 15 | 
 16 | def test_init_with_invalid_bytes():
 17 |     with pytest.raises(ValueError):
 18 |         SimpleMatcher(b"")
 19 |         SimpleMatcher(b"123")
 20 |         SimpleMatcher(b"invalid")
 21 |         SimpleMatcher(b"[]")
 22 |         SimpleMatcher(b"{}")
 23 | 
 24 | 
 25 | def test_init_with_empty_map():
 26 |     SimpleMatcher(json.dumps({}).encode())
 27 |     SimpleMatcher(json.dumps({1: {}}).encode())
 28 | 
 29 | 
 30 | def test_init_with_invalid_map():
 31 |     with pytest.raises(ValueError):
 32 |         SimpleMatcher(json.dumps({"a": 1}).encode())
 33 |         SimpleMatcher(json.dumps({"a": {"b": 1}}).encode())
 34 |         SimpleMatcher(json.dumps({1: []}).encode())
 35 | 
 36 | 
 37 | def test_backslashes():
 38 |     simple_matcher = SimpleMatcher(
 39 |         json.dumps({ProcessType.MatchNone: {1: r"It's /\/\y duty"}}).encode()
 40 |     )
 41 |     assert simple_matcher.is_match(r"It's /\/\y duty")
 42 |     assert simple_matcher.process(r"It's /\/\y duty")[0]["word"] == r"It's /\/\y duty"
 43 | 
 44 | 
 45 | def test_fanjian():
 46 |     simple_matcher = SimpleMatcher(
 47 |         json.dumps({ProcessType.MatchFanjian: {1: "你好"}}).encode()
 48 |     )
 49 |     assert simple_matcher.is_match("妳好")
 50 |     assert simple_matcher.process("你好")[0]["word_id"] == 1
 51 |     assert simple_matcher.process("你好")[0]["word"] == "你好"
 52 | 
 53 |     simple_matcher = SimpleMatcher(
 54 |         json.dumps({ProcessType.MatchFanjian: {1: "妳好"}}).encode()
 55 |     )
 56 |     assert simple_matcher.is_match("你好")
 57 |     assert simple_matcher.process("你好")[0]["word_id"] == 1
 58 |     assert simple_matcher.process("你好")[0]["word"] == "妳好"
 59 | 
 60 | 
 61 | def test_delete():
 62 |     simple_matcher = SimpleMatcher(
 63 |         json.dumps({ProcessType.MatchDelete: {1: "你好"}}).encode()
 64 |     )
 65 |     assert simple_matcher.is_match("你！好")
 66 |     assert len(simple_matcher.process("你！好")) == 1
 67 | 
 68 | 
 69 | def test_normalize():
 70 |     simple_matcher = SimpleMatcher(
 71 |         json.dumps(
 72 |             {
 73 |                 ProcessType.MatchNormalize: {
 74 |                     1: "he11o",
 75 |                 }
 76 |             }
 77 |         ).encode()
 78 |     )
 79 |     assert simple_matcher.is_match("ℋЀ⒈㈠Õ")
 80 |     assert simple_matcher.process("ℋЀ⒈㈠Õ")[0]["word_id"] == 1
 81 |     assert simple_matcher.process("ℋЀ⒈㈠Õ")[0]["word"] == "he11o"
 82 | 
 83 | 
 84 | def test_pinyin():
 85 |     simple_matcher = SimpleMatcher(
 86 |         json.dumps(
 87 |             {
 88 |                 ProcessType.MatchPinYin: {
 89 |                     1: "西安",
 90 |                 }
 91 |             }
 92 |         ).encode()
 93 |     )
 94 |     assert simple_matcher.is_match("洗按")
 95 |     assert not simple_matcher.is_match("现")
 96 | 
 97 | 
 98 | def test_pinyinchar():
 99 |     simple_matcher = SimpleMatcher(
100 |         json.dumps(
101 |             {
102 |                 ProcessType.MatchPinYinChar: {
103 |                     1: "西安",
104 |                 }
105 |             }
106 |         ).encode()
107 |     )
108 |     assert simple_matcher.is_match("洗按")
109 |     assert simple_matcher.is_match("现")
110 |     assert simple_matcher.is_match("xian")
111 | 


--------------------------------------------------------------------------------
/matcher_py/uv.lock:
--------------------------------------------------------------------------------
  1 | version = 1
  2 | revision = 1
  3 | requires-python = ">=3.8"
  4 | 
  5 | [[package]]
  6 | name = "colorama"
  7 | version = "0.4.6"
  8 | source = { registry = "https://pypi.org/simple" }
  9 | sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
 10 | wheels = [
 11 |     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "exceptiongroup"
 16 | version = "1.2.2"
 17 | source = { registry = "https://pypi.org/simple" }
 18 | sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 }
 19 | wheels = [
 20 |     { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 },
 21 | ]
 22 | 
 23 | [[package]]
 24 | name = "iniconfig"
 25 | version = "2.1.0"
 26 | source = { registry = "https://pypi.org/simple" }
 27 | sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
 28 | wheels = [
 29 |     { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
 30 | ]
 31 | 
 32 | [[package]]
 33 | name = "matcher-py"
 34 | version = "0.5.7"
 35 | source = { editable = "." }
 36 | 
 37 | [package.dev-dependencies]
 38 | dev = [
 39 |     { name = "pip" },
 40 |     { name = "pytest" },
 41 | ]
 42 | 
 43 | [package.metadata]
 44 | 
 45 | [package.metadata.requires-dev]
 46 | dev = [
 47 |     { name = "pip" },
 48 |     { name = "pytest" },
 49 | ]
 50 | 
 51 | [[package]]
 52 | name = "packaging"
 53 | version = "24.2"
 54 | source = { registry = "https://pypi.org/simple" }
 55 | sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 }
 56 | wheels = [
 57 |     { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 },
 58 | ]
 59 | 
 60 | [[package]]
 61 | name = "pip"
 62 | version = "25.0.1"
 63 | source = { registry = "https://pypi.org/simple" }
 64 | sdist = { url = "https://files.pythonhosted.org/packages/70/53/b309b4a497b09655cb7e07088966881a57d082f48ac3cb54ea729fd2c6cf/pip-25.0.1.tar.gz", hash = "sha256:88f96547ea48b940a3a385494e181e29fb8637898f88d88737c5049780f196ea", size = 1950850 }
 65 | wheels = [
 66 |     { url = "https://files.pythonhosted.org/packages/c9/bc/b7db44f5f39f9d0494071bddae6880eb645970366d0a200022a1a93d57f5/pip-25.0.1-py3-none-any.whl", hash = "sha256:c46efd13b6aa8279f33f2864459c8ce587ea6a1a59ee20de055868d8f7688f7f", size = 1841526 },
 67 | ]
 68 | 
 69 | [[package]]
 70 | name = "pluggy"
 71 | version = "1.5.0"
 72 | source = { registry = "https://pypi.org/simple" }
 73 | sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
 74 | wheels = [
 75 |     { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
 76 | ]
 77 | 
 78 | [[package]]
 79 | name = "pytest"
 80 | version = "8.3.5"
 81 | source = { registry = "https://pypi.org/simple" }
 82 | dependencies = [
 83 |     { name = "colorama", marker = "sys_platform == 'win32'" },
 84 |     { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
 85 |     { name = "iniconfig" },
 86 |     { name = "packaging" },
 87 |     { name = "pluggy" },
 88 |     { name = "tomli", marker = "python_full_version < '3.11'" },
 89 | ]
 90 | sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
 91 | wheels = [
 92 |     { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
 93 | ]
 94 | 
 95 | [[package]]
 96 | name = "tomli"
 97 | version = "2.2.1"
 98 | source = { registry = "https://pypi.org/simple" }
 99 | sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 }
100 | wheels = [
101 |     { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 },
102 |     { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 },
103 |     { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 },
104 |     { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 },
105 |     { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 },
106 |     { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 },
107 |     { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 },
108 |     { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 },
109 |     { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 },
110 |     { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 },
111 |     { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 },
112 |     { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 },
113 |     { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 },
114 |     { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 },
115 |     { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 },
116 |     { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 },
117 |     { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 },
118 |     { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 },
119 |     { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 },
120 |     { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 },
121 |     { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708 },
122 |     { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582 },
123 |     { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543 },
124 |     { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691 },
125 |     { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170 },
126 |     { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530 },
127 |     { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666 },
128 |     { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954 },
129 |     { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724 },
130 |     { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383 },
131 |     { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 },
132 | ]
133 | 


--------------------------------------------------------------------------------
/matcher_rs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "matcher_rs"
 3 | authors.workspace = true
 4 | categories.workspace = true
 5 | description.workspace = true
 6 | edition.workspace = true
 7 | homepage.workspace = true
 8 | keywords.workspace = true
 9 | license.workspace = true
10 | repository.workspace = true
11 | version.workspace = true
12 | readme = "README.md"
13 | documentation = "https://docs.rs/crate/matcher_rs/latest"
14 | build = "build.rs"
15 | 
16 | [package.metadata.docs.rs]
17 | rustc-args = ["-C", "target-feature=native"]
18 | rustdoc-args = ["-C", "target-feature=native", "--document-private-items"]
19 | 
20 | [lib]
21 | name = "matcher_rs"
22 | crate-type = ["rlib"]
23 | 
24 | [dependencies]
25 | # aho-corasick = "1.1.3"
26 | aho-corasick-unsafe = { version = "0.0.4", git = "https://github.com/Lips7/aho-corasick" }
27 | bitflags = "2.9.1"
28 | daachorse = "1.0.0"
29 | fancy-regex = "0.14.0"
30 | id-set = "0.2.2"
31 | lazy_static = "1.5.0"
32 | micromap = "0.1.0"
33 | nohash-hasher = "0.2.0"
34 | parking_lot = { version = "0.12.4", features = ["hardware-lock-elision"]}
35 | rapidfuzz = "0.5.0"
36 | regex = { version = "1.11.1", features = ["perf-dfa-full"] }
37 | rustc-hash = "2.1.1"
38 | serde = { version = "1.0.219", features = ["derive"] }
39 | tinyvec = { version = "1.9.0", features = ["serde"] }
40 | 
41 | [target.'cfg(all(target_os = "linux", target_arch = "aarch64"))'.dependencies]
42 | tikv-jemallocator = "0.6.0"
43 | 
44 | [target.'cfg(not(all(target_os = "linux", target_arch = "aarch64")))'.dependencies]
45 | mimalloc = "0.1.46"
46 | 
47 | [build-dependencies]
48 | daachorse = "1.0.0"
49 | 
50 | [dev-dependencies]
51 | divan = "0.1.21"
52 | fastrand = "2.3.0"
53 | 
54 | [features]
55 | default = ["dfa"]
56 | # By enable runtime_build feature, we could build process matcher at runtime, but with build time increasing.
57 | runtime_build = []
58 | # By enable serde feature, we could serialize and deserialize matcher and simple_matcher.
59 | # With serde feature, AhoCorasick's prefilter is disabled, because I don't know how to serialize it correctly,
60 | # which will lead to performance regression when the patterns size is small (say, less than 100).
61 | serde = ["aho-corasick-unsafe/serde"]
62 | # By enable dfa feature, we could use dfa to perform simple matching, but with significantly increasing memory consumption.
63 | dfa = []
64 | 
65 | [[bench]]
66 | name = "bench"
67 | harness = false
68 | 


--------------------------------------------------------------------------------
/matcher_rs/build.rs:
--------------------------------------------------------------------------------
  1 | use std::io::Result;
  2 | 
  3 | /// The `main` function serves as the build script for a Rust project, responsible for
  4 | /// generating binary data files used in text conversion and matching tasks.
  5 | /// Depending on the features enabled, it reads specific conversion mappings from
  6 | /// text files, processes them, and writes them to binary files.
  7 | ///
  8 | /// It comprises several key steps:
  9 | ///
 10 | /// 1. Print instructions to re-run build script if specific files change.
 11 | /// 2. Conditionally process text conversion data only if 'runtime_build' feature is not enabled.
 12 | /// 3. Load text content from files in the 'process_map' directory into constants like FANJIAN, NUM_NORM, NORM, and PINYIN.
 13 | /// 4. For each mapping type ('fanjian', 'normalize', 'pinyin'):
 14 | ///     - Aggregate conversion mappings from loaded constants into a HashMap.
 15 | ///     - Clean the HashMap by removing identity mappings.
 16 | ///     - Create binary files containing the list of strings to match and the list of corresponding replacements.
 17 | ///     - For 'pinyin':
 18 | ///         - Also create a binary file with trimmed replacements.
 19 | ///     - For specified mappings ('fanjian', 'pinyin'):
 20 | ///         - Use the `daachorse` crate to build and serialize a CharwiseDoubleArrayAhoCorasick matcher, and write it to a binary file.
 21 | ///     - For 'normalize', when DFA feature is not enabled:
 22 | ///         - Similarly, build a matcher with a different match kind and serialize it.
 23 | /// 5. Additionally, if 'dfa' feature is not enabled:
 24 | ///     - Load delete and whitespace character patterns from TEXT_DELETE constant and WHITE_SPACE array respectively.
 25 | ///     - Aggregate these patterns into a HashSet to remove duplicates.
 26 | ///     - Write these patterns to a binary file.
 27 | ///     - Build a matcher for these patterns, serialize it, and write it to a binary file.
 28 | ///
 29 | /// The function completes by returning `Ok(())` to indicate successful completion of the build script.
 30 | fn main() -> Result<()> {
 31 |     println!("cargo:rerun-if-changed=build.rs");
 32 |     println!("cargo:rerun-if-changed=process_map");
 33 | 
 34 |     #[cfg(not(feature = "runtime_build"))]
 35 |     {
 36 |         use std::collections::HashMap;
 37 |         use std::env;
 38 |         use std::fs::File;
 39 |         use std::io::Write;
 40 | 
 41 |         use daachorse::{
 42 |             CharwiseDoubleArrayAhoCorasick, CharwiseDoubleArrayAhoCorasickBuilder,
 43 |             MatchKind as DoubleArrayAhoCorasickMatchKind,
 44 |         };
 45 | 
 46 |         /// These constants include the contents of their respective text files
 47 |         /// from the `process_map` directory. Each constant refers to a specific
 48 |         /// text conversion mapping used within the project. The text files
 49 |         /// contain tab-separated values, where each line represents a pair of
 50 |         /// strings that define a specific conversion.
 51 |         ///
 52 |         /// - `FANJIAN` includes simplified and traditional Chinese character mappings.
 53 |         /// - `NUM_NORM` includes mappings for normalizing numbers.
 54 |         /// - `NORM` includes mappings for various normalization forms.
 55 |         /// - `PINYIN` includes mappings for converting characters to Pinyin.
 56 |         const FANJIAN: &str = include_str!("./process_map/FANJIAN.txt");
 57 |         const NUM_NORM: &str = include_str!("./process_map/NUM-NORM.txt");
 58 |         const NORM: &str = include_str!("./process_map/NORM.txt");
 59 |         const PINYIN: &str = include_str!("./process_map/PINYIN.txt");
 60 | 
 61 |         let out_dir = env::var("OUT_DIR").unwrap();
 62 |         let process_str_map = HashMap::from([
 63 |             ("fanjian", vec![FANJIAN]),
 64 |             ("normalize", vec![NORM, NUM_NORM]),
 65 |             ("pinyin", vec![PINYIN]),
 66 |         ]);
 67 | 
 68 |         for process_type_bit_str in ["fanjian", "normalize", "pinyin"] {
 69 |             let mut process_dict = HashMap::new();
 70 | 
 71 |             for process_map in process_str_map.get(process_type_bit_str).unwrap() {
 72 |                 process_dict.extend(process_map.trim().lines().map(|pair_str| {
 73 |                     let mut pair_str_split = pair_str.split('\t');
 74 |                     (
 75 |                         pair_str_split.next().unwrap(),
 76 |                         pair_str_split.next().unwrap(),
 77 |                     )
 78 |                 }))
 79 |             }
 80 | 
 81 |             process_dict.retain(|&key, &mut value| key != value);
 82 |             let process_list = process_dict
 83 |                 .iter()
 84 |                 .map(|(&key, _)| key)
 85 |                 .collect::<Vec<&str>>();
 86 | 
 87 |             let mut process_list_bin =
 88 |                 File::create(format!("{out_dir}/{process_type_bit_str}_process_list.bin"))?;
 89 |             process_list_bin.write_all(process_list.join("\n").as_bytes())?;
 90 | 
 91 |             let process_replace_list = process_dict
 92 |                 .iter()
 93 |                 .map(|(_, &val)| val)
 94 |                 .collect::<Vec<&str>>();
 95 |             let mut process_replace_list_bin = File::create(format!(
 96 |                 "{out_dir}/{process_type_bit_str}_process_replace_list.bin"
 97 |             ))?;
 98 |             process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?;
 99 | 
100 |             if process_type_bit_str == "pinyin" {
101 |                 let process_replace_list = process_dict
102 |                     .iter()
103 |                     .map(|(_, &val)| val.trim_matches(' '))
104 |                     .collect::<Vec<&str>>();
105 |                 let mut process_replace_list_bin =
106 |                     File::create(format!("{out_dir}/pinyinchar_process_replace_list.bin"))?;
107 |                 process_replace_list_bin.write_all(process_replace_list.join("\n").as_bytes())?;
108 |             }
109 | 
110 |             if ["fanjian", "pinyin"].contains(&process_type_bit_str) {
111 |                 let matcher: CharwiseDoubleArrayAhoCorasick<u32> =
112 |                     CharwiseDoubleArrayAhoCorasickBuilder::new()
113 |                         .match_kind(DoubleArrayAhoCorasickMatchKind::Standard)
114 |                         .build(&process_list)
115 |                         .unwrap();
116 |                 let matcher_bytes = matcher.serialize();
117 |                 let mut matcher_bin = File::create(format!(
118 |                     "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin"
119 |                 ))?;
120 |                 matcher_bin.write_all(&matcher_bytes)?;
121 |             }
122 | 
123 |             #[cfg(not(feature = "dfa"))]
124 |             if process_type_bit_str == "normalize" {
125 |                 let matcher: CharwiseDoubleArrayAhoCorasick<u32> =
126 |                     CharwiseDoubleArrayAhoCorasickBuilder::new()
127 |                         .match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest)
128 |                         .build(&process_list)
129 |                         .unwrap();
130 |                 let matcher_bytes = matcher.serialize();
131 |                 let mut matcher_bin = File::create(format!(
132 |                     "{out_dir}/{process_type_bit_str}_daachorse_charwise_u32_matcher.bin"
133 |                 ))?;
134 |                 matcher_bin.write_all(&matcher_bytes)?;
135 |             }
136 |         }
137 | 
138 |         #[cfg(not(feature = "dfa"))]
139 |         {
140 |             use std::collections::HashSet;
141 | 
142 |             /// These constants define deletion and whitespace character mappings
143 |             /// that are used within the project. The `TEXT_DELETE` constant
144 |             /// includes contents from the `TEXT-DELETE.txt` file in the `process_map`
145 |             /// directory, which contains textual patterns to be deleted.
146 |             /// The `WHITE_SPACE` constant includes various Unicode whitespace
147 |             /// characters that are treated as whitespace in the project's text
148 |             /// processing logic.
149 |             ///
150 |             /// - `TEXT_DELETE` includes patterns of text identified for deletion.
151 |             /// - `WHITE_SPACE` includes numerous Unicode representations of whitespace.
152 |             const TEXT_DELETE: &str = include_str!("./process_map/TEXT-DELETE.txt");
153 |             const WHITE_SPACE: &[&str] = &[
154 |                 "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}",
155 |                 "\u{00A0}", "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}",
156 |                 "\u{2005}", "\u{2006}", "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}",
157 |                 "\u{200F}", "\u{2028}", "\u{2029}", "\u{202F}", "\u{205F}", "\u{3000}",
158 |             ];
159 | 
160 |             let mut process_set = HashSet::new();
161 | 
162 |             process_set.extend(TEXT_DELETE.trim().lines().map(|line| line));
163 |             process_set.extend(WHITE_SPACE);
164 | 
165 |             let process_list = process_set.iter().map(|&s| s).collect::<Vec<&str>>();
166 | 
167 |             let mut process_list_bin = File::create(format!("{out_dir}/delete_process_list.bin"))?;
168 |             process_list_bin.write_all(process_list.join("\n").as_bytes())?;
169 | 
170 |             let matcher: CharwiseDoubleArrayAhoCorasick<u32> =
171 |                 CharwiseDoubleArrayAhoCorasickBuilder::new()
172 |                     .match_kind(DoubleArrayAhoCorasickMatchKind::LeftmostLongest)
173 |                     .build(&process_list)
174 |                     .unwrap();
175 |             let matcher_bytes = matcher.serialize();
176 |             let mut matcher_bin = File::create(format!(
177 |                 "{out_dir}/delete_daachorse_charwise_u32_matcher.bin"
178 |             ))?;
179 |             matcher_bin.write_all(&matcher_bytes)?;
180 |         }
181 |     }
182 | 
183 |     Ok(())
184 | }
185 | 


--------------------------------------------------------------------------------
/matcher_rs/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(all(target_os = "linux", target_arch = "aarch64"))]
 2 | #[global_allocator]
 3 | static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 4 | 
 5 | #[cfg(not(all(target_os = "linux", target_arch = "aarch64")))]
 6 | #[global_allocator]
 7 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
 8 | 
 9 | mod util;
10 | pub use util::word::SimpleWord;
11 | 
12 | mod process;
13 | pub use process::process_matcher::{
14 |     build_process_type_tree, get_process_matcher, reduce_text_process, reduce_text_process_emit,
15 |     reduce_text_process_with_set, reduce_text_process_with_tree, text_process, ProcessType,
16 | };
17 | 
18 | mod simple_matcher;
19 | pub use simple_matcher::{SimpleMatcher, SimpleResult, SimpleTable, SimpleTableSerde};
20 | 
21 | mod regex_matcher;
22 | pub use regex_matcher::{RegexMatchType, RegexMatcher, RegexResult, RegexTable};
23 | 
24 | mod sim_matcher;
25 | pub use sim_matcher::{SimMatchType, SimMatcher, SimResult, SimTable};
26 | 
27 | mod matcher;
28 | pub use matcher::{
29 |     MatchResult, MatchResultTrait, MatchTable, MatchTableMap, MatchTableMapSerde, MatchTableType,
30 |     Matcher, TextMatcherTrait,
31 | };
32 | 


--------------------------------------------------------------------------------
/matcher_rs/src/process/constants.rs:
--------------------------------------------------------------------------------
  1 | /// This module defines several constants for processing and normalization of text data,
  2 | /// including definitions for whitespace characters, conditional includes for files,
  3 | /// and configurations for runtime build and DFA (Deterministic Finite Automaton) features.
  4 | 
  5 | /// These constants are conditionally included when the `runtime_build` feature is enabled.
  6 | /// They provide paths to various text processing maps used for normalization and replacement.
  7 | ///
  8 | /// - `FANJIAN`: Maps traditional Chinese characters to simplified Chinese characters.
  9 | /// - `TEXT_DELETE`: Defines text segments that should be removed during preprocessing.
 10 | /// - `NUM_NORM`: Specifies numeric normalization rules.
 11 | /// - `NORM`: Contains general normalization rules.
 12 | /// - `PINYIN`: Provides mappings for converting Chinese characters to Pinyin.
 13 | #[cfg(feature = "runtime_build")]
 14 | pub const FANJIAN: &str = include_str!("../../process_map/FANJIAN.txt");
 15 | #[cfg(feature = "runtime_build")]
 16 | pub const TEXT_DELETE: &str = include_str!("../../process_map/TEXT-DELETE.txt");
 17 | #[cfg(feature = "runtime_build")]
 18 | pub const NUM_NORM: &str = include_str!("../../process_map/NUM-NORM.txt");
 19 | #[cfg(feature = "runtime_build")]
 20 | pub const NORM: &str = include_str!("../../process_map/NORM.txt");
 21 | #[cfg(feature = "runtime_build")]
 22 | pub const PINYIN: &str = include_str!("../../process_map/PINYIN.txt");
 23 | 
 24 | /// These constants are for normalization processing and are included based on different
 25 | /// feature flags.
 26 | ///
 27 | /// When the `runtime_build` feature is not enabled and the `dfa` feature is enabled,
 28 | /// `NORMALIZE_PROCESS_LIST_STR` is included. This constant provides the path to the
 29 | /// normalization process list, which is generated at compile time.
 30 | ///
 31 | /// When `runtime_build` is not enabled and the `dfa` feature is not enabled,
 32 | /// `NORMALIZE_PROCESS_MATCHER_BYTES` is included. This constant provides the path to
 33 | /// the normalization matcher bytes, which is also generated during the build process.
 34 | ///
 35 | /// Additionally, `NORMALIZE_PROCESS_REPLACE_LIST_STR` is included when `runtime_build`
 36 | /// is not enabled. This constant provides the path to the normalization replace list,
 37 | /// used for text replacement operations during normalization.
 38 | #[cfg(all(not(feature = "runtime_build"), feature = "dfa"))]
 39 | pub const NORMALIZE_PROCESS_LIST_STR: &str =
 40 |     include_str!(concat!(env!("OUT_DIR"), "/normalize_process_list.bin"));
 41 | #[cfg(all(not(feature = "runtime_build"), not(feature = "dfa")))]
 42 | pub const NORMALIZE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!(
 43 |     env!("OUT_DIR"),
 44 |     "/normalize_daachorse_charwise_u32_matcher.bin"
 45 | ));
 46 | #[cfg(not(feature = "runtime_build"))]
 47 | pub const NORMALIZE_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!(
 48 |     env!("OUT_DIR"),
 49 |     "/normalize_process_replace_list.bin"
 50 | ));
 51 | 
 52 | /// These constants are related to Fanjian (simplified vs traditional Chinese conversion)
 53 | /// processing and are included based on feature flags.
 54 | ///
 55 | /// - When the `runtime_build` feature is not enabled, `FANJIAN_PROCESS_REPLACE_LIST_STR`
 56 | ///   is included. This constant provides the path to the Fanjian process replace list,
 57 | ///   which is used for converting traditional Chinese characters to simplified Chinese
 58 | ///   characters during normalization.
 59 | ///
 60 | /// - Additionally, when the `runtime_build` feature is not enabled, `FANJIAN_PROCESS_MATCHER_BYTES`
 61 | ///   is included. This constant provides the path to the Fanjian matcher bytes, which are
 62 | ///   used for matching Fanjian text patterns during the normalization process.
 63 | #[cfg(not(feature = "runtime_build"))]
 64 | pub const FANJIAN_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!(
 65 |     env!("OUT_DIR"),
 66 |     "/fanjian_process_replace_list.bin"
 67 | ));
 68 | #[cfg(not(feature = "runtime_build"))]
 69 | pub const FANJIAN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!(
 70 |     env!("OUT_DIR"),
 71 |     "/fanjian_daachorse_charwise_u32_matcher.bin"
 72 | ));
 73 | 
 74 | /// These constants are related to Pinyin processing and are included based on feature flags.
 75 | ///
 76 | /// - When the `runtime_build` feature is not enabled, `PINYIN_PROCESS_REPLACE_LIST_STR`
 77 | ///   is included. This constant provides the path to the Pinyin process replace list,
 78 | ///   which is used for converting Chinese characters to Pinyin during normalization.
 79 | ///
 80 | /// - Similarly, when the `runtime_build` feature is not enabled, `PINYINCHAR_PROCESS_REPLACE_LIST_STR`
 81 | ///   is included. This constant provides the path to the Pinyin character process replace list,
 82 | ///   which is also used for text replacement operations.
 83 | ///
 84 | /// - Additionally, when the `runtime_build` feature is not enabled, `PINYIN_PROCESS_MATCHER_BYTES`
 85 | ///   is included. This constant provides the path to the Pinyin matcher bytes, which are
 86 | ///   used for matching Pinyin text patterns during the normalization process.
 87 | #[cfg(not(feature = "runtime_build"))]
 88 | pub const PINYIN_PROCESS_REPLACE_LIST_STR: &str =
 89 |     include_str!(concat!(env!("OUT_DIR"), "/pinyin_process_replace_list.bin"));
 90 | #[cfg(not(feature = "runtime_build"))]
 91 | pub const PINYINCHAR_PROCESS_REPLACE_LIST_STR: &str = include_str!(concat!(
 92 |     env!("OUT_DIR"),
 93 |     "/pinyinchar_process_replace_list.bin"
 94 | ));
 95 | #[cfg(not(feature = "runtime_build"))]
 96 | pub const PINYIN_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!(
 97 |     env!("OUT_DIR"),
 98 |     "/pinyin_daachorse_charwise_u32_matcher.bin"
 99 | ));
100 | 
101 | /// List of Unicode code points considered as whitespace characters.
102 | #[cfg(any(feature = "runtime_build", feature = "dfa"))]
103 | pub const WHITE_SPACE: &[&str] = &[
104 |     "\u{0009}", "\u{000A}", "\u{000B}", "\u{000C}", "\u{000D}", "\u{0020}", "\u{0085}", "\u{00A0}",
105 |     "\u{1680}", "\u{2000}", "\u{2001}", "\u{2002}", "\u{2003}", "\u{2004}", "\u{2005}", "\u{2006}",
106 |     "\u{2007}", "\u{2008}", "\u{2009}", "\u{200A}", "\u{200D}", "\u{200F}", "\u{2028}", "\u{2029}",
107 |     "\u{202F}", "\u{205F}", "\u{3000}",
108 | ];
109 | 
110 | /// These constants are related to the text deletion processing and are included based on feature flags.
111 | ///
112 | /// - When the `runtime_build` feature is not enabled and the `dfa` feature is enabled,
113 | ///   `TEXT_DELETE` is included. This constant provides the path to the text deletion map,
114 | ///   used for identifying text segments to be deleted during normalization.
115 | ///
116 | /// - When the `runtime_build` feature is not enabled and the `dfa` feature is not enabled,
117 | ///   `TEXT_DELETE_PROCESS_MATCHER_BYTES` is included. This constant provides the path
118 | ///   to the text deletion matcher bytes, which are generated during the build process and
119 | ///   used for matching text patterns to be deleted during normalization.
120 | #[cfg(all(not(feature = "runtime_build"), feature = "dfa"))]
121 | pub const TEXT_DELETE: &str = include_str!("../../process_map/TEXT-DELETE.txt");
122 | #[cfg(all(not(feature = "runtime_build"), not(feature = "dfa")))]
123 | pub const TEXT_DELETE_PROCESS_MATCHER_BYTES: &[u8] = include_bytes!(concat!(
124 |     env!("OUT_DIR"),
125 |     "/delete_daachorse_charwise_u32_matcher.bin"
126 | ));
127 | 


--------------------------------------------------------------------------------
/matcher_rs/src/process/mod.rs:
--------------------------------------------------------------------------------
1 | mod constants;
2 | pub mod process_matcher;
3 | 


--------------------------------------------------------------------------------
/matcher_rs/src/sim_matcher.rs:
--------------------------------------------------------------------------------
  1 | use std::borrow::Cow;
  2 | 
  3 | use id_set::IdSet;
  4 | use rapidfuzz::distance;
  5 | use serde::{Deserialize, Serialize};
  6 | 
  7 | use crate::{
  8 |     matcher::{MatchResultTrait, TextMatcherTrait},
  9 |     process::process_matcher::{
 10 |         build_process_type_tree, reduce_text_process_with_tree, ProcessType, ProcessTypeBitNode,
 11 |     },
 12 | };
 13 | 
 14 | /// Enumeration representing the types of similarity matching algorithms available.
 15 | ///
 16 | /// Currently, this enum only supports the Levenshtein distance algorithm.
 17 | ///
 18 | /// # Variants
 19 | ///
 20 | /// * [SimMatchType::Levenshtein] - Represents the Levenshtein distance algorithm, a string metric for measuring the difference between two sequences.
 21 | ///
 22 | /// The enum variants are serialized and deserialized using the `snake_case` naming convention.
 23 | #[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
 24 | #[serde(rename_all = "snake_case")]
 25 | pub enum SimMatchType {
 26 |     Levenshtein,
 27 | }
 28 | 
 29 | /// Represents a table structure to be used in the similarity matching process.
 30 | ///
 31 | /// This structure holds various properties required for similarity matching using different algorithms.
 32 | ///
 33 | /// # Fields
 34 | ///
 35 | /// * `table_id` - A unique identifier for the table.
 36 | /// * `match_id` - A unique identifier for the matching process.
 37 | /// * `process_type` - The type of processing to be applied, represented by the [ProcessType] enum.
 38 | /// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [SimMatchType] enum.
 39 | /// * `word_list` - A list of words to be used in the matching process.
 40 | /// * `threshold` - A float value representing the similarity threshold for matching.
 41 | #[derive(Debug, Clone)]
 42 | pub struct SimTable<'a> {
 43 |     pub table_id: u32,
 44 |     pub match_id: u32,
 45 |     pub process_type: ProcessType,
 46 |     pub sim_match_type: SimMatchType,
 47 |     pub word_list: Vec<&'a str>,
 48 |     pub threshold: f64,
 49 | }
 50 | 
 51 | /// Represents a processed table used in the similarity matching process.
 52 | ///
 53 | /// This struct is a concrete version of the [SimTable] struct, with ownership over
 54 | /// the word list.
 55 | ///
 56 | /// # Fields
 57 | ///
 58 | /// * `table_id` - A unique identifier for the table.
 59 | /// * `match_id` - A unique identifier for the matching process.
 60 | /// * `process_type` - The type of processing to be applied, represented by the [ProcessType] enum.
 61 | /// * `sim_match_type` - The type of similarity matching algorithm to be used, represented by the [SimMatchType] enum.
 62 | /// * `word_list` - A list of words over which the matching operation is performed. This is an owned vector of strings.
 63 | /// * `threshold` - A float value representing the similarity threshold for a match.
 64 | #[derive(Debug, Clone)]
 65 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 66 | struct SimProcessedTable {
 67 |     table_id: u32,
 68 |     match_id: u32,
 69 |     process_type: ProcessType,
 70 |     sim_match_type: SimMatchType,
 71 |     word_list: Vec<String>,
 72 |     threshold: f64,
 73 | }
 74 | 
 75 | /// Represents the result of a similarity matching operation.
 76 | ///
 77 | /// This struct holds information about the match including identifiers for the match and table,
 78 | /// the word that was matched, and the similarity score of the match. The word is represented as a
 79 | /// [Cow] (Clone on Write) for efficient handling of borrowed or owned strings. This allows
 80 | /// flexibility in returning either a borrowed string or an owned string.
 81 | ///
 82 | /// # Fields
 83 | ///
 84 | /// * `match_id` - A unique identifier for the matching process.
 85 | /// * `table_id` - A unique identifier for the table.
 86 | /// * `word_id` - A unique identifier for the word within the table.
 87 | /// * `word` - The word that was matched, represented as a [Cow] to allow for both borrowed and owned strings.
 88 | /// * `similarity` - A float value representing the similarity score of the match.
 89 | #[derive(Debug, Clone)]
 90 | pub struct SimResult<'a> {
 91 |     pub match_id: u32,
 92 |     pub table_id: u32,
 93 |     pub word_id: u32,
 94 |     pub word: Cow<'a, str>,
 95 |     pub similarity: f64,
 96 | }
 97 | 
 98 | impl MatchResultTrait<'_> for SimResult<'_> {
 99 |     fn match_id(&self) -> u32 {
100 |         self.match_id
101 |     }
102 |     fn table_id(&self) -> u32 {
103 |         self.table_id
104 |     }
105 |     fn word_id(&self) -> u32 {
106 |         0
107 |     }
108 |     fn word(&self) -> &str {
109 |         &self.word
110 |     }
111 |     fn similarity(&self) -> f64 {
112 |         self.similarity
113 |     }
114 | }
115 | 
116 | /// The [SimMatcher] struct is responsible for performing similarity matching operations
117 | /// based on different processing types and similarity algorithms.
118 | ///
119 | /// This struct maintains a process type tree and a list of pre-processed tables that contain
120 | /// the necessary information for performing similarity matching on texts.
121 | ///
122 | /// # Fields
123 | ///
124 | /// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for
125 | ///   text processing based on defined process types.
126 | /// * `sim_processed_table_list` - A vector of `SimProcessedTable`, holding the tables with processed information
127 | ///   for performing similarity matching.
128 | ///
129 | /// # Example
130 | ///
131 | /// ```
132 | /// use matcher_rs::{SimMatcher, SimTable, SimMatchType, ProcessType};
133 | ///
134 | /// // Create a list of `SimTable` with the required properties
135 | /// let sim_table_list = vec![SimTable {
136 | ///     table_id: 1,
137 | ///     match_id: 1,
138 | ///     process_type: ProcessType::None,
139 | ///     sim_match_type: SimMatchType::Levenshtein,
140 | ///     word_list: vec!["example", "test"],
141 | ///     threshold: 0.8,
142 | /// }];
143 | ///
144 | /// // Instantiate a `SimMatcher` with the list of `SimTable`
145 | /// let matcher = SimMatcher::new(&sim_table_list);
146 | ///
147 | /// // Use `matcher` methods for performing similarity matching operations
148 | /// ```
149 | ///
150 | /// The [SimMatcher] struct provides methods for checking if a text matches any of the processed tables
151 | /// and for processing texts to obtain a list of similarity results.
152 | #[derive(Debug, Clone)]
153 | #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
154 | pub struct SimMatcher {
155 |     process_type_tree: Vec<ProcessTypeBitNode>,
156 |     sim_processed_table_list: Vec<SimProcessedTable>,
157 | }
158 | 
159 | impl SimMatcher {
160 |     /// Creates a new instance of [SimMatcher] from a list of [SimTable].
161 |     ///
162 |     /// This function initializes a [SimMatcher] by processing each [SimTable] in the input list.
163 |     /// It extracts the process types and constructs a tree structure used for processing texts.
164 |     /// Additionally, it converts the word lists in each [SimTable] from borrowed strings to owned strings.
165 |     ///
166 |     /// # Parameters
167 |     ///
168 |     /// * `sim_table_list` - A slice of [SimTable] references to be processed and included in the new [SimMatcher] instance.
169 |     ///
170 |     /// # Returns
171 |     ///
172 |     /// Returns a new instance of [SimMatcher] containing:
173 |     /// * `process_type_tree` - A vector of `ProcessTypeBitNode`, representing the tree structure used for text processing based on the process types extracted from the input [SimTable] list.
174 |     /// * `sim_processed_table_list` - A vector of `SimProcessedTable`, each containing an owned vector of words and other properties derived from the input [SimTable] list.
175 |     pub fn new(sim_table_list: &[SimTable]) -> SimMatcher {
176 |         let mut process_type_set = IdSet::with_capacity(sim_table_list.len());
177 |         let mut sim_processed_table_list = Vec::with_capacity(sim_table_list.len());
178 | 
179 |         for sim_table in sim_table_list {
180 |             process_type_set.insert(sim_table.process_type.bits() as usize);
181 |             sim_processed_table_list.push(SimProcessedTable {
182 |                 table_id: sim_table.table_id,
183 |                 match_id: sim_table.match_id,
184 |                 process_type: sim_table.process_type,
185 |                 sim_match_type: sim_table.sim_match_type,
186 |                 word_list: sim_table
187 |                     .word_list
188 |                     .iter()
189 |                     .map(|&word| word.to_owned())
190 |                     .collect::<Vec<String>>(),
191 |                 threshold: sim_table.threshold,
192 |             })
193 |         }
194 | 
195 |         let process_type_tree = build_process_type_tree(&process_type_set);
196 | 
197 |         SimMatcher {
198 |             process_type_tree,
199 |             sim_processed_table_list,
200 |         }
201 |     }
202 | }
203 | 
204 | impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
205 |     /// Checks if the provided text matches any entry in the processed tables.
206 |     ///
207 |     /// This function processes the input text to generate a set of processed text variants
208 |     /// based on the defined process types. It then delegates the actual matching logic to a
209 |     /// helper function that checks if any of these processed text variants match the entries
210 |     /// in the `sim_processed_table_list`.
211 |     ///
212 |     /// # Parameters
213 |     ///
214 |     /// * `text` - A string slice representing the input text to be checked for similarity matches.
215 |     ///
216 |     /// # Returns
217 |     ///
218 |     /// Returns `true` if the processed text matches any entry in the processed tables; otherwise returns `false`.
219 |     fn is_match(&'a self, text: &'a str) -> bool {
220 |         if text.is_empty() {
221 |             return false;
222 |         }
223 | 
224 |         let processed_text_process_type_set =
225 |             reduce_text_process_with_tree(&self.process_type_tree, text);
226 | 
227 |         self._is_match_with_processed_text_process_type_set(&processed_text_process_type_set)
228 |     }
229 | 
230 |     /// Checks if any processed text variant matches an entry in the similarity tables.
231 |     ///
232 |     /// This helper function iterates through the processed text variants and their corresponding
233 |     /// process type sets. For each variant, it checks against all entries in the similarity tables
234 |     /// to see if there is a match based on the defined similarity match type (e.g., Levenshtein).
235 |     ///
236 |     /// # Parameters
237 |     ///
238 |     /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of:
239 |     ///   - A processed text variant represented as a [`Cow<str>`].
240 |     ///   - An [IdSet] containing the process type identifiers associated with the processed text.
241 |     ///
242 |     /// # Returns
243 |     ///
244 |     /// Returns `true` if any of the processed text variants match an entry in the similarity tables
245 |     /// according to the specified match type and similarity threshold; otherwise, returns `false`.
246 |     fn _is_match_with_processed_text_process_type_set(
247 |         &'a self,
248 |         processed_text_process_type_set: &[(Cow<'a, str>, id_set::IdSet)],
249 |     ) -> bool {
250 |         for (processed_text, process_type_set) in processed_text_process_type_set {
251 |             for sim_processed_table in &self.sim_processed_table_list {
252 |                 if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) {
253 |                     continue;
254 |                 }
255 |                 let is_match = match sim_processed_table.sim_match_type {
256 |                     SimMatchType::Levenshtein => sim_processed_table.word_list.iter().any(|text| {
257 |                         distance::levenshtein::normalized_similarity_with_args(
258 |                             text.chars(),
259 |                             processed_text.chars(),
260 |                             &distance::levenshtein::Args::default()
261 |                                 .score_cutoff(sim_processed_table.threshold),
262 |                         )
263 |                         .is_some()
264 |                     }),
265 |                 };
266 | 
267 |                 if is_match {
268 |                     return true;
269 |                 }
270 |             }
271 |         }
272 | 
273 |         false
274 |     }
275 | 
276 |     /// Processes the provided text and returns a list of similarity results.
277 |     ///
278 |     /// This function takes the input text and generates a set of processed text variants based
279 |     /// on the defined process types, as described in the `process_type_tree`. It then uses these
280 |     /// variants to find matches in the similarity tables, accumulating results where a similarity
281 |     /// match is found.
282 |     ///
283 |     /// # Parameters
284 |     ///
285 |     /// * `text` - A string slice representing the input text to be processed and checked for similarity matches.
286 |     ///
287 |     /// # Returns
288 |     ///
289 |     /// Returns a vector of [SimResult] instances, each containing information about a matched entry
290 |     /// in the similarity tables, including the `match_id`, `table_id`, `word_id`, `word`, and the
291 |     /// similarity score.
292 |     fn process(&'a self, text: &'a str) -> Vec<SimResult<'a>> {
293 |         if text.is_empty() {
294 |             return Vec::new();
295 |         }
296 | 
297 |         let processed_text_process_type_set =
298 |             reduce_text_process_with_tree(&self.process_type_tree, text);
299 | 
300 |         self._process_with_processed_text_process_type_set(&processed_text_process_type_set)
301 |     }
302 | 
303 |     /// Processes the provided set of processed text variants and their corresponding process type sets,
304 |     /// returning a list of similarity results.
305 |     ///
306 |     /// This function iterates through each processed text variant and its associated process type set,
307 |     /// comparing them against entries in the similarity tables to identify matches based on the defined
308 |     /// similarity match type (e.g., Levenshtein). For each match found, the function accumulates the result
309 |     /// with relevant information such as `match_id`, `table_id`, `word_id`, `word`, and the similarity score.
310 |     ///
311 |     /// # Parameters
312 |     ///
313 |     /// * `processed_text_process_type_set` - A reference to a list of tuples where each tuple consists of:
314 |     ///   - A processed text variant represented as a [`Cow<str>`].
315 |     ///   - An [IdSet] containing the process type identifiers associated with the processed text.
316 |     ///
317 |     /// # Returns
318 |     ///
319 |     /// Returns a vector of [SimResult] instances, each containing information about a matched entry
320 |     /// in the similarity tables, including:
321 |     /// - `match_id`: The identifier for the match.
322 |     /// - `table_id`: The identifier of the similarity table where the match was found.
323 |     /// - `word_id`: The index of the word in the similarity table's word list.
324 |     /// - `word`: The word from the similarity table's word list that matched the processed text.
325 |     /// - `similarity`: The similarity score of the match.
326 |     ///
327 |     /// The function ensures that only unique matches are included in the result list by maintaining
328 |     /// an [IdSet] to track already processed table ID and word index combinations.
329 |     fn _process_with_processed_text_process_type_set(
330 |         &'a self,
331 |         processed_text_process_type_set: &[(Cow<'a, str>, IdSet)],
332 |     ) -> Vec<SimResult<'a>> {
333 |         let mut result_list = Vec::new();
334 |         let mut table_id_index_set = IdSet::new();
335 | 
336 |         for (processed_text, process_type_set) in processed_text_process_type_set {
337 |             for sim_processed_table in &self.sim_processed_table_list {
338 |                 if !process_type_set.contains(sim_processed_table.process_type.bits() as usize) {
339 |                     continue;
340 |                 }
341 |                 match sim_processed_table.sim_match_type {
342 |                     SimMatchType::Levenshtein => {
343 |                         for (index, text) in sim_processed_table.word_list.iter().enumerate() {
344 |                             let table_id_index =
345 |                                 ((sim_processed_table.table_id as usize) << 32) | index;
346 | 
347 |                             if table_id_index_set.insert(table_id_index) {
348 |                                 if let Some(similarity) =
349 |                                     distance::levenshtein::normalized_similarity_with_args(
350 |                                         text.chars(),
351 |                                         processed_text.chars(),
352 |                                         &distance::levenshtein::Args::default()
353 |                                             .score_cutoff(sim_processed_table.threshold),
354 |                                     )
355 |                                 {
356 |                                     result_list.push(SimResult {
357 |                                         match_id: sim_processed_table.match_id,
358 |                                         table_id: sim_processed_table.table_id,
359 |                                         word_id: index as u32,
360 |                                         word: Cow::Borrowed(text),
361 |                                         similarity,
362 |                                     });
363 |                                 }
364 |                             }
365 |                         }
366 |                     }
367 |                 }
368 |             }
369 |         }
370 | 
371 |         result_list
372 |     }
373 | }
374 | 


--------------------------------------------------------------------------------
/matcher_rs/src/util/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod serde;
2 | pub mod word;
3 | 


--------------------------------------------------------------------------------
/matcher_rs/src/util/serde.rs:
--------------------------------------------------------------------------------
  1 | #[cfg(feature = "serde")]
  2 | use std::borrow::Cow;
  3 | 
  4 | #[cfg(feature = "serde")]
  5 | use fancy_regex::Regex;
  6 | #[cfg(feature = "serde")]
  7 | use regex::RegexSet;
  8 | #[cfg(feature = "serde")]
  9 | use serde::{de::Error, Deserialize, Deserializer, Serialize, Serializer};
 10 | 
 11 | #[cfg(feature = "serde")]
 12 | pub mod serde_regex {
 13 |     use super::*;
 14 | 
 15 |     /// Deserialize and serialize functions for `Regex` type.
 16 |     ///
 17 |     /// This module provides custom serialization and deserialization
 18 |     /// for the `Regex` type from the `fancy_regex` crate using Serde.
 19 |     /// The regex is serialized as a string and deserialized back into a `Regex` object.
 20 |     ///
 21 |     /// To use the custom serialization and deserialization, the field in the struct must
 22 |     /// be annotated with `#[serde(with = "serde_regex")]`.
 23 |     ///
 24 |     /// The provided methods ensure that regex patterns are correctly handled during
 25 |     /// serialization and deserialization processes without losing the actual regex functionalities.
 26 |     pub fn deserialize<'de, D>(d: D) -> Result<Regex, D::Error>
 27 |     where
 28 |         D: Deserializer<'de>,
 29 |     {
 30 |         let s = <Cow<str>>::deserialize(d)?;
 31 | 
 32 |         match Regex::new(s.as_ref()) {
 33 |             Ok(regex) => Ok(regex),
 34 |             Err(err) => Err(D::Error::custom(err)),
 35 |         }
 36 |     }
 37 | 
 38 |     pub fn serialize<S>(regex: &Regex, serializer: S) -> Result<S::Ok, S::Error>
 39 |     where
 40 |         S: Serializer,
 41 |     {
 42 |         regex.as_str().serialize(serializer)
 43 |     }
 44 | }
 45 | 
 46 | #[cfg(feature = "serde")]
 47 | pub mod serde_regex_list {
 48 |     use serde::ser::SerializeSeq;
 49 | 
 50 |     use super::*;
 51 | 
 52 |     /// Deserialize and serialize functions for a list of `Regex` types.
 53 |     ///
 54 |     /// This module provides custom serialization and deserialization
 55 |     /// for lists of the `Regex` type from the `fancy_regex` crate using Serde.
 56 |     /// Each regex in the list is serialized as a string and deserialized back into a `Regex` object.
 57 |     ///
 58 |     /// To use the custom serialization and deserialization, the field in the struct must
 59 |     /// be annotated with `#[serde(with = "serde_regex_list")]`.
 60 |     ///
 61 |     /// These methods ensure that lists of regex patterns are correctly handled during
 62 |     /// serialization and deserialization processes without losing the actual regex functionalities.
 63 |     pub fn deserialize<'de, D>(d: D) -> Result<Vec<Regex>, D::Error>
 64 |     where
 65 |         D: Deserializer<'de>,
 66 |     {
 67 |         let s = <Vec<Cow<str>>>::deserialize(d)?;
 68 |         let mut regex_list = Vec::with_capacity(s.len());
 69 |         for e in s.into_iter() {
 70 |             let regex = Regex::new(e.as_ref()).map_err(D::Error::custom)?;
 71 |             regex_list.push(regex);
 72 |         }
 73 | 
 74 |         Ok(regex_list)
 75 |     }
 76 | 
 77 |     pub fn serialize<S>(regex_list: &Vec<Regex>, serializer: S) -> Result<S::Ok, S::Error>
 78 |     where
 79 |         S: Serializer,
 80 |     {
 81 |         let mut seq = serializer.serialize_seq(Some(regex_list.len()))?;
 82 |         for e in regex_list {
 83 |             seq.serialize_element(e.as_str())?;
 84 |         }
 85 |         seq.end()
 86 |     }
 87 | }
 88 | 
 89 | #[cfg(feature = "serde")]
 90 | pub mod serde_regex_set {
 91 |     use serde::ser::SerializeSeq;
 92 | 
 93 |     use super::*;
 94 | 
 95 |     /// Deserialize and serialize functions for `RegexSet` type.
 96 |     ///
 97 |     /// This module provides custom serialization and deserialization
 98 |     /// for the `RegexSet` type from the `regex` crate using Serde.
 99 |     /// The regex set is serialized as a list of strings and deserialized back into a `RegexSet` object.
100 |     ///
101 |     /// To use the custom serialization and deserialization, the field in the struct must
102 |     /// be annotated with `#[serde(with = "serde_regex_set")]`.
103 |     ///
104 |     /// These methods ensure that regex set patterns are correctly handled during
105 |     /// serialization and deserialization processes without losing the actual regex functionalities.
106 |     pub fn deserialize<'de, D>(d: D) -> Result<RegexSet, D::Error>
107 |     where
108 |         D: Deserializer<'de>,
109 |     {
110 |         let s = <Vec<Cow<str>>>::deserialize(d)?;
111 |         let regex_set = RegexSet::new(s).map_err(D::Error::custom)?;
112 | 
113 |         Ok(regex_set)
114 |     }
115 | 
116 |     pub fn serialize<S>(regex_set: &RegexSet, serializer: S) -> Result<S::Ok, S::Error>
117 |     where
118 |         S: Serializer,
119 |     {
120 |         let mut seq = serializer.serialize_seq(Some(regex_set.len()))?;
121 |         for e in regex_set.patterns() {
122 |             seq.serialize_element(e.as_str())?;
123 |         }
124 |         seq.end()
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/matcher_rs/src/util/word.rs:
--------------------------------------------------------------------------------
  1 | use std::borrow::Cow;
  2 | use std::fmt::Display;
  3 | 
  4 | use serde::{Deserialize, Serialize};
  5 | 
  6 | /// A struct representing a simple word.
  7 | ///
  8 | /// This struct holds a single `String` and provides various methods for
  9 | /// manipulating and querying the contents of the string. It supports the
 10 | /// `Debug`, `Default`, `Clone`, `PartialEq`, `Eq`, `Serialize`, and
 11 | /// `Deserialize` traits, making it versatile for different use cases such
 12 | /// as debugging, serialization, and comparison.
 13 | #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize)]
 14 | pub struct SimpleWord(String);
 15 | 
 16 | impl SimpleWord {
 17 |     /// Creates a new `SimpleWord` instance from any type that can be referenced as a string.
 18 |     ///
 19 |     /// # Arguments
 20 |     ///
 21 |     /// * `word` - An input that implements the `AsRef<str>` trait. This allows for a wide range
 22 |     ///            of input types, including `String`, `&str`, and `Cow<str>`.
 23 |     ///
 24 |     /// # Returns
 25 |     ///
 26 |     /// A `SimpleWord` instance containing the provided word.
 27 |     ///
 28 |     /// # Examples
 29 |     ///
 30 |     /// ```
 31 |     /// use matcher_rs::SimpleWord;
 32 |     ///
 33 |     /// let word = SimpleWord::new("hello");
 34 |     /// assert_eq!(word.as_str(), "hello");
 35 |     /// ```
 36 |     pub fn new<I>(word: I) -> Self
 37 |     where
 38 |         I: AsRef<str>,
 39 |     {
 40 |         SimpleWord(word.as_ref().to_owned())
 41 |     }
 42 | 
 43 |     /// Returns the length of the string contained within the `SimpleWord`.
 44 |     ///
 45 |     /// This method returns the number of characters in the underlying string.
 46 |     ///
 47 |     /// # Returns
 48 |     ///
 49 |     /// The length of the string as a `usize`.
 50 |     ///
 51 |     /// # Examples
 52 |     ///
 53 |     /// ```
 54 |     /// use matcher_rs::SimpleWord;
 55 |     ///
 56 |     /// let word = SimpleWord::new("hello");
 57 |     /// assert_eq!(word.len(), 5);
 58 |     /// ```
 59 |     pub fn len(&self) -> usize {
 60 |         self.0.len()
 61 |     }
 62 | 
 63 |     /// Checks if the string contained within the `SimpleWord` is empty.
 64 |     ///
 65 |     /// This method returns true if the underlying string has a length of zero.
 66 |     ///
 67 |     /// # Returns
 68 |     ///
 69 |     /// `true` if the string is empty, `false` otherwise.
 70 |     ///
 71 |     /// # Examples
 72 |     ///
 73 |     /// ```
 74 |     /// use matcher_rs::SimpleWord;
 75 |     ///
 76 |     /// let empty_word = SimpleWord::new("");
 77 |     /// assert!(empty_word.is_empty());
 78 |     ///
 79 |     /// let non_empty_word = SimpleWord::new("hello");
 80 |     /// assert!(!non_empty_word.is_empty());
 81 |     /// ```
 82 |     pub fn is_empty(&self) -> bool {
 83 |         self.0.is_empty()
 84 |     }
 85 | 
 86 |     /// Appends a given word to the current `SimpleWord` with an `&`.
 87 |     ///
 88 |     /// This method takes an input that implements the `AsRef<str>` trait and appends
 89 |     /// it to the current `SimpleWord` instance, preceded by the `&` character.
 90 |     ///
 91 |     /// # Arguments
 92 |     ///
 93 |     /// * `word` - An input that implements the `AsRef<str>` trait. This could be a
 94 |     ///            `String`, `&str`, or `Cow<str>`.
 95 |     ///
 96 |     /// # Returns
 97 |     ///
 98 |     /// A new `SimpleWord` instance with the appended word.
 99 |     ///
100 |     /// # Examples
101 |     ///
102 |     /// ```
103 |     /// use matcher_rs::SimpleWord;
104 |     ///
105 |     /// let word1 = SimpleWord::new("hello");
106 |     /// let word2 = word1.and("world");
107 |     /// assert_eq!(word2.as_str(), "hello&world");
108 |     /// ```
109 |     pub fn and<I>(mut self, word: I) -> Self
110 |     where
111 |         I: AsRef<str>,
112 |     {
113 |         self.0.push('&');
114 |         self.0.push_str(word.as_ref());
115 |         self
116 |     }
117 | 
118 |     /// Prepends a given word to the current `SimpleWord` with a `~`.
119 |     ///
120 |     /// This method takes an input that implements the `AsRef<str>` trait and prepends
121 |     /// it to the current `SimpleWord` instance, preceded by the `~` character.
122 |     ///
123 |     /// # Arguments
124 |     ///
125 |     /// * `word` - An input that implements the `AsRef<str>` trait. This could be a
126 |     ///            `String`, `&str`, or `Cow<str>`.
127 |     ///
128 |     /// # Returns
129 |     ///
130 |     /// A new `SimpleWord` instance with the prepended word.
131 |     ///
132 |     /// # Examples
133 |     ///
134 |     /// ```
135 |     /// use matcher_rs::SimpleWord;
136 |     ///
137 |     /// let word1 = SimpleWord::new("world");
138 |     /// let word2 = word1.not("hello");
139 |     /// assert_eq!(word2.as_str(), "world~hello");
140 |     /// ```
141 |     pub fn not<I>(mut self, word: I) -> Self
142 |     where
143 |         I: AsRef<str>,
144 |     {
145 |         self.0.push('~');
146 |         self.0.push_str(word.as_ref());
147 |         self
148 |     }
149 | 
150 |     /// Returns a string slice of the contents of the `SimpleWord`.
151 |     ///
152 |     /// This method allows for borrowing the underlying string without taking ownership.
153 |     ///
154 |     /// # Returns
155 |     ///
156 |     /// A string slice (`&str`) of the contents.
157 |     ///
158 |     /// # Examples
159 |     ///
160 |     /// ```
161 |     /// use matcher_rs::SimpleWord;
162 |     ///
163 |     /// let word = SimpleWord::new("hello");
164 |     /// assert_eq!(word.as_str(), "hello");
165 |     /// ```
166 |     pub fn as_str(&self) -> &str {
167 |         &self.0
168 |     }
169 | }
170 | 
171 | impl Display for SimpleWord {
172 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
173 |         f.write_str(&self.0)
174 |     }
175 | }
176 | 
177 | impl From<String> for SimpleWord {
178 |     fn from(value: String) -> Self {
179 |         SimpleWord(value)
180 |     }
181 | }
182 | 
183 | impl From<&str> for SimpleWord {
184 |     fn from(value: &str) -> Self {
185 |         SimpleWord(value.to_owned())
186 |     }
187 | }
188 | 
189 | impl<'a> From<Cow<'a, str>> for SimpleWord {
190 |     fn from(value: Cow<'a, str>) -> Self {
191 |         SimpleWord(value.into_owned())
192 |     }
193 | }
194 | 
195 | impl From<SimpleWord> for String {
196 |     fn from(value: SimpleWord) -> Self {
197 |         value.0
198 |     }
199 | }
200 | 
201 | impl AsRef<str> for SimpleWord {
202 |     fn as_ref(&self) -> &str {
203 |         &self.0
204 |     }
205 | }
206 | 


--------------------------------------------------------------------------------
/matcher_rs/tests/test.rs:
--------------------------------------------------------------------------------
  1 | mod test_simple {
  2 |     use std::collections::HashMap;
  3 | 
  4 |     use matcher_rs::{ProcessType, SimpleMatcher, SimpleWord, TextMatcherTrait};
  5 | 
  6 |     #[test]
  7 |     fn simple_match_init() {
  8 |         let _ = SimpleMatcher::new(&HashMap::from([(
  9 |             ProcessType::None,
 10 |             HashMap::from([(1, "")]),
 11 |         )]));
 12 |         let _ = SimpleMatcher::new(&HashMap::from([(
 13 |             ProcessType::None,
 14 |             HashMap::from([(1, "hello"), (2, "world")]),
 15 |         )]));
 16 |     }
 17 | 
 18 |     #[test]
 19 |     fn simple_match_fanjian() {
 20 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 21 |             ProcessType::Fanjian,
 22 |             HashMap::from([(1, "你好")]),
 23 |         )]));
 24 |         assert!(simple_matcher.is_match("妳好"));
 25 | 
 26 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 27 |             ProcessType::Fanjian,
 28 |             HashMap::from([(1, "妳好")]),
 29 |         )]));
 30 |         assert!(simple_matcher.is_match("你好"));
 31 |     }
 32 | 
 33 |     #[test]
 34 |     fn simple_match_delete() {
 35 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 36 |             ProcessType::Delete,
 37 |             HashMap::from([(1, "你好")]),
 38 |         )]));
 39 |         assert!(simple_matcher.is_match("你！好"));
 40 |     }
 41 | 
 42 |     #[test]
 43 |     fn simple_match_normalize() {
 44 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 45 |             ProcessType::Normalize,
 46 |             HashMap::from([(1, "he11o")]),
 47 |         )]));
 48 |         assert!(simple_matcher.is_match("ℋЀ⒈㈠Õ"));
 49 |     }
 50 | 
 51 |     #[test]
 52 |     fn simple_match_pinyin() {
 53 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 54 |             ProcessType::PinYin,
 55 |             HashMap::from([(1, "西安")]),
 56 |         )]));
 57 |         assert!(simple_matcher.is_match("洗按"));
 58 |         assert!(!simple_matcher.is_match("现"));
 59 |     }
 60 | 
 61 |     #[test]
 62 |     fn simple_match_pinyinchar() {
 63 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 64 |             ProcessType::PinYinChar,
 65 |             HashMap::from([(1, "西安")]),
 66 |         )]));
 67 |         assert!(simple_matcher.is_match("洗按"));
 68 |         assert!(simple_matcher.is_match("现"));
 69 |         assert!(simple_matcher.is_match("xian"));
 70 |     }
 71 | 
 72 |     #[test]
 73 |     fn simple_match_combination() {
 74 |         let simple_matcher = SimpleMatcher::new(&HashMap::from([(
 75 |             ProcessType::None,
 76 |             HashMap::from([
 77 |                 (1, SimpleWord::from("hello").and("world")),
 78 |                 (2, SimpleWord::from("hello").and("world").and("hello")),
 79 |                 (3, SimpleWord::from("hello").not("world")),
 80 |                 (4, SimpleWord::from("hello").not("world").not("world")),
 81 |                 (5, SimpleWord::from("hello").and("world").not("word")),
 82 |                 (
 83 |                     6,
 84 |                     SimpleWord::from("hello")
 85 |                         .and("world")
 86 |                         .not("word")
 87 |                         .not("word"),
 88 |                 ),
 89 |             ]),
 90 |         )]));
 91 |         assert!(simple_matcher.is_match("hello world"));
 92 |         assert!(simple_matcher.is_match("hello hello world"));
 93 |         assert!(simple_matcher.is_match("hello word"));
 94 |     }
 95 | }
 96 | 
 97 | mod test_regex {
 98 |     use matcher_rs::{ProcessType, RegexMatchType, RegexMatcher, RegexTable, TextMatcherTrait};
 99 | 
100 |     #[test]
101 |     fn regex_match_regex() {
102 |         let regex_matcher = RegexMatcher::new(&[RegexTable {
103 |             table_id: 1,
104 |             match_id: 1,
105 |             process_type: ProcessType::None,
106 |             regex_match_type: RegexMatchType::Regex,
107 |             word_list: vec!["h[aeiou]llo", "w[aeiou]rd"],
108 |         }]);
109 | 
110 |         assert!(regex_matcher.is_match("hallo"));
111 |         assert!(regex_matcher.is_match("ward"));
112 |     }
113 | 
114 |     #[test]
115 |     fn regex_match_acrostic() {
116 |         let regex_matcher = RegexMatcher::new(&[RegexTable {
117 |             table_id: 1,
118 |             match_id: 1,
119 |             process_type: ProcessType::None,
120 |             regex_match_type: RegexMatchType::Acrostic,
121 |             word_list: vec!["h,e,l,l,o", "你,好"],
122 |         }]);
123 | 
124 |         assert!(regex_matcher.is_match("hope, endures, love, lasts, onward."));
125 |         assert!(regex_matcher.is_match("Happy moments shared, Every smile and laugh, Love in every word, Lighting up our paths, Open hearts we show."));
126 |         assert!(regex_matcher.is_match("你的笑容温暖, 好心情常伴。"));
127 |     }
128 | 
129 |     #[test]
130 |     fn regex_match_similar_char() {
131 |         let regex_matcher = RegexMatcher::new(&[RegexTable {
132 |             table_id: 1,
133 |             match_id: 1,
134 |             process_type: ProcessType::None,
135 |             regex_match_type: RegexMatchType::SimilarChar,
136 |             word_list: vec!["hello,hi,H,你好", "world,word,🌍,世界"],
137 |         }]);
138 | 
139 |         assert!(regex_matcher.is_match("helloworld"));
140 |         assert!(regex_matcher.is_match("hi世界"));
141 |     }
142 | }
143 | 
144 | mod test_sim {
145 |     use matcher_rs::{ProcessType, SimMatchType, SimMatcher, SimTable, TextMatcherTrait};
146 | 
147 |     #[test]
148 |     fn sim_match() {
149 |         let sim_matcher = SimMatcher::new(&[SimTable {
150 |             table_id: 1,
151 |             match_id: 1,
152 |             process_type: ProcessType::None,
153 |             sim_match_type: SimMatchType::Levenshtein,
154 |             word_list: vec!["helloworld"],
155 |             threshold: 0.8,
156 |         }]);
157 | 
158 |         assert!(sim_matcher.is_match("helloworl"));
159 |         assert!(sim_matcher.is_match("halloworld"));
160 |         assert!(sim_matcher.is_match("ha1loworld"));
161 |         assert!(!sim_matcher.is_match("ha1loworld1"));
162 |     }
163 | }
164 | 
165 | mod test_matcher {
166 |     use std::collections::HashMap;
167 | 
168 |     use matcher_rs::{MatchTable, MatchTableType, Matcher, ProcessType, TextMatcherTrait};
169 | 
170 |     #[test]
171 |     fn matcher_init() {
172 |         let _ = Matcher::new(&HashMap::from([(
173 |             1,
174 |             vec![MatchTable {
175 |                 table_id: 1,
176 |                 match_table_type: MatchTableType::Simple {
177 |                     process_type: ProcessType::None,
178 |                 },
179 |                 word_list: vec![],
180 |                 exemption_process_type: ProcessType::None,
181 |                 exemption_word_list: vec![],
182 |             }],
183 |         )]));
184 |     }
185 | 
186 |     #[test]
187 |     fn matcher_exemption() {
188 |         let matcher = Matcher::new(&HashMap::from([(
189 |             1,
190 |             vec![MatchTable {
191 |                 table_id: 1,
192 |                 match_table_type: MatchTableType::Simple {
193 |                     process_type: ProcessType::None,
194 |                 },
195 |                 word_list: vec!["hello"],
196 |                 exemption_process_type: ProcessType::None,
197 |                 exemption_word_list: vec!["world"],
198 |             }],
199 |         )]));
200 |         assert!(matcher.is_match("hello"));
201 |         assert!(!matcher.is_match("hello,world"))
202 |     }
203 | }
204 | 
205 | mod test_process {
206 |     use id_set::IdSet;
207 |     use matcher_rs::{
208 |         build_process_type_tree, reduce_text_process, reduce_text_process_emit,
209 |         reduce_text_process_with_set, reduce_text_process_with_tree, text_process, ProcessType,
210 |     };
211 | 
212 |     #[test]
213 |     fn test_text_process() {
214 |         let text = text_process(ProcessType::Fanjian, "~ᗩ~躶~𝚩~軆~Ⲉ~");
215 |         println!("{:?}", text);
216 |     }
217 | 
218 |     #[test]
219 |     fn test_reduce_text_process() {
220 |         let text = reduce_text_process(ProcessType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~");
221 |         println!("{:?}", text);
222 |     }
223 | 
224 |     #[test]
225 |     fn test_reduce_text_process_emit() {
226 |         let text = reduce_text_process_emit(ProcessType::FanjianDeleteNormalize, "~ᗩ~躶~𝚩~軆~Ⲉ~");
227 |         println!("{:?}", text);
228 |     }
229 | 
230 |     #[test]
231 |     fn test_build_process_type_tree() {
232 |         let process_type_set = IdSet::from_iter([
233 |             ProcessType::Fanjian.bits() as usize,
234 |             ProcessType::DeleteNormalize.bits() as usize,
235 |             ProcessType::FanjianDeleteNormalize.bits() as usize,
236 |             ProcessType::Delete.bits() as usize,
237 |             ProcessType::Normalize.bits() as usize,
238 |         ]);
239 |         let process_type_tree = build_process_type_tree(&process_type_set);
240 |         println!("{:?}", process_type_tree);
241 |     }
242 | 
243 |     #[test]
244 |     fn test_reduce_text_process_with_tree() {
245 |         let process_type_set = IdSet::from_iter([
246 |             ProcessType::Fanjian.bits() as usize,
247 |             ProcessType::DeleteNormalize.bits() as usize,
248 |             ProcessType::FanjianDeleteNormalize.bits() as usize,
249 |             ProcessType::Delete.bits() as usize,
250 |             ProcessType::Normalize.bits() as usize,
251 |         ]);
252 |         let process_type_tree = build_process_type_tree(&process_type_set);
253 |         let text = "test爽-︻";
254 | 
255 |         let processed_text_process_type_set =
256 |             reduce_text_process_with_tree(&process_type_tree, text);
257 |         println!("{processed_text_process_type_set:?}");
258 |     }
259 | 
260 |     #[test]
261 |     fn test_reduce_text_process_with_set() {
262 |         let process_type_set = IdSet::from_iter([
263 |             ProcessType::Fanjian.bits() as usize,
264 |             ProcessType::DeleteNormalize.bits() as usize,
265 |             ProcessType::FanjianDeleteNormalize.bits() as usize,
266 |             ProcessType::Delete.bits() as usize,
267 |             ProcessType::Normalize.bits() as usize,
268 |         ]);
269 |         let text = "test爽-︻";
270 | 
271 |         let processed_text_process_type_set = reduce_text_process_with_set(&process_type_set, text);
272 |         println!("{processed_text_process_type_set:?}");
273 |     }
274 | }
275 | 


--------------------------------------------------------------------------------