├── .github ├── dependabot.yml └── workflows │ └── CI.yml ├── .gitignore ├── .readthedocs.yml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat ├── requirements.txt ├── runtime.txt └── rust-toolchain ├── netlify.toml ├── pyproject.toml ├── rjieba └── __init__.py ├── src └── lib.rs └── tests └── test_rjieba.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | # This file is autogenerated by maturin v1.8.1 2 | # To update, run 3 | # 4 | # maturin generate-ci github 5 | # 6 | name: CI 7 | 8 | on: 9 | push: 10 | branches: 11 | - main 12 | - master 13 | tags: 14 | - "*" 15 | pull_request: 16 | workflow_dispatch: 17 | 18 | permissions: 19 | contents: read 20 | 21 | jobs: 22 | linux: 23 | runs-on: ${{ matrix.platform.runner }} 24 | strategy: 25 | matrix: 26 | platform: 27 | - runner: ubuntu-22.04 28 | target: x86_64 29 | - runner: ubuntu-22.04 30 | target: x86 31 | - runner: ubuntu-22.04 32 | target: aarch64 33 | - runner: ubuntu-22.04 34 | target: armv7 35 | - runner: ubuntu-22.04 36 | target: s390x 37 | - runner: ubuntu-22.04 38 | target: ppc64le 39 | steps: 40 | - uses: actions/checkout@v4 41 | - uses: actions/setup-python@v5 42 | with: 43 | python-version: 3.x 44 | - name: Build wheels 45 | uses: PyO3/maturin-action@v1 46 | with: 47 | target: ${{ matrix.platform.target }} 48 | args: --release --out dist 49 | sccache: "true" 50 | manylinux: auto 51 | - name: Build free-threaded wheels 52 | uses: PyO3/maturin-action@v1 53 | with: 54 | target: ${{ matrix.platform.target }} 55 | args: --release --out dist -i python3.13t 56 | sccache: "true" 57 | manylinux: auto 58 | - name: Upload wheels 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: wheels-linux-${{ matrix.platform.target }} 62 | path: dist 63 | 64 | musllinux: 65 | runs-on: ${{ matrix.platform.runner }} 66 | strategy: 67 | matrix: 68 | platform: 69 | - runner: ubuntu-22.04 70 | target: x86_64 71 | - runner: ubuntu-22.04 72 | target: x86 73 | - runner: ubuntu-22.04 74 | target: aarch64 75 | - runner: ubuntu-22.04 76 | target: armv7 77 | steps: 78 | - uses: actions/checkout@v4 79 | - uses: actions/setup-python@v5 80 | with: 81 | python-version: 3.x 82 | - name: Build wheels 83 | uses: PyO3/maturin-action@v1 84 | with: 85 | target: ${{ matrix.platform.target }} 86 | args: --release --out dist 87 | sccache: "true" 88 | manylinux: musllinux_1_2 89 | - name: Build free-threaded wheels 90 | uses: PyO3/maturin-action@v1 91 | with: 92 | target: ${{ matrix.platform.target }} 93 | args: --release --out dist -i python3.13t 94 | sccache: "true" 95 | manylinux: musllinux_1_2 96 | - name: Upload wheels 97 | uses: actions/upload-artifact@v4 98 | with: 99 | name: wheels-musllinux-${{ matrix.platform.target }} 100 | path: dist 101 | 102 | windows: 103 | runs-on: ${{ matrix.platform.runner }} 104 | strategy: 105 | matrix: 106 | platform: 107 | - runner: windows-latest 108 | target: x64 109 | - runner: windows-latest 110 | target: x86 111 | steps: 112 | - uses: actions/checkout@v4 113 | - uses: actions/setup-python@v5 114 | with: 115 | python-version: 3.x 116 | architecture: ${{ matrix.platform.target }} 117 | - name: Build wheels 118 | uses: PyO3/maturin-action@v1 119 | with: 120 | target: ${{ matrix.platform.target }} 121 | args: --release --out dist 122 | sccache: "true" 123 | - name: Build free-threaded wheels 124 | uses: PyO3/maturin-action@v1 125 | with: 126 | target: ${{ matrix.platform.target }} 127 | args: --release --out dist -i python3.13t 128 | sccache: "true" 129 | - name: Upload wheels 130 | uses: actions/upload-artifact@v4 131 | with: 132 | name: wheels-windows-${{ matrix.platform.target }} 133 | path: dist 134 | 135 | macos: 136 | runs-on: ${{ matrix.platform.runner }} 137 | strategy: 138 | matrix: 139 | platform: 140 | - runner: macos-13 141 | target: x86_64 142 | - runner: macos-14 143 | target: aarch64 144 | steps: 145 | - uses: actions/checkout@v4 146 | - uses: actions/setup-python@v5 147 | with: 148 | python-version: 3.x 149 | - name: Build wheels 150 | uses: PyO3/maturin-action@v1 151 | with: 152 | target: ${{ matrix.platform.target }} 153 | args: --release --out dist 154 | sccache: "true" 155 | - name: Build free-threaded wheels 156 | uses: PyO3/maturin-action@v1 157 | with: 158 | target: ${{ matrix.platform.target }} 159 | args: --release --out dist -i python3.13t 160 | sccache: "true" 161 | - name: Upload wheels 162 | uses: actions/upload-artifact@v4 163 | with: 164 | name: wheels-macos-${{ matrix.platform.target }} 165 | path: dist 166 | 167 | sdist: 168 | runs-on: ubuntu-latest 169 | steps: 170 | - uses: actions/checkout@v4 171 | - name: Build sdist 172 | uses: PyO3/maturin-action@v1 173 | with: 174 | command: sdist 175 | args: --out dist 176 | - name: Upload sdist 177 | uses: actions/upload-artifact@v4 178 | with: 179 | name: wheels-sdist 180 | path: dist 181 | 182 | release: 183 | name: Release 184 | runs-on: ubuntu-latest 185 | if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }} 186 | needs: [linux, musllinux, windows, macos, sdist] 187 | permissions: 188 | # Use to sign the release artifacts 189 | id-token: write 190 | # Used to upload release artifacts 191 | contents: write 192 | # Used to generate artifact attestation 193 | attestations: write 194 | steps: 195 | - uses: actions/download-artifact@v4 196 | - name: Generate artifact attestation 197 | uses: actions/attest-build-provenance@v1 198 | with: 199 | subject-path: "wheels-*/*" 200 | - name: Publish to PyPI 201 | if: ${{ startsWith(github.ref, 'refs/tags/') }} 202 | uses: PyO3/maturin-action@v1 203 | env: 204 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_PASSWORD }} 205 | with: 206 | command: upload 207 | args: --non-interactive --skip-existing wheels-*/* 208 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | .pytest_cache/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | .venv/ 14 | env/ 15 | bin/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | include/ 26 | man/ 27 | venv/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | pip-selfcheck.json 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | 45 | # Translations 46 | *.mo 47 | 48 | # Mr Developer 49 | .mr.developer.cfg 50 | .project 51 | .pydevproject 52 | 53 | # Rope 54 | .ropeproject 55 | 56 | # Django stuff: 57 | *.log 58 | *.pot 59 | 60 | .DS_Store 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyCharm 66 | .idea/ 67 | 68 | # VSCode 69 | .vscode/ 70 | 71 | # Pyenv 72 | .python-version 73 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # https://docs.readthedocs.io/en/stable/config-file/v2.html#supported-settings 2 | 3 | version: 2 4 | 5 | sphinx: 6 | builder: html 7 | 8 | build: 9 | # readdocs master now includes a rust toolchain 10 | os: "ubuntu-20.04" 11 | tools: 12 | python: "3.9" 13 | rust: "1.55" 14 | 15 | python: 16 | install: 17 | - method: pip 18 | path: . 19 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "adler32" 7 | version = "1.2.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" 10 | 11 | [[package]] 12 | name = "ahash" 13 | version = "0.8.11" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" 16 | dependencies = [ 17 | "cfg-if", 18 | "once_cell", 19 | "version_check", 20 | "zerocopy", 21 | ] 22 | 23 | [[package]] 24 | name = "aho-corasick" 25 | version = "1.1.3" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 28 | dependencies = [ 29 | "memchr", 30 | ] 31 | 32 | [[package]] 33 | name = "allocator-api2" 34 | version = "0.2.21" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" 37 | 38 | [[package]] 39 | name = "autocfg" 40 | version = "1.4.0" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 43 | 44 | [[package]] 45 | name = "byteorder" 46 | version = "1.5.0" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 49 | 50 | [[package]] 51 | name = "cc" 52 | version = "1.2.9" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" 55 | dependencies = [ 56 | "shlex", 57 | ] 58 | 59 | [[package]] 60 | name = "cedarwood" 61 | version = "0.4.6" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "6d910bedd62c24733263d0bed247460853c9d22e8956bd4cd964302095e04e90" 64 | dependencies = [ 65 | "smallvec", 66 | ] 67 | 68 | [[package]] 69 | name = "cfg-if" 70 | version = "1.0.0" 71 | source = "registry+https://github.com/rust-lang/crates.io-index" 72 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 73 | 74 | [[package]] 75 | name = "core2" 76 | version = "0.4.0" 77 | source = "registry+https://github.com/rust-lang/crates.io-index" 78 | checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" 79 | dependencies = [ 80 | "memchr", 81 | ] 82 | 83 | [[package]] 84 | name = "crc32fast" 85 | version = "1.4.2" 86 | source = "registry+https://github.com/rust-lang/crates.io-index" 87 | checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" 88 | dependencies = [ 89 | "cfg-if", 90 | ] 91 | 92 | [[package]] 93 | name = "dary_heap" 94 | version = "0.3.7" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" 97 | 98 | [[package]] 99 | name = "fxhash" 100 | version = "0.2.1" 101 | source = "registry+https://github.com/rust-lang/crates.io-index" 102 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 103 | dependencies = [ 104 | "byteorder", 105 | ] 106 | 107 | [[package]] 108 | name = "hashbrown" 109 | version = "0.14.5" 110 | source = "registry+https://github.com/rust-lang/crates.io-index" 111 | checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" 112 | dependencies = [ 113 | "ahash", 114 | "allocator-api2", 115 | ] 116 | 117 | [[package]] 118 | name = "heck" 119 | version = "0.5.0" 120 | source = "registry+https://github.com/rust-lang/crates.io-index" 121 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 122 | 123 | [[package]] 124 | name = "include-flate" 125 | version = "0.3.0" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "df49c16750695486c1f34de05da5b7438096156466e7f76c38fcdf285cf0113e" 128 | dependencies = [ 129 | "include-flate-codegen", 130 | "lazy_static", 131 | "libflate", 132 | ] 133 | 134 | [[package]] 135 | name = "include-flate-codegen" 136 | version = "0.2.0" 137 | source = "registry+https://github.com/rust-lang/crates.io-index" 138 | checksum = "8c5b246c6261be723b85c61ecf87804e8ea4a35cb68be0ff282ed84b95ffe7d7" 139 | dependencies = [ 140 | "libflate", 141 | "proc-macro2", 142 | "quote", 143 | "syn", 144 | ] 145 | 146 | [[package]] 147 | name = "indoc" 148 | version = "2.0.5" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" 151 | 152 | [[package]] 153 | name = "jieba-macros" 154 | version = "0.7.1" 155 | source = "registry+https://github.com/rust-lang/crates.io-index" 156 | checksum = "7c676b32a471d3cfae8dac2ad2f8334cd52e53377733cca8c1fb0a5062fec192" 157 | dependencies = [ 158 | "phf_codegen", 159 | ] 160 | 161 | [[package]] 162 | name = "jieba-rs" 163 | version = "0.7.3" 164 | source = "registry+https://github.com/rust-lang/crates.io-index" 165 | checksum = "b06096b4b61fb4bfdbf16c6a968ea2d6be1ac9617cf3db741c3b641e6c290a35" 166 | dependencies = [ 167 | "cedarwood", 168 | "fxhash", 169 | "include-flate", 170 | "jieba-macros", 171 | "lazy_static", 172 | "phf", 173 | "regex", 174 | ] 175 | 176 | [[package]] 177 | name = "lazy_static" 178 | version = "1.5.0" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 181 | 182 | [[package]] 183 | name = "libc" 184 | version = "0.2.169" 185 | source = "registry+https://github.com/rust-lang/crates.io-index" 186 | checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" 187 | 188 | [[package]] 189 | name = "libflate" 190 | version = "2.1.0" 191 | source = "registry+https://github.com/rust-lang/crates.io-index" 192 | checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" 193 | dependencies = [ 194 | "adler32", 195 | "core2", 196 | "crc32fast", 197 | "dary_heap", 198 | "libflate_lz77", 199 | ] 200 | 201 | [[package]] 202 | name = "libflate_lz77" 203 | version = "2.1.0" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" 206 | dependencies = [ 207 | "core2", 208 | "hashbrown", 209 | "rle-decode-fast", 210 | ] 211 | 212 | [[package]] 213 | name = "memchr" 214 | version = "2.7.4" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 217 | 218 | [[package]] 219 | name = "memoffset" 220 | version = "0.9.1" 221 | source = "registry+https://github.com/rust-lang/crates.io-index" 222 | checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" 223 | dependencies = [ 224 | "autocfg", 225 | ] 226 | 227 | [[package]] 228 | name = "once_cell" 229 | version = "1.20.2" 230 | source = "registry+https://github.com/rust-lang/crates.io-index" 231 | checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" 232 | 233 | [[package]] 234 | name = "phf" 235 | version = "0.11.3" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" 238 | dependencies = [ 239 | "phf_shared", 240 | ] 241 | 242 | [[package]] 243 | name = "phf_codegen" 244 | version = "0.11.3" 245 | source = "registry+https://github.com/rust-lang/crates.io-index" 246 | checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" 247 | dependencies = [ 248 | "phf_generator", 249 | "phf_shared", 250 | ] 251 | 252 | [[package]] 253 | name = "phf_generator" 254 | version = "0.11.3" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" 257 | dependencies = [ 258 | "phf_shared", 259 | "rand", 260 | ] 261 | 262 | [[package]] 263 | name = "phf_shared" 264 | version = "0.11.3" 265 | source = "registry+https://github.com/rust-lang/crates.io-index" 266 | checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" 267 | dependencies = [ 268 | "siphasher", 269 | ] 270 | 271 | [[package]] 272 | name = "portable-atomic" 273 | version = "1.10.0" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6" 276 | 277 | [[package]] 278 | name = "proc-macro2" 279 | version = "1.0.93" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" 282 | dependencies = [ 283 | "unicode-ident", 284 | ] 285 | 286 | [[package]] 287 | name = "pyo3" 288 | version = "0.25.0" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "f239d656363bcee73afef85277f1b281e8ac6212a1d42aa90e55b90ed43c47a4" 291 | dependencies = [ 292 | "indoc", 293 | "libc", 294 | "memoffset", 295 | "once_cell", 296 | "portable-atomic", 297 | "pyo3-build-config", 298 | "pyo3-ffi", 299 | "pyo3-macros", 300 | "unindent", 301 | ] 302 | 303 | [[package]] 304 | name = "pyo3-build-config" 305 | version = "0.25.0" 306 | source = "registry+https://github.com/rust-lang/crates.io-index" 307 | checksum = "755ea671a1c34044fa165247aaf6f419ca39caa6003aee791a0df2713d8f1b6d" 308 | dependencies = [ 309 | "once_cell", 310 | "python3-dll-a", 311 | "target-lexicon", 312 | ] 313 | 314 | [[package]] 315 | name = "pyo3-ffi" 316 | version = "0.25.0" 317 | source = "registry+https://github.com/rust-lang/crates.io-index" 318 | checksum = "fc95a2e67091e44791d4ea300ff744be5293f394f1bafd9f78c080814d35956e" 319 | dependencies = [ 320 | "libc", 321 | "pyo3-build-config", 322 | ] 323 | 324 | [[package]] 325 | name = "pyo3-macros" 326 | version = "0.25.0" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "a179641d1b93920829a62f15e87c0ed791b6c8db2271ba0fd7c2686090510214" 329 | dependencies = [ 330 | "proc-macro2", 331 | "pyo3-macros-backend", 332 | "quote", 333 | "syn", 334 | ] 335 | 336 | [[package]] 337 | name = "pyo3-macros-backend" 338 | version = "0.25.0" 339 | source = "registry+https://github.com/rust-lang/crates.io-index" 340 | checksum = "9dff85ebcaab8c441b0e3f7ae40a6963ecea8a9f5e74f647e33fcf5ec9a1e89e" 341 | dependencies = [ 342 | "heck", 343 | "proc-macro2", 344 | "pyo3-build-config", 345 | "quote", 346 | "syn", 347 | ] 348 | 349 | [[package]] 350 | name = "python3-dll-a" 351 | version = "0.2.12" 352 | source = "registry+https://github.com/rust-lang/crates.io-index" 353 | checksum = "9b66f9171950e674e64bad3456e11bb3cca108e5c34844383cfe277f45c8a7a8" 354 | dependencies = [ 355 | "cc", 356 | ] 357 | 358 | [[package]] 359 | name = "quote" 360 | version = "1.0.38" 361 | source = "registry+https://github.com/rust-lang/crates.io-index" 362 | checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" 363 | dependencies = [ 364 | "proc-macro2", 365 | ] 366 | 367 | [[package]] 368 | name = "rand" 369 | version = "0.8.5" 370 | source = "registry+https://github.com/rust-lang/crates.io-index" 371 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 372 | dependencies = [ 373 | "rand_core", 374 | ] 375 | 376 | [[package]] 377 | name = "rand_core" 378 | version = "0.6.4" 379 | source = "registry+https://github.com/rust-lang/crates.io-index" 380 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 381 | 382 | [[package]] 383 | name = "regex" 384 | version = "1.11.1" 385 | source = "registry+https://github.com/rust-lang/crates.io-index" 386 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 387 | dependencies = [ 388 | "aho-corasick", 389 | "memchr", 390 | "regex-automata", 391 | "regex-syntax", 392 | ] 393 | 394 | [[package]] 395 | name = "regex-automata" 396 | version = "0.4.9" 397 | source = "registry+https://github.com/rust-lang/crates.io-index" 398 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 399 | dependencies = [ 400 | "aho-corasick", 401 | "memchr", 402 | "regex-syntax", 403 | ] 404 | 405 | [[package]] 406 | name = "regex-syntax" 407 | version = "0.8.5" 408 | source = "registry+https://github.com/rust-lang/crates.io-index" 409 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 410 | 411 | [[package]] 412 | name = "rjieba" 413 | version = "0.1.13" 414 | dependencies = [ 415 | "jieba-rs", 416 | "pyo3", 417 | ] 418 | 419 | [[package]] 420 | name = "rle-decode-fast" 421 | version = "1.0.3" 422 | source = "registry+https://github.com/rust-lang/crates.io-index" 423 | checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" 424 | 425 | [[package]] 426 | name = "shlex" 427 | version = "1.3.0" 428 | source = "registry+https://github.com/rust-lang/crates.io-index" 429 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 430 | 431 | [[package]] 432 | name = "siphasher" 433 | version = "1.0.1" 434 | source = "registry+https://github.com/rust-lang/crates.io-index" 435 | checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" 436 | 437 | [[package]] 438 | name = "smallvec" 439 | version = "1.13.2" 440 | source = "registry+https://github.com/rust-lang/crates.io-index" 441 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" 442 | 443 | [[package]] 444 | name = "syn" 445 | version = "2.0.96" 446 | source = "registry+https://github.com/rust-lang/crates.io-index" 447 | checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" 448 | dependencies = [ 449 | "proc-macro2", 450 | "quote", 451 | "unicode-ident", 452 | ] 453 | 454 | [[package]] 455 | name = "target-lexicon" 456 | version = "0.13.2" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | checksum = "e502f78cdbb8ba4718f566c418c52bc729126ffd16baee5baa718cf25dd5a69a" 459 | 460 | [[package]] 461 | name = "unicode-ident" 462 | version = "1.0.14" 463 | source = "registry+https://github.com/rust-lang/crates.io-index" 464 | checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" 465 | 466 | [[package]] 467 | name = "unindent" 468 | version = "0.2.3" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" 471 | 472 | [[package]] 473 | name = "version_check" 474 | version = "0.9.5" 475 | source = "registry+https://github.com/rust-lang/crates.io-index" 476 | checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" 477 | 478 | [[package]] 479 | name = "zerocopy" 480 | version = "0.7.35" 481 | source = "registry+https://github.com/rust-lang/crates.io-index" 482 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 483 | dependencies = [ 484 | "zerocopy-derive", 485 | ] 486 | 487 | [[package]] 488 | name = "zerocopy-derive" 489 | version = "0.7.35" 490 | source = "registry+https://github.com/rust-lang/crates.io-index" 491 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 492 | dependencies = [ 493 | "proc-macro2", 494 | "quote", 495 | "syn", 496 | ] 497 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rjieba" 3 | version = "0.1.13" 4 | authors = ["messense "] 5 | edition = "2018" 6 | description = "jieba-rs Python binding" 7 | license = "MIT" 8 | repository = "https://github.com/messense/rjieba-py" 9 | readme = "README.md" 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [lib] 14 | name = "rjieba" 15 | crate-type = ["cdylib"] 16 | 17 | [dependencies] 18 | jieba-rs = "0.7.3" 19 | pyo3 = { version = "0.25.0", features = ["abi3-py38", "generate-import-lib"] } 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Messense Lv 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 9 | of the Software, and to permit persons to whom the Software is furnished to do 10 | so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rjieba-py 2 | 3 | ![CI](https://github.com/messense/rjieba-py/workflows/CI/badge.svg) 4 | [![PyPI](https://img.shields.io/pypi/v/rjieba.svg)](https://pypi.org/project/rjieba) 5 | 6 | [jieba-rs](https://github.com/messense/jieba-rs) Python binding. 7 | 8 | ## Installation 9 | 10 | ```bash 11 | pip install rjieba 12 | ``` 13 | 14 | ## Usage 15 | 16 | ```python 17 | import rjieba 18 | 19 | 20 | print(rjieba.cut('我们中出了一个叛徒')) 21 | print(rjieba.tag('我们中出了一个叛徒')) 22 | ``` 23 | 24 | ## Performance 25 | 26 | Running on MacBook Pro (15-inch, 2018) 2.2 GHz 6-Core Intel Core i7 27 | 28 | ```python 29 | In [1]: import jieba 30 | 31 | In [2]: import cjieba 32 | 33 | In [3]: import rjieba 34 | 35 | In [4]: jieba.initialize() 36 | Building prefix dict from the default dictionary ... 37 | Loading model from cache /var/folders/8d/h3lyjgz14296j_lw7chgf5hc0000gp/T/jieba.cache 38 | Loading model cost 0.695 seconds. 39 | Prefix dict has been built successfully. 40 | 41 | In [5]: cjieba.initialize() 42 | 43 | In [6]: with open('../jieba-rs/examples/weicheng/src/weicheng.txt') as f: 44 | ...: txt = f.read() 45 | ...: 46 | 47 | In [7]: %timeit list(jieba.cut(txt)) 48 | 1.1 s ± 10.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) 49 | 50 | In [8]: %timeit cjieba.cut(txt) 51 | 225 ms ± 3.95 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) 52 | 53 | In [9]: %timeit rjieba.cut(txt) 54 | 106 ms ± 2.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) 55 | ``` 56 | 57 | ## License 58 | 59 | This work is released under the MIT license. A copy of the license is provided in the [LICENSE](./LICENSE) file. 60 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | 16 | # sys.path.insert(0, os.path.abspath('..')) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'rjieba' 22 | copyright = '2021, messense' 23 | author = 'messense' 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc', 33 | 'sphinx.ext.coverage', 34 | ] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # List of patterns, relative to source directory, that match files and 40 | # directories to ignore when looking for source files. 41 | # This pattern also affects html_static_path and html_extra_path. 42 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 43 | 44 | 45 | # -- Options for HTML output ------------------------------------------------- 46 | 47 | # The theme to use for HTML and HTML Help pages. See the documentation for 48 | # a list of builtin themes. 49 | # 50 | html_theme = 'alabaster' 51 | 52 | # Add any paths that contain custom static files (such as style sheets) here, 53 | # relative to this directory. They are copied after the builtin static files, 54 | # so a file named "default.css" will overwrite the builtin "default.css". 55 | html_static_path = ['_static'] 56 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. rjieba documentation master file, created by 2 | sphinx-quickstart on Mon Nov 8 17:38:34 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to rjieba's documentation! 7 | ================================== 8 | 9 | .. automodule:: rjieba 10 | :members: 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | :caption: Contents: 15 | 16 | 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster==0.7.12 2 | Babel==2.9.1 3 | certifi==2021.10.8 4 | charset-normalizer==2.0.7 5 | docutils==0.17.1 6 | idna==3.3 7 | imagesize==1.2.0 8 | Jinja2==3.0.2 9 | MarkupSafe==2.0.1 10 | packaging==21.2 11 | Pygments==2.10.0 12 | pyparsing==2.4.7 13 | pytz==2021.3 14 | requests==2.26.0 15 | snowballstemmer==2.1.0 16 | Sphinx==4.2.0 17 | sphinxcontrib-applehelp==1.0.2 18 | sphinxcontrib-devhelp==1.0.2 19 | sphinxcontrib-htmlhelp==2.0.0 20 | sphinxcontrib-jsmath==1.0.1 21 | sphinxcontrib-qthelp==1.0.3 22 | sphinxcontrib-serializinghtml==1.1.5 23 | urllib3==1.26.7 24 | maturin==0.11.5 25 | -------------------------------------------------------------------------------- /docs/runtime.txt: -------------------------------------------------------------------------------- 1 | 3.8 2 | -------------------------------------------------------------------------------- /docs/rust-toolchain: -------------------------------------------------------------------------------- 1 | stable 2 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | base = "docs" 3 | publish = "_build/html" 4 | command = "maturin build -m ../Cargo.toml && python3 -m pip install --force-reinstall ../target/wheels/rjieba*.whl && make html" 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2"] 3 | build-backend = "maturin" 4 | 5 | [tool.maturin] 6 | features = ["pyo3/extension-module"] 7 | -------------------------------------------------------------------------------- /rjieba/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .rjieba import Jieba 3 | 4 | 5 | __all__ = [ 6 | "Jieba", 7 | "cut", 8 | "cut_all", 9 | "cut_for_search", 10 | "tag", 11 | "tokenize", 12 | ] 13 | 14 | 15 | _DEFAULT_JIEBA = Jieba() 16 | 17 | cut = _DEFAULT_JIEBA.cut 18 | cut_all = _DEFAULT_JIEBA.cut_all 19 | cut_for_search = _DEFAULT_JIEBA.cut_for_search 20 | tag = _DEFAULT_JIEBA.tag 21 | tokenize = _DEFAULT_JIEBA.tokenize 22 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | 3 | #[pyclass(subclass)] 4 | struct Jieba { 5 | jieba: jieba_rs::Jieba, 6 | } 7 | 8 | #[pymethods] 9 | impl Jieba { 10 | #[new] 11 | fn new() -> Self { 12 | Self { 13 | jieba: jieba_rs::Jieba::new(), 14 | } 15 | } 16 | 17 | /// Cut the input text 18 | #[pyo3(signature = (text, hmm = true))] 19 | fn cut<'a>(&self, py: Python, text: &'a str, hmm: bool) -> Vec<&'a str> { 20 | py.allow_threads(move || self.jieba.cut(text, hmm)) 21 | } 22 | 23 | /// Cut the input text, return all possible words 24 | #[pyo3(signature = (text,))] 25 | fn cut_all<'a>(&self, py: Python, text: &'a str) -> Vec<&'a str> { 26 | py.allow_threads(move || self.jieba.cut_all(text)) 27 | } 28 | 29 | /// Cut the input text in search mode 30 | #[pyo3(signature = (text, hmm = true))] 31 | fn cut_for_search<'a>(&self, py: Python, text: &'a str, hmm: bool) -> Vec<&'a str> { 32 | py.allow_threads(move || self.jieba.cut_for_search(text, hmm)) 33 | } 34 | 35 | /// Tag the input text 36 | #[pyo3(signature = (text, hmm = true))] 37 | fn tag<'a>(&'a self, py: Python, text: &'a str, hmm: bool) -> Vec<(&'a str, &'a str)> { 38 | py.allow_threads(move || { 39 | self.jieba 40 | .tag(text, hmm) 41 | .into_iter() 42 | .map(|t| (t.word, t.tag)) 43 | .collect() 44 | }) 45 | } 46 | 47 | /// Tokenize 48 | #[pyo3(signature = (text, mode = "default", hmm = true))] 49 | fn tokenize<'a>( 50 | &self, 51 | py: Python, 52 | text: &'a str, 53 | mode: &str, 54 | hmm: bool, 55 | ) -> Vec<(&'a str, usize, usize)> { 56 | let tokenize_mode = if mode.to_lowercase() == "search" { 57 | jieba_rs::TokenizeMode::Search 58 | } else { 59 | jieba_rs::TokenizeMode::Default 60 | }; 61 | py.allow_threads(move || { 62 | self.jieba 63 | .tokenize(text, tokenize_mode, hmm) 64 | .into_iter() 65 | .map(|t| (t.word, t.start, t.end)) 66 | .collect() 67 | }) 68 | } 69 | } 70 | 71 | #[pymodule] 72 | fn rjieba(_py: Python, m: &Bound) -> PyResult<()> { 73 | m.add_class::()?; 74 | Ok(()) 75 | } 76 | -------------------------------------------------------------------------------- /tests/test_rjieba.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import rjieba as jieba 3 | 4 | 5 | def test_cut(): 6 | ret = jieba.cut('') 7 | assert ret == [] 8 | 9 | ret = jieba.cut('南京市长江大桥') 10 | assert ret == ['南京市', '长江大桥'] 11 | 12 | 13 | def test_cut_all(): 14 | ret = jieba.cut_all('') 15 | assert ret == [] 16 | 17 | ret = jieba.cut_all('南京市长江大桥') 18 | assert ret == ['南', '南京', '南京市', '京', '京市', '市', '市长', '长', '长江', '长江大桥', '江', '大', '大桥', '桥'] 19 | 20 | 21 | def test_cut_for_search(): 22 | ret = jieba.cut_for_search('') 23 | assert ret == [] 24 | 25 | ret = jieba.cut_for_search('南京市长江大桥') 26 | assert ret == ['南京', '京市', '南京市', '长江', '大桥', '长江大桥'] 27 | 28 | 29 | def test_tag(): 30 | ret = jieba.tag('') 31 | assert ret == [] 32 | 33 | ret = jieba.tag('南京市长江大桥') 34 | assert len(ret) == 2 35 | assert ret[0] == ('南京市', 'ns') 36 | assert ret[1] == ('长江大桥', 'ns') 37 | 38 | 39 | def test_tag_with_slash(): 40 | ret = jieba.tag('/ .') 41 | assert len(ret) == 3 42 | assert ret[0] == ('/', 'x') 43 | assert ret[1] == (' ', 'x') 44 | assert ret[2] == ('.', 'x') 45 | 46 | 47 | def test_tokenize(): 48 | ret = jieba.tokenize('') 49 | assert ret == [] 50 | 51 | ret = jieba.tokenize('南京市长江大桥') 52 | assert len(ret) == 2 53 | assert ret[0] == ('南京市', 0, 3) 54 | assert ret[1] == ('长江大桥', 3, 7) 55 | 56 | ret = jieba.tokenize('南京南京') 57 | assert len(ret) == 2 58 | assert ret[0] == ('南京', 0, 2) 59 | assert ret[1] == ('南京', 2, 4) 60 | 61 | ret = jieba.tokenize('南京市长江大桥', mode='search') 62 | assert len(ret) == 6 63 | assert ret[0] == ('南京', 0, 2) 64 | assert ret[1] == ('京市', 1, 3) 65 | assert ret[2] == ('南京市', 0, 3) 66 | assert ret[3] == ('长江', 3, 5) 67 | assert ret[4] == ('大桥', 5, 7) 68 | assert ret[5] == ('长江大桥', 3, 7) 69 | --------------------------------------------------------------------------------