├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── CI.yml
    │   └── bench.yml
├── .gitignore
├── Cargo.toml
├── LICENSE
├── README.md
├── benches
    └── jieba_benchmark.rs
├── capi
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── examples
    └── weicheng
    │   ├── Cargo.toml
    │   └── src
    │       ├── main.rs
    │       └── weicheng.txt
├── jieba-macros
    ├── Cargo.toml
    └── src
    │   ├── hmm.model
    │   └── lib.rs
├── rustfmt.toml
├── src
    ├── data
    │   ├── dict.txt
    │   └── idf.txt
    ├── errors.rs
    ├── hmm.rs
    ├── keywords
    │   ├── mod.rs
    │   ├── textrank.rs
    │   └── tfidf.rs
    ├── lib.rs
    └── sparse_dag.rs
└── tests
    └── test_wasm.rs


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: messense
2 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |   pull_request:
 6 | 
 7 | name: CI
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   check:
15 |     name: Check
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v4
19 |       - uses: dtolnay/rust-toolchain@stable
20 |       - run: cargo check --all-features
21 | 
22 |   test:
23 |     name: Test Suite
24 |     runs-on: ${{ matrix.os }}
25 |     strategy:
26 |       matrix:
27 |         os: [ubuntu-latest, macos-latest, windows-latest]
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |       - uses: dtolnay/rust-toolchain@stable
31 |       - name: Cache cargo build
32 |         uses: Swatinem/rust-cache@v2
33 |       - name: Check build with --no-default-features
34 |         run: cargo build --no-default-features
35 |       - name: Check build with default features
36 |         run: cargo build
37 |       - name: Check build with tfidf feature
38 |         run: cargo build --features tfidf
39 |       - name: Check build with textrank feature
40 |         run: cargo build --features textrank
41 |       - name: Test
42 |         run: cargo test --all-features --all --benches
43 | 
44 |   codecov:
45 |     name: Code Coverage
46 |     runs-on: ubuntu-latest
47 |     steps:
48 |       - uses: actions/checkout@v4
49 |         # Nighty needed for --doctests support. See https://github.com/taiki-e/cargo-llvm-cov/issues/2
50 |       - uses: dtolnay/rust-toolchain@nightly
51 |       - name: Cache cargo build
52 |         uses: Swatinem/rust-cache@v2
53 |       - name: Install cargo-llvm-cov
54 |         uses: taiki-e/install-action@cargo-llvm-cov
55 |       - name: Generate code coverage
56 |         run: cargo llvm-cov --all-features --workspace --lcov --doctests --output-path lcov.info
57 |       - name: Upload coverage to Codecov
58 |         uses: codecov/codecov-action@v4
59 |         with:
60 |           files: lcov.info
61 |           fail_ci_if_error: true
62 |         env:
63 |           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
64 | 
65 |   fmt:
66 |     name: Rustfmt
67 |     runs-on: ubuntu-latest
68 |     steps:
69 |       - uses: actions/checkout@v4
70 |       - uses: dtolnay/rust-toolchain@stable
71 |         with:
72 |           components: rustfmt
73 |       - run: cargo fmt --all -- --check
74 | 


--------------------------------------------------------------------------------
/.github/workflows/bench.yml:
--------------------------------------------------------------------------------
 1 | name: benches
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "main"
 7 |   pull_request:
 8 |   # `workflow_dispatch` allows CodSpeed to trigger backtest
 9 |   # performance analysis in order to generate initial data.
10 |   workflow_dispatch:
11 | 
12 | concurrency:
13 |   group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }}-benches
14 |   cancel-in-progress: true
15 | 
16 | jobs:
17 |   benchmarks:
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - uses: dtolnay/rust-toolchain@stable
22 |         with:
23 |           components: rust-src
24 | 
25 |       - uses: Swatinem/rust-cache@v2
26 |         continue-on-error: true
27 | 
28 |       - name: Install cargo-codspeed
29 |         run: cargo install cargo-codspeed
30 | 
31 |       - name: Build the benchmark target(s)
32 |         run: cargo codspeed build --features tfidf,textrank
33 | 
34 |       - name: Run the benchmarks
35 |         uses: CodSpeedHQ/action@v3
36 |         with:
37 |           run: cargo codspeed run
38 |           token: ${{ secrets.CODSPEED_TOKEN }}
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target
3 | Cargo.lock
4 | **/*.rs.bk
5 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "jieba-rs"
 3 | version = "0.7.3"
 4 | authors = ["messense <messense@icloud.com>", "Paul Meng <me@paulme.ng>"]
 5 | categories = ["text-processing"]
 6 | description = "The Jieba Chinese Word Segmentation Implemented in Rust"
 7 | keywords = ["nlp", "chinese", "segmenation"]
 8 | license = "MIT"
 9 | readme = "README.md"
10 | repository = "https://github.com/messense/jieba-rs"
11 | edition = '2021'
12 | 
13 | [package.metadata.docs.rs]
14 | all-features = true
15 | 
16 | [dev-dependencies]
17 | codspeed-criterion-compat = { workspace = true }
18 | rand = { workspace = true }
19 | wasm-bindgen-test = { workspace = true }
20 | rayon = { workspace = true }
21 | 
22 | [target.'cfg(unix)'.dev-dependencies]
23 | jemallocator = "0.5.0"
24 | 
25 | [[bench]]
26 | name = "jieba_benchmark"
27 | harness = false
28 | required-features = ["tfidf", "textrank"]
29 | 
30 | [dependencies]
31 | jieba-macros = { workspace = true }
32 | cedarwood = { workspace = true }
33 | derive_builder = { workspace = true, optional = true }
34 | fxhash = { workspace = true }
35 | include-flate = { workspace = true }
36 | lazy_static = { workspace = true }
37 | ordered-float = { workspace = true, optional = true }
38 | phf = { workspace = true }
39 | regex = { workspace = true }
40 | 
41 | [features]
42 | default = ["default-dict"]
43 | default-dict = []
44 | tfidf = ["dep:ordered-float", "dep:derive_builder"]
45 | textrank = ["dep:ordered-float", "dep:derive_builder"]
46 | 
47 | [workspace]
48 | members = [".", "capi", "jieba-macros", "examples/weicheng"]
49 | 
50 | [workspace.dependencies]
51 | c_fixed_string = "0.2.0"
52 | cedarwood = "0.4"
53 | codspeed-criterion-compat = "2.4.1"
54 | derive_builder = "0.20.0"
55 | fxhash = "0.2.1"
56 | include-flate = "0.3.0"
57 | jieba-macros = { version = "0.7.1", path = "jieba-macros" }
58 | lazy_static = "1.0"
59 | ordered-float = "4.0"
60 | phf = "0.11"
61 | phf_codegen = "0.11"
62 | rand = "0.8"
63 | rayon = "1.10"
64 | regex = "1.0"
65 | wasm-bindgen-test = "0.3.0"
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 - 2019 messense
 4 | Copyright (c) 2019 Paul Meng
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # jieba-rs
 2 | 
 3 | [![GitHub Actions](https://github.com/messense/jieba-rs/workflows/CI/badge.svg)](https://github.com/messense/jieba-rs/actions?query=workflow%3ACI)
 4 | [![codecov](https://codecov.io/gh/messense/jieba-rs/branch/master/graph/badge.svg)](https://codecov.io/gh/messense/jieba-rs)
 5 | [![Crates.io](https://img.shields.io/crates/v/jieba-rs.svg)](https://crates.io/crates/jieba-rs)
 6 | [![docs.rs](https://docs.rs/jieba-rs/badge.svg)](https://docs.rs/jieba-rs/)
 7 | 
 8 | > 🚀 Help me to become a full-time open-source developer by [sponsoring me on GitHub](https://github.com/sponsors/messense)
 9 | 
10 | The Jieba Chinese Word Segmentation Implemented in Rust
11 | 
12 | ## Installation
13 | 
14 | Add it to your ``Cargo.toml``:
15 | 
16 | ```toml
17 | [dependencies]
18 | jieba-rs = "0.7"
19 | ```
20 | 
21 | then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well. 
22 | 
23 | ## Example
24 | 
25 | ```rust
26 | use jieba_rs::Jieba;
27 | 
28 | fn main() {
29 |     let jieba = Jieba::new();
30 |     let words = jieba.cut("我们中出了一个叛徒", false);
31 |     assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
32 | }
33 | ```
34 | 
35 | ## Enabling Additional Features
36 | 
37 | * `default-dict` feature enables embedded dictionary, this features is enabled by default
38 | * `tfidf` feature enables TF-IDF keywords extractor
39 | * `textrank` feature enables TextRank keywords extractor
40 | 
41 | ```toml
42 | [dependencies]
43 | jieba-rs = { version = "0.7", features = ["tfidf", "textrank"] }
44 | ```
45 | 
46 | ## Run benchmark
47 | 
48 | ```bash
49 | cargo bench --all-features
50 | ```
51 | 
52 | ## Benchmark: Compare with cppjieba 
53 | 
54 | * [Optimizing jieba-rs to be 33% faster than cppjieba](https://blog.paulme.ng/posts/2019-06-30-optimizing-jieba-rs-to-be-33percents-faster-than-cppjieba.html)
55 | * [优化 jieba-rs 中文分词性能评测](https://blog.paulme.ng/posts/2019-07-01-%E4%BC%98%E5%8C%96-jieba-rs-%E4%B8%AD%E6%96%87%E5%88%86%E8%AF%8D-%E6%80%A7%E8%83%BD%E8%AF%84%E6%B5%8B%EF%BC%88%E5%BF%AB%E4%BA%8E-cppjieba-33percent%29.html)
56 | * [最佳化 jieba-rs 中文斷詞性能測試](https://blog.paulme.ng/posts/2019-07-01-%E6%9C%80%E4%BD%B3%E5%8C%96jieba-rs%E4%B8%AD%E6%96%87%E6%96%B7%E8%A9%9E%E6%80%A7%E8%83%BD%E6%B8%AC%E8%A9%A6%28%E5%BF%AB%E4%BA%8Ecppjieba-33%25%29.html)
57 | 
58 | ## `jieba-rs` bindings
59 | 
60 | * [`@node-rs/jieba` NodeJS binding](https://github.com/napi-rs/node-rs/tree/main/packages/jieba)
61 | * [`jieba-php` PHP binding](https://github.com/binaryoung/jieba-php)
62 | * [`rjieba-py` Python binding](https://github.com/messense/rjieba-py)
63 | * [`cang-jie` Chinese tokenizer for tantivy](https://github.com/DCjanus/cang-jie)
64 | * [`tantivy-jieba` An adapter that bridges between tantivy and jieba-rs](https://github.com/jiegec/tantivy-jieba)
65 | * [`jieba-wasm` the WebAssembly binding](https://github.com/fengkx/jieba-wasm)
66 | 
67 | ## License
68 | 
69 | This work is released under the MIT license. A copy of the license is provided in the [LICENSE](./LICENSE) file.
70 | 


--------------------------------------------------------------------------------
/benches/jieba_benchmark.rs:
--------------------------------------------------------------------------------
 1 | use codspeed_criterion_compat::{black_box, criterion_group, criterion_main, Criterion, Throughput};
 2 | use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf, TokenizeMode};
 3 | use lazy_static::lazy_static;
 4 | use rayon::iter::{IntoParallelIterator, ParallelIterator};
 5 | 
 6 | #[cfg(unix)]
 7 | #[global_allocator]
 8 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 9 | 
10 | lazy_static! {
11 |     static ref JIEBA: Jieba = Jieba::new();
12 |     static ref TFIDF_EXTRACTOR: TfIdf = TfIdf::default();
13 |     static ref TEXTRANK_EXTRACTOR: TextRank = TextRank::default();
14 | }
15 | static SENTENCE: &str = "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。";
16 | 
17 | fn criterion_benchmark(c: &mut Criterion) {
18 |     let mut group = c.benchmark_group("jieba");
19 |     let dict_len = include_bytes!("../src/data/dict.txt").len() as u64;
20 |     group.throughput(Throughput::Bytes(dict_len));
21 |     group.bench_function("new", |b| {
22 |         b.iter(|| {
23 |             black_box(Jieba::new());
24 |         })
25 |     });
26 |     group.finish();
27 | 
28 |     let mut group = c.benchmark_group("cut");
29 |     group.throughput(Throughput::Bytes(SENTENCE.len() as u64));
30 |     group.bench_function("no_hmm", |b| b.iter(|| JIEBA.cut(black_box(SENTENCE), false)));
31 |     group.bench_function("with_hmm", |b| b.iter(|| JIEBA.cut(black_box(SENTENCE), true)));
32 |     group.bench_function("cut_all", |b| b.iter(|| JIEBA.cut_all(black_box(SENTENCE))));
33 |     group.bench_function("cut_for_search", |b| {
34 |         b.iter(|| JIEBA.cut_for_search(black_box(SENTENCE), true))
35 |     });
36 |     group.finish();
37 | 
38 |     let mut group = c.benchmark_group("tokenize");
39 |     group.throughput(Throughput::Bytes(SENTENCE.len() as u64));
40 |     group.bench_function("default_mode", |b| {
41 |         b.iter(|| JIEBA.tokenize(black_box(SENTENCE), TokenizeMode::Default, true))
42 |     });
43 |     group.bench_function("search_mode", |b| {
44 |         b.iter(|| JIEBA.tokenize(black_box(SENTENCE), TokenizeMode::Search, true))
45 |     });
46 |     group.finish();
47 | 
48 |     let mut group = c.benchmark_group("jieba");
49 |     group.throughput(Throughput::Bytes(SENTENCE.len() as u64));
50 |     group.bench_function("tag", |b| b.iter(|| JIEBA.tag(black_box(SENTENCE), true)));
51 |     group.finish();
52 | 
53 |     let mut group = c.benchmark_group("keywords");
54 |     group.throughput(Throughput::Bytes(SENTENCE.len() as u64));
55 |     group.bench_function("tfidf", |b| {
56 |         b.iter(|| TFIDF_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new()))
57 |     });
58 |     group.bench_function("textrank", |b| {
59 |         b.iter(|| TEXTRANK_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new()))
60 |     });
61 |     group.finish();
62 | 
63 |     let mut group = c.benchmark_group("multithreaded");
64 |     let repeat = 1000usize;
65 |     group.throughput(Throughput::Bytes(SENTENCE.len() as u64 * repeat as u64));
66 |     group.bench_function("single_thread", |b| {
67 |         b.iter(|| {
68 |             for _ in 0..repeat {
69 |                 let _words = JIEBA.cut(black_box(&SENTENCE), true);
70 |             }
71 |         })
72 |     });
73 |     group.bench_function("multi_thread", |b| {
74 |         b.iter(|| {
75 |             (0..repeat).into_par_iter().for_each(|_| {
76 |                 let _words = JIEBA.cut(black_box(&SENTENCE), true);
77 |             });
78 |         })
79 |     });
80 |     group.finish();
81 | }
82 | 
83 | criterion_group!(benches, criterion_benchmark);
84 | criterion_main!(benches);
85 | 


--------------------------------------------------------------------------------
/capi/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "jieba-capi"
 3 | version = "0.1.0"
 4 | authors = ["messense <messense@icloud.com>"]
 5 | edition = "2021"
 6 | 
 7 | [dependencies]
 8 | jieba-rs = { version = "0.7.0", path = "../", features = ["textrank", "tfidf"] }
 9 | c_fixed_string = { workspace = true }
10 | 
11 | [lib]
12 | crate-type = ["cdylib"]
13 | 


--------------------------------------------------------------------------------
/capi/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use c_fixed_string::CFixedStr;
  2 | use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf};
  3 | use std::boxed::Box;
  4 | use std::os::raw::c_char;
  5 | use std::{mem, ptr};
  6 | 
  7 | #[repr(C)]
  8 | pub struct CJieba {
  9 |     jieba: Jieba,
 10 |     _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>,
 11 | }
 12 | 
 13 | #[repr(C)]
 14 | pub struct CJiebaTFIDF {
 15 |     cjieba: *mut CJieba,
 16 |     tfidf: TfIdf,
 17 |     _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>,
 18 | }
 19 | 
 20 | #[repr(C)]
 21 | pub struct CJiebaWords {
 22 |     pub words: *mut FfiStr,
 23 |     pub len: usize,
 24 | }
 25 | 
 26 | #[repr(C)]
 27 | pub struct CJiebaToken {
 28 |     pub word: FfiStr,
 29 |     pub start: usize,
 30 |     pub end: usize,
 31 | }
 32 | 
 33 | #[repr(C)]
 34 | pub struct CJiebaTokens {
 35 |     pub tokens: *mut CJiebaToken,
 36 |     pub len: usize,
 37 | }
 38 | 
 39 | /// Tokenize mode
 40 | #[repr(C)]
 41 | pub enum TokenizeMode {
 42 |     /// Default mode
 43 |     Default = 0,
 44 |     /// Search mode
 45 |     Search = 1,
 46 | }
 47 | 
 48 | impl From<TokenizeMode> for jieba_rs::TokenizeMode {
 49 |     fn from(mode: TokenizeMode) -> Self {
 50 |         match mode {
 51 |             TokenizeMode::Default => jieba_rs::TokenizeMode::Default,
 52 |             TokenizeMode::Search => jieba_rs::TokenizeMode::Search,
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | #[repr(C)]
 58 | pub struct CJiebaTag {
 59 |     pub word: FfiStr,
 60 |     pub tag: FfiStr,
 61 | }
 62 | 
 63 | #[repr(C)]
 64 | pub struct CJiebaTags {
 65 |     pub tags: *mut CJiebaTag,
 66 |     pub len: usize,
 67 | }
 68 | 
 69 | /// Represents a string.
 70 | #[repr(C)]
 71 | pub struct FfiStr {
 72 |     pub data: *mut c_char,
 73 |     pub len: usize,
 74 |     pub owned: bool,
 75 | }
 76 | 
 77 | impl Default for FfiStr {
 78 |     fn default() -> Self {
 79 |         Self {
 80 |             data: ptr::null_mut(),
 81 |             len: 0,
 82 |             owned: false,
 83 |         }
 84 |     }
 85 | }
 86 | 
 87 | impl FfiStr {
 88 |     pub fn from_string(mut s: String) -> Self {
 89 |         s.shrink_to_fit();
 90 |         let rv = Self {
 91 |             data: s.as_ptr() as *mut c_char,
 92 |             len: s.len(),
 93 |             owned: true,
 94 |         };
 95 |         mem::forget(s);
 96 |         rv
 97 |     }
 98 | 
 99 |     /// # Safety
100 |     /// Frees the underlying data. After this call, the internal pointer is invalid.
101 |     pub unsafe fn free(&mut self) {
102 |         if self.owned && !self.data.is_null() {
103 |             String::from_raw_parts(self.data as *mut _, self.len, self.len);
104 |             self.data = ptr::null_mut();
105 |             self.len = 0;
106 |             self.owned = false;
107 |         }
108 |     }
109 | }
110 | 
111 | impl Drop for FfiStr {
112 |     fn drop(&mut self) {
113 |         unsafe {
114 |             self.free();
115 |         }
116 |     }
117 | }
118 | 
119 | /// Frees a ffi str.
120 | ///
121 | /// If the string is marked as not owned then this function does not
122 | /// do anything.
123 | ///
124 | /// # Safety
125 | /// Used to release strings returned as results of function calls.
126 | #[no_mangle]
127 | pub unsafe extern "C" fn jieba_str_free(s: *mut FfiStr) {
128 |     if !s.is_null() {
129 |         (*s).free()
130 |     }
131 | }
132 | 
133 | unsafe fn params_unwrap(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&Jieba, &CFixedStr) {
134 |     let jieba = &(*(*cjieba_ref)).jieba;
135 |     let c_str = CFixedStr::from_ptr(s, len);
136 |     (jieba, c_str)
137 | }
138 | 
139 | unsafe fn params_unwrap_mut(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&mut Jieba, &CFixedStr) {
140 |     let jieba = &mut (*(*cjieba_ref)).jieba;
141 |     let c_str = CFixedStr::from_ptr(s, len);
142 |     (jieba, c_str)
143 | }
144 | 
145 | /// # Safety
146 | /// Returned value must be freed by `jieba_free()`.
147 | #[no_mangle]
148 | pub extern "C" fn jieba_new() -> *mut CJieba {
149 |     let cjieba = CJieba {
150 |         jieba: Jieba::new(),
151 |         _marker: Default::default(),
152 |     };
153 |     Box::into_raw(Box::new(cjieba))
154 | }
155 | 
156 | /// Returns a Jieba instance with an empty dictionary.
157 | ///
158 | /// # Safety
159 | /// Returned value must be freed by `jieba_free()`.
160 | #[no_mangle]
161 | pub extern "C" fn jieba_empty() -> *mut CJieba {
162 |     let cjieba = CJieba {
163 |         jieba: Jieba::empty(),
164 |         _marker: Default::default(),
165 |     };
166 |     Box::into_raw(Box::new(cjieba))
167 | }
168 | 
169 | /// # Safety
170 | /// cjieba is result from `jieba_new()` call.
171 | #[no_mangle]
172 | pub unsafe extern "C" fn jieba_free(cjieba: *mut CJieba) {
173 |     if !cjieba.is_null() {
174 |         drop(Box::from_raw(cjieba));
175 |     }
176 | }
177 | 
178 | /// # Safety
179 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
180 | #[no_mangle]
181 | pub unsafe extern "C" fn jieba_cut(
182 |     cjieba: *mut CJieba,
183 |     sentence: *const c_char,
184 |     len: usize,
185 |     hmm: bool,
186 | ) -> *mut CJiebaWords {
187 |     let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
188 |     // FIXME: remove allocation
189 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
190 |     let words = jieba.cut(&s, hmm);
191 |     let mut c_words: Vec<FfiStr> = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect();
192 |     let words_len = c_words.len();
193 |     let ptr = c_words.as_mut_ptr();
194 |     mem::forget(c_words);
195 |     Box::into_raw(Box::new(CJiebaWords {
196 |         words: ptr,
197 |         len: words_len,
198 |     }))
199 | }
200 | 
201 | /// # Safety
202 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
203 | #[no_mangle]
204 | pub unsafe extern "C" fn jieba_cut_all(cjieba: *mut CJieba, sentence: *const c_char, len: usize) -> *mut CJiebaWords {
205 |     let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
206 |     // FIXME: remove allocation
207 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
208 |     let words = (*jieba).cut_all(&s);
209 |     let mut c_words: Vec<FfiStr> = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect();
210 |     let words_len = c_words.len();
211 |     let ptr = c_words.as_mut_ptr();
212 |     mem::forget(c_words);
213 |     Box::into_raw(Box::new(CJiebaWords {
214 |         words: ptr,
215 |         len: words_len,
216 |     }))
217 | }
218 | 
219 | /// # Safety
220 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
221 | #[no_mangle]
222 | pub unsafe extern "C" fn jieba_cut_for_search(
223 |     cjieba: *mut CJieba,
224 |     sentence: *const c_char,
225 |     len: usize,
226 |     hmm: bool,
227 | ) -> *mut CJiebaWords {
228 |     let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
229 |     // FIXME: remove allocation
230 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
231 |     let words = (*jieba).cut_for_search(&s, hmm);
232 |     let mut c_words: Vec<FfiStr> = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect();
233 |     let words_len = c_words.len();
234 |     let ptr = c_words.as_mut_ptr();
235 |     mem::forget(c_words);
236 |     Box::into_raw(Box::new(CJiebaWords {
237 |         words: ptr,
238 |         len: words_len,
239 |     }))
240 | }
241 | 
242 | /// # Safety
243 | /// cjieba must be valid object from `jieba_new()` and must outlive the returned CJiebaTFIDF instance.
244 | ///
245 | /// Returned value must be freed by `jieba_tfidf_free()`.
246 | #[no_mangle]
247 | pub extern "C" fn jieba_tfidf_new(cjieba: *mut CJieba) -> *mut CJiebaTFIDF {
248 |     let cjieba_tfidf = CJiebaTFIDF {
249 |         cjieba,
250 |         tfidf: Default::default(),
251 |         _marker: Default::default(),
252 |     };
253 |     Box::into_raw(Box::new(cjieba_tfidf))
254 | }
255 | 
256 | /// # Safety
257 | /// cjieba_tfidf is result from `jieba_tfidf_new()` call.
258 | #[no_mangle]
259 | pub unsafe extern "C" fn jieba_tfidf_free(cjieba_tfidf: *mut CJiebaTFIDF) {
260 |     if !cjieba_tfidf.is_null() {
261 |         drop(Box::from_raw(cjieba_tfidf));
262 |     }
263 | }
264 | 
265 | /// # Safety
266 | /// cjieba_tfidf must be valid object from `jieba_tfidf_new()`. `sentence` must be `len` or larger.
267 | ///
268 | /// Returned value must be freed by `jieba_words_free()`.
269 | #[no_mangle]
270 | pub unsafe extern "C" fn jieba_tfidf_extract(
271 |     cjieba_tfidf: *mut CJiebaTFIDF,
272 |     sentence: *const c_char,
273 |     len: usize,
274 |     top_k: usize,
275 |     allowed_pos: *const *mut c_char,
276 |     allowed_pos_len: usize,
277 | ) -> *mut CJiebaWords {
278 |     let cjieba_tfidf_ref = &(*cjieba_tfidf);
279 |     let tfidf = &cjieba_tfidf_ref.tfidf;
280 |     let (jieba, c_str) = params_unwrap(&cjieba_tfidf_ref.cjieba, sentence, len);
281 |     // FIXME: remove allocation
282 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
283 | 
284 |     let allowed_pos: Vec<String> = if allowed_pos_len == 0 || allowed_pos.is_null() {
285 |         Vec::new()
286 |     } else {
287 |         let mut v = Vec::with_capacity(allowed_pos_len);
288 | 
289 |         let slice: &[*mut c_char] = std::slice::from_raw_parts(allowed_pos, allowed_pos_len);
290 |         for ptr in slice.iter() {
291 |             let cstring_allowed_pos = std::ffi::CString::from_raw(*ptr);
292 |             let string_allowed_pos = cstring_allowed_pos.into_string().expect("into_string().err() failed");
293 |             v.push(string_allowed_pos);
294 |         }
295 | 
296 |         v
297 |     };
298 | 
299 |     let words = tfidf.extract_keywords(jieba, &s, top_k, allowed_pos);
300 |     let mut c_words: Vec<FfiStr> = words.into_iter().map(|x| FfiStr::from_string(x.keyword)).collect();
301 |     let words_len = c_words.len();
302 |     let ptr = c_words.as_mut_ptr();
303 |     mem::forget(c_words);
304 |     Box::into_raw(Box::new(CJiebaWords {
305 |         words: ptr,
306 |         len: words_len,
307 |     }))
308 | }
309 | 
310 | /// # Safety
311 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
312 | ///
313 | /// Returned value must be freed by `jieba_words_free()`.
314 | #[no_mangle]
315 | pub unsafe extern "C" fn jieba_textrank_extract(
316 |     cjieba: *mut CJieba,
317 |     sentence: *const c_char,
318 |     len: usize,
319 |     top_k: usize,
320 |     allowed_pos: *const *mut c_char,
321 |     allowed_pos_len: usize,
322 | ) -> *mut CJiebaWords {
323 |     let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
324 |     // FIXME: remove allocation
325 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
326 | 
327 |     let allowed_pos: Vec<String> = if allowed_pos_len == 0 || allowed_pos.is_null() {
328 |         Vec::new()
329 |     } else {
330 |         let mut v = Vec::with_capacity(allowed_pos_len);
331 | 
332 |         let slice: &[*mut c_char] = std::slice::from_raw_parts(allowed_pos, allowed_pos_len);
333 |         for ptr in slice.iter() {
334 |             let cstring_allowed_pos = std::ffi::CString::from_raw(*ptr);
335 |             let string_allowed_pos = cstring_allowed_pos.into_string().expect("into_string().err() failed");
336 |             v.push(string_allowed_pos);
337 |         }
338 | 
339 |         v
340 |     };
341 | 
342 |     let textrank = TextRank::default();
343 |     let words = textrank.extract_keywords(jieba, &s, top_k, allowed_pos);
344 |     let mut c_words: Vec<FfiStr> = words.into_iter().map(|x| FfiStr::from_string(x.keyword)).collect();
345 |     let words_len = c_words.len();
346 |     let ptr = c_words.as_mut_ptr();
347 |     mem::forget(c_words);
348 |     Box::into_raw(Box::new(CJiebaWords {
349 |         words: ptr,
350 |         len: words_len,
351 |     }))
352 | }
353 | 
354 | /// # Safety
355 | /// c_tags is result from `jieba_textrank_extract()` or `jieba_tfidf_extract()` call.
356 | #[no_mangle]
357 | pub unsafe extern "C" fn jieba_words_free(c_words: *mut CJiebaWords) {
358 |     if !c_words.is_null() {
359 |         Vec::from_raw_parts((*c_words).words, (*c_words).len, (*c_words).len);
360 |         drop(Box::from_raw(c_words));
361 |     }
362 | }
363 | 
364 | /// # Safety
365 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
366 | ///
367 | /// Returned value must be freed by `jieba_tokens_free()`.
368 | #[no_mangle]
369 | pub unsafe extern "C" fn jieba_tokenize(
370 |     cjieba: *mut CJieba,
371 |     sentence: *const c_char,
372 |     len: usize,
373 |     mode: TokenizeMode,
374 |     hmm: bool,
375 | ) -> *mut CJiebaTokens {
376 |     let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
377 |     // FIXME: remove allocation
378 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
379 |     let tokens = (*jieba).tokenize(&s, mode.into(), hmm);
380 |     let mut c_tokens: Vec<CJiebaToken> = tokens
381 |         .into_iter()
382 |         .map(|x| CJiebaToken {
383 |             word: FfiStr::from_string(x.word.to_string()),
384 |             start: x.start,
385 |             end: x.end,
386 |         })
387 |         .collect();
388 |     let tokens_len = c_tokens.len();
389 |     let ptr = c_tokens.as_mut_ptr();
390 |     mem::forget(c_tokens);
391 |     Box::into_raw(Box::new(CJiebaTokens {
392 |         tokens: ptr,
393 |         len: tokens_len,
394 |     }))
395 | }
396 | 
397 | /// # Safety
398 | /// c_tokens is result from `jieba_tokenize()` call.
399 | #[no_mangle]
400 | pub unsafe extern "C" fn jieba_tokens_free(c_tokens: *mut CJiebaTokens) {
401 |     if !c_tokens.is_null() {
402 |         Vec::from_raw_parts((*c_tokens).tokens, (*c_tokens).len, (*c_tokens).len);
403 |         drop(Box::from_raw(c_tokens));
404 |     }
405 | }
406 | 
407 | /// # Safety
408 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger.
409 | ///
410 | /// Returned value must be freed by `jieba_tags_free()`.
411 | #[no_mangle]
412 | pub unsafe extern "C" fn jieba_tag(
413 |     cjieba: *mut CJieba,
414 |     sentence: *const c_char,
415 |     len: usize,
416 |     hmm: bool,
417 | ) -> *mut CJiebaTags {
418 |     let (jieba, c_str) = params_unwrap(&cjieba, sentence, len);
419 |     // FIXME: remove allocation
420 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
421 |     let tags = (*jieba).tag(&s, hmm);
422 |     let mut c_tags: Vec<CJiebaTag> = tags
423 |         .into_iter()
424 |         .map(|x| CJiebaTag {
425 |             word: FfiStr::from_string(x.word.to_string()),
426 |             tag: FfiStr::from_string(x.tag.to_string()),
427 |         })
428 |         .collect();
429 |     let tags_len = c_tags.len();
430 |     let ptr = c_tags.as_mut_ptr();
431 |     mem::forget(c_tags);
432 |     Box::into_raw(Box::new(CJiebaTags {
433 |         tags: ptr,
434 |         len: tags_len,
435 |     }))
436 | }
437 | 
438 | /// # Safety
439 | /// c_tags is result from `jieba_tag()` call.
440 | #[no_mangle]
441 | pub unsafe extern "C" fn jieba_tags_free(c_tags: *mut CJiebaTags) {
442 |     if !c_tags.is_null() {
443 |         Vec::from_raw_parts((*c_tags).tags, (*c_tags).len, (*c_tags).len);
444 |         drop(Box::from_raw(c_tags));
445 |     }
446 | }
447 | 
448 | /// # Safety
449 | /// cjieba must be valid object from `jieba_new()`. `word` must be `len` or larger.
450 | #[no_mangle]
451 | pub unsafe extern "C" fn jieba_add_word(cjieba: *mut CJieba, word: *const c_char, len: usize) -> usize {
452 |     let (jieba, c_str) = params_unwrap_mut(&cjieba, word, len);
453 |     // FIXME: remove allocation
454 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
455 |     jieba.add_word(&s, None, None)
456 | }
457 | 
458 | /// # Safety
459 | /// cjieba must be valid object from `jieba_new()`. `segment` must be `len` or larger.
460 | #[no_mangle]
461 | pub unsafe extern "C" fn jieba_suggest_freq(cjieba: *mut CJieba, segment: *const c_char, len: usize) -> usize {
462 |     let (jieba, c_str) = params_unwrap(&cjieba, segment, len);
463 |     // FIXME: remove allocation
464 |     let s = String::from_utf8_lossy(c_str.as_bytes_full());
465 | 
466 |     (*jieba).suggest_freq(&s)
467 | }
468 | 
469 | #[cfg(test)]
470 | mod test {
471 |     use super::*;
472 |     use std::ffi::CString;
473 | 
474 |     #[test]
475 |     fn test_jieba_new_and_free() {
476 |         let jieba = jieba_new();
477 |         unsafe { jieba_free(jieba) };
478 |     }
479 | 
480 |     #[test]
481 |     fn test_jieba_empty_and_free() {
482 |         let jieba = jieba_empty();
483 |         unsafe { jieba_free(jieba) };
484 |     }
485 | 
486 |     #[test]
487 |     fn test_jieba_add_word() {
488 |         let jieba = jieba_empty();
489 |         let word = "今天";
490 |         let c_word = CString::new(word).unwrap();
491 |         unsafe {
492 |             jieba_add_word(jieba, c_word.as_ptr(), word.len());
493 |             jieba_free(jieba)
494 |         };
495 |     }
496 | }
497 | 


--------------------------------------------------------------------------------
/examples/weicheng/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "weicheng"
 3 | version = "0.1.0"
 4 | authors = ["messense <messense@icloud.com>"]
 5 | edition = "2021"
 6 | 
 7 | [dependencies]
 8 | jieba-rs = { path = "../.." }
 9 | 
10 | [target.'cfg(unix)'.dependencies]
11 | jemallocator = "0.5.0"
12 | 


--------------------------------------------------------------------------------
/examples/weicheng/src/main.rs:
--------------------------------------------------------------------------------
 1 | use jieba_rs::Jieba;
 2 | use std::time;
 3 | 
 4 | #[cfg(unix)]
 5 | #[global_allocator]
 6 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 7 | 
 8 | static WEICHENG_TXT: &str = include_str!("weicheng.txt");
 9 | 
10 | fn main() {
11 |     let jieba = Jieba::new();
12 |     let lines: Vec<&str> = WEICHENG_TXT.split('\n').collect();
13 |     let now = time::Instant::now();
14 |     for _ in 0..50 {
15 |         for line in &lines {
16 |             let _ = jieba.cut(line, true);
17 |         }
18 |     }
19 |     println!("{}ms", now.elapsed().as_millis());
20 | }
21 | 


--------------------------------------------------------------------------------
/jieba-macros/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "jieba-macros"
 3 | version = "0.7.1"
 4 | categories = ["text-processing"]
 5 | description = "jieba-rs proc-macro"
 6 | keywords = ["nlp", "chinese", "segmenation"]
 7 | license = "MIT"
 8 | readme = "../README.md"
 9 | repository = "https://github.com/messense/jieba-rs"
10 | edition = "2021"
11 | 
12 | [lib]
13 | proc-macro = true
14 | 
15 | [dependencies]
16 | phf_codegen = { workspace = true }
17 | 


--------------------------------------------------------------------------------
/jieba-macros/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use proc_macro::TokenStream;
 2 | 
 3 | #[proc_macro]
 4 | pub fn generate_hmm_data(_input: TokenStream) -> TokenStream {
 5 |     let hmm_data = include_str!("hmm.model");
 6 |     let mut output = String::new();
 7 |     let mut lines = hmm_data.lines().skip_while(|x| x.starts_with('#'));
 8 | 
 9 |     // Initial probabilities
10 |     let init_probs = lines
11 |         .next()
12 |         .expect("Failed to read initial probabilities from hmm.model");
13 | 
14 |     output.push_str("#[allow(clippy::style)]\n");
15 |     output.push_str("pub static INITIAL_PROBS: [f64; 4] = [");
16 |     output.push_str(&init_probs.replace(' ', ", "));
17 |     output.push_str("];\n\n");
18 | 
19 |     // Transition probabilities
20 |     output.push_str("#[allow(clippy::style)]\n");
21 |     output.push_str("pub static TRANS_PROBS: [[f64; 4]; 4] = [");
22 |     for line in lines
23 |         .by_ref()
24 |         .skip_while(|x| x.starts_with('#'))
25 |         .take_while(|x| !x.starts_with('#'))
26 |     {
27 |         output.push('[');
28 |         output.push_str(&line.replace(' ', ", "));
29 |         output.push_str("],\n");
30 |     }
31 |     output.push_str("];\n\n");
32 | 
33 |     // Emission probabilities
34 |     for (i, line) in lines.filter(|x| !x.starts_with('#')).enumerate() {
35 |         output.push_str("#[allow(clippy::style)]\n");
36 |         output.push_str(&format!("pub static EMIT_PROB_{}: phf::Map<&'static str, f64> = ", i));
37 | 
38 |         let mut map = phf_codegen::Map::new();
39 |         for word_prob in line.split(',') {
40 |             let mut parts = word_prob.split(':');
41 |             let word = parts.next().unwrap();
42 |             let prob = parts.next().unwrap();
43 |             map.entry(word, prob);
44 |         }
45 |         output.push_str(&map.build().to_string());
46 |         output.push_str(";\n\n");
47 |     }
48 | 
49 |     output.push_str("#[allow(clippy::style)]\n");
50 |     output.push_str("pub static EMIT_PROBS: [&'static phf::Map<&'static str, f64>; 4] = [&EMIT_PROB_0, &EMIT_PROB_1, &EMIT_PROB_2, &EMIT_PROB_3];\n\n");
51 | 
52 |     output.parse().unwrap()
53 | }
54 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 120
2 | 


--------------------------------------------------------------------------------
/src/errors.rs:
--------------------------------------------------------------------------------
 1 | use std::{error, fmt, io};
 2 | 
 3 | /// The Error type
 4 | #[derive(Debug)]
 5 | pub enum Error {
 6 |     /// I/O errors
 7 |     Io(io::Error),
 8 |     /// Invalid entry in dictionary
 9 |     InvalidDictEntry(String),
10 | }
11 | 
12 | impl From<io::Error> for Error {
13 |     fn from(err: io::Error) -> Self {
14 |         Self::Io(err)
15 |     }
16 | }
17 | 
18 | impl fmt::Display for Error {
19 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
20 |         match *self {
21 |             Error::Io(ref err) => err.fmt(f),
22 |             Error::InvalidDictEntry(ref err) => write!(f, "invalid dictionary entry: {}", err),
23 |         }
24 |     }
25 | }
26 | 
27 | impl error::Error for Error {
28 |     fn source(&self) -> Option<&(dyn error::Error + 'static)> {
29 |         match *self {
30 |             Error::Io(ref err) => Some(err),
31 |             Error::InvalidDictEntry(_) => None,
32 |         }
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/hmm.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | 
  3 | use regex::Regex;
  4 | 
  5 | use crate::SplitMatches;
  6 | use jieba_macros::generate_hmm_data;
  7 | 
  8 | thread_local! {
  9 |     static RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap();
 10 |     static RE_SKIP: Regex = Regex::new(r"([a-zA-Z0-9]+(?:.\d+)?%?)").unwrap();
 11 | }
 12 | 
 13 | pub const NUM_STATES: usize = 4;
 14 | 
 15 | /// Result of hmm is a labeling of each Unicode Scalar Value in the input
 16 | /// string with Begin, Middle, End, or Single. These denote the proposed
 17 | /// segments. A segment is one of the following two patterns.
 18 | ///
 19 | ///   Begin, [Middle...], End
 20 | ///   Single
 21 | ///
 22 | /// Each state in the enum is also assigned an index value from 0-3 that
 23 | /// can be used as an index into an array representing data pertaining
 24 | /// to that state.
 25 | ///
 26 | /// WARNING: The data file format for hmm.model comments imply one can
 27 | /// reassign the index values of each state at the top but `jieba-macros`
 28 | /// currently ignores the mapping. Do not reassign these indices without
 29 | /// verifying how it interacts with `jieba-macros`.  These indices must also
 30 | /// match the order if ALLOWED_PREV_STATUS.
 31 | #[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Copy)]
 32 | pub enum State {
 33 |     Begin = 0,
 34 |     End = 1,
 35 |     Middle = 2,
 36 |     Single = 3,
 37 | }
 38 | 
 39 | // Mapping representing the allow transitiongs into the given state.
 40 | //
 41 | // WARNING: Ordering must match the indicies in State.
 42 | static ALLOWED_PREV_STATUS: [[State; 2]; NUM_STATES] = [
 43 |     // Can preceed State::Begin
 44 |     [State::End, State::Single],
 45 |     // Can preceed State::End
 46 |     [State::Begin, State::Middle],
 47 |     // Can preceed State::Middle
 48 |     [State::Middle, State::Begin],
 49 |     // Can preceed State::Single
 50 |     [State::Single, State::End],
 51 | ];
 52 | 
 53 | generate_hmm_data!();
 54 | 
 55 | const MIN_FLOAT: f64 = -3.14e100;
 56 | 
 57 | #[derive(Default)]
 58 | pub(crate) struct HmmContext {
 59 |     v: Vec<f64>,
 60 |     prev: Vec<Option<State>>,
 61 |     best_path: Vec<State>,
 62 | }
 63 | 
 64 | #[allow(non_snake_case)]
 65 | fn viterbi(sentence: &str, hmm_context: &mut HmmContext) {
 66 |     let str_len = sentence.len();
 67 |     let states = [State::Begin, State::Middle, State::End, State::Single];
 68 |     #[allow(non_snake_case)]
 69 |     let R = states.len();
 70 |     let C = sentence.chars().count();
 71 |     assert!(C > 1);
 72 | 
 73 |     // TODO: Can code just do fill() with the default instead of clear() and resize?
 74 |     if hmm_context.prev.len() < R * C {
 75 |         hmm_context.prev.resize(R * C, None);
 76 |     }
 77 | 
 78 |     if hmm_context.v.len() < R * C {
 79 |         hmm_context.v.resize(R * C, 0.0);
 80 |     }
 81 | 
 82 |     if hmm_context.best_path.len() < C {
 83 |         hmm_context.best_path.resize(C, State::Begin);
 84 |     }
 85 | 
 86 |     let mut curr = sentence.char_indices().map(|x| x.0).peekable();
 87 |     let x1 = curr.next().unwrap();
 88 |     let x2 = *curr.peek().unwrap();
 89 |     for y in &states {
 90 |         let first_word = &sentence[x1..x2];
 91 |         let prob = INITIAL_PROBS[*y as usize] + EMIT_PROBS[*y as usize].get(first_word).cloned().unwrap_or(MIN_FLOAT);
 92 |         hmm_context.v[*y as usize] = prob;
 93 |     }
 94 | 
 95 |     let mut t = 1;
 96 |     while let Some(byte_start) = curr.next() {
 97 |         for y in &states {
 98 |             let byte_end = *curr.peek().unwrap_or(&str_len);
 99 |             let word = &sentence[byte_start..byte_end];
100 |             let em_prob = EMIT_PROBS[*y as usize].get(word).cloned().unwrap_or(MIN_FLOAT);
101 |             let (prob, state) = ALLOWED_PREV_STATUS[*y as usize]
102 |                 .iter()
103 |                 .map(|y0| {
104 |                     (
105 |                         hmm_context.v[(t - 1) * R + (*y0 as usize)]
106 |                             + TRANS_PROBS[*y0 as usize].get(*y as usize).cloned().unwrap_or(MIN_FLOAT)
107 |                             + em_prob,
108 |                         *y0,
109 |                     )
110 |                 })
111 |                 .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal))
112 |                 .unwrap();
113 |             let idx = (t * R) + (*y as usize);
114 |             hmm_context.v[idx] = prob;
115 |             hmm_context.prev[idx] = Some(state);
116 |         }
117 | 
118 |         t += 1;
119 |     }
120 | 
121 |     let (_prob, state) = [State::End, State::Single]
122 |         .iter()
123 |         .map(|y| (hmm_context.v[(C - 1) * R + (*y as usize)], y))
124 |         .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal))
125 |         .unwrap();
126 | 
127 |     let mut t = C - 1;
128 |     let mut curr = *state;
129 | 
130 |     hmm_context.best_path[t] = *state;
131 |     while let Some(p) = hmm_context.prev[t * R + (curr as usize)] {
132 |         assert!(t > 0);
133 |         hmm_context.best_path[t - 1] = p;
134 |         curr = p;
135 |         t -= 1;
136 |     }
137 | 
138 |     hmm_context.prev.clear();
139 |     hmm_context.v.clear();
140 | }
141 | 
142 | #[allow(non_snake_case)]
143 | pub(crate) fn cut_internal<'a>(sentence: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut HmmContext) {
144 |     let str_len = sentence.len();
145 |     viterbi(sentence, hmm_context);
146 |     let mut begin = 0;
147 |     let mut next_byte_offset = 0;
148 |     let mut i = 0;
149 | 
150 |     let mut curr = sentence.char_indices().map(|x| x.0).peekable();
151 |     while let Some(curr_byte_offset) = curr.next() {
152 |         let state = hmm_context.best_path[i];
153 |         match state {
154 |             State::Begin => begin = curr_byte_offset,
155 |             State::End => {
156 |                 let byte_start = begin;
157 |                 let byte_end = *curr.peek().unwrap_or(&str_len);
158 |                 words.push(&sentence[byte_start..byte_end]);
159 |                 next_byte_offset = byte_end;
160 |             }
161 |             State::Single => {
162 |                 let byte_start = curr_byte_offset;
163 |                 let byte_end = *curr.peek().unwrap_or(&str_len);
164 |                 words.push(&sentence[byte_start..byte_end]);
165 |                 next_byte_offset = byte_end;
166 |             }
167 |             State::Middle => { /* do nothing */ }
168 |         }
169 | 
170 |         i += 1;
171 |     }
172 | 
173 |     if next_byte_offset < str_len {
174 |         let byte_start = next_byte_offset;
175 |         words.push(&sentence[byte_start..]);
176 |     }
177 | 
178 |     hmm_context.best_path.clear();
179 | }
180 | 
181 | #[allow(non_snake_case)]
182 | pub(crate) fn cut_with_allocated_memory<'a>(sentence: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut HmmContext) {
183 |     RE_HAN.with(|re_han| {
184 |         RE_SKIP.with(|re_skip| {
185 |             let splitter = SplitMatches::new(re_han, sentence);
186 |             for state in splitter {
187 |                 let block = state.into_str();
188 |                 if block.is_empty() {
189 |                     continue;
190 |                 }
191 |                 if re_han.is_match(block) {
192 |                     if block.chars().count() > 1 {
193 |                         cut_internal(block, words, hmm_context);
194 |                     } else {
195 |                         words.push(block);
196 |                     }
197 |                 } else {
198 |                     let skip_splitter = SplitMatches::new(re_skip, block);
199 |                     for skip_state in skip_splitter {
200 |                         let x = skip_state.into_str();
201 |                         if x.is_empty() {
202 |                             continue;
203 |                         }
204 |                         words.push(x);
205 |                     }
206 |                 }
207 |             }
208 |         })
209 |     })
210 | }
211 | 
212 | #[allow(non_snake_case)]
213 | pub fn cut<'a>(sentence: &'a str, words: &mut Vec<&'a str>) {
214 |     let mut hmm_context = HmmContext::default();
215 | 
216 |     cut_with_allocated_memory(sentence, words, &mut hmm_context)
217 | }
218 | 
219 | #[cfg(test)]
220 | mod tests {
221 |     use super::{cut, viterbi, HmmContext};
222 | 
223 |     #[test]
224 |     #[allow(non_snake_case)]
225 |     fn test_viterbi() {
226 |         use super::State::*;
227 | 
228 |         let sentence = "小明硕士毕业于中国科学院计算所";
229 | 
230 |         let mut hmm_context = HmmContext::default();
231 |         viterbi(sentence, &mut hmm_context);
232 |         assert_eq!(
233 |             hmm_context.best_path,
234 |             vec![Begin, End, Begin, End, Begin, Middle, End, Begin, End, Begin, Middle, End, Begin, End, Single]
235 |         );
236 |     }
237 | 
238 |     #[test]
239 |     fn test_hmm_cut() {
240 |         let sentence = "小明硕士毕业于中国科学院计算所";
241 |         let mut words = Vec::with_capacity(sentence.chars().count() / 2);
242 |         cut(sentence, &mut words);
243 |         assert_eq!(words, vec!["小明", "硕士", "毕业于", "中国", "科学院", "计算", "所"]);
244 |     }
245 | }
246 | 


--------------------------------------------------------------------------------
/src/keywords/mod.rs:
--------------------------------------------------------------------------------
  1 | use derive_builder::Builder;
  2 | use lazy_static::lazy_static;
  3 | use std::collections::BTreeSet;
  4 | 
  5 | use crate::Jieba;
  6 | 
  7 | #[cfg(feature = "textrank")]
  8 | pub mod textrank;
  9 | #[cfg(feature = "tfidf")]
 10 | pub mod tfidf;
 11 | 
 12 | lazy_static! {
 13 |     pub static ref DEFAULT_STOP_WORDS: BTreeSet<String> = {
 14 |         BTreeSet::from_iter(
 15 |             [
 16 |                 "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with",
 17 |                 "can", "if", "from", "which", "you", "it", "this", "then", "at", "have", "all", "not", "one", "has",
 18 |                 "or", "that",
 19 |             ]
 20 |             .into_iter()
 21 |             .map(|s| s.to_string()),
 22 |         )
 23 |     };
 24 | }
 25 | 
 26 | /// Keyword with weight
 27 | #[derive(Debug, Clone, PartialEq)]
 28 | pub struct Keyword {
 29 |     pub keyword: String,
 30 |     pub weight: f64,
 31 | }
 32 | 
 33 | /// Creates a KeywordExtractConfig state that contains filter criteria as
 34 | /// well as segmentation configuration for use by keyword extraction
 35 | /// implementations.
 36 | ///
 37 | /// Use KeywordExtractConfigBuilder to change the defaults.
 38 | ///
 39 | /// # Examples
 40 | /// ```
 41 | ///    use jieba_rs::KeywordExtractConfig;
 42 | ///
 43 | ///    let mut config = KeywordExtractConfig::default();
 44 | ///    assert!(config.stop_words().contains("the"));
 45 | ///    assert!(!config.stop_words().contains("FakeWord"));
 46 | ///    assert!(!config.use_hmm());
 47 | ///    assert_eq!(2, config.min_keyword_length());
 48 | ///
 49 | ///    let built_default = KeywordExtractConfig::builder().build().unwrap();
 50 | ///    assert_eq!(config, built_default);
 51 | ///
 52 | ///    let changed = KeywordExtractConfig::builder()
 53 | ///        .add_stop_word("FakeWord".to_string())
 54 | ///        .remove_stop_word("the")
 55 | ///        .use_hmm(true)
 56 | ///        .min_keyword_length(10)
 57 | ///        .build().unwrap();
 58 | ///
 59 | ///    assert!(!changed.stop_words().contains("the"));
 60 | ///    assert!(changed.stop_words().contains("FakeWord"));
 61 | ///    assert!(changed.use_hmm());
 62 | ///    assert_eq!(10, changed.min_keyword_length());
 63 | /// ```
 64 | #[derive(Builder, Debug, Clone, PartialEq)]
 65 | pub struct KeywordExtractConfig {
 66 |     #[builder(default = "self.default_stop_words()?", setter(custom))]
 67 |     stop_words: BTreeSet<String>,
 68 | 
 69 |     #[builder(default = "2")]
 70 |     #[doc = r"Any segments less than this length will not be considered a Keyword"]
 71 |     min_keyword_length: usize,
 72 | 
 73 |     #[builder(default = "false")]
 74 |     #[doc = r"If true, fall back to hmm model if segment cannot be found in the dictionary"]
 75 |     use_hmm: bool,
 76 | }
 77 | 
 78 | impl KeywordExtractConfig {
 79 |     pub fn builder() -> KeywordExtractConfigBuilder {
 80 |         KeywordExtractConfigBuilder::default()
 81 |     }
 82 | 
 83 |     /// Get current set of stop words.
 84 |     pub fn stop_words(&self) -> &BTreeSet<String> {
 85 |         &self.stop_words
 86 |     }
 87 | 
 88 |     /// True if hmm is used during segmentation in `extract_tags`.
 89 |     pub fn use_hmm(&self) -> bool {
 90 |         self.use_hmm
 91 |     }
 92 | 
 93 |     /// Gets the minimum number of Unicode Scalar Values required per keyword.
 94 |     pub fn min_keyword_length(&self) -> usize {
 95 |         self.min_keyword_length
 96 |     }
 97 | 
 98 |     #[inline]
 99 |     pub(crate) fn filter(&self, s: &str) -> bool {
100 |         s.chars().count() >= self.min_keyword_length() && !self.stop_words.contains(&s.to_lowercase())
101 |     }
102 | }
103 | 
104 | impl KeywordExtractConfigBuilder {
105 |     fn default_stop_words(&self) -> Result<BTreeSet<String>, KeywordExtractConfigBuilderError> {
106 |         Ok(DEFAULT_STOP_WORDS.clone())
107 |     }
108 | 
109 |     /// Add a new stop word.
110 |     ///
111 |     /// # Examples
112 |     /// ```
113 |     ///    use jieba_rs::KeywordExtractConfig;
114 |     ///    use std::collections::BTreeSet;
115 |     ///
116 |     ///    let populates_default = KeywordExtractConfig::builder()
117 |     ///        .add_stop_word("FakeWord".to_string())
118 |     ///        .build().unwrap();
119 |     ///
120 |     ///    assert!(populates_default.stop_words().contains("the"));
121 |     ///    assert!(populates_default.stop_words().contains("FakeWord"));
122 |     ///
123 |     ///    let multiple_adds_stack = KeywordExtractConfig::builder()
124 |     ///        .add_stop_word("FakeWord".to_string())
125 |     ///        .add_stop_word("MoarFakeWord".to_string())
126 |     ///        .build().unwrap();
127 |     ///
128 |     ///    assert!(multiple_adds_stack.stop_words().contains("the"));
129 |     ///    assert!(multiple_adds_stack.stop_words().contains("FakeWord"));
130 |     ///    assert!(multiple_adds_stack.stop_words().contains("MoarFakeWord"));
131 |     ///
132 |     ///    let no_default_if_set = KeywordExtractConfig::builder()
133 |     ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
134 |     ///        .add_stop_word("FakeWord".to_string())
135 |     ///        .build().unwrap();
136 |     ///
137 |     ///    assert!(!no_default_if_set.stop_words().contains("the"));
138 |     ///    assert!(no_default_if_set.stop_words().contains("boo"));
139 |     ///    assert!(no_default_if_set.stop_words().contains("FakeWord"));
140 |     /// ```
141 |     pub fn add_stop_word(&mut self, word: String) -> &mut Self {
142 |         if self.stop_words.is_none() {
143 |             self.stop_words = Some(self.default_stop_words().unwrap());
144 |         }
145 |         self.stop_words.as_mut().unwrap().insert(word);
146 |         self
147 |     }
148 | 
149 |     /// Remove an existing stop word.
150 |     ///
151 |     /// # Examples
152 |     /// ```
153 |     ///    use jieba_rs::KeywordExtractConfig;
154 |     ///    use std::collections::BTreeSet;
155 |     ///
156 |     ///    let populates_default = KeywordExtractConfig::builder()
157 |     ///        .remove_stop_word("the")
158 |     ///        .build().unwrap();
159 |     ///
160 |     ///    assert!(!populates_default.stop_words().contains("the"));
161 |     ///    assert!(populates_default.stop_words().contains("of"));
162 |     ///
163 |     ///    let no_default_if_set = KeywordExtractConfig::builder()
164 |     ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
165 |     ///         // Removing non-existant word is okay.
166 |     ///        .remove_stop_word("the".to_string())
167 |     ///        .build().unwrap();
168 |     ///
169 |     ///    assert!(!no_default_if_set.stop_words().contains("the"));
170 |     ///    assert!(!no_default_if_set.stop_words().contains("of"));
171 |     ///    assert!(no_default_if_set.stop_words().contains("boo"));
172 |     /// ```
173 |     pub fn remove_stop_word(&mut self, word: impl AsRef<str>) -> &mut Self {
174 |         if self.stop_words.is_none() {
175 |             self.stop_words = Some(self.default_stop_words().unwrap());
176 |         }
177 |         self.stop_words.as_mut().unwrap().remove(word.as_ref());
178 |         self
179 |     }
180 | 
181 |     /// Replace all stop words with new stop words set.
182 |     ///
183 |     /// # Examples
184 |     /// ```
185 |     ///    use jieba_rs::KeywordExtractConfig;
186 |     ///    use std::collections::BTreeSet;
187 |     ///
188 |     ///    let no_default_if_set = KeywordExtractConfig::builder()
189 |     ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
190 |     ///        .build().unwrap();
191 |     ///
192 |     ///    assert!(!no_default_if_set.stop_words().contains("the"));
193 |     ///    assert!(no_default_if_set.stop_words().contains("boo"));
194 |     ///
195 |     ///    let overwrites = KeywordExtractConfig::builder()
196 |     ///        .add_stop_word("FakeWord".to_string())
197 |     ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
198 |     ///        .build().unwrap();
199 |     ///
200 |     ///    assert!(!no_default_if_set.stop_words().contains("FakeWord"));
201 |     ///    assert!(no_default_if_set.stop_words().contains("boo"));
202 |     /// ```
203 |     pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) -> &mut Self {
204 |         self.stop_words = Some(stop_words);
205 |         self
206 |     }
207 | }
208 | 
209 | impl Default for KeywordExtractConfig {
210 |     fn default() -> KeywordExtractConfig {
211 |         KeywordExtractConfigBuilder::default().build().unwrap()
212 |     }
213 | }
214 | 
215 | /// Extracts keywords from a given sentence with the Jieba instance.
216 | pub trait KeywordExtract {
217 |     fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword>;
218 | }
219 | 


--------------------------------------------------------------------------------
/src/keywords/textrank.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::collections::{BTreeSet, BinaryHeap};
  3 | 
  4 | use ordered_float::OrderedFloat;
  5 | 
  6 | use super::{Keyword, KeywordExtract, KeywordExtractConfig, KeywordExtractConfigBuilder};
  7 | use crate::FxHashMap as HashMap;
  8 | use crate::Jieba;
  9 | 
 10 | type Weight = f64;
 11 | 
 12 | #[derive(Clone)]
 13 | struct Edge {
 14 |     dst: usize,
 15 |     weight: Weight,
 16 | }
 17 | 
 18 | impl Edge {
 19 |     fn new(dst: usize, weight: Weight) -> Edge {
 20 |         Edge { dst, weight }
 21 |     }
 22 | }
 23 | 
 24 | type Edges = Vec<Edge>;
 25 | type Graph = Vec<Edges>;
 26 | 
 27 | struct StateDiagram {
 28 |     damping_factor: Weight,
 29 |     g: Graph,
 30 | }
 31 | 
 32 | impl StateDiagram {
 33 |     fn new(size: usize) -> Self {
 34 |         StateDiagram {
 35 |             damping_factor: 0.85,
 36 |             g: vec![Vec::new(); size],
 37 |         }
 38 |     }
 39 | 
 40 |     fn add_undirected_edge(&mut self, src: usize, dst: usize, weight: Weight) {
 41 |         self.g[src].push(Edge::new(dst, weight));
 42 |         self.g[dst].push(Edge::new(src, weight));
 43 |     }
 44 | 
 45 |     fn rank(&mut self) -> Vec<Weight> {
 46 |         let n = self.g.len();
 47 |         let default_weight = 1.0 / (n as f64);
 48 | 
 49 |         let mut ranking_vector = vec![default_weight; n];
 50 | 
 51 |         let mut outflow_weights = vec![0.0; n];
 52 |         for (i, v) in self.g.iter().enumerate() {
 53 |             outflow_weights[i] = v.iter().map(|e| e.weight).sum();
 54 |         }
 55 | 
 56 |         for _ in 0..20 {
 57 |             for (i, v) in self.g.iter().enumerate() {
 58 |                 let s: f64 = v
 59 |                     .iter()
 60 |                     .map(|e| e.weight / outflow_weights[e.dst] * ranking_vector[e.dst])
 61 |                     .sum();
 62 | 
 63 |                 ranking_vector[i] = (1.0 - self.damping_factor) + self.damping_factor * s;
 64 |             }
 65 |         }
 66 | 
 67 |         ranking_vector
 68 |     }
 69 | }
 70 | 
 71 | /// Text rank keywords extraction.
 72 | ///
 73 | /// Requires `textrank` feature to be enabled.
 74 | #[derive(Debug)]
 75 | pub struct TextRank {
 76 |     span: usize,
 77 |     config: KeywordExtractConfig,
 78 | }
 79 | 
 80 | impl TextRank {
 81 |     /// Creates an TextRank.
 82 |     ///
 83 |     /// # Examples
 84 |     ///
 85 |     /// New instance with custom stop words. Also uses hmm for unknown words
 86 |     /// during segmentation.
 87 |     /// ```
 88 |     ///    use std::collections::BTreeSet;
 89 |     ///    use jieba_rs::{TextRank, KeywordExtractConfig};
 90 |     ///
 91 |     ///    let stop_words : BTreeSet<String> =
 92 |     ///        BTreeSet::from(["a", "the", "of"].map(|s| s.to_string()));
 93 |     ///    TextRank::new(
 94 |     ///        5,
 95 |     ///        KeywordExtractConfig::default());
 96 |     /// ```
 97 |     pub fn new(span: usize, config: KeywordExtractConfig) -> Self {
 98 |         TextRank { span, config }
 99 |     }
100 | }
101 | 
102 | impl Default for TextRank {
103 |     /// Creates TextRank with 5 Unicode Scalar Value spans
104 |     fn default() -> Self {
105 |         TextRank::new(5, KeywordExtractConfigBuilder::default().build().unwrap())
106 |     }
107 | }
108 | 
109 | impl KeywordExtract for TextRank {
110 |     /// Uses TextRank algorithm to extract the `top_k` keywords from `sentence`.
111 |     ///
112 |     /// If `allowed_pos` is not empty, then only terms matching those parts if
113 |     /// speech are considered.
114 |     ///
115 |     /// # Examples
116 |     ///
117 |     /// ```
118 |     ///    use jieba_rs::{Jieba, KeywordExtract, TextRank};
119 |     ///
120 |     ///    let jieba = Jieba::new();
121 |     ///    let keyword_extractor = TextRank::default();
122 |     ///    let mut top_k = keyword_extractor.extract_keywords(
123 |     ///        &jieba,
124 |     ///        "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
125 |     ///        6,
126 |     ///        vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
127 |     ///    );
128 |     ///    assert_eq!(
129 |     ///        top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
130 |     ///        vec!["吉林", "欧亚", "置业", "实现", "收入", "子公司"]
131 |     ///    );
132 |     ///
133 |     ///    top_k = keyword_extractor.extract_keywords(
134 |     ///        &jieba,
135 |     ///        "It is nice weather in New York City. and今天纽约的天气真好啊，and京华大酒店的张尧经理吃了一只北京烤鸭。and后天纽约的天气不好，and昨天纽约的天气也不好，and北京烤鸭真好吃",
136 |     ///        3,
137 |     ///        vec![],
138 |     ///    );
139 |     ///    assert_eq!(
140 |     ///        top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
141 |     ///        vec!["纽约", "天气", "不好"]
142 |     ///    );
143 |     /// ```
144 |     fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword> {
145 |         let tags = jieba.tag(sentence, self.config.use_hmm());
146 |         let mut allowed_pos_set = BTreeSet::new();
147 | 
148 |         for s in allowed_pos {
149 |             allowed_pos_set.insert(s);
150 |         }
151 | 
152 |         let mut word2id: HashMap<String, usize> = HashMap::default();
153 |         let mut unique_words = Vec::new();
154 |         for t in &tags {
155 |             if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(t.tag) {
156 |                 continue;
157 |             }
158 | 
159 |             if !word2id.contains_key(t.word) {
160 |                 unique_words.push(String::from(t.word));
161 |                 word2id.insert(String::from(t.word), unique_words.len() - 1);
162 |             }
163 |         }
164 | 
165 |         let mut cooccurence: HashMap<(usize, usize), usize> = HashMap::default();
166 |         for (i, t) in tags.iter().enumerate() {
167 |             if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(t.tag) {
168 |                 continue;
169 |             }
170 | 
171 |             if !self.config.filter(t.word) {
172 |                 continue;
173 |             }
174 | 
175 |             for j in (i + 1)..(i + self.span) {
176 |                 if j >= tags.len() {
177 |                     break;
178 |                 }
179 | 
180 |                 if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(tags[j].tag) {
181 |                     continue;
182 |                 }
183 | 
184 |                 if !self.config.filter(tags[j].word) {
185 |                     continue;
186 |                 }
187 | 
188 |                 let u = word2id.get(t.word).unwrap().to_owned();
189 |                 let v = word2id.get(tags[j].word).unwrap().to_owned();
190 |                 let entry = cooccurence.entry((u, v)).or_insert(0);
191 |                 *entry += 1;
192 |             }
193 |         }
194 | 
195 |         let mut diagram = StateDiagram::new(unique_words.len());
196 |         for (k, &v) in cooccurence.iter() {
197 |             diagram.add_undirected_edge(k.0, k.1, v as f64);
198 |         }
199 | 
200 |         let ranking_vector = diagram.rank();
201 | 
202 |         let mut heap = BinaryHeap::new();
203 |         for (k, v) in ranking_vector.iter().enumerate() {
204 |             heap.push(HeapNode {
205 |                 rank: OrderedFloat(v * 1e10),
206 |                 word_id: k,
207 |             });
208 | 
209 |             if k >= top_k {
210 |                 heap.pop();
211 |             }
212 |         }
213 | 
214 |         let mut res = Vec::new();
215 |         for _ in 0..top_k {
216 |             if let Some(w) = heap.pop() {
217 |                 res.push(Keyword {
218 |                     keyword: unique_words[w.word_id].clone(),
219 |                     weight: w.rank.into_inner(),
220 |                 });
221 |             }
222 |         }
223 | 
224 |         res.reverse();
225 |         res
226 |     }
227 | }
228 | 
229 | #[derive(Debug, Clone, Eq, PartialEq)]
230 | struct HeapNode {
231 |     rank: OrderedFloat<f64>,
232 |     word_id: usize,
233 | }
234 | 
235 | impl Ord for HeapNode {
236 |     fn cmp(&self, other: &HeapNode) -> Ordering {
237 |         other
238 |             .rank
239 |             .cmp(&self.rank)
240 |             .then_with(|| self.word_id.cmp(&other.word_id))
241 |     }
242 | }
243 | 
244 | impl PartialOrd for HeapNode {
245 |     fn partial_cmp(&self, other: &HeapNode) -> Option<Ordering> {
246 |         Some(self.cmp(other))
247 |     }
248 | }
249 | 
250 | #[cfg(test)]
251 | mod tests {
252 |     use super::*;
253 |     #[test]
254 |     fn test_init_state_diagram() {
255 |         let diagram = StateDiagram::new(10);
256 |         assert_eq!(diagram.g.len(), 10);
257 |     }
258 | }
259 | 


--------------------------------------------------------------------------------
/src/keywords/tfidf.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | use std::collections::{BTreeSet, BinaryHeap};
  3 | use std::io::{self, BufRead, BufReader};
  4 | 
  5 | use include_flate::flate;
  6 | use ordered_float::OrderedFloat;
  7 | 
  8 | use super::{Keyword, KeywordExtract, KeywordExtractConfig, KeywordExtractConfigBuilder};
  9 | use crate::FxHashMap as HashMap;
 10 | use crate::Jieba;
 11 | 
 12 | flate!(static DEFAULT_IDF: str from "src/data/idf.txt");
 13 | 
 14 | #[derive(Debug, Clone, Eq, PartialEq)]
 15 | struct HeapNode<'a> {
 16 |     tfidf: OrderedFloat<f64>,
 17 |     word: &'a str,
 18 | }
 19 | 
 20 | impl Ord for HeapNode<'_> {
 21 |     fn cmp(&self, other: &HeapNode) -> Ordering {
 22 |         other.tfidf.cmp(&self.tfidf).then_with(|| self.word.cmp(other.word))
 23 |     }
 24 | }
 25 | 
 26 | impl PartialOrd for HeapNode<'_> {
 27 |     fn partial_cmp(&self, other: &HeapNode) -> Option<Ordering> {
 28 |         Some(self.cmp(other))
 29 |     }
 30 | }
 31 | 
 32 | /// TF-IDF keywords extraction
 33 | ///
 34 | /// Require `tfidf` feature to be enabled
 35 | #[derive(Debug)]
 36 | pub struct TfIdf {
 37 |     idf_dict: HashMap<String, f64>,
 38 |     median_idf: f64,
 39 |     config: KeywordExtractConfig,
 40 | }
 41 | 
 42 | /// Implementation of JiebaKeywordExtract using a TF-IDF dictionary.
 43 | ///
 44 | /// This takes the segments produced by Jieba and attempts to extract keywords.
 45 | /// Segments are filtered for stopwords and short terms. They are then matched
 46 | /// against a loaded dictionary to calculate TF-IDF scores.
 47 | impl TfIdf {
 48 |     /// Creates an TfIdf.
 49 |     ///
 50 |     /// # Examples
 51 |     ///
 52 |     /// New instance with custom idf dictionary.
 53 |     /// ```
 54 |     ///    use jieba_rs::{TfIdf, KeywordExtractConfig};
 55 |     ///
 56 |     ///    let mut sample_idf = "劳动防护 13.900677652\n\
 57 |     ///        生化学 13.900677652\n";
 58 |     ///    TfIdf::new(
 59 |     ///        Some(&mut sample_idf.as_bytes()),
 60 |     ///        KeywordExtractConfig::default());
 61 |     /// ```
 62 |     ///
 63 |     /// New instance with module default stop words and no initial IDF
 64 |     /// dictionary. Dictionary should be loaded later with `load_dict()` calls.
 65 |     /// ```
 66 |     ///    use jieba_rs::{TfIdf, KeywordExtractConfig};
 67 |     ///
 68 |     ///    TfIdf::new(
 69 |     ///        None::<&mut std::io::Empty>,
 70 |     ///        KeywordExtractConfig::default());
 71 |     /// ```
 72 |     pub fn new(opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig) -> Self {
 73 |         let mut instance = TfIdf {
 74 |             idf_dict: HashMap::default(),
 75 |             median_idf: 0.0,
 76 |             config,
 77 |         };
 78 |         if let Some(dict) = opt_dict {
 79 |             instance.load_dict(dict).unwrap();
 80 |         }
 81 |         instance
 82 |     }
 83 | 
 84 |     /// Merges entires from `dict` into the `idf_dict`.
 85 |     ///
 86 |     /// ```
 87 |     ///    use jieba_rs::{Jieba, KeywordExtract, Keyword, KeywordExtractConfig,
 88 |     ///        TfIdf};
 89 |     ///
 90 |     ///    let jieba = Jieba::default();
 91 |     ///    let mut init_idf = "生化学 13.900677652\n";
 92 |     ///
 93 |     ///    let mut tfidf = TfIdf::new(
 94 |     ///        Some(&mut init_idf.as_bytes()),
 95 |     ///        KeywordExtractConfig::default());
 96 |     ///    let top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]);
 97 |     ///    assert_eq!(
 98 |     ///        top_k,
 99 |     ///        vec![
100 |     ///            Keyword { keyword: "不是".to_string(), weight: 4.6335592173333335 },
101 |     ///            Keyword { keyword: "光化学".to_string(), weight: 4.6335592173333335 },
102 |     ///            Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 }
103 |     ///        ]
104 |     ///    );
105 |     ///
106 |     ///    let mut init_idf = "光化学 99.123456789\n";
107 |     ///    tfidf.load_dict(&mut init_idf.as_bytes());
108 |     ///    let new_top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]);
109 |     ///    assert_eq!(
110 |     ///        new_top_k,
111 |     ///        vec![
112 |     ///            Keyword { keyword: "不是".to_string(), weight: 33.041152263 },
113 |     ///            Keyword { keyword: "光化学".to_string(), weight: 33.041152263 },
114 |     ///            Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 }
115 |     ///        ]
116 |     ///    );
117 |     /// ```
118 |     pub fn load_dict(&mut self, dict: &mut impl BufRead) -> io::Result<()> {
119 |         let mut buf = String::new();
120 |         let mut idf_heap = BinaryHeap::new();
121 |         while dict.read_line(&mut buf)? > 0 {
122 |             let parts: Vec<&str> = buf.split_whitespace().collect();
123 |             if parts.is_empty() {
124 |                 continue;
125 |             }
126 | 
127 |             let word = parts[0];
128 |             if let Some(idf) = parts.get(1).and_then(|x| x.parse::<f64>().ok()) {
129 |                 self.idf_dict.insert(word.to_string(), idf);
130 |                 idf_heap.push(OrderedFloat(idf));
131 |             }
132 | 
133 |             buf.clear();
134 |         }
135 | 
136 |         let m = idf_heap.len() / 2;
137 |         for _ in 0..m {
138 |             idf_heap.pop();
139 |         }
140 | 
141 |         self.median_idf = idf_heap.pop().unwrap().into_inner();
142 | 
143 |         Ok(())
144 |     }
145 | 
146 |     pub fn config(&self) -> &KeywordExtractConfig {
147 |         &self.config
148 |     }
149 | 
150 |     pub fn config_mut(&mut self) -> &mut KeywordExtractConfig {
151 |         &mut self.config
152 |     }
153 | }
154 | 
155 | /// TF-IDF keywords extraction.
156 | ///
157 | /// Require `tfidf` feature to be enabled.
158 | impl Default for TfIdf {
159 |     /// Creates TfIdf with DEFAULT_STOP_WORDS, the default TfIdf dictionary,
160 |     /// 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation.
161 |     fn default() -> Self {
162 |         let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes());
163 |         TfIdf::new(
164 |             Some(&mut default_dict),
165 |             KeywordExtractConfigBuilder::default().build().unwrap(),
166 |         )
167 |     }
168 | }
169 | 
170 | impl KeywordExtract for TfIdf {
171 |     /// Uses TF-IDF algorithm to extract the `top_k` keywords from `sentence`.
172 |     ///
173 |     /// If `allowed_pos` is not empty, then only terms matching those parts if
174 |     /// speech are considered.
175 |     ///
176 |     /// # Examples
177 |     /// ```
178 |     ///    use jieba_rs::{Jieba, KeywordExtract, TfIdf};
179 |     ///
180 |     ///    let jieba = Jieba::new();
181 |     ///    let keyword_extractor = TfIdf::default();
182 |     ///    let mut top_k = keyword_extractor.extract_keywords(
183 |     ///        &jieba,
184 |     ///        "今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好，昨天纽约的天气也不好，北京烤鸭真好吃",
185 |     ///        3,
186 |     ///        vec![],
187 |     ///    );
188 |     ///    assert_eq!(
189 |     ///        top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
190 |     ///        vec!["北京烤鸭", "纽约", "天气"]
191 |     ///    );
192 |     ///
193 |     ///    top_k = keyword_extractor.extract_keywords(
194 |     ///        &jieba,
195 |     ///        "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
196 |     ///        5,
197 |     ///        vec![],
198 |     ///    );
199 |     ///    assert_eq!(
200 |     ///        top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
201 |     ///        vec!["欧亚", "吉林", "置业", "万元", "增资"]
202 |     ///    );
203 |     ///
204 |     ///    top_k = keyword_extractor.extract_keywords(
205 |     ///        &jieba,
206 |     ///        "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
207 |     ///        5,
208 |     ///        vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
209 |     ///    );
210 |     ///    assert_eq!(
211 |     ///        top_k.iter().map(|x| &x.keyword).collect::<Vec<&String>>(),
212 |     ///        vec!["欧亚", "吉林", "置业", "增资", "实现"]
213 |     ///    );
214 |     /// ```
215 |     fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword> {
216 |         let tags = jieba.tag(sentence, self.config.use_hmm());
217 |         let mut allowed_pos_set = BTreeSet::new();
218 | 
219 |         for s in allowed_pos {
220 |             allowed_pos_set.insert(s);
221 |         }
222 | 
223 |         let mut term_freq: HashMap<String, u64> = HashMap::default();
224 |         for t in &tags {
225 |             if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(t.tag) {
226 |                 continue;
227 |             }
228 | 
229 |             if !self.config.filter(t.word) {
230 |                 continue;
231 |             }
232 | 
233 |             let entry = term_freq.entry(String::from(t.word)).or_insert(0);
234 |             *entry += 1;
235 |         }
236 | 
237 |         let total: u64 = term_freq.values().sum();
238 |         let mut heap = BinaryHeap::new();
239 |         for (cnt, (k, tf)) in term_freq.iter().enumerate() {
240 |             let idf = self.idf_dict.get(k).unwrap_or(&self.median_idf);
241 |             let node = HeapNode {
242 |                 tfidf: OrderedFloat(*tf as f64 * idf / total as f64),
243 |                 word: k,
244 |             };
245 |             heap.push(node);
246 |             if cnt >= top_k {
247 |                 heap.pop();
248 |             }
249 |         }
250 | 
251 |         let mut res = Vec::new();
252 |         for _ in 0..top_k {
253 |             if let Some(w) = heap.pop() {
254 |                 res.push(Keyword {
255 |                     keyword: String::from(w.word),
256 |                     weight: w.tfidf.into_inner(),
257 |                 });
258 |             }
259 |         }
260 | 
261 |         res.reverse();
262 |         res
263 |     }
264 | }
265 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
   1 | //! The Jieba Chinese Word Segmentation Implemented in Rust
   2 | //!
   3 | //! ## Installation
   4 | //!
   5 | //! Add it to your `Cargo.toml`:
   6 | //!
   7 | //! ```toml
   8 | //! [dependencies]
   9 | //! jieba-rs = "0.7"
  10 | //! ```
  11 | //!
  12 | //! then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well.
  13 | //!
  14 | //! ## Example
  15 | //!
  16 | //! ```rust
  17 | //! use jieba_rs::Jieba;
  18 | //!
  19 | //! let jieba = Jieba::new();
  20 | //! let words = jieba.cut("我们中出了一个叛徒", false);
  21 | //! assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
  22 | //! ```
  23 | //!
  24 | //! ```rust
  25 | //! # #[cfg(feature = "tfidf")] {
  26 | //! use jieba_rs::Jieba;
  27 | //! use jieba_rs::{TfIdf, KeywordExtract};
  28 | //!
  29 | //! fn main() {
  30 | //!     let jieba = Jieba::new();
  31 | //!     let keyword_extractor = TfIdf::default();
  32 | //!     let top_k = keyword_extractor.extract_keywords(
  33 | //!         &jieba,
  34 | //!         "今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好，昨天纽约的天气也不好，北京烤鸭真好吃",
  35 | //!         3,
  36 | //!         vec![],
  37 | //!     );
  38 | //!     println!("{:?}", top_k);
  39 | //! }
  40 | //! # }
  41 | //! ```
  42 | //!
  43 | //! ```rust
  44 | //! # #[cfg(feature = "textrank")] {
  45 | //! use jieba_rs::Jieba;
  46 | //! use jieba_rs::{TextRank, KeywordExtract};
  47 | //!
  48 | //! fn main() {
  49 | //!     let jieba = Jieba::new();
  50 | //!     let keyword_extractor = TextRank::default();
  51 | //!     let top_k = keyword_extractor.extract_keywords(
  52 | //!         &jieba,
  53 | //!         "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
  54 | //!         6,
  55 | //!         vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
  56 | //!     );
  57 | //!     println!("{:?}", top_k);
  58 | //! }
  59 | //! # }
  60 | //! ```
  61 | //!
  62 | //! ## Enabling Additional Features
  63 | //!
  64 | //! * `default-dict` feature enables embedded dictionary, this features is enabled by default
  65 | //! * `tfidf` feature enables TF-IDF keywords extractor
  66 | //! * `textrank` feature enables TextRank keywords extractor
  67 | //!
  68 | //! ```toml
  69 | //! [dependencies]
  70 | //! jieba-rs = { version = "0.7", features = ["tfidf", "textrank"] }
  71 | //! ```
  72 | //!
  73 | 
  74 | use include_flate::flate;
  75 | 
  76 | use std::cmp::Ordering;
  77 | use std::collections::HashMap;
  78 | use std::io::BufRead;
  79 | 
  80 | use cedarwood::Cedar;
  81 | use regex::{Match, Matches, Regex};
  82 | 
  83 | pub(crate) type FxHashMap<K, V> = HashMap<K, V, fxhash::FxBuildHasher>;
  84 | 
  85 | pub use crate::errors::Error;
  86 | #[cfg(feature = "textrank")]
  87 | pub use crate::keywords::textrank::TextRank;
  88 | #[cfg(feature = "tfidf")]
  89 | pub use crate::keywords::tfidf::TfIdf;
  90 | #[cfg(any(feature = "tfidf", feature = "textrank"))]
  91 | pub use crate::keywords::{Keyword, KeywordExtract, KeywordExtractConfig, DEFAULT_STOP_WORDS};
  92 | 
  93 | mod errors;
  94 | mod hmm;
  95 | #[cfg(any(feature = "tfidf", feature = "textrank"))]
  96 | mod keywords;
  97 | mod sparse_dag;
  98 | 
  99 | #[cfg(feature = "default-dict")]
 100 | flate!(static DEFAULT_DICT: str from "src/data/dict.txt");
 101 | 
 102 | use sparse_dag::StaticSparseDAG;
 103 | 
 104 | thread_local! {
 105 |     static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
 106 |     static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
 107 |     static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
 108 |     static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
 109 | }
 110 | 
 111 | struct SplitMatches<'r, 't> {
 112 |     finder: Matches<'r, 't>,
 113 |     text: &'t str,
 114 |     last: usize,
 115 |     matched: Option<Match<'t>>,
 116 | }
 117 | 
 118 | impl<'r, 't> SplitMatches<'r, 't> {
 119 |     #[inline]
 120 |     fn new(re: &'r Regex, text: &'t str) -> SplitMatches<'r, 't> {
 121 |         SplitMatches {
 122 |             finder: re.find_iter(text),
 123 |             text,
 124 |             last: 0,
 125 |             matched: None,
 126 |         }
 127 |     }
 128 | }
 129 | 
 130 | #[derive(Debug)]
 131 | pub(crate) enum SplitState<'t> {
 132 |     Unmatched(&'t str),
 133 |     Matched(Match<'t>),
 134 | }
 135 | 
 136 | impl<'t> SplitState<'t> {
 137 |     #[inline]
 138 |     fn into_str(self) -> &'t str {
 139 |         match self {
 140 |             SplitState::Unmatched(t) => t,
 141 |             SplitState::Matched(matched) => matched.as_str(),
 142 |         }
 143 |     }
 144 | }
 145 | 
 146 | impl<'t> Iterator for SplitMatches<'_, 't> {
 147 |     type Item = SplitState<'t>;
 148 | 
 149 |     fn next(&mut self) -> Option<SplitState<'t>> {
 150 |         if let Some(matched) = self.matched.take() {
 151 |             return Some(SplitState::Matched(matched));
 152 |         }
 153 |         match self.finder.next() {
 154 |             None => {
 155 |                 if self.last >= self.text.len() {
 156 |                     None
 157 |                 } else {
 158 |                     let s = &self.text[self.last..];
 159 |                     self.last = self.text.len();
 160 |                     Some(SplitState::Unmatched(s))
 161 |                 }
 162 |             }
 163 |             Some(m) => {
 164 |                 if self.last == m.start() {
 165 |                     self.last = m.end();
 166 |                     Some(SplitState::Matched(m))
 167 |                 } else {
 168 |                     let unmatched = &self.text[self.last..m.start()];
 169 |                     self.last = m.end();
 170 |                     self.matched = Some(m);
 171 |                     Some(SplitState::Unmatched(unmatched))
 172 |                 }
 173 |             }
 174 |         }
 175 |     }
 176 | }
 177 | 
 178 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 179 | pub enum TokenizeMode {
 180 |     /// Default mode
 181 |     Default,
 182 |     /// Search mode
 183 |     Search,
 184 | }
 185 | 
 186 | /// A Token
 187 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 188 | pub struct Token<'a> {
 189 |     /// Word of the token
 190 |     pub word: &'a str,
 191 |     /// Unicode start position of the token
 192 |     pub start: usize,
 193 |     /// Unicode end position of the token
 194 |     pub end: usize,
 195 | }
 196 | 
 197 | /// A tagged word
 198 | #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 199 | pub struct Tag<'a> {
 200 |     /// Word
 201 |     pub word: &'a str,
 202 |     /// Word tag
 203 |     pub tag: &'a str,
 204 | }
 205 | 
 206 | #[derive(Debug, Clone)]
 207 | struct Record {
 208 |     freq: usize,
 209 |     tag: String,
 210 | }
 211 | 
 212 | impl Record {
 213 |     #[inline(always)]
 214 |     fn new(freq: usize, tag: String) -> Self {
 215 |         Self { freq, tag }
 216 |     }
 217 | }
 218 | 
 219 | /// Jieba segmentation
 220 | #[derive(Debug, Clone)]
 221 | pub struct Jieba {
 222 |     records: Vec<Record>,
 223 |     cedar: Cedar,
 224 |     total: usize,
 225 | }
 226 | 
 227 | #[cfg(feature = "default-dict")]
 228 | impl Default for Jieba {
 229 |     fn default() -> Self {
 230 |         Jieba::new()
 231 |     }
 232 | }
 233 | 
 234 | impl Jieba {
 235 |     /// Create a new instance with empty dict
 236 |     pub fn empty() -> Self {
 237 |         Jieba {
 238 |             records: Vec::new(),
 239 |             cedar: Cedar::new(),
 240 |             total: 0,
 241 |         }
 242 |     }
 243 | 
 244 |     /// Create a new instance with embed dict
 245 |     ///
 246 |     /// Requires `default-dict` feature to be enabled.
 247 |     #[cfg(feature = "default-dict")]
 248 |     pub fn new() -> Self {
 249 |         let mut instance = Self::empty();
 250 |         instance.load_default_dict();
 251 |         instance
 252 |     }
 253 | 
 254 |     /// Create a new instance with dict
 255 |     pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
 256 |         let mut instance = Self::empty();
 257 |         instance.load_dict(dict)?;
 258 |         Ok(instance)
 259 |     }
 260 | 
 261 |     /// Loads the default dictionary into the instance.
 262 |     ///
 263 |     /// This method reads the default dictionary from a predefined byte slice (`DEFAULT_DICT`)
 264 |     /// and loads it into the current instance using the `load_dict` method.
 265 |     ///
 266 |     /// # Arguments
 267 |     ///
 268 |     /// * `&mut self` - Mutable reference to the current instance.
 269 |     ///
 270 |     /// Requires `default-dict` feature to be enabled.
 271 |     ///
 272 |     /// # Examples
 273 |     ///
 274 |     /// ```
 275 |     /// use jieba_rs::Jieba;
 276 |     ///
 277 |     /// let mut instance = Jieba::empty();
 278 |     /// instance.load_default_dict(); // Loads the default dictionary into the instance
 279 |     /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary");
 280 |     /// ```
 281 |     #[cfg(feature = "default-dict")]
 282 |     pub fn load_default_dict(&mut self) {
 283 |         use std::io::BufReader;
 284 | 
 285 |         let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
 286 |         self.load_dict(&mut default_dict).unwrap();
 287 |     }
 288 | 
 289 |     /// Clears all data
 290 |     ///
 291 |     /// This method performs the following actions:
 292 |     /// 1. Clears the `records` list, removing all entries.
 293 |     /// 2. Resets `cedar` to a new instance of `Cedar`.
 294 |     /// 3. Sets `total` to 0, resetting the count.
 295 |     ///
 296 |     /// # Arguments
 297 |     ///
 298 |     /// * `&mut self` - Mutable reference to the current instance.
 299 |     ///
 300 |     /// # Examples
 301 |     ///
 302 |     /// ```
 303 |     /// use jieba_rs::Jieba;
 304 |     ///
 305 |     /// let mut instance = Jieba::new();
 306 |     /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary");
 307 |     /// instance.clear(); // clear all dict data
 308 |     /// assert!(!instance.has_word("我们"), "The word '我们' should not be in the dictionary after clearing the dictionary");
 309 |     /// ```
 310 |     pub fn clear(&mut self) {
 311 |         self.records.clear();
 312 |         self.cedar = Cedar::new();
 313 |         self.total = 0;
 314 |     }
 315 | 
 316 |     /// Add word to dict, return `freq`
 317 |     ///
 318 |     /// `freq`: if `None`, will be given by [suggest_freq](#method.suggest_freq)
 319 |     ///
 320 |     /// `tag`: if `None`, will be given `""`
 321 |     pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
 322 |         if word.is_empty() {
 323 |             return 0;
 324 |         }
 325 |         let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
 326 |         let tag = tag.unwrap_or("");
 327 | 
 328 |         match self.cedar.exact_match_search(word) {
 329 |             Some((word_id, _, _)) => {
 330 |                 let old_freq = self.records[word_id as usize].freq;
 331 |                 self.records[word_id as usize].freq = freq;
 332 | 
 333 |                 self.total += freq;
 334 |                 self.total -= old_freq;
 335 |             }
 336 |             None => {
 337 |                 let word_id = self.records.len() as i32;
 338 |                 self.records.push(Record::new(freq, String::from(tag)));
 339 | 
 340 |                 self.cedar.update(word, word_id);
 341 |                 self.total += freq;
 342 |             }
 343 |         };
 344 | 
 345 |         freq
 346 |     }
 347 | 
 348 |     /// Checks if a word exists in the dictionary.
 349 |     ///
 350 |     /// # Arguments
 351 |     ///
 352 |     /// * `word` - The word to check.
 353 |     ///
 354 |     /// # Returns
 355 |     ///
 356 |     /// * `bool` - Whether the word exists in the dictionary.
 357 |     pub fn has_word(&self, word: &str) -> bool {
 358 |         self.cedar.exact_match_search(word).is_some()
 359 |     }
 360 | 
 361 |     /// Loads a dictionary by adding entries to the existing dictionary rather than resetting it.
 362 |     ///
 363 |     /// This function reads from a `BufRead` source, parsing each line as a dictionary entry. Each entry
 364 |     /// is expected to contain a word, its frequency, and optionally a tag.
 365 |     ///
 366 |     /// # Type Parameters
 367 |     ///
 368 |     /// * `R`: A type that implements the `BufRead` trait, used for reading lines from the dictionary.
 369 |     ///
 370 |     /// # Arguments
 371 |     ///
 372 |     /// * `dict` - A mutable reference to a `BufRead` source containing the dictionary entries.
 373 |     ///
 374 |     /// # Returns
 375 |     ///
 376 |     /// * `Result<(), Error>` - Returns `Ok(())` if the dictionary is successfully loaded; otherwise,
 377 |     ///   returns an error describing what went wrong.
 378 |     ///
 379 |     /// # Errors
 380 |     ///
 381 |     /// This function will return an error if:
 382 |     /// * There is an issue reading from the provided `BufRead` source.
 383 |     /// * A line in the dictionary file contains invalid frequency data (not a valid integer).
 384 |     pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
 385 |         let mut buf = String::new();
 386 |         self.total = 0;
 387 | 
 388 |         let mut line_no = 0;
 389 |         while dict.read_line(&mut buf)? > 0 {
 390 |             {
 391 |                 line_no += 1;
 392 |                 let mut iter = buf.split_whitespace();
 393 |                 if let Some(word) = iter.next() {
 394 |                     let freq = iter
 395 |                         .next()
 396 |                         .map(|x| {
 397 |                             x.parse::<usize>().map_err(|e| {
 398 |                                 Error::InvalidDictEntry(format!(
 399 |                                     "line {} `{}` frequency {} is not a valid integer: {}",
 400 |                                     line_no, buf, x, e
 401 |                                 ))
 402 |                             })
 403 |                         })
 404 |                         .unwrap_or(Ok(0))?;
 405 |                     let tag = iter.next().unwrap_or("");
 406 | 
 407 |                     match self.cedar.exact_match_search(word) {
 408 |                         Some((word_id, _, _)) => {
 409 |                             self.records[word_id as usize].freq = freq;
 410 |                         }
 411 |                         None => {
 412 |                             let word_id = self.records.len() as i32;
 413 |                             self.records.push(Record::new(freq, String::from(tag)));
 414 |                             self.cedar.update(word, word_id);
 415 |                         }
 416 |                     };
 417 |                 }
 418 |             }
 419 |             buf.clear();
 420 |         }
 421 |         self.total = self.records.iter().map(|n| n.freq).sum();
 422 | 
 423 |         Ok(())
 424 |     }
 425 | 
 426 |     fn get_word_freq(&self, word: &str, default: usize) -> usize {
 427 |         match self.cedar.exact_match_search(word) {
 428 |             Some((word_id, _, _)) => self.records[word_id as usize].freq,
 429 |             _ => default,
 430 |         }
 431 |     }
 432 | 
 433 |     /// Suggest word frequency to force the characters in a word to be joined or split.
 434 |     pub fn suggest_freq(&self, segment: &str) -> usize {
 435 |         let logtotal = (self.total as f64).ln();
 436 |         let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, word| {
 437 |             freq + (self.get_word_freq(word, 1) as f64).ln() - logtotal
 438 |         });
 439 |         std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
 440 |     }
 441 | 
 442 |     #[allow(clippy::ptr_arg)]
 443 |     fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
 444 |         let str_len = sentence.len();
 445 | 
 446 |         if str_len + 1 > route.len() {
 447 |             route.resize(str_len + 1, (0.0, 0));
 448 |         }
 449 | 
 450 |         let logtotal = (self.total as f64).ln();
 451 |         let mut prev_byte_start = str_len;
 452 |         let curr = sentence.char_indices().map(|x| x.0).rev();
 453 |         for byte_start in curr {
 454 |             let pair = dag
 455 |                 .iter_edges(byte_start)
 456 |                 .map(|byte_end| {
 457 |                     let wfrag = if byte_end == str_len {
 458 |                         &sentence[byte_start..]
 459 |                     } else {
 460 |                         &sentence[byte_start..byte_end]
 461 |                     };
 462 | 
 463 |                     let freq = if let Some((word_id, _, _)) = self.cedar.exact_match_search(wfrag) {
 464 |                         self.records[word_id as usize].freq
 465 |                     } else {
 466 |                         1
 467 |                     };
 468 | 
 469 |                     ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end)
 470 |                 })
 471 |                 .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
 472 | 
 473 |             if let Some(p) = pair {
 474 |                 route[byte_start] = p;
 475 |             } else {
 476 |                 let byte_end = prev_byte_start;
 477 |                 let freq = 1;
 478 |                 route[byte_start] = ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end);
 479 |             }
 480 | 
 481 |             prev_byte_start = byte_start;
 482 |         }
 483 |     }
 484 | 
 485 |     fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
 486 |         for (byte_start, _) in sentence.char_indices().peekable() {
 487 |             dag.start(byte_start);
 488 |             let haystack = &sentence[byte_start..];
 489 | 
 490 |             for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
 491 |                 dag.insert(end_index + byte_start + 1);
 492 |             }
 493 | 
 494 |             dag.commit();
 495 |         }
 496 |     }
 497 | 
 498 |     fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
 499 |         let str_len = sentence.len();
 500 |         let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
 501 |         self.dag(sentence, &mut dag);
 502 | 
 503 |         let curr = sentence.char_indices().map(|x| x.0);
 504 |         for byte_start in curr {
 505 |             for byte_end in dag.iter_edges(byte_start) {
 506 |                 let word = if byte_end == str_len {
 507 |                     &sentence[byte_start..]
 508 |                 } else {
 509 |                     &sentence[byte_start..byte_end]
 510 |                 };
 511 | 
 512 |                 words.push(word)
 513 |             }
 514 |         }
 515 |     }
 516 | 
 517 |     fn cut_dag_no_hmm<'a>(
 518 |         &self,
 519 |         sentence: &'a str,
 520 |         words: &mut Vec<&'a str>,
 521 |         route: &mut Vec<(f64, usize)>,
 522 |         dag: &mut StaticSparseDAG,
 523 |     ) {
 524 |         self.dag(sentence, dag);
 525 |         self.calc(sentence, dag, route);
 526 |         let mut x = 0;
 527 |         let mut left: Option<usize> = None;
 528 | 
 529 |         while x < sentence.len() {
 530 |             let y = route[x].1;
 531 |             let l_str = if y < sentence.len() {
 532 |                 &sentence[x..y]
 533 |             } else {
 534 |                 &sentence[x..]
 535 |             };
 536 | 
 537 |             if l_str.chars().count() == 1 && l_str.chars().all(|ch| ch.is_ascii_alphanumeric()) {
 538 |                 if left.is_none() {
 539 |                     left = Some(x);
 540 |                 }
 541 |             } else {
 542 |                 if let Some(byte_start) = left {
 543 |                     let word = &sentence[byte_start..x];
 544 |                     words.push(word);
 545 |                     left = None;
 546 |                 }
 547 | 
 548 |                 let word = if y < sentence.len() {
 549 |                     &sentence[x..y]
 550 |                 } else {
 551 |                     &sentence[x..]
 552 |                 };
 553 | 
 554 |                 words.push(word);
 555 |             }
 556 |             x = y;
 557 |         }
 558 | 
 559 |         if let Some(byte_start) = left {
 560 |             let word = &sentence[byte_start..];
 561 |             words.push(word);
 562 |         }
 563 | 
 564 |         dag.clear();
 565 |         route.clear();
 566 |     }
 567 | 
 568 |     #[allow(non_snake_case, clippy::too_many_arguments)]
 569 |     fn cut_dag_hmm<'a>(
 570 |         &self,
 571 |         sentence: &'a str,
 572 |         words: &mut Vec<&'a str>,
 573 |         route: &mut Vec<(f64, usize)>,
 574 |         dag: &mut StaticSparseDAG,
 575 |         hmm_context: &mut hmm::HmmContext,
 576 |     ) {
 577 |         self.dag(sentence, dag);
 578 |         self.calc(sentence, dag, route);
 579 |         let mut x = 0;
 580 |         let mut left: Option<usize> = None;
 581 | 
 582 |         while x < sentence.len() {
 583 |             let y = route[x].1;
 584 | 
 585 |             if sentence[x..y].chars().count() == 1 {
 586 |                 if left.is_none() {
 587 |                     left = Some(x);
 588 |                 }
 589 |             } else {
 590 |                 if let Some(byte_start) = left {
 591 |                     let byte_end = x;
 592 |                     let word = if byte_end < sentence.len() {
 593 |                         &sentence[byte_start..byte_end]
 594 |                     } else {
 595 |                         &sentence[byte_start..]
 596 |                     };
 597 | 
 598 |                     if word.chars().count() == 1 {
 599 |                         words.push(word);
 600 |                     } else if self.cedar.exact_match_search(word).is_none() {
 601 |                         hmm::cut_with_allocated_memory(word, words, hmm_context);
 602 |                     } else {
 603 |                         let mut word_indices = word.char_indices().map(|x| x.0).peekable();
 604 |                         while let Some(byte_start) = word_indices.next() {
 605 |                             if let Some(byte_end) = word_indices.peek() {
 606 |                                 words.push(&word[byte_start..*byte_end]);
 607 |                             } else {
 608 |                                 words.push(&word[byte_start..]);
 609 |                             }
 610 |                         }
 611 |                     }
 612 |                     left = None;
 613 |                 }
 614 |                 let word = if y < sentence.len() {
 615 |                     &sentence[x..y]
 616 |                 } else {
 617 |                     &sentence[x..]
 618 |                 };
 619 |                 words.push(word);
 620 |             }
 621 |             x = y;
 622 |         }
 623 | 
 624 |         if let Some(byte_start) = left {
 625 |             let word = &sentence[byte_start..];
 626 | 
 627 |             if word.chars().count() == 1 {
 628 |                 words.push(word);
 629 |             } else if self.cedar.exact_match_search(word).is_none() {
 630 |                 hmm::cut(word, words);
 631 |             } else {
 632 |                 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
 633 |                 while let Some(byte_start) = word_indices.next() {
 634 |                     if let Some(byte_end) = word_indices.peek() {
 635 |                         words.push(&word[byte_start..*byte_end]);
 636 |                     } else {
 637 |                         words.push(&word[byte_start..]);
 638 |                     }
 639 |                 }
 640 |             }
 641 |         }
 642 | 
 643 |         dag.clear();
 644 |         route.clear();
 645 |     }
 646 | 
 647 |     #[allow(non_snake_case)]
 648 |     fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
 649 |         let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
 650 |         let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };
 651 | 
 652 |         re_han.with(|re_han| {
 653 |             re_skip.with(|re_skip| {
 654 |                 let heuristic_capacity = sentence.len() / 2;
 655 |                 let mut words = Vec::with_capacity(heuristic_capacity);
 656 | 
 657 |                 let splitter = SplitMatches::new(re_han, sentence);
 658 |                 let mut route = Vec::with_capacity(heuristic_capacity);
 659 |                 let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
 660 | 
 661 |                 let mut hmm_context = hmm::HmmContext::default();
 662 | 
 663 |                 for state in splitter {
 664 |                     match state {
 665 |                         SplitState::Matched(_) => {
 666 |                             let block = state.into_str();
 667 |                             assert!(!block.is_empty());
 668 | 
 669 |                             if cut_all {
 670 |                                 self.cut_all_internal(block, &mut words);
 671 |                             } else if hmm {
 672 |                                 self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
 673 |                             } else {
 674 |                                 self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
 675 |                             }
 676 |                         }
 677 |                         SplitState::Unmatched(_) => {
 678 |                             let block = state.into_str();
 679 |                             assert!(!block.is_empty());
 680 | 
 681 |                             let skip_splitter = SplitMatches::new(re_skip, block);
 682 |                             for skip_state in skip_splitter {
 683 |                                 let word = skip_state.into_str();
 684 |                                 if word.is_empty() {
 685 |                                     continue;
 686 |                                 }
 687 |                                 if cut_all || re_skip.is_match(word) {
 688 |                                     words.push(word);
 689 |                                 } else {
 690 |                                     let mut word_indices = word.char_indices().map(|x| x.0).peekable();
 691 |                                     while let Some(byte_start) = word_indices.next() {
 692 |                                         if let Some(byte_end) = word_indices.peek() {
 693 |                                             words.push(&word[byte_start..*byte_end]);
 694 |                                         } else {
 695 |                                             words.push(&word[byte_start..]);
 696 |                                         }
 697 |                                     }
 698 |                                 }
 699 |                             }
 700 |                         }
 701 |                     }
 702 |                 }
 703 |                 words
 704 |             })
 705 |         })
 706 |     }
 707 | 
 708 |     /// Cut the input text
 709 |     ///
 710 |     /// ## Params
 711 |     ///
 712 |     /// `sentence`: input text
 713 |     ///
 714 |     /// `hmm`: enable HMM or not
 715 |     pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
 716 |         self.cut_internal(sentence, false, hmm)
 717 |     }
 718 | 
 719 |     /// Cut the input text, return all possible words
 720 |     ///
 721 |     /// ## Params
 722 |     ///
 723 |     /// `sentence`: input text
 724 |     pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
 725 |         self.cut_internal(sentence, true, false)
 726 |     }
 727 | 
 728 |     /// Cut the input text in search mode
 729 |     ///
 730 |     /// ## Params
 731 |     ///
 732 |     /// `sentence`: input text
 733 |     ///
 734 |     /// `hmm`: enable HMM or not
 735 |     pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
 736 |         let words = self.cut(sentence, hmm);
 737 |         let mut new_words = Vec::with_capacity(words.len());
 738 |         for word in words {
 739 |             let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
 740 |             let char_count = char_indices.len();
 741 |             if char_count > 2 {
 742 |                 for i in 0..char_count - 1 {
 743 |                     let byte_start = char_indices[i];
 744 |                     let gram2 = if i + 2 < char_count {
 745 |                         &word[byte_start..char_indices[i + 2]]
 746 |                     } else {
 747 |                         &word[byte_start..]
 748 |                     };
 749 |                     if self.cedar.exact_match_search(gram2).is_some() {
 750 |                         new_words.push(gram2);
 751 |                     }
 752 |                 }
 753 |             }
 754 |             if char_count > 3 {
 755 |                 for i in 0..char_count - 2 {
 756 |                     let byte_start = char_indices[i];
 757 |                     let gram3 = if i + 3 < char_count {
 758 |                         &word[byte_start..char_indices[i + 3]]
 759 |                     } else {
 760 |                         &word[byte_start..]
 761 |                     };
 762 |                     if self.cedar.exact_match_search(gram3).is_some() {
 763 |                         new_words.push(gram3);
 764 |                     }
 765 |                 }
 766 |             }
 767 |             new_words.push(word);
 768 |         }
 769 |         new_words
 770 |     }
 771 | 
 772 |     /// Tokenize
 773 |     ///
 774 |     /// ## Params
 775 |     ///
 776 |     /// `sentence`: input text
 777 |     ///
 778 |     /// `mode`: tokenize mode
 779 |     ///
 780 |     /// `hmm`: enable HMM or not
 781 |     pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
 782 |         let words = self.cut(sentence, hmm);
 783 |         let mut tokens = Vec::with_capacity(words.len());
 784 |         let mut start = 0;
 785 |         match mode {
 786 |             TokenizeMode::Default => {
 787 |                 for word in words {
 788 |                     let width = word.chars().count();
 789 |                     tokens.push(Token {
 790 |                         word,
 791 |                         start,
 792 |                         end: start + width,
 793 |                     });
 794 |                     start += width;
 795 |                 }
 796 |             }
 797 |             TokenizeMode::Search => {
 798 |                 for word in words {
 799 |                     let width = word.chars().count();
 800 |                     if width > 2 {
 801 |                         let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
 802 |                         for i in 0..width - 1 {
 803 |                             let byte_start = char_indices[i];
 804 |                             let gram2 = if i + 2 < width {
 805 |                                 &word[byte_start..char_indices[i + 2]]
 806 |                             } else {
 807 |                                 &word[byte_start..]
 808 |                             };
 809 |                             if self.cedar.exact_match_search(gram2).is_some() {
 810 |                                 tokens.push(Token {
 811 |                                     word: gram2,
 812 |                                     start: start + i,
 813 |                                     end: start + i + 2,
 814 |                                 });
 815 |                             }
 816 |                         }
 817 |                         if width > 3 {
 818 |                             for i in 0..width - 2 {
 819 |                                 let byte_start = char_indices[i];
 820 |                                 let gram3 = if i + 3 < width {
 821 |                                     &word[byte_start..char_indices[i + 3]]
 822 |                                 } else {
 823 |                                     &word[byte_start..]
 824 |                                 };
 825 |                                 if self.cedar.exact_match_search(gram3).is_some() {
 826 |                                     tokens.push(Token {
 827 |                                         word: gram3,
 828 |                                         start: start + i,
 829 |                                         end: start + i + 3,
 830 |                                     });
 831 |                                 }
 832 |                             }
 833 |                         }
 834 |                     }
 835 |                     tokens.push(Token {
 836 |                         word,
 837 |                         start,
 838 |                         end: start + width,
 839 |                     });
 840 |                     start += width;
 841 |                 }
 842 |             }
 843 |         }
 844 |         tokens
 845 |     }
 846 | 
 847 |     /// Tag the input text
 848 |     ///
 849 |     /// ## Params
 850 |     ///
 851 |     /// `sentence`: input text
 852 |     ///
 853 |     /// `hmm`: enable HMM or not
 854 |     pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
 855 |         let words = self.cut(sentence, hmm);
 856 |         words
 857 |             .into_iter()
 858 |             .map(|word| {
 859 |                 if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
 860 |                     let t = &self.records[word_id as usize].tag;
 861 |                     return Tag { word, tag: t };
 862 |                 }
 863 |                 let mut eng = 0;
 864 |                 let mut m = 0;
 865 |                 for chr in word.chars() {
 866 |                     if chr.is_ascii_alphanumeric() {
 867 |                         eng += 1;
 868 |                         if chr.is_ascii_digit() {
 869 |                             m += 1;
 870 |                         }
 871 |                     }
 872 |                 }
 873 |                 let tag = if eng == 0 {
 874 |                     "x"
 875 |                 } else if eng == m {
 876 |                     "m"
 877 |                 } else {
 878 |                     "eng"
 879 |                 };
 880 |                 Tag { word, tag }
 881 |             })
 882 |             .collect()
 883 |     }
 884 | }
 885 | 
 886 | #[cfg(test)]
 887 | mod tests {
 888 |     use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, RE_HAN_DEFAULT};
 889 |     use std::io::BufReader;
 890 | 
 891 |     #[test]
 892 |     fn test_init_with_default_dict() {
 893 |         let _ = Jieba::new();
 894 |     }
 895 | 
 896 |     #[test]
 897 |     fn test_has_word() {
 898 |         let jieba = Jieba::new();
 899 |         assert!(jieba.has_word("中国"));
 900 |         assert!(jieba.has_word("开源"));
 901 |         assert!(!jieba.has_word("不存在的词"));
 902 |     }
 903 | 
 904 |     #[test]
 905 |     fn test_split_matches() {
 906 |         RE_HAN_DEFAULT.with(|re_han| {
 907 |             let splitter = SplitMatches::new(
 908 |                 re_han,
 909 |                 "👪 PS: 我觉得开源有一个好处，就是能够敦促自己不断改进 👪，避免敞帚自珍",
 910 |             );
 911 |             for state in splitter {
 912 |                 match state {
 913 |                     SplitState::Matched(_) => {
 914 |                         let block = state.into_str();
 915 |                         assert!(!block.is_empty());
 916 |                     }
 917 |                     SplitState::Unmatched(_) => {
 918 |                         let block = state.into_str();
 919 |                         assert!(!block.is_empty());
 920 |                     }
 921 |                 }
 922 |             }
 923 |         });
 924 |     }
 925 | 
 926 |     #[test]
 927 |     fn test_split_matches_against_unicode_sip() {
 928 |         RE_HAN_DEFAULT.with(|re_han| {
 929 |             let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
 930 | 
 931 |             let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
 932 |             assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
 933 |         });
 934 |     }
 935 | 
 936 |     #[test]
 937 |     fn test_cut_all() {
 938 |         let jieba = Jieba::new();
 939 |         let words = jieba.cut_all("abc网球拍卖会def");
 940 |         assert_eq!(
 941 |             words,
 942 |             vec![
 943 |                 "abc",
 944 |                 "网",
 945 |                 "网球",
 946 |                 "网球拍",
 947 |                 "球",
 948 |                 "球拍",
 949 |                 "拍",
 950 |                 "拍卖",
 951 |                 "拍卖会",
 952 |                 "卖",
 953 |                 "会",
 954 |                 "def"
 955 |             ]
 956 |         );
 957 | 
 958 |         // The cut_all from the python de-facto implementation is loosely defined,
 959 |         // And the answer "我, 来到, 北京, 清华, 清华大学, 华大, 大学" from the python implementation looks weird since it drops the single character word even though it is part of the DAG candidates.
 960 |         // For example, it includes "华大" but it doesn't include "清" and "学"
 961 |         let words = jieba.cut_all("我来到北京清华大学");
 962 |         assert_eq!(
 963 |             words,
 964 |             vec![
 965 |                 "我",
 966 |                 "来",
 967 |                 "来到",
 968 |                 "到",
 969 |                 "北",
 970 |                 "北京",
 971 |                 "京",
 972 |                 "清",
 973 |                 "清华",
 974 |                 "清华大学",
 975 |                 "华",
 976 |                 "华大",
 977 |                 "大",
 978 |                 "大学",
 979 |                 "学"
 980 |             ]
 981 |         );
 982 |     }
 983 | 
 984 |     #[test]
 985 |     fn test_cut_no_hmm() {
 986 |         let jieba = Jieba::new();
 987 |         let words = jieba.cut("abc网球拍卖会def", false);
 988 |         assert_eq!(words, vec!["abc", "网球", "拍卖会", "def"]);
 989 |     }
 990 | 
 991 |     #[test]
 992 |     fn test_cut_with_hmm() {
 993 |         let jieba = Jieba::new();
 994 |         let words = jieba.cut("我们中出了一个叛徒", false);
 995 |         assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
 996 |         let words = jieba.cut("我们中出了一个叛徒", true);
 997 |         assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒"]);
 998 |         let words = jieba.cut("我们中出了一个叛徒👪", true);
 999 |         assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒", "👪"]);
1000 | 
1001 |         let words = jieba.cut("我来到北京清华大学", true);
1002 |         assert_eq!(words, vec!["我", "来到", "北京", "清华大学"]);
1003 | 
1004 |         let words = jieba.cut("他来到了网易杭研大厦", true);
1005 |         assert_eq!(words, vec!["他", "来到", "了", "网易", "杭研", "大厦"]);
1006 |     }
1007 | 
1008 |     #[test]
1009 |     fn test_cut_weicheng() {
1010 |         static WEICHENG_TXT: &str = include_str!("../examples/weicheng/src/weicheng.txt");
1011 |         let jieba = Jieba::new();
1012 |         for line in WEICHENG_TXT.split('\n') {
1013 |             let _ = jieba.cut(line, true);
1014 |         }
1015 |     }
1016 | 
1017 |     #[test]
1018 |     fn test_cut_for_search() {
1019 |         let jieba = Jieba::new();
1020 |         let words = jieba.cut_for_search("南京市长江大桥", true);
1021 |         assert_eq!(words, vec!["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]);
1022 | 
1023 |         let words = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造", true);
1024 | 
1025 |         // The python implementation silently filtered "，". but we includes it here in the output
1026 |         // to let the library user to decide their own filtering strategy
1027 |         assert_eq!(
1028 |             words,
1029 |             vec![
1030 |                 "小明",
1031 |                 "硕士",
1032 |                 "毕业",
1033 |                 "于",
1034 |                 "中国",
1035 |                 "科学",
1036 |                 "学院",
1037 |                 "科学院",
1038 |                 "中国科学院",
1039 |                 "计算",
1040 |                 "计算所",
1041 |                 "，",
1042 |                 "后",
1043 |                 "在",
1044 |                 "日本",
1045 |                 "京都",
1046 |                 "大学",
1047 |                 "日本京都大学",
1048 |                 "深造"
1049 |             ]
1050 |         );
1051 |     }
1052 | 
1053 |     #[test]
1054 |     fn test_tag() {
1055 |         let jieba = Jieba::new();
1056 |         let tags = jieba.tag(
1057 |             "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。",
1058 |             true,
1059 |         );
1060 |         assert_eq!(
1061 |             tags,
1062 |             vec![
1063 |                 Tag { word: "我", tag: "r" },
1064 |                 Tag { word: "是", tag: "v" },
1065 |                 Tag {
1066 |                     word: "拖拉机",
1067 |                     tag: "n"
1068 |                 },
1069 |                 Tag {
1070 |                     word: "学院", tag: "n"
1071 |                 },
1072 |                 Tag {
1073 |                     word: "手扶拖拉机",
1074 |                     tag: "n"
1075 |                 },
1076 |                 Tag {
1077 |                     word: "专业", tag: "n"
1078 |                 },
1079 |                 Tag { word: "的", tag: "uj" },
1080 |                 Tag { word: "。", tag: "x" },
1081 |                 Tag {
1082 |                     word: "不用", tag: "v"
1083 |                 },
1084 |                 Tag {
1085 |                     word: "多久", tag: "m"
1086 |                 },
1087 |                 Tag { word: "，", tag: "x" },
1088 |                 Tag { word: "我", tag: "r" },
1089 |                 Tag { word: "就", tag: "d" },
1090 |                 Tag { word: "会", tag: "v" },
1091 |                 Tag {
1092 |                     word: "升职", tag: "v"
1093 |                 },
1094 |                 Tag {
1095 |                     word: "加薪",
1096 |                     tag: "nr"
1097 |                 },
1098 |                 Tag { word: "，", tag: "x" },
1099 |                 Tag {
1100 |                     word: "当上", tag: "t"
1101 |                 },
1102 |                 Tag {
1103 |                     word: "CEO",
1104 |                     tag: "eng"
1105 |                 },
1106 |                 Tag { word: "，", tag: "x" },
1107 |                 Tag {
1108 |                     word: "走上", tag: "v"
1109 |                 },
1110 |                 Tag {
1111 |                     word: "人生", tag: "n"
1112 |                 },
1113 |                 Tag {
1114 |                     word: "巅峰", tag: "n"
1115 |                 },
1116 |                 Tag { word: "。", tag: "x" }
1117 |             ]
1118 |         );
1119 | 
1120 |         let tags = jieba.tag("今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1121 |         assert_eq!(
1122 |             tags,
1123 |             vec![
1124 |                 Tag {
1125 |                     word: "今天", tag: "t"
1126 |                 },
1127 |                 Tag {
1128 |                     word: "纽约",
1129 |                     tag: "ns"
1130 |                 },
1131 |                 Tag { word: "的", tag: "uj" },
1132 |                 Tag {
1133 |                     word: "天气", tag: "n"
1134 |                 },
1135 |                 Tag {
1136 |                     word: "真好", tag: "d"
1137 |                 },
1138 |                 Tag { word: "啊", tag: "zg" },
1139 |                 Tag { word: "，", tag: "x" },
1140 |                 Tag {
1141 |                     word: "京华",
1142 |                     tag: "nz"
1143 |                 },
1144 |                 Tag {
1145 |                     word: "大酒店",
1146 |                     tag: "n"
1147 |                 },
1148 |                 Tag { word: "的", tag: "uj" },
1149 |                 Tag {
1150 |                     word: "张尧", tag: "x"
1151 |                 }, // XXX: missing in dict
1152 |                 Tag {
1153 |                     word: "经理", tag: "n"
1154 |                 },
1155 |                 Tag { word: "吃", tag: "v" },
1156 |                 Tag { word: "了", tag: "ul" },
1157 |                 Tag {
1158 |                     word: "一只", tag: "m"
1159 |                 },
1160 |                 Tag {
1161 |                     word: "北京烤鸭",
1162 |                     tag: "n"
1163 |                 },
1164 |                 Tag { word: "。", tag: "x" }
1165 |             ]
1166 |         );
1167 |     }
1168 | 
1169 |     #[test]
1170 |     fn test_tokenize() {
1171 |         let jieba = Jieba::new();
1172 |         let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1173 |         assert_eq!(
1174 |             tokens,
1175 |             vec![
1176 |                 Token {
1177 |                     word: "南京市",
1178 |                     start: 0,
1179 |                     end: 3
1180 |                 },
1181 |                 Token {
1182 |                     word: "长江大桥",
1183 |                     start: 3,
1184 |                     end: 7
1185 |                 }
1186 |             ]
1187 |         );
1188 | 
1189 |         let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1190 |         assert_eq!(
1191 |             tokens,
1192 |             vec![
1193 |                 Token {
1194 |                     word: "南京",
1195 |                     start: 0,
1196 |                     end: 2
1197 |                 },
1198 |                 Token {
1199 |                     word: "京市",
1200 |                     start: 1,
1201 |                     end: 3
1202 |                 },
1203 |                 Token {
1204 |                     word: "南京市",
1205 |                     start: 0,
1206 |                     end: 3
1207 |                 },
1208 |                 Token {
1209 |                     word: "长江",
1210 |                     start: 3,
1211 |                     end: 5
1212 |                 },
1213 |                 Token {
1214 |                     word: "大桥",
1215 |                     start: 5,
1216 |                     end: 7
1217 |                 },
1218 |                 Token {
1219 |                     word: "长江大桥",
1220 |                     start: 3,
1221 |                     end: 7
1222 |                 }
1223 |             ]
1224 |         );
1225 | 
1226 |         let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1227 |         assert_eq!(
1228 |             tokens,
1229 |             vec![
1230 |                 Token {
1231 |                     word: "我们",
1232 |                     start: 0,
1233 |                     end: 2
1234 |                 },
1235 |                 Token {
1236 |                     word: "中",
1237 |                     start: 2,
1238 |                     end: 3
1239 |                 },
1240 |                 Token {
1241 |                     word: "出",
1242 |                     start: 3,
1243 |                     end: 4
1244 |                 },
1245 |                 Token {
1246 |                     word: "了",
1247 |                     start: 4,
1248 |                     end: 5
1249 |                 },
1250 |                 Token {
1251 |                     word: "一个",
1252 |                     start: 5,
1253 |                     end: 7
1254 |                 },
1255 |                 Token {
1256 |                     word: "叛徒",
1257 |                     start: 7,
1258 |                     end: 9
1259 |                 }
1260 |             ]
1261 |         );
1262 |         let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1263 |         assert_eq!(
1264 |             tokens,
1265 |             vec![
1266 |                 Token {
1267 |                     word: "我们",
1268 |                     start: 0,
1269 |                     end: 2
1270 |                 },
1271 |                 Token {
1272 |                     word: "中出",
1273 |                     start: 2,
1274 |                     end: 4
1275 |                 },
1276 |                 Token {
1277 |                     word: "了",
1278 |                     start: 4,
1279 |                     end: 5
1280 |                 },
1281 |                 Token {
1282 |                     word: "一个",
1283 |                     start: 5,
1284 |                     end: 7
1285 |                 },
1286 |                 Token {
1287 |                     word: "叛徒",
1288 |                     start: 7,
1289 |                     end: 9
1290 |                 }
1291 |             ]
1292 |         );
1293 | 
1294 |         let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1295 |         assert_eq!(
1296 |             tokens,
1297 |             vec![
1298 |                 Token {
1299 |                     word: "永和",
1300 |                     start: 0,
1301 |                     end: 2
1302 |                 },
1303 |                 Token {
1304 |                     word: "服装",
1305 |                     start: 2,
1306 |                     end: 4
1307 |                 },
1308 |                 Token {
1309 |                     word: "饰品",
1310 |                     start: 4,
1311 |                     end: 6
1312 |                 },
1313 |                 Token {
1314 |                     word: "有限公司",
1315 |                     start: 6,
1316 |                     end: 10
1317 |                 }
1318 |             ]
1319 |         );
1320 |     }
1321 | 
1322 |     #[test]
1323 |     fn test_userdict() {
1324 |         let mut jieba = Jieba::new();
1325 |         let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1326 |         assert_eq!(
1327 |             tokens,
1328 |             vec![
1329 |                 Token {
1330 |                     word: "我们",
1331 |                     start: 0,
1332 |                     end: 2
1333 |                 },
1334 |                 Token {
1335 |                     word: "中",
1336 |                     start: 2,
1337 |                     end: 3
1338 |                 },
1339 |                 Token {
1340 |                     word: "出",
1341 |                     start: 3,
1342 |                     end: 4
1343 |                 },
1344 |                 Token {
1345 |                     word: "了",
1346 |                     start: 4,
1347 |                     end: 5
1348 |                 },
1349 |                 Token {
1350 |                     word: "一个",
1351 |                     start: 5,
1352 |                     end: 7
1353 |                 },
1354 |                 Token {
1355 |                     word: "叛徒",
1356 |                     start: 7,
1357 |                     end: 9
1358 |                 }
1359 |             ]
1360 |         );
1361 |         let userdict = "中出 10000";
1362 |         jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1363 |         let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1364 |         assert_eq!(
1365 |             tokens,
1366 |             vec![
1367 |                 Token {
1368 |                     word: "我们",
1369 |                     start: 0,
1370 |                     end: 2
1371 |                 },
1372 |                 Token {
1373 |                     word: "中出",
1374 |                     start: 2,
1375 |                     end: 4
1376 |                 },
1377 |                 Token {
1378 |                     word: "了",
1379 |                     start: 4,
1380 |                     end: 5
1381 |                 },
1382 |                 Token {
1383 |                     word: "一个",
1384 |                     start: 5,
1385 |                     end: 7
1386 |                 },
1387 |                 Token {
1388 |                     word: "叛徒",
1389 |                     start: 7,
1390 |                     end: 9
1391 |                 }
1392 |             ]
1393 |         );
1394 |     }
1395 | 
1396 |     #[test]
1397 |     fn test_userdict_hmm() {
1398 |         let mut jieba = Jieba::new();
1399 |         let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1400 |         assert_eq!(
1401 |             tokens,
1402 |             vec![
1403 |                 Token {
1404 |                     word: "我们",
1405 |                     start: 0,
1406 |                     end: 2
1407 |                 },
1408 |                 Token {
1409 |                     word: "中出",
1410 |                     start: 2,
1411 |                     end: 4
1412 |                 },
1413 |                 Token {
1414 |                     word: "了",
1415 |                     start: 4,
1416 |                     end: 5
1417 |                 },
1418 |                 Token {
1419 |                     word: "一个",
1420 |                     start: 5,
1421 |                     end: 7
1422 |                 },
1423 |                 Token {
1424 |                     word: "叛徒",
1425 |                     start: 7,
1426 |                     end: 9
1427 |                 }
1428 |             ]
1429 |         );
1430 |         let userdict = "出了 10000";
1431 |         jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1432 |         let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1433 |         assert_eq!(
1434 |             tokens,
1435 |             vec![
1436 |                 Token {
1437 |                     word: "我们",
1438 |                     start: 0,
1439 |                     end: 2
1440 |                 },
1441 |                 Token {
1442 |                     word: "中",
1443 |                     start: 2,
1444 |                     end: 3
1445 |                 },
1446 |                 Token {
1447 |                     word: "出了",
1448 |                     start: 3,
1449 |                     end: 5
1450 |                 },
1451 |                 Token {
1452 |                     word: "一个",
1453 |                     start: 5,
1454 |                     end: 7
1455 |                 },
1456 |                 Token {
1457 |                     word: "叛徒",
1458 |                     start: 7,
1459 |                     end: 9
1460 |                 }
1461 |             ]
1462 |         );
1463 |     }
1464 | 
1465 |     #[test]
1466 |     fn test_userdict_error() {
1467 |         let mut jieba = Jieba::empty();
1468 |         let userdict = "出了 not_a_int";
1469 |         let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1470 |         assert!(ret.is_err());
1471 |     }
1472 | 
1473 |     #[test]
1474 |     fn test_suggest_freq() {
1475 |         // NOTE: Following behaviors are aligned with original Jieba
1476 | 
1477 |         let mut jieba = Jieba::new();
1478 |         // These values were calculated by original Jieba
1479 |         assert_eq!(jieba.suggest_freq("中出"), 348);
1480 |         assert_eq!(jieba.suggest_freq("出了"), 1263);
1481 | 
1482 |         // Freq in dict.txt was 3, which became 300 after loading user dict
1483 |         let userdict = "中出 300";
1484 |         jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1485 |         // But it's less than calculated freq 348
1486 |         assert_eq!(jieba.suggest_freq("中出"), 348);
1487 | 
1488 |         let userdict = "中出 500";
1489 |         jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1490 |         // Now it's significant enough
1491 |         assert_eq!(jieba.suggest_freq("中出"), 500)
1492 |     }
1493 | 
1494 |     #[test]
1495 |     fn test_custom_lower_freq() {
1496 |         let mut jieba = Jieba::new();
1497 | 
1498 |         jieba.add_word("测试", Some(2445), None);
1499 |         jieba.add_word("测试", Some(10), None);
1500 |         let words = jieba.cut("测试", false);
1501 |         assert_eq!(words, vec!["测试"]);
1502 |     }
1503 | 
1504 |     #[test]
1505 |     fn test_cut_dag_no_hmm_against_string_with_sip() {
1506 |         let mut jieba = Jieba::empty();
1507 | 
1508 |         //add fake word into dictionary
1509 |         jieba.add_word("䶴䶵𦡦", Some(1000), None);
1510 |         jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
1511 | 
1512 |         let words = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
1513 |         assert_eq!(words, vec!["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]);
1514 |     }
1515 | 
1516 |     #[test]
1517 |     fn test_add_custom_word_with_underscrore() {
1518 |         let mut jieba = Jieba::empty();
1519 |         jieba.add_word("田-女士", Some(42), Some("n"));
1520 |         let words = jieba.cut("市民田-女士急匆匆", false);
1521 |         assert_eq!(words, vec!["市", "民", "田-女士", "急", "匆", "匆"]);
1522 |     }
1523 | }
1524 | 


--------------------------------------------------------------------------------
/src/sparse_dag.rs:
--------------------------------------------------------------------------------
  1 | use crate::FxHashMap as HashMap;
  2 | 
  3 | pub(crate) struct StaticSparseDAG {
  4 |     array: Vec<usize>,
  5 |     start_pos: HashMap<usize, usize>,
  6 |     size_hint_for_iterator: usize,
  7 |     curr_insertion_len: usize,
  8 | }
  9 | 
 10 | pub struct EdgeIter<'a> {
 11 |     dag: &'a StaticSparseDAG,
 12 |     cursor: usize,
 13 | }
 14 | 
 15 | impl Iterator for EdgeIter<'_> {
 16 |     type Item = usize;
 17 | 
 18 |     fn size_hint(&self) -> (usize, Option<usize>) {
 19 |         (0, Some(self.dag.size_hint_for_iterator))
 20 |     }
 21 | 
 22 |     fn next(&mut self) -> Option<Self::Item> {
 23 |         if self.dag.array[self.cursor] == 0 {
 24 |             self.cursor += 1;
 25 |             None
 26 |         } else {
 27 |             let v = self.dag.array[self.cursor] - 1;
 28 |             self.cursor += 1;
 29 |             Some(v)
 30 |         }
 31 |     }
 32 | }
 33 | 
 34 | impl StaticSparseDAG {
 35 |     pub(crate) fn with_size_hint(hint: usize) -> Self {
 36 |         StaticSparseDAG {
 37 |             array: Vec::with_capacity(hint * 5),
 38 |             start_pos: HashMap::default(),
 39 |             size_hint_for_iterator: 0,
 40 |             curr_insertion_len: 0,
 41 |         }
 42 |     }
 43 | 
 44 |     #[inline]
 45 |     pub(crate) fn start(&mut self, from: usize) {
 46 |         let idx = self.array.len();
 47 |         self.curr_insertion_len = 0;
 48 |         self.start_pos.insert(from, idx);
 49 |     }
 50 | 
 51 |     #[inline]
 52 |     pub(crate) fn insert(&mut self, to: usize) {
 53 |         self.curr_insertion_len += 1;
 54 |         self.array.push(to + 1);
 55 |     }
 56 | 
 57 |     #[inline]
 58 |     pub(crate) fn commit(&mut self) {
 59 |         self.size_hint_for_iterator = std::cmp::max(self.curr_insertion_len, self.size_hint_for_iterator);
 60 |         self.array.push(0);
 61 |     }
 62 | 
 63 |     #[inline]
 64 |     pub(crate) fn iter_edges(&self, from: usize) -> EdgeIter {
 65 |         let cursor = self.start_pos.get(&from).unwrap().to_owned();
 66 | 
 67 |         EdgeIter { dag: self, cursor }
 68 |     }
 69 | 
 70 |     pub(crate) fn clear(&mut self) {
 71 |         self.array.clear();
 72 |         self.start_pos.clear();
 73 |     }
 74 | }
 75 | 
 76 | #[cfg(test)]
 77 | mod tests {
 78 |     use super::*;
 79 | 
 80 |     #[test]
 81 |     fn test_static_sparse_dag() {
 82 |         let mut dag = StaticSparseDAG::with_size_hint(5);
 83 |         let mut ans: Vec<Vec<usize>> = vec![Vec::new(); 5];
 84 |         for i in 0..=3 {
 85 |             dag.start(i);
 86 |             for j in (i + 1)..=4 {
 87 |                 ans[i].push(j);
 88 |                 dag.insert(j);
 89 |             }
 90 | 
 91 |             dag.commit()
 92 |         }
 93 | 
 94 |         assert_eq!(dag.size_hint_for_iterator, 4);
 95 | 
 96 |         for i in 0..=3 {
 97 |             let edges: Vec<usize> = dag.iter_edges(i).collect();
 98 |             assert_eq!(ans[i], edges);
 99 |         }
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/tests/test_wasm.rs:
--------------------------------------------------------------------------------
 1 | #![cfg(target_arch = "wasm32")]
 2 | 
 3 | use jieba_rs::Jieba;
 4 | use wasm_bindgen_test::*;
 5 | 
 6 | #[wasm_bindgen_test]
 7 | fn test_jieba_cut() {
 8 |     let jieba = Jieba::new();
 9 |     let words = jieba.cut("我们中出了一个叛徒", false);
10 |     assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
11 | }
12 | 


--------------------------------------------------------------------------------