├── .github ├── FUNDING.yml └── workflows │ ├── CI.yml │ └── bench.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── jieba_benchmark.rs ├── capi ├── Cargo.toml └── src │ └── lib.rs ├── examples └── weicheng │ ├── Cargo.toml │ └── src │ ├── main.rs │ └── weicheng.txt ├── jieba-macros ├── Cargo.toml └── src │ ├── hmm.model │ └── lib.rs ├── rustfmt.toml ├── src ├── data │ ├── dict.txt │ └── idf.txt ├── errors.rs ├── hmm.rs ├── keywords │ ├── mod.rs │ ├── textrank.rs │ └── tfidf.rs ├── lib.rs └── sparse_dag.rs └── tests └── test_wasm.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: messense 2 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | pull_request: 6 | 7 | name: CI 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | check: 15 | name: Check 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: dtolnay/rust-toolchain@stable 20 | - run: cargo check --all-features 21 | 22 | test: 23 | name: Test Suite 24 | runs-on: ${{ matrix.os }} 25 | strategy: 26 | matrix: 27 | os: [ubuntu-latest, macos-latest, windows-latest] 28 | steps: 29 | - uses: actions/checkout@v4 30 | - uses: dtolnay/rust-toolchain@stable 31 | - name: Cache cargo build 32 | uses: Swatinem/rust-cache@v2 33 | - name: Check build with --no-default-features 34 | run: cargo build --no-default-features 35 | - name: Check build with default features 36 | run: cargo build 37 | - name: Check build with tfidf feature 38 | run: cargo build --features tfidf 39 | - name: Check build with textrank feature 40 | run: cargo build --features textrank 41 | - name: Test 42 | run: cargo test --all-features --all --benches 43 | 44 | codecov: 45 | name: Code Coverage 46 | runs-on: ubuntu-latest 47 | steps: 48 | - uses: actions/checkout@v4 49 | # Nighty needed for --doctests support. See https://github.com/taiki-e/cargo-llvm-cov/issues/2 50 | - uses: dtolnay/rust-toolchain@nightly 51 | - name: Cache cargo build 52 | uses: Swatinem/rust-cache@v2 53 | - name: Install cargo-llvm-cov 54 | uses: taiki-e/install-action@cargo-llvm-cov 55 | - name: Generate code coverage 56 | run: cargo llvm-cov --all-features --workspace --lcov --doctests --output-path lcov.info 57 | - name: Upload coverage to Codecov 58 | uses: codecov/codecov-action@v4 59 | with: 60 | files: lcov.info 61 | fail_ci_if_error: true 62 | env: 63 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 64 | 65 | fmt: 66 | name: Rustfmt 67 | runs-on: ubuntu-latest 68 | steps: 69 | - uses: actions/checkout@v4 70 | - uses: dtolnay/rust-toolchain@stable 71 | with: 72 | components: rustfmt 73 | - run: cargo fmt --all -- --check 74 | -------------------------------------------------------------------------------- /.github/workflows/bench.yml: -------------------------------------------------------------------------------- 1 | name: benches 2 | 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | pull_request: 8 | # `workflow_dispatch` allows CodSpeed to trigger backtest 9 | # performance analysis in order to generate initial data. 10 | workflow_dispatch: 11 | 12 | concurrency: 13 | group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.event.pull_request.number || github.sha }}-benches 14 | cancel-in-progress: true 15 | 16 | jobs: 17 | benchmarks: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v4 21 | - uses: dtolnay/rust-toolchain@stable 22 | with: 23 | components: rust-src 24 | 25 | - uses: Swatinem/rust-cache@v2 26 | continue-on-error: true 27 | 28 | - name: Install cargo-codspeed 29 | run: cargo install cargo-codspeed 30 | 31 | - name: Build the benchmark target(s) 32 | run: cargo codspeed build --features tfidf,textrank 33 | 34 | - name: Run the benchmarks 35 | uses: CodSpeedHQ/action@v3 36 | with: 37 | run: cargo codspeed run 38 | token: ${{ secrets.CODSPEED_TOKEN }} 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target 3 | Cargo.lock 4 | **/*.rs.bk 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jieba-rs" 3 | version = "0.7.3" 4 | authors = ["messense ", "Paul Meng "] 5 | categories = ["text-processing"] 6 | description = "The Jieba Chinese Word Segmentation Implemented in Rust" 7 | keywords = ["nlp", "chinese", "segmenation"] 8 | license = "MIT" 9 | readme = "README.md" 10 | repository = "https://github.com/messense/jieba-rs" 11 | edition = '2021' 12 | 13 | [package.metadata.docs.rs] 14 | all-features = true 15 | 16 | [dev-dependencies] 17 | codspeed-criterion-compat = { workspace = true } 18 | rand = { workspace = true } 19 | wasm-bindgen-test = { workspace = true } 20 | rayon = { workspace = true } 21 | 22 | [target.'cfg(unix)'.dev-dependencies] 23 | jemallocator = "0.5.0" 24 | 25 | [[bench]] 26 | name = "jieba_benchmark" 27 | harness = false 28 | required-features = ["tfidf", "textrank"] 29 | 30 | [dependencies] 31 | jieba-macros = { workspace = true } 32 | cedarwood = { workspace = true } 33 | derive_builder = { workspace = true, optional = true } 34 | fxhash = { workspace = true } 35 | include-flate = { workspace = true } 36 | lazy_static = { workspace = true } 37 | ordered-float = { workspace = true, optional = true } 38 | phf = { workspace = true } 39 | regex = { workspace = true } 40 | 41 | [features] 42 | default = ["default-dict"] 43 | default-dict = [] 44 | tfidf = ["dep:ordered-float", "dep:derive_builder"] 45 | textrank = ["dep:ordered-float", "dep:derive_builder"] 46 | 47 | [workspace] 48 | members = [".", "capi", "jieba-macros", "examples/weicheng"] 49 | 50 | [workspace.dependencies] 51 | c_fixed_string = "0.2.0" 52 | cedarwood = "0.4" 53 | codspeed-criterion-compat = "2.4.1" 54 | derive_builder = "0.20.0" 55 | fxhash = "0.2.1" 56 | include-flate = "0.3.0" 57 | jieba-macros = { version = "0.7.1", path = "jieba-macros" } 58 | lazy_static = "1.0" 59 | ordered-float = "4.0" 60 | phf = "0.11" 61 | phf_codegen = "0.11" 62 | rand = "0.8" 63 | rayon = "1.10" 64 | regex = "1.0" 65 | wasm-bindgen-test = "0.3.0" 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 - 2019 messense 4 | Copyright (c) 2019 Paul Meng 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jieba-rs 2 | 3 | [![GitHub Actions](https://github.com/messense/jieba-rs/workflows/CI/badge.svg)](https://github.com/messense/jieba-rs/actions?query=workflow%3ACI) 4 | [![codecov](https://codecov.io/gh/messense/jieba-rs/branch/master/graph/badge.svg)](https://codecov.io/gh/messense/jieba-rs) 5 | [![Crates.io](https://img.shields.io/crates/v/jieba-rs.svg)](https://crates.io/crates/jieba-rs) 6 | [![docs.rs](https://docs.rs/jieba-rs/badge.svg)](https://docs.rs/jieba-rs/) 7 | 8 | > 🚀 Help me to become a full-time open-source developer by [sponsoring me on GitHub](https://github.com/sponsors/messense) 9 | 10 | The Jieba Chinese Word Segmentation Implemented in Rust 11 | 12 | ## Installation 13 | 14 | Add it to your ``Cargo.toml``: 15 | 16 | ```toml 17 | [dependencies] 18 | jieba-rs = "0.7" 19 | ``` 20 | 21 | then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well. 22 | 23 | ## Example 24 | 25 | ```rust 26 | use jieba_rs::Jieba; 27 | 28 | fn main() { 29 | let jieba = Jieba::new(); 30 | let words = jieba.cut("我们中出了一个叛徒", false); 31 | assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]); 32 | } 33 | ``` 34 | 35 | ## Enabling Additional Features 36 | 37 | * `default-dict` feature enables embedded dictionary, this features is enabled by default 38 | * `tfidf` feature enables TF-IDF keywords extractor 39 | * `textrank` feature enables TextRank keywords extractor 40 | 41 | ```toml 42 | [dependencies] 43 | jieba-rs = { version = "0.7", features = ["tfidf", "textrank"] } 44 | ``` 45 | 46 | ## Run benchmark 47 | 48 | ```bash 49 | cargo bench --all-features 50 | ``` 51 | 52 | ## Benchmark: Compare with cppjieba 53 | 54 | * [Optimizing jieba-rs to be 33% faster than cppjieba](https://blog.paulme.ng/posts/2019-06-30-optimizing-jieba-rs-to-be-33percents-faster-than-cppjieba.html) 55 | * [优化 jieba-rs 中文分词性能评测](https://blog.paulme.ng/posts/2019-07-01-%E4%BC%98%E5%8C%96-jieba-rs-%E4%B8%AD%E6%96%87%E5%88%86%E8%AF%8D-%E6%80%A7%E8%83%BD%E8%AF%84%E6%B5%8B%EF%BC%88%E5%BF%AB%E4%BA%8E-cppjieba-33percent%29.html) 56 | * [最佳化 jieba-rs 中文斷詞性能測試](https://blog.paulme.ng/posts/2019-07-01-%E6%9C%80%E4%BD%B3%E5%8C%96jieba-rs%E4%B8%AD%E6%96%87%E6%96%B7%E8%A9%9E%E6%80%A7%E8%83%BD%E6%B8%AC%E8%A9%A6%28%E5%BF%AB%E4%BA%8Ecppjieba-33%25%29.html) 57 | 58 | ## `jieba-rs` bindings 59 | 60 | * [`@node-rs/jieba` NodeJS binding](https://github.com/napi-rs/node-rs/tree/main/packages/jieba) 61 | * [`jieba-php` PHP binding](https://github.com/binaryoung/jieba-php) 62 | * [`rjieba-py` Python binding](https://github.com/messense/rjieba-py) 63 | * [`cang-jie` Chinese tokenizer for tantivy](https://github.com/DCjanus/cang-jie) 64 | * [`tantivy-jieba` An adapter that bridges between tantivy and jieba-rs](https://github.com/jiegec/tantivy-jieba) 65 | * [`jieba-wasm` the WebAssembly binding](https://github.com/fengkx/jieba-wasm) 66 | 67 | ## License 68 | 69 | This work is released under the MIT license. A copy of the license is provided in the [LICENSE](./LICENSE) file. 70 | -------------------------------------------------------------------------------- /benches/jieba_benchmark.rs: -------------------------------------------------------------------------------- 1 | use codspeed_criterion_compat::{black_box, criterion_group, criterion_main, Criterion, Throughput}; 2 | use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf, TokenizeMode}; 3 | use lazy_static::lazy_static; 4 | use rayon::iter::{IntoParallelIterator, ParallelIterator}; 5 | 6 | #[cfg(unix)] 7 | #[global_allocator] 8 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; 9 | 10 | lazy_static! { 11 | static ref JIEBA: Jieba = Jieba::new(); 12 | static ref TFIDF_EXTRACTOR: TfIdf = TfIdf::default(); 13 | static ref TEXTRANK_EXTRACTOR: TextRank = TextRank::default(); 14 | } 15 | static SENTENCE: &str = "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。"; 16 | 17 | fn criterion_benchmark(c: &mut Criterion) { 18 | let mut group = c.benchmark_group("jieba"); 19 | let dict_len = include_bytes!("../src/data/dict.txt").len() as u64; 20 | group.throughput(Throughput::Bytes(dict_len)); 21 | group.bench_function("new", |b| { 22 | b.iter(|| { 23 | black_box(Jieba::new()); 24 | }) 25 | }); 26 | group.finish(); 27 | 28 | let mut group = c.benchmark_group("cut"); 29 | group.throughput(Throughput::Bytes(SENTENCE.len() as u64)); 30 | group.bench_function("no_hmm", |b| b.iter(|| JIEBA.cut(black_box(SENTENCE), false))); 31 | group.bench_function("with_hmm", |b| b.iter(|| JIEBA.cut(black_box(SENTENCE), true))); 32 | group.bench_function("cut_all", |b| b.iter(|| JIEBA.cut_all(black_box(SENTENCE)))); 33 | group.bench_function("cut_for_search", |b| { 34 | b.iter(|| JIEBA.cut_for_search(black_box(SENTENCE), true)) 35 | }); 36 | group.finish(); 37 | 38 | let mut group = c.benchmark_group("tokenize"); 39 | group.throughput(Throughput::Bytes(SENTENCE.len() as u64)); 40 | group.bench_function("default_mode", |b| { 41 | b.iter(|| JIEBA.tokenize(black_box(SENTENCE), TokenizeMode::Default, true)) 42 | }); 43 | group.bench_function("search_mode", |b| { 44 | b.iter(|| JIEBA.tokenize(black_box(SENTENCE), TokenizeMode::Search, true)) 45 | }); 46 | group.finish(); 47 | 48 | let mut group = c.benchmark_group("jieba"); 49 | group.throughput(Throughput::Bytes(SENTENCE.len() as u64)); 50 | group.bench_function("tag", |b| b.iter(|| JIEBA.tag(black_box(SENTENCE), true))); 51 | group.finish(); 52 | 53 | let mut group = c.benchmark_group("keywords"); 54 | group.throughput(Throughput::Bytes(SENTENCE.len() as u64)); 55 | group.bench_function("tfidf", |b| { 56 | b.iter(|| TFIDF_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new())) 57 | }); 58 | group.bench_function("textrank", |b| { 59 | b.iter(|| TEXTRANK_EXTRACTOR.extract_keywords(&JIEBA, black_box(SENTENCE), 3, Vec::new())) 60 | }); 61 | group.finish(); 62 | 63 | let mut group = c.benchmark_group("multithreaded"); 64 | let repeat = 1000usize; 65 | group.throughput(Throughput::Bytes(SENTENCE.len() as u64 * repeat as u64)); 66 | group.bench_function("single_thread", |b| { 67 | b.iter(|| { 68 | for _ in 0..repeat { 69 | let _words = JIEBA.cut(black_box(&SENTENCE), true); 70 | } 71 | }) 72 | }); 73 | group.bench_function("multi_thread", |b| { 74 | b.iter(|| { 75 | (0..repeat).into_par_iter().for_each(|_| { 76 | let _words = JIEBA.cut(black_box(&SENTENCE), true); 77 | }); 78 | }) 79 | }); 80 | group.finish(); 81 | } 82 | 83 | criterion_group!(benches, criterion_benchmark); 84 | criterion_main!(benches); 85 | -------------------------------------------------------------------------------- /capi/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jieba-capi" 3 | version = "0.1.0" 4 | authors = ["messense "] 5 | edition = "2021" 6 | 7 | [dependencies] 8 | jieba-rs = { version = "0.7.0", path = "../", features = ["textrank", "tfidf"] } 9 | c_fixed_string = { workspace = true } 10 | 11 | [lib] 12 | crate-type = ["cdylib"] 13 | -------------------------------------------------------------------------------- /capi/src/lib.rs: -------------------------------------------------------------------------------- 1 | use c_fixed_string::CFixedStr; 2 | use jieba_rs::{Jieba, KeywordExtract, TextRank, TfIdf}; 3 | use std::boxed::Box; 4 | use std::os::raw::c_char; 5 | use std::{mem, ptr}; 6 | 7 | #[repr(C)] 8 | pub struct CJieba { 9 | jieba: Jieba, 10 | _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>, 11 | } 12 | 13 | #[repr(C)] 14 | pub struct CJiebaTFIDF { 15 | cjieba: *mut CJieba, 16 | tfidf: TfIdf, 17 | _marker: core::marker::PhantomData<(*mut u8, core::marker::PhantomPinned)>, 18 | } 19 | 20 | #[repr(C)] 21 | pub struct CJiebaWords { 22 | pub words: *mut FfiStr, 23 | pub len: usize, 24 | } 25 | 26 | #[repr(C)] 27 | pub struct CJiebaToken { 28 | pub word: FfiStr, 29 | pub start: usize, 30 | pub end: usize, 31 | } 32 | 33 | #[repr(C)] 34 | pub struct CJiebaTokens { 35 | pub tokens: *mut CJiebaToken, 36 | pub len: usize, 37 | } 38 | 39 | /// Tokenize mode 40 | #[repr(C)] 41 | pub enum TokenizeMode { 42 | /// Default mode 43 | Default = 0, 44 | /// Search mode 45 | Search = 1, 46 | } 47 | 48 | impl From for jieba_rs::TokenizeMode { 49 | fn from(mode: TokenizeMode) -> Self { 50 | match mode { 51 | TokenizeMode::Default => jieba_rs::TokenizeMode::Default, 52 | TokenizeMode::Search => jieba_rs::TokenizeMode::Search, 53 | } 54 | } 55 | } 56 | 57 | #[repr(C)] 58 | pub struct CJiebaTag { 59 | pub word: FfiStr, 60 | pub tag: FfiStr, 61 | } 62 | 63 | #[repr(C)] 64 | pub struct CJiebaTags { 65 | pub tags: *mut CJiebaTag, 66 | pub len: usize, 67 | } 68 | 69 | /// Represents a string. 70 | #[repr(C)] 71 | pub struct FfiStr { 72 | pub data: *mut c_char, 73 | pub len: usize, 74 | pub owned: bool, 75 | } 76 | 77 | impl Default for FfiStr { 78 | fn default() -> Self { 79 | Self { 80 | data: ptr::null_mut(), 81 | len: 0, 82 | owned: false, 83 | } 84 | } 85 | } 86 | 87 | impl FfiStr { 88 | pub fn from_string(mut s: String) -> Self { 89 | s.shrink_to_fit(); 90 | let rv = Self { 91 | data: s.as_ptr() as *mut c_char, 92 | len: s.len(), 93 | owned: true, 94 | }; 95 | mem::forget(s); 96 | rv 97 | } 98 | 99 | /// # Safety 100 | /// Frees the underlying data. After this call, the internal pointer is invalid. 101 | pub unsafe fn free(&mut self) { 102 | if self.owned && !self.data.is_null() { 103 | String::from_raw_parts(self.data as *mut _, self.len, self.len); 104 | self.data = ptr::null_mut(); 105 | self.len = 0; 106 | self.owned = false; 107 | } 108 | } 109 | } 110 | 111 | impl Drop for FfiStr { 112 | fn drop(&mut self) { 113 | unsafe { 114 | self.free(); 115 | } 116 | } 117 | } 118 | 119 | /// Frees a ffi str. 120 | /// 121 | /// If the string is marked as not owned then this function does not 122 | /// do anything. 123 | /// 124 | /// # Safety 125 | /// Used to release strings returned as results of function calls. 126 | #[no_mangle] 127 | pub unsafe extern "C" fn jieba_str_free(s: *mut FfiStr) { 128 | if !s.is_null() { 129 | (*s).free() 130 | } 131 | } 132 | 133 | unsafe fn params_unwrap(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&Jieba, &CFixedStr) { 134 | let jieba = &(*(*cjieba_ref)).jieba; 135 | let c_str = CFixedStr::from_ptr(s, len); 136 | (jieba, c_str) 137 | } 138 | 139 | unsafe fn params_unwrap_mut(cjieba_ref: &*mut CJieba, s: *const c_char, len: usize) -> (&mut Jieba, &CFixedStr) { 140 | let jieba = &mut (*(*cjieba_ref)).jieba; 141 | let c_str = CFixedStr::from_ptr(s, len); 142 | (jieba, c_str) 143 | } 144 | 145 | /// # Safety 146 | /// Returned value must be freed by `jieba_free()`. 147 | #[no_mangle] 148 | pub extern "C" fn jieba_new() -> *mut CJieba { 149 | let cjieba = CJieba { 150 | jieba: Jieba::new(), 151 | _marker: Default::default(), 152 | }; 153 | Box::into_raw(Box::new(cjieba)) 154 | } 155 | 156 | /// Returns a Jieba instance with an empty dictionary. 157 | /// 158 | /// # Safety 159 | /// Returned value must be freed by `jieba_free()`. 160 | #[no_mangle] 161 | pub extern "C" fn jieba_empty() -> *mut CJieba { 162 | let cjieba = CJieba { 163 | jieba: Jieba::empty(), 164 | _marker: Default::default(), 165 | }; 166 | Box::into_raw(Box::new(cjieba)) 167 | } 168 | 169 | /// # Safety 170 | /// cjieba is result from `jieba_new()` call. 171 | #[no_mangle] 172 | pub unsafe extern "C" fn jieba_free(cjieba: *mut CJieba) { 173 | if !cjieba.is_null() { 174 | drop(Box::from_raw(cjieba)); 175 | } 176 | } 177 | 178 | /// # Safety 179 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. 180 | #[no_mangle] 181 | pub unsafe extern "C" fn jieba_cut( 182 | cjieba: *mut CJieba, 183 | sentence: *const c_char, 184 | len: usize, 185 | hmm: bool, 186 | ) -> *mut CJiebaWords { 187 | let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); 188 | // FIXME: remove allocation 189 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 190 | let words = jieba.cut(&s, hmm); 191 | let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect(); 192 | let words_len = c_words.len(); 193 | let ptr = c_words.as_mut_ptr(); 194 | mem::forget(c_words); 195 | Box::into_raw(Box::new(CJiebaWords { 196 | words: ptr, 197 | len: words_len, 198 | })) 199 | } 200 | 201 | /// # Safety 202 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. 203 | #[no_mangle] 204 | pub unsafe extern "C" fn jieba_cut_all(cjieba: *mut CJieba, sentence: *const c_char, len: usize) -> *mut CJiebaWords { 205 | let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); 206 | // FIXME: remove allocation 207 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 208 | let words = (*jieba).cut_all(&s); 209 | let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect(); 210 | let words_len = c_words.len(); 211 | let ptr = c_words.as_mut_ptr(); 212 | mem::forget(c_words); 213 | Box::into_raw(Box::new(CJiebaWords { 214 | words: ptr, 215 | len: words_len, 216 | })) 217 | } 218 | 219 | /// # Safety 220 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. 221 | #[no_mangle] 222 | pub unsafe extern "C" fn jieba_cut_for_search( 223 | cjieba: *mut CJieba, 224 | sentence: *const c_char, 225 | len: usize, 226 | hmm: bool, 227 | ) -> *mut CJiebaWords { 228 | let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); 229 | // FIXME: remove allocation 230 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 231 | let words = (*jieba).cut_for_search(&s, hmm); 232 | let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.to_string())).collect(); 233 | let words_len = c_words.len(); 234 | let ptr = c_words.as_mut_ptr(); 235 | mem::forget(c_words); 236 | Box::into_raw(Box::new(CJiebaWords { 237 | words: ptr, 238 | len: words_len, 239 | })) 240 | } 241 | 242 | /// # Safety 243 | /// cjieba must be valid object from `jieba_new()` and must outlive the returned CJiebaTFIDF instance. 244 | /// 245 | /// Returned value must be freed by `jieba_tfidf_free()`. 246 | #[no_mangle] 247 | pub extern "C" fn jieba_tfidf_new(cjieba: *mut CJieba) -> *mut CJiebaTFIDF { 248 | let cjieba_tfidf = CJiebaTFIDF { 249 | cjieba, 250 | tfidf: Default::default(), 251 | _marker: Default::default(), 252 | }; 253 | Box::into_raw(Box::new(cjieba_tfidf)) 254 | } 255 | 256 | /// # Safety 257 | /// cjieba_tfidf is result from `jieba_tfidf_new()` call. 258 | #[no_mangle] 259 | pub unsafe extern "C" fn jieba_tfidf_free(cjieba_tfidf: *mut CJiebaTFIDF) { 260 | if !cjieba_tfidf.is_null() { 261 | drop(Box::from_raw(cjieba_tfidf)); 262 | } 263 | } 264 | 265 | /// # Safety 266 | /// cjieba_tfidf must be valid object from `jieba_tfidf_new()`. `sentence` must be `len` or larger. 267 | /// 268 | /// Returned value must be freed by `jieba_words_free()`. 269 | #[no_mangle] 270 | pub unsafe extern "C" fn jieba_tfidf_extract( 271 | cjieba_tfidf: *mut CJiebaTFIDF, 272 | sentence: *const c_char, 273 | len: usize, 274 | top_k: usize, 275 | allowed_pos: *const *mut c_char, 276 | allowed_pos_len: usize, 277 | ) -> *mut CJiebaWords { 278 | let cjieba_tfidf_ref = &(*cjieba_tfidf); 279 | let tfidf = &cjieba_tfidf_ref.tfidf; 280 | let (jieba, c_str) = params_unwrap(&cjieba_tfidf_ref.cjieba, sentence, len); 281 | // FIXME: remove allocation 282 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 283 | 284 | let allowed_pos: Vec = if allowed_pos_len == 0 || allowed_pos.is_null() { 285 | Vec::new() 286 | } else { 287 | let mut v = Vec::with_capacity(allowed_pos_len); 288 | 289 | let slice: &[*mut c_char] = std::slice::from_raw_parts(allowed_pos, allowed_pos_len); 290 | for ptr in slice.iter() { 291 | let cstring_allowed_pos = std::ffi::CString::from_raw(*ptr); 292 | let string_allowed_pos = cstring_allowed_pos.into_string().expect("into_string().err() failed"); 293 | v.push(string_allowed_pos); 294 | } 295 | 296 | v 297 | }; 298 | 299 | let words = tfidf.extract_keywords(jieba, &s, top_k, allowed_pos); 300 | let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.keyword)).collect(); 301 | let words_len = c_words.len(); 302 | let ptr = c_words.as_mut_ptr(); 303 | mem::forget(c_words); 304 | Box::into_raw(Box::new(CJiebaWords { 305 | words: ptr, 306 | len: words_len, 307 | })) 308 | } 309 | 310 | /// # Safety 311 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. 312 | /// 313 | /// Returned value must be freed by `jieba_words_free()`. 314 | #[no_mangle] 315 | pub unsafe extern "C" fn jieba_textrank_extract( 316 | cjieba: *mut CJieba, 317 | sentence: *const c_char, 318 | len: usize, 319 | top_k: usize, 320 | allowed_pos: *const *mut c_char, 321 | allowed_pos_len: usize, 322 | ) -> *mut CJiebaWords { 323 | let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); 324 | // FIXME: remove allocation 325 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 326 | 327 | let allowed_pos: Vec = if allowed_pos_len == 0 || allowed_pos.is_null() { 328 | Vec::new() 329 | } else { 330 | let mut v = Vec::with_capacity(allowed_pos_len); 331 | 332 | let slice: &[*mut c_char] = std::slice::from_raw_parts(allowed_pos, allowed_pos_len); 333 | for ptr in slice.iter() { 334 | let cstring_allowed_pos = std::ffi::CString::from_raw(*ptr); 335 | let string_allowed_pos = cstring_allowed_pos.into_string().expect("into_string().err() failed"); 336 | v.push(string_allowed_pos); 337 | } 338 | 339 | v 340 | }; 341 | 342 | let textrank = TextRank::default(); 343 | let words = textrank.extract_keywords(jieba, &s, top_k, allowed_pos); 344 | let mut c_words: Vec = words.into_iter().map(|x| FfiStr::from_string(x.keyword)).collect(); 345 | let words_len = c_words.len(); 346 | let ptr = c_words.as_mut_ptr(); 347 | mem::forget(c_words); 348 | Box::into_raw(Box::new(CJiebaWords { 349 | words: ptr, 350 | len: words_len, 351 | })) 352 | } 353 | 354 | /// # Safety 355 | /// c_tags is result from `jieba_textrank_extract()` or `jieba_tfidf_extract()` call. 356 | #[no_mangle] 357 | pub unsafe extern "C" fn jieba_words_free(c_words: *mut CJiebaWords) { 358 | if !c_words.is_null() { 359 | Vec::from_raw_parts((*c_words).words, (*c_words).len, (*c_words).len); 360 | drop(Box::from_raw(c_words)); 361 | } 362 | } 363 | 364 | /// # Safety 365 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. 366 | /// 367 | /// Returned value must be freed by `jieba_tokens_free()`. 368 | #[no_mangle] 369 | pub unsafe extern "C" fn jieba_tokenize( 370 | cjieba: *mut CJieba, 371 | sentence: *const c_char, 372 | len: usize, 373 | mode: TokenizeMode, 374 | hmm: bool, 375 | ) -> *mut CJiebaTokens { 376 | let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); 377 | // FIXME: remove allocation 378 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 379 | let tokens = (*jieba).tokenize(&s, mode.into(), hmm); 380 | let mut c_tokens: Vec = tokens 381 | .into_iter() 382 | .map(|x| CJiebaToken { 383 | word: FfiStr::from_string(x.word.to_string()), 384 | start: x.start, 385 | end: x.end, 386 | }) 387 | .collect(); 388 | let tokens_len = c_tokens.len(); 389 | let ptr = c_tokens.as_mut_ptr(); 390 | mem::forget(c_tokens); 391 | Box::into_raw(Box::new(CJiebaTokens { 392 | tokens: ptr, 393 | len: tokens_len, 394 | })) 395 | } 396 | 397 | /// # Safety 398 | /// c_tokens is result from `jieba_tokenize()` call. 399 | #[no_mangle] 400 | pub unsafe extern "C" fn jieba_tokens_free(c_tokens: *mut CJiebaTokens) { 401 | if !c_tokens.is_null() { 402 | Vec::from_raw_parts((*c_tokens).tokens, (*c_tokens).len, (*c_tokens).len); 403 | drop(Box::from_raw(c_tokens)); 404 | } 405 | } 406 | 407 | /// # Safety 408 | /// cjieba must be valid object from `jieba_new()`. `sentence` must be `len` or larger. 409 | /// 410 | /// Returned value must be freed by `jieba_tags_free()`. 411 | #[no_mangle] 412 | pub unsafe extern "C" fn jieba_tag( 413 | cjieba: *mut CJieba, 414 | sentence: *const c_char, 415 | len: usize, 416 | hmm: bool, 417 | ) -> *mut CJiebaTags { 418 | let (jieba, c_str) = params_unwrap(&cjieba, sentence, len); 419 | // FIXME: remove allocation 420 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 421 | let tags = (*jieba).tag(&s, hmm); 422 | let mut c_tags: Vec = tags 423 | .into_iter() 424 | .map(|x| CJiebaTag { 425 | word: FfiStr::from_string(x.word.to_string()), 426 | tag: FfiStr::from_string(x.tag.to_string()), 427 | }) 428 | .collect(); 429 | let tags_len = c_tags.len(); 430 | let ptr = c_tags.as_mut_ptr(); 431 | mem::forget(c_tags); 432 | Box::into_raw(Box::new(CJiebaTags { 433 | tags: ptr, 434 | len: tags_len, 435 | })) 436 | } 437 | 438 | /// # Safety 439 | /// c_tags is result from `jieba_tag()` call. 440 | #[no_mangle] 441 | pub unsafe extern "C" fn jieba_tags_free(c_tags: *mut CJiebaTags) { 442 | if !c_tags.is_null() { 443 | Vec::from_raw_parts((*c_tags).tags, (*c_tags).len, (*c_tags).len); 444 | drop(Box::from_raw(c_tags)); 445 | } 446 | } 447 | 448 | /// # Safety 449 | /// cjieba must be valid object from `jieba_new()`. `word` must be `len` or larger. 450 | #[no_mangle] 451 | pub unsafe extern "C" fn jieba_add_word(cjieba: *mut CJieba, word: *const c_char, len: usize) -> usize { 452 | let (jieba, c_str) = params_unwrap_mut(&cjieba, word, len); 453 | // FIXME: remove allocation 454 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 455 | jieba.add_word(&s, None, None) 456 | } 457 | 458 | /// # Safety 459 | /// cjieba must be valid object from `jieba_new()`. `segment` must be `len` or larger. 460 | #[no_mangle] 461 | pub unsafe extern "C" fn jieba_suggest_freq(cjieba: *mut CJieba, segment: *const c_char, len: usize) -> usize { 462 | let (jieba, c_str) = params_unwrap(&cjieba, segment, len); 463 | // FIXME: remove allocation 464 | let s = String::from_utf8_lossy(c_str.as_bytes_full()); 465 | 466 | (*jieba).suggest_freq(&s) 467 | } 468 | 469 | #[cfg(test)] 470 | mod test { 471 | use super::*; 472 | use std::ffi::CString; 473 | 474 | #[test] 475 | fn test_jieba_new_and_free() { 476 | let jieba = jieba_new(); 477 | unsafe { jieba_free(jieba) }; 478 | } 479 | 480 | #[test] 481 | fn test_jieba_empty_and_free() { 482 | let jieba = jieba_empty(); 483 | unsafe { jieba_free(jieba) }; 484 | } 485 | 486 | #[test] 487 | fn test_jieba_add_word() { 488 | let jieba = jieba_empty(); 489 | let word = "今天"; 490 | let c_word = CString::new(word).unwrap(); 491 | unsafe { 492 | jieba_add_word(jieba, c_word.as_ptr(), word.len()); 493 | jieba_free(jieba) 494 | }; 495 | } 496 | } 497 | -------------------------------------------------------------------------------- /examples/weicheng/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "weicheng" 3 | version = "0.1.0" 4 | authors = ["messense "] 5 | edition = "2021" 6 | 7 | [dependencies] 8 | jieba-rs = { path = "../.." } 9 | 10 | [target.'cfg(unix)'.dependencies] 11 | jemallocator = "0.5.0" 12 | -------------------------------------------------------------------------------- /examples/weicheng/src/main.rs: -------------------------------------------------------------------------------- 1 | use jieba_rs::Jieba; 2 | use std::time; 3 | 4 | #[cfg(unix)] 5 | #[global_allocator] 6 | static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; 7 | 8 | static WEICHENG_TXT: &str = include_str!("weicheng.txt"); 9 | 10 | fn main() { 11 | let jieba = Jieba::new(); 12 | let lines: Vec<&str> = WEICHENG_TXT.split('\n').collect(); 13 | let now = time::Instant::now(); 14 | for _ in 0..50 { 15 | for line in &lines { 16 | let _ = jieba.cut(line, true); 17 | } 18 | } 19 | println!("{}ms", now.elapsed().as_millis()); 20 | } 21 | -------------------------------------------------------------------------------- /jieba-macros/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jieba-macros" 3 | version = "0.7.1" 4 | categories = ["text-processing"] 5 | description = "jieba-rs proc-macro" 6 | keywords = ["nlp", "chinese", "segmenation"] 7 | license = "MIT" 8 | readme = "../README.md" 9 | repository = "https://github.com/messense/jieba-rs" 10 | edition = "2021" 11 | 12 | [lib] 13 | proc-macro = true 14 | 15 | [dependencies] 16 | phf_codegen = { workspace = true } 17 | -------------------------------------------------------------------------------- /jieba-macros/src/lib.rs: -------------------------------------------------------------------------------- 1 | use proc_macro::TokenStream; 2 | 3 | #[proc_macro] 4 | pub fn generate_hmm_data(_input: TokenStream) -> TokenStream { 5 | let hmm_data = include_str!("hmm.model"); 6 | let mut output = String::new(); 7 | let mut lines = hmm_data.lines().skip_while(|x| x.starts_with('#')); 8 | 9 | // Initial probabilities 10 | let init_probs = lines 11 | .next() 12 | .expect("Failed to read initial probabilities from hmm.model"); 13 | 14 | output.push_str("#[allow(clippy::style)]\n"); 15 | output.push_str("pub static INITIAL_PROBS: [f64; 4] = ["); 16 | output.push_str(&init_probs.replace(' ', ", ")); 17 | output.push_str("];\n\n"); 18 | 19 | // Transition probabilities 20 | output.push_str("#[allow(clippy::style)]\n"); 21 | output.push_str("pub static TRANS_PROBS: [[f64; 4]; 4] = ["); 22 | for line in lines 23 | .by_ref() 24 | .skip_while(|x| x.starts_with('#')) 25 | .take_while(|x| !x.starts_with('#')) 26 | { 27 | output.push('['); 28 | output.push_str(&line.replace(' ', ", ")); 29 | output.push_str("],\n"); 30 | } 31 | output.push_str("];\n\n"); 32 | 33 | // Emission probabilities 34 | for (i, line) in lines.filter(|x| !x.starts_with('#')).enumerate() { 35 | output.push_str("#[allow(clippy::style)]\n"); 36 | output.push_str(&format!("pub static EMIT_PROB_{}: phf::Map<&'static str, f64> = ", i)); 37 | 38 | let mut map = phf_codegen::Map::new(); 39 | for word_prob in line.split(',') { 40 | let mut parts = word_prob.split(':'); 41 | let word = parts.next().unwrap(); 42 | let prob = parts.next().unwrap(); 43 | map.entry(word, prob); 44 | } 45 | output.push_str(&map.build().to_string()); 46 | output.push_str(";\n\n"); 47 | } 48 | 49 | output.push_str("#[allow(clippy::style)]\n"); 50 | output.push_str("pub static EMIT_PROBS: [&'static phf::Map<&'static str, f64>; 4] = [&EMIT_PROB_0, &EMIT_PROB_1, &EMIT_PROB_2, &EMIT_PROB_3];\n\n"); 51 | 52 | output.parse().unwrap() 53 | } 54 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 120 2 | -------------------------------------------------------------------------------- /src/errors.rs: -------------------------------------------------------------------------------- 1 | use std::{error, fmt, io}; 2 | 3 | /// The Error type 4 | #[derive(Debug)] 5 | pub enum Error { 6 | /// I/O errors 7 | Io(io::Error), 8 | /// Invalid entry in dictionary 9 | InvalidDictEntry(String), 10 | } 11 | 12 | impl From for Error { 13 | fn from(err: io::Error) -> Self { 14 | Self::Io(err) 15 | } 16 | } 17 | 18 | impl fmt::Display for Error { 19 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 20 | match *self { 21 | Error::Io(ref err) => err.fmt(f), 22 | Error::InvalidDictEntry(ref err) => write!(f, "invalid dictionary entry: {}", err), 23 | } 24 | } 25 | } 26 | 27 | impl error::Error for Error { 28 | fn source(&self) -> Option<&(dyn error::Error + 'static)> { 29 | match *self { 30 | Error::Io(ref err) => Some(err), 31 | Error::InvalidDictEntry(_) => None, 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/hmm.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | 3 | use regex::Regex; 4 | 5 | use crate::SplitMatches; 6 | use jieba_macros::generate_hmm_data; 7 | 8 | thread_local! { 9 | static RE_HAN: Regex = Regex::new(r"([\u{4E00}-\u{9FD5}]+)").unwrap(); 10 | static RE_SKIP: Regex = Regex::new(r"([a-zA-Z0-9]+(?:.\d+)?%?)").unwrap(); 11 | } 12 | 13 | pub const NUM_STATES: usize = 4; 14 | 15 | /// Result of hmm is a labeling of each Unicode Scalar Value in the input 16 | /// string with Begin, Middle, End, or Single. These denote the proposed 17 | /// segments. A segment is one of the following two patterns. 18 | /// 19 | /// Begin, [Middle...], End 20 | /// Single 21 | /// 22 | /// Each state in the enum is also assigned an index value from 0-3 that 23 | /// can be used as an index into an array representing data pertaining 24 | /// to that state. 25 | /// 26 | /// WARNING: The data file format for hmm.model comments imply one can 27 | /// reassign the index values of each state at the top but `jieba-macros` 28 | /// currently ignores the mapping. Do not reassign these indices without 29 | /// verifying how it interacts with `jieba-macros`. These indices must also 30 | /// match the order if ALLOWED_PREV_STATUS. 31 | #[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Clone, Copy)] 32 | pub enum State { 33 | Begin = 0, 34 | End = 1, 35 | Middle = 2, 36 | Single = 3, 37 | } 38 | 39 | // Mapping representing the allow transitiongs into the given state. 40 | // 41 | // WARNING: Ordering must match the indicies in State. 42 | static ALLOWED_PREV_STATUS: [[State; 2]; NUM_STATES] = [ 43 | // Can preceed State::Begin 44 | [State::End, State::Single], 45 | // Can preceed State::End 46 | [State::Begin, State::Middle], 47 | // Can preceed State::Middle 48 | [State::Middle, State::Begin], 49 | // Can preceed State::Single 50 | [State::Single, State::End], 51 | ]; 52 | 53 | generate_hmm_data!(); 54 | 55 | const MIN_FLOAT: f64 = -3.14e100; 56 | 57 | #[derive(Default)] 58 | pub(crate) struct HmmContext { 59 | v: Vec, 60 | prev: Vec>, 61 | best_path: Vec, 62 | } 63 | 64 | #[allow(non_snake_case)] 65 | fn viterbi(sentence: &str, hmm_context: &mut HmmContext) { 66 | let str_len = sentence.len(); 67 | let states = [State::Begin, State::Middle, State::End, State::Single]; 68 | #[allow(non_snake_case)] 69 | let R = states.len(); 70 | let C = sentence.chars().count(); 71 | assert!(C > 1); 72 | 73 | // TODO: Can code just do fill() with the default instead of clear() and resize? 74 | if hmm_context.prev.len() < R * C { 75 | hmm_context.prev.resize(R * C, None); 76 | } 77 | 78 | if hmm_context.v.len() < R * C { 79 | hmm_context.v.resize(R * C, 0.0); 80 | } 81 | 82 | if hmm_context.best_path.len() < C { 83 | hmm_context.best_path.resize(C, State::Begin); 84 | } 85 | 86 | let mut curr = sentence.char_indices().map(|x| x.0).peekable(); 87 | let x1 = curr.next().unwrap(); 88 | let x2 = *curr.peek().unwrap(); 89 | for y in &states { 90 | let first_word = &sentence[x1..x2]; 91 | let prob = INITIAL_PROBS[*y as usize] + EMIT_PROBS[*y as usize].get(first_word).cloned().unwrap_or(MIN_FLOAT); 92 | hmm_context.v[*y as usize] = prob; 93 | } 94 | 95 | let mut t = 1; 96 | while let Some(byte_start) = curr.next() { 97 | for y in &states { 98 | let byte_end = *curr.peek().unwrap_or(&str_len); 99 | let word = &sentence[byte_start..byte_end]; 100 | let em_prob = EMIT_PROBS[*y as usize].get(word).cloned().unwrap_or(MIN_FLOAT); 101 | let (prob, state) = ALLOWED_PREV_STATUS[*y as usize] 102 | .iter() 103 | .map(|y0| { 104 | ( 105 | hmm_context.v[(t - 1) * R + (*y0 as usize)] 106 | + TRANS_PROBS[*y0 as usize].get(*y as usize).cloned().unwrap_or(MIN_FLOAT) 107 | + em_prob, 108 | *y0, 109 | ) 110 | }) 111 | .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal)) 112 | .unwrap(); 113 | let idx = (t * R) + (*y as usize); 114 | hmm_context.v[idx] = prob; 115 | hmm_context.prev[idx] = Some(state); 116 | } 117 | 118 | t += 1; 119 | } 120 | 121 | let (_prob, state) = [State::End, State::Single] 122 | .iter() 123 | .map(|y| (hmm_context.v[(C - 1) * R + (*y as usize)], y)) 124 | .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal)) 125 | .unwrap(); 126 | 127 | let mut t = C - 1; 128 | let mut curr = *state; 129 | 130 | hmm_context.best_path[t] = *state; 131 | while let Some(p) = hmm_context.prev[t * R + (curr as usize)] { 132 | assert!(t > 0); 133 | hmm_context.best_path[t - 1] = p; 134 | curr = p; 135 | t -= 1; 136 | } 137 | 138 | hmm_context.prev.clear(); 139 | hmm_context.v.clear(); 140 | } 141 | 142 | #[allow(non_snake_case)] 143 | pub(crate) fn cut_internal<'a>(sentence: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut HmmContext) { 144 | let str_len = sentence.len(); 145 | viterbi(sentence, hmm_context); 146 | let mut begin = 0; 147 | let mut next_byte_offset = 0; 148 | let mut i = 0; 149 | 150 | let mut curr = sentence.char_indices().map(|x| x.0).peekable(); 151 | while let Some(curr_byte_offset) = curr.next() { 152 | let state = hmm_context.best_path[i]; 153 | match state { 154 | State::Begin => begin = curr_byte_offset, 155 | State::End => { 156 | let byte_start = begin; 157 | let byte_end = *curr.peek().unwrap_or(&str_len); 158 | words.push(&sentence[byte_start..byte_end]); 159 | next_byte_offset = byte_end; 160 | } 161 | State::Single => { 162 | let byte_start = curr_byte_offset; 163 | let byte_end = *curr.peek().unwrap_or(&str_len); 164 | words.push(&sentence[byte_start..byte_end]); 165 | next_byte_offset = byte_end; 166 | } 167 | State::Middle => { /* do nothing */ } 168 | } 169 | 170 | i += 1; 171 | } 172 | 173 | if next_byte_offset < str_len { 174 | let byte_start = next_byte_offset; 175 | words.push(&sentence[byte_start..]); 176 | } 177 | 178 | hmm_context.best_path.clear(); 179 | } 180 | 181 | #[allow(non_snake_case)] 182 | pub(crate) fn cut_with_allocated_memory<'a>(sentence: &'a str, words: &mut Vec<&'a str>, hmm_context: &mut HmmContext) { 183 | RE_HAN.with(|re_han| { 184 | RE_SKIP.with(|re_skip| { 185 | let splitter = SplitMatches::new(re_han, sentence); 186 | for state in splitter { 187 | let block = state.into_str(); 188 | if block.is_empty() { 189 | continue; 190 | } 191 | if re_han.is_match(block) { 192 | if block.chars().count() > 1 { 193 | cut_internal(block, words, hmm_context); 194 | } else { 195 | words.push(block); 196 | } 197 | } else { 198 | let skip_splitter = SplitMatches::new(re_skip, block); 199 | for skip_state in skip_splitter { 200 | let x = skip_state.into_str(); 201 | if x.is_empty() { 202 | continue; 203 | } 204 | words.push(x); 205 | } 206 | } 207 | } 208 | }) 209 | }) 210 | } 211 | 212 | #[allow(non_snake_case)] 213 | pub fn cut<'a>(sentence: &'a str, words: &mut Vec<&'a str>) { 214 | let mut hmm_context = HmmContext::default(); 215 | 216 | cut_with_allocated_memory(sentence, words, &mut hmm_context) 217 | } 218 | 219 | #[cfg(test)] 220 | mod tests { 221 | use super::{cut, viterbi, HmmContext}; 222 | 223 | #[test] 224 | #[allow(non_snake_case)] 225 | fn test_viterbi() { 226 | use super::State::*; 227 | 228 | let sentence = "小明硕士毕业于中国科学院计算所"; 229 | 230 | let mut hmm_context = HmmContext::default(); 231 | viterbi(sentence, &mut hmm_context); 232 | assert_eq!( 233 | hmm_context.best_path, 234 | vec![Begin, End, Begin, End, Begin, Middle, End, Begin, End, Begin, Middle, End, Begin, End, Single] 235 | ); 236 | } 237 | 238 | #[test] 239 | fn test_hmm_cut() { 240 | let sentence = "小明硕士毕业于中国科学院计算所"; 241 | let mut words = Vec::with_capacity(sentence.chars().count() / 2); 242 | cut(sentence, &mut words); 243 | assert_eq!(words, vec!["小明", "硕士", "毕业于", "中国", "科学院", "计算", "所"]); 244 | } 245 | } 246 | -------------------------------------------------------------------------------- /src/keywords/mod.rs: -------------------------------------------------------------------------------- 1 | use derive_builder::Builder; 2 | use lazy_static::lazy_static; 3 | use std::collections::BTreeSet; 4 | 5 | use crate::Jieba; 6 | 7 | #[cfg(feature = "textrank")] 8 | pub mod textrank; 9 | #[cfg(feature = "tfidf")] 10 | pub mod tfidf; 11 | 12 | lazy_static! { 13 | pub static ref DEFAULT_STOP_WORDS: BTreeSet = { 14 | BTreeSet::from_iter( 15 | [ 16 | "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with", 17 | "can", "if", "from", "which", "you", "it", "this", "then", "at", "have", "all", "not", "one", "has", 18 | "or", "that", 19 | ] 20 | .into_iter() 21 | .map(|s| s.to_string()), 22 | ) 23 | }; 24 | } 25 | 26 | /// Keyword with weight 27 | #[derive(Debug, Clone, PartialEq)] 28 | pub struct Keyword { 29 | pub keyword: String, 30 | pub weight: f64, 31 | } 32 | 33 | /// Creates a KeywordExtractConfig state that contains filter criteria as 34 | /// well as segmentation configuration for use by keyword extraction 35 | /// implementations. 36 | /// 37 | /// Use KeywordExtractConfigBuilder to change the defaults. 38 | /// 39 | /// # Examples 40 | /// ``` 41 | /// use jieba_rs::KeywordExtractConfig; 42 | /// 43 | /// let mut config = KeywordExtractConfig::default(); 44 | /// assert!(config.stop_words().contains("the")); 45 | /// assert!(!config.stop_words().contains("FakeWord")); 46 | /// assert!(!config.use_hmm()); 47 | /// assert_eq!(2, config.min_keyword_length()); 48 | /// 49 | /// let built_default = KeywordExtractConfig::builder().build().unwrap(); 50 | /// assert_eq!(config, built_default); 51 | /// 52 | /// let changed = KeywordExtractConfig::builder() 53 | /// .add_stop_word("FakeWord".to_string()) 54 | /// .remove_stop_word("the") 55 | /// .use_hmm(true) 56 | /// .min_keyword_length(10) 57 | /// .build().unwrap(); 58 | /// 59 | /// assert!(!changed.stop_words().contains("the")); 60 | /// assert!(changed.stop_words().contains("FakeWord")); 61 | /// assert!(changed.use_hmm()); 62 | /// assert_eq!(10, changed.min_keyword_length()); 63 | /// ``` 64 | #[derive(Builder, Debug, Clone, PartialEq)] 65 | pub struct KeywordExtractConfig { 66 | #[builder(default = "self.default_stop_words()?", setter(custom))] 67 | stop_words: BTreeSet, 68 | 69 | #[builder(default = "2")] 70 | #[doc = r"Any segments less than this length will not be considered a Keyword"] 71 | min_keyword_length: usize, 72 | 73 | #[builder(default = "false")] 74 | #[doc = r"If true, fall back to hmm model if segment cannot be found in the dictionary"] 75 | use_hmm: bool, 76 | } 77 | 78 | impl KeywordExtractConfig { 79 | pub fn builder() -> KeywordExtractConfigBuilder { 80 | KeywordExtractConfigBuilder::default() 81 | } 82 | 83 | /// Get current set of stop words. 84 | pub fn stop_words(&self) -> &BTreeSet { 85 | &self.stop_words 86 | } 87 | 88 | /// True if hmm is used during segmentation in `extract_tags`. 89 | pub fn use_hmm(&self) -> bool { 90 | self.use_hmm 91 | } 92 | 93 | /// Gets the minimum number of Unicode Scalar Values required per keyword. 94 | pub fn min_keyword_length(&self) -> usize { 95 | self.min_keyword_length 96 | } 97 | 98 | #[inline] 99 | pub(crate) fn filter(&self, s: &str) -> bool { 100 | s.chars().count() >= self.min_keyword_length() && !self.stop_words.contains(&s.to_lowercase()) 101 | } 102 | } 103 | 104 | impl KeywordExtractConfigBuilder { 105 | fn default_stop_words(&self) -> Result, KeywordExtractConfigBuilderError> { 106 | Ok(DEFAULT_STOP_WORDS.clone()) 107 | } 108 | 109 | /// Add a new stop word. 110 | /// 111 | /// # Examples 112 | /// ``` 113 | /// use jieba_rs::KeywordExtractConfig; 114 | /// use std::collections::BTreeSet; 115 | /// 116 | /// let populates_default = KeywordExtractConfig::builder() 117 | /// .add_stop_word("FakeWord".to_string()) 118 | /// .build().unwrap(); 119 | /// 120 | /// assert!(populates_default.stop_words().contains("the")); 121 | /// assert!(populates_default.stop_words().contains("FakeWord")); 122 | /// 123 | /// let multiple_adds_stack = KeywordExtractConfig::builder() 124 | /// .add_stop_word("FakeWord".to_string()) 125 | /// .add_stop_word("MoarFakeWord".to_string()) 126 | /// .build().unwrap(); 127 | /// 128 | /// assert!(multiple_adds_stack.stop_words().contains("the")); 129 | /// assert!(multiple_adds_stack.stop_words().contains("FakeWord")); 130 | /// assert!(multiple_adds_stack.stop_words().contains("MoarFakeWord")); 131 | /// 132 | /// let no_default_if_set = KeywordExtractConfig::builder() 133 | /// .set_stop_words(BTreeSet::from(["boo".to_string()])) 134 | /// .add_stop_word("FakeWord".to_string()) 135 | /// .build().unwrap(); 136 | /// 137 | /// assert!(!no_default_if_set.stop_words().contains("the")); 138 | /// assert!(no_default_if_set.stop_words().contains("boo")); 139 | /// assert!(no_default_if_set.stop_words().contains("FakeWord")); 140 | /// ``` 141 | pub fn add_stop_word(&mut self, word: String) -> &mut Self { 142 | if self.stop_words.is_none() { 143 | self.stop_words = Some(self.default_stop_words().unwrap()); 144 | } 145 | self.stop_words.as_mut().unwrap().insert(word); 146 | self 147 | } 148 | 149 | /// Remove an existing stop word. 150 | /// 151 | /// # Examples 152 | /// ``` 153 | /// use jieba_rs::KeywordExtractConfig; 154 | /// use std::collections::BTreeSet; 155 | /// 156 | /// let populates_default = KeywordExtractConfig::builder() 157 | /// .remove_stop_word("the") 158 | /// .build().unwrap(); 159 | /// 160 | /// assert!(!populates_default.stop_words().contains("the")); 161 | /// assert!(populates_default.stop_words().contains("of")); 162 | /// 163 | /// let no_default_if_set = KeywordExtractConfig::builder() 164 | /// .set_stop_words(BTreeSet::from(["boo".to_string()])) 165 | /// // Removing non-existant word is okay. 166 | /// .remove_stop_word("the".to_string()) 167 | /// .build().unwrap(); 168 | /// 169 | /// assert!(!no_default_if_set.stop_words().contains("the")); 170 | /// assert!(!no_default_if_set.stop_words().contains("of")); 171 | /// assert!(no_default_if_set.stop_words().contains("boo")); 172 | /// ``` 173 | pub fn remove_stop_word(&mut self, word: impl AsRef) -> &mut Self { 174 | if self.stop_words.is_none() { 175 | self.stop_words = Some(self.default_stop_words().unwrap()); 176 | } 177 | self.stop_words.as_mut().unwrap().remove(word.as_ref()); 178 | self 179 | } 180 | 181 | /// Replace all stop words with new stop words set. 182 | /// 183 | /// # Examples 184 | /// ``` 185 | /// use jieba_rs::KeywordExtractConfig; 186 | /// use std::collections::BTreeSet; 187 | /// 188 | /// let no_default_if_set = KeywordExtractConfig::builder() 189 | /// .set_stop_words(BTreeSet::from(["boo".to_string()])) 190 | /// .build().unwrap(); 191 | /// 192 | /// assert!(!no_default_if_set.stop_words().contains("the")); 193 | /// assert!(no_default_if_set.stop_words().contains("boo")); 194 | /// 195 | /// let overwrites = KeywordExtractConfig::builder() 196 | /// .add_stop_word("FakeWord".to_string()) 197 | /// .set_stop_words(BTreeSet::from(["boo".to_string()])) 198 | /// .build().unwrap(); 199 | /// 200 | /// assert!(!no_default_if_set.stop_words().contains("FakeWord")); 201 | /// assert!(no_default_if_set.stop_words().contains("boo")); 202 | /// ``` 203 | pub fn set_stop_words(&mut self, stop_words: BTreeSet) -> &mut Self { 204 | self.stop_words = Some(stop_words); 205 | self 206 | } 207 | } 208 | 209 | impl Default for KeywordExtractConfig { 210 | fn default() -> KeywordExtractConfig { 211 | KeywordExtractConfigBuilder::default().build().unwrap() 212 | } 213 | } 214 | 215 | /// Extracts keywords from a given sentence with the Jieba instance. 216 | pub trait KeywordExtract { 217 | fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec; 218 | } 219 | -------------------------------------------------------------------------------- /src/keywords/textrank.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::collections::{BTreeSet, BinaryHeap}; 3 | 4 | use ordered_float::OrderedFloat; 5 | 6 | use super::{Keyword, KeywordExtract, KeywordExtractConfig, KeywordExtractConfigBuilder}; 7 | use crate::FxHashMap as HashMap; 8 | use crate::Jieba; 9 | 10 | type Weight = f64; 11 | 12 | #[derive(Clone)] 13 | struct Edge { 14 | dst: usize, 15 | weight: Weight, 16 | } 17 | 18 | impl Edge { 19 | fn new(dst: usize, weight: Weight) -> Edge { 20 | Edge { dst, weight } 21 | } 22 | } 23 | 24 | type Edges = Vec; 25 | type Graph = Vec; 26 | 27 | struct StateDiagram { 28 | damping_factor: Weight, 29 | g: Graph, 30 | } 31 | 32 | impl StateDiagram { 33 | fn new(size: usize) -> Self { 34 | StateDiagram { 35 | damping_factor: 0.85, 36 | g: vec![Vec::new(); size], 37 | } 38 | } 39 | 40 | fn add_undirected_edge(&mut self, src: usize, dst: usize, weight: Weight) { 41 | self.g[src].push(Edge::new(dst, weight)); 42 | self.g[dst].push(Edge::new(src, weight)); 43 | } 44 | 45 | fn rank(&mut self) -> Vec { 46 | let n = self.g.len(); 47 | let default_weight = 1.0 / (n as f64); 48 | 49 | let mut ranking_vector = vec![default_weight; n]; 50 | 51 | let mut outflow_weights = vec![0.0; n]; 52 | for (i, v) in self.g.iter().enumerate() { 53 | outflow_weights[i] = v.iter().map(|e| e.weight).sum(); 54 | } 55 | 56 | for _ in 0..20 { 57 | for (i, v) in self.g.iter().enumerate() { 58 | let s: f64 = v 59 | .iter() 60 | .map(|e| e.weight / outflow_weights[e.dst] * ranking_vector[e.dst]) 61 | .sum(); 62 | 63 | ranking_vector[i] = (1.0 - self.damping_factor) + self.damping_factor * s; 64 | } 65 | } 66 | 67 | ranking_vector 68 | } 69 | } 70 | 71 | /// Text rank keywords extraction. 72 | /// 73 | /// Requires `textrank` feature to be enabled. 74 | #[derive(Debug)] 75 | pub struct TextRank { 76 | span: usize, 77 | config: KeywordExtractConfig, 78 | } 79 | 80 | impl TextRank { 81 | /// Creates an TextRank. 82 | /// 83 | /// # Examples 84 | /// 85 | /// New instance with custom stop words. Also uses hmm for unknown words 86 | /// during segmentation. 87 | /// ``` 88 | /// use std::collections::BTreeSet; 89 | /// use jieba_rs::{TextRank, KeywordExtractConfig}; 90 | /// 91 | /// let stop_words : BTreeSet = 92 | /// BTreeSet::from(["a", "the", "of"].map(|s| s.to_string())); 93 | /// TextRank::new( 94 | /// 5, 95 | /// KeywordExtractConfig::default()); 96 | /// ``` 97 | pub fn new(span: usize, config: KeywordExtractConfig) -> Self { 98 | TextRank { span, config } 99 | } 100 | } 101 | 102 | impl Default for TextRank { 103 | /// Creates TextRank with 5 Unicode Scalar Value spans 104 | fn default() -> Self { 105 | TextRank::new(5, KeywordExtractConfigBuilder::default().build().unwrap()) 106 | } 107 | } 108 | 109 | impl KeywordExtract for TextRank { 110 | /// Uses TextRank algorithm to extract the `top_k` keywords from `sentence`. 111 | /// 112 | /// If `allowed_pos` is not empty, then only terms matching those parts if 113 | /// speech are considered. 114 | /// 115 | /// # Examples 116 | /// 117 | /// ``` 118 | /// use jieba_rs::{Jieba, KeywordExtract, TextRank}; 119 | /// 120 | /// let jieba = Jieba::new(); 121 | /// let keyword_extractor = TextRank::default(); 122 | /// let mut top_k = keyword_extractor.extract_keywords( 123 | /// &jieba, 124 | /// "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", 125 | /// 6, 126 | /// vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], 127 | /// ); 128 | /// assert_eq!( 129 | /// top_k.iter().map(|x| &x.keyword).collect::>(), 130 | /// vec!["吉林", "欧亚", "置业", "实现", "收入", "子公司"] 131 | /// ); 132 | /// 133 | /// top_k = keyword_extractor.extract_keywords( 134 | /// &jieba, 135 | /// "It is nice weather in New York City. and今天纽约的天气真好啊,and京华大酒店的张尧经理吃了一只北京烤鸭。and后天纽约的天气不好,and昨天纽约的天气也不好,and北京烤鸭真好吃", 136 | /// 3, 137 | /// vec![], 138 | /// ); 139 | /// assert_eq!( 140 | /// top_k.iter().map(|x| &x.keyword).collect::>(), 141 | /// vec!["纽约", "天气", "不好"] 142 | /// ); 143 | /// ``` 144 | fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { 145 | let tags = jieba.tag(sentence, self.config.use_hmm()); 146 | let mut allowed_pos_set = BTreeSet::new(); 147 | 148 | for s in allowed_pos { 149 | allowed_pos_set.insert(s); 150 | } 151 | 152 | let mut word2id: HashMap = HashMap::default(); 153 | let mut unique_words = Vec::new(); 154 | for t in &tags { 155 | if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(t.tag) { 156 | continue; 157 | } 158 | 159 | if !word2id.contains_key(t.word) { 160 | unique_words.push(String::from(t.word)); 161 | word2id.insert(String::from(t.word), unique_words.len() - 1); 162 | } 163 | } 164 | 165 | let mut cooccurence: HashMap<(usize, usize), usize> = HashMap::default(); 166 | for (i, t) in tags.iter().enumerate() { 167 | if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(t.tag) { 168 | continue; 169 | } 170 | 171 | if !self.config.filter(t.word) { 172 | continue; 173 | } 174 | 175 | for j in (i + 1)..(i + self.span) { 176 | if j >= tags.len() { 177 | break; 178 | } 179 | 180 | if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(tags[j].tag) { 181 | continue; 182 | } 183 | 184 | if !self.config.filter(tags[j].word) { 185 | continue; 186 | } 187 | 188 | let u = word2id.get(t.word).unwrap().to_owned(); 189 | let v = word2id.get(tags[j].word).unwrap().to_owned(); 190 | let entry = cooccurence.entry((u, v)).or_insert(0); 191 | *entry += 1; 192 | } 193 | } 194 | 195 | let mut diagram = StateDiagram::new(unique_words.len()); 196 | for (k, &v) in cooccurence.iter() { 197 | diagram.add_undirected_edge(k.0, k.1, v as f64); 198 | } 199 | 200 | let ranking_vector = diagram.rank(); 201 | 202 | let mut heap = BinaryHeap::new(); 203 | for (k, v) in ranking_vector.iter().enumerate() { 204 | heap.push(HeapNode { 205 | rank: OrderedFloat(v * 1e10), 206 | word_id: k, 207 | }); 208 | 209 | if k >= top_k { 210 | heap.pop(); 211 | } 212 | } 213 | 214 | let mut res = Vec::new(); 215 | for _ in 0..top_k { 216 | if let Some(w) = heap.pop() { 217 | res.push(Keyword { 218 | keyword: unique_words[w.word_id].clone(), 219 | weight: w.rank.into_inner(), 220 | }); 221 | } 222 | } 223 | 224 | res.reverse(); 225 | res 226 | } 227 | } 228 | 229 | #[derive(Debug, Clone, Eq, PartialEq)] 230 | struct HeapNode { 231 | rank: OrderedFloat, 232 | word_id: usize, 233 | } 234 | 235 | impl Ord for HeapNode { 236 | fn cmp(&self, other: &HeapNode) -> Ordering { 237 | other 238 | .rank 239 | .cmp(&self.rank) 240 | .then_with(|| self.word_id.cmp(&other.word_id)) 241 | } 242 | } 243 | 244 | impl PartialOrd for HeapNode { 245 | fn partial_cmp(&self, other: &HeapNode) -> Option { 246 | Some(self.cmp(other)) 247 | } 248 | } 249 | 250 | #[cfg(test)] 251 | mod tests { 252 | use super::*; 253 | #[test] 254 | fn test_init_state_diagram() { 255 | let diagram = StateDiagram::new(10); 256 | assert_eq!(diagram.g.len(), 10); 257 | } 258 | } 259 | -------------------------------------------------------------------------------- /src/keywords/tfidf.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::collections::{BTreeSet, BinaryHeap}; 3 | use std::io::{self, BufRead, BufReader}; 4 | 5 | use include_flate::flate; 6 | use ordered_float::OrderedFloat; 7 | 8 | use super::{Keyword, KeywordExtract, KeywordExtractConfig, KeywordExtractConfigBuilder}; 9 | use crate::FxHashMap as HashMap; 10 | use crate::Jieba; 11 | 12 | flate!(static DEFAULT_IDF: str from "src/data/idf.txt"); 13 | 14 | #[derive(Debug, Clone, Eq, PartialEq)] 15 | struct HeapNode<'a> { 16 | tfidf: OrderedFloat, 17 | word: &'a str, 18 | } 19 | 20 | impl Ord for HeapNode<'_> { 21 | fn cmp(&self, other: &HeapNode) -> Ordering { 22 | other.tfidf.cmp(&self.tfidf).then_with(|| self.word.cmp(other.word)) 23 | } 24 | } 25 | 26 | impl PartialOrd for HeapNode<'_> { 27 | fn partial_cmp(&self, other: &HeapNode) -> Option { 28 | Some(self.cmp(other)) 29 | } 30 | } 31 | 32 | /// TF-IDF keywords extraction 33 | /// 34 | /// Require `tfidf` feature to be enabled 35 | #[derive(Debug)] 36 | pub struct TfIdf { 37 | idf_dict: HashMap, 38 | median_idf: f64, 39 | config: KeywordExtractConfig, 40 | } 41 | 42 | /// Implementation of JiebaKeywordExtract using a TF-IDF dictionary. 43 | /// 44 | /// This takes the segments produced by Jieba and attempts to extract keywords. 45 | /// Segments are filtered for stopwords and short terms. They are then matched 46 | /// against a loaded dictionary to calculate TF-IDF scores. 47 | impl TfIdf { 48 | /// Creates an TfIdf. 49 | /// 50 | /// # Examples 51 | /// 52 | /// New instance with custom idf dictionary. 53 | /// ``` 54 | /// use jieba_rs::{TfIdf, KeywordExtractConfig}; 55 | /// 56 | /// let mut sample_idf = "劳动防护 13.900677652\n\ 57 | /// 生化学 13.900677652\n"; 58 | /// TfIdf::new( 59 | /// Some(&mut sample_idf.as_bytes()), 60 | /// KeywordExtractConfig::default()); 61 | /// ``` 62 | /// 63 | /// New instance with module default stop words and no initial IDF 64 | /// dictionary. Dictionary should be loaded later with `load_dict()` calls. 65 | /// ``` 66 | /// use jieba_rs::{TfIdf, KeywordExtractConfig}; 67 | /// 68 | /// TfIdf::new( 69 | /// None::<&mut std::io::Empty>, 70 | /// KeywordExtractConfig::default()); 71 | /// ``` 72 | pub fn new(opt_dict: Option<&mut impl BufRead>, config: KeywordExtractConfig) -> Self { 73 | let mut instance = TfIdf { 74 | idf_dict: HashMap::default(), 75 | median_idf: 0.0, 76 | config, 77 | }; 78 | if let Some(dict) = opt_dict { 79 | instance.load_dict(dict).unwrap(); 80 | } 81 | instance 82 | } 83 | 84 | /// Merges entires from `dict` into the `idf_dict`. 85 | /// 86 | /// ``` 87 | /// use jieba_rs::{Jieba, KeywordExtract, Keyword, KeywordExtractConfig, 88 | /// TfIdf}; 89 | /// 90 | /// let jieba = Jieba::default(); 91 | /// let mut init_idf = "生化学 13.900677652\n"; 92 | /// 93 | /// let mut tfidf = TfIdf::new( 94 | /// Some(&mut init_idf.as_bytes()), 95 | /// KeywordExtractConfig::default()); 96 | /// let top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]); 97 | /// assert_eq!( 98 | /// top_k, 99 | /// vec![ 100 | /// Keyword { keyword: "不是".to_string(), weight: 4.6335592173333335 }, 101 | /// Keyword { keyword: "光化学".to_string(), weight: 4.6335592173333335 }, 102 | /// Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 } 103 | /// ] 104 | /// ); 105 | /// 106 | /// let mut init_idf = "光化学 99.123456789\n"; 107 | /// tfidf.load_dict(&mut init_idf.as_bytes()); 108 | /// let new_top_k = tfidf.extract_keywords(&jieba, "生化学不是光化学的,", 3, vec![]); 109 | /// assert_eq!( 110 | /// new_top_k, 111 | /// vec![ 112 | /// Keyword { keyword: "不是".to_string(), weight: 33.041152263 }, 113 | /// Keyword { keyword: "光化学".to_string(), weight: 33.041152263 }, 114 | /// Keyword { keyword: "生化学".to_string(), weight: 4.6335592173333335 } 115 | /// ] 116 | /// ); 117 | /// ``` 118 | pub fn load_dict(&mut self, dict: &mut impl BufRead) -> io::Result<()> { 119 | let mut buf = String::new(); 120 | let mut idf_heap = BinaryHeap::new(); 121 | while dict.read_line(&mut buf)? > 0 { 122 | let parts: Vec<&str> = buf.split_whitespace().collect(); 123 | if parts.is_empty() { 124 | continue; 125 | } 126 | 127 | let word = parts[0]; 128 | if let Some(idf) = parts.get(1).and_then(|x| x.parse::().ok()) { 129 | self.idf_dict.insert(word.to_string(), idf); 130 | idf_heap.push(OrderedFloat(idf)); 131 | } 132 | 133 | buf.clear(); 134 | } 135 | 136 | let m = idf_heap.len() / 2; 137 | for _ in 0..m { 138 | idf_heap.pop(); 139 | } 140 | 141 | self.median_idf = idf_heap.pop().unwrap().into_inner(); 142 | 143 | Ok(()) 144 | } 145 | 146 | pub fn config(&self) -> &KeywordExtractConfig { 147 | &self.config 148 | } 149 | 150 | pub fn config_mut(&mut self) -> &mut KeywordExtractConfig { 151 | &mut self.config 152 | } 153 | } 154 | 155 | /// TF-IDF keywords extraction. 156 | /// 157 | /// Require `tfidf` feature to be enabled. 158 | impl Default for TfIdf { 159 | /// Creates TfIdf with DEFAULT_STOP_WORDS, the default TfIdf dictionary, 160 | /// 2 Unicode Scalar Value minimum for keywords, and no hmm in segmentation. 161 | fn default() -> Self { 162 | let mut default_dict = BufReader::new(DEFAULT_IDF.as_bytes()); 163 | TfIdf::new( 164 | Some(&mut default_dict), 165 | KeywordExtractConfigBuilder::default().build().unwrap(), 166 | ) 167 | } 168 | } 169 | 170 | impl KeywordExtract for TfIdf { 171 | /// Uses TF-IDF algorithm to extract the `top_k` keywords from `sentence`. 172 | /// 173 | /// If `allowed_pos` is not empty, then only terms matching those parts if 174 | /// speech are considered. 175 | /// 176 | /// # Examples 177 | /// ``` 178 | /// use jieba_rs::{Jieba, KeywordExtract, TfIdf}; 179 | /// 180 | /// let jieba = Jieba::new(); 181 | /// let keyword_extractor = TfIdf::default(); 182 | /// let mut top_k = keyword_extractor.extract_keywords( 183 | /// &jieba, 184 | /// "今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃", 185 | /// 3, 186 | /// vec![], 187 | /// ); 188 | /// assert_eq!( 189 | /// top_k.iter().map(|x| &x.keyword).collect::>(), 190 | /// vec!["北京烤鸭", "纽约", "天气"] 191 | /// ); 192 | /// 193 | /// top_k = keyword_extractor.extract_keywords( 194 | /// &jieba, 195 | /// "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", 196 | /// 5, 197 | /// vec![], 198 | /// ); 199 | /// assert_eq!( 200 | /// top_k.iter().map(|x| &x.keyword).collect::>(), 201 | /// vec!["欧亚", "吉林", "置业", "万元", "增资"] 202 | /// ); 203 | /// 204 | /// top_k = keyword_extractor.extract_keywords( 205 | /// &jieba, 206 | /// "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", 207 | /// 5, 208 | /// vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], 209 | /// ); 210 | /// assert_eq!( 211 | /// top_k.iter().map(|x| &x.keyword).collect::>(), 212 | /// vec!["欧亚", "吉林", "置业", "增资", "实现"] 213 | /// ); 214 | /// ``` 215 | fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec) -> Vec { 216 | let tags = jieba.tag(sentence, self.config.use_hmm()); 217 | let mut allowed_pos_set = BTreeSet::new(); 218 | 219 | for s in allowed_pos { 220 | allowed_pos_set.insert(s); 221 | } 222 | 223 | let mut term_freq: HashMap = HashMap::default(); 224 | for t in &tags { 225 | if !allowed_pos_set.is_empty() && !allowed_pos_set.contains(t.tag) { 226 | continue; 227 | } 228 | 229 | if !self.config.filter(t.word) { 230 | continue; 231 | } 232 | 233 | let entry = term_freq.entry(String::from(t.word)).or_insert(0); 234 | *entry += 1; 235 | } 236 | 237 | let total: u64 = term_freq.values().sum(); 238 | let mut heap = BinaryHeap::new(); 239 | for (cnt, (k, tf)) in term_freq.iter().enumerate() { 240 | let idf = self.idf_dict.get(k).unwrap_or(&self.median_idf); 241 | let node = HeapNode { 242 | tfidf: OrderedFloat(*tf as f64 * idf / total as f64), 243 | word: k, 244 | }; 245 | heap.push(node); 246 | if cnt >= top_k { 247 | heap.pop(); 248 | } 249 | } 250 | 251 | let mut res = Vec::new(); 252 | for _ in 0..top_k { 253 | if let Some(w) = heap.pop() { 254 | res.push(Keyword { 255 | keyword: String::from(w.word), 256 | weight: w.tfidf.into_inner(), 257 | }); 258 | } 259 | } 260 | 261 | res.reverse(); 262 | res 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! The Jieba Chinese Word Segmentation Implemented in Rust 2 | //! 3 | //! ## Installation 4 | //! 5 | //! Add it to your `Cargo.toml`: 6 | //! 7 | //! ```toml 8 | //! [dependencies] 9 | //! jieba-rs = "0.7" 10 | //! ``` 11 | //! 12 | //! then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well. 13 | //! 14 | //! ## Example 15 | //! 16 | //! ```rust 17 | //! use jieba_rs::Jieba; 18 | //! 19 | //! let jieba = Jieba::new(); 20 | //! let words = jieba.cut("我们中出了一个叛徒", false); 21 | //! assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]); 22 | //! ``` 23 | //! 24 | //! ```rust 25 | //! # #[cfg(feature = "tfidf")] { 26 | //! use jieba_rs::Jieba; 27 | //! use jieba_rs::{TfIdf, KeywordExtract}; 28 | //! 29 | //! fn main() { 30 | //! let jieba = Jieba::new(); 31 | //! let keyword_extractor = TfIdf::default(); 32 | //! let top_k = keyword_extractor.extract_keywords( 33 | //! &jieba, 34 | //! "今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好,昨天纽约的天气也不好,北京烤鸭真好吃", 35 | //! 3, 36 | //! vec![], 37 | //! ); 38 | //! println!("{:?}", top_k); 39 | //! } 40 | //! # } 41 | //! ``` 42 | //! 43 | //! ```rust 44 | //! # #[cfg(feature = "textrank")] { 45 | //! use jieba_rs::Jieba; 46 | //! use jieba_rs::{TextRank, KeywordExtract}; 47 | //! 48 | //! fn main() { 49 | //! let jieba = Jieba::new(); 50 | //! let keyword_extractor = TextRank::default(); 51 | //! let top_k = keyword_extractor.extract_keywords( 52 | //! &jieba, 53 | //! "此外,公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元,增资后,吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年,实现营业收入0万元,实现净利润-139.13万元。", 54 | //! 6, 55 | //! vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")], 56 | //! ); 57 | //! println!("{:?}", top_k); 58 | //! } 59 | //! # } 60 | //! ``` 61 | //! 62 | //! ## Enabling Additional Features 63 | //! 64 | //! * `default-dict` feature enables embedded dictionary, this features is enabled by default 65 | //! * `tfidf` feature enables TF-IDF keywords extractor 66 | //! * `textrank` feature enables TextRank keywords extractor 67 | //! 68 | //! ```toml 69 | //! [dependencies] 70 | //! jieba-rs = { version = "0.7", features = ["tfidf", "textrank"] } 71 | //! ``` 72 | //! 73 | 74 | use include_flate::flate; 75 | 76 | use std::cmp::Ordering; 77 | use std::collections::HashMap; 78 | use std::io::BufRead; 79 | 80 | use cedarwood::Cedar; 81 | use regex::{Match, Matches, Regex}; 82 | 83 | pub(crate) type FxHashMap = HashMap; 84 | 85 | pub use crate::errors::Error; 86 | #[cfg(feature = "textrank")] 87 | pub use crate::keywords::textrank::TextRank; 88 | #[cfg(feature = "tfidf")] 89 | pub use crate::keywords::tfidf::TfIdf; 90 | #[cfg(any(feature = "tfidf", feature = "textrank"))] 91 | pub use crate::keywords::{Keyword, KeywordExtract, KeywordExtractConfig, DEFAULT_STOP_WORDS}; 92 | 93 | mod errors; 94 | mod hmm; 95 | #[cfg(any(feature = "tfidf", feature = "textrank"))] 96 | mod keywords; 97 | mod sparse_dag; 98 | 99 | #[cfg(feature = "default-dict")] 100 | flate!(static DEFAULT_DICT: str from "src/data/dict.txt"); 101 | 102 | use sparse_dag::StaticSparseDAG; 103 | 104 | thread_local! { 105 | static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap(); 106 | static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap(); 107 | static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap(); 108 | static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap(); 109 | } 110 | 111 | struct SplitMatches<'r, 't> { 112 | finder: Matches<'r, 't>, 113 | text: &'t str, 114 | last: usize, 115 | matched: Option>, 116 | } 117 | 118 | impl<'r, 't> SplitMatches<'r, 't> { 119 | #[inline] 120 | fn new(re: &'r Regex, text: &'t str) -> SplitMatches<'r, 't> { 121 | SplitMatches { 122 | finder: re.find_iter(text), 123 | text, 124 | last: 0, 125 | matched: None, 126 | } 127 | } 128 | } 129 | 130 | #[derive(Debug)] 131 | pub(crate) enum SplitState<'t> { 132 | Unmatched(&'t str), 133 | Matched(Match<'t>), 134 | } 135 | 136 | impl<'t> SplitState<'t> { 137 | #[inline] 138 | fn into_str(self) -> &'t str { 139 | match self { 140 | SplitState::Unmatched(t) => t, 141 | SplitState::Matched(matched) => matched.as_str(), 142 | } 143 | } 144 | } 145 | 146 | impl<'t> Iterator for SplitMatches<'_, 't> { 147 | type Item = SplitState<'t>; 148 | 149 | fn next(&mut self) -> Option> { 150 | if let Some(matched) = self.matched.take() { 151 | return Some(SplitState::Matched(matched)); 152 | } 153 | match self.finder.next() { 154 | None => { 155 | if self.last >= self.text.len() { 156 | None 157 | } else { 158 | let s = &self.text[self.last..]; 159 | self.last = self.text.len(); 160 | Some(SplitState::Unmatched(s)) 161 | } 162 | } 163 | Some(m) => { 164 | if self.last == m.start() { 165 | self.last = m.end(); 166 | Some(SplitState::Matched(m)) 167 | } else { 168 | let unmatched = &self.text[self.last..m.start()]; 169 | self.last = m.end(); 170 | self.matched = Some(m); 171 | Some(SplitState::Unmatched(unmatched)) 172 | } 173 | } 174 | } 175 | } 176 | } 177 | 178 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 179 | pub enum TokenizeMode { 180 | /// Default mode 181 | Default, 182 | /// Search mode 183 | Search, 184 | } 185 | 186 | /// A Token 187 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 188 | pub struct Token<'a> { 189 | /// Word of the token 190 | pub word: &'a str, 191 | /// Unicode start position of the token 192 | pub start: usize, 193 | /// Unicode end position of the token 194 | pub end: usize, 195 | } 196 | 197 | /// A tagged word 198 | #[derive(Debug, Clone, PartialEq, Eq, Hash)] 199 | pub struct Tag<'a> { 200 | /// Word 201 | pub word: &'a str, 202 | /// Word tag 203 | pub tag: &'a str, 204 | } 205 | 206 | #[derive(Debug, Clone)] 207 | struct Record { 208 | freq: usize, 209 | tag: String, 210 | } 211 | 212 | impl Record { 213 | #[inline(always)] 214 | fn new(freq: usize, tag: String) -> Self { 215 | Self { freq, tag } 216 | } 217 | } 218 | 219 | /// Jieba segmentation 220 | #[derive(Debug, Clone)] 221 | pub struct Jieba { 222 | records: Vec, 223 | cedar: Cedar, 224 | total: usize, 225 | } 226 | 227 | #[cfg(feature = "default-dict")] 228 | impl Default for Jieba { 229 | fn default() -> Self { 230 | Jieba::new() 231 | } 232 | } 233 | 234 | impl Jieba { 235 | /// Create a new instance with empty dict 236 | pub fn empty() -> Self { 237 | Jieba { 238 | records: Vec::new(), 239 | cedar: Cedar::new(), 240 | total: 0, 241 | } 242 | } 243 | 244 | /// Create a new instance with embed dict 245 | /// 246 | /// Requires `default-dict` feature to be enabled. 247 | #[cfg(feature = "default-dict")] 248 | pub fn new() -> Self { 249 | let mut instance = Self::empty(); 250 | instance.load_default_dict(); 251 | instance 252 | } 253 | 254 | /// Create a new instance with dict 255 | pub fn with_dict(dict: &mut R) -> Result { 256 | let mut instance = Self::empty(); 257 | instance.load_dict(dict)?; 258 | Ok(instance) 259 | } 260 | 261 | /// Loads the default dictionary into the instance. 262 | /// 263 | /// This method reads the default dictionary from a predefined byte slice (`DEFAULT_DICT`) 264 | /// and loads it into the current instance using the `load_dict` method. 265 | /// 266 | /// # Arguments 267 | /// 268 | /// * `&mut self` - Mutable reference to the current instance. 269 | /// 270 | /// Requires `default-dict` feature to be enabled. 271 | /// 272 | /// # Examples 273 | /// 274 | /// ``` 275 | /// use jieba_rs::Jieba; 276 | /// 277 | /// let mut instance = Jieba::empty(); 278 | /// instance.load_default_dict(); // Loads the default dictionary into the instance 279 | /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary"); 280 | /// ``` 281 | #[cfg(feature = "default-dict")] 282 | pub fn load_default_dict(&mut self) { 283 | use std::io::BufReader; 284 | 285 | let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes()); 286 | self.load_dict(&mut default_dict).unwrap(); 287 | } 288 | 289 | /// Clears all data 290 | /// 291 | /// This method performs the following actions: 292 | /// 1. Clears the `records` list, removing all entries. 293 | /// 2. Resets `cedar` to a new instance of `Cedar`. 294 | /// 3. Sets `total` to 0, resetting the count. 295 | /// 296 | /// # Arguments 297 | /// 298 | /// * `&mut self` - Mutable reference to the current instance. 299 | /// 300 | /// # Examples 301 | /// 302 | /// ``` 303 | /// use jieba_rs::Jieba; 304 | /// 305 | /// let mut instance = Jieba::new(); 306 | /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary"); 307 | /// instance.clear(); // clear all dict data 308 | /// assert!(!instance.has_word("我们"), "The word '我们' should not be in the dictionary after clearing the dictionary"); 309 | /// ``` 310 | pub fn clear(&mut self) { 311 | self.records.clear(); 312 | self.cedar = Cedar::new(); 313 | self.total = 0; 314 | } 315 | 316 | /// Add word to dict, return `freq` 317 | /// 318 | /// `freq`: if `None`, will be given by [suggest_freq](#method.suggest_freq) 319 | /// 320 | /// `tag`: if `None`, will be given `""` 321 | pub fn add_word(&mut self, word: &str, freq: Option, tag: Option<&str>) -> usize { 322 | if word.is_empty() { 323 | return 0; 324 | } 325 | let freq = freq.unwrap_or_else(|| self.suggest_freq(word)); 326 | let tag = tag.unwrap_or(""); 327 | 328 | match self.cedar.exact_match_search(word) { 329 | Some((word_id, _, _)) => { 330 | let old_freq = self.records[word_id as usize].freq; 331 | self.records[word_id as usize].freq = freq; 332 | 333 | self.total += freq; 334 | self.total -= old_freq; 335 | } 336 | None => { 337 | let word_id = self.records.len() as i32; 338 | self.records.push(Record::new(freq, String::from(tag))); 339 | 340 | self.cedar.update(word, word_id); 341 | self.total += freq; 342 | } 343 | }; 344 | 345 | freq 346 | } 347 | 348 | /// Checks if a word exists in the dictionary. 349 | /// 350 | /// # Arguments 351 | /// 352 | /// * `word` - The word to check. 353 | /// 354 | /// # Returns 355 | /// 356 | /// * `bool` - Whether the word exists in the dictionary. 357 | pub fn has_word(&self, word: &str) -> bool { 358 | self.cedar.exact_match_search(word).is_some() 359 | } 360 | 361 | /// Loads a dictionary by adding entries to the existing dictionary rather than resetting it. 362 | /// 363 | /// This function reads from a `BufRead` source, parsing each line as a dictionary entry. Each entry 364 | /// is expected to contain a word, its frequency, and optionally a tag. 365 | /// 366 | /// # Type Parameters 367 | /// 368 | /// * `R`: A type that implements the `BufRead` trait, used for reading lines from the dictionary. 369 | /// 370 | /// # Arguments 371 | /// 372 | /// * `dict` - A mutable reference to a `BufRead` source containing the dictionary entries. 373 | /// 374 | /// # Returns 375 | /// 376 | /// * `Result<(), Error>` - Returns `Ok(())` if the dictionary is successfully loaded; otherwise, 377 | /// returns an error describing what went wrong. 378 | /// 379 | /// # Errors 380 | /// 381 | /// This function will return an error if: 382 | /// * There is an issue reading from the provided `BufRead` source. 383 | /// * A line in the dictionary file contains invalid frequency data (not a valid integer). 384 | pub fn load_dict(&mut self, dict: &mut R) -> Result<(), Error> { 385 | let mut buf = String::new(); 386 | self.total = 0; 387 | 388 | let mut line_no = 0; 389 | while dict.read_line(&mut buf)? > 0 { 390 | { 391 | line_no += 1; 392 | let mut iter = buf.split_whitespace(); 393 | if let Some(word) = iter.next() { 394 | let freq = iter 395 | .next() 396 | .map(|x| { 397 | x.parse::().map_err(|e| { 398 | Error::InvalidDictEntry(format!( 399 | "line {} `{}` frequency {} is not a valid integer: {}", 400 | line_no, buf, x, e 401 | )) 402 | }) 403 | }) 404 | .unwrap_or(Ok(0))?; 405 | let tag = iter.next().unwrap_or(""); 406 | 407 | match self.cedar.exact_match_search(word) { 408 | Some((word_id, _, _)) => { 409 | self.records[word_id as usize].freq = freq; 410 | } 411 | None => { 412 | let word_id = self.records.len() as i32; 413 | self.records.push(Record::new(freq, String::from(tag))); 414 | self.cedar.update(word, word_id); 415 | } 416 | }; 417 | } 418 | } 419 | buf.clear(); 420 | } 421 | self.total = self.records.iter().map(|n| n.freq).sum(); 422 | 423 | Ok(()) 424 | } 425 | 426 | fn get_word_freq(&self, word: &str, default: usize) -> usize { 427 | match self.cedar.exact_match_search(word) { 428 | Some((word_id, _, _)) => self.records[word_id as usize].freq, 429 | _ => default, 430 | } 431 | } 432 | 433 | /// Suggest word frequency to force the characters in a word to be joined or split. 434 | pub fn suggest_freq(&self, segment: &str) -> usize { 435 | let logtotal = (self.total as f64).ln(); 436 | let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, word| { 437 | freq + (self.get_word_freq(word, 1) as f64).ln() - logtotal 438 | }); 439 | std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1)) 440 | } 441 | 442 | #[allow(clippy::ptr_arg)] 443 | fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) { 444 | let str_len = sentence.len(); 445 | 446 | if str_len + 1 > route.len() { 447 | route.resize(str_len + 1, (0.0, 0)); 448 | } 449 | 450 | let logtotal = (self.total as f64).ln(); 451 | let mut prev_byte_start = str_len; 452 | let curr = sentence.char_indices().map(|x| x.0).rev(); 453 | for byte_start in curr { 454 | let pair = dag 455 | .iter_edges(byte_start) 456 | .map(|byte_end| { 457 | let wfrag = if byte_end == str_len { 458 | &sentence[byte_start..] 459 | } else { 460 | &sentence[byte_start..byte_end] 461 | }; 462 | 463 | let freq = if let Some((word_id, _, _)) = self.cedar.exact_match_search(wfrag) { 464 | self.records[word_id as usize].freq 465 | } else { 466 | 1 467 | }; 468 | 469 | ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end) 470 | }) 471 | .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal)); 472 | 473 | if let Some(p) = pair { 474 | route[byte_start] = p; 475 | } else { 476 | let byte_end = prev_byte_start; 477 | let freq = 1; 478 | route[byte_start] = ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end); 479 | } 480 | 481 | prev_byte_start = byte_start; 482 | } 483 | } 484 | 485 | fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) { 486 | for (byte_start, _) in sentence.char_indices().peekable() { 487 | dag.start(byte_start); 488 | let haystack = &sentence[byte_start..]; 489 | 490 | for (_, end_index) in self.cedar.common_prefix_iter(haystack) { 491 | dag.insert(end_index + byte_start + 1); 492 | } 493 | 494 | dag.commit(); 495 | } 496 | } 497 | 498 | fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) { 499 | let str_len = sentence.len(); 500 | let mut dag = StaticSparseDAG::with_size_hint(sentence.len()); 501 | self.dag(sentence, &mut dag); 502 | 503 | let curr = sentence.char_indices().map(|x| x.0); 504 | for byte_start in curr { 505 | for byte_end in dag.iter_edges(byte_start) { 506 | let word = if byte_end == str_len { 507 | &sentence[byte_start..] 508 | } else { 509 | &sentence[byte_start..byte_end] 510 | }; 511 | 512 | words.push(word) 513 | } 514 | } 515 | } 516 | 517 | fn cut_dag_no_hmm<'a>( 518 | &self, 519 | sentence: &'a str, 520 | words: &mut Vec<&'a str>, 521 | route: &mut Vec<(f64, usize)>, 522 | dag: &mut StaticSparseDAG, 523 | ) { 524 | self.dag(sentence, dag); 525 | self.calc(sentence, dag, route); 526 | let mut x = 0; 527 | let mut left: Option = None; 528 | 529 | while x < sentence.len() { 530 | let y = route[x].1; 531 | let l_str = if y < sentence.len() { 532 | &sentence[x..y] 533 | } else { 534 | &sentence[x..] 535 | }; 536 | 537 | if l_str.chars().count() == 1 && l_str.chars().all(|ch| ch.is_ascii_alphanumeric()) { 538 | if left.is_none() { 539 | left = Some(x); 540 | } 541 | } else { 542 | if let Some(byte_start) = left { 543 | let word = &sentence[byte_start..x]; 544 | words.push(word); 545 | left = None; 546 | } 547 | 548 | let word = if y < sentence.len() { 549 | &sentence[x..y] 550 | } else { 551 | &sentence[x..] 552 | }; 553 | 554 | words.push(word); 555 | } 556 | x = y; 557 | } 558 | 559 | if let Some(byte_start) = left { 560 | let word = &sentence[byte_start..]; 561 | words.push(word); 562 | } 563 | 564 | dag.clear(); 565 | route.clear(); 566 | } 567 | 568 | #[allow(non_snake_case, clippy::too_many_arguments)] 569 | fn cut_dag_hmm<'a>( 570 | &self, 571 | sentence: &'a str, 572 | words: &mut Vec<&'a str>, 573 | route: &mut Vec<(f64, usize)>, 574 | dag: &mut StaticSparseDAG, 575 | hmm_context: &mut hmm::HmmContext, 576 | ) { 577 | self.dag(sentence, dag); 578 | self.calc(sentence, dag, route); 579 | let mut x = 0; 580 | let mut left: Option = None; 581 | 582 | while x < sentence.len() { 583 | let y = route[x].1; 584 | 585 | if sentence[x..y].chars().count() == 1 { 586 | if left.is_none() { 587 | left = Some(x); 588 | } 589 | } else { 590 | if let Some(byte_start) = left { 591 | let byte_end = x; 592 | let word = if byte_end < sentence.len() { 593 | &sentence[byte_start..byte_end] 594 | } else { 595 | &sentence[byte_start..] 596 | }; 597 | 598 | if word.chars().count() == 1 { 599 | words.push(word); 600 | } else if self.cedar.exact_match_search(word).is_none() { 601 | hmm::cut_with_allocated_memory(word, words, hmm_context); 602 | } else { 603 | let mut word_indices = word.char_indices().map(|x| x.0).peekable(); 604 | while let Some(byte_start) = word_indices.next() { 605 | if let Some(byte_end) = word_indices.peek() { 606 | words.push(&word[byte_start..*byte_end]); 607 | } else { 608 | words.push(&word[byte_start..]); 609 | } 610 | } 611 | } 612 | left = None; 613 | } 614 | let word = if y < sentence.len() { 615 | &sentence[x..y] 616 | } else { 617 | &sentence[x..] 618 | }; 619 | words.push(word); 620 | } 621 | x = y; 622 | } 623 | 624 | if let Some(byte_start) = left { 625 | let word = &sentence[byte_start..]; 626 | 627 | if word.chars().count() == 1 { 628 | words.push(word); 629 | } else if self.cedar.exact_match_search(word).is_none() { 630 | hmm::cut(word, words); 631 | } else { 632 | let mut word_indices = word.char_indices().map(|x| x.0).peekable(); 633 | while let Some(byte_start) = word_indices.next() { 634 | if let Some(byte_end) = word_indices.peek() { 635 | words.push(&word[byte_start..*byte_end]); 636 | } else { 637 | words.push(&word[byte_start..]); 638 | } 639 | } 640 | } 641 | } 642 | 643 | dag.clear(); 644 | route.clear(); 645 | } 646 | 647 | #[allow(non_snake_case)] 648 | fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> { 649 | let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT }; 650 | let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT }; 651 | 652 | re_han.with(|re_han| { 653 | re_skip.with(|re_skip| { 654 | let heuristic_capacity = sentence.len() / 2; 655 | let mut words = Vec::with_capacity(heuristic_capacity); 656 | 657 | let splitter = SplitMatches::new(re_han, sentence); 658 | let mut route = Vec::with_capacity(heuristic_capacity); 659 | let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity); 660 | 661 | let mut hmm_context = hmm::HmmContext::default(); 662 | 663 | for state in splitter { 664 | match state { 665 | SplitState::Matched(_) => { 666 | let block = state.into_str(); 667 | assert!(!block.is_empty()); 668 | 669 | if cut_all { 670 | self.cut_all_internal(block, &mut words); 671 | } else if hmm { 672 | self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context); 673 | } else { 674 | self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag); 675 | } 676 | } 677 | SplitState::Unmatched(_) => { 678 | let block = state.into_str(); 679 | assert!(!block.is_empty()); 680 | 681 | let skip_splitter = SplitMatches::new(re_skip, block); 682 | for skip_state in skip_splitter { 683 | let word = skip_state.into_str(); 684 | if word.is_empty() { 685 | continue; 686 | } 687 | if cut_all || re_skip.is_match(word) { 688 | words.push(word); 689 | } else { 690 | let mut word_indices = word.char_indices().map(|x| x.0).peekable(); 691 | while let Some(byte_start) = word_indices.next() { 692 | if let Some(byte_end) = word_indices.peek() { 693 | words.push(&word[byte_start..*byte_end]); 694 | } else { 695 | words.push(&word[byte_start..]); 696 | } 697 | } 698 | } 699 | } 700 | } 701 | } 702 | } 703 | words 704 | }) 705 | }) 706 | } 707 | 708 | /// Cut the input text 709 | /// 710 | /// ## Params 711 | /// 712 | /// `sentence`: input text 713 | /// 714 | /// `hmm`: enable HMM or not 715 | pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> { 716 | self.cut_internal(sentence, false, hmm) 717 | } 718 | 719 | /// Cut the input text, return all possible words 720 | /// 721 | /// ## Params 722 | /// 723 | /// `sentence`: input text 724 | pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<&'a str> { 725 | self.cut_internal(sentence, true, false) 726 | } 727 | 728 | /// Cut the input text in search mode 729 | /// 730 | /// ## Params 731 | /// 732 | /// `sentence`: input text 733 | /// 734 | /// `hmm`: enable HMM or not 735 | pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> { 736 | let words = self.cut(sentence, hmm); 737 | let mut new_words = Vec::with_capacity(words.len()); 738 | for word in words { 739 | let char_indices: Vec = word.char_indices().map(|x| x.0).collect(); 740 | let char_count = char_indices.len(); 741 | if char_count > 2 { 742 | for i in 0..char_count - 1 { 743 | let byte_start = char_indices[i]; 744 | let gram2 = if i + 2 < char_count { 745 | &word[byte_start..char_indices[i + 2]] 746 | } else { 747 | &word[byte_start..] 748 | }; 749 | if self.cedar.exact_match_search(gram2).is_some() { 750 | new_words.push(gram2); 751 | } 752 | } 753 | } 754 | if char_count > 3 { 755 | for i in 0..char_count - 2 { 756 | let byte_start = char_indices[i]; 757 | let gram3 = if i + 3 < char_count { 758 | &word[byte_start..char_indices[i + 3]] 759 | } else { 760 | &word[byte_start..] 761 | }; 762 | if self.cedar.exact_match_search(gram3).is_some() { 763 | new_words.push(gram3); 764 | } 765 | } 766 | } 767 | new_words.push(word); 768 | } 769 | new_words 770 | } 771 | 772 | /// Tokenize 773 | /// 774 | /// ## Params 775 | /// 776 | /// `sentence`: input text 777 | /// 778 | /// `mode`: tokenize mode 779 | /// 780 | /// `hmm`: enable HMM or not 781 | pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec> { 782 | let words = self.cut(sentence, hmm); 783 | let mut tokens = Vec::with_capacity(words.len()); 784 | let mut start = 0; 785 | match mode { 786 | TokenizeMode::Default => { 787 | for word in words { 788 | let width = word.chars().count(); 789 | tokens.push(Token { 790 | word, 791 | start, 792 | end: start + width, 793 | }); 794 | start += width; 795 | } 796 | } 797 | TokenizeMode::Search => { 798 | for word in words { 799 | let width = word.chars().count(); 800 | if width > 2 { 801 | let char_indices: Vec = word.char_indices().map(|x| x.0).collect(); 802 | for i in 0..width - 1 { 803 | let byte_start = char_indices[i]; 804 | let gram2 = if i + 2 < width { 805 | &word[byte_start..char_indices[i + 2]] 806 | } else { 807 | &word[byte_start..] 808 | }; 809 | if self.cedar.exact_match_search(gram2).is_some() { 810 | tokens.push(Token { 811 | word: gram2, 812 | start: start + i, 813 | end: start + i + 2, 814 | }); 815 | } 816 | } 817 | if width > 3 { 818 | for i in 0..width - 2 { 819 | let byte_start = char_indices[i]; 820 | let gram3 = if i + 3 < width { 821 | &word[byte_start..char_indices[i + 3]] 822 | } else { 823 | &word[byte_start..] 824 | }; 825 | if self.cedar.exact_match_search(gram3).is_some() { 826 | tokens.push(Token { 827 | word: gram3, 828 | start: start + i, 829 | end: start + i + 3, 830 | }); 831 | } 832 | } 833 | } 834 | } 835 | tokens.push(Token { 836 | word, 837 | start, 838 | end: start + width, 839 | }); 840 | start += width; 841 | } 842 | } 843 | } 844 | tokens 845 | } 846 | 847 | /// Tag the input text 848 | /// 849 | /// ## Params 850 | /// 851 | /// `sentence`: input text 852 | /// 853 | /// `hmm`: enable HMM or not 854 | pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec> { 855 | let words = self.cut(sentence, hmm); 856 | words 857 | .into_iter() 858 | .map(|word| { 859 | if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) { 860 | let t = &self.records[word_id as usize].tag; 861 | return Tag { word, tag: t }; 862 | } 863 | let mut eng = 0; 864 | let mut m = 0; 865 | for chr in word.chars() { 866 | if chr.is_ascii_alphanumeric() { 867 | eng += 1; 868 | if chr.is_ascii_digit() { 869 | m += 1; 870 | } 871 | } 872 | } 873 | let tag = if eng == 0 { 874 | "x" 875 | } else if eng == m { 876 | "m" 877 | } else { 878 | "eng" 879 | }; 880 | Tag { word, tag } 881 | }) 882 | .collect() 883 | } 884 | } 885 | 886 | #[cfg(test)] 887 | mod tests { 888 | use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, RE_HAN_DEFAULT}; 889 | use std::io::BufReader; 890 | 891 | #[test] 892 | fn test_init_with_default_dict() { 893 | let _ = Jieba::new(); 894 | } 895 | 896 | #[test] 897 | fn test_has_word() { 898 | let jieba = Jieba::new(); 899 | assert!(jieba.has_word("中国")); 900 | assert!(jieba.has_word("开源")); 901 | assert!(!jieba.has_word("不存在的词")); 902 | } 903 | 904 | #[test] 905 | fn test_split_matches() { 906 | RE_HAN_DEFAULT.with(|re_han| { 907 | let splitter = SplitMatches::new( 908 | re_han, 909 | "👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍", 910 | ); 911 | for state in splitter { 912 | match state { 913 | SplitState::Matched(_) => { 914 | let block = state.into_str(); 915 | assert!(!block.is_empty()); 916 | } 917 | SplitState::Unmatched(_) => { 918 | let block = state.into_str(); 919 | assert!(!block.is_empty()); 920 | } 921 | } 922 | } 923 | }); 924 | } 925 | 926 | #[test] 927 | fn test_split_matches_against_unicode_sip() { 928 | RE_HAN_DEFAULT.with(|re_han| { 929 | let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦"); 930 | 931 | let result: Vec<&str> = splitter.map(|x| x.into_str()).collect(); 932 | assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]); 933 | }); 934 | } 935 | 936 | #[test] 937 | fn test_cut_all() { 938 | let jieba = Jieba::new(); 939 | let words = jieba.cut_all("abc网球拍卖会def"); 940 | assert_eq!( 941 | words, 942 | vec![ 943 | "abc", 944 | "网", 945 | "网球", 946 | "网球拍", 947 | "球", 948 | "球拍", 949 | "拍", 950 | "拍卖", 951 | "拍卖会", 952 | "卖", 953 | "会", 954 | "def" 955 | ] 956 | ); 957 | 958 | // The cut_all from the python de-facto implementation is loosely defined, 959 | // And the answer "我, 来到, 北京, 清华, 清华大学, 华大, 大学" from the python implementation looks weird since it drops the single character word even though it is part of the DAG candidates. 960 | // For example, it includes "华大" but it doesn't include "清" and "学" 961 | let words = jieba.cut_all("我来到北京清华大学"); 962 | assert_eq!( 963 | words, 964 | vec![ 965 | "我", 966 | "来", 967 | "来到", 968 | "到", 969 | "北", 970 | "北京", 971 | "京", 972 | "清", 973 | "清华", 974 | "清华大学", 975 | "华", 976 | "华大", 977 | "大", 978 | "大学", 979 | "学" 980 | ] 981 | ); 982 | } 983 | 984 | #[test] 985 | fn test_cut_no_hmm() { 986 | let jieba = Jieba::new(); 987 | let words = jieba.cut("abc网球拍卖会def", false); 988 | assert_eq!(words, vec!["abc", "网球", "拍卖会", "def"]); 989 | } 990 | 991 | #[test] 992 | fn test_cut_with_hmm() { 993 | let jieba = Jieba::new(); 994 | let words = jieba.cut("我们中出了一个叛徒", false); 995 | assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]); 996 | let words = jieba.cut("我们中出了一个叛徒", true); 997 | assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒"]); 998 | let words = jieba.cut("我们中出了一个叛徒👪", true); 999 | assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒", "👪"]); 1000 | 1001 | let words = jieba.cut("我来到北京清华大学", true); 1002 | assert_eq!(words, vec!["我", "来到", "北京", "清华大学"]); 1003 | 1004 | let words = jieba.cut("他来到了网易杭研大厦", true); 1005 | assert_eq!(words, vec!["他", "来到", "了", "网易", "杭研", "大厦"]); 1006 | } 1007 | 1008 | #[test] 1009 | fn test_cut_weicheng() { 1010 | static WEICHENG_TXT: &str = include_str!("../examples/weicheng/src/weicheng.txt"); 1011 | let jieba = Jieba::new(); 1012 | for line in WEICHENG_TXT.split('\n') { 1013 | let _ = jieba.cut(line, true); 1014 | } 1015 | } 1016 | 1017 | #[test] 1018 | fn test_cut_for_search() { 1019 | let jieba = Jieba::new(); 1020 | let words = jieba.cut_for_search("南京市长江大桥", true); 1021 | assert_eq!(words, vec!["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]); 1022 | 1023 | let words = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true); 1024 | 1025 | // The python implementation silently filtered ",". but we includes it here in the output 1026 | // to let the library user to decide their own filtering strategy 1027 | assert_eq!( 1028 | words, 1029 | vec![ 1030 | "小明", 1031 | "硕士", 1032 | "毕业", 1033 | "于", 1034 | "中国", 1035 | "科学", 1036 | "学院", 1037 | "科学院", 1038 | "中国科学院", 1039 | "计算", 1040 | "计算所", 1041 | ",", 1042 | "后", 1043 | "在", 1044 | "日本", 1045 | "京都", 1046 | "大学", 1047 | "日本京都大学", 1048 | "深造" 1049 | ] 1050 | ); 1051 | } 1052 | 1053 | #[test] 1054 | fn test_tag() { 1055 | let jieba = Jieba::new(); 1056 | let tags = jieba.tag( 1057 | "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。", 1058 | true, 1059 | ); 1060 | assert_eq!( 1061 | tags, 1062 | vec![ 1063 | Tag { word: "我", tag: "r" }, 1064 | Tag { word: "是", tag: "v" }, 1065 | Tag { 1066 | word: "拖拉机", 1067 | tag: "n" 1068 | }, 1069 | Tag { 1070 | word: "学院", tag: "n" 1071 | }, 1072 | Tag { 1073 | word: "手扶拖拉机", 1074 | tag: "n" 1075 | }, 1076 | Tag { 1077 | word: "专业", tag: "n" 1078 | }, 1079 | Tag { word: "的", tag: "uj" }, 1080 | Tag { word: "。", tag: "x" }, 1081 | Tag { 1082 | word: "不用", tag: "v" 1083 | }, 1084 | Tag { 1085 | word: "多久", tag: "m" 1086 | }, 1087 | Tag { word: ",", tag: "x" }, 1088 | Tag { word: "我", tag: "r" }, 1089 | Tag { word: "就", tag: "d" }, 1090 | Tag { word: "会", tag: "v" }, 1091 | Tag { 1092 | word: "升职", tag: "v" 1093 | }, 1094 | Tag { 1095 | word: "加薪", 1096 | tag: "nr" 1097 | }, 1098 | Tag { word: ",", tag: "x" }, 1099 | Tag { 1100 | word: "当上", tag: "t" 1101 | }, 1102 | Tag { 1103 | word: "CEO", 1104 | tag: "eng" 1105 | }, 1106 | Tag { word: ",", tag: "x" }, 1107 | Tag { 1108 | word: "走上", tag: "v" 1109 | }, 1110 | Tag { 1111 | word: "人生", tag: "n" 1112 | }, 1113 | Tag { 1114 | word: "巅峰", tag: "n" 1115 | }, 1116 | Tag { word: "。", tag: "x" } 1117 | ] 1118 | ); 1119 | 1120 | let tags = jieba.tag("今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。", true); 1121 | assert_eq!( 1122 | tags, 1123 | vec![ 1124 | Tag { 1125 | word: "今天", tag: "t" 1126 | }, 1127 | Tag { 1128 | word: "纽约", 1129 | tag: "ns" 1130 | }, 1131 | Tag { word: "的", tag: "uj" }, 1132 | Tag { 1133 | word: "天气", tag: "n" 1134 | }, 1135 | Tag { 1136 | word: "真好", tag: "d" 1137 | }, 1138 | Tag { word: "啊", tag: "zg" }, 1139 | Tag { word: ",", tag: "x" }, 1140 | Tag { 1141 | word: "京华", 1142 | tag: "nz" 1143 | }, 1144 | Tag { 1145 | word: "大酒店", 1146 | tag: "n" 1147 | }, 1148 | Tag { word: "的", tag: "uj" }, 1149 | Tag { 1150 | word: "张尧", tag: "x" 1151 | }, // XXX: missing in dict 1152 | Tag { 1153 | word: "经理", tag: "n" 1154 | }, 1155 | Tag { word: "吃", tag: "v" }, 1156 | Tag { word: "了", tag: "ul" }, 1157 | Tag { 1158 | word: "一只", tag: "m" 1159 | }, 1160 | Tag { 1161 | word: "北京烤鸭", 1162 | tag: "n" 1163 | }, 1164 | Tag { word: "。", tag: "x" } 1165 | ] 1166 | ); 1167 | } 1168 | 1169 | #[test] 1170 | fn test_tokenize() { 1171 | let jieba = Jieba::new(); 1172 | let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false); 1173 | assert_eq!( 1174 | tokens, 1175 | vec![ 1176 | Token { 1177 | word: "南京市", 1178 | start: 0, 1179 | end: 3 1180 | }, 1181 | Token { 1182 | word: "长江大桥", 1183 | start: 3, 1184 | end: 7 1185 | } 1186 | ] 1187 | ); 1188 | 1189 | let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false); 1190 | assert_eq!( 1191 | tokens, 1192 | vec![ 1193 | Token { 1194 | word: "南京", 1195 | start: 0, 1196 | end: 2 1197 | }, 1198 | Token { 1199 | word: "京市", 1200 | start: 1, 1201 | end: 3 1202 | }, 1203 | Token { 1204 | word: "南京市", 1205 | start: 0, 1206 | end: 3 1207 | }, 1208 | Token { 1209 | word: "长江", 1210 | start: 3, 1211 | end: 5 1212 | }, 1213 | Token { 1214 | word: "大桥", 1215 | start: 5, 1216 | end: 7 1217 | }, 1218 | Token { 1219 | word: "长江大桥", 1220 | start: 3, 1221 | end: 7 1222 | } 1223 | ] 1224 | ); 1225 | 1226 | let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false); 1227 | assert_eq!( 1228 | tokens, 1229 | vec![ 1230 | Token { 1231 | word: "我们", 1232 | start: 0, 1233 | end: 2 1234 | }, 1235 | Token { 1236 | word: "中", 1237 | start: 2, 1238 | end: 3 1239 | }, 1240 | Token { 1241 | word: "出", 1242 | start: 3, 1243 | end: 4 1244 | }, 1245 | Token { 1246 | word: "了", 1247 | start: 4, 1248 | end: 5 1249 | }, 1250 | Token { 1251 | word: "一个", 1252 | start: 5, 1253 | end: 7 1254 | }, 1255 | Token { 1256 | word: "叛徒", 1257 | start: 7, 1258 | end: 9 1259 | } 1260 | ] 1261 | ); 1262 | let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true); 1263 | assert_eq!( 1264 | tokens, 1265 | vec![ 1266 | Token { 1267 | word: "我们", 1268 | start: 0, 1269 | end: 2 1270 | }, 1271 | Token { 1272 | word: "中出", 1273 | start: 2, 1274 | end: 4 1275 | }, 1276 | Token { 1277 | word: "了", 1278 | start: 4, 1279 | end: 5 1280 | }, 1281 | Token { 1282 | word: "一个", 1283 | start: 5, 1284 | end: 7 1285 | }, 1286 | Token { 1287 | word: "叛徒", 1288 | start: 7, 1289 | end: 9 1290 | } 1291 | ] 1292 | ); 1293 | 1294 | let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true); 1295 | assert_eq!( 1296 | tokens, 1297 | vec![ 1298 | Token { 1299 | word: "永和", 1300 | start: 0, 1301 | end: 2 1302 | }, 1303 | Token { 1304 | word: "服装", 1305 | start: 2, 1306 | end: 4 1307 | }, 1308 | Token { 1309 | word: "饰品", 1310 | start: 4, 1311 | end: 6 1312 | }, 1313 | Token { 1314 | word: "有限公司", 1315 | start: 6, 1316 | end: 10 1317 | } 1318 | ] 1319 | ); 1320 | } 1321 | 1322 | #[test] 1323 | fn test_userdict() { 1324 | let mut jieba = Jieba::new(); 1325 | let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false); 1326 | assert_eq!( 1327 | tokens, 1328 | vec![ 1329 | Token { 1330 | word: "我们", 1331 | start: 0, 1332 | end: 2 1333 | }, 1334 | Token { 1335 | word: "中", 1336 | start: 2, 1337 | end: 3 1338 | }, 1339 | Token { 1340 | word: "出", 1341 | start: 3, 1342 | end: 4 1343 | }, 1344 | Token { 1345 | word: "了", 1346 | start: 4, 1347 | end: 5 1348 | }, 1349 | Token { 1350 | word: "一个", 1351 | start: 5, 1352 | end: 7 1353 | }, 1354 | Token { 1355 | word: "叛徒", 1356 | start: 7, 1357 | end: 9 1358 | } 1359 | ] 1360 | ); 1361 | let userdict = "中出 10000"; 1362 | jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap(); 1363 | let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false); 1364 | assert_eq!( 1365 | tokens, 1366 | vec![ 1367 | Token { 1368 | word: "我们", 1369 | start: 0, 1370 | end: 2 1371 | }, 1372 | Token { 1373 | word: "中出", 1374 | start: 2, 1375 | end: 4 1376 | }, 1377 | Token { 1378 | word: "了", 1379 | start: 4, 1380 | end: 5 1381 | }, 1382 | Token { 1383 | word: "一个", 1384 | start: 5, 1385 | end: 7 1386 | }, 1387 | Token { 1388 | word: "叛徒", 1389 | start: 7, 1390 | end: 9 1391 | } 1392 | ] 1393 | ); 1394 | } 1395 | 1396 | #[test] 1397 | fn test_userdict_hmm() { 1398 | let mut jieba = Jieba::new(); 1399 | let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true); 1400 | assert_eq!( 1401 | tokens, 1402 | vec![ 1403 | Token { 1404 | word: "我们", 1405 | start: 0, 1406 | end: 2 1407 | }, 1408 | Token { 1409 | word: "中出", 1410 | start: 2, 1411 | end: 4 1412 | }, 1413 | Token { 1414 | word: "了", 1415 | start: 4, 1416 | end: 5 1417 | }, 1418 | Token { 1419 | word: "一个", 1420 | start: 5, 1421 | end: 7 1422 | }, 1423 | Token { 1424 | word: "叛徒", 1425 | start: 7, 1426 | end: 9 1427 | } 1428 | ] 1429 | ); 1430 | let userdict = "出了 10000"; 1431 | jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap(); 1432 | let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true); 1433 | assert_eq!( 1434 | tokens, 1435 | vec![ 1436 | Token { 1437 | word: "我们", 1438 | start: 0, 1439 | end: 2 1440 | }, 1441 | Token { 1442 | word: "中", 1443 | start: 2, 1444 | end: 3 1445 | }, 1446 | Token { 1447 | word: "出了", 1448 | start: 3, 1449 | end: 5 1450 | }, 1451 | Token { 1452 | word: "一个", 1453 | start: 5, 1454 | end: 7 1455 | }, 1456 | Token { 1457 | word: "叛徒", 1458 | start: 7, 1459 | end: 9 1460 | } 1461 | ] 1462 | ); 1463 | } 1464 | 1465 | #[test] 1466 | fn test_userdict_error() { 1467 | let mut jieba = Jieba::empty(); 1468 | let userdict = "出了 not_a_int"; 1469 | let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes())); 1470 | assert!(ret.is_err()); 1471 | } 1472 | 1473 | #[test] 1474 | fn test_suggest_freq() { 1475 | // NOTE: Following behaviors are aligned with original Jieba 1476 | 1477 | let mut jieba = Jieba::new(); 1478 | // These values were calculated by original Jieba 1479 | assert_eq!(jieba.suggest_freq("中出"), 348); 1480 | assert_eq!(jieba.suggest_freq("出了"), 1263); 1481 | 1482 | // Freq in dict.txt was 3, which became 300 after loading user dict 1483 | let userdict = "中出 300"; 1484 | jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap(); 1485 | // But it's less than calculated freq 348 1486 | assert_eq!(jieba.suggest_freq("中出"), 348); 1487 | 1488 | let userdict = "中出 500"; 1489 | jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap(); 1490 | // Now it's significant enough 1491 | assert_eq!(jieba.suggest_freq("中出"), 500) 1492 | } 1493 | 1494 | #[test] 1495 | fn test_custom_lower_freq() { 1496 | let mut jieba = Jieba::new(); 1497 | 1498 | jieba.add_word("测试", Some(2445), None); 1499 | jieba.add_word("测试", Some(10), None); 1500 | let words = jieba.cut("测试", false); 1501 | assert_eq!(words, vec!["测试"]); 1502 | } 1503 | 1504 | #[test] 1505 | fn test_cut_dag_no_hmm_against_string_with_sip() { 1506 | let mut jieba = Jieba::empty(); 1507 | 1508 | //add fake word into dictionary 1509 | jieba.add_word("䶴䶵𦡦", Some(1000), None); 1510 | jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None); 1511 | 1512 | let words = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false); 1513 | assert_eq!(words, vec!["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]); 1514 | } 1515 | 1516 | #[test] 1517 | fn test_add_custom_word_with_underscrore() { 1518 | let mut jieba = Jieba::empty(); 1519 | jieba.add_word("田-女士", Some(42), Some("n")); 1520 | let words = jieba.cut("市民田-女士急匆匆", false); 1521 | assert_eq!(words, vec!["市", "民", "田-女士", "急", "匆", "匆"]); 1522 | } 1523 | } 1524 | -------------------------------------------------------------------------------- /src/sparse_dag.rs: -------------------------------------------------------------------------------- 1 | use crate::FxHashMap as HashMap; 2 | 3 | pub(crate) struct StaticSparseDAG { 4 | array: Vec, 5 | start_pos: HashMap, 6 | size_hint_for_iterator: usize, 7 | curr_insertion_len: usize, 8 | } 9 | 10 | pub struct EdgeIter<'a> { 11 | dag: &'a StaticSparseDAG, 12 | cursor: usize, 13 | } 14 | 15 | impl Iterator for EdgeIter<'_> { 16 | type Item = usize; 17 | 18 | fn size_hint(&self) -> (usize, Option) { 19 | (0, Some(self.dag.size_hint_for_iterator)) 20 | } 21 | 22 | fn next(&mut self) -> Option { 23 | if self.dag.array[self.cursor] == 0 { 24 | self.cursor += 1; 25 | None 26 | } else { 27 | let v = self.dag.array[self.cursor] - 1; 28 | self.cursor += 1; 29 | Some(v) 30 | } 31 | } 32 | } 33 | 34 | impl StaticSparseDAG { 35 | pub(crate) fn with_size_hint(hint: usize) -> Self { 36 | StaticSparseDAG { 37 | array: Vec::with_capacity(hint * 5), 38 | start_pos: HashMap::default(), 39 | size_hint_for_iterator: 0, 40 | curr_insertion_len: 0, 41 | } 42 | } 43 | 44 | #[inline] 45 | pub(crate) fn start(&mut self, from: usize) { 46 | let idx = self.array.len(); 47 | self.curr_insertion_len = 0; 48 | self.start_pos.insert(from, idx); 49 | } 50 | 51 | #[inline] 52 | pub(crate) fn insert(&mut self, to: usize) { 53 | self.curr_insertion_len += 1; 54 | self.array.push(to + 1); 55 | } 56 | 57 | #[inline] 58 | pub(crate) fn commit(&mut self) { 59 | self.size_hint_for_iterator = std::cmp::max(self.curr_insertion_len, self.size_hint_for_iterator); 60 | self.array.push(0); 61 | } 62 | 63 | #[inline] 64 | pub(crate) fn iter_edges(&self, from: usize) -> EdgeIter { 65 | let cursor = self.start_pos.get(&from).unwrap().to_owned(); 66 | 67 | EdgeIter { dag: self, cursor } 68 | } 69 | 70 | pub(crate) fn clear(&mut self) { 71 | self.array.clear(); 72 | self.start_pos.clear(); 73 | } 74 | } 75 | 76 | #[cfg(test)] 77 | mod tests { 78 | use super::*; 79 | 80 | #[test] 81 | fn test_static_sparse_dag() { 82 | let mut dag = StaticSparseDAG::with_size_hint(5); 83 | let mut ans: Vec> = vec![Vec::new(); 5]; 84 | for i in 0..=3 { 85 | dag.start(i); 86 | for j in (i + 1)..=4 { 87 | ans[i].push(j); 88 | dag.insert(j); 89 | } 90 | 91 | dag.commit() 92 | } 93 | 94 | assert_eq!(dag.size_hint_for_iterator, 4); 95 | 96 | for i in 0..=3 { 97 | let edges: Vec = dag.iter_edges(i).collect(); 98 | assert_eq!(ans[i], edges); 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /tests/test_wasm.rs: -------------------------------------------------------------------------------- 1 | #![cfg(target_arch = "wasm32")] 2 | 3 | use jieba_rs::Jieba; 4 | use wasm_bindgen_test::*; 5 | 6 | #[wasm_bindgen_test] 7 | fn test_jieba_cut() { 8 | let jieba = Jieba::new(); 9 | let words = jieba.cut("我们中出了一个叛徒", false); 10 | assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]); 11 | } 12 | --------------------------------------------------------------------------------