├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── periodic.yml │ ├── regression.yml │ └── release.yml ├── .gitignore ├── AUTHORS ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── benches └── bench.rs ├── examples ├── cc-cedict.rs ├── ipadic.rs ├── ko-dic.rs ├── lindera.yml ├── tokenize_with_config.rs └── unidic.rs └── src ├── lib.rs ├── stream.rs └── tokenizer.rs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: mosuka 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "cargo" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/periodic.yml: -------------------------------------------------------------------------------- 1 | name: Periodic 2 | 3 | on: 4 | schedule: 5 | - cron: 0 0 * * SUN 6 | 7 | jobs: 8 | test: 9 | name: Test 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest, macOS-latest, windows-latest] 13 | toolchain: [stable, beta, nightly] 14 | features: ["ipadic", "ko-dic", "cc-cedict"] 15 | runs-on: ${{ matrix.os }} 16 | steps: 17 | - uses: actions/checkout@v1 18 | - uses: actions-rs/toolchain@v1 19 | with: 20 | profile: minimal 21 | toolchain: ${{ matrix.toolchain }} 22 | override: true 23 | - uses: actions-rs/cargo@v1 24 | with: 25 | command: test 26 | args: --features "${{ matrix.features }}" 27 | -------------------------------------------------------------------------------- /.github/workflows/regression.yml: -------------------------------------------------------------------------------- 1 | name: Regression 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | types: [opened, synchronize] 9 | 10 | jobs: 11 | check: 12 | name: Check 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest] 16 | toolchain: [stable] 17 | runs-on: ${{ matrix.os }} 18 | steps: 19 | - uses: actions/checkout@v1 20 | - uses: actions-rs/toolchain@v1 21 | with: 22 | profile: minimal 23 | toolchain: ${{ matrix.toolchain }} 24 | override: true 25 | - uses: actions-rs/cargo@v1 26 | with: 27 | command: check 28 | 29 | test: 30 | name: Test 31 | strategy: 32 | matrix: 33 | os: [ubuntu-latest, macOS-latest, windows-latest] 34 | toolchain: [stable] 35 | features: ["ipadic", "ko-dic", "cc-cedict"] 36 | runs-on: ${{ matrix.os }} 37 | steps: 38 | - uses: actions/checkout@v1 39 | - uses: actions-rs/toolchain@v1 40 | with: 41 | profile: minimal 42 | toolchain: ${{ matrix.toolchain }} 43 | override: true 44 | - uses: actions-rs/cargo@v1 45 | with: 46 | command: test 47 | args: --features "${{ matrix.features }}" 48 | 49 | fmt: 50 | name: Format 51 | strategy: 52 | matrix: 53 | os: [ubuntu-latest] 54 | toolchain: [stable] 55 | runs-on: ${{ matrix.os }} 56 | steps: 57 | - uses: actions/checkout@v1 58 | - uses: actions-rs/toolchain@v1 59 | with: 60 | profile: minimal 61 | toolchain: ${{ matrix.toolchain }} 62 | override: true 63 | - run: rustup component add rustfmt 64 | - uses: actions-rs/cargo@v1 65 | with: 66 | command: fmt 67 | args: --all -- --check 68 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | tags: 7 | - "v*.*.*" 8 | 9 | jobs: 10 | create-release: 11 | name: Upload artifact 12 | runs-on: ubuntu-latest 13 | steps: 14 | - id: create-release 15 | uses: softprops/action-gh-release@v2 16 | env: 17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 18 | with: 19 | name: Release ${{ github.ref_name }} 20 | tag_name: ${{ github.ref }} 21 | draft: false 22 | prerelease: false 23 | generate_release_notes: true 24 | 25 | publish-crates: 26 | name: Publish crate 27 | strategy: 28 | matrix: 29 | os: [ubuntu-latest] 30 | toolchain: [stable] 31 | needs: [create-release] 32 | runs-on: ${{ matrix.os }} 33 | steps: 34 | - uses: actions/checkout@v1 35 | - uses: actions-rs/toolchain@v1 36 | with: 37 | profile: minimal 38 | toolchain: ${{ matrix.toolchain }} 39 | override: true 40 | - uses: actions-rs/cargo@v1 41 | with: 42 | command: publish 43 | args: --token ${{ secrets.CRATES_TOKEN }} 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | .DS_Store 4 | 5 | target 6 | perf.data* 7 | 8 | Cargo.lock 9 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of authors of tantivy for copyright purposes. 2 | Minoru Osuka 3 | @ken0x0a 4 | Jun Ohtani 5 | Koichi Akabe 6 | François Massot 7 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "lindera-tantivy" 3 | version = "0.43.1" 4 | edition = "2021" 5 | description = "Lindera Tokenizer for Tantivy." 6 | documentation = "https://docs.rs/lindera-tantivy" 7 | homepage = "https://github.com/lindera/lindera-tantivy" 8 | repository = "https://github.com/lindera/lindera-tantivy" 9 | readme = "README.md" 10 | keywords = ["tokenizer", "tantivy", "lindera"] 11 | categories = ["text-processing"] 12 | license = "MIT" 13 | 14 | [features] 15 | default = [] # No directories included 16 | ipadic = ["lindera/ipadic"] # Include IPADIC dictionary (Japanese) 17 | ipadic-neologd = [ 18 | "lindera/ipadic-neologd", 19 | ] # Include IPADIC NEologd dictionary (Japanese) 20 | unidic = ["lindera/unidic"] # Include UniDic dictionary (Japanese) 21 | ko-dic = ["lindera/ko-dic"] # Include ko-dic dictionary (Korean) 22 | cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese) 23 | compress = ["lindera/compress"] # Compress dictionaries 24 | 25 | [dependencies] 26 | tantivy-tokenizer-api = "0.5.0" 27 | tantivy = "0.24.1" 28 | 29 | lindera = "0.43.1" 30 | 31 | [dev-dependencies] 32 | criterion = { version = "0.6.0", features = ["html_reports"] } 33 | tantivy = "0.24.0" 34 | 35 | [[bench]] 36 | name = "bench" 37 | harness = false 38 | 39 | [profile.release] 40 | lto = true 41 | 42 | # Make sure that the build scripts and proc-macros are compiled with 43 | # all the optimizations. It speeds up the flate2 crate that we use in our build scripts. 44 | [profile.dev.build-override] 45 | opt-level = 3 46 | [profile.release.build-override] 47 | opt-level = 3 48 | [profile.bench.build-override] 49 | opt-level = 3 50 | [profile.test.build-override] 51 | opt-level = 3 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 by the project authors, as listed in the AUTHORS file. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | LINDERA_TANTIVY_VERSION ?= $(shell cargo metadata --no-deps --format-version=1 | jq -r '.packages[] | select(.name=="lindera-tantivy") | .version') 2 | 3 | .DEFAULT_GOAL := help 4 | 5 | clean: ## Clean the project 6 | cargo clean 7 | 8 | format: ## Format the code 9 | cargo fmt 10 | 11 | test: ## Run tests 12 | cargo test 13 | 14 | tag: ## Make a new tag for the current version 15 | git tag v$(LINDERA_TANTIVY_VERSION) 16 | git push origin v$(LINDERA_TANTIVY_VERSION) 17 | 18 | publish: ## Publish the crate to crates.io 19 | ifeq ($(shell curl -s -XGET https://crates.io/api/v1/crates/lindera-tantivy | jq -r '.versions[].num' | grep $(LINDERA_TANTIVY_VERSION)),) 20 | cargo package && cargo publish 21 | endif 22 | 23 | help: ## Show help 24 | @echo "Available targets:" 25 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}' 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lindera tokenizer for Tantivy 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 4 | 5 | [Lindera](https://github.com/lindera/lindera) Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy). 6 | 7 | ## Usage 8 | 9 | Make sure you have activated the required dictionaries for the  Lindera in Cargo.toml. 10 | The following example enables IPADIC. 11 | 12 | ```toml 13 | [dependencies] 14 | lindera = "0.38" 15 | lindera-tantivy = { version = "0.38.0", features = ["ipadic"] } 16 | ``` 17 | 18 | ### Basic example 19 | 20 | ```rust 21 | fn main() -> tantivy::Result<()> { 22 | use tantivy::{ 23 | collector::TopDocs, 24 | doc, 25 | query::QueryParser, 26 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 27 | Document, Index, TantivyDocument, 28 | }; 29 | 30 | use lindera::dictionary::DictionaryKind; 31 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; 32 | use lindera_tantivy::tokenizer::LinderaTokenizer; 33 | 34 | // create schema builder 35 | let mut schema_builder = Schema::builder(); 36 | 37 | // add id field 38 | let id = schema_builder.add_text_field( 39 | "id", 40 | TextOptions::default() 41 | .set_indexing_options( 42 | TextFieldIndexing::default() 43 | .set_tokenizer("raw") 44 | .set_index_option(IndexRecordOption::Basic), 45 | ) 46 | .set_stored(), 47 | ); 48 | 49 | // add title field 50 | let title = schema_builder.add_text_field( 51 | "title", 52 | TextOptions::default() 53 | .set_indexing_options( 54 | TextFieldIndexing::default() 55 | .set_tokenizer("lang_ja") 56 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 57 | ) 58 | .set_stored(), 59 | ); 60 | 61 | // add body field 62 | let body = schema_builder.add_text_field( 63 | "body", 64 | TextOptions::default() 65 | .set_indexing_options( 66 | TextFieldIndexing::default() 67 | .set_tokenizer("lang_ja") 68 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 69 | ) 70 | .set_stored(), 71 | ); 72 | 73 | // build schema 74 | let schema = schema_builder.build(); 75 | 76 | // create index on memory 77 | let index = Index::create_in_ram(schema.clone()); 78 | 79 | // Tokenizer with IPADIC 80 | let mode = Mode::Normal; 81 | let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); 82 | let user_dictionary = None; 83 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 84 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter); 85 | 86 | // register Lindera tokenizer 87 | index.tokenizers().register("lang_ja", tokenizer); 88 | 89 | // create index writer 90 | let mut index_writer = index.writer(50_000_000)?; 91 | 92 | // add document 93 | index_writer.add_document(doc!( 94 | id => "1", 95 | title => "成田国際空港", 96 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" 97 | )).unwrap(); 98 | 99 | // add document 100 | index_writer.add_document(doc!( 101 | id => "2", 102 | title => "東京国際空港", 103 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" 104 | )).unwrap(); 105 | 106 | // add document 107 | index_writer.add_document(doc!( 108 | id => "3", 109 | title => "関西国際空港", 110 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" 111 | )).unwrap(); 112 | 113 | // commit 114 | index_writer.commit()?; 115 | 116 | // create reader 117 | let reader = index.reader()?; 118 | 119 | // create searcher 120 | let searcher = reader.searcher(); 121 | 122 | // create querhy parser 123 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 124 | 125 | // parse query 126 | let query_str = "東京"; 127 | let query = query_parser.parse_query(query_str)?; 128 | println!("Query String: {}", query_str); 129 | 130 | // search 131 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 132 | println!("Search Result:"); 133 | for (_, doc_address) in top_docs { 134 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 135 | println!("{}", retrieved_doc.to_json(&schema)); 136 | } 137 | 138 | Ok(()) 139 | } 140 | ``` 141 | 142 | ### Config by YAML 143 | 144 | ```rust 145 | use std::path::PathBuf; 146 | 147 | fn main() -> tantivy::Result<()> { 148 | use tantivy::{ 149 | collector::TopDocs, 150 | doc, 151 | query::QueryParser, 152 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 153 | Document, Index, TantivyDocument, 154 | }; 155 | 156 | use lindera_tantivy::tokenizer::LinderaTokenizer; 157 | 158 | // create schema builder 159 | let mut schema_builder = Schema::builder(); 160 | 161 | // add id field 162 | let id = schema_builder.add_text_field( 163 | "id", 164 | TextOptions::default() 165 | .set_indexing_options( 166 | TextFieldIndexing::default() 167 | .set_tokenizer("raw") 168 | .set_index_option(IndexRecordOption::Basic), 169 | ) 170 | .set_stored(), 171 | ); 172 | 173 | // add title field 174 | let title = schema_builder.add_text_field( 175 | "title", 176 | TextOptions::default() 177 | .set_indexing_options( 178 | TextFieldIndexing::default() 179 | .set_tokenizer("lang_ja") 180 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 181 | ) 182 | .set_stored(), 183 | ); 184 | 185 | // add body field 186 | let body = schema_builder.add_text_field( 187 | "body", 188 | TextOptions::default() 189 | .set_indexing_options( 190 | TextFieldIndexing::default() 191 | .set_tokenizer("lang_ja") 192 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 193 | ) 194 | .set_stored(), 195 | ); 196 | 197 | // build schema 198 | let schema = schema_builder.build(); 199 | 200 | // create index on memory 201 | let index = Index::create_in_ram(schema.clone()); 202 | 203 | // Build tokenizer with config file 204 | let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) 205 | .join("./examples") 206 | .join("lindera.yml"); 207 | let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?; 208 | 209 | // register Lindera tokenizer 210 | index.tokenizers().register("lang_ja", tokenizer); 211 | 212 | // create index writer 213 | let mut index_writer = index.writer(50_000_000)?; 214 | 215 | // add document 216 | index_writer.add_document(doc!( 217 | id => "1", 218 | title => "成田国際空港", 219 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" 220 | )).unwrap(); 221 | 222 | // add document 223 | index_writer.add_document(doc!( 224 | id => "2", 225 | title => "東京国際空港", 226 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" 227 | )).unwrap(); 228 | 229 | // add document 230 | index_writer.add_document(doc!( 231 | id => "3", 232 | title => "関西国際空港", 233 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" 234 | )).unwrap(); 235 | 236 | // commit 237 | index_writer.commit()?; 238 | 239 | // create reader 240 | let reader = index.reader()?; 241 | 242 | // create searcher 243 | let searcher = reader.searcher(); 244 | 245 | // create querhy parser 246 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 247 | 248 | // parse query 249 | let query_str = "TOKYO"; 250 | let query = query_parser.parse_query(query_str)?; 251 | println!("Query String: {}", query_str); 252 | 253 | // search 254 | println!("Parsed Query: {:?}", query); 255 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 256 | println!("Search Result:"); 257 | for (_, doc_address) in top_docs { 258 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 259 | println!("{}", retrieved_doc.to_json(&schema)); 260 | } 261 | 262 | Ok(()) 263 | } 264 | ``` 265 | 266 | ## API reference 267 | 268 | The API reference is available. Please see following URL: 269 | 270 | - lindera-tantivy 271 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | use criterion::Criterion; 2 | use criterion::{criterion_group, criterion_main}; 3 | 4 | #[cfg(feature = "ipadic")] 5 | fn bench_indexing(c: &mut Criterion) { 6 | use lindera::dictionary::load_dictionary_from_kind; 7 | use lindera::segmenter::Segmenter; 8 | use tantivy::doc; 9 | use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; 10 | use tantivy::Index; 11 | 12 | use lindera::dictionary::DictionaryKind; 13 | use lindera::mode::Mode; 14 | use lindera_tantivy::tokenizer::LinderaTokenizer; 15 | 16 | // create schema builder 17 | let mut schema_builder = Schema::builder(); 18 | 19 | // add id field 20 | let id = schema_builder.add_text_field( 21 | "id", 22 | TextOptions::default() 23 | .set_indexing_options( 24 | TextFieldIndexing::default() 25 | .set_tokenizer("raw") 26 | .set_index_option(IndexRecordOption::Basic), 27 | ) 28 | .set_stored(), 29 | ); 30 | 31 | // add text field 32 | let text = schema_builder.add_text_field( 33 | "text", 34 | TextOptions::default() 35 | .set_indexing_options( 36 | TextFieldIndexing::default() 37 | .set_tokenizer("lang_ja") 38 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 39 | ) 40 | .set_stored(), 41 | ); 42 | 43 | // build schema 44 | let schema = schema_builder.build(); 45 | 46 | // create index on memory 47 | let index = Index::create_in_ram(schema.clone()); 48 | 49 | // Test document set. 50 | let mut docs = Vec::new(); 51 | for i in 0..1000 { 52 | let doc = doc!( 53 | id => format!("doc-{}", i), 54 | text => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である[1]。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" 55 | ); 56 | docs.push(doc); 57 | } 58 | 59 | let mode = Mode::Normal; 60 | let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); 61 | let user_dictionary = None; 62 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 63 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter); 64 | 65 | // register Lindera tokenizer 66 | index.tokenizers().register("lang_ja", tokenizer); 67 | 68 | // create index writer 69 | let mut index_writer = index.writer(50_000_000).unwrap(); 70 | 71 | // Using benchmark_group for changing sample_size 72 | let mut group = c.benchmark_group("indexing"); 73 | group.sample_size(100); 74 | group.bench_function("bench-indexing", |b| { 75 | b.iter(|| { 76 | for doc in docs.iter() { 77 | index_writer.add_document(doc.clone()).unwrap(); 78 | } 79 | }); 80 | 81 | // commit 82 | index_writer.commit().unwrap(); 83 | }); 84 | group.finish(); 85 | } 86 | 87 | #[cfg(not(feature = "ipadic"))] 88 | fn bench_indexing(_c: &mut Criterion) {} 89 | 90 | criterion_group!(benches, bench_indexing,); 91 | criterion_main!(benches); 92 | -------------------------------------------------------------------------------- /examples/cc-cedict.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "cc-cedict")] 2 | fn main() -> tantivy::Result<()> { 3 | use tantivy::{ 4 | collector::TopDocs, 5 | doc, 6 | query::QueryParser, 7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 8 | Document, Index, TantivyDocument, 9 | }; 10 | 11 | use lindera::dictionary::DictionaryKind; 12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; 13 | use lindera_tantivy::tokenizer::LinderaTokenizer; 14 | 15 | // create schema builder 16 | let mut schema_builder = Schema::builder(); 17 | 18 | // add id field 19 | let id = schema_builder.add_text_field( 20 | "id", 21 | TextOptions::default() 22 | .set_indexing_options( 23 | TextFieldIndexing::default() 24 | .set_tokenizer("raw") 25 | .set_index_option(IndexRecordOption::Basic), 26 | ) 27 | .set_stored(), 28 | ); 29 | 30 | // add title field 31 | let title = schema_builder.add_text_field( 32 | "title", 33 | TextOptions::default() 34 | .set_indexing_options( 35 | TextFieldIndexing::default() 36 | .set_tokenizer("lang_zh") 37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 38 | ) 39 | .set_stored(), 40 | ); 41 | 42 | // add body field 43 | let body = schema_builder.add_text_field( 44 | "body", 45 | TextOptions::default() 46 | .set_indexing_options( 47 | TextFieldIndexing::default() 48 | .set_tokenizer("lang_zh") 49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 50 | ) 51 | .set_stored(), 52 | ); 53 | 54 | // build schema 55 | let schema = schema_builder.build(); 56 | 57 | // create index on memory 58 | let index = Index::create_in_ram(schema.clone()); 59 | 60 | // Tokenizer with CC-CEDICT 61 | let mode = Mode::Normal; 62 | let dictionary = load_dictionary_from_kind(DictionaryKind::CcCedict).unwrap(); 63 | let user_dictionary = None; 64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter); 66 | 67 | // register Lindera tokenizer 68 | index.tokenizers().register("lang_zh", tokenizer); 69 | 70 | // create index writer 71 | let mut index_writer = index.writer(50_000_000)?; 72 | 73 | // add document 74 | index_writer.add_document(doc!( 75 | id => "1", 76 | title => "成田国际机场", 77 | body => "成田國際機場(日语:成田国際空港/なりたこくさいくうこう Narita Kokusai Kūkō */?;IATA代码:NRT;ICAO代码:RJAA),通稱成田機場(成田空港),原名新東京國際機場(新東京国際空港/しんとうきょうこくさいくうこう Shin-Tōkyō Kokusai Kūkō),是位於日本千葉縣成田市的國際機場,與羽田機場並列為東京兩大聯外機場。占地1,111公頃,擁有3座客運航廈,客運流量居日本第二位,貨運吞吐量則居日本第一、全球第九。根據日本機場分類法,其劃分為據點機場。" 78 | )).unwrap(); 79 | 80 | // add document 81 | index_writer.add_document(doc!( 82 | id => "2", 83 | title => "東京國際機場", 84 | body => "東京國際機場(日语:東京国際空港/とうきょうこくさいくうこう Tōkyō Kokusai Kūkō */?;IATA代码:HND;ICAO代码:RJTT)是位於日本東京都大田區的機場,因座落於羽田地區而通稱為羽田機場(羽田空港/はねだくうこう Haneda Kūkō),啟用於1931年8月25日,與成田國際機場並列為東京兩大聯外機場。" 85 | )).unwrap(); 86 | 87 | // add document 88 | index_writer.add_document(doc!( 89 | id => "3", 90 | title => "关西国际机场", 91 | body => "關西國際機場(日语:関西国際空港/かんさいこくさいくうこう Kansai kokusai kūkō */?,英語:Kansai International Airport,IATA代码:KIX;ICAO代码:RJBB),常通稱為關西機場、大阪關西機場或關空[註 1],是位於日本大阪府的機場,坐落於大阪湾东南部的泉州近海離岸5公里的人工島上,面積約1,067.7公頃[2],行政區劃橫跨大阪府的泉佐野市(北)、田尻町(中)以及泉南市(南)。" 92 | )).unwrap(); 93 | 94 | // commit 95 | index_writer.commit()?; 96 | 97 | // create reader 98 | let reader = index.reader()?; 99 | 100 | // create searcher 101 | let searcher = reader.searcher(); 102 | 103 | // create querhy parser 104 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 105 | 106 | // parse query 107 | let query_str = "東京"; 108 | let query = query_parser.parse_query(query_str)?; 109 | println!("Query String: {}", query_str); 110 | 111 | // search 112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 113 | println!("Search Result:"); 114 | for (_, doc_address) in top_docs { 115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 116 | println!("{}", retrieved_doc.to_json(&schema)); 117 | } 118 | 119 | Ok(()) 120 | } 121 | 122 | #[cfg(not(feature = "cc-cedict"))] 123 | fn main() -> tantivy::Result<()> { 124 | Ok(()) 125 | } 126 | -------------------------------------------------------------------------------- /examples/ipadic.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "ipadic")] 2 | fn main() -> tantivy::Result<()> { 3 | use tantivy::{ 4 | collector::TopDocs, 5 | doc, 6 | query::QueryParser, 7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 8 | Document, Index, TantivyDocument, 9 | }; 10 | 11 | use lindera::dictionary::DictionaryKind; 12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; 13 | use lindera_tantivy::tokenizer::LinderaTokenizer; 14 | 15 | // create schema builder 16 | let mut schema_builder = Schema::builder(); 17 | 18 | // add id field 19 | let id = schema_builder.add_text_field( 20 | "id", 21 | TextOptions::default() 22 | .set_indexing_options( 23 | TextFieldIndexing::default() 24 | .set_tokenizer("raw") 25 | .set_index_option(IndexRecordOption::Basic), 26 | ) 27 | .set_stored(), 28 | ); 29 | 30 | // add title field 31 | let title = schema_builder.add_text_field( 32 | "title", 33 | TextOptions::default() 34 | .set_indexing_options( 35 | TextFieldIndexing::default() 36 | .set_tokenizer("lang_ja") 37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 38 | ) 39 | .set_stored(), 40 | ); 41 | 42 | // add body field 43 | let body = schema_builder.add_text_field( 44 | "body", 45 | TextOptions::default() 46 | .set_indexing_options( 47 | TextFieldIndexing::default() 48 | .set_tokenizer("lang_ja") 49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 50 | ) 51 | .set_stored(), 52 | ); 53 | 54 | // build schema 55 | let schema = schema_builder.build(); 56 | 57 | // create index on memory 58 | let index = Index::create_in_ram(schema.clone()); 59 | 60 | // Tokenizer with IPADIC 61 | let mode = Mode::Normal; 62 | let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap(); 63 | let user_dictionary = None; 64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter); 66 | 67 | // register Lindera tokenizer 68 | index.tokenizers().register("lang_ja", tokenizer); 69 | 70 | // create index writer 71 | let mut index_writer = index.writer(50_000_000)?; 72 | 73 | // add document 74 | index_writer.add_document(doc!( 75 | id => "1", 76 | title => "成田国際空港", 77 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" 78 | )).unwrap(); 79 | 80 | // add document 81 | index_writer.add_document(doc!( 82 | id => "2", 83 | title => "東京国際空港", 84 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" 85 | )).unwrap(); 86 | 87 | // add document 88 | index_writer.add_document(doc!( 89 | id => "3", 90 | title => "関西国際空港", 91 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" 92 | )).unwrap(); 93 | 94 | // commit 95 | index_writer.commit()?; 96 | 97 | // create reader 98 | let reader = index.reader()?; 99 | 100 | // create searcher 101 | let searcher = reader.searcher(); 102 | 103 | // create querhy parser 104 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 105 | 106 | // parse query 107 | let query_str = "東京"; 108 | let query = query_parser.parse_query(query_str)?; 109 | println!("Query String: {}", query_str); 110 | 111 | // search 112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 113 | println!("Search Result:"); 114 | for (_, doc_address) in top_docs { 115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 116 | println!("{}", retrieved_doc.to_json(&schema)); 117 | } 118 | 119 | Ok(()) 120 | } 121 | 122 | #[cfg(not(feature = "ipadic"))] 123 | fn main() -> tantivy::Result<()> { 124 | Ok(()) 125 | } 126 | -------------------------------------------------------------------------------- /examples/ko-dic.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "ko-dic")] 2 | fn main() -> tantivy::Result<()> { 3 | use tantivy::{ 4 | collector::TopDocs, 5 | doc, 6 | query::QueryParser, 7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 8 | Document, Index, TantivyDocument, 9 | }; 10 | 11 | use lindera::dictionary::DictionaryKind; 12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; 13 | use lindera_tantivy::tokenizer::LinderaTokenizer; 14 | 15 | // create schema builder 16 | let mut schema_builder = Schema::builder(); 17 | 18 | // add id field 19 | let id = schema_builder.add_text_field( 20 | "id", 21 | TextOptions::default() 22 | .set_indexing_options( 23 | TextFieldIndexing::default() 24 | .set_tokenizer("raw") 25 | .set_index_option(IndexRecordOption::Basic), 26 | ) 27 | .set_stored(), 28 | ); 29 | 30 | // add title field 31 | let title = schema_builder.add_text_field( 32 | "title", 33 | TextOptions::default() 34 | .set_indexing_options( 35 | TextFieldIndexing::default() 36 | .set_tokenizer("lang_ko") 37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 38 | ) 39 | .set_stored(), 40 | ); 41 | 42 | // add body field 43 | let body = schema_builder.add_text_field( 44 | "body", 45 | TextOptions::default() 46 | .set_indexing_options( 47 | TextFieldIndexing::default() 48 | .set_tokenizer("lang_ko") 49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 50 | ) 51 | .set_stored(), 52 | ); 53 | 54 | // build schema 55 | let schema = schema_builder.build(); 56 | 57 | // create index on memory 58 | let index = Index::create_in_ram(schema.clone()); 59 | 60 | // Tokenizer with ko-dic 61 | let mode = Mode::Normal; 62 | let dictionary = load_dictionary_from_kind(DictionaryKind::KoDic).unwrap(); 63 | let user_dictionary = None; 64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter); 66 | 67 | // register Lindera tokenizer 68 | index.tokenizers().register("lang_ko", tokenizer); 69 | 70 | // create index writer 71 | let mut index_writer = index.writer(50_000_000)?; 72 | 73 | // add document 74 | index_writer.add_document(doc!( 75 | id => "1", 76 | title => "나리타 국제공항", 77 | body => "나리타 국제공항(일본어: 成田国際空港, 영어: Narita International Airport, IATA: NRT, ICAO: RJAA)은 일본 지바현 나리타시에 위치한 국제공항으로, 도쿄도 도심에서 동북쪽으로 약 62km 떨어져 있다." 78 | )).unwrap(); 79 | 80 | // add document 81 | index_writer.add_document(doc!( 82 | id => "2", 83 | title => "도쿄 국제공항", 84 | body => "도쿄국제공항(일본어: 東京国際空港、とうきょうこくさいくうこう, 영어: Tokyo International Airport)은 일본 도쿄도 오타구에 있는 공항이다. 보통 이 일대의 옛 지명을 본뜬 하네다 공항(일본어: 羽田空港, 영어: Haneda Airport)이라고 불린다." 85 | )).unwrap(); 86 | 87 | // add document 88 | index_writer.add_document(doc!( 89 | id => "3", 90 | title => "간사이 국제공항", 91 | body => "간사이 국제공항(일본어: 関西国際空港, IATA: KIX, ICAO: RJBB)은 일본 오사카부 오사카 만에 조성된 인공섬에 위치한 일본의 공항으로, 대한민국의 인천국제공항보다 6년 반 앞선 1994년 9월 4일에 개항했다." 92 | )).unwrap(); 93 | 94 | // commit 95 | index_writer.commit()?; 96 | 97 | // create reader 98 | let reader = index.reader()?; 99 | 100 | // create searcher 101 | let searcher = reader.searcher(); 102 | 103 | // create querhy parser 104 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 105 | 106 | // parse query 107 | let query_str = "도쿄"; 108 | let query = query_parser.parse_query(query_str)?; 109 | println!("Query String: {}", query_str); 110 | 111 | // search 112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 113 | println!("Search Result:"); 114 | for (_, doc_address) in top_docs { 115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 116 | println!("{}", retrieved_doc.to_json(&schema)); 117 | } 118 | 119 | Ok(()) 120 | } 121 | 122 | #[cfg(not(feature = "ko-dic"))] 123 | fn main() -> tantivy::Result<()> { 124 | Ok(()) 125 | } 126 | -------------------------------------------------------------------------------- /examples/lindera.yml: -------------------------------------------------------------------------------- 1 | segmenter: 2 | mode: "normal" 3 | dictionary: 4 | kind: "ipadic" 5 | # user_dictionary: 6 | # path: "./resources/ipadic_simple.csv" 7 | # kind: "ipadic" 8 | 9 | character_filters: 10 | - kind: "unicode_normalize" 11 | args: 12 | kind: "nfkc" 13 | - kind: "japanese_iteration_mark" 14 | args: 15 | normalize_kanji: true 16 | normalize_kana: true 17 | - kind: mapping 18 | args: 19 | mapping: 20 | リンデラ: Lindera 21 | 22 | token_filters: 23 | - kind: "japanese_compound_word" 24 | args: 25 | kind: "ipadic" 26 | tags: 27 | - "名詞,数" 28 | - "名詞,接尾,助数詞" 29 | new_tag: "名詞,数" 30 | - kind: "japanese_number" 31 | args: 32 | tags: 33 | - "名詞,数" 34 | - kind: "japanese_stop_tags" 35 | args: 36 | tags: 37 | - "接続詞" 38 | - "助詞" 39 | - "助詞,格助詞" 40 | - "助詞,格助詞,一般" 41 | - "助詞,格助詞,引用" 42 | - "助詞,格助詞,連語" 43 | - "助詞,係助詞" 44 | - "助詞,副助詞" 45 | - "助詞,間投助詞" 46 | - "助詞,並立助詞" 47 | - "助詞,終助詞" 48 | - "助詞,副助詞/並立助詞/終助詞" 49 | - "助詞,連体化" 50 | - "助詞,副詞化" 51 | - "助詞,特殊" 52 | - "助動詞" 53 | - "記号" 54 | - "記号,一般" 55 | - "記号,読点" 56 | - "記号,句点" 57 | - "記号,空白" 58 | - "記号,括弧閉" 59 | - "その他,間投" 60 | - "フィラー" 61 | - "非言語音" 62 | - kind: "japanese_katakana_stem" 63 | args: 64 | min: 3 65 | - kind: "remove_diacritical_mark" 66 | args: 67 | japanese: false 68 | - kind: "lowercase" 69 | args: {} 70 | -------------------------------------------------------------------------------- /examples/tokenize_with_config.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | fn main() -> tantivy::Result<()> { 4 | use tantivy::{ 5 | collector::TopDocs, 6 | doc, 7 | query::QueryParser, 8 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 9 | Document, Index, TantivyDocument, 10 | }; 11 | 12 | use lindera_tantivy::tokenizer::LinderaTokenizer; 13 | 14 | // create schema builder 15 | let mut schema_builder = Schema::builder(); 16 | 17 | // add id field 18 | let id = schema_builder.add_text_field( 19 | "id", 20 | TextOptions::default() 21 | .set_indexing_options( 22 | TextFieldIndexing::default() 23 | .set_tokenizer("raw") 24 | .set_index_option(IndexRecordOption::Basic), 25 | ) 26 | .set_stored(), 27 | ); 28 | 29 | // add title field 30 | let title = schema_builder.add_text_field( 31 | "title", 32 | TextOptions::default() 33 | .set_indexing_options( 34 | TextFieldIndexing::default() 35 | .set_tokenizer("lang_ja") 36 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 37 | ) 38 | .set_stored(), 39 | ); 40 | 41 | // add body field 42 | let body = schema_builder.add_text_field( 43 | "body", 44 | TextOptions::default() 45 | .set_indexing_options( 46 | TextFieldIndexing::default() 47 | .set_tokenizer("lang_ja") 48 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 49 | ) 50 | .set_stored(), 51 | ); 52 | 53 | // build schema 54 | let schema = schema_builder.build(); 55 | 56 | // create index on memory 57 | let index = Index::create_in_ram(schema.clone()); 58 | 59 | // Build tokenizer with config file 60 | let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")) 61 | .join("./examples") 62 | .join("lindera.yml"); 63 | let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?; 64 | 65 | // register Lindera tokenizer 66 | index.tokenizers().register("lang_ja", tokenizer); 67 | 68 | // create index writer 69 | let mut index_writer = index.writer(50_000_000)?; 70 | 71 | // add document 72 | index_writer.add_document(doc!( 73 | id => "1", 74 | title => "成田国際空港", 75 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" 76 | )).unwrap(); 77 | 78 | // add document 79 | index_writer.add_document(doc!( 80 | id => "2", 81 | title => "東京国際空港", 82 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" 83 | )).unwrap(); 84 | 85 | // add document 86 | index_writer.add_document(doc!( 87 | id => "3", 88 | title => "関西国際空港", 89 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" 90 | )).unwrap(); 91 | 92 | // commit 93 | index_writer.commit()?; 94 | 95 | // create reader 96 | let reader = index.reader()?; 97 | 98 | // create searcher 99 | let searcher = reader.searcher(); 100 | 101 | // create querhy parser 102 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 103 | 104 | // parse query 105 | let query_str = "TOKYO"; 106 | let query = query_parser.parse_query(query_str)?; 107 | println!("Query String: {}", query_str); 108 | 109 | // search 110 | println!("Parsed Query: {:?}", query); 111 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 112 | println!("Search Result:"); 113 | for (_, doc_address) in top_docs { 114 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 115 | println!("{}", retrieved_doc.to_json(&schema)); 116 | } 117 | 118 | Ok(()) 119 | } 120 | -------------------------------------------------------------------------------- /examples/unidic.rs: -------------------------------------------------------------------------------- 1 | #[cfg(feature = "unidic")] 2 | fn main() -> tantivy::Result<()> { 3 | use tantivy::{ 4 | collector::TopDocs, 5 | doc, 6 | query::QueryParser, 7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, 8 | Document, Index, TantivyDocument, 9 | }; 10 | 11 | use lindera::dictionary::DictionaryKind; 12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter}; 13 | use lindera_tantivy::tokenizer::LinderaTokenizer; 14 | 15 | // create schema builder 16 | let mut schema_builder = Schema::builder(); 17 | 18 | // add id field 19 | let id = schema_builder.add_text_field( 20 | "id", 21 | TextOptions::default() 22 | .set_indexing_options( 23 | TextFieldIndexing::default() 24 | .set_tokenizer("raw") 25 | .set_index_option(IndexRecordOption::Basic), 26 | ) 27 | .set_stored(), 28 | ); 29 | 30 | // add title field 31 | let title = schema_builder.add_text_field( 32 | "title", 33 | TextOptions::default() 34 | .set_indexing_options( 35 | TextFieldIndexing::default() 36 | .set_tokenizer("lang_ja") 37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 38 | ) 39 | .set_stored(), 40 | ); 41 | 42 | // add body field 43 | let body = schema_builder.add_text_field( 44 | "body", 45 | TextOptions::default() 46 | .set_indexing_options( 47 | TextFieldIndexing::default() 48 | .set_tokenizer("lang_ja") 49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions), 50 | ) 51 | .set_stored(), 52 | ); 53 | 54 | // build schema 55 | let schema = schema_builder.build(); 56 | 57 | // create index on memory 58 | let index = Index::create_in_ram(schema.clone()); 59 | 60 | // Tokenizer with UniDic 61 | let mode = Mode::Normal; 62 | let dictionary = load_dictionary_from_kind(DictionaryKind::UniDic).unwrap(); 63 | let user_dictionary = None; 64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter); 66 | 67 | // register Lindera tokenizer 68 | index.tokenizers().register("lang_ja", tokenizer); 69 | 70 | // create index writer 71 | let mut index_writer = index.writer(50_000_000)?; 72 | 73 | // add document 74 | index_writer.add_document(doc!( 75 | id => "1", 76 | title => "成田国際空港", 77 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。" 78 | )).unwrap(); 79 | 80 | // add document 81 | index_writer.add_document(doc!( 82 | id => "2", 83 | title => "東京国際空港", 84 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。" 85 | )).unwrap(); 86 | 87 | // add document 88 | index_writer.add_document(doc!( 89 | id => "3", 90 | title => "関西国際空港", 91 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。" 92 | )).unwrap(); 93 | 94 | // commit 95 | index_writer.commit()?; 96 | 97 | // create reader 98 | let reader = index.reader()?; 99 | 100 | // create searcher 101 | let searcher = reader.searcher(); 102 | 103 | // create querhy parser 104 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 105 | 106 | // parse query 107 | let query_str = "東京"; 108 | let query = query_parser.parse_query(query_str)?; 109 | println!("Query String: {}", query_str); 110 | 111 | // search 112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 113 | println!("Search Result:"); 114 | for (_, doc_address) in top_docs { 115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; 116 | println!("{}", retrieved_doc.to_json(&schema)); 117 | } 118 | 119 | Ok(()) 120 | } 121 | 122 | #[cfg(not(feature = "unidic"))] 123 | fn main() -> tantivy::Result<()> { 124 | Ok(()) 125 | } 126 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod stream; 2 | pub mod tokenizer; 3 | -------------------------------------------------------------------------------- /src/stream.rs: -------------------------------------------------------------------------------- 1 | use tantivy_tokenizer_api::{Token, TokenStream}; 2 | 3 | use lindera::token::Token as LToken; 4 | 5 | pub struct LinderaTokenStream<'a> { 6 | pub tokens: Vec>, 7 | pub token: &'a mut Token, 8 | } 9 | 10 | impl<'a> TokenStream for LinderaTokenStream<'a> { 11 | fn advance(&mut self) -> bool { 12 | if self.tokens.is_empty() { 13 | return false; 14 | } 15 | let token = self.tokens.remove(0); 16 | self.token.text = token.text.to_string(); 17 | self.token.offset_from = token.byte_start; 18 | self.token.offset_to = token.byte_end; 19 | self.token.position = token.position; 20 | self.token.position_length = token.position_length; 21 | 22 | true 23 | } 24 | 25 | fn token(&self) -> &Token { 26 | self.token 27 | } 28 | 29 | fn token_mut(&mut self) -> &mut Token { 30 | self.token 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | 3 | use lindera::character_filter::BoxCharacterFilter; 4 | use lindera::token_filter::BoxTokenFilter; 5 | use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder}; 6 | use tantivy::Result; 7 | use tantivy::TantivyError; 8 | use tantivy_tokenizer_api::{Token, Tokenizer}; 9 | 10 | use crate::stream::LinderaTokenStream; 11 | 12 | #[derive(Clone)] 13 | pub struct LinderaTokenizer { 14 | tokenizer: LTokenizer, 15 | token: Token, 16 | } 17 | 18 | impl LinderaTokenizer { 19 | /// Create a new `LinderaTokenizer`. 20 | /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable. 21 | pub fn new() -> Result { 22 | let builder = TokenizerBuilder::new() 23 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; 24 | let tokenizer = builder 25 | .build() 26 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; 27 | Ok(LinderaTokenizer { 28 | tokenizer, 29 | token: Default::default(), 30 | }) 31 | } 32 | 33 | /// Create a new `LinderaTokenizer`. 34 | /// This function will create a new `LinderaTokenizer` with settings from the YAML file. 35 | pub fn from_file(file_path: &Path) -> Result { 36 | let builder = TokenizerBuilder::from_file(file_path) 37 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; 38 | let tokenizer = builder 39 | .build() 40 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?; 41 | Ok(LinderaTokenizer { 42 | tokenizer, 43 | token: Default::default(), 44 | }) 45 | } 46 | 47 | /// Create a new `LinderaTokenizer`. 48 | /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`. 49 | pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer { 50 | LinderaTokenizer { 51 | tokenizer: LTokenizer::new(segmenter), 52 | token: Default::default(), 53 | } 54 | } 55 | 56 | /// Append a character filter to the tokenizer. 57 | pub fn append_character_filter(&mut self, character_filter: BoxCharacterFilter) -> &mut Self { 58 | self.tokenizer.append_character_filter(character_filter); 59 | 60 | self 61 | } 62 | 63 | /// Append a token filter to the tokenizer. 64 | pub fn append_token_filter(&mut self, token_filter: BoxTokenFilter) -> &mut Self { 65 | self.tokenizer.token_filters.push(token_filter); 66 | 67 | self 68 | } 69 | } 70 | 71 | impl Tokenizer for LinderaTokenizer { 72 | type TokenStream<'a> = LinderaTokenStream<'a>; 73 | 74 | fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> { 75 | self.token.reset(); 76 | LinderaTokenStream { 77 | tokens: self.tokenizer.tokenize(text).unwrap(), 78 | token: &mut self.token, 79 | } 80 | } 81 | } 82 | 83 | #[cfg(test)] 84 | #[cfg(any( 85 | feature = "ipadic", 86 | feature = "unidic", 87 | feature = "ko-dic", 88 | feature = "cc-cedict" 89 | ))] 90 | mod tests { 91 | use lindera::segmenter::Segmenter; 92 | use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer}; 93 | 94 | use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind}; 95 | use lindera::mode::Mode; 96 | 97 | use super::LinderaTokenizer; 98 | 99 | fn token_stream_helper(text: &str, dictionary_kind: DictionaryKind) -> Vec { 100 | let mode = Mode::Normal; 101 | let dictionary = load_dictionary_from_kind(dictionary_kind).unwrap(); 102 | let user_dictionary = None; 103 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary); 104 | let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter); 105 | 106 | let mut token_stream = tokenizer.token_stream(text); 107 | let mut tokens: Vec = vec![]; 108 | let mut add_token = |token: &Token| { 109 | tokens.push(token.clone()); 110 | }; 111 | token_stream.process(&mut add_token); 112 | 113 | tokens 114 | } 115 | 116 | #[cfg(feature = "ipadic")] 117 | fn token_stream_helper_ipadic(text: &str) -> Vec { 118 | token_stream_helper(text, DictionaryKind::IPADIC) 119 | } 120 | 121 | #[cfg(feature = "unidic")] 122 | fn token_stream_helper_unidic(text: &str) -> Vec { 123 | token_stream_helper(text, DictionaryKind::UniDic) 124 | } 125 | 126 | #[cfg(feature = "ko-dic")] 127 | fn token_stream_helper_kodic(text: &str) -> Vec { 128 | token_stream_helper(text, DictionaryKind::KoDic) 129 | } 130 | 131 | #[cfg(feature = "cc-cedict")] 132 | fn token_stream_helper_cccedict(text: &str) -> Vec { 133 | token_stream_helper(text, DictionaryKind::CcCedict) 134 | } 135 | 136 | /// This is a function that can be used in tests and doc tests 137 | /// to assert a token's correctness. 138 | pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) { 139 | assert_eq!( 140 | token.position, position, 141 | "expected position {position} but {token:?}" 142 | ); 143 | assert_eq!(token.text, text, "expected text {text} but {token:?}"); 144 | assert_eq!( 145 | token.offset_from, from, 146 | "expected offset_from {from} but {token:?}" 147 | ); 148 | assert_eq!(token.offset_to, to, "expected offset_to {to} but {token:?}"); 149 | } 150 | 151 | #[test] 152 | #[cfg(feature = "ipadic")] 153 | fn test_tokenize_ipadic() { 154 | let tokens = token_stream_helper_ipadic("羽田空港限定トートバッグ"); 155 | assert_eq!(tokens.len(), 3); 156 | assert_token(&tokens[0], 0, "羽田空港", 0, 12); 157 | assert_token(&tokens[1], 1, "限定", 12, 18); 158 | assert_token(&tokens[2], 2, "トートバッグ", 18, 36); 159 | } 160 | 161 | #[test] 162 | #[cfg(feature = "unidic")] 163 | fn test_tokenize_unidic() { 164 | let tokens = token_stream_helper_unidic("羽田空港限定トートバッグ"); 165 | assert_eq!(tokens.len(), 5); 166 | assert_token(&tokens[0], 0, "羽田", 0, 6); 167 | assert_token(&tokens[1], 1, "空港", 6, 12); 168 | assert_token(&tokens[2], 2, "限定", 12, 18); 169 | assert_token(&tokens[3], 3, "トート", 18, 27); 170 | assert_token(&tokens[4], 4, "バッグ", 27, 36); 171 | } 172 | 173 | #[test] 174 | #[cfg(feature = "ko-dic")] 175 | fn test_tokenize_kodic() { 176 | let tokens = token_stream_helper_kodic("하네다공항한정토트백"); 177 | assert_eq!(tokens.len(), 4); 178 | assert_token(&tokens[0], 0, "하네다", 0, 9); 179 | assert_token(&tokens[1], 1, "공항", 9, 15); 180 | assert_token(&tokens[2], 2, "한정", 15, 21); 181 | assert_token(&tokens[3], 3, "토트백", 21, 30); 182 | } 183 | 184 | #[test] 185 | #[cfg(feature = "cc-cedict")] 186 | fn test_tokenize_cccedict() { 187 | let tokens = token_stream_helper_cccedict("羽田机场限量版手提包"); 188 | assert_eq!(tokens.len(), 6); 189 | assert_token(&tokens[0], 0, "羽田", 0, 6); 190 | assert_token(&tokens[1], 1, "机场", 6, 12); 191 | assert_token(&tokens[2], 2, "限", 12, 15); 192 | assert_token(&tokens[3], 3, "量", 15, 18); 193 | assert_token(&tokens[4], 4, "版", 18, 21); 194 | assert_token(&tokens[5], 5, "手提包", 21, 30); 195 | } 196 | } 197 | --------------------------------------------------------------------------------