├── .github
├── FUNDING.yml
├── dependabot.yml
└── workflows
│ ├── periodic.yml
│ ├── regression.yml
│ └── release.yml
├── .gitignore
├── AUTHORS
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── benches
└── bench.rs
├── examples
├── cc-cedict.rs
├── ipadic.rs
├── ko-dic.rs
├── lindera.yml
├── tokenize_with_config.rs
└── unidic.rs
└── src
├── lib.rs
├── stream.rs
└── tokenizer.rs
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: mosuka
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "cargo" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "daily"
12 |
--------------------------------------------------------------------------------
/.github/workflows/periodic.yml:
--------------------------------------------------------------------------------
1 | name: Periodic
2 |
3 | on:
4 | schedule:
5 | - cron: 0 0 * * SUN
6 |
7 | jobs:
8 | test:
9 | name: Test
10 | strategy:
11 | matrix:
12 | os: [ubuntu-latest, macOS-latest, windows-latest]
13 | toolchain: [stable, beta, nightly]
14 | features: ["ipadic", "ko-dic", "cc-cedict"]
15 | runs-on: ${{ matrix.os }}
16 | steps:
17 | - uses: actions/checkout@v1
18 | - uses: actions-rs/toolchain@v1
19 | with:
20 | profile: minimal
21 | toolchain: ${{ matrix.toolchain }}
22 | override: true
23 | - uses: actions-rs/cargo@v1
24 | with:
25 | command: test
26 | args: --features "${{ matrix.features }}"
27 |
--------------------------------------------------------------------------------
/.github/workflows/regression.yml:
--------------------------------------------------------------------------------
1 | name: Regression
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | types: [opened, synchronize]
9 |
10 | jobs:
11 | check:
12 | name: Check
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest]
16 | toolchain: [stable]
17 | runs-on: ${{ matrix.os }}
18 | steps:
19 | - uses: actions/checkout@v1
20 | - uses: actions-rs/toolchain@v1
21 | with:
22 | profile: minimal
23 | toolchain: ${{ matrix.toolchain }}
24 | override: true
25 | - uses: actions-rs/cargo@v1
26 | with:
27 | command: check
28 |
29 | test:
30 | name: Test
31 | strategy:
32 | matrix:
33 | os: [ubuntu-latest, macOS-latest, windows-latest]
34 | toolchain: [stable]
35 | features: ["ipadic", "ko-dic", "cc-cedict"]
36 | runs-on: ${{ matrix.os }}
37 | steps:
38 | - uses: actions/checkout@v1
39 | - uses: actions-rs/toolchain@v1
40 | with:
41 | profile: minimal
42 | toolchain: ${{ matrix.toolchain }}
43 | override: true
44 | - uses: actions-rs/cargo@v1
45 | with:
46 | command: test
47 | args: --features "${{ matrix.features }}"
48 |
49 | fmt:
50 | name: Format
51 | strategy:
52 | matrix:
53 | os: [ubuntu-latest]
54 | toolchain: [stable]
55 | runs-on: ${{ matrix.os }}
56 | steps:
57 | - uses: actions/checkout@v1
58 | - uses: actions-rs/toolchain@v1
59 | with:
60 | profile: minimal
61 | toolchain: ${{ matrix.toolchain }}
62 | override: true
63 | - run: rustup component add rustfmt
64 | - uses: actions-rs/cargo@v1
65 | with:
66 | command: fmt
67 | args: --all -- --check
68 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | workflow_dispatch:
5 | push:
6 | tags:
7 | - "v*.*.*"
8 |
9 | jobs:
10 | create-release:
11 | name: Upload artifact
12 | runs-on: ubuntu-latest
13 | steps:
14 | - id: create-release
15 | uses: softprops/action-gh-release@v2
16 | env:
17 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
18 | with:
19 | name: Release ${{ github.ref_name }}
20 | tag_name: ${{ github.ref }}
21 | draft: false
22 | prerelease: false
23 | generate_release_notes: true
24 |
25 | publish-crates:
26 | name: Publish crate
27 | strategy:
28 | matrix:
29 | os: [ubuntu-latest]
30 | toolchain: [stable]
31 | needs: [create-release]
32 | runs-on: ${{ matrix.os }}
33 | steps:
34 | - uses: actions/checkout@v1
35 | - uses: actions-rs/toolchain@v1
36 | with:
37 | profile: minimal
38 | toolchain: ${{ matrix.toolchain }}
39 | override: true
40 | - uses: actions-rs/cargo@v1
41 | with:
42 | command: publish
43 | args: --token ${{ secrets.CRATES_TOKEN }}
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .idea
3 | .DS_Store
4 |
5 | target
6 | perf.data*
7 |
8 | Cargo.lock
9 |
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | # This is the list of authors of tantivy for copyright purposes.
2 | Minoru Osuka
3 | @ken0x0a
4 | Jun Ohtani
5 | Koichi Akabe
6 | François Massot
7 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "lindera-tantivy"
3 | version = "0.43.1"
4 | edition = "2021"
5 | description = "Lindera Tokenizer for Tantivy."
6 | documentation = "https://docs.rs/lindera-tantivy"
7 | homepage = "https://github.com/lindera/lindera-tantivy"
8 | repository = "https://github.com/lindera/lindera-tantivy"
9 | readme = "README.md"
10 | keywords = ["tokenizer", "tantivy", "lindera"]
11 | categories = ["text-processing"]
12 | license = "MIT"
13 |
14 | [features]
15 | default = [] # No directories included
16 | ipadic = ["lindera/ipadic"] # Include IPADIC dictionary (Japanese)
17 | ipadic-neologd = [
18 | "lindera/ipadic-neologd",
19 | ] # Include IPADIC NEologd dictionary (Japanese)
20 | unidic = ["lindera/unidic"] # Include UniDic dictionary (Japanese)
21 | ko-dic = ["lindera/ko-dic"] # Include ko-dic dictionary (Korean)
22 | cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese)
23 | compress = ["lindera/compress"] # Compress dictionaries
24 |
25 | [dependencies]
26 | tantivy-tokenizer-api = "0.5.0"
27 | tantivy = "0.24.1"
28 |
29 | lindera = "0.43.1"
30 |
31 | [dev-dependencies]
32 | criterion = { version = "0.6.0", features = ["html_reports"] }
33 | tantivy = "0.24.0"
34 |
35 | [[bench]]
36 | name = "bench"
37 | harness = false
38 |
39 | [profile.release]
40 | lto = true
41 |
42 | # Make sure that the build scripts and proc-macros are compiled with
43 | # all the optimizations. It speeds up the flate2 crate that we use in our build scripts.
44 | [profile.dev.build-override]
45 | opt-level = 3
46 | [profile.release.build-override]
47 | opt-level = 3
48 | [profile.bench.build-override]
49 | opt-level = 3
50 | [profile.test.build-override]
51 | opt-level = 3
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 by the project authors, as listed in the AUTHORS file.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | LINDERA_TANTIVY_VERSION ?= $(shell cargo metadata --no-deps --format-version=1 | jq -r '.packages[] | select(.name=="lindera-tantivy") | .version')
2 |
3 | .DEFAULT_GOAL := help
4 |
5 | clean: ## Clean the project
6 | cargo clean
7 |
8 | format: ## Format the code
9 | cargo fmt
10 |
11 | test: ## Run tests
12 | cargo test
13 |
14 | tag: ## Make a new tag for the current version
15 | git tag v$(LINDERA_TANTIVY_VERSION)
16 | git push origin v$(LINDERA_TANTIVY_VERSION)
17 |
18 | publish: ## Publish the crate to crates.io
19 | ifeq ($(shell curl -s -XGET https://crates.io/api/v1/crates/lindera-tantivy | jq -r '.versions[].num' | grep $(LINDERA_TANTIVY_VERSION)),)
20 | cargo package && cargo publish
21 | endif
22 |
23 | help: ## Show help
24 | @echo "Available targets:"
25 | @grep -E '^[a-zA-Z0-9_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " %-15s %s\n", $$1, $$2}'
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Lindera tokenizer for Tantivy
2 |
3 | [](https://opensource.org/licenses/MIT)
4 |
5 | [Lindera](https://github.com/lindera/lindera) Tokenizer for [Tantivy](https://github.com/tantivy-search/tantivy).
6 |
7 | ## Usage
8 |
9 | Make sure you have activated the required dictionaries for the Lindera in Cargo.toml.
10 | The following example enables IPADIC.
11 |
12 | ```toml
13 | [dependencies]
14 | lindera = "0.38"
15 | lindera-tantivy = { version = "0.38.0", features = ["ipadic"] }
16 | ```
17 |
18 | ### Basic example
19 |
20 | ```rust
21 | fn main() -> tantivy::Result<()> {
22 | use tantivy::{
23 | collector::TopDocs,
24 | doc,
25 | query::QueryParser,
26 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
27 | Document, Index, TantivyDocument,
28 | };
29 |
30 | use lindera::dictionary::DictionaryKind;
31 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
32 | use lindera_tantivy::tokenizer::LinderaTokenizer;
33 |
34 | // create schema builder
35 | let mut schema_builder = Schema::builder();
36 |
37 | // add id field
38 | let id = schema_builder.add_text_field(
39 | "id",
40 | TextOptions::default()
41 | .set_indexing_options(
42 | TextFieldIndexing::default()
43 | .set_tokenizer("raw")
44 | .set_index_option(IndexRecordOption::Basic),
45 | )
46 | .set_stored(),
47 | );
48 |
49 | // add title field
50 | let title = schema_builder.add_text_field(
51 | "title",
52 | TextOptions::default()
53 | .set_indexing_options(
54 | TextFieldIndexing::default()
55 | .set_tokenizer("lang_ja")
56 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
57 | )
58 | .set_stored(),
59 | );
60 |
61 | // add body field
62 | let body = schema_builder.add_text_field(
63 | "body",
64 | TextOptions::default()
65 | .set_indexing_options(
66 | TextFieldIndexing::default()
67 | .set_tokenizer("lang_ja")
68 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
69 | )
70 | .set_stored(),
71 | );
72 |
73 | // build schema
74 | let schema = schema_builder.build();
75 |
76 | // create index on memory
77 | let index = Index::create_in_ram(schema.clone());
78 |
79 | // Tokenizer with IPADIC
80 | let mode = Mode::Normal;
81 | let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
82 | let user_dictionary = None;
83 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
84 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
85 |
86 | // register Lindera tokenizer
87 | index.tokenizers().register("lang_ja", tokenizer);
88 |
89 | // create index writer
90 | let mut index_writer = index.writer(50_000_000)?;
91 |
92 | // add document
93 | index_writer.add_document(doc!(
94 | id => "1",
95 | title => "成田国際空港",
96 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
97 | )).unwrap();
98 |
99 | // add document
100 | index_writer.add_document(doc!(
101 | id => "2",
102 | title => "東京国際空港",
103 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
104 | )).unwrap();
105 |
106 | // add document
107 | index_writer.add_document(doc!(
108 | id => "3",
109 | title => "関西国際空港",
110 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
111 | )).unwrap();
112 |
113 | // commit
114 | index_writer.commit()?;
115 |
116 | // create reader
117 | let reader = index.reader()?;
118 |
119 | // create searcher
120 | let searcher = reader.searcher();
121 |
122 | // create querhy parser
123 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
124 |
125 | // parse query
126 | let query_str = "東京";
127 | let query = query_parser.parse_query(query_str)?;
128 | println!("Query String: {}", query_str);
129 |
130 | // search
131 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
132 | println!("Search Result:");
133 | for (_, doc_address) in top_docs {
134 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
135 | println!("{}", retrieved_doc.to_json(&schema));
136 | }
137 |
138 | Ok(())
139 | }
140 | ```
141 |
142 | ### Config by YAML
143 |
144 | ```rust
145 | use std::path::PathBuf;
146 |
147 | fn main() -> tantivy::Result<()> {
148 | use tantivy::{
149 | collector::TopDocs,
150 | doc,
151 | query::QueryParser,
152 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
153 | Document, Index, TantivyDocument,
154 | };
155 |
156 | use lindera_tantivy::tokenizer::LinderaTokenizer;
157 |
158 | // create schema builder
159 | let mut schema_builder = Schema::builder();
160 |
161 | // add id field
162 | let id = schema_builder.add_text_field(
163 | "id",
164 | TextOptions::default()
165 | .set_indexing_options(
166 | TextFieldIndexing::default()
167 | .set_tokenizer("raw")
168 | .set_index_option(IndexRecordOption::Basic),
169 | )
170 | .set_stored(),
171 | );
172 |
173 | // add title field
174 | let title = schema_builder.add_text_field(
175 | "title",
176 | TextOptions::default()
177 | .set_indexing_options(
178 | TextFieldIndexing::default()
179 | .set_tokenizer("lang_ja")
180 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
181 | )
182 | .set_stored(),
183 | );
184 |
185 | // add body field
186 | let body = schema_builder.add_text_field(
187 | "body",
188 | TextOptions::default()
189 | .set_indexing_options(
190 | TextFieldIndexing::default()
191 | .set_tokenizer("lang_ja")
192 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
193 | )
194 | .set_stored(),
195 | );
196 |
197 | // build schema
198 | let schema = schema_builder.build();
199 |
200 | // create index on memory
201 | let index = Index::create_in_ram(schema.clone());
202 |
203 | // Build tokenizer with config file
204 | let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
205 | .join("./examples")
206 | .join("lindera.yml");
207 | let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?;
208 |
209 | // register Lindera tokenizer
210 | index.tokenizers().register("lang_ja", tokenizer);
211 |
212 | // create index writer
213 | let mut index_writer = index.writer(50_000_000)?;
214 |
215 | // add document
216 | index_writer.add_document(doc!(
217 | id => "1",
218 | title => "成田国際空港",
219 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
220 | )).unwrap();
221 |
222 | // add document
223 | index_writer.add_document(doc!(
224 | id => "2",
225 | title => "東京国際空港",
226 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
227 | )).unwrap();
228 |
229 | // add document
230 | index_writer.add_document(doc!(
231 | id => "3",
232 | title => "関西国際空港",
233 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
234 | )).unwrap();
235 |
236 | // commit
237 | index_writer.commit()?;
238 |
239 | // create reader
240 | let reader = index.reader()?;
241 |
242 | // create searcher
243 | let searcher = reader.searcher();
244 |
245 | // create querhy parser
246 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
247 |
248 | // parse query
249 | let query_str = "TOKYO";
250 | let query = query_parser.parse_query(query_str)?;
251 | println!("Query String: {}", query_str);
252 |
253 | // search
254 | println!("Parsed Query: {:?}", query);
255 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
256 | println!("Search Result:");
257 | for (_, doc_address) in top_docs {
258 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
259 | println!("{}", retrieved_doc.to_json(&schema));
260 | }
261 |
262 | Ok(())
263 | }
264 | ```
265 |
266 | ## API reference
267 |
268 | The API reference is available. Please see following URL:
269 |
270 | - lindera-tantivy
271 |
--------------------------------------------------------------------------------
/benches/bench.rs:
--------------------------------------------------------------------------------
1 | use criterion::Criterion;
2 | use criterion::{criterion_group, criterion_main};
3 |
4 | #[cfg(feature = "ipadic")]
5 | fn bench_indexing(c: &mut Criterion) {
6 | use lindera::dictionary::load_dictionary_from_kind;
7 | use lindera::segmenter::Segmenter;
8 | use tantivy::doc;
9 | use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
10 | use tantivy::Index;
11 |
12 | use lindera::dictionary::DictionaryKind;
13 | use lindera::mode::Mode;
14 | use lindera_tantivy::tokenizer::LinderaTokenizer;
15 |
16 | // create schema builder
17 | let mut schema_builder = Schema::builder();
18 |
19 | // add id field
20 | let id = schema_builder.add_text_field(
21 | "id",
22 | TextOptions::default()
23 | .set_indexing_options(
24 | TextFieldIndexing::default()
25 | .set_tokenizer("raw")
26 | .set_index_option(IndexRecordOption::Basic),
27 | )
28 | .set_stored(),
29 | );
30 |
31 | // add text field
32 | let text = schema_builder.add_text_field(
33 | "text",
34 | TextOptions::default()
35 | .set_indexing_options(
36 | TextFieldIndexing::default()
37 | .set_tokenizer("lang_ja")
38 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
39 | )
40 | .set_stored(),
41 | );
42 |
43 | // build schema
44 | let schema = schema_builder.build();
45 |
46 | // create index on memory
47 | let index = Index::create_in_ram(schema.clone());
48 |
49 | // Test document set.
50 | let mut docs = Vec::new();
51 | for i in 0..1000 {
52 | let doc = doc!(
53 | id => format!("doc-{}", i),
54 | text => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である[1]。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
55 | );
56 | docs.push(doc);
57 | }
58 |
59 | let mode = Mode::Normal;
60 | let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
61 | let user_dictionary = None;
62 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
63 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
64 |
65 | // register Lindera tokenizer
66 | index.tokenizers().register("lang_ja", tokenizer);
67 |
68 | // create index writer
69 | let mut index_writer = index.writer(50_000_000).unwrap();
70 |
71 | // Using benchmark_group for changing sample_size
72 | let mut group = c.benchmark_group("indexing");
73 | group.sample_size(100);
74 | group.bench_function("bench-indexing", |b| {
75 | b.iter(|| {
76 | for doc in docs.iter() {
77 | index_writer.add_document(doc.clone()).unwrap();
78 | }
79 | });
80 |
81 | // commit
82 | index_writer.commit().unwrap();
83 | });
84 | group.finish();
85 | }
86 |
87 | #[cfg(not(feature = "ipadic"))]
88 | fn bench_indexing(_c: &mut Criterion) {}
89 |
90 | criterion_group!(benches, bench_indexing,);
91 | criterion_main!(benches);
92 |
--------------------------------------------------------------------------------
/examples/cc-cedict.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "cc-cedict")]
2 | fn main() -> tantivy::Result<()> {
3 | use tantivy::{
4 | collector::TopDocs,
5 | doc,
6 | query::QueryParser,
7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
8 | Document, Index, TantivyDocument,
9 | };
10 |
11 | use lindera::dictionary::DictionaryKind;
12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
13 | use lindera_tantivy::tokenizer::LinderaTokenizer;
14 |
15 | // create schema builder
16 | let mut schema_builder = Schema::builder();
17 |
18 | // add id field
19 | let id = schema_builder.add_text_field(
20 | "id",
21 | TextOptions::default()
22 | .set_indexing_options(
23 | TextFieldIndexing::default()
24 | .set_tokenizer("raw")
25 | .set_index_option(IndexRecordOption::Basic),
26 | )
27 | .set_stored(),
28 | );
29 |
30 | // add title field
31 | let title = schema_builder.add_text_field(
32 | "title",
33 | TextOptions::default()
34 | .set_indexing_options(
35 | TextFieldIndexing::default()
36 | .set_tokenizer("lang_zh")
37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
38 | )
39 | .set_stored(),
40 | );
41 |
42 | // add body field
43 | let body = schema_builder.add_text_field(
44 | "body",
45 | TextOptions::default()
46 | .set_indexing_options(
47 | TextFieldIndexing::default()
48 | .set_tokenizer("lang_zh")
49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
50 | )
51 | .set_stored(),
52 | );
53 |
54 | // build schema
55 | let schema = schema_builder.build();
56 |
57 | // create index on memory
58 | let index = Index::create_in_ram(schema.clone());
59 |
60 | // Tokenizer with CC-CEDICT
61 | let mode = Mode::Normal;
62 | let dictionary = load_dictionary_from_kind(DictionaryKind::CcCedict).unwrap();
63 | let user_dictionary = None;
64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
66 |
67 | // register Lindera tokenizer
68 | index.tokenizers().register("lang_zh", tokenizer);
69 |
70 | // create index writer
71 | let mut index_writer = index.writer(50_000_000)?;
72 |
73 | // add document
74 | index_writer.add_document(doc!(
75 | id => "1",
76 | title => "成田国际机场",
77 | body => "成田國際機場(日语:成田国際空港/なりたこくさいくうこう Narita Kokusai Kūkō */?;IATA代码:NRT;ICAO代码:RJAA),通稱成田機場(成田空港),原名新東京國際機場(新東京国際空港/しんとうきょうこくさいくうこう Shin-Tōkyō Kokusai Kūkō),是位於日本千葉縣成田市的國際機場,與羽田機場並列為東京兩大聯外機場。占地1,111公頃,擁有3座客運航廈,客運流量居日本第二位,貨運吞吐量則居日本第一、全球第九。根據日本機場分類法,其劃分為據點機場。"
78 | )).unwrap();
79 |
80 | // add document
81 | index_writer.add_document(doc!(
82 | id => "2",
83 | title => "東京國際機場",
84 | body => "東京國際機場(日语:東京国際空港/とうきょうこくさいくうこう Tōkyō Kokusai Kūkō */?;IATA代码:HND;ICAO代码:RJTT)是位於日本東京都大田區的機場,因座落於羽田地區而通稱為羽田機場(羽田空港/はねだくうこう Haneda Kūkō),啟用於1931年8月25日,與成田國際機場並列為東京兩大聯外機場。"
85 | )).unwrap();
86 |
87 | // add document
88 | index_writer.add_document(doc!(
89 | id => "3",
90 | title => "关西国际机场",
91 | body => "關西國際機場(日语:関西国際空港/かんさいこくさいくうこう Kansai kokusai kūkō */?,英語:Kansai International Airport,IATA代码:KIX;ICAO代码:RJBB),常通稱為關西機場、大阪關西機場或關空[註 1],是位於日本大阪府的機場,坐落於大阪湾东南部的泉州近海離岸5公里的人工島上,面積約1,067.7公頃[2],行政區劃橫跨大阪府的泉佐野市(北)、田尻町(中)以及泉南市(南)。"
92 | )).unwrap();
93 |
94 | // commit
95 | index_writer.commit()?;
96 |
97 | // create reader
98 | let reader = index.reader()?;
99 |
100 | // create searcher
101 | let searcher = reader.searcher();
102 |
103 | // create querhy parser
104 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
105 |
106 | // parse query
107 | let query_str = "東京";
108 | let query = query_parser.parse_query(query_str)?;
109 | println!("Query String: {}", query_str);
110 |
111 | // search
112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
113 | println!("Search Result:");
114 | for (_, doc_address) in top_docs {
115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
116 | println!("{}", retrieved_doc.to_json(&schema));
117 | }
118 |
119 | Ok(())
120 | }
121 |
122 | #[cfg(not(feature = "cc-cedict"))]
123 | fn main() -> tantivy::Result<()> {
124 | Ok(())
125 | }
126 |
--------------------------------------------------------------------------------
/examples/ipadic.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "ipadic")]
2 | fn main() -> tantivy::Result<()> {
3 | use tantivy::{
4 | collector::TopDocs,
5 | doc,
6 | query::QueryParser,
7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
8 | Document, Index, TantivyDocument,
9 | };
10 |
11 | use lindera::dictionary::DictionaryKind;
12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
13 | use lindera_tantivy::tokenizer::LinderaTokenizer;
14 |
15 | // create schema builder
16 | let mut schema_builder = Schema::builder();
17 |
18 | // add id field
19 | let id = schema_builder.add_text_field(
20 | "id",
21 | TextOptions::default()
22 | .set_indexing_options(
23 | TextFieldIndexing::default()
24 | .set_tokenizer("raw")
25 | .set_index_option(IndexRecordOption::Basic),
26 | )
27 | .set_stored(),
28 | );
29 |
30 | // add title field
31 | let title = schema_builder.add_text_field(
32 | "title",
33 | TextOptions::default()
34 | .set_indexing_options(
35 | TextFieldIndexing::default()
36 | .set_tokenizer("lang_ja")
37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
38 | )
39 | .set_stored(),
40 | );
41 |
42 | // add body field
43 | let body = schema_builder.add_text_field(
44 | "body",
45 | TextOptions::default()
46 | .set_indexing_options(
47 | TextFieldIndexing::default()
48 | .set_tokenizer("lang_ja")
49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
50 | )
51 | .set_stored(),
52 | );
53 |
54 | // build schema
55 | let schema = schema_builder.build();
56 |
57 | // create index on memory
58 | let index = Index::create_in_ram(schema.clone());
59 |
60 | // Tokenizer with IPADIC
61 | let mode = Mode::Normal;
62 | let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
63 | let user_dictionary = None;
64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
66 |
67 | // register Lindera tokenizer
68 | index.tokenizers().register("lang_ja", tokenizer);
69 |
70 | // create index writer
71 | let mut index_writer = index.writer(50_000_000)?;
72 |
73 | // add document
74 | index_writer.add_document(doc!(
75 | id => "1",
76 | title => "成田国際空港",
77 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
78 | )).unwrap();
79 |
80 | // add document
81 | index_writer.add_document(doc!(
82 | id => "2",
83 | title => "東京国際空港",
84 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
85 | )).unwrap();
86 |
87 | // add document
88 | index_writer.add_document(doc!(
89 | id => "3",
90 | title => "関西国際空港",
91 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
92 | )).unwrap();
93 |
94 | // commit
95 | index_writer.commit()?;
96 |
97 | // create reader
98 | let reader = index.reader()?;
99 |
100 | // create searcher
101 | let searcher = reader.searcher();
102 |
103 | // create querhy parser
104 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
105 |
106 | // parse query
107 | let query_str = "東京";
108 | let query = query_parser.parse_query(query_str)?;
109 | println!("Query String: {}", query_str);
110 |
111 | // search
112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
113 | println!("Search Result:");
114 | for (_, doc_address) in top_docs {
115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
116 | println!("{}", retrieved_doc.to_json(&schema));
117 | }
118 |
119 | Ok(())
120 | }
121 |
122 | #[cfg(not(feature = "ipadic"))]
123 | fn main() -> tantivy::Result<()> {
124 | Ok(())
125 | }
126 |
--------------------------------------------------------------------------------
/examples/ko-dic.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "ko-dic")]
2 | fn main() -> tantivy::Result<()> {
3 | use tantivy::{
4 | collector::TopDocs,
5 | doc,
6 | query::QueryParser,
7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
8 | Document, Index, TantivyDocument,
9 | };
10 |
11 | use lindera::dictionary::DictionaryKind;
12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
13 | use lindera_tantivy::tokenizer::LinderaTokenizer;
14 |
15 | // create schema builder
16 | let mut schema_builder = Schema::builder();
17 |
18 | // add id field
19 | let id = schema_builder.add_text_field(
20 | "id",
21 | TextOptions::default()
22 | .set_indexing_options(
23 | TextFieldIndexing::default()
24 | .set_tokenizer("raw")
25 | .set_index_option(IndexRecordOption::Basic),
26 | )
27 | .set_stored(),
28 | );
29 |
30 | // add title field
31 | let title = schema_builder.add_text_field(
32 | "title",
33 | TextOptions::default()
34 | .set_indexing_options(
35 | TextFieldIndexing::default()
36 | .set_tokenizer("lang_ko")
37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
38 | )
39 | .set_stored(),
40 | );
41 |
42 | // add body field
43 | let body = schema_builder.add_text_field(
44 | "body",
45 | TextOptions::default()
46 | .set_indexing_options(
47 | TextFieldIndexing::default()
48 | .set_tokenizer("lang_ko")
49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
50 | )
51 | .set_stored(),
52 | );
53 |
54 | // build schema
55 | let schema = schema_builder.build();
56 |
57 | // create index on memory
58 | let index = Index::create_in_ram(schema.clone());
59 |
60 | // Tokenizer with ko-dic
61 | let mode = Mode::Normal;
62 | let dictionary = load_dictionary_from_kind(DictionaryKind::KoDic).unwrap();
63 | let user_dictionary = None;
64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
66 |
67 | // register Lindera tokenizer
68 | index.tokenizers().register("lang_ko", tokenizer);
69 |
70 | // create index writer
71 | let mut index_writer = index.writer(50_000_000)?;
72 |
73 | // add document
74 | index_writer.add_document(doc!(
75 | id => "1",
76 | title => "나리타 국제공항",
77 | body => "나리타 국제공항(일본어: 成田国際空港, 영어: Narita International Airport, IATA: NRT, ICAO: RJAA)은 일본 지바현 나리타시에 위치한 국제공항으로, 도쿄도 도심에서 동북쪽으로 약 62km 떨어져 있다."
78 | )).unwrap();
79 |
80 | // add document
81 | index_writer.add_document(doc!(
82 | id => "2",
83 | title => "도쿄 국제공항",
84 | body => "도쿄국제공항(일본어: 東京国際空港、とうきょうこくさいくうこう, 영어: Tokyo International Airport)은 일본 도쿄도 오타구에 있는 공항이다. 보통 이 일대의 옛 지명을 본뜬 하네다 공항(일본어: 羽田空港, 영어: Haneda Airport)이라고 불린다."
85 | )).unwrap();
86 |
87 | // add document
88 | index_writer.add_document(doc!(
89 | id => "3",
90 | title => "간사이 국제공항",
91 | body => "간사이 국제공항(일본어: 関西国際空港, IATA: KIX, ICAO: RJBB)은 일본 오사카부 오사카 만에 조성된 인공섬에 위치한 일본의 공항으로, 대한민국의 인천국제공항보다 6년 반 앞선 1994년 9월 4일에 개항했다."
92 | )).unwrap();
93 |
94 | // commit
95 | index_writer.commit()?;
96 |
97 | // create reader
98 | let reader = index.reader()?;
99 |
100 | // create searcher
101 | let searcher = reader.searcher();
102 |
103 | // create querhy parser
104 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
105 |
106 | // parse query
107 | let query_str = "도쿄";
108 | let query = query_parser.parse_query(query_str)?;
109 | println!("Query String: {}", query_str);
110 |
111 | // search
112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
113 | println!("Search Result:");
114 | for (_, doc_address) in top_docs {
115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
116 | println!("{}", retrieved_doc.to_json(&schema));
117 | }
118 |
119 | Ok(())
120 | }
121 |
122 | #[cfg(not(feature = "ko-dic"))]
123 | fn main() -> tantivy::Result<()> {
124 | Ok(())
125 | }
126 |
--------------------------------------------------------------------------------
/examples/lindera.yml:
--------------------------------------------------------------------------------
1 | segmenter:
2 | mode: "normal"
3 | dictionary:
4 | kind: "ipadic"
5 | # user_dictionary:
6 | # path: "./resources/ipadic_simple.csv"
7 | # kind: "ipadic"
8 |
9 | character_filters:
10 | - kind: "unicode_normalize"
11 | args:
12 | kind: "nfkc"
13 | - kind: "japanese_iteration_mark"
14 | args:
15 | normalize_kanji: true
16 | normalize_kana: true
17 | - kind: mapping
18 | args:
19 | mapping:
20 | リンデラ: Lindera
21 |
22 | token_filters:
23 | - kind: "japanese_compound_word"
24 | args:
25 | kind: "ipadic"
26 | tags:
27 | - "名詞,数"
28 | - "名詞,接尾,助数詞"
29 | new_tag: "名詞,数"
30 | - kind: "japanese_number"
31 | args:
32 | tags:
33 | - "名詞,数"
34 | - kind: "japanese_stop_tags"
35 | args:
36 | tags:
37 | - "接続詞"
38 | - "助詞"
39 | - "助詞,格助詞"
40 | - "助詞,格助詞,一般"
41 | - "助詞,格助詞,引用"
42 | - "助詞,格助詞,連語"
43 | - "助詞,係助詞"
44 | - "助詞,副助詞"
45 | - "助詞,間投助詞"
46 | - "助詞,並立助詞"
47 | - "助詞,終助詞"
48 | - "助詞,副助詞/並立助詞/終助詞"
49 | - "助詞,連体化"
50 | - "助詞,副詞化"
51 | - "助詞,特殊"
52 | - "助動詞"
53 | - "記号"
54 | - "記号,一般"
55 | - "記号,読点"
56 | - "記号,句点"
57 | - "記号,空白"
58 | - "記号,括弧閉"
59 | - "その他,間投"
60 | - "フィラー"
61 | - "非言語音"
62 | - kind: "japanese_katakana_stem"
63 | args:
64 | min: 3
65 | - kind: "remove_diacritical_mark"
66 | args:
67 | japanese: false
68 | - kind: "lowercase"
69 | args: {}
70 |
--------------------------------------------------------------------------------
/examples/tokenize_with_config.rs:
--------------------------------------------------------------------------------
1 | use std::path::PathBuf;
2 |
3 | fn main() -> tantivy::Result<()> {
4 | use tantivy::{
5 | collector::TopDocs,
6 | doc,
7 | query::QueryParser,
8 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
9 | Document, Index, TantivyDocument,
10 | };
11 |
12 | use lindera_tantivy::tokenizer::LinderaTokenizer;
13 |
14 | // create schema builder
15 | let mut schema_builder = Schema::builder();
16 |
17 | // add id field
18 | let id = schema_builder.add_text_field(
19 | "id",
20 | TextOptions::default()
21 | .set_indexing_options(
22 | TextFieldIndexing::default()
23 | .set_tokenizer("raw")
24 | .set_index_option(IndexRecordOption::Basic),
25 | )
26 | .set_stored(),
27 | );
28 |
29 | // add title field
30 | let title = schema_builder.add_text_field(
31 | "title",
32 | TextOptions::default()
33 | .set_indexing_options(
34 | TextFieldIndexing::default()
35 | .set_tokenizer("lang_ja")
36 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
37 | )
38 | .set_stored(),
39 | );
40 |
41 | // add body field
42 | let body = schema_builder.add_text_field(
43 | "body",
44 | TextOptions::default()
45 | .set_indexing_options(
46 | TextFieldIndexing::default()
47 | .set_tokenizer("lang_ja")
48 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
49 | )
50 | .set_stored(),
51 | );
52 |
53 | // build schema
54 | let schema = schema_builder.build();
55 |
56 | // create index on memory
57 | let index = Index::create_in_ram(schema.clone());
58 |
59 | // Build tokenizer with config file
60 | let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
61 | .join("./examples")
62 | .join("lindera.yml");
63 | let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?;
64 |
65 | // register Lindera tokenizer
66 | index.tokenizers().register("lang_ja", tokenizer);
67 |
68 | // create index writer
69 | let mut index_writer = index.writer(50_000_000)?;
70 |
71 | // add document
72 | index_writer.add_document(doc!(
73 | id => "1",
74 | title => "成田国際空港",
75 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
76 | )).unwrap();
77 |
78 | // add document
79 | index_writer.add_document(doc!(
80 | id => "2",
81 | title => "東京国際空港",
82 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
83 | )).unwrap();
84 |
85 | // add document
86 | index_writer.add_document(doc!(
87 | id => "3",
88 | title => "関西国際空港",
89 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
90 | )).unwrap();
91 |
92 | // commit
93 | index_writer.commit()?;
94 |
95 | // create reader
96 | let reader = index.reader()?;
97 |
98 | // create searcher
99 | let searcher = reader.searcher();
100 |
101 | // create querhy parser
102 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
103 |
104 | // parse query
105 | let query_str = "TOKYO";
106 | let query = query_parser.parse_query(query_str)?;
107 | println!("Query String: {}", query_str);
108 |
109 | // search
110 | println!("Parsed Query: {:?}", query);
111 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
112 | println!("Search Result:");
113 | for (_, doc_address) in top_docs {
114 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
115 | println!("{}", retrieved_doc.to_json(&schema));
116 | }
117 |
118 | Ok(())
119 | }
120 |
--------------------------------------------------------------------------------
/examples/unidic.rs:
--------------------------------------------------------------------------------
1 | #[cfg(feature = "unidic")]
2 | fn main() -> tantivy::Result<()> {
3 | use tantivy::{
4 | collector::TopDocs,
5 | doc,
6 | query::QueryParser,
7 | schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
8 | Document, Index, TantivyDocument,
9 | };
10 |
11 | use lindera::dictionary::DictionaryKind;
12 | use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
13 | use lindera_tantivy::tokenizer::LinderaTokenizer;
14 |
15 | // create schema builder
16 | let mut schema_builder = Schema::builder();
17 |
18 | // add id field
19 | let id = schema_builder.add_text_field(
20 | "id",
21 | TextOptions::default()
22 | .set_indexing_options(
23 | TextFieldIndexing::default()
24 | .set_tokenizer("raw")
25 | .set_index_option(IndexRecordOption::Basic),
26 | )
27 | .set_stored(),
28 | );
29 |
30 | // add title field
31 | let title = schema_builder.add_text_field(
32 | "title",
33 | TextOptions::default()
34 | .set_indexing_options(
35 | TextFieldIndexing::default()
36 | .set_tokenizer("lang_ja")
37 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
38 | )
39 | .set_stored(),
40 | );
41 |
42 | // add body field
43 | let body = schema_builder.add_text_field(
44 | "body",
45 | TextOptions::default()
46 | .set_indexing_options(
47 | TextFieldIndexing::default()
48 | .set_tokenizer("lang_ja")
49 | .set_index_option(IndexRecordOption::WithFreqsAndPositions),
50 | )
51 | .set_stored(),
52 | );
53 |
54 | // build schema
55 | let schema = schema_builder.build();
56 |
57 | // create index on memory
58 | let index = Index::create_in_ram(schema.clone());
59 |
60 | // Tokenizer with UniDic
61 | let mode = Mode::Normal;
62 | let dictionary = load_dictionary_from_kind(DictionaryKind::UniDic).unwrap();
63 | let user_dictionary = None;
64 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
65 | let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
66 |
67 | // register Lindera tokenizer
68 | index.tokenizers().register("lang_ja", tokenizer);
69 |
70 | // create index writer
71 | let mut index_writer = index.writer(50_000_000)?;
72 |
73 | // add document
74 | index_writer.add_document(doc!(
75 | id => "1",
76 | title => "成田国際空港",
77 | body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
78 | )).unwrap();
79 |
80 | // add document
81 | index_writer.add_document(doc!(
82 | id => "2",
83 | title => "東京国際空港",
84 | body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
85 | )).unwrap();
86 |
87 | // add document
88 | index_writer.add_document(doc!(
89 | id => "3",
90 | title => "関西国際空港",
91 | body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
92 | )).unwrap();
93 |
94 | // commit
95 | index_writer.commit()?;
96 |
97 | // create reader
98 | let reader = index.reader()?;
99 |
100 | // create searcher
101 | let searcher = reader.searcher();
102 |
103 | // create querhy parser
104 | let query_parser = QueryParser::for_index(&index, vec![title, body]);
105 |
106 | // parse query
107 | let query_str = "東京";
108 | let query = query_parser.parse_query(query_str)?;
109 | println!("Query String: {}", query_str);
110 |
111 | // search
112 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
113 | println!("Search Result:");
114 | for (_, doc_address) in top_docs {
115 | let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
116 | println!("{}", retrieved_doc.to_json(&schema));
117 | }
118 |
119 | Ok(())
120 | }
121 |
122 | #[cfg(not(feature = "unidic"))]
123 | fn main() -> tantivy::Result<()> {
124 | Ok(())
125 | }
126 |
--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod stream;
2 | pub mod tokenizer;
3 |
--------------------------------------------------------------------------------
/src/stream.rs:
--------------------------------------------------------------------------------
1 | use tantivy_tokenizer_api::{Token, TokenStream};
2 |
3 | use lindera::token::Token as LToken;
4 |
5 | pub struct LinderaTokenStream<'a> {
6 | pub tokens: Vec>,
7 | pub token: &'a mut Token,
8 | }
9 |
10 | impl<'a> TokenStream for LinderaTokenStream<'a> {
11 | fn advance(&mut self) -> bool {
12 | if self.tokens.is_empty() {
13 | return false;
14 | }
15 | let token = self.tokens.remove(0);
16 | self.token.text = token.text.to_string();
17 | self.token.offset_from = token.byte_start;
18 | self.token.offset_to = token.byte_end;
19 | self.token.position = token.position;
20 | self.token.position_length = token.position_length;
21 |
22 | true
23 | }
24 |
25 | fn token(&self) -> &Token {
26 | self.token
27 | }
28 |
29 | fn token_mut(&mut self) -> &mut Token {
30 | self.token
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/tokenizer.rs:
--------------------------------------------------------------------------------
1 | use std::path::Path;
2 |
3 | use lindera::character_filter::BoxCharacterFilter;
4 | use lindera::token_filter::BoxTokenFilter;
5 | use lindera::tokenizer::{Tokenizer as LTokenizer, TokenizerBuilder};
6 | use tantivy::Result;
7 | use tantivy::TantivyError;
8 | use tantivy_tokenizer_api::{Token, Tokenizer};
9 |
10 | use crate::stream::LinderaTokenStream;
11 |
12 | #[derive(Clone)]
13 | pub struct LinderaTokenizer {
14 | tokenizer: LTokenizer,
15 | token: Token,
16 | }
17 |
18 | impl LinderaTokenizer {
19 | /// Create a new `LinderaTokenizer`.
20 | /// This function will create a new `LinderaTokenizer` with settings from the YAML file specified in the `LINDERA_CONFIG_PATH` environment variable.
21 | pub fn new() -> Result {
22 | let builder = TokenizerBuilder::new()
23 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?;
24 | let tokenizer = builder
25 | .build()
26 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?;
27 | Ok(LinderaTokenizer {
28 | tokenizer,
29 | token: Default::default(),
30 | })
31 | }
32 |
33 | /// Create a new `LinderaTokenizer`.
34 | /// This function will create a new `LinderaTokenizer` with settings from the YAML file.
35 | pub fn from_file(file_path: &Path) -> Result {
36 | let builder = TokenizerBuilder::from_file(file_path)
37 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?;
38 | let tokenizer = builder
39 | .build()
40 | .map_err(|e| TantivyError::InvalidArgument(format!("{:?}", e)))?;
41 | Ok(LinderaTokenizer {
42 | tokenizer,
43 | token: Default::default(),
44 | })
45 | }
46 |
47 | /// Create a new `LinderaTokenizer`.
48 | /// This function will create a new `LinderaTokenizer` with the specified `lindera::segmenter::Segmenter`.
49 | pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> LinderaTokenizer {
50 | LinderaTokenizer {
51 | tokenizer: LTokenizer::new(segmenter),
52 | token: Default::default(),
53 | }
54 | }
55 |
56 | /// Append a character filter to the tokenizer.
57 | pub fn append_character_filter(&mut self, character_filter: BoxCharacterFilter) -> &mut Self {
58 | self.tokenizer.append_character_filter(character_filter);
59 |
60 | self
61 | }
62 |
63 | /// Append a token filter to the tokenizer.
64 | pub fn append_token_filter(&mut self, token_filter: BoxTokenFilter) -> &mut Self {
65 | self.tokenizer.token_filters.push(token_filter);
66 |
67 | self
68 | }
69 | }
70 |
71 | impl Tokenizer for LinderaTokenizer {
72 | type TokenStream<'a> = LinderaTokenStream<'a>;
73 |
74 | fn token_stream<'a>(&'a mut self, text: &'a str) -> LinderaTokenStream<'a> {
75 | self.token.reset();
76 | LinderaTokenStream {
77 | tokens: self.tokenizer.tokenize(text).unwrap(),
78 | token: &mut self.token,
79 | }
80 | }
81 | }
82 |
83 | #[cfg(test)]
84 | #[cfg(any(
85 | feature = "ipadic",
86 | feature = "unidic",
87 | feature = "ko-dic",
88 | feature = "cc-cedict"
89 | ))]
90 | mod tests {
91 | use lindera::segmenter::Segmenter;
92 | use tantivy_tokenizer_api::{Token, TokenStream, Tokenizer};
93 |
94 | use lindera::dictionary::{load_dictionary_from_kind, DictionaryKind};
95 | use lindera::mode::Mode;
96 |
97 | use super::LinderaTokenizer;
98 |
99 | fn token_stream_helper(text: &str, dictionary_kind: DictionaryKind) -> Vec {
100 | let mode = Mode::Normal;
101 | let dictionary = load_dictionary_from_kind(dictionary_kind).unwrap();
102 | let user_dictionary = None;
103 | let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
104 | let mut tokenizer = LinderaTokenizer::from_segmenter(segmenter);
105 |
106 | let mut token_stream = tokenizer.token_stream(text);
107 | let mut tokens: Vec = vec![];
108 | let mut add_token = |token: &Token| {
109 | tokens.push(token.clone());
110 | };
111 | token_stream.process(&mut add_token);
112 |
113 | tokens
114 | }
115 |
116 | #[cfg(feature = "ipadic")]
117 | fn token_stream_helper_ipadic(text: &str) -> Vec {
118 | token_stream_helper(text, DictionaryKind::IPADIC)
119 | }
120 |
121 | #[cfg(feature = "unidic")]
122 | fn token_stream_helper_unidic(text: &str) -> Vec {
123 | token_stream_helper(text, DictionaryKind::UniDic)
124 | }
125 |
126 | #[cfg(feature = "ko-dic")]
127 | fn token_stream_helper_kodic(text: &str) -> Vec {
128 | token_stream_helper(text, DictionaryKind::KoDic)
129 | }
130 |
131 | #[cfg(feature = "cc-cedict")]
132 | fn token_stream_helper_cccedict(text: &str) -> Vec {
133 | token_stream_helper(text, DictionaryKind::CcCedict)
134 | }
135 |
136 | /// This is a function that can be used in tests and doc tests
137 | /// to assert a token's correctness.
138 | pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
139 | assert_eq!(
140 | token.position, position,
141 | "expected position {position} but {token:?}"
142 | );
143 | assert_eq!(token.text, text, "expected text {text} but {token:?}");
144 | assert_eq!(
145 | token.offset_from, from,
146 | "expected offset_from {from} but {token:?}"
147 | );
148 | assert_eq!(token.offset_to, to, "expected offset_to {to} but {token:?}");
149 | }
150 |
151 | #[test]
152 | #[cfg(feature = "ipadic")]
153 | fn test_tokenize_ipadic() {
154 | let tokens = token_stream_helper_ipadic("羽田空港限定トートバッグ");
155 | assert_eq!(tokens.len(), 3);
156 | assert_token(&tokens[0], 0, "羽田空港", 0, 12);
157 | assert_token(&tokens[1], 1, "限定", 12, 18);
158 | assert_token(&tokens[2], 2, "トートバッグ", 18, 36);
159 | }
160 |
161 | #[test]
162 | #[cfg(feature = "unidic")]
163 | fn test_tokenize_unidic() {
164 | let tokens = token_stream_helper_unidic("羽田空港限定トートバッグ");
165 | assert_eq!(tokens.len(), 5);
166 | assert_token(&tokens[0], 0, "羽田", 0, 6);
167 | assert_token(&tokens[1], 1, "空港", 6, 12);
168 | assert_token(&tokens[2], 2, "限定", 12, 18);
169 | assert_token(&tokens[3], 3, "トート", 18, 27);
170 | assert_token(&tokens[4], 4, "バッグ", 27, 36);
171 | }
172 |
173 | #[test]
174 | #[cfg(feature = "ko-dic")]
175 | fn test_tokenize_kodic() {
176 | let tokens = token_stream_helper_kodic("하네다공항한정토트백");
177 | assert_eq!(tokens.len(), 4);
178 | assert_token(&tokens[0], 0, "하네다", 0, 9);
179 | assert_token(&tokens[1], 1, "공항", 9, 15);
180 | assert_token(&tokens[2], 2, "한정", 15, 21);
181 | assert_token(&tokens[3], 3, "토트백", 21, 30);
182 | }
183 |
184 | #[test]
185 | #[cfg(feature = "cc-cedict")]
186 | fn test_tokenize_cccedict() {
187 | let tokens = token_stream_helper_cccedict("羽田机场限量版手提包");
188 | assert_eq!(tokens.len(), 6);
189 | assert_token(&tokens[0], 0, "羽田", 0, 6);
190 | assert_token(&tokens[1], 1, "机场", 6, 12);
191 | assert_token(&tokens[2], 2, "限", 12, 15);
192 | assert_token(&tokens[3], 3, "量", 15, 18);
193 | assert_token(&tokens[4], 4, "版", 18, 21);
194 | assert_token(&tokens[5], 5, "手提包", 21, 30);
195 | }
196 | }
197 |
--------------------------------------------------------------------------------