├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── actions.md │ ├── bug_report.md │ ├── feature_request.md │ └── question.md ├── dependabot.yml └── workflows │ ├── coverage.yml │ ├── long_running.yml │ └── test.yml ├── .gitignore ├── ARCHITECTURE.md ├── AUTHORS ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── appveyor.yml ├── benches ├── alice.txt ├── analyzer.rs ├── hdfs.json └── index-bench.rs ├── bitpacker ├── Cargo.toml ├── benches │ └── bench.rs └── src │ ├── bitpacker.rs │ ├── blocked_bitpacker.rs │ └── lib.rs ├── ci ├── before_deploy.ps1 ├── before_deploy.sh ├── install.sh └── script.sh ├── common ├── Cargo.toml └── src │ ├── bitset.rs │ ├── lib.rs │ ├── serialize.rs │ ├── vint.rs │ └── writer.rs ├── doc ├── .gitignore ├── book.toml └── src │ ├── SUMMARY.md │ ├── avant-propos.md │ ├── basis.md │ ├── best_practise.md.rs │ ├── examples.md │ ├── facetting.md │ ├── faq.md │ ├── index_sorting.md │ ├── innerworkings.md │ ├── inverted_index.md │ ├── json.md │ └── schema.md ├── examples ├── aggregation.rs ├── basic_search.rs ├── custom_collector.rs ├── custom_tokenizer.rs ├── deleting_updating_documents.rs ├── faceted_search.rs ├── faceted_search_with_tweaked_score.rs ├── integer_range_search.rs ├── iterating_docs_and_positions.rs ├── json_field.rs ├── multiple_producer.rs ├── pre_tokenized_text.rs ├── snippet.rs ├── stop_words.rs ├── warmer.rs └── working_with_json.rs ├── fastfield_codecs ├── Cargo.toml ├── README.md ├── benches │ └── bench.rs └── src │ ├── bitpacked.rs │ ├── lib.rs │ ├── linearinterpol.rs │ ├── main.rs │ └── multilinearinterpol.rs ├── ownedbytes ├── Cargo.toml └── src │ └── lib.rs ├── query-grammar ├── Cargo.toml ├── README.md └── src │ ├── lib.rs │ ├── occur.rs │ ├── query_grammar.rs │ └── user_input_ast.rs ├── run-tests.sh ├── rustfmt.toml ├── src ├── aggregation │ ├── README.md │ ├── agg_req.rs │ ├── agg_req_with_accessor.rs │ ├── agg_result.rs │ ├── bucket │ │ ├── histogram │ │ │ ├── histogram.rs │ │ │ └── mod.rs │ │ ├── mod.rs │ │ └── range.rs │ ├── collector.rs │ ├── intermediate_agg_result.rs │ ├── metric │ │ ├── average.rs │ │ ├── mod.rs │ │ └── stats.rs │ ├── mod.rs │ └── segment_agg_result.rs ├── collector │ ├── count_collector.rs │ ├── custom_score_top_collector.rs │ ├── docset_collector.rs │ ├── facet_collector.rs │ ├── filter_collector_wrapper.rs │ ├── histogram_collector.rs │ ├── mod.rs │ ├── multi_collector.rs │ ├── tests.rs │ ├── top_collector.rs │ ├── top_score_collector.rs │ └── tweak_score_top_collector.rs ├── core │ ├── executor.rs │ ├── index.rs │ ├── index_meta.rs │ ├── inverted_index_reader.rs │ ├── mod.rs │ ├── searcher.rs │ ├── segment.rs │ ├── segment_component.rs │ ├── segment_id.rs │ └── segment_reader.rs ├── directory │ ├── composite_file.rs │ ├── directory.rs │ ├── directory_lock.rs │ ├── error.rs │ ├── file_slice.rs │ ├── file_watcher.rs │ ├── footer.rs │ ├── managed_directory.rs │ ├── mmap_directory.rs │ ├── mod.rs │ ├── ram_directory.rs │ ├── tests.rs │ └── watch_event_router.rs ├── docset.rs ├── error.rs ├── fastfield │ ├── alive_bitset.rs │ ├── bytes │ │ ├── mod.rs │ │ ├── reader.rs │ │ └── writer.rs │ ├── error.rs │ ├── facet_reader.rs │ ├── mod.rs │ ├── multivalued │ │ ├── mod.rs │ │ ├── reader.rs │ │ └── writer.rs │ ├── reader.rs │ ├── readers.rs │ ├── serializer │ │ └── mod.rs │ └── writer.rs ├── fieldnorm │ ├── code.rs │ ├── mod.rs │ ├── reader.rs │ ├── serializer.rs │ └── writer.rs ├── functional_test.rs ├── future_result.rs ├── indexer │ ├── delete_queue.rs │ ├── demuxer.rs │ ├── doc_id_mapping.rs │ ├── doc_opstamp_mapping.rs │ ├── index_writer.rs │ ├── index_writer_status.rs │ ├── json_term_writer.rs │ ├── log_merge_policy.rs │ ├── merge_operation.rs │ ├── merge_policy.rs │ ├── merger.rs │ ├── merger_sorted_index_test.rs │ ├── mod.rs │ ├── operation.rs │ ├── prepared_commit.rs │ ├── segment_entry.rs │ ├── segment_manager.rs │ ├── segment_register.rs │ ├── segment_serializer.rs │ ├── segment_updater.rs │ ├── segment_writer.rs │ └── stamper.rs ├── lib.rs ├── macros.rs ├── positions │ ├── mod.rs │ ├── reader.rs │ └── serializer.rs ├── postings │ ├── block_search.rs │ ├── block_segment_postings.rs │ ├── compression │ │ ├── mod.rs │ │ └── vint.rs │ ├── indexing_context.rs │ ├── json_postings_writer.rs │ ├── mod.rs │ ├── per_field_postings_writer.rs │ ├── postings.rs │ ├── postings_writer.rs │ ├── recorder.rs │ ├── segment_postings.rs │ ├── serializer.rs │ ├── skip.rs │ ├── stacker │ │ ├── expull.rs │ │ ├── memory_arena.rs │ │ ├── mod.rs │ │ └── term_hashmap.rs │ └── term_info.rs ├── query │ ├── all_query.rs │ ├── automaton_weight.rs │ ├── bitset │ │ └── mod.rs │ ├── bm25.rs │ ├── boolean_query │ │ ├── block_wand.rs │ │ ├── boolean_query.rs │ │ ├── boolean_weight.rs │ │ └── mod.rs │ ├── boost_query.rs │ ├── empty_query.rs │ ├── exclude.rs │ ├── explanation.rs │ ├── fuzzy_query.rs │ ├── intersection.rs │ ├── mod.rs │ ├── more_like_this │ │ ├── mod.rs │ │ ├── more_like_this.rs │ │ └── query.rs │ ├── phrase_query │ │ ├── mod.rs │ │ ├── phrase_query.rs │ │ ├── phrase_scorer.rs │ │ └── phrase_weight.rs │ ├── query.rs │ ├── query_parser │ │ ├── logical_ast.rs │ │ ├── mod.rs │ │ └── query_parser.rs │ ├── range_query.rs │ ├── regex_query.rs │ ├── reqopt_scorer.rs │ ├── score_combiner.rs │ ├── scorer.rs │ ├── term_query │ │ ├── mod.rs │ │ ├── term_query.rs │ │ ├── term_scorer.rs │ │ └── term_weight.rs │ ├── union.rs │ ├── vec_docset.rs │ └── weight.rs ├── reader │ ├── mod.rs │ ├── pool.rs │ └── warming.rs ├── schema │ ├── bytes_options.rs │ ├── document.rs │ ├── facet.rs │ ├── facet_options.rs │ ├── field.rs │ ├── field_entry.rs │ ├── field_type.rs │ ├── field_value.rs │ ├── flags.rs │ ├── index_record_option.rs │ ├── json_object_options.rs │ ├── mod.rs │ ├── named_field_document.rs │ ├── numeric_options.rs │ ├── schema.rs │ ├── term.rs │ ├── text_options.rs │ └── value.rs ├── snippet │ └── mod.rs ├── space_usage │ └── mod.rs ├── store │ ├── compression_brotli.rs │ ├── compression_lz4_block.rs │ ├── compression_snap.rs │ ├── compressors.rs │ ├── footer.rs │ ├── index │ │ ├── block.rs │ │ ├── mod.rs │ │ ├── skip_index.rs │ │ └── skip_index_builder.rs │ ├── mod.rs │ ├── reader.rs │ └── writer.rs ├── termdict │ ├── fst_termdict │ │ ├── merger.rs │ │ ├── mod.rs │ │ ├── streamer.rs │ │ ├── term_info_store.rs │ │ └── termdict.rs │ ├── mod.rs │ ├── sstable_termdict │ │ ├── merger.rs │ │ ├── mod.rs │ │ ├── sstable │ │ │ ├── block_reader.rs │ │ │ ├── delta.rs │ │ │ ├── merge │ │ │ │ ├── heap_merge.rs │ │ │ │ └── mod.rs │ │ │ ├── mod.rs │ │ │ ├── sstable_index.rs │ │ │ ├── value.rs │ │ │ └── vint.rs │ │ ├── streamer.rs │ │ └── termdict.rs │ └── tests.rs └── tokenizer │ ├── alphanum_only.rs │ ├── ascii_folding_filter.rs │ ├── empty_tokenizer.rs │ ├── facet_tokenizer.rs │ ├── lower_caser.rs │ ├── mod.rs │ ├── ngram_tokenizer.rs │ ├── raw_tokenizer.rs │ ├── remove_long.rs │ ├── simple_tokenizer.rs │ ├── stemmer.rs │ ├── stop_word_filter.rs │ ├── tokenized_string.rs │ ├── tokenizer.rs │ ├── tokenizer_manager.rs │ └── whitespace_tokenizer.rs └── tests ├── failpoints └── mod.rs └── mod.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | cpp/* linguist-vendored 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: fulmicoton 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/actions.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Actions 3 | about: Actions not directly related to producing code. 4 | 5 | --- 6 | 7 | # Actions title 8 | 9 | Action description. 10 | e.g. 11 | - benchmark 12 | - investigate and report 13 | - etc. 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | - What did you do? 9 | - What happened? 10 | - What was expected? 11 | 12 | **Which version of tantivy are you using?** 13 | If "master", ideally give the specific sha1 revision. 14 | 15 | **To Reproduce** 16 | 17 | If your bug is deterministic, can you give a minimal reproducing code? 18 | Some bugs are not deterministic. Can you describe with precision in which context it happened? 19 | If this is possible, can you share your code? 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **[Optional] describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask any question about tantivy's usage... 4 | 5 | --- 6 | 7 | Try to be specific about your use case... 8 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: cargo 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "20:00" 8 | open-pull-requests-limit: 10 9 | 10 | - package-ecosystem: "github-actions" 11 | directory: "/" 12 | schedule: 13 | interval: daily 14 | time: "20:00" 15 | open-pull-requests-limit: 10 16 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yml: -------------------------------------------------------------------------------- 1 | name: Coverage 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | coverage: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Install Rust 15 | run: rustup toolchain install nightly --component llvm-tools-preview 16 | - name: Install cargo-llvm-cov 17 | run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin 18 | - name: Generate code coverage 19 | run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info 20 | - name: Upload coverage to Codecov 21 | uses: codecov/codecov-action@v2 22 | with: 23 | token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos 24 | files: lcov.info 25 | fail_ci_if_error: true 26 | -------------------------------------------------------------------------------- /.github/workflows/long_running.yml: -------------------------------------------------------------------------------- 1 | name: Long running tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | env: 8 | CARGO_TERM_COLOR: always 9 | NUM_FUNCTIONAL_TEST_ITERATIONS: 20000 10 | 11 | jobs: 12 | functional_test_unsorted: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Run indexing_unsorted 17 | run: cargo test indexing_unsorted -- --ignored 18 | functional_test_sorted: 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Run indexing_sorted 23 | run: cargo test indexing_sorted -- --ignored 24 | 25 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Unit tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | test: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose --workspace 21 | - name: Install latest nightly to test also against unstable feature flag 22 | uses: actions-rs/toolchain@v1 23 | with: 24 | toolchain: nightly 25 | override: true 26 | components: rustfmt 27 | 28 | - name: Install latest nightly to test also against unstable feature flag 29 | uses: actions-rs/toolchain@v1 30 | with: 31 | toolchain: stable 32 | override: true 33 | components: rustfmt, clippy 34 | 35 | - name: Run tests 36 | run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace 37 | 38 | - name: Run tests quickwit feature 39 | run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace 40 | 41 | - name: Check Formatting 42 | run: cargo +nightly fmt --all -- --check 43 | 44 | - uses: actions-rs/clippy-check@v1 45 | with: 46 | toolchain: stable 47 | token: ${{ secrets.GITHUB_TOKEN }} 48 | args: --tests 49 | 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tantivy.iml 2 | .cargo 3 | proptest-regressions 4 | *.swp 5 | target 6 | target/debug 7 | .vscode 8 | target/release 9 | Cargo.lock 10 | benchmark 11 | .DS_Store 12 | cpp/simdcomp/bitpackingbenchmark 13 | *.bk 14 | .idea 15 | trace.dat 16 | cargo-timing* 17 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of authors of tantivy for copyright purposes. 2 | Paul Masurel 3 | Laurentiu Nicola 4 | Dru Sellers 5 | Ashley Mannix 6 | Michael J. Curry 7 | Jason Wolfe 8 | # As an employee of Google I am required to add Google LLC 9 | # in the list of authors, but this project is not affiliated to Google 10 | # in any other way. 11 | Google LLC 12 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tantivy" 3 | version = "0.17.0" 4 | authors = ["Paul Masurel "] 5 | license = "MIT" 6 | categories = ["database-implementations", "data-structures"] 7 | description = """Search engine library""" 8 | documentation = "https://docs.rs/tantivy/" 9 | homepage = "https://github.com/quickwit-oss/tantivy" 10 | repository = "https://github.com/quickwit-oss/tantivy" 11 | readme = "README.md" 12 | keywords = ["search", "information", "retrieval"] 13 | edition = "2018" 14 | 15 | [dependencies] 16 | oneshot = "0.1" 17 | base64 = "0.13" 18 | byteorder = "1.4.3" 19 | crc32fast = "1.2.1" 20 | once_cell = "1.7.2" 21 | regex ={ version = "1.5.4", default-features = false, features = ["std"] } 22 | tantivy-fst = "0.3" 23 | memmap2 = {version = "0.5", optional=true} 24 | lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true } 25 | brotli = { version = "3.3", optional = true } 26 | snap = { version = "1.0.5", optional = true } 27 | tempfile = { version = "3.2", optional = true } 28 | log = "0.4.14" 29 | serde = { version = "1.0.126", features = ["derive"] } 30 | serde_json = "1.0.64" 31 | num_cpus = "1.13" 32 | fs2={ version = "0.4.3", optional = true } 33 | levenshtein_automata = "0.2" 34 | uuid = { version = "0.8.2", features = ["v4", "serde"] } 35 | crossbeam = "0.8.1" 36 | tantivy-query-grammar = { version="0.15.0", path="./query-grammar" } 37 | tantivy-bitpacker = { version="0.1", path="./bitpacker" } 38 | common = { version = "0.2", path = "./common/", package = "tantivy-common" } 39 | fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false } 40 | ownedbytes = { version="0.2", path="./ownedbytes" } 41 | stable_deref_trait = "1.2" 42 | rust-stemmers = "1.2" 43 | downcast-rs = "1.2" 44 | bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] } 45 | census = "0.4" 46 | fnv = "1.0.7" 47 | thiserror = "1.0.24" 48 | htmlescape = "0.3.1" 49 | fail = "0.5" 50 | murmurhash32 = "0.2" 51 | chrono = "0.4.19" 52 | smallvec = "1.6.1" 53 | rayon = "1.5" 54 | lru = "0.7.0" 55 | fastdivide = "0.4" 56 | itertools = "0.10.0" 57 | measure_time = "0.8.0" 58 | pretty_assertions = "1.1.0" 59 | serde_cbor = {version="0.11", optional=true} 60 | async-trait = "0.1" 61 | 62 | [target.'cfg(windows)'.dependencies] 63 | winapi = "0.3.9" 64 | 65 | [dev-dependencies] 66 | rand = "0.8.3" 67 | maplit = "1.0.2" 68 | matches = "0.1.8" 69 | proptest = "1.0" 70 | criterion = "0.3.5" 71 | test-log = "0.2.8" 72 | env_logger = "0.9.0" 73 | pprof = {version= "0.7", features=["flamegraph", "criterion"]} 74 | futures = "0.3.15" 75 | 76 | [dev-dependencies.fail] 77 | version = "0.5" 78 | features = ["failpoints"] 79 | 80 | [profile.release] 81 | opt-level = 3 82 | debug = false 83 | debug-assertions = false 84 | 85 | [profile.test] 86 | debug-assertions = true 87 | overflow-checks = true 88 | 89 | [features] 90 | default = ["mmap", "lz4-compression" ] 91 | mmap = ["fs2", "tempfile", "memmap2"] 92 | 93 | brotli-compression = ["brotli"] 94 | lz4-compression = ["lz4_flex"] 95 | snappy-compression = ["snap"] 96 | 97 | failpoints = ["fail/failpoints"] 98 | unstable = [] # useful for benches. 99 | 100 | quickwit = ["serde_cbor"] 101 | 102 | [workspace] 103 | members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"] 104 | 105 | # Following the "fail" crate best practises, we isolate 106 | # tests that define specific behavior in fail check points 107 | # in a different binary. 108 | # 109 | # We do that because, fail rely on a global definition of 110 | # failpoints behavior and hence, it is incompatible with 111 | # multithreading. 112 | [[test]] 113 | name = "failpoints" 114 | path = "tests/failpoints/mod.rs" 115 | required-features = ["fail/failpoints"] 116 | 117 | [[bench]] 118 | name = "analyzer" 119 | harness = false 120 | 121 | [[bench]] 122 | name = "index-bench" 123 | harness = false 124 | 125 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 by the project authors, as listed in the AUTHORS file. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | echo "Run test only... No examples." 3 | cargo test --tests --lib 4 | 5 | fmt: 6 | cargo +nightly fmt --all 7 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # Appveyor configuration template for Rust using rustup for Rust installation 2 | # https://github.com/starkat99/appveyor-rust 3 | 4 | os: Visual Studio 2015 5 | environment: 6 | matrix: 7 | - channel: stable 8 | target: x86_64-pc-windows-msvc 9 | 10 | install: 11 | - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe 12 | - rustup-init -yv --default-toolchain %channel% --default-host %target% 13 | - set PATH=%PATH%;%USERPROFILE%\.cargo\bin 14 | - if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin 15 | - rustc -vV 16 | - cargo -vV 17 | 18 | build: false 19 | 20 | test_script: 21 | - REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap 22 | - REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap 23 | - REM SET RUST_BACKTRACE=1 & cargo build --examples 24 | -------------------------------------------------------------------------------- /benches/analyzer.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use tantivy::tokenizer::TokenizerManager; 3 | 4 | const ALICE_TXT: &str = include_str!("alice.txt"); 5 | 6 | pub fn criterion_benchmark(c: &mut Criterion) { 7 | let tokenizer_manager = TokenizerManager::default(); 8 | let tokenizer = tokenizer_manager.get("default").unwrap(); 9 | c.bench_function("default-tokenize-alice", |b| { 10 | b.iter(|| { 11 | let mut word_count = 0; 12 | let mut token_stream = tokenizer.token_stream(ALICE_TXT); 13 | while token_stream.advance() { 14 | word_count += 1; 15 | } 16 | assert_eq!(word_count, 30_731); 17 | }) 18 | }); 19 | } 20 | 21 | criterion_group!(benches, criterion_benchmark); 22 | criterion_main!(benches); 23 | -------------------------------------------------------------------------------- /bitpacker/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tantivy-bitpacker" 3 | version = "0.1.1" 4 | edition = "2018" 5 | authors = ["Paul Masurel "] 6 | license = "MIT" 7 | categories = [] 8 | description = """Tantivy-sub crate: bitpacking""" 9 | repository = "https://github.com/quickwit-oss/tantivy" 10 | keywords = [] 11 | 12 | 13 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 14 | 15 | [dependencies] 16 | -------------------------------------------------------------------------------- /bitpacker/benches/bench.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | #[cfg(test)] 6 | mod tests { 7 | use tantivy_bitpacker::BlockedBitpacker; 8 | use test::Bencher; 9 | 10 | #[bench] 11 | fn bench_blockedbitp_read(b: &mut Bencher) { 12 | let mut blocked_bitpacker = BlockedBitpacker::new(); 13 | for val in 0..=21500 { 14 | blocked_bitpacker.add(val * val); 15 | } 16 | b.iter(|| { 17 | let mut out = 0; 18 | for val in 0..=21500 { 19 | out = blocked_bitpacker.get(val); 20 | } 21 | out 22 | }); 23 | } 24 | 25 | #[bench] 26 | fn bench_blockedbitp_create(b: &mut Bencher) { 27 | b.iter(|| { 28 | let mut blocked_bitpacker = BlockedBitpacker::new(); 29 | for val in 0..=21500 { 30 | blocked_bitpacker.add(val * val); 31 | } 32 | blocked_bitpacker 33 | }); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /bitpacker/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod bitpacker; 2 | mod blocked_bitpacker; 3 | 4 | pub use crate::bitpacker::{BitPacker, BitUnpacker}; 5 | pub use crate::blocked_bitpacker::BlockedBitpacker; 6 | 7 | /// Computes the number of bits that will be used for bitpacking. 8 | /// 9 | /// In general the target is the minimum number of bits 10 | /// required to express the amplitude given in argument. 11 | /// 12 | /// e.g. If the amplitude is 10, we can store all ints on simply 4bits. 13 | /// 14 | /// The logic is slightly more convoluted here as for optimization 15 | /// reasons, we want to ensure that a value spawns over at most 8 bytes 16 | /// of aligned bytes. 17 | /// 18 | /// Spanning over 9 bytes is possible for instance, if we do 19 | /// bitpacking with an amplitude of 63 bits. 20 | /// In this case, the second int will start on bit 21 | /// 63 (which belongs to byte 7) and ends at byte 15; 22 | /// Hence 9 bytes (from byte 7 to byte 15 included). 23 | /// 24 | /// To avoid this, we force the number of bits to 64bits 25 | /// when the result is greater than `64-8 = 56 bits`. 26 | /// 27 | /// Note that this only affects rare use cases spawning over 28 | /// a very large range of values. Even in this case, it results 29 | /// in an extra cost of at most 12% compared to the optimal 30 | /// number of bits. 31 | pub fn compute_num_bits(n: u64) -> u8 { 32 | let amplitude = (64u32 - n.leading_zeros()) as u8; 33 | if amplitude <= 64 - 8 { 34 | amplitude 35 | } else { 36 | 64 37 | } 38 | } 39 | 40 | pub fn minmax(mut vals: I) -> Option<(T, T)> 41 | where 42 | I: Iterator, 43 | T: Copy + Ord, 44 | { 45 | if let Some(first_el) = vals.next() { 46 | return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| { 47 | (min_val.min(el), max_val.max(el)) 48 | })); 49 | } 50 | None 51 | } 52 | 53 | #[test] 54 | fn test_compute_num_bits() { 55 | assert_eq!(compute_num_bits(1), 1u8); 56 | assert_eq!(compute_num_bits(0), 0u8); 57 | assert_eq!(compute_num_bits(2), 2u8); 58 | assert_eq!(compute_num_bits(3), 2u8); 59 | assert_eq!(compute_num_bits(4), 3u8); 60 | assert_eq!(compute_num_bits(255), 8u8); 61 | assert_eq!(compute_num_bits(256), 9u8); 62 | assert_eq!(compute_num_bits(5_000_000_000), 33u8); 63 | } 64 | 65 | #[test] 66 | fn test_minmax_empty() { 67 | let vals: Vec = vec![]; 68 | assert_eq!(minmax(vals.into_iter()), None); 69 | } 70 | 71 | #[test] 72 | fn test_minmax_one() { 73 | assert_eq!(minmax(vec![1].into_iter()), Some((1, 1))); 74 | } 75 | 76 | #[test] 77 | fn test_minmax_two() { 78 | assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2))); 79 | assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2))); 80 | } 81 | -------------------------------------------------------------------------------- /ci/before_deploy.ps1: -------------------------------------------------------------------------------- 1 | # This script takes care of packaging the build artifacts that will go in the 2 | # release zipfile 3 | 4 | $SRC_DIR = $PWD.Path 5 | $STAGE = [System.Guid]::NewGuid().ToString() 6 | 7 | Set-Location $ENV:Temp 8 | New-Item -Type Directory -Name $STAGE 9 | Set-Location $STAGE 10 | 11 | $ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip" 12 | 13 | # TODO Update this to package the right artifacts 14 | Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\' 15 | 16 | 7z a "$ZIP" * 17 | 18 | Push-AppveyorArtifact "$ZIP" 19 | 20 | Remove-Item *.* -Force 21 | Set-Location .. 22 | Remove-Item $STAGE 23 | Set-Location $SRC_DIR 24 | -------------------------------------------------------------------------------- /ci/before_deploy.sh: -------------------------------------------------------------------------------- 1 | # This script takes care of building your crate and packaging it for release 2 | 3 | set -ex 4 | 5 | main() { 6 | local src=$(pwd) \ 7 | stage= 8 | 9 | case $TRAVIS_OS_NAME in 10 | linux) 11 | stage=$(mktemp -d) 12 | ;; 13 | osx) 14 | stage=$(mktemp -d -t tmp) 15 | ;; 16 | esac 17 | 18 | test -f Cargo.lock || cargo generate-lockfile 19 | 20 | # TODO Update this to build the artifacts that matter to you 21 | cross rustc --bin hello --target $TARGET --release -- -C lto 22 | 23 | # TODO Update this to package the right artifacts 24 | cp target/$TARGET/release/hello $stage/ 25 | 26 | cd $stage 27 | tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz * 28 | cd $src 29 | 30 | rm -rf $stage 31 | } 32 | 33 | main 34 | -------------------------------------------------------------------------------- /ci/install.sh: -------------------------------------------------------------------------------- 1 | set -ex 2 | 3 | main() { 4 | local target= 5 | if [ $TRAVIS_OS_NAME = linux ]; then 6 | target=x86_64-unknown-linux-musl 7 | sort=sort 8 | else 9 | target=x86_64-apple-darwin 10 | sort=gsort # for `sort --sort-version`, from brew's coreutils. 11 | fi 12 | 13 | # Builds for iOS are done on OSX, but require the specific target to be 14 | # installed. 15 | case $TARGET in 16 | aarch64-apple-ios) 17 | rustup target install aarch64-apple-ios 18 | ;; 19 | armv7-apple-ios) 20 | rustup target install armv7-apple-ios 21 | ;; 22 | armv7s-apple-ios) 23 | rustup target install armv7s-apple-ios 24 | ;; 25 | i386-apple-ios) 26 | rustup target install i386-apple-ios 27 | ;; 28 | x86_64-apple-ios) 29 | rustup target install x86_64-apple-ios 30 | ;; 31 | esac 32 | 33 | # This fetches latest stable release 34 | local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \ 35 | | cut -d/ -f3 \ 36 | | grep -E '^v[0.1.0-9.]+$' \ 37 | | $sort --version-sort \ 38 | | tail -n1) 39 | curl -LSfs https://japaric.github.io/trust/install.sh | \ 40 | sh -s -- \ 41 | --force \ 42 | --git japaric/cross \ 43 | --tag $tag \ 44 | --target $target 45 | } 46 | 47 | main 48 | -------------------------------------------------------------------------------- /ci/script.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script takes care of testing your crate 4 | 5 | set -ex 6 | 7 | main() { 8 | if [ ! -z $CODECOV ]; then 9 | echo "Codecov" 10 | cargo build --verbose && cargo coverage --verbose --all && bash <(curl -s https://codecov.io/bash) -s target/kcov 11 | else 12 | echo "Build" 13 | cross build --target $TARGET 14 | if [ ! -z $DISABLE_TESTS ]; then 15 | return 16 | fi 17 | echo "Test" 18 | cross test --target $TARGET --no-default-features --features mmap 19 | cross test --target $TARGET --no-default-features --features mmap query-grammar 20 | fi 21 | for example in $(ls examples/*.rs) 22 | do 23 | cargo run --example $(basename $example .rs) 24 | done 25 | } 26 | 27 | # we don't run the "test phase" when doing deploys 28 | if [ -z $TRAVIS_TAG ]; then 29 | main 30 | fi 31 | -------------------------------------------------------------------------------- /common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tantivy-common" 3 | version = "0.2.0" 4 | authors = ["Paul Masurel ", "Pascal Seitz "] 5 | license = "MIT" 6 | edition = "2018" 7 | description = "common traits and utility functions used by multiple tantivy subcrates" 8 | 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 10 | 11 | [dependencies] 12 | byteorder = "1.4.3" 13 | ownedbytes = { version="0.2", path="../ownedbytes" } 14 | 15 | [dev-dependencies] 16 | proptest = "1.0.0" 17 | rand = "0.8.4" 18 | -------------------------------------------------------------------------------- /common/src/writer.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, BufWriter, Write}; 2 | 3 | pub struct CountingWriter { 4 | underlying: W, 5 | written_bytes: u64, 6 | } 7 | 8 | impl CountingWriter { 9 | pub fn wrap(underlying: W) -> CountingWriter { 10 | CountingWriter { 11 | underlying, 12 | written_bytes: 0, 13 | } 14 | } 15 | 16 | #[inline] 17 | pub fn written_bytes(&self) -> u64 { 18 | self.written_bytes 19 | } 20 | 21 | /// Returns the underlying write object. 22 | /// Note that this method does not trigger any flushing. 23 | #[inline] 24 | pub fn finish(self) -> W { 25 | self.underlying 26 | } 27 | } 28 | 29 | impl Write for CountingWriter { 30 | #[inline] 31 | fn write(&mut self, buf: &[u8]) -> io::Result { 32 | let written_size = self.underlying.write(buf)?; 33 | self.written_bytes += written_size as u64; 34 | Ok(written_size) 35 | } 36 | 37 | #[inline] 38 | fn write_all(&mut self, buf: &[u8]) -> io::Result<()> { 39 | self.underlying.write_all(buf)?; 40 | self.written_bytes += buf.len() as u64; 41 | Ok(()) 42 | } 43 | 44 | #[inline] 45 | fn flush(&mut self) -> io::Result<()> { 46 | self.underlying.flush() 47 | } 48 | } 49 | 50 | impl TerminatingWrite for CountingWriter { 51 | #[inline] 52 | fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> { 53 | self.underlying.terminate_ref(token) 54 | } 55 | } 56 | 57 | /// Struct used to prevent from calling 58 | /// [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly 59 | /// 60 | /// The point is that while the type is public, it cannot be built by anyone 61 | /// outside of this module. 62 | pub struct AntiCallToken(()); 63 | 64 | /// Trait used to indicate when no more write need to be done on a writer 65 | pub trait TerminatingWrite: Write { 66 | /// Indicate that the writer will no longer be used. Internally call terminate_ref. 67 | fn terminate(mut self) -> io::Result<()> 68 | where Self: Sized { 69 | self.terminate_ref(AntiCallToken(())) 70 | } 71 | 72 | /// You should implement this function to define custom behavior. 73 | /// This function should flush any buffer it may hold. 74 | fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>; 75 | } 76 | 77 | impl TerminatingWrite for Box { 78 | fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> { 79 | self.as_mut().terminate_ref(token) 80 | } 81 | } 82 | 83 | impl TerminatingWrite for BufWriter { 84 | fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> { 85 | self.flush()?; 86 | self.get_mut().terminate_ref(a) 87 | } 88 | } 89 | 90 | impl<'a> TerminatingWrite for &'a mut Vec { 91 | fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> { 92 | self.flush() 93 | } 94 | } 95 | 96 | #[cfg(test)] 97 | mod test { 98 | 99 | use std::io::Write; 100 | 101 | use super::CountingWriter; 102 | 103 | #[test] 104 | fn test_counting_writer() { 105 | let buffer: Vec = vec![]; 106 | let mut counting_writer = CountingWriter::wrap(buffer); 107 | let bytes = (0u8..10u8).collect::>(); 108 | counting_writer.write_all(&bytes).unwrap(); 109 | let len = counting_writer.written_bytes(); 110 | let buffer_restituted: Vec = counting_writer.finish(); 111 | assert_eq!(len, 10u64); 112 | assert_eq!(buffer_restituted.len(), 10); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | book 2 | -------------------------------------------------------------------------------- /doc/book.toml: -------------------------------------------------------------------------------- 1 | [book] 2 | authors = ["Paul Masurel"] 3 | multilingual = false 4 | src = "src" 5 | title = "Tantivy, the user guide" 6 | -------------------------------------------------------------------------------- /doc/src/SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | 4 | 5 | [Avant Propos](./avant-propos.md) 6 | 7 | - [Segments](./basis.md) 8 | - [Defining your schema](./schema.md) 9 | - [Facetting](./facetting.md) 10 | - [Index Sorting](./index_sorting.md) 11 | - [Innerworkings](./innerworkings.md) 12 | - [Inverted index](./inverted_index.md) 13 | - [Best practise](./inverted_index.md) 14 | 15 | [Frequently Asked Questions](./faq.md) 16 | [Examples](./examples.md) 17 | -------------------------------------------------------------------------------- /doc/src/avant-propos.md: -------------------------------------------------------------------------------- 1 | # Foreword, what is the scope of tantivy? 2 | 3 | > Tantivy is a **search** engine **library** for Rust. 4 | 5 | If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and 6 | they both have the same scope and targetted use cases. 7 | 8 | If you are not familiar with Lucene, let's break down our little tagline. 9 | 10 | - **Search** here means full-text search : fundamentally, tantivy is here to help you 11 | identify efficiently what are the documents matching a given query in your corpus. 12 | But modern search UI are so much more : text processing, facetting, autocomplete, fuzzy search, good 13 | relevancy, collapsing, highlighting, spatial search. 14 | 15 | While some of these features are not available in tantivy yet, all of these are relevant 16 | feature requests. Tantivy's objective is to offer a solid toolbox to create the best search 17 | experience. But keep in mind this is just a toolbox. 18 | Which bring us to the second keyword... 19 | 20 | - **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance. 21 | 22 | Sometimes a functionality will not be available in tantivy because it is too 23 | specific to your use case. By design, tantivy should make it possible to extend 24 | the available set of features using the existing rock-solid datastructures. 25 | 26 | Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own 27 | `TokenFilter`... Some of your requirements may also be related to 28 | something closer to architecture or operations. For instance, you may 29 | want to build a large corpus on Hadoop, fine-tune the merge policy to keep your 30 | index sharded in a time-wise fashion, or you may want to convert and existing 31 | index from a different format. 32 | 33 | Tantivy exposes a lot of low level API to do all of these things. 34 | 35 | -------------------------------------------------------------------------------- /doc/src/best_practise.md.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skiff-org/tantivy/46d5de920dd1ac86fa7a74baa0debd933bcb6574/doc/src/best_practise.md.rs -------------------------------------------------------------------------------- /doc/src/examples.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | - [Basic search](/examples/basic_search.html) -------------------------------------------------------------------------------- /doc/src/facetting.md: -------------------------------------------------------------------------------- 1 | # Facetting 2 | 3 | wewew 4 | 5 | ## weeewe 6 | -------------------------------------------------------------------------------- /doc/src/faq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skiff-org/tantivy/46d5de920dd1ac86fa7a74baa0debd933bcb6574/doc/src/faq.md -------------------------------------------------------------------------------- /doc/src/index_sorting.md: -------------------------------------------------------------------------------- 1 | 2 | - [Index Sorting](#index-sorting) 3 | + [Why Sorting](#why-sorting) 4 | * [Compression](#compression) 5 | * [Top-N Optimization](#top-n-optimization) 6 | * [Pruning](#pruning) 7 | * [Other](#other) 8 | + [Usage](#usage) 9 | 10 | # Index Sorting 11 | 12 | Tantivy allows you to sort the index according to a property. 13 | 14 | ## Why Sorting 15 | 16 | Presorting an index has several advantages: 17 | 18 | ###### Compression 19 | 20 | When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5]. 21 | If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1]. 22 | Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected. 23 | ###### Top-N Optimization 24 | 25 | When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents. 26 | E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids. 27 | 28 | Note: Tantivy 0.16 does not do this optimization yet. 29 | 30 | ###### Pruning 31 | 32 | Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter. 33 | 34 | Note: Tantivy 0.16 does not do this optimization yet. 35 | 36 | ###### Other? 37 | 38 | In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?) 39 | 40 | ## Usage 41 | The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used. 42 | 43 | ``` 44 | let settings = IndexSettings { 45 | sort_by_field: Some(IndexSortByField { 46 | field: "intval".to_string(), 47 | order: Order::Desc, 48 | }), 49 | ..Default::default() 50 | }; 51 | let mut index_builder = Index::builder().schema(schema); 52 | index_builder = index_builder.settings(settings); 53 | let index = index_builder.create_in_ram().unwrap(); 54 | ``` 55 | 56 | ## Implementation details 57 | 58 | Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073). 59 | 60 | In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets). 61 | 62 | -------------------------------------------------------------------------------- /doc/src/innerworkings.md: -------------------------------------------------------------------------------- 1 | # Innerworkings 2 | -------------------------------------------------------------------------------- /doc/src/inverted_index.md: -------------------------------------------------------------------------------- 1 | # Inverted index 2 | -------------------------------------------------------------------------------- /doc/src/schema.md: -------------------------------------------------------------------------------- 1 | # Defining your schema 2 | -------------------------------------------------------------------------------- /examples/faceted_search_with_tweaked_score.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | use tantivy::collector::TopDocs; 4 | use tantivy::query::BooleanQuery; 5 | use tantivy::schema::*; 6 | use tantivy::{doc, DocId, Index, Score, SegmentReader}; 7 | 8 | fn main() -> tantivy::Result<()> { 9 | let mut schema_builder = Schema::builder(); 10 | 11 | let title = schema_builder.add_text_field("title", STORED); 12 | let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default()); 13 | 14 | let schema = schema_builder.build(); 15 | let index = Index::create_in_ram(schema); 16 | 17 | let mut index_writer = index.writer(30_000_000)?; 18 | 19 | index_writer.add_document(doc!( 20 | title => "Fried egg", 21 | ingredient => Facet::from("/ingredient/egg"), 22 | ingredient => Facet::from("/ingredient/oil"), 23 | ))?; 24 | index_writer.add_document(doc!( 25 | title => "Scrambled egg", 26 | ingredient => Facet::from("/ingredient/egg"), 27 | ingredient => Facet::from("/ingredient/butter"), 28 | ingredient => Facet::from("/ingredient/milk"), 29 | ingredient => Facet::from("/ingredient/salt"), 30 | ))?; 31 | index_writer.add_document(doc!( 32 | title => "Egg rolls", 33 | ingredient => Facet::from("/ingredient/egg"), 34 | ingredient => Facet::from("/ingredient/garlic"), 35 | ingredient => Facet::from("/ingredient/salt"), 36 | ingredient => Facet::from("/ingredient/oil"), 37 | ingredient => Facet::from("/ingredient/tortilla-wrap"), 38 | ingredient => Facet::from("/ingredient/mushroom"), 39 | ))?; 40 | index_writer.commit()?; 41 | 42 | let reader = index.reader()?; 43 | let searcher = reader.searcher(); 44 | { 45 | let facets = vec![ 46 | Facet::from("/ingredient/egg"), 47 | Facet::from("/ingredient/oil"), 48 | Facet::from("/ingredient/garlic"), 49 | Facet::from("/ingredient/mushroom"), 50 | ]; 51 | let query = BooleanQuery::new_multiterms_query( 52 | facets 53 | .iter() 54 | .map(|key| Term::from_facet(ingredient, key)) 55 | .collect(), 56 | ); 57 | let top_docs_by_custom_score = 58 | TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| { 59 | let ingredient_reader = segment_reader.facet_reader(ingredient).unwrap(); 60 | let facet_dict = ingredient_reader.facet_dict(); 61 | 62 | let query_ords: HashSet = facets 63 | .iter() 64 | .filter_map(|key| facet_dict.term_ord(key.encoded_str()).unwrap()) 65 | .collect(); 66 | 67 | let mut facet_ords_buffer: Vec = Vec::with_capacity(20); 68 | 69 | move |doc: DocId, original_score: Score| { 70 | ingredient_reader.facet_ords(doc, &mut facet_ords_buffer); 71 | let missing_ingredients = facet_ords_buffer 72 | .iter() 73 | .filter(|ord| !query_ords.contains(ord)) 74 | .count(); 75 | let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32); 76 | 77 | original_score * tweak 78 | } 79 | }); 80 | let top_docs = searcher.search(&query, &top_docs_by_custom_score)?; 81 | 82 | let titles: Vec = top_docs 83 | .iter() 84 | .map(|(_, doc_id)| { 85 | searcher 86 | .doc(*doc_id) 87 | .unwrap() 88 | .get_first(title) 89 | .unwrap() 90 | .as_text() 91 | .unwrap() 92 | .to_owned() 93 | }) 94 | .collect(); 95 | assert_eq!(titles, vec!["Fried egg", "Egg rolls"]); 96 | } 97 | Ok(()) 98 | } 99 | -------------------------------------------------------------------------------- /examples/integer_range_search.rs: -------------------------------------------------------------------------------- 1 | // # Searching a range on an indexed int field. 2 | // 3 | // Below is an example of creating an indexed integer field in your schema 4 | // You can use RangeQuery to get a Count of all occurrences in a given range. 5 | use tantivy::collector::Count; 6 | use tantivy::query::RangeQuery; 7 | use tantivy::schema::{Schema, INDEXED}; 8 | use tantivy::{doc, Index, Result}; 9 | 10 | fn main() -> Result<()> { 11 | // For the sake of simplicity, this schema will only have 1 field 12 | let mut schema_builder = Schema::builder(); 13 | 14 | // `INDEXED` is a short-hand to indicate that our field should be "searchable". 15 | let year_field = schema_builder.add_u64_field("year", INDEXED); 16 | let schema = schema_builder.build(); 17 | let index = Index::create_in_ram(schema); 18 | let reader = index.reader()?; 19 | { 20 | let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?; 21 | for year in 1950u64..2019u64 { 22 | index_writer.add_document(doc!(year_field => year))?; 23 | } 24 | index_writer.commit()?; 25 | // The index will be a range of years 26 | } 27 | reader.reload()?; 28 | let searcher = reader.searcher(); 29 | // The end is excluded i.e. here we are searching up to 1969 30 | let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970); 31 | // Uses a Count collector to sum the total number of docs in the range 32 | let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?; 33 | assert_eq!(num_60s_books, 10); 34 | Ok(()) 35 | } 36 | -------------------------------------------------------------------------------- /examples/json_field.rs: -------------------------------------------------------------------------------- 1 | // # Json field example 2 | // 3 | // This example shows how the json field can be used 4 | // to make tantivy partially schemaless. 5 | 6 | use tantivy::collector::{Count, TopDocs}; 7 | use tantivy::query::QueryParser; 8 | use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT}; 9 | use tantivy::Index; 10 | 11 | fn main() -> tantivy::Result<()> { 12 | // # Defining the schema 13 | // 14 | // We need two fields: 15 | // - a timestamp 16 | // - a json object field 17 | let mut schema_builder = Schema::builder(); 18 | schema_builder.add_date_field("timestamp", FAST | STORED); 19 | let event_type = schema_builder.add_text_field("event_type", STRING | STORED); 20 | let attributes = schema_builder.add_json_field("attributes", STORED | TEXT); 21 | let schema = schema_builder.build(); 22 | 23 | // # Indexing documents 24 | let index = Index::create_in_ram(schema.clone()); 25 | 26 | let mut index_writer = index.writer(50_000_000)?; 27 | let doc = schema.parse_document( 28 | r#"{ 29 | "timestamp": "2022-02-22T23:20:50.53Z", 30 | "event_type": "click", 31 | "attributes": { 32 | "target": "submit-button", 33 | "cart": {"product_id": 103}, 34 | "description": "the best vacuum cleaner ever" 35 | } 36 | }"#, 37 | )?; 38 | index_writer.add_document(doc)?; 39 | let doc = schema.parse_document( 40 | r#"{ 41 | "timestamp": "2022-02-22T23:20:51.53Z", 42 | "event_type": "click", 43 | "attributes": { 44 | "target": "submit-button", 45 | "cart": {"product_id": 133}, 46 | "description": "das keyboard" 47 | } 48 | }"#, 49 | )?; 50 | index_writer.add_document(doc)?; 51 | index_writer.commit()?; 52 | 53 | let reader = index.reader()?; 54 | let searcher = reader.searcher(); 55 | 56 | let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]); 57 | { 58 | let query = query_parser.parse_query("target:submit-button")?; 59 | let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?; 60 | assert_eq!(count_docs.len(), 2); 61 | } 62 | { 63 | let query = query_parser.parse_query("target:submit")?; 64 | let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?; 65 | assert_eq!(count_docs.len(), 2); 66 | } 67 | { 68 | let query = query_parser.parse_query("cart.product_id:103")?; 69 | let count_docs = searcher.search(&*query, &Count)?; 70 | assert_eq!(count_docs, 1); 71 | } 72 | { 73 | let query = query_parser 74 | .parse_query("event_type:click AND cart.product_id:133") 75 | .unwrap(); 76 | let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap(); 77 | assert_eq!(hits.len(), 1); 78 | } 79 | Ok(()) 80 | } 81 | -------------------------------------------------------------------------------- /examples/snippet.rs: -------------------------------------------------------------------------------- 1 | // # Snippet example 2 | // 3 | // This example shows how to return a representative snippet of 4 | // your hit result. 5 | // Snippet are an extracted of a target document, and returned in HTML format. 6 | // The keyword searched by the user are highlighted with a `` tag. 7 | 8 | // --- 9 | // Importing tantivy... 10 | use tantivy::collector::TopDocs; 11 | use tantivy::query::QueryParser; 12 | use tantivy::schema::*; 13 | use tantivy::{doc, Index, Snippet, SnippetGenerator}; 14 | use tempfile::TempDir; 15 | 16 | fn main() -> tantivy::Result<()> { 17 | // Let's create a temporary directory for the 18 | // sake of this example 19 | let index_path = TempDir::new()?; 20 | 21 | // # Defining the schema 22 | let mut schema_builder = Schema::builder(); 23 | let title = schema_builder.add_text_field("title", TEXT | STORED); 24 | let body = schema_builder.add_text_field("body", TEXT | STORED); 25 | let schema = schema_builder.build(); 26 | 27 | // # Indexing documents 28 | let index = Index::create_in_dir(&index_path, schema)?; 29 | 30 | let mut index_writer = index.writer(50_000_000)?; 31 | 32 | // we'll only need one doc for this example. 33 | index_writer.add_document(doc!( 34 | title => "Of Mice and Men", 35 | body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ 36 | bank and runs deep and green. The water is warm too, for it has slipped twinkling \ 37 | over the yellow sands in the sunlight before reaching the narrow pool. On one \ 38 | side of the river the golden foothill slopes curve up to the strong and rocky \ 39 | Gabilan Mountains, but on the valley side the water is lined with trees—willows \ 40 | fresh and green with every spring, carrying in their lower leaf junctures the \ 41 | debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ 42 | limbs and branches that arch over the pool" 43 | ))?; 44 | // ... 45 | index_writer.commit()?; 46 | 47 | let reader = index.reader()?; 48 | let searcher = reader.searcher(); 49 | let query_parser = QueryParser::for_index(&index, vec![title, body]); 50 | let query = query_parser.parse_query("sycamore spring")?; 51 | 52 | let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; 53 | 54 | let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; 55 | 56 | for (score, doc_address) in top_docs { 57 | let doc = searcher.doc(doc_address)?; 58 | let snippet = snippet_generator.snippet_from_doc(&doc); 59 | println!("Document score {}:", score); 60 | println!( 61 | "title: {}", 62 | doc.get_first(title).unwrap().as_text().unwrap() 63 | ); 64 | println!("snippet: {}", snippet.to_html()); 65 | println!("custom highlighting: {}", highlight(snippet)); 66 | } 67 | 68 | Ok(()) 69 | } 70 | 71 | fn highlight(snippet: Snippet) -> String { 72 | let mut result = String::new(); 73 | let mut start_from = 0; 74 | 75 | for fragment_range in snippet.highlighted() { 76 | result.push_str(&snippet.fragment()[start_from..fragment_range.start]); 77 | result.push_str(" --> "); 78 | result.push_str(&snippet.fragment()[fragment_range.clone()]); 79 | result.push_str(" <-- "); 80 | start_from = fragment_range.end; 81 | } 82 | 83 | result.push_str(&snippet.fragment()[start_from..]); 84 | result 85 | } 86 | -------------------------------------------------------------------------------- /examples/working_with_json.rs: -------------------------------------------------------------------------------- 1 | use tantivy::schema::*; 2 | 3 | // # Document from json 4 | // 5 | // For convenience, `Document` can be parsed directly from json. 6 | fn main() -> tantivy::Result<()> { 7 | // Let's first define a schema and an index. 8 | // Check out the basic example if this is confusing to you. 9 | // 10 | // first we need to define a schema ... 11 | let mut schema_builder = Schema::builder(); 12 | schema_builder.add_text_field("title", TEXT | STORED); 13 | schema_builder.add_text_field("body", TEXT); 14 | schema_builder.add_u64_field("year", INDEXED); 15 | let schema = schema_builder.build(); 16 | 17 | // Let's assume we have a json-serialized document. 18 | let mice_and_men_doc_json = r#"{ 19 | "title": "Of Mice and Men", 20 | "year": 1937 21 | }"#; 22 | 23 | // We can parse our document 24 | let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?; 25 | 26 | // Multi-valued field are allowed, they are 27 | // expressed in JSON by an array. 28 | // The following document has two titles. 29 | let frankenstein_json = r#"{ 30 | "title": ["Frankenstein", "The Modern Prometheus"], 31 | "year": 1818 32 | }"#; 33 | let _frankenstein_doc = schema.parse_document(frankenstein_json)?; 34 | 35 | // Note that the schema is saved in your index directory. 36 | // 37 | // As a result, Indexes are aware of their schema, and you can use this feature 38 | // just by opening an existing `Index`, and calling `index.schema()..parse_document(json)`. 39 | Ok(()) 40 | } 41 | -------------------------------------------------------------------------------- /fastfield_codecs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fastfield_codecs" 3 | version = "0.1.0" 4 | authors = ["Pascal Seitz "] 5 | license = "MIT" 6 | edition = "2018" 7 | description = "Fast field codecs used by tantivy" 8 | 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 10 | 11 | [dependencies] 12 | common = { version = "0.2", path = "../common/", package = "tantivy-common" } 13 | tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" } 14 | prettytable-rs = {version="0.8.0", optional= true} 15 | rand = {version="0.8.3", optional= true} 16 | 17 | [dev-dependencies] 18 | more-asserts = "0.2.1" 19 | rand = "0.8.3" 20 | 21 | [features] 22 | bin = ["prettytable-rs", "rand"] 23 | default = ["bin"] 24 | 25 | -------------------------------------------------------------------------------- /fastfield_codecs/benches/bench.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | 5 | #[cfg(test)] 6 | mod tests { 7 | use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer}; 8 | use fastfield_codecs::linearinterpol::{ 9 | LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer, 10 | }; 11 | use fastfield_codecs::multilinearinterpol::{ 12 | MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer, 13 | }; 14 | use fastfield_codecs::*; 15 | 16 | fn get_data() -> Vec { 17 | let mut data: Vec<_> = (100..55000_u64) 18 | .map(|num| num + rand::random::() as u64) 19 | .collect(); 20 | data.push(99_000); 21 | data.insert(1000, 2000); 22 | data.insert(2000, 100); 23 | data.insert(3000, 4100); 24 | data.insert(4000, 100); 25 | data.insert(5000, 800); 26 | data 27 | } 28 | 29 | fn value_iter() -> impl Iterator { 30 | 0..20_000 31 | } 32 | fn bench_get( 33 | b: &mut Bencher, 34 | data: &[u64], 35 | ) { 36 | let mut bytes = vec![]; 37 | S::serialize( 38 | &mut bytes, 39 | &data, 40 | stats_from_vec(data), 41 | data.iter().cloned(), 42 | data.iter().cloned(), 43 | ) 44 | .unwrap(); 45 | let reader = R::open_from_bytes(&bytes).unwrap(); 46 | b.iter(|| { 47 | for pos in value_iter() { 48 | reader.get_u64(pos as u64, &bytes); 49 | } 50 | }); 51 | } 52 | fn bench_create(b: &mut Bencher, data: &[u64]) { 53 | let mut bytes = vec![]; 54 | b.iter(|| { 55 | S::serialize( 56 | &mut bytes, 57 | &data, 58 | stats_from_vec(data), 59 | data.iter().cloned(), 60 | data.iter().cloned(), 61 | ) 62 | .unwrap(); 63 | }); 64 | } 65 | 66 | use test::Bencher; 67 | #[bench] 68 | fn bench_fastfield_bitpack_create(b: &mut Bencher) { 69 | let data: Vec<_> = get_data(); 70 | bench_create::(b, &data); 71 | } 72 | #[bench] 73 | fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { 74 | let data: Vec<_> = get_data(); 75 | bench_create::(b, &data); 76 | } 77 | #[bench] 78 | fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { 79 | let data: Vec<_> = get_data(); 80 | bench_create::(b, &data); 81 | } 82 | #[bench] 83 | fn bench_fastfield_bitpack_get(b: &mut Bencher) { 84 | let data: Vec<_> = get_data(); 85 | bench_get::(b, &data); 86 | } 87 | #[bench] 88 | fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { 89 | let data: Vec<_> = get_data(); 90 | bench_get::(b, &data); 91 | } 92 | #[bench] 93 | fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { 94 | let data: Vec<_> = get_data(); 95 | bench_get::( 96 | b, &data, 97 | ); 98 | } 99 | pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { 100 | let min_value = data.iter().cloned().min().unwrap_or(0); 101 | let max_value = data.iter().cloned().max().unwrap_or(0); 102 | FastFieldStats { 103 | min_value, 104 | max_value, 105 | num_vals: data.len() as u64, 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /ownedbytes/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Paul Masurel ", "Pascal Seitz "] 3 | name = "ownedbytes" 4 | version = "0.2.0" 5 | edition = "2018" 6 | description = "Expose data as static slice" 7 | license = "MIT" 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 9 | 10 | [dependencies] 11 | stable_deref_trait = "1.2.0" 12 | -------------------------------------------------------------------------------- /query-grammar/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tantivy-query-grammar" 3 | version = "0.15.0" 4 | authors = ["Paul Masurel "] 5 | license = "MIT" 6 | categories = ["database-implementations", "data-structures"] 7 | description = """Search engine library""" 8 | homepage = "https://github.com/quickwit-oss/tantivy" 9 | repository = "https://github.com/quickwit-oss/tantivy" 10 | readme = "README.md" 11 | keywords = ["search", "information", "retrieval"] 12 | edition = "2018" 13 | 14 | [dependencies] 15 | combine = {version="4", default-features=false, features=[] } 16 | once_cell = "1.7.2" 17 | regex ={ version = "1.5.4", default-features = false, features = ["std"] } 18 | -------------------------------------------------------------------------------- /query-grammar/README.md: -------------------------------------------------------------------------------- 1 | # Tantivy Query Grammar 2 | 3 | This crate is used by tantivy to parse queries. 4 | -------------------------------------------------------------------------------- /query-grammar/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod occur; 2 | mod query_grammar; 3 | mod user_input_ast; 4 | use combine::parser::Parser; 5 | 6 | pub use crate::occur::Occur; 7 | use crate::query_grammar::parse_to_ast; 8 | pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; 9 | 10 | pub struct Error; 11 | 12 | pub fn parse_query(query: &str) -> Result { 13 | let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?; 14 | Ok(user_input_ast) 15 | } 16 | -------------------------------------------------------------------------------- /query-grammar/src/occur.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::fmt::Write; 3 | 4 | /// Defines whether a term in a query must be present, 5 | /// should be present or must be not present. 6 | #[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)] 7 | pub enum Occur { 8 | /// For a given document to be considered for scoring, 9 | /// at least one of the document with the Should or the Must 10 | /// Occur constraint must be within the document. 11 | Should, 12 | /// Document without the term are excluded from the search. 13 | Must, 14 | /// Document that contain the term are excluded from the 15 | /// search. 16 | MustNot, 17 | } 18 | 19 | impl Occur { 20 | /// Returns the one-char prefix symbol for this `Occur`. 21 | /// - `Should` => '?', 22 | /// - `Must` => '+' 23 | /// - `Not` => '-' 24 | fn to_char(self) -> char { 25 | match self { 26 | Occur::Should => '?', 27 | Occur::Must => '+', 28 | Occur::MustNot => '-', 29 | } 30 | } 31 | 32 | /// Compose two occur values. 33 | pub fn compose(left: Occur, right: Occur) -> Occur { 34 | match (left, right) { 35 | (Occur::Should, _) => right, 36 | (Occur::Must, Occur::MustNot) => Occur::MustNot, 37 | (Occur::Must, _) => Occur::Must, 38 | (Occur::MustNot, Occur::MustNot) => Occur::Must, 39 | (Occur::MustNot, _) => Occur::MustNot, 40 | } 41 | } 42 | } 43 | 44 | impl fmt::Display for Occur { 45 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 46 | f.write_char(self.to_char()) 47 | } 48 | } 49 | 50 | #[cfg(test)] 51 | mod test { 52 | use crate::Occur; 53 | 54 | #[test] 55 | fn test_occur_compose() { 56 | assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should); 57 | assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must); 58 | assert_eq!( 59 | Occur::compose(Occur::Should, Occur::MustNot), 60 | Occur::MustNot 61 | ); 62 | assert_eq!(Occur::compose(Occur::Must, Occur::Should), Occur::Must); 63 | assert_eq!(Occur::compose(Occur::Must, Occur::Must), Occur::Must); 64 | assert_eq!(Occur::compose(Occur::Must, Occur::MustNot), Occur::MustNot); 65 | assert_eq!( 66 | Occur::compose(Occur::MustNot, Occur::Should), 67 | Occur::MustNot 68 | ); 69 | assert_eq!(Occur::compose(Occur::MustNot, Occur::Must), Occur::MustNot); 70 | assert_eq!(Occur::compose(Occur::MustNot, Occur::MustNot), Occur::Must); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cargo test 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | comment_width = 120 2 | format_strings = true 3 | group_imports = "StdExternalCrate" 4 | imports_granularity = "Module" 5 | normalize_comments = true 6 | where_single_line = true 7 | wrap_comments = true 8 | -------------------------------------------------------------------------------- /src/aggregation/README.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When adding new bucket aggregation make sure to extend the "test_aggregation_flushing" test for at least 2 levels. 4 | 5 | 6 | 7 | # Code Organization 8 | 9 | Tantivy's aggregations have been designed to mimic the 10 | [aggregations of elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html). 11 | 12 | The code is organized in submodules: 13 | 14 | ## bucket 15 | Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggegations. 16 | 17 | ## metric 18 | Contains all metric aggregations, like average aggregation. Metric aggregations do not have sub aggregations. 19 | 20 | #### agg_req 21 | agg_req contains the users aggregation request. Deserialization from json is compatible with elasticsearch aggregation requests. 22 | 23 | #### agg_req_with_accessor 24 | agg_req_with_accessor contains the users aggregation request enriched with fast field accessors etc, which are 25 | used during collection. 26 | 27 | #### segment_agg_result 28 | segment_agg_result contains the aggregation result tree, which is used for collection of a segment. 29 | The tree from agg_req_with_accessor is passed during collection. 30 | 31 | #### intermediate_agg_result 32 | intermediate_agg_result contains the aggregation tree for merging with other trees. 33 | 34 | #### agg_result 35 | agg_result contains the final aggregation tree. 36 | 37 | -------------------------------------------------------------------------------- /src/aggregation/bucket/histogram/mod.rs: -------------------------------------------------------------------------------- 1 | mod histogram; 2 | pub use histogram::*; 3 | -------------------------------------------------------------------------------- /src/aggregation/bucket/mod.rs: -------------------------------------------------------------------------------- 1 | //! Module for all bucket aggregations. 2 | //! 3 | //! BucketAggregations create buckets of documents 4 | //! [BucketAggregation](super::agg_req::BucketAggregation). 5 | //! 6 | //! Results of final buckets are [BucketResult](super::agg_result::BucketResult). 7 | //! Results of intermediate buckets are 8 | //! [IntermediateBucketResult](super::intermediate_agg_result::IntermediateBucketResult) 9 | 10 | mod histogram; 11 | mod range; 12 | 13 | pub(crate) use histogram::SegmentHistogramCollector; 14 | pub use histogram::*; 15 | pub(crate) use range::SegmentRangeCollector; 16 | pub use range::*; 17 | -------------------------------------------------------------------------------- /src/aggregation/metric/average.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | use crate::aggregation::f64_from_fastfield_u64; 6 | use crate::fastfield::{DynamicFastFieldReader, FastFieldReader}; 7 | use crate::schema::Type; 8 | use crate::DocId; 9 | 10 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 11 | /// A single-value metric aggregation that computes the average of numeric values that are 12 | /// extracted from the aggregated documents. 13 | /// Supported field types are u64, i64, and f64. 14 | /// See [super::SingleMetricResult] for return value. 15 | /// 16 | /// # JSON Format 17 | /// ```json 18 | /// { 19 | /// "avg": { 20 | /// "field": "score", 21 | /// } 22 | /// } 23 | /// ``` 24 | pub struct AverageAggregation { 25 | /// The field name to compute the stats on. 26 | pub field: String, 27 | } 28 | impl AverageAggregation { 29 | /// Create new AverageAggregation from a field. 30 | pub fn from_field_name(field_name: String) -> Self { 31 | AverageAggregation { field: field_name } 32 | } 33 | /// Return the field name. 34 | pub fn field_name(&self) -> &str { 35 | &self.field 36 | } 37 | } 38 | 39 | #[derive(Clone, PartialEq)] 40 | pub(crate) struct SegmentAverageCollector { 41 | pub data: IntermediateAverage, 42 | field_type: Type, 43 | } 44 | 45 | impl Debug for SegmentAverageCollector { 46 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 47 | f.debug_struct("AverageCollector") 48 | .field("data", &self.data) 49 | .finish() 50 | } 51 | } 52 | 53 | impl SegmentAverageCollector { 54 | pub fn from_req(field_type: Type) -> Self { 55 | Self { 56 | field_type, 57 | data: Default::default(), 58 | } 59 | } 60 | pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader) { 61 | let mut iter = doc.chunks_exact(4); 62 | for docs in iter.by_ref() { 63 | let val1 = field.get(docs[0]); 64 | let val2 = field.get(docs[1]); 65 | let val3 = field.get(docs[2]); 66 | let val4 = field.get(docs[3]); 67 | let val1 = f64_from_fastfield_u64(val1, &self.field_type); 68 | let val2 = f64_from_fastfield_u64(val2, &self.field_type); 69 | let val3 = f64_from_fastfield_u64(val3, &self.field_type); 70 | let val4 = f64_from_fastfield_u64(val4, &self.field_type); 71 | self.data.collect(val1); 72 | self.data.collect(val2); 73 | self.data.collect(val3); 74 | self.data.collect(val4); 75 | } 76 | for doc in iter.remainder() { 77 | let val = field.get(*doc); 78 | let val = f64_from_fastfield_u64(val, &self.field_type); 79 | self.data.collect(val); 80 | } 81 | } 82 | } 83 | 84 | /// Contains mergeable version of average data. 85 | #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)] 86 | pub struct IntermediateAverage { 87 | pub(crate) sum: f64, 88 | pub(crate) doc_count: u64, 89 | } 90 | 91 | impl IntermediateAverage { 92 | pub(crate) fn from_collector(collector: SegmentAverageCollector) -> Self { 93 | collector.data 94 | } 95 | 96 | /// Merge average data into this instance. 97 | pub fn merge_fruits(&mut self, other: IntermediateAverage) { 98 | self.sum += other.sum; 99 | self.doc_count += other.doc_count; 100 | } 101 | /// compute final result 102 | pub fn finalize(&self) -> Option { 103 | if self.doc_count == 0 { 104 | None 105 | } else { 106 | Some(self.sum / self.doc_count as f64) 107 | } 108 | } 109 | #[inline] 110 | fn collect(&mut self, val: f64) { 111 | self.doc_count += 1; 112 | self.sum += val; 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/aggregation/metric/mod.rs: -------------------------------------------------------------------------------- 1 | //! Module for all metric aggregations. 2 | //! 3 | //! The aggregations in this family compute metrics, see [super::agg_req::MetricAggregation] for 4 | //! details. 5 | mod average; 6 | mod stats; 7 | pub use average::*; 8 | use serde::{Deserialize, Serialize}; 9 | pub use stats::*; 10 | 11 | /// Single-metric aggregations use this common result structure. 12 | /// 13 | /// Main reason to wrap it in value is to match elasticsearch output structure. 14 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] 15 | pub struct SingleMetricResult { 16 | /// The value of the single value metric. 17 | pub value: Option, 18 | } 19 | 20 | impl From for SingleMetricResult { 21 | fn from(value: f64) -> Self { 22 | Self { value: Some(value) } 23 | } 24 | } 25 | 26 | impl From> for SingleMetricResult { 27 | fn from(value: Option) -> Self { 28 | Self { value } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/collector/count_collector.rs: -------------------------------------------------------------------------------- 1 | use super::Collector; 2 | use crate::collector::SegmentCollector; 3 | use crate::{DocId, Score, SegmentOrdinal, SegmentReader}; 4 | 5 | /// `CountCollector` collector only counts how many 6 | /// documents match the query. 7 | /// 8 | /// ```rust 9 | /// use tantivy::collector::Count; 10 | /// use tantivy::query::QueryParser; 11 | /// use tantivy::schema::{Schema, TEXT}; 12 | /// use tantivy::{doc, Index}; 13 | /// 14 | /// let mut schema_builder = Schema::builder(); 15 | /// let title = schema_builder.add_text_field("title", TEXT); 16 | /// let schema = schema_builder.build(); 17 | /// let index = Index::create_in_ram(schema); 18 | /// 19 | /// let mut index_writer = index.writer(3_000_000).unwrap(); 20 | /// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap(); 21 | /// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap(); 22 | /// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap(); 23 | /// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap(); 24 | /// assert!(index_writer.commit().is_ok()); 25 | /// 26 | /// let reader = index.reader().unwrap(); 27 | /// let searcher = reader.searcher(); 28 | /// 29 | /// // Here comes the important part 30 | /// let query_parser = QueryParser::for_index(&index, vec![title]); 31 | /// let query = query_parser.parse_query("diary").unwrap(); 32 | /// let count = searcher.search(&query, &Count).unwrap(); 33 | /// 34 | /// assert_eq!(count, 2); 35 | /// ``` 36 | pub struct Count; 37 | 38 | impl Collector for Count { 39 | type Fruit = usize; 40 | 41 | type Child = SegmentCountCollector; 42 | 43 | fn for_segment( 44 | &self, 45 | _: SegmentOrdinal, 46 | _: &SegmentReader, 47 | ) -> crate::Result { 48 | Ok(SegmentCountCollector::default()) 49 | } 50 | 51 | fn requires_scoring(&self) -> bool { 52 | false 53 | } 54 | 55 | fn merge_fruits(&self, segment_counts: Vec) -> crate::Result { 56 | Ok(segment_counts.into_iter().sum()) 57 | } 58 | } 59 | 60 | #[derive(Default)] 61 | pub struct SegmentCountCollector { 62 | count: usize, 63 | } 64 | 65 | impl SegmentCollector for SegmentCountCollector { 66 | type Fruit = usize; 67 | 68 | fn collect(&mut self, _: DocId, _: Score) { 69 | self.count += 1; 70 | } 71 | 72 | fn harvest(self) -> usize { 73 | self.count 74 | } 75 | } 76 | 77 | #[cfg(test)] 78 | mod tests { 79 | use super::{Count, SegmentCountCollector}; 80 | use crate::collector::{Collector, SegmentCollector}; 81 | 82 | #[test] 83 | fn test_count_collect_does_not_requires_scoring() { 84 | assert!(!Count.requires_scoring()); 85 | } 86 | 87 | #[test] 88 | fn test_segment_count_collector() { 89 | { 90 | let count_collector = SegmentCountCollector::default(); 91 | assert_eq!(count_collector.harvest(), 0); 92 | } 93 | { 94 | let mut count_collector = SegmentCountCollector::default(); 95 | count_collector.collect(0u32, 1.0); 96 | assert_eq!(count_collector.harvest(), 1); 97 | } 98 | { 99 | let mut count_collector = SegmentCountCollector::default(); 100 | count_collector.collect(0u32, 1.0); 101 | assert_eq!(count_collector.harvest(), 1); 102 | } 103 | { 104 | let mut count_collector = SegmentCountCollector::default(); 105 | count_collector.collect(0u32, 1.0); 106 | count_collector.collect(1u32, 1.0); 107 | assert_eq!(count_collector.harvest(), 2); 108 | } 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/collector/docset_collector.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | 3 | use super::{Collector, SegmentCollector}; 4 | use crate::{DocAddress, DocId, Score}; 5 | 6 | /// Collectors that returns the set of DocAddress that matches the query. 7 | /// 8 | /// This collector is mostly useful for tests. 9 | pub struct DocSetCollector; 10 | 11 | impl Collector for DocSetCollector { 12 | type Fruit = HashSet; 13 | type Child = DocSetChildCollector; 14 | 15 | fn for_segment( 16 | &self, 17 | segment_local_id: crate::SegmentOrdinal, 18 | _segment: &crate::SegmentReader, 19 | ) -> crate::Result { 20 | Ok(DocSetChildCollector { 21 | segment_local_id, 22 | docs: HashSet::new(), 23 | }) 24 | } 25 | 26 | fn requires_scoring(&self) -> bool { 27 | false 28 | } 29 | 30 | fn merge_fruits( 31 | &self, 32 | segment_fruits: Vec<(u32, HashSet)>, 33 | ) -> crate::Result { 34 | let len: usize = segment_fruits.iter().map(|(_, docset)| docset.len()).sum(); 35 | let mut result = HashSet::with_capacity(len); 36 | for (segment_local_id, docs) in segment_fruits { 37 | for doc in docs { 38 | result.insert(DocAddress::new(segment_local_id, doc)); 39 | } 40 | } 41 | Ok(result) 42 | } 43 | } 44 | 45 | pub struct DocSetChildCollector { 46 | segment_local_id: u32, 47 | docs: HashSet, 48 | } 49 | 50 | impl SegmentCollector for DocSetChildCollector { 51 | type Fruit = (u32, HashSet); 52 | 53 | fn collect(&mut self, doc: crate::DocId, _score: Score) { 54 | self.docs.insert(doc); 55 | } 56 | 57 | fn harvest(self) -> (u32, HashSet) { 58 | (self.segment_local_id, self.docs) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/core/mod.rs: -------------------------------------------------------------------------------- 1 | mod executor; 2 | pub mod index; 3 | mod index_meta; 4 | mod inverted_index_reader; 5 | pub mod searcher; 6 | mod segment; 7 | mod segment_component; 8 | mod segment_id; 9 | mod segment_reader; 10 | 11 | use std::path::Path; 12 | 13 | use once_cell::sync::Lazy; 14 | 15 | pub use self::executor::Executor; 16 | pub use self::index::{Index, IndexBuilder}; 17 | pub use self::index_meta::{ 18 | IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory, 19 | }; 20 | pub use self::inverted_index_reader::InvertedIndexReader; 21 | pub use self::searcher::{Searcher, SearcherGeneration}; 22 | pub use self::segment::Segment; 23 | pub use self::segment_component::SegmentComponent; 24 | pub use self::segment_id::SegmentId; 25 | pub use self::segment_reader::SegmentReader; 26 | 27 | /// The meta file contains all the information about the list of segments and the schema 28 | /// of the index. 29 | pub static META_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new("meta.json")); 30 | 31 | /// The managed file contains a list of files that were created by the tantivy 32 | /// and will therefore be garbage collected when they are deemed useless by tantivy. 33 | /// 34 | /// Removing this file is safe, but will prevent the garbage collection of all of the file that 35 | /// are currently in the directory 36 | pub static MANAGED_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new(".managed.json")); 37 | -------------------------------------------------------------------------------- /src/core/segment.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::path::PathBuf; 3 | 4 | use super::SegmentComponent; 5 | use crate::core::{Index, SegmentId, SegmentMeta}; 6 | use crate::directory::error::{OpenReadError, OpenWriteError}; 7 | use crate::directory::{Directory, FileSlice, WritePtr}; 8 | use crate::schema::Schema; 9 | use crate::Opstamp; 10 | 11 | /// A segment is a piece of the index. 12 | #[derive(Clone)] 13 | pub struct Segment { 14 | index: Index, 15 | meta: SegmentMeta, 16 | } 17 | 18 | impl fmt::Debug for Segment { 19 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 20 | write!(f, "Segment({:?})", self.id().uuid_string()) 21 | } 22 | } 23 | 24 | impl Segment { 25 | /// Creates a new segment given an `Index` and a `SegmentId` 26 | pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment { 27 | Segment { index, meta } 28 | } 29 | 30 | /// Returns the index the segment belongs to. 31 | pub fn index(&self) -> &Index { 32 | &self.index 33 | } 34 | 35 | /// Returns our index's schema. 36 | pub fn schema(&self) -> Schema { 37 | self.index.schema() 38 | } 39 | 40 | /// Returns the segment meta-information 41 | pub fn meta(&self) -> &SegmentMeta { 42 | &self.meta 43 | } 44 | 45 | /// Updates the max_doc value from the `SegmentMeta`. 46 | /// 47 | /// This method is only used when updating `max_doc` from 0 48 | /// as we finalize a fresh new segment. 49 | pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment { 50 | Segment { 51 | index: self.index, 52 | meta: self.meta.with_max_doc(max_doc), 53 | } 54 | } 55 | 56 | #[doc(hidden)] 57 | #[must_use] 58 | pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment { 59 | Segment { 60 | index: self.index, 61 | meta: self.meta.with_delete_meta(num_deleted_docs, opstamp), 62 | } 63 | } 64 | 65 | /// Returns the segment's id. 66 | pub fn id(&self) -> SegmentId { 67 | self.meta.id() 68 | } 69 | 70 | /// Returns the relative path of a component of our segment. 71 | /// 72 | /// It just joins the segment id with the extension 73 | /// associated to a segment component. 74 | pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { 75 | self.meta.relative_path(component) 76 | } 77 | 78 | /// Open one of the component file for a *regular* read. 79 | pub fn open_read(&self, component: SegmentComponent) -> Result { 80 | let path = self.relative_path(component); 81 | self.index.directory().open_read(&path) 82 | } 83 | 84 | /// Open one of the component file for *regular* write. 85 | pub fn open_write(&mut self, component: SegmentComponent) -> Result { 86 | let path = self.relative_path(component); 87 | let write = self.index.directory_mut().open_write(&path)?; 88 | Ok(write) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/core/segment_component.rs: -------------------------------------------------------------------------------- 1 | use std::slice; 2 | 3 | /// Enum describing each component of a tantivy segment. 4 | /// Each component is stored in its own file, 5 | /// using the pattern `segment_uuid`.`component_extension`, 6 | /// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension` 7 | #[derive(Copy, Clone, Eq, PartialEq)] 8 | pub enum SegmentComponent { 9 | /// Postings (or inverted list). Sorted lists of document ids, associated to terms 10 | Postings, 11 | /// Positions of terms in each document. 12 | Positions, 13 | /// Column-oriented random-access storage of fields. 14 | FastFields, 15 | /// Stores the sum of the length (in terms) of each field for each document. 16 | /// Field norms are stored as a special u64 fast field. 17 | FieldNorms, 18 | /// Dictionary associating `Term`s to `TermInfo`s which is 19 | /// simply an address into the `postings` file and the `positions` file. 20 | Terms, 21 | /// Row-oriented, compressed storage of the documents. 22 | /// Accessing a document from the store is relatively slow, as it 23 | /// requires to decompress the entire block it belongs to. 24 | Store, 25 | /// Temporary storage of the documents, before streamed to `Store`. 26 | TempStore, 27 | /// Bitset describing which document of the segment is deleted. 28 | Delete, 29 | } 30 | 31 | impl SegmentComponent { 32 | /// Iterates through the components. 33 | pub fn iterator() -> slice::Iter<'static, SegmentComponent> { 34 | static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [ 35 | SegmentComponent::Postings, 36 | SegmentComponent::Positions, 37 | SegmentComponent::FastFields, 38 | SegmentComponent::FieldNorms, 39 | SegmentComponent::Terms, 40 | SegmentComponent::Store, 41 | SegmentComponent::TempStore, 42 | SegmentComponent::Delete, 43 | ]; 44 | SEGMENT_COMPONENTS.iter() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/directory/directory_lock.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use once_cell::sync::Lazy; 4 | 5 | /// A directory lock. 6 | /// 7 | /// A lock is associated to a specific path and some 8 | /// [`LockParams`](./enum.LockParams.html). 9 | /// Tantivy itself uses only two locks but client application 10 | /// can use the directory facility to define their own locks. 11 | /// - [INDEX_WRITER_LOCK] 12 | /// - [META_LOCK] 13 | /// 14 | /// Check out these locks documentation for more information. 15 | #[derive(Debug)] 16 | pub struct Lock { 17 | /// The lock needs to be associated with its own file `path`. 18 | /// Depending on the platform, the lock might rely on the creation 19 | /// and deletion of this filepath. 20 | pub filepath: PathBuf, 21 | /// `lock_params` describes whether acquiring the lock is meant 22 | /// to be a blocking operation or a non-blocking. 23 | /// 24 | /// Acquiring a blocking lock blocks until the lock is 25 | /// available. 26 | /// Acquiring a blocking lock returns rapidly, either successfully 27 | /// or with an error signifying that someone is already holding 28 | /// the lock. 29 | pub is_blocking: bool, 30 | } 31 | 32 | /// Only one process should be able to write tantivy's index at a time. 33 | /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter. 34 | /// 35 | /// If the process is killed and this file remains, it is safe to remove it manually. 36 | /// 37 | /// Failing to acquire this lock usually means a misuse of tantivy's API, 38 | /// (creating more than one instance of the `IndexWriter`), are a spurious 39 | /// lock file remaining after a crash. In the latter case, removing the file after 40 | /// checking no process running tantivy is running is safe. 41 | pub static INDEX_WRITER_LOCK: Lazy = Lazy::new(|| Lock { 42 | filepath: PathBuf::from(".tantivy-writer.lock"), 43 | is_blocking: false, 44 | }); 45 | /// The meta lock file is here to protect the segment files being opened by 46 | /// `IndexReader::reload()` from being garbage collected. 47 | /// It makes it possible for another process to safely consume 48 | /// our index in-writing. Ideally, we may have prefered `RWLock` semantics 49 | /// here, but it is difficult to achieve on Windows. 50 | /// 51 | /// Opening segment readers is a very fast process. 52 | pub static META_LOCK: Lazy = Lazy::new(|| Lock { 53 | filepath: PathBuf::from(".tantivy-meta.lock"), 54 | is_blocking: true, 55 | }); 56 | -------------------------------------------------------------------------------- /src/directory/mod.rs: -------------------------------------------------------------------------------- 1 | //! WORM (Write Once Read Many) directory abstraction. 2 | 3 | #[cfg(feature = "mmap")] 4 | mod mmap_directory; 5 | 6 | mod directory; 7 | mod directory_lock; 8 | mod file_slice; 9 | mod file_watcher; 10 | mod footer; 11 | mod managed_directory; 12 | mod ram_directory; 13 | mod watch_event_router; 14 | 15 | /// Errors specific to the directory module. 16 | pub mod error; 17 | 18 | mod composite_file; 19 | 20 | use std::io::BufWriter; 21 | use std::path::PathBuf; 22 | 23 | pub use common::{AntiCallToken, TerminatingWrite}; 24 | pub use ownedbytes::OwnedBytes; 25 | 26 | pub(crate) use self::composite_file::{CompositeFile, CompositeWrite}; 27 | pub use self::directory::{Directory, DirectoryClone, DirectoryLock}; 28 | pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK}; 29 | pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes}; 30 | pub use self::file_slice::{FileHandle, FileSlice}; 31 | pub use self::ram_directory::RamDirectory; 32 | pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle}; 33 | 34 | /// Outcome of the Garbage collection 35 | pub struct GarbageCollectionResult { 36 | /// List of files that were deleted in this cycle 37 | pub deleted_files: Vec, 38 | /// List of files that were schedule to be deleted in this cycle, 39 | /// but deletion did not work. This typically happens on windows, 40 | /// as deleting a memory mapped file is forbidden. 41 | /// 42 | /// If a searcher is still held, a file cannot be deleted. 43 | /// This is not considered a bug, the file will simply be deleted 44 | /// in the next GC. 45 | pub failed_to_delete_files: Vec, 46 | } 47 | 48 | pub use self::managed_directory::ManagedDirectory; 49 | #[cfg(feature = "mmap")] 50 | pub use self::mmap_directory::MmapDirectory; 51 | 52 | /// Write object for Directory. 53 | /// 54 | /// `WritePtr` are required to implement both Write 55 | /// and Seek. 56 | pub type WritePtr = BufWriter>; 57 | 58 | #[cfg(test)] 59 | mod tests; 60 | -------------------------------------------------------------------------------- /src/fastfield/bytes/reader.rs: -------------------------------------------------------------------------------- 1 | use crate::directory::{FileSlice, OwnedBytes}; 2 | use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, MultiValueLength}; 3 | use crate::DocId; 4 | 5 | /// Reader for byte array fast fields 6 | /// 7 | /// The reader is implemented as a `u64` fast field and a separate collection of bytes. 8 | /// 9 | /// The `vals_reader` will access the concatenated list of all values for all documents. 10 | /// 11 | /// The `idx_reader` associates, for each document, the index of its first value. 12 | /// 13 | /// Reading the value for a document is done by reading the start index for it, 14 | /// and the start index for the next document, and keeping the bytes in between. 15 | #[derive(Clone)] 16 | pub struct BytesFastFieldReader { 17 | idx_reader: DynamicFastFieldReader, 18 | values: OwnedBytes, 19 | } 20 | 21 | impl BytesFastFieldReader { 22 | pub(crate) fn open( 23 | idx_reader: DynamicFastFieldReader, 24 | values_file: FileSlice, 25 | ) -> crate::Result { 26 | let values = values_file.read_bytes()?; 27 | Ok(BytesFastFieldReader { idx_reader, values }) 28 | } 29 | 30 | fn range(&self, doc: DocId) -> (usize, usize) { 31 | let start = self.idx_reader.get(doc) as usize; 32 | let stop = self.idx_reader.get(doc + 1) as usize; 33 | (start, stop) 34 | } 35 | 36 | /// Returns the bytes associated to the given `doc` 37 | pub fn get_bytes(&self, doc: DocId) -> &[u8] { 38 | let (start, stop) = self.range(doc); 39 | &self.values.as_slice()[start..stop] 40 | } 41 | 42 | /// Returns the length of the bytes associated to the given `doc` 43 | pub fn num_bytes(&self, doc: DocId) -> usize { 44 | let (start, stop) = self.range(doc); 45 | stop - start 46 | } 47 | 48 | /// Returns the overall number of bytes in this bytes fast field. 49 | pub fn total_num_bytes(&self) -> usize { 50 | self.values.len() 51 | } 52 | } 53 | 54 | impl MultiValueLength for BytesFastFieldReader { 55 | fn get_len(&self, doc_id: DocId) -> u64 { 56 | self.num_bytes(doc_id) as u64 57 | } 58 | fn get_total_len(&self) -> u64 { 59 | self.total_num_bytes() as u64 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/fastfield/error.rs: -------------------------------------------------------------------------------- 1 | use std::result; 2 | 3 | use crate::schema::FieldEntry; 4 | 5 | /// `FastFieldNotAvailableError` is returned when the 6 | /// user requested for a fast field reader, and the field was not 7 | /// defined in the schema as a fast field. 8 | #[derive(Debug, Error)] 9 | #[error("Fast field not available: '{field_name:?}'")] 10 | pub struct FastFieldNotAvailableError { 11 | field_name: String, 12 | } 13 | 14 | impl FastFieldNotAvailableError { 15 | /// Creates a `FastFieldNotAvailable` error. 16 | /// `field_entry` is the configuration of the field 17 | /// for which fast fields are not available. 18 | pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError { 19 | FastFieldNotAvailableError { 20 | field_name: field_entry.name().to_string(), 21 | } 22 | } 23 | } 24 | 25 | /// Result when trying to access a fast field reader. 26 | pub type Result = result::Result; 27 | -------------------------------------------------------------------------------- /src/fieldnorm/serializer.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::Write; 3 | 4 | use crate::directory::{CompositeWrite, WritePtr}; 5 | use crate::schema::Field; 6 | 7 | /// The fieldnorms serializer is in charge of 8 | /// the serialization of field norms for all fields. 9 | pub struct FieldNormsSerializer { 10 | composite_write: CompositeWrite, 11 | } 12 | 13 | impl FieldNormsSerializer { 14 | /// Constructor 15 | pub fn from_write(write: WritePtr) -> io::Result { 16 | // just making room for the pointer to header. 17 | let composite_write = CompositeWrite::wrap(write); 18 | Ok(FieldNormsSerializer { composite_write }) 19 | } 20 | 21 | /// Serialize the given field 22 | pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> { 23 | let write = self.composite_write.for_field(field); 24 | write.write_all(fieldnorms_data)?; 25 | write.flush()?; 26 | Ok(()) 27 | } 28 | 29 | /// Clean up / flush / close 30 | pub fn close(self) -> io::Result<()> { 31 | self.composite_write.close()?; 32 | Ok(()) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/indexer/doc_opstamp_mapping.rs: -------------------------------------------------------------------------------- 1 | use crate::{DocId, Opstamp}; 2 | 3 | // Doc to opstamp is used to identify which 4 | // document should be deleted. 5 | // 6 | // Since the docset matching the query of a delete operation 7 | // is not computed right when the delete operation is received, 8 | // we need to find a way to evaluate, for each document, 9 | // whether the document was added before or after 10 | // the delete operation. This anteriority is used by comparing 11 | // the docstamp of the document. 12 | // 13 | // The doc to opstamp mapping stores precisely an array 14 | // indexed by doc id and storing the opstamp of the document. 15 | // 16 | // This mapping is NOT necessarily increasing, because 17 | // we might be sorting documents according to a fast field. 18 | #[derive(Clone)] 19 | pub enum DocToOpstampMapping<'a> { 20 | WithMap(&'a [Opstamp]), 21 | None, 22 | } 23 | 24 | impl<'a> DocToOpstampMapping<'a> { 25 | /// Assess whether a document should be considered deleted given that it contains 26 | /// a deleted term that was deleted at the opstamp: `delete_opstamp`. 27 | /// 28 | /// This function returns true if the `DocToOpstamp` mapping is none or if 29 | /// the `doc_opstamp` is anterior to the delete opstamp. 30 | pub fn is_deleted(&self, doc_id: DocId, delete_opstamp: Opstamp) -> bool { 31 | match self { 32 | Self::WithMap(doc_opstamps) => { 33 | let doc_opstamp = doc_opstamps[doc_id as usize]; 34 | doc_opstamp < delete_opstamp 35 | } 36 | Self::None => true, 37 | } 38 | } 39 | } 40 | 41 | #[cfg(test)] 42 | mod tests { 43 | 44 | use super::DocToOpstampMapping; 45 | 46 | #[test] 47 | fn test_doc_to_opstamp_mapping_none() { 48 | let doc_to_opstamp_mapping = DocToOpstampMapping::None; 49 | assert!(doc_to_opstamp_mapping.is_deleted(1u32, 0u64)); 50 | assert!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64)); 51 | } 52 | 53 | #[test] 54 | fn test_doc_to_opstamp_mapping_with_map() { 55 | let doc_to_opstamp_mapping = DocToOpstampMapping::WithMap(&[5u64, 1u64, 0u64, 4u64, 3u64]); 56 | assert_eq!(doc_to_opstamp_mapping.is_deleted(0u32, 2u64), false); 57 | assert_eq!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64), true); 58 | assert_eq!(doc_to_opstamp_mapping.is_deleted(2u32, 2u64), true); 59 | assert_eq!(doc_to_opstamp_mapping.is_deleted(3u32, 2u64), false); 60 | assert_eq!(doc_to_opstamp_mapping.is_deleted(4u32, 2u64), false); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/indexer/index_writer_status.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::{AtomicBool, Ordering}; 2 | use std::sync::{Arc, RwLock}; 3 | 4 | use super::AddBatchReceiver; 5 | 6 | #[derive(Clone)] 7 | pub(crate) struct IndexWriterStatus { 8 | inner: Arc, 9 | } 10 | 11 | impl IndexWriterStatus { 12 | /// Returns true iff the index writer is alive. 13 | pub fn is_alive(&self) -> bool { 14 | self.inner.as_ref().is_alive() 15 | } 16 | 17 | /// Returns a copy of the operation receiver. 18 | /// If the index writer was killed, returns None. 19 | pub fn operation_receiver(&self) -> Option { 20 | let rlock = self 21 | .inner 22 | .receive_channel 23 | .read() 24 | .expect("This lock should never be poisoned"); 25 | rlock.as_ref().cloned() 26 | } 27 | 28 | /// Create an index writer bomb. 29 | /// If dropped, the index writer status will be killed. 30 | pub(crate) fn create_bomb(&self) -> IndexWriterBomb { 31 | IndexWriterBomb { 32 | inner: Some(self.inner.clone()), 33 | } 34 | } 35 | } 36 | 37 | struct Inner { 38 | is_alive: AtomicBool, 39 | receive_channel: RwLock>, 40 | } 41 | 42 | impl Inner { 43 | fn is_alive(&self) -> bool { 44 | self.is_alive.load(Ordering::Relaxed) 45 | } 46 | 47 | fn kill(&self) { 48 | self.is_alive.store(false, Ordering::Relaxed); 49 | self.receive_channel 50 | .write() 51 | .expect("This lock should never be poisoned") 52 | .take(); 53 | } 54 | } 55 | 56 | impl From for IndexWriterStatus { 57 | fn from(receiver: AddBatchReceiver) -> Self { 58 | IndexWriterStatus { 59 | inner: Arc::new(Inner { 60 | is_alive: AtomicBool::new(true), 61 | receive_channel: RwLock::new(Some(receiver)), 62 | }), 63 | } 64 | } 65 | } 66 | 67 | /// If dropped, the index writer will be killed. 68 | /// To prevent this, clients can call `.defuse()`. 69 | pub(crate) struct IndexWriterBomb { 70 | inner: Option>, 71 | } 72 | 73 | impl IndexWriterBomb { 74 | /// Defuses the bomb. 75 | /// 76 | /// This is the only way to drop the bomb without killing 77 | /// the index writer. 78 | pub fn defuse(mut self) { 79 | self.inner = None; 80 | } 81 | } 82 | 83 | impl Drop for IndexWriterBomb { 84 | fn drop(&mut self) { 85 | if let Some(inner) = self.inner.take() { 86 | inner.kill(); 87 | } 88 | } 89 | } 90 | 91 | #[cfg(test)] 92 | mod tests { 93 | use std::mem; 94 | 95 | use crossbeam::channel; 96 | 97 | use super::IndexWriterStatus; 98 | 99 | #[test] 100 | fn test_bomb_goes_boom() { 101 | let (_tx, rx) = channel::bounded(10); 102 | let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx); 103 | assert!(index_writer_status.operation_receiver().is_some()); 104 | let bomb = index_writer_status.create_bomb(); 105 | assert!(index_writer_status.operation_receiver().is_some()); 106 | mem::drop(bomb); 107 | // boom! 108 | assert!(index_writer_status.operation_receiver().is_none()); 109 | } 110 | 111 | #[test] 112 | fn test_bomb_defused() { 113 | let (_tx, rx) = channel::bounded(10); 114 | let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx); 115 | assert!(index_writer_status.operation_receiver().is_some()); 116 | let bomb = index_writer_status.create_bomb(); 117 | bomb.defuse(); 118 | assert!(index_writer_status.operation_receiver().is_some()); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/indexer/merge_operation.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use std::ops::Deref; 3 | 4 | use crate::{Inventory, Opstamp, SegmentId, TrackedObject}; 5 | 6 | #[derive(Default)] 7 | pub(crate) struct MergeOperationInventory(Inventory); 8 | 9 | impl Deref for MergeOperationInventory { 10 | type Target = Inventory; 11 | 12 | fn deref(&self) -> &Self::Target { 13 | &self.0 14 | } 15 | } 16 | 17 | impl MergeOperationInventory { 18 | pub fn segment_in_merge(&self) -> HashSet { 19 | let mut segment_in_merge = HashSet::default(); 20 | for merge_op in self.list() { 21 | for &segment_id in &merge_op.segment_ids { 22 | segment_in_merge.insert(segment_id); 23 | } 24 | } 25 | segment_in_merge 26 | } 27 | } 28 | 29 | /// A `MergeOperation` has two roles. 30 | /// It carries all of the information required to describe a merge: 31 | /// - `target_opstamp` is the opstamp up to which we want to consume the 32 | /// delete queue and reflect their deletes. 33 | /// - `segment_ids` is the list of segment to be merged. 34 | /// 35 | /// The second role is to ensure keep track of the fact that these 36 | /// segments are in merge and avoid starting a merge operation that 37 | /// may conflict with this one. 38 | /// 39 | /// This works by tracking merge operations. When considering computing 40 | /// merge candidates, we simply list tracked merge operations and remove 41 | /// their segments from possible merge candidates. 42 | pub struct MergeOperation { 43 | inner: TrackedObject, 44 | } 45 | 46 | pub(crate) struct InnerMergeOperation { 47 | target_opstamp: Opstamp, 48 | segment_ids: Vec, 49 | } 50 | 51 | impl MergeOperation { 52 | pub(crate) fn new( 53 | inventory: &MergeOperationInventory, 54 | target_opstamp: Opstamp, 55 | segment_ids: Vec, 56 | ) -> MergeOperation { 57 | let inner_merge_operation = InnerMergeOperation { 58 | target_opstamp, 59 | segment_ids, 60 | }; 61 | MergeOperation { 62 | inner: inventory.track(inner_merge_operation), 63 | } 64 | } 65 | 66 | pub fn target_opstamp(&self) -> Opstamp { 67 | self.inner.target_opstamp 68 | } 69 | 70 | pub fn segment_ids(&self) -> &[SegmentId] { 71 | &self.inner.segment_ids[..] 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/indexer/merge_policy.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | use std::marker; 3 | 4 | use crate::core::{SegmentId, SegmentMeta}; 5 | 6 | /// Set of segment suggested for a merge. 7 | #[derive(Debug, Clone)] 8 | pub struct MergeCandidate(pub Vec); 9 | 10 | /// The `MergePolicy` defines which segments should be merged. 11 | /// 12 | /// Every time a the list of segments changes, the segment updater 13 | /// asks the merge policy if some segments should be merged. 14 | pub trait MergePolicy: marker::Send + marker::Sync + Debug { 15 | /// Given the list of segment metas, returns the list of merge candidates. 16 | /// 17 | /// This call happens on the segment updater thread, and will block 18 | /// other segment updates, so all implementations should happen rapidly. 19 | fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec; 20 | } 21 | 22 | /// Never merge segments. 23 | #[derive(Debug, Clone)] 24 | pub struct NoMergePolicy; 25 | 26 | impl Default for NoMergePolicy { 27 | fn default() -> NoMergePolicy { 28 | NoMergePolicy 29 | } 30 | } 31 | 32 | impl MergePolicy for NoMergePolicy { 33 | fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec { 34 | Vec::new() 35 | } 36 | } 37 | 38 | #[cfg(test)] 39 | pub mod tests { 40 | 41 | use super::*; 42 | use crate::core::{SegmentId, SegmentMeta}; 43 | 44 | /// `MergePolicy` useful for test purposes. 45 | /// 46 | /// Everytime there is more than one segment, 47 | /// it will suggest to merge them. 48 | #[derive(Debug, Clone)] 49 | pub struct MergeWheneverPossible; 50 | 51 | impl MergePolicy for MergeWheneverPossible { 52 | fn compute_merge_candidates(&self, segment_metas: &[SegmentMeta]) -> Vec { 53 | let segment_ids = segment_metas 54 | .iter() 55 | .map(|segment_meta| segment_meta.id()) 56 | .collect::>(); 57 | if segment_ids.len() > 1 { 58 | vec![MergeCandidate(segment_ids)] 59 | } else { 60 | vec![] 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/indexer/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod delete_queue; 2 | 3 | pub mod demuxer; 4 | pub mod doc_id_mapping; 5 | mod doc_opstamp_mapping; 6 | pub mod index_writer; 7 | mod index_writer_status; 8 | mod json_term_writer; 9 | mod log_merge_policy; 10 | mod merge_operation; 11 | pub mod merge_policy; 12 | pub mod merger; 13 | mod merger_sorted_index_test; 14 | pub mod operation; 15 | pub mod prepared_commit; 16 | mod segment_entry; 17 | mod segment_manager; 18 | mod segment_register; 19 | pub mod segment_serializer; 20 | pub mod segment_updater; 21 | mod segment_writer; 22 | mod stamper; 23 | 24 | use crossbeam::channel; 25 | use smallvec::SmallVec; 26 | 27 | pub use self::index_writer::IndexWriter; 28 | pub(crate) use self::json_term_writer::JsonTermWriter; 29 | pub use self::log_merge_policy::LogMergePolicy; 30 | pub use self::merge_operation::MergeOperation; 31 | pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; 32 | pub use self::prepared_commit::PreparedCommit; 33 | pub use self::segment_entry::SegmentEntry; 34 | pub use self::segment_manager::SegmentManager; 35 | pub use self::segment_serializer::SegmentSerializer; 36 | pub use self::segment_updater::{merge_filtered_segments, merge_indices}; 37 | pub use self::segment_writer::SegmentWriter; 38 | use crate::indexer::operation::AddOperation; 39 | 40 | /// Alias for the default merge policy, which is the `LogMergePolicy`. 41 | pub type DefaultMergePolicy = LogMergePolicy; 42 | 43 | // Batch of documents. 44 | // Most of the time, users will send operation one-by-one, but it can be useful to 45 | // send them as a small block to ensure that 46 | // - all docs in the operation will happen on the same segment and continuous doc_ids. 47 | // - all operations in the group are committed at the same time, making the group 48 | // atomic. 49 | type AddBatch = SmallVec<[AddOperation; 4]>; 50 | type AddBatchSender = channel::Sender; 51 | type AddBatchReceiver = channel::Receiver; 52 | 53 | #[cfg(feature = "mmap")] 54 | #[cfg(test)] 55 | mod tests_mmap { 56 | use crate::schema::{self, Schema}; 57 | use crate::{Index, Term}; 58 | 59 | #[test] 60 | fn test_advance_delete_bug() -> crate::Result<()> { 61 | let mut schema_builder = Schema::builder(); 62 | let text_field = schema_builder.add_text_field("text", schema::TEXT); 63 | let index = Index::create_from_tempdir(schema_builder.build())?; 64 | let mut index_writer = index.writer_for_tests()?; 65 | // there must be one deleted document in the segment 66 | index_writer.add_document(doc!(text_field=>"b"))?; 67 | index_writer.delete_term(Term::from_field_text(text_field, "b")); 68 | // we need enough data to trigger the bug (at least 32 documents) 69 | for _ in 0..32 { 70 | index_writer.add_document(doc!(text_field=>"c"))?; 71 | } 72 | index_writer.commit()?; 73 | index_writer.commit()?; 74 | Ok(()) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/indexer/operation.rs: -------------------------------------------------------------------------------- 1 | use crate::schema::{Document, Term}; 2 | use crate::Opstamp; 3 | 4 | /// Timestamped Delete operation. 5 | #[derive(Clone, Eq, PartialEq, Debug)] 6 | pub struct DeleteOperation { 7 | pub opstamp: Opstamp, 8 | pub term: Term, 9 | } 10 | 11 | impl Default for DeleteOperation { 12 | fn default() -> Self { 13 | DeleteOperation { 14 | opstamp: 0u64, 15 | term: Term::new(), 16 | } 17 | } 18 | } 19 | 20 | /// Timestamped Add operation. 21 | #[derive(Eq, PartialEq, Debug)] 22 | pub struct AddOperation { 23 | pub opstamp: Opstamp, 24 | pub document: Document, 25 | } 26 | 27 | /// UserOperation is an enum type that encapsulates other operation types. 28 | #[derive(Eq, PartialEq, Debug)] 29 | pub enum UserOperation { 30 | /// Add operation 31 | Add(Document), 32 | /// Delete operation 33 | Delete(Term), 34 | } 35 | -------------------------------------------------------------------------------- /src/indexer/prepared_commit.rs: -------------------------------------------------------------------------------- 1 | use super::IndexWriter; 2 | use crate::{FutureResult, Opstamp}; 3 | 4 | /// A prepared commit 5 | pub struct PreparedCommit<'a> { 6 | index_writer: &'a mut IndexWriter, 7 | payload: Option, 8 | opstamp: Opstamp, 9 | } 10 | 11 | impl<'a> PreparedCommit<'a> { 12 | pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> { 13 | PreparedCommit { 14 | index_writer, 15 | payload: None, 16 | opstamp, 17 | } 18 | } 19 | 20 | /// Returns the opstamp associated to the prepared commit. 21 | pub fn opstamp(&self) -> Opstamp { 22 | self.opstamp 23 | } 24 | 25 | /// Adds an arbitrary payload to the commit. 26 | pub fn set_payload(&mut self, payload: &str) { 27 | self.payload = Some(payload.to_string()) 28 | } 29 | 30 | /// Rollbacks any change. 31 | pub fn abort(self) -> crate::Result { 32 | self.index_writer.rollback() 33 | } 34 | 35 | /// Proceeds to commit. 36 | /// See `.commit_future()`. 37 | pub fn commit(self) -> crate::Result { 38 | self.commit_future().wait() 39 | } 40 | 41 | /// Proceeds to commit. 42 | /// 43 | /// Unfortunately, contrary to what `PrepareCommit` may suggests, 44 | /// this operation is not at all really light. 45 | /// At this point deletes have not been flushed yet. 46 | pub fn commit_future(self) -> FutureResult { 47 | info!("committing {}", self.opstamp); 48 | self.index_writer 49 | .segment_updater() 50 | .schedule_commit(self.opstamp, self.payload) 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/indexer/segment_entry.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | use common::BitSet; 4 | 5 | use crate::core::{SegmentId, SegmentMeta}; 6 | use crate::indexer::delete_queue::DeleteCursor; 7 | 8 | /// A segment entry describes the state of 9 | /// a given segment, at a given instant. 10 | /// 11 | /// In addition to segment `meta`, 12 | /// it contains a few transient states 13 | /// - `alive_bitset` is a bitset describing 14 | /// documents that were alive during the commit 15 | /// itself. 16 | /// - `delete_cursor` is the position in the delete queue. 17 | /// Deletes happening before the cursor are reflected either 18 | /// in the .del file or in the `alive_bitset`. 19 | #[derive(Clone)] 20 | pub struct SegmentEntry { 21 | meta: SegmentMeta, 22 | alive_bitset: Option, 23 | delete_cursor: DeleteCursor, 24 | } 25 | 26 | impl SegmentEntry { 27 | /// Create a new `SegmentEntry` 28 | pub fn new( 29 | segment_meta: SegmentMeta, 30 | delete_cursor: DeleteCursor, 31 | alive_bitset: Option, 32 | ) -> SegmentEntry { 33 | SegmentEntry { 34 | meta: segment_meta, 35 | alive_bitset, 36 | delete_cursor, 37 | } 38 | } 39 | 40 | /// Return a reference to the segment entry deleted bitset. 41 | /// 42 | /// `DocId` in this bitset are flagged as deleted. 43 | pub fn alive_bitset(&self) -> Option<&BitSet> { 44 | self.alive_bitset.as_ref() 45 | } 46 | 47 | /// Set the `SegmentMeta` for this segment. 48 | pub fn set_meta(&mut self, segment_meta: SegmentMeta) { 49 | self.meta = segment_meta; 50 | } 51 | 52 | /// Return a reference to the segment_entry's delete cursor 53 | pub fn delete_cursor(&mut self) -> &mut DeleteCursor { 54 | &mut self.delete_cursor 55 | } 56 | 57 | /// Returns the segment id. 58 | pub fn segment_id(&self) -> SegmentId { 59 | self.meta.id() 60 | } 61 | 62 | /// Accessor to the `SegmentMeta` 63 | pub fn meta(&self) -> &SegmentMeta { 64 | &self.meta 65 | } 66 | } 67 | 68 | impl fmt::Debug for SegmentEntry { 69 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { 70 | write!(formatter, "SegmentEntry({:?})", self.meta) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/indexer/segment_serializer.rs: -------------------------------------------------------------------------------- 1 | use crate::core::{Segment, SegmentComponent}; 2 | use crate::fastfield::CompositeFastFieldSerializer; 3 | use crate::fieldnorm::FieldNormsSerializer; 4 | use crate::postings::InvertedIndexSerializer; 5 | use crate::store::StoreWriter; 6 | 7 | /// Segment serializer is in charge of laying out on disk 8 | /// the data accumulated and sorted by the `SegmentWriter`. 9 | pub struct SegmentSerializer { 10 | segment: Segment, 11 | pub(crate) store_writer: StoreWriter, 12 | fast_field_serializer: CompositeFastFieldSerializer, 13 | fieldnorms_serializer: Option, 14 | postings_serializer: InvertedIndexSerializer, 15 | } 16 | 17 | impl SegmentSerializer { 18 | /// Creates a new `SegmentSerializer`. 19 | pub fn for_segment( 20 | mut segment: Segment, 21 | is_in_merge: bool, 22 | ) -> crate::Result { 23 | // If the segment is going to be sorted, we stream the docs first to a temporary file. 24 | // In the merge case this is not necessary because we can kmerge the already sorted 25 | // segments 26 | let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge; 27 | let store_component = if remapping_required { 28 | SegmentComponent::TempStore 29 | } else { 30 | SegmentComponent::Store 31 | }; 32 | let store_write = segment.open_write(store_component)?; 33 | 34 | let fast_field_write = segment.open_write(SegmentComponent::FastFields)?; 35 | let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?; 36 | 37 | let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?; 38 | let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?; 39 | 40 | let postings_serializer = InvertedIndexSerializer::open(&mut segment)?; 41 | let compressor = segment.index().settings().docstore_compression; 42 | Ok(SegmentSerializer { 43 | segment, 44 | store_writer: StoreWriter::new(store_write, compressor), 45 | fast_field_serializer, 46 | fieldnorms_serializer: Some(fieldnorms_serializer), 47 | postings_serializer, 48 | }) 49 | } 50 | 51 | /// The memory used (inclusive childs) 52 | pub fn mem_usage(&self) -> usize { 53 | self.store_writer.mem_usage() 54 | } 55 | 56 | pub fn segment(&self) -> &Segment { 57 | &self.segment 58 | } 59 | 60 | pub fn segment_mut(&mut self) -> &mut Segment { 61 | &mut self.segment 62 | } 63 | 64 | /// Accessor to the `PostingsSerializer`. 65 | pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { 66 | &mut self.postings_serializer 67 | } 68 | 69 | /// Accessor to the `FastFieldSerializer`. 70 | pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer { 71 | &mut self.fast_field_serializer 72 | } 73 | 74 | /// Extract the field norm serializer. 75 | /// 76 | /// Note the fieldnorms serializer can only be extracted once. 77 | pub fn extract_fieldnorms_serializer(&mut self) -> Option { 78 | self.fieldnorms_serializer.take() 79 | } 80 | 81 | /// Accessor to the `StoreWriter`. 82 | pub fn get_store_writer(&mut self) -> &mut StoreWriter { 83 | &mut self.store_writer 84 | } 85 | 86 | /// Finalize the segment serialization. 87 | pub fn close(mut self) -> crate::Result<()> { 88 | if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() { 89 | fieldnorms_serializer.close()?; 90 | } 91 | self.fast_field_serializer.close()?; 92 | self.postings_serializer.close()?; 93 | self.store_writer.close()?; 94 | Ok(()) 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/macros.rs: -------------------------------------------------------------------------------- 1 | /// `doc!` is a shortcut that helps building `Document` 2 | /// objects. 3 | /// 4 | /// Assuming that `field1` and `field2` are `Field` instances. 5 | /// You can create a document with a value of `value1` for `field1` 6 | /// `value2` for `field2`, as follows : 7 | /// 8 | /// ```c 9 | /// doc!( 10 | /// field1 => value1, 11 | /// field2 => value2, 12 | /// ) 13 | /// ``` 14 | /// 15 | /// The value can be a `u64`, a `&str`, a `i64`, or a `String`. 16 | /// 17 | /// # Warning 18 | /// 19 | /// The document hence created, is not yet validated against a schema. 20 | /// Nothing prevents its user from creating an invalid document missing a 21 | /// field, or associating a `String` to a `u64` field for instance. 22 | /// 23 | /// # Example 24 | /// 25 | /// ```rust 26 | /// use tantivy::schema::{Schema, TEXT, FAST}; 27 | /// use tantivy::doc; 28 | /// 29 | /// //... 30 | /// 31 | /// # fn main() { 32 | /// let mut schema_builder = Schema::builder(); 33 | /// let title = schema_builder.add_text_field("title", TEXT); 34 | /// let author = schema_builder.add_text_field("text", TEXT); 35 | /// let likes = schema_builder.add_u64_field("num_u64", FAST); 36 | /// let schema = schema_builder.build(); 37 | /// let doc = doc!( 38 | /// title => "Life Aquatic", 39 | /// author => "Wes Anderson", 40 | /// likes => 4u64 41 | /// ); 42 | /// # } 43 | /// ``` 44 | #[macro_export] 45 | macro_rules! doc( 46 | () => { 47 | { 48 | ($crate::Document::default()) 49 | } 50 | }; // avoids a warning due to the useless `mut`. 51 | ($($field:expr => $value:expr),*) => { 52 | { 53 | let mut document = $crate::Document::default(); 54 | $( 55 | document.add_field_value($field, $value); 56 | )* 57 | document 58 | } 59 | }; 60 | // if there is a trailing comma retry with the trailing comma stripped. 61 | ($($field:expr => $value:expr),+ ,) => { 62 | doc!( $( $field => $value ), *) 63 | }; 64 | ); 65 | 66 | #[cfg(test)] 67 | mod test { 68 | use crate::schema::{Schema, FAST, TEXT}; 69 | 70 | #[test] 71 | fn test_doc_basic() { 72 | let mut schema_builder = Schema::builder(); 73 | let title = schema_builder.add_text_field("title", TEXT); 74 | let author = schema_builder.add_text_field("text", TEXT); 75 | let likes = schema_builder.add_u64_field("num_u64", FAST); 76 | let _schema = schema_builder.build(); 77 | let _doc = doc!( 78 | title => "Life Aquatic", 79 | author => "Wes Anderson", 80 | likes => 4u64 81 | ); 82 | } 83 | 84 | #[test] 85 | fn test_doc_trailing_comma() { 86 | let mut schema_builder = Schema::builder(); 87 | let title = schema_builder.add_text_field("title", TEXT); 88 | let author = schema_builder.add_text_field("text", TEXT); 89 | let likes = schema_builder.add_u64_field("num_u64", FAST); 90 | let _schema = schema_builder.build(); 91 | let _doc = doc!( 92 | title => "Life Aquatic", 93 | author => "Wes Anderson", 94 | likes => 4u64, 95 | ); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/positions/serializer.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Write}; 2 | 3 | use common::{BinarySerializable, CountingWriter, VInt}; 4 | 5 | use crate::positions::COMPRESSION_BLOCK_SIZE; 6 | use crate::postings::compression::{BlockEncoder, VIntEncoder}; 7 | 8 | /// The PositionSerializer is in charge of serializing all of the positions 9 | /// of all of the terms of a given field. 10 | /// 11 | /// It is valid to call write_position_delta more than once per term. 12 | pub struct PositionSerializer { 13 | block_encoder: BlockEncoder, 14 | positions_wrt: CountingWriter, 15 | positions_buffer: Vec, 16 | block: Vec, 17 | bit_widths: Vec, 18 | } 19 | 20 | impl PositionSerializer { 21 | /// Creates a new PositionSerializer writing into the given positions_wrt. 22 | pub fn new(positions_wrt: W) -> PositionSerializer { 23 | PositionSerializer { 24 | block_encoder: BlockEncoder::new(), 25 | positions_wrt: CountingWriter::wrap(positions_wrt), 26 | positions_buffer: Vec::with_capacity(128_000), 27 | block: Vec::with_capacity(128), 28 | bit_widths: Vec::new(), 29 | } 30 | } 31 | 32 | /// Returns the number of bytes written in the positions write object 33 | /// at this point. 34 | /// When called before writing the positions of a term, this value is used as 35 | /// start offset. 36 | /// When called after writing the positions of a term, this value is used as a 37 | /// end offset. 38 | pub fn written_bytes(&self) -> u64 { 39 | self.positions_wrt.written_bytes() 40 | } 41 | 42 | fn remaining_block_len(&self) -> usize { 43 | COMPRESSION_BLOCK_SIZE - self.block.len() 44 | } 45 | 46 | /// Writes all of the given positions delta. 47 | pub fn write_positions_delta(&mut self, mut positions_delta: &[u32]) { 48 | while !positions_delta.is_empty() { 49 | let remaining_block_len = self.remaining_block_len(); 50 | let num_to_write = remaining_block_len.min(positions_delta.len()); 51 | self.block.extend(&positions_delta[..num_to_write]); 52 | positions_delta = &positions_delta[num_to_write..]; 53 | if self.remaining_block_len() == 0 { 54 | self.flush_block(); 55 | } 56 | } 57 | } 58 | 59 | fn flush_block(&mut self) { 60 | // encode the positions in the block 61 | if self.block.is_empty() { 62 | return; 63 | } 64 | if self.block.len() == COMPRESSION_BLOCK_SIZE { 65 | let (bit_width, block_encoded): (u8, &[u8]) = 66 | self.block_encoder.compress_block_unsorted(&self.block[..]); 67 | self.bit_widths.push(bit_width); 68 | self.positions_buffer.extend(block_encoded); 69 | } else { 70 | debug_assert!(self.block.len() < COMPRESSION_BLOCK_SIZE); 71 | let block_vint_encoded = self.block_encoder.compress_vint_unsorted(&self.block[..]); 72 | self.positions_buffer.extend_from_slice(block_vint_encoded); 73 | } 74 | self.block.clear(); 75 | } 76 | 77 | /// Close the positions for the given term. 78 | pub fn close_term(&mut self) -> io::Result<()> { 79 | self.flush_block(); 80 | VInt(self.bit_widths.len() as u64).serialize(&mut self.positions_wrt)?; 81 | self.positions_wrt.write_all(&self.bit_widths[..])?; 82 | self.positions_wrt.write_all(&self.positions_buffer)?; 83 | self.bit_widths.clear(); 84 | self.positions_buffer.clear(); 85 | Ok(()) 86 | } 87 | 88 | /// Close the positions for this term and flushes the data. 89 | pub fn close(mut self) -> io::Result<()> { 90 | self.positions_wrt.flush() 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/postings/block_search.rs: -------------------------------------------------------------------------------- 1 | use crate::postings::compression::COMPRESSION_BLOCK_SIZE; 2 | 3 | /// Search the first index containing an element greater or equal to 4 | /// the target. 5 | /// 6 | /// The results should be equivalent to 7 | /// ```compile_fail 8 | /// block[..] 9 | // .iter() 10 | // .take_while(|&&val| val < target) 11 | // .count() 12 | /// ``` 13 | /// 14 | /// the `start` argument is just used to hint that the response is 15 | /// greater than beyond `start`. the implementation may or may not use 16 | /// it for optimization. 17 | /// 18 | /// # Assumption 19 | /// 20 | /// - The block is sorted. Some elements may appear several times. This is the case at the 21 | /// end of the last block for instance. 22 | /// - The target is assumed smaller or equal to the last element of the block. 23 | pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize { 24 | let mut start = 0; 25 | let mut len = arr.len(); 26 | for _ in 0..7 { 27 | len /= 2; 28 | let pivot = unsafe { *arr.get_unchecked(start + len - 1) }; 29 | if pivot < target { 30 | start += len; 31 | } 32 | } 33 | start 34 | } 35 | 36 | #[cfg(test)] 37 | mod tests { 38 | use std::collections::HashSet; 39 | 40 | use proptest::prelude::*; 41 | 42 | use super::branchless_binary_search; 43 | use crate::docset::TERMINATED; 44 | use crate::postings::compression::COMPRESSION_BLOCK_SIZE; 45 | 46 | fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize { 47 | block.iter().take_while(|&&val| val < target).count() 48 | } 49 | 50 | fn util_test_search_in_block(block: &[u32], target: u32) { 51 | let cursor = search_in_block_trivial_but_slow(block, target); 52 | assert!(cursor < COMPRESSION_BLOCK_SIZE); 53 | assert!(block[cursor] >= target); 54 | if cursor > 0 { 55 | assert!(block[cursor - 1] < target); 56 | } 57 | assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE); 58 | let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE]; 59 | output_buffer[..block.len()].copy_from_slice(block); 60 | assert_eq!(branchless_binary_search(&output_buffer, target), cursor); 61 | } 62 | 63 | fn util_test_search_in_block_all(block: &[u32]) { 64 | let mut targets = HashSet::new(); 65 | targets.insert(0); 66 | for &val in block { 67 | if val > 0 { 68 | targets.insert(val - 1); 69 | } 70 | targets.insert(val); 71 | } 72 | for target in targets { 73 | util_test_search_in_block(block, target); 74 | } 75 | } 76 | 77 | #[test] 78 | fn test_search_in_branchless_binary_search() { 79 | let v: Vec = (0..COMPRESSION_BLOCK_SIZE).map(|i| i as u32 * 2).collect(); 80 | util_test_search_in_block_all(&v[..]); 81 | } 82 | 83 | fn monotonous_block() -> impl Strategy> { 84 | prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| { 85 | let mut el = 0; 86 | for i in 0..COMPRESSION_BLOCK_SIZE { 87 | el += deltas[i]; 88 | deltas[i] = el; 89 | } 90 | deltas 91 | }) 92 | } 93 | 94 | proptest! { 95 | #[test] 96 | fn test_proptest_branchless_binary_search(block in monotonous_block()) { 97 | util_test_search_in_block_all(&block[..]); 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/postings/compression/vint.rs: -------------------------------------------------------------------------------- 1 | #[inline] 2 | pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { 3 | let mut byte_written = 0; 4 | for &v in input { 5 | let mut to_encode: u32 = v - offset; 6 | offset = v; 7 | loop { 8 | let next_byte: u8 = (to_encode % 128u32) as u8; 9 | to_encode /= 128u32; 10 | if to_encode == 0u32 { 11 | output[byte_written] = next_byte | 128u8; 12 | byte_written += 1; 13 | break; 14 | } else { 15 | output[byte_written] = next_byte; 16 | byte_written += 1; 17 | } 18 | } 19 | } 20 | &output[..byte_written] 21 | } 22 | 23 | #[inline] 24 | pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] { 25 | let mut byte_written = 0; 26 | for &v in input { 27 | let mut to_encode: u32 = v; 28 | loop { 29 | let next_byte: u8 = (to_encode % 128u32) as u8; 30 | to_encode /= 128u32; 31 | if to_encode == 0u32 { 32 | output[byte_written] = next_byte | 128u8; 33 | byte_written += 1; 34 | break; 35 | } else { 36 | output[byte_written] = next_byte; 37 | byte_written += 1; 38 | } 39 | } 40 | } 41 | &output[..byte_written] 42 | } 43 | 44 | #[inline] 45 | pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize { 46 | let mut read_byte = 0; 47 | let mut result = offset; 48 | for output_mut in output.iter_mut() { 49 | let mut shift = 0u32; 50 | loop { 51 | let cur_byte = compressed_data[read_byte]; 52 | read_byte += 1; 53 | result += u32::from(cur_byte % 128u8) << shift; 54 | if cur_byte & 128u8 != 0u8 { 55 | break; 56 | } 57 | shift += 7; 58 | } 59 | *output_mut = result; 60 | } 61 | read_byte 62 | } 63 | 64 | #[inline] 65 | pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize { 66 | let mut num_read_bytes = 0; 67 | for output_mut in output_arr.iter_mut() { 68 | let mut result = 0u32; 69 | let mut shift = 0u32; 70 | loop { 71 | let cur_byte = compressed_data[num_read_bytes]; 72 | num_read_bytes += 1; 73 | result += u32::from(cur_byte % 128u8) << shift; 74 | if cur_byte & 128u8 != 0u8 { 75 | break; 76 | } 77 | shift += 7; 78 | } 79 | *output_mut = result; 80 | } 81 | num_read_bytes 82 | } 83 | 84 | #[inline] 85 | pub(crate) fn uncompress_unsorted_until_end( 86 | compressed_data: &[u8], 87 | output_arr: &mut [u32], 88 | ) -> usize { 89 | let mut num_read_bytes = 0; 90 | for (num_ints_written, output_mut) in output_arr.iter_mut().enumerate() { 91 | if compressed_data.len() == num_read_bytes { 92 | return num_ints_written; 93 | } 94 | let mut result = 0u32; 95 | let mut shift = 0u32; 96 | loop { 97 | let cur_byte = compressed_data[num_read_bytes]; 98 | num_read_bytes += 1; 99 | result += u32::from(cur_byte % 128u8) << shift; 100 | if cur_byte & 128u8 != 0u8 { 101 | break; 102 | } 103 | shift += 7; 104 | } 105 | *output_mut = result; 106 | } 107 | output_arr.len() 108 | } 109 | -------------------------------------------------------------------------------- /src/postings/indexing_context.rs: -------------------------------------------------------------------------------- 1 | use crate::postings::stacker::{MemoryArena, TermHashMap}; 2 | 3 | /// IndexingContext contains all of the transient memory arenas 4 | /// required for building the inverted index. 5 | pub(crate) struct IndexingContext { 6 | /// The term index is an adhoc hashmap, 7 | /// itself backed by a dedicated memory arena. 8 | pub term_index: TermHashMap, 9 | /// Arena is a memory arena that stores posting lists / term frequencies / positions. 10 | pub arena: MemoryArena, 11 | } 12 | 13 | impl IndexingContext { 14 | /// Create a new IndexingContext given the size of the term hash map. 15 | pub(crate) fn new(table_size: usize) -> IndexingContext { 16 | let term_index = TermHashMap::new(table_size); 17 | IndexingContext { 18 | arena: MemoryArena::new(), 19 | term_index, 20 | } 21 | } 22 | 23 | /// Returns the memory usage for the inverted index memory arenas, in bytes. 24 | pub(crate) fn mem_usage(&self) -> usize { 25 | self.term_index.mem_usage() + self.arena.mem_usage() 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/postings/json_postings_writer.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use crate::indexer::doc_id_mapping::DocIdMapping; 4 | use crate::postings::postings_writer::SpecializedPostingsWriter; 5 | use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder}; 6 | use crate::postings::stacker::Addr; 7 | use crate::postings::{ 8 | FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId, 9 | }; 10 | use crate::schema::term::as_json_path_type_value_bytes; 11 | use crate::schema::Type; 12 | use crate::tokenizer::TokenStream; 13 | use crate::{DocId, Term}; 14 | 15 | #[derive(Default)] 16 | pub(crate) struct JsonPostingsWriter { 17 | str_posting_writer: SpecializedPostingsWriter, 18 | non_str_posting_writer: SpecializedPostingsWriter, 19 | } 20 | 21 | impl From> for Box { 22 | fn from(json_postings_writer: JsonPostingsWriter) -> Box { 23 | Box::new(json_postings_writer) 24 | } 25 | } 26 | 27 | impl PostingsWriter for JsonPostingsWriter { 28 | fn subscribe( 29 | &mut self, 30 | doc: crate::DocId, 31 | pos: u32, 32 | term: &crate::Term, 33 | ctx: &mut IndexingContext, 34 | ) -> UnorderedTermId { 35 | self.non_str_posting_writer.subscribe(doc, pos, term, ctx) 36 | } 37 | 38 | fn index_text( 39 | &mut self, 40 | doc_id: DocId, 41 | token_stream: &mut dyn TokenStream, 42 | term_buffer: &mut Term, 43 | ctx: &mut IndexingContext, 44 | indexing_position: &mut IndexingPosition, 45 | ) { 46 | self.str_posting_writer.index_text( 47 | doc_id, 48 | token_stream, 49 | term_buffer, 50 | ctx, 51 | indexing_position, 52 | ); 53 | } 54 | 55 | /// The actual serialization format is handled by the `PostingsSerializer`. 56 | fn serialize( 57 | &self, 58 | term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)], 59 | doc_id_map: Option<&DocIdMapping>, 60 | ctx: &IndexingContext, 61 | serializer: &mut FieldSerializer, 62 | ) -> io::Result<()> { 63 | let mut buffer_lender = BufferLender::default(); 64 | for (term, addr, _) in term_addrs { 65 | // TODO optimization opportunity here. 66 | if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) { 67 | if typ == Type::Str { 68 | SpecializedPostingsWriter::::serialize_one_term( 69 | term, 70 | *addr, 71 | doc_id_map, 72 | &mut buffer_lender, 73 | ctx, 74 | serializer, 75 | )?; 76 | } else { 77 | SpecializedPostingsWriter::::serialize_one_term( 78 | term, 79 | *addr, 80 | doc_id_map, 81 | &mut buffer_lender, 82 | ctx, 83 | serializer, 84 | )?; 85 | } 86 | } 87 | } 88 | Ok(()) 89 | } 90 | 91 | fn total_num_tokens(&self) -> u64 { 92 | self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens() 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/postings/per_field_postings_writer.rs: -------------------------------------------------------------------------------- 1 | use crate::postings::json_postings_writer::JsonPostingsWriter; 2 | use crate::postings::postings_writer::SpecializedPostingsWriter; 3 | use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder}; 4 | use crate::postings::PostingsWriter; 5 | use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema}; 6 | 7 | pub(crate) struct PerFieldPostingsWriter { 8 | per_field_postings_writers: Vec>, 9 | } 10 | 11 | impl PerFieldPostingsWriter { 12 | pub fn for_schema(schema: &Schema) -> Self { 13 | let per_field_postings_writers = schema 14 | .fields() 15 | .map(|(_, field_entry)| posting_writer_from_field_entry(field_entry)) 16 | .collect(); 17 | PerFieldPostingsWriter { 18 | per_field_postings_writers, 19 | } 20 | } 21 | 22 | pub(crate) fn get_for_field(&self, field: Field) -> &dyn PostingsWriter { 23 | self.per_field_postings_writers[field.field_id() as usize].as_ref() 24 | } 25 | 26 | pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter { 27 | self.per_field_postings_writers[field.field_id() as usize].as_mut() 28 | } 29 | } 30 | 31 | fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box { 32 | match *field_entry.field_type() { 33 | FieldType::Str(ref text_options) => text_options 34 | .get_indexing_options() 35 | .map(|indexing_options| match indexing_options.index_option() { 36 | IndexRecordOption::Basic => { 37 | SpecializedPostingsWriter::::default().into() 38 | } 39 | IndexRecordOption::WithFreqs => { 40 | SpecializedPostingsWriter::::default().into() 41 | } 42 | IndexRecordOption::WithFreqsAndPositions => { 43 | SpecializedPostingsWriter::::default().into() 44 | } 45 | }) 46 | .unwrap_or_else(|| SpecializedPostingsWriter::::default().into()), 47 | FieldType::U64(_) 48 | | FieldType::I64(_) 49 | | FieldType::F64(_) 50 | | FieldType::Date(_) 51 | | FieldType::Bytes(_) 52 | | FieldType::Facet(_) => Box::new(SpecializedPostingsWriter::::default()), 53 | FieldType::JsonObject(ref json_object_options) => { 54 | if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() { 55 | match text_indexing_option.index_option() { 56 | IndexRecordOption::Basic => { 57 | JsonPostingsWriter::::default().into() 58 | } 59 | IndexRecordOption::WithFreqs => { 60 | JsonPostingsWriter::::default().into() 61 | } 62 | IndexRecordOption::WithFreqsAndPositions => { 63 | JsonPostingsWriter::::default().into() 64 | } 65 | } 66 | } else { 67 | JsonPostingsWriter::::default().into() 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/postings/postings.rs: -------------------------------------------------------------------------------- 1 | use crate::docset::DocSet; 2 | 3 | /// Postings (also called inverted list) 4 | /// 5 | /// For a given term, it is the list of doc ids of the doc 6 | /// containing the term. Optionally, for each document, 7 | /// it may also give access to the term frequency 8 | /// as well as the list of term positions. 9 | /// 10 | /// Its main implementation is `SegmentPostings`, 11 | /// but other implementations mocking `SegmentPostings` exist, 12 | /// for merging segments or for testing. 13 | pub trait Postings: DocSet + 'static { 14 | /// The number of times the term appears in the document. 15 | fn term_freq(&self) -> u32; 16 | 17 | /// Returns the positions offseted with a given value. 18 | /// The output vector will be resized to the `term_freq`. 19 | fn positions_with_offset(&mut self, offset: u32, output: &mut Vec); 20 | 21 | /// Returns the positions of the term in the given document. 22 | /// The output vector will be resized to the `term_freq`. 23 | fn positions(&mut self, output: &mut Vec) { 24 | self.positions_with_offset(0u32, output); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/postings/stacker/mod.rs: -------------------------------------------------------------------------------- 1 | mod expull; 2 | mod memory_arena; 3 | mod term_hashmap; 4 | 5 | pub(crate) use self::expull::ExpUnrolledLinkedList; 6 | pub(crate) use self::memory_arena::{Addr, MemoryArena}; 7 | pub(crate) use self::term_hashmap::{compute_table_size, TermHashMap}; 8 | -------------------------------------------------------------------------------- /src/postings/term_info.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::iter::ExactSizeIterator; 3 | use std::ops::Range; 4 | 5 | use common::{BinarySerializable, FixedSize}; 6 | 7 | /// `TermInfo` wraps the metadata associated to a Term. 8 | /// It is segment-local. 9 | #[derive(Debug, Default, Eq, PartialEq, Clone)] 10 | pub struct TermInfo { 11 | /// Number of documents in the segment containing the term 12 | pub doc_freq: u32, 13 | /// Byte range of the posting list within the postings (`.idx`) file. 14 | pub postings_range: Range, 15 | /// Byte range of the positions of this terms in the positions (`.pos`) file. 16 | pub positions_range: Range, 17 | } 18 | 19 | impl TermInfo { 20 | pub(crate) fn posting_num_bytes(&self) -> u32 { 21 | let num_bytes = self.postings_range.len(); 22 | assert!(num_bytes <= std::u32::MAX as usize); 23 | num_bytes as u32 24 | } 25 | 26 | pub(crate) fn positions_num_bytes(&self) -> u32 { 27 | let num_bytes = self.positions_range.len(); 28 | assert!(num_bytes <= std::u32::MAX as usize); 29 | num_bytes as u32 30 | } 31 | } 32 | 33 | impl FixedSize for TermInfo { 34 | /// Size required for the binary serialization of a `TermInfo` object. 35 | /// This is large, but in practise, `TermInfo` are encoded in blocks and 36 | /// only the first `TermInfo` of a block is serialized uncompressed. 37 | /// The subsequent `TermInfo` are delta encoded and bitpacked. 38 | const SIZE_IN_BYTES: usize = 3 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES; 39 | } 40 | 41 | impl BinarySerializable for TermInfo { 42 | fn serialize(&self, writer: &mut W) -> io::Result<()> { 43 | self.doc_freq.serialize(writer)?; 44 | (self.postings_range.start as u64).serialize(writer)?; 45 | self.posting_num_bytes().serialize(writer)?; 46 | (self.positions_range.start as u64).serialize(writer)?; 47 | self.positions_num_bytes().serialize(writer)?; 48 | Ok(()) 49 | } 50 | 51 | fn deserialize(reader: &mut R) -> io::Result { 52 | let doc_freq = u32::deserialize(reader)?; 53 | let postings_start_offset = u64::deserialize(reader)? as usize; 54 | let postings_num_bytes = u32::deserialize(reader)? as usize; 55 | let postings_end_offset = postings_start_offset + postings_num_bytes; 56 | let positions_start_offset = u64::deserialize(reader)? as usize; 57 | let positions_num_bytes = u32::deserialize(reader)? as usize; 58 | let positions_end_offset = positions_start_offset + positions_num_bytes; 59 | Ok(TermInfo { 60 | doc_freq, 61 | postings_range: postings_start_offset..postings_end_offset, 62 | positions_range: positions_start_offset..positions_end_offset, 63 | }) 64 | } 65 | } 66 | 67 | #[cfg(test)] 68 | mod tests { 69 | 70 | use super::TermInfo; 71 | use crate::tests::fixed_size_test; 72 | 73 | // TODO add serialize/deserialize test for terminfo 74 | 75 | #[test] 76 | fn test_fixed_size() { 77 | fixed_size_test::(); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/query/empty_query.rs: -------------------------------------------------------------------------------- 1 | use super::Scorer; 2 | use crate::docset::TERMINATED; 3 | use crate::query::explanation::does_not_match; 4 | use crate::query::{Explanation, Query, Weight}; 5 | use crate::{DocId, DocSet, Score, Searcher, SegmentReader}; 6 | 7 | /// `EmptyQuery` is a dummy `Query` in which no document matches. 8 | /// 9 | /// It is useful for tests and handling edge cases. 10 | #[derive(Clone, Debug)] 11 | pub struct EmptyQuery; 12 | 13 | impl Query for EmptyQuery { 14 | fn weight( 15 | &self, 16 | _searcher: &Searcher, 17 | _scoring_enabled: bool, 18 | ) -> crate::Result> { 19 | Ok(Box::new(EmptyWeight)) 20 | } 21 | 22 | fn count(&self, _searcher: &Searcher) -> crate::Result { 23 | Ok(0) 24 | } 25 | } 26 | 27 | /// `EmptyWeight` is a dummy `Weight` in which no document matches. 28 | /// 29 | /// It is useful for tests and handling edge cases. 30 | pub struct EmptyWeight; 31 | impl Weight for EmptyWeight { 32 | fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result> { 33 | Ok(Box::new(EmptyScorer)) 34 | } 35 | 36 | fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result { 37 | Err(does_not_match(doc)) 38 | } 39 | } 40 | 41 | /// `EmptyScorer` is a dummy `Scorer` in which no document matches. 42 | /// 43 | /// It is useful for tests and handling edge cases. 44 | pub struct EmptyScorer; 45 | 46 | impl DocSet for EmptyScorer { 47 | fn advance(&mut self) -> DocId { 48 | TERMINATED 49 | } 50 | 51 | fn doc(&self) -> DocId { 52 | TERMINATED 53 | } 54 | 55 | fn size_hint(&self) -> u32 { 56 | 0 57 | } 58 | } 59 | 60 | impl Scorer for EmptyScorer { 61 | fn score(&mut self) -> Score { 62 | 0.0 63 | } 64 | } 65 | 66 | #[cfg(test)] 67 | mod tests { 68 | use crate::docset::TERMINATED; 69 | use crate::query::EmptyScorer; 70 | use crate::DocSet; 71 | 72 | #[test] 73 | fn test_empty_scorer() { 74 | let mut empty_scorer = EmptyScorer; 75 | assert_eq!(empty_scorer.doc(), TERMINATED); 76 | assert_eq!(empty_scorer.advance(), TERMINATED); 77 | assert_eq!(empty_scorer.doc(), TERMINATED); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/query/explanation.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | use serde::Serialize; 4 | 5 | use crate::{DocId, Score, TantivyError}; 6 | 7 | pub(crate) fn does_not_match(doc: DocId) -> TantivyError { 8 | TantivyError::InvalidArgument(format!("Document #({}) does not match", doc)) 9 | } 10 | 11 | /// Object describing the score of a given document. 12 | /// It is organized in trees. 13 | /// 14 | /// `.to_pretty_json()` can be useful to print out a human readable 15 | /// representation of this tree when debugging a given score. 16 | #[derive(Clone, Serialize)] 17 | pub struct Explanation { 18 | value: Score, 19 | description: String, 20 | #[serde(skip_serializing_if = "Vec::is_empty")] 21 | details: Vec, 22 | context: Vec, 23 | } 24 | 25 | impl fmt::Debug for Explanation { 26 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 27 | write!(f, "Explanation({})", self.to_pretty_json()) 28 | } 29 | } 30 | 31 | impl Explanation { 32 | /// Creates a new explanation object. 33 | pub fn new(description: T, value: Score) -> Explanation { 34 | Explanation { 35 | value, 36 | description: description.to_string(), 37 | details: vec![], 38 | context: vec![], 39 | } 40 | } 41 | 42 | /// Returns the value associated to the current node. 43 | pub fn value(&self) -> Score { 44 | self.value 45 | } 46 | 47 | /// Add some detail, explaining some part of the current node formula. 48 | /// 49 | /// Details are treated as child of the current node. 50 | pub fn add_detail(&mut self, child_explanation: Explanation) { 51 | self.details.push(child_explanation); 52 | } 53 | 54 | /// Adds some extra context to the explanation. 55 | pub fn add_context(&mut self, context: String) { 56 | self.context.push(context); 57 | } 58 | 59 | /// Shortcut for `self.details.push(Explanation::new(name, value));` 60 | pub fn add_const(&mut self, name: T, value: Score) { 61 | self.details.push(Explanation::new(name, value)); 62 | } 63 | 64 | /// Returns an indented json representation of the explanation tree for debug usage. 65 | pub fn to_pretty_json(&self) -> String { 66 | serde_json::to_string_pretty(self).unwrap() 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/query/more_like_this/mod.rs: -------------------------------------------------------------------------------- 1 | mod more_like_this; 2 | mod query; 3 | 4 | pub use self::more_like_this::MoreLikeThis; 5 | pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder}; 6 | -------------------------------------------------------------------------------- /src/query/query_parser/logical_ast.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::ops::Bound; 3 | 4 | use crate::query::Occur; 5 | use crate::schema::{Field, Term, Type}; 6 | use crate::Score; 7 | 8 | #[derive(Clone)] 9 | pub enum LogicalLiteral { 10 | Term(Term), 11 | Phrase(Vec<(usize, Term)>), 12 | Range { 13 | field: Field, 14 | value_type: Type, 15 | lower: Bound, 16 | upper: Bound, 17 | }, 18 | All, 19 | } 20 | 21 | pub enum LogicalAst { 22 | Clause(Vec<(Occur, LogicalAst)>), 23 | Leaf(Box), 24 | Boost(Box, Score), 25 | } 26 | 27 | impl LogicalAst { 28 | pub fn boost(self, boost: Score) -> LogicalAst { 29 | if (boost - 1.0).abs() < Score::EPSILON { 30 | self 31 | } else { 32 | LogicalAst::Boost(Box::new(self), boost) 33 | } 34 | } 35 | } 36 | 37 | fn occur_letter(occur: Occur) -> &'static str { 38 | match occur { 39 | Occur::Must => "+", 40 | Occur::MustNot => "-", 41 | Occur::Should => "", 42 | } 43 | } 44 | 45 | impl fmt::Debug for LogicalAst { 46 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { 47 | match *self { 48 | LogicalAst::Clause(ref clause) => { 49 | if clause.is_empty() { 50 | write!(formatter, "")?; 51 | } else { 52 | let (ref occur, ref subquery) = clause[0]; 53 | write!(formatter, "({}{:?}", occur_letter(*occur), subquery)?; 54 | for &(ref occur, ref subquery) in &clause[1..] { 55 | write!(formatter, " {}{:?}", occur_letter(*occur), subquery)?; 56 | } 57 | formatter.write_str(")")?; 58 | } 59 | Ok(()) 60 | } 61 | LogicalAst::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost), 62 | LogicalAst::Leaf(ref literal) => write!(formatter, "{:?}", literal), 63 | } 64 | } 65 | } 66 | 67 | impl From for LogicalAst { 68 | fn from(literal: LogicalLiteral) -> LogicalAst { 69 | LogicalAst::Leaf(Box::new(literal)) 70 | } 71 | } 72 | 73 | impl fmt::Debug for LogicalLiteral { 74 | fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { 75 | match *self { 76 | LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term), 77 | LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms), 78 | LogicalLiteral::Range { 79 | ref lower, 80 | ref upper, 81 | .. 82 | } => write!(formatter, "({:?} TO {:?})", lower, upper), 83 | LogicalLiteral::All => write!(formatter, "*"), 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/query/query_parser/mod.rs: -------------------------------------------------------------------------------- 1 | mod query_parser; 2 | 3 | pub mod logical_ast; 4 | pub use self::query_parser::{QueryParser, QueryParserError}; 5 | -------------------------------------------------------------------------------- /src/query/score_combiner.rs: -------------------------------------------------------------------------------- 1 | use crate::query::Scorer; 2 | use crate::Score; 3 | 4 | /// The `ScoreCombiner` trait defines how to compute 5 | /// an overall score given a list of scores. 6 | pub trait ScoreCombiner: Default + Clone + Send + Copy + 'static { 7 | /// Aggregates the score combiner with the given scorer. 8 | /// 9 | /// The `ScoreCombiner` may decide to call `.scorer.score()` 10 | /// or not. 11 | fn update(&mut self, scorer: &mut TScorer); 12 | 13 | /// Clears the score combiner state back to its initial state. 14 | fn clear(&mut self); 15 | 16 | /// Returns the aggregate score. 17 | fn score(&self) -> Score; 18 | } 19 | 20 | /// Just ignores scores. The `DoNothingCombiner` does not 21 | /// even call the scorers `.score()` function. 22 | /// 23 | /// It is useful to optimize the case when scoring is disabled. 24 | #[derive(Default, Clone, Copy)] //< these should not be too much work :) 25 | pub struct DoNothingCombiner; 26 | 27 | impl ScoreCombiner for DoNothingCombiner { 28 | fn update(&mut self, _scorer: &mut TScorer) {} 29 | 30 | fn clear(&mut self) {} 31 | 32 | fn score(&self) -> Score { 33 | 1.0 34 | } 35 | } 36 | 37 | /// Sums the score of different scorers. 38 | #[derive(Default, Clone, Copy)] 39 | pub struct SumCombiner { 40 | score: Score, 41 | } 42 | 43 | impl ScoreCombiner for SumCombiner { 44 | fn update(&mut self, scorer: &mut TScorer) { 45 | self.score += scorer.score(); 46 | } 47 | 48 | fn clear(&mut self) { 49 | self.score = 0.0; 50 | } 51 | 52 | fn score(&self) -> Score { 53 | self.score 54 | } 55 | } 56 | 57 | /// Sums the score of different scorers and keeps the count 58 | /// of scorers which matched. 59 | #[derive(Default, Clone, Copy)] 60 | pub struct SumWithCoordsCombiner { 61 | num_fields: usize, 62 | score: Score, 63 | } 64 | 65 | impl ScoreCombiner for SumWithCoordsCombiner { 66 | fn update(&mut self, scorer: &mut TScorer) { 67 | self.score += scorer.score(); 68 | self.num_fields += 1; 69 | } 70 | 71 | fn clear(&mut self) { 72 | self.score = 0.0; 73 | self.num_fields = 0; 74 | } 75 | 76 | fn score(&self) -> Score { 77 | self.score 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/query/scorer.rs: -------------------------------------------------------------------------------- 1 | use std::ops::DerefMut; 2 | 3 | use downcast_rs::impl_downcast; 4 | 5 | use crate::docset::DocSet; 6 | use crate::{DocId, Score}; 7 | 8 | /// Scored set of documents matching a query within a specific segment. 9 | /// 10 | /// See [`Query`](./trait.Query.html). 11 | pub trait Scorer: downcast_rs::Downcast + DocSet + 'static { 12 | /// Returns the score. 13 | /// 14 | /// This method will perform a bit of computation and is not cached. 15 | fn score(&mut self) -> Score; 16 | } 17 | 18 | impl_downcast!(Scorer); 19 | 20 | impl Scorer for Box { 21 | fn score(&mut self) -> Score { 22 | self.deref_mut().score() 23 | } 24 | } 25 | 26 | /// Wraps a `DocSet` and simply returns a constant `Scorer`. 27 | /// The `ConstScorer` is useful if you have a `DocSet` where 28 | /// you needed a scorer. 29 | /// 30 | /// The `ConstScorer`'s constant score can be set 31 | /// by calling `.set_score(...)`. 32 | pub struct ConstScorer { 33 | docset: TDocSet, 34 | score: Score, 35 | } 36 | 37 | impl ConstScorer { 38 | /// Creates a new `ConstScorer`. 39 | pub fn new(docset: TDocSet, score: Score) -> ConstScorer { 40 | ConstScorer { docset, score } 41 | } 42 | } 43 | 44 | impl From for ConstScorer { 45 | fn from(docset: TDocSet) -> Self { 46 | ConstScorer::new(docset, 1.0) 47 | } 48 | } 49 | 50 | impl DocSet for ConstScorer { 51 | fn advance(&mut self) -> DocId { 52 | self.docset.advance() 53 | } 54 | 55 | fn seek(&mut self, target: DocId) -> DocId { 56 | self.docset.seek(target) 57 | } 58 | 59 | fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize { 60 | self.docset.fill_buffer(buffer) 61 | } 62 | 63 | fn doc(&self) -> DocId { 64 | self.docset.doc() 65 | } 66 | 67 | fn size_hint(&self) -> u32 { 68 | self.docset.size_hint() 69 | } 70 | } 71 | 72 | impl Scorer for ConstScorer { 73 | fn score(&mut self) -> Score { 74 | self.score 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/query/vec_docset.rs: -------------------------------------------------------------------------------- 1 | #![allow(dead_code)] 2 | 3 | use common::HasLen; 4 | 5 | use crate::docset::{DocSet, TERMINATED}; 6 | use crate::DocId; 7 | 8 | /// Simulate a `Postings` objects from a `VecPostings`. 9 | /// `VecPostings` only exist for testing purposes. 10 | /// 11 | /// Term frequencies always return 1. 12 | /// No positions are returned. 13 | pub struct VecDocSet { 14 | doc_ids: Vec, 15 | cursor: usize, 16 | } 17 | 18 | impl From> for VecDocSet { 19 | fn from(doc_ids: Vec) -> VecDocSet { 20 | VecDocSet { doc_ids, cursor: 0 } 21 | } 22 | } 23 | 24 | impl DocSet for VecDocSet { 25 | fn advance(&mut self) -> DocId { 26 | self.cursor += 1; 27 | if self.cursor >= self.doc_ids.len() { 28 | self.cursor = self.doc_ids.len(); 29 | return TERMINATED; 30 | } 31 | self.doc() 32 | } 33 | 34 | fn doc(&self) -> DocId { 35 | if self.cursor == self.doc_ids.len() { 36 | return TERMINATED; 37 | } 38 | self.doc_ids[self.cursor] 39 | } 40 | 41 | fn size_hint(&self) -> u32 { 42 | self.len() as u32 43 | } 44 | } 45 | 46 | impl HasLen for VecDocSet { 47 | fn len(&self) -> usize { 48 | self.doc_ids.len() 49 | } 50 | } 51 | 52 | #[cfg(test)] 53 | pub mod tests { 54 | 55 | use super::*; 56 | use crate::docset::DocSet; 57 | use crate::DocId; 58 | 59 | #[test] 60 | pub fn test_vec_postings() { 61 | let doc_ids: Vec = (0u32..1024u32).map(|e| e * 3).collect(); 62 | let mut postings = VecDocSet::from(doc_ids); 63 | assert_eq!(postings.doc(), 0u32); 64 | assert_eq!(postings.advance(), 3u32); 65 | assert_eq!(postings.doc(), 3u32); 66 | assert_eq!(postings.seek(14u32), 15u32); 67 | assert_eq!(postings.doc(), 15u32); 68 | assert_eq!(postings.seek(300u32), 300u32); 69 | assert_eq!(postings.doc(), 300u32); 70 | assert_eq!(postings.seek(6000u32), TERMINATED); 71 | } 72 | 73 | #[test] 74 | pub fn test_fill_buffer() { 75 | let doc_ids: Vec = (1u32..210u32).collect(); 76 | let mut postings = VecDocSet::from(doc_ids); 77 | let mut buffer = vec![1000u32; 100]; 78 | assert_eq!(postings.fill_buffer(&mut buffer[..]), 100); 79 | for i in 0u32..100u32 { 80 | assert_eq!(buffer[i as usize], i + 1); 81 | } 82 | assert_eq!(postings.fill_buffer(&mut buffer[..]), 100); 83 | for i in 0u32..100u32 { 84 | assert_eq!(buffer[i as usize], i + 101); 85 | } 86 | assert_eq!(postings.fill_buffer(&mut buffer[..]), 9); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/query/weight.rs: -------------------------------------------------------------------------------- 1 | use super::Scorer; 2 | use crate::core::SegmentReader; 3 | use crate::query::Explanation; 4 | use crate::{DocId, Score, TERMINATED}; 5 | 6 | /// Iterates through all of the document matched by the DocSet 7 | /// `DocSet` and push the scored documents to the collector. 8 | pub(crate) fn for_each_scorer( 9 | scorer: &mut TScorer, 10 | callback: &mut dyn FnMut(DocId, Score), 11 | ) { 12 | let mut doc = scorer.doc(); 13 | while doc != TERMINATED { 14 | callback(doc, scorer.score()); 15 | doc = scorer.advance(); 16 | } 17 | } 18 | 19 | /// Calls `callback` with all of the `(doc, score)` for which score 20 | /// is exceeding a given threshold. 21 | /// 22 | /// This method is useful for the TopDocs collector. 23 | /// For all docsets, the blanket implementation has the benefit 24 | /// of prefiltering (doc, score) pairs, avoiding the 25 | /// virtual dispatch cost. 26 | /// 27 | /// More importantly, it makes it possible for scorers to implement 28 | /// important optimization (e.g. BlockWAND for union). 29 | pub(crate) fn for_each_pruning_scorer( 30 | scorer: &mut TScorer, 31 | mut threshold: Score, 32 | callback: &mut dyn FnMut(DocId, Score) -> Score, 33 | ) { 34 | let mut doc = scorer.doc(); 35 | while doc != TERMINATED { 36 | let score = scorer.score(); 37 | if score > threshold { 38 | threshold = callback(doc, score); 39 | } 40 | doc = scorer.advance(); 41 | } 42 | } 43 | 44 | /// A Weight is the specialization of a Query 45 | /// for a given set of segments. 46 | /// 47 | /// See [`Query`](./trait.Query.html). 48 | pub trait Weight: Send + Sync + 'static { 49 | /// Returns the scorer for the given segment. 50 | /// 51 | /// `boost` is a multiplier to apply to the score. 52 | /// 53 | /// See [`Query`](./trait.Query.html). 54 | fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result>; 55 | 56 | /// Returns an `Explanation` for the given document. 57 | fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result; 58 | 59 | /// Returns the number documents within the given `SegmentReader`. 60 | fn count(&self, reader: &SegmentReader) -> crate::Result { 61 | let mut scorer = self.scorer(reader, 1.0)?; 62 | if let Some(alive_bitset) = reader.alive_bitset() { 63 | Ok(scorer.count(alive_bitset)) 64 | } else { 65 | Ok(scorer.count_including_deleted()) 66 | } 67 | } 68 | 69 | /// Iterates through all of the document matched by the DocSet 70 | /// `DocSet` and push the scored documents to the collector. 71 | fn for_each( 72 | &self, 73 | reader: &SegmentReader, 74 | callback: &mut dyn FnMut(DocId, Score), 75 | ) -> crate::Result<()> { 76 | let mut scorer = self.scorer(reader, 1.0)?; 77 | for_each_scorer(scorer.as_mut(), callback); 78 | Ok(()) 79 | } 80 | 81 | /// Calls `callback` with all of the `(doc, score)` for which score 82 | /// is exceeding a given threshold. 83 | /// 84 | /// This method is useful for the TopDocs collector. 85 | /// For all docsets, the blanket implementation has the benefit 86 | /// of prefiltering (doc, score) pairs, avoiding the 87 | /// virtual dispatch cost. 88 | /// 89 | /// More importantly, it makes it possible for scorers to implement 90 | /// important optimization (e.g. BlockWAND for union). 91 | fn for_each_pruning( 92 | &self, 93 | threshold: Score, 94 | reader: &SegmentReader, 95 | callback: &mut dyn FnMut(DocId, Score) -> Score, 96 | ) -> crate::Result<()> { 97 | let mut scorer = self.scorer(reader, 1.0)?; 98 | for_each_pruning_scorer(scorer.as_mut(), threshold, callback); 99 | Ok(()) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/schema/facet_options.rs: -------------------------------------------------------------------------------- 1 | use std::ops::BitOr; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | use crate::schema::flags::{IndexedFlag, SchemaFlagList, StoredFlag}; 6 | 7 | /// Define how a facet field should be handled by tantivy. 8 | /// 9 | /// Note that a Facet is always indexed and stored as a fastfield. 10 | #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] 11 | pub struct FacetOptions { 12 | stored: bool, 13 | } 14 | 15 | impl FacetOptions { 16 | /// Returns true if the value is stored. 17 | pub fn is_stored(&self) -> bool { 18 | self.stored 19 | } 20 | 21 | /// Set the field as stored. 22 | /// 23 | /// Only the fields that are set as *stored* are 24 | /// persisted into the Tantivy's store. 25 | #[must_use] 26 | pub fn set_stored(mut self) -> FacetOptions { 27 | self.stored = true; 28 | self 29 | } 30 | } 31 | 32 | impl From<()> for FacetOptions { 33 | fn from(_: ()) -> FacetOptions { 34 | FacetOptions::default() 35 | } 36 | } 37 | 38 | impl From for FacetOptions { 39 | fn from(_: StoredFlag) -> Self { 40 | FacetOptions { stored: true } 41 | } 42 | } 43 | 44 | impl> BitOr for FacetOptions { 45 | type Output = FacetOptions; 46 | 47 | fn bitor(self, other: T) -> FacetOptions { 48 | let other = other.into(); 49 | FacetOptions { 50 | stored: self.stored | other.stored, 51 | } 52 | } 53 | } 54 | 55 | impl From> for FacetOptions 56 | where 57 | Head: Clone, 58 | Tail: Clone, 59 | Self: BitOr + From + From, 60 | { 61 | fn from(head_tail: SchemaFlagList) -> Self { 62 | Self::from(head_tail.head) | Self::from(head_tail.tail) 63 | } 64 | } 65 | 66 | impl From for FacetOptions { 67 | fn from(_: IndexedFlag) -> Self { 68 | FacetOptions { stored: false } 69 | } 70 | } 71 | 72 | #[cfg(test)] 73 | mod tests { 74 | use crate::schema::{FacetOptions, INDEXED}; 75 | 76 | #[test] 77 | fn test_from_index_flag() { 78 | let facet_option = FacetOptions::from(INDEXED); 79 | assert_eq!(facet_option, FacetOptions::default()); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/schema/field.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::{Read, Write}; 3 | 4 | use common::BinarySerializable; 5 | 6 | /// `Field` is represented by an unsigned 32-bit integer type 7 | /// The schema holds the mapping between field names and `Field` objects. 8 | #[derive( 9 | Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, serde::Serialize, serde::Deserialize, 10 | )] 11 | pub struct Field(u32); 12 | 13 | impl Field { 14 | /// Create a new field object for the given FieldId. 15 | pub const fn from_field_id(field_id: u32) -> Field { 16 | Field(field_id) 17 | } 18 | 19 | /// Returns a u32 identifying uniquely a field within a schema. 20 | pub const fn field_id(self) -> u32 { 21 | self.0 22 | } 23 | } 24 | 25 | impl BinarySerializable for Field { 26 | fn serialize(&self, writer: &mut W) -> io::Result<()> { 27 | self.0.serialize(writer) 28 | } 29 | 30 | fn deserialize(reader: &mut R) -> io::Result { 31 | u32::deserialize(reader).map(Field) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/schema/field_value.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Read, Write}; 2 | 3 | use common::BinarySerializable; 4 | 5 | use crate::schema::{Field, Value}; 6 | 7 | /// `FieldValue` holds together a `Field` and its `Value`. 8 | #[allow(missing_docs)] 9 | #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] 10 | pub struct FieldValue { 11 | pub field: Field, 12 | pub value: Value, 13 | } 14 | 15 | impl FieldValue { 16 | /// Constructor 17 | pub fn new(field: Field, value: Value) -> FieldValue { 18 | FieldValue { field, value } 19 | } 20 | 21 | /// Field accessor 22 | pub fn field(&self) -> Field { 23 | self.field 24 | } 25 | 26 | /// Value accessor 27 | pub fn value(&self) -> &Value { 28 | &self.value 29 | } 30 | } 31 | 32 | impl From for Value { 33 | fn from(field_value: FieldValue) -> Self { 34 | field_value.value 35 | } 36 | } 37 | 38 | impl BinarySerializable for FieldValue { 39 | fn serialize(&self, writer: &mut W) -> io::Result<()> { 40 | self.field.serialize(writer)?; 41 | self.value.serialize(writer) 42 | } 43 | 44 | fn deserialize(reader: &mut R) -> io::Result { 45 | let field = Field::deserialize(reader)?; 46 | let value = Value::deserialize(reader)?; 47 | Ok(FieldValue { field, value }) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/schema/flags.rs: -------------------------------------------------------------------------------- 1 | use std::ops::BitOr; 2 | 3 | use crate::schema::{NumericOptions, TextOptions}; 4 | 5 | #[derive(Clone)] 6 | pub struct StoredFlag; 7 | /// Flag to mark the field as stored. 8 | /// This flag can apply to any kind of field. 9 | /// 10 | /// A stored fields of a document can be retrieved given its `DocId`. 11 | /// Stored field are stored together and compressed. 12 | /// Reading the stored fields of a document is relatively slow. 13 | /// (~ 100 microsecs) 14 | /// 15 | /// It should not be used during scoring or collection. 16 | pub const STORED: SchemaFlagList = SchemaFlagList { 17 | head: StoredFlag, 18 | tail: (), 19 | }; 20 | 21 | #[derive(Clone)] 22 | pub struct IndexedFlag; 23 | /// Flag to mark the field as indexed. An indexed field is searchable and has a fieldnorm. 24 | /// 25 | /// The `INDEXED` flag can only be used when building `NumericOptions` (`u64`, `i64` and `f64` 26 | /// fields) Of course, text fields can also be indexed... But this is expressed by using either the 27 | /// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags. 28 | pub const INDEXED: SchemaFlagList = SchemaFlagList { 29 | head: IndexedFlag, 30 | tail: (), 31 | }; 32 | 33 | #[derive(Clone)] 34 | pub struct FastFlag; 35 | /// Flag to mark the field as a fast field (similar to Lucene's DocValues) 36 | /// 37 | /// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering 38 | /// or collection should be mark as fast fields. 39 | /// The `FAST` flag can only be used when building `NumericOptions` (`u64`, `i64` and `f64` fields) 40 | pub const FAST: SchemaFlagList = SchemaFlagList { 41 | head: FastFlag, 42 | tail: (), 43 | }; 44 | 45 | impl BitOr> for SchemaFlagList 46 | where 47 | Head: Clone, 48 | OldHead: Clone, 49 | OldTail: Clone, 50 | { 51 | type Output = SchemaFlagList>; 52 | 53 | fn bitor(self, head: SchemaFlagList) -> Self::Output { 54 | SchemaFlagList { 55 | head: head.head, 56 | tail: self, 57 | } 58 | } 59 | } 60 | 61 | impl> BitOr for SchemaFlagList { 62 | type Output = NumericOptions; 63 | 64 | fn bitor(self, rhs: NumericOptions) -> Self::Output { 65 | self.head.into() | rhs 66 | } 67 | } 68 | 69 | impl> BitOr for SchemaFlagList { 70 | type Output = TextOptions; 71 | 72 | fn bitor(self, rhs: TextOptions) -> Self::Output { 73 | self.head.into() | rhs 74 | } 75 | } 76 | 77 | #[derive(Clone)] 78 | pub struct SchemaFlagList { 79 | pub head: Head, 80 | pub tail: Tail, 81 | } 82 | -------------------------------------------------------------------------------- /src/schema/index_record_option.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | /// `IndexRecordOption` describes an amount information associated 4 | /// to a given indexed field. 5 | /// 6 | /// It is both used to: 7 | /// 8 | /// * describe in the schema the amount of information 9 | /// that should be retained during indexing (See 10 | /// [`TextFieldIndexing.html.set_index_option`]( 11 | /// ../schema/struct.TextFieldIndexing.html#method.set_index_option)) 12 | /// * to request for a given 13 | /// amount of information to be decoded as one goes through a posting list. 14 | /// (See [`InvertedIndexReader.read_postings`]( 15 | /// ../struct.InvertedIndexReader.html#method.read_postings)) 16 | #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)] 17 | pub enum IndexRecordOption { 18 | /// records only the `DocId`s 19 | #[serde(rename = "basic")] 20 | Basic, 21 | /// records the document ids as well as the term frequency. 22 | /// The term frequency can help giving better scoring of the documents. 23 | #[serde(rename = "freq")] 24 | WithFreqs, 25 | /// records the document id, the term frequency and the positions of 26 | /// the occurences in the document. 27 | /// Positions are required to run [PhraseQueries](../query/struct.PhraseQuery.html). 28 | #[serde(rename = "position")] 29 | WithFreqsAndPositions, 30 | } 31 | 32 | impl Default for IndexRecordOption { 33 | fn default() -> Self { 34 | IndexRecordOption::Basic 35 | } 36 | } 37 | 38 | impl IndexRecordOption { 39 | /// Returns true if this option includes encoding 40 | /// term frequencies. 41 | pub fn has_freq(self) -> bool { 42 | match self { 43 | IndexRecordOption::Basic => false, 44 | IndexRecordOption::WithFreqs | IndexRecordOption::WithFreqsAndPositions => true, 45 | } 46 | } 47 | 48 | /// Returns true if this option include encoding 49 | /// term positions. 50 | pub fn has_positions(self) -> bool { 51 | match self { 52 | IndexRecordOption::Basic | IndexRecordOption::WithFreqs => false, 53 | IndexRecordOption::WithFreqsAndPositions => true, 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/schema/named_field_document.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | use crate::schema::Value; 6 | 7 | /// Internal representation of a document used for JSON 8 | /// serialization. 9 | /// 10 | /// A `NamedFieldDocument` is a simple representation of a document 11 | /// as a `BTreeMap>`. 12 | #[derive(Debug, Deserialize, Serialize)] 13 | pub struct NamedFieldDocument(pub BTreeMap>); 14 | -------------------------------------------------------------------------------- /src/store/compression_brotli.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | #[inline] 4 | pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { 5 | let params = brotli::enc::BrotliEncoderParams { 6 | quality: 5, 7 | ..Default::default() 8 | }; 9 | compressed.clear(); 10 | brotli::BrotliCompress(&mut uncompressed, compressed, ¶ms)?; 11 | Ok(()) 12 | } 13 | 14 | #[inline] 15 | pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec) -> io::Result<()> { 16 | decompressed.clear(); 17 | brotli::BrotliDecompress(&mut compressed, decompressed)?; 18 | Ok(()) 19 | } 20 | -------------------------------------------------------------------------------- /src/store/compression_lz4_block.rs: -------------------------------------------------------------------------------- 1 | use core::convert::TryInto; 2 | use std::io::{self}; 3 | use std::mem; 4 | 5 | use lz4_flex::{compress_into, decompress_into}; 6 | 7 | #[inline] 8 | #[allow(clippy::uninit_vec)] 9 | pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { 10 | compressed.clear(); 11 | let maximum_ouput_size = 12 | mem::size_of::() + lz4_flex::block::get_maximum_output_size(uncompressed.len()); 13 | compressed.reserve(maximum_ouput_size); 14 | unsafe { 15 | compressed.set_len(maximum_ouput_size); 16 | } 17 | let bytes_written = compress_into(uncompressed, &mut compressed[4..]) 18 | .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?; 19 | let num_bytes = uncompressed.len() as u32; 20 | compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes()); 21 | unsafe { 22 | compressed.set_len(bytes_written + mem::size_of::()); 23 | } 24 | Ok(()) 25 | } 26 | 27 | #[inline] 28 | #[allow(clippy::uninit_vec)] 29 | pub fn decompress(compressed: &[u8], decompressed: &mut Vec) -> io::Result<()> { 30 | decompressed.clear(); 31 | let uncompressed_size_bytes: &[u8; 4] = compressed 32 | .get(..4) 33 | .ok_or(io::ErrorKind::InvalidData)? 34 | .try_into() 35 | .unwrap(); 36 | let uncompressed_size = u32::from_le_bytes(*uncompressed_size_bytes) as usize; 37 | decompressed.reserve(uncompressed_size); 38 | unsafe { 39 | decompressed.set_len(uncompressed_size); 40 | } 41 | let bytes_written = decompress_into(&compressed[4..], decompressed) 42 | .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?; 43 | if bytes_written != uncompressed_size { 44 | return Err(io::Error::new( 45 | io::ErrorKind::InvalidData, 46 | "doc store block not completely decompressed, data corruption".to_string(), 47 | )); 48 | } 49 | Ok(()) 50 | } 51 | -------------------------------------------------------------------------------- /src/store/compression_snap.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Read, Write}; 2 | 3 | #[inline] 4 | pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { 5 | compressed.clear(); 6 | let mut encoder = snap::write::FrameEncoder::new(compressed); 7 | encoder.write_all(uncompressed)?; 8 | encoder.flush()?; 9 | Ok(()) 10 | } 11 | 12 | #[inline] 13 | pub fn decompress(compressed: &[u8], decompressed: &mut Vec) -> io::Result<()> { 14 | decompressed.clear(); 15 | snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?; 16 | Ok(()) 17 | } 18 | -------------------------------------------------------------------------------- /src/store/footer.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use common::{BinarySerializable, FixedSize, HasLen}; 4 | 5 | use crate::directory::FileSlice; 6 | use crate::store::Compressor; 7 | 8 | #[derive(Debug, Clone, PartialEq)] 9 | pub struct DocStoreFooter { 10 | pub offset: u64, 11 | pub compressor: Compressor, 12 | } 13 | 14 | /// Serialises the footer to a byte-array 15 | /// - offset : 8 bytes 16 | /// - compressor id: 1 byte 17 | /// - reserved for future use: 15 bytes 18 | impl BinarySerializable for DocStoreFooter { 19 | fn serialize(&self, writer: &mut W) -> io::Result<()> { 20 | BinarySerializable::serialize(&self.offset, writer)?; 21 | BinarySerializable::serialize(&self.compressor.get_id(), writer)?; 22 | writer.write_all(&[0; 15])?; 23 | Ok(()) 24 | } 25 | 26 | fn deserialize(reader: &mut R) -> io::Result { 27 | let offset = u64::deserialize(reader)?; 28 | let compressor_id = u8::deserialize(reader)?; 29 | let mut skip_buf = [0; 15]; 30 | reader.read_exact(&mut skip_buf)?; 31 | Ok(DocStoreFooter { 32 | offset, 33 | compressor: Compressor::from_id(compressor_id), 34 | }) 35 | } 36 | } 37 | 38 | impl FixedSize for DocStoreFooter { 39 | const SIZE_IN_BYTES: usize = 24; 40 | } 41 | 42 | impl DocStoreFooter { 43 | pub fn new(offset: u64, compressor: Compressor) -> Self { 44 | DocStoreFooter { offset, compressor } 45 | } 46 | 47 | pub fn extract_footer(file: FileSlice) -> io::Result<(DocStoreFooter, FileSlice)> { 48 | if file.len() < DocStoreFooter::SIZE_IN_BYTES { 49 | return Err(io::Error::new( 50 | io::ErrorKind::UnexpectedEof, 51 | format!( 52 | "File corrupted. The file is smaller than Footer::SIZE_IN_BYTES (len={}).", 53 | file.len() 54 | ), 55 | )); 56 | } 57 | let (body, footer_slice) = file.split_from_end(DocStoreFooter::SIZE_IN_BYTES); 58 | let mut footer_bytes = footer_slice.read_bytes()?; 59 | let footer = DocStoreFooter::deserialize(&mut footer_bytes)?; 60 | Ok((footer, body)) 61 | } 62 | } 63 | 64 | #[test] 65 | fn doc_store_footer_test() { 66 | // This test is just to safe guard changes on the footer. 67 | // When the doc store footer is updated, make sure to update also the serialize/deserialize 68 | // methods 69 | assert_eq!(core::mem::size_of::(), 16); 70 | } 71 | -------------------------------------------------------------------------------- /src/store/index/skip_index.rs: -------------------------------------------------------------------------------- 1 | use common::{BinarySerializable, VInt}; 2 | 3 | use crate::directory::OwnedBytes; 4 | use crate::store::index::block::CheckpointBlock; 5 | use crate::store::index::Checkpoint; 6 | use crate::DocId; 7 | 8 | pub struct LayerCursor<'a> { 9 | remaining: &'a [u8], 10 | block: CheckpointBlock, 11 | cursor: usize, 12 | } 13 | 14 | impl<'a> Iterator for LayerCursor<'a> { 15 | type Item = Checkpoint; 16 | 17 | fn next(&mut self) -> Option { 18 | if self.cursor == self.block.len() { 19 | if self.remaining.is_empty() { 20 | return None; 21 | } 22 | let (block_mut, remaining_mut) = (&mut self.block, &mut self.remaining); 23 | block_mut.deserialize(remaining_mut).ok()?; 24 | self.cursor = 0; 25 | } 26 | let res = Some(self.block.get(self.cursor)); 27 | self.cursor += 1; 28 | res 29 | } 30 | } 31 | 32 | struct Layer { 33 | data: OwnedBytes, 34 | } 35 | 36 | impl Layer { 37 | fn cursor(&self) -> impl Iterator + '_ { 38 | self.cursor_at_offset(0) 39 | } 40 | 41 | fn cursor_at_offset(&self, start_offset: usize) -> impl Iterator + '_ { 42 | let data = &self.data.as_slice(); 43 | LayerCursor { 44 | remaining: &data[start_offset..], 45 | block: CheckpointBlock::default(), 46 | cursor: 0, 47 | } 48 | } 49 | 50 | fn seek_start_at_offset(&self, target: DocId, offset: usize) -> Option { 51 | self.cursor_at_offset(offset) 52 | .find(|checkpoint| checkpoint.doc_range.end > target) 53 | } 54 | } 55 | 56 | pub struct SkipIndex { 57 | layers: Vec, 58 | } 59 | 60 | impl SkipIndex { 61 | pub fn open(mut data: OwnedBytes) -> SkipIndex { 62 | let offsets: Vec = Vec::::deserialize(&mut data) 63 | .unwrap() 64 | .into_iter() 65 | .map(|el| el.0) 66 | .collect(); 67 | let mut start_offset = 0; 68 | let mut layers = Vec::new(); 69 | for end_offset in offsets { 70 | let layer = Layer { 71 | data: data.slice(start_offset as usize..end_offset as usize), 72 | }; 73 | layers.push(layer); 74 | start_offset = end_offset; 75 | } 76 | SkipIndex { layers } 77 | } 78 | 79 | pub(crate) fn checkpoints(&self) -> impl Iterator + '_ { 80 | self.layers 81 | .last() 82 | .into_iter() 83 | .flat_map(|layer| layer.cursor()) 84 | } 85 | 86 | pub fn seek(&self, target: DocId) -> Option { 87 | let first_layer_len = self 88 | .layers 89 | .first() 90 | .map(|layer| layer.data.len()) 91 | .unwrap_or(0); 92 | let mut cur_checkpoint = Checkpoint { 93 | doc_range: 0u32..1u32, 94 | byte_range: 0..first_layer_len, 95 | }; 96 | for layer in &self.layers { 97 | if let Some(checkpoint) = 98 | layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start) 99 | { 100 | cur_checkpoint = checkpoint; 101 | } else { 102 | return None; 103 | } 104 | } 105 | Some(cur_checkpoint) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/store/index/skip_index_builder.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::Write; 3 | 4 | use common::{BinarySerializable, VInt}; 5 | 6 | use crate::store::index::block::CheckpointBlock; 7 | use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD}; 8 | 9 | // Each skip contains iterator over pairs (last doc in block, offset to start of block). 10 | 11 | struct LayerBuilder { 12 | buffer: Vec, 13 | pub block: CheckpointBlock, 14 | } 15 | 16 | impl LayerBuilder { 17 | fn finish(self) -> Vec { 18 | self.buffer 19 | } 20 | 21 | fn new() -> LayerBuilder { 22 | LayerBuilder { 23 | buffer: Vec::new(), 24 | block: CheckpointBlock::default(), 25 | } 26 | } 27 | 28 | /// Serializes the block, and return a checkpoint representing 29 | /// the entire block. 30 | /// 31 | /// If the block was empty to begin with, simply return None. 32 | fn flush_block(&mut self) -> Option { 33 | if let Some(doc_range) = self.block.doc_interval() { 34 | let start_offset = self.buffer.len(); 35 | self.block.serialize(&mut self.buffer); 36 | let end_offset = self.buffer.len(); 37 | self.block.clear(); 38 | Some(Checkpoint { 39 | doc_range, 40 | byte_range: start_offset..end_offset, 41 | }) 42 | } else { 43 | None 44 | } 45 | } 46 | 47 | fn push(&mut self, checkpoint: Checkpoint) { 48 | self.block.push(checkpoint); 49 | } 50 | 51 | fn insert(&mut self, checkpoint: Checkpoint) -> Option { 52 | self.push(checkpoint); 53 | let emit_skip_info = self.block.len() >= CHECKPOINT_PERIOD; 54 | if emit_skip_info { 55 | self.flush_block() 56 | } else { 57 | None 58 | } 59 | } 60 | } 61 | 62 | pub struct SkipIndexBuilder { 63 | layers: Vec, 64 | } 65 | 66 | impl SkipIndexBuilder { 67 | pub fn new() -> SkipIndexBuilder { 68 | SkipIndexBuilder { layers: Vec::new() } 69 | } 70 | 71 | fn get_layer(&mut self, layer_id: usize) -> &mut LayerBuilder { 72 | if layer_id == self.layers.len() { 73 | let layer_builder = LayerBuilder::new(); 74 | self.layers.push(layer_builder); 75 | } 76 | &mut self.layers[layer_id] 77 | } 78 | 79 | pub fn insert(&mut self, checkpoint: Checkpoint) { 80 | let mut skip_pointer = Some(checkpoint); 81 | for layer_id in 0.. { 82 | if let Some(checkpoint) = skip_pointer { 83 | skip_pointer = self.get_layer(layer_id).insert(checkpoint); 84 | } else { 85 | break; 86 | } 87 | } 88 | } 89 | 90 | pub fn write(mut self, output: &mut W) -> io::Result<()> { 91 | let mut last_pointer = None; 92 | for skip_layer in self.layers.iter_mut() { 93 | if let Some(checkpoint) = last_pointer { 94 | skip_layer.push(checkpoint); 95 | } 96 | last_pointer = skip_layer.flush_block(); 97 | } 98 | let layer_buffers: Vec> = self 99 | .layers 100 | .into_iter() 101 | .rev() 102 | .map(|layer| layer.finish()) 103 | .collect(); 104 | 105 | let mut layer_offset = 0; 106 | let mut layer_sizes = Vec::new(); 107 | for layer_buffer in &layer_buffers { 108 | layer_offset += layer_buffer.len() as u64; 109 | layer_sizes.push(VInt(layer_offset)); 110 | } 111 | layer_sizes.serialize(output)?; 112 | for layer_buffer in layer_buffers { 113 | output.write_all(&layer_buffer[..])?; 114 | } 115 | Ok(()) 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/termdict/fst_termdict/mod.rs: -------------------------------------------------------------------------------- 1 | //! The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to 2 | //! a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information 3 | //! about the term. 4 | //! 5 | //! Internally, the term dictionary relies on the `fst` crate to store 6 | //! a sorted mapping that associate each term to its rank in the lexicographical order. 7 | //! For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", 8 | //! the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`. 9 | //! 10 | //! For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the 11 | //! lexicographical order matches the natural order of integers. 12 | //! 13 | //! `i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` 14 | //! and then treated as a `u64`. 15 | //! 16 | //! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated 17 | //! as `u64`. 18 | //! 19 | //! A second datastructure makes it possible to access a 20 | //! [`TermInfo`](../postings/struct.TermInfo.html). 21 | mod merger; 22 | mod streamer; 23 | mod term_info_store; 24 | mod termdict; 25 | 26 | pub use self::merger::TermMerger; 27 | pub use self::streamer::{TermStreamer, TermStreamerBuilder}; 28 | pub use self::termdict::{TermDictionary, TermDictionaryBuilder}; 29 | -------------------------------------------------------------------------------- /src/termdict/mod.rs: -------------------------------------------------------------------------------- 1 | //! The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to 2 | //! a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information 3 | //! about the term. 4 | //! 5 | //! Internally, the term dictionary relies on the `fst` crate to store 6 | //! a sorted mapping that associate each term to its rank in the lexicographical order. 7 | //! For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", 8 | //! the [TermOrdinal] are respectively `0`, `1`, `2`, and `3`. 9 | //! 10 | //! For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the 11 | //! lexicographical order matches the natural order of integers. 12 | //! 13 | //! `i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` 14 | //! and then treated as a `u64`. 15 | //! 16 | //! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated 17 | //! as `u64`. 18 | //! 19 | //! A second datastructure makes it possible to access a 20 | //! [`TermInfo`](../postings/struct.TermInfo.html). 21 | 22 | #[cfg(not(feature = "quickwit"))] 23 | mod fst_termdict; 24 | #[cfg(not(feature = "quickwit"))] 25 | use fst_termdict as termdict; 26 | 27 | #[cfg(feature = "quickwit")] 28 | mod sstable_termdict; 29 | #[cfg(feature = "quickwit")] 30 | use sstable_termdict as termdict; 31 | use tantivy_fst::automaton::AlwaysMatch; 32 | 33 | #[cfg(test)] 34 | mod tests; 35 | 36 | /// Position of the term in the sorted list of terms. 37 | pub type TermOrdinal = u64; 38 | 39 | /// The term dictionary contains all of the terms in 40 | /// `tantivy index` in a sorted manner. 41 | pub type TermDictionary = self::termdict::TermDictionary; 42 | 43 | /// Builder for the new term dictionary. 44 | /// 45 | /// Inserting must be done in the order of the `keys`. 46 | pub type TermDictionaryBuilder = self::termdict::TermDictionaryBuilder; 47 | 48 | /// Given a list of sorted term streams, 49 | /// returns an iterator over sorted unique terms. 50 | /// 51 | /// The item yield is actually a pair with 52 | /// - the term 53 | /// - a slice with the ordinal of the segments containing 54 | /// the terms. 55 | pub type TermMerger<'a> = self::termdict::TermMerger<'a>; 56 | 57 | /// `TermStreamer` acts as a cursor over a range of terms of a segment. 58 | /// Terms are guaranteed to be sorted. 59 | pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>; 60 | -------------------------------------------------------------------------------- /src/termdict/sstable_termdict/sstable/block_reader.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Read}; 2 | 3 | use byteorder::{LittleEndian, ReadBytesExt}; 4 | 5 | pub struct BlockReader<'a> { 6 | buffer: Vec, 7 | reader: Box, 8 | offset: usize, 9 | } 10 | 11 | impl<'a> BlockReader<'a> { 12 | pub fn new(reader: Box) -> BlockReader<'a> { 13 | BlockReader { 14 | buffer: Vec::new(), 15 | reader, 16 | offset: 0, 17 | } 18 | } 19 | 20 | pub fn deserialize_u64(&mut self) -> u64 { 21 | let (num_bytes, val) = super::vint::deserialize_read(self.buffer()); 22 | self.advance(num_bytes); 23 | val 24 | } 25 | 26 | #[inline(always)] 27 | pub fn buffer_from_to(&self, start: usize, end: usize) -> &[u8] { 28 | &self.buffer[start..end] 29 | } 30 | 31 | pub fn read_block(&mut self) -> io::Result { 32 | self.offset = 0; 33 | let block_len_res = self.reader.read_u32::(); 34 | if let Err(err) = &block_len_res { 35 | if err.kind() == io::ErrorKind::UnexpectedEof { 36 | return Ok(false); 37 | } 38 | } 39 | let block_len = block_len_res?; 40 | if block_len == 0u32 { 41 | self.buffer.clear(); 42 | return Ok(false); 43 | } 44 | self.buffer.resize(block_len as usize, 0u8); 45 | self.reader.read_exact(&mut self.buffer[..])?; 46 | Ok(true) 47 | } 48 | 49 | pub fn offset(&self) -> usize { 50 | self.offset 51 | } 52 | 53 | pub fn advance(&mut self, num_bytes: usize) { 54 | self.offset += num_bytes; 55 | } 56 | 57 | pub fn buffer(&self) -> &[u8] { 58 | &self.buffer[self.offset..] 59 | } 60 | } 61 | 62 | impl<'a> io::Read for BlockReader<'a> { 63 | fn read(&mut self, buf: &mut [u8]) -> io::Result { 64 | let len = self.buffer().read(buf)?; 65 | self.advance(len); 66 | Ok(len) 67 | } 68 | 69 | fn read_to_end(&mut self, buf: &mut Vec) -> io::Result { 70 | let len = self.buffer.len(); 71 | buf.extend_from_slice(self.buffer()); 72 | self.advance(len); 73 | Ok(len) 74 | } 75 | 76 | fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> { 77 | self.buffer().read_exact(buf)?; 78 | self.advance(buf.len()); 79 | Ok(()) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/termdict/sstable_termdict/sstable/merge/heap_merge.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::collections::binary_heap::PeekMut; 3 | use std::collections::BinaryHeap; 4 | use std::io; 5 | 6 | use super::{SingleValueMerger, ValueMerger}; 7 | use crate::termdict::sstable_termdict::sstable::{Reader, SSTable, Writer}; 8 | 9 | struct HeapItem>(B); 10 | 11 | impl> Ord for HeapItem { 12 | fn cmp(&self, other: &Self) -> Ordering { 13 | other.0.as_ref().cmp(self.0.as_ref()) 14 | } 15 | } 16 | impl> PartialOrd for HeapItem { 17 | fn partial_cmp(&self, other: &Self) -> Option { 18 | Some(other.0.as_ref().cmp(self.0.as_ref())) 19 | } 20 | } 21 | 22 | impl> Eq for HeapItem {} 23 | impl> PartialEq for HeapItem { 24 | fn eq(&self, other: &Self) -> bool { 25 | self.0.as_ref() == other.0.as_ref() 26 | } 27 | } 28 | 29 | #[allow(dead_code)] 30 | pub fn merge_sstable>( 31 | readers: Vec>, 32 | mut writer: Writer, 33 | mut merger: M, 34 | ) -> io::Result<()> { 35 | let mut heap: BinaryHeap>> = 36 | BinaryHeap::with_capacity(readers.len()); 37 | for mut reader in readers { 38 | if reader.advance()? { 39 | heap.push(HeapItem(reader)); 40 | } 41 | } 42 | loop { 43 | let len = heap.len(); 44 | let mut value_merger; 45 | if let Some(mut head) = heap.peek_mut() { 46 | writer.write_key(head.0.key()); 47 | value_merger = merger.new_value(head.0.value()); 48 | if !head.0.advance()? { 49 | PeekMut::pop(head); 50 | } 51 | } else { 52 | break; 53 | } 54 | for _ in 0..len - 1 { 55 | if let Some(mut head) = heap.peek_mut() { 56 | if head.0.key() == writer.current_key() { 57 | value_merger.add(head.0.value()); 58 | if !head.0.advance()? { 59 | PeekMut::pop(head); 60 | } 61 | continue; 62 | } 63 | } 64 | break; 65 | } 66 | let value = value_merger.finish(); 67 | writer.write_value(&value)?; 68 | writer.flush_block_if_required()?; 69 | } 70 | writer.finalize()?; 71 | Ok(()) 72 | } 73 | -------------------------------------------------------------------------------- /src/termdict/sstable_termdict/sstable/sstable_index.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::ops::Range; 3 | 4 | use serde::{Deserialize, Serialize}; 5 | 6 | use crate::error::DataCorruption; 7 | 8 | #[derive(Default, Debug, Serialize, Deserialize)] 9 | pub struct SSTableIndex { 10 | blocks: Vec, 11 | } 12 | 13 | impl SSTableIndex { 14 | pub(crate) fn load(data: &[u8]) -> Result { 15 | serde_cbor::de::from_slice(data) 16 | .map_err(|_| DataCorruption::comment_only("SSTable index is corrupted")) 17 | } 18 | 19 | pub fn search(&self, key: &[u8]) -> Option { 20 | self.blocks 21 | .iter() 22 | .find(|block| &block.last_key[..] >= key) 23 | .map(|block| block.block_addr.clone()) 24 | } 25 | } 26 | 27 | #[derive(Clone, Eq, PartialEq, Debug, Serialize, Deserialize)] 28 | pub struct BlockAddr { 29 | pub byte_range: Range, 30 | pub first_ordinal: u64, 31 | } 32 | 33 | #[derive(Debug, Serialize, Deserialize)] 34 | struct BlockMeta { 35 | pub last_key: Vec, 36 | pub block_addr: BlockAddr, 37 | } 38 | 39 | #[derive(Default)] 40 | pub struct SSTableIndexBuilder { 41 | index: SSTableIndex, 42 | } 43 | 44 | impl SSTableIndexBuilder { 45 | pub fn add_block(&mut self, last_key: &[u8], byte_range: Range, first_ordinal: u64) { 46 | self.index.blocks.push(BlockMeta { 47 | last_key: last_key.to_vec(), 48 | block_addr: BlockAddr { 49 | byte_range, 50 | first_ordinal, 51 | }, 52 | }) 53 | } 54 | 55 | pub fn serialize(&self, wrt: &mut dyn io::Write) -> io::Result<()> { 56 | serde_cbor::ser::to_writer(wrt, &self.index).unwrap(); 57 | Ok(()) 58 | } 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use super::{BlockAddr, SSTableIndex, SSTableIndexBuilder}; 64 | 65 | #[test] 66 | fn test_sstable_index() { 67 | let mut sstable_builder = SSTableIndexBuilder::default(); 68 | sstable_builder.add_block(b"aaa", 10..20, 0u64); 69 | sstable_builder.add_block(b"bbbbbbb", 20..30, 564); 70 | sstable_builder.add_block(b"ccc", 30..40, 10u64); 71 | sstable_builder.add_block(b"dddd", 40..50, 15u64); 72 | let mut buffer: Vec = Vec::new(); 73 | sstable_builder.serialize(&mut buffer).unwrap(); 74 | let sstable_index = SSTableIndex::load(&buffer[..]).unwrap(); 75 | assert_eq!( 76 | sstable_index.search(b"bbbde"), 77 | Some(BlockAddr { 78 | first_ordinal: 10u64, 79 | byte_range: 30..40 80 | }) 81 | ); 82 | } 83 | 84 | #[test] 85 | fn test_sstable_with_corrupted_data() { 86 | let mut sstable_builder = SSTableIndexBuilder::default(); 87 | sstable_builder.add_block(b"aaa", 10..20, 0u64); 88 | sstable_builder.add_block(b"bbbbbbb", 20..30, 564); 89 | sstable_builder.add_block(b"ccc", 30..40, 10u64); 90 | sstable_builder.add_block(b"dddd", 40..50, 15u64); 91 | let mut buffer: Vec = Vec::new(); 92 | sstable_builder.serialize(&mut buffer).unwrap(); 93 | buffer[1] = 9u8; 94 | let data_corruption_err = SSTableIndex::load(&buffer[..]).err().unwrap(); 95 | assert_eq!( 96 | format!("{data_corruption_err:?}"), 97 | "Data corruption: SSTable index is corrupted." 98 | ); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/termdict/sstable_termdict/sstable/value.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | 3 | use super::{vint, BlockReader}; 4 | 5 | pub trait ValueReader: Default { 6 | type Value; 7 | 8 | fn value(&self, idx: usize) -> &Self::Value; 9 | 10 | fn read(&mut self, reader: &mut BlockReader) -> io::Result<()>; 11 | } 12 | 13 | pub trait ValueWriter: Default { 14 | type Value; 15 | 16 | fn write(&mut self, val: &Self::Value); 17 | 18 | fn write_block(&mut self, writer: &mut Vec); 19 | } 20 | 21 | #[derive(Default)] 22 | pub struct VoidReader; 23 | 24 | impl ValueReader for VoidReader { 25 | type Value = (); 26 | 27 | fn value(&self, _idx: usize) -> &() { 28 | &() 29 | } 30 | 31 | fn read(&mut self, _reader: &mut BlockReader) -> io::Result<()> { 32 | Ok(()) 33 | } 34 | } 35 | 36 | #[derive(Default)] 37 | pub struct VoidWriter; 38 | 39 | impl ValueWriter for VoidWriter { 40 | type Value = (); 41 | 42 | fn write(&mut self, _val: &()) {} 43 | 44 | fn write_block(&mut self, _writer: &mut Vec) {} 45 | } 46 | 47 | #[derive(Default)] 48 | pub struct U64MonotonicWriter { 49 | vals: Vec, 50 | } 51 | 52 | impl ValueWriter for U64MonotonicWriter { 53 | type Value = u64; 54 | 55 | fn write(&mut self, val: &Self::Value) { 56 | self.vals.push(*val); 57 | } 58 | 59 | fn write_block(&mut self, writer: &mut Vec) { 60 | let mut prev_val = 0u64; 61 | vint::serialize_into_vec(self.vals.len() as u64, writer); 62 | for &val in &self.vals { 63 | let delta = val - prev_val; 64 | vint::serialize_into_vec(delta, writer); 65 | prev_val = val; 66 | } 67 | self.vals.clear(); 68 | } 69 | } 70 | 71 | #[derive(Default)] 72 | pub struct U64MonotonicReader { 73 | vals: Vec, 74 | } 75 | 76 | impl ValueReader for U64MonotonicReader { 77 | type Value = u64; 78 | 79 | fn value(&self, idx: usize) -> &Self::Value { 80 | &self.vals[idx] 81 | } 82 | 83 | fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> { 84 | let len = reader.deserialize_u64() as usize; 85 | self.vals.clear(); 86 | let mut prev_val = 0u64; 87 | for _ in 0..len { 88 | let delta = reader.deserialize_u64() as u64; 89 | let val = prev_val + delta; 90 | self.vals.push(val); 91 | prev_val = val; 92 | } 93 | Ok(()) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/termdict/sstable_termdict/sstable/vint.rs: -------------------------------------------------------------------------------- 1 | const CONTINUE_BIT: u8 = 128u8; 2 | 3 | pub fn serialize(mut val: u64, buffer: &mut [u8]) -> usize { 4 | for (i, b) in buffer.iter_mut().enumerate() { 5 | let next_byte: u8 = (val & 127u64) as u8; 6 | val >>= 7; 7 | if val == 0u64 { 8 | *b = next_byte; 9 | return i + 1; 10 | } else { 11 | *b = next_byte | CONTINUE_BIT; 12 | } 13 | } 14 | 10 //< actually unreachable 15 | } 16 | 17 | pub fn serialize_into_vec(val: u64, buffer: &mut Vec) { 18 | let mut buf = [0u8; 10]; 19 | let num_bytes = serialize(val, &mut buf[..]); 20 | buffer.extend_from_slice(&buf[..num_bytes]); 21 | } 22 | 23 | // super slow but we don't care 24 | pub fn deserialize_read(buf: &[u8]) -> (usize, u64) { 25 | let mut result = 0u64; 26 | let mut shift = 0u64; 27 | let mut consumed = 0; 28 | 29 | for &b in buf { 30 | consumed += 1; 31 | result |= u64::from(b % 128u8) << shift; 32 | if b < CONTINUE_BIT { 33 | break; 34 | } 35 | shift += 7; 36 | } 37 | (consumed, result) 38 | } 39 | 40 | #[cfg(test)] 41 | mod tests { 42 | use std::u64; 43 | 44 | use super::{deserialize_read, serialize}; 45 | 46 | fn aux_test_int(val: u64, expect_len: usize) { 47 | let mut buffer = [0u8; 14]; 48 | assert_eq!(serialize(val, &mut buffer[..]), expect_len); 49 | assert_eq!(deserialize_read(&buffer), (expect_len, val)); 50 | } 51 | 52 | #[test] 53 | fn test_vint() { 54 | aux_test_int(0u64, 1); 55 | aux_test_int(17u64, 1); 56 | aux_test_int(127u64, 1); 57 | aux_test_int(128u64, 2); 58 | aux_test_int(123423418u64, 4); 59 | for i in 1..63 { 60 | let power_of_two = 1u64 << i; 61 | aux_test_int(power_of_two + 1, (i / 7) + 1); 62 | aux_test_int(power_of_two, (i / 7) + 1); 63 | aux_test_int(power_of_two - 1, ((i - 1) / 7) + 1); 64 | } 65 | aux_test_int(u64::MAX, 10); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/tokenizer/alphanum_only.rs: -------------------------------------------------------------------------------- 1 | //! # Example 2 | //! ```rust 3 | //! use tantivy::tokenizer::*; 4 | //! 5 | //! let tokenizer = TextAnalyzer::from(RawTokenizer) 6 | //! .filter(AlphaNumOnlyFilter); 7 | //! 8 | //! let mut stream = tokenizer.token_stream("hello there"); 9 | //! // is none because the raw filter emits one token that 10 | //! // contains a space 11 | //! assert!(stream.next().is_none()); 12 | //! 13 | //! let tokenizer = TextAnalyzer::from(SimpleTokenizer) 14 | //! .filter(AlphaNumOnlyFilter); 15 | //! 16 | //! let mut stream = tokenizer.token_stream("hello there 💣"); 17 | //! assert!(stream.next().is_some()); 18 | //! assert!(stream.next().is_some()); 19 | //! // the "emoji" is dropped because its not an alphanum 20 | //! assert!(stream.next().is_none()); 21 | //! ``` 22 | use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; 23 | 24 | /// `TokenFilter` that removes all tokens that contain non 25 | /// ascii alphanumeric characters. 26 | #[derive(Clone)] 27 | pub struct AlphaNumOnlyFilter; 28 | 29 | pub struct AlphaNumOnlyFilterStream<'a> { 30 | tail: BoxTokenStream<'a>, 31 | } 32 | 33 | impl<'a> AlphaNumOnlyFilterStream<'a> { 34 | fn predicate(&self, token: &Token) -> bool { 35 | token.text.chars().all(|c| c.is_ascii_alphanumeric()) 36 | } 37 | } 38 | 39 | impl TokenFilter for AlphaNumOnlyFilter { 40 | fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { 41 | BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream }) 42 | } 43 | } 44 | 45 | impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> { 46 | fn advance(&mut self) -> bool { 47 | while self.tail.advance() { 48 | if self.predicate(self.tail.token()) { 49 | return true; 50 | } 51 | } 52 | 53 | false 54 | } 55 | 56 | fn token(&self) -> &Token { 57 | self.tail.token() 58 | } 59 | 60 | fn token_mut(&mut self) -> &mut Token { 61 | self.tail.token_mut() 62 | } 63 | } 64 | 65 | #[cfg(test)] 66 | mod tests { 67 | use crate::tokenizer::tests::assert_token; 68 | use crate::tokenizer::{AlphaNumOnlyFilter, SimpleTokenizer, TextAnalyzer, Token}; 69 | 70 | #[test] 71 | fn test_alphanum_only() { 72 | let tokens = token_stream_helper("I am a cat. 我輩は猫である。(1906)"); 73 | assert_eq!(tokens.len(), 5); 74 | assert_token(&tokens[0], 0, "I", 0, 1); 75 | assert_token(&tokens[1], 1, "am", 2, 4); 76 | assert_token(&tokens[2], 2, "a", 5, 6); 77 | assert_token(&tokens[3], 3, "cat", 7, 10); 78 | assert_token(&tokens[4], 5, "1906", 37, 41); 79 | } 80 | 81 | fn token_stream_helper(text: &str) -> Vec { 82 | let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter); 83 | let mut token_stream = a.token_stream(text); 84 | let mut tokens: Vec = vec![]; 85 | let mut add_token = |token: &Token| { 86 | tokens.push(token.clone()); 87 | }; 88 | token_stream.process(&mut add_token); 89 | tokens 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/tokenizer/empty_tokenizer.rs: -------------------------------------------------------------------------------- 1 | use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer}; 2 | 3 | #[derive(Clone)] 4 | pub(crate) struct EmptyTokenizer; 5 | 6 | impl Tokenizer for EmptyTokenizer { 7 | fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> { 8 | EmptyTokenStream::default().into() 9 | } 10 | } 11 | 12 | #[derive(Default)] 13 | struct EmptyTokenStream { 14 | token: Token, 15 | } 16 | 17 | impl TokenStream for EmptyTokenStream { 18 | fn advance(&mut self) -> bool { 19 | false 20 | } 21 | 22 | fn token(&self) -> &super::Token { 23 | &self.token 24 | } 25 | 26 | fn token_mut(&mut self) -> &mut super::Token { 27 | &mut self.token 28 | } 29 | } 30 | 31 | #[cfg(test)] 32 | mod tests { 33 | use crate::tokenizer::Tokenizer; 34 | 35 | #[test] 36 | fn test_empty_tokenizer() { 37 | let tokenizer = super::EmptyTokenizer; 38 | let mut empty = tokenizer.token_stream("whatever string"); 39 | assert!(!empty.advance()); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/tokenizer/lower_caser.rs: -------------------------------------------------------------------------------- 1 | use std::mem; 2 | 3 | use super::{Token, TokenFilter, TokenStream}; 4 | use crate::tokenizer::BoxTokenStream; 5 | 6 | impl TokenFilter for LowerCaser { 7 | fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { 8 | BoxTokenStream::from(LowerCaserTokenStream { 9 | tail: token_stream, 10 | buffer: String::with_capacity(100), 11 | }) 12 | } 13 | } 14 | 15 | /// Token filter that lowercase terms. 16 | #[derive(Clone)] 17 | pub struct LowerCaser; 18 | 19 | pub struct LowerCaserTokenStream<'a> { 20 | buffer: String, 21 | tail: BoxTokenStream<'a>, 22 | } 23 | 24 | // writes a lowercased version of text into output. 25 | fn to_lowercase_unicode(text: &str, output: &mut String) { 26 | output.clear(); 27 | for c in text.chars() { 28 | // Contrary to the std, we do not take care of sigma special case. 29 | // This will have an normalizationo effect, which is ok for search. 30 | output.extend(c.to_lowercase()); 31 | } 32 | } 33 | 34 | impl<'a> TokenStream for LowerCaserTokenStream<'a> { 35 | fn advance(&mut self) -> bool { 36 | if !self.tail.advance() { 37 | return false; 38 | } 39 | if self.token_mut().text.is_ascii() { 40 | // fast track for ascii. 41 | self.token_mut().text.make_ascii_lowercase(); 42 | } else { 43 | to_lowercase_unicode(&self.tail.token().text, &mut self.buffer); 44 | mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); 45 | } 46 | true 47 | } 48 | 49 | fn token(&self) -> &Token { 50 | self.tail.token() 51 | } 52 | 53 | fn token_mut(&mut self) -> &mut Token { 54 | self.tail.token_mut() 55 | } 56 | } 57 | 58 | #[cfg(test)] 59 | mod tests { 60 | use crate::tokenizer::tests::assert_token; 61 | use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, Token}; 62 | 63 | #[test] 64 | fn test_to_lower_case() { 65 | let tokens = token_stream_helper("Tree"); 66 | assert_eq!(tokens.len(), 1); 67 | assert_token(&tokens[0], 0, "tree", 0, 4); 68 | 69 | let tokens = token_stream_helper("Русский текст"); 70 | assert_eq!(tokens.len(), 2); 71 | assert_token(&tokens[0], 0, "русский", 0, 14); 72 | assert_token(&tokens[1], 1, "текст", 15, 25); 73 | } 74 | 75 | fn token_stream_helper(text: &str) -> Vec { 76 | let mut token_stream = TextAnalyzer::from(SimpleTokenizer) 77 | .filter(LowerCaser) 78 | .token_stream(text); 79 | let mut tokens = vec![]; 80 | let mut add_token = |token: &Token| { 81 | tokens.push(token.clone()); 82 | }; 83 | token_stream.process(&mut add_token); 84 | tokens 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/tokenizer/raw_tokenizer.rs: -------------------------------------------------------------------------------- 1 | use super::{Token, TokenStream, Tokenizer}; 2 | use crate::tokenizer::BoxTokenStream; 3 | 4 | /// For each value of the field, emit a single unprocessed token. 5 | #[derive(Clone)] 6 | pub struct RawTokenizer; 7 | 8 | pub struct RawTokenStream { 9 | token: Token, 10 | has_token: bool, 11 | } 12 | 13 | impl Tokenizer for RawTokenizer { 14 | fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { 15 | let token = Token { 16 | offset_from: 0, 17 | offset_to: text.len(), 18 | position: 0, 19 | text: text.to_string(), 20 | position_length: 1, 21 | }; 22 | RawTokenStream { 23 | token, 24 | has_token: true, 25 | } 26 | .into() 27 | } 28 | } 29 | 30 | impl TokenStream for RawTokenStream { 31 | fn advance(&mut self) -> bool { 32 | let result = self.has_token; 33 | self.has_token = false; 34 | result 35 | } 36 | 37 | fn token(&self) -> &Token { 38 | &self.token 39 | } 40 | 41 | fn token_mut(&mut self) -> &mut Token { 42 | &mut self.token 43 | } 44 | } 45 | 46 | #[cfg(test)] 47 | mod tests { 48 | use crate::tokenizer::tests::assert_token; 49 | use crate::tokenizer::{RawTokenizer, TextAnalyzer, Token}; 50 | 51 | #[test] 52 | fn test_raw_tokenizer() { 53 | let tokens = token_stream_helper("Hello, happy tax payer!"); 54 | assert_eq!(tokens.len(), 1); 55 | assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23); 56 | } 57 | 58 | fn token_stream_helper(text: &str) -> Vec { 59 | let a = TextAnalyzer::from(RawTokenizer); 60 | let mut token_stream = a.token_stream(text); 61 | let mut tokens: Vec = vec![]; 62 | let mut add_token = |token: &Token| { 63 | tokens.push(token.clone()); 64 | }; 65 | token_stream.process(&mut add_token); 66 | tokens 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/tokenizer/remove_long.rs: -------------------------------------------------------------------------------- 1 | //! # Example 2 | //! ```rust 3 | //! use tantivy::tokenizer::*; 4 | //! 5 | //! let tokenizer = TextAnalyzer::from(SimpleTokenizer) 6 | //! .filter(RemoveLongFilter::limit(5)); 7 | //! 8 | //! let mut stream = tokenizer.token_stream("toolong nice"); 9 | //! // because `toolong` is more than 5 characters, it is filtered 10 | //! // out of the token stream. 11 | //! assert_eq!(stream.next().unwrap().text, "nice"); 12 | //! assert!(stream.next().is_none()); 13 | //! ``` 14 | use super::{Token, TokenFilter, TokenStream}; 15 | use crate::tokenizer::BoxTokenStream; 16 | 17 | /// `RemoveLongFilter` removes tokens that are longer 18 | /// than a given number of bytes (in UTF-8 representation). 19 | /// 20 | /// It is especially useful when indexing unconstrained content. 21 | /// e.g. Mail containing base-64 encoded pictures etc. 22 | #[derive(Clone)] 23 | pub struct RemoveLongFilter { 24 | length_limit: usize, 25 | } 26 | 27 | impl RemoveLongFilter { 28 | /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation. 29 | pub fn limit(length_limit: usize) -> RemoveLongFilter { 30 | RemoveLongFilter { length_limit } 31 | } 32 | } 33 | 34 | impl<'a> RemoveLongFilterStream<'a> { 35 | fn predicate(&self, token: &Token) -> bool { 36 | token.text.len() < self.token_length_limit 37 | } 38 | } 39 | 40 | impl TokenFilter for RemoveLongFilter { 41 | fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { 42 | BoxTokenStream::from(RemoveLongFilterStream { 43 | token_length_limit: self.length_limit, 44 | tail: token_stream, 45 | }) 46 | } 47 | } 48 | 49 | pub struct RemoveLongFilterStream<'a> { 50 | token_length_limit: usize, 51 | tail: BoxTokenStream<'a>, 52 | } 53 | 54 | impl<'a> TokenStream for RemoveLongFilterStream<'a> { 55 | fn advance(&mut self) -> bool { 56 | while self.tail.advance() { 57 | if self.predicate(self.tail.token()) { 58 | return true; 59 | } 60 | } 61 | false 62 | } 63 | 64 | fn token(&self) -> &Token { 65 | self.tail.token() 66 | } 67 | 68 | fn token_mut(&mut self) -> &mut Token { 69 | self.tail.token_mut() 70 | } 71 | } 72 | 73 | #[cfg(test)] 74 | mod tests { 75 | use crate::tokenizer::tests::assert_token; 76 | use crate::tokenizer::{RemoveLongFilter, SimpleTokenizer, TextAnalyzer, Token}; 77 | 78 | #[test] 79 | fn test_remove_long() { 80 | let tokens = token_stream_helper("hello tantivy, happy searching!"); 81 | assert_eq!(tokens.len(), 2); 82 | assert_token(&tokens[0], 0, "hello", 0, 5); 83 | assert_token(&tokens[1], 2, "happy", 15, 20); 84 | } 85 | 86 | fn token_stream_helper(text: &str) -> Vec { 87 | let a = TextAnalyzer::from(SimpleTokenizer).filter(RemoveLongFilter::limit(6)); 88 | let mut token_stream = a.token_stream(text); 89 | let mut tokens: Vec = vec![]; 90 | let mut add_token = |token: &Token| { 91 | tokens.push(token.clone()); 92 | }; 93 | token_stream.process(&mut add_token); 94 | tokens 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/tokenizer/simple_tokenizer.rs: -------------------------------------------------------------------------------- 1 | use std::str::CharIndices; 2 | 3 | use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; 4 | 5 | /// Tokenize the text by splitting on whitespaces and punctuation. 6 | #[derive(Clone)] 7 | pub struct SimpleTokenizer; 8 | 9 | pub struct SimpleTokenStream<'a> { 10 | text: &'a str, 11 | chars: CharIndices<'a>, 12 | token: Token, 13 | } 14 | 15 | impl Tokenizer for SimpleTokenizer { 16 | fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { 17 | BoxTokenStream::from(SimpleTokenStream { 18 | text, 19 | chars: text.char_indices(), 20 | token: Token::default(), 21 | }) 22 | } 23 | } 24 | 25 | impl<'a> SimpleTokenStream<'a> { 26 | // search for the end of the current token. 27 | fn search_token_end(&mut self) -> usize { 28 | (&mut self.chars) 29 | .filter(|&(_, ref c)| !c.is_alphanumeric()) 30 | .map(|(offset, _)| offset) 31 | .next() 32 | .unwrap_or(self.text.len()) 33 | } 34 | } 35 | 36 | impl<'a> TokenStream for SimpleTokenStream<'a> { 37 | fn advance(&mut self) -> bool { 38 | self.token.text.clear(); 39 | self.token.position = self.token.position.wrapping_add(1); 40 | while let Some((offset_from, c)) = self.chars.next() { 41 | if c.is_alphanumeric() { 42 | let offset_to = self.search_token_end(); 43 | self.token.offset_from = offset_from; 44 | self.token.offset_to = offset_to; 45 | self.token.text.push_str(&self.text[offset_from..offset_to]); 46 | return true; 47 | } 48 | } 49 | false 50 | } 51 | 52 | fn token(&self) -> &Token { 53 | &self.token 54 | } 55 | 56 | fn token_mut(&mut self) -> &mut Token { 57 | &mut self.token 58 | } 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use crate::tokenizer::tests::assert_token; 64 | use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, Token}; 65 | 66 | #[test] 67 | fn test_simple_tokenizer() { 68 | let tokens = token_stream_helper("Hello, happy tax payer!"); 69 | assert_eq!(tokens.len(), 4); 70 | assert_token(&tokens[0], 0, "Hello", 0, 5); 71 | assert_token(&tokens[1], 1, "happy", 7, 12); 72 | assert_token(&tokens[2], 2, "tax", 13, 16); 73 | assert_token(&tokens[3], 3, "payer", 17, 22); 74 | } 75 | 76 | fn token_stream_helper(text: &str) -> Vec { 77 | let a = TextAnalyzer::from(SimpleTokenizer); 78 | let mut token_stream = a.token_stream(text); 79 | let mut tokens: Vec = vec![]; 80 | let mut add_token = |token: &Token| { 81 | tokens.push(token.clone()); 82 | }; 83 | token_stream.process(&mut add_token); 84 | tokens 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/tokenizer/stemmer.rs: -------------------------------------------------------------------------------- 1 | use rust_stemmers::{self, Algorithm}; 2 | use serde::{Deserialize, Serialize}; 3 | 4 | use super::{Token, TokenFilter, TokenStream}; 5 | use crate::tokenizer::BoxTokenStream; 6 | 7 | /// Available stemmer languages. 8 | #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)] 9 | #[allow(missing_docs)] 10 | pub enum Language { 11 | Arabic, 12 | Danish, 13 | Dutch, 14 | English, 15 | Finnish, 16 | French, 17 | German, 18 | Greek, 19 | Hungarian, 20 | Italian, 21 | Norwegian, 22 | Portuguese, 23 | Romanian, 24 | Russian, 25 | Spanish, 26 | Swedish, 27 | Tamil, 28 | Turkish, 29 | } 30 | 31 | impl Language { 32 | fn algorithm(self) -> Algorithm { 33 | use self::Language::*; 34 | match self { 35 | Arabic => Algorithm::Arabic, 36 | Danish => Algorithm::Danish, 37 | Dutch => Algorithm::Dutch, 38 | English => Algorithm::English, 39 | Finnish => Algorithm::Finnish, 40 | French => Algorithm::French, 41 | German => Algorithm::German, 42 | Greek => Algorithm::Greek, 43 | Hungarian => Algorithm::Hungarian, 44 | Italian => Algorithm::Italian, 45 | Norwegian => Algorithm::Norwegian, 46 | Portuguese => Algorithm::Portuguese, 47 | Romanian => Algorithm::Romanian, 48 | Russian => Algorithm::Russian, 49 | Spanish => Algorithm::Spanish, 50 | Swedish => Algorithm::Swedish, 51 | Tamil => Algorithm::Tamil, 52 | Turkish => Algorithm::Turkish, 53 | } 54 | } 55 | } 56 | 57 | /// `Stemmer` token filter. Several languages are supported, see `Language` for the available 58 | /// languages. 59 | /// Tokens are expected to be lowercased beforehand. 60 | #[derive(Clone)] 61 | pub struct Stemmer { 62 | stemmer_algorithm: Algorithm, 63 | } 64 | 65 | impl Stemmer { 66 | /// Creates a new Stemmer `TokenFilter` for a given language algorithm. 67 | pub fn new(language: Language) -> Stemmer { 68 | Stemmer { 69 | stemmer_algorithm: language.algorithm(), 70 | } 71 | } 72 | } 73 | 74 | impl Default for Stemmer { 75 | /// Creates a new Stemmer `TokenFilter` for English. 76 | fn default() -> Self { 77 | Stemmer::new(Language::English) 78 | } 79 | } 80 | 81 | impl TokenFilter for Stemmer { 82 | fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { 83 | let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); 84 | BoxTokenStream::from(StemmerTokenStream { 85 | tail: token_stream, 86 | stemmer: inner_stemmer, 87 | }) 88 | } 89 | } 90 | 91 | pub struct StemmerTokenStream<'a> { 92 | tail: BoxTokenStream<'a>, 93 | stemmer: rust_stemmers::Stemmer, 94 | } 95 | 96 | impl<'a> TokenStream for StemmerTokenStream<'a> { 97 | fn advance(&mut self) -> bool { 98 | if !self.tail.advance() { 99 | return false; 100 | } 101 | // TODO remove allocation 102 | let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned(); 103 | self.token_mut().text.clear(); 104 | self.token_mut().text.push_str(&stemmed_str); 105 | true 106 | } 107 | 108 | fn token(&self) -> &Token { 109 | self.tail.token() 110 | } 111 | 112 | fn token_mut(&mut self) -> &mut Token { 113 | self.tail.token_mut() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/tokenizer/tokenized_string.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | 3 | use serde::{Deserialize, Serialize}; 4 | 5 | use crate::tokenizer::{Token, TokenStream}; 6 | 7 | /// Struct representing pre-tokenized text 8 | #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] 9 | pub struct PreTokenizedString { 10 | /// Original text 11 | pub text: String, 12 | /// Tokens derived from the text 13 | pub tokens: Vec, 14 | } 15 | 16 | impl Ord for PreTokenizedString { 17 | fn cmp(&self, other: &Self) -> Ordering { 18 | self.text.cmp(&other.text) 19 | } 20 | } 21 | 22 | impl PartialOrd for PreTokenizedString { 23 | fn partial_cmp(&self, other: &Self) -> Option { 24 | Some(self.cmp(other)) 25 | } 26 | } 27 | 28 | /// TokenStream implementation which wraps PreTokenizedString 29 | pub struct PreTokenizedStream { 30 | tokenized_string: PreTokenizedString, 31 | current_token: i64, 32 | } 33 | 34 | impl From for PreTokenizedStream { 35 | fn from(s: PreTokenizedString) -> PreTokenizedStream { 36 | PreTokenizedStream { 37 | tokenized_string: s, 38 | current_token: -1, 39 | } 40 | } 41 | } 42 | 43 | impl TokenStream for PreTokenizedStream { 44 | fn advance(&mut self) -> bool { 45 | self.current_token += 1; 46 | self.current_token < self.tokenized_string.tokens.len() as i64 47 | } 48 | 49 | fn token(&self) -> &Token { 50 | assert!( 51 | self.current_token >= 0, 52 | "TokenStream not initialized. You should call advance() at least once." 53 | ); 54 | &self.tokenized_string.tokens[self.current_token as usize] 55 | } 56 | 57 | fn token_mut(&mut self) -> &mut Token { 58 | assert!( 59 | self.current_token >= 0, 60 | "TokenStream not initialized. You should call advance() at least once." 61 | ); 62 | &mut self.tokenized_string.tokens[self.current_token as usize] 63 | } 64 | } 65 | 66 | #[cfg(test)] 67 | mod tests { 68 | 69 | use super::*; 70 | use crate::tokenizer::Token; 71 | 72 | #[test] 73 | fn test_tokenized_stream() { 74 | let tok_text = PreTokenizedString { 75 | text: String::from("A a"), 76 | tokens: vec![ 77 | Token { 78 | offset_from: 0, 79 | offset_to: 1, 80 | position: 0, 81 | text: String::from("A"), 82 | position_length: 1, 83 | }, 84 | Token { 85 | offset_from: 2, 86 | offset_to: 3, 87 | position: 1, 88 | text: String::from("a"), 89 | position_length: 1, 90 | }, 91 | ], 92 | }; 93 | 94 | let mut token_stream = PreTokenizedStream::from(tok_text.clone()); 95 | 96 | for expected_token in tok_text.tokens { 97 | assert!(token_stream.advance()); 98 | assert_eq!(token_stream.token(), &expected_token); 99 | } 100 | assert!(!token_stream.advance()); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/tokenizer/tokenizer_manager.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::sync::{Arc, RwLock}; 3 | 4 | use crate::tokenizer::stemmer::Language; 5 | use crate::tokenizer::tokenizer::TextAnalyzer; 6 | use crate::tokenizer::{ 7 | LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer, 8 | }; 9 | 10 | /// The tokenizer manager serves as a store for 11 | /// all of the pre-configured tokenizer pipelines. 12 | /// 13 | /// By default, it is populated with the following managers. 14 | /// 15 | /// * `raw` : does not process nor tokenize the text. 16 | /// * `default` : Chops the text on according to whitespace and 17 | /// punctuation, removes tokens that are too long, and lowercases 18 | /// tokens 19 | /// * `en_stem` : Like `default`, but also applies stemming on the 20 | /// resulting tokens. Stemming can improve the recall of your 21 | /// search engine. 22 | #[derive(Clone)] 23 | pub struct TokenizerManager { 24 | tokenizers: Arc>>, 25 | } 26 | 27 | impl TokenizerManager { 28 | /// Registers a new tokenizer associated with a given name. 29 | pub fn register(&self, tokenizer_name: &str, tokenizer: T) 30 | where TextAnalyzer: From { 31 | let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer); 32 | self.tokenizers 33 | .write() 34 | .expect("Acquiring the lock should never fail") 35 | .insert(tokenizer_name.to_string(), boxed_tokenizer); 36 | } 37 | 38 | /// Accessing a tokenizer given its name. 39 | pub fn get(&self, tokenizer_name: &str) -> Option { 40 | self.tokenizers 41 | .read() 42 | .expect("Acquiring the lock should never fail") 43 | .get(tokenizer_name) 44 | .cloned() 45 | } 46 | } 47 | 48 | impl Default for TokenizerManager { 49 | /// Creates an `TokenizerManager` prepopulated with 50 | /// the default pre-configured tokenizers of `tantivy`. 51 | /// - simple 52 | /// - en_stem 53 | /// - ja 54 | fn default() -> TokenizerManager { 55 | let manager = TokenizerManager { 56 | tokenizers: Arc::new(RwLock::new(HashMap::new())), 57 | }; 58 | manager.register("raw", RawTokenizer); 59 | manager.register( 60 | "default", 61 | TextAnalyzer::from(SimpleTokenizer) 62 | .filter(RemoveLongFilter::limit(40)) 63 | .filter(LowerCaser), 64 | ); 65 | manager.register( 66 | "en_stem", 67 | TextAnalyzer::from(SimpleTokenizer) 68 | .filter(RemoveLongFilter::limit(40)) 69 | .filter(LowerCaser) 70 | .filter(Stemmer::new(Language::English)), 71 | ); 72 | manager.register("whitespace", WhitespaceTokenizer); 73 | manager 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/tokenizer/whitespace_tokenizer.rs: -------------------------------------------------------------------------------- 1 | use std::str::CharIndices; 2 | 3 | use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; 4 | 5 | /// Tokenize the text by splitting on whitespaces. 6 | #[derive(Clone)] 7 | pub struct WhitespaceTokenizer; 8 | 9 | pub struct WhitespaceTokenStream<'a> { 10 | text: &'a str, 11 | chars: CharIndices<'a>, 12 | token: Token, 13 | } 14 | 15 | impl Tokenizer for WhitespaceTokenizer { 16 | fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { 17 | BoxTokenStream::from(WhitespaceTokenStream { 18 | text, 19 | chars: text.char_indices(), 20 | token: Token::default(), 21 | }) 22 | } 23 | } 24 | 25 | impl<'a> WhitespaceTokenStream<'a> { 26 | // search for the end of the current token. 27 | fn search_token_end(&mut self) -> usize { 28 | (&mut self.chars) 29 | .filter(|&(_, ref c)| c.is_ascii_whitespace()) 30 | .map(|(offset, _)| offset) 31 | .next() 32 | .unwrap_or(self.text.len()) 33 | } 34 | } 35 | 36 | impl<'a> TokenStream for WhitespaceTokenStream<'a> { 37 | fn advance(&mut self) -> bool { 38 | self.token.text.clear(); 39 | self.token.position = self.token.position.wrapping_add(1); 40 | while let Some((offset_from, c)) = self.chars.next() { 41 | if !c.is_ascii_whitespace() { 42 | let offset_to = self.search_token_end(); 43 | self.token.offset_from = offset_from; 44 | self.token.offset_to = offset_to; 45 | self.token.text.push_str(&self.text[offset_from..offset_to]); 46 | return true; 47 | } 48 | } 49 | false 50 | } 51 | 52 | fn token(&self) -> &Token { 53 | &self.token 54 | } 55 | 56 | fn token_mut(&mut self) -> &mut Token { 57 | &mut self.token 58 | } 59 | } 60 | 61 | #[cfg(test)] 62 | mod tests { 63 | use crate::tokenizer::tests::assert_token; 64 | use crate::tokenizer::{TextAnalyzer, Token, WhitespaceTokenizer}; 65 | 66 | #[test] 67 | fn test_whitespace_tokenizer() { 68 | let tokens = token_stream_helper("Hello, happy tax payer!"); 69 | assert_eq!(tokens.len(), 4); 70 | assert_token(&tokens[0], 0, "Hello,", 0, 6); 71 | assert_token(&tokens[1], 1, "happy", 7, 12); 72 | assert_token(&tokens[2], 2, "tax", 13, 16); 73 | assert_token(&tokens[3], 3, "payer!", 17, 23); 74 | } 75 | 76 | fn token_stream_helper(text: &str) -> Vec { 77 | let a = TextAnalyzer::from(WhitespaceTokenizer); 78 | let mut token_stream = a.token_stream(text); 79 | let mut tokens: Vec = vec![]; 80 | let mut add_token = |token: &Token| { 81 | tokens.push(token.clone()); 82 | }; 83 | token_stream.process(&mut add_token); 84 | tokens 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /tests/mod.rs: -------------------------------------------------------------------------------- 1 | mod failpoints; 2 | --------------------------------------------------------------------------------