├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   ├── actions.md
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── question.md
    ├── dependabot.yml
    └── workflows
    │   ├── coverage.yml
    │   ├── long_running.yml
    │   └── test.yml
├── .gitignore
├── ARCHITECTURE.md
├── AUTHORS
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE
├── Makefile
├── README.md
├── appveyor.yml
├── benches
    ├── alice.txt
    ├── analyzer.rs
    ├── hdfs.json
    └── index-bench.rs
├── bitpacker
    ├── Cargo.toml
    ├── benches
    │   └── bench.rs
    └── src
    │   ├── bitpacker.rs
    │   ├── blocked_bitpacker.rs
    │   └── lib.rs
├── ci
    ├── before_deploy.ps1
    ├── before_deploy.sh
    ├── install.sh
    └── script.sh
├── common
    ├── Cargo.toml
    └── src
    │   ├── bitset.rs
    │   ├── lib.rs
    │   ├── serialize.rs
    │   ├── vint.rs
    │   └── writer.rs
├── doc
    ├── .gitignore
    ├── book.toml
    └── src
    │   ├── SUMMARY.md
    │   ├── avant-propos.md
    │   ├── basis.md
    │   ├── best_practise.md.rs
    │   ├── examples.md
    │   ├── facetting.md
    │   ├── faq.md
    │   ├── index_sorting.md
    │   ├── innerworkings.md
    │   ├── inverted_index.md
    │   ├── json.md
    │   └── schema.md
├── examples
    ├── aggregation.rs
    ├── basic_search.rs
    ├── custom_collector.rs
    ├── custom_tokenizer.rs
    ├── deleting_updating_documents.rs
    ├── faceted_search.rs
    ├── faceted_search_with_tweaked_score.rs
    ├── integer_range_search.rs
    ├── iterating_docs_and_positions.rs
    ├── json_field.rs
    ├── multiple_producer.rs
    ├── pre_tokenized_text.rs
    ├── snippet.rs
    ├── stop_words.rs
    ├── warmer.rs
    └── working_with_json.rs
├── fastfield_codecs
    ├── Cargo.toml
    ├── README.md
    ├── benches
    │   └── bench.rs
    └── src
    │   ├── bitpacked.rs
    │   ├── lib.rs
    │   ├── linearinterpol.rs
    │   ├── main.rs
    │   └── multilinearinterpol.rs
├── ownedbytes
    ├── Cargo.toml
    └── src
    │   └── lib.rs
├── query-grammar
    ├── Cargo.toml
    ├── README.md
    └── src
    │   ├── lib.rs
    │   ├── occur.rs
    │   ├── query_grammar.rs
    │   └── user_input_ast.rs
├── run-tests.sh
├── rustfmt.toml
├── src
    ├── aggregation
    │   ├── README.md
    │   ├── agg_req.rs
    │   ├── agg_req_with_accessor.rs
    │   ├── agg_result.rs
    │   ├── bucket
    │   │   ├── histogram
    │   │   │   ├── histogram.rs
    │   │   │   └── mod.rs
    │   │   ├── mod.rs
    │   │   └── range.rs
    │   ├── collector.rs
    │   ├── intermediate_agg_result.rs
    │   ├── metric
    │   │   ├── average.rs
    │   │   ├── mod.rs
    │   │   └── stats.rs
    │   ├── mod.rs
    │   └── segment_agg_result.rs
    ├── collector
    │   ├── count_collector.rs
    │   ├── custom_score_top_collector.rs
    │   ├── docset_collector.rs
    │   ├── facet_collector.rs
    │   ├── filter_collector_wrapper.rs
    │   ├── histogram_collector.rs
    │   ├── mod.rs
    │   ├── multi_collector.rs
    │   ├── tests.rs
    │   ├── top_collector.rs
    │   ├── top_score_collector.rs
    │   └── tweak_score_top_collector.rs
    ├── core
    │   ├── executor.rs
    │   ├── index.rs
    │   ├── index_meta.rs
    │   ├── inverted_index_reader.rs
    │   ├── mod.rs
    │   ├── searcher.rs
    │   ├── segment.rs
    │   ├── segment_component.rs
    │   ├── segment_id.rs
    │   └── segment_reader.rs
    ├── directory
    │   ├── composite_file.rs
    │   ├── directory.rs
    │   ├── directory_lock.rs
    │   ├── error.rs
    │   ├── file_slice.rs
    │   ├── file_watcher.rs
    │   ├── footer.rs
    │   ├── managed_directory.rs
    │   ├── mmap_directory.rs
    │   ├── mod.rs
    │   ├── ram_directory.rs
    │   ├── tests.rs
    │   └── watch_event_router.rs
    ├── docset.rs
    ├── error.rs
    ├── fastfield
    │   ├── alive_bitset.rs
    │   ├── bytes
    │   │   ├── mod.rs
    │   │   ├── reader.rs
    │   │   └── writer.rs
    │   ├── error.rs
    │   ├── facet_reader.rs
    │   ├── mod.rs
    │   ├── multivalued
    │   │   ├── mod.rs
    │   │   ├── reader.rs
    │   │   └── writer.rs
    │   ├── reader.rs
    │   ├── readers.rs
    │   ├── serializer
    │   │   └── mod.rs
    │   └── writer.rs
    ├── fieldnorm
    │   ├── code.rs
    │   ├── mod.rs
    │   ├── reader.rs
    │   ├── serializer.rs
    │   └── writer.rs
    ├── functional_test.rs
    ├── future_result.rs
    ├── indexer
    │   ├── delete_queue.rs
    │   ├── demuxer.rs
    │   ├── doc_id_mapping.rs
    │   ├── doc_opstamp_mapping.rs
    │   ├── index_writer.rs
    │   ├── index_writer_status.rs
    │   ├── json_term_writer.rs
    │   ├── log_merge_policy.rs
    │   ├── merge_operation.rs
    │   ├── merge_policy.rs
    │   ├── merger.rs
    │   ├── merger_sorted_index_test.rs
    │   ├── mod.rs
    │   ├── operation.rs
    │   ├── prepared_commit.rs
    │   ├── segment_entry.rs
    │   ├── segment_manager.rs
    │   ├── segment_register.rs
    │   ├── segment_serializer.rs
    │   ├── segment_updater.rs
    │   ├── segment_writer.rs
    │   └── stamper.rs
    ├── lib.rs
    ├── macros.rs
    ├── positions
    │   ├── mod.rs
    │   ├── reader.rs
    │   └── serializer.rs
    ├── postings
    │   ├── block_search.rs
    │   ├── block_segment_postings.rs
    │   ├── compression
    │   │   ├── mod.rs
    │   │   └── vint.rs
    │   ├── indexing_context.rs
    │   ├── json_postings_writer.rs
    │   ├── mod.rs
    │   ├── per_field_postings_writer.rs
    │   ├── postings.rs
    │   ├── postings_writer.rs
    │   ├── recorder.rs
    │   ├── segment_postings.rs
    │   ├── serializer.rs
    │   ├── skip.rs
    │   ├── stacker
    │   │   ├── expull.rs
    │   │   ├── memory_arena.rs
    │   │   ├── mod.rs
    │   │   └── term_hashmap.rs
    │   └── term_info.rs
    ├── query
    │   ├── all_query.rs
    │   ├── automaton_weight.rs
    │   ├── bitset
    │   │   └── mod.rs
    │   ├── bm25.rs
    │   ├── boolean_query
    │   │   ├── block_wand.rs
    │   │   ├── boolean_query.rs
    │   │   ├── boolean_weight.rs
    │   │   └── mod.rs
    │   ├── boost_query.rs
    │   ├── empty_query.rs
    │   ├── exclude.rs
    │   ├── explanation.rs
    │   ├── fuzzy_query.rs
    │   ├── intersection.rs
    │   ├── mod.rs
    │   ├── more_like_this
    │   │   ├── mod.rs
    │   │   ├── more_like_this.rs
    │   │   └── query.rs
    │   ├── phrase_query
    │   │   ├── mod.rs
    │   │   ├── phrase_query.rs
    │   │   ├── phrase_scorer.rs
    │   │   └── phrase_weight.rs
    │   ├── query.rs
    │   ├── query_parser
    │   │   ├── logical_ast.rs
    │   │   ├── mod.rs
    │   │   └── query_parser.rs
    │   ├── range_query.rs
    │   ├── regex_query.rs
    │   ├── reqopt_scorer.rs
    │   ├── score_combiner.rs
    │   ├── scorer.rs
    │   ├── term_query
    │   │   ├── mod.rs
    │   │   ├── term_query.rs
    │   │   ├── term_scorer.rs
    │   │   └── term_weight.rs
    │   ├── union.rs
    │   ├── vec_docset.rs
    │   └── weight.rs
    ├── reader
    │   ├── mod.rs
    │   ├── pool.rs
    │   └── warming.rs
    ├── schema
    │   ├── bytes_options.rs
    │   ├── document.rs
    │   ├── facet.rs
    │   ├── facet_options.rs
    │   ├── field.rs
    │   ├── field_entry.rs
    │   ├── field_type.rs
    │   ├── field_value.rs
    │   ├── flags.rs
    │   ├── index_record_option.rs
    │   ├── json_object_options.rs
    │   ├── mod.rs
    │   ├── named_field_document.rs
    │   ├── numeric_options.rs
    │   ├── schema.rs
    │   ├── term.rs
    │   ├── text_options.rs
    │   └── value.rs
    ├── snippet
    │   └── mod.rs
    ├── space_usage
    │   └── mod.rs
    ├── store
    │   ├── compression_brotli.rs
    │   ├── compression_lz4_block.rs
    │   ├── compression_snap.rs
    │   ├── compressors.rs
    │   ├── footer.rs
    │   ├── index
    │   │   ├── block.rs
    │   │   ├── mod.rs
    │   │   ├── skip_index.rs
    │   │   └── skip_index_builder.rs
    │   ├── mod.rs
    │   ├── reader.rs
    │   └── writer.rs
    ├── termdict
    │   ├── fst_termdict
    │   │   ├── merger.rs
    │   │   ├── mod.rs
    │   │   ├── streamer.rs
    │   │   ├── term_info_store.rs
    │   │   └── termdict.rs
    │   ├── mod.rs
    │   ├── sstable_termdict
    │   │   ├── merger.rs
    │   │   ├── mod.rs
    │   │   ├── sstable
    │   │   │   ├── block_reader.rs
    │   │   │   ├── delta.rs
    │   │   │   ├── merge
    │   │   │   │   ├── heap_merge.rs
    │   │   │   │   └── mod.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── sstable_index.rs
    │   │   │   ├── value.rs
    │   │   │   └── vint.rs
    │   │   ├── streamer.rs
    │   │   └── termdict.rs
    │   └── tests.rs
    └── tokenizer
    │   ├── alphanum_only.rs
    │   ├── ascii_folding_filter.rs
    │   ├── empty_tokenizer.rs
    │   ├── facet_tokenizer.rs
    │   ├── lower_caser.rs
    │   ├── mod.rs
    │   ├── ngram_tokenizer.rs
    │   ├── raw_tokenizer.rs
    │   ├── remove_long.rs
    │   ├── simple_tokenizer.rs
    │   ├── stemmer.rs
    │   ├── stop_word_filter.rs
    │   ├── tokenized_string.rs
    │   ├── tokenizer.rs
    │   ├── tokenizer_manager.rs
    │   └── whitespace_tokenizer.rs
└── tests
    ├── failpoints
        └── mod.rs
    └── mod.rs


/.gitattributes:
--------------------------------------------------------------------------------
1 | cpp/* linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: fulmicoton
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/actions.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Actions
 3 | about: Actions not directly related to producing code.
 4 | 
 5 | ---
 6 | 
 7 | # Actions title
 8 | 
 9 | Action description. 
10 | e.g. 
11 | - benchmark
12 | - investigate and report
13 | - etc.
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Describe the bug**
 8 | - What did you do?
 9 | - What happened?
10 | - What was expected?
11 | 
12 | **Which version of tantivy are you using?**
13 | If "master",  ideally give the specific sha1 revision.
14 | 
15 | **To Reproduce**
16 | 
17 | If your bug is deterministic, can you give a minimal reproducing code?
18 | Some bugs are not deterministic. Can you describe with precision in which context it happened?
19 | If this is possible, can you share your code?
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | **Is your feature request related to a problem? Please describe.**
 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
 9 | 
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 | 
13 | **[Optional] describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Question
3 | about: Ask any question about tantivy's usage...
4 | 
5 | ---
6 | 
7 | Try to be specific about your use case...
8 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: cargo
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     time: "20:00"
 8 |   open-pull-requests-limit: 10
 9 | 
10 | - package-ecosystem: "github-actions"
11 |   directory: "/"
12 |   schedule:
13 |     interval: daily
14 |     time: "20:00"
15 |   open-pull-requests-limit: 10
16 | 


--------------------------------------------------------------------------------
/.github/workflows/coverage.yml:
--------------------------------------------------------------------------------
 1 | name: Coverage
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   coverage:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |       - name: Install Rust
15 |         run: rustup toolchain install nightly --component llvm-tools-preview
16 |       - name: Install cargo-llvm-cov
17 |         run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
18 |       - name: Generate code coverage
19 |         run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
20 |       - name: Upload coverage to Codecov
21 |         uses: codecov/codecov-action@v2
22 |         with:
23 |           token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
24 |           files: lcov.info
25 |           fail_ci_if_error: true
26 | 


--------------------------------------------------------------------------------
/.github/workflows/long_running.yml:
--------------------------------------------------------------------------------
 1 | name: Long running tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 | env:
 8 |   CARGO_TERM_COLOR: always
 9 |   NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
10 | 
11 | jobs:
12 |   functional_test_unsorted:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |     - name: Run indexing_unsorted
17 |       run: cargo test indexing_unsorted -- --ignored
18 |   functional_test_sorted:
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Run indexing_sorted
23 |       run: cargo test indexing_sorted -- --ignored
24 | 
25 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Unit tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   test:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Build
20 |       run: cargo build --verbose --workspace
21 |     - name: Install latest nightly to test also against unstable feature flag
22 |       uses: actions-rs/toolchain@v1
23 |       with:
24 |             toolchain: nightly
25 |             override: true
26 |             components: rustfmt
27 | 
28 |     - name: Install latest nightly to test also against unstable feature flag
29 |       uses: actions-rs/toolchain@v1
30 |       with:
31 |             toolchain: stable
32 |             override: true
33 |             components: rustfmt, clippy
34 | 
35 |     - name: Run tests
36 |       run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
37 | 
38 |     - name: Run tests quickwit feature
39 |       run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace
40 | 
41 |     - name: Check Formatting
42 |       run: cargo +nightly fmt --all -- --check
43 | 
44 |     - uses: actions-rs/clippy-check@v1
45 |       with:
46 |         toolchain: stable
47 |         token: ${{ secrets.GITHUB_TOKEN }}
48 |         args: --tests
49 | 
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | tantivy.iml
 2 | .cargo
 3 | proptest-regressions
 4 | *.swp
 5 | target
 6 | target/debug
 7 | .vscode
 8 | target/release
 9 | Cargo.lock
10 | benchmark
11 | .DS_Store
12 | cpp/simdcomp/bitpackingbenchmark
13 | *.bk
14 | .idea
15 | trace.dat
16 | cargo-timing*
17 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | # This is the list of authors of tantivy for copyright purposes.
 2 | Paul Masurel
 3 | Laurentiu Nicola
 4 | Dru Sellers
 5 | Ashley Mannix
 6 | Michael J. Curry
 7 | Jason Wolfe
 8 | # As an employee of Google I am required to add Google LLC
 9 | # in the list of authors, but this project is not affiliated to Google
10 | # in any other way.
11 | Google LLC 
12 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
  1 | [package]
  2 | name = "tantivy"
  3 | version = "0.17.0"
  4 | authors = ["Paul Masurel <paul.masurel@gmail.com>"]
  5 | license = "MIT"
  6 | categories = ["database-implementations", "data-structures"]
  7 | description = """Search engine library"""
  8 | documentation = "https://docs.rs/tantivy/"
  9 | homepage = "https://github.com/quickwit-oss/tantivy"
 10 | repository = "https://github.com/quickwit-oss/tantivy"
 11 | readme = "README.md"
 12 | keywords = ["search", "information", "retrieval"]
 13 | edition = "2018"
 14 | 
 15 | [dependencies]
 16 | oneshot = "0.1"
 17 | base64 = "0.13"
 18 | byteorder = "1.4.3"
 19 | crc32fast = "1.2.1"
 20 | once_cell = "1.7.2"
 21 | regex ={ version = "1.5.4", default-features = false, features = ["std"] }
 22 | tantivy-fst = "0.3"
 23 | memmap2 = {version = "0.5", optional=true}
 24 | lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
 25 | brotli = { version = "3.3", optional = true }
 26 | snap = { version = "1.0.5", optional = true }
 27 | tempfile = { version = "3.2", optional = true }
 28 | log = "0.4.14"
 29 | serde = { version = "1.0.126", features = ["derive"] }
 30 | serde_json = "1.0.64"
 31 | num_cpus = "1.13"
 32 | fs2={ version = "0.4.3", optional = true }
 33 | levenshtein_automata = "0.2"
 34 | uuid = { version = "0.8.2", features = ["v4", "serde"] }
 35 | crossbeam = "0.8.1"
 36 | tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
 37 | tantivy-bitpacker = { version="0.1", path="./bitpacker" }
 38 | common = { version = "0.2", path = "./common/", package = "tantivy-common" }
 39 | fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
 40 | ownedbytes = { version="0.2", path="./ownedbytes" }
 41 | stable_deref_trait = "1.2"
 42 | rust-stemmers = "1.2"
 43 | downcast-rs = "1.2"
 44 | bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
 45 | census = "0.4"
 46 | fnv = "1.0.7"
 47 | thiserror = "1.0.24"
 48 | htmlescape = "0.3.1"
 49 | fail = "0.5"
 50 | murmurhash32 = "0.2"
 51 | chrono = "0.4.19"
 52 | smallvec = "1.6.1"
 53 | rayon = "1.5"
 54 | lru = "0.7.0"
 55 | fastdivide = "0.4"
 56 | itertools = "0.10.0"
 57 | measure_time = "0.8.0"
 58 | pretty_assertions = "1.1.0"
 59 | serde_cbor = {version="0.11", optional=true}
 60 | async-trait = "0.1"
 61 | 
 62 | [target.'cfg(windows)'.dependencies]
 63 | winapi = "0.3.9"
 64 | 
 65 | [dev-dependencies]
 66 | rand = "0.8.3"
 67 | maplit = "1.0.2"
 68 | matches = "0.1.8"
 69 | proptest = "1.0"
 70 | criterion = "0.3.5"
 71 | test-log = "0.2.8"
 72 | env_logger = "0.9.0"
 73 | pprof = {version= "0.7", features=["flamegraph", "criterion"]}
 74 | futures = "0.3.15"
 75 | 
 76 | [dev-dependencies.fail]
 77 | version = "0.5"
 78 | features = ["failpoints"]
 79 | 
 80 | [profile.release]
 81 | opt-level = 3
 82 | debug = false
 83 | debug-assertions = false
 84 | 
 85 | [profile.test]
 86 | debug-assertions = true
 87 | overflow-checks = true
 88 | 
 89 | [features]
 90 | default = ["mmap", "lz4-compression" ]
 91 | mmap = ["fs2", "tempfile", "memmap2"]
 92 | 
 93 | brotli-compression = ["brotli"]
 94 | lz4-compression = ["lz4_flex"]
 95 | snappy-compression = ["snap"]
 96 | 
 97 | failpoints = ["fail/failpoints"]
 98 | unstable = [] # useful for benches.
 99 | 
100 | quickwit = ["serde_cbor"]
101 | 
102 | [workspace]
103 | members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes"]
104 | 
105 | # Following the "fail" crate best practises, we isolate
106 | # tests that define specific behavior in fail check points
107 | # in a different binary.
108 | #
109 | # We do that because, fail rely on a global definition of
110 | # failpoints behavior and hence, it is incompatible with
111 | # multithreading.
112 | [[test]]
113 | name = "failpoints"
114 | path = "tests/failpoints/mod.rs"
115 | required-features = ["fail/failpoints"]
116 | 
117 | [[bench]]
118 | name = "analyzer"
119 | harness = false
120 | 
121 | [[bench]]
122 | name = "index-bench"
123 | harness = false
124 | 
125 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2018 by the project authors, as listed in the AUTHORS file. 
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | test:
2 | 	echo "Run test only... No examples."
3 | 	cargo test --tests --lib
4 | 
5 | fmt:
6 | 	cargo +nightly fmt --all
7 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | # Appveyor configuration template for Rust using rustup for Rust installation
 2 | # https://github.com/starkat99/appveyor-rust
 3 | 
 4 | os: Visual Studio 2015
 5 | environment:
 6 |   matrix:
 7 |     - channel: stable
 8 |       target: x86_64-pc-windows-msvc
 9 | 
10 | install:
11 |   - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
12 |   - rustup-init -yv --default-toolchain %channel% --default-host %target%
13 |   - set PATH=%PATH%;%USERPROFILE%\.cargo\bin
14 |   - if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
15 |   - rustc -vV
16 |   - cargo -vV
17 | 
18 | build: false
19 | 
20 | test_script:
21 |   - REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
22 |   - REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
23 |   - REM SET RUST_BACKTRACE=1 & cargo build --examples
24 | 


--------------------------------------------------------------------------------
/benches/analyzer.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{criterion_group, criterion_main, Criterion};
 2 | use tantivy::tokenizer::TokenizerManager;
 3 | 
 4 | const ALICE_TXT: &str = include_str!("alice.txt");
 5 | 
 6 | pub fn criterion_benchmark(c: &mut Criterion) {
 7 |     let tokenizer_manager = TokenizerManager::default();
 8 |     let tokenizer = tokenizer_manager.get("default").unwrap();
 9 |     c.bench_function("default-tokenize-alice", |b| {
10 |         b.iter(|| {
11 |             let mut word_count = 0;
12 |             let mut token_stream = tokenizer.token_stream(ALICE_TXT);
13 |             while token_stream.advance() {
14 |                 word_count += 1;
15 |             }
16 |             assert_eq!(word_count, 30_731);
17 |         })
18 |     });
19 | }
20 | 
21 | criterion_group!(benches, criterion_benchmark);
22 | criterion_main!(benches);
23 | 


--------------------------------------------------------------------------------
/bitpacker/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tantivy-bitpacker"
 3 | version = "0.1.1"
 4 | edition = "2018"
 5 | authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 6 | license = "MIT"
 7 | categories = []
 8 | description = """Tantivy-sub crate: bitpacking"""
 9 | repository = "https://github.com/quickwit-oss/tantivy"
10 | keywords = []
11 | 
12 | 
13 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
14 | 
15 | [dependencies]
16 | 


--------------------------------------------------------------------------------
/bitpacker/benches/bench.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test)]
 2 | 
 3 | extern crate test;
 4 | 
 5 | #[cfg(test)]
 6 | mod tests {
 7 |     use tantivy_bitpacker::BlockedBitpacker;
 8 |     use test::Bencher;
 9 | 
10 |     #[bench]
11 |     fn bench_blockedbitp_read(b: &mut Bencher) {
12 |         let mut blocked_bitpacker = BlockedBitpacker::new();
13 |         for val in 0..=21500 {
14 |             blocked_bitpacker.add(val * val);
15 |         }
16 |         b.iter(|| {
17 |             let mut out = 0;
18 |             for val in 0..=21500 {
19 |                 out = blocked_bitpacker.get(val);
20 |             }
21 |             out
22 |         });
23 |     }
24 | 
25 |     #[bench]
26 |     fn bench_blockedbitp_create(b: &mut Bencher) {
27 |         b.iter(|| {
28 |             let mut blocked_bitpacker = BlockedBitpacker::new();
29 |             for val in 0..=21500 {
30 |                 blocked_bitpacker.add(val * val);
31 |             }
32 |             blocked_bitpacker
33 |         });
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/bitpacker/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod bitpacker;
 2 | mod blocked_bitpacker;
 3 | 
 4 | pub use crate::bitpacker::{BitPacker, BitUnpacker};
 5 | pub use crate::blocked_bitpacker::BlockedBitpacker;
 6 | 
 7 | /// Computes the number of bits that will be used for bitpacking.
 8 | ///
 9 | /// In general the target is the minimum number of bits
10 | /// required to express the amplitude given in argument.
11 | ///
12 | /// e.g. If the amplitude is 10, we can store all ints on simply 4bits.
13 | ///
14 | /// The logic is slightly more convoluted here as for optimization
15 | /// reasons, we want to ensure that a value spawns over at most 8 bytes
16 | /// of aligned bytes.
17 | ///
18 | /// Spanning over 9 bytes is possible for instance, if we do
19 | /// bitpacking with an amplitude of 63 bits.
20 | /// In this case, the second int will start on bit
21 | /// 63 (which belongs to byte 7) and ends at byte 15;
22 | /// Hence 9 bytes (from byte 7 to byte 15 included).
23 | ///
24 | /// To avoid this, we force the number of bits to 64bits
25 | /// when the result is greater than `64-8 = 56 bits`.
26 | ///
27 | /// Note that this only affects rare use cases spawning over
28 | /// a very large range of values. Even in this case, it results
29 | /// in an extra cost of at most 12% compared to the optimal
30 | /// number of bits.
31 | pub fn compute_num_bits(n: u64) -> u8 {
32 |     let amplitude = (64u32 - n.leading_zeros()) as u8;
33 |     if amplitude <= 64 - 8 {
34 |         amplitude
35 |     } else {
36 |         64
37 |     }
38 | }
39 | 
40 | pub fn minmax<I, T>(mut vals: I) -> Option<(T, T)>
41 | where
42 |     I: Iterator<Item = T>,
43 |     T: Copy + Ord,
44 | {
45 |     if let Some(first_el) = vals.next() {
46 |         return Some(vals.fold((first_el, first_el), |(min_val, max_val), el| {
47 |             (min_val.min(el), max_val.max(el))
48 |         }));
49 |     }
50 |     None
51 | }
52 | 
53 | #[test]
54 | fn test_compute_num_bits() {
55 |     assert_eq!(compute_num_bits(1), 1u8);
56 |     assert_eq!(compute_num_bits(0), 0u8);
57 |     assert_eq!(compute_num_bits(2), 2u8);
58 |     assert_eq!(compute_num_bits(3), 2u8);
59 |     assert_eq!(compute_num_bits(4), 3u8);
60 |     assert_eq!(compute_num_bits(255), 8u8);
61 |     assert_eq!(compute_num_bits(256), 9u8);
62 |     assert_eq!(compute_num_bits(5_000_000_000), 33u8);
63 | }
64 | 
65 | #[test]
66 | fn test_minmax_empty() {
67 |     let vals: Vec<u32> = vec![];
68 |     assert_eq!(minmax(vals.into_iter()), None);
69 | }
70 | 
71 | #[test]
72 | fn test_minmax_one() {
73 |     assert_eq!(minmax(vec![1].into_iter()), Some((1, 1)));
74 | }
75 | 
76 | #[test]
77 | fn test_minmax_two() {
78 |     assert_eq!(minmax(vec![1, 2].into_iter()), Some((1, 2)));
79 |     assert_eq!(minmax(vec![2, 1].into_iter()), Some((1, 2)));
80 | }
81 | 


--------------------------------------------------------------------------------
/ci/before_deploy.ps1:
--------------------------------------------------------------------------------
 1 | # This script takes care of packaging the build artifacts that will go in the
 2 | # release zipfile
 3 | 
 4 | $SRC_DIR = $PWD.Path
 5 | $STAGE = [System.Guid]::NewGuid().ToString()
 6 | 
 7 | Set-Location $ENV:Temp
 8 | New-Item -Type Directory -Name $STAGE
 9 | Set-Location $STAGE
10 | 
11 | $ZIP = "$SRC_DIR\$($Env:CRATE_NAME)-$($Env:APPVEYOR_REPO_TAG_NAME)-$($Env:TARGET).zip"
12 | 
13 | # TODO Update this to package the right artifacts
14 | Copy-Item "$SRC_DIR\target\$($Env:TARGET)\release\hello.exe" '.\'
15 | 
16 | 7z a "$ZIP" *
17 | 
18 | Push-AppveyorArtifact "$ZIP"
19 | 
20 | Remove-Item *.* -Force
21 | Set-Location ..
22 | Remove-Item $STAGE
23 | Set-Location $SRC_DIR
24 | 


--------------------------------------------------------------------------------
/ci/before_deploy.sh:
--------------------------------------------------------------------------------
 1 | # This script takes care of building your crate and packaging it for release
 2 | 
 3 | set -ex
 4 | 
 5 | main() {
 6 |     local src=$(pwd) \
 7 |           stage=
 8 | 
 9 |     case $TRAVIS_OS_NAME in
10 |         linux)
11 |             stage=$(mktemp -d)
12 |             ;;
13 |         osx)
14 |             stage=$(mktemp -d -t tmp)
15 |             ;;
16 |     esac
17 | 
18 |     test -f Cargo.lock || cargo generate-lockfile
19 | 
20 |     # TODO Update this to build the artifacts that matter to you
21 |     cross rustc --bin hello --target $TARGET --release -- -C lto
22 | 
23 |     # TODO Update this to package the right artifacts
24 |     cp target/$TARGET/release/hello $stage/
25 | 
26 |     cd $stage
27 |     tar czf $src/$CRATE_NAME-$TRAVIS_TAG-$TARGET.tar.gz *
28 |     cd $src
29 | 
30 |     rm -rf $stage
31 | }
32 | 
33 | main
34 | 


--------------------------------------------------------------------------------
/ci/install.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | main() {
 4 |     local target=
 5 |     if [ $TRAVIS_OS_NAME = linux ]; then
 6 |         target=x86_64-unknown-linux-musl
 7 |         sort=sort
 8 |     else
 9 |         target=x86_64-apple-darwin
10 |         sort=gsort  # for `sort --sort-version`, from brew's coreutils.
11 |     fi
12 | 
13 |     # Builds for iOS are done on OSX, but require the specific target to be
14 |     # installed.
15 |     case $TARGET in
16 |         aarch64-apple-ios)
17 |             rustup target install aarch64-apple-ios
18 |             ;;
19 |         armv7-apple-ios)
20 |             rustup target install armv7-apple-ios
21 |             ;;
22 |         armv7s-apple-ios)
23 |             rustup target install armv7s-apple-ios
24 |             ;;
25 |         i386-apple-ios)
26 |             rustup target install i386-apple-ios
27 |             ;;
28 |         x86_64-apple-ios)
29 |             rustup target install x86_64-apple-ios
30 |             ;;
31 |     esac
32 | 
33 |     # This fetches latest stable release
34 |     local tag=$(git ls-remote --tags --refs --exit-code https://github.com/japaric/cross \
35 |                        | cut -d/ -f3 \
36 |                        | grep -E '^v[0.1.0-9.]+$' \
37 |                        | $sort --version-sort \
38 |                        | tail -n1)
39 |     curl -LSfs https://japaric.github.io/trust/install.sh | \
40 |         sh -s -- \
41 |            --force \
42 |            --git japaric/cross \
43 |            --tag $tag \
44 |            --target $target
45 | }
46 | 
47 | main
48 | 


--------------------------------------------------------------------------------
/ci/script.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script takes care of testing your crate
 4 | 
 5 | set -ex
 6 | 
 7 | main() {
 8 |     if [ ! -z $CODECOV ]; then
 9 |         echo "Codecov"
10 |         cargo build --verbose && cargo coverage --verbose --all && bash <(curl -s https://codecov.io/bash) -s target/kcov
11 |     else
12 |         echo "Build"
13 |         cross build --target $TARGET
14 |         if [ ! -z $DISABLE_TESTS ]; then
15 |             return
16 |         fi
17 |         echo "Test"
18 |         cross test --target $TARGET --no-default-features --features mmap
19 |         cross test --target $TARGET --no-default-features --features mmap query-grammar
20 |     fi
21 |     for example in $(ls examples/*.rs)
22 |     do
23 |         cargo run --example  $(basename $example .rs)
24 |     done
25 | }
26 | 
27 | # we don't run the "test phase" when doing deploys
28 | if [ -z $TRAVIS_TAG ]; then
29 |     main
30 | fi
31 | 


--------------------------------------------------------------------------------
/common/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tantivy-common"
 3 | version = "0.2.0"
 4 | authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 5 | license = "MIT"
 6 | edition = "2018"
 7 | description = "common traits and utility functions used by multiple tantivy subcrates"
 8 | 
 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
10 | 
11 | [dependencies]
12 | byteorder = "1.4.3"
13 | ownedbytes = { version="0.2", path="../ownedbytes" }
14 | 
15 | [dev-dependencies]
16 | proptest = "1.0.0"
17 | rand = "0.8.4"
18 | 


--------------------------------------------------------------------------------
/common/src/writer.rs:
--------------------------------------------------------------------------------
  1 | use std::io::{self, BufWriter, Write};
  2 | 
  3 | pub struct CountingWriter<W> {
  4 |     underlying: W,
  5 |     written_bytes: u64,
  6 | }
  7 | 
  8 | impl<W: Write> CountingWriter<W> {
  9 |     pub fn wrap(underlying: W) -> CountingWriter<W> {
 10 |         CountingWriter {
 11 |             underlying,
 12 |             written_bytes: 0,
 13 |         }
 14 |     }
 15 | 
 16 |     #[inline]
 17 |     pub fn written_bytes(&self) -> u64 {
 18 |         self.written_bytes
 19 |     }
 20 | 
 21 |     /// Returns the underlying write object.
 22 |     /// Note that this method does not trigger any flushing.
 23 |     #[inline]
 24 |     pub fn finish(self) -> W {
 25 |         self.underlying
 26 |     }
 27 | }
 28 | 
 29 | impl<W: Write> Write for CountingWriter<W> {
 30 |     #[inline]
 31 |     fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
 32 |         let written_size = self.underlying.write(buf)?;
 33 |         self.written_bytes += written_size as u64;
 34 |         Ok(written_size)
 35 |     }
 36 | 
 37 |     #[inline]
 38 |     fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
 39 |         self.underlying.write_all(buf)?;
 40 |         self.written_bytes += buf.len() as u64;
 41 |         Ok(())
 42 |     }
 43 | 
 44 |     #[inline]
 45 |     fn flush(&mut self) -> io::Result<()> {
 46 |         self.underlying.flush()
 47 |     }
 48 | }
 49 | 
 50 | impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
 51 |     #[inline]
 52 |     fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
 53 |         self.underlying.terminate_ref(token)
 54 |     }
 55 | }
 56 | 
 57 | /// Struct used to prevent from calling
 58 | /// [`terminate_ref`](trait.TerminatingWrite.html#tymethod.terminate_ref) directly
 59 | ///
 60 | /// The point is that while the type is public, it cannot be built by anyone
 61 | /// outside of this module.
 62 | pub struct AntiCallToken(());
 63 | 
 64 | /// Trait used to indicate when no more write need to be done on a writer
 65 | pub trait TerminatingWrite: Write {
 66 |     /// Indicate that the writer will no longer be used. Internally call terminate_ref.
 67 |     fn terminate(mut self) -> io::Result<()>
 68 |     where Self: Sized {
 69 |         self.terminate_ref(AntiCallToken(()))
 70 |     }
 71 | 
 72 |     /// You should implement this function to define custom behavior.
 73 |     /// This function should flush any buffer it may hold.
 74 |     fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
 75 | }
 76 | 
 77 | impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
 78 |     fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
 79 |         self.as_mut().terminate_ref(token)
 80 |     }
 81 | }
 82 | 
 83 | impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
 84 |     fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
 85 |         self.flush()?;
 86 |         self.get_mut().terminate_ref(a)
 87 |     }
 88 | }
 89 | 
 90 | impl<'a> TerminatingWrite for &'a mut Vec<u8> {
 91 |     fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
 92 |         self.flush()
 93 |     }
 94 | }
 95 | 
 96 | #[cfg(test)]
 97 | mod test {
 98 | 
 99 |     use std::io::Write;
100 | 
101 |     use super::CountingWriter;
102 | 
103 |     #[test]
104 |     fn test_counting_writer() {
105 |         let buffer: Vec<u8> = vec![];
106 |         let mut counting_writer = CountingWriter::wrap(buffer);
107 |         let bytes = (0u8..10u8).collect::<Vec<u8>>();
108 |         counting_writer.write_all(&bytes).unwrap();
109 |         let len = counting_writer.written_bytes();
110 |         let buffer_restituted: Vec<u8> = counting_writer.finish();
111 |         assert_eq!(len, 10u64);
112 |         assert_eq!(buffer_restituted.len(), 10);
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | book
2 | 


--------------------------------------------------------------------------------
/doc/book.toml:
--------------------------------------------------------------------------------
1 | [book]
2 | authors = ["Paul Masurel"]
3 | multilingual = false
4 | src = "src"
5 | title = "Tantivy, the user guide"
6 | 


--------------------------------------------------------------------------------
/doc/src/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | 
 4 | 
 5 | [Avant Propos](./avant-propos.md)
 6 | 
 7 | - [Segments](./basis.md)
 8 | - [Defining your schema](./schema.md)
 9 | - [Facetting](./facetting.md)
10 | - [Index Sorting](./index_sorting.md)
11 | - [Innerworkings](./innerworkings.md)
12 |   - [Inverted index](./inverted_index.md)
13 | - [Best practise](./inverted_index.md)
14 | 
15 | [Frequently Asked Questions](./faq.md)
16 | [Examples](./examples.md)
17 | 


--------------------------------------------------------------------------------
/doc/src/avant-propos.md:
--------------------------------------------------------------------------------
 1 | # Foreword, what is the scope of tantivy?
 2 | 
 3 | > Tantivy is a **search** engine **library** for Rust.
 4 | 
 5 | If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
 6 | they both have the same scope and targetted use cases.
 7 | 
 8 | If you are not familiar with Lucene, let's break down our little tagline.
 9 | 
10 | - **Search** here means full-text search : fundamentally, tantivy is here to help you
11 | identify efficiently what are the documents matching a given query in your corpus.
12 | But modern search UI are so much more : text processing, facetting, autocomplete, fuzzy search, good
13 | relevancy, collapsing, highlighting, spatial search.
14 | 
15 |   While some of these features are not available in tantivy yet, all of these are relevant
16 |   feature requests. Tantivy's objective is to offer a solid toolbox to create the best search
17 |   experience. But keep in mind this is just a toolbox.
18 |   Which bring us to the second keyword...
19 | 
20 | - **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
21 | 
22 |   Sometimes a functionality will not be available in tantivy because it is too
23 |   specific to your use case. By design, tantivy should make it possible to extend
24 |   the available set of features using the existing rock-solid datastructures.
25 | 
26 |   Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own
27 |   `TokenFilter`... Some of your requirements may also be related to
28 |   something closer to architecture or operations. For instance, you may
29 |   want to build a large corpus on Hadoop, fine-tune the merge policy to keep your
30 |   index sharded in a time-wise fashion, or you may want to convert and existing
31 |   index from a different format.
32 | 
33 |   Tantivy exposes a lot of low level API to do all of these things.
34 |   
35 | 


--------------------------------------------------------------------------------
/doc/src/best_practise.md.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skiff-org/tantivy/46d5de920dd1ac86fa7a74baa0debd933bcb6574/doc/src/best_practise.md.rs


--------------------------------------------------------------------------------
/doc/src/examples.md:
--------------------------------------------------------------------------------
1 | # Examples
2 | 
3 | - [Basic search](/examples/basic_search.html)


--------------------------------------------------------------------------------
/doc/src/facetting.md:
--------------------------------------------------------------------------------
1 | # Facetting
2 | 
3 | wewew
4 | 
5 | ## weeewe
6 | 


--------------------------------------------------------------------------------
/doc/src/faq.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skiff-org/tantivy/46d5de920dd1ac86fa7a74baa0debd933bcb6574/doc/src/faq.md


--------------------------------------------------------------------------------
/doc/src/index_sorting.md:
--------------------------------------------------------------------------------
 1 | 
 2 | - [Index Sorting](#index-sorting)
 3 |     + [Why Sorting](#why-sorting)
 4 |         * [Compression](#compression)
 5 |         * [Top-N Optimization](#top-n-optimization)
 6 |         * [Pruning](#pruning)
 7 |         * [Other](#other)
 8 |     + [Usage](#usage)
 9 | 
10 | # Index Sorting
11 | 
12 | Tantivy allows you to sort the index according to a property.
13 | 
14 | ## Why Sorting
15 | 
16 | Presorting an index has several advantages:
17 | 
18 | ###### Compression
19 | 
20 | When data is sorted it is easier to compress the data. E.g. the numbers sequence [5, 2, 3, 1, 4] would be sorted to [1, 2, 3, 4, 5]. 
21 | If we apply delta encoding this list would be unsorted [5, -3, 1, -2, 3] vs. [1, 1, 1, 1, 1].
22 | Compression ratio is mainly affected on the fast field of the sorted property, every thing else is likely unaffected. 
23 | ###### Top-N Optimization
24 | 
25 | When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents. 
26 | E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
27 | 
28 | Note: Tantivy 0.16 does not do this optimization yet.
29 | 
30 | ###### Pruning
31 | 
32 | Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
33 | 
34 | Note: Tantivy 0.16 does not do this optimization yet.
35 | 
36 | ###### Other?
37 | 
38 | In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
39 | 
40 | ## Usage
41 | The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
42 | 
43 | ```
44 | let settings = IndexSettings {
45 |     sort_by_field: Some(IndexSortByField {
46 |         field: "intval".to_string(),
47 |         order: Order::Desc,
48 |     }),
49 |     ..Default::default()
50 | };
51 | let mut index_builder = Index::builder().schema(schema);
52 | index_builder = index_builder.settings(settings);
53 | let index = index_builder.create_in_ram().unwrap();
54 | ```
55 | 
56 | ## Implementation details
57 | 
58 | Sorting an index is applied in the serialization step. In general there are two serialization steps: [Finishing a single segment](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/segment_writer.rs#L338) and [merging multiple segments](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/indexer/merger.rs#L1073).
59 | 
60 | In both cases we generate a docid mapping reflecting the sort. This mapping is used when serializing the different components (doc store, fastfields, posting list, normfield, facets).
61 | 
62 | 


--------------------------------------------------------------------------------
/doc/src/innerworkings.md:
--------------------------------------------------------------------------------
1 | # Innerworkings
2 | 


--------------------------------------------------------------------------------
/doc/src/inverted_index.md:
--------------------------------------------------------------------------------
1 | # Inverted index
2 | 


--------------------------------------------------------------------------------
/doc/src/schema.md:
--------------------------------------------------------------------------------
1 | # Defining your schema
2 | 


--------------------------------------------------------------------------------
/examples/faceted_search_with_tweaked_score.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashSet;
 2 | 
 3 | use tantivy::collector::TopDocs;
 4 | use tantivy::query::BooleanQuery;
 5 | use tantivy::schema::*;
 6 | use tantivy::{doc, DocId, Index, Score, SegmentReader};
 7 | 
 8 | fn main() -> tantivy::Result<()> {
 9 |     let mut schema_builder = Schema::builder();
10 | 
11 |     let title = schema_builder.add_text_field("title", STORED);
12 |     let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default());
13 | 
14 |     let schema = schema_builder.build();
15 |     let index = Index::create_in_ram(schema);
16 | 
17 |     let mut index_writer = index.writer(30_000_000)?;
18 | 
19 |     index_writer.add_document(doc!(
20 |         title => "Fried egg",
21 |         ingredient => Facet::from("/ingredient/egg"),
22 |         ingredient => Facet::from("/ingredient/oil"),
23 |     ))?;
24 |     index_writer.add_document(doc!(
25 |         title => "Scrambled egg",
26 |         ingredient => Facet::from("/ingredient/egg"),
27 |         ingredient => Facet::from("/ingredient/butter"),
28 |         ingredient => Facet::from("/ingredient/milk"),
29 |         ingredient => Facet::from("/ingredient/salt"),
30 |     ))?;
31 |     index_writer.add_document(doc!(
32 |         title => "Egg rolls",
33 |         ingredient => Facet::from("/ingredient/egg"),
34 |         ingredient => Facet::from("/ingredient/garlic"),
35 |         ingredient => Facet::from("/ingredient/salt"),
36 |         ingredient => Facet::from("/ingredient/oil"),
37 |         ingredient => Facet::from("/ingredient/tortilla-wrap"),
38 |         ingredient => Facet::from("/ingredient/mushroom"),
39 |     ))?;
40 |     index_writer.commit()?;
41 | 
42 |     let reader = index.reader()?;
43 |     let searcher = reader.searcher();
44 |     {
45 |         let facets = vec![
46 |             Facet::from("/ingredient/egg"),
47 |             Facet::from("/ingredient/oil"),
48 |             Facet::from("/ingredient/garlic"),
49 |             Facet::from("/ingredient/mushroom"),
50 |         ];
51 |         let query = BooleanQuery::new_multiterms_query(
52 |             facets
53 |                 .iter()
54 |                 .map(|key| Term::from_facet(ingredient, key))
55 |                 .collect(),
56 |         );
57 |         let top_docs_by_custom_score =
58 |             TopDocs::with_limit(2).tweak_score(move |segment_reader: &SegmentReader| {
59 |                 let ingredient_reader = segment_reader.facet_reader(ingredient).unwrap();
60 |                 let facet_dict = ingredient_reader.facet_dict();
61 | 
62 |                 let query_ords: HashSet<u64> = facets
63 |                     .iter()
64 |                     .filter_map(|key| facet_dict.term_ord(key.encoded_str()).unwrap())
65 |                     .collect();
66 | 
67 |                 let mut facet_ords_buffer: Vec<u64> = Vec::with_capacity(20);
68 | 
69 |                 move |doc: DocId, original_score: Score| {
70 |                     ingredient_reader.facet_ords(doc, &mut facet_ords_buffer);
71 |                     let missing_ingredients = facet_ords_buffer
72 |                         .iter()
73 |                         .filter(|ord| !query_ords.contains(ord))
74 |                         .count();
75 |                     let tweak = 1.0 / 4_f32.powi(missing_ingredients as i32);
76 | 
77 |                     original_score * tweak
78 |                 }
79 |             });
80 |         let top_docs = searcher.search(&query, &top_docs_by_custom_score)?;
81 | 
82 |         let titles: Vec<String> = top_docs
83 |             .iter()
84 |             .map(|(_, doc_id)| {
85 |                 searcher
86 |                     .doc(*doc_id)
87 |                     .unwrap()
88 |                     .get_first(title)
89 |                     .unwrap()
90 |                     .as_text()
91 |                     .unwrap()
92 |                     .to_owned()
93 |             })
94 |             .collect();
95 |         assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
96 |     }
97 |     Ok(())
98 | }
99 | 


--------------------------------------------------------------------------------
/examples/integer_range_search.rs:
--------------------------------------------------------------------------------
 1 | // # Searching a range on an indexed int field.
 2 | //
 3 | // Below is an example of creating an indexed integer field in your schema
 4 | // You can use RangeQuery to get a Count of all occurrences in a given range.
 5 | use tantivy::collector::Count;
 6 | use tantivy::query::RangeQuery;
 7 | use tantivy::schema::{Schema, INDEXED};
 8 | use tantivy::{doc, Index, Result};
 9 | 
10 | fn main() -> Result<()> {
11 |     // For the sake of simplicity, this schema will only have 1 field
12 |     let mut schema_builder = Schema::builder();
13 | 
14 |     // `INDEXED` is a short-hand to indicate that our field should be "searchable".
15 |     let year_field = schema_builder.add_u64_field("year", INDEXED);
16 |     let schema = schema_builder.build();
17 |     let index = Index::create_in_ram(schema);
18 |     let reader = index.reader()?;
19 |     {
20 |         let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
21 |         for year in 1950u64..2019u64 {
22 |             index_writer.add_document(doc!(year_field => year))?;
23 |         }
24 |         index_writer.commit()?;
25 |         // The index will be a range of years
26 |     }
27 |     reader.reload()?;
28 |     let searcher = reader.searcher();
29 |     // The end is excluded i.e. here we are searching up to 1969
30 |     let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
31 |     // Uses a Count collector to sum the total number of docs in the range
32 |     let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
33 |     assert_eq!(num_60s_books, 10);
34 |     Ok(())
35 | }
36 | 


--------------------------------------------------------------------------------
/examples/json_field.rs:
--------------------------------------------------------------------------------
 1 | // # Json field example
 2 | //
 3 | // This example shows how the json field can be used
 4 | // to make tantivy partially schemaless.
 5 | 
 6 | use tantivy::collector::{Count, TopDocs};
 7 | use tantivy::query::QueryParser;
 8 | use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
 9 | use tantivy::Index;
10 | 
11 | fn main() -> tantivy::Result<()> {
12 |     // # Defining the schema
13 |     //
14 |     // We need two fields:
15 |     // - a timestamp
16 |     // - a json object field
17 |     let mut schema_builder = Schema::builder();
18 |     schema_builder.add_date_field("timestamp", FAST | STORED);
19 |     let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
20 |     let attributes = schema_builder.add_json_field("attributes", STORED | TEXT);
21 |     let schema = schema_builder.build();
22 | 
23 |     // # Indexing documents
24 |     let index = Index::create_in_ram(schema.clone());
25 | 
26 |     let mut index_writer = index.writer(50_000_000)?;
27 |     let doc = schema.parse_document(
28 |         r#"{
29 |         "timestamp": "2022-02-22T23:20:50.53Z",
30 |         "event_type": "click",
31 |         "attributes": {
32 |             "target": "submit-button",
33 |             "cart": {"product_id": 103},
34 |             "description": "the best vacuum cleaner ever"
35 |         }
36 |     }"#,
37 |     )?;
38 |     index_writer.add_document(doc)?;
39 |     let doc = schema.parse_document(
40 |         r#"{
41 |         "timestamp": "2022-02-22T23:20:51.53Z",
42 |         "event_type": "click",
43 |         "attributes": {
44 |             "target": "submit-button",
45 |             "cart": {"product_id": 133},
46 |             "description": "das keyboard"
47 |         }
48 |     }"#,
49 |     )?;
50 |     index_writer.add_document(doc)?;
51 |     index_writer.commit()?;
52 | 
53 |     let reader = index.reader()?;
54 |     let searcher = reader.searcher();
55 | 
56 |     let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
57 |     {
58 |         let query = query_parser.parse_query("target:submit-button")?;
59 |         let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
60 |         assert_eq!(count_docs.len(), 2);
61 |     }
62 |     {
63 |         let query = query_parser.parse_query("target:submit")?;
64 |         let count_docs = searcher.search(&*query, &TopDocs::with_limit(2))?;
65 |         assert_eq!(count_docs.len(), 2);
66 |     }
67 |     {
68 |         let query = query_parser.parse_query("cart.product_id:103")?;
69 |         let count_docs = searcher.search(&*query, &Count)?;
70 |         assert_eq!(count_docs, 1);
71 |     }
72 |     {
73 |         let query = query_parser
74 |             .parse_query("event_type:click AND cart.product_id:133")
75 |             .unwrap();
76 |         let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
77 |         assert_eq!(hits.len(), 1);
78 |     }
79 |     Ok(())
80 | }
81 | 


--------------------------------------------------------------------------------
/examples/snippet.rs:
--------------------------------------------------------------------------------
 1 | // # Snippet example
 2 | //
 3 | // This example shows how to return a representative snippet of
 4 | // your hit result.
 5 | // Snippet are an extracted of a target document, and returned in HTML format.
 6 | // The keyword searched by the user are highlighted with a `<b>` tag.
 7 | 
 8 | // ---
 9 | // Importing tantivy...
10 | use tantivy::collector::TopDocs;
11 | use tantivy::query::QueryParser;
12 | use tantivy::schema::*;
13 | use tantivy::{doc, Index, Snippet, SnippetGenerator};
14 | use tempfile::TempDir;
15 | 
16 | fn main() -> tantivy::Result<()> {
17 |     // Let's create a temporary directory for the
18 |     // sake of this example
19 |     let index_path = TempDir::new()?;
20 | 
21 |     // # Defining the schema
22 |     let mut schema_builder = Schema::builder();
23 |     let title = schema_builder.add_text_field("title", TEXT | STORED);
24 |     let body = schema_builder.add_text_field("body", TEXT | STORED);
25 |     let schema = schema_builder.build();
26 | 
27 |     // # Indexing documents
28 |     let index = Index::create_in_dir(&index_path, schema)?;
29 | 
30 |     let mut index_writer = index.writer(50_000_000)?;
31 | 
32 |     // we'll only need one doc for this example.
33 |     index_writer.add_document(doc!(
34 |     title => "Of Mice and Men",
35 |     body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
36 |             bank and runs deep and green. The water is warm too, for it has slipped twinkling \
37 |             over the yellow sands in the sunlight before reaching the narrow pool. On one \
38 |             side of the river the golden foothill slopes curve up to the strong and rocky \
39 |             Gabilan Mountains, but on the valley side the water is lined with trees—willows \
40 |             fresh and green with every spring, carrying in their lower leaf junctures the \
41 |             debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
42 |             limbs and branches that arch over the pool"
43 |     ))?;
44 |     // ...
45 |     index_writer.commit()?;
46 | 
47 |     let reader = index.reader()?;
48 |     let searcher = reader.searcher();
49 |     let query_parser = QueryParser::for_index(&index, vec![title, body]);
50 |     let query = query_parser.parse_query("sycamore spring")?;
51 | 
52 |     let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
53 | 
54 |     let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
55 | 
56 |     for (score, doc_address) in top_docs {
57 |         let doc = searcher.doc(doc_address)?;
58 |         let snippet = snippet_generator.snippet_from_doc(&doc);
59 |         println!("Document score {}:", score);
60 |         println!(
61 |             "title: {}",
62 |             doc.get_first(title).unwrap().as_text().unwrap()
63 |         );
64 |         println!("snippet: {}", snippet.to_html());
65 |         println!("custom highlighting: {}", highlight(snippet));
66 |     }
67 | 
68 |     Ok(())
69 | }
70 | 
71 | fn highlight(snippet: Snippet) -> String {
72 |     let mut result = String::new();
73 |     let mut start_from = 0;
74 | 
75 |     for fragment_range in snippet.highlighted() {
76 |         result.push_str(&snippet.fragment()[start_from..fragment_range.start]);
77 |         result.push_str(" --> ");
78 |         result.push_str(&snippet.fragment()[fragment_range.clone()]);
79 |         result.push_str(" <-- ");
80 |         start_from = fragment_range.end;
81 |     }
82 | 
83 |     result.push_str(&snippet.fragment()[start_from..]);
84 |     result
85 | }
86 | 


--------------------------------------------------------------------------------
/examples/working_with_json.rs:
--------------------------------------------------------------------------------
 1 | use tantivy::schema::*;
 2 | 
 3 | // # Document from json
 4 | //
 5 | // For convenience, `Document` can be parsed directly from json.
 6 | fn main() -> tantivy::Result<()> {
 7 |     // Let's first define a schema and an index.
 8 |     // Check out the basic example if this is confusing to you.
 9 |     //
10 |     // first we need to define a schema ...
11 |     let mut schema_builder = Schema::builder();
12 |     schema_builder.add_text_field("title", TEXT | STORED);
13 |     schema_builder.add_text_field("body", TEXT);
14 |     schema_builder.add_u64_field("year", INDEXED);
15 |     let schema = schema_builder.build();
16 | 
17 |     // Let's assume we have a json-serialized document.
18 |     let mice_and_men_doc_json = r#"{
19 |        "title": "Of Mice and Men",
20 |        "year": 1937
21 |     }"#;
22 | 
23 |     // We can parse our document
24 |     let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
25 | 
26 |     // Multi-valued field are allowed, they are
27 |     // expressed in JSON by an array.
28 |     // The following document has two titles.
29 |     let frankenstein_json = r#"{
30 |        "title": ["Frankenstein", "The Modern Prometheus"],
31 |        "year": 1818
32 |     }"#;
33 |     let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
34 | 
35 |     // Note that the schema is saved in your index directory.
36 |     //
37 |     // As a result, Indexes are aware of their schema, and you can use this feature
38 |     // just by opening an existing `Index`, and calling `index.schema()..parse_document(json)`.
39 |     Ok(())
40 | }
41 | 


--------------------------------------------------------------------------------
/fastfield_codecs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "fastfield_codecs"
 3 | version = "0.1.0"
 4 | authors = ["Pascal Seitz <pascal@quickwit.io>"]
 5 | license = "MIT"
 6 | edition = "2018"
 7 | description = "Fast field codecs used by tantivy"
 8 | 
 9 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
10 | 
11 | [dependencies]
12 | common = { version = "0.2", path = "../common/", package = "tantivy-common" }
13 | tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
14 | prettytable-rs = {version="0.8.0", optional= true}
15 | rand = {version="0.8.3", optional= true}
16 | 
17 | [dev-dependencies]
18 | more-asserts = "0.2.1"
19 | rand = "0.8.3"
20 | 
21 | [features]
22 | bin = ["prettytable-rs", "rand"]
23 | default = ["bin"]
24 | 
25 | 


--------------------------------------------------------------------------------
/fastfield_codecs/benches/bench.rs:
--------------------------------------------------------------------------------
  1 | #![feature(test)]
  2 | 
  3 | extern crate test;
  4 | 
  5 | #[cfg(test)]
  6 | mod tests {
  7 |     use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
  8 |     use fastfield_codecs::linearinterpol::{
  9 |         LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
 10 |     };
 11 |     use fastfield_codecs::multilinearinterpol::{
 12 |         MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
 13 |     };
 14 |     use fastfield_codecs::*;
 15 | 
 16 |     fn get_data() -> Vec<u64> {
 17 |         let mut data: Vec<_> = (100..55000_u64)
 18 |             .map(|num| num + rand::random::<u8>() as u64)
 19 |             .collect();
 20 |         data.push(99_000);
 21 |         data.insert(1000, 2000);
 22 |         data.insert(2000, 100);
 23 |         data.insert(3000, 4100);
 24 |         data.insert(4000, 100);
 25 |         data.insert(5000, 800);
 26 |         data
 27 |     }
 28 | 
 29 |     fn value_iter() -> impl Iterator<Item = u64> {
 30 |         0..20_000
 31 |     }
 32 |     fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
 33 |         b: &mut Bencher,
 34 |         data: &[u64],
 35 |     ) {
 36 |         let mut bytes = vec![];
 37 |         S::serialize(
 38 |             &mut bytes,
 39 |             &data,
 40 |             stats_from_vec(data),
 41 |             data.iter().cloned(),
 42 |             data.iter().cloned(),
 43 |         )
 44 |         .unwrap();
 45 |         let reader = R::open_from_bytes(&bytes).unwrap();
 46 |         b.iter(|| {
 47 |             for pos in value_iter() {
 48 |                 reader.get_u64(pos as u64, &bytes);
 49 |             }
 50 |         });
 51 |     }
 52 |     fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
 53 |         let mut bytes = vec![];
 54 |         b.iter(|| {
 55 |             S::serialize(
 56 |                 &mut bytes,
 57 |                 &data,
 58 |                 stats_from_vec(data),
 59 |                 data.iter().cloned(),
 60 |                 data.iter().cloned(),
 61 |             )
 62 |             .unwrap();
 63 |         });
 64 |     }
 65 | 
 66 |     use test::Bencher;
 67 |     #[bench]
 68 |     fn bench_fastfield_bitpack_create(b: &mut Bencher) {
 69 |         let data: Vec<_> = get_data();
 70 |         bench_create::<BitpackedFastFieldSerializer>(b, &data);
 71 |     }
 72 |     #[bench]
 73 |     fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
 74 |         let data: Vec<_> = get_data();
 75 |         bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
 76 |     }
 77 |     #[bench]
 78 |     fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
 79 |         let data: Vec<_> = get_data();
 80 |         bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
 81 |     }
 82 |     #[bench]
 83 |     fn bench_fastfield_bitpack_get(b: &mut Bencher) {
 84 |         let data: Vec<_> = get_data();
 85 |         bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
 86 |     }
 87 |     #[bench]
 88 |     fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
 89 |         let data: Vec<_> = get_data();
 90 |         bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
 91 |     }
 92 |     #[bench]
 93 |     fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
 94 |         let data: Vec<_> = get_data();
 95 |         bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
 96 |             b, &data,
 97 |         );
 98 |     }
 99 |     pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
100 |         let min_value = data.iter().cloned().min().unwrap_or(0);
101 |         let max_value = data.iter().cloned().max().unwrap_or(0);
102 |         FastFieldStats {
103 |             min_value,
104 |             max_value,
105 |             num_vals: data.len() as u64,
106 |         }
107 |     }
108 | }
109 | 


--------------------------------------------------------------------------------
/ownedbytes/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
 3 | name = "ownedbytes"
 4 | version = "0.2.0"
 5 | edition = "2018"
 6 | description = "Expose data as static slice"
 7 | license = "MIT"
 8 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 9 | 
10 | [dependencies]
11 | stable_deref_trait = "1.2.0"
12 | 


--------------------------------------------------------------------------------
/query-grammar/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tantivy-query-grammar"
 3 | version = "0.15.0"
 4 | authors = ["Paul Masurel <paul.masurel@gmail.com>"]
 5 | license = "MIT"
 6 | categories = ["database-implementations", "data-structures"]
 7 | description = """Search engine library"""
 8 | homepage = "https://github.com/quickwit-oss/tantivy"
 9 | repository = "https://github.com/quickwit-oss/tantivy"
10 | readme = "README.md"
11 | keywords = ["search", "information", "retrieval"]
12 | edition = "2018"
13 | 
14 | [dependencies]
15 | combine = {version="4", default-features=false, features=[] }
16 | once_cell = "1.7.2"
17 | regex ={ version = "1.5.4", default-features = false, features = ["std"] }
18 | 


--------------------------------------------------------------------------------
/query-grammar/README.md:
--------------------------------------------------------------------------------
1 | # Tantivy Query Grammar
2 | 
3 | This crate is used by tantivy to parse queries.
4 | 


--------------------------------------------------------------------------------
/query-grammar/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod occur;
 2 | mod query_grammar;
 3 | mod user_input_ast;
 4 | use combine::parser::Parser;
 5 | 
 6 | pub use crate::occur::Occur;
 7 | use crate::query_grammar::parse_to_ast;
 8 | pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
 9 | 
10 | pub struct Error;
11 | 
12 | pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
13 |     let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
14 |     Ok(user_input_ast)
15 | }
16 | 


--------------------------------------------------------------------------------
/query-grammar/src/occur.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::fmt::Write;
 3 | 
 4 | /// Defines whether a term in a query must be present,
 5 | /// should be present or must be not present.
 6 | #[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
 7 | pub enum Occur {
 8 |     /// For a given document to be considered for scoring,
 9 |     /// at least one of the document with the Should or the Must
10 |     /// Occur constraint must be within the document.
11 |     Should,
12 |     /// Document without the term are excluded from the search.
13 |     Must,
14 |     /// Document that contain the term are excluded from the
15 |     /// search.
16 |     MustNot,
17 | }
18 | 
19 | impl Occur {
20 |     /// Returns the one-char prefix symbol for this `Occur`.
21 |     /// - `Should` => '?',
22 |     /// - `Must` => '+'
23 |     /// - `Not` => '-'
24 |     fn to_char(self) -> char {
25 |         match self {
26 |             Occur::Should => '?',
27 |             Occur::Must => '+',
28 |             Occur::MustNot => '-',
29 |         }
30 |     }
31 | 
32 |     /// Compose two occur values.
33 |     pub fn compose(left: Occur, right: Occur) -> Occur {
34 |         match (left, right) {
35 |             (Occur::Should, _) => right,
36 |             (Occur::Must, Occur::MustNot) => Occur::MustNot,
37 |             (Occur::Must, _) => Occur::Must,
38 |             (Occur::MustNot, Occur::MustNot) => Occur::Must,
39 |             (Occur::MustNot, _) => Occur::MustNot,
40 |         }
41 |     }
42 | }
43 | 
44 | impl fmt::Display for Occur {
45 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
46 |         f.write_char(self.to_char())
47 |     }
48 | }
49 | 
50 | #[cfg(test)]
51 | mod test {
52 |     use crate::Occur;
53 | 
54 |     #[test]
55 |     fn test_occur_compose() {
56 |         assert_eq!(Occur::compose(Occur::Should, Occur::Should), Occur::Should);
57 |         assert_eq!(Occur::compose(Occur::Should, Occur::Must), Occur::Must);
58 |         assert_eq!(
59 |             Occur::compose(Occur::Should, Occur::MustNot),
60 |             Occur::MustNot
61 |         );
62 |         assert_eq!(Occur::compose(Occur::Must, Occur::Should), Occur::Must);
63 |         assert_eq!(Occur::compose(Occur::Must, Occur::Must), Occur::Must);
64 |         assert_eq!(Occur::compose(Occur::Must, Occur::MustNot), Occur::MustNot);
65 |         assert_eq!(
66 |             Occur::compose(Occur::MustNot, Occur::Should),
67 |             Occur::MustNot
68 |         );
69 |         assert_eq!(Occur::compose(Occur::MustNot, Occur::Must), Occur::MustNot);
70 |         assert_eq!(Occur::compose(Occur::MustNot, Occur::MustNot), Occur::Must);
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cargo test
3 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | comment_width = 120
2 | format_strings = true
3 | group_imports = "StdExternalCrate"
4 | imports_granularity = "Module"
5 | normalize_comments = true
6 | where_single_line = true
7 | wrap_comments = true
8 | 


--------------------------------------------------------------------------------
/src/aggregation/README.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When adding new bucket aggregation make sure to extend the "test_aggregation_flushing" test for at least 2 levels.
 4 | 
 5 | 
 6 | 
 7 | # Code Organization
 8 | 
 9 | Tantivy's aggregations have been designed to mimic the 
10 | [aggregations of elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations.html).
11 | 
12 | The code is organized in submodules:
13 | 
14 | ## bucket
15 | Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggegations.
16 | 
17 | ## metric
18 | Contains all metric aggregations, like average aggregation. Metric aggregations do not have sub aggregations.
19 | 
20 | #### agg_req
21 | agg_req contains the users aggregation request. Deserialization from json is compatible with elasticsearch aggregation requests.
22 | 
23 | #### agg_req_with_accessor
24 | agg_req_with_accessor contains the users aggregation request enriched with fast field accessors etc, which are
25 | used during collection.
26 | 
27 | #### segment_agg_result
28 | segment_agg_result contains the aggregation result tree, which is used for collection of a segment.
29 | The tree from agg_req_with_accessor is passed during collection.
30 | 
31 | #### intermediate_agg_result
32 | intermediate_agg_result contains the aggregation tree for merging with other trees.
33 | 
34 | #### agg_result
35 | agg_result contains the final aggregation tree.
36 | 
37 | 


--------------------------------------------------------------------------------
/src/aggregation/bucket/histogram/mod.rs:
--------------------------------------------------------------------------------
1 | mod histogram;
2 | pub use histogram::*;
3 | 


--------------------------------------------------------------------------------
/src/aggregation/bucket/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Module for all bucket aggregations.
 2 | //!
 3 | //! BucketAggregations create buckets of documents
 4 | //! [BucketAggregation](super::agg_req::BucketAggregation).
 5 | //!
 6 | //! Results of final buckets are [BucketResult](super::agg_result::BucketResult).
 7 | //! Results of intermediate buckets are
 8 | //! [IntermediateBucketResult](super::intermediate_agg_result::IntermediateBucketResult)
 9 | 
10 | mod histogram;
11 | mod range;
12 | 
13 | pub(crate) use histogram::SegmentHistogramCollector;
14 | pub use histogram::*;
15 | pub(crate) use range::SegmentRangeCollector;
16 | pub use range::*;
17 | 


--------------------------------------------------------------------------------
/src/aggregation/metric/average.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt::Debug;
  2 | 
  3 | use serde::{Deserialize, Serialize};
  4 | 
  5 | use crate::aggregation::f64_from_fastfield_u64;
  6 | use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
  7 | use crate::schema::Type;
  8 | use crate::DocId;
  9 | 
 10 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
 11 | /// A single-value metric aggregation that computes the average of numeric values that are
 12 | /// extracted from the aggregated documents.
 13 | /// Supported field types are u64, i64, and f64.
 14 | /// See [super::SingleMetricResult] for return value.
 15 | ///
 16 | /// # JSON Format
 17 | /// ```json
 18 | /// {
 19 | ///     "avg": {
 20 | ///         "field": "score",
 21 | ///     }
 22 | ///  }
 23 | /// ```
 24 | pub struct AverageAggregation {
 25 |     /// The field name to compute the stats on.
 26 |     pub field: String,
 27 | }
 28 | impl AverageAggregation {
 29 |     /// Create new AverageAggregation from a field.
 30 |     pub fn from_field_name(field_name: String) -> Self {
 31 |         AverageAggregation { field: field_name }
 32 |     }
 33 |     /// Return the field name.
 34 |     pub fn field_name(&self) -> &str {
 35 |         &self.field
 36 |     }
 37 | }
 38 | 
 39 | #[derive(Clone, PartialEq)]
 40 | pub(crate) struct SegmentAverageCollector {
 41 |     pub data: IntermediateAverage,
 42 |     field_type: Type,
 43 | }
 44 | 
 45 | impl Debug for SegmentAverageCollector {
 46 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 47 |         f.debug_struct("AverageCollector")
 48 |             .field("data", &self.data)
 49 |             .finish()
 50 |     }
 51 | }
 52 | 
 53 | impl SegmentAverageCollector {
 54 |     pub fn from_req(field_type: Type) -> Self {
 55 |         Self {
 56 |             field_type,
 57 |             data: Default::default(),
 58 |         }
 59 |     }
 60 |     pub(crate) fn collect_block(&mut self, doc: &[DocId], field: &DynamicFastFieldReader<u64>) {
 61 |         let mut iter = doc.chunks_exact(4);
 62 |         for docs in iter.by_ref() {
 63 |             let val1 = field.get(docs[0]);
 64 |             let val2 = field.get(docs[1]);
 65 |             let val3 = field.get(docs[2]);
 66 |             let val4 = field.get(docs[3]);
 67 |             let val1 = f64_from_fastfield_u64(val1, &self.field_type);
 68 |             let val2 = f64_from_fastfield_u64(val2, &self.field_type);
 69 |             let val3 = f64_from_fastfield_u64(val3, &self.field_type);
 70 |             let val4 = f64_from_fastfield_u64(val4, &self.field_type);
 71 |             self.data.collect(val1);
 72 |             self.data.collect(val2);
 73 |             self.data.collect(val3);
 74 |             self.data.collect(val4);
 75 |         }
 76 |         for doc in iter.remainder() {
 77 |             let val = field.get(*doc);
 78 |             let val = f64_from_fastfield_u64(val, &self.field_type);
 79 |             self.data.collect(val);
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | /// Contains mergeable version of average data.
 85 | #[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
 86 | pub struct IntermediateAverage {
 87 |     pub(crate) sum: f64,
 88 |     pub(crate) doc_count: u64,
 89 | }
 90 | 
 91 | impl IntermediateAverage {
 92 |     pub(crate) fn from_collector(collector: SegmentAverageCollector) -> Self {
 93 |         collector.data
 94 |     }
 95 | 
 96 |     /// Merge average data into this instance.
 97 |     pub fn merge_fruits(&mut self, other: IntermediateAverage) {
 98 |         self.sum += other.sum;
 99 |         self.doc_count += other.doc_count;
100 |     }
101 |     /// compute final result
102 |     pub fn finalize(&self) -> Option<f64> {
103 |         if self.doc_count == 0 {
104 |             None
105 |         } else {
106 |             Some(self.sum / self.doc_count as f64)
107 |         }
108 |     }
109 |     #[inline]
110 |     fn collect(&mut self, val: f64) {
111 |         self.doc_count += 1;
112 |         self.sum += val;
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/aggregation/metric/mod.rs:
--------------------------------------------------------------------------------
 1 | //! Module for all metric aggregations.
 2 | //!
 3 | //! The aggregations in this family compute metrics, see [super::agg_req::MetricAggregation] for
 4 | //! details.
 5 | mod average;
 6 | mod stats;
 7 | pub use average::*;
 8 | use serde::{Deserialize, Serialize};
 9 | pub use stats::*;
10 | 
11 | /// Single-metric aggregations use this common result structure.
12 | ///
13 | /// Main reason to wrap it in value is to match elasticsearch output structure.
14 | #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
15 | pub struct SingleMetricResult {
16 |     /// The value of the single value metric.
17 |     pub value: Option<f64>,
18 | }
19 | 
20 | impl From<f64> for SingleMetricResult {
21 |     fn from(value: f64) -> Self {
22 |         Self { value: Some(value) }
23 |     }
24 | }
25 | 
26 | impl From<Option<f64>> for SingleMetricResult {
27 |     fn from(value: Option<f64>) -> Self {
28 |         Self { value }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/collector/count_collector.rs:
--------------------------------------------------------------------------------
  1 | use super::Collector;
  2 | use crate::collector::SegmentCollector;
  3 | use crate::{DocId, Score, SegmentOrdinal, SegmentReader};
  4 | 
  5 | /// `CountCollector` collector only counts how many
  6 | /// documents match the query.
  7 | ///
  8 | /// ```rust
  9 | /// use tantivy::collector::Count;
 10 | /// use tantivy::query::QueryParser;
 11 | /// use tantivy::schema::{Schema, TEXT};
 12 | /// use tantivy::{doc, Index};
 13 | ///
 14 | /// let mut schema_builder = Schema::builder();
 15 | /// let title = schema_builder.add_text_field("title", TEXT);
 16 | /// let schema = schema_builder.build();
 17 | /// let index = Index::create_in_ram(schema);
 18 | ///
 19 | /// let mut index_writer = index.writer(3_000_000).unwrap();
 20 | /// index_writer.add_document(doc!(title => "The Name of the Wind")).unwrap();
 21 | /// index_writer.add_document(doc!(title => "The Diary of Muadib")).unwrap();
 22 | /// index_writer.add_document(doc!(title => "A Dairy Cow")).unwrap();
 23 | /// index_writer.add_document(doc!(title => "The Diary of a Young Girl")).unwrap();
 24 | /// assert!(index_writer.commit().is_ok());
 25 | ///
 26 | /// let reader = index.reader().unwrap();
 27 | /// let searcher = reader.searcher();
 28 | ///
 29 | /// // Here comes the important part
 30 | /// let query_parser = QueryParser::for_index(&index, vec![title]);
 31 | /// let query = query_parser.parse_query("diary").unwrap();
 32 | /// let count = searcher.search(&query, &Count).unwrap();
 33 | ///
 34 | /// assert_eq!(count, 2);
 35 | /// ```
 36 | pub struct Count;
 37 | 
 38 | impl Collector for Count {
 39 |     type Fruit = usize;
 40 | 
 41 |     type Child = SegmentCountCollector;
 42 | 
 43 |     fn for_segment(
 44 |         &self,
 45 |         _: SegmentOrdinal,
 46 |         _: &SegmentReader,
 47 |     ) -> crate::Result<SegmentCountCollector> {
 48 |         Ok(SegmentCountCollector::default())
 49 |     }
 50 | 
 51 |     fn requires_scoring(&self) -> bool {
 52 |         false
 53 |     }
 54 | 
 55 |     fn merge_fruits(&self, segment_counts: Vec<usize>) -> crate::Result<usize> {
 56 |         Ok(segment_counts.into_iter().sum())
 57 |     }
 58 | }
 59 | 
 60 | #[derive(Default)]
 61 | pub struct SegmentCountCollector {
 62 |     count: usize,
 63 | }
 64 | 
 65 | impl SegmentCollector for SegmentCountCollector {
 66 |     type Fruit = usize;
 67 | 
 68 |     fn collect(&mut self, _: DocId, _: Score) {
 69 |         self.count += 1;
 70 |     }
 71 | 
 72 |     fn harvest(self) -> usize {
 73 |         self.count
 74 |     }
 75 | }
 76 | 
 77 | #[cfg(test)]
 78 | mod tests {
 79 |     use super::{Count, SegmentCountCollector};
 80 |     use crate::collector::{Collector, SegmentCollector};
 81 | 
 82 |     #[test]
 83 |     fn test_count_collect_does_not_requires_scoring() {
 84 |         assert!(!Count.requires_scoring());
 85 |     }
 86 | 
 87 |     #[test]
 88 |     fn test_segment_count_collector() {
 89 |         {
 90 |             let count_collector = SegmentCountCollector::default();
 91 |             assert_eq!(count_collector.harvest(), 0);
 92 |         }
 93 |         {
 94 |             let mut count_collector = SegmentCountCollector::default();
 95 |             count_collector.collect(0u32, 1.0);
 96 |             assert_eq!(count_collector.harvest(), 1);
 97 |         }
 98 |         {
 99 |             let mut count_collector = SegmentCountCollector::default();
100 |             count_collector.collect(0u32, 1.0);
101 |             assert_eq!(count_collector.harvest(), 1);
102 |         }
103 |         {
104 |             let mut count_collector = SegmentCountCollector::default();
105 |             count_collector.collect(0u32, 1.0);
106 |             count_collector.collect(1u32, 1.0);
107 |             assert_eq!(count_collector.harvest(), 2);
108 |         }
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/collector/docset_collector.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashSet;
 2 | 
 3 | use super::{Collector, SegmentCollector};
 4 | use crate::{DocAddress, DocId, Score};
 5 | 
 6 | /// Collectors that returns the set of DocAddress that matches the query.
 7 | ///
 8 | /// This collector is mostly useful for tests.
 9 | pub struct DocSetCollector;
10 | 
11 | impl Collector for DocSetCollector {
12 |     type Fruit = HashSet<DocAddress>;
13 |     type Child = DocSetChildCollector;
14 | 
15 |     fn for_segment(
16 |         &self,
17 |         segment_local_id: crate::SegmentOrdinal,
18 |         _segment: &crate::SegmentReader,
19 |     ) -> crate::Result<Self::Child> {
20 |         Ok(DocSetChildCollector {
21 |             segment_local_id,
22 |             docs: HashSet::new(),
23 |         })
24 |     }
25 | 
26 |     fn requires_scoring(&self) -> bool {
27 |         false
28 |     }
29 | 
30 |     fn merge_fruits(
31 |         &self,
32 |         segment_fruits: Vec<(u32, HashSet<DocId>)>,
33 |     ) -> crate::Result<Self::Fruit> {
34 |         let len: usize = segment_fruits.iter().map(|(_, docset)| docset.len()).sum();
35 |         let mut result = HashSet::with_capacity(len);
36 |         for (segment_local_id, docs) in segment_fruits {
37 |             for doc in docs {
38 |                 result.insert(DocAddress::new(segment_local_id, doc));
39 |             }
40 |         }
41 |         Ok(result)
42 |     }
43 | }
44 | 
45 | pub struct DocSetChildCollector {
46 |     segment_local_id: u32,
47 |     docs: HashSet<DocId>,
48 | }
49 | 
50 | impl SegmentCollector for DocSetChildCollector {
51 |     type Fruit = (u32, HashSet<DocId>);
52 | 
53 |     fn collect(&mut self, doc: crate::DocId, _score: Score) {
54 |         self.docs.insert(doc);
55 |     }
56 | 
57 |     fn harvest(self) -> (u32, HashSet<DocId>) {
58 |         (self.segment_local_id, self.docs)
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/core/mod.rs:
--------------------------------------------------------------------------------
 1 | mod executor;
 2 | pub mod index;
 3 | mod index_meta;
 4 | mod inverted_index_reader;
 5 | pub mod searcher;
 6 | mod segment;
 7 | mod segment_component;
 8 | mod segment_id;
 9 | mod segment_reader;
10 | 
11 | use std::path::Path;
12 | 
13 | use once_cell::sync::Lazy;
14 | 
15 | pub use self::executor::Executor;
16 | pub use self::index::{Index, IndexBuilder};
17 | pub use self::index_meta::{
18 |     IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta, SegmentMetaInventory,
19 | };
20 | pub use self::inverted_index_reader::InvertedIndexReader;
21 | pub use self::searcher::{Searcher, SearcherGeneration};
22 | pub use self::segment::Segment;
23 | pub use self::segment_component::SegmentComponent;
24 | pub use self::segment_id::SegmentId;
25 | pub use self::segment_reader::SegmentReader;
26 | 
27 | /// The meta file contains all the information about the list of segments and the schema
28 | /// of the index.
29 | pub static META_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new("meta.json"));
30 | 
31 | /// The managed file contains a list of files that were created by the tantivy
32 | /// and will therefore be garbage collected when they are deemed useless by tantivy.
33 | ///
34 | /// Removing this file is safe, but will prevent the garbage collection of all of the file that
35 | /// are currently in the directory
36 | pub static MANAGED_FILEPATH: Lazy<&'static Path> = Lazy::new(|| Path::new(".managed.json"));
37 | 


--------------------------------------------------------------------------------
/src/core/segment.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::path::PathBuf;
 3 | 
 4 | use super::SegmentComponent;
 5 | use crate::core::{Index, SegmentId, SegmentMeta};
 6 | use crate::directory::error::{OpenReadError, OpenWriteError};
 7 | use crate::directory::{Directory, FileSlice, WritePtr};
 8 | use crate::schema::Schema;
 9 | use crate::Opstamp;
10 | 
11 | /// A segment is a piece of the index.
12 | #[derive(Clone)]
13 | pub struct Segment {
14 |     index: Index,
15 |     meta: SegmentMeta,
16 | }
17 | 
18 | impl fmt::Debug for Segment {
19 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
20 |         write!(f, "Segment({:?})", self.id().uuid_string())
21 |     }
22 | }
23 | 
24 | impl Segment {
25 |     /// Creates a new segment given an `Index` and a `SegmentId`
26 |     pub(crate) fn for_index(index: Index, meta: SegmentMeta) -> Segment {
27 |         Segment { index, meta }
28 |     }
29 | 
30 |     /// Returns the index the segment belongs to.
31 |     pub fn index(&self) -> &Index {
32 |         &self.index
33 |     }
34 | 
35 |     /// Returns our index's schema.
36 |     pub fn schema(&self) -> Schema {
37 |         self.index.schema()
38 |     }
39 | 
40 |     /// Returns the segment meta-information
41 |     pub fn meta(&self) -> &SegmentMeta {
42 |         &self.meta
43 |     }
44 | 
45 |     /// Updates the max_doc value from the `SegmentMeta`.
46 |     ///
47 |     /// This method is only used when updating `max_doc` from 0
48 |     /// as we finalize a fresh new segment.
49 |     pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
50 |         Segment {
51 |             index: self.index,
52 |             meta: self.meta.with_max_doc(max_doc),
53 |         }
54 |     }
55 | 
56 |     #[doc(hidden)]
57 |     #[must_use]
58 |     pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
59 |         Segment {
60 |             index: self.index,
61 |             meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),
62 |         }
63 |     }
64 | 
65 |     /// Returns the segment's id.
66 |     pub fn id(&self) -> SegmentId {
67 |         self.meta.id()
68 |     }
69 | 
70 |     /// Returns the relative path of a component of our segment.
71 |     ///
72 |     /// It just joins the segment id with the extension
73 |     /// associated to a segment component.
74 |     pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
75 |         self.meta.relative_path(component)
76 |     }
77 | 
78 |     /// Open one of the component file for a *regular* read.
79 |     pub fn open_read(&self, component: SegmentComponent) -> Result<FileSlice, OpenReadError> {
80 |         let path = self.relative_path(component);
81 |         self.index.directory().open_read(&path)
82 |     }
83 | 
84 |     /// Open one of the component file for *regular* write.
85 |     pub fn open_write(&mut self, component: SegmentComponent) -> Result<WritePtr, OpenWriteError> {
86 |         let path = self.relative_path(component);
87 |         let write = self.index.directory_mut().open_write(&path)?;
88 |         Ok(write)
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/core/segment_component.rs:
--------------------------------------------------------------------------------
 1 | use std::slice;
 2 | 
 3 | /// Enum describing each component of a tantivy segment.
 4 | /// Each component is stored in its own file,
 5 | /// using the pattern `segment_uuid`.`component_extension`,
 6 | /// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
 7 | #[derive(Copy, Clone, Eq, PartialEq)]
 8 | pub enum SegmentComponent {
 9 |     /// Postings (or inverted list). Sorted lists of document ids, associated to terms
10 |     Postings,
11 |     /// Positions of terms in each document.
12 |     Positions,
13 |     /// Column-oriented random-access storage of fields.
14 |     FastFields,
15 |     /// Stores the sum  of the length (in terms) of each field for each document.
16 |     /// Field norms are stored as a special u64 fast field.
17 |     FieldNorms,
18 |     /// Dictionary associating `Term`s to `TermInfo`s which is
19 |     /// simply an address into the `postings` file and the `positions` file.
20 |     Terms,
21 |     /// Row-oriented, compressed storage of the documents.
22 |     /// Accessing a document from the store is relatively slow, as it
23 |     /// requires to decompress the entire block it belongs to.
24 |     Store,
25 |     /// Temporary storage of the documents, before streamed to `Store`.
26 |     TempStore,
27 |     /// Bitset describing which document of the segment is deleted.
28 |     Delete,
29 | }
30 | 
31 | impl SegmentComponent {
32 |     /// Iterates through the components.
33 |     pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
34 |         static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [
35 |             SegmentComponent::Postings,
36 |             SegmentComponent::Positions,
37 |             SegmentComponent::FastFields,
38 |             SegmentComponent::FieldNorms,
39 |             SegmentComponent::Terms,
40 |             SegmentComponent::Store,
41 |             SegmentComponent::TempStore,
42 |             SegmentComponent::Delete,
43 |         ];
44 |         SEGMENT_COMPONENTS.iter()
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/directory/directory_lock.rs:
--------------------------------------------------------------------------------
 1 | use std::path::PathBuf;
 2 | 
 3 | use once_cell::sync::Lazy;
 4 | 
 5 | /// A directory lock.
 6 | ///
 7 | /// A lock is associated to a specific path and some
 8 | /// [`LockParams`](./enum.LockParams.html).
 9 | /// Tantivy itself uses only two locks but client application
10 | /// can use the directory facility to define their own locks.
11 | /// - [INDEX_WRITER_LOCK]
12 | /// - [META_LOCK]
13 | ///
14 | /// Check out these locks documentation for more information.
15 | #[derive(Debug)]
16 | pub struct Lock {
17 |     /// The lock needs to be associated with its own file `path`.
18 |     /// Depending on the platform, the lock might rely on the creation
19 |     /// and deletion of this filepath.
20 |     pub filepath: PathBuf,
21 |     /// `lock_params` describes whether acquiring the lock is meant
22 |     /// to be a blocking operation or a non-blocking.
23 |     ///
24 |     /// Acquiring a blocking lock blocks until the lock is
25 |     /// available.
26 |     /// Acquiring a blocking lock returns rapidly, either successfully
27 |     /// or with an error signifying that someone is already holding
28 |     /// the lock.
29 |     pub is_blocking: bool,
30 | }
31 | 
32 | /// Only one process should be able to write tantivy's index at a time.
33 | /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
34 | ///
35 | /// If the process is killed and this file remains, it is safe to remove it manually.
36 | ///
37 | /// Failing to acquire this lock usually means a misuse of tantivy's API,
38 | /// (creating more than one instance of the `IndexWriter`), are a spurious
39 | /// lock file remaining after a crash. In the latter case, removing the file after
40 | /// checking no process running tantivy is running is safe.
41 | pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
42 |     filepath: PathBuf::from(".tantivy-writer.lock"),
43 |     is_blocking: false,
44 | });
45 | /// The meta lock file is here to protect the segment files being opened by
46 | /// `IndexReader::reload()` from being garbage collected.
47 | /// It makes it possible for another process to safely consume
48 | /// our index in-writing. Ideally, we may have prefered `RWLock` semantics
49 | /// here, but it is difficult to achieve on Windows.
50 | ///
51 | /// Opening segment readers is a very fast process.
52 | pub static META_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
53 |     filepath: PathBuf::from(".tantivy-meta.lock"),
54 |     is_blocking: true,
55 | });
56 | 


--------------------------------------------------------------------------------
/src/directory/mod.rs:
--------------------------------------------------------------------------------
 1 | //! WORM (Write Once Read Many) directory abstraction.
 2 | 
 3 | #[cfg(feature = "mmap")]
 4 | mod mmap_directory;
 5 | 
 6 | mod directory;
 7 | mod directory_lock;
 8 | mod file_slice;
 9 | mod file_watcher;
10 | mod footer;
11 | mod managed_directory;
12 | mod ram_directory;
13 | mod watch_event_router;
14 | 
15 | /// Errors specific to the directory module.
16 | pub mod error;
17 | 
18 | mod composite_file;
19 | 
20 | use std::io::BufWriter;
21 | use std::path::PathBuf;
22 | 
23 | pub use common::{AntiCallToken, TerminatingWrite};
24 | pub use ownedbytes::OwnedBytes;
25 | 
26 | pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
27 | pub use self::directory::{Directory, DirectoryClone, DirectoryLock};
28 | pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
29 | pub(crate) use self::file_slice::{ArcBytes, WeakArcBytes};
30 | pub use self::file_slice::{FileHandle, FileSlice};
31 | pub use self::ram_directory::RamDirectory;
32 | pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
33 | 
34 | /// Outcome of the Garbage collection
35 | pub struct GarbageCollectionResult {
36 |     /// List of files that were deleted in this cycle
37 |     pub deleted_files: Vec<PathBuf>,
38 |     /// List of files that were schedule to be deleted in this cycle,
39 |     /// but deletion did not work. This typically happens on windows,
40 |     /// as deleting a memory mapped file is forbidden.
41 |     ///
42 |     /// If a searcher is still held, a file cannot be deleted.
43 |     /// This is not considered a bug, the file will simply be deleted
44 |     /// in the next GC.
45 |     pub failed_to_delete_files: Vec<PathBuf>,
46 | }
47 | 
48 | pub use self::managed_directory::ManagedDirectory;
49 | #[cfg(feature = "mmap")]
50 | pub use self::mmap_directory::MmapDirectory;
51 | 
52 | /// Write object for Directory.
53 | ///
54 | /// `WritePtr` are required to implement both Write
55 | /// and Seek.
56 | pub type WritePtr = BufWriter<Box<dyn TerminatingWrite>>;
57 | 
58 | #[cfg(test)]
59 | mod tests;
60 | 


--------------------------------------------------------------------------------
/src/fastfield/bytes/reader.rs:
--------------------------------------------------------------------------------
 1 | use crate::directory::{FileSlice, OwnedBytes};
 2 | use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, MultiValueLength};
 3 | use crate::DocId;
 4 | 
 5 | /// Reader for byte array fast fields
 6 | ///
 7 | /// The reader is implemented as a `u64` fast field and a separate collection of bytes.
 8 | ///
 9 | /// The `vals_reader` will access the concatenated list of all values for all documents.
10 | ///
11 | /// The `idx_reader` associates, for each document, the index of its first value.
12 | ///
13 | /// Reading the value for a document is done by reading the start index for it,
14 | /// and the start index for the next document, and keeping the bytes in between.
15 | #[derive(Clone)]
16 | pub struct BytesFastFieldReader {
17 |     idx_reader: DynamicFastFieldReader<u64>,
18 |     values: OwnedBytes,
19 | }
20 | 
21 | impl BytesFastFieldReader {
22 |     pub(crate) fn open(
23 |         idx_reader: DynamicFastFieldReader<u64>,
24 |         values_file: FileSlice,
25 |     ) -> crate::Result<BytesFastFieldReader> {
26 |         let values = values_file.read_bytes()?;
27 |         Ok(BytesFastFieldReader { idx_reader, values })
28 |     }
29 | 
30 |     fn range(&self, doc: DocId) -> (usize, usize) {
31 |         let start = self.idx_reader.get(doc) as usize;
32 |         let stop = self.idx_reader.get(doc + 1) as usize;
33 |         (start, stop)
34 |     }
35 | 
36 |     /// Returns the bytes associated to the given `doc`
37 |     pub fn get_bytes(&self, doc: DocId) -> &[u8] {
38 |         let (start, stop) = self.range(doc);
39 |         &self.values.as_slice()[start..stop]
40 |     }
41 | 
42 |     /// Returns the length of the bytes associated to the given `doc`
43 |     pub fn num_bytes(&self, doc: DocId) -> usize {
44 |         let (start, stop) = self.range(doc);
45 |         stop - start
46 |     }
47 | 
48 |     /// Returns the overall number of bytes in this bytes fast field.
49 |     pub fn total_num_bytes(&self) -> usize {
50 |         self.values.len()
51 |     }
52 | }
53 | 
54 | impl MultiValueLength for BytesFastFieldReader {
55 |     fn get_len(&self, doc_id: DocId) -> u64 {
56 |         self.num_bytes(doc_id) as u64
57 |     }
58 |     fn get_total_len(&self) -> u64 {
59 |         self.total_num_bytes() as u64
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/fastfield/error.rs:
--------------------------------------------------------------------------------
 1 | use std::result;
 2 | 
 3 | use crate::schema::FieldEntry;
 4 | 
 5 | /// `FastFieldNotAvailableError` is returned when the
 6 | /// user requested for a fast field reader, and the field was not
 7 | /// defined in the schema as a fast field.
 8 | #[derive(Debug, Error)]
 9 | #[error("Fast field not available: '{field_name:?}'")]
10 | pub struct FastFieldNotAvailableError {
11 |     field_name: String,
12 | }
13 | 
14 | impl FastFieldNotAvailableError {
15 |     /// Creates a `FastFieldNotAvailable` error.
16 |     /// `field_entry` is the configuration of the field
17 |     /// for which fast fields are not available.
18 |     pub fn new(field_entry: &FieldEntry) -> FastFieldNotAvailableError {
19 |         FastFieldNotAvailableError {
20 |             field_name: field_entry.name().to_string(),
21 |         }
22 |     }
23 | }
24 | 
25 | /// Result when trying to access a fast field reader.
26 | pub type Result<R> = result::Result<R, FastFieldNotAvailableError>;
27 | 


--------------------------------------------------------------------------------
/src/fieldnorm/serializer.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | use std::io::Write;
 3 | 
 4 | use crate::directory::{CompositeWrite, WritePtr};
 5 | use crate::schema::Field;
 6 | 
 7 | /// The fieldnorms serializer is in charge of
 8 | /// the serialization of field norms for all fields.
 9 | pub struct FieldNormsSerializer {
10 |     composite_write: CompositeWrite,
11 | }
12 | 
13 | impl FieldNormsSerializer {
14 |     /// Constructor
15 |     pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
16 |         // just making room for the pointer to header.
17 |         let composite_write = CompositeWrite::wrap(write);
18 |         Ok(FieldNormsSerializer { composite_write })
19 |     }
20 | 
21 |     /// Serialize the given field
22 |     pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
23 |         let write = self.composite_write.for_field(field);
24 |         write.write_all(fieldnorms_data)?;
25 |         write.flush()?;
26 |         Ok(())
27 |     }
28 | 
29 |     /// Clean up / flush / close
30 |     pub fn close(self) -> io::Result<()> {
31 |         self.composite_write.close()?;
32 |         Ok(())
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/indexer/doc_opstamp_mapping.rs:
--------------------------------------------------------------------------------
 1 | use crate::{DocId, Opstamp};
 2 | 
 3 | // Doc to opstamp is used to identify which
 4 | // document should be deleted.
 5 | //
 6 | // Since the docset matching the query of a delete operation
 7 | // is not computed right when the delete operation is received,
 8 | // we need to find a way to evaluate, for each document,
 9 | // whether the document was added before or after
10 | // the delete operation. This anteriority is used by comparing
11 | // the docstamp of the document.
12 | //
13 | // The doc to opstamp mapping stores precisely an array
14 | // indexed by doc id and storing the opstamp of the document.
15 | //
16 | // This mapping is NOT necessarily increasing, because
17 | // we might be sorting documents according to a fast field.
18 | #[derive(Clone)]
19 | pub enum DocToOpstampMapping<'a> {
20 |     WithMap(&'a [Opstamp]),
21 |     None,
22 | }
23 | 
24 | impl<'a> DocToOpstampMapping<'a> {
25 |     /// Assess whether a document should be considered deleted given that it contains
26 |     /// a deleted term that was deleted at the opstamp: `delete_opstamp`.
27 |     ///
28 |     /// This function returns true if the `DocToOpstamp` mapping is none or if
29 |     /// the `doc_opstamp` is anterior to the delete opstamp.
30 |     pub fn is_deleted(&self, doc_id: DocId, delete_opstamp: Opstamp) -> bool {
31 |         match self {
32 |             Self::WithMap(doc_opstamps) => {
33 |                 let doc_opstamp = doc_opstamps[doc_id as usize];
34 |                 doc_opstamp < delete_opstamp
35 |             }
36 |             Self::None => true,
37 |         }
38 |     }
39 | }
40 | 
41 | #[cfg(test)]
42 | mod tests {
43 | 
44 |     use super::DocToOpstampMapping;
45 | 
46 |     #[test]
47 |     fn test_doc_to_opstamp_mapping_none() {
48 |         let doc_to_opstamp_mapping = DocToOpstampMapping::None;
49 |         assert!(doc_to_opstamp_mapping.is_deleted(1u32, 0u64));
50 |         assert!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64));
51 |     }
52 | 
53 |     #[test]
54 |     fn test_doc_to_opstamp_mapping_with_map() {
55 |         let doc_to_opstamp_mapping = DocToOpstampMapping::WithMap(&[5u64, 1u64, 0u64, 4u64, 3u64]);
56 |         assert_eq!(doc_to_opstamp_mapping.is_deleted(0u32, 2u64), false);
57 |         assert_eq!(doc_to_opstamp_mapping.is_deleted(1u32, 2u64), true);
58 |         assert_eq!(doc_to_opstamp_mapping.is_deleted(2u32, 2u64), true);
59 |         assert_eq!(doc_to_opstamp_mapping.is_deleted(3u32, 2u64), false);
60 |         assert_eq!(doc_to_opstamp_mapping.is_deleted(4u32, 2u64), false);
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/indexer/index_writer_status.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::atomic::{AtomicBool, Ordering};
  2 | use std::sync::{Arc, RwLock};
  3 | 
  4 | use super::AddBatchReceiver;
  5 | 
  6 | #[derive(Clone)]
  7 | pub(crate) struct IndexWriterStatus {
  8 |     inner: Arc<Inner>,
  9 | }
 10 | 
 11 | impl IndexWriterStatus {
 12 |     /// Returns true iff the index writer is alive.
 13 |     pub fn is_alive(&self) -> bool {
 14 |         self.inner.as_ref().is_alive()
 15 |     }
 16 | 
 17 |     /// Returns a copy of the operation receiver.
 18 |     /// If the index writer was killed, returns None.
 19 |     pub fn operation_receiver(&self) -> Option<AddBatchReceiver> {
 20 |         let rlock = self
 21 |             .inner
 22 |             .receive_channel
 23 |             .read()
 24 |             .expect("This lock should never be poisoned");
 25 |         rlock.as_ref().cloned()
 26 |     }
 27 | 
 28 |     /// Create an index writer bomb.
 29 |     /// If dropped, the index writer status will be killed.
 30 |     pub(crate) fn create_bomb(&self) -> IndexWriterBomb {
 31 |         IndexWriterBomb {
 32 |             inner: Some(self.inner.clone()),
 33 |         }
 34 |     }
 35 | }
 36 | 
 37 | struct Inner {
 38 |     is_alive: AtomicBool,
 39 |     receive_channel: RwLock<Option<AddBatchReceiver>>,
 40 | }
 41 | 
 42 | impl Inner {
 43 |     fn is_alive(&self) -> bool {
 44 |         self.is_alive.load(Ordering::Relaxed)
 45 |     }
 46 | 
 47 |     fn kill(&self) {
 48 |         self.is_alive.store(false, Ordering::Relaxed);
 49 |         self.receive_channel
 50 |             .write()
 51 |             .expect("This lock should never be poisoned")
 52 |             .take();
 53 |     }
 54 | }
 55 | 
 56 | impl From<AddBatchReceiver> for IndexWriterStatus {
 57 |     fn from(receiver: AddBatchReceiver) -> Self {
 58 |         IndexWriterStatus {
 59 |             inner: Arc::new(Inner {
 60 |                 is_alive: AtomicBool::new(true),
 61 |                 receive_channel: RwLock::new(Some(receiver)),
 62 |             }),
 63 |         }
 64 |     }
 65 | }
 66 | 
 67 | /// If dropped, the index writer will be killed.
 68 | /// To prevent this, clients can call `.defuse()`.
 69 | pub(crate) struct IndexWriterBomb {
 70 |     inner: Option<Arc<Inner>>,
 71 | }
 72 | 
 73 | impl IndexWriterBomb {
 74 |     /// Defuses the bomb.
 75 |     ///
 76 |     /// This is the only way to drop the bomb without killing
 77 |     /// the index writer.
 78 |     pub fn defuse(mut self) {
 79 |         self.inner = None;
 80 |     }
 81 | }
 82 | 
 83 | impl Drop for IndexWriterBomb {
 84 |     fn drop(&mut self) {
 85 |         if let Some(inner) = self.inner.take() {
 86 |             inner.kill();
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | #[cfg(test)]
 92 | mod tests {
 93 |     use std::mem;
 94 | 
 95 |     use crossbeam::channel;
 96 | 
 97 |     use super::IndexWriterStatus;
 98 | 
 99 |     #[test]
100 |     fn test_bomb_goes_boom() {
101 |         let (_tx, rx) = channel::bounded(10);
102 |         let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
103 |         assert!(index_writer_status.operation_receiver().is_some());
104 |         let bomb = index_writer_status.create_bomb();
105 |         assert!(index_writer_status.operation_receiver().is_some());
106 |         mem::drop(bomb);
107 |         // boom!
108 |         assert!(index_writer_status.operation_receiver().is_none());
109 |     }
110 | 
111 |     #[test]
112 |     fn test_bomb_defused() {
113 |         let (_tx, rx) = channel::bounded(10);
114 |         let index_writer_status: IndexWriterStatus = IndexWriterStatus::from(rx);
115 |         assert!(index_writer_status.operation_receiver().is_some());
116 |         let bomb = index_writer_status.create_bomb();
117 |         bomb.defuse();
118 |         assert!(index_writer_status.operation_receiver().is_some());
119 |     }
120 | }
121 | 


--------------------------------------------------------------------------------
/src/indexer/merge_operation.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashSet;
 2 | use std::ops::Deref;
 3 | 
 4 | use crate::{Inventory, Opstamp, SegmentId, TrackedObject};
 5 | 
 6 | #[derive(Default)]
 7 | pub(crate) struct MergeOperationInventory(Inventory<InnerMergeOperation>);
 8 | 
 9 | impl Deref for MergeOperationInventory {
10 |     type Target = Inventory<InnerMergeOperation>;
11 | 
12 |     fn deref(&self) -> &Self::Target {
13 |         &self.0
14 |     }
15 | }
16 | 
17 | impl MergeOperationInventory {
18 |     pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
19 |         let mut segment_in_merge = HashSet::default();
20 |         for merge_op in self.list() {
21 |             for &segment_id in &merge_op.segment_ids {
22 |                 segment_in_merge.insert(segment_id);
23 |             }
24 |         }
25 |         segment_in_merge
26 |     }
27 | }
28 | 
29 | /// A `MergeOperation` has two roles.
30 | /// It carries all of the information required to describe a merge:
31 | /// - `target_opstamp` is the opstamp up to which we want to consume the
32 | /// delete queue and reflect their deletes.
33 | /// - `segment_ids` is the list of segment to be merged.
34 | ///
35 | /// The second role is to ensure keep track of the fact that these
36 | /// segments are in merge and avoid starting a merge operation that
37 | /// may conflict with this one.
38 | ///
39 | /// This works by tracking merge operations. When considering computing
40 | /// merge candidates, we simply list tracked merge operations and remove
41 | /// their segments from possible merge candidates.
42 | pub struct MergeOperation {
43 |     inner: TrackedObject<InnerMergeOperation>,
44 | }
45 | 
46 | pub(crate) struct InnerMergeOperation {
47 |     target_opstamp: Opstamp,
48 |     segment_ids: Vec<SegmentId>,
49 | }
50 | 
51 | impl MergeOperation {
52 |     pub(crate) fn new(
53 |         inventory: &MergeOperationInventory,
54 |         target_opstamp: Opstamp,
55 |         segment_ids: Vec<SegmentId>,
56 |     ) -> MergeOperation {
57 |         let inner_merge_operation = InnerMergeOperation {
58 |             target_opstamp,
59 |             segment_ids,
60 |         };
61 |         MergeOperation {
62 |             inner: inventory.track(inner_merge_operation),
63 |         }
64 |     }
65 | 
66 |     pub fn target_opstamp(&self) -> Opstamp {
67 |         self.inner.target_opstamp
68 |     }
69 | 
70 |     pub fn segment_ids(&self) -> &[SegmentId] {
71 |         &self.inner.segment_ids[..]
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/indexer/merge_policy.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt::Debug;
 2 | use std::marker;
 3 | 
 4 | use crate::core::{SegmentId, SegmentMeta};
 5 | 
 6 | /// Set of segment suggested for a merge.
 7 | #[derive(Debug, Clone)]
 8 | pub struct MergeCandidate(pub Vec<SegmentId>);
 9 | 
10 | /// The `MergePolicy` defines which segments should be merged.
11 | ///
12 | /// Every time a the list of segments changes, the segment updater
13 | /// asks the merge policy if some segments should be merged.
14 | pub trait MergePolicy: marker::Send + marker::Sync + Debug {
15 |     /// Given the list of segment metas, returns the list of merge candidates.
16 |     ///
17 |     /// This call happens on the segment updater thread, and will block
18 |     /// other segment updates, so all implementations should happen rapidly.
19 |     fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
20 | }
21 | 
22 | /// Never merge segments.
23 | #[derive(Debug, Clone)]
24 | pub struct NoMergePolicy;
25 | 
26 | impl Default for NoMergePolicy {
27 |     fn default() -> NoMergePolicy {
28 |         NoMergePolicy
29 |     }
30 | }
31 | 
32 | impl MergePolicy for NoMergePolicy {
33 |     fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
34 |         Vec::new()
35 |     }
36 | }
37 | 
38 | #[cfg(test)]
39 | pub mod tests {
40 | 
41 |     use super::*;
42 |     use crate::core::{SegmentId, SegmentMeta};
43 | 
44 |     /// `MergePolicy` useful for test purposes.
45 |     ///
46 |     /// Everytime there is more than one segment,
47 |     /// it will suggest to merge them.
48 |     #[derive(Debug, Clone)]
49 |     pub struct MergeWheneverPossible;
50 | 
51 |     impl MergePolicy for MergeWheneverPossible {
52 |         fn compute_merge_candidates(&self, segment_metas: &[SegmentMeta]) -> Vec<MergeCandidate> {
53 |             let segment_ids = segment_metas
54 |                 .iter()
55 |                 .map(|segment_meta| segment_meta.id())
56 |                 .collect::<Vec<SegmentId>>();
57 |             if segment_ids.len() > 1 {
58 |                 vec![MergeCandidate(segment_ids)]
59 |             } else {
60 |                 vec![]
61 |             }
62 |         }
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/indexer/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod delete_queue;
 2 | 
 3 | pub mod demuxer;
 4 | pub mod doc_id_mapping;
 5 | mod doc_opstamp_mapping;
 6 | pub mod index_writer;
 7 | mod index_writer_status;
 8 | mod json_term_writer;
 9 | mod log_merge_policy;
10 | mod merge_operation;
11 | pub mod merge_policy;
12 | pub mod merger;
13 | mod merger_sorted_index_test;
14 | pub mod operation;
15 | pub mod prepared_commit;
16 | mod segment_entry;
17 | mod segment_manager;
18 | mod segment_register;
19 | pub mod segment_serializer;
20 | pub mod segment_updater;
21 | mod segment_writer;
22 | mod stamper;
23 | 
24 | use crossbeam::channel;
25 | use smallvec::SmallVec;
26 | 
27 | pub use self::index_writer::IndexWriter;
28 | pub(crate) use self::json_term_writer::JsonTermWriter;
29 | pub use self::log_merge_policy::LogMergePolicy;
30 | pub use self::merge_operation::MergeOperation;
31 | pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
32 | pub use self::prepared_commit::PreparedCommit;
33 | pub use self::segment_entry::SegmentEntry;
34 | pub use self::segment_manager::SegmentManager;
35 | pub use self::segment_serializer::SegmentSerializer;
36 | pub use self::segment_updater::{merge_filtered_segments, merge_indices};
37 | pub use self::segment_writer::SegmentWriter;
38 | use crate::indexer::operation::AddOperation;
39 | 
40 | /// Alias for the default merge policy, which is the `LogMergePolicy`.
41 | pub type DefaultMergePolicy = LogMergePolicy;
42 | 
43 | // Batch of documents.
44 | // Most of the time, users will send operation one-by-one, but it can be useful to
45 | // send them as a small block to ensure that
46 | // - all docs in the operation will happen on the same segment and continuous doc_ids.
47 | // - all operations in the group are committed at the same time, making the group
48 | // atomic.
49 | type AddBatch = SmallVec<[AddOperation; 4]>;
50 | type AddBatchSender = channel::Sender<AddBatch>;
51 | type AddBatchReceiver = channel::Receiver<AddBatch>;
52 | 
53 | #[cfg(feature = "mmap")]
54 | #[cfg(test)]
55 | mod tests_mmap {
56 |     use crate::schema::{self, Schema};
57 |     use crate::{Index, Term};
58 | 
59 |     #[test]
60 |     fn test_advance_delete_bug() -> crate::Result<()> {
61 |         let mut schema_builder = Schema::builder();
62 |         let text_field = schema_builder.add_text_field("text", schema::TEXT);
63 |         let index = Index::create_from_tempdir(schema_builder.build())?;
64 |         let mut index_writer = index.writer_for_tests()?;
65 |         // there must be one deleted document in the segment
66 |         index_writer.add_document(doc!(text_field=>"b"))?;
67 |         index_writer.delete_term(Term::from_field_text(text_field, "b"));
68 |         // we need enough data to trigger the bug (at least 32 documents)
69 |         for _ in 0..32 {
70 |             index_writer.add_document(doc!(text_field=>"c"))?;
71 |         }
72 |         index_writer.commit()?;
73 |         index_writer.commit()?;
74 |         Ok(())
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/indexer/operation.rs:
--------------------------------------------------------------------------------
 1 | use crate::schema::{Document, Term};
 2 | use crate::Opstamp;
 3 | 
 4 | /// Timestamped Delete operation.
 5 | #[derive(Clone, Eq, PartialEq, Debug)]
 6 | pub struct DeleteOperation {
 7 |     pub opstamp: Opstamp,
 8 |     pub term: Term,
 9 | }
10 | 
11 | impl Default for DeleteOperation {
12 |     fn default() -> Self {
13 |         DeleteOperation {
14 |             opstamp: 0u64,
15 |             term: Term::new(),
16 |         }
17 |     }
18 | }
19 | 
20 | /// Timestamped Add operation.
21 | #[derive(Eq, PartialEq, Debug)]
22 | pub struct AddOperation {
23 |     pub opstamp: Opstamp,
24 |     pub document: Document,
25 | }
26 | 
27 | /// UserOperation is an enum type that encapsulates other operation types.
28 | #[derive(Eq, PartialEq, Debug)]
29 | pub enum UserOperation {
30 |     /// Add operation
31 |     Add(Document),
32 |     /// Delete operation
33 |     Delete(Term),
34 | }
35 | 


--------------------------------------------------------------------------------
/src/indexer/prepared_commit.rs:
--------------------------------------------------------------------------------
 1 | use super::IndexWriter;
 2 | use crate::{FutureResult, Opstamp};
 3 | 
 4 | /// A prepared commit
 5 | pub struct PreparedCommit<'a> {
 6 |     index_writer: &'a mut IndexWriter,
 7 |     payload: Option<String>,
 8 |     opstamp: Opstamp,
 9 | }
10 | 
11 | impl<'a> PreparedCommit<'a> {
12 |     pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit<'_> {
13 |         PreparedCommit {
14 |             index_writer,
15 |             payload: None,
16 |             opstamp,
17 |         }
18 |     }
19 | 
20 |     /// Returns the opstamp associated to the prepared commit.
21 |     pub fn opstamp(&self) -> Opstamp {
22 |         self.opstamp
23 |     }
24 | 
25 |     /// Adds an arbitrary payload to the commit.
26 |     pub fn set_payload(&mut self, payload: &str) {
27 |         self.payload = Some(payload.to_string())
28 |     }
29 | 
30 |     /// Rollbacks any change.
31 |     pub fn abort(self) -> crate::Result<Opstamp> {
32 |         self.index_writer.rollback()
33 |     }
34 | 
35 |     /// Proceeds to commit.
36 |     /// See `.commit_future()`.
37 |     pub fn commit(self) -> crate::Result<Opstamp> {
38 |         self.commit_future().wait()
39 |     }
40 | 
41 |     /// Proceeds to commit.
42 |     ///
43 |     /// Unfortunately, contrary to what `PrepareCommit` may suggests,
44 |     /// this operation is not at all really light.
45 |     /// At this point deletes have not been flushed yet.
46 |     pub fn commit_future(self) -> FutureResult<Opstamp> {
47 |         info!("committing {}", self.opstamp);
48 |         self.index_writer
49 |             .segment_updater()
50 |             .schedule_commit(self.opstamp, self.payload)
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/indexer/segment_entry.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | 
 3 | use common::BitSet;
 4 | 
 5 | use crate::core::{SegmentId, SegmentMeta};
 6 | use crate::indexer::delete_queue::DeleteCursor;
 7 | 
 8 | /// A segment entry describes the state of
 9 | /// a given segment, at a given instant.
10 | ///
11 | /// In addition to segment `meta`,
12 | /// it contains a few transient states
13 | /// - `alive_bitset` is a bitset describing
14 | /// documents that were alive during the commit
15 | /// itself.
16 | /// - `delete_cursor` is the position in the delete queue.
17 | /// Deletes happening before the cursor are reflected either
18 | /// in the .del file or in the `alive_bitset`.
19 | #[derive(Clone)]
20 | pub struct SegmentEntry {
21 |     meta: SegmentMeta,
22 |     alive_bitset: Option<BitSet>,
23 |     delete_cursor: DeleteCursor,
24 | }
25 | 
26 | impl SegmentEntry {
27 |     /// Create a new `SegmentEntry`
28 |     pub fn new(
29 |         segment_meta: SegmentMeta,
30 |         delete_cursor: DeleteCursor,
31 |         alive_bitset: Option<BitSet>,
32 |     ) -> SegmentEntry {
33 |         SegmentEntry {
34 |             meta: segment_meta,
35 |             alive_bitset,
36 |             delete_cursor,
37 |         }
38 |     }
39 | 
40 |     /// Return a reference to the segment entry deleted bitset.
41 |     ///
42 |     /// `DocId` in this bitset are flagged as deleted.
43 |     pub fn alive_bitset(&self) -> Option<&BitSet> {
44 |         self.alive_bitset.as_ref()
45 |     }
46 | 
47 |     /// Set the `SegmentMeta` for this segment.
48 |     pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
49 |         self.meta = segment_meta;
50 |     }
51 | 
52 |     /// Return a reference to the segment_entry's delete cursor
53 |     pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
54 |         &mut self.delete_cursor
55 |     }
56 | 
57 |     /// Returns the segment id.
58 |     pub fn segment_id(&self) -> SegmentId {
59 |         self.meta.id()
60 |     }
61 | 
62 |     /// Accessor to the `SegmentMeta`
63 |     pub fn meta(&self) -> &SegmentMeta {
64 |         &self.meta
65 |     }
66 | }
67 | 
68 | impl fmt::Debug for SegmentEntry {
69 |     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
70 |         write!(formatter, "SegmentEntry({:?})", self.meta)
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/indexer/segment_serializer.rs:
--------------------------------------------------------------------------------
 1 | use crate::core::{Segment, SegmentComponent};
 2 | use crate::fastfield::CompositeFastFieldSerializer;
 3 | use crate::fieldnorm::FieldNormsSerializer;
 4 | use crate::postings::InvertedIndexSerializer;
 5 | use crate::store::StoreWriter;
 6 | 
 7 | /// Segment serializer is in charge of laying out on disk
 8 | /// the data accumulated and sorted by the `SegmentWriter`.
 9 | pub struct SegmentSerializer {
10 |     segment: Segment,
11 |     pub(crate) store_writer: StoreWriter,
12 |     fast_field_serializer: CompositeFastFieldSerializer,
13 |     fieldnorms_serializer: Option<FieldNormsSerializer>,
14 |     postings_serializer: InvertedIndexSerializer,
15 | }
16 | 
17 | impl SegmentSerializer {
18 |     /// Creates a new `SegmentSerializer`.
19 |     pub fn for_segment(
20 |         mut segment: Segment,
21 |         is_in_merge: bool,
22 |     ) -> crate::Result<SegmentSerializer> {
23 |         // If the segment is going to be sorted, we stream the docs first to a temporary file.
24 |         // In the merge case this is not necessary because we can kmerge the already sorted
25 |         // segments
26 |         let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
27 |         let store_component = if remapping_required {
28 |             SegmentComponent::TempStore
29 |         } else {
30 |             SegmentComponent::Store
31 |         };
32 |         let store_write = segment.open_write(store_component)?;
33 | 
34 |         let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
35 |         let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
36 | 
37 |         let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
38 |         let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
39 | 
40 |         let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
41 |         let compressor = segment.index().settings().docstore_compression;
42 |         Ok(SegmentSerializer {
43 |             segment,
44 |             store_writer: StoreWriter::new(store_write, compressor),
45 |             fast_field_serializer,
46 |             fieldnorms_serializer: Some(fieldnorms_serializer),
47 |             postings_serializer,
48 |         })
49 |     }
50 | 
51 |     /// The memory used (inclusive childs)
52 |     pub fn mem_usage(&self) -> usize {
53 |         self.store_writer.mem_usage()
54 |     }
55 | 
56 |     pub fn segment(&self) -> &Segment {
57 |         &self.segment
58 |     }
59 | 
60 |     pub fn segment_mut(&mut self) -> &mut Segment {
61 |         &mut self.segment
62 |     }
63 | 
64 |     /// Accessor to the `PostingsSerializer`.
65 |     pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
66 |         &mut self.postings_serializer
67 |     }
68 | 
69 |     /// Accessor to the `FastFieldSerializer`.
70 |     pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
71 |         &mut self.fast_field_serializer
72 |     }
73 | 
74 |     /// Extract the field norm serializer.
75 |     ///
76 |     /// Note the fieldnorms serializer can only be extracted once.
77 |     pub fn extract_fieldnorms_serializer(&mut self) -> Option<FieldNormsSerializer> {
78 |         self.fieldnorms_serializer.take()
79 |     }
80 | 
81 |     /// Accessor to the `StoreWriter`.
82 |     pub fn get_store_writer(&mut self) -> &mut StoreWriter {
83 |         &mut self.store_writer
84 |     }
85 | 
86 |     /// Finalize the segment serialization.
87 |     pub fn close(mut self) -> crate::Result<()> {
88 |         if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
89 |             fieldnorms_serializer.close()?;
90 |         }
91 |         self.fast_field_serializer.close()?;
92 |         self.postings_serializer.close()?;
93 |         self.store_writer.close()?;
94 |         Ok(())
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/macros.rs:
--------------------------------------------------------------------------------
 1 | /// `doc!` is a shortcut that helps building `Document`
 2 | /// objects.
 3 | ///
 4 | /// Assuming that `field1` and `field2` are `Field` instances.
 5 | /// You can create a document with a value of `value1` for `field1`
 6 | /// `value2` for `field2`, as follows :
 7 | ///
 8 | /// ```c
 9 | /// doc!(
10 | ///     field1 => value1,
11 | ///     field2 => value2,
12 | /// )
13 | /// ```
14 | ///
15 | /// The value can be a `u64`, a `&str`, a `i64`, or a `String`.
16 | ///
17 | /// # Warning
18 | ///
19 | /// The document hence created, is not yet validated against a schema.
20 | /// Nothing prevents its user from creating an invalid document missing a
21 | /// field, or associating a `String` to a `u64` field for instance.
22 | ///
23 | /// # Example
24 | ///
25 | /// ```rust
26 | /// use tantivy::schema::{Schema, TEXT, FAST};
27 | /// use tantivy::doc;
28 | ///
29 | /// //...
30 | ///
31 | /// # fn main() {
32 | /// let mut schema_builder = Schema::builder();
33 | /// let title = schema_builder.add_text_field("title", TEXT);
34 | /// let author = schema_builder.add_text_field("text", TEXT);
35 | /// let likes = schema_builder.add_u64_field("num_u64", FAST);
36 | /// let schema = schema_builder.build();
37 | /// let doc = doc!(
38 | ///     title => "Life Aquatic",
39 | ///     author => "Wes Anderson",
40 | ///     likes => 4u64
41 | /// );
42 | /// # }
43 | /// ```
44 | #[macro_export]
45 | macro_rules! doc(
46 |     () => {
47 |         {
48 |             ($crate::Document::default())
49 |         }
50 |     }; // avoids a warning due to the useless `mut`.
51 |     ($($field:expr => $value:expr),*) => {
52 |         {
53 |             let mut document = $crate::Document::default();
54 |             $(
55 |                 document.add_field_value($field, $value);
56 |             )*
57 |             document
58 |         }
59 |     };
60 |     // if there is a trailing comma retry with the trailing comma stripped.
61 |     ($($field:expr => $value:expr),+ ,) => {
62 |         doc!( $( $field => $value ), *)
63 |     };
64 | );
65 | 
66 | #[cfg(test)]
67 | mod test {
68 |     use crate::schema::{Schema, FAST, TEXT};
69 | 
70 |     #[test]
71 |     fn test_doc_basic() {
72 |         let mut schema_builder = Schema::builder();
73 |         let title = schema_builder.add_text_field("title", TEXT);
74 |         let author = schema_builder.add_text_field("text", TEXT);
75 |         let likes = schema_builder.add_u64_field("num_u64", FAST);
76 |         let _schema = schema_builder.build();
77 |         let _doc = doc!(
78 |         title => "Life Aquatic",
79 |         author => "Wes Anderson",
80 |         likes => 4u64
81 |         );
82 |     }
83 | 
84 |     #[test]
85 |     fn test_doc_trailing_comma() {
86 |         let mut schema_builder = Schema::builder();
87 |         let title = schema_builder.add_text_field("title", TEXT);
88 |         let author = schema_builder.add_text_field("text", TEXT);
89 |         let likes = schema_builder.add_u64_field("num_u64", FAST);
90 |         let _schema = schema_builder.build();
91 |         let _doc = doc!(
92 |         title => "Life Aquatic",
93 |         author => "Wes Anderson",
94 |         likes => 4u64,
95 |         );
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/positions/serializer.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{self, Write};
 2 | 
 3 | use common::{BinarySerializable, CountingWriter, VInt};
 4 | 
 5 | use crate::positions::COMPRESSION_BLOCK_SIZE;
 6 | use crate::postings::compression::{BlockEncoder, VIntEncoder};
 7 | 
 8 | /// The PositionSerializer is in charge of serializing all of the positions
 9 | /// of all of the terms of a given field.
10 | ///
11 | /// It is valid to call write_position_delta more than once per term.
12 | pub struct PositionSerializer<W: io::Write> {
13 |     block_encoder: BlockEncoder,
14 |     positions_wrt: CountingWriter<W>,
15 |     positions_buffer: Vec<u8>,
16 |     block: Vec<u32>,
17 |     bit_widths: Vec<u8>,
18 | }
19 | 
20 | impl<W: io::Write> PositionSerializer<W> {
21 |     /// Creates a new PositionSerializer writing into the given positions_wrt.
22 |     pub fn new(positions_wrt: W) -> PositionSerializer<W> {
23 |         PositionSerializer {
24 |             block_encoder: BlockEncoder::new(),
25 |             positions_wrt: CountingWriter::wrap(positions_wrt),
26 |             positions_buffer: Vec::with_capacity(128_000),
27 |             block: Vec::with_capacity(128),
28 |             bit_widths: Vec::new(),
29 |         }
30 |     }
31 | 
32 |     /// Returns the number of bytes written in the positions write object
33 |     /// at this point.
34 |     /// When called before writing the positions of a term, this value is used as
35 |     /// start offset.
36 |     /// When called after writing the positions of a term, this value is used as a
37 |     /// end offset.
38 |     pub fn written_bytes(&self) -> u64 {
39 |         self.positions_wrt.written_bytes()
40 |     }
41 | 
42 |     fn remaining_block_len(&self) -> usize {
43 |         COMPRESSION_BLOCK_SIZE - self.block.len()
44 |     }
45 | 
46 |     /// Writes all of the given positions delta.
47 |     pub fn write_positions_delta(&mut self, mut positions_delta: &[u32]) {
48 |         while !positions_delta.is_empty() {
49 |             let remaining_block_len = self.remaining_block_len();
50 |             let num_to_write = remaining_block_len.min(positions_delta.len());
51 |             self.block.extend(&positions_delta[..num_to_write]);
52 |             positions_delta = &positions_delta[num_to_write..];
53 |             if self.remaining_block_len() == 0 {
54 |                 self.flush_block();
55 |             }
56 |         }
57 |     }
58 | 
59 |     fn flush_block(&mut self) {
60 |         // encode the positions in the block
61 |         if self.block.is_empty() {
62 |             return;
63 |         }
64 |         if self.block.len() == COMPRESSION_BLOCK_SIZE {
65 |             let (bit_width, block_encoded): (u8, &[u8]) =
66 |                 self.block_encoder.compress_block_unsorted(&self.block[..]);
67 |             self.bit_widths.push(bit_width);
68 |             self.positions_buffer.extend(block_encoded);
69 |         } else {
70 |             debug_assert!(self.block.len() < COMPRESSION_BLOCK_SIZE);
71 |             let block_vint_encoded = self.block_encoder.compress_vint_unsorted(&self.block[..]);
72 |             self.positions_buffer.extend_from_slice(block_vint_encoded);
73 |         }
74 |         self.block.clear();
75 |     }
76 | 
77 |     /// Close the positions for the given term.
78 |     pub fn close_term(&mut self) -> io::Result<()> {
79 |         self.flush_block();
80 |         VInt(self.bit_widths.len() as u64).serialize(&mut self.positions_wrt)?;
81 |         self.positions_wrt.write_all(&self.bit_widths[..])?;
82 |         self.positions_wrt.write_all(&self.positions_buffer)?;
83 |         self.bit_widths.clear();
84 |         self.positions_buffer.clear();
85 |         Ok(())
86 |     }
87 | 
88 |     /// Close the positions for this term and flushes the data.
89 |     pub fn close(mut self) -> io::Result<()> {
90 |         self.positions_wrt.flush()
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/postings/block_search.rs:
--------------------------------------------------------------------------------
  1 | use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
  2 | 
  3 | /// Search the first index containing an element greater or equal to
  4 | /// the target.
  5 | ///
  6 | /// The results should be equivalent to
  7 | /// ```compile_fail
  8 | /// block[..]
  9 | //       .iter()
 10 | //       .take_while(|&&val| val < target)
 11 | //       .count()
 12 | /// ```
 13 | /// 
 14 | /// the `start` argument is just used to hint that the response is
 15 | /// greater than beyond `start`. the implementation may or may not use
 16 | /// it for optimization.
 17 | ///
 18 | /// # Assumption
 19 | ///
 20 | /// - The block is sorted. Some elements may appear several times. This is the case at the
 21 | /// end of the last block for instance.
 22 | /// - The target is assumed smaller or equal to the last element of the block.
 23 | pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
 24 |     let mut start = 0;
 25 |     let mut len = arr.len();
 26 |     for _ in 0..7 {
 27 |         len /= 2;
 28 |         let pivot = unsafe { *arr.get_unchecked(start + len - 1) };
 29 |         if pivot < target {
 30 |             start += len;
 31 |         }
 32 |     }
 33 |     start
 34 | }
 35 | 
 36 | #[cfg(test)]
 37 | mod tests {
 38 |     use std::collections::HashSet;
 39 | 
 40 |     use proptest::prelude::*;
 41 | 
 42 |     use super::branchless_binary_search;
 43 |     use crate::docset::TERMINATED;
 44 |     use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
 45 | 
 46 |     fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
 47 |         block.iter().take_while(|&&val| val < target).count()
 48 |     }
 49 | 
 50 |     fn util_test_search_in_block(block: &[u32], target: u32) {
 51 |         let cursor = search_in_block_trivial_but_slow(block, target);
 52 |         assert!(cursor < COMPRESSION_BLOCK_SIZE);
 53 |         assert!(block[cursor] >= target);
 54 |         if cursor > 0 {
 55 |             assert!(block[cursor - 1] < target);
 56 |         }
 57 |         assert_eq!(block.len(), COMPRESSION_BLOCK_SIZE);
 58 |         let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
 59 |         output_buffer[..block.len()].copy_from_slice(block);
 60 |         assert_eq!(branchless_binary_search(&output_buffer, target), cursor);
 61 |     }
 62 | 
 63 |     fn util_test_search_in_block_all(block: &[u32]) {
 64 |         let mut targets = HashSet::new();
 65 |         targets.insert(0);
 66 |         for &val in block {
 67 |             if val > 0 {
 68 |                 targets.insert(val - 1);
 69 |             }
 70 |             targets.insert(val);
 71 |         }
 72 |         for target in targets {
 73 |             util_test_search_in_block(block, target);
 74 |         }
 75 |     }
 76 | 
 77 |     #[test]
 78 |     fn test_search_in_branchless_binary_search() {
 79 |         let v: Vec<u32> = (0..COMPRESSION_BLOCK_SIZE).map(|i| i as u32 * 2).collect();
 80 |         util_test_search_in_block_all(&v[..]);
 81 |     }
 82 | 
 83 |     fn monotonous_block() -> impl Strategy<Value = Vec<u32>> {
 84 |         prop::collection::vec(0u32..5u32, COMPRESSION_BLOCK_SIZE).prop_map(|mut deltas| {
 85 |             let mut el = 0;
 86 |             for i in 0..COMPRESSION_BLOCK_SIZE {
 87 |                 el += deltas[i];
 88 |                 deltas[i] = el;
 89 |             }
 90 |             deltas
 91 |         })
 92 |     }
 93 | 
 94 |     proptest! {
 95 |         #[test]
 96 |         fn test_proptest_branchless_binary_search(block in monotonous_block()) {
 97 |             util_test_search_in_block_all(&block[..]);
 98 |         }
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/src/postings/compression/vint.rs:
--------------------------------------------------------------------------------
  1 | #[inline]
  2 | pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
  3 |     let mut byte_written = 0;
  4 |     for &v in input {
  5 |         let mut to_encode: u32 = v - offset;
  6 |         offset = v;
  7 |         loop {
  8 |             let next_byte: u8 = (to_encode % 128u32) as u8;
  9 |             to_encode /= 128u32;
 10 |             if to_encode == 0u32 {
 11 |                 output[byte_written] = next_byte | 128u8;
 12 |                 byte_written += 1;
 13 |                 break;
 14 |             } else {
 15 |                 output[byte_written] = next_byte;
 16 |                 byte_written += 1;
 17 |             }
 18 |         }
 19 |     }
 20 |     &output[..byte_written]
 21 | }
 22 | 
 23 | #[inline]
 24 | pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
 25 |     let mut byte_written = 0;
 26 |     for &v in input {
 27 |         let mut to_encode: u32 = v;
 28 |         loop {
 29 |             let next_byte: u8 = (to_encode % 128u32) as u8;
 30 |             to_encode /= 128u32;
 31 |             if to_encode == 0u32 {
 32 |                 output[byte_written] = next_byte | 128u8;
 33 |                 byte_written += 1;
 34 |                 break;
 35 |             } else {
 36 |                 output[byte_written] = next_byte;
 37 |                 byte_written += 1;
 38 |             }
 39 |         }
 40 |     }
 41 |     &output[..byte_written]
 42 | }
 43 | 
 44 | #[inline]
 45 | pub fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize {
 46 |     let mut read_byte = 0;
 47 |     let mut result = offset;
 48 |     for output_mut in output.iter_mut() {
 49 |         let mut shift = 0u32;
 50 |         loop {
 51 |             let cur_byte = compressed_data[read_byte];
 52 |             read_byte += 1;
 53 |             result += u32::from(cur_byte % 128u8) << shift;
 54 |             if cur_byte & 128u8 != 0u8 {
 55 |                 break;
 56 |             }
 57 |             shift += 7;
 58 |         }
 59 |         *output_mut = result;
 60 |     }
 61 |     read_byte
 62 | }
 63 | 
 64 | #[inline]
 65 | pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize {
 66 |     let mut num_read_bytes = 0;
 67 |     for output_mut in output_arr.iter_mut() {
 68 |         let mut result = 0u32;
 69 |         let mut shift = 0u32;
 70 |         loop {
 71 |             let cur_byte = compressed_data[num_read_bytes];
 72 |             num_read_bytes += 1;
 73 |             result += u32::from(cur_byte % 128u8) << shift;
 74 |             if cur_byte & 128u8 != 0u8 {
 75 |                 break;
 76 |             }
 77 |             shift += 7;
 78 |         }
 79 |         *output_mut = result;
 80 |     }
 81 |     num_read_bytes
 82 | }
 83 | 
 84 | #[inline]
 85 | pub(crate) fn uncompress_unsorted_until_end(
 86 |     compressed_data: &[u8],
 87 |     output_arr: &mut [u32],
 88 | ) -> usize {
 89 |     let mut num_read_bytes = 0;
 90 |     for (num_ints_written, output_mut) in output_arr.iter_mut().enumerate() {
 91 |         if compressed_data.len() == num_read_bytes {
 92 |             return num_ints_written;
 93 |         }
 94 |         let mut result = 0u32;
 95 |         let mut shift = 0u32;
 96 |         loop {
 97 |             let cur_byte = compressed_data[num_read_bytes];
 98 |             num_read_bytes += 1;
 99 |             result += u32::from(cur_byte % 128u8) << shift;
100 |             if cur_byte & 128u8 != 0u8 {
101 |                 break;
102 |             }
103 |             shift += 7;
104 |         }
105 |         *output_mut = result;
106 |     }
107 |     output_arr.len()
108 | }
109 | 


--------------------------------------------------------------------------------
/src/postings/indexing_context.rs:
--------------------------------------------------------------------------------
 1 | use crate::postings::stacker::{MemoryArena, TermHashMap};
 2 | 
 3 | /// IndexingContext contains all of the transient memory arenas
 4 | /// required for building the inverted index.
 5 | pub(crate) struct IndexingContext {
 6 |     /// The term index is an adhoc hashmap,
 7 |     /// itself backed by a dedicated memory arena.
 8 |     pub term_index: TermHashMap,
 9 |     /// Arena is a memory arena that stores posting lists / term frequencies / positions.
10 |     pub arena: MemoryArena,
11 | }
12 | 
13 | impl IndexingContext {
14 |     /// Create a new IndexingContext given the size of the term hash map.
15 |     pub(crate) fn new(table_size: usize) -> IndexingContext {
16 |         let term_index = TermHashMap::new(table_size);
17 |         IndexingContext {
18 |             arena: MemoryArena::new(),
19 |             term_index,
20 |         }
21 |     }
22 | 
23 |     /// Returns the memory usage for the inverted index memory arenas, in bytes.
24 |     pub(crate) fn mem_usage(&self) -> usize {
25 |         self.term_index.mem_usage() + self.arena.mem_usage()
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/postings/json_postings_writer.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use crate::indexer::doc_id_mapping::DocIdMapping;
 4 | use crate::postings::postings_writer::SpecializedPostingsWriter;
 5 | use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
 6 | use crate::postings::stacker::Addr;
 7 | use crate::postings::{
 8 |     FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
 9 | };
10 | use crate::schema::term::as_json_path_type_value_bytes;
11 | use crate::schema::Type;
12 | use crate::tokenizer::TokenStream;
13 | use crate::{DocId, Term};
14 | 
15 | #[derive(Default)]
16 | pub(crate) struct JsonPostingsWriter<Rec: Recorder> {
17 |     str_posting_writer: SpecializedPostingsWriter<Rec>,
18 |     non_str_posting_writer: SpecializedPostingsWriter<NothingRecorder>,
19 | }
20 | 
21 | impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
22 |     fn from(json_postings_writer: JsonPostingsWriter<Rec>) -> Box<dyn PostingsWriter> {
23 |         Box::new(json_postings_writer)
24 |     }
25 | }
26 | 
27 | impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
28 |     fn subscribe(
29 |         &mut self,
30 |         doc: crate::DocId,
31 |         pos: u32,
32 |         term: &crate::Term,
33 |         ctx: &mut IndexingContext,
34 |     ) -> UnorderedTermId {
35 |         self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
36 |     }
37 | 
38 |     fn index_text(
39 |         &mut self,
40 |         doc_id: DocId,
41 |         token_stream: &mut dyn TokenStream,
42 |         term_buffer: &mut Term,
43 |         ctx: &mut IndexingContext,
44 |         indexing_position: &mut IndexingPosition,
45 |     ) {
46 |         self.str_posting_writer.index_text(
47 |             doc_id,
48 |             token_stream,
49 |             term_buffer,
50 |             ctx,
51 |             indexing_position,
52 |         );
53 |     }
54 | 
55 |     /// The actual serialization format is handled by the `PostingsSerializer`.
56 |     fn serialize(
57 |         &self,
58 |         term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
59 |         doc_id_map: Option<&DocIdMapping>,
60 |         ctx: &IndexingContext,
61 |         serializer: &mut FieldSerializer,
62 |     ) -> io::Result<()> {
63 |         let mut buffer_lender = BufferLender::default();
64 |         for (term, addr, _) in term_addrs {
65 |             // TODO optimization opportunity here.
66 |             if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
67 |                 if typ == Type::Str {
68 |                     SpecializedPostingsWriter::<Rec>::serialize_one_term(
69 |                         term,
70 |                         *addr,
71 |                         doc_id_map,
72 |                         &mut buffer_lender,
73 |                         ctx,
74 |                         serializer,
75 |                     )?;
76 |                 } else {
77 |                     SpecializedPostingsWriter::<NothingRecorder>::serialize_one_term(
78 |                         term,
79 |                         *addr,
80 |                         doc_id_map,
81 |                         &mut buffer_lender,
82 |                         ctx,
83 |                         serializer,
84 |                     )?;
85 |                 }
86 |             }
87 |         }
88 |         Ok(())
89 |     }
90 | 
91 |     fn total_num_tokens(&self) -> u64 {
92 |         self.str_posting_writer.total_num_tokens() + self.non_str_posting_writer.total_num_tokens()
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/postings/per_field_postings_writer.rs:
--------------------------------------------------------------------------------
 1 | use crate::postings::json_postings_writer::JsonPostingsWriter;
 2 | use crate::postings::postings_writer::SpecializedPostingsWriter;
 3 | use crate::postings::recorder::{NothingRecorder, TermFrequencyRecorder, TfAndPositionRecorder};
 4 | use crate::postings::PostingsWriter;
 5 | use crate::schema::{Field, FieldEntry, FieldType, IndexRecordOption, Schema};
 6 | 
 7 | pub(crate) struct PerFieldPostingsWriter {
 8 |     per_field_postings_writers: Vec<Box<dyn PostingsWriter>>,
 9 | }
10 | 
11 | impl PerFieldPostingsWriter {
12 |     pub fn for_schema(schema: &Schema) -> Self {
13 |         let per_field_postings_writers = schema
14 |             .fields()
15 |             .map(|(_, field_entry)| posting_writer_from_field_entry(field_entry))
16 |             .collect();
17 |         PerFieldPostingsWriter {
18 |             per_field_postings_writers,
19 |         }
20 |     }
21 | 
22 |     pub(crate) fn get_for_field(&self, field: Field) -> &dyn PostingsWriter {
23 |         self.per_field_postings_writers[field.field_id() as usize].as_ref()
24 |     }
25 | 
26 |     pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter {
27 |         self.per_field_postings_writers[field.field_id() as usize].as_mut()
28 |     }
29 | }
30 | 
31 | fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
32 |     match *field_entry.field_type() {
33 |         FieldType::Str(ref text_options) => text_options
34 |             .get_indexing_options()
35 |             .map(|indexing_options| match indexing_options.index_option() {
36 |                 IndexRecordOption::Basic => {
37 |                     SpecializedPostingsWriter::<NothingRecorder>::default().into()
38 |                 }
39 |                 IndexRecordOption::WithFreqs => {
40 |                     SpecializedPostingsWriter::<TermFrequencyRecorder>::default().into()
41 |                 }
42 |                 IndexRecordOption::WithFreqsAndPositions => {
43 |                     SpecializedPostingsWriter::<TfAndPositionRecorder>::default().into()
44 |                 }
45 |             })
46 |             .unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::default().into()),
47 |         FieldType::U64(_)
48 |         | FieldType::I64(_)
49 |         | FieldType::F64(_)
50 |         | FieldType::Date(_)
51 |         | FieldType::Bytes(_)
52 |         | FieldType::Facet(_) => Box::new(SpecializedPostingsWriter::<NothingRecorder>::default()),
53 |         FieldType::JsonObject(ref json_object_options) => {
54 |             if let Some(text_indexing_option) = json_object_options.get_text_indexing_options() {
55 |                 match text_indexing_option.index_option() {
56 |                     IndexRecordOption::Basic => {
57 |                         JsonPostingsWriter::<NothingRecorder>::default().into()
58 |                     }
59 |                     IndexRecordOption::WithFreqs => {
60 |                         JsonPostingsWriter::<TermFrequencyRecorder>::default().into()
61 |                     }
62 |                     IndexRecordOption::WithFreqsAndPositions => {
63 |                         JsonPostingsWriter::<TfAndPositionRecorder>::default().into()
64 |                     }
65 |                 }
66 |             } else {
67 |                 JsonPostingsWriter::<NothingRecorder>::default().into()
68 |             }
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/postings/postings.rs:
--------------------------------------------------------------------------------
 1 | use crate::docset::DocSet;
 2 | 
 3 | /// Postings (also called inverted list)
 4 | ///
 5 | /// For a given term, it is the list of doc ids of the doc
 6 | /// containing the term. Optionally, for each document,
 7 | /// it may also give access to the term frequency
 8 | /// as well as the list of term positions.
 9 | ///
10 | /// Its main implementation is `SegmentPostings`,
11 | /// but other implementations mocking `SegmentPostings` exist,
12 | /// for merging segments or for testing.
13 | pub trait Postings: DocSet + 'static {
14 |     /// The number of times the term appears in the document.
15 |     fn term_freq(&self) -> u32;
16 | 
17 |     /// Returns the positions offseted with a given value.
18 |     /// The output vector will be resized to the `term_freq`.
19 |     fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
20 | 
21 |     /// Returns the positions of the term in the given document.
22 |     /// The output vector will be resized to the `term_freq`.
23 |     fn positions(&mut self, output: &mut Vec<u32>) {
24 |         self.positions_with_offset(0u32, output);
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/postings/stacker/mod.rs:
--------------------------------------------------------------------------------
1 | mod expull;
2 | mod memory_arena;
3 | mod term_hashmap;
4 | 
5 | pub(crate) use self::expull::ExpUnrolledLinkedList;
6 | pub(crate) use self::memory_arena::{Addr, MemoryArena};
7 | pub(crate) use self::term_hashmap::{compute_table_size, TermHashMap};
8 | 


--------------------------------------------------------------------------------
/src/postings/term_info.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | use std::iter::ExactSizeIterator;
 3 | use std::ops::Range;
 4 | 
 5 | use common::{BinarySerializable, FixedSize};
 6 | 
 7 | /// `TermInfo` wraps the metadata associated to a Term.
 8 | /// It is segment-local.
 9 | #[derive(Debug, Default, Eq, PartialEq, Clone)]
10 | pub struct TermInfo {
11 |     /// Number of documents in the segment containing the term
12 |     pub doc_freq: u32,
13 |     /// Byte range of the posting list within the postings (`.idx`) file.
14 |     pub postings_range: Range<usize>,
15 |     /// Byte range of the positions of this terms in the positions (`.pos`) file.
16 |     pub positions_range: Range<usize>,
17 | }
18 | 
19 | impl TermInfo {
20 |     pub(crate) fn posting_num_bytes(&self) -> u32 {
21 |         let num_bytes = self.postings_range.len();
22 |         assert!(num_bytes <= std::u32::MAX as usize);
23 |         num_bytes as u32
24 |     }
25 | 
26 |     pub(crate) fn positions_num_bytes(&self) -> u32 {
27 |         let num_bytes = self.positions_range.len();
28 |         assert!(num_bytes <= std::u32::MAX as usize);
29 |         num_bytes as u32
30 |     }
31 | }
32 | 
33 | impl FixedSize for TermInfo {
34 |     /// Size required for the binary serialization of a `TermInfo` object.
35 |     /// This is large, but in practise, `TermInfo` are encoded in blocks and
36 |     /// only the first `TermInfo` of a block is serialized uncompressed.
37 |     /// The subsequent `TermInfo` are delta encoded and bitpacked.
38 |     const SIZE_IN_BYTES: usize = 3 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES;
39 | }
40 | 
41 | impl BinarySerializable for TermInfo {
42 |     fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
43 |         self.doc_freq.serialize(writer)?;
44 |         (self.postings_range.start as u64).serialize(writer)?;
45 |         self.posting_num_bytes().serialize(writer)?;
46 |         (self.positions_range.start as u64).serialize(writer)?;
47 |         self.positions_num_bytes().serialize(writer)?;
48 |         Ok(())
49 |     }
50 | 
51 |     fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
52 |         let doc_freq = u32::deserialize(reader)?;
53 |         let postings_start_offset = u64::deserialize(reader)? as usize;
54 |         let postings_num_bytes = u32::deserialize(reader)? as usize;
55 |         let postings_end_offset = postings_start_offset + postings_num_bytes;
56 |         let positions_start_offset = u64::deserialize(reader)? as usize;
57 |         let positions_num_bytes = u32::deserialize(reader)? as usize;
58 |         let positions_end_offset = positions_start_offset + positions_num_bytes;
59 |         Ok(TermInfo {
60 |             doc_freq,
61 |             postings_range: postings_start_offset..postings_end_offset,
62 |             positions_range: positions_start_offset..positions_end_offset,
63 |         })
64 |     }
65 | }
66 | 
67 | #[cfg(test)]
68 | mod tests {
69 | 
70 |     use super::TermInfo;
71 |     use crate::tests::fixed_size_test;
72 | 
73 |     // TODO add serialize/deserialize test for terminfo
74 | 
75 |     #[test]
76 |     fn test_fixed_size() {
77 |         fixed_size_test::<TermInfo>();
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/query/empty_query.rs:
--------------------------------------------------------------------------------
 1 | use super::Scorer;
 2 | use crate::docset::TERMINATED;
 3 | use crate::query::explanation::does_not_match;
 4 | use crate::query::{Explanation, Query, Weight};
 5 | use crate::{DocId, DocSet, Score, Searcher, SegmentReader};
 6 | 
 7 | /// `EmptyQuery` is a dummy `Query` in which no document matches.
 8 | ///
 9 | /// It is useful for tests and handling edge cases.
10 | #[derive(Clone, Debug)]
11 | pub struct EmptyQuery;
12 | 
13 | impl Query for EmptyQuery {
14 |     fn weight(
15 |         &self,
16 |         _searcher: &Searcher,
17 |         _scoring_enabled: bool,
18 |     ) -> crate::Result<Box<dyn Weight>> {
19 |         Ok(Box::new(EmptyWeight))
20 |     }
21 | 
22 |     fn count(&self, _searcher: &Searcher) -> crate::Result<usize> {
23 |         Ok(0)
24 |     }
25 | }
26 | 
27 | /// `EmptyWeight` is a dummy `Weight` in which no document matches.
28 | ///
29 | /// It is useful for tests and handling edge cases.
30 | pub struct EmptyWeight;
31 | impl Weight for EmptyWeight {
32 |     fn scorer(&self, _reader: &SegmentReader, _boost: Score) -> crate::Result<Box<dyn Scorer>> {
33 |         Ok(Box::new(EmptyScorer))
34 |     }
35 | 
36 |     fn explain(&self, _reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
37 |         Err(does_not_match(doc))
38 |     }
39 | }
40 | 
41 | /// `EmptyScorer` is a dummy `Scorer` in which no document matches.
42 | ///
43 | /// It is useful for tests and handling edge cases.
44 | pub struct EmptyScorer;
45 | 
46 | impl DocSet for EmptyScorer {
47 |     fn advance(&mut self) -> DocId {
48 |         TERMINATED
49 |     }
50 | 
51 |     fn doc(&self) -> DocId {
52 |         TERMINATED
53 |     }
54 | 
55 |     fn size_hint(&self) -> u32 {
56 |         0
57 |     }
58 | }
59 | 
60 | impl Scorer for EmptyScorer {
61 |     fn score(&mut self) -> Score {
62 |         0.0
63 |     }
64 | }
65 | 
66 | #[cfg(test)]
67 | mod tests {
68 |     use crate::docset::TERMINATED;
69 |     use crate::query::EmptyScorer;
70 |     use crate::DocSet;
71 | 
72 |     #[test]
73 |     fn test_empty_scorer() {
74 |         let mut empty_scorer = EmptyScorer;
75 |         assert_eq!(empty_scorer.doc(), TERMINATED);
76 |         assert_eq!(empty_scorer.advance(), TERMINATED);
77 |         assert_eq!(empty_scorer.doc(), TERMINATED);
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/query/explanation.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | 
 3 | use serde::Serialize;
 4 | 
 5 | use crate::{DocId, Score, TantivyError};
 6 | 
 7 | pub(crate) fn does_not_match(doc: DocId) -> TantivyError {
 8 |     TantivyError::InvalidArgument(format!("Document #({}) does not match", doc))
 9 | }
10 | 
11 | /// Object describing the score of a given document.
12 | /// It is organized in trees.
13 | ///
14 | /// `.to_pretty_json()` can be useful to print out a human readable
15 | /// representation of this tree when debugging a given score.
16 | #[derive(Clone, Serialize)]
17 | pub struct Explanation {
18 |     value: Score,
19 |     description: String,
20 |     #[serde(skip_serializing_if = "Vec::is_empty")]
21 |     details: Vec<Explanation>,
22 |     context: Vec<String>,
23 | }
24 | 
25 | impl fmt::Debug for Explanation {
26 |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
27 |         write!(f, "Explanation({})", self.to_pretty_json())
28 |     }
29 | }
30 | 
31 | impl Explanation {
32 |     /// Creates a new explanation object.
33 |     pub fn new<T: ToString>(description: T, value: Score) -> Explanation {
34 |         Explanation {
35 |             value,
36 |             description: description.to_string(),
37 |             details: vec![],
38 |             context: vec![],
39 |         }
40 |     }
41 | 
42 |     /// Returns the value associated to the current node.
43 |     pub fn value(&self) -> Score {
44 |         self.value
45 |     }
46 | 
47 |     /// Add some detail, explaining some part of the current node formula.
48 |     ///
49 |     /// Details are treated as child of the current node.
50 |     pub fn add_detail(&mut self, child_explanation: Explanation) {
51 |         self.details.push(child_explanation);
52 |     }
53 | 
54 |     /// Adds some extra context to the explanation.
55 |     pub fn add_context(&mut self, context: String) {
56 |         self.context.push(context);
57 |     }
58 | 
59 |     /// Shortcut for `self.details.push(Explanation::new(name, value));`
60 |     pub fn add_const<T: ToString>(&mut self, name: T, value: Score) {
61 |         self.details.push(Explanation::new(name, value));
62 |     }
63 | 
64 |     /// Returns an indented json representation of the explanation tree for debug usage.
65 |     pub fn to_pretty_json(&self) -> String {
66 |         serde_json::to_string_pretty(self).unwrap()
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/query/more_like_this/mod.rs:
--------------------------------------------------------------------------------
1 | mod more_like_this;
2 | mod query;
3 | 
4 | pub use self::more_like_this::MoreLikeThis;
5 | pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
6 | 


--------------------------------------------------------------------------------
/src/query/query_parser/logical_ast.rs:
--------------------------------------------------------------------------------
 1 | use std::fmt;
 2 | use std::ops::Bound;
 3 | 
 4 | use crate::query::Occur;
 5 | use crate::schema::{Field, Term, Type};
 6 | use crate::Score;
 7 | 
 8 | #[derive(Clone)]
 9 | pub enum LogicalLiteral {
10 |     Term(Term),
11 |     Phrase(Vec<(usize, Term)>),
12 |     Range {
13 |         field: Field,
14 |         value_type: Type,
15 |         lower: Bound<Term>,
16 |         upper: Bound<Term>,
17 |     },
18 |     All,
19 | }
20 | 
21 | pub enum LogicalAst {
22 |     Clause(Vec<(Occur, LogicalAst)>),
23 |     Leaf(Box<LogicalLiteral>),
24 |     Boost(Box<LogicalAst>, Score),
25 | }
26 | 
27 | impl LogicalAst {
28 |     pub fn boost(self, boost: Score) -> LogicalAst {
29 |         if (boost - 1.0).abs() < Score::EPSILON {
30 |             self
31 |         } else {
32 |             LogicalAst::Boost(Box::new(self), boost)
33 |         }
34 |     }
35 | }
36 | 
37 | fn occur_letter(occur: Occur) -> &'static str {
38 |     match occur {
39 |         Occur::Must => "+",
40 |         Occur::MustNot => "-",
41 |         Occur::Should => "",
42 |     }
43 | }
44 | 
45 | impl fmt::Debug for LogicalAst {
46 |     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
47 |         match *self {
48 |             LogicalAst::Clause(ref clause) => {
49 |                 if clause.is_empty() {
50 |                     write!(formatter, "<emptyclause>")?;
51 |                 } else {
52 |                     let (ref occur, ref subquery) = clause[0];
53 |                     write!(formatter, "({}{:?}", occur_letter(*occur), subquery)?;
54 |                     for &(ref occur, ref subquery) in &clause[1..] {
55 |                         write!(formatter, " {}{:?}", occur_letter(*occur), subquery)?;
56 |                     }
57 |                     formatter.write_str(")")?;
58 |                 }
59 |                 Ok(())
60 |             }
61 |             LogicalAst::Boost(ref ast, boost) => write!(formatter, "{:?}^{}", ast, boost),
62 |             LogicalAst::Leaf(ref literal) => write!(formatter, "{:?}", literal),
63 |         }
64 |     }
65 | }
66 | 
67 | impl From<LogicalLiteral> for LogicalAst {
68 |     fn from(literal: LogicalLiteral) -> LogicalAst {
69 |         LogicalAst::Leaf(Box::new(literal))
70 |     }
71 | }
72 | 
73 | impl fmt::Debug for LogicalLiteral {
74 |     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
75 |         match *self {
76 |             LogicalLiteral::Term(ref term) => write!(formatter, "{:?}", term),
77 |             LogicalLiteral::Phrase(ref terms) => write!(formatter, "\"{:?}\"", terms),
78 |             LogicalLiteral::Range {
79 |                 ref lower,
80 |                 ref upper,
81 |                 ..
82 |             } => write!(formatter, "({:?} TO {:?})", lower, upper),
83 |             LogicalLiteral::All => write!(formatter, "*"),
84 |         }
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/query/query_parser/mod.rs:
--------------------------------------------------------------------------------
1 | mod query_parser;
2 | 
3 | pub mod logical_ast;
4 | pub use self::query_parser::{QueryParser, QueryParserError};
5 | 


--------------------------------------------------------------------------------
/src/query/score_combiner.rs:
--------------------------------------------------------------------------------
 1 | use crate::query::Scorer;
 2 | use crate::Score;
 3 | 
 4 | /// The `ScoreCombiner` trait defines how to compute
 5 | /// an overall score given a list of scores.
 6 | pub trait ScoreCombiner: Default + Clone + Send + Copy + 'static {
 7 |     /// Aggregates the score combiner with the given scorer.
 8 |     ///
 9 |     /// The `ScoreCombiner` may decide to call `.scorer.score()`
10 |     /// or not.
11 |     fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer);
12 | 
13 |     /// Clears the score combiner state back to its initial state.
14 |     fn clear(&mut self);
15 | 
16 |     /// Returns the aggregate score.
17 |     fn score(&self) -> Score;
18 | }
19 | 
20 | /// Just ignores scores. The `DoNothingCombiner` does not
21 | /// even call the scorers `.score()` function.
22 | ///
23 | /// It is useful to optimize the case when scoring is disabled.
24 | #[derive(Default, Clone, Copy)] //< these should not be too much work :)
25 | pub struct DoNothingCombiner;
26 | 
27 | impl ScoreCombiner for DoNothingCombiner {
28 |     fn update<TScorer: Scorer>(&mut self, _scorer: &mut TScorer) {}
29 | 
30 |     fn clear(&mut self) {}
31 | 
32 |     fn score(&self) -> Score {
33 |         1.0
34 |     }
35 | }
36 | 
37 | /// Sums the score of different scorers.
38 | #[derive(Default, Clone, Copy)]
39 | pub struct SumCombiner {
40 |     score: Score,
41 | }
42 | 
43 | impl ScoreCombiner for SumCombiner {
44 |     fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer) {
45 |         self.score += scorer.score();
46 |     }
47 | 
48 |     fn clear(&mut self) {
49 |         self.score = 0.0;
50 |     }
51 | 
52 |     fn score(&self) -> Score {
53 |         self.score
54 |     }
55 | }
56 | 
57 | /// Sums the score of different scorers and keeps the count
58 | /// of scorers which matched.
59 | #[derive(Default, Clone, Copy)]
60 | pub struct SumWithCoordsCombiner {
61 |     num_fields: usize,
62 |     score: Score,
63 | }
64 | 
65 | impl ScoreCombiner for SumWithCoordsCombiner {
66 |     fn update<TScorer: Scorer>(&mut self, scorer: &mut TScorer) {
67 |         self.score += scorer.score();
68 |         self.num_fields += 1;
69 |     }
70 | 
71 |     fn clear(&mut self) {
72 |         self.score = 0.0;
73 |         self.num_fields = 0;
74 |     }
75 | 
76 |     fn score(&self) -> Score {
77 |         self.score
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/query/scorer.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::DerefMut;
 2 | 
 3 | use downcast_rs::impl_downcast;
 4 | 
 5 | use crate::docset::DocSet;
 6 | use crate::{DocId, Score};
 7 | 
 8 | /// Scored set of documents matching a query within a specific segment.
 9 | ///
10 | /// See [`Query`](./trait.Query.html).
11 | pub trait Scorer: downcast_rs::Downcast + DocSet + 'static {
12 |     /// Returns the score.
13 |     ///
14 |     /// This method will perform a bit of computation and is not cached.
15 |     fn score(&mut self) -> Score;
16 | }
17 | 
18 | impl_downcast!(Scorer);
19 | 
20 | impl Scorer for Box<dyn Scorer> {
21 |     fn score(&mut self) -> Score {
22 |         self.deref_mut().score()
23 |     }
24 | }
25 | 
26 | /// Wraps a `DocSet` and simply returns a constant `Scorer`.
27 | /// The `ConstScorer` is useful if you have a `DocSet` where
28 | /// you needed a scorer.
29 | ///
30 | /// The `ConstScorer`'s constant score can be set
31 | /// by calling `.set_score(...)`.
32 | pub struct ConstScorer<TDocSet: DocSet> {
33 |     docset: TDocSet,
34 |     score: Score,
35 | }
36 | 
37 | impl<TDocSet: DocSet> ConstScorer<TDocSet> {
38 |     /// Creates a new `ConstScorer`.
39 |     pub fn new(docset: TDocSet, score: Score) -> ConstScorer<TDocSet> {
40 |         ConstScorer { docset, score }
41 |     }
42 | }
43 | 
44 | impl<TDocSet: DocSet> From<TDocSet> for ConstScorer<TDocSet> {
45 |     fn from(docset: TDocSet) -> Self {
46 |         ConstScorer::new(docset, 1.0)
47 |     }
48 | }
49 | 
50 | impl<TDocSet: DocSet> DocSet for ConstScorer<TDocSet> {
51 |     fn advance(&mut self) -> DocId {
52 |         self.docset.advance()
53 |     }
54 | 
55 |     fn seek(&mut self, target: DocId) -> DocId {
56 |         self.docset.seek(target)
57 |     }
58 | 
59 |     fn fill_buffer(&mut self, buffer: &mut [DocId]) -> usize {
60 |         self.docset.fill_buffer(buffer)
61 |     }
62 | 
63 |     fn doc(&self) -> DocId {
64 |         self.docset.doc()
65 |     }
66 | 
67 |     fn size_hint(&self) -> u32 {
68 |         self.docset.size_hint()
69 |     }
70 | }
71 | 
72 | impl<TDocSet: DocSet + 'static> Scorer for ConstScorer<TDocSet> {
73 |     fn score(&mut self) -> Score {
74 |         self.score
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/query/vec_docset.rs:
--------------------------------------------------------------------------------
 1 | #![allow(dead_code)]
 2 | 
 3 | use common::HasLen;
 4 | 
 5 | use crate::docset::{DocSet, TERMINATED};
 6 | use crate::DocId;
 7 | 
 8 | /// Simulate a `Postings` objects from a `VecPostings`.
 9 | /// `VecPostings` only exist for testing purposes.
10 | ///
11 | /// Term frequencies always return 1.
12 | /// No positions are returned.
13 | pub struct VecDocSet {
14 |     doc_ids: Vec<DocId>,
15 |     cursor: usize,
16 | }
17 | 
18 | impl From<Vec<DocId>> for VecDocSet {
19 |     fn from(doc_ids: Vec<DocId>) -> VecDocSet {
20 |         VecDocSet { doc_ids, cursor: 0 }
21 |     }
22 | }
23 | 
24 | impl DocSet for VecDocSet {
25 |     fn advance(&mut self) -> DocId {
26 |         self.cursor += 1;
27 |         if self.cursor >= self.doc_ids.len() {
28 |             self.cursor = self.doc_ids.len();
29 |             return TERMINATED;
30 |         }
31 |         self.doc()
32 |     }
33 | 
34 |     fn doc(&self) -> DocId {
35 |         if self.cursor == self.doc_ids.len() {
36 |             return TERMINATED;
37 |         }
38 |         self.doc_ids[self.cursor]
39 |     }
40 | 
41 |     fn size_hint(&self) -> u32 {
42 |         self.len() as u32
43 |     }
44 | }
45 | 
46 | impl HasLen for VecDocSet {
47 |     fn len(&self) -> usize {
48 |         self.doc_ids.len()
49 |     }
50 | }
51 | 
52 | #[cfg(test)]
53 | pub mod tests {
54 | 
55 |     use super::*;
56 |     use crate::docset::DocSet;
57 |     use crate::DocId;
58 | 
59 |     #[test]
60 |     pub fn test_vec_postings() {
61 |         let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
62 |         let mut postings = VecDocSet::from(doc_ids);
63 |         assert_eq!(postings.doc(), 0u32);
64 |         assert_eq!(postings.advance(), 3u32);
65 |         assert_eq!(postings.doc(), 3u32);
66 |         assert_eq!(postings.seek(14u32), 15u32);
67 |         assert_eq!(postings.doc(), 15u32);
68 |         assert_eq!(postings.seek(300u32), 300u32);
69 |         assert_eq!(postings.doc(), 300u32);
70 |         assert_eq!(postings.seek(6000u32), TERMINATED);
71 |     }
72 | 
73 |     #[test]
74 |     pub fn test_fill_buffer() {
75 |         let doc_ids: Vec<DocId> = (1u32..210u32).collect();
76 |         let mut postings = VecDocSet::from(doc_ids);
77 |         let mut buffer = vec![1000u32; 100];
78 |         assert_eq!(postings.fill_buffer(&mut buffer[..]), 100);
79 |         for i in 0u32..100u32 {
80 |             assert_eq!(buffer[i as usize], i + 1);
81 |         }
82 |         assert_eq!(postings.fill_buffer(&mut buffer[..]), 100);
83 |         for i in 0u32..100u32 {
84 |             assert_eq!(buffer[i as usize], i + 101);
85 |         }
86 |         assert_eq!(postings.fill_buffer(&mut buffer[..]), 9);
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/query/weight.rs:
--------------------------------------------------------------------------------
  1 | use super::Scorer;
  2 | use crate::core::SegmentReader;
  3 | use crate::query::Explanation;
  4 | use crate::{DocId, Score, TERMINATED};
  5 | 
  6 | /// Iterates through all of the document matched by the DocSet
  7 | /// `DocSet` and push the scored documents to the collector.
  8 | pub(crate) fn for_each_scorer<TScorer: Scorer + ?Sized>(
  9 |     scorer: &mut TScorer,
 10 |     callback: &mut dyn FnMut(DocId, Score),
 11 | ) {
 12 |     let mut doc = scorer.doc();
 13 |     while doc != TERMINATED {
 14 |         callback(doc, scorer.score());
 15 |         doc = scorer.advance();
 16 |     }
 17 | }
 18 | 
 19 | /// Calls `callback` with all of the `(doc, score)` for which score
 20 | /// is exceeding a given threshold.
 21 | ///
 22 | /// This method is useful for the TopDocs collector.
 23 | /// For all docsets, the blanket implementation has the benefit
 24 | /// of prefiltering (doc, score) pairs, avoiding the
 25 | /// virtual dispatch cost.
 26 | ///
 27 | /// More importantly, it makes it possible for scorers to implement
 28 | /// important optimization (e.g. BlockWAND for union).
 29 | pub(crate) fn for_each_pruning_scorer<TScorer: Scorer + ?Sized>(
 30 |     scorer: &mut TScorer,
 31 |     mut threshold: Score,
 32 |     callback: &mut dyn FnMut(DocId, Score) -> Score,
 33 | ) {
 34 |     let mut doc = scorer.doc();
 35 |     while doc != TERMINATED {
 36 |         let score = scorer.score();
 37 |         if score > threshold {
 38 |             threshold = callback(doc, score);
 39 |         }
 40 |         doc = scorer.advance();
 41 |     }
 42 | }
 43 | 
 44 | /// A Weight is the specialization of a Query
 45 | /// for a given set of segments.
 46 | ///
 47 | /// See [`Query`](./trait.Query.html).
 48 | pub trait Weight: Send + Sync + 'static {
 49 |     /// Returns the scorer for the given segment.
 50 |     ///
 51 |     /// `boost` is a multiplier to apply to the score.
 52 |     ///
 53 |     /// See [`Query`](./trait.Query.html).
 54 |     fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>>;
 55 | 
 56 |     /// Returns an `Explanation` for the given document.
 57 |     fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation>;
 58 | 
 59 |     /// Returns the number documents within the given `SegmentReader`.
 60 |     fn count(&self, reader: &SegmentReader) -> crate::Result<u32> {
 61 |         let mut scorer = self.scorer(reader, 1.0)?;
 62 |         if let Some(alive_bitset) = reader.alive_bitset() {
 63 |             Ok(scorer.count(alive_bitset))
 64 |         } else {
 65 |             Ok(scorer.count_including_deleted())
 66 |         }
 67 |     }
 68 | 
 69 |     /// Iterates through all of the document matched by the DocSet
 70 |     /// `DocSet` and push the scored documents to the collector.
 71 |     fn for_each(
 72 |         &self,
 73 |         reader: &SegmentReader,
 74 |         callback: &mut dyn FnMut(DocId, Score),
 75 |     ) -> crate::Result<()> {
 76 |         let mut scorer = self.scorer(reader, 1.0)?;
 77 |         for_each_scorer(scorer.as_mut(), callback);
 78 |         Ok(())
 79 |     }
 80 | 
 81 |     /// Calls `callback` with all of the `(doc, score)` for which score
 82 |     /// is exceeding a given threshold.
 83 |     ///
 84 |     /// This method is useful for the TopDocs collector.
 85 |     /// For all docsets, the blanket implementation has the benefit
 86 |     /// of prefiltering (doc, score) pairs, avoiding the
 87 |     /// virtual dispatch cost.
 88 |     ///
 89 |     /// More importantly, it makes it possible for scorers to implement
 90 |     /// important optimization (e.g. BlockWAND for union).
 91 |     fn for_each_pruning(
 92 |         &self,
 93 |         threshold: Score,
 94 |         reader: &SegmentReader,
 95 |         callback: &mut dyn FnMut(DocId, Score) -> Score,
 96 |     ) -> crate::Result<()> {
 97 |         let mut scorer = self.scorer(reader, 1.0)?;
 98 |         for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
 99 |         Ok(())
100 |     }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/schema/facet_options.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::BitOr;
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | use crate::schema::flags::{IndexedFlag, SchemaFlagList, StoredFlag};
 6 | 
 7 | /// Define how a facet field should be handled by tantivy.
 8 | ///
 9 | /// Note that a Facet is always indexed and stored as a fastfield.
10 | #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
11 | pub struct FacetOptions {
12 |     stored: bool,
13 | }
14 | 
15 | impl FacetOptions {
16 |     /// Returns true if the value is stored.
17 |     pub fn is_stored(&self) -> bool {
18 |         self.stored
19 |     }
20 | 
21 |     /// Set the field as stored.
22 |     ///
23 |     /// Only the fields that are set as *stored* are
24 |     /// persisted into the Tantivy's store.
25 |     #[must_use]
26 |     pub fn set_stored(mut self) -> FacetOptions {
27 |         self.stored = true;
28 |         self
29 |     }
30 | }
31 | 
32 | impl From<()> for FacetOptions {
33 |     fn from(_: ()) -> FacetOptions {
34 |         FacetOptions::default()
35 |     }
36 | }
37 | 
38 | impl From<StoredFlag> for FacetOptions {
39 |     fn from(_: StoredFlag) -> Self {
40 |         FacetOptions { stored: true }
41 |     }
42 | }
43 | 
44 | impl<T: Into<FacetOptions>> BitOr<T> for FacetOptions {
45 |     type Output = FacetOptions;
46 | 
47 |     fn bitor(self, other: T) -> FacetOptions {
48 |         let other = other.into();
49 |         FacetOptions {
50 |             stored: self.stored | other.stored,
51 |         }
52 |     }
53 | }
54 | 
55 | impl<Head, Tail> From<SchemaFlagList<Head, Tail>> for FacetOptions
56 | where
57 |     Head: Clone,
58 |     Tail: Clone,
59 |     Self: BitOr<Output = Self> + From<Head> + From<Tail>,
60 | {
61 |     fn from(head_tail: SchemaFlagList<Head, Tail>) -> Self {
62 |         Self::from(head_tail.head) | Self::from(head_tail.tail)
63 |     }
64 | }
65 | 
66 | impl From<IndexedFlag> for FacetOptions {
67 |     fn from(_: IndexedFlag) -> Self {
68 |         FacetOptions { stored: false }
69 |     }
70 | }
71 | 
72 | #[cfg(test)]
73 | mod tests {
74 |     use crate::schema::{FacetOptions, INDEXED};
75 | 
76 |     #[test]
77 |     fn test_from_index_flag() {
78 |         let facet_option = FacetOptions::from(INDEXED);
79 |         assert_eq!(facet_option, FacetOptions::default());
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/schema/field.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | use std::io::{Read, Write};
 3 | 
 4 | use common::BinarySerializable;
 5 | 
 6 | /// `Field` is represented by an unsigned 32-bit integer type
 7 | /// The schema holds the mapping between field names and `Field` objects.
 8 | #[derive(
 9 |     Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, serde::Serialize, serde::Deserialize,
10 | )]
11 | pub struct Field(u32);
12 | 
13 | impl Field {
14 |     /// Create a new field object for the given FieldId.
15 |     pub const fn from_field_id(field_id: u32) -> Field {
16 |         Field(field_id)
17 |     }
18 | 
19 |     /// Returns a u32 identifying uniquely a field within a schema.
20 |     pub const fn field_id(self) -> u32 {
21 |         self.0
22 |     }
23 | }
24 | 
25 | impl BinarySerializable for Field {
26 |     fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
27 |         self.0.serialize(writer)
28 |     }
29 | 
30 |     fn deserialize<R: Read>(reader: &mut R) -> io::Result<Field> {
31 |         u32::deserialize(reader).map(Field)
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/schema/field_value.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{self, Read, Write};
 2 | 
 3 | use common::BinarySerializable;
 4 | 
 5 | use crate::schema::{Field, Value};
 6 | 
 7 | /// `FieldValue` holds together a `Field` and its `Value`.
 8 | #[allow(missing_docs)]
 9 | #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
10 | pub struct FieldValue {
11 |     pub field: Field,
12 |     pub value: Value,
13 | }
14 | 
15 | impl FieldValue {
16 |     /// Constructor
17 |     pub fn new(field: Field, value: Value) -> FieldValue {
18 |         FieldValue { field, value }
19 |     }
20 | 
21 |     /// Field accessor
22 |     pub fn field(&self) -> Field {
23 |         self.field
24 |     }
25 | 
26 |     /// Value accessor
27 |     pub fn value(&self) -> &Value {
28 |         &self.value
29 |     }
30 | }
31 | 
32 | impl From<FieldValue> for Value {
33 |     fn from(field_value: FieldValue) -> Self {
34 |         field_value.value
35 |     }
36 | }
37 | 
38 | impl BinarySerializable for FieldValue {
39 |     fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
40 |         self.field.serialize(writer)?;
41 |         self.value.serialize(writer)
42 |     }
43 | 
44 |     fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
45 |         let field = Field::deserialize(reader)?;
46 |         let value = Value::deserialize(reader)?;
47 |         Ok(FieldValue { field, value })
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/schema/flags.rs:
--------------------------------------------------------------------------------
 1 | use std::ops::BitOr;
 2 | 
 3 | use crate::schema::{NumericOptions, TextOptions};
 4 | 
 5 | #[derive(Clone)]
 6 | pub struct StoredFlag;
 7 | /// Flag to mark the field as stored.
 8 | /// This flag can apply to any kind of field.
 9 | ///
10 | /// A stored fields of a document can be retrieved given its `DocId`.
11 | /// Stored field are stored together and compressed.
12 | /// Reading the stored fields of a document is relatively slow.
13 | /// (~ 100 microsecs)
14 | ///
15 | /// It should not be used during scoring or collection.
16 | pub const STORED: SchemaFlagList<StoredFlag, ()> = SchemaFlagList {
17 |     head: StoredFlag,
18 |     tail: (),
19 | };
20 | 
21 | #[derive(Clone)]
22 | pub struct IndexedFlag;
23 | /// Flag to mark the field as indexed. An indexed field is searchable and has a fieldnorm.
24 | ///
25 | /// The `INDEXED` flag can only be used when building `NumericOptions` (`u64`, `i64` and `f64`
26 | /// fields) Of course, text fields can also be indexed... But this is expressed by using either the
27 | /// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags.
28 | pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
29 |     head: IndexedFlag,
30 |     tail: (),
31 | };
32 | 
33 | #[derive(Clone)]
34 | pub struct FastFlag;
35 | /// Flag to mark the field as a fast field (similar to Lucene's DocValues)
36 | ///
37 | /// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering
38 | /// or collection should be mark as fast fields.
39 | /// The `FAST` flag can only be used when building `NumericOptions` (`u64`, `i64` and `f64` fields)
40 | pub const FAST: SchemaFlagList<FastFlag, ()> = SchemaFlagList {
41 |     head: FastFlag,
42 |     tail: (),
43 | };
44 | 
45 | impl<Head, OldHead, OldTail> BitOr<SchemaFlagList<Head, ()>> for SchemaFlagList<OldHead, OldTail>
46 | where
47 |     Head: Clone,
48 |     OldHead: Clone,
49 |     OldTail: Clone,
50 | {
51 |     type Output = SchemaFlagList<Head, SchemaFlagList<OldHead, OldTail>>;
52 | 
53 |     fn bitor(self, head: SchemaFlagList<Head, ()>) -> Self::Output {
54 |         SchemaFlagList {
55 |             head: head.head,
56 |             tail: self,
57 |         }
58 |     }
59 | }
60 | 
61 | impl<T: Clone + Into<NumericOptions>> BitOr<NumericOptions> for SchemaFlagList<T, ()> {
62 |     type Output = NumericOptions;
63 | 
64 |     fn bitor(self, rhs: NumericOptions) -> Self::Output {
65 |         self.head.into() | rhs
66 |     }
67 | }
68 | 
69 | impl<T: Clone + Into<TextOptions>> BitOr<TextOptions> for SchemaFlagList<T, ()> {
70 |     type Output = TextOptions;
71 | 
72 |     fn bitor(self, rhs: TextOptions) -> Self::Output {
73 |         self.head.into() | rhs
74 |     }
75 | }
76 | 
77 | #[derive(Clone)]
78 | pub struct SchemaFlagList<Head: Clone, Tail: Clone> {
79 |     pub head: Head,
80 |     pub tail: Tail,
81 | }
82 | 


--------------------------------------------------------------------------------
/src/schema/index_record_option.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | /// `IndexRecordOption` describes an amount information associated
 4 | /// to a given indexed field.
 5 | ///
 6 | /// It is both used to:
 7 | ///
 8 | ///  * describe in the schema the amount of information
 9 | /// that should be retained during indexing (See
10 | /// [`TextFieldIndexing.html.set_index_option`](
11 | ///     ../schema/struct.TextFieldIndexing.html#method.set_index_option))
12 | ///  * to request for a given
13 | /// amount of information to be decoded as one goes through a posting list.
14 | /// (See [`InvertedIndexReader.read_postings`](
15 | ///     ../struct.InvertedIndexReader.html#method.read_postings))
16 | #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
17 | pub enum IndexRecordOption {
18 |     /// records only the `DocId`s
19 |     #[serde(rename = "basic")]
20 |     Basic,
21 |     /// records the document ids as well as the term frequency.
22 |     /// The term frequency can help giving better scoring of the documents.
23 |     #[serde(rename = "freq")]
24 |     WithFreqs,
25 |     /// records the document id, the term frequency and the positions of
26 |     /// the occurences in the document.
27 |     /// Positions are required to run [PhraseQueries](../query/struct.PhraseQuery.html).
28 |     #[serde(rename = "position")]
29 |     WithFreqsAndPositions,
30 | }
31 | 
32 | impl Default for IndexRecordOption {
33 |     fn default() -> Self {
34 |         IndexRecordOption::Basic
35 |     }
36 | }
37 | 
38 | impl IndexRecordOption {
39 |     /// Returns true if this option includes encoding
40 |     /// term frequencies.
41 |     pub fn has_freq(self) -> bool {
42 |         match self {
43 |             IndexRecordOption::Basic => false,
44 |             IndexRecordOption::WithFreqs | IndexRecordOption::WithFreqsAndPositions => true,
45 |         }
46 |     }
47 | 
48 |     /// Returns true if this option include encoding
49 |     ///  term positions.
50 |     pub fn has_positions(self) -> bool {
51 |         match self {
52 |             IndexRecordOption::Basic | IndexRecordOption::WithFreqs => false,
53 |             IndexRecordOption::WithFreqsAndPositions => true,
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/schema/named_field_document.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | 
 3 | use serde::{Deserialize, Serialize};
 4 | 
 5 | use crate::schema::Value;
 6 | 
 7 | /// Internal representation of a document used for JSON
 8 | /// serialization.
 9 | ///
10 | /// A `NamedFieldDocument` is a simple representation of a document
11 | /// as a `BTreeMap<String, Vec<Value>>`.
12 | #[derive(Debug, Deserialize, Serialize)]
13 | pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);
14 | 


--------------------------------------------------------------------------------
/src/store/compression_brotli.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | #[inline]
 4 | pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
 5 |     let params = brotli::enc::BrotliEncoderParams {
 6 |         quality: 5,
 7 |         ..Default::default()
 8 |     };
 9 |     compressed.clear();
10 |     brotli::BrotliCompress(&mut uncompressed, compressed, &params)?;
11 |     Ok(())
12 | }
13 | 
14 | #[inline]
15 | pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
16 |     decompressed.clear();
17 |     brotli::BrotliDecompress(&mut compressed, decompressed)?;
18 |     Ok(())
19 | }
20 | 


--------------------------------------------------------------------------------
/src/store/compression_lz4_block.rs:
--------------------------------------------------------------------------------
 1 | use core::convert::TryInto;
 2 | use std::io::{self};
 3 | use std::mem;
 4 | 
 5 | use lz4_flex::{compress_into, decompress_into};
 6 | 
 7 | #[inline]
 8 | #[allow(clippy::uninit_vec)]
 9 | pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
10 |     compressed.clear();
11 |     let maximum_ouput_size =
12 |         mem::size_of::<u32>() + lz4_flex::block::get_maximum_output_size(uncompressed.len());
13 |     compressed.reserve(maximum_ouput_size);
14 |     unsafe {
15 |         compressed.set_len(maximum_ouput_size);
16 |     }
17 |     let bytes_written = compress_into(uncompressed, &mut compressed[4..])
18 |         .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
19 |     let num_bytes = uncompressed.len() as u32;
20 |     compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes());
21 |     unsafe {
22 |         compressed.set_len(bytes_written + mem::size_of::<u32>());
23 |     }
24 |     Ok(())
25 | }
26 | 
27 | #[inline]
28 | #[allow(clippy::uninit_vec)]
29 | pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
30 |     decompressed.clear();
31 |     let uncompressed_size_bytes: &[u8; 4] = compressed
32 |         .get(..4)
33 |         .ok_or(io::ErrorKind::InvalidData)?
34 |         .try_into()
35 |         .unwrap();
36 |     let uncompressed_size = u32::from_le_bytes(*uncompressed_size_bytes) as usize;
37 |     decompressed.reserve(uncompressed_size);
38 |     unsafe {
39 |         decompressed.set_len(uncompressed_size);
40 |     }
41 |     let bytes_written = decompress_into(&compressed[4..], decompressed)
42 |         .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
43 |     if bytes_written != uncompressed_size {
44 |         return Err(io::Error::new(
45 |             io::ErrorKind::InvalidData,
46 |             "doc store block not completely decompressed, data corruption".to_string(),
47 |         ));
48 |     }
49 |     Ok(())
50 | }
51 | 


--------------------------------------------------------------------------------
/src/store/compression_snap.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{self, Read, Write};
 2 | 
 3 | #[inline]
 4 | pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
 5 |     compressed.clear();
 6 |     let mut encoder = snap::write::FrameEncoder::new(compressed);
 7 |     encoder.write_all(uncompressed)?;
 8 |     encoder.flush()?;
 9 |     Ok(())
10 | }
11 | 
12 | #[inline]
13 | pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
14 |     decompressed.clear();
15 |     snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
16 |     Ok(())
17 | }
18 | 


--------------------------------------------------------------------------------
/src/store/footer.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use common::{BinarySerializable, FixedSize, HasLen};
 4 | 
 5 | use crate::directory::FileSlice;
 6 | use crate::store::Compressor;
 7 | 
 8 | #[derive(Debug, Clone, PartialEq)]
 9 | pub struct DocStoreFooter {
10 |     pub offset: u64,
11 |     pub compressor: Compressor,
12 | }
13 | 
14 | /// Serialises the footer to a byte-array
15 | /// - offset : 8 bytes
16 | /// - compressor id: 1 byte
17 | /// - reserved for future use: 15 bytes
18 | impl BinarySerializable for DocStoreFooter {
19 |     fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
20 |         BinarySerializable::serialize(&self.offset, writer)?;
21 |         BinarySerializable::serialize(&self.compressor.get_id(), writer)?;
22 |         writer.write_all(&[0; 15])?;
23 |         Ok(())
24 |     }
25 | 
26 |     fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
27 |         let offset = u64::deserialize(reader)?;
28 |         let compressor_id = u8::deserialize(reader)?;
29 |         let mut skip_buf = [0; 15];
30 |         reader.read_exact(&mut skip_buf)?;
31 |         Ok(DocStoreFooter {
32 |             offset,
33 |             compressor: Compressor::from_id(compressor_id),
34 |         })
35 |     }
36 | }
37 | 
38 | impl FixedSize for DocStoreFooter {
39 |     const SIZE_IN_BYTES: usize = 24;
40 | }
41 | 
42 | impl DocStoreFooter {
43 |     pub fn new(offset: u64, compressor: Compressor) -> Self {
44 |         DocStoreFooter { offset, compressor }
45 |     }
46 | 
47 |     pub fn extract_footer(file: FileSlice) -> io::Result<(DocStoreFooter, FileSlice)> {
48 |         if file.len() < DocStoreFooter::SIZE_IN_BYTES {
49 |             return Err(io::Error::new(
50 |                 io::ErrorKind::UnexpectedEof,
51 |                 format!(
52 |                     "File corrupted. The file is smaller than Footer::SIZE_IN_BYTES (len={}).",
53 |                     file.len()
54 |                 ),
55 |             ));
56 |         }
57 |         let (body, footer_slice) = file.split_from_end(DocStoreFooter::SIZE_IN_BYTES);
58 |         let mut footer_bytes = footer_slice.read_bytes()?;
59 |         let footer = DocStoreFooter::deserialize(&mut footer_bytes)?;
60 |         Ok((footer, body))
61 |     }
62 | }
63 | 
64 | #[test]
65 | fn doc_store_footer_test() {
66 |     // This test is just to safe guard changes on the footer.
67 |     // When the doc store footer is updated, make sure to update also the serialize/deserialize
68 |     // methods
69 |     assert_eq!(core::mem::size_of::<DocStoreFooter>(), 16);
70 | }
71 | 


--------------------------------------------------------------------------------
/src/store/index/skip_index.rs:
--------------------------------------------------------------------------------
  1 | use common::{BinarySerializable, VInt};
  2 | 
  3 | use crate::directory::OwnedBytes;
  4 | use crate::store::index::block::CheckpointBlock;
  5 | use crate::store::index::Checkpoint;
  6 | use crate::DocId;
  7 | 
  8 | pub struct LayerCursor<'a> {
  9 |     remaining: &'a [u8],
 10 |     block: CheckpointBlock,
 11 |     cursor: usize,
 12 | }
 13 | 
 14 | impl<'a> Iterator for LayerCursor<'a> {
 15 |     type Item = Checkpoint;
 16 | 
 17 |     fn next(&mut self) -> Option<Checkpoint> {
 18 |         if self.cursor == self.block.len() {
 19 |             if self.remaining.is_empty() {
 20 |                 return None;
 21 |             }
 22 |             let (block_mut, remaining_mut) = (&mut self.block, &mut self.remaining);
 23 |             block_mut.deserialize(remaining_mut).ok()?;
 24 |             self.cursor = 0;
 25 |         }
 26 |         let res = Some(self.block.get(self.cursor));
 27 |         self.cursor += 1;
 28 |         res
 29 |     }
 30 | }
 31 | 
 32 | struct Layer {
 33 |     data: OwnedBytes,
 34 | }
 35 | 
 36 | impl Layer {
 37 |     fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
 38 |         self.cursor_at_offset(0)
 39 |     }
 40 | 
 41 |     fn cursor_at_offset(&self, start_offset: usize) -> impl Iterator<Item = Checkpoint> + '_ {
 42 |         let data = &self.data.as_slice();
 43 |         LayerCursor {
 44 |             remaining: &data[start_offset..],
 45 |             block: CheckpointBlock::default(),
 46 |             cursor: 0,
 47 |         }
 48 |     }
 49 | 
 50 |     fn seek_start_at_offset(&self, target: DocId, offset: usize) -> Option<Checkpoint> {
 51 |         self.cursor_at_offset(offset)
 52 |             .find(|checkpoint| checkpoint.doc_range.end > target)
 53 |     }
 54 | }
 55 | 
 56 | pub struct SkipIndex {
 57 |     layers: Vec<Layer>,
 58 | }
 59 | 
 60 | impl SkipIndex {
 61 |     pub fn open(mut data: OwnedBytes) -> SkipIndex {
 62 |         let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
 63 |             .unwrap()
 64 |             .into_iter()
 65 |             .map(|el| el.0)
 66 |             .collect();
 67 |         let mut start_offset = 0;
 68 |         let mut layers = Vec::new();
 69 |         for end_offset in offsets {
 70 |             let layer = Layer {
 71 |                 data: data.slice(start_offset as usize..end_offset as usize),
 72 |             };
 73 |             layers.push(layer);
 74 |             start_offset = end_offset;
 75 |         }
 76 |         SkipIndex { layers }
 77 |     }
 78 | 
 79 |     pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
 80 |         self.layers
 81 |             .last()
 82 |             .into_iter()
 83 |             .flat_map(|layer| layer.cursor())
 84 |     }
 85 | 
 86 |     pub fn seek(&self, target: DocId) -> Option<Checkpoint> {
 87 |         let first_layer_len = self
 88 |             .layers
 89 |             .first()
 90 |             .map(|layer| layer.data.len())
 91 |             .unwrap_or(0);
 92 |         let mut cur_checkpoint = Checkpoint {
 93 |             doc_range: 0u32..1u32,
 94 |             byte_range: 0..first_layer_len,
 95 |         };
 96 |         for layer in &self.layers {
 97 |             if let Some(checkpoint) =
 98 |                 layer.seek_start_at_offset(target, cur_checkpoint.byte_range.start)
 99 |             {
100 |                 cur_checkpoint = checkpoint;
101 |             } else {
102 |                 return None;
103 |             }
104 |         }
105 |         Some(cur_checkpoint)
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/store/index/skip_index_builder.rs:
--------------------------------------------------------------------------------
  1 | use std::io;
  2 | use std::io::Write;
  3 | 
  4 | use common::{BinarySerializable, VInt};
  5 | 
  6 | use crate::store::index::block::CheckpointBlock;
  7 | use crate::store::index::{Checkpoint, CHECKPOINT_PERIOD};
  8 | 
  9 | // Each skip contains iterator over pairs (last doc in block, offset to start of block).
 10 | 
 11 | struct LayerBuilder {
 12 |     buffer: Vec<u8>,
 13 |     pub block: CheckpointBlock,
 14 | }
 15 | 
 16 | impl LayerBuilder {
 17 |     fn finish(self) -> Vec<u8> {
 18 |         self.buffer
 19 |     }
 20 | 
 21 |     fn new() -> LayerBuilder {
 22 |         LayerBuilder {
 23 |             buffer: Vec::new(),
 24 |             block: CheckpointBlock::default(),
 25 |         }
 26 |     }
 27 | 
 28 |     /// Serializes the block, and return a checkpoint representing
 29 |     /// the entire block.
 30 |     ///
 31 |     /// If the block was empty to begin with, simply return None.
 32 |     fn flush_block(&mut self) -> Option<Checkpoint> {
 33 |         if let Some(doc_range) = self.block.doc_interval() {
 34 |             let start_offset = self.buffer.len();
 35 |             self.block.serialize(&mut self.buffer);
 36 |             let end_offset = self.buffer.len();
 37 |             self.block.clear();
 38 |             Some(Checkpoint {
 39 |                 doc_range,
 40 |                 byte_range: start_offset..end_offset,
 41 |             })
 42 |         } else {
 43 |             None
 44 |         }
 45 |     }
 46 | 
 47 |     fn push(&mut self, checkpoint: Checkpoint) {
 48 |         self.block.push(checkpoint);
 49 |     }
 50 | 
 51 |     fn insert(&mut self, checkpoint: Checkpoint) -> Option<Checkpoint> {
 52 |         self.push(checkpoint);
 53 |         let emit_skip_info = self.block.len() >= CHECKPOINT_PERIOD;
 54 |         if emit_skip_info {
 55 |             self.flush_block()
 56 |         } else {
 57 |             None
 58 |         }
 59 |     }
 60 | }
 61 | 
 62 | pub struct SkipIndexBuilder {
 63 |     layers: Vec<LayerBuilder>,
 64 | }
 65 | 
 66 | impl SkipIndexBuilder {
 67 |     pub fn new() -> SkipIndexBuilder {
 68 |         SkipIndexBuilder { layers: Vec::new() }
 69 |     }
 70 | 
 71 |     fn get_layer(&mut self, layer_id: usize) -> &mut LayerBuilder {
 72 |         if layer_id == self.layers.len() {
 73 |             let layer_builder = LayerBuilder::new();
 74 |             self.layers.push(layer_builder);
 75 |         }
 76 |         &mut self.layers[layer_id]
 77 |     }
 78 | 
 79 |     pub fn insert(&mut self, checkpoint: Checkpoint) {
 80 |         let mut skip_pointer = Some(checkpoint);
 81 |         for layer_id in 0.. {
 82 |             if let Some(checkpoint) = skip_pointer {
 83 |                 skip_pointer = self.get_layer(layer_id).insert(checkpoint);
 84 |             } else {
 85 |                 break;
 86 |             }
 87 |         }
 88 |     }
 89 | 
 90 |     pub fn write<W: Write>(mut self, output: &mut W) -> io::Result<()> {
 91 |         let mut last_pointer = None;
 92 |         for skip_layer in self.layers.iter_mut() {
 93 |             if let Some(checkpoint) = last_pointer {
 94 |                 skip_layer.push(checkpoint);
 95 |             }
 96 |             last_pointer = skip_layer.flush_block();
 97 |         }
 98 |         let layer_buffers: Vec<Vec<u8>> = self
 99 |             .layers
100 |             .into_iter()
101 |             .rev()
102 |             .map(|layer| layer.finish())
103 |             .collect();
104 | 
105 |         let mut layer_offset = 0;
106 |         let mut layer_sizes = Vec::new();
107 |         for layer_buffer in &layer_buffers {
108 |             layer_offset += layer_buffer.len() as u64;
109 |             layer_sizes.push(VInt(layer_offset));
110 |         }
111 |         layer_sizes.serialize(output)?;
112 |         for layer_buffer in layer_buffers {
113 |             output.write_all(&layer_buffer[..])?;
114 |         }
115 |         Ok(())
116 |     }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/termdict/fst_termdict/mod.rs:
--------------------------------------------------------------------------------
 1 | //! The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to
 2 | //! a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information
 3 | //! about the term.
 4 | //!
 5 | //! Internally, the term dictionary relies on the `fst` crate to store
 6 | //! a sorted mapping that associate each term to its rank in the lexicographical order.
 7 | //! For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan",
 8 | //! the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`.
 9 | //!
10 | //! For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the
11 | //! lexicographical order matches the natural order of integers.
12 | //!
13 | //! `i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()`
14 | //! and then treated as a `u64`.
15 | //!
16 | //! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
17 | //! as `u64`.
18 | //!
19 | //! A second datastructure makes it possible to access a
20 | //! [`TermInfo`](../postings/struct.TermInfo.html).
21 | mod merger;
22 | mod streamer;
23 | mod term_info_store;
24 | mod termdict;
25 | 
26 | pub use self::merger::TermMerger;
27 | pub use self::streamer::{TermStreamer, TermStreamerBuilder};
28 | pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
29 | 


--------------------------------------------------------------------------------
/src/termdict/mod.rs:
--------------------------------------------------------------------------------
 1 | //! The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to
 2 | //! a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information
 3 | //! about the term.
 4 | //!
 5 | //! Internally, the term dictionary relies on the `fst` crate to store
 6 | //! a sorted mapping that associate each term to its rank in the lexicographical order.
 7 | //! For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan",
 8 | //! the [TermOrdinal] are respectively `0`, `1`, `2`, and `3`.
 9 | //!
10 | //! For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the
11 | //! lexicographical order matches the natural order of integers.
12 | //!
13 | //! `i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()`
14 | //! and then treated as a `u64`.
15 | //!
16 | //! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
17 | //! as `u64`.
18 | //!
19 | //! A second datastructure makes it possible to access a
20 | //! [`TermInfo`](../postings/struct.TermInfo.html).
21 | 
22 | #[cfg(not(feature = "quickwit"))]
23 | mod fst_termdict;
24 | #[cfg(not(feature = "quickwit"))]
25 | use fst_termdict as termdict;
26 | 
27 | #[cfg(feature = "quickwit")]
28 | mod sstable_termdict;
29 | #[cfg(feature = "quickwit")]
30 | use sstable_termdict as termdict;
31 | use tantivy_fst::automaton::AlwaysMatch;
32 | 
33 | #[cfg(test)]
34 | mod tests;
35 | 
36 | /// Position of the term in the sorted list of terms.
37 | pub type TermOrdinal = u64;
38 | 
39 | /// The term dictionary contains all of the terms in
40 | /// `tantivy index` in a sorted manner.
41 | pub type TermDictionary = self::termdict::TermDictionary;
42 | 
43 | /// Builder for the new term dictionary.
44 | ///
45 | /// Inserting must be done in the order of the `keys`.
46 | pub type TermDictionaryBuilder<W> = self::termdict::TermDictionaryBuilder<W>;
47 | 
48 | /// Given a list of sorted term streams,
49 | /// returns an iterator over sorted unique terms.
50 | ///
51 | /// The item yield is actually a pair with
52 | /// - the term
53 | /// - a slice with the ordinal of the segments containing
54 | /// the terms.
55 | pub type TermMerger<'a> = self::termdict::TermMerger<'a>;
56 | 
57 | /// `TermStreamer` acts as a cursor over a range of terms of a segment.
58 | /// Terms are guaranteed to be sorted.
59 | pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
60 | 


--------------------------------------------------------------------------------
/src/termdict/sstable_termdict/sstable/block_reader.rs:
--------------------------------------------------------------------------------
 1 | use std::io::{self, Read};
 2 | 
 3 | use byteorder::{LittleEndian, ReadBytesExt};
 4 | 
 5 | pub struct BlockReader<'a> {
 6 |     buffer: Vec<u8>,
 7 |     reader: Box<dyn io::Read + 'a>,
 8 |     offset: usize,
 9 | }
10 | 
11 | impl<'a> BlockReader<'a> {
12 |     pub fn new(reader: Box<dyn io::Read + 'a>) -> BlockReader<'a> {
13 |         BlockReader {
14 |             buffer: Vec::new(),
15 |             reader,
16 |             offset: 0,
17 |         }
18 |     }
19 | 
20 |     pub fn deserialize_u64(&mut self) -> u64 {
21 |         let (num_bytes, val) = super::vint::deserialize_read(self.buffer());
22 |         self.advance(num_bytes);
23 |         val
24 |     }
25 | 
26 |     #[inline(always)]
27 |     pub fn buffer_from_to(&self, start: usize, end: usize) -> &[u8] {
28 |         &self.buffer[start..end]
29 |     }
30 | 
31 |     pub fn read_block(&mut self) -> io::Result<bool> {
32 |         self.offset = 0;
33 |         let block_len_res = self.reader.read_u32::<LittleEndian>();
34 |         if let Err(err) = &block_len_res {
35 |             if err.kind() == io::ErrorKind::UnexpectedEof {
36 |                 return Ok(false);
37 |             }
38 |         }
39 |         let block_len = block_len_res?;
40 |         if block_len == 0u32 {
41 |             self.buffer.clear();
42 |             return Ok(false);
43 |         }
44 |         self.buffer.resize(block_len as usize, 0u8);
45 |         self.reader.read_exact(&mut self.buffer[..])?;
46 |         Ok(true)
47 |     }
48 | 
49 |     pub fn offset(&self) -> usize {
50 |         self.offset
51 |     }
52 | 
53 |     pub fn advance(&mut self, num_bytes: usize) {
54 |         self.offset += num_bytes;
55 |     }
56 | 
57 |     pub fn buffer(&self) -> &[u8] {
58 |         &self.buffer[self.offset..]
59 |     }
60 | }
61 | 
62 | impl<'a> io::Read for BlockReader<'a> {
63 |     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
64 |         let len = self.buffer().read(buf)?;
65 |         self.advance(len);
66 |         Ok(len)
67 |     }
68 | 
69 |     fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
70 |         let len = self.buffer.len();
71 |         buf.extend_from_slice(self.buffer());
72 |         self.advance(len);
73 |         Ok(len)
74 |     }
75 | 
76 |     fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
77 |         self.buffer().read_exact(buf)?;
78 |         self.advance(buf.len());
79 |         Ok(())
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/termdict/sstable_termdict/sstable/merge/heap_merge.rs:
--------------------------------------------------------------------------------
 1 | use std::cmp::Ordering;
 2 | use std::collections::binary_heap::PeekMut;
 3 | use std::collections::BinaryHeap;
 4 | use std::io;
 5 | 
 6 | use super::{SingleValueMerger, ValueMerger};
 7 | use crate::termdict::sstable_termdict::sstable::{Reader, SSTable, Writer};
 8 | 
 9 | struct HeapItem<B: AsRef<[u8]>>(B);
10 | 
11 | impl<B: AsRef<[u8]>> Ord for HeapItem<B> {
12 |     fn cmp(&self, other: &Self) -> Ordering {
13 |         other.0.as_ref().cmp(self.0.as_ref())
14 |     }
15 | }
16 | impl<B: AsRef<[u8]>> PartialOrd for HeapItem<B> {
17 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
18 |         Some(other.0.as_ref().cmp(self.0.as_ref()))
19 |     }
20 | }
21 | 
22 | impl<B: AsRef<[u8]>> Eq for HeapItem<B> {}
23 | impl<B: AsRef<[u8]>> PartialEq for HeapItem<B> {
24 |     fn eq(&self, other: &Self) -> bool {
25 |         self.0.as_ref() == other.0.as_ref()
26 |     }
27 | }
28 | 
29 | #[allow(dead_code)]
30 | pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
31 |     readers: Vec<Reader<SST::Reader>>,
32 |     mut writer: Writer<W, SST::Writer>,
33 |     mut merger: M,
34 | ) -> io::Result<()> {
35 |     let mut heap: BinaryHeap<HeapItem<Reader<SST::Reader>>> =
36 |         BinaryHeap::with_capacity(readers.len());
37 |     for mut reader in readers {
38 |         if reader.advance()? {
39 |             heap.push(HeapItem(reader));
40 |         }
41 |     }
42 |     loop {
43 |         let len = heap.len();
44 |         let mut value_merger;
45 |         if let Some(mut head) = heap.peek_mut() {
46 |             writer.write_key(head.0.key());
47 |             value_merger = merger.new_value(head.0.value());
48 |             if !head.0.advance()? {
49 |                 PeekMut::pop(head);
50 |             }
51 |         } else {
52 |             break;
53 |         }
54 |         for _ in 0..len - 1 {
55 |             if let Some(mut head) = heap.peek_mut() {
56 |                 if head.0.key() == writer.current_key() {
57 |                     value_merger.add(head.0.value());
58 |                     if !head.0.advance()? {
59 |                         PeekMut::pop(head);
60 |                     }
61 |                     continue;
62 |                 }
63 |             }
64 |             break;
65 |         }
66 |         let value = value_merger.finish();
67 |         writer.write_value(&value)?;
68 |         writer.flush_block_if_required()?;
69 |     }
70 |     writer.finalize()?;
71 |     Ok(())
72 | }
73 | 


--------------------------------------------------------------------------------
/src/termdict/sstable_termdict/sstable/sstable_index.rs:
--------------------------------------------------------------------------------
  1 | use std::io;
  2 | use std::ops::Range;
  3 | 
  4 | use serde::{Deserialize, Serialize};
  5 | 
  6 | use crate::error::DataCorruption;
  7 | 
  8 | #[derive(Default, Debug, Serialize, Deserialize)]
  9 | pub struct SSTableIndex {
 10 |     blocks: Vec<BlockMeta>,
 11 | }
 12 | 
 13 | impl SSTableIndex {
 14 |     pub(crate) fn load(data: &[u8]) -> Result<SSTableIndex, DataCorruption> {
 15 |         serde_cbor::de::from_slice(data)
 16 |             .map_err(|_| DataCorruption::comment_only("SSTable index is corrupted"))
 17 |     }
 18 | 
 19 |     pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
 20 |         self.blocks
 21 |             .iter()
 22 |             .find(|block| &block.last_key[..] >= key)
 23 |             .map(|block| block.block_addr.clone())
 24 |     }
 25 | }
 26 | 
 27 | #[derive(Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
 28 | pub struct BlockAddr {
 29 |     pub byte_range: Range<usize>,
 30 |     pub first_ordinal: u64,
 31 | }
 32 | 
 33 | #[derive(Debug, Serialize, Deserialize)]
 34 | struct BlockMeta {
 35 |     pub last_key: Vec<u8>,
 36 |     pub block_addr: BlockAddr,
 37 | }
 38 | 
 39 | #[derive(Default)]
 40 | pub struct SSTableIndexBuilder {
 41 |     index: SSTableIndex,
 42 | }
 43 | 
 44 | impl SSTableIndexBuilder {
 45 |     pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
 46 |         self.index.blocks.push(BlockMeta {
 47 |             last_key: last_key.to_vec(),
 48 |             block_addr: BlockAddr {
 49 |                 byte_range,
 50 |                 first_ordinal,
 51 |             },
 52 |         })
 53 |     }
 54 | 
 55 |     pub fn serialize(&self, wrt: &mut dyn io::Write) -> io::Result<()> {
 56 |         serde_cbor::ser::to_writer(wrt, &self.index).unwrap();
 57 |         Ok(())
 58 |     }
 59 | }
 60 | 
 61 | #[cfg(test)]
 62 | mod tests {
 63 |     use super::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
 64 | 
 65 |     #[test]
 66 |     fn test_sstable_index() {
 67 |         let mut sstable_builder = SSTableIndexBuilder::default();
 68 |         sstable_builder.add_block(b"aaa", 10..20, 0u64);
 69 |         sstable_builder.add_block(b"bbbbbbb", 20..30, 564);
 70 |         sstable_builder.add_block(b"ccc", 30..40, 10u64);
 71 |         sstable_builder.add_block(b"dddd", 40..50, 15u64);
 72 |         let mut buffer: Vec<u8> = Vec::new();
 73 |         sstable_builder.serialize(&mut buffer).unwrap();
 74 |         let sstable_index = SSTableIndex::load(&buffer[..]).unwrap();
 75 |         assert_eq!(
 76 |             sstable_index.search(b"bbbde"),
 77 |             Some(BlockAddr {
 78 |                 first_ordinal: 10u64,
 79 |                 byte_range: 30..40
 80 |             })
 81 |         );
 82 |     }
 83 | 
 84 |     #[test]
 85 |     fn test_sstable_with_corrupted_data() {
 86 |         let mut sstable_builder = SSTableIndexBuilder::default();
 87 |         sstable_builder.add_block(b"aaa", 10..20, 0u64);
 88 |         sstable_builder.add_block(b"bbbbbbb", 20..30, 564);
 89 |         sstable_builder.add_block(b"ccc", 30..40, 10u64);
 90 |         sstable_builder.add_block(b"dddd", 40..50, 15u64);
 91 |         let mut buffer: Vec<u8> = Vec::new();
 92 |         sstable_builder.serialize(&mut buffer).unwrap();
 93 |         buffer[1] = 9u8;
 94 |         let data_corruption_err = SSTableIndex::load(&buffer[..]).err().unwrap();
 95 |         assert_eq!(
 96 |             format!("{data_corruption_err:?}"),
 97 |             "Data corruption: SSTable index is corrupted."
 98 |         );
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/src/termdict/sstable_termdict/sstable/value.rs:
--------------------------------------------------------------------------------
 1 | use std::io;
 2 | 
 3 | use super::{vint, BlockReader};
 4 | 
 5 | pub trait ValueReader: Default {
 6 |     type Value;
 7 | 
 8 |     fn value(&self, idx: usize) -> &Self::Value;
 9 | 
10 |     fn read(&mut self, reader: &mut BlockReader) -> io::Result<()>;
11 | }
12 | 
13 | pub trait ValueWriter: Default {
14 |     type Value;
15 | 
16 |     fn write(&mut self, val: &Self::Value);
17 | 
18 |     fn write_block(&mut self, writer: &mut Vec<u8>);
19 | }
20 | 
21 | #[derive(Default)]
22 | pub struct VoidReader;
23 | 
24 | impl ValueReader for VoidReader {
25 |     type Value = ();
26 | 
27 |     fn value(&self, _idx: usize) -> &() {
28 |         &()
29 |     }
30 | 
31 |     fn read(&mut self, _reader: &mut BlockReader) -> io::Result<()> {
32 |         Ok(())
33 |     }
34 | }
35 | 
36 | #[derive(Default)]
37 | pub struct VoidWriter;
38 | 
39 | impl ValueWriter for VoidWriter {
40 |     type Value = ();
41 | 
42 |     fn write(&mut self, _val: &()) {}
43 | 
44 |     fn write_block(&mut self, _writer: &mut Vec<u8>) {}
45 | }
46 | 
47 | #[derive(Default)]
48 | pub struct U64MonotonicWriter {
49 |     vals: Vec<u64>,
50 | }
51 | 
52 | impl ValueWriter for U64MonotonicWriter {
53 |     type Value = u64;
54 | 
55 |     fn write(&mut self, val: &Self::Value) {
56 |         self.vals.push(*val);
57 |     }
58 | 
59 |     fn write_block(&mut self, writer: &mut Vec<u8>) {
60 |         let mut prev_val = 0u64;
61 |         vint::serialize_into_vec(self.vals.len() as u64, writer);
62 |         for &val in &self.vals {
63 |             let delta = val - prev_val;
64 |             vint::serialize_into_vec(delta, writer);
65 |             prev_val = val;
66 |         }
67 |         self.vals.clear();
68 |     }
69 | }
70 | 
71 | #[derive(Default)]
72 | pub struct U64MonotonicReader {
73 |     vals: Vec<u64>,
74 | }
75 | 
76 | impl ValueReader for U64MonotonicReader {
77 |     type Value = u64;
78 | 
79 |     fn value(&self, idx: usize) -> &Self::Value {
80 |         &self.vals[idx]
81 |     }
82 | 
83 |     fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
84 |         let len = reader.deserialize_u64() as usize;
85 |         self.vals.clear();
86 |         let mut prev_val = 0u64;
87 |         for _ in 0..len {
88 |             let delta = reader.deserialize_u64() as u64;
89 |             let val = prev_val + delta;
90 |             self.vals.push(val);
91 |             prev_val = val;
92 |         }
93 |         Ok(())
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/termdict/sstable_termdict/sstable/vint.rs:
--------------------------------------------------------------------------------
 1 | const CONTINUE_BIT: u8 = 128u8;
 2 | 
 3 | pub fn serialize(mut val: u64, buffer: &mut [u8]) -> usize {
 4 |     for (i, b) in buffer.iter_mut().enumerate() {
 5 |         let next_byte: u8 = (val & 127u64) as u8;
 6 |         val >>= 7;
 7 |         if val == 0u64 {
 8 |             *b = next_byte;
 9 |             return i + 1;
10 |         } else {
11 |             *b = next_byte | CONTINUE_BIT;
12 |         }
13 |     }
14 |     10 //< actually unreachable
15 | }
16 | 
17 | pub fn serialize_into_vec(val: u64, buffer: &mut Vec<u8>) {
18 |     let mut buf = [0u8; 10];
19 |     let num_bytes = serialize(val, &mut buf[..]);
20 |     buffer.extend_from_slice(&buf[..num_bytes]);
21 | }
22 | 
23 | // super slow but we don't care
24 | pub fn deserialize_read(buf: &[u8]) -> (usize, u64) {
25 |     let mut result = 0u64;
26 |     let mut shift = 0u64;
27 |     let mut consumed = 0;
28 | 
29 |     for &b in buf {
30 |         consumed += 1;
31 |         result |= u64::from(b % 128u8) << shift;
32 |         if b < CONTINUE_BIT {
33 |             break;
34 |         }
35 |         shift += 7;
36 |     }
37 |     (consumed, result)
38 | }
39 | 
40 | #[cfg(test)]
41 | mod tests {
42 |     use std::u64;
43 | 
44 |     use super::{deserialize_read, serialize};
45 | 
46 |     fn aux_test_int(val: u64, expect_len: usize) {
47 |         let mut buffer = [0u8; 14];
48 |         assert_eq!(serialize(val, &mut buffer[..]), expect_len);
49 |         assert_eq!(deserialize_read(&buffer), (expect_len, val));
50 |     }
51 | 
52 |     #[test]
53 |     fn test_vint() {
54 |         aux_test_int(0u64, 1);
55 |         aux_test_int(17u64, 1);
56 |         aux_test_int(127u64, 1);
57 |         aux_test_int(128u64, 2);
58 |         aux_test_int(123423418u64, 4);
59 |         for i in 1..63 {
60 |             let power_of_two = 1u64 << i;
61 |             aux_test_int(power_of_two + 1, (i / 7) + 1);
62 |             aux_test_int(power_of_two, (i / 7) + 1);
63 |             aux_test_int(power_of_two - 1, ((i - 1) / 7) + 1);
64 |         }
65 |         aux_test_int(u64::MAX, 10);
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/tokenizer/alphanum_only.rs:
--------------------------------------------------------------------------------
 1 | //! # Example
 2 | //! ```rust
 3 | //! use tantivy::tokenizer::*;
 4 | //!
 5 | //! let tokenizer = TextAnalyzer::from(RawTokenizer)
 6 | //!   .filter(AlphaNumOnlyFilter);
 7 | //!
 8 | //! let mut stream = tokenizer.token_stream("hello there");
 9 | //! // is none because the raw filter emits one token that
10 | //! // contains a space
11 | //! assert!(stream.next().is_none());
12 | //!
13 | //! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
14 | //!   .filter(AlphaNumOnlyFilter);
15 | //!
16 | //! let mut stream = tokenizer.token_stream("hello there 💣");
17 | //! assert!(stream.next().is_some());
18 | //! assert!(stream.next().is_some());
19 | //! // the "emoji" is dropped because its not an alphanum
20 | //! assert!(stream.next().is_none());
21 | //! ```
22 | use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
23 | 
24 | /// `TokenFilter` that removes all tokens that contain non
25 | /// ascii alphanumeric characters.
26 | #[derive(Clone)]
27 | pub struct AlphaNumOnlyFilter;
28 | 
29 | pub struct AlphaNumOnlyFilterStream<'a> {
30 |     tail: BoxTokenStream<'a>,
31 | }
32 | 
33 | impl<'a> AlphaNumOnlyFilterStream<'a> {
34 |     fn predicate(&self, token: &Token) -> bool {
35 |         token.text.chars().all(|c| c.is_ascii_alphanumeric())
36 |     }
37 | }
38 | 
39 | impl TokenFilter for AlphaNumOnlyFilter {
40 |     fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
41 |         BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream })
42 |     }
43 | }
44 | 
45 | impl<'a> TokenStream for AlphaNumOnlyFilterStream<'a> {
46 |     fn advance(&mut self) -> bool {
47 |         while self.tail.advance() {
48 |             if self.predicate(self.tail.token()) {
49 |                 return true;
50 |             }
51 |         }
52 | 
53 |         false
54 |     }
55 | 
56 |     fn token(&self) -> &Token {
57 |         self.tail.token()
58 |     }
59 | 
60 |     fn token_mut(&mut self) -> &mut Token {
61 |         self.tail.token_mut()
62 |     }
63 | }
64 | 
65 | #[cfg(test)]
66 | mod tests {
67 |     use crate::tokenizer::tests::assert_token;
68 |     use crate::tokenizer::{AlphaNumOnlyFilter, SimpleTokenizer, TextAnalyzer, Token};
69 | 
70 |     #[test]
71 |     fn test_alphanum_only() {
72 |         let tokens = token_stream_helper("I am a cat. 我輩は猫である。(1906)");
73 |         assert_eq!(tokens.len(), 5);
74 |         assert_token(&tokens[0], 0, "I", 0, 1);
75 |         assert_token(&tokens[1], 1, "am", 2, 4);
76 |         assert_token(&tokens[2], 2, "a", 5, 6);
77 |         assert_token(&tokens[3], 3, "cat", 7, 10);
78 |         assert_token(&tokens[4], 5, "1906", 37, 41);
79 |     }
80 | 
81 |     fn token_stream_helper(text: &str) -> Vec<Token> {
82 |         let a = TextAnalyzer::from(SimpleTokenizer).filter(AlphaNumOnlyFilter);
83 |         let mut token_stream = a.token_stream(text);
84 |         let mut tokens: Vec<Token> = vec![];
85 |         let mut add_token = |token: &Token| {
86 |             tokens.push(token.clone());
87 |         };
88 |         token_stream.process(&mut add_token);
89 |         tokens
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/tokenizer/empty_tokenizer.rs:
--------------------------------------------------------------------------------
 1 | use crate::tokenizer::{BoxTokenStream, Token, TokenStream, Tokenizer};
 2 | 
 3 | #[derive(Clone)]
 4 | pub(crate) struct EmptyTokenizer;
 5 | 
 6 | impl Tokenizer for EmptyTokenizer {
 7 |     fn token_stream<'a>(&self, _text: &'a str) -> BoxTokenStream<'a> {
 8 |         EmptyTokenStream::default().into()
 9 |     }
10 | }
11 | 
12 | #[derive(Default)]
13 | struct EmptyTokenStream {
14 |     token: Token,
15 | }
16 | 
17 | impl TokenStream for EmptyTokenStream {
18 |     fn advance(&mut self) -> bool {
19 |         false
20 |     }
21 | 
22 |     fn token(&self) -> &super::Token {
23 |         &self.token
24 |     }
25 | 
26 |     fn token_mut(&mut self) -> &mut super::Token {
27 |         &mut self.token
28 |     }
29 | }
30 | 
31 | #[cfg(test)]
32 | mod tests {
33 |     use crate::tokenizer::Tokenizer;
34 | 
35 |     #[test]
36 |     fn test_empty_tokenizer() {
37 |         let tokenizer = super::EmptyTokenizer;
38 |         let mut empty = tokenizer.token_stream("whatever string");
39 |         assert!(!empty.advance());
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/tokenizer/lower_caser.rs:
--------------------------------------------------------------------------------
 1 | use std::mem;
 2 | 
 3 | use super::{Token, TokenFilter, TokenStream};
 4 | use crate::tokenizer::BoxTokenStream;
 5 | 
 6 | impl TokenFilter for LowerCaser {
 7 |     fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
 8 |         BoxTokenStream::from(LowerCaserTokenStream {
 9 |             tail: token_stream,
10 |             buffer: String::with_capacity(100),
11 |         })
12 |     }
13 | }
14 | 
15 | /// Token filter that lowercase terms.
16 | #[derive(Clone)]
17 | pub struct LowerCaser;
18 | 
19 | pub struct LowerCaserTokenStream<'a> {
20 |     buffer: String,
21 |     tail: BoxTokenStream<'a>,
22 | }
23 | 
24 | // writes a lowercased version of text into output.
25 | fn to_lowercase_unicode(text: &str, output: &mut String) {
26 |     output.clear();
27 |     for c in text.chars() {
28 |         // Contrary to the std, we do not take care of sigma special case.
29 |         // This will have an normalizationo effect, which is ok for search.
30 |         output.extend(c.to_lowercase());
31 |     }
32 | }
33 | 
34 | impl<'a> TokenStream for LowerCaserTokenStream<'a> {
35 |     fn advance(&mut self) -> bool {
36 |         if !self.tail.advance() {
37 |             return false;
38 |         }
39 |         if self.token_mut().text.is_ascii() {
40 |             // fast track for ascii.
41 |             self.token_mut().text.make_ascii_lowercase();
42 |         } else {
43 |             to_lowercase_unicode(&self.tail.token().text, &mut self.buffer);
44 |             mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
45 |         }
46 |         true
47 |     }
48 | 
49 |     fn token(&self) -> &Token {
50 |         self.tail.token()
51 |     }
52 | 
53 |     fn token_mut(&mut self) -> &mut Token {
54 |         self.tail.token_mut()
55 |     }
56 | }
57 | 
58 | #[cfg(test)]
59 | mod tests {
60 |     use crate::tokenizer::tests::assert_token;
61 |     use crate::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, Token};
62 | 
63 |     #[test]
64 |     fn test_to_lower_case() {
65 |         let tokens = token_stream_helper("Tree");
66 |         assert_eq!(tokens.len(), 1);
67 |         assert_token(&tokens[0], 0, "tree", 0, 4);
68 | 
69 |         let tokens = token_stream_helper("Русский текст");
70 |         assert_eq!(tokens.len(), 2);
71 |         assert_token(&tokens[0], 0, "русский", 0, 14);
72 |         assert_token(&tokens[1], 1, "текст", 15, 25);
73 |     }
74 | 
75 |     fn token_stream_helper(text: &str) -> Vec<Token> {
76 |         let mut token_stream = TextAnalyzer::from(SimpleTokenizer)
77 |             .filter(LowerCaser)
78 |             .token_stream(text);
79 |         let mut tokens = vec![];
80 |         let mut add_token = |token: &Token| {
81 |             tokens.push(token.clone());
82 |         };
83 |         token_stream.process(&mut add_token);
84 |         tokens
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/tokenizer/raw_tokenizer.rs:
--------------------------------------------------------------------------------
 1 | use super::{Token, TokenStream, Tokenizer};
 2 | use crate::tokenizer::BoxTokenStream;
 3 | 
 4 | /// For each value of the field, emit a single unprocessed token.
 5 | #[derive(Clone)]
 6 | pub struct RawTokenizer;
 7 | 
 8 | pub struct RawTokenStream {
 9 |     token: Token,
10 |     has_token: bool,
11 | }
12 | 
13 | impl Tokenizer for RawTokenizer {
14 |     fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
15 |         let token = Token {
16 |             offset_from: 0,
17 |             offset_to: text.len(),
18 |             position: 0,
19 |             text: text.to_string(),
20 |             position_length: 1,
21 |         };
22 |         RawTokenStream {
23 |             token,
24 |             has_token: true,
25 |         }
26 |         .into()
27 |     }
28 | }
29 | 
30 | impl TokenStream for RawTokenStream {
31 |     fn advance(&mut self) -> bool {
32 |         let result = self.has_token;
33 |         self.has_token = false;
34 |         result
35 |     }
36 | 
37 |     fn token(&self) -> &Token {
38 |         &self.token
39 |     }
40 | 
41 |     fn token_mut(&mut self) -> &mut Token {
42 |         &mut self.token
43 |     }
44 | }
45 | 
46 | #[cfg(test)]
47 | mod tests {
48 |     use crate::tokenizer::tests::assert_token;
49 |     use crate::tokenizer::{RawTokenizer, TextAnalyzer, Token};
50 | 
51 |     #[test]
52 |     fn test_raw_tokenizer() {
53 |         let tokens = token_stream_helper("Hello, happy tax payer!");
54 |         assert_eq!(tokens.len(), 1);
55 |         assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
56 |     }
57 | 
58 |     fn token_stream_helper(text: &str) -> Vec<Token> {
59 |         let a = TextAnalyzer::from(RawTokenizer);
60 |         let mut token_stream = a.token_stream(text);
61 |         let mut tokens: Vec<Token> = vec![];
62 |         let mut add_token = |token: &Token| {
63 |             tokens.push(token.clone());
64 |         };
65 |         token_stream.process(&mut add_token);
66 |         tokens
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/tokenizer/remove_long.rs:
--------------------------------------------------------------------------------
 1 | //! # Example
 2 | //! ```rust
 3 | //! use tantivy::tokenizer::*;
 4 | //!
 5 | //! let tokenizer = TextAnalyzer::from(SimpleTokenizer)
 6 | //!   .filter(RemoveLongFilter::limit(5));
 7 | //!
 8 | //! let mut stream = tokenizer.token_stream("toolong nice");
 9 | //! // because `toolong` is more than 5 characters, it is filtered
10 | //! // out of the token stream.
11 | //! assert_eq!(stream.next().unwrap().text, "nice");
12 | //! assert!(stream.next().is_none());
13 | //! ```
14 | use super::{Token, TokenFilter, TokenStream};
15 | use crate::tokenizer::BoxTokenStream;
16 | 
17 | /// `RemoveLongFilter` removes tokens that are longer
18 | /// than a given number of bytes (in UTF-8 representation).
19 | ///
20 | /// It is especially useful when indexing unconstrained content.
21 | /// e.g. Mail containing base-64 encoded pictures etc.
22 | #[derive(Clone)]
23 | pub struct RemoveLongFilter {
24 |     length_limit: usize,
25 | }
26 | 
27 | impl RemoveLongFilter {
28 |     /// Creates a `RemoveLongFilter` given a limit in bytes of the UTF-8 representation.
29 |     pub fn limit(length_limit: usize) -> RemoveLongFilter {
30 |         RemoveLongFilter { length_limit }
31 |     }
32 | }
33 | 
34 | impl<'a> RemoveLongFilterStream<'a> {
35 |     fn predicate(&self, token: &Token) -> bool {
36 |         token.text.len() < self.token_length_limit
37 |     }
38 | }
39 | 
40 | impl TokenFilter for RemoveLongFilter {
41 |     fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
42 |         BoxTokenStream::from(RemoveLongFilterStream {
43 |             token_length_limit: self.length_limit,
44 |             tail: token_stream,
45 |         })
46 |     }
47 | }
48 | 
49 | pub struct RemoveLongFilterStream<'a> {
50 |     token_length_limit: usize,
51 |     tail: BoxTokenStream<'a>,
52 | }
53 | 
54 | impl<'a> TokenStream for RemoveLongFilterStream<'a> {
55 |     fn advance(&mut self) -> bool {
56 |         while self.tail.advance() {
57 |             if self.predicate(self.tail.token()) {
58 |                 return true;
59 |             }
60 |         }
61 |         false
62 |     }
63 | 
64 |     fn token(&self) -> &Token {
65 |         self.tail.token()
66 |     }
67 | 
68 |     fn token_mut(&mut self) -> &mut Token {
69 |         self.tail.token_mut()
70 |     }
71 | }
72 | 
73 | #[cfg(test)]
74 | mod tests {
75 |     use crate::tokenizer::tests::assert_token;
76 |     use crate::tokenizer::{RemoveLongFilter, SimpleTokenizer, TextAnalyzer, Token};
77 | 
78 |     #[test]
79 |     fn test_remove_long() {
80 |         let tokens = token_stream_helper("hello tantivy, happy searching!");
81 |         assert_eq!(tokens.len(), 2);
82 |         assert_token(&tokens[0], 0, "hello", 0, 5);
83 |         assert_token(&tokens[1], 2, "happy", 15, 20);
84 |     }
85 | 
86 |     fn token_stream_helper(text: &str) -> Vec<Token> {
87 |         let a = TextAnalyzer::from(SimpleTokenizer).filter(RemoveLongFilter::limit(6));
88 |         let mut token_stream = a.token_stream(text);
89 |         let mut tokens: Vec<Token> = vec![];
90 |         let mut add_token = |token: &Token| {
91 |             tokens.push(token.clone());
92 |         };
93 |         token_stream.process(&mut add_token);
94 |         tokens
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/src/tokenizer/simple_tokenizer.rs:
--------------------------------------------------------------------------------
 1 | use std::str::CharIndices;
 2 | 
 3 | use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
 4 | 
 5 | /// Tokenize the text by splitting on whitespaces and punctuation.
 6 | #[derive(Clone)]
 7 | pub struct SimpleTokenizer;
 8 | 
 9 | pub struct SimpleTokenStream<'a> {
10 |     text: &'a str,
11 |     chars: CharIndices<'a>,
12 |     token: Token,
13 | }
14 | 
15 | impl Tokenizer for SimpleTokenizer {
16 |     fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
17 |         BoxTokenStream::from(SimpleTokenStream {
18 |             text,
19 |             chars: text.char_indices(),
20 |             token: Token::default(),
21 |         })
22 |     }
23 | }
24 | 
25 | impl<'a> SimpleTokenStream<'a> {
26 |     // search for the end of the current token.
27 |     fn search_token_end(&mut self) -> usize {
28 |         (&mut self.chars)
29 |             .filter(|&(_, ref c)| !c.is_alphanumeric())
30 |             .map(|(offset, _)| offset)
31 |             .next()
32 |             .unwrap_or(self.text.len())
33 |     }
34 | }
35 | 
36 | impl<'a> TokenStream for SimpleTokenStream<'a> {
37 |     fn advance(&mut self) -> bool {
38 |         self.token.text.clear();
39 |         self.token.position = self.token.position.wrapping_add(1);
40 |         while let Some((offset_from, c)) = self.chars.next() {
41 |             if c.is_alphanumeric() {
42 |                 let offset_to = self.search_token_end();
43 |                 self.token.offset_from = offset_from;
44 |                 self.token.offset_to = offset_to;
45 |                 self.token.text.push_str(&self.text[offset_from..offset_to]);
46 |                 return true;
47 |             }
48 |         }
49 |         false
50 |     }
51 | 
52 |     fn token(&self) -> &Token {
53 |         &self.token
54 |     }
55 | 
56 |     fn token_mut(&mut self) -> &mut Token {
57 |         &mut self.token
58 |     }
59 | }
60 | 
61 | #[cfg(test)]
62 | mod tests {
63 |     use crate::tokenizer::tests::assert_token;
64 |     use crate::tokenizer::{SimpleTokenizer, TextAnalyzer, Token};
65 | 
66 |     #[test]
67 |     fn test_simple_tokenizer() {
68 |         let tokens = token_stream_helper("Hello, happy tax payer!");
69 |         assert_eq!(tokens.len(), 4);
70 |         assert_token(&tokens[0], 0, "Hello", 0, 5);
71 |         assert_token(&tokens[1], 1, "happy", 7, 12);
72 |         assert_token(&tokens[2], 2, "tax", 13, 16);
73 |         assert_token(&tokens[3], 3, "payer", 17, 22);
74 |     }
75 | 
76 |     fn token_stream_helper(text: &str) -> Vec<Token> {
77 |         let a = TextAnalyzer::from(SimpleTokenizer);
78 |         let mut token_stream = a.token_stream(text);
79 |         let mut tokens: Vec<Token> = vec![];
80 |         let mut add_token = |token: &Token| {
81 |             tokens.push(token.clone());
82 |         };
83 |         token_stream.process(&mut add_token);
84 |         tokens
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/tokenizer/stemmer.rs:
--------------------------------------------------------------------------------
  1 | use rust_stemmers::{self, Algorithm};
  2 | use serde::{Deserialize, Serialize};
  3 | 
  4 | use super::{Token, TokenFilter, TokenStream};
  5 | use crate::tokenizer::BoxTokenStream;
  6 | 
  7 | /// Available stemmer languages.
  8 | #[derive(Debug, Serialize, Deserialize, Eq, PartialEq, Copy, Clone)]
  9 | #[allow(missing_docs)]
 10 | pub enum Language {
 11 |     Arabic,
 12 |     Danish,
 13 |     Dutch,
 14 |     English,
 15 |     Finnish,
 16 |     French,
 17 |     German,
 18 |     Greek,
 19 |     Hungarian,
 20 |     Italian,
 21 |     Norwegian,
 22 |     Portuguese,
 23 |     Romanian,
 24 |     Russian,
 25 |     Spanish,
 26 |     Swedish,
 27 |     Tamil,
 28 |     Turkish,
 29 | }
 30 | 
 31 | impl Language {
 32 |     fn algorithm(self) -> Algorithm {
 33 |         use self::Language::*;
 34 |         match self {
 35 |             Arabic => Algorithm::Arabic,
 36 |             Danish => Algorithm::Danish,
 37 |             Dutch => Algorithm::Dutch,
 38 |             English => Algorithm::English,
 39 |             Finnish => Algorithm::Finnish,
 40 |             French => Algorithm::French,
 41 |             German => Algorithm::German,
 42 |             Greek => Algorithm::Greek,
 43 |             Hungarian => Algorithm::Hungarian,
 44 |             Italian => Algorithm::Italian,
 45 |             Norwegian => Algorithm::Norwegian,
 46 |             Portuguese => Algorithm::Portuguese,
 47 |             Romanian => Algorithm::Romanian,
 48 |             Russian => Algorithm::Russian,
 49 |             Spanish => Algorithm::Spanish,
 50 |             Swedish => Algorithm::Swedish,
 51 |             Tamil => Algorithm::Tamil,
 52 |             Turkish => Algorithm::Turkish,
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | /// `Stemmer` token filter. Several languages are supported, see `Language` for the available
 58 | /// languages.
 59 | /// Tokens are expected to be lowercased beforehand.
 60 | #[derive(Clone)]
 61 | pub struct Stemmer {
 62 |     stemmer_algorithm: Algorithm,
 63 | }
 64 | 
 65 | impl Stemmer {
 66 |     /// Creates a new Stemmer `TokenFilter` for a given language algorithm.
 67 |     pub fn new(language: Language) -> Stemmer {
 68 |         Stemmer {
 69 |             stemmer_algorithm: language.algorithm(),
 70 |         }
 71 |     }
 72 | }
 73 | 
 74 | impl Default for Stemmer {
 75 |     /// Creates a new Stemmer `TokenFilter` for English.
 76 |     fn default() -> Self {
 77 |         Stemmer::new(Language::English)
 78 |     }
 79 | }
 80 | 
 81 | impl TokenFilter for Stemmer {
 82 |     fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> {
 83 |         let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
 84 |         BoxTokenStream::from(StemmerTokenStream {
 85 |             tail: token_stream,
 86 |             stemmer: inner_stemmer,
 87 |         })
 88 |     }
 89 | }
 90 | 
 91 | pub struct StemmerTokenStream<'a> {
 92 |     tail: BoxTokenStream<'a>,
 93 |     stemmer: rust_stemmers::Stemmer,
 94 | }
 95 | 
 96 | impl<'a> TokenStream for StemmerTokenStream<'a> {
 97 |     fn advance(&mut self) -> bool {
 98 |         if !self.tail.advance() {
 99 |             return false;
100 |         }
101 |         // TODO remove allocation
102 |         let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
103 |         self.token_mut().text.clear();
104 |         self.token_mut().text.push_str(&stemmed_str);
105 |         true
106 |     }
107 | 
108 |     fn token(&self) -> &Token {
109 |         self.tail.token()
110 |     }
111 | 
112 |     fn token_mut(&mut self) -> &mut Token {
113 |         self.tail.token_mut()
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/tokenizer/tokenized_string.rs:
--------------------------------------------------------------------------------
  1 | use std::cmp::Ordering;
  2 | 
  3 | use serde::{Deserialize, Serialize};
  4 | 
  5 | use crate::tokenizer::{Token, TokenStream};
  6 | 
  7 | /// Struct representing pre-tokenized text
  8 | #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
  9 | pub struct PreTokenizedString {
 10 |     /// Original text
 11 |     pub text: String,
 12 |     /// Tokens derived from the text
 13 |     pub tokens: Vec<Token>,
 14 | }
 15 | 
 16 | impl Ord for PreTokenizedString {
 17 |     fn cmp(&self, other: &Self) -> Ordering {
 18 |         self.text.cmp(&other.text)
 19 |     }
 20 | }
 21 | 
 22 | impl PartialOrd for PreTokenizedString {
 23 |     fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
 24 |         Some(self.cmp(other))
 25 |     }
 26 | }
 27 | 
 28 | /// TokenStream implementation which wraps PreTokenizedString
 29 | pub struct PreTokenizedStream {
 30 |     tokenized_string: PreTokenizedString,
 31 |     current_token: i64,
 32 | }
 33 | 
 34 | impl From<PreTokenizedString> for PreTokenizedStream {
 35 |     fn from(s: PreTokenizedString) -> PreTokenizedStream {
 36 |         PreTokenizedStream {
 37 |             tokenized_string: s,
 38 |             current_token: -1,
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | impl TokenStream for PreTokenizedStream {
 44 |     fn advance(&mut self) -> bool {
 45 |         self.current_token += 1;
 46 |         self.current_token < self.tokenized_string.tokens.len() as i64
 47 |     }
 48 | 
 49 |     fn token(&self) -> &Token {
 50 |         assert!(
 51 |             self.current_token >= 0,
 52 |             "TokenStream not initialized. You should call advance() at least once."
 53 |         );
 54 |         &self.tokenized_string.tokens[self.current_token as usize]
 55 |     }
 56 | 
 57 |     fn token_mut(&mut self) -> &mut Token {
 58 |         assert!(
 59 |             self.current_token >= 0,
 60 |             "TokenStream not initialized. You should call advance() at least once."
 61 |         );
 62 |         &mut self.tokenized_string.tokens[self.current_token as usize]
 63 |     }
 64 | }
 65 | 
 66 | #[cfg(test)]
 67 | mod tests {
 68 | 
 69 |     use super::*;
 70 |     use crate::tokenizer::Token;
 71 | 
 72 |     #[test]
 73 |     fn test_tokenized_stream() {
 74 |         let tok_text = PreTokenizedString {
 75 |             text: String::from("A a"),
 76 |             tokens: vec![
 77 |                 Token {
 78 |                     offset_from: 0,
 79 |                     offset_to: 1,
 80 |                     position: 0,
 81 |                     text: String::from("A"),
 82 |                     position_length: 1,
 83 |                 },
 84 |                 Token {
 85 |                     offset_from: 2,
 86 |                     offset_to: 3,
 87 |                     position: 1,
 88 |                     text: String::from("a"),
 89 |                     position_length: 1,
 90 |                 },
 91 |             ],
 92 |         };
 93 | 
 94 |         let mut token_stream = PreTokenizedStream::from(tok_text.clone());
 95 | 
 96 |         for expected_token in tok_text.tokens {
 97 |             assert!(token_stream.advance());
 98 |             assert_eq!(token_stream.token(), &expected_token);
 99 |         }
100 |         assert!(!token_stream.advance());
101 |     }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/tokenizer/tokenizer_manager.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::HashMap;
 2 | use std::sync::{Arc, RwLock};
 3 | 
 4 | use crate::tokenizer::stemmer::Language;
 5 | use crate::tokenizer::tokenizer::TextAnalyzer;
 6 | use crate::tokenizer::{
 7 |     LowerCaser, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer, WhitespaceTokenizer,
 8 | };
 9 | 
10 | /// The tokenizer manager serves as a store for
11 | /// all of the pre-configured tokenizer pipelines.
12 | ///
13 | /// By default, it is populated with the following managers.
14 | ///
15 | ///  * `raw` : does not process nor tokenize the text.
16 | ///  * `default` : Chops the text on according to whitespace and
17 | ///  punctuation, removes tokens that are too long, and lowercases
18 | ///  tokens
19 | ///  * `en_stem` : Like `default`, but also applies stemming on the
20 | ///  resulting tokens. Stemming can improve the recall of your
21 | ///  search engine.
22 | #[derive(Clone)]
23 | pub struct TokenizerManager {
24 |     tokenizers: Arc<RwLock<HashMap<String, TextAnalyzer>>>,
25 | }
26 | 
27 | impl TokenizerManager {
28 |     /// Registers a new tokenizer associated with a given name.
29 |     pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
30 |     where TextAnalyzer: From<T> {
31 |         let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
32 |         self.tokenizers
33 |             .write()
34 |             .expect("Acquiring the lock should never fail")
35 |             .insert(tokenizer_name.to_string(), boxed_tokenizer);
36 |     }
37 | 
38 |     /// Accessing a tokenizer given its name.
39 |     pub fn get(&self, tokenizer_name: &str) -> Option<TextAnalyzer> {
40 |         self.tokenizers
41 |             .read()
42 |             .expect("Acquiring the lock should never fail")
43 |             .get(tokenizer_name)
44 |             .cloned()
45 |     }
46 | }
47 | 
48 | impl Default for TokenizerManager {
49 |     /// Creates an `TokenizerManager` prepopulated with
50 |     /// the default pre-configured tokenizers of `tantivy`.
51 |     /// - simple
52 |     /// - en_stem
53 |     /// - ja
54 |     fn default() -> TokenizerManager {
55 |         let manager = TokenizerManager {
56 |             tokenizers: Arc::new(RwLock::new(HashMap::new())),
57 |         };
58 |         manager.register("raw", RawTokenizer);
59 |         manager.register(
60 |             "default",
61 |             TextAnalyzer::from(SimpleTokenizer)
62 |                 .filter(RemoveLongFilter::limit(40))
63 |                 .filter(LowerCaser),
64 |         );
65 |         manager.register(
66 |             "en_stem",
67 |             TextAnalyzer::from(SimpleTokenizer)
68 |                 .filter(RemoveLongFilter::limit(40))
69 |                 .filter(LowerCaser)
70 |                 .filter(Stemmer::new(Language::English)),
71 |         );
72 |         manager.register("whitespace", WhitespaceTokenizer);
73 |         manager
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/tokenizer/whitespace_tokenizer.rs:
--------------------------------------------------------------------------------
 1 | use std::str::CharIndices;
 2 | 
 3 | use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
 4 | 
 5 | /// Tokenize the text by splitting on whitespaces.
 6 | #[derive(Clone)]
 7 | pub struct WhitespaceTokenizer;
 8 | 
 9 | pub struct WhitespaceTokenStream<'a> {
10 |     text: &'a str,
11 |     chars: CharIndices<'a>,
12 |     token: Token,
13 | }
14 | 
15 | impl Tokenizer for WhitespaceTokenizer {
16 |     fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
17 |         BoxTokenStream::from(WhitespaceTokenStream {
18 |             text,
19 |             chars: text.char_indices(),
20 |             token: Token::default(),
21 |         })
22 |     }
23 | }
24 | 
25 | impl<'a> WhitespaceTokenStream<'a> {
26 |     // search for the end of the current token.
27 |     fn search_token_end(&mut self) -> usize {
28 |         (&mut self.chars)
29 |             .filter(|&(_, ref c)| c.is_ascii_whitespace())
30 |             .map(|(offset, _)| offset)
31 |             .next()
32 |             .unwrap_or(self.text.len())
33 |     }
34 | }
35 | 
36 | impl<'a> TokenStream for WhitespaceTokenStream<'a> {
37 |     fn advance(&mut self) -> bool {
38 |         self.token.text.clear();
39 |         self.token.position = self.token.position.wrapping_add(1);
40 |         while let Some((offset_from, c)) = self.chars.next() {
41 |             if !c.is_ascii_whitespace() {
42 |                 let offset_to = self.search_token_end();
43 |                 self.token.offset_from = offset_from;
44 |                 self.token.offset_to = offset_to;
45 |                 self.token.text.push_str(&self.text[offset_from..offset_to]);
46 |                 return true;
47 |             }
48 |         }
49 |         false
50 |     }
51 | 
52 |     fn token(&self) -> &Token {
53 |         &self.token
54 |     }
55 | 
56 |     fn token_mut(&mut self) -> &mut Token {
57 |         &mut self.token
58 |     }
59 | }
60 | 
61 | #[cfg(test)]
62 | mod tests {
63 |     use crate::tokenizer::tests::assert_token;
64 |     use crate::tokenizer::{TextAnalyzer, Token, WhitespaceTokenizer};
65 | 
66 |     #[test]
67 |     fn test_whitespace_tokenizer() {
68 |         let tokens = token_stream_helper("Hello, happy tax payer!");
69 |         assert_eq!(tokens.len(), 4);
70 |         assert_token(&tokens[0], 0, "Hello,", 0, 6);
71 |         assert_token(&tokens[1], 1, "happy", 7, 12);
72 |         assert_token(&tokens[2], 2, "tax", 13, 16);
73 |         assert_token(&tokens[3], 3, "payer!", 17, 23);
74 |     }
75 | 
76 |     fn token_stream_helper(text: &str) -> Vec<Token> {
77 |         let a = TextAnalyzer::from(WhitespaceTokenizer);
78 |         let mut token_stream = a.token_stream(text);
79 |         let mut tokens: Vec<Token> = vec![];
80 |         let mut add_token = |token: &Token| {
81 |             tokens.push(token.clone());
82 |         };
83 |         token_stream.process(&mut add_token);
84 |         tokens
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/tests/mod.rs:
--------------------------------------------------------------------------------
1 | mod failpoints;
2 | 


--------------------------------------------------------------------------------