├── .cargo └── config.toml ├── .github ├── dependabot.yaml └── workflows │ ├── ci.yml │ ├── codeql.yml │ ├── publish.yaml │ └── scorecards.yml ├── .gitignore ├── .readthedocs.yaml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── MANIFEST.in ├── README.md ├── build.rs ├── ci ├── deploy.sh └── deploy_mac.sh ├── docs ├── about.md ├── explanation.md ├── howto.md ├── index.md ├── reference.md ├── requirements.txt └── tutorials.md ├── mkdocs.yml ├── noxfile.py ├── pyproject.toml ├── requirements-dev.txt ├── rust-toolchain.toml ├── rustfmt.toml ├── src ├── document.rs ├── facet.rs ├── index.rs ├── lib.rs ├── parser_error.rs ├── query.rs ├── schema.rs ├── schemabuilder.rs ├── searcher.rs ├── snippet.rs └── tokenizer.rs ├── tantivy ├── __init__.py ├── py.typed └── tantivy.pyi └── tests ├── conftest.py ├── tantivy_test.py ├── test_docs.py ├── test_escapes.py └── test_json_bug.py /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.x86_64-apple-darwin] 2 | rustflags = [ 3 | "-C", "link-arg=-undefined", 4 | "-C", "link-arg=dynamic_lookup", 5 | ] -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | 8 | - package-ecosystem: cargo 9 | directory: "/" 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | concurrency: 12 | group: ${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | Lint: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Harden Runner 23 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 24 | with: 25 | disable-sudo: true 26 | egress-policy: block 27 | allowed-endpoints: > 28 | github.com:443 29 | static.rust-lang.org:443 30 | 31 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 32 | with: 33 | fetch-depth: 0 34 | 35 | - name: Rust toolchain 36 | uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 37 | with: 38 | toolchain: "stable" 39 | components: rustfmt 40 | 41 | - name: Check Formatting 42 | run: cargo fmt --check 43 | 44 | Test: 45 | env: 46 | UNSAFE_PYO3_SKIP_VERSION_CHECK: ${{ matrix.unsafe-pyo3-skip-version-check }} 47 | strategy: 48 | matrix: 49 | os: [ubuntu-latest, macos-latest, windows-latest] 50 | python-version: ["3.12"] 51 | allow-prereleases: [false] 52 | include: 53 | - os: ubuntu-latest 54 | python-version: "3.13" 55 | allow-prereleases: false 56 | - os: ubuntu-latest 57 | python-version: "3.12" 58 | allow-prereleases: false 59 | - os: ubuntu-latest 60 | python-version: "3.11" 61 | allow-prereleases: false 62 | - os: ubuntu-latest 63 | python-version: "3.10" 64 | allow-prereleases: false 65 | - os: ubuntu-latest 66 | python-version: 3.9 67 | allow-prereleases: false 68 | runs-on: "${{ matrix.os }}" 69 | steps: 70 | - name: Harden Runner 71 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 72 | with: 73 | disable-sudo: true 74 | egress-policy: block 75 | allowed-endpoints: > 76 | api.github.com:443 77 | crates.io:443 78 | files.pythonhosted.org:443 79 | github.com:443 80 | pypi.org:443 81 | static.crates.io:443 82 | index.crates.io:443 83 | static.rust-lang.org:443 84 | objects.githubusercontent.com:443 85 | 86 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 87 | with: 88 | fetch-depth: 0 89 | 90 | - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # 5.6.0 91 | with: 92 | python-version: ${{ matrix.python-version }} 93 | allow-prereleases: ${{ matrix.allow-prereleases }} 94 | 95 | - uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 96 | with: 97 | toolchain: "stable" 98 | 99 | #- uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84 # 2.7.3 100 | - run: python -m pip install nox 101 | - run: nox -s test-${{ matrix.python-version }} 102 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: ["master"] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: ["master"] 9 | schedule: 10 | - cron: "0 0 * * 1" 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | analyze: 17 | name: Analyze 18 | runs-on: ubuntu-latest 19 | permissions: 20 | actions: read 21 | contents: read 22 | security-events: write 23 | 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | language: ["python"] 28 | # CodeQL supports [ $supported-codeql-languages ] 29 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 30 | 31 | steps: 32 | - name: Harden Runner 33 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.2.1 34 | with: 35 | disable-sudo: true 36 | egress-policy: block 37 | allowed-endpoints: > 38 | api.github.com:443 39 | files.pythonhosted.org:443 40 | objects.githubusercontent.com:443 41 | github.com:443 42 | pypi.org:443 43 | uploads.github.com:443 44 | 45 | - name: Checkout repository 46 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 47 | 48 | # Initializes the CodeQL tools for scanning. 49 | - name: Initialize CodeQL 50 | uses: github/codeql-action/init@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v2.2.5 51 | with: 52 | languages: ${{ matrix.language }} 53 | # If you wish to specify custom queries, you can do so here or in a config file. 54 | # By default, queries listed here will override any specified in a config file. 55 | # Prefix the list here with "+" to use these queries and those in the config file. 56 | 57 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 58 | # queries: security-extended,security-and-quality 59 | 60 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 61 | # If this step fails, then you should remove it and run the build manually (see below) 62 | - name: Autobuild 63 | uses: github/codeql-action/autobuild@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v2.2.5 64 | 65 | # ℹ️ Command-line programs to run using the OS shell. 66 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 67 | 68 | # If the Autobuild fails above, remove it and uncomment the following three lines. 69 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 70 | 71 | # - run: | 72 | # echo "Run, Build Application using script" 73 | # ./location_of_script_within_repo/buildscript.sh 74 | 75 | - name: Perform CodeQL Analysis 76 | uses: github/codeql-action/analyze@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v2.2.5 77 | with: 78 | category: "/language:${{matrix.language}}" -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Test & Release 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | # pull_request: 8 | # branches: 9 | # - master 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | linux: 16 | runs-on: ubuntu-latest 17 | permissions: 18 | id-token: write # ability to mint the OIDC token permission is necessary to persist the attestation 19 | contents: read 20 | attestations: write # persist the attestation 21 | strategy: 22 | matrix: 23 | platform: [ 'x86_64-unknown-linux-gnu', 'aarch64-unknown-linux-gnu' ] 24 | steps: 25 | - name: Harden Runner 26 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 27 | with: 28 | egress-policy: block 29 | allowed-endpoints: > 30 | api.github.com:443 31 | astral.sh:443 32 | cdn.quay.io:443 33 | cdn01.quay.io:443 34 | cdn02.quay.io:443 35 | cdn03.quay.io:443 36 | crates.io:443 37 | files.pythonhosted.org:443 38 | ghcr.io:443 39 | github.com:443 40 | index.crates.io:443 41 | objects.githubusercontent.com:443 42 | pkg-containers.githubusercontent.com:443 43 | pypi.org:443 44 | quay.io:443 45 | sh.rustup.rs:443 46 | static.crates.io:443 47 | static.rust-lang.org:443 48 | uploads.github.com:443 49 | 50 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 51 | - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 52 | with: 53 | python-version: 3.9 54 | architecture: x64 55 | 56 | - uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3 57 | with: 58 | manylinux: auto 59 | target: ${{ matrix.platform }} 60 | command: build 61 | args: --release --sdist -o dist -i 3.9 3.10 3.11 3.12 3.13 62 | 63 | - name: Upload wheels 64 | uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 65 | with: 66 | name: wheels-linux-${{ matrix.platform }} 67 | path: dist 68 | 69 | windows: 70 | runs-on: windows-latest 71 | permissions: 72 | id-token: write # ability to mint the OIDC token permission is necessary to persist the attestation 73 | contents: read 74 | attestations: write # persist the attestation 75 | strategy: 76 | matrix: 77 | target: [x64] 78 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 79 | steps: 80 | - name: Harden Runner 81 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 82 | with: 83 | egress-policy: audit 84 | 85 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 86 | - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 87 | with: 88 | python-version: ${{ matrix.python-version }} 89 | 90 | - uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3 91 | env: 92 | PYO3_PYTHON: python${{ matrix.python-version }} 93 | with: 94 | command: build 95 | args: --release -o dist 96 | 97 | - name: Upload wheels 98 | uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 99 | with: 100 | name: wheels-windows-${{ matrix.python-version }}-${{ matrix.target }} 101 | path: dist 102 | 103 | macos: 104 | runs-on: macos-latest 105 | permissions: 106 | id-token: write # ability to mint the OIDC token permission is necessary to persist the attestation 107 | contents: read 108 | attestations: write # persist the attestation 109 | strategy: 110 | matrix: 111 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 112 | target: ['universal2', 'x86_64-apple-darwin'] 113 | steps: 114 | - name: Harden Runner 115 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 116 | with: 117 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 118 | 119 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 120 | - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 121 | with: 122 | python-version: ${{ matrix.python-version }} 123 | 124 | - name: Build wheels - ${{ matrix.target }} 125 | uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3 126 | env: 127 | PYO3_PYTHON: python${{ matrix.python-version }} 128 | with: 129 | target: ${{ matrix.target }} 130 | command: build 131 | args: --release -o dist 132 | 133 | - name: Upload wheels 134 | uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 135 | with: 136 | name: wheels-macos-${{ matrix.python-version }}-${{ matrix.target }} 137 | path: dist 138 | 139 | python-release-github: 140 | runs-on: ubuntu-latest 141 | needs: [ macos, windows, linux ] 142 | permissions: 143 | contents: write # To add assets to a release. 144 | checks: write 145 | packages: write 146 | steps: 147 | - name: Harden Runner 148 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.1.0 149 | with: 150 | disable-sudo: true 151 | egress-policy: block 152 | allowed-endpoints: > 153 | api.github.com:443 154 | github.com:443 155 | uploads.github.com:443 156 | static.rust-lang.org:443 157 | 158 | - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 159 | with: 160 | ref: ${{ github.head_ref }} 161 | 162 | - uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1 163 | with: 164 | toolchain: "stable" 165 | 166 | - name: Set up Python 3.9 167 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 168 | with: 169 | python-version: 3.9 170 | 171 | - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 172 | with: 173 | path: wheels 174 | pattern: wheels-* 175 | merge-multiple: true 176 | 177 | - name: Upload release binaries 178 | uses: alexellis/upload-assets@13926a61cdb2cb35f5fdef1c06b8b591523236d3 # 0.4.1 179 | env: 180 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 181 | with: 182 | asset_paths: '["./wheels/tantivy-*"]' 183 | 184 | release-pypy: 185 | name: Release 186 | runs-on: ubuntu-latest 187 | needs: [ macos, windows, linux ] 188 | permissions: 189 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 190 | steps: 191 | - name: Harden Runner 192 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 193 | with: 194 | egress-policy: audit 195 | 196 | - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 197 | with: 198 | path: wheels 199 | pattern: wheels-* 200 | merge-multiple: true 201 | 202 | - name: Publish package distributions to Test PyPI 203 | uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3 204 | with: 205 | repository-url: https://test.pypi.org/legacy/ 206 | packages-dir: wheels/ 207 | skip-existing: true 208 | attestations: false 209 | 210 | - name: Publish package distributions to PyPI 211 | if: always() 212 | uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3 213 | with: 214 | packages-dir: wheels/ 215 | skip-existing: true 216 | -------------------------------------------------------------------------------- /.github/workflows/scorecards.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses actions that are not certified by GitHub. They are provided 2 | # by a third-party and are governed by separate terms of service, privacy 3 | # policy, and support documentation. 4 | 5 | name: Scorecard supply-chain security 6 | on: 7 | # For Branch-Protection check. Only the default branch is supported. See 8 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection 9 | branch_protection_rule: 10 | # To guarantee Maintained check is occasionally updated. See 11 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained 12 | schedule: 13 | - cron: '35 8 * * 5' 14 | push: 15 | branches: [ "master" ] 16 | 17 | # Declare default permissions as read only. 18 | permissions: read-all 19 | 20 | jobs: 21 | analysis: 22 | name: Scorecard analysis 23 | runs-on: ubuntu-latest 24 | permissions: 25 | # Needed to upload the results to code-scanning dashboard. 26 | security-events: write 27 | # Needed to publish results and get a badge (see publish_results below). 28 | id-token: write 29 | 30 | steps: 31 | - name: "Checkout code" 32 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 33 | with: 34 | persist-credentials: false 35 | 36 | - name: "Run analysis" 37 | uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2 38 | with: 39 | results_file: results.sarif 40 | results_format: sarif 41 | # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: 42 | # - you want to enable the Branch-Protection check on a *public* repository, or 43 | # - you are installing Scorecard on a *private* repository 44 | # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. 45 | # repo_token: ${{ secrets.SCORECARD_TOKEN }} 46 | 47 | # Public repositories: 48 | # - Publish results to OpenSSF REST API for easy access by consumers 49 | # - Allows the repository to include the Scorecard badge. 50 | # - See https://github.com/ossf/scorecard-action#publishing-results. 51 | # For private repositories: 52 | # - `publish_results` will always be set to `false`, regardless 53 | # of the value entered here. 54 | publish_results: true 55 | 56 | # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF 57 | # format to the repository Actions tab. 58 | - name: "Upload artifact" 59 | uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 60 | with: 61 | name: SARIF file 62 | path: results.sarif 63 | retention-days: 5 64 | 65 | # Upload the results to GitHub's code scanning dashboard. 66 | - name: "Upload to code-scanning" 67 | uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18 68 | with: 69 | sarif_file: results.sarif 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | **/*.pyc 3 | build 4 | /target 5 | **/*.rs.bk 6 | dist/ 7 | __pycache__/ 8 | tantivy.so 9 | tantivy.dylib 10 | tantivy/tantivy.cpython*.so 11 | tantivy.egg-info/ 12 | .venv 13 | .envrc 14 | site/ -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | mkdocs: 20 | configuration: mkdocs.yml 21 | fail_on_warning: false 22 | 23 | # Optionally build your docs in additional formats such as PDF and ePub 24 | # formats: 25 | # - pdf 26 | # - epub 27 | 28 | # Optional but recommended, declare the Python requirements required 29 | # to build your documentation 30 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 31 | python: 32 | install: 33 | - requirements: docs/requirements.txt 34 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "tantivy" 3 | version = "0.24.0" 4 | readme = "README.md" 5 | authors = ["Damir Jelić "] 6 | edition = "2021" 7 | license = "MIT" 8 | 9 | [lib] 10 | name = "tantivy" 11 | crate-type = ["cdylib"] 12 | 13 | [build-dependencies] 14 | pyo3-build-config = "0.25.0" 15 | 16 | [dependencies] 17 | base64 = "0.22" 18 | chrono = "0.4.41" 19 | tantivy = "0.24.1" 20 | itertools = "0.14.0" 21 | futures = "0.3.31" 22 | pythonize = "0.24.0" 23 | serde = "1.0" 24 | serde_json = "1.0.140" 25 | 26 | [dependencies.pyo3] 27 | version = "0.24.2" 28 | features = ["chrono", "extension-module"] 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 The Matrix.org Foundation CIC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Cargo.toml 2 | include Makefile 3 | include rust-toolchain 4 | recursive-include src * 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/quickwit-inc/tantivy-py.svg?branch=master)](https://travis-ci.org/quickwit-inc/tantivy-py) 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | [![Docs](https://readthedocs.org/projects/tantivy-py/badge/?version=latest&style=flat-default)](https://tantivy-py.readthedocs.io/en/latest/) 4 | 5 | tantivy-py 6 | ========== 7 | 8 | Python bindings for [Tantivy](https://github.com/quickwit-oss/tantivy) the full-text search engine library written in Rust. 9 | 10 | # Installation 11 | 12 | The bindings can be installed using from pypi using pip: 13 | 14 | pip install tantivy 15 | 16 | If no binary wheel is present for your operating system the bindings will be 17 | build from source, this means that Rust needs to be installed before building 18 | can succeed. 19 | 20 | # Documentation 21 | 22 | Please see [the documentation](https://tantivy-py.readthedocs.io/en/latest/) for more information. 23 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | pyo3_build_config::add_extension_module_link_args(); 3 | } 4 | -------------------------------------------------------------------------------- /ci/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | docker run \ 4 | --env MATURIN_PASSWORD="$MATURIN_PASSWORD" \ 5 | --rm \ 6 | -v "$(pwd)":/io \ 7 | konstin2/maturin \ 8 | publish \ 9 | --username __token__ \ 10 | --password "$MATURIN_PASSWORD" 11 | -------------------------------------------------------------------------------- /ci/deploy_mac.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | maturin publish \ 4 | --interpreter python3.9 \ 5 | --username __token__ \ 6 | --password "$MATURIN_PASSWORD" \ 7 | --no-sdist 8 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # About 2 | -------------------------------------------------------------------------------- /docs/explanation.md: -------------------------------------------------------------------------------- 1 | # Explanation 2 | 3 | ## Merge policy 4 | 5 | When adding documents to a tantivy index, the indexed data will be recorded in multiple 6 | sections, called _segments_. There is more information about the [Life of a Segment](https://github.com/quickwit-oss/tantivy/wiki/Life-of-a-Segment) 7 | on the [tantivy wiki at Github](https://github.com/quickwit-oss/tantivy/wiki). 8 | 9 | Currently, tantivy-py does not offer a way to customize the merge policy, but fortunately 10 | the default merge policy is the [`LogMergePolicy`](https://docs.rs/tantivy/latest/tantivy/merge_policy/struct.LogMergePolicy.html) 11 | which is a good choice for most use cases. It is aliased as the [default merge policy here](https://docs.rs/tantivy/latest/tantivy/merge_policy/type.DefaultMergePolicy.html). 12 | 13 | Segment merging is performed in background threads. After adding documents to an index, 14 | it is important to allow time for those threads to complete merges. This is done by calling 15 | `writer.wait_merging_threads()` as the final step after adding data. This method will 16 | consume the writer and the identifier will no longer be usable. 17 | 18 | Here is a short description of the steps in pseudocode: 19 | 20 | ``` 21 | schema = Schema(...) 22 | index = Index(schema) 23 | writer = index.writer() 24 | for ... in data: 25 | document = Document(...) 26 | writer.add_document(...) 27 | writer.commit() 28 | writer.wait_merging_threads() 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/howto.md: -------------------------------------------------------------------------------- 1 | # How-to Guides 2 | 3 | ## Installation 4 | 5 | tantivy-py can be installed using from [pypi](pypi.org) using pip: 6 | 7 | pip install tantivy 8 | 9 | If no binary wheel is present for your operating system the bindings will be 10 | build from source, this means that Rust needs to be installed before building 11 | can succeed. 12 | 13 | Note that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which 14 | only supports python3. 15 | 16 | ## Set up a development environment to work on tantivy-py itself 17 | 18 | Setting up a development environment can be done in a virtual environment using 19 | [`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`. 20 | 21 | For the `nox` setup install the virtual environment and build the bindings using: 22 | 23 | python3 -m pip install nox 24 | nox 25 | 26 | For the `Makefile` based setup run: 27 | 28 | make 29 | 30 | Running the tests is done using: 31 | 32 | make test 33 | 34 | The `nox` test session will pass pytest arguments through. For example, 35 | to run only the tests including "simple_search" in the test name, and only 36 | on Python 3.11: 37 | 38 | nox -s test-3.11 -- -k simple_search 39 | 40 | ## Doctests 41 | 42 | [Doctests](https://docs.python.org/3/library/doctest.html) are automatically 43 | enabled for all docstrings in the `tantivy` module. Here is a very basic 44 | introduction. Consider the following hypothetical Rust `struct`: 45 | 46 | ```rust 47 | /// Tantivy's Document is the object that can be indexed and then searched for. 48 | /// 49 | /// Documents are fundamentally a collection of unordered tuples 50 | /// (field_name, value). In this list, one field may appear more than once. 51 | /// 52 | /// Example: 53 | /// >>> doc = tantivy.Document() 54 | /// >>> doc.add_text("title", "The Old Man and the Sea") 55 | /// >>> doc.add_text("body", ("He was an old man who fished alone in a " 56 | /// ... "skiff in the Gulf Stream and he had gone " 57 | /// ... "eighty-four days now without taking a fish.")) 58 | /// >>> doc 59 | /// Document(body=[He was an ],title=[The Old Ma]) 60 | /// 61 | #[pyclass(module = "tantivy")] 62 | #[derive(Clone, Default, PartialEq)] 63 | pub(crate) struct Document { 64 | pub(crate) field_values: BTreeMap>, 65 | } 66 | ``` 67 | 68 | When the tests are executed, pytest will automatically search all the docstrings 69 | for `>>>` and `...` and execute the code in the docstring. The output of the 70 | code is compared to the text that follows the code. If the output matches, the 71 | test passes. If the output does not match, the test fails. 72 | 73 | In the above example, a Tantivy document object is created, and then the 74 | representation of the document is printed. This representation, and indeed any 75 | output that manual typing would produce, is compared to the text that follows 76 | and this is how doctests work. 77 | 78 | Doctests are a great way to ensure that the documentation is accurate and up to 79 | date, and doctests are therefore encouraged be present on every public 80 | interface that users will interact with. However, doctest are not suitable 81 | for coverage testing and other more advanced testing methods so you must 82 | judge when to use them. 83 | 84 | ## Working on tantivy-py documentation 85 | 86 | Please be aware that this documentation is structured using the [Diátaxis](https://diataxis.fr/) framework. In very simple terms, this framework will suggest the correct location for different kinds of documentation. Please make sure you gain a basic understanding of the goals of the framework before making large pull requests with new documentation. 87 | 88 | This documentation uses the [MkDocs](https://mkdocs.readthedocs.io/en/stable/) framework. This package is specified as an optional dependency in the `pyproject.toml` file. To install all optional dev dependencies into your virtual env, run the following command: 89 | 90 | pip install .[dev] 91 | 92 | The [MkDocs](https://mkdocs.readthedocs.io/en/stable/) documentation itself is comprehensive. MkDocs provides some additional context and help around [writing with markdown](https://mkdocs.readthedocs.io/en/stable/user-guide/writing-your-docs/#writing-with-markdown). 93 | 94 | If all you want to do is make a few edits right away, the documentation content is in the `/docs` directory and consists of [Markdown](https://www.markdownguide.org/) files, which can be edited with any text editor. 95 | 96 | The most efficient way to work is to run a MkDocs livereload server in the background. This will launch a local web server on your dev machine, serve the docs (by default at `http://localhost:8000`), and automatically reload the page after you save any changes to the documentation files. 97 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to tantivy-py 2 | 3 | tantivy-py is a wrapper for the [tantivy](https://github.com/quickwit-oss/tantivy) full-text search engine, which is inspired by Apache Lucene. 4 | 5 | tantivy-py is [licensed](https://github.com/quickwit-oss/tantivy-py/blob/master/LICENSE) under the [MIT License](https://www.tldrlegal.com/license/mit-license). 6 | 7 | ## Important links 8 | 9 | - [tantivy-py code repository](https://github.com/quickwit-oss/tantivy-py) 10 | - [tantivy code repository](https://github.com/quickwit-oss/tantivy) 11 | - [tantivy Documentation](https://docs.rs/crate/tantivy/latest) 12 | - [tantivy query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) 13 | 14 | ## How to use this documentation 15 | 16 | This documentation uses the [Diátaxis](https://diataxis.fr/) framework. The following sections are clearly separated: 17 | 18 | - [Tutorials](tutorials.md): when you want to learn 19 | - [How-to Guides](howto.md): when need to accomplish a task 20 | - [Explanation](howto.md): when you need a broader understanding and the thinking behind why certain things are set up in a particular way. 21 | - [Reference](reference.md): when you need precise, detailed information 22 | 23 | -------------------------------------------------------------------------------- /docs/reference.md: -------------------------------------------------------------------------------- 1 | # Reference 2 | 3 | ## Setup 4 | 5 | We'll use a test index for the examples that follow. 6 | 7 | ```python 8 | import os 9 | from tantivy import SchemaBuilder, Index, Document 10 | schema = ( 11 | SchemaBuilder() 12 | .add_integer_field("doc_id", indexed=True, stored=True) 13 | .add_text_field("title", stored=True) 14 | .add_text_field("body") 15 | .build() 16 | ) 17 | index = Index(schema=schema, path=None) 18 | writer = index.writer(heap_size=15_000_000, num_threads=1) 19 | doc = Document() 20 | doc.add_integer("doc_id", 1) 21 | doc.add_text("title", "The Old Man and the Sea") 22 | doc.add_text( 23 | "body", 24 | ( 25 | "He was an old man who fished alone in a skiff in" 26 | "the Gulf Stream and he had gone eighty-four days " 27 | "now without taking a fish." 28 | ), 29 | ) 30 | writer.add_document(doc) 31 | 32 | doc = Document() 33 | doc.add_integer("doc_id", 2) 34 | doc.add_text("title", "The Old Man and the Sea II") 35 | doc.add_text("body", "He was an old man who sailed alone.") 36 | 37 | writer.add_document(doc) 38 | writer.commit() 39 | index.reload() 40 | ``` 41 | 42 | ## Valid Query Formats 43 | 44 | tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy. 45 | Below a few basic query formats are shown: 46 | 47 | - AND and OR conjunctions. 48 | ```python 49 | searcher = index.searcher() 50 | query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"]) 51 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0] 52 | best_doc = searcher.doc(best_doc_address) 53 | ``` 54 | 55 | - +(includes) and -(excludes) operators. 56 | ```python 57 | query = index.parse_query('+Old +Man chef -fished', ["title", "body"]) 58 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0] 59 | best_doc = searcher.doc(best_doc_address) 60 | ``` 61 | Note: in a query like above, a word with no +/- acts like an OR. 62 | 63 | - phrase search. 64 | ```python 65 | query = index.parse_query('"eighty-four days"', ["title", "body"]) 66 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0] 67 | best_doc = searcher.doc(best_doc_address) 68 | ``` 69 | 70 | - integer search 71 | ```python 72 | query = index.parse_query('1', ["doc_id"]) 73 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0] 74 | best_doc = searcher.doc(best_doc_address) 75 | ``` 76 | Note: for integer search, the integer field should be indexed. 77 | 78 | For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html) 79 | 80 | ## Escape quotes inside a query string 81 | 82 | The tantivy docs for the query parser say that special characters like quotes can be 83 | escaped inside query values. However, it will also be necessary to surround 84 | the search query in additional quotes, as if a phrase query were being used. 85 | 86 | The following will NOT work: 87 | 88 | ```python 89 | try: 90 | index.parse_query(r'sea\"', ["title", "body"]) 91 | except ValueError as e: 92 | assert str(e) == r'Syntax Error: sea\"' 93 | ``` 94 | 95 | However, the following will succeed: 96 | 97 | ```python 98 | # Works! 99 | index.parse_query(r'"sea\""', ["title", "body"]) 100 | ``` 101 | 102 | Note that whether the included (and escaped) quote actually gets used 103 | to match documents depends on the tokenizer used for the field. For example, 104 | the default tokenizer will not match the document "sea\"s" with the query 105 | "sea\"", because this tokenizer discards punctuation. 106 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs==1.4.3 2 | mktestdocs==0.2.1 3 | -------------------------------------------------------------------------------- /docs/tutorials.md: -------------------------------------------------------------------------------- 1 | # Tutorials 2 | 3 | ## Building an index and populating it 4 | 5 | ```python 6 | import tempfile 7 | import pathlib 8 | import tantivy 9 | 10 | # Declaring our schema. 11 | schema_builder = tantivy.SchemaBuilder() 12 | schema_builder.add_text_field("title", stored=True) 13 | schema_builder.add_text_field("body", stored=True) 14 | schema_builder.add_integer_field("doc_id",stored=True) 15 | schema = schema_builder.build() 16 | 17 | # Creating our index (in memory) 18 | index = tantivy.Index(schema) 19 | ``` 20 | 21 | To have a persistent index, use the path 22 | parameter to store the index on the disk, e.g: 23 | 24 | ```python 25 | tmpdir = tempfile.TemporaryDirectory() 26 | index_path = pathlib.Path(tmpdir.name) / "index" 27 | index_path.mkdir() 28 | persistent_index = tantivy.Index(schema, path=str(index_path)) 29 | ``` 30 | 31 | By default, tantivy offers the following tokenizers 32 | which can be used in tantivy-py: 33 | - `default` 34 | `default` is the tokenizer that will be used if you do not 35 | assign a specific tokenizer to your text field. 36 | It will chop your text on punctuation and whitespaces, 37 | removes tokens that are longer than 40 chars, and lowercase your text. 38 | 39 | - `raw` 40 | Does not actual tokenizer your text. It keeps it entirely unprocessed. 41 | It can be useful to index uuids, or urls for instance. 42 | 43 | - `en_stem` 44 | 45 | In addition to what `default` does, the `en_stem` tokenizer also 46 | apply stemming to your tokens. Stemming consists in trimming words to 47 | remove their inflection. This tokenizer is slower than the default one, 48 | but is recommended to improve recall. 49 | 50 | to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g. 51 | ```python 52 | schema_builder_tok = tantivy.SchemaBuilder() 53 | schema_builder_tok.add_text_field("body", stored=True, tokenizer_name='en_stem') 54 | ``` 55 | 56 | ## Adding one document. 57 | 58 | ```python 59 | writer = index.writer() 60 | writer.add_document(tantivy.Document( 61 | doc_id=1, 62 | title=["The Old Man and the Sea"], 63 | body=["""He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."""], 64 | )) 65 | # ... and committing 66 | writer.commit() 67 | writer.wait_merging_threads() 68 | ``` 69 | 70 | Note that `wait_merging_threads()` must come at the end, because 71 | the `writer` object will not be usable after this call. 72 | 73 | ## Building and Executing Queries with the Query Parser 74 | 75 | With the Query Parser, you can easily build simple queries for your index. 76 | 77 | First you need to get a searcher for the index 78 | 79 | ```python 80 | # Reload the index to ensure it points to the last commit. 81 | index.reload() 82 | searcher = index.searcher() 83 | ``` 84 | 85 | Then you need to get a valid query object by parsing your query on the index. 86 | 87 | ```python 88 | query = index.parse_query("fish days", ["title", "body"]) 89 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0] 90 | best_doc = searcher.doc(best_doc_address) 91 | assert best_doc["title"] == ["The Old Man and the Sea"] 92 | ``` 93 | 94 | The `parse_query` method takes in a query string (visit [reference](reference.md#valid-query-formats) for more details on the syntax) and create a `Query` object that can be used to search the index. 95 | 96 | In Tantivy, hit documents during search will return a `DocAddress` object that can be used to retrieve the document from the searcher, rather than returning the document directly. 97 | 98 | ## Building and Executing Queries with Query Objects 99 | 100 | > *This is an advanced topic. Only consider this if you need very fine-grained control over your queries, or existing query parsers do not meet your needs.* 101 | 102 | If you have a Lucene / ElasticSearch background, you might be more comfortable building nested queries programmatically. Also, some queries (e.g. ConstQuery, DisjunctionMaxQuery) are not supported by the query parser due to their complexity in expression. 103 | 104 | Consider the following query in ElasticSearch: 105 | 106 | ```json 107 | { 108 | "query": { 109 | "bool": { 110 | "must": [ 111 | { 112 | "dis_max": { 113 | "queries": [ 114 | { 115 | "match": { 116 | "title": { 117 | "query": "fish", 118 | "boost": 2 119 | } 120 | } 121 | }, 122 | { 123 | "match": { 124 | "body": { 125 | "query": "eighty-four days", 126 | "boost": 1.5 127 | } 128 | } 129 | } 130 | ], 131 | "tie_breaker": 0.3 132 | } 133 | } 134 | ] 135 | } 136 | } 137 | } 138 | ``` 139 | 140 | It is impossible to express this query using the query parser. Instead, you can build the query programmatically mixing with the query parser: 141 | 142 | ```python 143 | from tantivy import Query, Occur, Index 144 | 145 | ... 146 | 147 | complex_query = Query.boolean_query( 148 | [ 149 | ( 150 | Occur.Must, 151 | Query.disjunction_max_query( 152 | [ 153 | Query.boost_query( 154 | # by default, only the query parser will analyze 155 | # your query string 156 | index.parse_query("fish", ["title"]), 157 | 2.0 158 | ), 159 | Query.boost_query( 160 | index.parse_query("eighty-four days", ["body"]), 161 | 1.5 162 | ), 163 | ], 164 | 0.3, 165 | ), 166 | ) 167 | ] 168 | ) 169 | 170 | ``` 171 | 172 | 173 | 174 | ## Using the snippet generator 175 | 176 | Let's revisit the query `"fish days"` in our [example](#building-and-executing-queries-with-the-query-parser): 177 | 178 | ```python 179 | hit_text = best_doc["body"][0] 180 | print(f"{hit_text=}") 181 | assert hit_text == ( 182 | "He was an old man who fished alone in a skiff in the " 183 | "Gulf Stream and he had gone eighty-four days now " 184 | "without taking a fish." 185 | ) 186 | 187 | from tantivy import SnippetGenerator 188 | snippet_generator = SnippetGenerator.create( 189 | searcher, query, schema, "body" 190 | ) 191 | snippet = snippet_generator.snippet_from_doc(best_doc) 192 | ``` 193 | 194 | The snippet object provides the hit ranges. These are the marker 195 | offsets in the text that match the query. 196 | 197 | ```python 198 | highlights = snippet.highlighted() 199 | first_highlight = highlights[0] 200 | assert first_highlight.start == 93 201 | assert first_highlight.end == 97 202 | assert hit_text[first_highlight.start:first_highlight.end] == "days" 203 | ``` 204 | 205 | The snippet object can also generate a marked-up HTML snippet: 206 | 207 | ```python 208 | html_snippet = snippet.to_html() 209 | assert html_snippet == ( 210 | "He was an old man who fished alone in a skiff in the " 211 | "Gulf Stream and he had gone eighty-four days now " 212 | "without taking a fish" 213 | ) 214 | ``` 215 | 216 | 217 | ## Create a Custom Tokenizer (Text Analyzer) 218 | 219 | Tantivy provides several built-in tokenizers and filters that 220 | can be chained together to create new tokenizers (or 221 | 'text analyzers') that better fit your needs. 222 | 223 | Tantivy-py lets you access these components, assemble them, 224 | and register the result with an index. 225 | 226 | Let's walk through creating and registering a custom text analyzer 227 | to see how everything fits together. 228 | 229 | ### Example 230 | 231 | First, let's create a text analyzer. As explained further down, 232 | a text analyzer is a pipeline consisting of one tokenizer and 233 | any number of token filters. 234 | 235 | ```python 236 | from tantivy import ( 237 | TextAnalyzer, 238 | TextAnalyzerBuilder, 239 | Tokenizer, 240 | Filter, 241 | Index, 242 | SchemaBuilder 243 | ) 244 | 245 | my_analyzer: TextAnalyzer = ( 246 | TextAnalyzerBuilder( 247 | # Create a `Tokenizer` instance. 248 | # It instructs the builder about which type of tokenizer 249 | # to create internally and with which arguments. 250 | Tokenizer.regex(r"(?i)([a-z]+)") 251 | ) 252 | .filter( 253 | # Create a `Filter` instance. 254 | # Like `Tokenizer`, this object provides instructions 255 | # to the builder. 256 | Filter.lowercase() 257 | ) 258 | .filter( 259 | # Define custom words. 260 | Filter.custom_stopword(["www", "com"]) 261 | ) 262 | # Finally, build a TextAnalyzer 263 | # chaining all tokenizer > [filter, ...] steps together. 264 | .build() 265 | ) 266 | ``` 267 | 268 | We can check that our new analyzer is working as expected 269 | by passing some text to its `.analyze()` method. 270 | 271 | ```python 272 | # Will print: ['this', 'website', 'might', 'exist'] 273 | my_analyzer.analyze('www.this1website1might1exist.com') 274 | ``` 275 | 276 | The next step is to register our analyzer with an index. Let's 277 | assume we already have one. 278 | 279 | ```python 280 | index.register_tokenizer("custom_analyzer", my_analyzer) 281 | ``` 282 | 283 | To link an analyzer to a field in the index, pass the 284 | analyzer name to the `tokenizer_name=` parameter of 285 | the `SchemaBuilder`'s `add_text_field()` method. 286 | 287 | Here is the schema that was used to construct our index: 288 | 289 | ```python 290 | schema = ( 291 | tantivy.SchemaBuilder() 292 | .add_text_field("content", tokenizer_name="custom_analyzer") 293 | .build() 294 | ) 295 | index = Index(schema) 296 | ``` 297 | 298 | Summary: 299 | 300 | 1. Use `TextAnalyzerBuilder`, `Tokenizer`, and `Filter` to build a `TextAnalyzer` 301 | 2. The analyzer's `.analyze()` method lets you use your analyzer as a tokenizer from Python. 302 | 3. Refer to your analyzer's name when building the index schema. 303 | 4. Use the same name when registering your analyzer on the index. 304 | 305 | 306 | ### On terminology: Tokenizer vs. Text Analyzer 307 | 308 | Tantivy-py mimics Tantivy's interface as closely as possible. 309 | This includes minor terminological inconsistencies, one of 310 | which is how Tantivy distinguishes between 'tokenizers' and 311 | 'text analyzers'. 312 | 313 | Quite simply, a 'tokenizer' segments text into tokens. 314 | A 'text analyzer' is a pipeline consisting of one tokenizer 315 | and zero or more token filters. The `TextAnalyzer` is the 316 | primary object of interest when talking about how to 317 | change Tantivy's tokenization behavior. 318 | 319 | Slightly confusingly, though, the `Index` and `SchemaBuilder` 320 | interfaces use 'tokenizer' to mean 'text analyzer'. 321 | 322 | This inconsistency can be observed in `SchemaBuilder.add_text_field`, e.g. -- 323 | 324 | ``` 325 | SchemaBuilder.add_text_field(..., tokenizer_name=)` 326 | ``` 327 | 328 | -- and in the name of the `Index.register_tokenizer(...)` method, which actually 329 | serves to register a *text analyzer*. 330 | 331 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: tantivy-py 2 | # site_url: https://example.com 3 | nav: 4 | - Home: index.md 5 | - Tutorials: tutorials.md 6 | - How-to Guides: howto.md 7 | - Explanation: explanation.md 8 | - Reference: reference.md 9 | - About: about.md 10 | theme: readthedocs 11 | 12 | # Can nest documents under above sections 13 | # - 'User Guide': 14 | # - 'Writing your docs': 'writing-your-docs.md' 15 | # - 'Styling your docs': 'styling-your-docs.md' 16 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | import nox 2 | 3 | 4 | @nox.session(python=["3.9", "3.10", "3.11", "3.12", "3.13"]) 5 | def test(session): 6 | session.install("-rrequirements-dev.txt") 7 | session.install("-e", ".", "--no-build-isolation") 8 | session.run("pytest", *session.posargs) 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin<=1.3.2"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "tantivy" 7 | version = "0.24.0" 8 | description = "Official Python bindings for the Tantivy search engine" 9 | requires-python = ">=3.9" 10 | authors = [ 11 | { name = "Damir Jelić", email="poljar@termina.org.uk" }, 12 | { name = "Caleb Hattingh", email = "code@cjrh.info" }, 13 | { name = "Cam Parry", email = "cam.parry@kapiche.com"} 14 | ] 15 | optional-dependencies = { dev = ["nox"] } 16 | 17 | [tool.maturin] 18 | bindings = "pyo3" 19 | 20 | [tool.pytest.ini_options] 21 | # Set the durations option and doctest modules 22 | # See https://docs.pytest.org/en/latest/usage.html#durations 23 | addopts = "--doctest-modules --durations=10" 24 | # Use the `--ignore-glob` setting to exclude the `noxfile.py` module from the doctests 25 | # See https://docs.pytest.org/en/latest/reference.html#confval-ignore_glob 26 | testpaths = [ 27 | "tests", 28 | "tantivy", 29 | "src", 30 | ] 31 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | maturin 2 | pytest>=4.0 3 | mktestdocs==0.2.1 4 | -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "stable" 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 80 -------------------------------------------------------------------------------- /src/facet.rs: -------------------------------------------------------------------------------- 1 | use crate::to_pyerr; 2 | use pyo3::{ 3 | basic::CompareOp, 4 | prelude::*, 5 | types::{PyTuple, PyType}, 6 | IntoPyObjectExt, 7 | }; 8 | use serde::{Deserialize, Serialize}; 9 | use tantivy::schema; 10 | 11 | /// A Facet represent a point in a given hierarchy. 12 | /// 13 | /// They are typically represented similarly to a filepath. For instance, an 14 | /// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv. 15 | /// 16 | /// A document can be associated to any number of facets. The hierarchy 17 | /// implicitely imply that a document belonging to a facet also belongs to the 18 | /// ancestor of its facet. In the example above, /electronics/tv_and_video/ 19 | /// and /electronics. 20 | #[pyclass(frozen, module = "tantivy.tantivy")] 21 | #[derive(Clone, Deserialize, PartialEq, Serialize)] 22 | pub(crate) struct Facet { 23 | pub(crate) inner: schema::Facet, 24 | } 25 | 26 | #[pymethods] 27 | impl Facet { 28 | /// Creates a `Facet` from its binary representation. 29 | #[staticmethod] 30 | fn from_encoded(encoded_bytes: Vec) -> PyResult { 31 | let inner = 32 | schema::Facet::from_encoded(encoded_bytes).map_err(to_pyerr)?; 33 | Ok(Self { inner }) 34 | } 35 | 36 | /// Create a new instance of the "root facet" Equivalent to /. 37 | #[classmethod] 38 | fn root(_cls: &Bound) -> Facet { 39 | Facet { 40 | inner: schema::Facet::root(), 41 | } 42 | } 43 | 44 | /// Returns true if the facet is the root facet /. 45 | #[getter] 46 | fn is_root(&self) -> bool { 47 | self.inner.is_root() 48 | } 49 | 50 | /// Returns true if another Facet is a subfacet of this facet. 51 | /// Args: 52 | /// other (Facet): The Facet that we should check if this facet is a 53 | /// subset of. 54 | fn is_prefix_of(&self, other: &Facet) -> bool { 55 | self.inner.is_prefix_of(&other.inner) 56 | } 57 | 58 | /// Create a Facet object from a string. 59 | /// Args: 60 | /// facet_string (str): The string that contains a facet. 61 | /// 62 | /// Returns the created Facet. 63 | #[classmethod] 64 | fn from_string(_cls: &Bound, facet_string: &str) -> Facet { 65 | Facet { 66 | inner: schema::Facet::from(facet_string), 67 | } 68 | } 69 | 70 | /// Returns the list of `segments` that forms a facet path. 71 | /// 72 | /// For instance `//europe/france` becomes `["europe", "france"]`. 73 | fn to_path(&self) -> Vec<&str> { 74 | self.inner.to_path() 75 | } 76 | 77 | /// Returns the facet string representation. 78 | fn to_path_str(&self) -> String { 79 | self.inner.to_string() 80 | } 81 | 82 | fn __repr__(&self) -> PyResult { 83 | Ok(format!("Facet({})", self.to_path_str())) 84 | } 85 | 86 | fn __richcmp__( 87 | &self, 88 | other: &Self, 89 | op: CompareOp, 90 | py: Python<'_>, 91 | ) -> PyResult { 92 | match op { 93 | CompareOp::Eq => (self == other).into_py_any(py), 94 | CompareOp::Ne => (self != other).into_py_any(py), 95 | _ => Ok(py.NotImplemented()), 96 | } 97 | } 98 | 99 | fn __reduce__<'a>( 100 | slf: PyRef<'a, Self>, 101 | py: Python<'a>, 102 | ) -> PyResult> { 103 | let encoded_bytes = slf.inner.encoded_str().as_bytes().to_vec(); 104 | let deserializer = slf.into_pyobject(py)?.getattr("from_encoded")?; 105 | PyTuple::new( 106 | py, 107 | [deserializer, PyTuple::new(py, [encoded_bytes])?.into_any()], 108 | ) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/index.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::new_ret_no_self)] 2 | 3 | use std::collections::HashMap; 4 | 5 | use pyo3::{exceptions, prelude::*, types::PyAny}; 6 | 7 | use crate::{ 8 | document::{extract_value, Document}, 9 | get_field, 10 | parser_error::QueryParserErrorIntoPy, 11 | query::Query, 12 | schema::Schema, 13 | searcher::Searcher, 14 | to_pyerr, 15 | tokenizer::TextAnalyzer as PyTextAnalyzer, 16 | }; 17 | use tantivy as tv; 18 | use tantivy::{ 19 | directory::MmapDirectory, 20 | schema::{ 21 | document::TantivyDocument, NamedFieldDocument, OwnedValue as Value, 22 | Term, 23 | }, 24 | tokenizer::{ 25 | Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer, 26 | TextAnalyzer, 27 | }, 28 | }; 29 | 30 | const RELOAD_POLICY: &str = "commit"; 31 | 32 | /// IndexWriter is the user entry-point to add documents to the index. 33 | /// 34 | /// To create an IndexWriter first create an Index and call the writer() method 35 | /// on the index object. 36 | #[pyclass(module = "tantivy.tantivy")] 37 | pub(crate) struct IndexWriter { 38 | inner_index_writer: Option, 39 | schema: tv::schema::Schema, 40 | } 41 | 42 | impl IndexWriter { 43 | fn inner(&self) -> PyResult<&tv::IndexWriter> { 44 | self.inner_index_writer.as_ref().ok_or_else(|| { 45 | exceptions::PyRuntimeError::new_err( 46 | "IndexWriter was consumed and no longer in a valid state", 47 | ) 48 | }) 49 | } 50 | 51 | fn inner_mut(&mut self) -> PyResult<&mut tv::IndexWriter> { 52 | self.inner_index_writer.as_mut().ok_or_else(|| { 53 | exceptions::PyRuntimeError::new_err( 54 | "IndexWriter was consumed and no longer in a valid state", 55 | ) 56 | }) 57 | } 58 | 59 | fn take_inner(&mut self) -> PyResult { 60 | self.inner_index_writer.take().ok_or_else(|| { 61 | exceptions::PyRuntimeError::new_err( 62 | "IndexWriter was consumed and no longer in a valid state", 63 | ) 64 | }) 65 | } 66 | } 67 | 68 | #[pymethods] 69 | impl IndexWriter { 70 | /// Add a document to the index. 71 | /// 72 | /// If the indexing pipeline is full, this call may block. 73 | /// 74 | /// Returns an `opstamp`, which is an increasing integer that can be used 75 | /// by the client to align commits with its own document queue. 76 | /// The `opstamp` represents the number of documents that have been added 77 | /// since the creation of the index. 78 | pub fn add_document(&mut self, doc: &Document) -> PyResult { 79 | let named_doc = NamedFieldDocument(doc.field_values.clone()); 80 | let doc = TantivyDocument::convert_named_doc(&self.schema, named_doc) 81 | .map_err(to_pyerr)?; 82 | self.inner()?.add_document(doc).map_err(to_pyerr) 83 | } 84 | 85 | /// Helper for the `add_document` method, but passing a json string. 86 | /// 87 | /// If the indexing pipeline is full, this call may block. 88 | /// 89 | /// Returns an `opstamp`, which is an increasing integer that can be used 90 | /// by the client to align commits with its own document queue. 91 | /// The `opstamp` represents the number of documents that have been added 92 | /// since the creation of the index. 93 | pub fn add_json(&mut self, json: &str) -> PyResult { 94 | let doc = TantivyDocument::parse_json(&self.schema, json) 95 | .map_err(to_pyerr)?; 96 | let opstamp = self.inner()?.add_document(doc); 97 | opstamp.map_err(to_pyerr) 98 | } 99 | 100 | /// Commits all of the pending changes 101 | /// 102 | /// A call to commit blocks. After it returns, all of the document that 103 | /// were added since the last commit are published and persisted. 104 | /// 105 | /// In case of a crash or an hardware failure (as long as the hard disk is 106 | /// spared), it will be possible to resume indexing from this point. 107 | /// 108 | /// Returns the `opstamp` of the last document that made it in the commit. 109 | fn commit(&mut self) -> PyResult { 110 | self.inner_mut()?.commit().map_err(to_pyerr) 111 | } 112 | 113 | /// Rollback to the last commit 114 | /// 115 | /// This cancels all of the update that happened before after the last 116 | /// commit. After calling rollback, the index is in the same state as it 117 | /// was after the last commit. 118 | fn rollback(&mut self) -> PyResult { 119 | self.inner_mut()?.rollback().map_err(to_pyerr) 120 | } 121 | 122 | /// Detect and removes the files that are not used by the index anymore. 123 | fn garbage_collect_files(&mut self) -> PyResult<()> { 124 | use futures::executor::block_on; 125 | block_on(self.inner()?.garbage_collect_files()).map_err(to_pyerr)?; 126 | Ok(()) 127 | } 128 | 129 | /// Deletes all documents from the index. 130 | fn delete_all_documents(&mut self) -> PyResult<()> { 131 | self.inner()?.delete_all_documents().map_err(to_pyerr)?; 132 | Ok(()) 133 | } 134 | 135 | /// The opstamp of the last successful commit. 136 | /// 137 | /// This is the opstamp the index will rollback to if there is a failure 138 | /// like a power surge. 139 | /// 140 | /// This is also the opstamp of the commit that is currently available 141 | /// for searchers. 142 | #[getter] 143 | fn commit_opstamp(&self) -> PyResult { 144 | Ok(self.inner()?.commit_opstamp()) 145 | } 146 | 147 | #[deprecated( 148 | note = "This method is deprecated and will be removed in the future. Use either delete_documents_by_term, or delete_documents_by_query." 149 | )] 150 | fn delete_documents( 151 | &mut self, 152 | field_name: &str, 153 | field_value: &Bound, 154 | ) -> PyResult { 155 | self.delete_documents_by_term(field_name, field_value) 156 | } 157 | 158 | /// Delete all documents containing a given term. 159 | /// 160 | /// This method does not parse the given term and it expects the term to be 161 | /// already tokenized according to any tokenizers attached to the field. This 162 | /// can often result in surprising behaviour. For example, if you want to store 163 | /// UUIDs as text in a field, and those values have hyphens, and you use the 164 | /// default tokenizer which removes punctuation, you will not be able to delete 165 | /// a document added with particular UUID, by passing the same UUID to this 166 | /// method. In such workflows where deletions are required, particularly with 167 | /// string values, it is strongly recommended to use the 168 | /// "raw" tokenizer as this will match exactly. In situations where you do 169 | /// want tokenization to be applied, it is recommended to instead use the 170 | /// `delete_documents_by_query` method instead, which will delete documents 171 | /// matching the given query using the same query parser as used in search queries. 172 | /// 173 | /// Args: 174 | /// field_name (str): The field name for which we want to filter deleted docs. 175 | /// field_value (PyAny): Python object with the value we want to filter. 176 | /// 177 | /// If the field_name is not on the schema raises ValueError exception. 178 | /// If the field_value is not supported raises Exception. 179 | fn delete_documents_by_term( 180 | &mut self, 181 | field_name: &str, 182 | field_value: &Bound, 183 | ) -> PyResult { 184 | let field = get_field(&self.schema, field_name)?; 185 | let value = extract_value(field_value)?; 186 | let term = match value { 187 | Value::Null => { 188 | return Err(exceptions::PyValueError::new_err(format!( 189 | "Field `{field_name}` is null type not deletable." 190 | ))) 191 | }, 192 | Value::Str(text) => Term::from_field_text(field, &text), 193 | Value::U64(num) => Term::from_field_u64(field, num), 194 | Value::I64(num) => Term::from_field_i64(field, num), 195 | Value::F64(num) => Term::from_field_f64(field, num), 196 | Value::Date(d) => Term::from_field_date(field, d), 197 | Value::Facet(facet) => Term::from_facet(field, &facet), 198 | Value::Bytes(_) => { 199 | return Err(exceptions::PyValueError::new_err(format!( 200 | "Field `{field_name}` is bytes type not deletable." 201 | ))) 202 | } 203 | Value::PreTokStr(_pretok) => { 204 | return Err(exceptions::PyValueError::new_err(format!( 205 | "Field `{field_name}` is pretokenized. This is not authorized for delete." 206 | ))) 207 | } 208 | Value::Array(_) => { 209 | return Err(exceptions::PyValueError::new_err(format!( 210 | "Field `{field_name}` is array type not deletable." 211 | ))) 212 | } 213 | Value::Object(_) => { 214 | return Err(exceptions::PyValueError::new_err(format!( 215 | "Field `{field_name}` is json object type not deletable." 216 | ))) 217 | }, 218 | Value::Bool(b) => Term::from_field_bool(field, b), 219 | Value::IpAddr(i) => Term::from_field_ip_addr(field, i) 220 | }; 221 | Ok(self.inner()?.delete_term(term)) 222 | } 223 | 224 | /// Delete all documents matching a given query. 225 | /// 226 | /// Example: 227 | /// 228 | /// ```python 229 | /// schema_builder = SchemaBuilder() 230 | /// schema_builder.add_text_field("title", fast=True) 231 | /// schema = schema_builder.build() 232 | /// index = Index(schema) 233 | /// writer = index.writer() 234 | /// source_doc = { 235 | /// "title": "Here is some text" 236 | /// } 237 | /// writer.add_json(json.dumps(source_doc)) 238 | /// writer.commit() 239 | /// writer.wait_merging_threads() 240 | /// 241 | /// query = index.parse_query("title:text") 242 | /// writer = index.writer() 243 | /// writer.delete_documents_by_query(query) 244 | /// writer.commit() 245 | /// writer.wait_merging_threads() 246 | /// ``` 247 | /// 248 | /// Args: 249 | /// query (Query): The query to filter the deleted documents. 250 | /// 251 | /// If the query is not valid raises ValueError exception. 252 | /// If the query is not supported raises Exception. 253 | fn delete_documents_by_query(&mut self, query: &Query) -> PyResult { 254 | self.inner()? 255 | .delete_query(query.inner.box_clone()) 256 | .map_err(to_pyerr) 257 | } 258 | 259 | /// If there are some merging threads, blocks until they all finish 260 | /// their work and then drop the `IndexWriter`. 261 | /// 262 | /// This will consume the `IndexWriter`. Further accesses to the 263 | /// object will result in an error. 264 | pub fn wait_merging_threads(&mut self) -> PyResult<()> { 265 | self.take_inner()?.wait_merging_threads().map_err(to_pyerr) 266 | } 267 | } 268 | 269 | /// Create a new index object. 270 | /// 271 | /// Args: 272 | /// schema (Schema): The schema of the index. 273 | /// path (str, optional): The path where the index should be stored. If 274 | /// no path is provided, the index will be stored in memory. 275 | /// reuse (bool, optional): Should we open an existing index if one exists 276 | /// or always create a new one. 277 | /// 278 | /// If an index already exists it will be opened and reused. Raises OSError 279 | /// if there was a problem during the opening or creation of the index. 280 | #[pyclass(module = "tantivy.tantivy")] 281 | pub(crate) struct Index { 282 | pub(crate) index: tv::Index, 283 | reader: tv::IndexReader, 284 | } 285 | 286 | #[pymethods] 287 | impl Index { 288 | #[staticmethod] 289 | fn open(path: &str) -> PyResult { 290 | let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?; 291 | 292 | Index::register_custom_text_analyzers(&index); 293 | 294 | let reader = index.reader().map_err(to_pyerr)?; 295 | Ok(Index { index, reader }) 296 | } 297 | 298 | #[new] 299 | #[pyo3(signature = (schema, path = None, reuse = true))] 300 | fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult { 301 | let index = match path { 302 | Some(p) => { 303 | let directory = MmapDirectory::open(p).map_err(to_pyerr)?; 304 | if reuse { 305 | tv::Index::open_or_create(directory, schema.inner.clone()) 306 | } else { 307 | tv::Index::create( 308 | directory, 309 | schema.inner.clone(), 310 | tv::IndexSettings::default(), 311 | ) 312 | } 313 | .map_err(to_pyerr)? 314 | } 315 | None => tv::Index::create_in_ram(schema.inner.clone()), 316 | }; 317 | 318 | Index::register_custom_text_analyzers(&index); 319 | 320 | let reader = index.reader().map_err(to_pyerr)?; 321 | Ok(Index { index, reader }) 322 | } 323 | 324 | /// Create a `IndexWriter` for the index. 325 | /// 326 | /// The writer will be multithreaded and the provided heap size will be 327 | /// split between the given number of threads. 328 | /// 329 | /// Args: 330 | /// overall_heap_size (int, optional): The total target heap memory usage of 331 | /// the writer. Tantivy requires that this can't be less 332 | /// than 3000000 *per thread*. Lower values will result in more 333 | /// frequent internal commits when adding documents (slowing down 334 | /// write progress), and larger values will results in fewer 335 | /// commits but greater memory usage. The best value will depend 336 | /// on your specific use case. 337 | /// num_threads (int, optional): The number of threads that the writer 338 | /// should use. If this value is 0, tantivy will choose 339 | /// automatically the number of threads. 340 | /// 341 | /// Raises ValueError if there was an error while creating the writer. 342 | #[pyo3(signature = (heap_size = 128_000_000, num_threads = 0))] 343 | fn writer( 344 | &self, 345 | heap_size: usize, 346 | num_threads: usize, 347 | ) -> PyResult { 348 | let writer = match num_threads { 349 | 0 => self.index.writer(heap_size), 350 | _ => self.index.writer_with_num_threads(num_threads, heap_size), 351 | } 352 | .map_err(to_pyerr)?; 353 | let schema = self.index.schema(); 354 | Ok(IndexWriter { 355 | inner_index_writer: Some(writer), 356 | schema, 357 | }) 358 | } 359 | 360 | /// Configure the index reader. 361 | /// 362 | /// Args: 363 | /// reload_policy (str, optional): The reload policy that the 364 | /// IndexReader should use. Can be `Manual` or `OnCommit`. 365 | /// num_warmers (int, optional): The number of searchers that the 366 | /// reader should create. 367 | #[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))] 368 | fn config_reader( 369 | &mut self, 370 | reload_policy: &str, 371 | num_warmers: usize, 372 | ) -> Result<(), PyErr> { 373 | let reload_policy = reload_policy.to_lowercase(); 374 | let reload_policy = match reload_policy.as_ref() { 375 | "commit" => tv::ReloadPolicy::OnCommitWithDelay, 376 | "on-commit" => tv::ReloadPolicy::OnCommitWithDelay, 377 | "oncommit" => tv::ReloadPolicy::OnCommitWithDelay, 378 | "manual" => tv::ReloadPolicy::Manual, 379 | _ => return Err(exceptions::PyValueError::new_err( 380 | "Invalid reload policy, valid choices are: 'manual' and 'OnCommit'" 381 | )) 382 | }; 383 | let builder = self.index.reader_builder(); 384 | let builder = builder.reload_policy(reload_policy); 385 | let builder = if num_warmers > 0 { 386 | builder.num_warming_threads(num_warmers) 387 | } else { 388 | builder 389 | }; 390 | 391 | self.reader = builder.try_into().map_err(to_pyerr)?; 392 | Ok(()) 393 | } 394 | 395 | /// Returns a searcher 396 | /// 397 | /// This method should be called every single time a search query is performed. 398 | /// The same searcher must be used for a given query, as it ensures the use of a consistent segment set. 399 | fn searcher(&self) -> Searcher { 400 | Searcher { 401 | inner: self.reader.searcher(), 402 | } 403 | } 404 | 405 | /// Check if the given path contains an existing index. 406 | /// Args: 407 | /// path: The path where tantivy will search for an index. 408 | /// 409 | /// Returns True if an index exists at the given path, False otherwise. 410 | /// 411 | /// Raises OSError if the directory cannot be opened. 412 | #[staticmethod] 413 | fn exists(path: &str) -> PyResult { 414 | let directory = MmapDirectory::open(path).map_err(to_pyerr)?; 415 | tv::Index::exists(&directory).map_err(to_pyerr) 416 | } 417 | 418 | /// The schema of the current index. 419 | #[getter] 420 | fn schema(&self) -> Schema { 421 | let schema = self.index.schema(); 422 | Schema { inner: schema } 423 | } 424 | 425 | /// Update searchers so that they reflect the state of the last .commit(). 426 | /// 427 | /// If you set up the the reload policy to be on 'commit' (which is the 428 | /// default) every commit should be rapidly reflected on your IndexReader 429 | /// and you should not need to call reload() at all. 430 | fn reload(&self) -> PyResult<()> { 431 | self.reader.reload().map_err(to_pyerr) 432 | } 433 | 434 | /// Parse a query 435 | /// 436 | /// Args: 437 | /// query: the query, following the tantivy query language. 438 | /// 439 | /// default_fields_names (List[Field]): A list of fields used to search if no 440 | /// field is specified in the query. 441 | /// 442 | /// field_boosts: A dictionary keyed on field names which provides default boosts 443 | /// for the query constructed by this method. 444 | /// 445 | /// fuzzy_fields: A dictionary keyed on field names which provides (prefix, distance, transpose_cost_one) 446 | /// triples making queries constructed by this method fuzzy against the given fields 447 | /// and using the given parameters. 448 | /// `prefix` determines if terms which are prefixes of the given term match the query. 449 | /// `distance` determines the maximum Levenshtein distance between terms matching the query and the given term. 450 | /// `transpose_cost_one` determines if transpositions of neighbouring characters are counted only once against the Levenshtein distance. 451 | #[pyo3(signature = (query, default_field_names = None, field_boosts = HashMap::new(), fuzzy_fields = HashMap::new()))] 452 | pub fn parse_query( 453 | &self, 454 | query: &str, 455 | default_field_names: Option>, 456 | field_boosts: HashMap, 457 | fuzzy_fields: HashMap, 458 | ) -> PyResult { 459 | let parser = self.prepare_query_parser( 460 | default_field_names, 461 | field_boosts, 462 | fuzzy_fields, 463 | )?; 464 | 465 | let query = parser.parse_query(query).map_err(to_pyerr)?; 466 | 467 | Ok(Query { inner: query }) 468 | } 469 | 470 | /// Parse a query leniently. 471 | /// 472 | /// This variant parses invalid query on a best effort basis. If some part of the query can't 473 | /// reasonably be executed (range query without field, searching on a non existing field, 474 | /// searching without precising field when no default field is provided...), they may get turned 475 | /// into a "match-nothing" subquery. 476 | /// 477 | /// Args: 478 | /// query: the query, following the tantivy query language. 479 | /// 480 | /// default_fields_names (List[Field]): A list of fields used to search if no 481 | /// field is specified in the query. 482 | /// 483 | /// field_boosts: A dictionary keyed on field names which provides default boosts 484 | /// for the query constructed by this method. 485 | /// 486 | /// fuzzy_fields: A dictionary keyed on field names which provides (prefix, distance, transpose_cost_one) 487 | /// triples making queries constructed by this method fuzzy against the given fields 488 | /// and using the given parameters. 489 | /// `prefix` determines if terms which are prefixes of the given term match the query. 490 | /// `distance` determines the maximum Levenshtein distance between terms matching the query and the given term. 491 | /// `transpose_cost_one` determines if transpositions of neighbouring characters are counted only once against the Levenshtein distance. 492 | /// 493 | /// Returns a tuple containing the parsed query and a list of errors. 494 | /// 495 | /// Raises ValueError if a field in `default_field_names` is not defined or marked as indexed. 496 | #[pyo3(signature = (query, default_field_names = None, field_boosts = HashMap::new(), fuzzy_fields = HashMap::new()))] 497 | pub fn parse_query_lenient( 498 | &self, 499 | query: &str, 500 | default_field_names: Option>, 501 | field_boosts: HashMap, 502 | fuzzy_fields: HashMap, 503 | py: Python, 504 | ) -> PyResult<(Query, Vec)> { 505 | let parser = self.prepare_query_parser( 506 | default_field_names, 507 | field_boosts, 508 | fuzzy_fields, 509 | )?; 510 | 511 | let (query, errors) = parser.parse_query_lenient(query); 512 | let errors = errors 513 | .into_iter() 514 | .map(|err| err.into_py(py)) 515 | // This is a rust idiom, but just in case you're not familiar 516 | // with it, we're converting from an iterator of PyResult 517 | // into a PyResult>, by specifying the `PyResult` 518 | // on the outside of the turbofish type signature. 519 | .collect::>()?; 520 | 521 | Ok((Query { inner: query }, errors)) 522 | } 523 | 524 | /// Register a custom text analyzer by name. (Confusingly, 525 | /// this is one of the places where Tantivy uses 'tokenizer' to refer to a 526 | /// TextAnalyzer instance.) 527 | /// 528 | // Implementation notes: Skipped indirection of TokenizerManager. 529 | pub fn register_tokenizer(&self, name: &str, analyzer: PyTextAnalyzer) { 530 | self.index.tokenizers().register(name, analyzer.analyzer); 531 | } 532 | } 533 | 534 | impl Index { 535 | fn prepare_query_parser( 536 | &self, 537 | default_field_names: Option>, 538 | field_boosts: HashMap, 539 | fuzzy_fields: HashMap, 540 | ) -> PyResult { 541 | let schema = self.index.schema(); 542 | 543 | let default_fields = if let Some(default_field_names) = 544 | default_field_names 545 | { 546 | default_field_names.iter().map(|field_name| { 547 | let field = schema.get_field(field_name).map_err(|_err| { 548 | exceptions::PyValueError::new_err(format!( 549 | "Field `{field_name}` is not defined in the schema." 550 | )) 551 | })?; 552 | 553 | let field_entry = schema.get_field_entry(field); 554 | if !field_entry.is_indexed() { 555 | return Err(exceptions::PyValueError::new_err( 556 | format!("Field `{field_name}` is not set as indexed in the schema.") 557 | )); 558 | } 559 | 560 | Ok(field) 561 | }).collect::>()? 562 | } else { 563 | schema 564 | .fields() 565 | .filter(|(_, field_entry)| field_entry.is_indexed()) 566 | .map(|(field, _)| field) 567 | .collect() 568 | }; 569 | 570 | let mut parser = 571 | tv::query::QueryParser::for_index(&self.index, default_fields); 572 | 573 | for (field_name, boost) in field_boosts { 574 | let field = schema.get_field(&field_name).map_err(|_err| { 575 | exceptions::PyValueError::new_err(format!( 576 | "Field `{field_name}` is not defined in the schema." 577 | )) 578 | })?; 579 | parser.set_field_boost(field, boost); 580 | } 581 | 582 | for (field_name, (prefix, distance, transpose_cost_one)) in fuzzy_fields 583 | { 584 | let field = schema.get_field(&field_name).map_err(|_err| { 585 | exceptions::PyValueError::new_err(format!( 586 | "Field `{field_name}` is not defined in the schema." 587 | )) 588 | })?; 589 | parser.set_field_fuzzy(field, prefix, distance, transpose_cost_one); 590 | } 591 | 592 | Ok(parser) 593 | } 594 | 595 | fn register_custom_text_analyzers(index: &tv::Index) { 596 | let analyzers = [ 597 | ("ar_stem", Language::Arabic), 598 | ("da_stem", Language::Danish), 599 | ("nl_stem", Language::Dutch), 600 | ("fi_stem", Language::Finnish), 601 | ("fr_stem", Language::French), 602 | ("de_stem", Language::German), 603 | ("el_stem", Language::Greek), 604 | ("hu_stem", Language::Hungarian), 605 | ("it_stem", Language::Italian), 606 | ("no_stem", Language::Norwegian), 607 | ("pt_stem", Language::Portuguese), 608 | ("ro_stem", Language::Romanian), 609 | ("ru_stem", Language::Russian), 610 | ("es_stem", Language::Spanish), 611 | ("sv_stem", Language::Swedish), 612 | ("ta_stem", Language::Tamil), 613 | ("tr_stem", Language::Turkish), 614 | ]; 615 | 616 | for (name, lang) in &analyzers { 617 | let an = TextAnalyzer::builder(SimpleTokenizer::default()) 618 | .filter(RemoveLongFilter::limit(40)) 619 | .filter(LowerCaser) 620 | .filter(Stemmer::new(*lang)) 621 | .build(); 622 | index.tokenizers().register(name, an); 623 | } 624 | } 625 | } 626 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use ::tantivy as tv; 2 | use ::tantivy::schema::{OwnedValue as Value, Term}; 3 | use pyo3::{exceptions, prelude::*, wrap_pymodule}; 4 | 5 | mod document; 6 | mod facet; 7 | mod index; 8 | mod parser_error; 9 | mod query; 10 | mod schema; 11 | mod schemabuilder; 12 | mod searcher; 13 | mod snippet; 14 | mod tokenizer; 15 | 16 | use document::{extract_value, extract_value_for_type, Document}; 17 | use facet::Facet; 18 | use index::Index; 19 | use query::{Occur, Query}; 20 | use schema::{FieldType, Schema}; 21 | use schemabuilder::SchemaBuilder; 22 | use searcher::{DocAddress, Order, SearchResult, Searcher}; 23 | use snippet::{Snippet, SnippetGenerator}; 24 | use tokenizer::{Filter, TextAnalyzer, TextAnalyzerBuilder, Tokenizer}; 25 | 26 | /// Python bindings for the search engine library Tantivy. 27 | /// 28 | /// Tantivy is a full text search engine library written in rust. 29 | /// 30 | /// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in 31 | /// the sense it is not an off-the-shelf search engine server, but rather 32 | /// a library that can be used to build such a search engine. 33 | /// Tantivy is, in fact, strongly inspired by Lucene's design. 34 | /// 35 | /// Example: 36 | /// >>> import json 37 | /// >>> import tantivy 38 | /// 39 | /// >>> builder = tantivy.SchemaBuilder() 40 | /// 41 | /// >>> title = builder.add_text_field("title", stored=True) 42 | /// >>> body = builder.add_text_field("body") 43 | /// 44 | /// >>> schema = builder.build() 45 | /// >>> index = tantivy.Index(schema) 46 | /// >>> doc = tantivy.Document() 47 | /// >>> doc.add_text(title, "The Old Man and the Sea") 48 | /// >>> doc.add_text(body, ("He was an old man who fished alone in a " 49 | /// "skiff in the Gulf Stream and he had gone " 50 | /// "eighty-four days now without taking a fish.")) 51 | /// 52 | /// >>> writer.add_document(doc) 53 | /// 54 | /// >>> doc = schema.parse_document(json.dumps({ 55 | /// "title": ["Frankenstein", "The Modern Prometheus"], 56 | /// "body": ("You will rejoice to hear that no disaster has " 57 | /// "accompanied the commencement of an enterprise which " 58 | /// "you have regarded with such evil forebodings. " 59 | /// "I arrived here yesterday, and my first task is to " 60 | /// "assure my dear sister of my welfare and increasing " 61 | /// "confidence in the success of my undertaking.") 62 | /// })) 63 | /// 64 | /// >>> writer.add_document(doc) 65 | /// >>> writer.commit() 66 | /// 67 | /// >>> reader = index.reader() 68 | /// >>> searcher = reader.searcher() 69 | /// 70 | /// >>> query = index.parse_query("sea whale", [title, body]) 71 | /// 72 | /// >>> result = searcher.search(query, 10) 73 | /// 74 | /// >>> assert len(result) == 1 75 | /// 76 | #[pymodule] 77 | fn tantivy(_py: Python, m: &Bound) -> PyResult<()> { 78 | m.add_class::()?; 79 | m.add_class::()?; 80 | m.add_class::()?; 81 | m.add_class::()?; 82 | m.add_class::()?; 83 | m.add_class::()?; 84 | m.add_class::()?; 85 | m.add_class::()?; 86 | m.add_class::()?; 87 | m.add_class::()?; 88 | m.add_class::()?; 89 | m.add_class::()?; 90 | m.add_class::()?; 91 | m.add_class::()?; 92 | m.add_class::()?; 93 | m.add_class::()?; 94 | m.add_class::()?; 95 | m.add_class::()?; 96 | 97 | m.add_wrapped(wrap_pymodule!(query_parser_error))?; 98 | 99 | m.add("__version__", tv::version_string())?; 100 | 101 | Ok(()) 102 | } 103 | 104 | /// Submodule containing all the possible errors that can be raised during 105 | /// query parsing. 106 | /// 107 | /// Example: 108 | /// >>> import tantivy 109 | /// >>> from tantivy import query_parser_error 110 | /// 111 | /// >>> builder = tantivy.SchemaBuilder() 112 | /// 113 | /// >>> title = builder.add_text_field("title", stored=True) 114 | /// >>> body = builder.add_text_field("body") 115 | /// >>> id = builder.add_unsigned_field("id") 116 | /// >>> rating = builder.add_float_field("rating") 117 | /// 118 | /// >>> schema = builder.build() 119 | /// >>> index = tantivy.Index(schema) 120 | /// 121 | /// >>> query, errors = index.parse_query_lenient( 122 | /// "bod:'world' AND id:<3.5 AND rating:5.0" 123 | /// ) 124 | /// 125 | /// >>> assert len(errors) == 2 126 | /// >>> assert isinstance(errors[0], query_parser_error.FieldDoesNotExistError) 127 | /// >>> assert isinstance(errors[1], query_parser_error.ExpectedIntError) 128 | #[pymodule] 129 | fn query_parser_error(_py: Python, m: &Bound) -> PyResult<()> { 130 | m.add_class::()?; 131 | m.add_class::()?; 132 | m.add_class::()?; 133 | m.add_class::()?; 134 | m.add_class::()?; 135 | m.add_class::()?; 136 | m.add_class::()?; 137 | m.add_class::()?; 138 | m.add_class::()?; 139 | m.add_class::()?; 140 | m.add_class::()?; 141 | m.add_class::()?; 142 | m.add_class::()?; 143 | m.add_class::()?; 144 | m.add_class::()?; 145 | m.add_class::()?; 146 | m.add_class::()?; 147 | 148 | Ok(()) 149 | } 150 | 151 | pub(crate) fn to_pyerr(err: E) -> PyErr { 152 | exceptions::PyValueError::new_err(err.to_string()) 153 | } 154 | 155 | pub(crate) fn get_field( 156 | schema: &tv::schema::Schema, 157 | field_name: &str, 158 | ) -> PyResult { 159 | let field = schema.get_field(field_name).map_err(|_err| { 160 | exceptions::PyValueError::new_err(format!( 161 | "Field `{field_name}` is not defined in the schema." 162 | )) 163 | })?; 164 | 165 | Ok(field) 166 | } 167 | 168 | pub(crate) fn make_term( 169 | schema: &tv::schema::Schema, 170 | field_name: &str, 171 | field_value: &Bound, 172 | ) -> PyResult { 173 | let field = get_field(schema, field_name)?; 174 | let value = extract_value(field_value)?; 175 | let term = match value { 176 | Value::Str(text) => Term::from_field_text(field, &text), 177 | Value::U64(num) => Term::from_field_u64(field, num), 178 | Value::I64(num) => Term::from_field_i64(field, num), 179 | Value::F64(num) => Term::from_field_f64(field, num), 180 | Value::Date(d) => Term::from_field_date(field, d), 181 | Value::Facet(facet) => Term::from_facet(field, &facet), 182 | Value::Bool(b) => Term::from_field_bool(field, b), 183 | Value::IpAddr(i) => Term::from_field_ip_addr(field, i), 184 | _ => { 185 | return Err(exceptions::PyValueError::new_err(format!( 186 | "Can't create a term for Field `{field_name}` with value `{field_value}`." 187 | ))) 188 | } 189 | }; 190 | 191 | Ok(term) 192 | } 193 | 194 | pub(crate) fn make_term_for_type( 195 | schema: &tv::schema::Schema, 196 | field_name: &str, 197 | field_type: FieldType, 198 | field_value: &Bound, 199 | ) -> PyResult { 200 | let field = get_field(schema, field_name)?; 201 | let value = 202 | extract_value_for_type(field_value, field_type.into(), field_name)?; 203 | let term = match value { 204 | Value::Str(text) => Term::from_field_text(field, &text), 205 | Value::U64(num) => Term::from_field_u64(field, num), 206 | Value::I64(num) => Term::from_field_i64(field, num), 207 | Value::F64(num) => Term::from_field_f64(field, num), 208 | Value::Date(d) => Term::from_field_date(field, d), 209 | Value::Facet(facet) => Term::from_facet(field, &facet), 210 | Value::Bool(b) => Term::from_field_bool(field, b), 211 | Value::IpAddr(i) => Term::from_field_ip_addr(field, i), 212 | _ => { 213 | return Err(exceptions::PyValueError::new_err(format!( 214 | "Can't create a term for Field `{field_name}` with value `{field_value}`." 215 | ))) 216 | } 217 | }; 218 | 219 | Ok(term) 220 | } 221 | -------------------------------------------------------------------------------- /src/parser_error.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | convert::TryFrom, 3 | net::AddrParseError, 4 | num::{IntErrorKind, ParseFloatError, ParseIntError}, 5 | str::ParseBoolError, 6 | }; 7 | 8 | use pyo3::prelude::*; 9 | use pyo3::IntoPyObjectExt; 10 | use tantivy::{self as tv, schema::FacetParseError}; 11 | 12 | // TODO(https://github.com/PyO3/pyo3/issues/1190): Expose this to bindings once trait <-> ABC is 13 | // supported in PyO3. 14 | pub(crate) trait QueryParserError { 15 | fn full_message(&self) -> String; 16 | } 17 | 18 | /// A crate local version of the [`IntoPy`] trait to implement for 19 | /// [`QueryParserError`](tv::query::QueryParserError). 20 | pub(crate) trait QueryParserErrorIntoPy { 21 | fn into_py(self, py: Python) -> PyResult; 22 | } 23 | 24 | impl QueryParserErrorIntoPy for tv::query::QueryParserError { 25 | fn into_py(self, py: Python) -> PyResult { 26 | match self { 27 | tv::query::QueryParserError::SyntaxError(message) => { 28 | SyntaxError { message }.into_py_any(py) 29 | } 30 | tv::query::QueryParserError::UnsupportedQuery(message) => { 31 | UnsupportedQueryError { message }.into_py_any(py) 32 | } 33 | tv::query::QueryParserError::FieldDoesNotExist(field) => { 34 | FieldDoesNotExistError { field }.into_py_any(py) 35 | } 36 | tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed( 37 | field, 38 | ) => FieldDoesNotHavePositionsIndexedError { field }.into_py_any(py), 39 | tv::query::QueryParserError::ExpectedInt(parse_int_error) => { 40 | ExpectedIntError { parse_int_error }.into_py_any(py) 41 | } 42 | tv::query::QueryParserError::ExpectedFloat(parse_float_error) => { 43 | ExpectedFloatError { parse_float_error }.into_py_any(py) 44 | } 45 | tv::query::QueryParserError::ExpectedBool(parse_bool_error) => { 46 | ExpectedBoolError { parse_bool_error }.into_py_any(py) 47 | } 48 | tv::query::QueryParserError::ExpectedBase64(decode_error) => { 49 | ExpectedBase64Error { decode_error }.into_py_any(py) 50 | } 51 | tv::query::QueryParserError::AllButQueryForbidden => { 52 | AllButQueryForbiddenError.into_py_any(py) 53 | } 54 | tv::query::QueryParserError::NoDefaultFieldDeclared => { 55 | NoDefaultFieldDeclaredError.into_py_any(py) 56 | } 57 | tv::query::QueryParserError::FieldNotIndexed(field) => { 58 | FieldNotIndexedError { field }.into_py_any(py) 59 | } 60 | tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { 61 | phrase, 62 | tokenizer, 63 | } => { 64 | PhrasePrefixRequiresAtLeastTwoTermsError { phrase, tokenizer }.into_py_any(py) 65 | } 66 | tv::query::QueryParserError::UnknownTokenizer { tokenizer, field } => { 67 | UnknownTokenizerError { tokenizer, field }.into_py_any(py) 68 | } 69 | tv::query::QueryParserError::RangeMustNotHavePhrase => { 70 | RangeMustNotHavePhraseError.into_py_any(py) 71 | } 72 | tv::query::QueryParserError::DateFormatError(_) => { 73 | DateFormatError { inner: self }.into_py_any(py) 74 | } 75 | tv::query::QueryParserError::FacetFormatError(facet_parse_error) => { 76 | FacetFormatError { facet_parse_error }.into_py_any(py) 77 | } 78 | tv::query::QueryParserError::IpFormatError(addr_parse_error) => { 79 | IpFormatError { addr_parse_error }.into_py_any(py) 80 | } 81 | } 82 | } 83 | } 84 | 85 | /// Error in the query syntax. 86 | #[pyclass(frozen, module = "tantivy.tantivy")] 87 | pub(crate) struct SyntaxError { 88 | message: String, 89 | } 90 | 91 | #[pymethods] 92 | impl SyntaxError { 93 | #[getter] 94 | fn inner_message(&self) -> &str { 95 | self.message.as_str() 96 | } 97 | 98 | fn __repr__(&self) -> String { 99 | self.full_message() 100 | } 101 | 102 | fn __str__(&self) -> String { 103 | self.full_message() 104 | } 105 | } 106 | 107 | impl QueryParserError for SyntaxError { 108 | fn full_message(&self) -> String { 109 | format!("Syntax Error: {0}", self.message) 110 | } 111 | } 112 | 113 | impl From for tv::query::QueryParserError { 114 | fn from(error: SyntaxError) -> Self { 115 | tv::query::QueryParserError::SyntaxError(error.message) 116 | } 117 | } 118 | 119 | impl TryFrom for SyntaxError { 120 | type Error = String; 121 | 122 | fn try_from( 123 | error: tv::query::QueryParserError, 124 | ) -> Result { 125 | match error { 126 | tv::query::QueryParserError::SyntaxError(message) => { 127 | Ok(Self { message }) 128 | } 129 | _ => Err(format!("{error} is not a SyntaxError")), 130 | } 131 | } 132 | } 133 | 134 | /// This query is unsupported. 135 | #[pyclass(frozen, module = "tantivy.tantivy")] 136 | pub(crate) struct UnsupportedQueryError { 137 | message: String, 138 | } 139 | 140 | #[pymethods] 141 | impl UnsupportedQueryError { 142 | #[getter] 143 | fn inner_message(&self) -> &str { 144 | self.message.as_str() 145 | } 146 | 147 | fn __repr__(&self) -> String { 148 | self.full_message() 149 | } 150 | 151 | fn __str__(&self) -> String { 152 | self.full_message() 153 | } 154 | } 155 | 156 | impl QueryParserError for UnsupportedQueryError { 157 | fn full_message(&self) -> String { 158 | format!("Unsupported query: {0}", self.message) 159 | } 160 | } 161 | 162 | impl From for tv::query::QueryParserError { 163 | fn from(error: UnsupportedQueryError) -> Self { 164 | tv::query::QueryParserError::SyntaxError(error.message) 165 | } 166 | } 167 | 168 | impl TryFrom for UnsupportedQueryError { 169 | type Error = String; 170 | 171 | fn try_from( 172 | error: tv::query::QueryParserError, 173 | ) -> Result { 174 | match error { 175 | tv::query::QueryParserError::UnsupportedQuery(message) => { 176 | Ok(Self { message }) 177 | } 178 | _ => Err(format!("{error} is not an UnsupportedQuery error")), 179 | } 180 | } 181 | } 182 | 183 | /// The query references a field that is not in the schema. 184 | #[pyclass(frozen, module = "tantivy.tantivy")] 185 | pub struct FieldDoesNotExistError { 186 | field: String, 187 | } 188 | 189 | #[pymethods] 190 | impl FieldDoesNotExistError { 191 | /// The name of the field causing the error. 192 | #[getter] 193 | fn field(&self) -> &str { 194 | self.field.as_str() 195 | } 196 | 197 | fn __repr__(&self) -> String { 198 | self.full_message() 199 | } 200 | 201 | fn __str__(&self) -> String { 202 | self.full_message() 203 | } 204 | } 205 | 206 | impl QueryParserError for FieldDoesNotExistError { 207 | fn full_message(&self) -> String { 208 | format!("Field does not exist: '{0}'", self.field) 209 | } 210 | } 211 | 212 | impl From for tv::query::QueryParserError { 213 | fn from(error: FieldDoesNotExistError) -> Self { 214 | tv::query::QueryParserError::FieldDoesNotExist(error.field) 215 | } 216 | } 217 | 218 | impl TryFrom for FieldDoesNotExistError { 219 | type Error = String; 220 | 221 | fn try_from( 222 | error: tv::query::QueryParserError, 223 | ) -> Result { 224 | match error { 225 | tv::query::QueryParserError::FieldDoesNotExist(field) => { 226 | Ok(Self { field }) 227 | } 228 | _ => Err(format!("{error} is not a FieldDoesNotExist error")), 229 | } 230 | } 231 | } 232 | 233 | /// The query contains a term for a `u64` or `i64`-field, but the value is neither. 234 | #[pyclass(frozen, module = "tantivy.tantivy")] 235 | pub(crate) struct ExpectedIntError { 236 | parse_int_error: ParseIntError, 237 | } 238 | 239 | #[pymethods] 240 | impl ExpectedIntError { 241 | /// If `true`, the value being parsed was empty. 242 | fn caused_by_empty(&self) -> bool { 243 | self.parse_int_error.kind() == &IntErrorKind::Empty 244 | } 245 | 246 | /// If `true`, an invalid digit was found. 247 | fn caused_by_invalid_digit(&self) -> bool { 248 | self.parse_int_error.kind() == &IntErrorKind::InvalidDigit 249 | } 250 | 251 | /// If `true`, the value being parsed was too large. 252 | fn caused_by_pos_overflow(&self) -> bool { 253 | self.parse_int_error.kind() == &IntErrorKind::PosOverflow 254 | } 255 | 256 | /// If `true`, the value being parsed was too small. 257 | fn caused_by_neg_overflow(&self) -> bool { 258 | self.parse_int_error.kind() == &IntErrorKind::NegOverflow 259 | } 260 | 261 | fn __repr__(&self) -> String { 262 | self.full_message() 263 | } 264 | 265 | fn __str__(&self) -> String { 266 | self.full_message() 267 | } 268 | } 269 | 270 | impl QueryParserError for ExpectedIntError { 271 | fn full_message(&self) -> String { 272 | format!("Expected a valid integer: '{0:?}'", self.parse_int_error) 273 | } 274 | } 275 | 276 | impl From for tv::query::QueryParserError { 277 | fn from(error: ExpectedIntError) -> Self { 278 | tv::query::QueryParserError::ExpectedInt(error.parse_int_error) 279 | } 280 | } 281 | 282 | impl TryFrom for ExpectedIntError { 283 | type Error = String; 284 | 285 | fn try_from( 286 | error: tv::query::QueryParserError, 287 | ) -> Result { 288 | match error { 289 | tv::query::QueryParserError::ExpectedInt(parse_int_error) => { 290 | Ok(Self { parse_int_error }) 291 | } 292 | _ => Err(format!("{error} is not an ExpectedInt error")), 293 | } 294 | } 295 | } 296 | 297 | /// The query contains a term for a bytes field, but the value is not valid base64. 298 | #[pyclass(frozen, module = "tantivy.tantivy")] 299 | pub(crate) struct ExpectedBase64Error { 300 | decode_error: base64::DecodeError, 301 | } 302 | 303 | #[pymethods] 304 | impl ExpectedBase64Error { 305 | /// If `true`, an invalid byte was found in the query. Padding characters (`=`) interspersed in 306 | /// the encoded form will be treated as invalid bytes. 307 | fn caused_by_invalid_byte(&self) -> bool { 308 | matches!(self.decode_error, base64::DecodeError::InvalidByte { .. }) 309 | } 310 | 311 | /// If the error was caused by an invalid byte, returns the offset and offending byte. 312 | fn invalid_byte_info(&self) -> Option<(usize, u8)> { 313 | match self.decode_error { 314 | base64::DecodeError::InvalidByte(position, byte) => { 315 | Some((position, byte)) 316 | } 317 | _ => None, 318 | } 319 | } 320 | 321 | /// If `true`, the length of the base64 string was invalid. 322 | fn caused_by_invalid_length(&self) -> bool { 323 | matches!(self.decode_error, base64::DecodeError::InvalidLength(_)) 324 | } 325 | 326 | /// The last non-padding input symbol's encoded 6 bits have nonzero bits that will be discarded. 327 | /// If `true`, this is indicative of corrupted or truncated Base64. 328 | fn caused_by_invalid_last_symbol(&self) -> bool { 329 | matches!( 330 | self.decode_error, 331 | base64::DecodeError::InvalidLastSymbol { .. } 332 | ) 333 | } 334 | 335 | /// If the error was caused by an invalid last symbol, returns the offset and offending byte. 336 | fn invalid_last_symbol_info(&self) -> Option<(usize, u8)> { 337 | match self.decode_error { 338 | base64::DecodeError::InvalidLastSymbol(position, byte) => { 339 | Some((position, byte)) 340 | } 341 | _ => None, 342 | } 343 | } 344 | 345 | /// The nature of the padding was not as configured: absent or incorrect when it must be 346 | /// canonical, or present when it must be absent, etc. 347 | fn caused_by_invalid_padding(&self) -> bool { 348 | matches!(self.decode_error, base64::DecodeError::InvalidPadding) 349 | } 350 | 351 | fn __repr__(&self) -> String { 352 | self.full_message() 353 | } 354 | 355 | fn __str__(&self) -> String { 356 | self.full_message() 357 | } 358 | } 359 | 360 | impl QueryParserError for ExpectedBase64Error { 361 | fn full_message(&self) -> String { 362 | format!("Expected base64: {0:?}", self.decode_error) 363 | } 364 | } 365 | 366 | impl From for tv::query::QueryParserError { 367 | fn from(error: ExpectedBase64Error) -> Self { 368 | tv::query::QueryParserError::ExpectedBase64(error.decode_error) 369 | } 370 | } 371 | 372 | impl TryFrom for ExpectedBase64Error { 373 | type Error = String; 374 | 375 | fn try_from( 376 | error: tv::query::QueryParserError, 377 | ) -> Result { 378 | match error { 379 | tv::query::QueryParserError::ExpectedBase64(decode_error) => { 380 | Ok(Self { decode_error }) 381 | } 382 | _ => Err(format!("{error} is not an ExpectedBase64 error")), 383 | } 384 | } 385 | } 386 | 387 | /// The query contains a term for a `f64`-field, but the value is not a f64. 388 | #[pyclass(frozen, module = "tantivy.tantivy")] 389 | pub(crate) struct ExpectedFloatError { 390 | parse_float_error: ParseFloatError, 391 | } 392 | 393 | #[pymethods] 394 | impl ExpectedFloatError { 395 | fn __repr__(&self) -> String { 396 | self.full_message() 397 | } 398 | 399 | fn __str__(&self) -> String { 400 | self.full_message() 401 | } 402 | } 403 | 404 | impl QueryParserError for ExpectedFloatError { 405 | fn full_message(&self) -> String { 406 | format!("Expected a float value: '{0:?}'", self.parse_float_error) 407 | } 408 | } 409 | 410 | impl From for tv::query::QueryParserError { 411 | fn from(error: ExpectedFloatError) -> Self { 412 | tv::query::QueryParserError::ExpectedFloat(error.parse_float_error) 413 | } 414 | } 415 | 416 | impl TryFrom for ExpectedFloatError { 417 | type Error = String; 418 | 419 | fn try_from( 420 | error: tv::query::QueryParserError, 421 | ) -> Result { 422 | match error { 423 | tv::query::QueryParserError::ExpectedFloat(parse_float_error) => { 424 | Ok(Self { parse_float_error }) 425 | } 426 | _ => Err(format!("{error} is not an ExpectedFloat error")), 427 | } 428 | } 429 | } 430 | 431 | /// The query contains a term for a `bool`-field, but the value is not a bool. 432 | #[pyclass(frozen, module = "tantivy.tantivy")] 433 | pub(crate) struct ExpectedBoolError { 434 | parse_bool_error: ParseBoolError, 435 | } 436 | 437 | #[pymethods] 438 | impl ExpectedBoolError { 439 | fn __repr__(&self) -> String { 440 | self.full_message() 441 | } 442 | 443 | fn __str__(&self) -> String { 444 | self.full_message() 445 | } 446 | } 447 | 448 | impl QueryParserError for ExpectedBoolError { 449 | fn full_message(&self) -> String { 450 | format!("Expected a bool value: '{0:?}'", self.parse_bool_error) 451 | } 452 | } 453 | 454 | impl From for tv::query::QueryParserError { 455 | fn from(error: ExpectedBoolError) -> Self { 456 | tv::query::QueryParserError::ExpectedBool(error.parse_bool_error) 457 | } 458 | } 459 | 460 | impl TryFrom for ExpectedBoolError { 461 | type Error = String; 462 | 463 | fn try_from( 464 | error: tv::query::QueryParserError, 465 | ) -> Result { 466 | match error { 467 | tv::query::QueryParserError::ExpectedBool(parse_bool_error) => { 468 | Ok(Self { parse_bool_error }) 469 | } 470 | _ => Err(format!("{error} is not an ExpectedBool error")), 471 | } 472 | } 473 | } 474 | 475 | /// It is forbidden queries that are only "excluding". (e.g. -title:pop) 476 | #[pyclass(frozen, module = "tantivy.tantivy")] 477 | pub(crate) struct AllButQueryForbiddenError; 478 | 479 | #[pymethods] 480 | impl AllButQueryForbiddenError { 481 | fn __repr__(&self) -> String { 482 | self.full_message() 483 | } 484 | 485 | fn __str__(&self) -> String { 486 | self.full_message() 487 | } 488 | } 489 | 490 | impl QueryParserError for AllButQueryForbiddenError { 491 | fn full_message(&self) -> String { 492 | "Invalid query: Only excluding terms given".to_string() 493 | } 494 | } 495 | 496 | impl From for tv::query::QueryParserError { 497 | fn from(_error: AllButQueryForbiddenError) -> Self { 498 | tv::query::QueryParserError::AllButQueryForbidden 499 | } 500 | } 501 | 502 | impl TryFrom for AllButQueryForbiddenError { 503 | type Error = String; 504 | 505 | fn try_from( 506 | error: tv::query::QueryParserError, 507 | ) -> Result { 508 | match error { 509 | tv::query::QueryParserError::AllButQueryForbidden => Ok(Self {}), 510 | _ => Err(format!("{error} is not an AllButQueryForbidden error")), 511 | } 512 | } 513 | } 514 | 515 | /// If no default field is declared, running a query without any field specified is forbbidden. 516 | #[pyclass(frozen, module = "tantivy.tantivy")] 517 | pub(crate) struct NoDefaultFieldDeclaredError; 518 | 519 | #[pymethods] 520 | impl NoDefaultFieldDeclaredError { 521 | fn __repr__(&self) -> String { 522 | self.full_message() 523 | } 524 | 525 | fn __str__(&self) -> String { 526 | self.full_message() 527 | } 528 | } 529 | 530 | impl QueryParserError for NoDefaultFieldDeclaredError { 531 | fn full_message(&self) -> String { 532 | "No default field declared and no field specified in query".to_string() 533 | } 534 | } 535 | 536 | impl From for tv::query::QueryParserError { 537 | fn from(_error: NoDefaultFieldDeclaredError) -> Self { 538 | tv::query::QueryParserError::NoDefaultFieldDeclared 539 | } 540 | } 541 | 542 | impl TryFrom for NoDefaultFieldDeclaredError { 543 | type Error = String; 544 | 545 | fn try_from( 546 | error: tv::query::QueryParserError, 547 | ) -> Result { 548 | match error { 549 | tv::query::QueryParserError::NoDefaultFieldDeclared => Ok(Self {}), 550 | _ => Err(format!("{error} is not a NoDefaultFieldDeclared error")), 551 | } 552 | } 553 | } 554 | 555 | /// The field searched for is not declared as indexed in the schema. 556 | #[pyclass(frozen, module = "tantivy.tantivy")] 557 | pub(crate) struct FieldNotIndexedError { 558 | field: String, 559 | } 560 | 561 | #[pymethods] 562 | impl FieldNotIndexedError { 563 | fn field(&self) -> &str { 564 | self.field.as_str() 565 | } 566 | 567 | fn __repr__(&self) -> String { 568 | self.full_message() 569 | } 570 | 571 | fn __str__(&self) -> String { 572 | self.full_message() 573 | } 574 | } 575 | 576 | impl QueryParserError for FieldNotIndexedError { 577 | fn full_message(&self) -> String { 578 | format!("The field '{0}' is not declared as indexed", self.field) 579 | } 580 | } 581 | 582 | impl From for tv::query::QueryParserError { 583 | fn from(error: FieldNotIndexedError) -> Self { 584 | tv::query::QueryParserError::FieldNotIndexed(error.field) 585 | } 586 | } 587 | 588 | impl TryFrom for FieldNotIndexedError { 589 | type Error = String; 590 | 591 | fn try_from( 592 | error: tv::query::QueryParserError, 593 | ) -> Result { 594 | match error { 595 | tv::query::QueryParserError::FieldNotIndexed(field) => { 596 | Ok(Self { field }) 597 | } 598 | _ => Err(format!("{error} is not an FieldNotIndexed error")), 599 | } 600 | } 601 | } 602 | 603 | /// A phrase query was requested for a field that does not have any positions indexed. 604 | #[pyclass(frozen, module = "tantivy.tantivy")] 605 | pub(crate) struct FieldDoesNotHavePositionsIndexedError { 606 | field: String, 607 | } 608 | 609 | #[pymethods] 610 | impl FieldDoesNotHavePositionsIndexedError { 611 | fn field(&self) -> &str { 612 | self.field.as_str() 613 | } 614 | 615 | fn __repr__(&self) -> String { 616 | self.full_message() 617 | } 618 | 619 | fn __str__(&self) -> String { 620 | self.full_message() 621 | } 622 | } 623 | 624 | impl QueryParserError for FieldDoesNotHavePositionsIndexedError { 625 | fn full_message(&self) -> String { 626 | format!( 627 | "The field '{0}' does not have positions indexed", 628 | self.field 629 | ) 630 | } 631 | } 632 | 633 | impl From 634 | for tv::query::QueryParserError 635 | { 636 | fn from(error: FieldDoesNotHavePositionsIndexedError) -> Self { 637 | tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed( 638 | error.field, 639 | ) 640 | } 641 | } 642 | 643 | impl TryFrom 644 | for FieldDoesNotHavePositionsIndexedError 645 | { 646 | type Error = String; 647 | 648 | fn try_from( 649 | error: tv::query::QueryParserError, 650 | ) -> Result { 651 | match error { 652 | tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed( 653 | field, 654 | ) => Ok(Self { field }), 655 | _ => Err(format!( 656 | "{error} is not a FieldDoesNotHavePositionsIndexed error" 657 | )), 658 | } 659 | } 660 | } 661 | 662 | /// A phrase-prefix query requires at least two terms 663 | #[pyclass(frozen, module = "tantivy.tantivy")] 664 | pub(crate) struct PhrasePrefixRequiresAtLeastTwoTermsError { 665 | /// The phrase which triggered the issue. 666 | phrase: String, 667 | /// The tokenizer configured for the field. 668 | tokenizer: String, 669 | } 670 | 671 | #[pymethods] 672 | impl PhrasePrefixRequiresAtLeastTwoTermsError { 673 | fn phrase(&self) -> &str { 674 | self.phrase.as_str() 675 | } 676 | 677 | fn tokenizer(&self) -> &str { 678 | self.tokenizer.as_str() 679 | } 680 | 681 | fn __repr__(&self) -> String { 682 | self.full_message() 683 | } 684 | 685 | fn __str__(&self) -> String { 686 | self.full_message() 687 | } 688 | } 689 | 690 | impl QueryParserError for PhrasePrefixRequiresAtLeastTwoTermsError { 691 | fn full_message(&self) -> String { 692 | format!( 693 | "The phrase '{0:?}' does not produce at least two terms using the tokenizer '{1:?}'", 694 | self.phrase, self.tokenizer 695 | ) 696 | } 697 | } 698 | 699 | impl From 700 | for tv::query::QueryParserError 701 | { 702 | fn from(error: PhrasePrefixRequiresAtLeastTwoTermsError) -> Self { 703 | tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { 704 | phrase: error.phrase, 705 | tokenizer: error.tokenizer, 706 | } 707 | } 708 | } 709 | 710 | impl TryFrom 711 | for PhrasePrefixRequiresAtLeastTwoTermsError 712 | { 713 | type Error = String; 714 | 715 | fn try_from( 716 | error: tv::query::QueryParserError, 717 | ) -> Result { 718 | match error { 719 | tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { 720 | phrase, 721 | tokenizer, 722 | } => Ok(Self { phrase, tokenizer }), 723 | _ => Err(format!( 724 | "{error} is not a PhrasePrefixRequiresAtLeastTwoTerms error" 725 | )), 726 | } 727 | } 728 | } 729 | 730 | /// The tokenizer for the given field is unknown. 731 | #[pyclass(frozen, module = "tantivy.tantivy")] 732 | pub(crate) struct UnknownTokenizerError { 733 | /// The name of the tokenizer. 734 | tokenizer: String, 735 | /// The field name. 736 | field: String, 737 | } 738 | 739 | #[pymethods] 740 | impl UnknownTokenizerError { 741 | fn tokenizer(&self) -> &str { 742 | self.tokenizer.as_str() 743 | } 744 | 745 | fn field(&self) -> &str { 746 | self.field.as_str() 747 | } 748 | 749 | fn __repr__(&self) -> String { 750 | self.full_message() 751 | } 752 | 753 | fn __str__(&self) -> String { 754 | self.full_message() 755 | } 756 | } 757 | 758 | impl QueryParserError for UnknownTokenizerError { 759 | fn full_message(&self) -> String { 760 | format!( 761 | "The tokenizer '{0:?}' for the field '{1:?}' is unknown", 762 | self.tokenizer, self.field 763 | ) 764 | } 765 | } 766 | 767 | impl From for tv::query::QueryParserError { 768 | fn from(error: UnknownTokenizerError) -> Self { 769 | tv::query::QueryParserError::UnknownTokenizer { 770 | tokenizer: error.tokenizer, 771 | field: error.field, 772 | } 773 | } 774 | } 775 | 776 | impl TryFrom for UnknownTokenizerError { 777 | type Error = String; 778 | 779 | fn try_from( 780 | error: tv::query::QueryParserError, 781 | ) -> Result { 782 | match error { 783 | tv::query::QueryParserError::UnknownTokenizer { 784 | tokenizer, 785 | field, 786 | } => Ok(Self { tokenizer, field }), 787 | _ => Err(format!("{error} is not an UnknownTokenizer error")), 788 | } 789 | } 790 | } 791 | 792 | /// The query contains a range query with a phrase as one of the bounds. Only terms can be used as 793 | /// bounds. 794 | #[pyclass(frozen, module = "tantivy.tantivy")] 795 | pub(crate) struct RangeMustNotHavePhraseError; 796 | 797 | #[pymethods] 798 | impl RangeMustNotHavePhraseError { 799 | fn __repr__(&self) -> String { 800 | self.full_message() 801 | } 802 | 803 | fn __str__(&self) -> String { 804 | self.full_message() 805 | } 806 | } 807 | 808 | impl QueryParserError for RangeMustNotHavePhraseError { 809 | fn full_message(&self) -> String { 810 | "A range query cannot have a phrase as one of the bounds".to_string() 811 | } 812 | } 813 | 814 | impl From for tv::query::QueryParserError { 815 | fn from(_error: RangeMustNotHavePhraseError) -> Self { 816 | tv::query::QueryParserError::RangeMustNotHavePhrase 817 | } 818 | } 819 | 820 | impl TryFrom for RangeMustNotHavePhraseError { 821 | type Error = String; 822 | 823 | fn try_from( 824 | error: tv::query::QueryParserError, 825 | ) -> Result { 826 | match error { 827 | tv::query::QueryParserError::RangeMustNotHavePhrase => Ok(Self {}), 828 | _ => Err(format!("{error} is not a RangeMustNotHavePhrase error")), 829 | } 830 | } 831 | } 832 | 833 | /// The format for the date field is not RFC 3339 compliant. 834 | #[pyclass(frozen, module = "tantivy.tantivy")] 835 | pub(crate) struct DateFormatError { 836 | // Keep around the entire `QueryParserError` to avoid importing the `time` crate. 837 | inner: tv::query::QueryParserError, 838 | } 839 | 840 | #[pymethods] 841 | impl DateFormatError { 842 | fn __repr__(&self) -> String { 843 | self.full_message() 844 | } 845 | 846 | fn __str__(&self) -> String { 847 | self.full_message() 848 | } 849 | } 850 | 851 | impl QueryParserError for DateFormatError { 852 | fn full_message(&self) -> String { 853 | "The date field has an invalid format".to_string() 854 | } 855 | } 856 | 857 | impl From for tv::query::QueryParserError { 858 | fn from(error: DateFormatError) -> Self { 859 | error.inner 860 | } 861 | } 862 | 863 | impl TryFrom for DateFormatError { 864 | type Error = String; 865 | 866 | fn try_from( 867 | error: tv::query::QueryParserError, 868 | ) -> Result { 869 | match error { 870 | tv::query::QueryParserError::DateFormatError { .. } => { 871 | Ok(Self { inner: error }) 872 | } 873 | _ => Err(format!("{error} is not a DateFormatError")), 874 | } 875 | } 876 | } 877 | 878 | /// The format for the facet field is invalid. 879 | #[pyclass(frozen, module = "tantivy.tantivy")] 880 | pub(crate) struct FacetFormatError { 881 | facet_parse_error: FacetParseError, 882 | } 883 | 884 | #[pymethods] 885 | impl FacetFormatError { 886 | fn __repr__(&self) -> String { 887 | self.full_message() 888 | } 889 | 890 | fn __str__(&self) -> String { 891 | self.full_message() 892 | } 893 | } 894 | 895 | impl QueryParserError for FacetFormatError { 896 | fn full_message(&self) -> String { 897 | format!("The facet field is malformed: {0}", self.facet_parse_error) 898 | } 899 | } 900 | 901 | impl From for tv::query::QueryParserError { 902 | fn from(error: FacetFormatError) -> Self { 903 | tv::query::QueryParserError::FacetFormatError(error.facet_parse_error) 904 | } 905 | } 906 | 907 | impl TryFrom for FacetFormatError { 908 | type Error = String; 909 | 910 | fn try_from( 911 | error: tv::query::QueryParserError, 912 | ) -> Result { 913 | match error { 914 | tv::query::QueryParserError::FacetFormatError( 915 | facet_parse_error, 916 | ) => Ok(Self { facet_parse_error }), 917 | _ => Err(format!("{error} is not a FacetFormatError")), 918 | } 919 | } 920 | } 921 | 922 | /// The format for the ip field is invalid. 923 | #[pyclass(frozen, module = "tantivy.tantivy")] 924 | pub(crate) struct IpFormatError { 925 | addr_parse_error: AddrParseError, 926 | } 927 | 928 | #[pymethods] 929 | impl IpFormatError { 930 | fn __repr__(&self) -> String { 931 | self.full_message() 932 | } 933 | 934 | fn __str__(&self) -> String { 935 | self.full_message() 936 | } 937 | } 938 | 939 | impl QueryParserError for IpFormatError { 940 | fn full_message(&self) -> String { 941 | format!("The facet field is malformed: {0}", self.addr_parse_error) 942 | } 943 | } 944 | 945 | impl From for tv::query::QueryParserError { 946 | fn from(error: IpFormatError) -> Self { 947 | tv::query::QueryParserError::IpFormatError(error.addr_parse_error) 948 | } 949 | } 950 | 951 | impl TryFrom for IpFormatError { 952 | type Error = String; 953 | 954 | fn try_from( 955 | error: tv::query::QueryParserError, 956 | ) -> Result { 957 | match error { 958 | tv::query::QueryParserError::IpFormatError(addr_parse_error) => { 959 | Ok(Self { addr_parse_error }) 960 | } 961 | _ => Err(format!("{error} is not an IpFormatError")), 962 | } 963 | } 964 | } 965 | -------------------------------------------------------------------------------- /src/query.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | get_field, make_term, make_term_for_type, schema::FieldType, to_pyerr, 3 | DocAddress, Schema, 4 | }; 5 | use core::ops::Bound as OpsBound; 6 | use pyo3::{ 7 | exceptions, 8 | prelude::*, 9 | types::{PyAny, PyFloat, PyString}, 10 | }; 11 | use tantivy as tv; 12 | 13 | /// Custom Tuple struct to represent a pair of Occur and Query 14 | /// for the BooleanQuery 15 | struct OccurQueryPair(Occur, Query); 16 | 17 | impl<'source> FromPyObject<'source> for OccurQueryPair { 18 | fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult { 19 | let (occur, query): (Occur, Query) = ob.extract()?; 20 | 21 | Ok(OccurQueryPair(occur, query)) 22 | } 23 | } 24 | 25 | /// Tantivy's Occur 26 | #[pyclass(frozen, module = "tantivy.tantivy")] 27 | #[derive(Clone)] 28 | pub enum Occur { 29 | Must, 30 | Should, 31 | MustNot, 32 | } 33 | 34 | impl From for tv::query::Occur { 35 | fn from(occur: Occur) -> tv::query::Occur { 36 | match occur { 37 | Occur::Must => tv::query::Occur::Must, 38 | Occur::Should => tv::query::Occur::Should, 39 | Occur::MustNot => tv::query::Occur::MustNot, 40 | } 41 | } 42 | } 43 | 44 | /// Tantivy's Query 45 | #[pyclass(frozen, module = "tantivy.tantivy")] 46 | pub(crate) struct Query { 47 | pub(crate) inner: Box, 48 | } 49 | 50 | impl Clone for Query { 51 | fn clone(&self) -> Self { 52 | Query { 53 | inner: self.inner.box_clone(), 54 | } 55 | } 56 | } 57 | 58 | impl Query { 59 | pub(crate) fn get(&self) -> &dyn tv::query::Query { 60 | &self.inner 61 | } 62 | } 63 | 64 | #[pymethods] 65 | impl Query { 66 | fn __repr__(&self) -> PyResult { 67 | Ok(format!("Query({:?})", self.get())) 68 | } 69 | 70 | /// Construct a Tantivy's TermQuery 71 | #[staticmethod] 72 | #[pyo3(signature = (schema, field_name, field_value, index_option = "position"))] 73 | pub(crate) fn term_query( 74 | schema: &Schema, 75 | field_name: &str, 76 | field_value: &Bound, 77 | index_option: &str, 78 | ) -> PyResult { 79 | let term = make_term(&schema.inner, field_name, field_value)?; 80 | let index_option = match index_option { 81 | "position" => tv::schema::IndexRecordOption::WithFreqsAndPositions, 82 | "freq" => tv::schema::IndexRecordOption::WithFreqs, 83 | "basic" => tv::schema::IndexRecordOption::Basic, 84 | _ => return Err(exceptions::PyValueError::new_err( 85 | "Invalid index option, valid choices are: 'basic', 'freq' and 'position'" 86 | )) 87 | }; 88 | let inner = tv::query::TermQuery::new(term, index_option); 89 | Ok(Query { 90 | inner: Box::new(inner), 91 | }) 92 | } 93 | 94 | /// Construct a Tantivy's TermSetQuery 95 | #[staticmethod] 96 | #[pyo3(signature = (schema, field_name, field_values))] 97 | pub(crate) fn term_set_query( 98 | schema: &Schema, 99 | field_name: &str, 100 | field_values: Vec>, 101 | ) -> PyResult { 102 | let terms = field_values 103 | .into_iter() 104 | .map(|field_value| { 105 | make_term(&schema.inner, field_name, &field_value) 106 | }) 107 | .collect::, _>>()?; 108 | let inner = tv::query::TermSetQuery::new(terms); 109 | Ok(Query { 110 | inner: Box::new(inner), 111 | }) 112 | } 113 | 114 | /// Construct a Tantivy's AllQuery 115 | #[staticmethod] 116 | pub(crate) fn all_query() -> PyResult { 117 | let inner = tv::query::AllQuery {}; 118 | Ok(Query { 119 | inner: Box::new(inner), 120 | }) 121 | } 122 | 123 | /// Construct a Tantivy's FuzzyTermQuery 124 | /// 125 | /// # Arguments 126 | /// 127 | /// * `schema` - Schema of the target index. 128 | /// * `field_name` - Field name to be searched. 129 | /// * `text` - String representation of the query term. 130 | /// * `distance` - (Optional) Edit distance you are going to alow. When not specified, the default is 1. 131 | /// * `transposition_cost_one` - (Optional) If true, a transposition (swapping) cost will be 1; otherwise it will be 2. When not specified, the default is true. 132 | /// * `prefix` - (Optional) If true, prefix levenshtein distance is applied. When not specified, the default is false. 133 | #[staticmethod] 134 | #[pyo3(signature = (schema, field_name, text, distance = 1, transposition_cost_one = true, prefix = false))] 135 | pub(crate) fn fuzzy_term_query( 136 | schema: &Schema, 137 | field_name: &str, 138 | text: &Bound, 139 | distance: u8, 140 | transposition_cost_one: bool, 141 | prefix: bool, 142 | ) -> PyResult { 143 | let term = make_term(&schema.inner, field_name, text)?; 144 | let inner = if prefix { 145 | tv::query::FuzzyTermQuery::new_prefix( 146 | term, 147 | distance, 148 | transposition_cost_one, 149 | ) 150 | } else { 151 | tv::query::FuzzyTermQuery::new( 152 | term, 153 | distance, 154 | transposition_cost_one, 155 | ) 156 | }; 157 | Ok(Query { 158 | inner: Box::new(inner), 159 | }) 160 | } 161 | 162 | /// Construct a Tantivy's PhraseQuery with custom offsets and slop 163 | /// 164 | /// # Arguments 165 | /// 166 | /// * `schema` - Schema of the target index. 167 | /// * `field_name` - Field name to be searched. 168 | /// * `words` - Word list that constructs the phrase. A word can be a term text or a pair of term text and its offset in the phrase. 169 | /// * `slop` - (Optional) The number of gaps permitted between the words in the query phrase. Default is 0. 170 | #[staticmethod] 171 | #[pyo3(signature = (schema, field_name, words, slop = 0))] 172 | pub(crate) fn phrase_query( 173 | schema: &Schema, 174 | field_name: &str, 175 | words: Vec>, 176 | slop: u32, 177 | ) -> PyResult { 178 | let mut terms_with_offset = Vec::with_capacity(words.len()); 179 | for (idx, word) in words.into_iter().enumerate() { 180 | if let Ok((offset, value)) = word.extract() { 181 | // Custom offset is provided. 182 | let term = make_term(&schema.inner, field_name, &value)?; 183 | terms_with_offset.push((offset, term)); 184 | } else { 185 | // Custom offset is not provided. Use the list index as the offset. 186 | let term = make_term(&schema.inner, field_name, &word)?; 187 | terms_with_offset.push((idx, term)); 188 | }; 189 | } 190 | if terms_with_offset.is_empty() { 191 | return Err(exceptions::PyValueError::new_err( 192 | "words must not be empty.", 193 | )); 194 | } 195 | let inner = tv::query::PhraseQuery::new_with_offset_and_slop( 196 | terms_with_offset, 197 | slop, 198 | ); 199 | Ok(Query { 200 | inner: Box::new(inner), 201 | }) 202 | } 203 | 204 | /// Construct a Tantivy's BooleanQuery 205 | #[staticmethod] 206 | #[pyo3(signature = (subqueries))] 207 | pub(crate) fn boolean_query( 208 | subqueries: Vec<(Occur, Query)>, 209 | ) -> PyResult { 210 | let dyn_subqueries = subqueries 211 | .into_iter() 212 | .map(|(occur, query)| (occur.into(), query.inner.box_clone())) 213 | .collect::>(); 214 | 215 | let inner = tv::query::BooleanQuery::from(dyn_subqueries); 216 | 217 | Ok(Query { 218 | inner: Box::new(inner), 219 | }) 220 | } 221 | 222 | /// Construct a Tantivy's DisjunctionMaxQuery 223 | #[staticmethod] 224 | #[pyo3(signature = (subqueries, tie_breaker=None))] 225 | pub(crate) fn disjunction_max_query( 226 | subqueries: Vec, 227 | tie_breaker: Option>, 228 | ) -> PyResult { 229 | let inner_queries: Vec> = subqueries 230 | .iter() 231 | .map(|query| query.inner.box_clone()) 232 | .collect(); 233 | 234 | let dismax_query = if let Some(tie_breaker) = tie_breaker { 235 | tv::query::DisjunctionMaxQuery::with_tie_breaker( 236 | inner_queries, 237 | tie_breaker.extract::()?, 238 | ) 239 | } else { 240 | tv::query::DisjunctionMaxQuery::new(inner_queries) 241 | }; 242 | 243 | Ok(Query { 244 | inner: Box::new(dismax_query), 245 | }) 246 | } 247 | 248 | /// Construct a Tantivy's BoostQuery 249 | #[staticmethod] 250 | #[pyo3(signature = (query, boost))] 251 | pub(crate) fn boost_query(query: Query, boost: f32) -> PyResult { 252 | let inner = tv::query::BoostQuery::new(query.inner, boost); 253 | Ok(Query { 254 | inner: Box::new(inner), 255 | }) 256 | } 257 | 258 | /// Construct a Tantivy's RegexQuery 259 | #[staticmethod] 260 | #[pyo3(signature = (schema, field_name, regex_pattern))] 261 | pub(crate) fn regex_query( 262 | schema: &Schema, 263 | field_name: &str, 264 | regex_pattern: &str, 265 | ) -> PyResult { 266 | let field = get_field(&schema.inner, field_name)?; 267 | 268 | let inner_result = 269 | tv::query::RegexQuery::from_pattern(regex_pattern, field); 270 | match inner_result { 271 | Ok(inner) => Ok(Query { 272 | inner: Box::new(inner), 273 | }), 274 | Err(e) => Err(to_pyerr(e)), 275 | } 276 | } 277 | 278 | #[staticmethod] 279 | #[pyo3(signature = (doc_address, min_doc_frequency = Some(5), max_doc_frequency = None, min_term_frequency = Some(2), max_query_terms = Some(25), min_word_length = None, max_word_length = None, boost_factor = Some(1.0), stop_words = vec![]))] 280 | #[allow(clippy::too_many_arguments)] 281 | pub(crate) fn more_like_this_query( 282 | doc_address: &DocAddress, 283 | min_doc_frequency: Option, 284 | max_doc_frequency: Option, 285 | min_term_frequency: Option, 286 | max_query_terms: Option, 287 | min_word_length: Option, 288 | max_word_length: Option, 289 | boost_factor: Option, 290 | stop_words: Vec, 291 | ) -> PyResult { 292 | let mut builder = tv::query::MoreLikeThisQuery::builder(); 293 | if let Some(value) = min_doc_frequency { 294 | builder = builder.with_min_doc_frequency(value); 295 | } 296 | if let Some(value) = max_doc_frequency { 297 | builder = builder.with_max_doc_frequency(value); 298 | } 299 | if let Some(value) = min_term_frequency { 300 | builder = builder.with_min_term_frequency(value); 301 | } 302 | if let Some(value) = max_query_terms { 303 | builder = builder.with_max_query_terms(value); 304 | } 305 | if let Some(value) = min_word_length { 306 | builder = builder.with_min_word_length(value); 307 | } 308 | if let Some(value) = max_word_length { 309 | builder = builder.with_max_word_length(value); 310 | } 311 | if let Some(value) = boost_factor { 312 | builder = builder.with_boost_factor(value); 313 | } 314 | builder = builder.with_stop_words(stop_words); 315 | 316 | let inner = builder.with_document(tv::DocAddress::from(doc_address)); 317 | Ok(Query { 318 | inner: Box::new(inner), 319 | }) 320 | } 321 | 322 | /// Construct a Tantivy's ConstScoreQuery 323 | #[staticmethod] 324 | #[pyo3(signature = (query, score))] 325 | pub(crate) fn const_score_query( 326 | query: Query, 327 | score: f32, 328 | ) -> PyResult { 329 | let inner = tv::query::ConstScoreQuery::new(query.inner, score); 330 | Ok(Query { 331 | inner: Box::new(inner), 332 | }) 333 | } 334 | 335 | #[staticmethod] 336 | #[pyo3(signature = (schema, field_name, field_type, lower_bound, upper_bound, include_lower = true, include_upper = true))] 337 | pub(crate) fn range_query( 338 | schema: &Schema, 339 | field_name: &str, 340 | field_type: FieldType, 341 | lower_bound: &Bound, 342 | upper_bound: &Bound, 343 | include_lower: bool, 344 | include_upper: bool, 345 | ) -> PyResult { 346 | match field_type { 347 | FieldType::Text => { 348 | return Err(exceptions::PyValueError::new_err( 349 | "Text fields are not supported for range queries.", 350 | )) 351 | } 352 | FieldType::Boolean => { 353 | return Err(exceptions::PyValueError::new_err( 354 | "Boolean fields are not supported for range queries.", 355 | )) 356 | } 357 | FieldType::Facet => { 358 | return Err(exceptions::PyValueError::new_err( 359 | "Facet fields are not supported for range queries.", 360 | )) 361 | } 362 | FieldType::Bytes => { 363 | return Err(exceptions::PyValueError::new_err( 364 | "Bytes fields are not supported for range queries.", 365 | )) 366 | } 367 | FieldType::Json => { 368 | return Err(exceptions::PyValueError::new_err( 369 | "Json fields are not supported for range queries.", 370 | )) 371 | } 372 | _ => {} 373 | } 374 | 375 | // Look up the field in the schema. The given type must match the 376 | // field type in the schema. 377 | let field = get_field(&schema.inner, field_name)?; 378 | let actual_field_entry = schema.inner.get_field_entry(field); 379 | let actual_field_type = actual_field_entry.field_type().value_type(); // Convert tv::schema::FieldType to local FieldType 380 | let given_field_type: tv::schema::Type = field_type.clone().into(); // Convert local FieldType to tv::schema::FieldType 381 | 382 | if actual_field_type != given_field_type { 383 | return Err(exceptions::PyValueError::new_err(format!( 384 | "Field type mismatch: field '{}' is type {:?}, but got {:?}", 385 | field_name, actual_field_type, given_field_type 386 | ))); 387 | } 388 | 389 | let lower_bound_term = make_term_for_type( 390 | &schema.inner, 391 | field_name, 392 | field_type.clone(), 393 | lower_bound, 394 | )?; 395 | let upper_bound_term = make_term_for_type( 396 | &schema.inner, 397 | field_name, 398 | field_type.clone(), 399 | upper_bound, 400 | )?; 401 | 402 | let lower_bound = if include_lower { 403 | OpsBound::Included(lower_bound_term) 404 | } else { 405 | OpsBound::Excluded(lower_bound_term) 406 | }; 407 | 408 | let upper_bound = if include_upper { 409 | OpsBound::Included(upper_bound_term) 410 | } else { 411 | OpsBound::Excluded(upper_bound_term) 412 | }; 413 | 414 | let inner = tv::query::RangeQuery::new(lower_bound, upper_bound); 415 | 416 | Ok(Query { 417 | inner: Box::new(inner), 418 | }) 419 | } 420 | } 421 | -------------------------------------------------------------------------------- /src/schema.rs: -------------------------------------------------------------------------------- 1 | use crate::to_pyerr; 2 | use pyo3::IntoPyObjectExt; 3 | use pyo3::{basic::CompareOp, prelude::*, types::PyTuple}; 4 | use serde::{Deserialize, Serialize}; 5 | use tantivy as tv; 6 | 7 | /// Tantivy's Type 8 | #[pyclass(frozen, module = "tantivy.tantivy")] 9 | #[derive(Clone, PartialEq)] 10 | pub(crate) enum FieldType { 11 | Text, 12 | Unsigned, 13 | Integer, 14 | Float, 15 | Boolean, 16 | Date, 17 | Facet, 18 | Bytes, 19 | Json, 20 | IpAddr, 21 | } 22 | 23 | impl From for tv::schema::Type { 24 | fn from(field_type: FieldType) -> tv::schema::Type { 25 | match field_type { 26 | FieldType::Text => tv::schema::Type::Str, 27 | FieldType::Unsigned => tv::schema::Type::U64, 28 | FieldType::Integer => tv::schema::Type::I64, 29 | FieldType::Float => tv::schema::Type::F64, 30 | FieldType::Boolean => tv::schema::Type::Str, 31 | FieldType::Date => tv::schema::Type::Date, 32 | FieldType::Facet => tv::schema::Type::Facet, 33 | FieldType::Bytes => tv::schema::Type::Bytes, 34 | FieldType::Json => tv::schema::Type::Json, 35 | FieldType::IpAddr => tv::schema::Type::IpAddr, 36 | } 37 | } 38 | } 39 | 40 | /// Tantivy schema. 41 | /// 42 | /// The schema is very strict. To build the schema the `SchemaBuilder` class is 43 | /// provided. 44 | #[pyclass(frozen, module = "tantivy.tantivy")] 45 | #[derive(Deserialize, PartialEq, Serialize)] 46 | pub(crate) struct Schema { 47 | pub(crate) inner: tv::schema::Schema, 48 | } 49 | 50 | #[pymethods] 51 | impl Schema { 52 | fn __richcmp__( 53 | &self, 54 | other: &Self, 55 | op: CompareOp, 56 | py: Python<'_>, 57 | ) -> PyResult { 58 | match op { 59 | CompareOp::Eq => (self == other).into_py_any(py), 60 | CompareOp::Ne => (self != other).into_py_any(py), 61 | _ => Ok(py.NotImplemented()), 62 | } 63 | } 64 | 65 | #[staticmethod] 66 | fn _internal_from_pythonized(serialized: &Bound) -> PyResult { 67 | pythonize::depythonize(serialized).map_err(to_pyerr) 68 | } 69 | 70 | fn __reduce__<'a>( 71 | slf: PyRef<'a, Self>, 72 | py: Python<'a>, 73 | ) -> PyResult> { 74 | let serialized = pythonize::pythonize(py, &*slf).map_err(to_pyerr)?; 75 | let deserializer = slf 76 | .into_pyobject(py)? 77 | .getattr("_internal_from_pythonized")?; 78 | PyTuple::new( 79 | py, 80 | [deserializer, PyTuple::new(py, [serialized])?.into_any()], 81 | ) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/schemabuilder.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::new_ret_no_self)] 2 | 3 | use pyo3::{exceptions, prelude::*}; 4 | 5 | use crate::schema::Schema; 6 | use std::sync::{Arc, RwLock}; 7 | use tantivy::schema::{ 8 | self, BytesOptions, DateOptions, IpAddrOptions, INDEXED, 9 | }; 10 | 11 | /// Tantivy has a very strict schema. 12 | /// You need to specify in advance whether a field is indexed or not, 13 | /// stored or not. 14 | /// 15 | /// This is done by creating a schema object, and 16 | /// setting up the fields one by one. 17 | /// 18 | /// Examples: 19 | /// 20 | /// >>> builder = tantivy.SchemaBuilder() 21 | /// 22 | /// >>> title = builder.add_text_field("title", stored=True) 23 | /// >>> body = builder.add_text_field("body") 24 | /// 25 | /// >>> schema = builder.build() 26 | #[pyclass(module = "tantivy.tantivy")] 27 | #[derive(Clone)] 28 | pub(crate) struct SchemaBuilder { 29 | pub(crate) builder: Arc>>, 30 | } 31 | 32 | const NO_TOKENIZER_NAME: &str = "raw"; 33 | const TOKENIZER: &str = "default"; 34 | const RECORD: &str = "position"; 35 | 36 | #[pymethods] 37 | impl SchemaBuilder { 38 | #[new] 39 | fn new() -> Self { 40 | SchemaBuilder { 41 | builder: Arc::new(From::from(Some(schema::Schema::builder()))), 42 | } 43 | } 44 | 45 | #[staticmethod] 46 | fn is_valid_field_name(name: &str) -> bool { 47 | schema::is_valid_field_name(name) 48 | } 49 | 50 | /// Add a new text field to the schema. 51 | /// 52 | /// Args: 53 | /// name (str): The name of the field. 54 | /// stored (bool, optional): If true sets the field as stored, the 55 | /// content of the field can be later restored from a Searcher. 56 | /// Defaults to False. 57 | /// fast (bool, optional): Set the text options as a fast field. A 58 | /// fast field is a column-oriented fashion storage for tantivy. 59 | /// Text fast fields will have the term ids stored in the fast 60 | /// field. The fast field will be a multivalued fast field. 61 | /// It is recommended to use the "raw" tokenizer, since it will 62 | /// store the original text unchanged. The "default" tokenizer will 63 | /// store the terms as lower case and this will be reflected in the 64 | /// dictionary. 65 | /// tokenizer_name (str, optional): The name of the tokenizer that 66 | /// should be used to process the field. Defaults to 'default' 67 | /// index_option (str, optional): Sets which information should be 68 | /// indexed with the tokens. Can be one of 'position', 'freq' or 69 | /// 'basic'. Defaults to 'position'. The 'basic' index_option 70 | /// records only the document ID, the 'freq' option records the 71 | /// document id and the term frequency, while the 'position' option 72 | /// records the document id, term frequency and the positions of 73 | /// the term occurrences in the document. 74 | /// 75 | /// Returns the associated field handle. 76 | /// Raises a ValueError if there was an error with the field creation. 77 | #[pyo3(signature = ( 78 | name, 79 | stored = false, 80 | fast = false, 81 | tokenizer_name = TOKENIZER, 82 | index_option = RECORD 83 | ))] 84 | fn add_text_field( 85 | &mut self, 86 | name: &str, 87 | stored: bool, 88 | fast: bool, 89 | tokenizer_name: &str, 90 | index_option: &str, 91 | ) -> PyResult { 92 | let builder = &mut self.builder; 93 | let options = SchemaBuilder::build_text_option( 94 | stored, 95 | fast, 96 | tokenizer_name, 97 | index_option, 98 | )?; 99 | 100 | if let Some(builder) = builder.write().unwrap().as_mut() { 101 | builder.add_text_field(name, options); 102 | } else { 103 | return Err(exceptions::PyValueError::new_err( 104 | "Schema builder object isn't valid anymore.", 105 | )); 106 | } 107 | Ok(self.clone()) 108 | } 109 | 110 | /// Add a new signed integer field to the schema. 111 | /// 112 | /// Args: 113 | /// name (str): The name of the field. 114 | /// stored (bool, optional): If true sets the field as stored, the 115 | /// content of the field can be later restored from a Searcher. 116 | /// Defaults to False. 117 | /// indexed (bool, optional): If true sets the field to be indexed. 118 | /// fast (bool, optional): Set the numeric options as a fast field. A 119 | /// fast field is a column-oriented fashion storage for tantivy. 120 | /// It is designed for the fast random access of some document 121 | /// fields given a document id. 122 | /// 123 | /// Returns the associated field handle. 124 | /// Raises a ValueError if there was an error with the field creation. 125 | #[pyo3(signature = (name, stored = false, indexed = false, fast = false))] 126 | fn add_integer_field( 127 | &mut self, 128 | name: &str, 129 | stored: bool, 130 | indexed: bool, 131 | fast: bool, 132 | ) -> PyResult { 133 | let builder = &mut self.builder; 134 | 135 | let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?; 136 | 137 | if let Some(builder) = builder.write().unwrap().as_mut() { 138 | builder.add_i64_field(name, opts); 139 | } else { 140 | return Err(exceptions::PyValueError::new_err( 141 | "Schema builder object isn't valid anymore.", 142 | )); 143 | } 144 | Ok(self.clone()) 145 | } 146 | 147 | /// Add a new float field to the schema. 148 | /// 149 | /// Args: 150 | /// name (str): The name of the field. 151 | /// stored (bool, optional): If true sets the field as stored, the 152 | /// content of the field can be later restored from a Searcher. 153 | /// Defaults to False. 154 | /// indexed (bool, optional): If true sets the field to be indexed. 155 | /// fast (bool, optional): Set the numeric options as a fast field. A 156 | /// fast field is a column-oriented fashion storage for tantivy. 157 | /// It is designed for the fast random access of some document 158 | /// fields given a document id. 159 | /// 160 | /// Returns the associated field handle. 161 | /// Raises a ValueError if there was an error with the field creation. 162 | #[pyo3(signature = (name, stored = false, indexed = false, fast = false))] 163 | fn add_float_field( 164 | &mut self, 165 | name: &str, 166 | stored: bool, 167 | indexed: bool, 168 | fast: bool, 169 | ) -> PyResult { 170 | let builder = &mut self.builder; 171 | 172 | let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?; 173 | 174 | if let Some(builder) = builder.write().unwrap().as_mut() { 175 | builder.add_f64_field(name, opts); 176 | } else { 177 | return Err(exceptions::PyValueError::new_err( 178 | "Schema builder object isn't valid anymore.", 179 | )); 180 | } 181 | Ok(self.clone()) 182 | } 183 | 184 | /// Add a new unsigned integer field to the schema. 185 | /// 186 | /// Args: 187 | /// name (str): The name of the field. 188 | /// stored (bool, optional): If true sets the field as stored, the 189 | /// content of the field can be later restored from a Searcher. 190 | /// Defaults to False. 191 | /// indexed (bool, optional): If true sets the field to be indexed. 192 | /// fast (bool, optional): Set the numeric options as a fast field. A 193 | /// fast field is a column-oriented fashion storage for tantivy. 194 | /// It is designed for the fast random access of some document 195 | /// fields given a document id. 196 | /// 197 | /// Returns the associated field handle. 198 | /// Raises a ValueError if there was an error with the field creation. 199 | #[pyo3(signature = (name, stored = false, indexed = false, fast = false))] 200 | fn add_unsigned_field( 201 | &mut self, 202 | name: &str, 203 | stored: bool, 204 | indexed: bool, 205 | fast: bool, 206 | ) -> PyResult { 207 | let builder = &mut self.builder; 208 | 209 | let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?; 210 | 211 | if let Some(builder) = builder.write().unwrap().as_mut() { 212 | builder.add_u64_field(name, opts); 213 | } else { 214 | return Err(exceptions::PyValueError::new_err( 215 | "Schema builder object isn't valid anymore.", 216 | )); 217 | } 218 | Ok(self.clone()) 219 | } 220 | 221 | /// Add a new boolean field to the schema. 222 | /// 223 | /// Args: 224 | /// name (str): The name of the field. 225 | /// stored (bool, optional): If true sets the field as stored, the 226 | /// content of the field can be later restored from a Searcher. 227 | /// Defaults to False. 228 | /// indexed (bool, optional): If true sets the field to be indexed. 229 | /// fast (bool, optional): Set the numeric options as a fast field. A 230 | /// fast field is a column-oriented fashion storage for tantivy. 231 | /// It is designed for the fast random access of some document 232 | /// fields given a document id. 233 | /// 234 | /// Returns the associated field handle. 235 | /// Raises a ValueError if there was an error with the field creation. 236 | #[pyo3(signature = (name, stored = false, indexed = false, fast = false))] 237 | fn add_boolean_field( 238 | &mut self, 239 | name: &str, 240 | stored: bool, 241 | indexed: bool, 242 | fast: bool, 243 | ) -> PyResult { 244 | let builder = &mut self.builder; 245 | 246 | let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?; 247 | 248 | if let Some(builder) = builder.write().unwrap().as_mut() { 249 | builder.add_bool_field(name, opts); 250 | } else { 251 | return Err(exceptions::PyValueError::new_err( 252 | "Schema builder object isn't valid anymore.", 253 | )); 254 | } 255 | Ok(self.clone()) 256 | } 257 | 258 | /// Add a new date field to the schema. 259 | /// 260 | /// Args: 261 | /// name (str): The name of the field. 262 | /// stored (bool, optional): If true sets the field as stored, the 263 | /// content of the field can be later restored from a Searcher. 264 | /// Defaults to False. 265 | /// indexed (bool, optional): If true sets the field to be indexed. 266 | /// fast (bool, optional): Set the date options as a fast field. A fast 267 | /// field is a column-oriented fashion storage for tantivy. It is 268 | /// designed for the fast random access of some document fields 269 | /// given a document id. 270 | /// 271 | /// Returns the associated field handle. 272 | /// Raises a ValueError if there was an error with the field creation. 273 | #[pyo3(signature = (name, stored = false, indexed = false, fast = false))] 274 | fn add_date_field( 275 | &mut self, 276 | name: &str, 277 | stored: bool, 278 | indexed: bool, 279 | fast: bool, 280 | ) -> PyResult { 281 | let builder = &mut self.builder; 282 | 283 | let mut opts = DateOptions::default(); 284 | if stored { 285 | opts = opts.set_stored(); 286 | } 287 | if indexed { 288 | opts = opts.set_indexed(); 289 | } 290 | if fast { 291 | opts = opts.set_fast(); 292 | } 293 | 294 | if let Some(builder) = builder.write().unwrap().as_mut() { 295 | builder.add_date_field(name, opts); 296 | } else { 297 | return Err(exceptions::PyValueError::new_err( 298 | "Schema builder object isn't valid anymore.", 299 | )); 300 | } 301 | Ok(self.clone()) 302 | } 303 | 304 | /// Add a new json field to the schema. 305 | /// 306 | /// Args: 307 | /// name (str): the name of the field. 308 | /// stored (bool, optional): If true sets the field as stored, the 309 | /// content of the field can be later restored from a Searcher. 310 | /// Defaults to False. 311 | /// fast (bool, optional): Set the text options as a fast field. A 312 | /// fast field is a column-oriented fashion storage for tantivy. 313 | /// Text fast fields will have the term ids stored in the fast 314 | /// field. The fast field will be a multivalued fast field. 315 | /// It is recommended to use the "raw" tokenizer, since it will 316 | /// store the original text unchanged. The "default" tokenizer will 317 | /// store the terms as lower case and this will be reflected in the 318 | /// dictionary. 319 | /// tokenizer_name (str, optional): The name of the tokenizer that 320 | /// should be used to process the field. Defaults to 'default' 321 | /// index_option (str, optional): Sets which information should be 322 | /// indexed with the tokens. Can be one of 'position', 'freq' or 323 | /// 'basic'. Defaults to 'position'. The 'basic' index_option 324 | /// records only the document ID, the 'freq' option records the 325 | /// document id and the term frequency, while the 'position' option 326 | /// records the document id, term frequency and the positions of 327 | /// the term occurrences in the document. 328 | /// 329 | /// Returns the associated field handle. 330 | /// Raises a ValueError if there was an error with the field creation. 331 | #[pyo3(signature = ( 332 | name, 333 | stored = false, 334 | fast = false, 335 | tokenizer_name = TOKENIZER, 336 | index_option = RECORD 337 | ))] 338 | fn add_json_field( 339 | &mut self, 340 | name: &str, 341 | stored: bool, 342 | fast: bool, 343 | tokenizer_name: &str, 344 | index_option: &str, 345 | ) -> PyResult { 346 | let builder = &mut self.builder; 347 | let options = SchemaBuilder::build_text_option( 348 | stored, 349 | fast, 350 | tokenizer_name, 351 | index_option, 352 | )?; 353 | 354 | if let Some(builder) = builder.write().unwrap().as_mut() { 355 | builder.add_json_field(name, options); 356 | } else { 357 | return Err(exceptions::PyValueError::new_err( 358 | "Schema builder object isn't valid anymore.", 359 | )); 360 | } 361 | 362 | Ok(self.clone()) 363 | } 364 | 365 | /// Add a Facet field to the schema. 366 | /// Args: 367 | /// name (str): The name of the field. 368 | fn add_facet_field(&mut self, name: &str) -> PyResult { 369 | let builder = &mut self.builder; 370 | 371 | if let Some(builder) = builder.write().unwrap().as_mut() { 372 | builder.add_facet_field(name, INDEXED); 373 | } else { 374 | return Err(exceptions::PyValueError::new_err( 375 | "Schema builder object isn't valid anymore.", 376 | )); 377 | } 378 | Ok(self.clone()) 379 | } 380 | 381 | /// Add a fast bytes field to the schema. 382 | /// 383 | /// Args: 384 | /// name (str): The name of the field. 385 | /// stored (bool, optional): If true sets the field as stored, the 386 | /// content of the field can be later restored from a Searcher. 387 | /// Defaults to False. 388 | /// indexed (bool, optional): If true sets the field to be indexed. 389 | /// fast (bool, optional): Set the bytes options as a fast field. A fast 390 | /// field is a column-oriented fashion storage for tantivy. It is 391 | /// designed for the fast random access of some document fields 392 | /// given a document id. 393 | #[pyo3(signature = ( 394 | name, 395 | stored = false, 396 | indexed = false, 397 | fast = false 398 | ))] 399 | fn add_bytes_field( 400 | &mut self, 401 | name: &str, 402 | stored: bool, 403 | indexed: bool, 404 | fast: bool, 405 | ) -> PyResult { 406 | let builder = &mut self.builder; 407 | let mut opts = BytesOptions::default(); 408 | if stored { 409 | opts = opts.set_stored(); 410 | } 411 | if indexed { 412 | opts = opts.set_indexed(); 413 | } 414 | if fast { 415 | opts = opts.set_fast(); 416 | } 417 | 418 | if let Some(builder) = builder.write().unwrap().as_mut() { 419 | builder.add_bytes_field(name, opts); 420 | } else { 421 | return Err(exceptions::PyValueError::new_err( 422 | "Schema builder object isn't valid anymore.", 423 | )); 424 | } 425 | Ok(self.clone()) 426 | } 427 | 428 | /// Add an IP address field to the schema. 429 | /// 430 | /// Args: 431 | /// name (str): The name of the field. 432 | /// stored (bool, optional): If true sets the field as stored, the 433 | /// content of the field can be later restored from a Searcher. 434 | /// Defaults to False. 435 | /// indexed (bool, optional): If true sets the field to be indexed. 436 | /// fast (bool, optional): Set the IP address options as a fast field. A 437 | /// fast field is a column-oriented fashion storage for tantivy. It 438 | /// is designed for the fast random access of some document fields 439 | /// given a document id. 440 | #[pyo3(signature = ( 441 | name, 442 | stored = false, 443 | indexed = false, 444 | fast = false 445 | ))] 446 | fn add_ip_addr_field( 447 | &mut self, 448 | name: &str, 449 | stored: bool, 450 | indexed: bool, 451 | fast: bool, 452 | ) -> PyResult { 453 | let builder = &mut self.builder; 454 | let mut opts = IpAddrOptions::default(); 455 | if stored { 456 | opts = opts.set_stored(); 457 | } 458 | if indexed { 459 | opts = opts.set_indexed(); 460 | } 461 | if fast { 462 | opts = opts.set_fast(); 463 | } 464 | 465 | if let Some(builder) = builder.write().unwrap().as_mut() { 466 | builder.add_ip_addr_field(name, opts); 467 | } else { 468 | return Err(exceptions::PyValueError::new_err( 469 | "Schema builder object isn't valid anymore.", 470 | )); 471 | } 472 | 473 | Ok(self.clone()) 474 | } 475 | 476 | /// Finalize the creation of a Schema. 477 | /// 478 | /// Returns a Schema object. After this is called the SchemaBuilder cannot 479 | /// be used anymore. 480 | fn build(&mut self) -> PyResult { 481 | let builder = self.builder.write().unwrap().take(); 482 | if let Some(builder) = builder { 483 | let schema = builder.build(); 484 | Ok(Schema { inner: schema }) 485 | } else { 486 | Err(exceptions::PyValueError::new_err( 487 | "Schema builder object isn't valid anymore.", 488 | )) 489 | } 490 | } 491 | } 492 | 493 | impl SchemaBuilder { 494 | fn build_numeric_option( 495 | stored: bool, 496 | indexed: bool, 497 | fast: bool, 498 | ) -> PyResult { 499 | let opts = schema::NumericOptions::default(); 500 | let opts = if stored { opts.set_stored() } else { opts }; 501 | let opts = if indexed { opts.set_indexed() } else { opts }; 502 | let opts = if fast { opts.set_fast() } else { opts }; 503 | Ok(opts) 504 | } 505 | 506 | fn build_text_option( 507 | stored: bool, 508 | fast: bool, 509 | tokenizer_name: &str, 510 | index_option: &str, 511 | ) -> PyResult { 512 | let index_option = match index_option { 513 | "position" => schema::IndexRecordOption::WithFreqsAndPositions, 514 | "freq" => schema::IndexRecordOption::WithFreqs, 515 | "basic" => schema::IndexRecordOption::Basic, 516 | _ => return Err(exceptions::PyValueError::new_err( 517 | "Invalid index option, valid choices are: 'basic', 'freq' and 'position'" 518 | )) 519 | }; 520 | 521 | let indexing = schema::TextFieldIndexing::default() 522 | .set_tokenizer(tokenizer_name) 523 | .set_index_option(index_option); 524 | 525 | let options = 526 | schema::TextOptions::default().set_indexing_options(indexing); 527 | let options = if stored { 528 | options.set_stored() 529 | } else { 530 | options 531 | }; 532 | 533 | let options = if fast { 534 | let text_tokenizer = if tokenizer_name != NO_TOKENIZER_NAME { 535 | Some(tokenizer_name) 536 | } else { 537 | None 538 | }; 539 | options.set_fast(text_tokenizer) 540 | } else { 541 | options 542 | }; 543 | 544 | Ok(options) 545 | } 546 | } 547 | -------------------------------------------------------------------------------- /src/searcher.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::new_ret_no_self)] 2 | 3 | use crate::{document::Document, query::Query, to_pyerr}; 4 | use pyo3::types::PyDict; 5 | use pyo3::IntoPyObjectExt; 6 | use pyo3::{basic::CompareOp, exceptions::PyValueError, prelude::*}; 7 | use serde::{Deserialize, Serialize}; 8 | use tantivy as tv; 9 | use tantivy::aggregation::AggregationCollector; 10 | use tantivy::collector::{Count, MultiCollector, TopDocs}; 11 | use tantivy::TantivyDocument; 12 | // Bring the trait into scope. This is required for the `to_named_doc` method. 13 | // However, tantivy-py declares its own `Document` class, so we need to avoid 14 | // introduce the `Document` trait into the namespace. 15 | use tantivy::Document as _; 16 | 17 | /// Tantivy's Searcher class 18 | /// 19 | /// A Searcher is used to search the index given a prepared Query. 20 | #[pyclass(module = "tantivy.tantivy")] 21 | pub(crate) struct Searcher { 22 | pub(crate) inner: tv::Searcher, 23 | } 24 | 25 | #[derive( 26 | Clone, Deserialize, PartialEq, Serialize, FromPyObject, IntoPyObject, 27 | )] 28 | enum Fruit { 29 | #[pyo3(transparent)] 30 | Score(f32), 31 | #[pyo3(transparent)] 32 | Order(u64), 33 | } 34 | 35 | impl std::fmt::Debug for Fruit { 36 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 37 | match self { 38 | Fruit::Score(s) => f.write_str(&format!("{s}")), 39 | Fruit::Order(o) => f.write_str(&format!("{o}")), 40 | } 41 | } 42 | } 43 | 44 | #[pyclass(frozen, module = "tantivy.tantivy")] 45 | #[derive(Clone, Copy, Deserialize, PartialEq, Serialize)] 46 | /// Enum representing the direction in which something should be sorted. 47 | pub(crate) enum Order { 48 | /// Ascending. Smaller values appear first. 49 | Asc, 50 | 51 | /// Descending. Larger values appear first. 52 | Desc, 53 | } 54 | 55 | impl From for tv::Order { 56 | fn from(order: Order) -> Self { 57 | match order { 58 | Order::Asc => tv::Order::Asc, 59 | Order::Desc => tv::Order::Desc, 60 | } 61 | } 62 | } 63 | 64 | #[pyclass(frozen, module = "tantivy.tantivy")] 65 | #[derive(Clone, Default, Deserialize, PartialEq, Serialize)] 66 | /// Object holding a results successful search. 67 | pub(crate) struct SearchResult { 68 | hits: Vec<(Fruit, DocAddress)>, 69 | #[pyo3(get)] 70 | /// How many documents matched the query. Only available if `count` was set 71 | /// to true during the search. 72 | count: Option, 73 | } 74 | 75 | #[pymethods] 76 | impl SearchResult { 77 | #[new] 78 | fn new( 79 | py: Python, 80 | hits: Vec<(PyObject, DocAddress)>, 81 | count: Option, 82 | ) -> PyResult { 83 | let hits = hits 84 | .iter() 85 | .map(|(f, d)| Ok((f.extract(py)?, d.clone()))) 86 | .collect::>>()?; 87 | Ok(Self { hits, count }) 88 | } 89 | 90 | fn __repr__(&self) -> PyResult { 91 | if let Some(count) = self.count { 92 | Ok(format!( 93 | "SearchResult(hits: {:?}, count: {})", 94 | self.hits, count 95 | )) 96 | } else { 97 | Ok(format!("SearchResult(hits: {:?})", self.hits)) 98 | } 99 | } 100 | 101 | fn __richcmp__( 102 | &self, 103 | other: &Self, 104 | op: CompareOp, 105 | py: Python<'_>, 106 | ) -> PyResult { 107 | match op { 108 | CompareOp::Eq => (self == other).into_py_any(py), 109 | CompareOp::Ne => (self != other).into_py_any(py), 110 | _ => Ok(py.NotImplemented()), 111 | } 112 | } 113 | 114 | fn __getnewargs__( 115 | &self, 116 | py: Python, 117 | ) -> PyResult<(Vec<(PyObject, DocAddress)>, Option)> { 118 | Ok((self.hits(py)?, self.count)) 119 | } 120 | 121 | #[getter] 122 | /// The list of tuples that contains the scores and DocAddress of the 123 | /// search results. 124 | fn hits(&self, py: Python) -> PyResult> { 125 | let ret = self 126 | .hits 127 | .iter() 128 | .map(|(result, address)| -> PyResult<_> { 129 | Ok((result.clone().into_py_any(py)?, address.clone())) 130 | }) 131 | .collect::>()?; 132 | Ok(ret) 133 | } 134 | } 135 | 136 | #[pymethods] 137 | impl Searcher { 138 | /// Search the index with the given query and collect results. 139 | /// 140 | /// Args: 141 | /// query (Query): The query that will be used for the search. 142 | /// limit (int, optional): The maximum number of search results to 143 | /// return. Defaults to 10. 144 | /// count (bool, optional): Should the number of documents that match 145 | /// the query be returned as well. Defaults to true. 146 | /// order_by_field (Field, optional): A schema field that the results 147 | /// should be ordered by. The field must be declared as a fast field 148 | /// when building the schema. Note, this only works for unsigned 149 | /// fields. 150 | /// offset (Field, optional): The offset from which the results have 151 | /// to be returned. 152 | /// order (Order, optional): The order in which the results 153 | /// should be sorted. If not specified, defaults to descending. 154 | /// 155 | /// Returns `SearchResult` object. 156 | /// 157 | /// Raises a ValueError if there was an error with the search. 158 | #[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0, order = Order::Desc))] 159 | #[allow(clippy::too_many_arguments)] 160 | fn search( 161 | &self, 162 | py: Python, 163 | query: &Query, 164 | limit: usize, 165 | count: bool, 166 | order_by_field: Option<&str>, 167 | offset: usize, 168 | order: Order, 169 | ) -> PyResult { 170 | py.allow_threads(move || { 171 | let mut multicollector = MultiCollector::new(); 172 | 173 | let count_handle = if count { 174 | Some(multicollector.add_collector(Count)) 175 | } else { 176 | None 177 | }; 178 | 179 | let (mut multifruit, hits) = { 180 | if let Some(order_by) = order_by_field { 181 | let collector = TopDocs::with_limit(limit) 182 | .and_offset(offset) 183 | .order_by_u64_field(order_by, order.into()); 184 | let top_docs_handle = 185 | multicollector.add_collector(collector); 186 | let ret = self.inner.search(query.get(), &multicollector); 187 | 188 | match ret { 189 | Ok(mut r) => { 190 | let top_docs = top_docs_handle.extract(&mut r); 191 | let result: Vec<(Fruit, DocAddress)> = top_docs 192 | .iter() 193 | .map(|(f, d)| { 194 | (Fruit::Order(*f), DocAddress::from(d)) 195 | }) 196 | .collect(); 197 | (r, result) 198 | } 199 | Err(e) => { 200 | return Err(PyValueError::new_err(e.to_string())) 201 | } 202 | } 203 | } else { 204 | let collector = 205 | TopDocs::with_limit(limit).and_offset(offset); 206 | let top_docs_handle = 207 | multicollector.add_collector(collector); 208 | let ret = self.inner.search(query.get(), &multicollector); 209 | 210 | match ret { 211 | Ok(mut r) => { 212 | let top_docs = top_docs_handle.extract(&mut r); 213 | let result: Vec<(Fruit, DocAddress)> = top_docs 214 | .iter() 215 | .map(|(f, d)| { 216 | (Fruit::Score(*f), DocAddress::from(d)) 217 | }) 218 | .collect(); 219 | (r, result) 220 | } 221 | Err(e) => { 222 | return Err(PyValueError::new_err(e.to_string())) 223 | } 224 | } 225 | } 226 | }; 227 | 228 | let count = count_handle.map(|h| h.extract(&mut multifruit)); 229 | 230 | Ok(SearchResult { hits, count }) 231 | }) 232 | } 233 | 234 | #[pyo3(signature = (query, agg))] 235 | fn aggregate( 236 | &self, 237 | py: Python, 238 | query: &Query, 239 | agg: Py, 240 | ) -> PyResult> { 241 | let py_json = py.import("json")?; 242 | let agg_query_str = py_json.call_method1("dumps", (agg,))?.to_string(); 243 | 244 | let agg_str = py.allow_threads(move || { 245 | let agg_collector = AggregationCollector::from_aggs( 246 | serde_json::from_str(&agg_query_str).map_err(to_pyerr)?, 247 | Default::default(), 248 | ); 249 | let agg_res = self 250 | .inner 251 | .search(query.get(), &agg_collector) 252 | .map_err(to_pyerr)?; 253 | 254 | serde_json::to_string(&agg_res).map_err(to_pyerr) 255 | })?; 256 | 257 | let agg_dict = py_json.call_method1("loads", (agg_str,))?; 258 | let agg_dict = agg_dict.downcast::()?; 259 | 260 | Ok(agg_dict.clone().unbind()) 261 | } 262 | 263 | /// Returns the overall number of documents in the index. 264 | #[getter] 265 | fn num_docs(&self) -> u64 { 266 | self.inner.num_docs() 267 | } 268 | 269 | /// Returns the number of segments in the index. 270 | #[getter] 271 | fn num_segments(&self) -> usize { 272 | self.inner.segment_readers().len() 273 | } 274 | 275 | /// Return the overall number of documents containing 276 | /// the given term. 277 | #[pyo3(signature = (field_name, field_value))] 278 | fn doc_freq( 279 | &self, 280 | field_name: &str, 281 | field_value: &Bound, 282 | ) -> PyResult { 283 | // Wrap the tantivy Searcher `doc_freq` method to return a PyResult. 284 | let schema = self.inner.schema(); 285 | let term = crate::make_term(schema, field_name, field_value)?; 286 | self.inner.doc_freq(&term).map_err(to_pyerr) 287 | } 288 | 289 | /// Fetches a document from Tantivy's store given a DocAddress. 290 | /// 291 | /// Args: 292 | /// doc_address (DocAddress): The DocAddress that is associated with 293 | /// the document that we wish to fetch. 294 | /// 295 | /// Returns the Document, raises ValueError if the document can't be found. 296 | fn doc(&self, doc_address: &DocAddress) -> PyResult { 297 | let doc: TantivyDocument = 298 | self.inner.doc(doc_address.into()).map_err(to_pyerr)?; 299 | let named_doc = doc.to_named_doc(self.inner.schema()); 300 | Ok(crate::document::Document { 301 | field_values: named_doc.0, 302 | }) 303 | } 304 | 305 | fn __repr__(&self) -> PyResult { 306 | Ok(format!( 307 | "Searcher(num_docs={}, num_segments={})", 308 | self.inner.num_docs(), 309 | self.inner.segment_readers().len() 310 | )) 311 | } 312 | } 313 | 314 | /// DocAddress contains all the necessary information to identify a document 315 | /// given a Searcher object. 316 | /// 317 | /// It consists in an id identifying its segment, and its segment-local DocId. 318 | /// The id used for the segment is actually an ordinal in the list of segment 319 | /// hold by a Searcher. 320 | #[pyclass(frozen, module = "tantivy.tantivy")] 321 | #[derive( 322 | Clone, Debug, Deserialize, PartialEq, PartialOrd, Eq, Ord, Serialize, 323 | )] 324 | pub(crate) struct DocAddress { 325 | pub(crate) segment_ord: tv::SegmentOrdinal, 326 | pub(crate) doc: tv::DocId, 327 | } 328 | 329 | #[pymethods] 330 | impl DocAddress { 331 | #[new] 332 | fn new(segment_ord: tv::SegmentOrdinal, doc: tv::DocId) -> Self { 333 | DocAddress { segment_ord, doc } 334 | } 335 | 336 | /// The segment ordinal is an id identifying the segment hosting the 337 | /// document. It is only meaningful, in the context of a searcher. 338 | #[getter] 339 | fn segment_ord(&self) -> u32 { 340 | self.segment_ord 341 | } 342 | 343 | /// The segment local DocId 344 | #[getter] 345 | fn doc(&self) -> u32 { 346 | self.doc 347 | } 348 | 349 | fn __richcmp__( 350 | &self, 351 | other: &Self, 352 | op: CompareOp, 353 | _py: Python<'_>, 354 | ) -> bool { 355 | op.matches(self.cmp(other)) 356 | } 357 | 358 | fn __getnewargs__(&self) -> PyResult<(tv::SegmentOrdinal, tv::DocId)> { 359 | Ok((self.segment_ord, self.doc)) 360 | } 361 | } 362 | 363 | impl From<&tv::DocAddress> for DocAddress { 364 | fn from(doc_address: &tv::DocAddress) -> Self { 365 | DocAddress { 366 | segment_ord: doc_address.segment_ord, 367 | doc: doc_address.doc_id, 368 | } 369 | } 370 | } 371 | 372 | impl From<&DocAddress> for tv::DocAddress { 373 | fn from(val: &DocAddress) -> Self { 374 | tv::DocAddress { 375 | segment_ord: val.segment_ord(), 376 | doc_id: val.doc(), 377 | } 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /src/snippet.rs: -------------------------------------------------------------------------------- 1 | use crate::to_pyerr; 2 | use pyo3::prelude::*; 3 | use tantivy as tv; 4 | // Bring the trait into scope to use methods like `as_str()` on `OwnedValue`. 5 | use tantivy::schema::Value; 6 | 7 | /// Tantivy Snippet 8 | /// 9 | /// Snippet contains a fragment of a document, and some highlighted 10 | /// parts inside it. 11 | #[pyclass(module = "tantivy.tantivy")] 12 | pub(crate) struct Snippet { 13 | pub(crate) inner: tv::snippet::Snippet, 14 | } 15 | 16 | #[pyclass(module = "tantivy.tantivy")] 17 | pub(crate) struct Range { 18 | #[pyo3(get)] 19 | start: usize, 20 | #[pyo3(get)] 21 | end: usize, 22 | } 23 | 24 | #[pymethods] 25 | impl Snippet { 26 | pub fn to_html(&self) -> PyResult { 27 | Ok(self.inner.to_html()) 28 | } 29 | 30 | pub fn highlighted(&self) -> Vec { 31 | let highlighted = self.inner.highlighted(); 32 | let results = highlighted 33 | .iter() 34 | .map(|r| Range { 35 | start: r.start, 36 | end: r.end, 37 | }) 38 | .collect::>(); 39 | results 40 | } 41 | 42 | pub fn fragment(&self) -> PyResult { 43 | Ok(self.inner.fragment().to_string()) 44 | } 45 | } 46 | 47 | #[pyclass(module = "tantivy.tantivy")] 48 | pub(crate) struct SnippetGenerator { 49 | pub(crate) field_name: String, 50 | pub(crate) inner: tv::snippet::SnippetGenerator, 51 | } 52 | 53 | #[pymethods] 54 | impl SnippetGenerator { 55 | #[staticmethod] 56 | pub fn create( 57 | searcher: &crate::Searcher, 58 | query: &crate::Query, 59 | schema: &crate::Schema, 60 | field_name: &str, 61 | ) -> PyResult { 62 | let field = schema 63 | .inner 64 | .get_field(field_name) 65 | .or(Err("field not found")) 66 | .map_err(to_pyerr)?; 67 | let generator = tv::snippet::SnippetGenerator::create( 68 | &searcher.inner, 69 | query.get(), 70 | field, 71 | ) 72 | .map_err(to_pyerr)?; 73 | 74 | Ok(SnippetGenerator { 75 | field_name: field_name.to_string(), 76 | inner: generator, 77 | }) 78 | } 79 | 80 | pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet { 81 | let text: String = doc 82 | .iter_values_for_field(&self.field_name) 83 | .flat_map(|ov| ov.as_str()) 84 | .collect::>() 85 | .join(" "); 86 | 87 | let result = self.inner.snippet(&text); 88 | Snippet { inner: result } 89 | } 90 | 91 | pub fn set_max_num_chars(&mut self, max_num_chars: usize) { 92 | self.inner.set_max_num_chars(max_num_chars); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/tokenizer.rs: -------------------------------------------------------------------------------- 1 | use pyo3::{exceptions::PyValueError, prelude::*}; 2 | use tantivy::tokenizer as tvt; 3 | 4 | /// All Tantivy's built-in tokenizers in one place. 5 | /// Each static method, e.g. Tokenizer.simple(), 6 | /// creates a wrapper around a Tantivy tokenizer. 7 | /// 8 | /// ## Example: 9 | /// 10 | /// ```python 11 | /// tokenizer = Tokenizer.regex(r"\w+") 12 | /// ``` 13 | /// 14 | /// ## Usage 15 | /// 16 | /// In general, tokenizer objects' only reason 17 | /// for existing is to be passed to 18 | /// TextAnalyzerBuilder(tokenizer=) 19 | /// 20 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/index.html 21 | /// 22 | // ## Implementation details: 23 | // 24 | // This is a complex enum. Each variant is a struct 25 | // that defines the arguments accepted by the 26 | // corresponding tokenizer's constructor. 27 | // The enum members, e.g. _Raw, are not instantiated 28 | // directly because our version of pyo3 (0.21.0) 29 | // does not have the #[pyo3(constructor = ...)], 30 | // attribute yet, making it more sensible to 31 | // define constructor signatures using a separate method. 32 | #[pyclass(module = "tantivy.tokenizer")] 33 | #[derive(Debug)] 34 | pub enum Tokenizer { 35 | _Raw {}, 36 | _Simple {}, 37 | _Whitespace {}, 38 | _Regex { 39 | pattern: String, 40 | }, 41 | _Ngram { 42 | min_gram: usize, 43 | max_gram: usize, 44 | prefix_only: bool, 45 | }, 46 | _Facet {}, 47 | } 48 | 49 | #[pymethods] 50 | impl Tokenizer { 51 | /// SimpleTokenizer 52 | #[staticmethod] 53 | fn simple() -> PyResult { 54 | Ok(Tokenizer::_Simple {}) 55 | } 56 | 57 | /// WhitespaceTokenizer 58 | #[staticmethod] 59 | fn whitespace() -> PyResult { 60 | Ok(Tokenizer::_Whitespace {}) 61 | } 62 | 63 | /// Raw Tokenizer 64 | #[staticmethod] 65 | fn raw() -> PyResult { 66 | Ok(Tokenizer::_Raw {}) 67 | } 68 | 69 | /// FacetTokenizer 70 | #[staticmethod] 71 | fn facet() -> PyResult { 72 | Ok(Tokenizer::_Facet {}) 73 | } 74 | 75 | /// Regextokenizer 76 | #[staticmethod] 77 | fn regex(pattern: String) -> PyResult { 78 | Ok(Tokenizer::_Regex { pattern }) 79 | } 80 | 81 | /// NgramTokenizer 82 | /// 83 | /// Args: 84 | /// - min_gram (int): Minimum character length of each ngram. 85 | /// - max_gram (int): Maximum character length of each ngram. 86 | /// - prefix_only (bool, optional): If true, ngrams must count from the start of the word. 87 | #[pyo3(signature=(min_gram=2,max_gram=3,prefix_only=false))] 88 | #[staticmethod] 89 | fn ngram( 90 | min_gram: usize, 91 | max_gram: usize, 92 | prefix_only: bool, 93 | ) -> PyResult { 94 | Ok(Tokenizer::_Ngram { 95 | min_gram, 96 | max_gram, 97 | prefix_only, 98 | }) 99 | } 100 | 101 | fn __repr__(&self) -> String { 102 | format!("tantivy.Tokenizer({:?})", &self) 103 | } 104 | } 105 | 106 | /// All Tantivy's builtin TokenFilters. 107 | /// 108 | /// ## Exmaple 109 | /// 110 | /// ```python 111 | /// filter = Filter.alpha_num() 112 | /// ``` 113 | /// 114 | /// ## Usage 115 | /// 116 | /// In general, filter objects exist to 117 | /// be passed to the filter() method 118 | /// of a TextAnalyzerBuilder instance. 119 | /// 120 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/index.html 121 | /// 122 | // ## Implementation details: 123 | // 124 | // This is a complex enum. Each variant is a struct 125 | // that defines the arguments accepted by the 126 | // corresponding tokenizer's constructor. 127 | // The enum members, e.g. _AlphaNum, are not instantiated 128 | // directly because our version of pyo3 (0.21.0) 129 | // does not have the #[pyo3(constructor = ...)], 130 | // attribute yet, making it more sensible to 131 | // define constructor signatures using a separate method. 132 | #[pyclass(module = "tantivy.tokenizer")] 133 | #[derive(Debug)] 134 | pub enum Filter { 135 | _AlphaNumOnly {}, 136 | _AsciiFolding {}, 137 | _LowerCaser {}, 138 | _RemoveLong { length_limit: usize }, 139 | _Stemmer { language: String }, 140 | _StopWord { language: String }, 141 | _CustomStopWord { stopwords: Vec }, 142 | _SplitCompound { constituent_words: Vec }, 143 | } 144 | 145 | #[pymethods] 146 | impl Filter { 147 | /// AlphaNumOnlyFilter 148 | #[staticmethod] 149 | fn alphanum_only() -> PyResult { 150 | Ok(Filter::_AlphaNumOnly {}) 151 | } 152 | 153 | /// AsciiFoldingFilter 154 | #[staticmethod] 155 | fn ascii_fold() -> PyResult { 156 | Ok(Filter::_AsciiFolding {}) 157 | } 158 | 159 | #[staticmethod] 160 | fn lowercase() -> PyResult { 161 | Ok(Filter::_LowerCaser {}) 162 | } 163 | 164 | /// RemoveLongFilter 165 | /// 166 | /// Args: 167 | /// - length_limit (int): max character length of token. 168 | #[staticmethod] 169 | fn remove_long(length_limit: usize) -> PyResult { 170 | Ok(Filter::_RemoveLong { length_limit }) 171 | } 172 | 173 | /// Stemmer 174 | #[staticmethod] 175 | fn stemmer(language: String) -> PyResult { 176 | Ok(Filter::_Stemmer { language }) 177 | } 178 | 179 | /// StopWordFilter (builtin stop word list) 180 | /// 181 | /// Args: 182 | /// - language (string): Stop words list language. 183 | /// Valid values: { 184 | /// "arabic", "danish", "dutch", "english", "finnish", "french", "german", "greek", 185 | /// "hungarian", "italian", "norwegian", "portuguese", "romanian", "russian", 186 | /// "spanish", "swedish", "tamil", "turkish" 187 | /// } 188 | // ## Implementation notes: 189 | // An enum would make more sense for `language`, but I'm not sure if it's worth it. 190 | #[staticmethod] 191 | fn stopword(language: String) -> PyResult { 192 | Ok(Filter::_StopWord { language }) 193 | } 194 | 195 | /// StopWordFilter (user-provided stop word list) 196 | /// 197 | /// This variant of Filter.stopword() lets you provide 198 | /// your own custom list of stopwords. 199 | /// 200 | /// Args: 201 | /// - stopwords (list(str)): a list of words to be removed. 202 | #[staticmethod] 203 | fn custom_stopword(stopwords: Vec) -> PyResult { 204 | Ok(Filter::_CustomStopWord { stopwords }) 205 | } 206 | 207 | /// SplitCompoundWords 208 | /// 209 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/struct.SplitCompoundWords.html 210 | /// 211 | /// Args: 212 | /// - constituent_words (list(string)): words that make up compound word (must be in order). 213 | /// 214 | /// Example: 215 | /// 216 | /// ```python 217 | /// # useless, contrived example: 218 | /// compound_spliter = Filter.split_compounds(['butter', 'fly']) 219 | /// # Will split 'butterfly' -> ['butter', 'fly'], 220 | /// # but won't split 'buttering' or 'buttercupfly' 221 | /// ``` 222 | #[staticmethod] 223 | fn split_compound(constituent_words: Vec) -> PyResult { 224 | Ok(Filter::_SplitCompound { constituent_words }) 225 | } 226 | 227 | fn __repr__(&self) -> String { 228 | format!("tantivy.Filter(kind={:?})", &self) 229 | } 230 | } 231 | 232 | fn parse_language(lang: &str) -> Result { 233 | match lang.to_lowercase().as_str() { 234 | "arabic" => Ok(tvt::Language::Arabic), 235 | "danish" => Ok(tvt::Language::Danish), 236 | "dutch" => Ok(tvt::Language::Dutch), 237 | "english" => Ok(tvt::Language::English), 238 | "finnish" => Ok(tvt::Language::Finnish), 239 | "french" => Ok(tvt::Language::French), 240 | "german" => Ok(tvt::Language::German), 241 | "greek" => Ok(tvt::Language::Greek), 242 | "hungarian" => Ok(tvt::Language::Hungarian), 243 | "italian" => Ok(tvt::Language::Italian), 244 | "norwegian" => Ok(tvt::Language::Norwegian), 245 | "portuguese" => Ok(tvt::Language::Portuguese), 246 | "romanian" => Ok(tvt::Language::Romanian), 247 | "russian" => Ok(tvt::Language::Russian), 248 | "spanish" => Ok(tvt::Language::Spanish), 249 | "swedish" => Ok(tvt::Language::Swedish), 250 | "tamil" => Ok(tvt::Language::Tamil), 251 | "turkish" => Ok(tvt::Language::Turkish), 252 | _ => Err(format!("Unsupported language: {}", lang)), 253 | } 254 | } 255 | 256 | /// Tantivy's TextAnalyzer 257 | /// 258 | /// Do not instantiate this class directly. 259 | /// Use the `TextAnalyzerBuilder` class instead. 260 | #[derive(Clone)] 261 | #[pyclass(module = "tantivy.tantivy")] 262 | pub(crate) struct TextAnalyzer { 263 | pub(crate) analyzer: tvt::TextAnalyzer, 264 | } 265 | 266 | #[pymethods] 267 | impl TextAnalyzer { 268 | /// Tokenize a string 269 | /// Args: 270 | /// - text (string): text to tokenize. 271 | /// Returns: 272 | /// - list(string): a list of tokens/words. 273 | fn analyze(&mut self, text: &str) -> Vec { 274 | let mut token_stream = self.analyzer.token_stream(text); 275 | let mut tokens = Vec::new(); 276 | 277 | while token_stream.advance() { 278 | tokens.push(token_stream.token().text.clone()); 279 | } 280 | tokens 281 | } 282 | } 283 | 284 | /// Tantivy's TextAnalyzerBuilder 285 | /// 286 | /// # Example 287 | /// 288 | /// ```python 289 | /// my_analyzer: TextAnalyzer = ( 290 | /// TextAnalyzerBuilder(Tokenizer.simple()) 291 | /// .filter(Filter.lowercase()) 292 | /// .filter(Filter.ngram()) 293 | /// .build() 294 | /// ) 295 | /// ``` 296 | /// 297 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/struct.TextAnalyzerBuilder.html 298 | #[pyclass] 299 | pub struct TextAnalyzerBuilder { 300 | builder: Option, 301 | } 302 | 303 | #[pymethods] 304 | impl TextAnalyzerBuilder { 305 | #[new] 306 | fn new(tokenizer: &Tokenizer) -> PyResult { 307 | let builder: tvt::TextAnalyzerBuilder = match tokenizer { 308 | Tokenizer::_Raw {} => { 309 | tvt::TextAnalyzer::builder(tvt::RawTokenizer::default()) 310 | .dynamic() 311 | } 312 | Tokenizer::_Simple {} => { 313 | tvt::TextAnalyzer::builder(tvt::SimpleTokenizer::default()) 314 | .dynamic() 315 | } 316 | Tokenizer::_Whitespace {} => { 317 | tvt::TextAnalyzer::builder(tvt::WhitespaceTokenizer::default()) 318 | .dynamic() 319 | } 320 | Tokenizer::_Regex { pattern } => tvt::TextAnalyzer::builder( 321 | tvt::RegexTokenizer::new(pattern).map_err(|e| { 322 | PyErr::new::(format!( 323 | "Invalid regex pattern: {}", 324 | e 325 | )) 326 | })?, // tvt::RegexTokenizer::new(pattern) .unwrap(), 327 | ) 328 | .dynamic(), 329 | Tokenizer::_Ngram { 330 | min_gram, 331 | max_gram, 332 | prefix_only, 333 | } => tvt::TextAnalyzer::builder( 334 | tvt::NgramTokenizer::new(*min_gram, *max_gram, *prefix_only) 335 | .unwrap(), 336 | ) 337 | .dynamic(), 338 | Tokenizer::_Facet {} => { 339 | tvt::TextAnalyzer::builder(tvt::FacetTokenizer::default()) 340 | .dynamic() 341 | } 342 | }; 343 | 344 | Ok(TextAnalyzerBuilder { 345 | builder: Some(builder.dynamic()), 346 | }) 347 | } 348 | 349 | /// Add filter to the builder. 350 | /// 351 | /// Args: 352 | /// - filter (Filter): a Filter object. 353 | /// Returns: 354 | /// - TextAnalyzerBuilder: A new instance of the builder 355 | /// 356 | /// Note: The builder is _not_ mutated in-place. 357 | fn filter(&mut self, filter: &Filter) -> PyResult { 358 | if let Some(builder) = self.builder.take() { 359 | let new_builder: tvt::TextAnalyzerBuilder = match filter { 360 | Filter::_AlphaNumOnly {} => { 361 | builder.filter_dynamic(tvt::AlphaNumOnlyFilter {}) 362 | } 363 | Filter::_AsciiFolding {} => { 364 | builder.filter_dynamic(tvt::AsciiFoldingFilter) 365 | } 366 | Filter::_LowerCaser {} => { 367 | builder.filter_dynamic(tvt::LowerCaser) 368 | } 369 | Filter::_RemoveLong { length_limit } => builder.filter_dynamic( 370 | tvt::RemoveLongFilter::limit(*length_limit), 371 | ), 372 | Filter::_Stemmer { language } => { 373 | match parse_language(language) { 374 | Ok(lang) => { 375 | builder.filter_dynamic(tvt::Stemmer::new(lang)) 376 | } 377 | Err(e) => { 378 | return Err(PyErr::new::< 379 | pyo3::exceptions::PyValueError, 380 | _, 381 | >(e)) 382 | } 383 | } 384 | } 385 | Filter::_StopWord { language } => { 386 | match parse_language(language) { 387 | Ok(lang) => builder.filter_dynamic( 388 | tvt::StopWordFilter::new(lang).unwrap(), 389 | ), 390 | Err(e) => { 391 | return Err(PyErr::new::< 392 | pyo3::exceptions::PyValueError, 393 | _, 394 | >(e)) 395 | } 396 | } 397 | } 398 | Filter::_CustomStopWord { stopwords } => builder 399 | .filter_dynamic(tvt::StopWordFilter::remove( 400 | stopwords.clone(), 401 | )), 402 | Filter::_SplitCompound { constituent_words } => builder 403 | .filter_dynamic( 404 | tvt::SplitCompoundWords::from_dictionary( 405 | constituent_words, 406 | ) 407 | .unwrap(), 408 | ), 409 | }; 410 | Ok(TextAnalyzerBuilder { 411 | builder: Some(new_builder), 412 | }) 413 | } else { 414 | Err(PyErr::new::( 415 | "Builder has already been consumed", 416 | )) 417 | } 418 | } 419 | 420 | /// Build final TextAnalyzer object. 421 | /// 422 | /// Returns: 423 | /// - TextAnalyzer with tokenizer and filters baked in. 424 | /// 425 | /// Tip: TextAnalyzer's `analyze(text) -> tokens` method lets you 426 | /// easily check if your analyzer is working as expected. 427 | fn build(&mut self) -> PyResult { 428 | if let Some(builder) = self.builder.take() { 429 | Ok(TextAnalyzer { 430 | analyzer: builder.build(), 431 | }) 432 | } else { 433 | Err(PyErr::new::( 434 | "Builder has already been consumed", 435 | )) 436 | } 437 | } 438 | } 439 | -------------------------------------------------------------------------------- /tantivy/__init__.py: -------------------------------------------------------------------------------- 1 | from .tantivy import * -------------------------------------------------------------------------------- /tantivy/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quickwit-oss/tantivy-py/23871c1aa2752010b34df117405ccb5da37e94ba/tantivy/py.typed -------------------------------------------------------------------------------- /tantivy/tantivy.pyi: -------------------------------------------------------------------------------- 1 | import datetime 2 | from enum import Enum 3 | from typing import Any, Optional, Sequence, TypeVar, Union 4 | 5 | 6 | class Schema: 7 | pass 8 | 9 | 10 | class SchemaBuilder: 11 | @staticmethod 12 | def is_valid_field_name(name: str) -> bool: 13 | pass 14 | 15 | def add_text_field( 16 | self, 17 | name: str, 18 | stored: bool = False, 19 | fast: bool = False, 20 | tokenizer_name: str = "default", 21 | index_option: str = "position", 22 | ) -> SchemaBuilder: 23 | pass 24 | 25 | def add_integer_field( 26 | self, 27 | name: str, 28 | stored: bool = False, 29 | indexed: bool = False, 30 | fast: bool = False, 31 | ) -> SchemaBuilder: 32 | pass 33 | 34 | def add_float_field( 35 | self, 36 | name: str, 37 | stored: bool = False, 38 | indexed: bool = False, 39 | fast: bool = False, 40 | ) -> SchemaBuilder: 41 | pass 42 | 43 | def add_unsigned_field( 44 | self, 45 | name: str, 46 | stored: bool = False, 47 | indexed: bool = False, 48 | fast: bool = False, 49 | ) -> SchemaBuilder: 50 | pass 51 | 52 | def add_boolean_field( 53 | self, 54 | name: str, 55 | stored: bool = False, 56 | indexed: bool = False, 57 | fast: bool = False, 58 | ) -> SchemaBuilder: 59 | pass 60 | 61 | def add_date_field( 62 | self, 63 | name: str, 64 | stored: bool = False, 65 | indexed: bool = False, 66 | fast: bool = False, 67 | ) -> SchemaBuilder: 68 | pass 69 | 70 | def add_json_field( 71 | self, 72 | name: str, 73 | stored: bool = False, 74 | tokenizer_name: str = "default", 75 | index_option: str = "position", 76 | ) -> SchemaBuilder: 77 | pass 78 | 79 | def add_facet_field( 80 | self, 81 | name: str, 82 | ) -> SchemaBuilder: 83 | pass 84 | 85 | def add_bytes_field( 86 | self, 87 | name: str, 88 | stored: bool = False, 89 | indexed: bool = False, 90 | fast: bool = False, 91 | index_option: str = "position", 92 | ) -> SchemaBuilder: 93 | pass 94 | 95 | def add_ip_addr_field( 96 | self, 97 | name: str, 98 | stored: bool = False, 99 | indexed: bool = False, 100 | fast: bool = False, 101 | ) -> SchemaBuilder: 102 | pass 103 | 104 | def build(self) -> Schema: 105 | pass 106 | 107 | 108 | class Facet: 109 | @staticmethod 110 | def from_encoded(encoded_bytes: bytes) -> Facet: 111 | pass 112 | 113 | @classmethod 114 | def root(cls) -> Facet: 115 | pass 116 | 117 | @classmethod 118 | def from_string(cls, facet_string: str) -> Facet: 119 | pass 120 | 121 | @property 122 | def is_root(self) -> bool: 123 | pass 124 | 125 | def is_prefix_of(self, other: Facet) -> bool: 126 | pass 127 | 128 | def to_path(self) -> list[str]: 129 | pass 130 | 131 | def to_path_str(self) -> str: 132 | pass 133 | 134 | 135 | class Document: 136 | def __new__(cls, **kwargs) -> Document: 137 | pass 138 | 139 | def __getitem__(self, key: str) -> list[Any]: 140 | pass 141 | 142 | def extend(self, py_dict: dict, schema: Optional[Schema]) -> None: 143 | pass 144 | 145 | @staticmethod 146 | def from_dict(py_dict: dict, schema: Optional[Schema] = None) -> Document: 147 | pass 148 | 149 | def to_dict(self) -> dict[str, list[Any]]: 150 | pass 151 | 152 | def add_text(self, field_name: str, text: str) -> None: 153 | pass 154 | 155 | def add_unsigned(self, field_name: str, value: int) -> None: 156 | pass 157 | 158 | def add_integer(self, field_name: str, value: int) -> None: 159 | pass 160 | 161 | def add_float(self, field_name: str, value: float) -> None: 162 | pass 163 | 164 | def add_boolean(self, field_name: str, value: bool) -> None: 165 | pass 166 | 167 | def add_date(self, field_name: str, value: datetime.datetime) -> None: 168 | pass 169 | 170 | def add_facet(self, field_name: str, facet: Facet) -> None: 171 | pass 172 | 173 | def add_bytes(self, field_name: str, bytes: bytes) -> None: 174 | pass 175 | 176 | def add_json(self, field_name: str, value: Any) -> None: 177 | pass 178 | 179 | def add_ip_addr(self, field_name: str, ip_addr: str) -> None: 180 | pass 181 | 182 | @property 183 | def num_fields(self) -> int: 184 | pass 185 | 186 | @property 187 | def is_empty(self) -> bool: 188 | pass 189 | 190 | def get_first(self, field_name: str) -> Optional[Any]: 191 | pass 192 | 193 | def get_all(self, field_name: str) -> list[Any]: 194 | pass 195 | 196 | 197 | class Occur(Enum): 198 | Must = 1 199 | Should = 2 200 | MustNot = 3 201 | 202 | 203 | class FieldType(Enum): 204 | Text = 1 205 | Unsigned = 2 206 | Integer = 3 207 | Float = 4 208 | Boolean = 5 209 | Date = 6 210 | Facet = 7 211 | Bytes = 8 212 | Json = 9 213 | IpAddr = 10 214 | 215 | 216 | _RangeType = TypeVar( 217 | "_RangeType", bound=int | float | datetime.datetime | bool | str | bytes 218 | ) 219 | 220 | 221 | class Query: 222 | @staticmethod 223 | def term_query( 224 | schema: Schema, 225 | field_name: str, 226 | field_value: Any, 227 | index_option: str = "position", 228 | ) -> Query: 229 | pass 230 | 231 | @staticmethod 232 | def term_set_query( 233 | schema: Schema, field_name: str, field_values: Sequence[Any] 234 | ) -> Query: 235 | pass 236 | 237 | @staticmethod 238 | def all_query() -> Query: 239 | pass 240 | 241 | @staticmethod 242 | def fuzzy_term_query( 243 | schema: Schema, 244 | field_name: str, 245 | text: str, 246 | distance: int = 1, 247 | transposition_cost_one: bool = True, 248 | prefix=False, 249 | ) -> Query: 250 | pass 251 | 252 | @staticmethod 253 | def phrase_query( 254 | schema: Schema, 255 | field_name: str, 256 | words: list[Union[str, tuple[int, str]]], 257 | slop: int = 0, 258 | ) -> Query: 259 | pass 260 | 261 | @staticmethod 262 | def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query: 263 | pass 264 | 265 | @staticmethod 266 | def disjunction_max_query( 267 | subqueries: Sequence[Query], tie_breaker: Optional[float] = None 268 | ) -> Query: 269 | pass 270 | 271 | @staticmethod 272 | def boost_query(query: Query, boost: float) -> Query: 273 | pass 274 | 275 | @staticmethod 276 | def regex_query(schema: Schema, field_name: str, regex_pattern: str) -> Query: 277 | pass 278 | 279 | @staticmethod 280 | def more_like_this_query( 281 | doc_address: DocAddress, 282 | min_doc_frequency: Optional[int] = 5, 283 | max_doc_frequency: Optional[int] = None, 284 | min_term_frequency: Optional[int] = 2, 285 | max_query_terms: Optional[int] = 25, 286 | min_word_length: Optional[int] = None, 287 | max_word_length: Optional[int] = None, 288 | boost_factor: Optional[float] = 1.0, 289 | stop_words: list[str] = [], 290 | ) -> Query: 291 | pass 292 | 293 | @staticmethod 294 | def const_score_query(query: Query, score: float) -> Query: 295 | pass 296 | 297 | @staticmethod 298 | def range_query( 299 | schema: Schema, 300 | field_name: str, 301 | field_type: FieldType, 302 | lower_bound: _RangeType, 303 | upper_bound: _RangeType, 304 | include_lower: bool = True, 305 | include_upper: bool = True, 306 | ) -> Query: 307 | pass 308 | 309 | 310 | class Order(Enum): 311 | Asc = 1 312 | Desc = 2 313 | 314 | 315 | class DocAddress: 316 | def __new__(cls, segment_ord: int, doc: int) -> DocAddress: 317 | pass 318 | 319 | @property 320 | def segment_ord(self) -> int: 321 | pass 322 | 323 | @property 324 | def doc(self) -> int: 325 | pass 326 | 327 | 328 | class SearchResult: 329 | @property 330 | def hits(self) -> list[tuple[Any, DocAddress]]: 331 | pass 332 | 333 | 334 | class Searcher: 335 | def search( 336 | self, 337 | query: Query, 338 | limit: int = 10, 339 | count: bool = True, 340 | order_by_field: Optional[str] = None, 341 | offset: int = 0, 342 | order: Order = Order.Desc, 343 | ) -> SearchResult: 344 | pass 345 | 346 | def aggregate( 347 | self, 348 | search_query: Query, 349 | agg_query: dict, 350 | ) -> dict: 351 | pass 352 | 353 | @property 354 | def num_docs(self) -> int: 355 | pass 356 | 357 | @property 358 | def num_segments(self) -> int: 359 | pass 360 | 361 | def doc(self, doc_address: DocAddress) -> Document: 362 | pass 363 | 364 | def doc_freq(self, field_name: str, field_value: Any) -> int: 365 | pass 366 | 367 | 368 | class IndexWriter: 369 | def add_document(self, doc: Document) -> int: 370 | pass 371 | 372 | def add_json(self, json: str) -> int: 373 | pass 374 | 375 | def commit(self) -> int: 376 | pass 377 | 378 | def rollback(self) -> int: 379 | pass 380 | 381 | def garbage_collect_files(self) -> None: 382 | pass 383 | 384 | def delete_all_documents(self) -> None: 385 | pass 386 | 387 | @property 388 | def commit_opstamp(self) -> int: 389 | pass 390 | 391 | def delete_documents(self, field_name: str, field_value: Any) -> int: 392 | pass 393 | 394 | def delete_documents_by_term(self, field_name: str, field_value: Any) -> int: 395 | pass 396 | 397 | def delete_documents_by_query(self, query: Query) -> int: 398 | pass 399 | 400 | def wait_merging_threads(self) -> None: 401 | pass 402 | 403 | 404 | class Index: 405 | def __new__( 406 | cls, schema: Schema, path: Optional[str] = None, reuse: bool = True 407 | ) -> Index: 408 | pass 409 | 410 | @staticmethod 411 | def open(path: str) -> Index: 412 | pass 413 | 414 | def writer(self, heap_size: int = 128_000_000, num_threads: int = 0) -> IndexWriter: 415 | pass 416 | 417 | def config_reader( 418 | self, reload_policy: str = "commit", num_warmers: int = 0 419 | ) -> None: 420 | pass 421 | 422 | def searcher(self) -> Searcher: 423 | pass 424 | 425 | @staticmethod 426 | def exists(path: str) -> bool: 427 | pass 428 | 429 | @property 430 | def schema(self) -> Schema: 431 | pass 432 | 433 | def reload(self) -> None: 434 | pass 435 | 436 | def parse_query( 437 | self, 438 | query: str, 439 | default_field_names: Optional[list[str]] = None, 440 | field_boosts: Optional[dict[str, float]] = None, 441 | fuzzy_fields: Optional[dict[str, tuple[bool, int, bool]]] = None, 442 | ) -> Query: 443 | pass 444 | 445 | def parse_query_lenient( 446 | self, 447 | query: str, 448 | default_field_names: Optional[list[str]] = None, 449 | field_boosts: Optional[dict[str, float]] = None, 450 | fuzzy_fields: Optional[dict[str, tuple[bool, int, bool]]] = None, 451 | ) -> tuple[Query, list[Any]]: 452 | pass 453 | 454 | def register_tokenizer( 455 | self, name: str, text_analyzer: TextAnalyzer 456 | ) -> None: ... 457 | 458 | 459 | class Range: 460 | @property 461 | def start(self) -> int: 462 | pass 463 | 464 | @property 465 | def end(self) -> int: 466 | pass 467 | 468 | 469 | class Snippet: 470 | def to_html(self) -> str: 471 | pass 472 | 473 | def highlighted(self) -> list[Range]: 474 | pass 475 | 476 | def fragment(self) -> str: 477 | pass 478 | 479 | class SnippetGenerator: 480 | @staticmethod 481 | def create( 482 | searcher: Searcher, query: Query, schema: Schema, field_name: str 483 | ) -> SnippetGenerator: 484 | pass 485 | 486 | def snippet_from_doc(self, doc: Document) -> Snippet: 487 | pass 488 | 489 | def set_max_num_chars(self, max_num_chars: int) -> None: 490 | pass 491 | 492 | 493 | class Tokenizer: 494 | @staticmethod 495 | def raw() -> Tokenizer: 496 | pass 497 | 498 | @staticmethod 499 | def simple() -> Tokenizer: 500 | pass 501 | 502 | @staticmethod 503 | def whitespace() -> Tokenizer: 504 | pass 505 | 506 | @staticmethod 507 | def regex(pattern: str) -> Tokenizer: 508 | pass 509 | 510 | @staticmethod 511 | def ngram( 512 | min_gram: int = 2, max_gram: int = 3, prefix_only: bool = False 513 | ) -> Tokenizer: 514 | pass 515 | 516 | @staticmethod 517 | def facet() -> Tokenizer: 518 | pass 519 | 520 | 521 | class Filter: 522 | 523 | @staticmethod 524 | def alphanum_only() -> Filter: 525 | pass 526 | 527 | @staticmethod 528 | def ascii_fold() -> Filter: 529 | pass 530 | 531 | @staticmethod 532 | def lowercase() -> Filter: 533 | pass 534 | 535 | @staticmethod 536 | def remove_long(length_limit: int) -> Filter: 537 | pass 538 | 539 | @staticmethod 540 | def stemmer(language: str) -> Filter: 541 | pass 542 | 543 | @staticmethod 544 | def stopword(language: str) -> Filter: 545 | pass 546 | 547 | @staticmethod 548 | def custom_stopword(stopwords: list[str]) -> Filter: 549 | pass 550 | 551 | @staticmethod 552 | def split_compound(constituent_words: list[str]) -> Filter: 553 | pass 554 | 555 | 556 | class TextAnalyzer: 557 | 558 | def analyze(self, text: str) -> list[str]: 559 | pass 560 | 561 | 562 | class TextAnalyzerBuilder: 563 | 564 | def __init__(self, tokenizer: Tokenizer): 565 | pass 566 | 567 | def filter(self, filter: Filter) -> TextAnalyzerBuilder: 568 | pass 569 | 570 | def build(self) -> TextAnalyzer: 571 | pass 572 | 573 | 574 | __version__: str 575 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import pytest 3 | 4 | from tantivy import SchemaBuilder, Index, Document 5 | 6 | 7 | def schema(): 8 | return ( 9 | SchemaBuilder() 10 | .add_text_field("title", stored=True) 11 | .add_text_field("body") 12 | .build() 13 | ) 14 | 15 | 16 | def schema_numeric_fields(): 17 | return ( 18 | SchemaBuilder() 19 | .add_integer_field("id", stored=True, indexed=True, fast=True) 20 | .add_float_field("rating", stored=True, indexed=True, fast=True) 21 | .add_boolean_field("is_good", stored=True, indexed=True) 22 | .add_text_field("body", stored=True, fast=True) 23 | .build() 24 | ) 25 | 26 | def schema_with_date_field(): 27 | return ( 28 | SchemaBuilder() 29 | .add_integer_field("id", stored=True, indexed=True) 30 | .add_float_field("rating", stored=True, indexed=True) 31 | .add_date_field("date", stored=True, indexed=True) 32 | .build() 33 | ) 34 | 35 | def schema_with_ip_addr_field(): 36 | return ( 37 | SchemaBuilder() 38 | .add_integer_field("id", stored=True, indexed=True) 39 | .add_float_field("rating", stored=True, indexed=True) 40 | .add_ip_addr_field("ip_addr", stored=True, indexed=True) 41 | .build() 42 | ) 43 | 44 | def create_index(dir=None): 45 | # assume all tests will use the same documents for now 46 | # other methods may set up function-local indexes 47 | index = Index(schema(), dir) 48 | writer = index.writer(15_000_000, 1) 49 | 50 | # 2 ways of adding documents 51 | # 1 52 | doc = Document() 53 | # create a document instance 54 | # add field-value pairs 55 | doc.add_text("title", "The Old Man and the Sea") 56 | doc.add_text( 57 | "body", 58 | ( 59 | "He was an old man who fished alone in a skiff in" 60 | "the Gulf Stream and he had gone eighty-four days " 61 | "now without taking a fish." 62 | ), 63 | ) 64 | writer.add_document(doc) 65 | # 2 use the built-in json support 66 | # keys need to coincide with field names 67 | doc = Document.from_dict( 68 | { 69 | "title": "Of Mice and Men", 70 | "body": ( 71 | "A few miles south of Soledad, the Salinas River drops " 72 | "in close to the hillside bank and runs deep and " 73 | "green. The water is warm too, for it has slipped " 74 | "twinkling over the yellow sands in the sunlight " 75 | "before reaching the narrow pool. On one side of the " 76 | "river the golden foothill slopes curve up to the " 77 | "strong and rocky Gabilan Mountains, but on the valley " 78 | "side the water is lined with trees—willows fresh and " 79 | "green with every spring, carrying in their lower leaf " 80 | "junctures the debris of the winter’s flooding; and " 81 | "sycamores with mottled, white, recumbent limbs and " 82 | "branches that arch over the pool" 83 | ), 84 | } 85 | ) 86 | writer.add_document(doc) 87 | writer.add_json( 88 | """{ 89 | "title": ["Frankenstein", "The Modern Prometheus"], 90 | "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." 91 | }""" 92 | ) 93 | writer.commit() 94 | writer.wait_merging_threads() 95 | index.reload() 96 | return index 97 | 98 | 99 | def create_index_with_numeric_fields(dir=None): 100 | index = Index(schema_numeric_fields(), dir) 101 | writer = index.writer(15_000_000, 1) 102 | 103 | doc = Document() 104 | doc.add_integer("id", 1) 105 | doc.add_float("rating", 3.5) 106 | doc.add_boolean("is_good", True) 107 | doc.add_text( 108 | "body", 109 | ( 110 | "He was an old man who fished alone in a skiff in" 111 | "the Gulf Stream and he had gone eighty-four days " 112 | "now without taking a fish." 113 | ), 114 | ) 115 | writer.add_document(doc) 116 | doc = Document.from_dict( 117 | { 118 | "id": 2, 119 | "rating": 4.5, 120 | "is_good": False, 121 | "body": ( 122 | "A few miles south of Soledad, the Salinas River drops " 123 | "in close to the hillside bank and runs deep and " 124 | "green. The water is warm too, for it has slipped " 125 | "twinkling over the yellow sands in the sunlight " 126 | "before reaching the narrow pool. On one side of the " 127 | "river the golden foothill slopes curve up to the " 128 | "strong and rocky Gabilan Mountains, but on the valley " 129 | "side the water is lined with trees—willows fresh and " 130 | "green with every spring, carrying in their lower leaf " 131 | "junctures the debris of the winter’s flooding; and " 132 | "sycamores with mottled, white, recumbent limbs and " 133 | "branches that arch over the pool" 134 | ), 135 | }, 136 | ) 137 | writer.add_document(doc) 138 | writer.commit() 139 | writer.wait_merging_threads() 140 | index.reload() 141 | return index 142 | 143 | def create_index_with_date_field(dir=None): 144 | index = Index(schema_with_date_field(), dir) 145 | writer = index.writer(15_000_000, 1) 146 | 147 | doc = Document() 148 | doc.add_integer("id", 1) 149 | doc.add_float("rating", 3.5) 150 | doc.add_date("date", datetime(2021, 1, 1)) 151 | 152 | writer.add_document(doc) 153 | doc = Document.from_dict( 154 | { 155 | "id": 2, 156 | "rating": 4.5, 157 | "date": datetime(2021, 1, 2), 158 | }, 159 | ) 160 | writer.add_document(doc) 161 | writer.commit() 162 | writer.wait_merging_threads() 163 | index.reload() 164 | return index 165 | 166 | def create_index_with_ip_addr_field(dir=None): 167 | schema = schema_with_ip_addr_field() 168 | index = Index(schema, dir) 169 | writer = index.writer(15_000_000, 1) 170 | 171 | doc = Document() 172 | doc.add_integer("id", 1) 173 | doc.add_float("rating", 3.5) 174 | doc.add_ip_addr("ip_addr", "10.0.0.1") 175 | writer.add_document(doc) 176 | 177 | doc = Document.from_dict( 178 | { 179 | "id": 2, 180 | "rating": 4.5, 181 | "ip_addr": "127.0.0.1", 182 | }, 183 | schema 184 | ) 185 | writer.add_document(doc) 186 | doc = Document.from_dict( 187 | { 188 | "id": 2, 189 | "rating": 4.5, 190 | "ip_addr": "::1", 191 | }, 192 | schema 193 | ) 194 | writer.add_document(doc) 195 | writer.commit() 196 | writer.wait_merging_threads() 197 | index.reload() 198 | return index 199 | 200 | def spanish_schema(): 201 | return ( 202 | SchemaBuilder() 203 | .add_text_field("title", stored=True, tokenizer_name="es_stem") 204 | .add_text_field("body", tokenizer_name="es_stem") 205 | .build() 206 | ) 207 | 208 | 209 | def create_spanish_index(): 210 | # assume all tests will use the same documents for now 211 | # other methods may set up function-local indexes 212 | index = Index(spanish_schema(), None) 213 | writer = index.writer() 214 | 215 | # 2 ways of adding documents 216 | # 1 217 | doc = Document() 218 | # create a document instance 219 | # add field-value pairs 220 | doc.add_text("title", "El viejo y el mar") 221 | doc.add_text( 222 | "body", 223 | ( 224 | "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. " 225 | ), 226 | ) 227 | writer.add_document(doc) 228 | # 2 use the built-in json support 229 | # keys need to coincide with field names 230 | doc = Document.from_dict( 231 | { 232 | "title": "De ratones y hombres", 233 | "body": ( 234 | "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque" 235 | ), 236 | } 237 | ) 238 | writer.add_document(doc) 239 | writer.add_json( 240 | """{ 241 | "title": ["Frankenstein", "El moderno Prometeo"], 242 | "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa." 243 | }""" 244 | ) 245 | writer.commit() 246 | writer.wait_merging_threads() 247 | index.reload() 248 | return index 249 | 250 | 251 | @pytest.fixture() 252 | def dir_index(tmpdir): 253 | return (tmpdir, create_index(str(tmpdir))) 254 | 255 | 256 | @pytest.fixture(scope="class") 257 | def ram_index(): 258 | return create_index() 259 | 260 | 261 | @pytest.fixture(scope="class") 262 | def ram_index_numeric_fields(): 263 | return create_index_with_numeric_fields() 264 | 265 | @pytest.fixture(scope="class") 266 | def ram_index_with_date_field(): 267 | return create_index_with_date_field() 268 | 269 | @pytest.fixture(scope="class") 270 | def ram_index_with_ip_addr_field(): 271 | return create_index_with_ip_addr_field() 272 | 273 | @pytest.fixture(scope="class") 274 | def spanish_index(): 275 | return create_spanish_index() 276 | -------------------------------------------------------------------------------- /tests/test_docs.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | 4 | from mktestdocs import check_md_file 5 | 6 | 7 | @pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str) 8 | def test_docs(filepath): 9 | check_md_file(filepath, memory=True) 10 | -------------------------------------------------------------------------------- /tests/test_escapes.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from tantivy import Query 4 | 5 | 6 | def test_escape_quote_parse_query(ram_index): 7 | index = ram_index 8 | # We verify only that `parse_query` doesn't raise. This was a change 9 | # from tantivy versions prior to 0.24.0 in which the following would 10 | # raise a `ValueError`. 11 | q = index.parse_query(r'sea\"', ["title", "body"]) 12 | print(q) 13 | 14 | 15 | def test_escape_quote_parse_query_with_quotes(ram_index): 16 | index = ram_index 17 | # We verify only that `parse_query` doesn't raise. We are not testing 18 | # whether tantivy's `parse_query` is correct. 19 | query = index.parse_query(r'"sea\""', ["title", "body"]) 20 | 21 | 22 | def test_escape_quote_parse_query_quoted(ram_index): 23 | index = ram_index 24 | # We verify only that `parse_query` doesn't raise. We are not testing 25 | # whether tantivy's `parse_query` is correct. 26 | query = index.parse_query(r'title:"sea \"whale"') 27 | 28 | 29 | def test_escape_quote_term_query(ram_index): 30 | index = ram_index 31 | # We verify only that `parse_query` doesn't raise. We are not testing 32 | # whether tantivy's `parse_query` is correct. 33 | query = Query.term_query(index.schema, "title", "sea\" whale") 34 | -------------------------------------------------------------------------------- /tests/test_json_bug.py: -------------------------------------------------------------------------------- 1 | def test_json_bug(): 2 | import tantivy 3 | 4 | schema_builder = tantivy.SchemaBuilder() 5 | schema_builder.add_json_field("data", stored=True) 6 | schema = schema_builder.build() 7 | 8 | index = tantivy.Index(schema) 9 | 10 | index_writer = index.writer() 11 | 12 | data = { 13 | "name": "John Doe", 14 | "age": 30, 15 | "email": "john.doe@example.com", 16 | "interests": ["reading", "hiking", "coding"], 17 | } 18 | import json 19 | json_data = json.dumps(data) 20 | 21 | doc = tantivy.Document() 22 | doc.add_json("data", json_data) 23 | index_writer.add_document(doc) 24 | index_writer.commit() 25 | index_writer.wait_merging_threads() 26 | index.reload() 27 | 28 | searcher = index.searcher() 29 | 30 | query = "*" 31 | q = index.parse_query(query) 32 | top_docs = searcher.search(q, limit=10) 33 | 34 | print(f"Total hits: {top_docs}") 35 | for score, hit in top_docs.hits: 36 | doc = searcher.doc(hit) 37 | print(doc["data"]) 38 | assert doc["data"] == [{'age': 30, 39 | 'email': 'john.doe@example.com', 40 | 'interests': ['reading', 'hiking', 'coding'], 41 | 'name': 'John Doe' 42 | }] 43 | --------------------------------------------------------------------------------