├── .cargo
    └── config.toml
├── .github
    ├── dependabot.yaml
    └── workflows
    │   ├── ci.yml
    │   ├── codeql.yml
    │   ├── publish.yaml
    │   └── scorecards.yml
├── .gitignore
├── .readthedocs.yaml
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── MANIFEST.in
├── README.md
├── build.rs
├── ci
    ├── deploy.sh
    └── deploy_mac.sh
├── docs
    ├── about.md
    ├── explanation.md
    ├── howto.md
    ├── index.md
    ├── reference.md
    ├── requirements.txt
    └── tutorials.md
├── mkdocs.yml
├── noxfile.py
├── pyproject.toml
├── requirements-dev.txt
├── rust-toolchain.toml
├── rustfmt.toml
├── src
    ├── document.rs
    ├── facet.rs
    ├── index.rs
    ├── lib.rs
    ├── parser_error.rs
    ├── query.rs
    ├── schema.rs
    ├── schemabuilder.rs
    ├── searcher.rs
    ├── snippet.rs
    └── tokenizer.rs
├── tantivy
    ├── __init__.py
    ├── py.typed
    └── tantivy.pyi
└── tests
    ├── conftest.py
    ├── tantivy_test.py
    ├── test_docs.py
    ├── test_escapes.py
    └── test_json_bug.py


/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.x86_64-apple-darwin]
2 | rustflags = [
3 |   "-C", "link-arg=-undefined",
4 |   "-C", "link-arg=dynamic_lookup",
5 | ]


--------------------------------------------------------------------------------
/.github/dependabot.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: "github-actions"
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: "weekly"
 7 | 
 8 | - package-ecosystem: cargo
 9 |   directory: "/"
10 |   schedule:
11 |     interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - master
  7 |   pull_request:
  8 |     branches:
  9 |       - master
 10 | 
 11 | concurrency:
 12 |   group: ${{ github.ref }}
 13 |   cancel-in-progress: true
 14 | 
 15 | permissions:
 16 |   contents: read
 17 | 
 18 | jobs:
 19 |   Lint:
 20 |     runs-on: ubuntu-latest
 21 |     steps:
 22 |       - name: Harden Runner
 23 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
 24 |         with:
 25 |           disable-sudo: true
 26 |           egress-policy: block
 27 |           allowed-endpoints: >
 28 |             github.com:443
 29 |             static.rust-lang.org:443
 30 | 
 31 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
 32 |         with:
 33 |           fetch-depth: 0
 34 | 
 35 |       - name: Rust toolchain
 36 |         uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
 37 |         with:
 38 |           toolchain: "stable"
 39 |           components: rustfmt
 40 | 
 41 |       - name: Check Formatting
 42 |         run: cargo fmt --check
 43 | 
 44 |   Test:
 45 |     env:
 46 |       UNSAFE_PYO3_SKIP_VERSION_CHECK: ${{ matrix.unsafe-pyo3-skip-version-check }}
 47 |     strategy:
 48 |       matrix:
 49 |         os: [ubuntu-latest, macos-latest, windows-latest]
 50 |         python-version: ["3.12"]
 51 |         allow-prereleases: [false]
 52 |         include:
 53 |           - os: ubuntu-latest
 54 |             python-version: "3.13"
 55 |             allow-prereleases: false
 56 |           - os: ubuntu-latest
 57 |             python-version: "3.12"
 58 |             allow-prereleases: false
 59 |           - os: ubuntu-latest
 60 |             python-version: "3.11"
 61 |             allow-prereleases: false
 62 |           - os: ubuntu-latest
 63 |             python-version: "3.10"
 64 |             allow-prereleases: false
 65 |           - os: ubuntu-latest
 66 |             python-version: 3.9
 67 |             allow-prereleases: false
 68 |     runs-on: "${{ matrix.os }}"
 69 |     steps:
 70 |       - name: Harden Runner
 71 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
 72 |         with:
 73 |           disable-sudo: true
 74 |           egress-policy: block
 75 |           allowed-endpoints: >
 76 |             api.github.com:443
 77 |             crates.io:443
 78 |             files.pythonhosted.org:443
 79 |             github.com:443
 80 |             pypi.org:443
 81 |             static.crates.io:443
 82 |             index.crates.io:443
 83 |             static.rust-lang.org:443
 84 |             objects.githubusercontent.com:443
 85 | 
 86 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
 87 |         with:
 88 |           fetch-depth: 0
 89 | 
 90 |       - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # 5.6.0
 91 |         with:
 92 |           python-version: ${{ matrix.python-version }}
 93 |           allow-prereleases: ${{ matrix.allow-prereleases }}
 94 | 
 95 |       - uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
 96 |         with:
 97 |           toolchain: "stable"
 98 | 
 99 |           #- uses: Swatinem/rust-cache@23bce251a8cd2ffc3c1075eaa2367cf899916d84  # 2.7.3
100 |       - run: python -m pip install nox
101 |       - run: nox -s test-${{ matrix.python-version }}
102 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: ["master"]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: ["master"]
 9 |   schedule:
10 |     - cron: "0 0 * * 1"
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   analyze:
17 |     name: Analyze
18 |     runs-on: ubuntu-latest
19 |     permissions:
20 |       actions: read
21 |       contents: read
22 |       security-events: write
23 | 
24 |     strategy:
25 |       fail-fast: false
26 |       matrix:
27 |         language: ["python"]
28 |         # CodeQL supports [ $supported-codeql-languages ]
29 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
30 | 
31 |     steps:
32 |       - name: Harden Runner
33 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.2.1
34 |         with:
35 |           disable-sudo: true
36 |           egress-policy: block
37 |           allowed-endpoints: >
38 |             api.github.com:443
39 |             files.pythonhosted.org:443
40 |             objects.githubusercontent.com:443
41 |             github.com:443
42 |             pypi.org:443
43 |             uploads.github.com:443
44 | 
45 |       - name: Checkout repository
46 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
47 | 
48 |       # Initializes the CodeQL tools for scanning.
49 |       - name: Initialize CodeQL
50 |         uses: github/codeql-action/init@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v2.2.5
51 |         with:
52 |           languages: ${{ matrix.language }}
53 |           # If you wish to specify custom queries, you can do so here or in a config file.
54 |           # By default, queries listed here will override any specified in a config file.
55 |           # Prefix the list here with "+" to use these queries and those in the config file.
56 | 
57 |           # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
58 |           # queries: security-extended,security-and-quality
59 | 
60 |       # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
61 |       # If this step fails, then you should remove it and run the build manually (see below)
62 |       - name: Autobuild
63 |         uses: github/codeql-action/autobuild@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v2.2.5
64 | 
65 |       # ℹ️ Command-line programs to run using the OS shell.
66 |       # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
67 | 
68 |       #   If the Autobuild fails above, remove it and uncomment the following three lines.
69 |       #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
70 | 
71 |       # - run: |
72 |       #   echo "Run, Build Application using script"
73 |       #   ./location_of_script_within_repo/buildscript.sh
74 | 
75 |       - name: Perform CodeQL Analysis
76 |         uses: github/codeql-action/analyze@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v2.2.5
77 |         with:
78 |           category: "/language:${{matrix.language}}"


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
  1 | name: Test & Release
  2 | 
  3 | on:
  4 |   release:
  5 |     types: [published]
  6 | 
  7 |   # pull_request:
  8 |   #   branches:
  9 |   #     - master
 10 | 
 11 | permissions:
 12 |   contents: read
 13 | 
 14 | jobs:
 15 |   linux:
 16 |     runs-on: ubuntu-latest
 17 |     permissions:
 18 |       id-token: write  # ability to mint the OIDC token permission is necessary to persist the attestation
 19 |       contents: read
 20 |       attestations: write  # persist the attestation
 21 |     strategy:
 22 |       matrix:
 23 |         platform: [ 'x86_64-unknown-linux-gnu', 'aarch64-unknown-linux-gnu' ]
 24 |     steps:
 25 |       - name: Harden Runner
 26 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
 27 |         with:
 28 |           egress-policy: block
 29 |           allowed-endpoints: >
 30 |             api.github.com:443
 31 |             astral.sh:443
 32 |             cdn.quay.io:443
 33 |             cdn01.quay.io:443
 34 |             cdn02.quay.io:443
 35 |             cdn03.quay.io:443
 36 |             crates.io:443
 37 |             files.pythonhosted.org:443
 38 |             ghcr.io:443
 39 |             github.com:443
 40 |             index.crates.io:443
 41 |             objects.githubusercontent.com:443
 42 |             pkg-containers.githubusercontent.com:443
 43 |             pypi.org:443
 44 |             quay.io:443
 45 |             sh.rustup.rs:443
 46 |             static.crates.io:443
 47 |             static.rust-lang.org:443
 48 |             uploads.github.com:443
 49 | 
 50 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
 51 |       - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065
 52 |         with:
 53 |           python-version: 3.9
 54 |           architecture: x64
 55 | 
 56 |       - uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3
 57 |         with:
 58 |           manylinux: auto
 59 |           target: ${{ matrix.platform }}
 60 |           command: build
 61 |           args: --release --sdist -o dist -i 3.9 3.10 3.11 3.12 3.13
 62 | 
 63 |       - name: Upload wheels
 64 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # 4.6.2
 65 |         with:
 66 |           name: wheels-linux-${{ matrix.platform }}
 67 |           path: dist
 68 | 
 69 |   windows:
 70 |     runs-on: windows-latest
 71 |     permissions:
 72 |       id-token: write  # ability to mint the OIDC token permission is necessary to persist the attestation
 73 |       contents: read
 74 |       attestations: write  # persist the attestation
 75 |     strategy:
 76 |       matrix:
 77 |         target: [x64]
 78 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
 79 |     steps:
 80 |       - name: Harden Runner
 81 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
 82 |         with:
 83 |           egress-policy: audit
 84 | 
 85 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
 86 |       - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065
 87 |         with:
 88 |           python-version: ${{ matrix.python-version }}
 89 | 
 90 |       - uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3
 91 |         env:
 92 |           PYO3_PYTHON: python${{ matrix.python-version }}
 93 |         with:
 94 |           command: build
 95 |           args: --release -o dist
 96 | 
 97 |       - name: Upload wheels
 98 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # 4.6.2
 99 |         with:
100 |           name: wheels-windows-${{ matrix.python-version }}-${{ matrix.target }}
101 |           path: dist
102 | 
103 |   macos:
104 |     runs-on: macos-latest
105 |     permissions:
106 |       id-token: write  # ability to mint the OIDC token permission is necessary to persist the attestation
107 |       contents: read
108 |       attestations: write  # persist the attestation
109 |     strategy:
110 |       matrix:
111 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
112 |         target: ['universal2', 'x86_64-apple-darwin']
113 |     steps:
114 |       - name: Harden Runner
115 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
116 |         with:
117 |           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
118 | 
119 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
120 |       - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065
121 |         with:
122 |           python-version: ${{ matrix.python-version }}
123 | 
124 |       - name: Build wheels - ${{ matrix.target }}
125 |         uses: PyO3/maturin-action@aef21716ff3dcae8a1c301d23ec3e4446972a6e3
126 |         env:
127 |           PYO3_PYTHON: python${{ matrix.python-version }}
128 |         with:
129 |           target: ${{ matrix.target }}
130 |           command: build
131 |           args: --release -o dist
132 | 
133 |       - name: Upload wheels
134 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # 4.6.2
135 |         with:
136 |           name: wheels-macos-${{ matrix.python-version }}-${{ matrix.target }}
137 |           path: dist
138 | 
139 |   python-release-github:
140 |     runs-on: ubuntu-latest
141 |     needs: [ macos, windows, linux ]
142 |     permissions:
143 |       contents: write # To add assets to a release.
144 |       checks: write
145 |       packages: write
146 |     steps:
147 |       - name: Harden Runner
148 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.1.0
149 |         with:
150 |           disable-sudo: true
151 |           egress-policy: block
152 |           allowed-endpoints: >
153 |             api.github.com:443
154 |             github.com:443
155 |             uploads.github.com:443
156 |             static.rust-lang.org:443 
157 | 
158 |       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
159 |         with:
160 |           ref: ${{ github.head_ref }}
161 | 
162 |       - uses: dtolnay/rust-toolchain@888c2e1ea69ab0d4330cbf0af1ecc7b68f368cc1
163 |         with:
164 |           toolchain: "stable"
165 | 
166 |       - name: Set up Python 3.9
167 |         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
168 |         with:
169 |           python-version: 3.9
170 | 
171 |       - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
172 |         with:
173 |           path: wheels
174 |           pattern: wheels-*
175 |           merge-multiple: true
176 | 
177 |       - name: Upload release binaries
178 |         uses: alexellis/upload-assets@13926a61cdb2cb35f5fdef1c06b8b591523236d3 # 0.4.1
179 |         env:
180 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
181 |         with:
182 |           asset_paths: '["./wheels/tantivy-*"]'
183 | 
184 |   release-pypy:
185 |     name: Release
186 |     runs-on: ubuntu-latest
187 |     needs: [ macos, windows, linux ]
188 |     permissions:
189 |       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
190 |     steps:
191 |       - name: Harden Runner
192 |         uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0
193 |         with:
194 |           egress-policy: audit
195 | 
196 |       - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
197 |         with:
198 |           path: wheels
199 |           pattern: wheels-*
200 |           merge-multiple: true
201 | 
202 |       - name: Publish package distributions to Test PyPI
203 |         uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3
204 |         with:
205 |           repository-url: https://test.pypi.org/legacy/
206 |           packages-dir: wheels/
207 |           skip-existing: true
208 |           attestations: false
209 | 
210 |       - name: Publish package distributions to PyPI
211 |         if: always()
212 |         uses: pypa/gh-action-pypi-publish@67339c736fd9354cd4f8cb0b744f2b82a74b5c70 # v1.12.3
213 |         with:
214 |           packages-dir: wheels/
215 |           skip-existing: true
216 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecards.yml:
--------------------------------------------------------------------------------
 1 | # This workflow uses actions that are not certified by GitHub. They are provided
 2 | # by a third-party and are governed by separate terms of service, privacy
 3 | # policy, and support documentation.
 4 | 
 5 | name: Scorecard supply-chain security
 6 | on:
 7 |   # For Branch-Protection check. Only the default branch is supported. See
 8 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 9 |   branch_protection_rule:
10 |   # To guarantee Maintained check is occasionally updated. See
11 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
12 |   schedule:
13 |     - cron: '35 8 * * 5'
14 |   push:
15 |     branches: [ "master" ]
16 | 
17 | # Declare default permissions as read only.
18 | permissions: read-all
19 | 
20 | jobs:
21 |   analysis:
22 |     name: Scorecard analysis
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       # Needed to upload the results to code-scanning dashboard.
26 |       security-events: write
27 |       # Needed to publish results and get a badge (see publish_results below).
28 |       id-token: write
29 | 
30 |     steps:
31 |       - name: "Checkout code"
32 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
33 |         with:
34 |           persist-credentials: false
35 | 
36 |       - name: "Run analysis"
37 |         uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
38 |         with:
39 |           results_file: results.sarif
40 |           results_format: sarif
41 |           # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
42 |           # - you want to enable the Branch-Protection check on a *public* repository, or
43 |           # - you are installing Scorecard on a *private* repository
44 |           # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
45 |           # repo_token: ${{ secrets.SCORECARD_TOKEN }}
46 | 
47 |           # Public repositories:
48 |           #   - Publish results to OpenSSF REST API for easy access by consumers
49 |           #   - Allows the repository to include the Scorecard badge.
50 |           #   - See https://github.com/ossf/scorecard-action#publishing-results.
51 |           # For private repositories:
52 |           #   - `publish_results` will always be set to `false`, regardless
53 |           #     of the value entered here.
54 |           publish_results: true
55 | 
56 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
57 |       # format to the repository Actions tab.
58 |       - name: "Upload artifact"
59 |         uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
60 |         with:
61 |           name: SARIF file
62 |           path: results.sarif
63 |           retention-days: 5
64 | 
65 |       # Upload the results to GitHub's code scanning dashboard.
66 |       - name: "Upload to code-scanning"
67 |         uses: github/codeql-action/upload-sarif@ff0a06e83cb2de871e5a09832bc6a81e7276941f # v3.28.18
68 |         with:
69 |           sarif_file: results.sarif
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | **/*.pyc
 3 | build
 4 | /target
 5 | **/*.rs.bk
 6 | dist/
 7 | __pycache__/
 8 | tantivy.so
 9 | tantivy.dylib
10 | tantivy/tantivy.cpython*.so
11 | tantivy.egg-info/
12 | .venv
13 | .envrc
14 | site/


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | # Build documentation in the "docs/" directory with Sphinx
19 | mkdocs:
20 |   configuration: mkdocs.yml
21 |   fail_on_warning: false
22 | 
23 | # Optionally build your docs in additional formats such as PDF and ePub
24 | # formats:
25 | #    - pdf
26 | #    - epub
27 | 
28 | # Optional but recommended, declare the Python requirements required
29 | # to build your documentation
30 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
31 | python:
32 |   install:
33 |   - requirements: docs/requirements.txt
34 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "tantivy"
 3 | version = "0.24.0"
 4 | readme = "README.md"
 5 | authors = ["Damir Jelić <poljar@termina.org.uk>"]
 6 | edition = "2021"
 7 | license = "MIT"
 8 | 
 9 | [lib]
10 | name = "tantivy"
11 | crate-type = ["cdylib"]
12 | 
13 | [build-dependencies]
14 | pyo3-build-config = "0.25.0"
15 | 
16 | [dependencies]
17 | base64 = "0.22"
18 | chrono = "0.4.41"
19 | tantivy = "0.24.1"
20 | itertools = "0.14.0"
21 | futures = "0.3.31"
22 | pythonize = "0.24.0"
23 | serde = "1.0"
24 | serde_json = "1.0.140"
25 | 
26 | [dependencies.pyo3]
27 | version = "0.24.2"
28 | features = ["chrono", "extension-module"]
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 The Matrix.org Foundation CIC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Cargo.toml
2 | include Makefile
3 | include rust-toolchain
4 | recursive-include src *
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/quickwit-inc/tantivy-py.svg?branch=master)](https://travis-ci.org/quickwit-inc/tantivy-py)
 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 3 | [![Docs](https://readthedocs.org/projects/tantivy-py/badge/?version=latest&style=flat-default)](https://tantivy-py.readthedocs.io/en/latest/)
 4 | 
 5 | tantivy-py
 6 | ==========
 7 | 
 8 | Python bindings for [Tantivy](https://github.com/quickwit-oss/tantivy) the full-text search engine library written in Rust.
 9 | 
10 | # Installation
11 | 
12 | The bindings can be installed using from pypi using pip:
13 | 
14 |     pip install tantivy
15 | 
16 | If no binary wheel is present for your operating system the bindings will be
17 | build from source, this means that Rust needs to be installed before building
18 | can succeed.
19 | 
20 | # Documentation
21 | 
22 | Please see [the documentation](https://tantivy-py.readthedocs.io/en/latest/) for more information.
23 | 


--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
1 | fn main() {
2 |     pyo3_build_config::add_extension_module_link_args();
3 | }
4 | 


--------------------------------------------------------------------------------
/ci/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | docker run \
 4 |        --env MATURIN_PASSWORD="$MATURIN_PASSWORD" \
 5 |        --rm \
 6 |        -v "$(pwd)":/io \
 7 |        konstin2/maturin \
 8 |        publish \
 9 |        --username __token__ \
10 |        --password "$MATURIN_PASSWORD"
11 | 


--------------------------------------------------------------------------------
/ci/deploy_mac.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | maturin publish \
4 |     --interpreter python3.9 \
5 |     --username __token__ \
6 |     --password "$MATURIN_PASSWORD" \
7 |     --no-sdist
8 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
1 | # About
2 | 


--------------------------------------------------------------------------------
/docs/explanation.md:
--------------------------------------------------------------------------------
 1 | # Explanation
 2 | 
 3 | ## Merge policy
 4 | 
 5 | When adding documents to a tantivy index, the indexed data will be recorded in multiple 
 6 | sections, called _segments_. There is more information about the [Life of a Segment](https://github.com/quickwit-oss/tantivy/wiki/Life-of-a-Segment)
 7 | on the [tantivy wiki at Github](https://github.com/quickwit-oss/tantivy/wiki).
 8 | 
 9 | Currently, tantivy-py does not offer a way to customize the merge policy, but fortunately
10 | the default merge policy is the [`LogMergePolicy`](https://docs.rs/tantivy/latest/tantivy/merge_policy/struct.LogMergePolicy.html) 
11 | which is a good choice for most use cases. It is aliased as the [default merge policy here](https://docs.rs/tantivy/latest/tantivy/merge_policy/type.DefaultMergePolicy.html).
12 | 
13 | Segment merging is performed in background threads. After adding documents to an index,
14 | it is important to allow time for those threads to complete merges. This is done by calling
15 | `writer.wait_merging_threads()` as the final step after adding data. This method will
16 | consume the writer and the identifier will no longer be usable.
17 | 
18 | Here is a short description of the steps in pseudocode:
19 | 
20 | ```
21 | schema = Schema(...)
22 | index = Index(schema)
23 | writer = index.writer()
24 | for ... in data:
25 |     document = Document(...)
26 |     writer.add_document(...)
27 | writer.commit()
28 | writer.wait_merging_threads()
29 | ```
30 | 


--------------------------------------------------------------------------------
/docs/howto.md:
--------------------------------------------------------------------------------
 1 | # How-to Guides
 2 | 
 3 | ## Installation
 4 | 
 5 | tantivy-py can be installed using from [pypi](pypi.org) using pip:
 6 | 
 7 |     pip install tantivy
 8 | 
 9 | If no binary wheel is present for your operating system the bindings will be
10 | build from source, this means that Rust needs to be installed before building
11 | can succeed.
12 | 
13 | Note that the bindings are using [PyO3](https://github.com/PyO3/pyo3), which
14 | only supports python3.
15 | 
16 | ## Set up a development environment to work on tantivy-py itself
17 | 
18 | Setting up a development environment can be done in a virtual environment using
19 | [`nox`](https://nox.thea.codes) or using local packages using the provided `Makefile`.
20 | 
21 | For the `nox` setup install the virtual environment and build the bindings using:
22 | 
23 |     python3 -m pip install nox
24 |     nox
25 | 
26 | For the `Makefile` based setup run:
27 | 
28 |     make
29 | 
30 | Running the tests is done using:
31 | 
32 |     make test
33 | 
34 | The `nox` test session will pass pytest arguments through. For example,
35 | to run only the tests including "simple_search" in the test name, and only
36 | on Python 3.11:
37 | 
38 |     nox -s test-3.11 -- -k simple_search
39 | 
40 | ## Doctests
41 | 
42 | [Doctests](https://docs.python.org/3/library/doctest.html) are automatically
43 | enabled for all docstrings in the `tantivy` module. Here is a very basic
44 | introduction.  Consider the following hypothetical Rust `struct`:
45 | 
46 | ```rust
47 | /// Tantivy's Document is the object that can be indexed and then searched for.
48 | ///
49 | /// Documents are fundamentally a collection of unordered tuples
50 | /// (field_name, value). In this list, one field may appear more than once.
51 | ///
52 | /// Example:
53 | ///     >>> doc = tantivy.Document()
54 | ///     >>> doc.add_text("title", "The Old Man and the Sea")
55 | ///     >>> doc.add_text("body", ("He was an old man who fished alone in a "
56 | ///     ...                       "skiff in the Gulf Stream and he had gone "
57 | ///     ...                       "eighty-four days now without taking a fish."))
58 | ///     >>> doc
59 | ///     Document(body=[He was an ],title=[The Old Ma])
60 | ///
61 | #[pyclass(module = "tantivy")]
62 | #[derive(Clone, Default, PartialEq)]
63 | pub(crate) struct Document {
64 |     pub(crate) field_values: BTreeMap<String, Vec<Value>>,
65 | }
66 | ```
67 | 
68 | When the tests are executed, pytest will automatically search all the docstrings
69 | for `>>>` and `...` and execute the code in the docstring. The output of the
70 | code is compared to the text that follows the code. If the output matches, the
71 | test passes. If the output does not match, the test fails.
72 | 
73 | In the above example, a Tantivy document object is created, and then the
74 | representation of the document is printed. This representation, and indeed any
75 | output that manual typing would produce, is compared to the text that follows
76 | and this is how doctests work.
77 | 
78 | Doctests are a great way to ensure that the documentation is accurate and up to
79 | date, and doctests are therefore encouraged be present on every public 
80 | interface that users will interact with. However, doctest are not suitable 
81 | for coverage testing and other more advanced testing methods so you must
82 | judge when to use them.
83 | 
84 | ## Working on tantivy-py documentation
85 | 
86 | Please be aware that this documentation is structured using the [Diátaxis](https://diataxis.fr/) framework. In very simple terms, this framework will suggest the correct location for different kinds of documentation. Please make sure you gain a basic understanding of the goals of the framework before making large pull requests with new documentation.
87 | 
88 | This documentation uses the [MkDocs](https://mkdocs.readthedocs.io/en/stable/) framework. This package is specified as an optional dependency in the `pyproject.toml` file. To install all optional dev dependencies into your virtual env, run the following command:
89 | 
90 |     pip install .[dev]
91 | 
92 | The [MkDocs](https://mkdocs.readthedocs.io/en/stable/) documentation itself is comprehensive. MkDocs provides some additional context and help around [writing with markdown](https://mkdocs.readthedocs.io/en/stable/user-guide/writing-your-docs/#writing-with-markdown).
93 | 
94 | If all you want to do is make a few edits right away, the documentation content is in the `/docs` directory and consists of [Markdown](https://www.markdownguide.org/) files, which can be edited with any text editor.
95 | 
96 | The most efficient way to work is to run a MkDocs livereload server in the background. This will launch a local web server on your dev machine, serve the docs (by default at `http://localhost:8000`), and automatically reload the page after you save any changes to the documentation files.
97 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Welcome to tantivy-py
 2 | 
 3 | tantivy-py is a wrapper for the [tantivy](https://github.com/quickwit-oss/tantivy) full-text search engine, which is inspired by Apache Lucene. 
 4 | 
 5 | tantivy-py is [licensed](https://github.com/quickwit-oss/tantivy-py/blob/master/LICENSE) under the [MIT License](https://www.tldrlegal.com/license/mit-license).
 6 | 
 7 | ## Important links
 8 | 
 9 | - [tantivy-py code repository](https://github.com/quickwit-oss/tantivy-py)
10 | - [tantivy code repository](https://github.com/quickwit-oss/tantivy)
11 | - [tantivy Documentation](https://docs.rs/crate/tantivy/latest)
12 | - [tantivy query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query)
13 | 
14 | ## How to use this documentation
15 | 
16 | This documentation uses the [Diátaxis](https://diataxis.fr/) framework. The following sections are clearly separated:
17 | 
18 | - [Tutorials](tutorials.md): when you want to learn
19 | - [How-to Guides](howto.md): when need to accomplish a task
20 | - [Explanation](howto.md): when you need a broader understanding and the thinking behind why certain things are set up in a particular way.
21 | - [Reference](reference.md): when you need precise, detailed information
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/reference.md:
--------------------------------------------------------------------------------
  1 | # Reference
  2 | 
  3 | ## Setup
  4 | 
  5 | We'll use a test index for the examples that follow.
  6 | 
  7 | ```python
  8 | import os
  9 | from tantivy import SchemaBuilder, Index, Document
 10 | schema = (
 11 |     SchemaBuilder()
 12 |         .add_integer_field("doc_id", indexed=True, stored=True)
 13 |         .add_text_field("title", stored=True)
 14 |         .add_text_field("body")
 15 |         .build()
 16 | )
 17 | index = Index(schema=schema, path=None)
 18 | writer = index.writer(heap_size=15_000_000, num_threads=1)
 19 | doc = Document()
 20 | doc.add_integer("doc_id", 1)
 21 | doc.add_text("title", "The Old Man and the Sea")
 22 | doc.add_text(
 23 |     "body",
 24 |     (
 25 |         "He was an old man who fished alone in a skiff in"
 26 |         "the Gulf Stream and he had gone eighty-four days "
 27 |         "now without taking a fish."
 28 |     ),
 29 | )
 30 | writer.add_document(doc)
 31 | 
 32 | doc = Document()
 33 | doc.add_integer("doc_id", 2)
 34 | doc.add_text("title", "The Old Man and the Sea II")
 35 | doc.add_text("body", "He was an old man who sailed alone.")
 36 | 
 37 | writer.add_document(doc)
 38 | writer.commit()
 39 | index.reload()
 40 | ```
 41 | 
 42 | ## Valid Query Formats
 43 | 
 44 | tantivy-py supports the [query language](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html#method.parse_query) used in tantivy.
 45 | Below a few basic query formats are shown:
 46 | 
 47 |  - AND and OR conjunctions.
 48 | ```python
 49 | searcher = index.searcher()
 50 | query = index.parse_query('(Old AND Man) OR Stream', ["title", "body"])
 51 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 52 | best_doc = searcher.doc(best_doc_address)
 53 | ```
 54 | 
 55 |  - +(includes) and -(excludes) operators.
 56 | ```python
 57 | query = index.parse_query('+Old +Man chef -fished', ["title", "body"])
 58 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 59 | best_doc = searcher.doc(best_doc_address)
 60 | ```
 61 | Note: in a query like above, a word with no +/- acts like an OR.
 62 | 
 63 |  - phrase search.
 64 | ```python
 65 | query = index.parse_query('"eighty-four days"', ["title", "body"])
 66 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 67 | best_doc = searcher.doc(best_doc_address)
 68 | ```
 69 | 
 70 | - integer search
 71 | ```python
 72 | query = index.parse_query('1', ["doc_id"])
 73 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 74 | best_doc = searcher.doc(best_doc_address)
 75 | ```
 76 | Note: for integer search, the integer field should be indexed.
 77 | 
 78 | For more possible query formats and possible query options, see [Tantivy Query Parser Docs.](https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html)
 79 | 
 80 | ## Escape quotes inside a query string
 81 | 
 82 | The tantivy docs for the query parser say that special characters like quotes can be 
 83 | escaped inside query values. However, it will also be necessary to surround
 84 | the search query in additional quotes, as if a phrase query were being used.
 85 | 
 86 | The following will NOT work:
 87 | 
 88 | ```python
 89 | try:
 90 |     index.parse_query(r'sea\"', ["title", "body"])
 91 | except ValueError as e:
 92 |     assert str(e) == r'Syntax Error: sea\"'
 93 | ```
 94 | 
 95 | However, the following will succeed:
 96 | 
 97 | ```python
 98 | # Works!
 99 | index.parse_query(r'"sea\""', ["title", "body"])
100 | ```
101 | 
102 | Note that whether the included (and escaped) quote actually gets used
103 | to match documents depends on the tokenizer used for the field. For example,
104 | the default tokenizer will not match the document "sea\"s" with the query
105 | "sea\"", because this tokenizer discards punctuation. 
106 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | mkdocs==1.4.3
2 | mktestdocs==0.2.1
3 | 


--------------------------------------------------------------------------------
/docs/tutorials.md:
--------------------------------------------------------------------------------
  1 | # Tutorials
  2 | 
  3 | ## Building an index and populating it
  4 | 
  5 | ```python
  6 | import tempfile
  7 | import pathlib
  8 | import tantivy
  9 | 
 10 | # Declaring our schema.
 11 | schema_builder = tantivy.SchemaBuilder()
 12 | schema_builder.add_text_field("title", stored=True)
 13 | schema_builder.add_text_field("body", stored=True)
 14 | schema_builder.add_integer_field("doc_id",stored=True)
 15 | schema = schema_builder.build()
 16 | 
 17 | # Creating our index (in memory)
 18 | index = tantivy.Index(schema)
 19 | ```
 20 | 
 21 | To have a persistent index, use the path
 22 | parameter to store the index on the disk, e.g:
 23 | 
 24 | ```python
 25 | tmpdir = tempfile.TemporaryDirectory()
 26 | index_path = pathlib.Path(tmpdir.name) / "index"
 27 | index_path.mkdir()
 28 | persistent_index = tantivy.Index(schema, path=str(index_path))
 29 | ```
 30 | 
 31 | By default, tantivy  offers the following tokenizers
 32 | which can be used in tantivy-py:
 33 |  -  `default`
 34 | `default` is the tokenizer that will be used if you do not
 35 |  assign a specific tokenizer to your text field.
 36 |  It will chop your text on punctuation and whitespaces,
 37 |  removes tokens that are longer than 40 chars, and lowercase your text.
 38 | 
 39 | -  `raw`
 40 |  Does not actual tokenizer your text. It keeps it entirely unprocessed.
 41 |  It can be useful to index uuids, or urls for instance.
 42 | 
 43 | -  `en_stem`
 44 | 
 45 |  In addition to what `default` does, the `en_stem` tokenizer also
 46 |  apply stemming to your tokens. Stemming consists in trimming words to
 47 |  remove their inflection. This tokenizer is slower than the default one,
 48 |  but is recommended to improve recall.
 49 | 
 50 | to use the above tokenizers, simply provide them as a parameter to `add_text_field`. e.g.
 51 | ```python
 52 | schema_builder_tok = tantivy.SchemaBuilder()
 53 | schema_builder_tok.add_text_field("body",  stored=True,  tokenizer_name='en_stem')
 54 | ```
 55 | 
 56 | ## Adding one document.
 57 | 
 58 | ```python
 59 | writer = index.writer()
 60 | writer.add_document(tantivy.Document(
 61 | 	doc_id=1,
 62 |     title=["The Old Man and the Sea"],
 63 |     body=["""He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."""],
 64 | ))
 65 | # ... and committing
 66 | writer.commit()
 67 | writer.wait_merging_threads()
 68 | ```
 69 | 
 70 | Note that `wait_merging_threads()` must come at the end, because
 71 | the `writer` object will not be usable after this call.
 72 | 
 73 | ## Building and Executing Queries with the Query Parser
 74 | 
 75 | With the Query Parser, you can easily build simple queries for your index.
 76 | 
 77 | First you need to get a searcher for the index
 78 | 
 79 | ```python
 80 | # Reload the index to ensure it points to the last commit.
 81 | index.reload()
 82 | searcher = index.searcher()
 83 | ```
 84 | 
 85 | Then you need to get a valid query object by parsing your query on the index.
 86 | 
 87 | ```python
 88 | query = index.parse_query("fish days", ["title", "body"])
 89 | (best_score, best_doc_address) = searcher.search(query, 3).hits[0]
 90 | best_doc = searcher.doc(best_doc_address)
 91 | assert best_doc["title"] == ["The Old Man and the Sea"]
 92 | ```
 93 | 
 94 | The `parse_query` method takes in a query string (visit [reference](reference.md#valid-query-formats) for more details on the syntax) and create a `Query` object that can be used to search the index.
 95 | 
 96 | In Tantivy, hit documents during search will return a `DocAddress` object that can be used to retrieve the document from the searcher, rather than returning the document directly.
 97 | 
 98 | ## Building and Executing Queries with Query Objects
 99 | 
100 | > *This is an advanced topic. Only consider this if you need very fine-grained control over your queries, or existing query parsers do not meet your needs.*
101 | 
102 | If you have a Lucene / ElasticSearch background, you might be more comfortable building nested queries programmatically. Also, some queries (e.g. ConstQuery, DisjunctionMaxQuery) are not supported by the query parser due to their complexity in expression.
103 | 
104 | Consider the following query in ElasticSearch:
105 | 
106 | ```json
107 | {
108 |     "query": {
109 |         "bool": {
110 |             "must": [
111 |                 {
112 |                     "dis_max": {
113 |                         "queries": [
114 |                             {
115 |                                 "match": {
116 |                                     "title": {
117 |                                         "query": "fish",
118 |                                         "boost": 2
119 |                                     }
120 |                                 }
121 |                             },
122 |                             {
123 |                                 "match": {
124 |                                     "body": {
125 |                                         "query": "eighty-four days",
126 |                                         "boost": 1.5
127 |                                     }
128 |                                 }
129 |                             }
130 |                         ],
131 |                         "tie_breaker": 0.3
132 |                     }
133 |                 }
134 |             ]
135 |         }
136 |     }
137 | }
138 | ```
139 | 
140 | It is impossible to express this query using the query parser. Instead, you can build the query programmatically mixing with the query parser:
141 | 
142 | ```python
143 | from tantivy import Query, Occur, Index
144 | 
145 | ...
146 | 
147 | complex_query = Query.boolean_query(
148 |     [
149 |         (
150 |             Occur.Must,
151 |             Query.disjunction_max_query(
152 |                 [
153 |                     Query.boost_query(
154 |                         # by default, only the query parser will analyze
155 |                         # your query string
156 |                         index.parse_query("fish", ["title"]), 
157 |                         2.0
158 |                     ),
159 |                     Query.boost_query(
160 |                         index.parse_query("eighty-four days", ["body"]), 
161 |                         1.5
162 |                     ),
163 |                 ],
164 |                 0.3,
165 |             ),
166 |         )
167 |     ]
168 | )
169 | 
170 | ```
171 | 
172 | <!--TODO: Update the reference link to the query parser docs when available.-->
173 | 
174 | ## Using the snippet generator
175 | 
176 | Let's revisit the query `"fish days"` in our [example](#building-and-executing-queries-with-the-query-parser):
177 | 
178 | ```python
179 | hit_text = best_doc["body"][0]
180 | print(f"{hit_text=}")
181 | assert hit_text == (
182 |     "He was an old man who fished alone in a skiff in the "
183 |     "Gulf Stream and he had gone eighty-four days now "
184 |     "without taking a fish."
185 | )
186 | 
187 | from tantivy import SnippetGenerator
188 | snippet_generator = SnippetGenerator.create(
189 |     searcher, query, schema, "body"
190 | )
191 | snippet = snippet_generator.snippet_from_doc(best_doc)
192 | ```
193 | 
194 | The snippet object provides the hit ranges. These are the marker
195 | offsets in the text that match the query.
196 | 
197 | ```python
198 | highlights = snippet.highlighted()
199 | first_highlight = highlights[0]
200 | assert first_highlight.start == 93
201 | assert first_highlight.end == 97
202 | assert hit_text[first_highlight.start:first_highlight.end] == "days"
203 | ```
204 | 
205 | The snippet object can also generate a marked-up HTML snippet:
206 | 
207 | ```python
208 | html_snippet = snippet.to_html()
209 | assert html_snippet == (
210 |     "He was an old man who fished alone in a skiff in the "
211 |     "Gulf Stream and he had gone eighty-four <b>days</b> now "
212 |     "without taking a <b>fish</b>"
213 | )
214 | ```
215 | 
216 | 
217 | ## Create a Custom Tokenizer (Text Analyzer)
218 | 
219 | Tantivy provides several built-in tokenizers and filters that
220 | can be chained together to create new tokenizers (or
221 | 'text analyzers') that better fit your needs.
222 | 
223 | Tantivy-py lets you access these components, assemble them,
224 | and register the result with an index.
225 | 
226 | Let's walk through creating and registering a custom text analyzer
227 | to see how everything fits together.
228 | 
229 | ### Example
230 | 
231 | First, let's create a text analyzer. As explained further down,
232 | a text analyzer is a pipeline consisting of one tokenizer and
233 | any number of token filters.
234 | 
235 | ```python
236 | from tantivy import (
237 |     TextAnalyzer,
238 |     TextAnalyzerBuilder,
239 |     Tokenizer,
240 |     Filter,
241 |     Index,
242 |     SchemaBuilder
243 | )
244 | 
245 | my_analyzer: TextAnalyzer = (
246 |     TextAnalyzerBuilder(
247 |         # Create a `Tokenizer` instance.
248 |         # It instructs the builder about which type of tokenizer
249 |         # to create internally and with which arguments.
250 |         Tokenizer.regex(r"(?i)([a-z]+)")
251 |     )
252 |     .filter(
253 |         # Create a `Filter` instance.
254 |         # Like `Tokenizer`, this object provides instructions
255 |         # to the builder.
256 |         Filter.lowercase()
257 |     )
258 |     .filter(
259 |         # Define custom words.
260 |         Filter.custom_stopword(["www", "com"])
261 |     )
262 |     # Finally, build a TextAnalyzer
263 |     # chaining all tokenizer > [filter, ...] steps together.
264 |     .build()
265 | )
266 | ```
267 | 
268 | We can check that our new analyzer is working as expected
269 | by passing some text to its `.analyze()` method.
270 | 
271 | ```python
272 | # Will print: ['this', 'website', 'might', 'exist']
273 | my_analyzer.analyze('www.this1website1might1exist.com')
274 | ```
275 | 
276 | The next step is to register our analyzer with an index. Let's
277 | assume we already have one.
278 | 
279 | ```python
280 | index.register_tokenizer("custom_analyzer", my_analyzer)
281 | ```
282 | 
283 | To link an analyzer to a field in the index, pass the
284 | analyzer name to the `tokenizer_name=` parameter of
285 | the `SchemaBuilder`'s `add_text_field()` method.
286 | 
287 | Here is the schema that was used to construct our index:
288 | 
289 | ```python
290 | schema = (
291 |     tantivy.SchemaBuilder()
292 |     .add_text_field("content", tokenizer_name="custom_analyzer")
293 |     .build()
294 | )
295 | index = Index(schema)
296 | ```
297 | 
298 | Summary:
299 | 
300 | 1. Use `TextAnalyzerBuilder`, `Tokenizer`, and `Filter` to build a `TextAnalyzer`
301 | 2. The analyzer's `.analyze()` method lets you use your analyzer as a tokenizer from Python.
302 | 3. Refer to your analyzer's name when building the index schema.
303 | 4. Use the same name when registering your analyzer on the index.
304 | 
305 | 
306 | ### On terminology: Tokenizer vs. Text Analyzer
307 | 
308 | Tantivy-py mimics Tantivy's interface as closely as possible.
309 | This includes minor terminological inconsistencies, one of
310 | which is how Tantivy distinguishes between 'tokenizers' and
311 | 'text analyzers'.
312 | 
313 | Quite simply, a 'tokenizer' segments text into tokens.
314 | A 'text analyzer' is a pipeline consisting of one tokenizer
315 | and zero or more token filters. The `TextAnalyzer` is the
316 | primary object of interest when talking about how to
317 | change Tantivy's tokenization behavior.
318 | 
319 | Slightly confusingly, though, the `Index` and `SchemaBuilder`
320 | interfaces use 'tokenizer' to mean 'text analyzer'.
321 | 
322 | This inconsistency can be observed in `SchemaBuilder.add_text_field`, e.g. --
323 | 
324 | ```
325 | SchemaBuilder.add_text_field(..., tokenizer_name=<analyzer name>)`
326 | ```
327 | 
328 | -- and in the name of the `Index.register_tokenizer(...)` method, which actually
329 | serves to register a *text analyzer*.
330 | 
331 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: tantivy-py
 2 | # site_url: https://example.com
 3 | nav:
 4 |   - Home: index.md
 5 |   - Tutorials: tutorials.md
 6 |   - How-to Guides: howto.md
 7 |   - Explanation: explanation.md
 8 |   - Reference: reference.md
 9 |   - About: about.md
10 | theme: readthedocs
11 | 
12 | # Can nest documents under above sections
13 | # - 'User Guide':
14 | #     - 'Writing your docs': 'writing-your-docs.md'
15 | #     - 'Styling your docs': 'styling-your-docs.md'
16 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
1 | import nox
2 | 
3 | 
4 | @nox.session(python=["3.9", "3.10", "3.11", "3.12", "3.13"])
5 | def test(session):
6 |     session.install("-rrequirements-dev.txt")
7 |     session.install("-e", ".", "--no-build-isolation")
8 |     session.run("pytest", *session.posargs)
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin<=1.3.2"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "tantivy"
 7 | version = "0.24.0"
 8 | description = "Official Python bindings for the Tantivy search engine"
 9 | requires-python = ">=3.9"
10 | authors = [
11 |     { name = "Damir Jelić", email="poljar@termina.org.uk" },
12 |     { name = "Caleb Hattingh", email = "code@cjrh.info" },
13 |     { name = "Cam Parry", email = "cam.parry@kapiche.com"}
14 | ]
15 | optional-dependencies = { dev = ["nox"] }
16 | 
17 | [tool.maturin]
18 | bindings = "pyo3"
19 | 
20 | [tool.pytest.ini_options]
21 | # Set the durations option and doctest modules
22 | # See https://docs.pytest.org/en/latest/usage.html#durations
23 | addopts = "--doctest-modules --durations=10"
24 | # Use the `--ignore-glob` setting to exclude the `noxfile.py` module from the doctests
25 | # See https://docs.pytest.org/en/latest/reference.html#confval-ignore_glob
26 | testpaths = [
27 |     "tests",
28 |     "tantivy",
29 |     "src",
30 | ]
31 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | maturin
2 | pytest>=4.0
3 | mktestdocs==0.2.1
4 | 


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "stable"
3 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 80


--------------------------------------------------------------------------------
/src/facet.rs:
--------------------------------------------------------------------------------
  1 | use crate::to_pyerr;
  2 | use pyo3::{
  3 |     basic::CompareOp,
  4 |     prelude::*,
  5 |     types::{PyTuple, PyType},
  6 |     IntoPyObjectExt,
  7 | };
  8 | use serde::{Deserialize, Serialize};
  9 | use tantivy::schema;
 10 | 
 11 | /// A Facet represent a point in a given hierarchy.
 12 | ///
 13 | /// They are typically represented similarly to a filepath. For instance, an
 14 | /// e-commerce website could have a Facet for /electronics/tv_and_video/led_tv.
 15 | ///
 16 | /// A document can be associated to any number of facets. The hierarchy
 17 | /// implicitely imply that a document belonging to a facet also belongs to the
 18 | /// ancestor of its facet. In the example above, /electronics/tv_and_video/
 19 | /// and /electronics.
 20 | #[pyclass(frozen, module = "tantivy.tantivy")]
 21 | #[derive(Clone, Deserialize, PartialEq, Serialize)]
 22 | pub(crate) struct Facet {
 23 |     pub(crate) inner: schema::Facet,
 24 | }
 25 | 
 26 | #[pymethods]
 27 | impl Facet {
 28 |     /// Creates a `Facet` from its binary representation.
 29 |     #[staticmethod]
 30 |     fn from_encoded(encoded_bytes: Vec<u8>) -> PyResult<Self> {
 31 |         let inner =
 32 |             schema::Facet::from_encoded(encoded_bytes).map_err(to_pyerr)?;
 33 |         Ok(Self { inner })
 34 |     }
 35 | 
 36 |     /// Create a new instance of the "root facet" Equivalent to /.
 37 |     #[classmethod]
 38 |     fn root(_cls: &Bound<PyType>) -> Facet {
 39 |         Facet {
 40 |             inner: schema::Facet::root(),
 41 |         }
 42 |     }
 43 | 
 44 |     /// Returns true if the facet is the root facet /.
 45 |     #[getter]
 46 |     fn is_root(&self) -> bool {
 47 |         self.inner.is_root()
 48 |     }
 49 | 
 50 |     /// Returns true if another Facet is a subfacet of this facet.
 51 |     /// Args:
 52 |     ///     other (Facet): The Facet that we should check if this facet is a
 53 |     ///         subset of.
 54 |     fn is_prefix_of(&self, other: &Facet) -> bool {
 55 |         self.inner.is_prefix_of(&other.inner)
 56 |     }
 57 | 
 58 |     /// Create a Facet object from a string.
 59 |     /// Args:
 60 |     ///     facet_string (str): The string that contains a facet.
 61 |     ///
 62 |     /// Returns the created Facet.
 63 |     #[classmethod]
 64 |     fn from_string(_cls: &Bound<PyType>, facet_string: &str) -> Facet {
 65 |         Facet {
 66 |             inner: schema::Facet::from(facet_string),
 67 |         }
 68 |     }
 69 | 
 70 |     /// Returns the list of `segments` that forms a facet path.
 71 |     ///
 72 |     /// For instance `//europe/france` becomes `["europe", "france"]`.
 73 |     fn to_path(&self) -> Vec<&str> {
 74 |         self.inner.to_path()
 75 |     }
 76 | 
 77 |     /// Returns the facet string representation.
 78 |     fn to_path_str(&self) -> String {
 79 |         self.inner.to_string()
 80 |     }
 81 | 
 82 |     fn __repr__(&self) -> PyResult<String> {
 83 |         Ok(format!("Facet({})", self.to_path_str()))
 84 |     }
 85 | 
 86 |     fn __richcmp__(
 87 |         &self,
 88 |         other: &Self,
 89 |         op: CompareOp,
 90 |         py: Python<'_>,
 91 |     ) -> PyResult<PyObject> {
 92 |         match op {
 93 |             CompareOp::Eq => (self == other).into_py_any(py),
 94 |             CompareOp::Ne => (self != other).into_py_any(py),
 95 |             _ => Ok(py.NotImplemented()),
 96 |         }
 97 |     }
 98 | 
 99 |     fn __reduce__<'a>(
100 |         slf: PyRef<'a, Self>,
101 |         py: Python<'a>,
102 |     ) -> PyResult<Bound<'a, PyTuple>> {
103 |         let encoded_bytes = slf.inner.encoded_str().as_bytes().to_vec();
104 |         let deserializer = slf.into_pyobject(py)?.getattr("from_encoded")?;
105 |         PyTuple::new(
106 |             py,
107 |             [deserializer, PyTuple::new(py, [encoded_bytes])?.into_any()],
108 |         )
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/index.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::new_ret_no_self)]
  2 | 
  3 | use std::collections::HashMap;
  4 | 
  5 | use pyo3::{exceptions, prelude::*, types::PyAny};
  6 | 
  7 | use crate::{
  8 |     document::{extract_value, Document},
  9 |     get_field,
 10 |     parser_error::QueryParserErrorIntoPy,
 11 |     query::Query,
 12 |     schema::Schema,
 13 |     searcher::Searcher,
 14 |     to_pyerr,
 15 |     tokenizer::TextAnalyzer as PyTextAnalyzer,
 16 | };
 17 | use tantivy as tv;
 18 | use tantivy::{
 19 |     directory::MmapDirectory,
 20 |     schema::{
 21 |         document::TantivyDocument, NamedFieldDocument, OwnedValue as Value,
 22 |         Term,
 23 |     },
 24 |     tokenizer::{
 25 |         Language, LowerCaser, RemoveLongFilter, SimpleTokenizer, Stemmer,
 26 |         TextAnalyzer,
 27 |     },
 28 | };
 29 | 
 30 | const RELOAD_POLICY: &str = "commit";
 31 | 
 32 | /// IndexWriter is the user entry-point to add documents to the index.
 33 | ///
 34 | /// To create an IndexWriter first create an Index and call the writer() method
 35 | /// on the index object.
 36 | #[pyclass(module = "tantivy.tantivy")]
 37 | pub(crate) struct IndexWriter {
 38 |     inner_index_writer: Option<tv::IndexWriter>,
 39 |     schema: tv::schema::Schema,
 40 | }
 41 | 
 42 | impl IndexWriter {
 43 |     fn inner(&self) -> PyResult<&tv::IndexWriter> {
 44 |         self.inner_index_writer.as_ref().ok_or_else(|| {
 45 |             exceptions::PyRuntimeError::new_err(
 46 |                 "IndexWriter was consumed and no longer in a valid state",
 47 |             )
 48 |         })
 49 |     }
 50 | 
 51 |     fn inner_mut(&mut self) -> PyResult<&mut tv::IndexWriter> {
 52 |         self.inner_index_writer.as_mut().ok_or_else(|| {
 53 |             exceptions::PyRuntimeError::new_err(
 54 |                 "IndexWriter was consumed and no longer in a valid state",
 55 |             )
 56 |         })
 57 |     }
 58 | 
 59 |     fn take_inner(&mut self) -> PyResult<tv::IndexWriter> {
 60 |         self.inner_index_writer.take().ok_or_else(|| {
 61 |             exceptions::PyRuntimeError::new_err(
 62 |                 "IndexWriter was consumed and no longer in a valid state",
 63 |             )
 64 |         })
 65 |     }
 66 | }
 67 | 
 68 | #[pymethods]
 69 | impl IndexWriter {
 70 |     /// Add a document to the index.
 71 |     ///
 72 |     /// If the indexing pipeline is full, this call may block.
 73 |     ///
 74 |     /// Returns an `opstamp`, which is an increasing integer that can be used
 75 |     /// by the client to align commits with its own document queue.
 76 |     /// The `opstamp` represents the number of documents that have been added
 77 |     /// since the creation of the index.
 78 |     pub fn add_document(&mut self, doc: &Document) -> PyResult<u64> {
 79 |         let named_doc = NamedFieldDocument(doc.field_values.clone());
 80 |         let doc = TantivyDocument::convert_named_doc(&self.schema, named_doc)
 81 |             .map_err(to_pyerr)?;
 82 |         self.inner()?.add_document(doc).map_err(to_pyerr)
 83 |     }
 84 | 
 85 |     /// Helper for the `add_document` method, but passing a json string.
 86 |     ///
 87 |     /// If the indexing pipeline is full, this call may block.
 88 |     ///
 89 |     /// Returns an `opstamp`, which is an increasing integer that can be used
 90 |     /// by the client to align commits with its own document queue.
 91 |     /// The `opstamp` represents the number of documents that have been added
 92 |     /// since the creation of the index.
 93 |     pub fn add_json(&mut self, json: &str) -> PyResult<u64> {
 94 |         let doc = TantivyDocument::parse_json(&self.schema, json)
 95 |             .map_err(to_pyerr)?;
 96 |         let opstamp = self.inner()?.add_document(doc);
 97 |         opstamp.map_err(to_pyerr)
 98 |     }
 99 | 
100 |     /// Commits all of the pending changes
101 |     ///
102 |     /// A call to commit blocks. After it returns, all of the document that
103 |     /// were added since the last commit are published and persisted.
104 |     ///
105 |     /// In case of a crash or an hardware failure (as long as the hard disk is
106 |     /// spared), it will be possible to resume indexing from this point.
107 |     ///
108 |     /// Returns the `opstamp` of the last document that made it in the commit.
109 |     fn commit(&mut self) -> PyResult<u64> {
110 |         self.inner_mut()?.commit().map_err(to_pyerr)
111 |     }
112 | 
113 |     /// Rollback to the last commit
114 |     ///
115 |     /// This cancels all of the update that happened before after the last
116 |     /// commit. After calling rollback, the index is in the same state as it
117 |     /// was after the last commit.
118 |     fn rollback(&mut self) -> PyResult<u64> {
119 |         self.inner_mut()?.rollback().map_err(to_pyerr)
120 |     }
121 | 
122 |     /// Detect and removes the files that are not used by the index anymore.
123 |     fn garbage_collect_files(&mut self) -> PyResult<()> {
124 |         use futures::executor::block_on;
125 |         block_on(self.inner()?.garbage_collect_files()).map_err(to_pyerr)?;
126 |         Ok(())
127 |     }
128 | 
129 |     /// Deletes all documents from the index.
130 |     fn delete_all_documents(&mut self) -> PyResult<()> {
131 |         self.inner()?.delete_all_documents().map_err(to_pyerr)?;
132 |         Ok(())
133 |     }
134 | 
135 |     /// The opstamp of the last successful commit.
136 |     ///
137 |     /// This is the opstamp the index will rollback to if there is a failure
138 |     /// like a power surge.
139 |     ///
140 |     /// This is also the opstamp of the commit that is currently available
141 |     /// for searchers.
142 |     #[getter]
143 |     fn commit_opstamp(&self) -> PyResult<u64> {
144 |         Ok(self.inner()?.commit_opstamp())
145 |     }
146 | 
147 |     #[deprecated(
148 |         note = "This method is deprecated and will be removed in the future. Use either delete_documents_by_term, or delete_documents_by_query."
149 |     )]
150 |     fn delete_documents(
151 |         &mut self,
152 |         field_name: &str,
153 |         field_value: &Bound<PyAny>,
154 |     ) -> PyResult<u64> {
155 |         self.delete_documents_by_term(field_name, field_value)
156 |     }
157 | 
158 |     /// Delete all documents containing a given term.
159 |     ///
160 |     /// This method does not parse the given term and it expects the term to be
161 |     /// already tokenized according to any tokenizers attached to the field. This
162 |     /// can often result in surprising behaviour. For example, if you want to store
163 |     /// UUIDs as text in a field, and those values have hyphens, and you use the
164 |     /// default tokenizer which removes punctuation, you will not be able to delete
165 |     /// a document added with particular UUID, by passing the same UUID to this
166 |     /// method. In such workflows where deletions are required, particularly with
167 |     /// string values, it is strongly recommended to use the
168 |     /// "raw" tokenizer as this will match exactly. In situations where you do
169 |     /// want tokenization to be applied, it is recommended to instead use the
170 |     /// `delete_documents_by_query` method instead, which will delete documents
171 |     /// matching the given query using the same query parser as used in search queries.
172 |     ///
173 |     /// Args:
174 |     ///     field_name (str): The field name for which we want to filter deleted docs.
175 |     ///     field_value (PyAny): Python object with the value we want to filter.
176 |     ///
177 |     /// If the field_name is not on the schema raises ValueError exception.
178 |     /// If the field_value is not supported raises Exception.
179 |     fn delete_documents_by_term(
180 |         &mut self,
181 |         field_name: &str,
182 |         field_value: &Bound<PyAny>,
183 |     ) -> PyResult<u64> {
184 |         let field = get_field(&self.schema, field_name)?;
185 |         let value = extract_value(field_value)?;
186 |         let term = match value {
187 |             Value::Null => {
188 |                 return Err(exceptions::PyValueError::new_err(format!(
189 |                     "Field `{field_name}` is null type not deletable."
190 |                 )))
191 |             },
192 |             Value::Str(text) => Term::from_field_text(field, &text),
193 |             Value::U64(num) => Term::from_field_u64(field, num),
194 |             Value::I64(num) => Term::from_field_i64(field, num),
195 |             Value::F64(num) => Term::from_field_f64(field, num),
196 |             Value::Date(d) => Term::from_field_date(field, d),
197 |             Value::Facet(facet) => Term::from_facet(field, &facet),
198 |             Value::Bytes(_) => {
199 |                 return Err(exceptions::PyValueError::new_err(format!(
200 |                     "Field `{field_name}` is bytes type not deletable."
201 |                 )))
202 |             }
203 |             Value::PreTokStr(_pretok) => {
204 |                 return Err(exceptions::PyValueError::new_err(format!(
205 |                     "Field `{field_name}` is pretokenized. This is not authorized for delete."
206 |                 )))
207 |             }
208 |             Value::Array(_) => {
209 |                 return Err(exceptions::PyValueError::new_err(format!(
210 |                     "Field `{field_name}` is array type not deletable."
211 |                 )))
212 |             }
213 |             Value::Object(_) => {
214 |                 return Err(exceptions::PyValueError::new_err(format!(
215 |                     "Field `{field_name}` is json object type not deletable."
216 |                 )))
217 |             },
218 |             Value::Bool(b) => Term::from_field_bool(field, b),
219 |             Value::IpAddr(i) => Term::from_field_ip_addr(field, i)
220 |         };
221 |         Ok(self.inner()?.delete_term(term))
222 |     }
223 | 
224 |     /// Delete all documents matching a given query.
225 |     ///
226 |     /// Example:
227 |     ///
228 |     ///     ```python
229 |     ///     schema_builder = SchemaBuilder()
230 |     ///     schema_builder.add_text_field("title", fast=True)
231 |     ///     schema = schema_builder.build()
232 |     ///     index = Index(schema)
233 |     ///     writer = index.writer()
234 |     ///     source_doc = {
235 |     ///         "title": "Here is some text"
236 |     ///     }
237 |     ///     writer.add_json(json.dumps(source_doc))
238 |     ///     writer.commit()
239 |     ///     writer.wait_merging_threads()
240 |     ///
241 |     ///     query = index.parse_query("title:text")
242 |     ///     writer = index.writer()
243 |     ///     writer.delete_documents_by_query(query)
244 |     ///     writer.commit()
245 |     ///     writer.wait_merging_threads()
246 |     ///     ```
247 |     ///
248 |     /// Args:
249 |     ///    query (Query): The query to filter the deleted documents.
250 |     ///
251 |     /// If the query is not valid raises ValueError exception.
252 |     /// If the query is not supported raises Exception.
253 |     fn delete_documents_by_query(&mut self, query: &Query) -> PyResult<u64> {
254 |         self.inner()?
255 |             .delete_query(query.inner.box_clone())
256 |             .map_err(to_pyerr)
257 |     }
258 | 
259 |     /// If there are some merging threads, blocks until they all finish
260 |     /// their work and then drop the `IndexWriter`.
261 |     ///
262 |     /// This will consume the `IndexWriter`. Further accesses to the
263 |     /// object will result in an error.
264 |     pub fn wait_merging_threads(&mut self) -> PyResult<()> {
265 |         self.take_inner()?.wait_merging_threads().map_err(to_pyerr)
266 |     }
267 | }
268 | 
269 | /// Create a new index object.
270 | ///
271 | /// Args:
272 | ///     schema (Schema): The schema of the index.
273 | ///     path (str, optional): The path where the index should be stored. If
274 | ///         no path is provided, the index will be stored in memory.
275 | ///     reuse (bool, optional): Should we open an existing index if one exists
276 | ///         or always create a new one.
277 | ///
278 | /// If an index already exists it will be opened and reused. Raises OSError
279 | /// if there was a problem during the opening or creation of the index.
280 | #[pyclass(module = "tantivy.tantivy")]
281 | pub(crate) struct Index {
282 |     pub(crate) index: tv::Index,
283 |     reader: tv::IndexReader,
284 | }
285 | 
286 | #[pymethods]
287 | impl Index {
288 |     #[staticmethod]
289 |     fn open(path: &str) -> PyResult<Index> {
290 |         let index = tv::Index::open_in_dir(path).map_err(to_pyerr)?;
291 | 
292 |         Index::register_custom_text_analyzers(&index);
293 | 
294 |         let reader = index.reader().map_err(to_pyerr)?;
295 |         Ok(Index { index, reader })
296 |     }
297 | 
298 |     #[new]
299 |     #[pyo3(signature = (schema, path = None, reuse = true))]
300 |     fn new(schema: &Schema, path: Option<&str>, reuse: bool) -> PyResult<Self> {
301 |         let index = match path {
302 |             Some(p) => {
303 |                 let directory = MmapDirectory::open(p).map_err(to_pyerr)?;
304 |                 if reuse {
305 |                     tv::Index::open_or_create(directory, schema.inner.clone())
306 |                 } else {
307 |                     tv::Index::create(
308 |                         directory,
309 |                         schema.inner.clone(),
310 |                         tv::IndexSettings::default(),
311 |                     )
312 |                 }
313 |                 .map_err(to_pyerr)?
314 |             }
315 |             None => tv::Index::create_in_ram(schema.inner.clone()),
316 |         };
317 | 
318 |         Index::register_custom_text_analyzers(&index);
319 | 
320 |         let reader = index.reader().map_err(to_pyerr)?;
321 |         Ok(Index { index, reader })
322 |     }
323 | 
324 |     /// Create a `IndexWriter` for the index.
325 |     ///
326 |     /// The writer will be multithreaded and the provided heap size will be
327 |     /// split between the given number of threads.
328 |     ///
329 |     /// Args:
330 |     ///     overall_heap_size (int, optional): The total target heap memory usage of
331 |     ///         the writer. Tantivy requires that this can't be less
332 |     ///         than 3000000 *per thread*. Lower values will result in more
333 |     ///         frequent internal commits when adding documents (slowing down
334 |     ///         write progress), and larger values will results in fewer
335 |     ///         commits but greater memory usage. The best value will depend
336 |     ///         on your specific use case.
337 |     ///     num_threads (int, optional): The number of threads that the writer
338 |     ///         should use. If this value is 0, tantivy will choose
339 |     ///         automatically the number of threads.
340 |     ///
341 |     /// Raises ValueError if there was an error while creating the writer.
342 |     #[pyo3(signature = (heap_size = 128_000_000, num_threads = 0))]
343 |     fn writer(
344 |         &self,
345 |         heap_size: usize,
346 |         num_threads: usize,
347 |     ) -> PyResult<IndexWriter> {
348 |         let writer = match num_threads {
349 |             0 => self.index.writer(heap_size),
350 |             _ => self.index.writer_with_num_threads(num_threads, heap_size),
351 |         }
352 |         .map_err(to_pyerr)?;
353 |         let schema = self.index.schema();
354 |         Ok(IndexWriter {
355 |             inner_index_writer: Some(writer),
356 |             schema,
357 |         })
358 |     }
359 | 
360 |     /// Configure the index reader.
361 |     ///
362 |     /// Args:
363 |     ///     reload_policy (str, optional): The reload policy that the
364 |     ///         IndexReader should use. Can be `Manual` or `OnCommit`.
365 |     ///     num_warmers (int, optional): The number of searchers that the
366 |     ///         reader should create.
367 |     #[pyo3(signature = (reload_policy = RELOAD_POLICY, num_warmers = 0))]
368 |     fn config_reader(
369 |         &mut self,
370 |         reload_policy: &str,
371 |         num_warmers: usize,
372 |     ) -> Result<(), PyErr> {
373 |         let reload_policy = reload_policy.to_lowercase();
374 |         let reload_policy = match reload_policy.as_ref() {
375 |             "commit" => tv::ReloadPolicy::OnCommitWithDelay,
376 |             "on-commit" => tv::ReloadPolicy::OnCommitWithDelay,
377 |             "oncommit" => tv::ReloadPolicy::OnCommitWithDelay,
378 |             "manual" => tv::ReloadPolicy::Manual,
379 |             _ => return Err(exceptions::PyValueError::new_err(
380 |                 "Invalid reload policy, valid choices are: 'manual' and 'OnCommit'"
381 |             ))
382 |         };
383 |         let builder = self.index.reader_builder();
384 |         let builder = builder.reload_policy(reload_policy);
385 |         let builder = if num_warmers > 0 {
386 |             builder.num_warming_threads(num_warmers)
387 |         } else {
388 |             builder
389 |         };
390 | 
391 |         self.reader = builder.try_into().map_err(to_pyerr)?;
392 |         Ok(())
393 |     }
394 | 
395 |     /// Returns a searcher
396 |     ///
397 |     /// This method should be called every single time a search query is performed.
398 |     /// The same searcher must be used for a given query, as it ensures the use of a consistent segment set.
399 |     fn searcher(&self) -> Searcher {
400 |         Searcher {
401 |             inner: self.reader.searcher(),
402 |         }
403 |     }
404 | 
405 |     /// Check if the given path contains an existing index.
406 |     /// Args:
407 |     ///     path: The path where tantivy will search for an index.
408 |     ///
409 |     /// Returns True if an index exists at the given path, False otherwise.
410 |     ///
411 |     /// Raises OSError if the directory cannot be opened.
412 |     #[staticmethod]
413 |     fn exists(path: &str) -> PyResult<bool> {
414 |         let directory = MmapDirectory::open(path).map_err(to_pyerr)?;
415 |         tv::Index::exists(&directory).map_err(to_pyerr)
416 |     }
417 | 
418 |     /// The schema of the current index.
419 |     #[getter]
420 |     fn schema(&self) -> Schema {
421 |         let schema = self.index.schema();
422 |         Schema { inner: schema }
423 |     }
424 | 
425 |     /// Update searchers so that they reflect the state of the last .commit().
426 |     ///
427 |     /// If you set up the the reload policy to be on 'commit' (which is the
428 |     /// default) every commit should be rapidly reflected on your IndexReader
429 |     /// and you should not need to call reload() at all.
430 |     fn reload(&self) -> PyResult<()> {
431 |         self.reader.reload().map_err(to_pyerr)
432 |     }
433 | 
434 |     /// Parse a query
435 |     ///
436 |     /// Args:
437 |     ///     query: the query, following the tantivy query language.
438 |     ///
439 |     ///     default_fields_names (List[Field]): A list of fields used to search if no
440 |     ///         field is specified in the query.
441 |     ///
442 |     ///     field_boosts: A dictionary keyed on field names which provides default boosts
443 |     ///         for the query constructed by this method.
444 |     ///
445 |     ///     fuzzy_fields: A dictionary keyed on field names which provides (prefix, distance, transpose_cost_one)
446 |     ///         triples making queries constructed by this method fuzzy against the given fields
447 |     ///         and using the given parameters.
448 |     ///         `prefix` determines if terms which are prefixes of the given term match the query.
449 |     ///         `distance` determines the maximum Levenshtein distance between terms matching the query and the given term.
450 |     ///         `transpose_cost_one` determines if transpositions of neighbouring characters are counted only once against the Levenshtein distance.
451 |     #[pyo3(signature = (query, default_field_names = None, field_boosts = HashMap::new(), fuzzy_fields = HashMap::new()))]
452 |     pub fn parse_query(
453 |         &self,
454 |         query: &str,
455 |         default_field_names: Option<Vec<String>>,
456 |         field_boosts: HashMap<String, tv::Score>,
457 |         fuzzy_fields: HashMap<String, (bool, u8, bool)>,
458 |     ) -> PyResult<Query> {
459 |         let parser = self.prepare_query_parser(
460 |             default_field_names,
461 |             field_boosts,
462 |             fuzzy_fields,
463 |         )?;
464 | 
465 |         let query = parser.parse_query(query).map_err(to_pyerr)?;
466 | 
467 |         Ok(Query { inner: query })
468 |     }
469 | 
470 |     /// Parse a query leniently.
471 |     ///
472 |     /// This variant parses invalid query on a best effort basis. If some part of the query can't
473 |     /// reasonably be executed (range query without field, searching on a non existing field,
474 |     /// searching without precising field when no default field is provided...), they may get turned
475 |     /// into a "match-nothing" subquery.
476 |     ///
477 |     /// Args:
478 |     ///     query: the query, following the tantivy query language.
479 |     ///
480 |     ///     default_fields_names (List[Field]): A list of fields used to search if no
481 |     ///         field is specified in the query.
482 |     ///
483 |     ///     field_boosts: A dictionary keyed on field names which provides default boosts
484 |     ///         for the query constructed by this method.
485 |     ///
486 |     ///     fuzzy_fields: A dictionary keyed on field names which provides (prefix, distance, transpose_cost_one)
487 |     ///         triples making queries constructed by this method fuzzy against the given fields
488 |     ///         and using the given parameters.
489 |     ///         `prefix` determines if terms which are prefixes of the given term match the query.
490 |     ///         `distance` determines the maximum Levenshtein distance between terms matching the query and the given term.
491 |     ///         `transpose_cost_one` determines if transpositions of neighbouring characters are counted only once against the Levenshtein distance.
492 |     ///
493 |     /// Returns a tuple containing the parsed query and a list of errors.
494 |     ///
495 |     /// Raises ValueError if a field in `default_field_names` is not defined or marked as indexed.
496 |     #[pyo3(signature = (query, default_field_names = None, field_boosts = HashMap::new(), fuzzy_fields = HashMap::new()))]
497 |     pub fn parse_query_lenient(
498 |         &self,
499 |         query: &str,
500 |         default_field_names: Option<Vec<String>>,
501 |         field_boosts: HashMap<String, tv::Score>,
502 |         fuzzy_fields: HashMap<String, (bool, u8, bool)>,
503 |         py: Python,
504 |     ) -> PyResult<(Query, Vec<PyObject>)> {
505 |         let parser = self.prepare_query_parser(
506 |             default_field_names,
507 |             field_boosts,
508 |             fuzzy_fields,
509 |         )?;
510 | 
511 |         let (query, errors) = parser.parse_query_lenient(query);
512 |         let errors = errors
513 |             .into_iter()
514 |             .map(|err| err.into_py(py))
515 |             // This is a rust idiom, but just in case you're not familiar
516 |             // with it, we're converting from an iterator of PyResult<PyObject>
517 |             // into a PyResult<Vec<PyObject>>, by specifying the `PyResult`
518 |             // on the outside of the turbofish type signature.
519 |             .collect::<PyResult<_>>()?;
520 | 
521 |         Ok((Query { inner: query }, errors))
522 |     }
523 | 
524 |     /// Register a custom text analyzer by name. (Confusingly,
525 |     /// this is one of the places where Tantivy uses 'tokenizer' to refer to a
526 |     /// TextAnalyzer instance.)
527 |     ///
528 |     // Implementation notes: Skipped indirection of TokenizerManager.
529 |     pub fn register_tokenizer(&self, name: &str, analyzer: PyTextAnalyzer) {
530 |         self.index.tokenizers().register(name, analyzer.analyzer);
531 |     }
532 | }
533 | 
534 | impl Index {
535 |     fn prepare_query_parser(
536 |         &self,
537 |         default_field_names: Option<Vec<String>>,
538 |         field_boosts: HashMap<String, tv::Score>,
539 |         fuzzy_fields: HashMap<String, (bool, u8, bool)>,
540 |     ) -> PyResult<tv::query::QueryParser> {
541 |         let schema = self.index.schema();
542 | 
543 |         let default_fields = if let Some(default_field_names) =
544 |             default_field_names
545 |         {
546 |             default_field_names.iter().map(|field_name| {
547 |                 let field = schema.get_field(field_name).map_err(|_err| {
548 |                     exceptions::PyValueError::new_err(format!(
549 |                         "Field `{field_name}` is not defined in the schema."
550 |                     ))
551 |                 })?;
552 | 
553 |                 let field_entry = schema.get_field_entry(field);
554 |                 if !field_entry.is_indexed() {
555 |                     return Err(exceptions::PyValueError::new_err(
556 |                         format!("Field `{field_name}` is not set as indexed in the schema.")
557 |                     ));
558 |                 }
559 | 
560 |                 Ok(field)
561 |             }).collect::<PyResult<_>>()?
562 |         } else {
563 |             schema
564 |                 .fields()
565 |                 .filter(|(_, field_entry)| field_entry.is_indexed())
566 |                 .map(|(field, _)| field)
567 |                 .collect()
568 |         };
569 | 
570 |         let mut parser =
571 |             tv::query::QueryParser::for_index(&self.index, default_fields);
572 | 
573 |         for (field_name, boost) in field_boosts {
574 |             let field = schema.get_field(&field_name).map_err(|_err| {
575 |                 exceptions::PyValueError::new_err(format!(
576 |                     "Field `{field_name}` is not defined in the schema."
577 |                 ))
578 |             })?;
579 |             parser.set_field_boost(field, boost);
580 |         }
581 | 
582 |         for (field_name, (prefix, distance, transpose_cost_one)) in fuzzy_fields
583 |         {
584 |             let field = schema.get_field(&field_name).map_err(|_err| {
585 |                 exceptions::PyValueError::new_err(format!(
586 |                     "Field `{field_name}` is not defined in the schema."
587 |                 ))
588 |             })?;
589 |             parser.set_field_fuzzy(field, prefix, distance, transpose_cost_one);
590 |         }
591 | 
592 |         Ok(parser)
593 |     }
594 | 
595 |     fn register_custom_text_analyzers(index: &tv::Index) {
596 |         let analyzers = [
597 |             ("ar_stem", Language::Arabic),
598 |             ("da_stem", Language::Danish),
599 |             ("nl_stem", Language::Dutch),
600 |             ("fi_stem", Language::Finnish),
601 |             ("fr_stem", Language::French),
602 |             ("de_stem", Language::German),
603 |             ("el_stem", Language::Greek),
604 |             ("hu_stem", Language::Hungarian),
605 |             ("it_stem", Language::Italian),
606 |             ("no_stem", Language::Norwegian),
607 |             ("pt_stem", Language::Portuguese),
608 |             ("ro_stem", Language::Romanian),
609 |             ("ru_stem", Language::Russian),
610 |             ("es_stem", Language::Spanish),
611 |             ("sv_stem", Language::Swedish),
612 |             ("ta_stem", Language::Tamil),
613 |             ("tr_stem", Language::Turkish),
614 |         ];
615 | 
616 |         for (name, lang) in &analyzers {
617 |             let an = TextAnalyzer::builder(SimpleTokenizer::default())
618 |                 .filter(RemoveLongFilter::limit(40))
619 |                 .filter(LowerCaser)
620 |                 .filter(Stemmer::new(*lang))
621 |                 .build();
622 |             index.tokenizers().register(name, an);
623 |         }
624 |     }
625 | }
626 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use ::tantivy as tv;
  2 | use ::tantivy::schema::{OwnedValue as Value, Term};
  3 | use pyo3::{exceptions, prelude::*, wrap_pymodule};
  4 | 
  5 | mod document;
  6 | mod facet;
  7 | mod index;
  8 | mod parser_error;
  9 | mod query;
 10 | mod schema;
 11 | mod schemabuilder;
 12 | mod searcher;
 13 | mod snippet;
 14 | mod tokenizer;
 15 | 
 16 | use document::{extract_value, extract_value_for_type, Document};
 17 | use facet::Facet;
 18 | use index::Index;
 19 | use query::{Occur, Query};
 20 | use schema::{FieldType, Schema};
 21 | use schemabuilder::SchemaBuilder;
 22 | use searcher::{DocAddress, Order, SearchResult, Searcher};
 23 | use snippet::{Snippet, SnippetGenerator};
 24 | use tokenizer::{Filter, TextAnalyzer, TextAnalyzerBuilder, Tokenizer};
 25 | 
 26 | /// Python bindings for the search engine library Tantivy.
 27 | ///
 28 | /// Tantivy is a full text search engine library written in rust.
 29 | ///
 30 | /// It is closer to Apache Lucene than to Elasticsearch and Apache Solr in
 31 | /// the sense it is not an off-the-shelf search engine server, but rather
 32 | /// a library that can be used to build such a search engine.
 33 | /// Tantivy is, in fact, strongly inspired by Lucene's design.
 34 | ///
 35 | /// Example:
 36 | ///     >>> import json
 37 | ///     >>> import tantivy
 38 | ///
 39 | ///     >>> builder = tantivy.SchemaBuilder()
 40 | ///
 41 | ///     >>> title = builder.add_text_field("title", stored=True)
 42 | ///     >>> body = builder.add_text_field("body")
 43 | ///
 44 | ///     >>> schema = builder.build()
 45 | ///     >>> index = tantivy.Index(schema)
 46 | ///     >>> doc = tantivy.Document()
 47 | ///     >>> doc.add_text(title, "The Old Man and the Sea")
 48 | ///     >>> doc.add_text(body, ("He was an old man who fished alone in a "
 49 | ///                             "skiff in the Gulf Stream and he had gone "
 50 | ///                             "eighty-four days now without taking a fish."))
 51 | ///
 52 | ///     >>> writer.add_document(doc)
 53 | ///
 54 | ///     >>> doc = schema.parse_document(json.dumps({
 55 | ///            "title": ["Frankenstein", "The Modern Prometheus"],
 56 | ///            "body": ("You will rejoice to hear that no disaster has "
 57 | ///                     "accompanied the commencement of an enterprise which "
 58 | ///                     "you have regarded with such evil forebodings.  "
 59 | ///                     "I arrived here yesterday, and my first task is to "
 60 | ///                     "assure my dear sister of my welfare and increasing "
 61 | ///                     "confidence in the success of my undertaking.")
 62 | ///     }))
 63 | ///
 64 | ///     >>> writer.add_document(doc)
 65 | ///     >>> writer.commit()
 66 | ///
 67 | ///     >>> reader = index.reader()
 68 | ///     >>> searcher = reader.searcher()
 69 | ///
 70 | ///     >>> query = index.parse_query("sea whale", [title, body])
 71 | ///
 72 | ///     >>> result = searcher.search(query, 10)
 73 | ///
 74 | ///     >>> assert len(result) == 1
 75 | ///
 76 | #[pymodule]
 77 | fn tantivy(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
 78 |     m.add_class::<Order>()?;
 79 |     m.add_class::<Schema>()?;
 80 |     m.add_class::<SchemaBuilder>()?;
 81 |     m.add_class::<Searcher>()?;
 82 |     m.add_class::<SearchResult>()?;
 83 |     m.add_class::<Document>()?;
 84 |     m.add_class::<Index>()?;
 85 |     m.add_class::<DocAddress>()?;
 86 |     m.add_class::<Facet>()?;
 87 |     m.add_class::<Query>()?;
 88 |     m.add_class::<Snippet>()?;
 89 |     m.add_class::<SnippetGenerator>()?;
 90 |     m.add_class::<Occur>()?;
 91 |     m.add_class::<FieldType>()?;
 92 |     m.add_class::<Tokenizer>()?;
 93 |     m.add_class::<TextAnalyzerBuilder>()?;
 94 |     m.add_class::<Filter>()?;
 95 |     m.add_class::<TextAnalyzer>()?;
 96 | 
 97 |     m.add_wrapped(wrap_pymodule!(query_parser_error))?;
 98 | 
 99 |     m.add("__version__", tv::version_string())?;
100 | 
101 |     Ok(())
102 | }
103 | 
104 | /// Submodule containing all the possible errors that can be raised during
105 | /// query parsing.
106 | ///
107 | /// Example:
108 | ///     >>> import tantivy
109 | ///     >>> from tantivy import query_parser_error
110 | ///
111 | ///     >>> builder = tantivy.SchemaBuilder()
112 | ///
113 | ///     >>> title = builder.add_text_field("title", stored=True)
114 | ///     >>> body = builder.add_text_field("body")
115 | ///     >>> id = builder.add_unsigned_field("id")
116 | ///     >>> rating = builder.add_float_field("rating")
117 | ///
118 | ///     >>> schema = builder.build()
119 | ///     >>> index = tantivy.Index(schema)
120 | ///
121 | ///     >>> query, errors = index.parse_query_lenient(
122 | ///             "bod:'world' AND id:<3.5 AND rating:5.0"
123 | ///         )
124 | ///
125 | ///     >>> assert len(errors) == 2
126 | ///     >>> assert isinstance(errors[0], query_parser_error.FieldDoesNotExistError)
127 | ///     >>> assert isinstance(errors[1], query_parser_error.ExpectedIntError)
128 | #[pymodule]
129 | fn query_parser_error(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {
130 |     m.add_class::<parser_error::SyntaxError>()?;
131 |     m.add_class::<parser_error::UnsupportedQueryError>()?;
132 |     m.add_class::<parser_error::FieldDoesNotExistError>()?;
133 |     m.add_class::<parser_error::ExpectedIntError>()?;
134 |     m.add_class::<parser_error::ExpectedBase64Error>()?;
135 |     m.add_class::<parser_error::ExpectedFloatError>()?;
136 |     m.add_class::<parser_error::ExpectedBoolError>()?;
137 |     m.add_class::<parser_error::AllButQueryForbiddenError>()?;
138 |     m.add_class::<parser_error::NoDefaultFieldDeclaredError>()?;
139 |     m.add_class::<parser_error::FieldNotIndexedError>()?;
140 |     m.add_class::<parser_error::FieldDoesNotHavePositionsIndexedError>()?;
141 |     m.add_class::<parser_error::PhrasePrefixRequiresAtLeastTwoTermsError>()?;
142 |     m.add_class::<parser_error::UnknownTokenizerError>()?;
143 |     m.add_class::<parser_error::RangeMustNotHavePhraseError>()?;
144 |     m.add_class::<parser_error::DateFormatError>()?;
145 |     m.add_class::<parser_error::FacetFormatError>()?;
146 |     m.add_class::<parser_error::IpFormatError>()?;
147 | 
148 |     Ok(())
149 | }
150 | 
151 | pub(crate) fn to_pyerr<E: ToString>(err: E) -> PyErr {
152 |     exceptions::PyValueError::new_err(err.to_string())
153 | }
154 | 
155 | pub(crate) fn get_field(
156 |     schema: &tv::schema::Schema,
157 |     field_name: &str,
158 | ) -> PyResult<tv::schema::Field> {
159 |     let field = schema.get_field(field_name).map_err(|_err| {
160 |         exceptions::PyValueError::new_err(format!(
161 |             "Field `{field_name}` is not defined in the schema."
162 |         ))
163 |     })?;
164 | 
165 |     Ok(field)
166 | }
167 | 
168 | pub(crate) fn make_term(
169 |     schema: &tv::schema::Schema,
170 |     field_name: &str,
171 |     field_value: &Bound<PyAny>,
172 | ) -> PyResult<tv::Term> {
173 |     let field = get_field(schema, field_name)?;
174 |     let value = extract_value(field_value)?;
175 |     let term = match value {
176 |         Value::Str(text) => Term::from_field_text(field, &text),
177 |         Value::U64(num) => Term::from_field_u64(field, num),
178 |         Value::I64(num) => Term::from_field_i64(field, num),
179 |         Value::F64(num) => Term::from_field_f64(field, num),
180 |         Value::Date(d) => Term::from_field_date(field, d),
181 |         Value::Facet(facet) => Term::from_facet(field, &facet),
182 |         Value::Bool(b) => Term::from_field_bool(field, b),
183 |         Value::IpAddr(i) => Term::from_field_ip_addr(field, i),
184 |         _ => {
185 |             return Err(exceptions::PyValueError::new_err(format!(
186 |                 "Can't create a term for Field `{field_name}` with value `{field_value}`."
187 |             )))
188 |         }
189 |     };
190 | 
191 |     Ok(term)
192 | }
193 | 
194 | pub(crate) fn make_term_for_type(
195 |     schema: &tv::schema::Schema,
196 |     field_name: &str,
197 |     field_type: FieldType,
198 |     field_value: &Bound<PyAny>,
199 | ) -> PyResult<tv::Term> {
200 |     let field = get_field(schema, field_name)?;
201 |     let value =
202 |         extract_value_for_type(field_value, field_type.into(), field_name)?;
203 |     let term = match value {
204 |         Value::Str(text) => Term::from_field_text(field, &text),
205 |         Value::U64(num) => Term::from_field_u64(field, num),
206 |         Value::I64(num) => Term::from_field_i64(field, num),
207 |         Value::F64(num) => Term::from_field_f64(field, num),
208 |         Value::Date(d) => Term::from_field_date(field, d),
209 |         Value::Facet(facet) => Term::from_facet(field, &facet),
210 |         Value::Bool(b) => Term::from_field_bool(field, b),
211 |         Value::IpAddr(i) => Term::from_field_ip_addr(field, i),
212 |         _ => {
213 |             return Err(exceptions::PyValueError::new_err(format!(
214 |                 "Can't create a term for Field `{field_name}` with value `{field_value}`."
215 |             )))
216 |         }
217 |     };
218 | 
219 |     Ok(term)
220 | }
221 | 


--------------------------------------------------------------------------------
/src/parser_error.rs:
--------------------------------------------------------------------------------
  1 | use std::{
  2 |     convert::TryFrom,
  3 |     net::AddrParseError,
  4 |     num::{IntErrorKind, ParseFloatError, ParseIntError},
  5 |     str::ParseBoolError,
  6 | };
  7 | 
  8 | use pyo3::prelude::*;
  9 | use pyo3::IntoPyObjectExt;
 10 | use tantivy::{self as tv, schema::FacetParseError};
 11 | 
 12 | // TODO(https://github.com/PyO3/pyo3/issues/1190): Expose this to bindings once trait <-> ABC is
 13 | // supported in PyO3.
 14 | pub(crate) trait QueryParserError {
 15 |     fn full_message(&self) -> String;
 16 | }
 17 | 
 18 | /// A crate local version of the [`IntoPy`] trait to implement for
 19 | /// [`QueryParserError`](tv::query::QueryParserError).
 20 | pub(crate) trait QueryParserErrorIntoPy {
 21 |     fn into_py(self, py: Python) -> PyResult<PyObject>;
 22 | }
 23 | 
 24 | impl QueryParserErrorIntoPy for tv::query::QueryParserError {
 25 |     fn into_py(self, py: Python) -> PyResult<PyObject> {
 26 |         match self {
 27 |             tv::query::QueryParserError::SyntaxError(message) => {
 28 |                 SyntaxError { message }.into_py_any(py)
 29 |             }
 30 |             tv::query::QueryParserError::UnsupportedQuery(message) => {
 31 |                 UnsupportedQueryError { message }.into_py_any(py)
 32 |             }
 33 |             tv::query::QueryParserError::FieldDoesNotExist(field) => {
 34 |                 FieldDoesNotExistError { field }.into_py_any(py)
 35 |             }
 36 |             tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed(
 37 |                 field,
 38 |             ) => FieldDoesNotHavePositionsIndexedError { field }.into_py_any(py),
 39 |             tv::query::QueryParserError::ExpectedInt(parse_int_error) => {
 40 |                 ExpectedIntError { parse_int_error }.into_py_any(py)
 41 |             }
 42 |             tv::query::QueryParserError::ExpectedFloat(parse_float_error) => {
 43 |                 ExpectedFloatError { parse_float_error }.into_py_any(py)
 44 |             }
 45 |             tv::query::QueryParserError::ExpectedBool(parse_bool_error) => {
 46 |                 ExpectedBoolError { parse_bool_error }.into_py_any(py)
 47 |             }
 48 |             tv::query::QueryParserError::ExpectedBase64(decode_error) => {
 49 |                 ExpectedBase64Error { decode_error }.into_py_any(py)
 50 |             }
 51 |             tv::query::QueryParserError::AllButQueryForbidden => {
 52 |                 AllButQueryForbiddenError.into_py_any(py)
 53 |             }
 54 |             tv::query::QueryParserError::NoDefaultFieldDeclared => {
 55 |                 NoDefaultFieldDeclaredError.into_py_any(py)
 56 |             }
 57 |             tv::query::QueryParserError::FieldNotIndexed(field) => {
 58 |                 FieldNotIndexedError { field }.into_py_any(py)
 59 |             }
 60 |             tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
 61 |                 phrase,
 62 |                 tokenizer,
 63 |             } => {
 64 |                 PhrasePrefixRequiresAtLeastTwoTermsError { phrase, tokenizer }.into_py_any(py)
 65 |             }
 66 |             tv::query::QueryParserError::UnknownTokenizer { tokenizer, field } => {
 67 |                     UnknownTokenizerError { tokenizer, field }.into_py_any(py)
 68 |             }
 69 |             tv::query::QueryParserError::RangeMustNotHavePhrase => {
 70 |                 RangeMustNotHavePhraseError.into_py_any(py)
 71 |             }
 72 |             tv::query::QueryParserError::DateFormatError(_) => {
 73 |                 DateFormatError { inner: self }.into_py_any(py)
 74 |             }
 75 |             tv::query::QueryParserError::FacetFormatError(facet_parse_error) => {
 76 |                 FacetFormatError { facet_parse_error }.into_py_any(py)
 77 |             }
 78 |             tv::query::QueryParserError::IpFormatError(addr_parse_error) => {
 79 |                 IpFormatError { addr_parse_error }.into_py_any(py)
 80 |             }
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | /// Error in the query syntax.
 86 | #[pyclass(frozen, module = "tantivy.tantivy")]
 87 | pub(crate) struct SyntaxError {
 88 |     message: String,
 89 | }
 90 | 
 91 | #[pymethods]
 92 | impl SyntaxError {
 93 |     #[getter]
 94 |     fn inner_message(&self) -> &str {
 95 |         self.message.as_str()
 96 |     }
 97 | 
 98 |     fn __repr__(&self) -> String {
 99 |         self.full_message()
100 |     }
101 | 
102 |     fn __str__(&self) -> String {
103 |         self.full_message()
104 |     }
105 | }
106 | 
107 | impl QueryParserError for SyntaxError {
108 |     fn full_message(&self) -> String {
109 |         format!("Syntax Error: {0}", self.message)
110 |     }
111 | }
112 | 
113 | impl From<SyntaxError> for tv::query::QueryParserError {
114 |     fn from(error: SyntaxError) -> Self {
115 |         tv::query::QueryParserError::SyntaxError(error.message)
116 |     }
117 | }
118 | 
119 | impl TryFrom<tv::query::QueryParserError> for SyntaxError {
120 |     type Error = String;
121 | 
122 |     fn try_from(
123 |         error: tv::query::QueryParserError,
124 |     ) -> Result<Self, Self::Error> {
125 |         match error {
126 |             tv::query::QueryParserError::SyntaxError(message) => {
127 |                 Ok(Self { message })
128 |             }
129 |             _ => Err(format!("{error} is not a SyntaxError")),
130 |         }
131 |     }
132 | }
133 | 
134 | /// This query is unsupported.
135 | #[pyclass(frozen, module = "tantivy.tantivy")]
136 | pub(crate) struct UnsupportedQueryError {
137 |     message: String,
138 | }
139 | 
140 | #[pymethods]
141 | impl UnsupportedQueryError {
142 |     #[getter]
143 |     fn inner_message(&self) -> &str {
144 |         self.message.as_str()
145 |     }
146 | 
147 |     fn __repr__(&self) -> String {
148 |         self.full_message()
149 |     }
150 | 
151 |     fn __str__(&self) -> String {
152 |         self.full_message()
153 |     }
154 | }
155 | 
156 | impl QueryParserError for UnsupportedQueryError {
157 |     fn full_message(&self) -> String {
158 |         format!("Unsupported query: {0}", self.message)
159 |     }
160 | }
161 | 
162 | impl From<UnsupportedQueryError> for tv::query::QueryParserError {
163 |     fn from(error: UnsupportedQueryError) -> Self {
164 |         tv::query::QueryParserError::SyntaxError(error.message)
165 |     }
166 | }
167 | 
168 | impl TryFrom<tv::query::QueryParserError> for UnsupportedQueryError {
169 |     type Error = String;
170 | 
171 |     fn try_from(
172 |         error: tv::query::QueryParserError,
173 |     ) -> Result<Self, Self::Error> {
174 |         match error {
175 |             tv::query::QueryParserError::UnsupportedQuery(message) => {
176 |                 Ok(Self { message })
177 |             }
178 |             _ => Err(format!("{error} is not an UnsupportedQuery error")),
179 |         }
180 |     }
181 | }
182 | 
183 | /// The query references a field that is not in the schema.
184 | #[pyclass(frozen, module = "tantivy.tantivy")]
185 | pub struct FieldDoesNotExistError {
186 |     field: String,
187 | }
188 | 
189 | #[pymethods]
190 | impl FieldDoesNotExistError {
191 |     /// The name of the field causing the error.
192 |     #[getter]
193 |     fn field(&self) -> &str {
194 |         self.field.as_str()
195 |     }
196 | 
197 |     fn __repr__(&self) -> String {
198 |         self.full_message()
199 |     }
200 | 
201 |     fn __str__(&self) -> String {
202 |         self.full_message()
203 |     }
204 | }
205 | 
206 | impl QueryParserError for FieldDoesNotExistError {
207 |     fn full_message(&self) -> String {
208 |         format!("Field does not exist: '{0}'", self.field)
209 |     }
210 | }
211 | 
212 | impl From<FieldDoesNotExistError> for tv::query::QueryParserError {
213 |     fn from(error: FieldDoesNotExistError) -> Self {
214 |         tv::query::QueryParserError::FieldDoesNotExist(error.field)
215 |     }
216 | }
217 | 
218 | impl TryFrom<tv::query::QueryParserError> for FieldDoesNotExistError {
219 |     type Error = String;
220 | 
221 |     fn try_from(
222 |         error: tv::query::QueryParserError,
223 |     ) -> Result<Self, Self::Error> {
224 |         match error {
225 |             tv::query::QueryParserError::FieldDoesNotExist(field) => {
226 |                 Ok(Self { field })
227 |             }
228 |             _ => Err(format!("{error} is not a FieldDoesNotExist error")),
229 |         }
230 |     }
231 | }
232 | 
233 | /// The query contains a term for a `u64` or `i64`-field, but the value is neither.
234 | #[pyclass(frozen, module = "tantivy.tantivy")]
235 | pub(crate) struct ExpectedIntError {
236 |     parse_int_error: ParseIntError,
237 | }
238 | 
239 | #[pymethods]
240 | impl ExpectedIntError {
241 |     /// If `true`, the value being parsed was empty.
242 |     fn caused_by_empty(&self) -> bool {
243 |         self.parse_int_error.kind() == &IntErrorKind::Empty
244 |     }
245 | 
246 |     /// If `true`, an invalid digit was found.
247 |     fn caused_by_invalid_digit(&self) -> bool {
248 |         self.parse_int_error.kind() == &IntErrorKind::InvalidDigit
249 |     }
250 | 
251 |     /// If `true`, the value being parsed was too large.
252 |     fn caused_by_pos_overflow(&self) -> bool {
253 |         self.parse_int_error.kind() == &IntErrorKind::PosOverflow
254 |     }
255 | 
256 |     /// If `true`, the value being parsed was too small.
257 |     fn caused_by_neg_overflow(&self) -> bool {
258 |         self.parse_int_error.kind() == &IntErrorKind::NegOverflow
259 |     }
260 | 
261 |     fn __repr__(&self) -> String {
262 |         self.full_message()
263 |     }
264 | 
265 |     fn __str__(&self) -> String {
266 |         self.full_message()
267 |     }
268 | }
269 | 
270 | impl QueryParserError for ExpectedIntError {
271 |     fn full_message(&self) -> String {
272 |         format!("Expected a valid integer: '{0:?}'", self.parse_int_error)
273 |     }
274 | }
275 | 
276 | impl From<ExpectedIntError> for tv::query::QueryParserError {
277 |     fn from(error: ExpectedIntError) -> Self {
278 |         tv::query::QueryParserError::ExpectedInt(error.parse_int_error)
279 |     }
280 | }
281 | 
282 | impl TryFrom<tv::query::QueryParserError> for ExpectedIntError {
283 |     type Error = String;
284 | 
285 |     fn try_from(
286 |         error: tv::query::QueryParserError,
287 |     ) -> Result<Self, Self::Error> {
288 |         match error {
289 |             tv::query::QueryParserError::ExpectedInt(parse_int_error) => {
290 |                 Ok(Self { parse_int_error })
291 |             }
292 |             _ => Err(format!("{error} is not an ExpectedInt error")),
293 |         }
294 |     }
295 | }
296 | 
297 | /// The query contains a term for a bytes field, but the value is not valid base64.
298 | #[pyclass(frozen, module = "tantivy.tantivy")]
299 | pub(crate) struct ExpectedBase64Error {
300 |     decode_error: base64::DecodeError,
301 | }
302 | 
303 | #[pymethods]
304 | impl ExpectedBase64Error {
305 |     /// If `true`, an invalid byte was found in the query. Padding characters (`=`) interspersed in
306 |     /// the encoded form will be treated as invalid bytes.
307 |     fn caused_by_invalid_byte(&self) -> bool {
308 |         matches!(self.decode_error, base64::DecodeError::InvalidByte { .. })
309 |     }
310 | 
311 |     /// If the error was caused by an invalid byte, returns the offset and offending byte.
312 |     fn invalid_byte_info(&self) -> Option<(usize, u8)> {
313 |         match self.decode_error {
314 |             base64::DecodeError::InvalidByte(position, byte) => {
315 |                 Some((position, byte))
316 |             }
317 |             _ => None,
318 |         }
319 |     }
320 | 
321 |     /// If `true`, the length of the base64 string was invalid.
322 |     fn caused_by_invalid_length(&self) -> bool {
323 |         matches!(self.decode_error, base64::DecodeError::InvalidLength(_))
324 |     }
325 | 
326 |     /// The last non-padding input symbol's encoded 6 bits have nonzero bits that will be discarded.
327 |     /// If `true`, this is indicative of corrupted or truncated Base64.
328 |     fn caused_by_invalid_last_symbol(&self) -> bool {
329 |         matches!(
330 |             self.decode_error,
331 |             base64::DecodeError::InvalidLastSymbol { .. }
332 |         )
333 |     }
334 | 
335 |     /// If the error was caused by an invalid last symbol, returns the offset and offending byte.
336 |     fn invalid_last_symbol_info(&self) -> Option<(usize, u8)> {
337 |         match self.decode_error {
338 |             base64::DecodeError::InvalidLastSymbol(position, byte) => {
339 |                 Some((position, byte))
340 |             }
341 |             _ => None,
342 |         }
343 |     }
344 | 
345 |     /// The nature of the padding was not as configured: absent or incorrect when it must be
346 |     /// canonical, or present when it must be absent, etc.
347 |     fn caused_by_invalid_padding(&self) -> bool {
348 |         matches!(self.decode_error, base64::DecodeError::InvalidPadding)
349 |     }
350 | 
351 |     fn __repr__(&self) -> String {
352 |         self.full_message()
353 |     }
354 | 
355 |     fn __str__(&self) -> String {
356 |         self.full_message()
357 |     }
358 | }
359 | 
360 | impl QueryParserError for ExpectedBase64Error {
361 |     fn full_message(&self) -> String {
362 |         format!("Expected base64: {0:?}", self.decode_error)
363 |     }
364 | }
365 | 
366 | impl From<ExpectedBase64Error> for tv::query::QueryParserError {
367 |     fn from(error: ExpectedBase64Error) -> Self {
368 |         tv::query::QueryParserError::ExpectedBase64(error.decode_error)
369 |     }
370 | }
371 | 
372 | impl TryFrom<tv::query::QueryParserError> for ExpectedBase64Error {
373 |     type Error = String;
374 | 
375 |     fn try_from(
376 |         error: tv::query::QueryParserError,
377 |     ) -> Result<Self, Self::Error> {
378 |         match error {
379 |             tv::query::QueryParserError::ExpectedBase64(decode_error) => {
380 |                 Ok(Self { decode_error })
381 |             }
382 |             _ => Err(format!("{error} is not an ExpectedBase64 error")),
383 |         }
384 |     }
385 | }
386 | 
387 | /// The query contains a term for a `f64`-field, but the value is not a f64.
388 | #[pyclass(frozen, module = "tantivy.tantivy")]
389 | pub(crate) struct ExpectedFloatError {
390 |     parse_float_error: ParseFloatError,
391 | }
392 | 
393 | #[pymethods]
394 | impl ExpectedFloatError {
395 |     fn __repr__(&self) -> String {
396 |         self.full_message()
397 |     }
398 | 
399 |     fn __str__(&self) -> String {
400 |         self.full_message()
401 |     }
402 | }
403 | 
404 | impl QueryParserError for ExpectedFloatError {
405 |     fn full_message(&self) -> String {
406 |         format!("Expected a float value: '{0:?}'", self.parse_float_error)
407 |     }
408 | }
409 | 
410 | impl From<ExpectedFloatError> for tv::query::QueryParserError {
411 |     fn from(error: ExpectedFloatError) -> Self {
412 |         tv::query::QueryParserError::ExpectedFloat(error.parse_float_error)
413 |     }
414 | }
415 | 
416 | impl TryFrom<tv::query::QueryParserError> for ExpectedFloatError {
417 |     type Error = String;
418 | 
419 |     fn try_from(
420 |         error: tv::query::QueryParserError,
421 |     ) -> Result<Self, Self::Error> {
422 |         match error {
423 |             tv::query::QueryParserError::ExpectedFloat(parse_float_error) => {
424 |                 Ok(Self { parse_float_error })
425 |             }
426 |             _ => Err(format!("{error} is not an ExpectedFloat error")),
427 |         }
428 |     }
429 | }
430 | 
431 | /// The query contains a term for a `bool`-field, but the value is not a bool.
432 | #[pyclass(frozen, module = "tantivy.tantivy")]
433 | pub(crate) struct ExpectedBoolError {
434 |     parse_bool_error: ParseBoolError,
435 | }
436 | 
437 | #[pymethods]
438 | impl ExpectedBoolError {
439 |     fn __repr__(&self) -> String {
440 |         self.full_message()
441 |     }
442 | 
443 |     fn __str__(&self) -> String {
444 |         self.full_message()
445 |     }
446 | }
447 | 
448 | impl QueryParserError for ExpectedBoolError {
449 |     fn full_message(&self) -> String {
450 |         format!("Expected a bool value: '{0:?}'", self.parse_bool_error)
451 |     }
452 | }
453 | 
454 | impl From<ExpectedBoolError> for tv::query::QueryParserError {
455 |     fn from(error: ExpectedBoolError) -> Self {
456 |         tv::query::QueryParserError::ExpectedBool(error.parse_bool_error)
457 |     }
458 | }
459 | 
460 | impl TryFrom<tv::query::QueryParserError> for ExpectedBoolError {
461 |     type Error = String;
462 | 
463 |     fn try_from(
464 |         error: tv::query::QueryParserError,
465 |     ) -> Result<Self, Self::Error> {
466 |         match error {
467 |             tv::query::QueryParserError::ExpectedBool(parse_bool_error) => {
468 |                 Ok(Self { parse_bool_error })
469 |             }
470 |             _ => Err(format!("{error} is not an ExpectedBool error")),
471 |         }
472 |     }
473 | }
474 | 
475 | /// It is forbidden queries that are only "excluding". (e.g. -title:pop)
476 | #[pyclass(frozen, module = "tantivy.tantivy")]
477 | pub(crate) struct AllButQueryForbiddenError;
478 | 
479 | #[pymethods]
480 | impl AllButQueryForbiddenError {
481 |     fn __repr__(&self) -> String {
482 |         self.full_message()
483 |     }
484 | 
485 |     fn __str__(&self) -> String {
486 |         self.full_message()
487 |     }
488 | }
489 | 
490 | impl QueryParserError for AllButQueryForbiddenError {
491 |     fn full_message(&self) -> String {
492 |         "Invalid query: Only excluding terms given".to_string()
493 |     }
494 | }
495 | 
496 | impl From<AllButQueryForbiddenError> for tv::query::QueryParserError {
497 |     fn from(_error: AllButQueryForbiddenError) -> Self {
498 |         tv::query::QueryParserError::AllButQueryForbidden
499 |     }
500 | }
501 | 
502 | impl TryFrom<tv::query::QueryParserError> for AllButQueryForbiddenError {
503 |     type Error = String;
504 | 
505 |     fn try_from(
506 |         error: tv::query::QueryParserError,
507 |     ) -> Result<Self, Self::Error> {
508 |         match error {
509 |             tv::query::QueryParserError::AllButQueryForbidden => Ok(Self {}),
510 |             _ => Err(format!("{error} is not an AllButQueryForbidden error")),
511 |         }
512 |     }
513 | }
514 | 
515 | /// If no default field is declared, running a query without any field specified is forbbidden.
516 | #[pyclass(frozen, module = "tantivy.tantivy")]
517 | pub(crate) struct NoDefaultFieldDeclaredError;
518 | 
519 | #[pymethods]
520 | impl NoDefaultFieldDeclaredError {
521 |     fn __repr__(&self) -> String {
522 |         self.full_message()
523 |     }
524 | 
525 |     fn __str__(&self) -> String {
526 |         self.full_message()
527 |     }
528 | }
529 | 
530 | impl QueryParserError for NoDefaultFieldDeclaredError {
531 |     fn full_message(&self) -> String {
532 |         "No default field declared and no field specified in query".to_string()
533 |     }
534 | }
535 | 
536 | impl From<NoDefaultFieldDeclaredError> for tv::query::QueryParserError {
537 |     fn from(_error: NoDefaultFieldDeclaredError) -> Self {
538 |         tv::query::QueryParserError::NoDefaultFieldDeclared
539 |     }
540 | }
541 | 
542 | impl TryFrom<tv::query::QueryParserError> for NoDefaultFieldDeclaredError {
543 |     type Error = String;
544 | 
545 |     fn try_from(
546 |         error: tv::query::QueryParserError,
547 |     ) -> Result<Self, Self::Error> {
548 |         match error {
549 |             tv::query::QueryParserError::NoDefaultFieldDeclared => Ok(Self {}),
550 |             _ => Err(format!("{error} is not a NoDefaultFieldDeclared error")),
551 |         }
552 |     }
553 | }
554 | 
555 | /// The field searched for is not declared as indexed in the schema.
556 | #[pyclass(frozen, module = "tantivy.tantivy")]
557 | pub(crate) struct FieldNotIndexedError {
558 |     field: String,
559 | }
560 | 
561 | #[pymethods]
562 | impl FieldNotIndexedError {
563 |     fn field(&self) -> &str {
564 |         self.field.as_str()
565 |     }
566 | 
567 |     fn __repr__(&self) -> String {
568 |         self.full_message()
569 |     }
570 | 
571 |     fn __str__(&self) -> String {
572 |         self.full_message()
573 |     }
574 | }
575 | 
576 | impl QueryParserError for FieldNotIndexedError {
577 |     fn full_message(&self) -> String {
578 |         format!("The field '{0}' is not declared as indexed", self.field)
579 |     }
580 | }
581 | 
582 | impl From<FieldNotIndexedError> for tv::query::QueryParserError {
583 |     fn from(error: FieldNotIndexedError) -> Self {
584 |         tv::query::QueryParserError::FieldNotIndexed(error.field)
585 |     }
586 | }
587 | 
588 | impl TryFrom<tv::query::QueryParserError> for FieldNotIndexedError {
589 |     type Error = String;
590 | 
591 |     fn try_from(
592 |         error: tv::query::QueryParserError,
593 |     ) -> Result<Self, Self::Error> {
594 |         match error {
595 |             tv::query::QueryParserError::FieldNotIndexed(field) => {
596 |                 Ok(Self { field })
597 |             }
598 |             _ => Err(format!("{error} is not an FieldNotIndexed error")),
599 |         }
600 |     }
601 | }
602 | 
603 | /// A phrase query was requested for a field that does not have any positions indexed.
604 | #[pyclass(frozen, module = "tantivy.tantivy")]
605 | pub(crate) struct FieldDoesNotHavePositionsIndexedError {
606 |     field: String,
607 | }
608 | 
609 | #[pymethods]
610 | impl FieldDoesNotHavePositionsIndexedError {
611 |     fn field(&self) -> &str {
612 |         self.field.as_str()
613 |     }
614 | 
615 |     fn __repr__(&self) -> String {
616 |         self.full_message()
617 |     }
618 | 
619 |     fn __str__(&self) -> String {
620 |         self.full_message()
621 |     }
622 | }
623 | 
624 | impl QueryParserError for FieldDoesNotHavePositionsIndexedError {
625 |     fn full_message(&self) -> String {
626 |         format!(
627 |             "The field '{0}' does not have positions indexed",
628 |             self.field
629 |         )
630 |     }
631 | }
632 | 
633 | impl From<FieldDoesNotHavePositionsIndexedError>
634 |     for tv::query::QueryParserError
635 | {
636 |     fn from(error: FieldDoesNotHavePositionsIndexedError) -> Self {
637 |         tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed(
638 |             error.field,
639 |         )
640 |     }
641 | }
642 | 
643 | impl TryFrom<tv::query::QueryParserError>
644 |     for FieldDoesNotHavePositionsIndexedError
645 | {
646 |     type Error = String;
647 | 
648 |     fn try_from(
649 |         error: tv::query::QueryParserError,
650 |     ) -> Result<Self, Self::Error> {
651 |         match error {
652 |             tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed(
653 |                 field,
654 |             ) => Ok(Self { field }),
655 |             _ => Err(format!(
656 |                 "{error} is not a FieldDoesNotHavePositionsIndexed error"
657 |             )),
658 |         }
659 |     }
660 | }
661 | 
662 | /// A phrase-prefix query requires at least two terms
663 | #[pyclass(frozen, module = "tantivy.tantivy")]
664 | pub(crate) struct PhrasePrefixRequiresAtLeastTwoTermsError {
665 |     /// The phrase which triggered the issue.
666 |     phrase: String,
667 |     /// The tokenizer configured for the field.
668 |     tokenizer: String,
669 | }
670 | 
671 | #[pymethods]
672 | impl PhrasePrefixRequiresAtLeastTwoTermsError {
673 |     fn phrase(&self) -> &str {
674 |         self.phrase.as_str()
675 |     }
676 | 
677 |     fn tokenizer(&self) -> &str {
678 |         self.tokenizer.as_str()
679 |     }
680 | 
681 |     fn __repr__(&self) -> String {
682 |         self.full_message()
683 |     }
684 | 
685 |     fn __str__(&self) -> String {
686 |         self.full_message()
687 |     }
688 | }
689 | 
690 | impl QueryParserError for PhrasePrefixRequiresAtLeastTwoTermsError {
691 |     fn full_message(&self) -> String {
692 |         format!(
693 |             "The phrase '{0:?}' does not produce at least two terms using the tokenizer '{1:?}'",
694 |             self.phrase, self.tokenizer
695 |         )
696 |     }
697 | }
698 | 
699 | impl From<PhrasePrefixRequiresAtLeastTwoTermsError>
700 |     for tv::query::QueryParserError
701 | {
702 |     fn from(error: PhrasePrefixRequiresAtLeastTwoTermsError) -> Self {
703 |         tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
704 |             phrase: error.phrase,
705 |             tokenizer: error.tokenizer,
706 |         }
707 |     }
708 | }
709 | 
710 | impl TryFrom<tv::query::QueryParserError>
711 |     for PhrasePrefixRequiresAtLeastTwoTermsError
712 | {
713 |     type Error = String;
714 | 
715 |     fn try_from(
716 |         error: tv::query::QueryParserError,
717 |     ) -> Result<Self, Self::Error> {
718 |         match error {
719 |             tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms {
720 |                 phrase,
721 |                 tokenizer,
722 |             } => Ok(Self { phrase, tokenizer }),
723 |             _ => Err(format!(
724 |                 "{error} is not a PhrasePrefixRequiresAtLeastTwoTerms error"
725 |             )),
726 |         }
727 |     }
728 | }
729 | 
730 | /// The tokenizer for the given field is unknown.
731 | #[pyclass(frozen, module = "tantivy.tantivy")]
732 | pub(crate) struct UnknownTokenizerError {
733 |     /// The name of the tokenizer.
734 |     tokenizer: String,
735 |     /// The field name.
736 |     field: String,
737 | }
738 | 
739 | #[pymethods]
740 | impl UnknownTokenizerError {
741 |     fn tokenizer(&self) -> &str {
742 |         self.tokenizer.as_str()
743 |     }
744 | 
745 |     fn field(&self) -> &str {
746 |         self.field.as_str()
747 |     }
748 | 
749 |     fn __repr__(&self) -> String {
750 |         self.full_message()
751 |     }
752 | 
753 |     fn __str__(&self) -> String {
754 |         self.full_message()
755 |     }
756 | }
757 | 
758 | impl QueryParserError for UnknownTokenizerError {
759 |     fn full_message(&self) -> String {
760 |         format!(
761 |             "The tokenizer '{0:?}' for the field '{1:?}' is unknown",
762 |             self.tokenizer, self.field
763 |         )
764 |     }
765 | }
766 | 
767 | impl From<UnknownTokenizerError> for tv::query::QueryParserError {
768 |     fn from(error: UnknownTokenizerError) -> Self {
769 |         tv::query::QueryParserError::UnknownTokenizer {
770 |             tokenizer: error.tokenizer,
771 |             field: error.field,
772 |         }
773 |     }
774 | }
775 | 
776 | impl TryFrom<tv::query::QueryParserError> for UnknownTokenizerError {
777 |     type Error = String;
778 | 
779 |     fn try_from(
780 |         error: tv::query::QueryParserError,
781 |     ) -> Result<Self, Self::Error> {
782 |         match error {
783 |             tv::query::QueryParserError::UnknownTokenizer {
784 |                 tokenizer,
785 |                 field,
786 |             } => Ok(Self { tokenizer, field }),
787 |             _ => Err(format!("{error} is not an UnknownTokenizer error")),
788 |         }
789 |     }
790 | }
791 | 
792 | /// The query contains a range query with a phrase as one of the bounds. Only terms can be used as
793 | /// bounds.
794 | #[pyclass(frozen, module = "tantivy.tantivy")]
795 | pub(crate) struct RangeMustNotHavePhraseError;
796 | 
797 | #[pymethods]
798 | impl RangeMustNotHavePhraseError {
799 |     fn __repr__(&self) -> String {
800 |         self.full_message()
801 |     }
802 | 
803 |     fn __str__(&self) -> String {
804 |         self.full_message()
805 |     }
806 | }
807 | 
808 | impl QueryParserError for RangeMustNotHavePhraseError {
809 |     fn full_message(&self) -> String {
810 |         "A range query cannot have a phrase as one of the bounds".to_string()
811 |     }
812 | }
813 | 
814 | impl From<RangeMustNotHavePhraseError> for tv::query::QueryParserError {
815 |     fn from(_error: RangeMustNotHavePhraseError) -> Self {
816 |         tv::query::QueryParserError::RangeMustNotHavePhrase
817 |     }
818 | }
819 | 
820 | impl TryFrom<tv::query::QueryParserError> for RangeMustNotHavePhraseError {
821 |     type Error = String;
822 | 
823 |     fn try_from(
824 |         error: tv::query::QueryParserError,
825 |     ) -> Result<Self, Self::Error> {
826 |         match error {
827 |             tv::query::QueryParserError::RangeMustNotHavePhrase => Ok(Self {}),
828 |             _ => Err(format!("{error} is not a RangeMustNotHavePhrase error")),
829 |         }
830 |     }
831 | }
832 | 
833 | /// The format for the date field is not RFC 3339 compliant.
834 | #[pyclass(frozen, module = "tantivy.tantivy")]
835 | pub(crate) struct DateFormatError {
836 |     // Keep around the entire `QueryParserError` to avoid importing the `time` crate.
837 |     inner: tv::query::QueryParserError,
838 | }
839 | 
840 | #[pymethods]
841 | impl DateFormatError {
842 |     fn __repr__(&self) -> String {
843 |         self.full_message()
844 |     }
845 | 
846 |     fn __str__(&self) -> String {
847 |         self.full_message()
848 |     }
849 | }
850 | 
851 | impl QueryParserError for DateFormatError {
852 |     fn full_message(&self) -> String {
853 |         "The date field has an invalid format".to_string()
854 |     }
855 | }
856 | 
857 | impl From<DateFormatError> for tv::query::QueryParserError {
858 |     fn from(error: DateFormatError) -> Self {
859 |         error.inner
860 |     }
861 | }
862 | 
863 | impl TryFrom<tv::query::QueryParserError> for DateFormatError {
864 |     type Error = String;
865 | 
866 |     fn try_from(
867 |         error: tv::query::QueryParserError,
868 |     ) -> Result<Self, Self::Error> {
869 |         match error {
870 |             tv::query::QueryParserError::DateFormatError { .. } => {
871 |                 Ok(Self { inner: error })
872 |             }
873 |             _ => Err(format!("{error} is not a DateFormatError")),
874 |         }
875 |     }
876 | }
877 | 
878 | /// The format for the facet field is invalid.
879 | #[pyclass(frozen, module = "tantivy.tantivy")]
880 | pub(crate) struct FacetFormatError {
881 |     facet_parse_error: FacetParseError,
882 | }
883 | 
884 | #[pymethods]
885 | impl FacetFormatError {
886 |     fn __repr__(&self) -> String {
887 |         self.full_message()
888 |     }
889 | 
890 |     fn __str__(&self) -> String {
891 |         self.full_message()
892 |     }
893 | }
894 | 
895 | impl QueryParserError for FacetFormatError {
896 |     fn full_message(&self) -> String {
897 |         format!("The facet field is malformed: {0}", self.facet_parse_error)
898 |     }
899 | }
900 | 
901 | impl From<FacetFormatError> for tv::query::QueryParserError {
902 |     fn from(error: FacetFormatError) -> Self {
903 |         tv::query::QueryParserError::FacetFormatError(error.facet_parse_error)
904 |     }
905 | }
906 | 
907 | impl TryFrom<tv::query::QueryParserError> for FacetFormatError {
908 |     type Error = String;
909 | 
910 |     fn try_from(
911 |         error: tv::query::QueryParserError,
912 |     ) -> Result<Self, Self::Error> {
913 |         match error {
914 |             tv::query::QueryParserError::FacetFormatError(
915 |                 facet_parse_error,
916 |             ) => Ok(Self { facet_parse_error }),
917 |             _ => Err(format!("{error} is not a FacetFormatError")),
918 |         }
919 |     }
920 | }
921 | 
922 | /// The format for the ip field is invalid.
923 | #[pyclass(frozen, module = "tantivy.tantivy")]
924 | pub(crate) struct IpFormatError {
925 |     addr_parse_error: AddrParseError,
926 | }
927 | 
928 | #[pymethods]
929 | impl IpFormatError {
930 |     fn __repr__(&self) -> String {
931 |         self.full_message()
932 |     }
933 | 
934 |     fn __str__(&self) -> String {
935 |         self.full_message()
936 |     }
937 | }
938 | 
939 | impl QueryParserError for IpFormatError {
940 |     fn full_message(&self) -> String {
941 |         format!("The facet field is malformed: {0}", self.addr_parse_error)
942 |     }
943 | }
944 | 
945 | impl From<IpFormatError> for tv::query::QueryParserError {
946 |     fn from(error: IpFormatError) -> Self {
947 |         tv::query::QueryParserError::IpFormatError(error.addr_parse_error)
948 |     }
949 | }
950 | 
951 | impl TryFrom<tv::query::QueryParserError> for IpFormatError {
952 |     type Error = String;
953 | 
954 |     fn try_from(
955 |         error: tv::query::QueryParserError,
956 |     ) -> Result<Self, Self::Error> {
957 |         match error {
958 |             tv::query::QueryParserError::IpFormatError(addr_parse_error) => {
959 |                 Ok(Self { addr_parse_error })
960 |             }
961 |             _ => Err(format!("{error} is not an IpFormatError")),
962 |         }
963 |     }
964 | }
965 | 


--------------------------------------------------------------------------------
/src/query.rs:
--------------------------------------------------------------------------------
  1 | use crate::{
  2 |     get_field, make_term, make_term_for_type, schema::FieldType, to_pyerr,
  3 |     DocAddress, Schema,
  4 | };
  5 | use core::ops::Bound as OpsBound;
  6 | use pyo3::{
  7 |     exceptions,
  8 |     prelude::*,
  9 |     types::{PyAny, PyFloat, PyString},
 10 | };
 11 | use tantivy as tv;
 12 | 
 13 | /// Custom Tuple struct to represent a pair of Occur and Query
 14 | /// for the BooleanQuery
 15 | struct OccurQueryPair(Occur, Query);
 16 | 
 17 | impl<'source> FromPyObject<'source> for OccurQueryPair {
 18 |     fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult<Self> {
 19 |         let (occur, query): (Occur, Query) = ob.extract()?;
 20 | 
 21 |         Ok(OccurQueryPair(occur, query))
 22 |     }
 23 | }
 24 | 
 25 | /// Tantivy's Occur
 26 | #[pyclass(frozen, module = "tantivy.tantivy")]
 27 | #[derive(Clone)]
 28 | pub enum Occur {
 29 |     Must,
 30 |     Should,
 31 |     MustNot,
 32 | }
 33 | 
 34 | impl From<Occur> for tv::query::Occur {
 35 |     fn from(occur: Occur) -> tv::query::Occur {
 36 |         match occur {
 37 |             Occur::Must => tv::query::Occur::Must,
 38 |             Occur::Should => tv::query::Occur::Should,
 39 |             Occur::MustNot => tv::query::Occur::MustNot,
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | /// Tantivy's Query
 45 | #[pyclass(frozen, module = "tantivy.tantivy")]
 46 | pub(crate) struct Query {
 47 |     pub(crate) inner: Box<dyn tv::query::Query>,
 48 | }
 49 | 
 50 | impl Clone for Query {
 51 |     fn clone(&self) -> Self {
 52 |         Query {
 53 |             inner: self.inner.box_clone(),
 54 |         }
 55 |     }
 56 | }
 57 | 
 58 | impl Query {
 59 |     pub(crate) fn get(&self) -> &dyn tv::query::Query {
 60 |         &self.inner
 61 |     }
 62 | }
 63 | 
 64 | #[pymethods]
 65 | impl Query {
 66 |     fn __repr__(&self) -> PyResult<String> {
 67 |         Ok(format!("Query({:?})", self.get()))
 68 |     }
 69 | 
 70 |     /// Construct a Tantivy's TermQuery
 71 |     #[staticmethod]
 72 |     #[pyo3(signature = (schema, field_name, field_value, index_option = "position"))]
 73 |     pub(crate) fn term_query(
 74 |         schema: &Schema,
 75 |         field_name: &str,
 76 |         field_value: &Bound<PyAny>,
 77 |         index_option: &str,
 78 |     ) -> PyResult<Query> {
 79 |         let term = make_term(&schema.inner, field_name, field_value)?;
 80 |         let index_option = match index_option {
 81 |             "position" => tv::schema::IndexRecordOption::WithFreqsAndPositions,
 82 |             "freq" => tv::schema::IndexRecordOption::WithFreqs,
 83 |             "basic" => tv::schema::IndexRecordOption::Basic,
 84 |             _ => return Err(exceptions::PyValueError::new_err(
 85 |                 "Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
 86 |             ))
 87 |         };
 88 |         let inner = tv::query::TermQuery::new(term, index_option);
 89 |         Ok(Query {
 90 |             inner: Box::new(inner),
 91 |         })
 92 |     }
 93 | 
 94 |     /// Construct a Tantivy's TermSetQuery
 95 |     #[staticmethod]
 96 |     #[pyo3(signature = (schema, field_name, field_values))]
 97 |     pub(crate) fn term_set_query(
 98 |         schema: &Schema,
 99 |         field_name: &str,
100 |         field_values: Vec<Bound<PyAny>>,
101 |     ) -> PyResult<Query> {
102 |         let terms = field_values
103 |             .into_iter()
104 |             .map(|field_value| {
105 |                 make_term(&schema.inner, field_name, &field_value)
106 |             })
107 |             .collect::<Result<Vec<_>, _>>()?;
108 |         let inner = tv::query::TermSetQuery::new(terms);
109 |         Ok(Query {
110 |             inner: Box::new(inner),
111 |         })
112 |     }
113 | 
114 |     /// Construct a Tantivy's AllQuery
115 |     #[staticmethod]
116 |     pub(crate) fn all_query() -> PyResult<Query> {
117 |         let inner = tv::query::AllQuery {};
118 |         Ok(Query {
119 |             inner: Box::new(inner),
120 |         })
121 |     }
122 | 
123 |     /// Construct a Tantivy's FuzzyTermQuery
124 |     ///
125 |     /// # Arguments
126 |     ///
127 |     /// * `schema` - Schema of the target index.
128 |     /// * `field_name` - Field name to be searched.
129 |     /// * `text` - String representation of the query term.
130 |     /// * `distance` - (Optional) Edit distance you are going to alow. When not specified, the default is 1.
131 |     /// * `transposition_cost_one` - (Optional) If true, a transposition (swapping) cost will be 1; otherwise it will be 2. When not specified, the default is true.
132 |     /// * `prefix` - (Optional) If true, prefix levenshtein distance is applied. When not specified, the default is false.
133 |     #[staticmethod]
134 |     #[pyo3(signature = (schema, field_name, text, distance = 1, transposition_cost_one = true, prefix = false))]
135 |     pub(crate) fn fuzzy_term_query(
136 |         schema: &Schema,
137 |         field_name: &str,
138 |         text: &Bound<PyString>,
139 |         distance: u8,
140 |         transposition_cost_one: bool,
141 |         prefix: bool,
142 |     ) -> PyResult<Query> {
143 |         let term = make_term(&schema.inner, field_name, text)?;
144 |         let inner = if prefix {
145 |             tv::query::FuzzyTermQuery::new_prefix(
146 |                 term,
147 |                 distance,
148 |                 transposition_cost_one,
149 |             )
150 |         } else {
151 |             tv::query::FuzzyTermQuery::new(
152 |                 term,
153 |                 distance,
154 |                 transposition_cost_one,
155 |             )
156 |         };
157 |         Ok(Query {
158 |             inner: Box::new(inner),
159 |         })
160 |     }
161 | 
162 |     /// Construct a Tantivy's PhraseQuery with custom offsets and slop
163 |     ///
164 |     /// # Arguments
165 |     ///
166 |     /// * `schema` - Schema of the target index.
167 |     /// * `field_name` - Field name to be searched.
168 |     /// * `words` - Word list that constructs the phrase. A word can be a term text or a pair of term text and its offset in the phrase.
169 |     /// * `slop` - (Optional) The number of gaps permitted between the words in the query phrase. Default is 0.
170 |     #[staticmethod]
171 |     #[pyo3(signature = (schema, field_name, words, slop = 0))]
172 |     pub(crate) fn phrase_query(
173 |         schema: &Schema,
174 |         field_name: &str,
175 |         words: Vec<Bound<PyAny>>,
176 |         slop: u32,
177 |     ) -> PyResult<Query> {
178 |         let mut terms_with_offset = Vec::with_capacity(words.len());
179 |         for (idx, word) in words.into_iter().enumerate() {
180 |             if let Ok((offset, value)) = word.extract() {
181 |                 // Custom offset is provided.
182 |                 let term = make_term(&schema.inner, field_name, &value)?;
183 |                 terms_with_offset.push((offset, term));
184 |             } else {
185 |                 // Custom offset is not provided. Use the list index as the offset.
186 |                 let term = make_term(&schema.inner, field_name, &word)?;
187 |                 terms_with_offset.push((idx, term));
188 |             };
189 |         }
190 |         if terms_with_offset.is_empty() {
191 |             return Err(exceptions::PyValueError::new_err(
192 |                 "words must not be empty.",
193 |             ));
194 |         }
195 |         let inner = tv::query::PhraseQuery::new_with_offset_and_slop(
196 |             terms_with_offset,
197 |             slop,
198 |         );
199 |         Ok(Query {
200 |             inner: Box::new(inner),
201 |         })
202 |     }
203 | 
204 |     /// Construct a Tantivy's BooleanQuery
205 |     #[staticmethod]
206 |     #[pyo3(signature = (subqueries))]
207 |     pub(crate) fn boolean_query(
208 |         subqueries: Vec<(Occur, Query)>,
209 |     ) -> PyResult<Query> {
210 |         let dyn_subqueries = subqueries
211 |             .into_iter()
212 |             .map(|(occur, query)| (occur.into(), query.inner.box_clone()))
213 |             .collect::<Vec<_>>();
214 | 
215 |         let inner = tv::query::BooleanQuery::from(dyn_subqueries);
216 | 
217 |         Ok(Query {
218 |             inner: Box::new(inner),
219 |         })
220 |     }
221 | 
222 |     /// Construct a Tantivy's DisjunctionMaxQuery
223 |     #[staticmethod]
224 |     #[pyo3(signature = (subqueries, tie_breaker=None))]
225 |     pub(crate) fn disjunction_max_query(
226 |         subqueries: Vec<Query>,
227 |         tie_breaker: Option<Bound<PyFloat>>,
228 |     ) -> PyResult<Query> {
229 |         let inner_queries: Vec<Box<dyn tv::query::Query>> = subqueries
230 |             .iter()
231 |             .map(|query| query.inner.box_clone())
232 |             .collect();
233 | 
234 |         let dismax_query = if let Some(tie_breaker) = tie_breaker {
235 |             tv::query::DisjunctionMaxQuery::with_tie_breaker(
236 |                 inner_queries,
237 |                 tie_breaker.extract::<f32>()?,
238 |             )
239 |         } else {
240 |             tv::query::DisjunctionMaxQuery::new(inner_queries)
241 |         };
242 | 
243 |         Ok(Query {
244 |             inner: Box::new(dismax_query),
245 |         })
246 |     }
247 | 
248 |     /// Construct a Tantivy's BoostQuery
249 |     #[staticmethod]
250 |     #[pyo3(signature = (query, boost))]
251 |     pub(crate) fn boost_query(query: Query, boost: f32) -> PyResult<Query> {
252 |         let inner = tv::query::BoostQuery::new(query.inner, boost);
253 |         Ok(Query {
254 |             inner: Box::new(inner),
255 |         })
256 |     }
257 | 
258 |     /// Construct a Tantivy's RegexQuery
259 |     #[staticmethod]
260 |     #[pyo3(signature = (schema, field_name, regex_pattern))]
261 |     pub(crate) fn regex_query(
262 |         schema: &Schema,
263 |         field_name: &str,
264 |         regex_pattern: &str,
265 |     ) -> PyResult<Query> {
266 |         let field = get_field(&schema.inner, field_name)?;
267 | 
268 |         let inner_result =
269 |             tv::query::RegexQuery::from_pattern(regex_pattern, field);
270 |         match inner_result {
271 |             Ok(inner) => Ok(Query {
272 |                 inner: Box::new(inner),
273 |             }),
274 |             Err(e) => Err(to_pyerr(e)),
275 |         }
276 |     }
277 | 
278 |     #[staticmethod]
279 |     #[pyo3(signature = (doc_address, min_doc_frequency = Some(5), max_doc_frequency = None, min_term_frequency = Some(2), max_query_terms = Some(25), min_word_length = None, max_word_length = None, boost_factor = Some(1.0), stop_words = vec![]))]
280 |     #[allow(clippy::too_many_arguments)]
281 |     pub(crate) fn more_like_this_query(
282 |         doc_address: &DocAddress,
283 |         min_doc_frequency: Option<u64>,
284 |         max_doc_frequency: Option<u64>,
285 |         min_term_frequency: Option<usize>,
286 |         max_query_terms: Option<usize>,
287 |         min_word_length: Option<usize>,
288 |         max_word_length: Option<usize>,
289 |         boost_factor: Option<f32>,
290 |         stop_words: Vec<String>,
291 |     ) -> PyResult<Query> {
292 |         let mut builder = tv::query::MoreLikeThisQuery::builder();
293 |         if let Some(value) = min_doc_frequency {
294 |             builder = builder.with_min_doc_frequency(value);
295 |         }
296 |         if let Some(value) = max_doc_frequency {
297 |             builder = builder.with_max_doc_frequency(value);
298 |         }
299 |         if let Some(value) = min_term_frequency {
300 |             builder = builder.with_min_term_frequency(value);
301 |         }
302 |         if let Some(value) = max_query_terms {
303 |             builder = builder.with_max_query_terms(value);
304 |         }
305 |         if let Some(value) = min_word_length {
306 |             builder = builder.with_min_word_length(value);
307 |         }
308 |         if let Some(value) = max_word_length {
309 |             builder = builder.with_max_word_length(value);
310 |         }
311 |         if let Some(value) = boost_factor {
312 |             builder = builder.with_boost_factor(value);
313 |         }
314 |         builder = builder.with_stop_words(stop_words);
315 | 
316 |         let inner = builder.with_document(tv::DocAddress::from(doc_address));
317 |         Ok(Query {
318 |             inner: Box::new(inner),
319 |         })
320 |     }
321 | 
322 |     /// Construct a Tantivy's ConstScoreQuery
323 |     #[staticmethod]
324 |     #[pyo3(signature = (query, score))]
325 |     pub(crate) fn const_score_query(
326 |         query: Query,
327 |         score: f32,
328 |     ) -> PyResult<Query> {
329 |         let inner = tv::query::ConstScoreQuery::new(query.inner, score);
330 |         Ok(Query {
331 |             inner: Box::new(inner),
332 |         })
333 |     }
334 | 
335 |     #[staticmethod]
336 |     #[pyo3(signature = (schema, field_name, field_type, lower_bound, upper_bound, include_lower = true, include_upper = true))]
337 |     pub(crate) fn range_query(
338 |         schema: &Schema,
339 |         field_name: &str,
340 |         field_type: FieldType,
341 |         lower_bound: &Bound<PyAny>,
342 |         upper_bound: &Bound<PyAny>,
343 |         include_lower: bool,
344 |         include_upper: bool,
345 |     ) -> PyResult<Query> {
346 |         match field_type {
347 |             FieldType::Text => {
348 |                 return Err(exceptions::PyValueError::new_err(
349 |                     "Text fields are not supported for range queries.",
350 |                 ))
351 |             }
352 |             FieldType::Boolean => {
353 |                 return Err(exceptions::PyValueError::new_err(
354 |                     "Boolean fields are not supported for range queries.",
355 |                 ))
356 |             }
357 |             FieldType::Facet => {
358 |                 return Err(exceptions::PyValueError::new_err(
359 |                     "Facet fields are not supported for range queries.",
360 |                 ))
361 |             }
362 |             FieldType::Bytes => {
363 |                 return Err(exceptions::PyValueError::new_err(
364 |                     "Bytes fields are not supported for range queries.",
365 |                 ))
366 |             }
367 |             FieldType::Json => {
368 |                 return Err(exceptions::PyValueError::new_err(
369 |                     "Json fields are not supported for range queries.",
370 |                 ))
371 |             }
372 |             _ => {}
373 |         }
374 | 
375 |         // Look up the field in the schema. The given type must match the
376 |         // field type in the schema.
377 |         let field = get_field(&schema.inner, field_name)?;
378 |         let actual_field_entry = schema.inner.get_field_entry(field);
379 |         let actual_field_type = actual_field_entry.field_type().value_type(); // Convert tv::schema::FieldType to local FieldType
380 |         let given_field_type: tv::schema::Type = field_type.clone().into(); // Convert local FieldType to tv::schema::FieldType
381 | 
382 |         if actual_field_type != given_field_type {
383 |             return Err(exceptions::PyValueError::new_err(format!(
384 |                 "Field type mismatch: field '{}' is type {:?}, but got {:?}",
385 |                 field_name, actual_field_type, given_field_type
386 |             )));
387 |         }
388 | 
389 |         let lower_bound_term = make_term_for_type(
390 |             &schema.inner,
391 |             field_name,
392 |             field_type.clone(),
393 |             lower_bound,
394 |         )?;
395 |         let upper_bound_term = make_term_for_type(
396 |             &schema.inner,
397 |             field_name,
398 |             field_type.clone(),
399 |             upper_bound,
400 |         )?;
401 | 
402 |         let lower_bound = if include_lower {
403 |             OpsBound::Included(lower_bound_term)
404 |         } else {
405 |             OpsBound::Excluded(lower_bound_term)
406 |         };
407 | 
408 |         let upper_bound = if include_upper {
409 |             OpsBound::Included(upper_bound_term)
410 |         } else {
411 |             OpsBound::Excluded(upper_bound_term)
412 |         };
413 | 
414 |         let inner = tv::query::RangeQuery::new(lower_bound, upper_bound);
415 | 
416 |         Ok(Query {
417 |             inner: Box::new(inner),
418 |         })
419 |     }
420 | }
421 | 


--------------------------------------------------------------------------------
/src/schema.rs:
--------------------------------------------------------------------------------
 1 | use crate::to_pyerr;
 2 | use pyo3::IntoPyObjectExt;
 3 | use pyo3::{basic::CompareOp, prelude::*, types::PyTuple};
 4 | use serde::{Deserialize, Serialize};
 5 | use tantivy as tv;
 6 | 
 7 | /// Tantivy's Type
 8 | #[pyclass(frozen, module = "tantivy.tantivy")]
 9 | #[derive(Clone, PartialEq)]
10 | pub(crate) enum FieldType {
11 |     Text,
12 |     Unsigned,
13 |     Integer,
14 |     Float,
15 |     Boolean,
16 |     Date,
17 |     Facet,
18 |     Bytes,
19 |     Json,
20 |     IpAddr,
21 | }
22 | 
23 | impl From<FieldType> for tv::schema::Type {
24 |     fn from(field_type: FieldType) -> tv::schema::Type {
25 |         match field_type {
26 |             FieldType::Text => tv::schema::Type::Str,
27 |             FieldType::Unsigned => tv::schema::Type::U64,
28 |             FieldType::Integer => tv::schema::Type::I64,
29 |             FieldType::Float => tv::schema::Type::F64,
30 |             FieldType::Boolean => tv::schema::Type::Str,
31 |             FieldType::Date => tv::schema::Type::Date,
32 |             FieldType::Facet => tv::schema::Type::Facet,
33 |             FieldType::Bytes => tv::schema::Type::Bytes,
34 |             FieldType::Json => tv::schema::Type::Json,
35 |             FieldType::IpAddr => tv::schema::Type::IpAddr,
36 |         }
37 |     }
38 | }
39 | 
40 | /// Tantivy schema.
41 | ///
42 | /// The schema is very strict. To build the schema the `SchemaBuilder` class is
43 | /// provided.
44 | #[pyclass(frozen, module = "tantivy.tantivy")]
45 | #[derive(Deserialize, PartialEq, Serialize)]
46 | pub(crate) struct Schema {
47 |     pub(crate) inner: tv::schema::Schema,
48 | }
49 | 
50 | #[pymethods]
51 | impl Schema {
52 |     fn __richcmp__(
53 |         &self,
54 |         other: &Self,
55 |         op: CompareOp,
56 |         py: Python<'_>,
57 |     ) -> PyResult<PyObject> {
58 |         match op {
59 |             CompareOp::Eq => (self == other).into_py_any(py),
60 |             CompareOp::Ne => (self != other).into_py_any(py),
61 |             _ => Ok(py.NotImplemented()),
62 |         }
63 |     }
64 | 
65 |     #[staticmethod]
66 |     fn _internal_from_pythonized(serialized: &Bound<PyAny>) -> PyResult<Self> {
67 |         pythonize::depythonize(serialized).map_err(to_pyerr)
68 |     }
69 | 
70 |     fn __reduce__<'a>(
71 |         slf: PyRef<'a, Self>,
72 |         py: Python<'a>,
73 |     ) -> PyResult<Bound<'a, PyTuple>> {
74 |         let serialized = pythonize::pythonize(py, &*slf).map_err(to_pyerr)?;
75 |         let deserializer = slf
76 |             .into_pyobject(py)?
77 |             .getattr("_internal_from_pythonized")?;
78 |         PyTuple::new(
79 |             py,
80 |             [deserializer, PyTuple::new(py, [serialized])?.into_any()],
81 |         )
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/schemabuilder.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::new_ret_no_self)]
  2 | 
  3 | use pyo3::{exceptions, prelude::*};
  4 | 
  5 | use crate::schema::Schema;
  6 | use std::sync::{Arc, RwLock};
  7 | use tantivy::schema::{
  8 |     self, BytesOptions, DateOptions, IpAddrOptions, INDEXED,
  9 | };
 10 | 
 11 | /// Tantivy has a very strict schema.
 12 | /// You need to specify in advance whether a field is indexed or not,
 13 | /// stored or not.
 14 | ///
 15 | /// This is done by creating a schema object, and
 16 | /// setting up the fields one by one.
 17 | ///
 18 | /// Examples:
 19 | ///
 20 | ///     >>> builder = tantivy.SchemaBuilder()
 21 | ///
 22 | ///     >>> title = builder.add_text_field("title", stored=True)
 23 | ///     >>> body = builder.add_text_field("body")
 24 | ///
 25 | ///     >>> schema = builder.build()
 26 | #[pyclass(module = "tantivy.tantivy")]
 27 | #[derive(Clone)]
 28 | pub(crate) struct SchemaBuilder {
 29 |     pub(crate) builder: Arc<RwLock<Option<schema::SchemaBuilder>>>,
 30 | }
 31 | 
 32 | const NO_TOKENIZER_NAME: &str = "raw";
 33 | const TOKENIZER: &str = "default";
 34 | const RECORD: &str = "position";
 35 | 
 36 | #[pymethods]
 37 | impl SchemaBuilder {
 38 |     #[new]
 39 |     fn new() -> Self {
 40 |         SchemaBuilder {
 41 |             builder: Arc::new(From::from(Some(schema::Schema::builder()))),
 42 |         }
 43 |     }
 44 | 
 45 |     #[staticmethod]
 46 |     fn is_valid_field_name(name: &str) -> bool {
 47 |         schema::is_valid_field_name(name)
 48 |     }
 49 | 
 50 |     /// Add a new text field to the schema.
 51 |     ///
 52 |     /// Args:
 53 |     ///     name (str): The name of the field.
 54 |     ///     stored (bool, optional): If true sets the field as stored, the
 55 |     ///         content of the field can be later restored from a Searcher.
 56 |     ///         Defaults to False.
 57 |     ///     fast (bool, optional): Set the text options as a fast field. A
 58 |     ///         fast field is a column-oriented fashion storage for tantivy.
 59 |     ///         Text fast fields will have the term ids stored in the fast
 60 |     ///         field. The fast field will be a multivalued fast field.
 61 |     ///         It is recommended to use the "raw" tokenizer, since it will
 62 |     ///         store the original text unchanged. The "default" tokenizer will
 63 |     ///         store the terms as lower case and this will be reflected in the
 64 |     ///         dictionary.
 65 |     ///     tokenizer_name (str, optional): The name of the tokenizer that
 66 |     ///         should be used to process the field. Defaults to 'default'
 67 |     ///     index_option (str, optional): Sets which information should be
 68 |     ///         indexed with the tokens. Can be one of 'position', 'freq' or
 69 |     ///         'basic'. Defaults to 'position'. The 'basic' index_option
 70 |     ///         records only the document ID, the 'freq' option records the
 71 |     ///         document id and the term frequency, while the 'position' option
 72 |     ///         records the document id, term frequency and the positions of
 73 |     ///         the term occurrences in the document.
 74 |     ///
 75 |     /// Returns the associated field handle.
 76 |     /// Raises a ValueError if there was an error with the field creation.
 77 |     #[pyo3(signature = (
 78 |         name,
 79 |         stored = false,
 80 |         fast = false,
 81 |         tokenizer_name = TOKENIZER,
 82 |         index_option = RECORD
 83 |     ))]
 84 |     fn add_text_field(
 85 |         &mut self,
 86 |         name: &str,
 87 |         stored: bool,
 88 |         fast: bool,
 89 |         tokenizer_name: &str,
 90 |         index_option: &str,
 91 |     ) -> PyResult<Self> {
 92 |         let builder = &mut self.builder;
 93 |         let options = SchemaBuilder::build_text_option(
 94 |             stored,
 95 |             fast,
 96 |             tokenizer_name,
 97 |             index_option,
 98 |         )?;
 99 | 
100 |         if let Some(builder) = builder.write().unwrap().as_mut() {
101 |             builder.add_text_field(name, options);
102 |         } else {
103 |             return Err(exceptions::PyValueError::new_err(
104 |                 "Schema builder object isn't valid anymore.",
105 |             ));
106 |         }
107 |         Ok(self.clone())
108 |     }
109 | 
110 |     /// Add a new signed integer field to the schema.
111 |     ///
112 |     /// Args:
113 |     ///     name (str): The name of the field.
114 |     ///     stored (bool, optional): If true sets the field as stored, the
115 |     ///         content of the field can be later restored from a Searcher.
116 |     ///         Defaults to False.
117 |     ///     indexed (bool, optional): If true sets the field to be indexed.
118 |     ///     fast (bool, optional): Set the numeric options as a fast field. A
119 |     ///         fast field is a column-oriented fashion storage for tantivy.
120 |     ///         It is designed for the fast random access of some document
121 |     ///         fields given a document id.
122 |     ///
123 |     /// Returns the associated field handle.
124 |     /// Raises a ValueError if there was an error with the field creation.
125 |     #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
126 |     fn add_integer_field(
127 |         &mut self,
128 |         name: &str,
129 |         stored: bool,
130 |         indexed: bool,
131 |         fast: bool,
132 |     ) -> PyResult<Self> {
133 |         let builder = &mut self.builder;
134 | 
135 |         let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
136 | 
137 |         if let Some(builder) = builder.write().unwrap().as_mut() {
138 |             builder.add_i64_field(name, opts);
139 |         } else {
140 |             return Err(exceptions::PyValueError::new_err(
141 |                 "Schema builder object isn't valid anymore.",
142 |             ));
143 |         }
144 |         Ok(self.clone())
145 |     }
146 | 
147 |     /// Add a new float field to the schema.
148 |     ///
149 |     /// Args:
150 |     ///     name (str): The name of the field.
151 |     ///     stored (bool, optional): If true sets the field as stored, the
152 |     ///         content of the field can be later restored from a Searcher.
153 |     ///         Defaults to False.
154 |     ///     indexed (bool, optional): If true sets the field to be indexed.
155 |     ///     fast (bool, optional): Set the numeric options as a fast field. A
156 |     ///         fast field is a column-oriented fashion storage for tantivy.
157 |     ///         It is designed for the fast random access of some document
158 |     ///         fields given a document id.
159 |     ///
160 |     /// Returns the associated field handle.
161 |     /// Raises a ValueError if there was an error with the field creation.
162 |     #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
163 |     fn add_float_field(
164 |         &mut self,
165 |         name: &str,
166 |         stored: bool,
167 |         indexed: bool,
168 |         fast: bool,
169 |     ) -> PyResult<Self> {
170 |         let builder = &mut self.builder;
171 | 
172 |         let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
173 | 
174 |         if let Some(builder) = builder.write().unwrap().as_mut() {
175 |             builder.add_f64_field(name, opts);
176 |         } else {
177 |             return Err(exceptions::PyValueError::new_err(
178 |                 "Schema builder object isn't valid anymore.",
179 |             ));
180 |         }
181 |         Ok(self.clone())
182 |     }
183 | 
184 |     /// Add a new unsigned integer field to the schema.
185 |     ///
186 |     /// Args:
187 |     ///     name (str): The name of the field.
188 |     ///     stored (bool, optional): If true sets the field as stored, the
189 |     ///         content of the field can be later restored from a Searcher.
190 |     ///         Defaults to False.
191 |     ///     indexed (bool, optional): If true sets the field to be indexed.
192 |     ///     fast (bool, optional): Set the numeric options as a fast field. A
193 |     ///         fast field is a column-oriented fashion storage for tantivy.
194 |     ///         It is designed for the fast random access of some document
195 |     ///         fields given a document id.
196 |     ///
197 |     /// Returns the associated field handle.
198 |     /// Raises a ValueError if there was an error with the field creation.
199 |     #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
200 |     fn add_unsigned_field(
201 |         &mut self,
202 |         name: &str,
203 |         stored: bool,
204 |         indexed: bool,
205 |         fast: bool,
206 |     ) -> PyResult<Self> {
207 |         let builder = &mut self.builder;
208 | 
209 |         let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
210 | 
211 |         if let Some(builder) = builder.write().unwrap().as_mut() {
212 |             builder.add_u64_field(name, opts);
213 |         } else {
214 |             return Err(exceptions::PyValueError::new_err(
215 |                 "Schema builder object isn't valid anymore.",
216 |             ));
217 |         }
218 |         Ok(self.clone())
219 |     }
220 | 
221 |     /// Add a new boolean field to the schema.
222 |     ///
223 |     /// Args:
224 |     ///     name (str): The name of the field.
225 |     ///     stored (bool, optional): If true sets the field as stored, the
226 |     ///         content of the field can be later restored from a Searcher.
227 |     ///         Defaults to False.
228 |     ///     indexed (bool, optional): If true sets the field to be indexed.
229 |     ///     fast (bool, optional): Set the numeric options as a fast field. A
230 |     ///         fast field is a column-oriented fashion storage for tantivy.
231 |     ///         It is designed for the fast random access of some document
232 |     ///         fields given a document id.
233 |     ///
234 |     /// Returns the associated field handle.
235 |     /// Raises a ValueError if there was an error with the field creation.
236 |     #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
237 |     fn add_boolean_field(
238 |         &mut self,
239 |         name: &str,
240 |         stored: bool,
241 |         indexed: bool,
242 |         fast: bool,
243 |     ) -> PyResult<Self> {
244 |         let builder = &mut self.builder;
245 | 
246 |         let opts = SchemaBuilder::build_numeric_option(stored, indexed, fast)?;
247 | 
248 |         if let Some(builder) = builder.write().unwrap().as_mut() {
249 |             builder.add_bool_field(name, opts);
250 |         } else {
251 |             return Err(exceptions::PyValueError::new_err(
252 |                 "Schema builder object isn't valid anymore.",
253 |             ));
254 |         }
255 |         Ok(self.clone())
256 |     }
257 | 
258 |     /// Add a new date field to the schema.
259 |     ///
260 |     /// Args:
261 |     ///     name (str): The name of the field.
262 |     ///     stored (bool, optional): If true sets the field as stored, the
263 |     ///         content of the field can be later restored from a Searcher.
264 |     ///         Defaults to False.
265 |     ///     indexed (bool, optional): If true sets the field to be indexed.
266 |     ///     fast (bool, optional): Set the date options as a fast field. A fast
267 |     ///         field is a column-oriented fashion storage for tantivy. It is
268 |     ///         designed for the fast random access of some document fields
269 |     ///         given a document id.
270 |     ///
271 |     /// Returns the associated field handle.
272 |     /// Raises a ValueError if there was an error with the field creation.
273 |     #[pyo3(signature = (name, stored = false, indexed = false, fast = false))]
274 |     fn add_date_field(
275 |         &mut self,
276 |         name: &str,
277 |         stored: bool,
278 |         indexed: bool,
279 |         fast: bool,
280 |     ) -> PyResult<Self> {
281 |         let builder = &mut self.builder;
282 | 
283 |         let mut opts = DateOptions::default();
284 |         if stored {
285 |             opts = opts.set_stored();
286 |         }
287 |         if indexed {
288 |             opts = opts.set_indexed();
289 |         }
290 |         if fast {
291 |             opts = opts.set_fast();
292 |         }
293 | 
294 |         if let Some(builder) = builder.write().unwrap().as_mut() {
295 |             builder.add_date_field(name, opts);
296 |         } else {
297 |             return Err(exceptions::PyValueError::new_err(
298 |                 "Schema builder object isn't valid anymore.",
299 |             ));
300 |         }
301 |         Ok(self.clone())
302 |     }
303 | 
304 |     /// Add a new json field to the schema.
305 |     ///
306 |     /// Args:
307 |     ///     name (str): the name of the field.
308 |     ///     stored (bool, optional): If true sets the field as stored, the
309 |     ///         content of the field can be later restored from a Searcher.
310 |     ///         Defaults to False.
311 |     ///     fast (bool, optional): Set the text options as a fast field. A
312 |     ///         fast field is a column-oriented fashion storage for tantivy.
313 |     ///         Text fast fields will have the term ids stored in the fast
314 |     ///         field. The fast field will be a multivalued fast field.
315 |     ///         It is recommended to use the "raw" tokenizer, since it will
316 |     ///         store the original text unchanged. The "default" tokenizer will
317 |     ///         store the terms as lower case and this will be reflected in the
318 |     ///         dictionary.
319 |     ///     tokenizer_name (str, optional): The name of the tokenizer that
320 |     ///         should be used to process the field. Defaults to 'default'
321 |     ///     index_option (str, optional): Sets which information should be
322 |     ///         indexed with the tokens. Can be one of 'position', 'freq' or
323 |     ///         'basic'. Defaults to 'position'. The 'basic' index_option
324 |     ///         records only the document ID, the 'freq' option records the
325 |     ///         document id and the term frequency, while the 'position' option
326 |     ///         records the document id, term frequency and the positions of
327 |     ///         the term occurrences in the document.
328 |     ///
329 |     /// Returns the associated field handle.
330 |     /// Raises a ValueError if there was an error with the field creation.
331 |     #[pyo3(signature = (
332 |         name,
333 |         stored = false,
334 |         fast = false,
335 |         tokenizer_name = TOKENIZER,
336 |         index_option = RECORD
337 |     ))]
338 |     fn add_json_field(
339 |         &mut self,
340 |         name: &str,
341 |         stored: bool,
342 |         fast: bool,
343 |         tokenizer_name: &str,
344 |         index_option: &str,
345 |     ) -> PyResult<Self> {
346 |         let builder = &mut self.builder;
347 |         let options = SchemaBuilder::build_text_option(
348 |             stored,
349 |             fast,
350 |             tokenizer_name,
351 |             index_option,
352 |         )?;
353 | 
354 |         if let Some(builder) = builder.write().unwrap().as_mut() {
355 |             builder.add_json_field(name, options);
356 |         } else {
357 |             return Err(exceptions::PyValueError::new_err(
358 |                 "Schema builder object isn't valid anymore.",
359 |             ));
360 |         }
361 | 
362 |         Ok(self.clone())
363 |     }
364 | 
365 |     /// Add a Facet field to the schema.
366 |     /// Args:
367 |     ///     name (str): The name of the field.
368 |     fn add_facet_field(&mut self, name: &str) -> PyResult<Self> {
369 |         let builder = &mut self.builder;
370 | 
371 |         if let Some(builder) = builder.write().unwrap().as_mut() {
372 |             builder.add_facet_field(name, INDEXED);
373 |         } else {
374 |             return Err(exceptions::PyValueError::new_err(
375 |                 "Schema builder object isn't valid anymore.",
376 |             ));
377 |         }
378 |         Ok(self.clone())
379 |     }
380 | 
381 |     /// Add a fast bytes field to the schema.
382 |     ///
383 |     /// Args:
384 |     ///     name (str): The name of the field.
385 |     ///     stored (bool, optional): If true sets the field as stored, the
386 |     ///         content of the field can be later restored from a Searcher.
387 |     ///         Defaults to False.
388 |     ///     indexed (bool, optional): If true sets the field to be indexed.
389 |     ///     fast (bool, optional): Set the bytes options as a fast field. A fast
390 |     ///         field is a column-oriented fashion storage for tantivy. It is
391 |     ///         designed for the fast random access of some document fields
392 |     ///         given a document id.
393 |     #[pyo3(signature = (
394 |         name,
395 |         stored = false,
396 |         indexed = false,
397 |         fast = false
398 |     ))]
399 |     fn add_bytes_field(
400 |         &mut self,
401 |         name: &str,
402 |         stored: bool,
403 |         indexed: bool,
404 |         fast: bool,
405 |     ) -> PyResult<Self> {
406 |         let builder = &mut self.builder;
407 |         let mut opts = BytesOptions::default();
408 |         if stored {
409 |             opts = opts.set_stored();
410 |         }
411 |         if indexed {
412 |             opts = opts.set_indexed();
413 |         }
414 |         if fast {
415 |             opts = opts.set_fast();
416 |         }
417 | 
418 |         if let Some(builder) = builder.write().unwrap().as_mut() {
419 |             builder.add_bytes_field(name, opts);
420 |         } else {
421 |             return Err(exceptions::PyValueError::new_err(
422 |                 "Schema builder object isn't valid anymore.",
423 |             ));
424 |         }
425 |         Ok(self.clone())
426 |     }
427 | 
428 |     /// Add an IP address field to the schema.
429 |     ///
430 |     /// Args:
431 |     ///     name (str): The name of the field.
432 |     ///     stored (bool, optional): If true sets the field as stored, the
433 |     ///         content of the field can be later restored from a Searcher.
434 |     ///         Defaults to False.
435 |     ///     indexed (bool, optional): If true sets the field to be indexed.
436 |     ///     fast (bool, optional): Set the IP address options as a fast field. A
437 |     ///         fast field is a column-oriented fashion storage for tantivy. It
438 |     ///         is designed for the fast random access of some document fields
439 |     ///         given a document id.
440 |     #[pyo3(signature = (
441 |         name,
442 |         stored = false,
443 |         indexed = false,
444 |         fast = false
445 |     ))]
446 |     fn add_ip_addr_field(
447 |         &mut self,
448 |         name: &str,
449 |         stored: bool,
450 |         indexed: bool,
451 |         fast: bool,
452 |     ) -> PyResult<Self> {
453 |         let builder = &mut self.builder;
454 |         let mut opts = IpAddrOptions::default();
455 |         if stored {
456 |             opts = opts.set_stored();
457 |         }
458 |         if indexed {
459 |             opts = opts.set_indexed();
460 |         }
461 |         if fast {
462 |             opts = opts.set_fast();
463 |         }
464 | 
465 |         if let Some(builder) = builder.write().unwrap().as_mut() {
466 |             builder.add_ip_addr_field(name, opts);
467 |         } else {
468 |             return Err(exceptions::PyValueError::new_err(
469 |                 "Schema builder object isn't valid anymore.",
470 |             ));
471 |         }
472 | 
473 |         Ok(self.clone())
474 |     }
475 | 
476 |     /// Finalize the creation of a Schema.
477 |     ///
478 |     /// Returns a Schema object. After this is called the SchemaBuilder cannot
479 |     /// be used anymore.
480 |     fn build(&mut self) -> PyResult<Schema> {
481 |         let builder = self.builder.write().unwrap().take();
482 |         if let Some(builder) = builder {
483 |             let schema = builder.build();
484 |             Ok(Schema { inner: schema })
485 |         } else {
486 |             Err(exceptions::PyValueError::new_err(
487 |                 "Schema builder object isn't valid anymore.",
488 |             ))
489 |         }
490 |     }
491 | }
492 | 
493 | impl SchemaBuilder {
494 |     fn build_numeric_option(
495 |         stored: bool,
496 |         indexed: bool,
497 |         fast: bool,
498 |     ) -> PyResult<schema::NumericOptions> {
499 |         let opts = schema::NumericOptions::default();
500 |         let opts = if stored { opts.set_stored() } else { opts };
501 |         let opts = if indexed { opts.set_indexed() } else { opts };
502 |         let opts = if fast { opts.set_fast() } else { opts };
503 |         Ok(opts)
504 |     }
505 | 
506 |     fn build_text_option(
507 |         stored: bool,
508 |         fast: bool,
509 |         tokenizer_name: &str,
510 |         index_option: &str,
511 |     ) -> PyResult<schema::TextOptions> {
512 |         let index_option = match index_option {
513 |             "position" => schema::IndexRecordOption::WithFreqsAndPositions,
514 |             "freq" => schema::IndexRecordOption::WithFreqs,
515 |             "basic" => schema::IndexRecordOption::Basic,
516 |             _ => return Err(exceptions::PyValueError::new_err(
517 |                 "Invalid index option, valid choices are: 'basic', 'freq' and 'position'"
518 |             ))
519 |         };
520 | 
521 |         let indexing = schema::TextFieldIndexing::default()
522 |             .set_tokenizer(tokenizer_name)
523 |             .set_index_option(index_option);
524 | 
525 |         let options =
526 |             schema::TextOptions::default().set_indexing_options(indexing);
527 |         let options = if stored {
528 |             options.set_stored()
529 |         } else {
530 |             options
531 |         };
532 | 
533 |         let options = if fast {
534 |             let text_tokenizer = if tokenizer_name != NO_TOKENIZER_NAME {
535 |                 Some(tokenizer_name)
536 |             } else {
537 |                 None
538 |             };
539 |             options.set_fast(text_tokenizer)
540 |         } else {
541 |             options
542 |         };
543 | 
544 |         Ok(options)
545 |     }
546 | }
547 | 


--------------------------------------------------------------------------------
/src/searcher.rs:
--------------------------------------------------------------------------------
  1 | #![allow(clippy::new_ret_no_self)]
  2 | 
  3 | use crate::{document::Document, query::Query, to_pyerr};
  4 | use pyo3::types::PyDict;
  5 | use pyo3::IntoPyObjectExt;
  6 | use pyo3::{basic::CompareOp, exceptions::PyValueError, prelude::*};
  7 | use serde::{Deserialize, Serialize};
  8 | use tantivy as tv;
  9 | use tantivy::aggregation::AggregationCollector;
 10 | use tantivy::collector::{Count, MultiCollector, TopDocs};
 11 | use tantivy::TantivyDocument;
 12 | // Bring the trait into scope. This is required for the `to_named_doc` method.
 13 | // However, tantivy-py declares its own `Document` class, so we need to avoid
 14 | // introduce the `Document` trait into the namespace.
 15 | use tantivy::Document as _;
 16 | 
 17 | /// Tantivy's Searcher class
 18 | ///
 19 | /// A Searcher is used to search the index given a prepared Query.
 20 | #[pyclass(module = "tantivy.tantivy")]
 21 | pub(crate) struct Searcher {
 22 |     pub(crate) inner: tv::Searcher,
 23 | }
 24 | 
 25 | #[derive(
 26 |     Clone, Deserialize, PartialEq, Serialize, FromPyObject, IntoPyObject,
 27 | )]
 28 | enum Fruit {
 29 |     #[pyo3(transparent)]
 30 |     Score(f32),
 31 |     #[pyo3(transparent)]
 32 |     Order(u64),
 33 | }
 34 | 
 35 | impl std::fmt::Debug for Fruit {
 36 |     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 37 |         match self {
 38 |             Fruit::Score(s) => f.write_str(&format!("{s}")),
 39 |             Fruit::Order(o) => f.write_str(&format!("{o}")),
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | #[pyclass(frozen, module = "tantivy.tantivy")]
 45 | #[derive(Clone, Copy, Deserialize, PartialEq, Serialize)]
 46 | /// Enum representing the direction in which something should be sorted.
 47 | pub(crate) enum Order {
 48 |     /// Ascending. Smaller values appear first.
 49 |     Asc,
 50 | 
 51 |     /// Descending. Larger values appear first.
 52 |     Desc,
 53 | }
 54 | 
 55 | impl From<Order> for tv::Order {
 56 |     fn from(order: Order) -> Self {
 57 |         match order {
 58 |             Order::Asc => tv::Order::Asc,
 59 |             Order::Desc => tv::Order::Desc,
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | #[pyclass(frozen, module = "tantivy.tantivy")]
 65 | #[derive(Clone, Default, Deserialize, PartialEq, Serialize)]
 66 | /// Object holding a results successful search.
 67 | pub(crate) struct SearchResult {
 68 |     hits: Vec<(Fruit, DocAddress)>,
 69 |     #[pyo3(get)]
 70 |     /// How many documents matched the query. Only available if `count` was set
 71 |     /// to true during the search.
 72 |     count: Option<usize>,
 73 | }
 74 | 
 75 | #[pymethods]
 76 | impl SearchResult {
 77 |     #[new]
 78 |     fn new(
 79 |         py: Python,
 80 |         hits: Vec<(PyObject, DocAddress)>,
 81 |         count: Option<usize>,
 82 |     ) -> PyResult<Self> {
 83 |         let hits = hits
 84 |             .iter()
 85 |             .map(|(f, d)| Ok((f.extract(py)?, d.clone())))
 86 |             .collect::<PyResult<Vec<_>>>()?;
 87 |         Ok(Self { hits, count })
 88 |     }
 89 | 
 90 |     fn __repr__(&self) -> PyResult<String> {
 91 |         if let Some(count) = self.count {
 92 |             Ok(format!(
 93 |                 "SearchResult(hits: {:?}, count: {})",
 94 |                 self.hits, count
 95 |             ))
 96 |         } else {
 97 |             Ok(format!("SearchResult(hits: {:?})", self.hits))
 98 |         }
 99 |     }
100 | 
101 |     fn __richcmp__(
102 |         &self,
103 |         other: &Self,
104 |         op: CompareOp,
105 |         py: Python<'_>,
106 |     ) -> PyResult<PyObject> {
107 |         match op {
108 |             CompareOp::Eq => (self == other).into_py_any(py),
109 |             CompareOp::Ne => (self != other).into_py_any(py),
110 |             _ => Ok(py.NotImplemented()),
111 |         }
112 |     }
113 | 
114 |     fn __getnewargs__(
115 |         &self,
116 |         py: Python,
117 |     ) -> PyResult<(Vec<(PyObject, DocAddress)>, Option<usize>)> {
118 |         Ok((self.hits(py)?, self.count))
119 |     }
120 | 
121 |     #[getter]
122 |     /// The list of tuples that contains the scores and DocAddress of the
123 |     /// search results.
124 |     fn hits(&self, py: Python) -> PyResult<Vec<(PyObject, DocAddress)>> {
125 |         let ret = self
126 |             .hits
127 |             .iter()
128 |             .map(|(result, address)| -> PyResult<_> {
129 |                 Ok((result.clone().into_py_any(py)?, address.clone()))
130 |             })
131 |             .collect::<PyResult<_>>()?;
132 |         Ok(ret)
133 |     }
134 | }
135 | 
136 | #[pymethods]
137 | impl Searcher {
138 |     /// Search the index with the given query and collect results.
139 |     ///
140 |     /// Args:
141 |     ///     query (Query): The query that will be used for the search.
142 |     ///     limit (int, optional): The maximum number of search results to
143 |     ///         return. Defaults to 10.
144 |     ///     count (bool, optional): Should the number of documents that match
145 |     ///         the query be returned as well. Defaults to true.
146 |     ///     order_by_field (Field, optional): A schema field that the results
147 |     ///         should be ordered by. The field must be declared as a fast field
148 |     ///         when building the schema. Note, this only works for unsigned
149 |     ///         fields.
150 |     ///     offset (Field, optional): The offset from which the results have
151 |     ///         to be returned.
152 |     ///     order (Order, optional): The order in which the results
153 |     ///         should be sorted. If not specified, defaults to descending.
154 |     ///
155 |     /// Returns `SearchResult` object.
156 |     ///
157 |     /// Raises a ValueError if there was an error with the search.
158 |     #[pyo3(signature = (query, limit = 10, count = true, order_by_field = None, offset = 0, order = Order::Desc))]
159 |     #[allow(clippy::too_many_arguments)]
160 |     fn search(
161 |         &self,
162 |         py: Python,
163 |         query: &Query,
164 |         limit: usize,
165 |         count: bool,
166 |         order_by_field: Option<&str>,
167 |         offset: usize,
168 |         order: Order,
169 |     ) -> PyResult<SearchResult> {
170 |         py.allow_threads(move || {
171 |             let mut multicollector = MultiCollector::new();
172 | 
173 |             let count_handle = if count {
174 |                 Some(multicollector.add_collector(Count))
175 |             } else {
176 |                 None
177 |             };
178 | 
179 |             let (mut multifruit, hits) = {
180 |                 if let Some(order_by) = order_by_field {
181 |                     let collector = TopDocs::with_limit(limit)
182 |                         .and_offset(offset)
183 |                         .order_by_u64_field(order_by, order.into());
184 |                     let top_docs_handle =
185 |                         multicollector.add_collector(collector);
186 |                     let ret = self.inner.search(query.get(), &multicollector);
187 | 
188 |                     match ret {
189 |                         Ok(mut r) => {
190 |                             let top_docs = top_docs_handle.extract(&mut r);
191 |                             let result: Vec<(Fruit, DocAddress)> = top_docs
192 |                                 .iter()
193 |                                 .map(|(f, d)| {
194 |                                     (Fruit::Order(*f), DocAddress::from(d))
195 |                                 })
196 |                                 .collect();
197 |                             (r, result)
198 |                         }
199 |                         Err(e) => {
200 |                             return Err(PyValueError::new_err(e.to_string()))
201 |                         }
202 |                     }
203 |                 } else {
204 |                     let collector =
205 |                         TopDocs::with_limit(limit).and_offset(offset);
206 |                     let top_docs_handle =
207 |                         multicollector.add_collector(collector);
208 |                     let ret = self.inner.search(query.get(), &multicollector);
209 | 
210 |                     match ret {
211 |                         Ok(mut r) => {
212 |                             let top_docs = top_docs_handle.extract(&mut r);
213 |                             let result: Vec<(Fruit, DocAddress)> = top_docs
214 |                                 .iter()
215 |                                 .map(|(f, d)| {
216 |                                     (Fruit::Score(*f), DocAddress::from(d))
217 |                                 })
218 |                                 .collect();
219 |                             (r, result)
220 |                         }
221 |                         Err(e) => {
222 |                             return Err(PyValueError::new_err(e.to_string()))
223 |                         }
224 |                     }
225 |                 }
226 |             };
227 | 
228 |             let count = count_handle.map(|h| h.extract(&mut multifruit));
229 | 
230 |             Ok(SearchResult { hits, count })
231 |         })
232 |     }
233 | 
234 |     #[pyo3(signature = (query, agg))]
235 |     fn aggregate(
236 |         &self,
237 |         py: Python,
238 |         query: &Query,
239 |         agg: Py<PyDict>,
240 |     ) -> PyResult<Py<PyDict>> {
241 |         let py_json = py.import("json")?;
242 |         let agg_query_str = py_json.call_method1("dumps", (agg,))?.to_string();
243 | 
244 |         let agg_str = py.allow_threads(move || {
245 |             let agg_collector = AggregationCollector::from_aggs(
246 |                 serde_json::from_str(&agg_query_str).map_err(to_pyerr)?,
247 |                 Default::default(),
248 |             );
249 |             let agg_res = self
250 |                 .inner
251 |                 .search(query.get(), &agg_collector)
252 |                 .map_err(to_pyerr)?;
253 | 
254 |             serde_json::to_string(&agg_res).map_err(to_pyerr)
255 |         })?;
256 | 
257 |         let agg_dict = py_json.call_method1("loads", (agg_str,))?;
258 |         let agg_dict = agg_dict.downcast::<PyDict>()?;
259 | 
260 |         Ok(agg_dict.clone().unbind())
261 |     }
262 | 
263 |     /// Returns the overall number of documents in the index.
264 |     #[getter]
265 |     fn num_docs(&self) -> u64 {
266 |         self.inner.num_docs()
267 |     }
268 | 
269 |     /// Returns the number of segments in the index.
270 |     #[getter]
271 |     fn num_segments(&self) -> usize {
272 |         self.inner.segment_readers().len()
273 |     }
274 | 
275 |     /// Return the overall number of documents containing
276 |     /// the given term.
277 |     #[pyo3(signature = (field_name, field_value))]
278 |     fn doc_freq(
279 |         &self,
280 |         field_name: &str,
281 |         field_value: &Bound<PyAny>,
282 |     ) -> PyResult<u64> {
283 |         // Wrap the tantivy Searcher `doc_freq` method to return a PyResult.
284 |         let schema = self.inner.schema();
285 |         let term = crate::make_term(schema, field_name, field_value)?;
286 |         self.inner.doc_freq(&term).map_err(to_pyerr)
287 |     }
288 | 
289 |     /// Fetches a document from Tantivy's store given a DocAddress.
290 |     ///
291 |     /// Args:
292 |     ///     doc_address (DocAddress): The DocAddress that is associated with
293 |     ///         the document that we wish to fetch.
294 |     ///
295 |     /// Returns the Document, raises ValueError if the document can't be found.
296 |     fn doc(&self, doc_address: &DocAddress) -> PyResult<Document> {
297 |         let doc: TantivyDocument =
298 |             self.inner.doc(doc_address.into()).map_err(to_pyerr)?;
299 |         let named_doc = doc.to_named_doc(self.inner.schema());
300 |         Ok(crate::document::Document {
301 |             field_values: named_doc.0,
302 |         })
303 |     }
304 | 
305 |     fn __repr__(&self) -> PyResult<String> {
306 |         Ok(format!(
307 |             "Searcher(num_docs={}, num_segments={})",
308 |             self.inner.num_docs(),
309 |             self.inner.segment_readers().len()
310 |         ))
311 |     }
312 | }
313 | 
314 | /// DocAddress contains all the necessary information to identify a document
315 | /// given a Searcher object.
316 | ///
317 | /// It consists in an id identifying its segment, and its segment-local DocId.
318 | /// The id used for the segment is actually an ordinal in the list of segment
319 | /// hold by a Searcher.
320 | #[pyclass(frozen, module = "tantivy.tantivy")]
321 | #[derive(
322 |     Clone, Debug, Deserialize, PartialEq, PartialOrd, Eq, Ord, Serialize,
323 | )]
324 | pub(crate) struct DocAddress {
325 |     pub(crate) segment_ord: tv::SegmentOrdinal,
326 |     pub(crate) doc: tv::DocId,
327 | }
328 | 
329 | #[pymethods]
330 | impl DocAddress {
331 |     #[new]
332 |     fn new(segment_ord: tv::SegmentOrdinal, doc: tv::DocId) -> Self {
333 |         DocAddress { segment_ord, doc }
334 |     }
335 | 
336 |     /// The segment ordinal is an id identifying the segment hosting the
337 |     /// document. It is only meaningful, in the context of a searcher.
338 |     #[getter]
339 |     fn segment_ord(&self) -> u32 {
340 |         self.segment_ord
341 |     }
342 | 
343 |     /// The segment local DocId
344 |     #[getter]
345 |     fn doc(&self) -> u32 {
346 |         self.doc
347 |     }
348 | 
349 |     fn __richcmp__(
350 |         &self,
351 |         other: &Self,
352 |         op: CompareOp,
353 |         _py: Python<'_>,
354 |     ) -> bool {
355 |         op.matches(self.cmp(other))
356 |     }
357 | 
358 |     fn __getnewargs__(&self) -> PyResult<(tv::SegmentOrdinal, tv::DocId)> {
359 |         Ok((self.segment_ord, self.doc))
360 |     }
361 | }
362 | 
363 | impl From<&tv::DocAddress> for DocAddress {
364 |     fn from(doc_address: &tv::DocAddress) -> Self {
365 |         DocAddress {
366 |             segment_ord: doc_address.segment_ord,
367 |             doc: doc_address.doc_id,
368 |         }
369 |     }
370 | }
371 | 
372 | impl From<&DocAddress> for tv::DocAddress {
373 |     fn from(val: &DocAddress) -> Self {
374 |         tv::DocAddress {
375 |             segment_ord: val.segment_ord(),
376 |             doc_id: val.doc(),
377 |         }
378 |     }
379 | }
380 | 


--------------------------------------------------------------------------------
/src/snippet.rs:
--------------------------------------------------------------------------------
 1 | use crate::to_pyerr;
 2 | use pyo3::prelude::*;
 3 | use tantivy as tv;
 4 | // Bring the trait into scope to use methods like `as_str()` on `OwnedValue`.
 5 | use tantivy::schema::Value;
 6 | 
 7 | /// Tantivy Snippet
 8 | ///
 9 | /// Snippet contains a fragment of a document, and some highlighted
10 | /// parts inside it.
11 | #[pyclass(module = "tantivy.tantivy")]
12 | pub(crate) struct Snippet {
13 |     pub(crate) inner: tv::snippet::Snippet,
14 | }
15 | 
16 | #[pyclass(module = "tantivy.tantivy")]
17 | pub(crate) struct Range {
18 |     #[pyo3(get)]
19 |     start: usize,
20 |     #[pyo3(get)]
21 |     end: usize,
22 | }
23 | 
24 | #[pymethods]
25 | impl Snippet {
26 |     pub fn to_html(&self) -> PyResult<String> {
27 |         Ok(self.inner.to_html())
28 |     }
29 | 
30 |     pub fn highlighted(&self) -> Vec<Range> {
31 |         let highlighted = self.inner.highlighted();
32 |         let results = highlighted
33 |             .iter()
34 |             .map(|r| Range {
35 |                 start: r.start,
36 |                 end: r.end,
37 |             })
38 |             .collect::<Vec<_>>();
39 |         results
40 |     }
41 | 
42 |     pub fn fragment(&self) -> PyResult<String> {
43 |         Ok(self.inner.fragment().to_string())
44 |     }
45 | }
46 | 
47 | #[pyclass(module = "tantivy.tantivy")]
48 | pub(crate) struct SnippetGenerator {
49 |     pub(crate) field_name: String,
50 |     pub(crate) inner: tv::snippet::SnippetGenerator,
51 | }
52 | 
53 | #[pymethods]
54 | impl SnippetGenerator {
55 |     #[staticmethod]
56 |     pub fn create(
57 |         searcher: &crate::Searcher,
58 |         query: &crate::Query,
59 |         schema: &crate::Schema,
60 |         field_name: &str,
61 |     ) -> PyResult<SnippetGenerator> {
62 |         let field = schema
63 |             .inner
64 |             .get_field(field_name)
65 |             .or(Err("field not found"))
66 |             .map_err(to_pyerr)?;
67 |         let generator = tv::snippet::SnippetGenerator::create(
68 |             &searcher.inner,
69 |             query.get(),
70 |             field,
71 |         )
72 |         .map_err(to_pyerr)?;
73 | 
74 |         Ok(SnippetGenerator {
75 |             field_name: field_name.to_string(),
76 |             inner: generator,
77 |         })
78 |     }
79 | 
80 |     pub fn snippet_from_doc(&self, doc: &crate::Document) -> crate::Snippet {
81 |         let text: String = doc
82 |             .iter_values_for_field(&self.field_name)
83 |             .flat_map(|ov| ov.as_str())
84 |             .collect::<Vec<&str>>()
85 |             .join(" ");
86 | 
87 |         let result = self.inner.snippet(&text);
88 |         Snippet { inner: result }
89 |     }
90 | 
91 |     pub fn set_max_num_chars(&mut self, max_num_chars: usize) {
92 |         self.inner.set_max_num_chars(max_num_chars);
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/tokenizer.rs:
--------------------------------------------------------------------------------
  1 | use pyo3::{exceptions::PyValueError, prelude::*};
  2 | use tantivy::tokenizer as tvt;
  3 | 
  4 | /// All Tantivy's built-in tokenizers in one place.
  5 | /// Each static method, e.g. Tokenizer.simple(),
  6 | /// creates a wrapper around a Tantivy tokenizer.
  7 | ///
  8 | /// ## Example:
  9 | ///
 10 | /// ```python
 11 | /// tokenizer = Tokenizer.regex(r"\w+")
 12 | /// ```
 13 | ///
 14 | /// ## Usage
 15 | ///
 16 | /// In general, tokenizer objects' only reason
 17 | /// for existing is to be passed to
 18 | /// TextAnalyzerBuilder(tokenizer=<tokenizer>)
 19 | ///
 20 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/index.html
 21 | ///
 22 | // ## Implementation details:
 23 | //
 24 | // This is a complex enum. Each variant is a struct
 25 | // that defines the arguments accepted by the
 26 | // corresponding tokenizer's constructor.
 27 | // The enum members, e.g. _Raw, are not instantiated
 28 | // directly because our version of pyo3 (0.21.0)
 29 | // does not have the #[pyo3(constructor = ...)],
 30 | // attribute yet, making it more sensible to
 31 | // define constructor signatures using a separate method.
 32 | #[pyclass(module = "tantivy.tokenizer")]
 33 | #[derive(Debug)]
 34 | pub enum Tokenizer {
 35 |     _Raw {},
 36 |     _Simple {},
 37 |     _Whitespace {},
 38 |     _Regex {
 39 |         pattern: String,
 40 |     },
 41 |     _Ngram {
 42 |         min_gram: usize,
 43 |         max_gram: usize,
 44 |         prefix_only: bool,
 45 |     },
 46 |     _Facet {},
 47 | }
 48 | 
 49 | #[pymethods]
 50 | impl Tokenizer {
 51 |     /// SimpleTokenizer
 52 |     #[staticmethod]
 53 |     fn simple() -> PyResult<Tokenizer> {
 54 |         Ok(Tokenizer::_Simple {})
 55 |     }
 56 | 
 57 |     /// WhitespaceTokenizer
 58 |     #[staticmethod]
 59 |     fn whitespace() -> PyResult<Tokenizer> {
 60 |         Ok(Tokenizer::_Whitespace {})
 61 |     }
 62 | 
 63 |     /// Raw Tokenizer
 64 |     #[staticmethod]
 65 |     fn raw() -> PyResult<Tokenizer> {
 66 |         Ok(Tokenizer::_Raw {})
 67 |     }
 68 | 
 69 |     /// FacetTokenizer
 70 |     #[staticmethod]
 71 |     fn facet() -> PyResult<Tokenizer> {
 72 |         Ok(Tokenizer::_Facet {})
 73 |     }
 74 | 
 75 |     /// Regextokenizer
 76 |     #[staticmethod]
 77 |     fn regex(pattern: String) -> PyResult<Tokenizer> {
 78 |         Ok(Tokenizer::_Regex { pattern })
 79 |     }
 80 | 
 81 |     /// NgramTokenizer
 82 |     ///
 83 |     /// Args:
 84 |     /// - min_gram (int): Minimum character length of each ngram.
 85 |     /// - max_gram (int): Maximum character length of each ngram.
 86 |     /// - prefix_only (bool, optional): If true, ngrams must count from the start of the word.
 87 |     #[pyo3(signature=(min_gram=2,max_gram=3,prefix_only=false))]
 88 |     #[staticmethod]
 89 |     fn ngram(
 90 |         min_gram: usize,
 91 |         max_gram: usize,
 92 |         prefix_only: bool,
 93 |     ) -> PyResult<Tokenizer> {
 94 |         Ok(Tokenizer::_Ngram {
 95 |             min_gram,
 96 |             max_gram,
 97 |             prefix_only,
 98 |         })
 99 |     }
100 | 
101 |     fn __repr__(&self) -> String {
102 |         format!("tantivy.Tokenizer({:?})", &self)
103 |     }
104 | }
105 | 
106 | /// All Tantivy's builtin TokenFilters.
107 | ///
108 | /// ## Exmaple
109 | ///
110 | /// ```python
111 | /// filter = Filter.alpha_num()
112 | /// ```
113 | ///
114 | /// ## Usage
115 | ///
116 | /// In general, filter objects exist to
117 | /// be passed to the filter() method
118 | /// of a TextAnalyzerBuilder instance.
119 | ///
120 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/index.html
121 | ///
122 | // ## Implementation details:
123 | //
124 | // This is a complex enum. Each variant is a struct
125 | // that defines the arguments accepted by the
126 | // corresponding tokenizer's constructor.
127 | // The enum members, e.g. _AlphaNum, are not instantiated
128 | // directly because our version of pyo3 (0.21.0)
129 | // does not have the #[pyo3(constructor = ...)],
130 | // attribute yet, making it more sensible to
131 | // define constructor signatures using a separate method.
132 | #[pyclass(module = "tantivy.tokenizer")]
133 | #[derive(Debug)]
134 | pub enum Filter {
135 |     _AlphaNumOnly {},
136 |     _AsciiFolding {},
137 |     _LowerCaser {},
138 |     _RemoveLong { length_limit: usize },
139 |     _Stemmer { language: String },
140 |     _StopWord { language: String },
141 |     _CustomStopWord { stopwords: Vec<String> },
142 |     _SplitCompound { constituent_words: Vec<String> },
143 | }
144 | 
145 | #[pymethods]
146 | impl Filter {
147 |     /// AlphaNumOnlyFilter
148 |     #[staticmethod]
149 |     fn alphanum_only() -> PyResult<Filter> {
150 |         Ok(Filter::_AlphaNumOnly {})
151 |     }
152 | 
153 |     /// AsciiFoldingFilter
154 |     #[staticmethod]
155 |     fn ascii_fold() -> PyResult<Filter> {
156 |         Ok(Filter::_AsciiFolding {})
157 |     }
158 | 
159 |     #[staticmethod]
160 |     fn lowercase() -> PyResult<Filter> {
161 |         Ok(Filter::_LowerCaser {})
162 |     }
163 | 
164 |     /// RemoveLongFilter
165 |     ///
166 |     /// Args:
167 |     /// - length_limit (int): max character length of token.
168 |     #[staticmethod]
169 |     fn remove_long(length_limit: usize) -> PyResult<Filter> {
170 |         Ok(Filter::_RemoveLong { length_limit })
171 |     }
172 | 
173 |     /// Stemmer
174 |     #[staticmethod]
175 |     fn stemmer(language: String) -> PyResult<Filter> {
176 |         Ok(Filter::_Stemmer { language })
177 |     }
178 | 
179 |     /// StopWordFilter (builtin stop word list)
180 |     ///
181 |     /// Args:
182 |     /// - language (string): Stop words list language.
183 |     ///   Valid values: {
184 |     ///     "arabic", "danish", "dutch", "english", "finnish", "french", "german", "greek",
185 |     ///     "hungarian", "italian", "norwegian", "portuguese", "romanian", "russian",
186 |     ///     "spanish", "swedish", "tamil", "turkish"
187 |     ///   }
188 |     // ## Implementation notes:
189 |     // An enum would make more sense for `language`, but I'm not sure if it's worth it.
190 |     #[staticmethod]
191 |     fn stopword(language: String) -> PyResult<Filter> {
192 |         Ok(Filter::_StopWord { language })
193 |     }
194 | 
195 |     /// StopWordFilter (user-provided stop word list)
196 |     ///
197 |     /// This variant of Filter.stopword() lets you provide
198 |     /// your own custom list of stopwords.
199 |     ///
200 |     /// Args:
201 |     /// - stopwords (list(str)): a list of words to be removed.
202 |     #[staticmethod]
203 |     fn custom_stopword(stopwords: Vec<String>) -> PyResult<Filter> {
204 |         Ok(Filter::_CustomStopWord { stopwords })
205 |     }
206 | 
207 |     /// SplitCompoundWords
208 |     ///
209 |     /// https://docs.rs/tantivy/latest/tantivy/tokenizer/struct.SplitCompoundWords.html
210 |     ///
211 |     /// Args:
212 |     /// - constituent_words (list(string)): words that make up compound word (must be in order).
213 |     ///
214 |     /// Example:
215 |     ///
216 |     /// ```python
217 |     /// # useless, contrived example:
218 |     /// compound_spliter = Filter.split_compounds(['butter', 'fly'])
219 |     /// # Will split 'butterfly' -> ['butter', 'fly'],
220 |     /// # but won't split 'buttering' or 'buttercupfly'
221 |     /// ```
222 |     #[staticmethod]
223 |     fn split_compound(constituent_words: Vec<String>) -> PyResult<Filter> {
224 |         Ok(Filter::_SplitCompound { constituent_words })
225 |     }
226 | 
227 |     fn __repr__(&self) -> String {
228 |         format!("tantivy.Filter(kind={:?})", &self)
229 |     }
230 | }
231 | 
232 | fn parse_language(lang: &str) -> Result<tvt::Language, String> {
233 |     match lang.to_lowercase().as_str() {
234 |         "arabic" => Ok(tvt::Language::Arabic),
235 |         "danish" => Ok(tvt::Language::Danish),
236 |         "dutch" => Ok(tvt::Language::Dutch),
237 |         "english" => Ok(tvt::Language::English),
238 |         "finnish" => Ok(tvt::Language::Finnish),
239 |         "french" => Ok(tvt::Language::French),
240 |         "german" => Ok(tvt::Language::German),
241 |         "greek" => Ok(tvt::Language::Greek),
242 |         "hungarian" => Ok(tvt::Language::Hungarian),
243 |         "italian" => Ok(tvt::Language::Italian),
244 |         "norwegian" => Ok(tvt::Language::Norwegian),
245 |         "portuguese" => Ok(tvt::Language::Portuguese),
246 |         "romanian" => Ok(tvt::Language::Romanian),
247 |         "russian" => Ok(tvt::Language::Russian),
248 |         "spanish" => Ok(tvt::Language::Spanish),
249 |         "swedish" => Ok(tvt::Language::Swedish),
250 |         "tamil" => Ok(tvt::Language::Tamil),
251 |         "turkish" => Ok(tvt::Language::Turkish),
252 |         _ => Err(format!("Unsupported language: {}", lang)),
253 |     }
254 | }
255 | 
256 | /// Tantivy's TextAnalyzer
257 | ///
258 | /// Do not instantiate this class directly.
259 | /// Use the `TextAnalyzerBuilder` class instead.
260 | #[derive(Clone)]
261 | #[pyclass(module = "tantivy.tantivy")]
262 | pub(crate) struct TextAnalyzer {
263 |     pub(crate) analyzer: tvt::TextAnalyzer,
264 | }
265 | 
266 | #[pymethods]
267 | impl TextAnalyzer {
268 |     /// Tokenize a string
269 |     /// Args:
270 |     /// - text (string): text to tokenize.
271 |     /// Returns:
272 |     /// - list(string): a list of tokens/words.
273 |     fn analyze(&mut self, text: &str) -> Vec<String> {
274 |         let mut token_stream = self.analyzer.token_stream(text);
275 |         let mut tokens = Vec::new();
276 | 
277 |         while token_stream.advance() {
278 |             tokens.push(token_stream.token().text.clone());
279 |         }
280 |         tokens
281 |     }
282 | }
283 | 
284 | /// Tantivy's TextAnalyzerBuilder
285 | ///
286 | /// # Example
287 | ///
288 | /// ```python
289 | /// my_analyzer: TextAnalyzer = (
290 | ///     TextAnalyzerBuilder(Tokenizer.simple())
291 | ///     .filter(Filter.lowercase())
292 | ///     .filter(Filter.ngram())
293 | ///     .build()
294 | /// )
295 | /// ```
296 | ///
297 | /// https://docs.rs/tantivy/latest/tantivy/tokenizer/struct.TextAnalyzerBuilder.html
298 | #[pyclass]
299 | pub struct TextAnalyzerBuilder {
300 |     builder: Option<tvt::TextAnalyzerBuilder>,
301 | }
302 | 
303 | #[pymethods]
304 | impl TextAnalyzerBuilder {
305 |     #[new]
306 |     fn new(tokenizer: &Tokenizer) -> PyResult<Self> {
307 |         let builder: tvt::TextAnalyzerBuilder = match tokenizer {
308 |             Tokenizer::_Raw {} => {
309 |                 tvt::TextAnalyzer::builder(tvt::RawTokenizer::default())
310 |                     .dynamic()
311 |             }
312 |             Tokenizer::_Simple {} => {
313 |                 tvt::TextAnalyzer::builder(tvt::SimpleTokenizer::default())
314 |                     .dynamic()
315 |             }
316 |             Tokenizer::_Whitespace {} => {
317 |                 tvt::TextAnalyzer::builder(tvt::WhitespaceTokenizer::default())
318 |                     .dynamic()
319 |             }
320 |             Tokenizer::_Regex { pattern } => tvt::TextAnalyzer::builder(
321 |                 tvt::RegexTokenizer::new(pattern).map_err(|e| {
322 |                     PyErr::new::<PyValueError, _>(format!(
323 |                         "Invalid regex pattern: {}",
324 |                         e
325 |                     ))
326 |                 })?, // tvt::RegexTokenizer::new(pattern) .unwrap(),
327 |             )
328 |             .dynamic(),
329 |             Tokenizer::_Ngram {
330 |                 min_gram,
331 |                 max_gram,
332 |                 prefix_only,
333 |             } => tvt::TextAnalyzer::builder(
334 |                 tvt::NgramTokenizer::new(*min_gram, *max_gram, *prefix_only)
335 |                     .unwrap(),
336 |             )
337 |             .dynamic(),
338 |             Tokenizer::_Facet {} => {
339 |                 tvt::TextAnalyzer::builder(tvt::FacetTokenizer::default())
340 |                     .dynamic()
341 |             }
342 |         };
343 | 
344 |         Ok(TextAnalyzerBuilder {
345 |             builder: Some(builder.dynamic()),
346 |         })
347 |     }
348 | 
349 |     /// Add filter to the builder.
350 |     ///
351 |     /// Args:
352 |     /// - filter (Filter): a Filter object.
353 |     /// Returns:
354 |     /// - TextAnalyzerBuilder: A new instance of the builder
355 |     ///
356 |     /// Note: The builder is _not_ mutated in-place.
357 |     fn filter(&mut self, filter: &Filter) -> PyResult<Self> {
358 |         if let Some(builder) = self.builder.take() {
359 |             let new_builder: tvt::TextAnalyzerBuilder = match filter {
360 |                 Filter::_AlphaNumOnly {} => {
361 |                     builder.filter_dynamic(tvt::AlphaNumOnlyFilter {})
362 |                 }
363 |                 Filter::_AsciiFolding {} => {
364 |                     builder.filter_dynamic(tvt::AsciiFoldingFilter)
365 |                 }
366 |                 Filter::_LowerCaser {} => {
367 |                     builder.filter_dynamic(tvt::LowerCaser)
368 |                 }
369 |                 Filter::_RemoveLong { length_limit } => builder.filter_dynamic(
370 |                     tvt::RemoveLongFilter::limit(*length_limit),
371 |                 ),
372 |                 Filter::_Stemmer { language } => {
373 |                     match parse_language(language) {
374 |                         Ok(lang) => {
375 |                             builder.filter_dynamic(tvt::Stemmer::new(lang))
376 |                         }
377 |                         Err(e) => {
378 |                             return Err(PyErr::new::<
379 |                                 pyo3::exceptions::PyValueError,
380 |                                 _,
381 |                             >(e))
382 |                         }
383 |                     }
384 |                 }
385 |                 Filter::_StopWord { language } => {
386 |                     match parse_language(language) {
387 |                         Ok(lang) => builder.filter_dynamic(
388 |                             tvt::StopWordFilter::new(lang).unwrap(),
389 |                         ),
390 |                         Err(e) => {
391 |                             return Err(PyErr::new::<
392 |                                 pyo3::exceptions::PyValueError,
393 |                                 _,
394 |                             >(e))
395 |                         }
396 |                     }
397 |                 }
398 |                 Filter::_CustomStopWord { stopwords } => builder
399 |                     .filter_dynamic(tvt::StopWordFilter::remove(
400 |                         stopwords.clone(),
401 |                     )),
402 |                 Filter::_SplitCompound { constituent_words } => builder
403 |                     .filter_dynamic(
404 |                         tvt::SplitCompoundWords::from_dictionary(
405 |                             constituent_words,
406 |                         )
407 |                         .unwrap(),
408 |                     ),
409 |             };
410 |             Ok(TextAnalyzerBuilder {
411 |                 builder: Some(new_builder),
412 |             })
413 |         } else {
414 |             Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(
415 |                 "Builder has already been consumed",
416 |             ))
417 |         }
418 |     }
419 | 
420 |     /// Build final TextAnalyzer object.
421 |     ///
422 |     /// Returns:
423 |     /// - TextAnalyzer with tokenizer and filters baked in.
424 |     ///
425 |     /// Tip: TextAnalyzer's `analyze(text) -> tokens` method lets you
426 |     /// easily check if your analyzer is working as expected.
427 |     fn build(&mut self) -> PyResult<TextAnalyzer> {
428 |         if let Some(builder) = self.builder.take() {
429 |             Ok(TextAnalyzer {
430 |                 analyzer: builder.build(),
431 |             })
432 |         } else {
433 |             Err(PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(
434 |                 "Builder has already been consumed",
435 |             ))
436 |         }
437 |     }
438 | }
439 | 


--------------------------------------------------------------------------------
/tantivy/__init__.py:
--------------------------------------------------------------------------------
1 | from .tantivy import *


--------------------------------------------------------------------------------
/tantivy/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quickwit-oss/tantivy-py/23871c1aa2752010b34df117405ccb5da37e94ba/tantivy/py.typed


--------------------------------------------------------------------------------
/tantivy/tantivy.pyi:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from enum import Enum
  3 | from typing import Any, Optional, Sequence, TypeVar, Union
  4 | 
  5 | 
  6 | class Schema:
  7 |     pass
  8 | 
  9 | 
 10 | class SchemaBuilder:
 11 |     @staticmethod
 12 |     def is_valid_field_name(name: str) -> bool:
 13 |         pass
 14 | 
 15 |     def add_text_field(
 16 |         self,
 17 |         name: str,
 18 |         stored: bool = False,
 19 |         fast: bool = False,
 20 |         tokenizer_name: str = "default",
 21 |         index_option: str = "position",
 22 |     ) -> SchemaBuilder:
 23 |         pass
 24 | 
 25 |     def add_integer_field(
 26 |         self,
 27 |         name: str,
 28 |         stored: bool = False,
 29 |         indexed: bool = False,
 30 |         fast: bool = False,
 31 |     ) -> SchemaBuilder:
 32 |         pass
 33 | 
 34 |     def add_float_field(
 35 |         self,
 36 |         name: str,
 37 |         stored: bool = False,
 38 |         indexed: bool = False,
 39 |         fast: bool = False,
 40 |     ) -> SchemaBuilder:
 41 |         pass
 42 | 
 43 |     def add_unsigned_field(
 44 |         self,
 45 |         name: str,
 46 |         stored: bool = False,
 47 |         indexed: bool = False,
 48 |         fast: bool = False,
 49 |     ) -> SchemaBuilder:
 50 |         pass
 51 | 
 52 |     def add_boolean_field(
 53 |         self,
 54 |         name: str,
 55 |         stored: bool = False,
 56 |         indexed: bool = False,
 57 |         fast: bool = False,
 58 |     ) -> SchemaBuilder:
 59 |         pass
 60 | 
 61 |     def add_date_field(
 62 |         self,
 63 |         name: str,
 64 |         stored: bool = False,
 65 |         indexed: bool = False,
 66 |         fast: bool = False,
 67 |     ) -> SchemaBuilder:
 68 |         pass
 69 | 
 70 |     def add_json_field(
 71 |         self,
 72 |         name: str,
 73 |         stored: bool = False,
 74 |         tokenizer_name: str = "default",
 75 |         index_option: str = "position",
 76 |     ) -> SchemaBuilder:
 77 |         pass
 78 | 
 79 |     def add_facet_field(
 80 |         self,
 81 |         name: str,
 82 |     ) -> SchemaBuilder:
 83 |         pass
 84 | 
 85 |     def add_bytes_field(
 86 |         self,
 87 |         name: str,
 88 |         stored: bool = False,
 89 |         indexed: bool = False,
 90 |         fast: bool = False,
 91 |         index_option: str = "position",
 92 |     ) -> SchemaBuilder:
 93 |         pass
 94 | 
 95 |     def add_ip_addr_field(
 96 |         self,
 97 |         name: str,
 98 |         stored: bool = False,
 99 |         indexed: bool = False,
100 |         fast: bool = False,
101 |     ) -> SchemaBuilder:
102 |         pass
103 | 
104 |     def build(self) -> Schema:
105 |         pass
106 | 
107 | 
108 | class Facet:
109 |     @staticmethod
110 |     def from_encoded(encoded_bytes: bytes) -> Facet:
111 |         pass
112 | 
113 |     @classmethod
114 |     def root(cls) -> Facet:
115 |         pass
116 | 
117 |     @classmethod
118 |     def from_string(cls, facet_string: str) -> Facet:
119 |         pass
120 | 
121 |     @property
122 |     def is_root(self) -> bool:
123 |         pass
124 | 
125 |     def is_prefix_of(self, other: Facet) -> bool:
126 |         pass
127 | 
128 |     def to_path(self) -> list[str]:
129 |         pass
130 | 
131 |     def to_path_str(self) -> str:
132 |         pass
133 | 
134 | 
135 | class Document:
136 |     def __new__(cls, **kwargs) -> Document:
137 |         pass
138 | 
139 |     def __getitem__(self, key: str) -> list[Any]:
140 |         pass
141 | 
142 |     def extend(self, py_dict: dict, schema: Optional[Schema]) -> None:
143 |         pass
144 | 
145 |     @staticmethod
146 |     def from_dict(py_dict: dict, schema: Optional[Schema] = None) -> Document:
147 |         pass
148 | 
149 |     def to_dict(self) -> dict[str, list[Any]]:
150 |         pass
151 | 
152 |     def add_text(self, field_name: str, text: str) -> None:
153 |         pass
154 | 
155 |     def add_unsigned(self, field_name: str, value: int) -> None:
156 |         pass
157 | 
158 |     def add_integer(self, field_name: str, value: int) -> None:
159 |         pass
160 | 
161 |     def add_float(self, field_name: str, value: float) -> None:
162 |         pass
163 | 
164 |     def add_boolean(self, field_name: str, value: bool) -> None:
165 |         pass
166 | 
167 |     def add_date(self, field_name: str, value: datetime.datetime) -> None:
168 |         pass
169 | 
170 |     def add_facet(self, field_name: str, facet: Facet) -> None:
171 |         pass
172 | 
173 |     def add_bytes(self, field_name: str, bytes: bytes) -> None:
174 |         pass
175 | 
176 |     def add_json(self, field_name: str, value: Any) -> None:
177 |         pass
178 | 
179 |     def add_ip_addr(self, field_name: str, ip_addr: str) -> None:
180 |         pass
181 | 
182 |     @property
183 |     def num_fields(self) -> int:
184 |         pass
185 | 
186 |     @property
187 |     def is_empty(self) -> bool:
188 |         pass
189 | 
190 |     def get_first(self, field_name: str) -> Optional[Any]:
191 |         pass
192 | 
193 |     def get_all(self, field_name: str) -> list[Any]:
194 |         pass
195 | 
196 | 
197 | class Occur(Enum):
198 |     Must = 1
199 |     Should = 2
200 |     MustNot = 3
201 | 
202 | 
203 | class FieldType(Enum):
204 |     Text = 1
205 |     Unsigned = 2
206 |     Integer = 3
207 |     Float = 4
208 |     Boolean = 5
209 |     Date = 6
210 |     Facet = 7
211 |     Bytes = 8
212 |     Json = 9
213 |     IpAddr = 10
214 | 
215 | 
216 | _RangeType = TypeVar(
217 |     "_RangeType", bound=int | float | datetime.datetime | bool | str | bytes
218 | )
219 | 
220 | 
221 | class Query:
222 |     @staticmethod
223 |     def term_query(
224 |         schema: Schema,
225 |         field_name: str,
226 |         field_value: Any,
227 |         index_option: str = "position",
228 |     ) -> Query:
229 |         pass
230 | 
231 |     @staticmethod
232 |     def term_set_query(
233 |         schema: Schema, field_name: str, field_values: Sequence[Any]
234 |     ) -> Query:
235 |         pass
236 | 
237 |     @staticmethod
238 |     def all_query() -> Query:
239 |         pass
240 | 
241 |     @staticmethod
242 |     def fuzzy_term_query(
243 |         schema: Schema,
244 |         field_name: str,
245 |         text: str,
246 |         distance: int = 1,
247 |         transposition_cost_one: bool = True,
248 |         prefix=False,
249 |     ) -> Query:
250 |         pass
251 | 
252 |     @staticmethod
253 |     def phrase_query(
254 |         schema: Schema,
255 |         field_name: str,
256 |         words: list[Union[str, tuple[int, str]]],
257 |         slop: int = 0,
258 |     ) -> Query:
259 |         pass
260 | 
261 |     @staticmethod
262 |     def boolean_query(subqueries: Sequence[tuple[Occur, Query]]) -> Query:
263 |         pass
264 | 
265 |     @staticmethod
266 |     def disjunction_max_query(
267 |         subqueries: Sequence[Query], tie_breaker: Optional[float] = None
268 |     ) -> Query:
269 |         pass
270 | 
271 |     @staticmethod
272 |     def boost_query(query: Query, boost: float) -> Query:
273 |         pass
274 | 
275 |     @staticmethod
276 |     def regex_query(schema: Schema, field_name: str, regex_pattern: str) -> Query:
277 |         pass
278 | 
279 |     @staticmethod
280 |     def more_like_this_query(
281 |         doc_address: DocAddress,
282 |         min_doc_frequency: Optional[int] = 5,
283 |         max_doc_frequency: Optional[int] = None,
284 |         min_term_frequency: Optional[int] = 2,
285 |         max_query_terms: Optional[int] = 25,
286 |         min_word_length: Optional[int] = None,
287 |         max_word_length: Optional[int] = None,
288 |         boost_factor: Optional[float] = 1.0,
289 |         stop_words: list[str] = [],
290 |     ) -> Query:
291 |         pass
292 | 
293 |     @staticmethod
294 |     def const_score_query(query: Query, score: float) -> Query:
295 |         pass
296 | 
297 |     @staticmethod
298 |     def range_query(
299 |         schema: Schema,
300 |         field_name: str,
301 |         field_type: FieldType,
302 |         lower_bound: _RangeType,
303 |         upper_bound: _RangeType,
304 |         include_lower: bool = True,
305 |         include_upper: bool = True,
306 |     ) -> Query:
307 |         pass
308 | 
309 | 
310 | class Order(Enum):
311 |     Asc = 1
312 |     Desc = 2
313 | 
314 | 
315 | class DocAddress:
316 |     def __new__(cls, segment_ord: int, doc: int) -> DocAddress:
317 |         pass
318 | 
319 |     @property
320 |     def segment_ord(self) -> int:
321 |         pass
322 | 
323 |     @property
324 |     def doc(self) -> int:
325 |         pass
326 | 
327 | 
328 | class SearchResult:
329 |     @property
330 |     def hits(self) -> list[tuple[Any, DocAddress]]:
331 |         pass
332 | 
333 | 
334 | class Searcher:
335 |     def search(
336 |         self,
337 |         query: Query,
338 |         limit: int = 10,
339 |         count: bool = True,
340 |         order_by_field: Optional[str] = None,
341 |         offset: int = 0,
342 |         order: Order = Order.Desc,
343 |     ) -> SearchResult:
344 |         pass
345 | 
346 |     def aggregate(
347 |         self,
348 |         search_query: Query,
349 |         agg_query: dict,
350 |     ) -> dict:
351 |         pass
352 | 
353 |     @property
354 |     def num_docs(self) -> int:
355 |         pass
356 | 
357 |     @property
358 |     def num_segments(self) -> int:
359 |         pass
360 | 
361 |     def doc(self, doc_address: DocAddress) -> Document:
362 |         pass
363 | 
364 |     def doc_freq(self, field_name: str, field_value: Any) -> int:
365 |         pass
366 | 
367 | 
368 | class IndexWriter:
369 |     def add_document(self, doc: Document) -> int:
370 |         pass
371 | 
372 |     def add_json(self, json: str) -> int:
373 |         pass
374 | 
375 |     def commit(self) -> int:
376 |         pass
377 | 
378 |     def rollback(self) -> int:
379 |         pass
380 | 
381 |     def garbage_collect_files(self) -> None:
382 |         pass
383 | 
384 |     def delete_all_documents(self) -> None:
385 |         pass
386 | 
387 |     @property
388 |     def commit_opstamp(self) -> int:
389 |         pass
390 | 
391 |     def delete_documents(self, field_name: str, field_value: Any) -> int:
392 |         pass
393 | 
394 |     def delete_documents_by_term(self, field_name: str, field_value: Any) -> int:
395 |         pass
396 | 
397 |     def delete_documents_by_query(self, query: Query) -> int:
398 |         pass
399 | 
400 |     def wait_merging_threads(self) -> None:
401 |         pass
402 | 
403 | 
404 | class Index:
405 |     def __new__(
406 |         cls, schema: Schema, path: Optional[str] = None, reuse: bool = True
407 |     ) -> Index:
408 |         pass
409 | 
410 |     @staticmethod
411 |     def open(path: str) -> Index:
412 |         pass
413 | 
414 |     def writer(self, heap_size: int = 128_000_000, num_threads: int = 0) -> IndexWriter:
415 |         pass
416 | 
417 |     def config_reader(
418 |         self, reload_policy: str = "commit", num_warmers: int = 0
419 |     ) -> None:
420 |         pass
421 | 
422 |     def searcher(self) -> Searcher:
423 |         pass
424 | 
425 |     @staticmethod
426 |     def exists(path: str) -> bool:
427 |         pass
428 | 
429 |     @property
430 |     def schema(self) -> Schema:
431 |         pass
432 | 
433 |     def reload(self) -> None:
434 |         pass
435 | 
436 |     def parse_query(
437 |         self,
438 |         query: str,
439 |         default_field_names: Optional[list[str]] = None,
440 |         field_boosts: Optional[dict[str, float]] = None,
441 |         fuzzy_fields: Optional[dict[str, tuple[bool, int, bool]]] = None,
442 |     ) -> Query:
443 |         pass
444 | 
445 |     def parse_query_lenient(
446 |         self,
447 |         query: str,
448 |         default_field_names: Optional[list[str]] = None,
449 |         field_boosts: Optional[dict[str, float]] = None,
450 |         fuzzy_fields: Optional[dict[str, tuple[bool, int, bool]]] = None,
451 |     ) -> tuple[Query, list[Any]]:
452 |         pass
453 | 
454 |     def register_tokenizer(
455 |         self, name: str, text_analyzer: TextAnalyzer
456 |     ) -> None: ...
457 | 
458 | 
459 | class Range:
460 |     @property
461 |     def start(self) -> int:
462 |         pass
463 | 
464 |     @property
465 |     def end(self) -> int:
466 |         pass
467 | 
468 | 
469 | class Snippet:
470 |     def to_html(self) -> str:
471 |         pass
472 | 
473 |     def highlighted(self) -> list[Range]:
474 |         pass
475 | 
476 |     def fragment(self) -> str:
477 |         pass
478 | 
479 | class SnippetGenerator:
480 |     @staticmethod
481 |     def create(
482 |         searcher: Searcher, query: Query, schema: Schema, field_name: str
483 |     ) -> SnippetGenerator:
484 |         pass
485 | 
486 |     def snippet_from_doc(self, doc: Document) -> Snippet:
487 |         pass
488 | 
489 |     def set_max_num_chars(self, max_num_chars: int) -> None:
490 |         pass
491 | 
492 | 
493 | class Tokenizer:
494 |     @staticmethod
495 |     def raw() -> Tokenizer:
496 |         pass
497 | 
498 |     @staticmethod
499 |     def simple() -> Tokenizer:
500 |         pass
501 | 
502 |     @staticmethod
503 |     def whitespace() -> Tokenizer:
504 |         pass
505 | 
506 |     @staticmethod
507 |     def regex(pattern: str) -> Tokenizer:
508 |         pass
509 | 
510 |     @staticmethod
511 |     def ngram(
512 |         min_gram: int = 2, max_gram: int = 3, prefix_only: bool = False
513 |     ) -> Tokenizer:
514 |         pass
515 | 
516 |     @staticmethod
517 |     def facet() -> Tokenizer:
518 |         pass
519 | 
520 | 
521 | class Filter:
522 | 
523 |     @staticmethod
524 |     def alphanum_only() -> Filter:
525 |         pass
526 | 
527 |     @staticmethod
528 |     def ascii_fold() -> Filter:
529 |         pass
530 | 
531 |     @staticmethod
532 |     def lowercase() -> Filter:
533 |         pass
534 | 
535 |     @staticmethod
536 |     def remove_long(length_limit: int) -> Filter:
537 |         pass
538 | 
539 |     @staticmethod
540 |     def stemmer(language: str) -> Filter:
541 |         pass
542 | 
543 |     @staticmethod
544 |     def stopword(language: str) -> Filter:
545 |         pass
546 | 
547 |     @staticmethod
548 |     def custom_stopword(stopwords: list[str]) -> Filter:
549 |         pass
550 | 
551 |     @staticmethod
552 |     def split_compound(constituent_words: list[str]) -> Filter:
553 |         pass
554 |     
555 | 
556 | class TextAnalyzer:
557 | 
558 |     def analyze(self, text: str) -> list[str]:
559 |         pass
560 | 
561 | 
562 | class TextAnalyzerBuilder:
563 | 
564 |     def __init__(self, tokenizer: Tokenizer):
565 |         pass
566 | 
567 |     def filter(self, filter: Filter) -> TextAnalyzerBuilder:
568 |         pass
569 | 
570 |     def build(self) -> TextAnalyzer:
571 |         pass
572 | 
573 | 
574 | __version__: str
575 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import pytest
  3 | 
  4 | from tantivy import SchemaBuilder, Index, Document
  5 | 
  6 | 
  7 | def schema():
  8 |     return (
  9 |         SchemaBuilder()
 10 |         .add_text_field("title", stored=True)
 11 |         .add_text_field("body")
 12 |         .build()
 13 |     )
 14 | 
 15 | 
 16 | def schema_numeric_fields():
 17 |     return (
 18 |         SchemaBuilder()
 19 |         .add_integer_field("id", stored=True, indexed=True, fast=True)
 20 |         .add_float_field("rating", stored=True, indexed=True, fast=True)
 21 |         .add_boolean_field("is_good", stored=True, indexed=True)
 22 |         .add_text_field("body", stored=True, fast=True)
 23 |         .build()
 24 |     )
 25 | 
 26 | def schema_with_date_field():
 27 |     return (
 28 |         SchemaBuilder()
 29 |         .add_integer_field("id", stored=True, indexed=True)
 30 |         .add_float_field("rating", stored=True, indexed=True)
 31 |         .add_date_field("date", stored=True, indexed=True)
 32 |         .build()
 33 |     )
 34 | 
 35 | def schema_with_ip_addr_field():
 36 |     return (
 37 |         SchemaBuilder()
 38 |         .add_integer_field("id", stored=True, indexed=True)
 39 |         .add_float_field("rating", stored=True, indexed=True)
 40 |         .add_ip_addr_field("ip_addr", stored=True, indexed=True)
 41 |         .build()
 42 |     )
 43 | 
 44 | def create_index(dir=None):
 45 |     # assume all tests will use the same documents for now
 46 |     # other methods may set up function-local indexes
 47 |     index = Index(schema(), dir)
 48 |     writer = index.writer(15_000_000, 1)
 49 | 
 50 |     # 2 ways of adding documents
 51 |     # 1
 52 |     doc = Document()
 53 |     # create a document instance
 54 |     # add field-value pairs
 55 |     doc.add_text("title", "The Old Man and the Sea")
 56 |     doc.add_text(
 57 |         "body",
 58 |         (
 59 |             "He was an old man who fished alone in a skiff in"
 60 |             "the Gulf Stream and he had gone eighty-four days "
 61 |             "now without taking a fish."
 62 |         ),
 63 |     )
 64 |     writer.add_document(doc)
 65 |     # 2 use the built-in json support
 66 |     # keys need to coincide with field names
 67 |     doc = Document.from_dict(
 68 |         {
 69 |             "title": "Of Mice and Men",
 70 |             "body": (
 71 |                 "A few miles south of Soledad, the Salinas River drops "
 72 |                 "in close to the hillside bank and runs deep and "
 73 |                 "green. The water is warm too, for it has slipped "
 74 |                 "twinkling over the yellow sands in the sunlight "
 75 |                 "before reaching the narrow pool. On one side of the "
 76 |                 "river the golden foothill slopes curve up to the "
 77 |                 "strong and rocky Gabilan Mountains, but on the valley "
 78 |                 "side the water is lined with trees—willows fresh and "
 79 |                 "green with every spring, carrying in their lower leaf "
 80 |                 "junctures the debris of the winter’s flooding; and "
 81 |                 "sycamores with mottled, white, recumbent limbs and "
 82 |                 "branches that arch over the pool"
 83 |             ),
 84 |         }
 85 |     )
 86 |     writer.add_document(doc)
 87 |     writer.add_json(
 88 |         """{
 89 |             "title": ["Frankenstein", "The Modern Prometheus"],
 90 |             "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings.  I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."
 91 |         }"""
 92 |     )
 93 |     writer.commit()
 94 |     writer.wait_merging_threads()
 95 |     index.reload()
 96 |     return index
 97 | 
 98 | 
 99 | def create_index_with_numeric_fields(dir=None):
100 |     index = Index(schema_numeric_fields(), dir)
101 |     writer = index.writer(15_000_000, 1)
102 | 
103 |     doc = Document()
104 |     doc.add_integer("id", 1)
105 |     doc.add_float("rating", 3.5)
106 |     doc.add_boolean("is_good", True)
107 |     doc.add_text(
108 |         "body",
109 |         (
110 |             "He was an old man who fished alone in a skiff in"
111 |             "the Gulf Stream and he had gone eighty-four days "
112 |             "now without taking a fish."
113 |         ),
114 |     )
115 |     writer.add_document(doc)
116 |     doc = Document.from_dict(
117 |         {
118 |             "id": 2,
119 |             "rating": 4.5,
120 |             "is_good": False,
121 |             "body": (
122 |                 "A few miles south of Soledad, the Salinas River drops "
123 |                 "in close to the hillside bank and runs deep and "
124 |                 "green. The water is warm too, for it has slipped "
125 |                 "twinkling over the yellow sands in the sunlight "
126 |                 "before reaching the narrow pool. On one side of the "
127 |                 "river the golden foothill slopes curve up to the "
128 |                 "strong and rocky Gabilan Mountains, but on the valley "
129 |                 "side the water is lined with trees—willows fresh and "
130 |                 "green with every spring, carrying in their lower leaf "
131 |                 "junctures the debris of the winter’s flooding; and "
132 |                 "sycamores with mottled, white, recumbent limbs and "
133 |                 "branches that arch over the pool"
134 |             ),
135 |         },
136 |     )
137 |     writer.add_document(doc)
138 |     writer.commit()
139 |     writer.wait_merging_threads()
140 |     index.reload()
141 |     return index
142 | 
143 | def create_index_with_date_field(dir=None):
144 |     index = Index(schema_with_date_field(), dir)
145 |     writer = index.writer(15_000_000, 1)
146 | 
147 |     doc = Document()
148 |     doc.add_integer("id", 1)
149 |     doc.add_float("rating", 3.5)
150 |     doc.add_date("date", datetime(2021, 1, 1))
151 |     
152 |     writer.add_document(doc)
153 |     doc = Document.from_dict(
154 |         {
155 |             "id": 2,
156 |             "rating": 4.5,
157 |             "date": datetime(2021, 1, 2),
158 |         },
159 |     )
160 |     writer.add_document(doc)
161 |     writer.commit()
162 |     writer.wait_merging_threads()
163 |     index.reload()
164 |     return index           
165 | 
166 | def create_index_with_ip_addr_field(dir=None):
167 |     schema = schema_with_ip_addr_field()
168 |     index = Index(schema, dir)
169 |     writer = index.writer(15_000_000, 1)
170 | 
171 |     doc = Document()
172 |     doc.add_integer("id", 1)
173 |     doc.add_float("rating", 3.5)
174 |     doc.add_ip_addr("ip_addr", "10.0.0.1")
175 |     writer.add_document(doc)
176 |     
177 |     doc = Document.from_dict(
178 |         {
179 |             "id": 2,
180 |             "rating": 4.5,
181 |             "ip_addr": "127.0.0.1",
182 |         },
183 |         schema
184 |     )
185 |     writer.add_document(doc)
186 |     doc = Document.from_dict(
187 |         {
188 |             "id": 2,
189 |             "rating": 4.5,
190 |             "ip_addr": "::1",
191 |         },
192 |         schema
193 |     )
194 |     writer.add_document(doc)
195 |     writer.commit()
196 |     writer.wait_merging_threads()
197 |     index.reload()
198 |     return index
199 | 
200 | def spanish_schema():
201 |     return (
202 |         SchemaBuilder()
203 |         .add_text_field("title", stored=True, tokenizer_name="es_stem")
204 |         .add_text_field("body", tokenizer_name="es_stem")
205 |         .build()
206 |     )
207 | 
208 | 
209 | def create_spanish_index():
210 |     # assume all tests will use the same documents for now
211 |     # other methods may set up function-local indexes
212 |     index = Index(spanish_schema(), None)
213 |     writer = index.writer()
214 | 
215 |     # 2 ways of adding documents
216 |     # 1
217 |     doc = Document()
218 |     # create a document instance
219 |     # add field-value pairs
220 |     doc.add_text("title", "El viejo y el mar")
221 |     doc.add_text(
222 |         "body",
223 |         (
224 |             "Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
225 |         ),
226 |     )
227 |     writer.add_document(doc)
228 |     # 2 use the built-in json support
229 |     # keys need to coincide with field names
230 |     doc = Document.from_dict(
231 |         {
232 |             "title": "De ratones y hombres",
233 |             "body": (
234 |                 "Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque"
235 |             ),
236 |         }
237 |     )
238 |     writer.add_document(doc)
239 |     writer.add_json(
240 |         """{
241 |             "title": ["Frankenstein", "El moderno Prometeo"],
242 |             "body": "Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa."
243 |         }"""
244 |     )
245 |     writer.commit()
246 |     writer.wait_merging_threads()
247 |     index.reload()
248 |     return index
249 | 
250 | 
251 | @pytest.fixture()
252 | def dir_index(tmpdir):
253 |     return (tmpdir, create_index(str(tmpdir)))
254 | 
255 | 
256 | @pytest.fixture(scope="class")
257 | def ram_index():
258 |     return create_index()
259 | 
260 | 
261 | @pytest.fixture(scope="class")
262 | def ram_index_numeric_fields():
263 |     return create_index_with_numeric_fields()
264 | 
265 | @pytest.fixture(scope="class")
266 | def ram_index_with_date_field():
267 |     return create_index_with_date_field()
268 | 
269 | @pytest.fixture(scope="class")
270 | def ram_index_with_ip_addr_field():
271 |     return create_index_with_ip_addr_field()
272 | 
273 | @pytest.fixture(scope="class")
274 | def spanish_index():
275 |     return create_spanish_index()
276 | 


--------------------------------------------------------------------------------
/tests/test_docs.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import pytest
 3 | 
 4 | from mktestdocs import check_md_file
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("filepath", Path("docs").glob("**/*.md"), ids=str)
 8 | def test_docs(filepath):
 9 |     check_md_file(filepath, memory=True)
10 | 


--------------------------------------------------------------------------------
/tests/test_escapes.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from tantivy import Query
 4 | 
 5 | 
 6 | def test_escape_quote_parse_query(ram_index):
 7 |     index = ram_index
 8 |     # We verify only that `parse_query` doesn't raise. This was a change
 9 |     # from tantivy versions prior to 0.24.0 in which the following would
10 |     # raise a `ValueError`.
11 |     q = index.parse_query(r'sea\"', ["title", "body"])
12 |     print(q)
13 | 
14 | 
15 | def test_escape_quote_parse_query_with_quotes(ram_index):
16 |     index = ram_index
17 |     # We verify only that `parse_query` doesn't raise. We are not testing
18 |     # whether tantivy's `parse_query` is correct.
19 |     query = index.parse_query(r'"sea\""', ["title", "body"])
20 | 
21 | 
22 | def test_escape_quote_parse_query_quoted(ram_index):
23 |     index = ram_index
24 |     # We verify only that `parse_query` doesn't raise. We are not testing
25 |     # whether tantivy's `parse_query` is correct.
26 |     query = index.parse_query(r'title:"sea \"whale"')
27 | 
28 | 
29 | def test_escape_quote_term_query(ram_index):
30 |     index = ram_index
31 |     # We verify only that `parse_query` doesn't raise. We are not testing
32 |     # whether tantivy's `parse_query` is correct.
33 |     query = Query.term_query(index.schema, "title", "sea\" whale")
34 | 


--------------------------------------------------------------------------------
/tests/test_json_bug.py:
--------------------------------------------------------------------------------
 1 | def test_json_bug():
 2 |     import tantivy
 3 | 
 4 |     schema_builder = tantivy.SchemaBuilder()
 5 |     schema_builder.add_json_field("data", stored=True)
 6 |     schema = schema_builder.build()
 7 | 
 8 |     index = tantivy.Index(schema)
 9 | 
10 |     index_writer = index.writer()
11 | 
12 |     data = {
13 |         "name": "John Doe",
14 |         "age": 30,
15 |         "email": "john.doe@example.com",
16 |         "interests": ["reading", "hiking", "coding"],
17 |     }
18 |     import json
19 |     json_data = json.dumps(data)
20 | 
21 |     doc = tantivy.Document()
22 |     doc.add_json("data", json_data)
23 |     index_writer.add_document(doc)
24 |     index_writer.commit()
25 |     index_writer.wait_merging_threads()
26 |     index.reload()
27 | 
28 |     searcher = index.searcher()
29 | 
30 |     query = "*"
31 |     q = index.parse_query(query)
32 |     top_docs = searcher.search(q, limit=10)
33 | 
34 |     print(f"Total hits: {top_docs}")
35 |     for score, hit in top_docs.hits:
36 |         doc = searcher.doc(hit)
37 |         print(doc["data"])
38 |         assert doc["data"] == [{'age': 30,
39 |              'email': 'john.doe@example.com',
40 |              'interests': ['reading', 'hiking', 'coding'],
41 |              'name': 'John Doe'
42 |         }]
43 | 


--------------------------------------------------------------------------------