├── .github
    └── workflows
    │   ├── docs.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
    └── .gitkeep
├── docs
    ├── bbm25_haystack.html
    ├── bbm25_haystack
    │   ├── __about__.html
    │   ├── bbm25_retriever.html
    │   ├── bbm25_store.html
    │   └── filters.html
    ├── index.html
    └── search.js
├── pyproject.toml
├── scripts
    └── benchmark_beir.py
├── src
    └── bbm25_haystack
    │   ├── __about__.py
    │   ├── __init__.py
    │   ├── bbm25_retriever.py
    │   ├── bbm25_store.py
    │   ├── default.model
    │   └── filters.py
└── tests
    ├── __init__.py
    ├── test_document_store.py
    └── test_retriever.py


/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: website
 2 | 
 3 | # build the documentation whenever there are new commits on main
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 |     # Alternative: only build for tags.
 9 |     # tags:
10 |     #   - '*'
11 | 
12 | # security: restrict permissions for CI jobs.
13 | permissions:
14 |   contents: read
15 | 
16 | jobs:
17 |   # Build the documentation and upload the static HTML files as an artifact.
18 |   build:
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |       - uses: actions/setup-python@v5
23 |         with:
24 |           python-version: '3.9'
25 | 
26 |       # ADJUST THIS: install all dependencies (including pdoc)
27 |       - run: pip install -e .
28 |       - run: pip install pdoc
29 | 
30 |       # ADJUST THIS: build your documentation into docs/.
31 |       # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here.
32 |       - run: pdoc src/bbm25_haystack -o docs --docformat restructuredtext
33 | 
34 |       - uses: actions/upload-pages-artifact@v3
35 |         with:
36 |           path: docs/
37 | 
38 |   # Deploy the artifact to GitHub pages.
39 |   # This is a separate job so that only actions/deploy-pages has the necessary permissions.
40 |   deploy:
41 |     needs: build
42 |     runs-on: ubuntu-latest
43 |     permissions:
44 |       pages: write
45 |       id-token: write
46 |     environment:
47 |       name: github-pages
48 |       url: ${{ steps.deployment.outputs.page_url }}
49 |     steps:
50 |       - id: deployment
51 |         uses: actions/deploy-pages@v4
52 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v[0-9].[0-9]+.[0-9]+*"
 7 | 
 8 | jobs:
 9 |   release-on-pypi:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout
14 |         uses: actions/checkout@v3
15 | 
16 |       - name: Install Hatch
17 |         run: pip install hatch
18 | 
19 |       - name: Build
20 |         run: hatch build
21 | 
22 |       - name: Publish on PyPi
23 |         env:
24 |           HATCH_INDEX_USER: __token__
25 |           HATCH_INDEX_AUTH: ${{ secrets.PYPI_API_TOKEN }}
26 |         run: hatch publish -y


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow comes from https://github.com/ofek/hatch-mypyc
 2 | # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
 3 | name: test
 4 | 
 5 | on:
 6 |   push:
 7 |     branches:
 8 |     - main
 9 |   pull_request:
10 | 
11 | concurrency:
12 |   group: test-${{ github.head_ref }}
13 |   cancel-in-progress: true
14 | 
15 | env:
16 |   PYTHONUNBUFFERED: "1"
17 |   FORCE_COLOR: "1"
18 | 
19 | jobs:
20 |   run:
21 |     name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
22 |     runs-on: ${{ matrix.os }}
23 |     strategy:
24 |       fail-fast: false
25 |       matrix:
26 |         os: [ubuntu-latest, windows-latest, macos-latest]
27 |         python-version: ['3.9', '3.10', '3.11', '3.12']
28 | 
29 |     steps:
30 |     - name: Support longpaths
31 |       if: matrix.os == 'windows-latest'
32 |       run: git config --system core.longpaths true
33 | 
34 |     - uses: actions/checkout@v3
35 | 
36 |     - name: Set up Python ${{ matrix.python-version }}
37 |       uses: actions/setup-python@v4
38 |       with:
39 |         python-version: ${{ matrix.python-version }}
40 | 
41 |     - name: Install Hatch
42 |       run: pip install --upgrade hatch
43 | 
44 |     - name: Lint
45 |       if: matrix.python-version == '3.9' && runner.os == 'Linux'
46 |       run: hatch run lint:all
47 | 
48 |     - name: Run tests
49 |       run: hatch run cov
50 | 
51 |     - name: Upload coverage reports to Codecov
52 |       uses: codecov/codecov-action@v4.0.1
53 |       with:
54 |         token: ${{ secrets.CODECOV_TOKEN }}
55 |         slug: Guest400123064/bbm25-haystack
56 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # VS Code
163 | .vscode
164 | 
165 | # Benchmarking datasets
166 | benchmarks/beir/*
167 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![test](https://github.com/Guest400123064/bbm25-haystack/actions/workflows/test.yml/badge.svg)](https://github.com/Guest400123064/bbm25-haystack/actions/workflows/test.yml)
 2 | [![codecov](https://codecov.io/gh/Guest400123064/bbm25-haystack/graph/badge.svg?token=IGRIRBHZ3U)](https://codecov.io/gh/Guest400123064/bbm25-haystack)
 3 | [![code style - Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 4 | [![types - Mypy](https://img.shields.io/badge/types-Mypy-blue.svg)](https://github.com/python/mypy)
 5 | [![Python 3.9](https://img.shields.io/badge/python-3.9%20|%203.10%20|%203.11%20|%203.12-blue.svg)](https://www.python.org/downloads/release/python-390/)
 6 | 
 7 | # Better BM25 In-Memory Document Store
 8 | 
 9 | An in-memory document store is a great starting point for prototyping and debugging before migrating to production-grade stores like Elasticsearch. However, [the original implementation](https://github.com/deepset-ai/haystack/blob/0dbb98c0a017b499560521aa93186d0640aab659/haystack/document_stores/in_memory/document_store.py#L148) of BM25 retrieval recreates an inverse index for the entire document store __on every new search__. Furthermore, the tokenization method is primitive, only permitting splitters based on regular expressions, making localization and domain adaptation challenging. Therefore, this implementation is a slight upgrade to the default BM25 in-memory document store by implementing incremental index update and incorporation of [SentencePiece](https://github.com/google/sentencepiece) statistical sub-word tokenization.
10 | 
11 | ## Installation
12 | 
13 | ```bash
14 | $ pip install bbm25-haystack
15 | ```
16 | 
17 | Alternatively, you can clone the repository and build from source to be able to reflect changes to the source code:
18 | 
19 | ```bash
20 | $ git clone https://github.com/Guest400123064/bbm25-haystack.git
21 | $ cd bbm25-haystack
22 | $ pip install -e .
23 | ```
24 | 
25 | ## Usage
26 | 
27 | ### Quick Start
28 | 
29 | Below is an example of how you can build a minimal search engine with the `bbm25_haystack` components on their own. They are also compatible with [Haystack pipelines](https://docs.haystack.deepset.ai/docs/creating-pipelines).
30 | 
31 | ```python
32 | from haystack import Document
33 | from bbm25_haystack import BetterBM25DocumentStore, BetterBM25Retriever
34 | 
35 | 
36 | document_store = BetterBM25DocumentStore()
37 | document_store.write_documents([
38 |    Document(content="There are over 7,000 languages spoken around the world today."),
39 |    Document(content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors."),
40 |    Document(content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bio-luminescent waves.")
41 | ])
42 | 
43 | retriever = BetterBM25Retriever(document_store)
44 | retriever.run(query="How many languages are spoken around the world today?")
45 | ```
46 | 
47 | ### API References
48 | 
49 | You can find the full API references [here](https://guest400123064.github.io/bbm25-haystack/). In a hurry? Below are some most important document store parameters you might want explore:
50 | 
51 | - `k, b, delta` - the [three BM25+ hyperparameters](https://en.wikipedia.org/wiki/Okapi_BM25).
52 | - `sp_file` - a path to a trained SentencePiece tokenizer `.model` file. The default tokenizer is directly copied from [LLaMA-2-7B-32K tokenizer](https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/tokenizer.model) with a vocab size of 32,000.
53 | - `n_grams` - default to 1, which means text (both query and document) are tokenized into uni-grams. If set to 2, the tokenizer also augment the list of uni-grams with bi-grams, and so on. If specified as tuple, e.g., (2, 3), the tokenizer only produce bi-grams and tri-grams, without any uni-gram.
54 | - `haystack_filter_logic` - see [below](#filtering-logic).
55 | 
56 | The retriever parameters are largely the same as [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever).
57 | 
58 | ## Filtering Logic
59 | 
60 | The current document store uses [`document_matches_filter`](https://github.com/deepset-ai/haystack/blob/main/haystack/utils/filters.py) shipped with Haystack to perform filtering by default, which is the same as [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore).
61 | 
62 | However, there is also an alternative filtering logic shipped with this implementation (unstable at this point). To use this alternative logic, initialize the document store with `haystack_filter_logic=False`. Please find comments and implementation details in [`filters.py`](./src/bbm25_haystack/filters.py). TL;DR:
63 | 
64 | - Comparison with `None`, i.e., missing values, involved will always return `False`, no matter missing the document attribute value or missing the filter value.
65 | - Comparison with `pandas.DataFrame` is always prohibited to reduce surprises.
66 | - No implicit `datetime` conversion from string values.
67 | - `in` and `not in` allows any `Iterable` as filter value, without the `list` constraint.
68 | - Allowing custom comparison functions for more flexibility. Note that the custom comparison function inputs are NEVER checked, i.e., no missing value check, no ``DataFrame`` check, etc. User should ensure the input values are valid and return value is always a boolean. The inputs are always supplied in the order of document value and then filter value.
69 | 
70 | In this case, the negation logic needs to be considered again because `False` can now issue from both input nullity check and the actual comparisons. For instance, `in` and `not in` both yield non-matching upon missing values. But I think having input processing and comparisons separated makes the filtering behavior more transparent.
71 | 
72 | ## Search Quality Evaluation
73 | 
74 | This repo has [a simple script](./scripts/benchmark_beir.py) to help evaluate the search quality over [BEIR](https://github.com/beir-cellar/beir/tree/main) benchmark. You need to clone the repository (you can also manually download the script and place it under a folder named `scripts`) and you have to install additional dependencies to run the script.
75 | 
76 | ```bash
77 | $ pip install beir
78 | ```
79 | 
80 | To run the script, you may want to specify the dataset name and BM25 hyperparameters. For example:
81 | 
82 | ```bash
83 | $ python scripts/benchmark_beir.py --datasets scifact arguana --bm25-k1 1.2 --n-grams 2 --output eval.csv
84 | ```
85 | 
86 | It automatically downloads the benchmarking dataset to `benchmarks/beir`, where `benchmarks` is at the same level as `scripts`. You may also check the help page for more information.
87 | 
88 | ```bash
89 | $ python scripts/benchmark_beir.py --help
90 | ```
91 | 
92 | New benchmarking scripts are expected to be added in the future.
93 | 
94 | ## License
95 | 
96 | `bbm25-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
97 | 


--------------------------------------------------------------------------------
/benchmarks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Guest400123064/bbm25-haystack/9906fa27ffc54f4fd92dfb5d717c15a12a69df0a/benchmarks/.gitkeep


--------------------------------------------------------------------------------
/docs/bbm25_haystack/__about__.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="utf-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  6 |     <meta name="generator" content="pdoc 14.4.0"/>
  7 |     <title>bbm25_haystack.__about__ API documentation</title>
  8 | 
  9 |     <style>/*! * Bootstrap Reboot v5.0.0 (https://getbootstrap.com/) * Copyright 2011-2021 The Bootstrap Authors * Copyright 2011-2021 Twitter, Inc. * Licensed under MIT (https://github.com/twbs/bootstrap/blob/main/LICENSE) * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md) */*,::after,::before{box-sizing:border-box}@media (prefers-reduced-motion:no-preference){:root{scroll-behavior:smooth}}body{margin:0;font-family:system-ui,-apple-system,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans","Liberation Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;background-color:#fff;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}hr{margin:1rem 0;color:inherit;background-color:currentColor;border:0;opacity:.25}hr:not([size]){height:1px}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem;font-weight:500;line-height:1.2}h1{font-size:calc(1.375rem + 1.5vw)}@media (min-width:1200px){h1{font-size:2.5rem}}h2{font-size:calc(1.325rem + .9vw)}@media (min-width:1200px){h2{font-size:2rem}}h3{font-size:calc(1.3rem + .6vw)}@media (min-width:1200px){h3{font-size:1.75rem}}h4{font-size:calc(1.275rem + .3vw)}@media (min-width:1200px){h4{font-size:1.5rem}}h5{font-size:1.25rem}h6{font-size:1rem}p{margin-top:0;margin-bottom:1rem}abbr[data-bs-original-title],abbr[title]{-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}ol,ul{padding-left:2rem}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:.875em}mark{padding:.2em;background-color:#fcf8e3}sub,sup{position:relative;font-size:.75em;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#0d6efd;text-decoration:underline}a:hover{color:#0a58ca}a:not([href]):not([class]),a:not([href]):not([class]):hover{color:inherit;text-decoration:none}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em;direction:ltr;unicode-bidi:bidi-override}pre{display:block;margin-top:0;margin-bottom:1rem;overflow:auto;font-size:.875em}pre code{font-size:inherit;color:inherit;word-break:normal}code{font-size:.875em;color:#d63384;word-wrap:break-word}a>code{color:inherit}kbd{padding:.2rem .4rem;font-size:.875em;color:#fff;background-color:#212529;border-radius:.2rem}kbd kbd{padding:0;font-size:1em;font-weight:700}figure{margin:0 0 1rem}img,svg{vertical-align:middle}table{caption-side:bottom;border-collapse:collapse}caption{padding-top:.5rem;padding-bottom:.5rem;color:#6c757d;text-align:left}th{text-align:inherit;text-align:-webkit-match-parent}tbody,td,tfoot,th,thead,tr{border-color:inherit;border-style:solid;border-width:0}label{display:inline-block}button{border-radius:0}button:focus:not(:focus-visible){outline:0}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,select{text-transform:none}[role=button]{cursor:pointer}select{word-wrap:normal}select:disabled{opacity:1}[list]::-webkit-calendar-picker-indicator{display:none}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}::-moz-focus-inner{padding:0;border-style:none}textarea{resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{float:left;width:100%;padding:0;margin-bottom:.5rem;font-size:calc(1.275rem + .3vw);line-height:inherit}@media (min-width:1200px){legend{font-size:1.5rem}}legend+*{clear:left}::-webkit-datetime-edit-day-field,::-webkit-datetime-edit-fields-wrapper,::-webkit-datetime-edit-hour-field,::-webkit-datetime-edit-minute,::-webkit-datetime-edit-month-field,::-webkit-datetime-edit-text,::-webkit-datetime-edit-year-field{padding:0}::-webkit-inner-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:textfield}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-color-swatch-wrapper{padding:0}::file-selector-button{font:inherit}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}iframe{border:0}summary{display:list-item;cursor:pointer}progress{vertical-align:baseline}[hidden]{display:none!important}</style>
 10 |     <style>/*! syntax-highlighting.css */pre{line-height:125%;}span.linenos{color:inherit; background-color:transparent; padding-left:5px; padding-right:20px;}.pdoc-code .hll{background-color:#ffffcc}.pdoc-code{background:#f8f8f8;}.pdoc-code .c{color:#3D7B7B; font-style:italic}.pdoc-code .err{border:1px solid #FF0000}.pdoc-code .k{color:#008000; font-weight:bold}.pdoc-code .o{color:#666666}.pdoc-code .ch{color:#3D7B7B; font-style:italic}.pdoc-code .cm{color:#3D7B7B; font-style:italic}.pdoc-code .cp{color:#9C6500}.pdoc-code .cpf{color:#3D7B7B; font-style:italic}.pdoc-code .c1{color:#3D7B7B; font-style:italic}.pdoc-code .cs{color:#3D7B7B; font-style:italic}.pdoc-code .gd{color:#A00000}.pdoc-code .ge{font-style:italic}.pdoc-code .gr{color:#E40000}.pdoc-code .gh{color:#000080; font-weight:bold}.pdoc-code .gi{color:#008400}.pdoc-code .go{color:#717171}.pdoc-code .gp{color:#000080; font-weight:bold}.pdoc-code .gs{font-weight:bold}.pdoc-code .gu{color:#800080; font-weight:bold}.pdoc-code .gt{color:#0044DD}.pdoc-code .kc{color:#008000; font-weight:bold}.pdoc-code .kd{color:#008000; font-weight:bold}.pdoc-code .kn{color:#008000; font-weight:bold}.pdoc-code .kp{color:#008000}.pdoc-code .kr{color:#008000; font-weight:bold}.pdoc-code .kt{color:#B00040}.pdoc-code .m{color:#666666}.pdoc-code .s{color:#BA2121}.pdoc-code .na{color:#687822}.pdoc-code .nb{color:#008000}.pdoc-code .nc{color:#0000FF; font-weight:bold}.pdoc-code .no{color:#880000}.pdoc-code .nd{color:#AA22FF}.pdoc-code .ni{color:#717171; font-weight:bold}.pdoc-code .ne{color:#CB3F38; font-weight:bold}.pdoc-code .nf{color:#0000FF}.pdoc-code .nl{color:#767600}.pdoc-code .nn{color:#0000FF; font-weight:bold}.pdoc-code .nt{color:#008000; font-weight:bold}.pdoc-code .nv{color:#19177C}.pdoc-code .ow{color:#AA22FF; font-weight:bold}.pdoc-code .w{color:#bbbbbb}.pdoc-code .mb{color:#666666}.pdoc-code .mf{color:#666666}.pdoc-code .mh{color:#666666}.pdoc-code .mi{color:#666666}.pdoc-code .mo{color:#666666}.pdoc-code .sa{color:#BA2121}.pdoc-code .sb{color:#BA2121}.pdoc-code .sc{color:#BA2121}.pdoc-code .dl{color:#BA2121}.pdoc-code .sd{color:#BA2121; font-style:italic}.pdoc-code .s2{color:#BA2121}.pdoc-code .se{color:#AA5D1F; font-weight:bold}.pdoc-code .sh{color:#BA2121}.pdoc-code .si{color:#A45A77; font-weight:bold}.pdoc-code .sx{color:#008000}.pdoc-code .sr{color:#A45A77}.pdoc-code .s1{color:#BA2121}.pdoc-code .ss{color:#19177C}.pdoc-code .bp{color:#008000}.pdoc-code .fm{color:#0000FF}.pdoc-code .vc{color:#19177C}.pdoc-code .vg{color:#19177C}.pdoc-code .vi{color:#19177C}.pdoc-code .vm{color:#19177C}.pdoc-code .il{color:#666666}</style>
 11 |     <style>/*! theme.css */:root{--pdoc-background:#fff;}.pdoc{--text:#212529;--muted:#6c757d;--link:#3660a5;--link-hover:#1659c5;--code:#f8f8f8;--active:#fff598;--accent:#eee;--accent2:#c1c1c1;--nav-hover:rgba(255, 255, 255, 0.5);--name:#0066BB;--def:#008800;--annotation:#007020;}</style>
 12 |     <style>/*! layout.css */html, body{width:100%;height:100%;}html, main{scroll-behavior:smooth;}body{background-color:var(--pdoc-background);}@media (max-width:769px){#navtoggle{cursor:pointer;position:absolute;width:50px;height:40px;top:1rem;right:1rem;border-color:var(--text);color:var(--text);display:flex;opacity:0.8;z-index:999;}#navtoggle:hover{opacity:1;}#togglestate + div{display:none;}#togglestate:checked + div{display:inherit;}main, header{padding:2rem 3vw;}header + main{margin-top:-3rem;}.git-button{display:none !important;}nav input[type="search"]{max-width:77%;}nav input[type="search"]:first-child{margin-top:-6px;}nav input[type="search"]:valid ~ *{display:none !important;}}@media (min-width:770px){:root{--sidebar-width:clamp(12.5rem, 28vw, 22rem);}nav{position:fixed;overflow:auto;height:100vh;width:var(--sidebar-width);}main, header{padding:3rem 2rem 3rem calc(var(--sidebar-width) + 3rem);width:calc(54rem + var(--sidebar-width));max-width:100%;}header + main{margin-top:-4rem;}#navtoggle{display:none;}}#togglestate{position:absolute;height:0;opacity:0;}nav.pdoc{--pad:clamp(0.5rem, 2vw, 1.75rem);--indent:1.5rem;background-color:var(--accent);border-right:1px solid var(--accent2);box-shadow:0 0 20px rgba(50, 50, 50, .2) inset;padding:0 0 0 var(--pad);overflow-wrap:anywhere;scrollbar-width:thin; scrollbar-color:var(--accent2) transparent; z-index:1}nav.pdoc::-webkit-scrollbar{width:.4rem; }nav.pdoc::-webkit-scrollbar-thumb{background-color:var(--accent2); }nav.pdoc > div{padding:var(--pad) 0;}nav.pdoc .module-list-button{display:inline-flex;align-items:center;color:var(--text);border-color:var(--muted);margin-bottom:1rem;}nav.pdoc .module-list-button:hover{border-color:var(--text);}nav.pdoc input[type=search]{display:block;outline-offset:0;width:calc(100% - var(--pad));}nav.pdoc .logo{max-width:calc(100% - var(--pad));max-height:35vh;display:block;margin:0 auto 1rem;transform:translate(calc(-.5 * var(--pad)), 0);}nav.pdoc ul{list-style:none;padding-left:0;}nav.pdoc > div > ul{margin-left:calc(0px - var(--pad));}nav.pdoc li a{padding:.2rem 0 .2rem calc(var(--pad) + var(--indent));}nav.pdoc > div > ul > li > a{padding-left:var(--pad);}nav.pdoc li{transition:all 100ms;}nav.pdoc li:hover{background-color:var(--nav-hover);}nav.pdoc a, nav.pdoc a:hover{color:var(--text);}nav.pdoc a{display:block;}nav.pdoc > h2:first-of-type{margin-top:1.5rem;}nav.pdoc .class:before{content:"class ";color:var(--muted);}nav.pdoc .function:after{content:"()";color:var(--muted);}nav.pdoc footer:before{content:"";display:block;width:calc(100% - var(--pad));border-top:solid var(--accent2) 1px;margin-top:1.5rem;padding-top:.5rem;}nav.pdoc footer{font-size:small;}</style>
 13 |     <style>/*! content.css */.pdoc{color:var(--text);box-sizing:border-box;line-height:1.5;background:none;}.pdoc .pdoc-button{cursor:pointer;display:inline-block;border:solid black 1px;border-radius:2px;font-size:.75rem;padding:calc(0.5em - 1px) 1em;transition:100ms all;}.pdoc .pdoc-alert{padding:1rem 1rem 1rem calc(1.5rem + 24px);border:1px solid transparent;border-radius:.25rem;background-repeat:no-repeat;background-position:1rem center;margin-bottom:1rem;}.pdoc .pdoc-alert > *:last-child{margin-bottom:0;}.pdoc .pdoc-alert-note {color:#084298;background-color:#cfe2ff;border-color:#b6d4fe;background-image:url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20width%3D%2224%22%20height%3D%2224%22%20fill%3D%22%23084298%22%20viewBox%3D%220%200%2016%2016%22%3E%3Cpath%20d%3D%22M8%2016A8%208%200%201%200%208%200a8%208%200%200%200%200%2016zm.93-9.412-1%204.705c-.07.34.029.533.304.533.194%200%20.487-.07.686-.246l-.088.416c-.287.346-.92.598-1.465.598-.703%200-1.002-.422-.808-1.319l.738-3.468c.064-.293.006-.399-.287-.47l-.451-.081.082-.381%202.29-.287zM8%205.5a1%201%200%201%201%200-2%201%201%200%200%201%200%202z%22/%3E%3C/svg%3E");}.pdoc .pdoc-alert-warning{color:#664d03;background-color:#fff3cd;border-color:#ffecb5;background-image:url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20width%3D%2224%22%20height%3D%2224%22%20fill%3D%22%23664d03%22%20viewBox%3D%220%200%2016%2016%22%3E%3Cpath%20d%3D%22M8.982%201.566a1.13%201.13%200%200%200-1.96%200L.165%2013.233c-.457.778.091%201.767.98%201.767h13.713c.889%200%201.438-.99.98-1.767L8.982%201.566zM8%205c.535%200%20.954.462.9.995l-.35%203.507a.552.552%200%200%201-1.1%200L7.1%205.995A.905.905%200%200%201%208%205zm.002%206a1%201%200%201%201%200%202%201%201%200%200%201%200-2z%22/%3E%3C/svg%3E");}.pdoc .pdoc-alert-danger{color:#842029;background-color:#f8d7da;border-color:#f5c2c7;background-image:url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20width%3D%2224%22%20height%3D%2224%22%20fill%3D%22%23842029%22%20viewBox%3D%220%200%2016%2016%22%3E%3Cpath%20d%3D%22M5.52.359A.5.5%200%200%201%206%200h4a.5.5%200%200%201%20.474.658L8.694%206H12.5a.5.5%200%200%201%20.395.807l-7%209a.5.5%200%200%201-.873-.454L6.823%209.5H3.5a.5.5%200%200%201-.48-.641l2.5-8.5z%22/%3E%3C/svg%3E");}.pdoc .visually-hidden{position:absolute !important;width:1px !important;height:1px !important;padding:0 !important;margin:-1px !important;overflow:hidden !important;clip:rect(0, 0, 0, 0) !important;white-space:nowrap !important;border:0 !important;}.pdoc h1, .pdoc h2, .pdoc h3{font-weight:300;margin:.3em 0;padding:.2em 0;}.pdoc > section:not(.module-info) h1{font-size:1.5rem;font-weight:500;}.pdoc > section:not(.module-info) h2{font-size:1.4rem;font-weight:500;}.pdoc > section:not(.module-info) h3{font-size:1.3rem;font-weight:500;}.pdoc > section:not(.module-info) h4{font-size:1.2rem;}.pdoc > section:not(.module-info) h5{font-size:1.1rem;}.pdoc a{text-decoration:none;color:var(--link);}.pdoc a:hover{color:var(--link-hover);}.pdoc blockquote{margin-left:2rem;}.pdoc pre{border-top:1px solid var(--accent2);border-bottom:1px solid var(--accent2);margin-top:0;margin-bottom:1em;padding:.5rem 0 .5rem .5rem;overflow-x:auto;background-color:var(--code);}.pdoc code{color:var(--text);padding:.2em .4em;margin:0;font-size:85%;background-color:var(--accent);border-radius:6px;}.pdoc a > code{color:inherit;}.pdoc pre > code{display:inline-block;font-size:inherit;background:none;border:none;padding:0;}.pdoc > section:not(.module-info){margin-bottom:1.5rem;}.pdoc .modulename{margin-top:0;font-weight:bold;}.pdoc .modulename a{color:var(--link);transition:100ms all;}.pdoc .git-button{float:right;border:solid var(--link) 1px;}.pdoc .git-button:hover{background-color:var(--link);color:var(--pdoc-background);}.view-source-toggle-state,.view-source-toggle-state ~ .pdoc-code{display:none;}.view-source-toggle-state:checked ~ .pdoc-code{display:block;}.view-source-button{display:inline-block;float:right;font-size:.75rem;line-height:1.5rem;color:var(--muted);padding:0 .4rem 0 1.3rem;cursor:pointer;text-indent:-2px;}.view-source-button > span{visibility:hidden;}.module-info .view-source-button{float:none;display:flex;justify-content:flex-end;margin:-1.2rem .4rem -.2rem 0;}.view-source-button::before{position:absolute;content:"View Source";display:list-item;list-style-type:disclosure-closed;}.view-source-toggle-state:checked ~ .attr .view-source-button::before,.view-source-toggle-state:checked ~ .view-source-button::before{list-style-type:disclosure-open;}.pdoc .docstring{margin-bottom:1.5rem;}.pdoc section:not(.module-info) .docstring{margin-left:clamp(0rem, 5vw - 2rem, 1rem);}.pdoc .docstring .pdoc-code{margin-left:1em;margin-right:1em;}.pdoc h1:target,.pdoc h2:target,.pdoc h3:target,.pdoc h4:target,.pdoc h5:target,.pdoc h6:target,.pdoc .pdoc-code > pre > span:target{background-color:var(--active);box-shadow:-1rem 0 0 0 var(--active);}.pdoc .pdoc-code > pre > span:target{display:block;}.pdoc div:target > .attr,.pdoc section:target > .attr,.pdoc dd:target > a{background-color:var(--active);}.pdoc *{scroll-margin:2rem;}.pdoc .pdoc-code .linenos{user-select:none;}.pdoc .attr:hover{filter:contrast(0.95);}.pdoc section, .pdoc .classattr{position:relative;}.pdoc .headerlink{--width:clamp(1rem, 3vw, 2rem);position:absolute;top:0;left:calc(0rem - var(--width));transition:all 100ms ease-in-out;opacity:0;}.pdoc .headerlink::before{content:"#";display:block;text-align:center;width:var(--width);height:2.3rem;line-height:2.3rem;font-size:1.5rem;}.pdoc .attr:hover ~ .headerlink,.pdoc *:target > .headerlink,.pdoc .headerlink:hover{opacity:1;}.pdoc .attr{display:block;margin:.5rem 0 .5rem;padding:.4rem .4rem .4rem 1rem;background-color:var(--accent);overflow-x:auto;}.pdoc .classattr{margin-left:2rem;}.pdoc .name{color:var(--name);font-weight:bold;}.pdoc .def{color:var(--def);font-weight:bold;}.pdoc .signature{background-color:transparent;}.pdoc .param, .pdoc .return-annotation{white-space:pre;}.pdoc .signature.multiline .param{display:block;}.pdoc .signature.condensed .param{display:inline-block;}.pdoc .annotation{color:var(--annotation);}.pdoc .view-value-toggle-state,.pdoc .view-value-toggle-state ~ .default_value{display:none;}.pdoc .view-value-toggle-state:checked ~ .default_value{display:inherit;}.pdoc .view-value-button{font-size:.5rem;vertical-align:middle;border-style:dashed;margin-top:-0.1rem;}.pdoc .view-value-button:hover{background:white;}.pdoc .view-value-button::before{content:"show";text-align:center;width:2.2em;display:inline-block;}.pdoc .view-value-toggle-state:checked ~ .view-value-button::before{content:"hide";}.pdoc .inherited{margin-left:2rem;}.pdoc .inherited dt{font-weight:700;}.pdoc .inherited dt, .pdoc .inherited dd{display:inline;margin-left:0;margin-bottom:.5rem;}.pdoc .inherited dd:not(:last-child):after{content:", ";}.pdoc .inherited .class:before{content:"class ";}.pdoc .inherited .function a:after{content:"()";}.pdoc .search-result .docstring{overflow:auto;max-height:25vh;}.pdoc .search-result.focused > .attr{background-color:var(--active);}.pdoc .attribution{margin-top:2rem;display:block;opacity:0.5;transition:all 200ms;filter:grayscale(100%);}.pdoc .attribution:hover{opacity:1;filter:grayscale(0%);}.pdoc .attribution img{margin-left:5px;height:35px;vertical-align:middle;width:70px;transition:all 200ms;}.pdoc table{display:block;width:max-content;max-width:100%;overflow:auto;margin-bottom:1rem;}.pdoc table th{font-weight:600;}.pdoc table th, .pdoc table td{padding:6px 13px;border:1px solid var(--accent2);}</style>
 14 |     <style>/*! custom.css */</style></head>
 15 | <body>
 16 |     <nav class="pdoc">
 17 |         <label id="navtoggle" for="togglestate" class="pdoc-button"><svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 30 30'><path stroke-linecap='round' stroke="currentColor" stroke-miterlimit='10' stroke-width='2' d='M4 7h22M4 15h22M4 23h22'/></svg></label>
 18 |         <input id="togglestate" type="checkbox" aria-hidden="true" tabindex="-1">
 19 |         <div>            <a class="pdoc-button module-list-button" href="../bbm25_haystack.html">
 20 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-box-arrow-in-left" viewBox="0 0 16 16">
 21 |   <path fill-rule="evenodd" d="M10 3.5a.5.5 0 0 0-.5-.5h-8a.5.5 0 0 0-.5.5v9a.5.5 0 0 0 .5.5h8a.5.5 0 0 0 .5-.5v-2a.5.5 0 0 1 1 0v2A1.5 1.5 0 0 1 9.5 14h-8A1.5 1.5 0 0 1 0 12.5v-9A1.5 1.5 0 0 1 1.5 2h8A1.5 1.5 0 0 1 11 3.5v2a.5.5 0 0 1-1 0v-2z"/>
 22 |   <path fill-rule="evenodd" d="M4.146 8.354a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L5.707 7.5H14.5a.5.5 0 0 1 0 1H5.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3z"/>
 23 | </svg>                &nbsp;bbm25_haystack</a>
 24 | 
 25 | 
 26 |             <input type="search" placeholder="Search..." role="searchbox" aria-label="search"
 27 |                    pattern=".+" required>
 28 | 
 29 | 
 30 | 
 31 |             <h2>API Documentation</h2>
 32 |                 <ul class="memberlist">
 33 |     </ul>
 34 | 
 35 | 
 36 | 
 37 |         <a class="attribution" title="pdoc: Python API documentation generator" href="https://pdoc.dev" target="_blank">
 38 |             built with <span class="visually-hidden">pdoc</span><img
 39 |                 alt="pdoc logo"
 40 |                 src="data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20role%3D%22img%22%20aria-label%3D%22pdoc%20logo%22%20width%3D%22300%22%20height%3D%22150%22%20viewBox%3D%22-1%200%2060%2030%22%3E%3Ctitle%3Epdoc%3C/title%3E%3Cpath%20d%3D%22M29.621%2021.293c-.011-.273-.214-.475-.511-.481a.5.5%200%200%200-.489.503l-.044%201.393c-.097.551-.695%201.215-1.566%201.704-.577.428-1.306.486-2.193.182-1.426-.617-2.467-1.654-3.304-2.487l-.173-.172a3.43%203.43%200%200%200-.365-.306.49.49%200%200%200-.286-.196c-1.718-1.06-4.931-1.47-7.353.191l-.219.15c-1.707%201.187-3.413%202.131-4.328%201.03-.02-.027-.49-.685-.141-1.763.233-.721.546-2.408.772-4.076.042-.09.067-.187.046-.288.166-1.347.277-2.625.241-3.351%201.378-1.008%202.271-2.586%202.271-4.362%200-.976-.272-1.935-.788-2.774-.057-.094-.122-.18-.184-.268.033-.167.052-.339.052-.516%200-1.477-1.202-2.679-2.679-2.679-.791%200-1.496.352-1.987.9a6.3%206.3%200%200%200-1.001.029c-.492-.564-1.207-.929-2.012-.929-1.477%200-2.679%201.202-2.679%202.679A2.65%202.65%200%200%200%20.97%206.554c-.383.747-.595%201.572-.595%202.41%200%202.311%201.507%204.29%203.635%205.107-.037.699-.147%202.27-.423%203.294l-.137.461c-.622%202.042-2.515%208.257%201.727%2010.643%201.614.908%203.06%201.248%204.317%201.248%202.665%200%204.492-1.524%205.322-2.401%201.476-1.559%202.886-1.854%206.491.82%201.877%201.393%203.514%201.753%204.861%201.068%202.223-1.713%202.811-3.867%203.399-6.374.077-.846.056-1.469.054-1.537zm-4.835%204.313c-.054.305-.156.586-.242.629-.034-.007-.131-.022-.307-.157-.145-.111-.314-.478-.456-.908.221.121.432.25.675.355.115.039.219.051.33.081zm-2.251-1.238c-.05.33-.158.648-.252.694-.022.001-.125-.018-.307-.157-.217-.166-.488-.906-.639-1.573.358.344.754.693%201.198%201.036zm-3.887-2.337c-.006-.116-.018-.231-.041-.342.635.145%201.189.368%201.599.625.097.231.166.481.174.642-.03.049-.055.101-.067.158-.046.013-.128.026-.298.004-.278-.037-.901-.57-1.367-1.087zm-1.127-.497c.116.306.176.625.12.71-.019.014-.117.045-.345.016-.206-.027-.604-.332-.986-.695.41-.051.816-.056%201.211-.031zm-4.535%201.535c.209.22.379.47.358.598-.006.041-.088.138-.351.234-.144.055-.539-.063-.979-.259a11.66%2011.66%200%200%200%20.972-.573zm.983-.664c.359-.237.738-.418%201.126-.554.25.237.479.548.457.694-.006.042-.087.138-.351.235-.174.064-.694-.105-1.232-.375zm-3.381%201.794c-.022.145-.061.29-.149.401-.133.166-.358.248-.69.251h-.002c-.133%200-.306-.26-.45-.621.417.091.854.07%201.291-.031zm-2.066-8.077a4.78%204.78%200%200%201-.775-.584c.172-.115.505-.254.88-.378l-.105.962zm-.331%202.302a10.32%2010.32%200%200%201-.828-.502c.202-.143.576-.328.984-.49l-.156.992zm-.45%202.157l-.701-.403c.214-.115.536-.249.891-.376a11.57%2011.57%200%200%201-.19.779zm-.181%201.716c.064.398.194.702.298.893-.194-.051-.435-.162-.736-.398.061-.119.224-.3.438-.495zM8.87%204.141c0%20.152-.123.276-.276.276s-.275-.124-.275-.276.123-.276.276-.276.275.124.275.276zm-.735-.389a1.15%201.15%200%200%200-.314.783%201.16%201.16%200%200%200%201.162%201.162c.457%200%20.842-.27%201.032-.653.026.117.042.238.042.362a1.68%201.68%200%200%201-1.679%201.679%201.68%201.68%200%200%201-1.679-1.679c0-.843.626-1.535%201.436-1.654zM5.059%205.406A1.68%201.68%200%200%201%203.38%207.085a1.68%201.68%200%200%201-1.679-1.679c0-.037.009-.072.011-.109.21.3.541.508.935.508a1.16%201.16%200%200%200%201.162-1.162%201.14%201.14%200%200%200-.474-.912c.015%200%20.03-.005.045-.005.926.001%201.679.754%201.679%201.68zM3.198%204.141c0%20.152-.123.276-.276.276s-.275-.124-.275-.276.123-.276.276-.276.275.124.275.276zM1.375%208.964c0-.52.103-1.035.288-1.52.466.394%201.06.64%201.717.64%201.144%200%202.116-.725%202.499-1.738.383%201.012%201.355%201.738%202.499%201.738.867%200%201.631-.421%202.121-1.062.307.605.478%201.267.478%201.942%200%202.486-2.153%204.51-4.801%204.51s-4.801-2.023-4.801-4.51zm24.342%2019.349c-.985.498-2.267.168-3.813-.979-3.073-2.281-5.453-3.199-7.813-.705-1.315%201.391-4.163%203.365-8.423.97-3.174-1.786-2.239-6.266-1.261-9.479l.146-.492c.276-1.02.395-2.457.444-3.268a6.11%206.11%200%200%200%201.18.115%206.01%206.01%200%200%200%202.536-.562l-.006.175c-.802.215-1.848.612-2.021%201.25-.079.295.021.601.274.837.219.203.415.364.598.501-.667.304-1.243.698-1.311%201.179-.02.144-.022.507.393.787.213.144.395.26.564.365-1.285.521-1.361.96-1.381%201.126-.018.142-.011.496.427.746l.854.489c-.473.389-.971.914-.999%201.429-.018.278.095.532.316.713.675.556%201.231.721%201.653.721.059%200%20.104-.014.158-.02.207.707.641%201.64%201.513%201.64h.013c.8-.008%201.236-.345%201.462-.626.173-.216.268-.457.325-.692.424.195.93.374%201.372.374.151%200%20.294-.021.423-.068.732-.27.944-.704.993-1.021.009-.061.003-.119.002-.179.266.086.538.147.789.147.15%200%20.294-.021.423-.069.542-.2.797-.489.914-.754.237.147.478.258.704.288.106.014.205.021.296.021.356%200%20.595-.101.767-.229.438.435%201.094.992%201.656%201.067.106.014.205.021.296.021a1.56%201.56%200%200%200%20.323-.035c.17.575.453%201.289.866%201.605.358.273.665.362.914.362a.99.99%200%200%200%20.421-.093%201.03%201.03%200%200%200%20.245-.164c.168.428.39.846.68%201.068.358.273.665.362.913.362a.99.99%200%200%200%20.421-.093c.317-.148.512-.448.639-.762.251.157.495.257.726.257.127%200%20.25-.024.37-.071.427-.17.706-.617.841-1.314.022-.015.047-.022.068-.038.067-.051.133-.104.196-.159-.443%201.486-1.107%202.761-2.086%203.257zM8.66%209.925a.5.5%200%201%200-1%200c0%20.653-.818%201.205-1.787%201.205s-1.787-.552-1.787-1.205a.5.5%200%201%200-1%200c0%201.216%201.25%202.205%202.787%202.205s2.787-.989%202.787-2.205zm4.4%2015.965l-.208.097c-2.661%201.258-4.708%201.436-6.086.527-1.542-1.017-1.88-3.19-1.844-4.198a.4.4%200%200%200-.385-.414c-.242-.029-.406.164-.414.385-.046%201.249.367%203.686%202.202%204.896.708.467%201.547.7%202.51.7%201.248%200%202.706-.392%204.362-1.174l.185-.086a.4.4%200%200%200%20.205-.527c-.089-.204-.326-.291-.527-.206zM9.547%202.292c.093.077.205.114.317.114a.5.5%200%200%200%20.318-.886L8.817.397a.5.5%200%200%200-.703.068.5.5%200%200%200%20.069.703l1.364%201.124zm-7.661-.065c.086%200%20.173-.022.253-.068l1.523-.893a.5.5%200%200%200-.506-.863l-1.523.892a.5.5%200%200%200-.179.685c.094.158.261.247.432.247z%22%20transform%3D%22matrix%28-1%200%200%201%2058%200%29%22%20fill%3D%22%233bb300%22/%3E%3Cpath%20d%3D%22M.3%2021.86V10.18q0-.46.02-.68.04-.22.18-.5.28-.54%201.34-.54%201.06%200%201.42.28.38.26.44.78.76-1.04%202.38-1.04%201.64%200%203.1%201.54%201.46%201.54%201.46%203.58%200%202.04-1.46%203.58-1.44%201.54-3.08%201.54-1.64%200-2.38-.92v4.04q0%20.46-.04.68-.02.22-.18.5-.14.3-.5.42-.36.12-.98.12-.62%200-1-.12-.36-.12-.52-.4-.14-.28-.18-.5-.02-.22-.02-.68zm3.96-9.42q-.46.54-.46%201.18%200%20.64.46%201.18.48.52%201.2.52.74%200%201.24-.52.52-.52.52-1.18%200-.66-.48-1.18-.48-.54-1.26-.54-.76%200-1.22.54zm14.741-8.36q.16-.3.54-.42.38-.12%201-.12.64%200%201.02.12.38.12.52.42.16.3.18.54.04.22.04.68v11.94q0%20.46-.04.7-.02.22-.18.5-.3.54-1.7.54-1.38%200-1.54-.98-.84.96-2.34.96-1.8%200-3.28-1.56-1.48-1.58-1.48-3.66%200-2.1%201.48-3.68%201.5-1.58%203.28-1.58%201.48%200%202.3%201v-4.2q0-.46.02-.68.04-.24.18-.52zm-3.24%2010.86q.52.54%201.26.54.74%200%201.22-.54.5-.54.5-1.18%200-.66-.48-1.22-.46-.56-1.26-.56-.8%200-1.28.56-.48.54-.48%201.2%200%20.66.52%201.2zm7.833-1.2q0-2.4%201.68-3.96%201.68-1.56%203.84-1.56%202.16%200%203.82%201.56%201.66%201.54%201.66%203.94%200%201.66-.86%202.96-.86%201.28-2.1%201.9-1.22.6-2.54.6-1.32%200-2.56-.64-1.24-.66-2.1-1.92-.84-1.28-.84-2.88zm4.18%201.44q.64.48%201.3.48.66%200%201.32-.5.66-.5.66-1.48%200-.98-.62-1.46-.62-.48-1.34-.48-.72%200-1.34.5-.62.5-.62%201.48%200%20.96.64%201.46zm11.412-1.44q0%20.84.56%201.32.56.46%201.18.46.64%200%201.18-.36.56-.38.9-.38.6%200%201.46%201.06.46.58.46%201.04%200%20.76-1.1%201.42-1.14.8-2.8.8-1.86%200-3.58-1.34-.82-.64-1.34-1.7-.52-1.08-.52-2.36%200-1.3.52-2.34.52-1.06%201.34-1.7%201.66-1.32%203.54-1.32.76%200%201.48.22.72.2%201.06.4l.32.2q.36.24.56.38.52.4.52.92%200%20.5-.42%201.14-.72%201.1-1.38%201.1-.38%200-1.08-.44-.36-.34-1.04-.34-.66%200-1.24.48-.58.48-.58%201.34z%22%20fill%3D%22green%22/%3E%3C/svg%3E"/>
 41 |         </a>
 42 | </div>
 43 |     </nav>
 44 |     <main class="pdoc">
 45 |             <section class="module-info">
 46 |                     <h1 class="modulename">
 47 | <a href="./../bbm25_haystack.html">bbm25_haystack</a><wbr>.__about__    </h1>
 48 | 
 49 |                 
 50 |                         <input id="mod-__about__-view-source" class="view-source-toggle-state" type="checkbox" aria-hidden="true" tabindex="-1">
 51 | 
 52 |                         <label class="view-source-button" for="mod-__about__-view-source"><span>View Source</span></label>
 53 | 
 54 |                         <div class="pdoc-code codehilite"><pre><span></span><span id="L-1"><a href="#L-1"><span class="linenos">1</span></a><span class="c1"># SPDX-FileCopyrightText: 2024-present Yuxuan Wang &lt;wangy49@seas.upenn.edu&gt;</span>
 55 | </span><span id="L-2"><a href="#L-2"><span class="linenos">2</span></a><span class="c1">#</span>
 56 | </span><span id="L-3"><a href="#L-3"><span class="linenos">3</span></a><span class="c1"># SPDX-License-Identifier: Apache-2.0</span>
 57 | </span><span id="L-4"><a href="#L-4"><span class="linenos">4</span></a>
 58 | </span><span id="L-5"><a href="#L-5"><span class="linenos">5</span></a><span class="n">__version__</span> <span class="o">=</span> <span class="s2">&quot;0.2.0&quot;</span>
 59 | </span></pre></div>
 60 | 
 61 | 
 62 |             </section>
 63 |     </main>
 64 | <script>
 65 |     function escapeHTML(html) {
 66 |         return document.createElement('div').appendChild(document.createTextNode(html)).parentNode.innerHTML;
 67 |     }
 68 | 
 69 |     const originalContent = document.querySelector("main.pdoc");
 70 |     let currentContent = originalContent;
 71 | 
 72 |     function setContent(innerHTML) {
 73 |         let elem;
 74 |         if (innerHTML) {
 75 |             elem = document.createElement("main");
 76 |             elem.classList.add("pdoc");
 77 |             elem.innerHTML = innerHTML;
 78 |         } else {
 79 |             elem = originalContent;
 80 |         }
 81 |         if (currentContent !== elem) {
 82 |             currentContent.replaceWith(elem);
 83 |             currentContent = elem;
 84 |         }
 85 |     }
 86 | 
 87 |     function getSearchTerm() {
 88 |         return (new URL(window.location)).searchParams.get("search");
 89 |     }
 90 | 
 91 |     const searchBox = document.querySelector(".pdoc input[type=search]");
 92 |     searchBox.addEventListener("input", function () {
 93 |         let url = new URL(window.location);
 94 |         if (searchBox.value.trim()) {
 95 |             url.hash = "";
 96 |             url.searchParams.set("search", searchBox.value);
 97 |         } else {
 98 |             url.searchParams.delete("search");
 99 |         }
100 |         history.replaceState("", "", url.toString());
101 |         onInput();
102 |     });
103 |     window.addEventListener("popstate", onInput);
104 | 
105 | 
106 |     let search, searchErr;
107 | 
108 |     async function initialize() {
109 |         try {
110 |             search = await new Promise((resolve, reject) => {
111 |                 const script = document.createElement("script");
112 |                 script.type = "text/javascript";
113 |                 script.async = true;
114 |                 script.onload = () => resolve(window.pdocSearch);
115 |                 script.onerror = (e) => reject(e);
116 |                 script.src = "../search.js";
117 |                 document.getElementsByTagName("head")[0].appendChild(script);
118 |             });
119 |         } catch (e) {
120 |             console.error("Cannot fetch pdoc search index");
121 |             searchErr = "Cannot fetch search index.";
122 |         }
123 |         onInput();
124 | 
125 |         document.querySelector("nav.pdoc").addEventListener("click", e => {
126 |             if (e.target.hash) {
127 |                 searchBox.value = "";
128 |                 searchBox.dispatchEvent(new Event("input"));
129 |             }
130 |         });
131 |     }
132 | 
133 |     function onInput() {
134 |         setContent((() => {
135 |             const term = getSearchTerm();
136 |             if (!term) {
137 |                 return null
138 |             }
139 |             if (searchErr) {
140 |                 return `<h3>Error: ${searchErr}</h3>`
141 |             }
142 |             if (!search) {
143 |                 return "<h3>Searching...</h3>"
144 |             }
145 | 
146 |             window.scrollTo({top: 0, left: 0, behavior: 'auto'});
147 | 
148 |             const results = search(term);
149 | 
150 |             let html;
151 |             if (results.length === 0) {
152 |                 html = `No search results for '${escapeHTML(term)}'.`
153 |             } else {
154 |                 html = `<h4>${results.length} search result${results.length > 1 ? "s" : ""} for '${escapeHTML(term)}'.</h4>`;
155 |             }
156 |             for (let result of results.slice(0, 10)) {
157 |                 let doc = result.doc;
158 |                 let url = `../${doc.modulename.replaceAll(".", "/")}.html`;
159 |                 if (doc.qualname) {
160 |                     url += `#${doc.qualname}`;
161 |                 }
162 | 
163 |                 let heading;
164 |                 switch (result.doc.kind) {
165 |                     case "function":
166 |                         if (doc.fullname.endsWith(".__init__")) {
167 |                             heading = `<span class="name">${doc.fullname.replace(/\.__init__$/, "")}</span>${doc.signature}`;
168 |                         } else {
169 |                             heading = `<span class="def">${doc.funcdef}</span> <span class="name">${doc.fullname}</span>${doc.signature}`;
170 |                         }
171 |                         break;
172 |                     case "class":
173 |                         heading = `<span class="def">class</span> <span class="name">${doc.fullname}</span>`;
174 |                         if (doc.bases)
175 |                             heading += `<wbr>(<span class="base">${doc.bases}</span>)`;
176 |                         heading += `:`;
177 |                         break;
178 |                     case "variable":
179 |                         heading = `<span class="name">${doc.fullname}</span>`;
180 |                         if (doc.annotation)
181 |                             heading += `<span class="annotation">${doc.annotation}</span>`;
182 |                         if (doc.default_value)
183 |                             heading += `<span class="default_value"> = ${doc.default_value}</span>`;
184 |                         break;
185 |                     default:
186 |                         heading = `<span class="name">${doc.fullname}</span>`;
187 |                         break;
188 |                 }
189 |                 html += `
190 |                         <section class="search-result">
191 |                         <a href="${url}" class="attr ${doc.kind}">${heading}</a>
192 |                         <div class="docstring">${doc.doc}</div>
193 |                         </section>
194 |                     `;
195 | 
196 |             }
197 |             return html;
198 |         })());
199 |     }
200 | 
201 |     if (getSearchTerm()) {
202 |         initialize();
203 |         searchBox.value = getSearchTerm();
204 |         onInput();
205 |     } else {
206 |         searchBox.addEventListener("focus", initialize, {once: true});
207 |     }
208 | 
209 |     searchBox.addEventListener("keydown", e => {
210 |         if (["ArrowDown", "ArrowUp", "Enter"].includes(e.key)) {
211 |             let focused = currentContent.querySelector(".search-result.focused");
212 |             if (!focused) {
213 |                 currentContent.querySelector(".search-result").classList.add("focused");
214 |             } else if (
215 |                 e.key === "ArrowDown"
216 |                 && focused.nextElementSibling
217 |                 && focused.nextElementSibling.classList.contains("search-result")
218 |             ) {
219 |                 focused.classList.remove("focused");
220 |                 focused.nextElementSibling.classList.add("focused");
221 |                 focused.nextElementSibling.scrollIntoView({
222 |                     behavior: "smooth",
223 |                     block: "nearest",
224 |                     inline: "nearest"
225 |                 });
226 |             } else if (
227 |                 e.key === "ArrowUp"
228 |                 && focused.previousElementSibling
229 |                 && focused.previousElementSibling.classList.contains("search-result")
230 |             ) {
231 |                 focused.classList.remove("focused");
232 |                 focused.previousElementSibling.classList.add("focused");
233 |                 focused.previousElementSibling.scrollIntoView({
234 |                     behavior: "smooth",
235 |                     block: "nearest",
236 |                     inline: "nearest"
237 |                 });
238 |             } else if (
239 |                 e.key === "Enter"
240 |             ) {
241 |                 focused.querySelector("a").click();
242 |             }
243 |         }
244 |     });
245 | </script></body>
246 | </html>


--------------------------------------------------------------------------------
/docs/bbm25_haystack/filters.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="utf-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  6 |     <meta name="generator" content="pdoc 14.4.0"/>
  7 |     <title>bbm25_haystack.filters API documentation</title>
  8 | 
  9 |     <style>/*! * Bootstrap Reboot v5.0.0 (https://getbootstrap.com/) * Copyright 2011-2021 The Bootstrap Authors * Copyright 2011-2021 Twitter, Inc. * Licensed under MIT (https://github.com/twbs/bootstrap/blob/main/LICENSE) * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md) */*,::after,::before{box-sizing:border-box}@media (prefers-reduced-motion:no-preference){:root{scroll-behavior:smooth}}body{margin:0;font-family:system-ui,-apple-system,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans","Liberation Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;background-color:#fff;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}hr{margin:1rem 0;color:inherit;background-color:currentColor;border:0;opacity:.25}hr:not([size]){height:1px}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem;font-weight:500;line-height:1.2}h1{font-size:calc(1.375rem + 1.5vw)}@media (min-width:1200px){h1{font-size:2.5rem}}h2{font-size:calc(1.325rem + .9vw)}@media (min-width:1200px){h2{font-size:2rem}}h3{font-size:calc(1.3rem + .6vw)}@media (min-width:1200px){h3{font-size:1.75rem}}h4{font-size:calc(1.275rem + .3vw)}@media (min-width:1200px){h4{font-size:1.5rem}}h5{font-size:1.25rem}h6{font-size:1rem}p{margin-top:0;margin-bottom:1rem}abbr[data-bs-original-title],abbr[title]{-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}ol,ul{padding-left:2rem}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:.875em}mark{padding:.2em;background-color:#fcf8e3}sub,sup{position:relative;font-size:.75em;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#0d6efd;text-decoration:underline}a:hover{color:#0a58ca}a:not([href]):not([class]),a:not([href]):not([class]):hover{color:inherit;text-decoration:none}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em;direction:ltr;unicode-bidi:bidi-override}pre{display:block;margin-top:0;margin-bottom:1rem;overflow:auto;font-size:.875em}pre code{font-size:inherit;color:inherit;word-break:normal}code{font-size:.875em;color:#d63384;word-wrap:break-word}a>code{color:inherit}kbd{padding:.2rem .4rem;font-size:.875em;color:#fff;background-color:#212529;border-radius:.2rem}kbd kbd{padding:0;font-size:1em;font-weight:700}figure{margin:0 0 1rem}img,svg{vertical-align:middle}table{caption-side:bottom;border-collapse:collapse}caption{padding-top:.5rem;padding-bottom:.5rem;color:#6c757d;text-align:left}th{text-align:inherit;text-align:-webkit-match-parent}tbody,td,tfoot,th,thead,tr{border-color:inherit;border-style:solid;border-width:0}label{display:inline-block}button{border-radius:0}button:focus:not(:focus-visible){outline:0}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,select{text-transform:none}[role=button]{cursor:pointer}select{word-wrap:normal}select:disabled{opacity:1}[list]::-webkit-calendar-picker-indicator{display:none}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}::-moz-focus-inner{padding:0;border-style:none}textarea{resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{float:left;width:100%;padding:0;margin-bottom:.5rem;font-size:calc(1.275rem + .3vw);line-height:inherit}@media (min-width:1200px){legend{font-size:1.5rem}}legend+*{clear:left}::-webkit-datetime-edit-day-field,::-webkit-datetime-edit-fields-wrapper,::-webkit-datetime-edit-hour-field,::-webkit-datetime-edit-minute,::-webkit-datetime-edit-month-field,::-webkit-datetime-edit-text,::-webkit-datetime-edit-year-field{padding:0}::-webkit-inner-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:textfield}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-color-swatch-wrapper{padding:0}::file-selector-button{font:inherit}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}iframe{border:0}summary{display:list-item;cursor:pointer}progress{vertical-align:baseline}[hidden]{display:none!important}</style>
 10 |     <style>/*! syntax-highlighting.css */pre{line-height:125%;}span.linenos{color:inherit; background-color:transparent; padding-left:5px; padding-right:20px;}.pdoc-code .hll{background-color:#ffffcc}.pdoc-code{background:#f8f8f8;}.pdoc-code .c{color:#3D7B7B; font-style:italic}.pdoc-code .err{border:1px solid #FF0000}.pdoc-code .k{color:#008000; font-weight:bold}.pdoc-code .o{color:#666666}.pdoc-code .ch{color:#3D7B7B; font-style:italic}.pdoc-code .cm{color:#3D7B7B; font-style:italic}.pdoc-code .cp{color:#9C6500}.pdoc-code .cpf{color:#3D7B7B; font-style:italic}.pdoc-code .c1{color:#3D7B7B; font-style:italic}.pdoc-code .cs{color:#3D7B7B; font-style:italic}.pdoc-code .gd{color:#A00000}.pdoc-code .ge{font-style:italic}.pdoc-code .gr{color:#E40000}.pdoc-code .gh{color:#000080; font-weight:bold}.pdoc-code .gi{color:#008400}.pdoc-code .go{color:#717171}.pdoc-code .gp{color:#000080; font-weight:bold}.pdoc-code .gs{font-weight:bold}.pdoc-code .gu{color:#800080; font-weight:bold}.pdoc-code .gt{color:#0044DD}.pdoc-code .kc{color:#008000; font-weight:bold}.pdoc-code .kd{color:#008000; font-weight:bold}.pdoc-code .kn{color:#008000; font-weight:bold}.pdoc-code .kp{color:#008000}.pdoc-code .kr{color:#008000; font-weight:bold}.pdoc-code .kt{color:#B00040}.pdoc-code .m{color:#666666}.pdoc-code .s{color:#BA2121}.pdoc-code .na{color:#687822}.pdoc-code .nb{color:#008000}.pdoc-code .nc{color:#0000FF; font-weight:bold}.pdoc-code .no{color:#880000}.pdoc-code .nd{color:#AA22FF}.pdoc-code .ni{color:#717171; font-weight:bold}.pdoc-code .ne{color:#CB3F38; font-weight:bold}.pdoc-code .nf{color:#0000FF}.pdoc-code .nl{color:#767600}.pdoc-code .nn{color:#0000FF; font-weight:bold}.pdoc-code .nt{color:#008000; font-weight:bold}.pdoc-code .nv{color:#19177C}.pdoc-code .ow{color:#AA22FF; font-weight:bold}.pdoc-code .w{color:#bbbbbb}.pdoc-code .mb{color:#666666}.pdoc-code .mf{color:#666666}.pdoc-code .mh{color:#666666}.pdoc-code .mi{color:#666666}.pdoc-code .mo{color:#666666}.pdoc-code .sa{color:#BA2121}.pdoc-code .sb{color:#BA2121}.pdoc-code .sc{color:#BA2121}.pdoc-code .dl{color:#BA2121}.pdoc-code .sd{color:#BA2121; font-style:italic}.pdoc-code .s2{color:#BA2121}.pdoc-code .se{color:#AA5D1F; font-weight:bold}.pdoc-code .sh{color:#BA2121}.pdoc-code .si{color:#A45A77; font-weight:bold}.pdoc-code .sx{color:#008000}.pdoc-code .sr{color:#A45A77}.pdoc-code .s1{color:#BA2121}.pdoc-code .ss{color:#19177C}.pdoc-code .bp{color:#008000}.pdoc-code .fm{color:#0000FF}.pdoc-code .vc{color:#19177C}.pdoc-code .vg{color:#19177C}.pdoc-code .vi{color:#19177C}.pdoc-code .vm{color:#19177C}.pdoc-code .il{color:#666666}</style>
 11 |     <style>/*! theme.css */:root{--pdoc-background:#fff;}.pdoc{--text:#212529;--muted:#6c757d;--link:#3660a5;--link-hover:#1659c5;--code:#f8f8f8;--active:#fff598;--accent:#eee;--accent2:#c1c1c1;--nav-hover:rgba(255, 255, 255, 0.5);--name:#0066BB;--def:#008800;--annotation:#007020;}</style>
 12 |     <style>/*! layout.css */html, body{width:100%;height:100%;}html, main{scroll-behavior:smooth;}body{background-color:var(--pdoc-background);}@media (max-width:769px){#navtoggle{cursor:pointer;position:absolute;width:50px;height:40px;top:1rem;right:1rem;border-color:var(--text);color:var(--text);display:flex;opacity:0.8;z-index:999;}#navtoggle:hover{opacity:1;}#togglestate + div{display:none;}#togglestate:checked + div{display:inherit;}main, header{padding:2rem 3vw;}header + main{margin-top:-3rem;}.git-button{display:none !important;}nav input[type="search"]{max-width:77%;}nav input[type="search"]:first-child{margin-top:-6px;}nav input[type="search"]:valid ~ *{display:none !important;}}@media (min-width:770px){:root{--sidebar-width:clamp(12.5rem, 28vw, 22rem);}nav{position:fixed;overflow:auto;height:100vh;width:var(--sidebar-width);}main, header{padding:3rem 2rem 3rem calc(var(--sidebar-width) + 3rem);width:calc(54rem + var(--sidebar-width));max-width:100%;}header + main{margin-top:-4rem;}#navtoggle{display:none;}}#togglestate{position:absolute;height:0;opacity:0;}nav.pdoc{--pad:clamp(0.5rem, 2vw, 1.75rem);--indent:1.5rem;background-color:var(--accent);border-right:1px solid var(--accent2);box-shadow:0 0 20px rgba(50, 50, 50, .2) inset;padding:0 0 0 var(--pad);overflow-wrap:anywhere;scrollbar-width:thin; scrollbar-color:var(--accent2) transparent; z-index:1}nav.pdoc::-webkit-scrollbar{width:.4rem; }nav.pdoc::-webkit-scrollbar-thumb{background-color:var(--accent2); }nav.pdoc > div{padding:var(--pad) 0;}nav.pdoc .module-list-button{display:inline-flex;align-items:center;color:var(--text);border-color:var(--muted);margin-bottom:1rem;}nav.pdoc .module-list-button:hover{border-color:var(--text);}nav.pdoc input[type=search]{display:block;outline-offset:0;width:calc(100% - var(--pad));}nav.pdoc .logo{max-width:calc(100% - var(--pad));max-height:35vh;display:block;margin:0 auto 1rem;transform:translate(calc(-.5 * var(--pad)), 0);}nav.pdoc ul{list-style:none;padding-left:0;}nav.pdoc > div > ul{margin-left:calc(0px - var(--pad));}nav.pdoc li a{padding:.2rem 0 .2rem calc(var(--pad) + var(--indent));}nav.pdoc > div > ul > li > a{padding-left:var(--pad);}nav.pdoc li{transition:all 100ms;}nav.pdoc li:hover{background-color:var(--nav-hover);}nav.pdoc a, nav.pdoc a:hover{color:var(--text);}nav.pdoc a{display:block;}nav.pdoc > h2:first-of-type{margin-top:1.5rem;}nav.pdoc .class:before{content:"class ";color:var(--muted);}nav.pdoc .function:after{content:"()";color:var(--muted);}nav.pdoc footer:before{content:"";display:block;width:calc(100% - var(--pad));border-top:solid var(--accent2) 1px;margin-top:1.5rem;padding-top:.5rem;}nav.pdoc footer{font-size:small;}</style>
 13 |     <style>/*! content.css */.pdoc{color:var(--text);box-sizing:border-box;line-height:1.5;background:none;}.pdoc .pdoc-button{cursor:pointer;display:inline-block;border:solid black 1px;border-radius:2px;font-size:.75rem;padding:calc(0.5em - 1px) 1em;transition:100ms all;}.pdoc .pdoc-alert{padding:1rem 1rem 1rem calc(1.5rem + 24px);border:1px solid transparent;border-radius:.25rem;background-repeat:no-repeat;background-position:1rem center;margin-bottom:1rem;}.pdoc .pdoc-alert > *:last-child{margin-bottom:0;}.pdoc .pdoc-alert-note {color:#084298;background-color:#cfe2ff;border-color:#b6d4fe;background-image:url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20width%3D%2224%22%20height%3D%2224%22%20fill%3D%22%23084298%22%20viewBox%3D%220%200%2016%2016%22%3E%3Cpath%20d%3D%22M8%2016A8%208%200%201%200%208%200a8%208%200%200%200%200%2016zm.93-9.412-1%204.705c-.07.34.029.533.304.533.194%200%20.487-.07.686-.246l-.088.416c-.287.346-.92.598-1.465.598-.703%200-1.002-.422-.808-1.319l.738-3.468c.064-.293.006-.399-.287-.47l-.451-.081.082-.381%202.29-.287zM8%205.5a1%201%200%201%201%200-2%201%201%200%200%201%200%202z%22/%3E%3C/svg%3E");}.pdoc .pdoc-alert-warning{color:#664d03;background-color:#fff3cd;border-color:#ffecb5;background-image:url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20width%3D%2224%22%20height%3D%2224%22%20fill%3D%22%23664d03%22%20viewBox%3D%220%200%2016%2016%22%3E%3Cpath%20d%3D%22M8.982%201.566a1.13%201.13%200%200%200-1.96%200L.165%2013.233c-.457.778.091%201.767.98%201.767h13.713c.889%200%201.438-.99.98-1.767L8.982%201.566zM8%205c.535%200%20.954.462.9.995l-.35%203.507a.552.552%200%200%201-1.1%200L7.1%205.995A.905.905%200%200%201%208%205zm.002%206a1%201%200%201%201%200%202%201%201%200%200%201%200-2z%22/%3E%3C/svg%3E");}.pdoc .pdoc-alert-danger{color:#842029;background-color:#f8d7da;border-color:#f5c2c7;background-image:url("data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20width%3D%2224%22%20height%3D%2224%22%20fill%3D%22%23842029%22%20viewBox%3D%220%200%2016%2016%22%3E%3Cpath%20d%3D%22M5.52.359A.5.5%200%200%201%206%200h4a.5.5%200%200%201%20.474.658L8.694%206H12.5a.5.5%200%200%201%20.395.807l-7%209a.5.5%200%200%201-.873-.454L6.823%209.5H3.5a.5.5%200%200%201-.48-.641l2.5-8.5z%22/%3E%3C/svg%3E");}.pdoc .visually-hidden{position:absolute !important;width:1px !important;height:1px !important;padding:0 !important;margin:-1px !important;overflow:hidden !important;clip:rect(0, 0, 0, 0) !important;white-space:nowrap !important;border:0 !important;}.pdoc h1, .pdoc h2, .pdoc h3{font-weight:300;margin:.3em 0;padding:.2em 0;}.pdoc > section:not(.module-info) h1{font-size:1.5rem;font-weight:500;}.pdoc > section:not(.module-info) h2{font-size:1.4rem;font-weight:500;}.pdoc > section:not(.module-info) h3{font-size:1.3rem;font-weight:500;}.pdoc > section:not(.module-info) h4{font-size:1.2rem;}.pdoc > section:not(.module-info) h5{font-size:1.1rem;}.pdoc a{text-decoration:none;color:var(--link);}.pdoc a:hover{color:var(--link-hover);}.pdoc blockquote{margin-left:2rem;}.pdoc pre{border-top:1px solid var(--accent2);border-bottom:1px solid var(--accent2);margin-top:0;margin-bottom:1em;padding:.5rem 0 .5rem .5rem;overflow-x:auto;background-color:var(--code);}.pdoc code{color:var(--text);padding:.2em .4em;margin:0;font-size:85%;background-color:var(--accent);border-radius:6px;}.pdoc a > code{color:inherit;}.pdoc pre > code{display:inline-block;font-size:inherit;background:none;border:none;padding:0;}.pdoc > section:not(.module-info){margin-bottom:1.5rem;}.pdoc .modulename{margin-top:0;font-weight:bold;}.pdoc .modulename a{color:var(--link);transition:100ms all;}.pdoc .git-button{float:right;border:solid var(--link) 1px;}.pdoc .git-button:hover{background-color:var(--link);color:var(--pdoc-background);}.view-source-toggle-state,.view-source-toggle-state ~ .pdoc-code{display:none;}.view-source-toggle-state:checked ~ .pdoc-code{display:block;}.view-source-button{display:inline-block;float:right;font-size:.75rem;line-height:1.5rem;color:var(--muted);padding:0 .4rem 0 1.3rem;cursor:pointer;text-indent:-2px;}.view-source-button > span{visibility:hidden;}.module-info .view-source-button{float:none;display:flex;justify-content:flex-end;margin:-1.2rem .4rem -.2rem 0;}.view-source-button::before{position:absolute;content:"View Source";display:list-item;list-style-type:disclosure-closed;}.view-source-toggle-state:checked ~ .attr .view-source-button::before,.view-source-toggle-state:checked ~ .view-source-button::before{list-style-type:disclosure-open;}.pdoc .docstring{margin-bottom:1.5rem;}.pdoc section:not(.module-info) .docstring{margin-left:clamp(0rem, 5vw - 2rem, 1rem);}.pdoc .docstring .pdoc-code{margin-left:1em;margin-right:1em;}.pdoc h1:target,.pdoc h2:target,.pdoc h3:target,.pdoc h4:target,.pdoc h5:target,.pdoc h6:target,.pdoc .pdoc-code > pre > span:target{background-color:var(--active);box-shadow:-1rem 0 0 0 var(--active);}.pdoc .pdoc-code > pre > span:target{display:block;}.pdoc div:target > .attr,.pdoc section:target > .attr,.pdoc dd:target > a{background-color:var(--active);}.pdoc *{scroll-margin:2rem;}.pdoc .pdoc-code .linenos{user-select:none;}.pdoc .attr:hover{filter:contrast(0.95);}.pdoc section, .pdoc .classattr{position:relative;}.pdoc .headerlink{--width:clamp(1rem, 3vw, 2rem);position:absolute;top:0;left:calc(0rem - var(--width));transition:all 100ms ease-in-out;opacity:0;}.pdoc .headerlink::before{content:"#";display:block;text-align:center;width:var(--width);height:2.3rem;line-height:2.3rem;font-size:1.5rem;}.pdoc .attr:hover ~ .headerlink,.pdoc *:target > .headerlink,.pdoc .headerlink:hover{opacity:1;}.pdoc .attr{display:block;margin:.5rem 0 .5rem;padding:.4rem .4rem .4rem 1rem;background-color:var(--accent);overflow-x:auto;}.pdoc .classattr{margin-left:2rem;}.pdoc .name{color:var(--name);font-weight:bold;}.pdoc .def{color:var(--def);font-weight:bold;}.pdoc .signature{background-color:transparent;}.pdoc .param, .pdoc .return-annotation{white-space:pre;}.pdoc .signature.multiline .param{display:block;}.pdoc .signature.condensed .param{display:inline-block;}.pdoc .annotation{color:var(--annotation);}.pdoc .view-value-toggle-state,.pdoc .view-value-toggle-state ~ .default_value{display:none;}.pdoc .view-value-toggle-state:checked ~ .default_value{display:inherit;}.pdoc .view-value-button{font-size:.5rem;vertical-align:middle;border-style:dashed;margin-top:-0.1rem;}.pdoc .view-value-button:hover{background:white;}.pdoc .view-value-button::before{content:"show";text-align:center;width:2.2em;display:inline-block;}.pdoc .view-value-toggle-state:checked ~ .view-value-button::before{content:"hide";}.pdoc .inherited{margin-left:2rem;}.pdoc .inherited dt{font-weight:700;}.pdoc .inherited dt, .pdoc .inherited dd{display:inline;margin-left:0;margin-bottom:.5rem;}.pdoc .inherited dd:not(:last-child):after{content:", ";}.pdoc .inherited .class:before{content:"class ";}.pdoc .inherited .function a:after{content:"()";}.pdoc .search-result .docstring{overflow:auto;max-height:25vh;}.pdoc .search-result.focused > .attr{background-color:var(--active);}.pdoc .attribution{margin-top:2rem;display:block;opacity:0.5;transition:all 200ms;filter:grayscale(100%);}.pdoc .attribution:hover{opacity:1;filter:grayscale(0%);}.pdoc .attribution img{margin-left:5px;height:35px;vertical-align:middle;width:70px;transition:all 200ms;}.pdoc table{display:block;width:max-content;max-width:100%;overflow:auto;margin-bottom:1rem;}.pdoc table th{font-weight:600;}.pdoc table th, .pdoc table td{padding:6px 13px;border:1px solid var(--accent2);}</style>
 14 |     <style>/*! custom.css */</style></head>
 15 | <body>
 16 |     <nav class="pdoc">
 17 |         <label id="navtoggle" for="togglestate" class="pdoc-button"><svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 30 30'><path stroke-linecap='round' stroke="currentColor" stroke-miterlimit='10' stroke-width='2' d='M4 7h22M4 15h22M4 23h22'/></svg></label>
 18 |         <input id="togglestate" type="checkbox" aria-hidden="true" tabindex="-1">
 19 |         <div>            <a class="pdoc-button module-list-button" href="../bbm25_haystack.html">
 20 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-box-arrow-in-left" viewBox="0 0 16 16">
 21 |   <path fill-rule="evenodd" d="M10 3.5a.5.5 0 0 0-.5-.5h-8a.5.5 0 0 0-.5.5v9a.5.5 0 0 0 .5.5h8a.5.5 0 0 0 .5-.5v-2a.5.5 0 0 1 1 0v2A1.5 1.5 0 0 1 9.5 14h-8A1.5 1.5 0 0 1 0 12.5v-9A1.5 1.5 0 0 1 1.5 2h8A1.5 1.5 0 0 1 11 3.5v2a.5.5 0 0 1-1 0v-2z"/>
 22 |   <path fill-rule="evenodd" d="M4.146 8.354a.5.5 0 0 1 0-.708l3-3a.5.5 0 1 1 .708.708L5.707 7.5H14.5a.5.5 0 0 1 0 1H5.707l2.147 2.146a.5.5 0 0 1-.708.708l-3-3z"/>
 23 | </svg>                &nbsp;bbm25_haystack</a>
 24 | 
 25 | 
 26 |             <input type="search" placeholder="Search..." role="searchbox" aria-label="search"
 27 |                    pattern=".+" required>
 28 | 
 29 | 
 30 | 
 31 |             <h2>API Documentation</h2>
 32 |                 <ul class="memberlist">
 33 |             <li>
 34 |                     <a class="function" href="#apply_filters_to_document">apply_filters_to_document</a>
 35 |             </li>
 36 |             <li>
 37 |                     <a class="variable" href="#LOGICAL_OPERATORS">LOGICAL_OPERATORS</a>
 38 |             </li>
 39 |             <li>
 40 |                     <a class="variable" href="#COMPARISON_OPERATORS">COMPARISON_OPERATORS</a>
 41 |             </li>
 42 |     </ul>
 43 | 
 44 | 
 45 | 
 46 |         <a class="attribution" title="pdoc: Python API documentation generator" href="https://pdoc.dev" target="_blank">
 47 |             built with <span class="visually-hidden">pdoc</span><img
 48 |                 alt="pdoc logo"
 49 |                 src="data:image/svg+xml,%3Csvg%20xmlns%3D%22http%3A//www.w3.org/2000/svg%22%20role%3D%22img%22%20aria-label%3D%22pdoc%20logo%22%20width%3D%22300%22%20height%3D%22150%22%20viewBox%3D%22-1%200%2060%2030%22%3E%3Ctitle%3Epdoc%3C/title%3E%3Cpath%20d%3D%22M29.621%2021.293c-.011-.273-.214-.475-.511-.481a.5.5%200%200%200-.489.503l-.044%201.393c-.097.551-.695%201.215-1.566%201.704-.577.428-1.306.486-2.193.182-1.426-.617-2.467-1.654-3.304-2.487l-.173-.172a3.43%203.43%200%200%200-.365-.306.49.49%200%200%200-.286-.196c-1.718-1.06-4.931-1.47-7.353.191l-.219.15c-1.707%201.187-3.413%202.131-4.328%201.03-.02-.027-.49-.685-.141-1.763.233-.721.546-2.408.772-4.076.042-.09.067-.187.046-.288.166-1.347.277-2.625.241-3.351%201.378-1.008%202.271-2.586%202.271-4.362%200-.976-.272-1.935-.788-2.774-.057-.094-.122-.18-.184-.268.033-.167.052-.339.052-.516%200-1.477-1.202-2.679-2.679-2.679-.791%200-1.496.352-1.987.9a6.3%206.3%200%200%200-1.001.029c-.492-.564-1.207-.929-2.012-.929-1.477%200-2.679%201.202-2.679%202.679A2.65%202.65%200%200%200%20.97%206.554c-.383.747-.595%201.572-.595%202.41%200%202.311%201.507%204.29%203.635%205.107-.037.699-.147%202.27-.423%203.294l-.137.461c-.622%202.042-2.515%208.257%201.727%2010.643%201.614.908%203.06%201.248%204.317%201.248%202.665%200%204.492-1.524%205.322-2.401%201.476-1.559%202.886-1.854%206.491.82%201.877%201.393%203.514%201.753%204.861%201.068%202.223-1.713%202.811-3.867%203.399-6.374.077-.846.056-1.469.054-1.537zm-4.835%204.313c-.054.305-.156.586-.242.629-.034-.007-.131-.022-.307-.157-.145-.111-.314-.478-.456-.908.221.121.432.25.675.355.115.039.219.051.33.081zm-2.251-1.238c-.05.33-.158.648-.252.694-.022.001-.125-.018-.307-.157-.217-.166-.488-.906-.639-1.573.358.344.754.693%201.198%201.036zm-3.887-2.337c-.006-.116-.018-.231-.041-.342.635.145%201.189.368%201.599.625.097.231.166.481.174.642-.03.049-.055.101-.067.158-.046.013-.128.026-.298.004-.278-.037-.901-.57-1.367-1.087zm-1.127-.497c.116.306.176.625.12.71-.019.014-.117.045-.345.016-.206-.027-.604-.332-.986-.695.41-.051.816-.056%201.211-.031zm-4.535%201.535c.209.22.379.47.358.598-.006.041-.088.138-.351.234-.144.055-.539-.063-.979-.259a11.66%2011.66%200%200%200%20.972-.573zm.983-.664c.359-.237.738-.418%201.126-.554.25.237.479.548.457.694-.006.042-.087.138-.351.235-.174.064-.694-.105-1.232-.375zm-3.381%201.794c-.022.145-.061.29-.149.401-.133.166-.358.248-.69.251h-.002c-.133%200-.306-.26-.45-.621.417.091.854.07%201.291-.031zm-2.066-8.077a4.78%204.78%200%200%201-.775-.584c.172-.115.505-.254.88-.378l-.105.962zm-.331%202.302a10.32%2010.32%200%200%201-.828-.502c.202-.143.576-.328.984-.49l-.156.992zm-.45%202.157l-.701-.403c.214-.115.536-.249.891-.376a11.57%2011.57%200%200%201-.19.779zm-.181%201.716c.064.398.194.702.298.893-.194-.051-.435-.162-.736-.398.061-.119.224-.3.438-.495zM8.87%204.141c0%20.152-.123.276-.276.276s-.275-.124-.275-.276.123-.276.276-.276.275.124.275.276zm-.735-.389a1.15%201.15%200%200%200-.314.783%201.16%201.16%200%200%200%201.162%201.162c.457%200%20.842-.27%201.032-.653.026.117.042.238.042.362a1.68%201.68%200%200%201-1.679%201.679%201.68%201.68%200%200%201-1.679-1.679c0-.843.626-1.535%201.436-1.654zM5.059%205.406A1.68%201.68%200%200%201%203.38%207.085a1.68%201.68%200%200%201-1.679-1.679c0-.037.009-.072.011-.109.21.3.541.508.935.508a1.16%201.16%200%200%200%201.162-1.162%201.14%201.14%200%200%200-.474-.912c.015%200%20.03-.005.045-.005.926.001%201.679.754%201.679%201.68zM3.198%204.141c0%20.152-.123.276-.276.276s-.275-.124-.275-.276.123-.276.276-.276.275.124.275.276zM1.375%208.964c0-.52.103-1.035.288-1.52.466.394%201.06.64%201.717.64%201.144%200%202.116-.725%202.499-1.738.383%201.012%201.355%201.738%202.499%201.738.867%200%201.631-.421%202.121-1.062.307.605.478%201.267.478%201.942%200%202.486-2.153%204.51-4.801%204.51s-4.801-2.023-4.801-4.51zm24.342%2019.349c-.985.498-2.267.168-3.813-.979-3.073-2.281-5.453-3.199-7.813-.705-1.315%201.391-4.163%203.365-8.423.97-3.174-1.786-2.239-6.266-1.261-9.479l.146-.492c.276-1.02.395-2.457.444-3.268a6.11%206.11%200%200%200%201.18.115%206.01%206.01%200%200%200%202.536-.562l-.006.175c-.802.215-1.848.612-2.021%201.25-.079.295.021.601.274.837.219.203.415.364.598.501-.667.304-1.243.698-1.311%201.179-.02.144-.022.507.393.787.213.144.395.26.564.365-1.285.521-1.361.96-1.381%201.126-.018.142-.011.496.427.746l.854.489c-.473.389-.971.914-.999%201.429-.018.278.095.532.316.713.675.556%201.231.721%201.653.721.059%200%20.104-.014.158-.02.207.707.641%201.64%201.513%201.64h.013c.8-.008%201.236-.345%201.462-.626.173-.216.268-.457.325-.692.424.195.93.374%201.372.374.151%200%20.294-.021.423-.068.732-.27.944-.704.993-1.021.009-.061.003-.119.002-.179.266.086.538.147.789.147.15%200%20.294-.021.423-.069.542-.2.797-.489.914-.754.237.147.478.258.704.288.106.014.205.021.296.021.356%200%20.595-.101.767-.229.438.435%201.094.992%201.656%201.067.106.014.205.021.296.021a1.56%201.56%200%200%200%20.323-.035c.17.575.453%201.289.866%201.605.358.273.665.362.914.362a.99.99%200%200%200%20.421-.093%201.03%201.03%200%200%200%20.245-.164c.168.428.39.846.68%201.068.358.273.665.362.913.362a.99.99%200%200%200%20.421-.093c.317-.148.512-.448.639-.762.251.157.495.257.726.257.127%200%20.25-.024.37-.071.427-.17.706-.617.841-1.314.022-.015.047-.022.068-.038.067-.051.133-.104.196-.159-.443%201.486-1.107%202.761-2.086%203.257zM8.66%209.925a.5.5%200%201%200-1%200c0%20.653-.818%201.205-1.787%201.205s-1.787-.552-1.787-1.205a.5.5%200%201%200-1%200c0%201.216%201.25%202.205%202.787%202.205s2.787-.989%202.787-2.205zm4.4%2015.965l-.208.097c-2.661%201.258-4.708%201.436-6.086.527-1.542-1.017-1.88-3.19-1.844-4.198a.4.4%200%200%200-.385-.414c-.242-.029-.406.164-.414.385-.046%201.249.367%203.686%202.202%204.896.708.467%201.547.7%202.51.7%201.248%200%202.706-.392%204.362-1.174l.185-.086a.4.4%200%200%200%20.205-.527c-.089-.204-.326-.291-.527-.206zM9.547%202.292c.093.077.205.114.317.114a.5.5%200%200%200%20.318-.886L8.817.397a.5.5%200%200%200-.703.068.5.5%200%200%200%20.069.703l1.364%201.124zm-7.661-.065c.086%200%20.173-.022.253-.068l1.523-.893a.5.5%200%200%200-.506-.863l-1.523.892a.5.5%200%200%200-.179.685c.094.158.261.247.432.247z%22%20transform%3D%22matrix%28-1%200%200%201%2058%200%29%22%20fill%3D%22%233bb300%22/%3E%3Cpath%20d%3D%22M.3%2021.86V10.18q0-.46.02-.68.04-.22.18-.5.28-.54%201.34-.54%201.06%200%201.42.28.38.26.44.78.76-1.04%202.38-1.04%201.64%200%203.1%201.54%201.46%201.54%201.46%203.58%200%202.04-1.46%203.58-1.44%201.54-3.08%201.54-1.64%200-2.38-.92v4.04q0%20.46-.04.68-.02.22-.18.5-.14.3-.5.42-.36.12-.98.12-.62%200-1-.12-.36-.12-.52-.4-.14-.28-.18-.5-.02-.22-.02-.68zm3.96-9.42q-.46.54-.46%201.18%200%20.64.46%201.18.48.52%201.2.52.74%200%201.24-.52.52-.52.52-1.18%200-.66-.48-1.18-.48-.54-1.26-.54-.76%200-1.22.54zm14.741-8.36q.16-.3.54-.42.38-.12%201-.12.64%200%201.02.12.38.12.52.42.16.3.18.54.04.22.04.68v11.94q0%20.46-.04.7-.02.22-.18.5-.3.54-1.7.54-1.38%200-1.54-.98-.84.96-2.34.96-1.8%200-3.28-1.56-1.48-1.58-1.48-3.66%200-2.1%201.48-3.68%201.5-1.58%203.28-1.58%201.48%200%202.3%201v-4.2q0-.46.02-.68.04-.24.18-.52zm-3.24%2010.86q.52.54%201.26.54.74%200%201.22-.54.5-.54.5-1.18%200-.66-.48-1.22-.46-.56-1.26-.56-.8%200-1.28.56-.48.54-.48%201.2%200%20.66.52%201.2zm7.833-1.2q0-2.4%201.68-3.96%201.68-1.56%203.84-1.56%202.16%200%203.82%201.56%201.66%201.54%201.66%203.94%200%201.66-.86%202.96-.86%201.28-2.1%201.9-1.22.6-2.54.6-1.32%200-2.56-.64-1.24-.66-2.1-1.92-.84-1.28-.84-2.88zm4.18%201.44q.64.48%201.3.48.66%200%201.32-.5.66-.5.66-1.48%200-.98-.62-1.46-.62-.48-1.34-.48-.72%200-1.34.5-.62.5-.62%201.48%200%20.96.64%201.46zm11.412-1.44q0%20.84.56%201.32.56.46%201.18.46.64%200%201.18-.36.56-.38.9-.38.6%200%201.46%201.06.46.58.46%201.04%200%20.76-1.1%201.42-1.14.8-2.8.8-1.86%200-3.58-1.34-.82-.64-1.34-1.7-.52-1.08-.52-2.36%200-1.3.52-2.34.52-1.06%201.34-1.7%201.66-1.32%203.54-1.32.76%200%201.48.22.72.2%201.06.4l.32.2q.36.24.56.38.52.4.52.92%200%20.5-.42%201.14-.72%201.1-1.38%201.1-.38%200-1.08-.44-.36-.34-1.04-.34-.66%200-1.24.48-.58.48-.58%201.34z%22%20fill%3D%22green%22/%3E%3C/svg%3E"/>
 50 |         </a>
 51 | </div>
 52 |     </nav>
 53 |     <main class="pdoc">
 54 |             <section class="module-info">
 55 |                     <h1 class="modulename">
 56 | <a href="./../bbm25_haystack.html">bbm25_haystack</a><wbr>.filters    </h1>
 57 | 
 58 |                 
 59 |                         <input id="mod-filters-view-source" class="view-source-toggle-state" type="checkbox" aria-hidden="true" tabindex="-1">
 60 | 
 61 |                         <label class="view-source-button" for="mod-filters-view-source"><span>View Source</span></label>
 62 | 
 63 |                         <div class="pdoc-code codehilite"><pre><span></span><span id="L-1"><a href="#L-1"><span class="linenos">  1</span></a><span class="c1"># SPDX-FileCopyrightText: 2024-present Yuxuan Wang &lt;wangy49@seas.upenn.edu&gt;</span>
 64 | </span><span id="L-2"><a href="#L-2"><span class="linenos">  2</span></a><span class="c1">#</span>
 65 | </span><span id="L-3"><a href="#L-3"><span class="linenos">  3</span></a><span class="c1"># SPDX-License-Identifier: Apache-2.0</span>
 66 | </span><span id="L-4"><a href="#L-4"><span class="linenos">  4</span></a><span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Iterable</span>
 67 | </span><span id="L-5"><a href="#L-5"><span class="linenos">  5</span></a><span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">wraps</span>
 68 | </span><span id="L-6"><a href="#L-6"><span class="linenos">  6</span></a><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">Final</span><span class="p">,</span> <span class="n">Optional</span>
 69 | </span><span id="L-7"><a href="#L-7"><span class="linenos">  7</span></a>
 70 | </span><span id="L-8"><a href="#L-8"><span class="linenos">  8</span></a><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
 71 | </span><span id="L-9"><a href="#L-9"><span class="linenos">  9</span></a><span class="kn">from</span> <span class="nn">haystack.dataclasses</span> <span class="kn">import</span> <span class="n">Document</span>
 72 | </span><span id="L-10"><a href="#L-10"><span class="linenos"> 10</span></a><span class="kn">from</span> <span class="nn">haystack.errors</span> <span class="kn">import</span> <span class="n">FilterError</span>
 73 | </span><span id="L-11"><a href="#L-11"><span class="linenos"> 11</span></a>
 74 | </span><span id="L-12"><a href="#L-12"><span class="linenos"> 12</span></a>
 75 | </span><span id="L-13"><a href="#L-13"><span class="linenos"> 13</span></a><span class="k">def</span> <span class="nf">apply_filters_to_document</span><span class="p">(</span>
 76 | </span><span id="L-14"><a href="#L-14"><span class="linenos"> 14</span></a>    <span class="n">filters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="n">document</span><span class="p">:</span> <span class="n">Document</span>
 77 | </span><span id="L-15"><a href="#L-15"><span class="linenos"> 15</span></a><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
 78 | </span><span id="L-16"><a href="#L-16"><span class="linenos"> 16</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 79 | </span><span id="L-17"><a href="#L-17"><span class="linenos"> 17</span></a><span class="sd">    Apply filters to a document.</span>
 80 | </span><span id="L-18"><a href="#L-18"><span class="linenos"> 18</span></a>
 81 | </span><span id="L-19"><a href="#L-19"><span class="linenos"> 19</span></a><span class="sd">    :param filters: The filters to apply to the document.</span>
 82 | </span><span id="L-20"><a href="#L-20"><span class="linenos"> 20</span></a><span class="sd">    :type filters: dict[str, Any]</span>
 83 | </span><span id="L-21"><a href="#L-21"><span class="linenos"> 21</span></a><span class="sd">    :param document: The document to apply the filters to.</span>
 84 | </span><span id="L-22"><a href="#L-22"><span class="linenos"> 22</span></a><span class="sd">    :type document: Document</span>
 85 | </span><span id="L-23"><a href="#L-23"><span class="linenos"> 23</span></a>
 86 | </span><span id="L-24"><a href="#L-24"><span class="linenos"> 24</span></a><span class="sd">    :return: True if the document passes the filters.</span>
 87 | </span><span id="L-25"><a href="#L-25"><span class="linenos"> 25</span></a><span class="sd">    :rtype: bool</span>
 88 | </span><span id="L-26"><a href="#L-26"><span class="linenos"> 26</span></a><span class="sd">    &quot;&quot;&quot;</span>
 89 | </span><span id="L-27"><a href="#L-27"><span class="linenos"> 27</span></a>    <span class="k">if</span> <span class="n">filters</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">filters</span><span class="p">:</span>
 90 | </span><span id="L-28"><a href="#L-28"><span class="linenos"> 28</span></a>        <span class="k">return</span> <span class="kc">True</span>
 91 | </span><span id="L-29"><a href="#L-29"><span class="linenos"> 29</span></a>    <span class="k">return</span> <span class="n">_run_comparison_condition</span><span class="p">(</span><span class="n">filters</span><span class="p">,</span> <span class="n">document</span><span class="p">)</span>
 92 | </span><span id="L-30"><a href="#L-30"><span class="linenos"> 30</span></a>
 93 | </span><span id="L-31"><a href="#L-31"><span class="linenos"> 31</span></a>
 94 | </span><span id="L-32"><a href="#L-32"><span class="linenos"> 32</span></a><span class="k">def</span> <span class="nf">_get_document_field</span><span class="p">(</span><span class="n">document</span><span class="p">:</span> <span class="n">Document</span><span class="p">,</span> <span class="n">field</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]:</span>
 95 | </span><span id="L-33"><a href="#L-33"><span class="linenos"> 33</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 96 | </span><span id="L-34"><a href="#L-34"><span class="linenos"> 34</span></a><span class="sd">    Get the value of a field in a document.</span>
 97 | </span><span id="L-35"><a href="#L-35"><span class="linenos"> 35</span></a>
 98 | </span><span id="L-36"><a href="#L-36"><span class="linenos"> 36</span></a><span class="sd">    If the field is not found within the document then, instead of</span>
 99 | </span><span id="L-37"><a href="#L-37"><span class="linenos"> 37</span></a><span class="sd">    raising an error, `None` is returned. Note that here we do not</span>
100 | </span><span id="L-38"><a href="#L-38"><span class="linenos"> 38</span></a><span class="sd">    implicitly add &#39;meta&#39; prefix for fields that are not a direct</span>
101 | </span><span id="L-39"><a href="#L-39"><span class="linenos"> 39</span></a><span class="sd">    attribute of the document, not supporting legacy behavior anymore.</span>
102 | </span><span id="L-40"><a href="#L-40"><span class="linenos"> 40</span></a>
103 | </span><span id="L-41"><a href="#L-41"><span class="linenos"> 41</span></a><span class="sd">    :param document: The document to get the field value from.</span>
104 | </span><span id="L-42"><a href="#L-42"><span class="linenos"> 42</span></a><span class="sd">    :type document: Document</span>
105 | </span><span id="L-43"><a href="#L-43"><span class="linenos"> 43</span></a><span class="sd">    :param field: The field to get the value of.</span>
106 | </span><span id="L-44"><a href="#L-44"><span class="linenos"> 44</span></a><span class="sd">    :type field: str</span>
107 | </span><span id="L-45"><a href="#L-45"><span class="linenos"> 45</span></a>
108 | </span><span id="L-46"><a href="#L-46"><span class="linenos"> 46</span></a><span class="sd">    :return: The value of the field in the document.</span>
109 | </span><span id="L-47"><a href="#L-47"><span class="linenos"> 47</span></a><span class="sd">    :rtype: Optional[Any]</span>
110 | </span><span id="L-48"><a href="#L-48"><span class="linenos"> 48</span></a><span class="sd">    &quot;&quot;&quot;</span>
111 | </span><span id="L-49"><a href="#L-49"><span class="linenos"> 49</span></a>    <span class="k">if</span> <span class="s2">&quot;.&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">field</span><span class="p">:</span>
112 | </span><span id="L-50"><a href="#L-50"><span class="linenos"> 50</span></a>        <span class="k">return</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">document</span><span class="p">,</span> <span class="n">field</span><span class="p">)</span>
113 | </span><span id="L-51"><a href="#L-51"><span class="linenos"> 51</span></a>
114 | </span><span id="L-52"><a href="#L-52"><span class="linenos"> 52</span></a>    <span class="n">attr</span> <span class="o">=</span> <span class="n">document</span><span class="o">.</span><span class="n">meta</span>
115 | </span><span id="L-53"><a href="#L-53"><span class="linenos"> 53</span></a>    <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">field</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s2">&quot;.&quot;</span><span class="p">)[</span><span class="mi">1</span><span class="p">:]:</span>
116 | </span><span id="L-54"><a href="#L-54"><span class="linenos"> 54</span></a>        <span class="n">attr</span> <span class="o">=</span> <span class="n">attr</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
117 | </span><span id="L-55"><a href="#L-55"><span class="linenos"> 55</span></a>        <span class="k">if</span> <span class="n">attr</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
118 | </span><span id="L-56"><a href="#L-56"><span class="linenos"> 56</span></a>            <span class="k">return</span> <span class="kc">None</span>
119 | </span><span id="L-57"><a href="#L-57"><span class="linenos"> 57</span></a>    <span class="k">return</span> <span class="n">attr</span>
120 | </span><span id="L-58"><a href="#L-58"><span class="linenos"> 58</span></a>
121 | </span><span id="L-59"><a href="#L-59"><span class="linenos"> 59</span></a>
122 | </span><span id="L-60"><a href="#L-60"><span class="linenos"> 60</span></a><span class="k">def</span> <span class="nf">_run_logical_condition</span><span class="p">(</span><span class="n">condition</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">document</span><span class="p">:</span> <span class="n">Document</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
123 | </span><span id="L-61"><a href="#L-61"><span class="linenos"> 61</span></a>    <span class="k">if</span> <span class="s2">&quot;operator&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">condition</span><span class="p">:</span>
124 | </span><span id="L-62"><a href="#L-62"><span class="linenos"> 62</span></a>        <span class="n">msg</span> <span class="o">=</span> <span class="s2">&quot;Logical condition must have an &#39;operator&#39; key.&quot;</span>
125 | </span><span id="L-63"><a href="#L-63"><span class="linenos"> 63</span></a>        <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
126 | </span><span id="L-64"><a href="#L-64"><span class="linenos"> 64</span></a>    <span class="k">if</span> <span class="s2">&quot;conditions&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">condition</span><span class="p">:</span>
127 | </span><span id="L-65"><a href="#L-65"><span class="linenos"> 65</span></a>        <span class="n">msg</span> <span class="o">=</span> <span class="s2">&quot;Logical condition must have a &#39;conditions&#39; key.&quot;</span>
128 | </span><span id="L-66"><a href="#L-66"><span class="linenos"> 66</span></a>        <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
129 | </span><span id="L-67"><a href="#L-67"><span class="linenos"> 67</span></a>
130 | </span><span id="L-68"><a href="#L-68"><span class="linenos"> 68</span></a>    <span class="n">conditions</span> <span class="o">=</span> <span class="n">condition</span><span class="p">[</span><span class="s2">&quot;conditions&quot;</span><span class="p">]</span>
131 | </span><span id="L-69"><a href="#L-69"><span class="linenos"> 69</span></a>    <span class="n">reducer</span> <span class="o">=</span> <span class="n">LOGICAL_OPERATORS</span><span class="p">[</span><span class="n">condition</span><span class="p">[</span><span class="s2">&quot;operator&quot;</span><span class="p">]]</span>
132 | </span><span id="L-70"><a href="#L-70"><span class="linenos"> 70</span></a>
133 | </span><span id="L-71"><a href="#L-71"><span class="linenos"> 71</span></a>    <span class="k">return</span> <span class="n">reducer</span><span class="p">(</span><span class="n">document</span><span class="p">,</span> <span class="n">conditions</span><span class="p">)</span>
134 | </span><span id="L-72"><a href="#L-72"><span class="linenos"> 72</span></a>
135 | </span><span id="L-73"><a href="#L-73"><span class="linenos"> 73</span></a>
136 | </span><span id="L-74"><a href="#L-74"><span class="linenos"> 74</span></a><span class="k">def</span> <span class="nf">_run_comparison_condition</span><span class="p">(</span><span class="n">condition</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">document</span><span class="p">:</span> <span class="n">Document</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
137 | </span><span id="L-75"><a href="#L-75"><span class="linenos"> 75</span></a>    <span class="k">if</span> <span class="s2">&quot;field&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">condition</span><span class="p">:</span>
138 | </span><span id="L-76"><a href="#L-76"><span class="linenos"> 76</span></a>        <span class="k">return</span> <span class="n">_run_logical_condition</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">document</span><span class="p">)</span>
139 | </span><span id="L-77"><a href="#L-77"><span class="linenos"> 77</span></a>
140 | </span><span id="L-78"><a href="#L-78"><span class="linenos"> 78</span></a>    <span class="k">if</span> <span class="s2">&quot;operator&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">condition</span><span class="p">:</span>
141 | </span><span id="L-79"><a href="#L-79"><span class="linenos"> 79</span></a>        <span class="n">msg</span> <span class="o">=</span> <span class="s2">&quot;Comparison condition must have an &#39;operator&#39; key.&quot;</span>
142 | </span><span id="L-80"><a href="#L-80"><span class="linenos"> 80</span></a>        <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
143 | </span><span id="L-81"><a href="#L-81"><span class="linenos"> 81</span></a>    <span class="k">if</span> <span class="s2">&quot;value&quot;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">condition</span><span class="p">:</span>
144 | </span><span id="L-82"><a href="#L-82"><span class="linenos"> 82</span></a>        <span class="n">msg</span> <span class="o">=</span> <span class="s2">&quot;Comparison condition must have a &#39;value&#39; key.&quot;</span>
145 | </span><span id="L-83"><a href="#L-83"><span class="linenos"> 83</span></a>        <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
146 | </span><span id="L-84"><a href="#L-84"><span class="linenos"> 84</span></a>
147 | </span><span id="L-85"><a href="#L-85"><span class="linenos"> 85</span></a>    <span class="n">field</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">condition</span><span class="p">[</span><span class="s2">&quot;field&quot;</span><span class="p">]</span>
148 | </span><span id="L-86"><a href="#L-86"><span class="linenos"> 86</span></a>    <span class="n">value</span><span class="p">:</span> <span class="n">Any</span> <span class="o">=</span> <span class="n">condition</span><span class="p">[</span><span class="s2">&quot;value&quot;</span><span class="p">]</span>
149 | </span><span id="L-87"><a href="#L-87"><span class="linenos"> 87</span></a>    <span class="n">comparator</span> <span class="o">=</span> <span class="n">COMPARISON_OPERATORS</span><span class="p">[</span><span class="n">condition</span><span class="p">[</span><span class="s2">&quot;operator&quot;</span><span class="p">]]</span>
150 | </span><span id="L-88"><a href="#L-88"><span class="linenos"> 88</span></a>
151 | </span><span id="L-89"><a href="#L-89"><span class="linenos"> 89</span></a>    <span class="k">return</span> <span class="n">comparator</span><span class="p">(</span><span class="n">_get_document_field</span><span class="p">(</span><span class="n">document</span><span class="p">,</span> <span class="n">field</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span>
152 | </span><span id="L-90"><a href="#L-90"><span class="linenos"> 90</span></a>
153 | </span><span id="L-91"><a href="#L-91"><span class="linenos"> 91</span></a>
154 | </span><span id="L-92"><a href="#L-92"><span class="linenos"> 92</span></a><span class="k">def</span> <span class="nf">_and</span><span class="p">(</span><span class="n">document</span><span class="p">:</span> <span class="n">Document</span><span class="p">,</span> <span class="n">conditions</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
155 | </span><span id="L-93"><a href="#L-93"><span class="linenos"> 93</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
156 | </span><span id="L-94"><a href="#L-94"><span class="linenos"> 94</span></a><span class="sd">    Return True if all conditions are met.</span>
157 | </span><span id="L-95"><a href="#L-95"><span class="linenos"> 95</span></a>
158 | </span><span id="L-96"><a href="#L-96"><span class="linenos"> 96</span></a><span class="sd">    :param document: The document to check the conditions against.</span>
159 | </span><span id="L-97"><a href="#L-97"><span class="linenos"> 97</span></a><span class="sd">    :type document: Document</span>
160 | </span><span id="L-98"><a href="#L-98"><span class="linenos"> 98</span></a><span class="sd">    :param conditions: The conditions to check against the document.</span>
161 | </span><span id="L-99"><a href="#L-99"><span class="linenos"> 99</span></a><span class="sd">    :type conditions: list[dict[str, Any]]</span>
162 | </span><span id="L-100"><a href="#L-100"><span class="linenos">100</span></a>
163 | </span><span id="L-101"><a href="#L-101"><span class="linenos">101</span></a><span class="sd">    :return: True if not all conditions are met.</span>
164 | </span><span id="L-102"><a href="#L-102"><span class="linenos">102</span></a><span class="sd">    :rtype: bool</span>
165 | </span><span id="L-103"><a href="#L-103"><span class="linenos">103</span></a><span class="sd">    &quot;&quot;&quot;</span>
166 | </span><span id="L-104"><a href="#L-104"><span class="linenos">104</span></a>    <span class="k">return</span> <span class="nb">all</span><span class="p">(</span>
167 | </span><span id="L-105"><a href="#L-105"><span class="linenos">105</span></a>        <span class="n">_run_comparison_condition</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">document</span><span class="p">)</span> <span class="k">for</span> <span class="n">condition</span> <span class="ow">in</span> <span class="n">conditions</span>
168 | </span><span id="L-106"><a href="#L-106"><span class="linenos">106</span></a>    <span class="p">)</span>
169 | </span><span id="L-107"><a href="#L-107"><span class="linenos">107</span></a>
170 | </span><span id="L-108"><a href="#L-108"><span class="linenos">108</span></a>
171 | </span><span id="L-109"><a href="#L-109"><span class="linenos">109</span></a><span class="k">def</span> <span class="nf">_or</span><span class="p">(</span><span class="n">document</span><span class="p">:</span> <span class="n">Document</span><span class="p">,</span> <span class="n">conditions</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
172 | </span><span id="L-110"><a href="#L-110"><span class="linenos">110</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
173 | </span><span id="L-111"><a href="#L-111"><span class="linenos">111</span></a><span class="sd">    Return True if any condition is met.</span>
174 | </span><span id="L-112"><a href="#L-112"><span class="linenos">112</span></a>
175 | </span><span id="L-113"><a href="#L-113"><span class="linenos">113</span></a><span class="sd">    :param document: The document to check the conditions against.</span>
176 | </span><span id="L-114"><a href="#L-114"><span class="linenos">114</span></a><span class="sd">    :type document: Document</span>
177 | </span><span id="L-115"><a href="#L-115"><span class="linenos">115</span></a><span class="sd">    :param conditions: The conditions to check against the document.</span>
178 | </span><span id="L-116"><a href="#L-116"><span class="linenos">116</span></a><span class="sd">    :type conditions: list[dict[str, Any]]</span>
179 | </span><span id="L-117"><a href="#L-117"><span class="linenos">117</span></a>
180 | </span><span id="L-118"><a href="#L-118"><span class="linenos">118</span></a><span class="sd">    :return: True if not all conditions are met.</span>
181 | </span><span id="L-119"><a href="#L-119"><span class="linenos">119</span></a><span class="sd">    :rtype: bool</span>
182 | </span><span id="L-120"><a href="#L-120"><span class="linenos">120</span></a><span class="sd">    &quot;&quot;&quot;</span>
183 | </span><span id="L-121"><a href="#L-121"><span class="linenos">121</span></a>    <span class="k">return</span> <span class="nb">any</span><span class="p">(</span><span class="n">_run_comparison_condition</span><span class="p">(</span><span class="n">cond</span><span class="p">,</span> <span class="n">document</span><span class="p">)</span> <span class="k">for</span> <span class="n">cond</span> <span class="ow">in</span> <span class="n">conditions</span><span class="p">)</span>
184 | </span><span id="L-122"><a href="#L-122"><span class="linenos">122</span></a>
185 | </span><span id="L-123"><a href="#L-123"><span class="linenos">123</span></a>
186 | </span><span id="L-124"><a href="#L-124"><span class="linenos">124</span></a><span class="k">def</span> <span class="nf">_not</span><span class="p">(</span><span class="n">document</span><span class="p">:</span> <span class="n">Document</span><span class="p">,</span> <span class="n">conditions</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
187 | </span><span id="L-125"><a href="#L-125"><span class="linenos">125</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
188 | </span><span id="L-126"><a href="#L-126"><span class="linenos">126</span></a><span class="sd">    Return True if not all conditions are met.</span>
189 | </span><span id="L-127"><a href="#L-127"><span class="linenos">127</span></a>
190 | </span><span id="L-128"><a href="#L-128"><span class="linenos">128</span></a><span class="sd">    The &#39;NOT&#39; operator is under-specified when supplied with a</span>
191 | </span><span id="L-129"><a href="#L-129"><span class="linenos">129</span></a><span class="sd">    set of conditions instead of a single condition. Because we</span>
192 | </span><span id="L-130"><a href="#L-130"><span class="linenos">130</span></a><span class="sd">    can have the semantics of &#39;at least one False&#39; versus</span>
193 | </span><span id="L-131"><a href="#L-131"><span class="linenos">131</span></a><span class="sd">    &#39;all False&#39;. Here we choose to comply with the official</span>
194 | </span><span id="L-132"><a href="#L-132"><span class="linenos">132</span></a><span class="sd">    implementation of Haystack (the &#39;at least one False&#39; semantics).</span>
195 | </span><span id="L-133"><a href="#L-133"><span class="linenos">133</span></a>
196 | </span><span id="L-134"><a href="#L-134"><span class="linenos">134</span></a><span class="sd">    :param document: The document to check the conditions against.</span>
197 | </span><span id="L-135"><a href="#L-135"><span class="linenos">135</span></a><span class="sd">    :type document: Document</span>
198 | </span><span id="L-136"><a href="#L-136"><span class="linenos">136</span></a><span class="sd">    :param conditions: The conditions to check against the document.</span>
199 | </span><span id="L-137"><a href="#L-137"><span class="linenos">137</span></a><span class="sd">    :type conditions: list[dict[str, Any]]</span>
200 | </span><span id="L-138"><a href="#L-138"><span class="linenos">138</span></a>
201 | </span><span id="L-139"><a href="#L-139"><span class="linenos">139</span></a><span class="sd">    :return: True if not all conditions are met.</span>
202 | </span><span id="L-140"><a href="#L-140"><span class="linenos">140</span></a><span class="sd">    :rtype: bool</span>
203 | </span><span id="L-141"><a href="#L-141"><span class="linenos">141</span></a><span class="sd">    &quot;&quot;&quot;</span>
204 | </span><span id="L-142"><a href="#L-142"><span class="linenos">142</span></a>    <span class="k">return</span> <span class="ow">not</span> <span class="n">_and</span><span class="p">(</span><span class="n">document</span><span class="p">,</span> <span class="n">conditions</span><span class="p">)</span>
205 | </span><span id="L-143"><a href="#L-143"><span class="linenos">143</span></a>
206 | </span><span id="L-144"><a href="#L-144"><span class="linenos">144</span></a>
207 | </span><span id="L-145"><a href="#L-145"><span class="linenos">145</span></a><span class="k">def</span> <span class="nf">_check_comparator_inputs</span><span class="p">(</span>
208 | </span><span id="L-146"><a href="#L-146"><span class="linenos">146</span></a>    <span class="n">comparator</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]</span>
209 | </span><span id="L-147"><a href="#L-147"><span class="linenos">147</span></a><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="nb">bool</span><span class="p">]:</span>
210 | </span><span id="L-148"><a href="#L-148"><span class="linenos">148</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
211 | </span><span id="L-149"><a href="#L-149"><span class="linenos">149</span></a><span class="sd">    A decorator to check and preprocess input attribute values.</span>
212 | </span><span id="L-150"><a href="#L-150"><span class="linenos">150</span></a>
213 | </span><span id="L-151"><a href="#L-151"><span class="linenos">151</span></a><span class="sd">    ALL COMPARISON OPERATORS SHOULD BE WRAPPED WITH THIS DECORATOR.</span>
214 | </span><span id="L-152"><a href="#L-152"><span class="linenos">152</span></a><span class="sd">    because a `False` may be returned by both input validation and</span>
215 | </span><span id="L-153"><a href="#L-153"><span class="linenos">153</span></a><span class="sd">    the actual comparison. This decorator ensures that the comparison</span>
216 | </span><span id="L-154"><a href="#L-154"><span class="linenos">154</span></a><span class="sd">    function is only called if the input values are valid.</span>
217 | </span><span id="L-155"><a href="#L-155"><span class="linenos">155</span></a>
218 | </span><span id="L-156"><a href="#L-156"><span class="linenos">156</span></a><span class="sd">    :param comparator: The comparator function to wrap.</span>
219 | </span><span id="L-157"><a href="#L-157"><span class="linenos">157</span></a><span class="sd">    :type comparator: Callable[[Any, Any], bool]</span>
220 | </span><span id="L-158"><a href="#L-158"><span class="linenos">158</span></a>
221 | </span><span id="L-159"><a href="#L-159"><span class="linenos">159</span></a><span class="sd">    :return: The wrapped comparator function.</span>
222 | </span><span id="L-160"><a href="#L-160"><span class="linenos">160</span></a><span class="sd">    :rtype: Callable[[Any, Any], bool]</span>
223 | </span><span id="L-161"><a href="#L-161"><span class="linenos">161</span></a><span class="sd">    &quot;&quot;&quot;</span>
224 | </span><span id="L-162"><a href="#L-162"><span class="linenos">162</span></a>
225 | </span><span id="L-163"><a href="#L-163"><span class="linenos">163</span></a>    <span class="nd">@wraps</span><span class="p">(</span><span class="n">comparator</span><span class="p">)</span>
226 | </span><span id="L-164"><a href="#L-164"><span class="linenos">164</span></a>    <span class="k">def</span> <span class="nf">_wrapper</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
227 | </span><span id="L-165"><a href="#L-165"><span class="linenos">165</span></a>
228 | </span><span id="L-166"><a href="#L-166"><span class="linenos">166</span></a>        <span class="c1"># I think allowing comparison between DataFrames would</span>
229 | </span><span id="L-167"><a href="#L-167"><span class="linenos">167</span></a>        <span class="c1"># be a really bad idea because it would create unexpected</span>
230 | </span><span id="L-168"><a href="#L-168"><span class="linenos">168</span></a>        <span class="c1"># behavior, but I am open to discussion on this.</span>
231 | </span><span id="L-169"><a href="#L-169"><span class="linenos">169</span></a>        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">fv</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
232 | </span><span id="L-170"><a href="#L-170"><span class="linenos">170</span></a>            <span class="n">msg</span> <span class="o">=</span> <span class="p">(</span>
233 | </span><span id="L-171"><a href="#L-171"><span class="linenos">171</span></a>                <span class="s2">&quot;Cannot compare DataFrames. Please convert them to &quot;</span>
234 | </span><span id="L-172"><a href="#L-172"><span class="linenos">172</span></a>                <span class="s2">&quot;simpler data structures before comparing.&quot;</span>
235 | </span><span id="L-173"><a href="#L-173"><span class="linenos">173</span></a>            <span class="p">)</span>
236 | </span><span id="L-174"><a href="#L-174"><span class="linenos">174</span></a>            <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
237 | </span><span id="L-175"><a href="#L-175"><span class="linenos">175</span></a>
238 | </span><span id="L-176"><a href="#L-176"><span class="linenos">176</span></a>        <span class="c1"># I think comparison between missing values is ambiguous,</span>
239 | </span><span id="L-177"><a href="#L-177"><span class="linenos">177</span></a>        <span class="c1"># but again, I am open to discussion on this. Here I choose</span>
240 | </span><span id="L-178"><a href="#L-178"><span class="linenos">178</span></a>        <span class="c1"># to return False if either value is None because from a</span>
241 | </span><span id="L-179"><a href="#L-179"><span class="linenos">179</span></a>        <span class="c1"># logical perspective, we really cannot say anything about</span>
242 | </span><span id="L-180"><a href="#L-180"><span class="linenos">180</span></a>        <span class="c1"># the comparison between a missing value and a non-missing.</span>
243 | </span><span id="L-181"><a href="#L-181"><span class="linenos">181</span></a>        <span class="k">if</span> <span class="n">dv</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">fv</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
244 | </span><span id="L-182"><a href="#L-182"><span class="linenos">182</span></a>            <span class="k">return</span> <span class="kc">False</span>
245 | </span><span id="L-183"><a href="#L-183"><span class="linenos">183</span></a>
246 | </span><span id="L-184"><a href="#L-184"><span class="linenos">184</span></a>        <span class="k">try</span><span class="p">:</span>
247 | </span><span id="L-185"><a href="#L-185"><span class="linenos">185</span></a>            <span class="k">return</span> <span class="n">comparator</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span>
248 | </span><span id="L-186"><a href="#L-186"><span class="linenos">186</span></a>        <span class="k">except</span> <span class="ne">TypeError</span> <span class="k">as</span> <span class="n">exc</span><span class="p">:</span>
249 | </span><span id="L-187"><a href="#L-187"><span class="linenos">187</span></a>            <span class="n">msg</span> <span class="o">=</span> <span class="p">(</span>
250 | </span><span id="L-188"><a href="#L-188"><span class="linenos">188</span></a>                <span class="sa">f</span><span class="s2">&quot;Cannot compare document value of </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">dv</span><span class="p">)</span><span class="si">}</span><span class="s2"> type &quot;</span>
251 | </span><span id="L-189"><a href="#L-189"><span class="linenos">189</span></a>                <span class="sa">f</span><span class="s2">&quot;with filter value of </span><span class="si">{</span><span class="nb">type</span><span class="p">(</span><span class="n">fv</span><span class="p">)</span><span class="si">}</span><span class="s2"> type.&quot;</span>
252 | </span><span id="L-190"><a href="#L-190"><span class="linenos">190</span></a>            <span class="p">)</span>
253 | </span><span id="L-191"><a href="#L-191"><span class="linenos">191</span></a>            <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span> <span class="kn">from</span> <span class="nn">exc</span>
254 | </span><span id="L-192"><a href="#L-192"><span class="linenos">192</span></a>
255 | </span><span id="L-193"><a href="#L-193"><span class="linenos">193</span></a>    <span class="k">return</span> <span class="n">_wrapper</span>
256 | </span><span id="L-194"><a href="#L-194"><span class="linenos">194</span></a>
257 | </span><span id="L-195"><a href="#L-195"><span class="linenos">195</span></a>
258 | </span><span id="L-196"><a href="#L-196"><span class="linenos">196</span></a><span class="nd">@_check_comparator_inputs</span>
259 | </span><span id="L-197"><a href="#L-197"><span class="linenos">197</span></a><span class="k">def</span> <span class="nf">_eq</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
260 | </span><span id="L-198"><a href="#L-198"><span class="linenos">198</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
261 | </span><span id="L-199"><a href="#L-199"><span class="linenos">199</span></a><span class="sd">    Conservative implementation of equal comparison.</span>
262 | </span><span id="L-200"><a href="#L-200"><span class="linenos">200</span></a>
263 | </span><span id="L-201"><a href="#L-201"><span class="linenos">201</span></a><span class="sd">    There are two major differences between this implementation</span>
264 | </span><span id="L-202"><a href="#L-202"><span class="linenos">202</span></a><span class="sd">    and the default Haystack filter implementation:</span>
265 | </span><span id="L-203"><a href="#L-203"><span class="linenos">203</span></a><span class="sd">        - If both values are None, we return False, instead of True.</span>
266 | </span><span id="L-204"><a href="#L-204"><span class="linenos">204</span></a><span class="sd">        - If any value is a DataFrame, we raise an error, instead</span>
267 | </span><span id="L-205"><a href="#L-205"><span class="linenos">205</span></a><span class="sd">            of converting them to JSON.</span>
268 | </span><span id="L-206"><a href="#L-206"><span class="linenos">206</span></a><span class="sd">    &quot;&quot;&quot;</span>
269 | </span><span id="L-207"><a href="#L-207"><span class="linenos">207</span></a>    <span class="k">return</span> <span class="n">dv</span> <span class="o">==</span> <span class="n">fv</span>
270 | </span><span id="L-208"><a href="#L-208"><span class="linenos">208</span></a>
271 | </span><span id="L-209"><a href="#L-209"><span class="linenos">209</span></a>
272 | </span><span id="L-210"><a href="#L-210"><span class="linenos">210</span></a><span class="nd">@_check_comparator_inputs</span>
273 | </span><span id="L-211"><a href="#L-211"><span class="linenos">211</span></a><span class="k">def</span> <span class="nf">_ne</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
274 | </span><span id="L-212"><a href="#L-212"><span class="linenos">212</span></a>    <span class="k">return</span> <span class="ow">not</span> <span class="n">_eq</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span>
275 | </span><span id="L-213"><a href="#L-213"><span class="linenos">213</span></a>
276 | </span><span id="L-214"><a href="#L-214"><span class="linenos">214</span></a>
277 | </span><span id="L-215"><a href="#L-215"><span class="linenos">215</span></a><span class="nd">@_check_comparator_inputs</span>
278 | </span><span id="L-216"><a href="#L-216"><span class="linenos">216</span></a><span class="k">def</span> <span class="nf">_gt</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
279 | </span><span id="L-217"><a href="#L-217"><span class="linenos">217</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
280 | </span><span id="L-218"><a href="#L-218"><span class="linenos">218</span></a><span class="sd">    A more liberal implementation with less surprises.</span>
281 | </span><span id="L-219"><a href="#L-219"><span class="linenos">219</span></a>
282 | </span><span id="L-220"><a href="#L-220"><span class="linenos">220</span></a><span class="sd">    Simply compare the two values with default Python comparison.</span>
283 | </span><span id="L-221"><a href="#L-221"><span class="linenos">221</span></a><span class="sd">    We do not perform any conversion here to have the behavior</span>
284 | </span><span id="L-222"><a href="#L-222"><span class="linenos">222</span></a><span class="sd">    more predictable. If we want to compare the dates, we should</span>
285 | </span><span id="L-223"><a href="#L-223"><span class="linenos">223</span></a><span class="sd">    just convert the document value and filter value explicitly</span>
286 | </span><span id="L-224"><a href="#L-224"><span class="linenos">224</span></a><span class="sd">    to dates before comparing them.</span>
287 | </span><span id="L-225"><a href="#L-225"><span class="linenos">225</span></a><span class="sd">    &quot;&quot;&quot;</span>
288 | </span><span id="L-226"><a href="#L-226"><span class="linenos">226</span></a>    <span class="k">return</span> <span class="n">dv</span> <span class="o">&gt;</span> <span class="n">fv</span>
289 | </span><span id="L-227"><a href="#L-227"><span class="linenos">227</span></a>
290 | </span><span id="L-228"><a href="#L-228"><span class="linenos">228</span></a>
291 | </span><span id="L-229"><a href="#L-229"><span class="linenos">229</span></a><span class="nd">@_check_comparator_inputs</span>
292 | </span><span id="L-230"><a href="#L-230"><span class="linenos">230</span></a><span class="k">def</span> <span class="nf">_lt</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
293 | </span><span id="L-231"><a href="#L-231"><span class="linenos">231</span></a>    <span class="k">return</span> <span class="n">dv</span> <span class="o">&lt;</span> <span class="n">fv</span>
294 | </span><span id="L-232"><a href="#L-232"><span class="linenos">232</span></a>
295 | </span><span id="L-233"><a href="#L-233"><span class="linenos">233</span></a>
296 | </span><span id="L-234"><a href="#L-234"><span class="linenos">234</span></a><span class="nd">@_check_comparator_inputs</span>
297 | </span><span id="L-235"><a href="#L-235"><span class="linenos">235</span></a><span class="k">def</span> <span class="nf">_gte</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
298 | </span><span id="L-236"><a href="#L-236"><span class="linenos">236</span></a>    <span class="k">return</span> <span class="n">_gt</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span> <span class="ow">or</span> <span class="n">_eq</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span>
299 | </span><span id="L-237"><a href="#L-237"><span class="linenos">237</span></a>
300 | </span><span id="L-238"><a href="#L-238"><span class="linenos">238</span></a>
301 | </span><span id="L-239"><a href="#L-239"><span class="linenos">239</span></a><span class="nd">@_check_comparator_inputs</span>
302 | </span><span id="L-240"><a href="#L-240"><span class="linenos">240</span></a><span class="k">def</span> <span class="nf">_lte</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
303 | </span><span id="L-241"><a href="#L-241"><span class="linenos">241</span></a>    <span class="k">return</span> <span class="n">_lt</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span> <span class="ow">or</span> <span class="n">_eq</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span>
304 | </span><span id="L-242"><a href="#L-242"><span class="linenos">242</span></a>
305 | </span><span id="L-243"><a href="#L-243"><span class="linenos">243</span></a>
306 | </span><span id="L-244"><a href="#L-244"><span class="linenos">244</span></a><span class="nd">@_check_comparator_inputs</span>
307 | </span><span id="L-245"><a href="#L-245"><span class="linenos">245</span></a><span class="k">def</span> <span class="nf">_in</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
308 | </span><span id="L-246"><a href="#L-246"><span class="linenos">246</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
309 | </span><span id="L-247"><a href="#L-247"><span class="linenos">247</span></a><span class="sd">    Allowing iterable filter values not just lists.</span>
310 | </span><span id="L-248"><a href="#L-248"><span class="linenos">248</span></a>
311 | </span><span id="L-249"><a href="#L-249"><span class="linenos">249</span></a><span class="sd">    This implementation permits a larger set of filter values</span>
312 | </span><span id="L-250"><a href="#L-250"><span class="linenos">250</span></a><span class="sd">    such as tuples, sets, and other iterable objects.</span>
313 | </span><span id="L-251"><a href="#L-251"><span class="linenos">251</span></a><span class="sd">    &quot;&quot;&quot;</span>
314 | </span><span id="L-252"><a href="#L-252"><span class="linenos">252</span></a>    <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">fv</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">):</span>
315 | </span><span id="L-253"><a href="#L-253"><span class="linenos">253</span></a>        <span class="n">msg</span> <span class="o">=</span> <span class="s2">&quot;Filter value must be an iterable for &#39;in&#39; comparison.&quot;</span>
316 | </span><span id="L-254"><a href="#L-254"><span class="linenos">254</span></a>        <span class="k">raise</span> <span class="n">FilterError</span><span class="p">(</span><span class="n">msg</span><span class="p">)</span>
317 | </span><span id="L-255"><a href="#L-255"><span class="linenos">255</span></a>
318 | </span><span id="L-256"><a href="#L-256"><span class="linenos">256</span></a>    <span class="k">return</span> <span class="nb">any</span><span class="p">(</span><span class="n">_eq</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">fv</span><span class="p">)</span>
319 | </span><span id="L-257"><a href="#L-257"><span class="linenos">257</span></a>
320 | </span><span id="L-258"><a href="#L-258"><span class="linenos">258</span></a>
321 | </span><span id="L-259"><a href="#L-259"><span class="linenos">259</span></a><span class="nd">@_check_comparator_inputs</span>
322 | </span><span id="L-260"><a href="#L-260"><span class="linenos">260</span></a><span class="k">def</span> <span class="nf">_nin</span><span class="p">(</span><span class="n">dv</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">fv</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
323 | </span><span id="L-261"><a href="#L-261"><span class="linenos">261</span></a>    <span class="k">return</span> <span class="ow">not</span> <span class="n">_in</span><span class="p">(</span><span class="n">dv</span><span class="p">,</span> <span class="n">fv</span><span class="p">)</span>
324 | </span><span id="L-262"><a href="#L-262"><span class="linenos">262</span></a>
325 | </span><span id="L-263"><a href="#L-263"><span class="linenos">263</span></a>
326 | </span><span id="L-264"><a href="#L-264"><span class="linenos">264</span></a><span class="n">LOGICAL_OPERATORS</span><span class="p">:</span> <span class="n">Final</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;NOT&quot;</span><span class="p">:</span> <span class="n">_not</span><span class="p">,</span> <span class="s2">&quot;AND&quot;</span><span class="p">:</span> <span class="n">_and</span><span class="p">,</span> <span class="s2">&quot;OR&quot;</span><span class="p">:</span> <span class="n">_or</span><span class="p">}</span>
327 | </span><span id="L-265"><a href="#L-265"><span class="linenos">265</span></a>
328 | </span><span id="L-266"><a href="#L-266"><span class="linenos">266</span></a><span class="n">COMPARISON_OPERATORS</span><span class="p">:</span> <span class="n">Final</span> <span class="o">=</span> <span class="p">{</span>
329 | </span><span id="L-267"><a href="#L-267"><span class="linenos">267</span></a>    <span class="s2">&quot;==&quot;</span><span class="p">:</span> <span class="n">_eq</span><span class="p">,</span>
330 | </span><span id="L-268"><a href="#L-268"><span class="linenos">268</span></a>    <span class="s2">&quot;!=&quot;</span><span class="p">:</span> <span class="n">_ne</span><span class="p">,</span>
331 | </span><span id="L-269"><a href="#L-269"><span class="linenos">269</span></a>    <span class="s2">&quot;&gt;&quot;</span><span class="p">:</span> <span class="n">_gt</span><span class="p">,</span>
332 | </span><span id="L-270"><a href="#L-270"><span class="linenos">270</span></a>    <span class="s2">&quot;&lt;&quot;</span><span class="p">:</span> <span class="n">_lt</span><span class="p">,</span>
333 | </span><span id="L-271"><a href="#L-271"><span class="linenos">271</span></a>    <span class="s2">&quot;&gt;=&quot;</span><span class="p">:</span> <span class="n">_gte</span><span class="p">,</span>
334 | </span><span id="L-272"><a href="#L-272"><span class="linenos">272</span></a>    <span class="s2">&quot;&lt;=&quot;</span><span class="p">:</span> <span class="n">_lte</span><span class="p">,</span>
335 | </span><span id="L-273"><a href="#L-273"><span class="linenos">273</span></a>    <span class="s2">&quot;in&quot;</span><span class="p">:</span> <span class="n">_in</span><span class="p">,</span>
336 | </span><span id="L-274"><a href="#L-274"><span class="linenos">274</span></a>    <span class="s2">&quot;not in&quot;</span><span class="p">:</span> <span class="n">_nin</span><span class="p">,</span>
337 | </span><span id="L-275"><a href="#L-275"><span class="linenos">275</span></a><span class="p">}</span>
338 | </span></pre></div>
339 | 
340 | 
341 |             </section>
342 |                 <section id="apply_filters_to_document">
343 |                             <input id="apply_filters_to_document-view-source" class="view-source-toggle-state" type="checkbox" aria-hidden="true" tabindex="-1">
344 | <div class="attr function">
345 |             
346 |         <span class="def">def</span>
347 |         <span class="name">apply_filters_to_document</span><span class="signature pdoc-code multiline">(<span class="param">	<span class="n">filters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span>,</span><span class="param">	<span class="n">document</span><span class="p">:</span> <span class="n">haystack</span><span class="o">.</span><span class="n">dataclasses</span><span class="o">.</span><span class="n">document</span><span class="o">.</span><span class="n">Document</span></span><span class="return-annotation">) -> <span class="nb">bool</span>:</span></span>
348 | 
349 |                 <label class="view-source-button" for="apply_filters_to_document-view-source"><span>View Source</span></label>
350 | 
351 |     </div>
352 |     <a class="headerlink" href="#apply_filters_to_document"></a>
353 |             <div class="pdoc-code codehilite"><pre><span></span><span id="apply_filters_to_document-14"><a href="#apply_filters_to_document-14"><span class="linenos">14</span></a><span class="k">def</span> <span class="nf">apply_filters_to_document</span><span class="p">(</span>
354 | </span><span id="apply_filters_to_document-15"><a href="#apply_filters_to_document-15"><span class="linenos">15</span></a>    <span class="n">filters</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="n">document</span><span class="p">:</span> <span class="n">Document</span>
355 | </span><span id="apply_filters_to_document-16"><a href="#apply_filters_to_document-16"><span class="linenos">16</span></a><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
356 | </span><span id="apply_filters_to_document-17"><a href="#apply_filters_to_document-17"><span class="linenos">17</span></a><span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
357 | </span><span id="apply_filters_to_document-18"><a href="#apply_filters_to_document-18"><span class="linenos">18</span></a><span class="sd">    Apply filters to a document.</span>
358 | </span><span id="apply_filters_to_document-19"><a href="#apply_filters_to_document-19"><span class="linenos">19</span></a>
359 | </span><span id="apply_filters_to_document-20"><a href="#apply_filters_to_document-20"><span class="linenos">20</span></a><span class="sd">    :param filters: The filters to apply to the document.</span>
360 | </span><span id="apply_filters_to_document-21"><a href="#apply_filters_to_document-21"><span class="linenos">21</span></a><span class="sd">    :type filters: dict[str, Any]</span>
361 | </span><span id="apply_filters_to_document-22"><a href="#apply_filters_to_document-22"><span class="linenos">22</span></a><span class="sd">    :param document: The document to apply the filters to.</span>
362 | </span><span id="apply_filters_to_document-23"><a href="#apply_filters_to_document-23"><span class="linenos">23</span></a><span class="sd">    :type document: Document</span>
363 | </span><span id="apply_filters_to_document-24"><a href="#apply_filters_to_document-24"><span class="linenos">24</span></a>
364 | </span><span id="apply_filters_to_document-25"><a href="#apply_filters_to_document-25"><span class="linenos">25</span></a><span class="sd">    :return: True if the document passes the filters.</span>
365 | </span><span id="apply_filters_to_document-26"><a href="#apply_filters_to_document-26"><span class="linenos">26</span></a><span class="sd">    :rtype: bool</span>
366 | </span><span id="apply_filters_to_document-27"><a href="#apply_filters_to_document-27"><span class="linenos">27</span></a><span class="sd">    &quot;&quot;&quot;</span>
367 | </span><span id="apply_filters_to_document-28"><a href="#apply_filters_to_document-28"><span class="linenos">28</span></a>    <span class="k">if</span> <span class="n">filters</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">filters</span><span class="p">:</span>
368 | </span><span id="apply_filters_to_document-29"><a href="#apply_filters_to_document-29"><span class="linenos">29</span></a>        <span class="k">return</span> <span class="kc">True</span>
369 | </span><span id="apply_filters_to_document-30"><a href="#apply_filters_to_document-30"><span class="linenos">30</span></a>    <span class="k">return</span> <span class="n">_run_comparison_condition</span><span class="p">(</span><span class="n">filters</span><span class="p">,</span> <span class="n">document</span><span class="p">)</span>
370 | </span></pre></div>
371 | 
372 | 
373 |             <div class="docstring"><p>Apply filters to a document.</p>
374 | 
375 | <h6 id="parameters">Parameters</h6>
376 | 
377 | <ul>
378 | <li><strong>filters</strong>:  The filters to apply to the document.</li>
379 | <li><strong>document</strong>:  The document to apply the filters to.</li>
380 | </ul>
381 | 
382 | <h6 id="returns">Returns</h6>
383 | 
384 | <blockquote>
385 |   <p>True if the document passes the filters.</p>
386 | </blockquote>
387 | </div>
388 | 
389 | 
390 |                 </section>
391 |                 <section id="LOGICAL_OPERATORS">
392 |                     <div class="attr variable">
393 |             <span class="name">LOGICAL_OPERATORS</span><span class="annotation">: Final</span>        =
394 | <span class="default_value">{&#39;NOT&#39;: &lt;function _not&gt;, &#39;AND&#39;: &lt;function _and&gt;, &#39;OR&#39;: &lt;function _or&gt;}</span>
395 | 
396 |         
397 |     </div>
398 |     <a class="headerlink" href="#LOGICAL_OPERATORS"></a>
399 |     
400 |     
401 | 
402 |                 </section>
403 |                 <section id="COMPARISON_OPERATORS">
404 |                     <div class="attr variable">
405 |             <span class="name">COMPARISON_OPERATORS</span><span class="annotation">: Final</span>        =
406 | <input id="COMPARISON_OPERATORS-view-value" class="view-value-toggle-state" type="checkbox" aria-hidden="true" tabindex="-1">
407 |             <label class="view-value-button pdoc-button" for="COMPARISON_OPERATORS-view-value"></label><span class="default_value">{&#39;==&#39;: &lt;function _eq&gt;, &#39;!=&#39;: &lt;function _ne&gt;, &#39;&gt;&#39;: &lt;function _gt&gt;, &#39;&lt;&#39;: &lt;function _lt&gt;, &#39;&gt;=&#39;: &lt;function _gte&gt;, &#39;&lt;=&#39;: &lt;function _lte&gt;, &#39;in&#39;: &lt;function _in&gt;, &#39;not in&#39;: &lt;function _nin&gt;}</span>
408 | 
409 |         
410 |     </div>
411 |     <a class="headerlink" href="#COMPARISON_OPERATORS"></a>
412 |     
413 |     
414 | 
415 |                 </section>
416 |     </main>
417 | <script>
418 |     function escapeHTML(html) {
419 |         return document.createElement('div').appendChild(document.createTextNode(html)).parentNode.innerHTML;
420 |     }
421 | 
422 |     const originalContent = document.querySelector("main.pdoc");
423 |     let currentContent = originalContent;
424 | 
425 |     function setContent(innerHTML) {
426 |         let elem;
427 |         if (innerHTML) {
428 |             elem = document.createElement("main");
429 |             elem.classList.add("pdoc");
430 |             elem.innerHTML = innerHTML;
431 |         } else {
432 |             elem = originalContent;
433 |         }
434 |         if (currentContent !== elem) {
435 |             currentContent.replaceWith(elem);
436 |             currentContent = elem;
437 |         }
438 |     }
439 | 
440 |     function getSearchTerm() {
441 |         return (new URL(window.location)).searchParams.get("search");
442 |     }
443 | 
444 |     const searchBox = document.querySelector(".pdoc input[type=search]");
445 |     searchBox.addEventListener("input", function () {
446 |         let url = new URL(window.location);
447 |         if (searchBox.value.trim()) {
448 |             url.hash = "";
449 |             url.searchParams.set("search", searchBox.value);
450 |         } else {
451 |             url.searchParams.delete("search");
452 |         }
453 |         history.replaceState("", "", url.toString());
454 |         onInput();
455 |     });
456 |     window.addEventListener("popstate", onInput);
457 | 
458 | 
459 |     let search, searchErr;
460 | 
461 |     async function initialize() {
462 |         try {
463 |             search = await new Promise((resolve, reject) => {
464 |                 const script = document.createElement("script");
465 |                 script.type = "text/javascript";
466 |                 script.async = true;
467 |                 script.onload = () => resolve(window.pdocSearch);
468 |                 script.onerror = (e) => reject(e);
469 |                 script.src = "../search.js";
470 |                 document.getElementsByTagName("head")[0].appendChild(script);
471 |             });
472 |         } catch (e) {
473 |             console.error("Cannot fetch pdoc search index");
474 |             searchErr = "Cannot fetch search index.";
475 |         }
476 |         onInput();
477 | 
478 |         document.querySelector("nav.pdoc").addEventListener("click", e => {
479 |             if (e.target.hash) {
480 |                 searchBox.value = "";
481 |                 searchBox.dispatchEvent(new Event("input"));
482 |             }
483 |         });
484 |     }
485 | 
486 |     function onInput() {
487 |         setContent((() => {
488 |             const term = getSearchTerm();
489 |             if (!term) {
490 |                 return null
491 |             }
492 |             if (searchErr) {
493 |                 return `<h3>Error: ${searchErr}</h3>`
494 |             }
495 |             if (!search) {
496 |                 return "<h3>Searching...</h3>"
497 |             }
498 | 
499 |             window.scrollTo({top: 0, left: 0, behavior: 'auto'});
500 | 
501 |             const results = search(term);
502 | 
503 |             let html;
504 |             if (results.length === 0) {
505 |                 html = `No search results for '${escapeHTML(term)}'.`
506 |             } else {
507 |                 html = `<h4>${results.length} search result${results.length > 1 ? "s" : ""} for '${escapeHTML(term)}'.</h4>`;
508 |             }
509 |             for (let result of results.slice(0, 10)) {
510 |                 let doc = result.doc;
511 |                 let url = `../${doc.modulename.replaceAll(".", "/")}.html`;
512 |                 if (doc.qualname) {
513 |                     url += `#${doc.qualname}`;
514 |                 }
515 | 
516 |                 let heading;
517 |                 switch (result.doc.kind) {
518 |                     case "function":
519 |                         if (doc.fullname.endsWith(".__init__")) {
520 |                             heading = `<span class="name">${doc.fullname.replace(/\.__init__$/, "")}</span>${doc.signature}`;
521 |                         } else {
522 |                             heading = `<span class="def">${doc.funcdef}</span> <span class="name">${doc.fullname}</span>${doc.signature}`;
523 |                         }
524 |                         break;
525 |                     case "class":
526 |                         heading = `<span class="def">class</span> <span class="name">${doc.fullname}</span>`;
527 |                         if (doc.bases)
528 |                             heading += `<wbr>(<span class="base">${doc.bases}</span>)`;
529 |                         heading += `:`;
530 |                         break;
531 |                     case "variable":
532 |                         heading = `<span class="name">${doc.fullname}</span>`;
533 |                         if (doc.annotation)
534 |                             heading += `<span class="annotation">${doc.annotation}</span>`;
535 |                         if (doc.default_value)
536 |                             heading += `<span class="default_value"> = ${doc.default_value}</span>`;
537 |                         break;
538 |                     default:
539 |                         heading = `<span class="name">${doc.fullname}</span>`;
540 |                         break;
541 |                 }
542 |                 html += `
543 |                         <section class="search-result">
544 |                         <a href="${url}" class="attr ${doc.kind}">${heading}</a>
545 |                         <div class="docstring">${doc.doc}</div>
546 |                         </section>
547 |                     `;
548 | 
549 |             }
550 |             return html;
551 |         })());
552 |     }
553 | 
554 |     if (getSearchTerm()) {
555 |         initialize();
556 |         searchBox.value = getSearchTerm();
557 |         onInput();
558 |     } else {
559 |         searchBox.addEventListener("focus", initialize, {once: true});
560 |     }
561 | 
562 |     searchBox.addEventListener("keydown", e => {
563 |         if (["ArrowDown", "ArrowUp", "Enter"].includes(e.key)) {
564 |             let focused = currentContent.querySelector(".search-result.focused");
565 |             if (!focused) {
566 |                 currentContent.querySelector(".search-result").classList.add("focused");
567 |             } else if (
568 |                 e.key === "ArrowDown"
569 |                 && focused.nextElementSibling
570 |                 && focused.nextElementSibling.classList.contains("search-result")
571 |             ) {
572 |                 focused.classList.remove("focused");
573 |                 focused.nextElementSibling.classList.add("focused");
574 |                 focused.nextElementSibling.scrollIntoView({
575 |                     behavior: "smooth",
576 |                     block: "nearest",
577 |                     inline: "nearest"
578 |                 });
579 |             } else if (
580 |                 e.key === "ArrowUp"
581 |                 && focused.previousElementSibling
582 |                 && focused.previousElementSibling.classList.contains("search-result")
583 |             ) {
584 |                 focused.classList.remove("focused");
585 |                 focused.previousElementSibling.classList.add("focused");
586 |                 focused.previousElementSibling.scrollIntoView({
587 |                     behavior: "smooth",
588 |                     block: "nearest",
589 |                     inline: "nearest"
590 |                 });
591 |             } else if (
592 |                 e.key === "Enter"
593 |             ) {
594 |                 focused.querySelector("a").click();
595 |             }
596 |         }
597 |     });
598 | </script></body>
599 | </html>


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 | <!doctype html>
2 | <html>
3 | <head>
4 |     <meta charset="utf-8">
5 |     <meta http-equiv="refresh" content="0; url=./bbm25_haystack.html"/>
6 | </head>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "bbm25-haystack"
  7 | dynamic = ["version"]
  8 | description = 'Haystack 2.x In-memory BM25 Document Store with Enhanced Efficiency'
  9 | readme = "README.md"
 10 | requires-python = ">=3.9"
 11 | license = "Apache-2.0"
 12 | keywords = ["Document Search", "BM25", "LLM Agent", "RAG", "Haystack"]
 13 | authors = [
 14 |   { name = "Guest400123064", email = "wangy49@seas.upenn.edu" },
 15 | ]
 16 | classifiers = [
 17 |   "Development Status :: 5 - Production/Stable",
 18 |   "Programming Language :: Python",
 19 |   "Programming Language :: Python :: 3.9",
 20 |   "Programming Language :: Python :: 3.10",
 21 |   "Programming Language :: Python :: 3.11",
 22 |   "Programming Language :: Python :: 3.12",
 23 |   "Programming Language :: Python :: Implementation :: CPython",
 24 |   "Programming Language :: Python :: Implementation :: PyPy",
 25 | ]
 26 | dependencies = [
 27 |   "haystack-ai",
 28 |   "sentencepiece",
 29 | ]
 30 | 
 31 | [project.urls]
 32 | Documentation = "https://github.com/Guest400123064/bbm25-haystack#readme"
 33 | Issues = "https://github.com/Guest400123064/bbm25-haystack/issues"
 34 | Source = "https://github.com/Guest400123064/bbm25-haystack"
 35 | 
 36 | [tool.hatch.version]
 37 | path = "src/bbm25_haystack/__about__.py"
 38 | 
 39 | [tool.hatch.envs.default]
 40 | dependencies = [
 41 |   "coverage[toml]>=6.5",
 42 |   "pytest",
 43 |   "pytest-cov",
 44 |   "hypothesis",
 45 | ]
 46 | [tool.hatch.envs.default.scripts]
 47 | test = "pytest {args:tests}"
 48 | test-cov = "coverage run -m pytest {args:tests}"
 49 | cov-report = [
 50 |   "- coverage combine",
 51 |   "coverage xml",
 52 | ]
 53 | cov = [
 54 |   "test-cov",
 55 |   "cov-report",
 56 | ]
 57 | 
 58 | [[tool.hatch.envs.all.matrix]]
 59 | python = ["3.9", "3.10", "3.11", "3.12"]
 60 | 
 61 | [tool.hatch.envs.lint]
 62 | detached = true
 63 | dependencies = [
 64 |   "black>=23.1.0",
 65 |   "mypy>=1.0.0",
 66 |   "ruff>=0.0.243",
 67 | ]
 68 | [tool.hatch.envs.lint.scripts]
 69 | typing = "mypy --install-types --non-interactive {args:src/bbm25_haystack tests}"
 70 | style = [
 71 |   "ruff {args:check .}",
 72 |   "black --check --diff {args:.}",
 73 | ]
 74 | fmt = [
 75 |   "black {args:.}",
 76 |   "ruff {args:check .} --fix",
 77 |   "style",
 78 | ]
 79 | all = [
 80 |   "style",
 81 |   "typing",
 82 | ]
 83 | 
 84 | [tool.hatch.metadata]
 85 | allow-direct-references = true
 86 | 
 87 | [tool.black]
 88 | target-version = ["py39"]
 89 | line-length = 85
 90 | skip-string-normalization = true
 91 | 
 92 | [tool.ruff]
 93 | target-version = "py39"
 94 | line-length = 85
 95 | select = [
 96 |   "A",
 97 |   "ARG",
 98 |   "B",
 99 |   "C",
100 |   "DTZ",
101 |   "E",
102 |   "EM",
103 |   "F",
104 |   "FBT",
105 |   "I",
106 |   "ICN",
107 |   "ISC",
108 |   "N",
109 |   "PLC",
110 |   "PLE",
111 |   "PLR",
112 |   "PLW",
113 |   "Q",
114 |   "RUF",
115 |   "S",
116 |   "T",
117 |   "TID",
118 |   "UP",
119 |   "W",
120 |   "YTT",
121 | ]
122 | ignore = [
123 |   # Allow non-abstract empty methods in abstract base classes
124 |   "B027",
125 |   # Allow boolean positional values in function calls, like `dict.get(... True)`
126 |   "FBT003",
127 |   # Ignore checks for possible passwords
128 |   "S105", "S106", "S107",
129 |   # Ignore complexity
130 |   "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
131 |   # Ignore usage of `lambda` expression
132 |   "E731",
133 | ]
134 | unfixable = [
135 |   # Don't touch unused imports
136 |   "F401",
137 | ]
138 | 
139 | [tool.ruff.isort]
140 | known-first-party = ["bbm25_haystack"]
141 | 
142 | [tool.ruff.flake8-tidy-imports]
143 | ban-relative-imports = "all"
144 | 
145 | [tool.ruff.per-file-ignores]
146 | # Tests can use magic values, assertions, and relative imports
147 | "tests/**/*" = ["PLR2004", "S101", "TID252"]
148 | 
149 | [tool.coverage.run]
150 | source_pkgs = ["bbm25_haystack", "tests"]
151 | branch = true
152 | parallel = true
153 | omit = [
154 |   "src/bbm25_haystack/__about__.py",
155 | ]
156 | 
157 | [tool.coverage.paths]
158 | bbm25_haystack = ["src/bbm25_haystack", "*/bbm25-haystack/src/bbm25_haystack"]
159 | tests = ["tests", "*/bbm25-haystack/tests"]
160 | 
161 | [tool.coverage.report]
162 | exclude_lines = [
163 |   "no cov",
164 |   "if __name__ == .__main__.:",
165 |   "if TYPE_CHECKING:",
166 | ]
167 | 
168 | [tool.pytest.ini_options]
169 | minversion = "6.0"
170 | markers = [
171 |   "unit: unit tests",
172 |   "integration: integration tests"
173 | ]
174 | 
175 | [[tool.mypy.overrides]]
176 | module = [
177 |   "haystack.*",
178 |   "pytest.*"
179 | ]
180 | ignore_missing_imports = true
181 | 


--------------------------------------------------------------------------------
/scripts/benchmark_beir.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2024-present Yuxuan Wang <wangy49@seas.upenn.edu>
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | import argparse
  5 | import logging
  6 | import os
  7 | import pathlib
  8 | from collections import deque
  9 | 
 10 | import pandas as pd
 11 | import tqdm
 12 | from beir import LoggingHandler, util
 13 | from beir.datasets.data_loader import GenericDataLoader
 14 | from beir.retrieval.evaluation import EvaluateRetrieval
 15 | from beir.retrieval.search import BaseSearch
 16 | from haystack import Document
 17 | 
 18 | from bbm25_haystack import BetterBM25DocumentStore
 19 | 
 20 | DIR_PROJ = pathlib.Path(__file__).resolve().parent.parent
 21 | DIR_BEIR_RAW = DIR_PROJ / "benchmarks" / "beir"
 22 | 
 23 | URL_BEIR = (
 24 |     "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{name}.zip"
 25 | )
 26 | 
 27 | DATASETS = [
 28 |     # General IR (in-domain)
 29 |     "msmarco",
 30 |     # Bio-medical IR
 31 |     "trec-covid",
 32 |     "nfcorpus",
 33 |     # Question answering
 34 |     "nq",
 35 |     "hotpotqa",
 36 |     "fiqa",
 37 |     # Citation prediction
 38 |     "scidocs",
 39 |     # Argument retrieval
 40 |     "arguana",
 41 |     "webis-touche2020",
 42 |     # Duplicate question retrieval
 43 |     "quora",
 44 |     "cqadupstack",
 45 |     # Fact checking
 46 |     "scifact",
 47 |     "fever",
 48 |     "climate-fever",
 49 |     # Entity retrieval
 50 |     "dbpedia-entity",
 51 | ]
 52 | 
 53 | logging.basicConfig(
 54 |     format="%(asctime)s - %(message)s",
 55 |     datefmt="%Y-%m-%d %H:%M:%S",
 56 |     level=logging.INFO,
 57 |     handlers=[LoggingHandler()],
 58 | )
 59 | 
 60 | 
 61 | class BEIRWrapper(BaseSearch):
 62 |     """Wrapper for the BetterBM25DocumentStore to be compatible with BEIR."""
 63 | 
 64 |     def __init__(self, store: BetterBM25DocumentStore) -> None:
 65 |         self._store = store
 66 |         self._indexed = False
 67 | 
 68 |     def index(self, corpus: dict[str, dict[str, str]]) -> None:
 69 |         """Index the corpus for retrieval."""
 70 | 
 71 |         for idx, raw in tqdm.tqdm(corpus.items(), desc="Indexing corpus"):
 72 |             raw_title = raw.get("title", "")
 73 |             raw_text = raw.get("text", "")
 74 | 
 75 |             content = f"title: {raw_title}; text: {raw_text}"
 76 |             document = Document(idx, content=content)
 77 |             self._store.write_documents([document])
 78 |         self._indexed = True
 79 | 
 80 |     def search(
 81 |         self,
 82 |         corpus: dict[str, dict[str]],
 83 |         queries: dict[str, str],
 84 |         top_k: int = 10,
 85 |         *args,
 86 |         **kwargs,
 87 |     ) -> dict[str, dict[str, float]]:
 88 |         """Search the corpus for relevant documents."""
 89 | 
 90 |         _ = args
 91 |         _ = kwargs
 92 | 
 93 |         if not self._indexed:
 94 |             self.index(corpus)
 95 | 
 96 |         results = {}
 97 |         for idx, qry in tqdm.tqdm(queries.items(), desc="Searching queries"):
 98 |             result = self._store._retrieval(qry, top_k=top_k)
 99 |             results[idx] = {doc.id: scr for doc, scr in result if doc.id != idx}
100 |         return results
101 | 
102 | 
103 | def download_dataset_from_beir(name: str) -> bool:
104 |     """Download a dataset maintained by the UKP Lab."""
105 | 
106 |     if os.path.isdir(DIR_BEIR_RAW / name):
107 |         logging.info(f"Dataset {name} already downloaded. Skipping...")
108 |         return True
109 | 
110 |     try:
111 |         logging.info(f"Downloading dataset {name} from BEIR to {DIR_BEIR_RAW}...")
112 |         util.download_and_unzip(URL_BEIR.format(name=name), DIR_BEIR_RAW)
113 |     except Exception as exc:
114 |         logging.warn(f"Failed to download dataset {name} from BEIR: {exc}")
115 |         return False
116 | 
117 |     logging.info(f"Dataset {name} downloaded successfully.")
118 |     return True
119 | 
120 | 
121 | def evaluate_retriever(args: argparse.Namespace) -> None:
122 |     """Evaluate the retrieval performance of a query encoder over
123 |     the BEIR benchmark."""
124 | 
125 |     queue = deque()  # [ local_save_dir_name... ]
126 |     for name in args.datasets or DATASETS:
127 |         download_dataset_from_beir(name)
128 | 
129 |         if name != "cqadupstack":
130 |             queue.append(name)
131 |             continue
132 | 
133 |         # Special handling for the CQADupStack dataset because the dataset has
134 |         # subdirectories for each topic; so we need to flatten the directory.
135 |         for sub_name in os.listdir(DIR_BEIR_RAW / "cqadupstack"):
136 |             sub_name_alt = str(os.path.join("cqadupstack", sub_name))
137 |             queue.append(sub_name_alt)
138 | 
139 |     records = []
140 |     while queue:
141 |         ds_name = queue.popleft()
142 |         dir_raw = DIR_BEIR_RAW / ds_name
143 | 
144 |         store = BetterBM25DocumentStore(
145 |             k=args.bm25_k1,
146 |             b=args.bm25_b,
147 |             delta=args.bm25_delta,
148 |             sp_file=args.sp_file,
149 |             n_grams=args.n_grams,
150 |         )
151 |         model = BEIRWrapper(store)
152 |         retriever = EvaluateRetrieval(model)
153 | 
154 |         corpus, queries, qrels = GenericDataLoader(dir_raw).load(split=args.split)
155 |         results = retriever.retrieve(corpus, queries)
156 | 
157 |         logging.info(f"Evaluating retriever over {ds_name}...")
158 | 
159 |         record = {}
160 |         for metric in retriever.evaluate(qrels, results, k_values=args.k_values):
161 |             record.update(metric)
162 | 
163 |         record.update(
164 |             {
165 |                 "datetime": pd.Timestamp.now(),
166 |                 "dataset": ds_name.replace("/", "-"),
167 |             }
168 |         )
169 |         record.update(store.to_dict().get("init_parameters"))
170 |         records.append(record)
171 | 
172 |     records = pd.DataFrame(records)
173 |     records.to_csv(args.output, index=False)
174 | 
175 | 
176 | def get_args() -> argparse.Namespace:
177 |     """Get command line arguments for evaluating retrieval performance."""
178 | 
179 |     parser = argparse.ArgumentParser(
180 |         prog="benchmark_beir.py",
181 |         description="Evaluate retrieval performance over the BEIR benchmark.",
182 |         epilog="Email wangy49@seas.upenn.edu for questions.",
183 |     )
184 | 
185 |     parser.add_argument(
186 |         "--datasets",
187 |         type=str,
188 |         nargs="+",
189 |         required=False,
190 |         default=None,
191 |         choices=DATASETS,
192 |         help=(
193 |             "Dataset names to evaluate on. All datasets will be used "
194 |             "if not specified (default: None)"
195 |         ),
196 |     )
197 |     parser.add_argument(
198 |         "--bm25-k1",
199 |         type=float,
200 |         required=False,
201 |         default=1.5,
202 |         help="The BM25+ k1 parameter; default to 1.5",
203 |     )
204 |     parser.add_argument(
205 |         "--bm25-b",
206 |         type=float,
207 |         default=0.75,
208 |         required=False,
209 |         help="The BM25+ b parameter; default to 0.75",
210 |     )
211 |     parser.add_argument(
212 |         "--bm25-delta",
213 |         type=float,
214 |         default=1.0,
215 |         required=False,
216 |         help="The BM25+ delta parameter; default to 1.0",
217 |     )
218 |     parser.add_argument(
219 |         "--sp-file",
220 |         type=str,
221 |         default=None,
222 |         required=False,
223 |         help="Path to the SentencePiece model file; default to None (LLaMA2)",
224 |     )
225 |     parser.add_argument(
226 |         "--n-grams",
227 |         type=int,
228 |         default=1,
229 |         required=False,
230 |         help="The n-gram size up to n for tokenizations; default to 1",
231 |     )
232 |     parser.add_argument(
233 |         "--split",
234 |         type=str,
235 |         default="test",
236 |         required=False,
237 |         choices=["train", "dev", "test"],
238 |         help="Dataset split to evaluate on (default: 'test')",
239 |     )
240 |     parser.add_argument(
241 |         "--output",
242 |         type=str,
243 |         default="beir_evaluation_results.csv",
244 |         help="Path to the evaluation result",
245 |     )
246 |     parser.add_argument(
247 |         "--k-values",
248 |         type=int,
249 |         nargs="+",
250 |         required=False,
251 |         default=[10, 100],
252 |         help="Top-k values for evaluation (default: [10, 100])",
253 |     )
254 | 
255 |     args = parser.parse_args()
256 |     return args
257 | 
258 | 
259 | if __name__ == "__main__":
260 |     evaluate_retriever(get_args())
261 | 


--------------------------------------------------------------------------------
/src/bbm25_haystack/__about__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
2 | #
3 | # SPDX-License-Identifier: Apache-2.0
4 | 
5 | __version__ = "0.2.2"
6 | 


--------------------------------------------------------------------------------
/src/bbm25_haystack/__init__.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
 2 | #
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | from bbm25_haystack.bbm25_retriever import BetterBM25Retriever
 5 | from bbm25_haystack.bbm25_store import BetterBM25DocumentStore
 6 | from bbm25_haystack.filters import apply_filters_to_document
 7 | 
 8 | __all__ = [
 9 |     "BetterBM25DocumentStore",
10 |     "BetterBM25Retriever",
11 |     "apply_filters_to_document",
12 | ]
13 | 


--------------------------------------------------------------------------------
/src/bbm25_haystack/bbm25_retriever.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | from typing import Any, Optional
  5 | 
  6 | from haystack import (
  7 |     DeserializationError,
  8 |     Document,
  9 |     component,
 10 |     default_from_dict,
 11 |     default_to_dict,
 12 | )
 13 | 
 14 | from bbm25_haystack.bbm25_store import BetterBM25DocumentStore
 15 | 
 16 | 
 17 | def _validate_search_params(filters: Optional[dict[str, Any]], top_k: int) -> None:
 18 |     """
 19 |     Validate the search parameters.
 20 | 
 21 |     :param filters: Haystack filters, a dictionary with filters to
 22 |         narrow down the search space. The filters are applied
 23 |         **before** similarity retrieval.
 24 |     :type filters: ``Optional[dict[str, Any]]``
 25 |     :param top_k: The maximum number of documents to return.
 26 |     :type top_k: ``int``
 27 | 
 28 |     :raises ValueError: If the specified top_k is not > 0.
 29 |     :raises TypeError: If filters is not a dictionary.
 30 |     """
 31 |     if not isinstance(top_k, int):
 32 |         msg = f"'top_k' must be an integer; got '{type(top_k)}' instead."
 33 |         raise TypeError(msg)
 34 | 
 35 |     if top_k <= 0:
 36 |         msg = f"'top_k' must be > 0; got '{top_k}' instead."
 37 |         raise ValueError(msg)
 38 | 
 39 |     if filters is not None and (not isinstance(filters, dict)):
 40 |         msg = f"'filters' must be a dictionary; got '{type(filters)}' instead."
 41 |         raise TypeError(msg)
 42 | 
 43 | 
 44 | @component
 45 | class BetterBM25Retriever:
 46 |     """
 47 |     A component for retrieving documents from a ``BetterBM25DocumentStore``.
 48 |     """
 49 | 
 50 |     def __init__(
 51 |         self,
 52 |         document_store: BetterBM25DocumentStore,
 53 |         *,
 54 |         filters: Optional[dict[str, Any]] = None,
 55 |         top_k: int = 10,
 56 |         set_score: bool = True,
 57 |     ) -> None:
 58 |         """
 59 |         Create a ``BetterBM25Retriever`` component.
 60 | 
 61 |         :param document_store: A ``BetterBM25DocumentStore`` instance.
 62 |         :type document_store: ``BetterBM25DocumentStore``
 63 |         :param filters: Haystack filters, a dictionary with filters to
 64 |             narrow down the search space. The filters are applied
 65 |             **before** similarity retrieval.
 66 |         :type filters: ``Optional[dict[str, Any]]``
 67 |         :param top_k: The maximum number of documents to return.
 68 |         :type top_k: ``int``
 69 |         :param set_score: Whether to set the similarity scores to returned
 70 |             documents under ``Document.score`` attribute. This is useful in
 71 |             hybrid retrieval setting where you may want to merge results.
 72 |             Note that returned documents are **copies** so that the original
 73 |             instances in the document store are not modified.
 74 |         :type set_score: ``bool``
 75 | 
 76 |         :raises ValueError: If the ``filters`` or ``top_k`` is invalid.
 77 |         :raises TypeError: If the ``document_store`` is not an instance of
 78 |             ``BetterBM25DocumentStore``.
 79 |         """
 80 |         _validate_search_params(filters, top_k)
 81 | 
 82 |         self.filters = filters
 83 |         self.top_k = top_k
 84 |         self.set_score = set_score
 85 | 
 86 |         if not isinstance(document_store, BetterBM25DocumentStore):
 87 |             msg = "'document_store' must be of type 'BetterBM25DocumentStore'"
 88 |             raise TypeError(msg)
 89 | 
 90 |         self.document_store = document_store
 91 | 
 92 |     @component.output_types(documents=list[Document])
 93 |     def run(
 94 |         self,
 95 |         query: str,
 96 |         *,
 97 |         filters: Optional[dict[str, Any]] = None,
 98 |         top_k: Optional[int] = None,
 99 |     ) -> dict[str, list[Document]]:
100 |         """
101 |         Run the Retriever on the given query. This method always return
102 |         copies of the documents retrieved from the document store.
103 | 
104 |         :param query: The text search term.
105 |         :type query: ``str``
106 |         :param filters: Haystack filters, a dictionary with filters to
107 |             narrow down the search space. The filters are applied
108 |             **before** similarity retrieval. Use the value provided during
109 |             initialization if not specified.
110 |         :type filters: ``Optional[dict[str, Any]]``
111 |         :param top_k: The maximum number of documents to return. Use the
112 |             value provided during initialization if not specified.
113 |         :type top_k: ``Optional[int]``
114 | 
115 |         :return: The retrieved documents in a dictionary with key "documents".
116 |         """
117 |         filters = filters or self.filters
118 |         top_k = top_k or self.top_k
119 | 
120 |         _validate_search_params(filters, top_k)
121 | 
122 |         sim = self.document_store._retrieval(query, filters=filters, top_k=top_k)
123 | 
124 |         ret = []
125 |         for doc, scr in sim:
126 |             data = doc.to_dict()
127 |             if self.set_score:
128 |                 data["score"] = scr
129 |             ret.append(Document.from_dict(data))
130 | 
131 |         return {"documents": ret}
132 | 
133 |     def to_dict(self) -> dict[str, Any]:
134 |         """Serializes the component to a dictionary."""
135 |         return default_to_dict(
136 |             self,
137 |             filters=self.filters,
138 |             top_k=self.top_k,
139 |             document_store=self.document_store.to_dict(),
140 |             set_score=self.set_score,
141 |         )
142 | 
143 |     @classmethod
144 |     def from_dict(cls, data: dict[str, Any]) -> "BetterBM25Retriever":
145 |         """Deserializes the retriever from a dictionary."""
146 |         doc_store_params = data["init_parameters"].get("document_store")
147 |         if doc_store_params is None:
148 |             msg = "Missing 'document_store' in serialization data"
149 |             raise DeserializationError(msg)
150 | 
151 |         if doc_store_params.get("type") is None:
152 |             msg = "Missing 'type' in document store's serialization data"
153 |             raise DeserializationError(msg)
154 | 
155 |         data["init_parameters"]["document_store"] = (
156 |             BetterBM25DocumentStore.from_dict(doc_store_params)
157 |         )
158 |         return default_from_dict(cls, data)
159 | 


--------------------------------------------------------------------------------
/src/bbm25_haystack/bbm25_store.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | import heapq
  5 | import math
  6 | import os
  7 | from collections import Counter, deque
  8 | from collections.abc import Iterable
  9 | from itertools import chain
 10 | from typing import Any, Final, Optional, Union
 11 | 
 12 | import pandas as pd
 13 | from haystack import Document, default_from_dict, default_to_dict, logging
 14 | from haystack.document_stores.errors import (
 15 |     DuplicateDocumentError,
 16 |     MissingDocumentError,
 17 | )
 18 | from haystack.document_stores.types import DuplicatePolicy
 19 | from haystack.utils.filters import document_matches_filter
 20 | from sentencepiece import SentencePieceProcessor  # type: ignore
 21 | 
 22 | from bbm25_haystack.filters import apply_filters_to_document
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def _n_grams(seq: Iterable[str], n: int):
 28 |     """
 29 |     Returns a sliding window (of width n) over data from the
 30 |     iterable. This solution is adapted from the StackOverflow
 31 |     answer [here](https://stackoverflow.com/a/6822773/13403958).
 32 | 
 33 |     :param seq: Input token sequence.
 34 |     :type seq: ``Iterable[str]``
 35 |     :param n: Window size.
 36 |     :type n: ``int``
 37 | 
 38 |     :return: The n-gram window generator.
 39 |     :rtype: ``Generator[tuple[str], None, None]``
 40 |     """
 41 |     it = iter(seq)
 42 |     wd = deque((next(it, None) for _ in range(n)), maxlen=n)
 43 | 
 44 |     yield tuple(wd)
 45 |     for el in it:
 46 |         wd.append(el)
 47 |         yield tuple(wd)
 48 | 
 49 | 
 50 | class BetterBM25DocumentStore:
 51 |     """
 52 |     An in-memory BM25 document store intended to improve the default
 53 |     ``InMemoryDocumentStore`` shipped with Haystack.
 54 |     """
 55 | 
 56 |     _default_sp_file: Final = os.path.join(
 57 |         os.path.dirname(os.path.abspath(__file__)), "default.model"
 58 |     )
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         *,
 63 |         k: float = 1.5,
 64 |         b: float = 0.75,
 65 |         delta: float = 1.0,
 66 |         sp_file: Optional[str] = None,
 67 |         n_grams: Union[int, tuple[int, int]] = 1,
 68 |         haystack_filter_logic: bool = True,
 69 |     ) -> None:
 70 |         """
 71 |         Creates a new ``BetterBM25DocumentStore`` instance.
 72 | 
 73 |         :param k: k1 parameter in BM25+ formula.
 74 |         :type k: ``Optional[float]``
 75 |         :param b: b parameter in BM25+ formula.
 76 |         :type b: ``Optional[float]``
 77 |         :param delta: delta parameter in BM25+ formula.
 78 |         :type delta: ``Optional[float]``
 79 |         :param sp_file: ``SentencePiece`` tokenizer ``.model`` file to
 80 |             use. A default from LLaMA-2-32K is used if not provided.
 81 |         :type sp_file: ``Optional[str]``
 82 |         :param n_grams: The n-gram window size. Can be a range of n-grams
 83 |             to include in text representation. If a single integer is
 84 |             provided, it will be treated as the maximum n-gram window size,
 85 |             which is equivalent to ``(1, n_grams)``.
 86 |         :type n_grams: ``Optional[Union[int, tuple[int, int]]]``
 87 |         :param haystack_filter_logic: Whether to use the Haystack filter
 88 |             logic or the one implemented in this store.
 89 |         :type haystack_filter_logic: ``Optional[bool]``
 90 |         """
 91 |         self._k = k
 92 |         self._b = b
 93 | 
 94 |         # Adjust the delta value so that we can bring the ``(k1 + 1)``
 95 |         # term out of the 'term frequency' term in BM25+ formula and
 96 |         # delete it; this will not affect the ranking.
 97 |         self._delta = delta / (self._k + 1.0)
 98 | 
 99 |         self._parse_sp_file(sp_file=sp_file)
100 |         self._parse_n_grams(n_grams=n_grams)
101 | 
102 |         self._haystack_filter_logic = haystack_filter_logic
103 |         self._filter_func = (
104 |             document_matches_filter
105 |             if self._haystack_filter_logic
106 |             else apply_filters_to_document
107 |         )
108 | 
109 |         self._avg_doc_len: float = 0.0
110 |         self._freq_doc: Counter = Counter()
111 |         self._index: dict[str, tuple[Document, dict[tuple[str], int], int]] = {}
112 | 
113 |     def _parse_sp_file(self, sp_file: Optional[str]) -> None:
114 |         self._sp_file = sp_file
115 | 
116 |         if sp_file is None:
117 |             self._sp_inst = SentencePieceProcessor(model_file=self._default_sp_file)
118 |             return
119 | 
120 |         if not os.path.exists(sp_file) or not os.path.isfile(sp_file):
121 |             msg = (
122 |                 f"Tokenizer model file '{sp_file}' not accessible; "
123 |                 f"fallback to default {self._default_sp_file}."
124 |             )
125 |             logger.warn(msg)
126 |             self._sp_inst = SentencePieceProcessor(model_file=self._default_sp_file)
127 |             return
128 | 
129 |         try:
130 |             self._sp_inst = SentencePieceProcessor(model_file=sp_file)
131 |         except Exception as exc:
132 |             msg = (
133 |                 f"Failed to load tokenizer model file '{sp_file}': {exc}; "
134 |                 f"fallback to default {self._default_sp_file}."
135 |             )
136 |             logger.error(msg)
137 |             self._sp_inst = SentencePieceProcessor(model_file=self._default_sp_file)
138 | 
139 |     def _parse_n_grams(self, n_grams: Optional[Union[int, tuple[int, int]]]) -> None:
140 |         self._n_grams = n_grams
141 | 
142 |         if isinstance(n_grams, int):
143 |             self._n_grams_min = 1
144 |             self._n_grams_max = n_grams
145 |             return
146 | 
147 |         if isinstance(n_grams, tuple):
148 |             self._n_grams_min, self._n_grams_max = n_grams
149 |             if not all(isinstance(n, int) for n in n_grams):
150 |                 msg = f"Invalid n-gram window size: {n_grams}."
151 |                 raise ValueError(msg)
152 |             return
153 | 
154 |         msg = f"Invalid n-gram window size: {n_grams}; expected int or tuple."
155 |         raise ValueError(msg)
156 | 
157 |     def _tokenize(self, texts: Union[str, list[str]]) -> list[list[tuple[str]]]:
158 |         """
159 |         Tokenize input text using SentencePiece model.
160 | 
161 |         The input text can either be a single string or a list of strings,
162 |         such as a single user query or a group of raw document. The tokenized
163 |         text will be augmented into set of n-grams based.
164 | 
165 |         :param texts: Input text to tokenize, queries or documents.
166 |         :type texts: ``Union[str, list[str]]``
167 | 
168 |         :return: Tokenized and n-gram augmented texts.
169 |         :rtype: ``list[list[tuple[str]]]``
170 |         """
171 | 
172 |         def _augment_to_n_grams(tokens: list[str]) -> list[tuple[str]]:
173 |             it = (
174 |                 _n_grams(tokens, n)
175 |                 for n in range(self._n_grams_min, self._n_grams_max + 1)
176 |             )
177 |             return list(chain(*it))
178 | 
179 |         if isinstance(texts, str):
180 |             texts = [texts]
181 |         return [
182 |             _augment_to_n_grams(tokens)
183 |             for tokens in self._sp_inst.encode(texts, out_type=str)
184 |         ]
185 | 
186 |     def _compute_bm25plus(
187 |         self,
188 |         query: str,
189 |         documents: list[Document],
190 |     ) -> list[tuple[Document, float]]:
191 |         """
192 |         Calculate the BM25+ score for all documents in this index.
193 | 
194 |         :param query: Query to calculate the BM25+ score for.
195 |         :type query: ``str``
196 |         :param documents: Filtered pool of documents retrieve from.
197 |         :type documents: ``list[Document]``
198 | 
199 |         :return: Documents and corresponding BM25+ scores.
200 |         :rtype: ``list[tuple[Document, float]]``
201 |         """
202 |         cnt = lambda ng: self._freq_doc.get(ng, 0)
203 |         idf = {
204 |             ng: math.log(
205 |                 1 + (self.count_documents() - cnt(ng) + 0.5) / (cnt(ng) + 0.5)
206 |             )
207 |             for ng in self._tokenize(query)[0]
208 |         }
209 | 
210 |         sim = []
211 |         for doc in documents:
212 |             _, freq, doc_len = self._index[doc.id]
213 |             doc_len_scaled = doc_len / self._avg_doc_len
214 | 
215 |             scr = 0.0
216 |             for token, idf_val in idf.items():
217 |                 freq_term = freq.get(token, 0.0)
218 |                 freq_damp = self._k * (1 + self._b * (doc_len_scaled - 1))
219 | 
220 |                 tf_val = freq_term / (freq_term + freq_damp) + self._delta
221 |                 scr += idf_val * tf_val
222 | 
223 |             sim.append((doc, scr))
224 | 
225 |         return sim
226 | 
227 |     def _retrieval(
228 |         self,
229 |         query: str,
230 |         *,
231 |         filters: Optional[dict[str, Any]] = None,
232 |         top_k: Optional[int] = None,
233 |     ) -> list[tuple[Document, float]]:
234 |         """
235 |         Retrieve documents from the store using the given query.
236 | 
237 |         :param query: Query to search for.
238 |         :type query: ``str``
239 |         :param filters: Filters to apply to the document list.
240 |         :type filters: ``Optional[dict[str, Any]]``
241 |         :param top_k: Number of documents to return.
242 |         :type top_k: ``int``
243 | 
244 |         :return: Top ``k`` documents and corresponding BM25+ scores.
245 |         :rtype: ``list[tuple[Document, float]]``
246 |         """
247 |         documents = self.filter_documents(filters)
248 |         if not documents:
249 |             return []
250 | 
251 |         sim = self._compute_bm25plus(query, documents)
252 |         if top_k is None:
253 |             return sorted(sim, key=lambda x: x[1], reverse=True)
254 |         return heapq.nlargest(top_k, sim, key=lambda x: x[1])
255 | 
256 |     def count_documents(self) -> int:
257 |         """
258 |         Returns how many documents are present in this store.
259 | 
260 |         :return: Number of documents in the store.
261 |         :rtype: ``int``
262 |         """
263 |         return len(self._index.keys())
264 | 
265 |     def filter_documents(
266 |         self, filters: Optional[dict[str, Any]] = None
267 |     ) -> list[Document]:
268 |         """
269 |         Filter documents in the store using the given filters.
270 | 
271 |         :param filters: Filters to apply to the document list.
272 |         :type filters: ``Optional[dict[str, Any]]``
273 | 
274 |         :return: List of documents that match the given filters.
275 |         :rtype: ``list[Document]``
276 |         """
277 |         if filters is None or not filters:
278 |             return [doc for doc, _, _ in self._index.values()]
279 |         return [
280 |             doc
281 |             for doc, _, _ in self._index.values()
282 |             if self._filter_func(filters, doc)
283 |         ]
284 | 
285 |     def write_documents(
286 |         self,
287 |         documents: list[Document],
288 |         policy: DuplicatePolicy = DuplicatePolicy.NONE,
289 |     ) -> int:
290 |         """
291 |         Writes (or overwrites) documents into the store.
292 | 
293 |         :param documents: List of documents to write.
294 |         :type documents: ``list[Document]``
295 |         :param policy: Documents with the same ``Document.id`` count as
296 |             duplicates. When duplicates are met, the store can:
297 |              - ``SKIP``: keep the existing document and ignore the new one.
298 |              - ``OVERWRITE``: remove the old document and write the new one.
299 |              - ``FAIL``: an error is raised (default behavior if not specified)
300 |         :type policy: ``Optional[DuplicatePolicy]``
301 | 
302 |         :raises ValueError: Exception trigger on invalid duplicate policy.
303 |         :raises DuplicateDocumentError: Exception trigger on duplicate
304 |             document if ``policy=DuplicatePolicy.FAIL``
305 | 
306 |         :return: Number of documents written.
307 |         :rtype: ``int``
308 |         """
309 |         if policy not in DuplicatePolicy:
310 |             msg = f"Invalid duplicate policy: {policy}."
311 |             raise ValueError(msg)
312 | 
313 |         if policy == DuplicatePolicy.NONE:
314 |             policy = DuplicatePolicy.FAIL
315 | 
316 |         n_written = 0
317 |         for doc in documents:
318 |             if not isinstance(doc, Document):
319 |                 msg = f"Expected document type, got '{doc}' of type '{type(doc)}'."
320 |                 raise ValueError(msg)
321 | 
322 |             if doc.id in self._index.keys():
323 |                 if policy == DuplicatePolicy.SKIP:
324 |                     continue
325 |                 elif policy == DuplicatePolicy.FAIL:
326 |                     msg = f"Document with ID '{doc.id}' already exists in the store."
327 |                     raise DuplicateDocumentError(msg)
328 | 
329 |                 # Overwrite if exists; delete first to keep the statistics consistent
330 |                 logger.debug(
331 |                     f"Document '{doc.id}' already exists in the store, overwriting."
332 |                 )
333 |                 self.delete_documents([doc.id])
334 | 
335 |             content = doc.content or ""
336 |             if content == "" and isinstance(doc.dataframe, pd.DataFrame):
337 |                 content = doc.dataframe.astype(str).to_csv(index=False)
338 | 
339 |             tokens = self._tokenize(content)[0]
340 | 
341 |             self._index[doc.id] = (doc, Counter(tokens), len(tokens))
342 |             self._freq_doc.update(set(tokens))
343 |             self._avg_doc_len = (
344 |                 len(tokens) + self._avg_doc_len * self.count_documents()
345 |             ) / (self.count_documents() + 1)
346 | 
347 |             logger.debug(f"Document '{doc.id}' written to store.")
348 |             n_written += 1
349 | 
350 |         return n_written
351 | 
352 |     def delete_documents(self, document_ids: list[str]) -> int:
353 |         """
354 |         Deletes all documents with a matching ID.
355 | 
356 |         :param document_ids: List of ``object_id`` to delete
357 |         :type document_ids: ``list[str]``
358 | 
359 |         :raises MissingDocumentError: Triggered on document not found.
360 | 
361 |         :return: Number of documents deleted.
362 |         :rtype: ``int``
363 |         """
364 |         n_removal = 0
365 |         for doc_id in document_ids:
366 |             try:
367 |                 _, freq, doc_len = self._index.pop(doc_id)
368 |                 self._freq_doc.subtract(Counter(freq.keys()))
369 |                 try:
370 |                     self._avg_doc_len = (
371 |                         self._avg_doc_len * (self.count_documents() + 1) - doc_len
372 |                     ) / self.count_documents()
373 |                 except ZeroDivisionError:
374 |                     self._avg_doc_len = 0
375 | 
376 |                 logger.debug(f"Document '{doc_id}' deleted from store.")
377 |                 n_removal += 1
378 |             except KeyError as exc:
379 |                 msg = f"Document with ID '{doc_id}' not found, cannot delete it."
380 |                 raise MissingDocumentError(msg) from exc
381 | 
382 |         return n_removal
383 | 
384 |     def to_dict(self) -> dict[str, Any]:
385 |         """Serializes this store to a dictionary."""
386 |         return default_to_dict(
387 |             self,
388 |             k=self._k,
389 |             b=self._b,
390 |             delta=self._delta * (self._k + 1.0),  # Because we scaled it on init
391 |             sp_file=self._sp_file,
392 |             n_grams=self._n_grams,
393 |             haystack_filter_logic=self._haystack_filter_logic,
394 |         )
395 | 
396 |     @classmethod
397 |     def from_dict(cls, data: dict[str, Any]) -> "BetterBM25DocumentStore":
398 |         """Deserializes the store from a dictionary."""
399 |         return default_from_dict(cls, data)
400 | 


--------------------------------------------------------------------------------
/src/bbm25_haystack/default.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Guest400123064/bbm25-haystack/9906fa27ffc54f4fd92dfb5d717c15a12a69df0a/src/bbm25_haystack/default.model


--------------------------------------------------------------------------------
/src/bbm25_haystack/filters.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | from collections.abc import Iterable
  5 | from functools import wraps
  6 | from typing import Any, Callable, Final, Optional
  7 | 
  8 | import pandas as pd
  9 | from haystack.dataclasses import Document
 10 | from haystack.errors import FilterError
 11 | 
 12 | 
 13 | def apply_filters_to_document(
 14 |     filters: Optional[dict[str, Any]], document: Document
 15 | ) -> bool:
 16 |     """
 17 |     Apply filters to a document. Differences with the official
 18 |     Haystack implementation:
 19 | 
 20 |     - Comparison with ``None``, i.e., missing values, involved will
 21 |         always return ``False``, no matter missing the document
 22 |         attribute value or missing the filter value.
 23 |     - Comparison with ``pandas.DataFrame`` is always prohibited to
 24 |         reduce surprises.
 25 |     - No implicit ``datetime`` conversion from string values.
 26 |     - ``in`` and ``not in`` allows any ``Iterable`` as filter value,
 27 |         without the ``list`` constraint.
 28 |     - Allowing custom comparison functions for more flexibility. Note
 29 |         that the custom comparison function inputs are NEVER checked,
 30 |         i.e., no missing value check, no ``DataFrame`` check, etc. User
 31 |         should ensure the input values are valid and return value is
 32 |         always a boolean. The inputs are always supplied in the order
 33 |         of document value and then filter value.
 34 | 
 35 |     :param filters: The filters to apply to the document.
 36 |     :type filters: ``dict[str, Any]``
 37 |     :param document: The document to apply the filters to.
 38 |     :type document: ``Document``
 39 | 
 40 |     :return: ``True`` if the document passes the filters.
 41 |     :rtype: ``bool``
 42 |     """
 43 |     if filters is None or not filters:
 44 |         return True
 45 |     return _run_comparison_condition(filters, document)
 46 | 
 47 | 
 48 | def _get_document_field(document: Document, field: str) -> Optional[Any]:
 49 |     """
 50 |     Get the value of a field in a document.
 51 | 
 52 |     If the field is not found within the document then, instead of
 53 |     raising an error, ``None`` is returned. Note that here we do not
 54 |     implicitly add ``'meta'`` prefix for fields that are not a direct
 55 |     attribute of the document, not supporting legacy behavior anymore.
 56 | 
 57 |     :param document: The document to get the field value from.
 58 |     :type document: ``Document``
 59 |     :param field: The field to get the value of.
 60 |     :type field: ``str``
 61 | 
 62 |     :return: The value of the field in the document.
 63 |     :rtype: ``Optional[Any]``
 64 |     """
 65 |     if "." not in field:
 66 |         return getattr(document, field)
 67 | 
 68 |     attr = document.meta
 69 |     for f in field.split(".")[1:]:
 70 |         attr = attr.get(f)
 71 |         if attr is None:
 72 |             return None
 73 |     return attr
 74 | 
 75 | 
 76 | def _run_logical_condition(condition: dict[str, Any], document: Document) -> bool:
 77 |     if "operator" not in condition:
 78 |         msg = "Logical condition must have an 'operator' key."
 79 |         raise FilterError(msg)
 80 |     if "conditions" not in condition:
 81 |         msg = "Logical condition must have a 'conditions' key."
 82 |         raise FilterError(msg)
 83 | 
 84 |     conditions = condition["conditions"]
 85 |     reducer = LOGICAL_OPERATORS[condition["operator"]]
 86 | 
 87 |     return reducer(document, conditions)
 88 | 
 89 | 
 90 | def _run_comparison_condition(condition: dict[str, Any], document: Document) -> bool:
 91 |     if "field" not in condition:
 92 |         return _run_logical_condition(condition, document)
 93 | 
 94 |     if "operator" not in condition:
 95 |         msg = "Comparison condition must have an 'operator' key."
 96 |         raise FilterError(msg)
 97 |     if "value" not in condition:
 98 |         msg = "Comparison condition must have a 'value' key."
 99 |         raise FilterError(msg)
100 | 
101 |     field: str = condition["field"]
102 |     value: Any = condition["value"]
103 | 
104 |     # TODO: We may want to check if the supplied comparator is valid
105 |     if callable(condition["operator"]):
106 |         comparator = condition["operator"]
107 |     else:
108 |         comparator = COMPARISON_OPERATORS[condition["operator"]]
109 | 
110 |     return comparator(_get_document_field(document, field), value)
111 | 
112 | 
113 | def _and(document: Document, conditions: list[dict[str, Any]]) -> bool:
114 |     """
115 |     Return True if all conditions are met.
116 | 
117 |     :param document: The document to check the conditions against.
118 |     :type document: Document
119 |     :param conditions: The conditions to check against the document.
120 |     :type conditions: ``list[dict[str, Any]]``
121 | 
122 |     :return: True if not all conditions are met.
123 |     :rtype: ``bool``
124 |     """
125 |     return all(
126 |         _run_comparison_condition(condition, document) for condition in conditions
127 |     )
128 | 
129 | 
130 | def _or(document: Document, conditions: list[dict[str, Any]]) -> bool:
131 |     """
132 |     Return True if any condition is met.
133 | 
134 |     :param document: The document to check the conditions against.
135 |     :type document: Document
136 |     :param conditions: The conditions to check against the document.
137 |     :type conditions: ``list[dict[str, Any]]``
138 | 
139 |     :return: True if not all conditions are met.
140 |     :rtype: ``bool``
141 |     """
142 |     return any(_run_comparison_condition(cond, document) for cond in conditions)
143 | 
144 | 
145 | def _not(document: Document, conditions: list[dict[str, Any]]) -> bool:
146 |     """
147 |     Return True if not all conditions are met.
148 | 
149 |     The 'NOT' operator is under-specified when supplied with a
150 |     set of conditions instead of a single condition. Because we
151 |     can have the semantics of 'at least one False' versus
152 |     'all False'. Here we choose to comply with the official
153 |     implementation of Haystack (the 'at least one False' semantics).
154 | 
155 |     :param document: The document to check the conditions against.
156 |     :type document: ``Document``
157 |     :param conditions: The conditions to check against the document.
158 |     :type conditions: ``list[dict[str, Any]]``
159 | 
160 |     :return: True if not all conditions are met.
161 |     :rtype: ``bool``
162 |     """
163 |     return not _and(document, conditions)
164 | 
165 | 
166 | def _check_comparator_inputs(
167 |     comparator: Callable[[Any, Any], bool]
168 | ) -> Callable[[Any, Any], bool]:
169 |     """
170 |     A decorator to check and preprocess input attribute values.
171 | 
172 |     ALL COMPARISON OPERATORS SHOULD BE WRAPPED WITH THIS DECORATOR.
173 |     because a `False` may be returned by both input validation and
174 |     the actual comparison. This decorator ensures that the comparison
175 |     function is only called if the input values are valid.
176 | 
177 |     :param comparator: The comparator function to wrap.
178 |     :type comparator: ``Callable[[Any, Any], bool]``
179 | 
180 |     :return: The wrapped comparator function.
181 |     :rtype: ``Callable[[Any, Any], bool]``
182 |     """
183 | 
184 |     @wraps(comparator)
185 |     def _wrapper(dv: Any, fv: Any) -> bool:
186 | 
187 |         # I think allowing comparison between DataFrames would
188 |         # be a really bad idea because it would create unexpected
189 |         # behavior, but I am open to discussion on this.
190 |         if isinstance(dv, pd.DataFrame) or isinstance(fv, pd.DataFrame):
191 |             msg = (
192 |                 "Cannot compare DataFrames. Please convert them to "
193 |                 "simpler data structures before comparing."
194 |             )
195 |             raise FilterError(msg)
196 | 
197 |         # I think comparison between missing values is ambiguous,
198 |         # but again, I am open to discussion on this. Here I choose
199 |         # to return False if either value is None because from a
200 |         # logical perspective, we really cannot say anything about
201 |         # the comparison between a missing value and a non-missing.
202 |         if dv is None or fv is None:
203 |             return False
204 | 
205 |         try:
206 |             return comparator(dv, fv)
207 |         except TypeError as exc:
208 |             msg = (
209 |                 f"Cannot compare document value of {type(dv)} type "
210 |                 f"with filter value of {type(fv)} type."
211 |             )
212 |             raise FilterError(msg) from exc
213 | 
214 |     return _wrapper
215 | 
216 | 
217 | @_check_comparator_inputs
218 | def _eq(dv: Any, fv: Any) -> bool:
219 |     """
220 |     Conservative implementation of equal comparison.
221 | 
222 |     There are two major differences between this implementation
223 |     and the default Haystack filter implementation:
224 |         - If both values are None, we return False, instead of True.
225 |         - If any value is a DataFrame, we raise an error, instead
226 |             of converting them to JSON.
227 |     """
228 |     return dv == fv
229 | 
230 | 
231 | @_check_comparator_inputs
232 | def _ne(dv: Any, fv: Any) -> bool:
233 |     return not _eq(dv, fv)
234 | 
235 | 
236 | @_check_comparator_inputs
237 | def _gt(dv: Any, fv: Any) -> bool:
238 |     """
239 |     A more liberal implementation with less surprises.
240 | 
241 |     Simply compare the two values with default Python comparison.
242 |     We do not perform any conversion here to have the behavior
243 |     more predictable. If we want to compare the dates, we should
244 |     just convert the document value and filter value explicitly
245 |     to dates before comparing them.
246 |     """
247 |     return dv > fv
248 | 
249 | 
250 | @_check_comparator_inputs
251 | def _lt(dv: Any, fv: Any) -> bool:
252 |     return dv < fv
253 | 
254 | 
255 | @_check_comparator_inputs
256 | def _gte(dv: Any, fv: Any) -> bool:
257 |     return _gt(dv, fv) or _eq(dv, fv)
258 | 
259 | 
260 | @_check_comparator_inputs
261 | def _lte(dv: Any, fv: Any) -> bool:
262 |     return _lt(dv, fv) or _eq(dv, fv)
263 | 
264 | 
265 | @_check_comparator_inputs
266 | def _in(dv: Any, fv: Any) -> bool:
267 |     """
268 |     Allowing iterable filter values not just lists.
269 | 
270 |     This implementation permits a larger set of filter values
271 |     such as tuples, sets, and other iterable objects.
272 |     """
273 |     if not isinstance(fv, Iterable):
274 |         msg = "Filter value must be an iterable for 'in' comparison."
275 |         raise FilterError(msg)
276 | 
277 |     return any(_eq(dv, v) for v in fv)
278 | 
279 | 
280 | @_check_comparator_inputs
281 | def _nin(dv: Any, fv: Any) -> bool:
282 |     return not _in(dv, fv)
283 | 
284 | 
285 | LOGICAL_OPERATORS: Final = {"NOT": _not, "AND": _and, "OR": _or}
286 | 
287 | COMPARISON_OPERATORS: Final = {
288 |     "==": _eq,
289 |     "!=": _ne,
290 |     ">": _gt,
291 |     "<": _lt,
292 |     ">=": _gte,
293 |     "<=": _lte,
294 |     "in": _in,
295 |     "not in": _nin,
296 | }
297 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2023-present John Doe <jd@example.com>
2 | #
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/tests/test_document_store.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | import pandas as pd
  5 | import pytest
  6 | from haystack import Document
  7 | from haystack.document_stores.errors import (
  8 |     DuplicateDocumentError,
  9 |     MissingDocumentError,
 10 | )
 11 | from haystack.document_stores.types import (
 12 |     DocumentStore,
 13 |     DuplicatePolicy,
 14 | )
 15 | from haystack.errors import FilterError
 16 | from haystack.testing.document_store import (
 17 |     DocumentStoreBaseTests,
 18 | )
 19 | 
 20 | from bbm25_haystack.bbm25_store import BetterBM25DocumentStore
 21 | 
 22 | 
 23 | @pytest.mark.integration
 24 | class TestDocumentStore(DocumentStoreBaseTests):
 25 |     """Common test cases will be provided by `DocumentStoreBaseTests`."""
 26 | 
 27 |     @pytest.fixture
 28 |     def document_store(self) -> BetterBM25DocumentStore:
 29 |         return BetterBM25DocumentStore()
 30 | 
 31 |     @pytest.fixture
 32 |     def document_store_bbm25_filter(self) -> BetterBM25DocumentStore:
 33 |         return BetterBM25DocumentStore(haystack_filter_logic=False)
 34 | 
 35 |     def test_write_documents(self, document_store: DocumentStore):
 36 |         docs = [Document(id="1")]
 37 |         assert document_store.write_documents(docs) == 1
 38 |         with pytest.raises(DuplicateDocumentError):
 39 |             document_store.write_documents(docs, DuplicatePolicy.FAIL)
 40 | 
 41 |         document_store.write_documents(
 42 |             [Document(id="1"), Document(id="2")], DuplicatePolicy.OVERWRITE
 43 |         )
 44 |         assert document_store.count_documents() == 2
 45 | 
 46 |     def test_delete_documents_empty_document_store(self, document_store):
 47 |         """
 48 |         This is different from the original implementation.
 49 | 
 50 |         One expects a MissingDocumentError to be raised when deleting a
 51 |         non-existing document, which is more intuitive.
 52 |         """
 53 |         with pytest.raises(MissingDocumentError):
 54 |             document_store.delete_documents(["non_existing_id"])
 55 | 
 56 |     def test_delete_documents_non_existing_document(self, document_store):
 57 |         """
 58 |         This is different from the original implementation.
 59 | 
 60 |         One expects a MissingDocumentError to be raised when deleting a
 61 |         non-existing document, which is more intuitive.
 62 |         """
 63 |         document_store.write_documents([Document(id="42")])
 64 |         with pytest.raises(MissingDocumentError):
 65 |             document_store.delete_documents(["non_existing_id"])
 66 | 
 67 |         assert document_store.count_documents() == 1
 68 | 
 69 |     def test_bm25_retrieval(self, document_store):
 70 |         docs = [
 71 |             Document(content="Hello world"),
 72 |             Document(content="Haystack supports multiple languages"),
 73 |         ]
 74 |         document_store.write_documents(docs)
 75 | 
 76 |         results = document_store._retrieval(query="What languages?", top_k=1)
 77 | 
 78 |         assert len(results) == 1
 79 |         assert results[0][0].content == "Haystack supports multiple languages"
 80 | 
 81 |     # Override a few filter test cases to account for new comparison logic
 82 |     # Specifically, we alter the expected behavior when comparison involves
 83 |     # None, DataFrame, and Iterables.
 84 |     def test_comparison_equal_with_none_bbm25_filter(
 85 |         self, document_store_bbm25_filter, filterable_docs
 86 |     ):
 87 |         document_store_bbm25_filter.write_documents(filterable_docs)
 88 |         result = document_store_bbm25_filter.filter_documents(
 89 |             filters={"field": "meta.number", "operator": "==", "value": None}
 90 |         )
 91 |         self.assert_documents_are_equal(result, [])
 92 | 
 93 |     def test_comparison_not_equal_with_none_bbm25_filter(
 94 |         self, document_store_bbm25_filter, filterable_docs
 95 |     ):
 96 |         document_store_bbm25_filter.write_documents(filterable_docs)
 97 |         result = document_store_bbm25_filter.filter_documents(
 98 |             filters={"field": "meta.number", "operator": "!=", "value": None}
 99 |         )
100 |         self.assert_documents_are_equal(result, [])
101 | 
102 |     def test_comparison_not_equal_bbm25_filter(
103 |         self, document_store_bbm25_filter, filterable_docs
104 |     ):
105 |         """Comparison with missing values will always return False.
106 |         So the ground truth is that we should only return documents
107 |         with a non-missing value."""
108 |         document_store_bbm25_filter.write_documents(filterable_docs)
109 |         result = document_store_bbm25_filter.filter_documents(
110 |             {"field": "meta.number", "operator": "!=", "value": 100}
111 |         )
112 |         self.assert_documents_are_equal(
113 |             result,
114 |             [
115 |                 d
116 |                 for d in filterable_docs
117 |                 if d.meta.get("number") != 100 and "number" in d.meta
118 |             ],
119 |         )
120 | 
121 |     def test_comparison_not_in_bbm25_filter(
122 |         self, document_store_bbm25_filter, filterable_docs
123 |     ):
124 |         """Similar to the test above."""
125 |         document_store_bbm25_filter.write_documents(filterable_docs)
126 |         result = document_store_bbm25_filter.filter_documents(
127 |             {"field": "meta.number", "operator": "not in", "value": [9, 10]}
128 |         )
129 |         self.assert_documents_are_equal(
130 |             result,
131 |             [
132 |                 d
133 |                 for d in filterable_docs
134 |                 if d.meta.get("number") not in [9, 10] and "number" in d.meta
135 |             ],
136 |         )
137 | 
138 |     def test_comparison_equal_with_dataframe_bbm25_filter(
139 |         self, document_store_bbm25_filter, filterable_docs
140 |     ):
141 |         document_store_bbm25_filter.write_documents(filterable_docs)
142 |         with pytest.raises(FilterError):
143 |             _ = document_store_bbm25_filter.filter_documents(
144 |                 filters={
145 |                     "field": "dataframe",
146 |                     "operator": "==",
147 |                     "value": pd.DataFrame([1]),
148 |                 }
149 |             )
150 | 
151 |     def test_comparison_not_equal_with_dataframe_bbm25_filter(
152 |         self, document_store_bbm25_filter, filterable_docs
153 |     ):
154 |         document_store_bbm25_filter.write_documents(filterable_docs)
155 |         with pytest.raises(FilterError):
156 |             _ = document_store_bbm25_filter.filter_documents(
157 |                 filters={
158 |                     "field": "dataframe",
159 |                     "operator": "==",
160 |                     "value": pd.DataFrame([1]),
161 |                 }
162 |             )
163 | 


--------------------------------------------------------------------------------
/tests/test_retriever.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: 2024-present Guest400123064 <wangy49@seas.upenn.edu>
  2 | #
  3 | # SPDX-License-Identifier: Apache-2.0
  4 | from typing import Any
  5 | 
  6 | import pytest
  7 | from haystack import DeserializationError, Pipeline
  8 | from haystack.dataclasses import Document
  9 | from haystack.testing.factory import document_store_class
 10 | 
 11 | from bbm25_haystack.bbm25_retriever import BetterBM25Retriever
 12 | from bbm25_haystack.bbm25_store import BetterBM25DocumentStore
 13 | 
 14 | 
 15 | @pytest.fixture()
 16 | def mock_docs():
 17 |     return [
 18 |         Document(content="Javascript is a popular programming language"),
 19 |         Document(content="Java is a popular programming language"),
 20 |         Document(content="Python is a popular programming language"),
 21 |         Document(content="Ruby is a popular programming language"),
 22 |         Document(content="PHP is a popular programming language"),
 23 |     ]
 24 | 
 25 | 
 26 | class TestRetriever:
 27 |     def test_init_default(self):
 28 |         retriever = BetterBM25Retriever(BetterBM25DocumentStore())
 29 |         assert retriever.filters is None
 30 |         assert retriever.top_k == 10
 31 | 
 32 |     def test_init_with_parameters(self):
 33 |         retriever = BetterBM25Retriever(
 34 |             BetterBM25DocumentStore(), filters={"name": "test.txt"}, top_k=5
 35 |         )
 36 |         assert retriever.filters == {"name": "test.txt"}
 37 |         assert retriever.top_k == 5
 38 | 
 39 |     def test_init_with_invalid_top_k_parameter(self):
 40 |         with pytest.raises(ValueError):
 41 |             BetterBM25Retriever(BetterBM25DocumentStore(), top_k=-2)
 42 | 
 43 |         with pytest.raises(TypeError):
 44 |             BetterBM25Retriever(BetterBM25DocumentStore(), top_k="2")
 45 | 
 46 |     def test_init_with_invalid_filters_parameter(self):
 47 |         with pytest.raises(TypeError):
 48 |             BetterBM25Retriever(BetterBM25DocumentStore(), filters="invalid")
 49 | 
 50 |     def test_to_dict(self):
 51 |         store_class = document_store_class(
 52 |             "MyFakeStore", bases=(BetterBM25DocumentStore,)
 53 |         )
 54 |         document_store = store_class()
 55 |         document_store.to_dict = lambda: {
 56 |             "type": "MyFakeStore",
 57 |             "init_parameters": {},
 58 |         }
 59 |         component = BetterBM25Retriever(document_store=document_store)
 60 | 
 61 |         data = component.to_dict()
 62 |         assert data == {
 63 |             "type": "bbm25_haystack.bbm25_retriever.BetterBM25Retriever",
 64 |             "init_parameters": {
 65 |                 "document_store": {
 66 |                     "type": "MyFakeStore",
 67 |                     "init_parameters": {},
 68 |                 },
 69 |                 "filters": None,
 70 |                 "top_k": 10,
 71 |                 "set_score": True,
 72 |             },
 73 |         }
 74 | 
 75 |     def test_to_dict_with_custom_init_parameters(self):
 76 |         ds = BetterBM25DocumentStore()
 77 |         serialized_ds = ds.to_dict()
 78 | 
 79 |         component = BetterBM25Retriever(
 80 |             document_store=BetterBM25DocumentStore(),
 81 |             filters={"name": "test.txt"},
 82 |             top_k=5,
 83 |             set_score=False,
 84 |         )
 85 |         data = component.to_dict()
 86 |         assert data == {
 87 |             "type": "bbm25_haystack.bbm25_retriever.BetterBM25Retriever",
 88 |             "init_parameters": {
 89 |                 "document_store": serialized_ds,
 90 |                 "filters": {"name": "test.txt"},
 91 |                 "top_k": 5,
 92 |                 "set_score": False,
 93 |             },
 94 |         }
 95 | 
 96 |     def test_from_dict(self):
 97 |         data = {
 98 |             "type": "bbm25_haystack.bbm25_retriever.BetterBM25Retriever",
 99 |             "init_parameters": {
100 |                 "document_store": {
101 |                     "type": "bbm25_haystack.bbm25_store.BetterBM25DocumentStore",
102 |                     "init_parameters": {},
103 |                 },
104 |                 "filters": {"name": "test.txt"},
105 |                 "top_k": 5,
106 |             },
107 |         }
108 |         component = BetterBM25Retriever.from_dict(data)
109 |         assert isinstance(component.document_store, BetterBM25DocumentStore)
110 |         assert component.filters == {"name": "test.txt"}
111 |         assert component.top_k == 5
112 | 
113 |     def test_from_dict_without_docstore(self):
114 |         data = {"type": "BetterBM25Retriever", "init_parameters": {}}
115 |         with pytest.raises(
116 |             DeserializationError,
117 |             match="Missing 'document_store' in serialization data",
118 |         ):
119 |             BetterBM25Retriever.from_dict(data)
120 | 
121 |     def test_from_dict_without_docstore_type(self):
122 |         data = {
123 |             "type": "BetterBM25Retriever",
124 |             "init_parameters": {"document_store": {"init_parameters": {}}},
125 |         }
126 |         with pytest.raises(
127 |             DeserializationError,
128 |             match="Missing 'type' in document store's serialization data",
129 |         ):
130 |             BetterBM25Retriever.from_dict(data)
131 | 
132 |     def test_from_dict_nonexisting_docstore(self):
133 |         data = {
134 |             "type": "bbm25_haystack.BetterBM25Retriever",
135 |             "init_parameters": {
136 |                 "document_store": {
137 |                     "type": "Nonexisting.Docstore",
138 |                     "init_parameters": {},
139 |                 }
140 |             },
141 |         }
142 |         with pytest.raises(DeserializationError):
143 |             BetterBM25Retriever.from_dict(data)
144 | 
145 |     def test_retriever_valid_run(self, mock_docs):
146 |         ds = BetterBM25DocumentStore()
147 |         ds.write_documents(mock_docs)
148 | 
149 |         retriever = BetterBM25Retriever(ds, top_k=5)
150 |         result = retriever.run(query="PHP")
151 | 
152 |         assert "documents" in result
153 |         assert len(result["documents"]) == 5
154 |         assert (
155 |             result["documents"][0].content == "PHP is a popular programming language"
156 |         )
157 | 
158 |     def test_invalid_run_wrong_store_type(self):
159 |         store_class = document_store_class("SomeOtherDocumentStore")
160 |         with pytest.raises(
161 |             TypeError,
162 |             match="'document_store' must be of type 'BetterBM25DocumentStore'",
163 |         ):
164 |             BetterBM25Retriever(store_class())
165 | 
166 |     @pytest.mark.integration
167 |     @pytest.mark.parametrize(
168 |         "query, query_result",
169 |         [
170 |             ("Javascript", "Javascript is a popular programming language"),
171 |             ("Java", "Java is a popular programming language"),
172 |         ],
173 |     )
174 |     def test_run_with_pipeline(self, mock_docs, query: str, query_result: str):
175 |         ds = BetterBM25DocumentStore()
176 |         ds.write_documents(mock_docs)
177 |         retriever = BetterBM25Retriever(ds)
178 | 
179 |         pipeline = Pipeline()
180 |         pipeline.add_component("retriever", retriever)
181 |         result: dict[str, Any] = pipeline.run(data={"retriever": {"query": query}})
182 | 
183 |         assert result
184 |         assert "retriever" in result
185 |         results_docs = result["retriever"]["documents"]
186 |         assert results_docs
187 |         assert results_docs[0].content == query_result
188 | 
189 |     @pytest.mark.integration
190 |     @pytest.mark.parametrize(
191 |         "query, query_result, top_k",
192 |         [
193 |             ("Javascript", "Javascript is a popular programming language", 1),
194 |             ("Java", "Java is a popular programming language", 2),
195 |             ("Ruby", "Ruby is a popular programming language", 3),
196 |         ],
197 |     )
198 |     def test_run_with_pipeline_and_top_k(
199 |         self, mock_docs, query: str, query_result: str, top_k: int
200 |     ):
201 |         ds = BetterBM25DocumentStore()
202 |         ds.write_documents(mock_docs)
203 |         retriever = BetterBM25Retriever(ds)
204 | 
205 |         pipeline = Pipeline()
206 |         pipeline.add_component("retriever", retriever)
207 |         result: dict[str, Any] = pipeline.run(
208 |             data={"retriever": {"query": query, "top_k": top_k}}
209 |         )
210 | 
211 |         assert result
212 |         assert "retriever" in result
213 |         results_docs = result["retriever"]["documents"]
214 |         assert results_docs
215 |         assert len(results_docs) == top_k
216 |         assert results_docs[0].content == query_result
217 | 


--------------------------------------------------------------------------------