├── .github
└── workflows
│ ├── docs.yml
│ ├── release.yml
│ └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
└── .gitkeep
├── docs
├── bbm25_haystack.html
├── bbm25_haystack
│ ├── __about__.html
│ ├── bbm25_retriever.html
│ ├── bbm25_store.html
│ └── filters.html
├── index.html
└── search.js
├── pyproject.toml
├── scripts
└── benchmark_beir.py
├── src
└── bbm25_haystack
│ ├── __about__.py
│ ├── __init__.py
│ ├── bbm25_retriever.py
│ ├── bbm25_store.py
│ ├── default.model
│ └── filters.py
└── tests
├── __init__.py
├── test_document_store.py
└── test_retriever.py
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: website
2 |
3 | # build the documentation whenever there are new commits on main
4 | on:
5 | push:
6 | branches:
7 | - main
8 | # Alternative: only build for tags.
9 | # tags:
10 | # - '*'
11 |
12 | # security: restrict permissions for CI jobs.
13 | permissions:
14 | contents: read
15 |
16 | jobs:
17 | # Build the documentation and upload the static HTML files as an artifact.
18 | build:
19 | runs-on: ubuntu-latest
20 | steps:
21 | - uses: actions/checkout@v4
22 | - uses: actions/setup-python@v5
23 | with:
24 | python-version: '3.9'
25 |
26 | # ADJUST THIS: install all dependencies (including pdoc)
27 | - run: pip install -e .
28 | - run: pip install pdoc
29 |
30 | # ADJUST THIS: build your documentation into docs/.
31 | # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here.
32 | - run: pdoc src/bbm25_haystack -o docs --docformat restructuredtext
33 |
34 | - uses: actions/upload-pages-artifact@v3
35 | with:
36 | path: docs/
37 |
38 | # Deploy the artifact to GitHub pages.
39 | # This is a separate job so that only actions/deploy-pages has the necessary permissions.
40 | deploy:
41 | needs: build
42 | runs-on: ubuntu-latest
43 | permissions:
44 | pages: write
45 | id-token: write
46 | environment:
47 | name: github-pages
48 | url: ${{ steps.deployment.outputs.page_url }}
49 | steps:
50 | - id: deployment
51 | uses: actions/deploy-pages@v4
52 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - "v[0-9].[0-9]+.[0-9]+*"
7 |
8 | jobs:
9 | release-on-pypi:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - name: Checkout
14 | uses: actions/checkout@v3
15 |
16 | - name: Install Hatch
17 | run: pip install hatch
18 |
19 | - name: Build
20 | run: hatch build
21 |
22 | - name: Publish on PyPi
23 | env:
24 | HATCH_INDEX_USER: __token__
25 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_API_TOKEN }}
26 | run: hatch publish -y
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | # This workflow comes from https://github.com/ofek/hatch-mypyc
2 | # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml
3 | name: test
4 |
5 | on:
6 | push:
7 | branches:
8 | - main
9 | pull_request:
10 |
11 | concurrency:
12 | group: test-${{ github.head_ref }}
13 | cancel-in-progress: true
14 |
15 | env:
16 | PYTHONUNBUFFERED: "1"
17 | FORCE_COLOR: "1"
18 |
19 | jobs:
20 | run:
21 | name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }}
22 | runs-on: ${{ matrix.os }}
23 | strategy:
24 | fail-fast: false
25 | matrix:
26 | os: [ubuntu-latest, windows-latest, macos-latest]
27 | python-version: ['3.9', '3.10', '3.11', '3.12']
28 |
29 | steps:
30 | - name: Support longpaths
31 | if: matrix.os == 'windows-latest'
32 | run: git config --system core.longpaths true
33 |
34 | - uses: actions/checkout@v3
35 |
36 | - name: Set up Python ${{ matrix.python-version }}
37 | uses: actions/setup-python@v4
38 | with:
39 | python-version: ${{ matrix.python-version }}
40 |
41 | - name: Install Hatch
42 | run: pip install --upgrade hatch
43 |
44 | - name: Lint
45 | if: matrix.python-version == '3.9' && runner.os == 'Linux'
46 | run: hatch run lint:all
47 |
48 | - name: Run tests
49 | run: hatch run cov
50 |
51 | - name: Upload coverage reports to Codecov
52 | uses: codecov/codecov-action@v4.0.1
53 | with:
54 | token: ${{ secrets.CODECOV_TOKEN }}
55 | slug: Guest400123064/bbm25-haystack
56 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
162 | # VS Code
163 | .vscode
164 |
165 | # Benchmarking datasets
166 | benchmarks/beir/*
167 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://github.com/Guest400123064/bbm25-haystack/actions/workflows/test.yml)
2 | [](https://codecov.io/gh/Guest400123064/bbm25-haystack)
3 | [](https://github.com/psf/black)
4 | [](https://github.com/python/mypy)
5 | [](https://www.python.org/downloads/release/python-390/)
6 |
7 | # Better BM25 In-Memory Document Store
8 |
9 | An in-memory document store is a great starting point for prototyping and debugging before migrating to production-grade stores like Elasticsearch. However, [the original implementation](https://github.com/deepset-ai/haystack/blob/0dbb98c0a017b499560521aa93186d0640aab659/haystack/document_stores/in_memory/document_store.py#L148) of BM25 retrieval recreates an inverse index for the entire document store __on every new search__. Furthermore, the tokenization method is primitive, only permitting splitters based on regular expressions, making localization and domain adaptation challenging. Therefore, this implementation is a slight upgrade to the default BM25 in-memory document store by implementing incremental index update and incorporation of [SentencePiece](https://github.com/google/sentencepiece) statistical sub-word tokenization.
10 |
11 | ## Installation
12 |
13 | ```bash
14 | $ pip install bbm25-haystack
15 | ```
16 |
17 | Alternatively, you can clone the repository and build from source to be able to reflect changes to the source code:
18 |
19 | ```bash
20 | $ git clone https://github.com/Guest400123064/bbm25-haystack.git
21 | $ cd bbm25-haystack
22 | $ pip install -e .
23 | ```
24 |
25 | ## Usage
26 |
27 | ### Quick Start
28 |
29 | Below is an example of how you can build a minimal search engine with the `bbm25_haystack` components on their own. They are also compatible with [Haystack pipelines](https://docs.haystack.deepset.ai/docs/creating-pipelines).
30 |
31 | ```python
32 | from haystack import Document
33 | from bbm25_haystack import BetterBM25DocumentStore, BetterBM25Retriever
34 |
35 |
36 | document_store = BetterBM25DocumentStore()
37 | document_store.write_documents([
38 | Document(content="There are over 7,000 languages spoken around the world today."),
39 | Document(content="Elephants have been observed to behave in a way that indicates a high level of self-awareness, such as recognizing themselves in mirrors."),
40 | Document(content="In certain parts of the world, like the Maldives, Puerto Rico, and San Diego, you can witness the phenomenon of bio-luminescent waves.")
41 | ])
42 |
43 | retriever = BetterBM25Retriever(document_store)
44 | retriever.run(query="How many languages are spoken around the world today?")
45 | ```
46 |
47 | ### API References
48 |
49 | You can find the full API references [here](https://guest400123064.github.io/bbm25-haystack/). In a hurry? Below are some most important document store parameters you might want explore:
50 |
51 | - `k, b, delta` - the [three BM25+ hyperparameters](https://en.wikipedia.org/wiki/Okapi_BM25).
52 | - `sp_file` - a path to a trained SentencePiece tokenizer `.model` file. The default tokenizer is directly copied from [LLaMA-2-7B-32K tokenizer](https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/tokenizer.model) with a vocab size of 32,000.
53 | - `n_grams` - default to 1, which means text (both query and document) are tokenized into uni-grams. If set to 2, the tokenizer also augment the list of uni-grams with bi-grams, and so on. If specified as tuple, e.g., (2, 3), the tokenizer only produce bi-grams and tri-grams, without any uni-gram.
54 | - `haystack_filter_logic` - see [below](#filtering-logic).
55 |
56 | The retriever parameters are largely the same as [`InMemoryBM25Retriever`](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever).
57 |
58 | ## Filtering Logic
59 |
60 | The current document store uses [`document_matches_filter`](https://github.com/deepset-ai/haystack/blob/main/haystack/utils/filters.py) shipped with Haystack to perform filtering by default, which is the same as [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore).
61 |
62 | However, there is also an alternative filtering logic shipped with this implementation (unstable at this point). To use this alternative logic, initialize the document store with `haystack_filter_logic=False`. Please find comments and implementation details in [`filters.py`](./src/bbm25_haystack/filters.py). TL;DR:
63 |
64 | - Comparison with `None`, i.e., missing values, involved will always return `False`, no matter missing the document attribute value or missing the filter value.
65 | - Comparison with `pandas.DataFrame` is always prohibited to reduce surprises.
66 | - No implicit `datetime` conversion from string values.
67 | - `in` and `not in` allows any `Iterable` as filter value, without the `list` constraint.
68 | - Allowing custom comparison functions for more flexibility. Note that the custom comparison function inputs are NEVER checked, i.e., no missing value check, no ``DataFrame`` check, etc. User should ensure the input values are valid and return value is always a boolean. The inputs are always supplied in the order of document value and then filter value.
69 |
70 | In this case, the negation logic needs to be considered again because `False` can now issue from both input nullity check and the actual comparisons. For instance, `in` and `not in` both yield non-matching upon missing values. But I think having input processing and comparisons separated makes the filtering behavior more transparent.
71 |
72 | ## Search Quality Evaluation
73 |
74 | This repo has [a simple script](./scripts/benchmark_beir.py) to help evaluate the search quality over [BEIR](https://github.com/beir-cellar/beir/tree/main) benchmark. You need to clone the repository (you can also manually download the script and place it under a folder named `scripts`) and you have to install additional dependencies to run the script.
75 |
76 | ```bash
77 | $ pip install beir
78 | ```
79 |
80 | To run the script, you may want to specify the dataset name and BM25 hyperparameters. For example:
81 |
82 | ```bash
83 | $ python scripts/benchmark_beir.py --datasets scifact arguana --bm25-k1 1.2 --n-grams 2 --output eval.csv
84 | ```
85 |
86 | It automatically downloads the benchmarking dataset to `benchmarks/beir`, where `benchmarks` is at the same level as `scripts`. You may also check the help page for more information.
87 |
88 | ```bash
89 | $ python scripts/benchmark_beir.py --help
90 | ```
91 |
92 | New benchmarking scripts are expected to be added in the future.
93 |
94 | ## License
95 |
96 | `bbm25-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license.
97 |
--------------------------------------------------------------------------------
/benchmarks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Guest400123064/bbm25-haystack/9906fa27ffc54f4fd92dfb5d717c15a12a69df0a/benchmarks/.gitkeep
--------------------------------------------------------------------------------
/docs/bbm25_haystack/__about__.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
1# SPDX-FileCopyrightText: 2024-present Yuxuan Wang <wangy49@seas.upenn.edu>
64 | 2#
65 | 3# SPDX-License-Identifier: Apache-2.0
66 | 4fromcollections.abcimportIterable
67 | 5fromfunctoolsimportwraps
68 | 6fromtypingimportAny,Callable,Final,Optional
69 | 7
70 | 8importpandasaspd
71 | 9fromhaystack.dataclassesimportDocument
72 | 10fromhaystack.errorsimportFilterError
73 | 11
74 | 12
75 | 13defapply_filters_to_document(
76 | 14filters:Optional[dict[str,Any]],document:Document
77 | 15)->bool:
78 | 16"""
79 | 17 Apply filters to a document.
80 | 18
81 | 19 :param filters: The filters to apply to the document.
82 | 20 :type filters: dict[str, Any]
83 | 21 :param document: The document to apply the filters to.
84 | 22 :type document: Document
85 | 23
86 | 24 :return: True if the document passes the filters.
87 | 25 :rtype: bool
88 | 26 """
89 | 27iffiltersisNoneornotfilters:
90 | 28returnTrue
91 | 29return_run_comparison_condition(filters,document)
92 | 30
93 | 31
94 | 32def_get_document_field(document:Document,field:str)->Optional[Any]:
95 | 33"""
96 | 34 Get the value of a field in a document.
97 | 35
98 | 36 If the field is not found within the document then, instead of
99 | 37 raising an error, `None` is returned. Note that here we do not
100 | 38 implicitly add 'meta' prefix for fields that are not a direct
101 | 39 attribute of the document, not supporting legacy behavior anymore.
102 | 40
103 | 41 :param document: The document to get the field value from.
104 | 42 :type document: Document
105 | 43 :param field: The field to get the value of.
106 | 44 :type field: str
107 | 45
108 | 46 :return: The value of the field in the document.
109 | 47 :rtype: Optional[Any]
110 | 48 """
111 | 49if"."notinfield:
112 | 50returngetattr(document,field)
113 | 51
114 | 52attr=document.meta
115 | 53forfinfield.split(".")[1:]:
116 | 54attr=attr.get(f)
117 | 55ifattrisNone:
118 | 56returnNone
119 | 57returnattr
120 | 58
121 | 59
122 | 60def_run_logical_condition(condition:dict[str,Any],document:Document)->bool:
123 | 61if"operator"notincondition:
124 | 62msg="Logical condition must have an 'operator' key."
125 | 63raiseFilterError(msg)
126 | 64if"conditions"notincondition:
127 | 65msg="Logical condition must have a 'conditions' key."
128 | 66raiseFilterError(msg)
129 | 67
130 | 68conditions=condition["conditions"]
131 | 69reducer=LOGICAL_OPERATORS[condition["operator"]]
132 | 70
133 | 71returnreducer(document,conditions)
134 | 72
135 | 73
136 | 74def_run_comparison_condition(condition:dict[str,Any],document:Document)->bool:
137 | 75if"field"notincondition:
138 | 76return_run_logical_condition(condition,document)
139 | 77
140 | 78if"operator"notincondition:
141 | 79msg="Comparison condition must have an 'operator' key."
142 | 80raiseFilterError(msg)
143 | 81if"value"notincondition:
144 | 82msg="Comparison condition must have a 'value' key."
145 | 83raiseFilterError(msg)
146 | 84
147 | 85field:str=condition["field"]
148 | 86value:Any=condition["value"]
149 | 87comparator=COMPARISON_OPERATORS[condition["operator"]]
150 | 88
151 | 89returncomparator(_get_document_field(document,field),value)
152 | 90
153 | 91
154 | 92def_and(document:Document,conditions:list[dict[str,Any]])->bool:
155 | 93"""
156 | 94 Return True if all conditions are met.
157 | 95
158 | 96 :param document: The document to check the conditions against.
159 | 97 :type document: Document
160 | 98 :param conditions: The conditions to check against the document.
161 | 99 :type conditions: list[dict[str, Any]]
162 | 100
163 | 101 :return: True if not all conditions are met.
164 | 102 :rtype: bool
165 | 103 """
166 | 104returnall(
167 | 105_run_comparison_condition(condition,document)forconditioninconditions
168 | 106)
169 | 107
170 | 108
171 | 109def_or(document:Document,conditions:list[dict[str,Any]])->bool:
172 | 110"""
173 | 111 Return True if any condition is met.
174 | 112
175 | 113 :param document: The document to check the conditions against.
176 | 114 :type document: Document
177 | 115 :param conditions: The conditions to check against the document.
178 | 116 :type conditions: list[dict[str, Any]]
179 | 117
180 | 118 :return: True if not all conditions are met.
181 | 119 :rtype: bool
182 | 120 """
183 | 121returnany(_run_comparison_condition(cond,document)forcondinconditions)
184 | 122
185 | 123
186 | 124def_not(document:Document,conditions:list[dict[str,Any]])->bool:
187 | 125"""
188 | 126 Return True if not all conditions are met.
189 | 127
190 | 128 The 'NOT' operator is under-specified when supplied with a
191 | 129 set of conditions instead of a single condition. Because we
192 | 130 can have the semantics of 'at least one False' versus
193 | 131 'all False'. Here we choose to comply with the official
194 | 132 implementation of Haystack (the 'at least one False' semantics).
195 | 133
196 | 134 :param document: The document to check the conditions against.
197 | 135 :type document: Document
198 | 136 :param conditions: The conditions to check against the document.
199 | 137 :type conditions: list[dict[str, Any]]
200 | 138
201 | 139 :return: True if not all conditions are met.
202 | 140 :rtype: bool
203 | 141 """
204 | 142returnnot_and(document,conditions)
205 | 143
206 | 144
207 | 145def_check_comparator_inputs(
208 | 146comparator:Callable[[Any,Any],bool]
209 | 147)->Callable[[Any,Any],bool]:
210 | 148"""
211 | 149 A decorator to check and preprocess input attribute values.
212 | 150
213 | 151 ALL COMPARISON OPERATORS SHOULD BE WRAPPED WITH THIS DECORATOR.
214 | 152 because a `False` may be returned by both input validation and
215 | 153 the actual comparison. This decorator ensures that the comparison
216 | 154 function is only called if the input values are valid.
217 | 155
218 | 156 :param comparator: The comparator function to wrap.
219 | 157 :type comparator: Callable[[Any, Any], bool]
220 | 158
221 | 159 :return: The wrapped comparator function.
222 | 160 :rtype: Callable[[Any, Any], bool]
223 | 161 """
224 | 162
225 | 163@wraps(comparator)
226 | 164def_wrapper(dv:Any,fv:Any)->bool:
227 | 165
228 | 166# I think allowing comparison between DataFrames would
229 | 167# be a really bad idea because it would create unexpected
230 | 168# behavior, but I am open to discussion on this.
231 | 169ifisinstance(dv,pd.DataFrame)orisinstance(fv,pd.DataFrame):
232 | 170msg=(
233 | 171"Cannot compare DataFrames. Please convert them to "
234 | 172"simpler data structures before comparing."
235 | 173)
236 | 174raiseFilterError(msg)
237 | 175
238 | 176# I think comparison between missing values is ambiguous,
239 | 177# but again, I am open to discussion on this. Here I choose
240 | 178# to return False if either value is None because from a
241 | 179# logical perspective, we really cannot say anything about
242 | 180# the comparison between a missing value and a non-missing.
243 | 181ifdvisNoneorfvisNone:
244 | 182returnFalse
245 | 183
246 | 184try:
247 | 185returncomparator(dv,fv)
248 | 186exceptTypeErrorasexc:
249 | 187msg=(
250 | 188f"Cannot compare document value of {type(dv)} type "
251 | 189f"with filter value of {type(fv)} type."
252 | 190)
253 | 191raiseFilterError(msg)fromexc
254 | 192
255 | 193return_wrapper
256 | 194
257 | 195
258 | 196@_check_comparator_inputs
259 | 197def_eq(dv:Any,fv:Any)->bool:
260 | 198"""
261 | 199 Conservative implementation of equal comparison.
262 | 200
263 | 201 There are two major differences between this implementation
264 | 202 and the default Haystack filter implementation:
265 | 203 - If both values are None, we return False, instead of True.
266 | 204 - If any value is a DataFrame, we raise an error, instead
267 | 205 of converting them to JSON.
268 | 206 """
269 | 207returndv==fv
270 | 208
271 | 209
272 | 210@_check_comparator_inputs
273 | 211def_ne(dv:Any,fv:Any)->bool:
274 | 212returnnot_eq(dv,fv)
275 | 213
276 | 214
277 | 215@_check_comparator_inputs
278 | 216def_gt(dv:Any,fv:Any)->bool:
279 | 217"""
280 | 218 A more liberal implementation with less surprises.
281 | 219
282 | 220 Simply compare the two values with default Python comparison.
283 | 221 We do not perform any conversion here to have the behavior
284 | 222 more predictable. If we want to compare the dates, we should
285 | 223 just convert the document value and filter value explicitly
286 | 224 to dates before comparing them.
287 | 225 """
288 | 226returndv>fv
289 | 227
290 | 228
291 | 229@_check_comparator_inputs
292 | 230def_lt(dv:Any,fv:Any)->bool:
293 | 231returndv<fv
294 | 232
295 | 233
296 | 234@_check_comparator_inputs
297 | 235def_gte(dv:Any,fv:Any)->bool:
298 | 236return_gt(dv,fv)or_eq(dv,fv)
299 | 237
300 | 238
301 | 239@_check_comparator_inputs
302 | 240def_lte(dv:Any,fv:Any)->bool:
303 | 241return_lt(dv,fv)or_eq(dv,fv)
304 | 242
305 | 243
306 | 244@_check_comparator_inputs
307 | 245def_in(dv:Any,fv:Any)->bool:
308 | 246"""
309 | 247 Allowing iterable filter values not just lists.
310 | 248
311 | 249 This implementation permits a larger set of filter values
312 | 250 such as tuples, sets, and other iterable objects.
313 | 251 """
314 | 252ifnotisinstance(fv,Iterable):
315 | 253msg="Filter value must be an iterable for 'in' comparison."
316 | 254raiseFilterError(msg)
317 | 255
318 | 256returnany(_eq(dv,v)forvinfv)
319 | 257
320 | 258
321 | 259@_check_comparator_inputs
322 | 260def_nin(dv:Any,fv:Any)->bool:
323 | 261returnnot_in(dv,fv)
324 | 262
325 | 263
326 | 264LOGICAL_OPERATORS:Final={"NOT":_not,"AND":_and,"OR":_or}
327 | 265
328 | 266COMPARISON_OPERATORS:Final={
329 | 267"==":_eq,
330 | 268"!=":_ne,
331 | 269">":_gt,
332 | 270"<":_lt,
333 | 271">=":_gte,
334 | 272"<=":_lte,
335 | 273"in":_in,
336 | 274"not in":_nin,
337 | 275}
338 |