├── .github
└── workflows
│ ├── ruff.yml
│ └── tests.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── benchmark.py
├── docs
├── .pages
├── CNAME
├── api
│ ├── .pages
│ ├── decorators
│ │ ├── .pages
│ │ ├── connect-to-duckdb.md
│ │ └── execute-with-duckdb.md
│ ├── evaluation
│ │ ├── .pages
│ │ ├── evaluate.md
│ │ └── load-beir.md
│ ├── hf
│ │ ├── .pages
│ │ └── insert-documents.md
│ ├── overview.md
│ ├── search
│ │ ├── .pages
│ │ ├── documents.md
│ │ ├── graphs.md
│ │ ├── queries.md
│ │ ├── search.md
│ │ ├── update-index-documents.md
│ │ └── update-index-queries.md
│ ├── tables
│ │ ├── .pages
│ │ ├── add-columns-documents.md
│ │ ├── create-documents-queries.md
│ │ ├── create-documents.md
│ │ ├── create-queries.md
│ │ ├── create-schema.md
│ │ ├── insert-documents-queries.md
│ │ ├── insert-documents.md
│ │ ├── insert-queries.md
│ │ ├── select-documents-columns.md
│ │ ├── select-documents.md
│ │ └── select-queries.md
│ ├── upload
│ │ ├── .pages
│ │ ├── documents.md
│ │ └── queries.md
│ └── utils
│ │ ├── .pages
│ │ ├── ParallelTqdm.md
│ │ ├── batchify.md
│ │ ├── generate-random-hash.md
│ │ ├── get-list-columns-df.md
│ │ └── plot.md
├── benchmarks
│ ├── .pages
│ └── benchmarks.md
├── css
│ └── version-select.css
├── documentation
│ ├── .pages
│ ├── delete.md
│ ├── graph.md
│ ├── search.md
│ ├── update.md
│ └── upload.md
├── img
│ └── logo.png
├── index.md
├── javascripts
│ ├── config.js
│ └── tablesort.js
├── js
│ └── version-select.js
├── parse
│ └── __main__.py
└── stylesheets
│ └── extra.css
├── ducksearch
├── __init__.py
├── __version__.py
├── decorators
│ ├── __init__.py
│ └── execute_with_duckdb.py
├── delete
│ ├── __init__.py
│ ├── delete
│ │ ├── documents.sql
│ │ ├── documents_queries.sql
│ │ └── scores.sql
│ ├── documents.py
│ └── update
│ │ ├── df.sql
│ │ ├── docs.sql
│ │ ├── scores.sql
│ │ ├── stats.sql
│ │ └── terms.sql
├── evaluation
│ ├── __init__.py
│ └── evaluation.py
├── hf
│ ├── __init__.py
│ ├── drop
│ │ └── tmp.sql
│ ├── insert.py
│ ├── insert
│ │ ├── documents.sql
│ │ └── tmp.sql
│ └── select
│ │ ├── columns.sql
│ │ ├── count.sql
│ │ └── exists.sql
├── search
│ ├── __init__.py
│ ├── create.py
│ ├── create
│ │ ├── index.sql
│ │ ├── queries_index.sql
│ │ ├── settings.sql
│ │ ├── stopwords.sql
│ │ └── tables.sql
│ ├── drop
│ │ ├── _documents.sql
│ │ ├── queries.sql
│ │ ├── schema.sql
│ │ └── scores.sql
│ ├── graphs.py
│ ├── insert
│ │ ├── dict.sql
│ │ ├── docs.sql
│ │ ├── queries.sql
│ │ ├── settings.sql
│ │ └── terms.sql
│ ├── select.py
│ ├── select
│ │ ├── search.sql
│ │ ├── search_filters.sql
│ │ ├── search_graph.sql
│ │ ├── search_graph_filters.sql
│ │ ├── search_order_by.sql
│ │ ├── settings.sql
│ │ ├── settings_exists.sql
│ │ ├── stats.sql
│ │ └── termids_to_score.sql
│ └── update
│ │ ├── bm25id.sql
│ │ ├── dict.sql
│ │ ├── scores.sql
│ │ └── stats.sql
├── tables
│ ├── __init__.py
│ ├── create.py
│ ├── create
│ │ ├── documents.sql
│ │ ├── documents_queries.sql
│ │ ├── queries.sql
│ │ └── schema.sql
│ ├── insert.py
│ ├── insert
│ │ ├── documents.sql
│ │ ├── documents_queries.sql
│ │ ├── fast_documents.sql
│ │ └── queries.sql
│ ├── select.py
│ ├── select
│ │ ├── columns.sql
│ │ ├── documents.sql
│ │ └── queries.sql
│ ├── update.py
│ └── update
│ │ └── documents.sql
├── upload
│ ├── __init__.py
│ └── upload.py
└── utils
│ ├── __init__.py
│ ├── batch.py
│ ├── columns.py
│ ├── hash.py
│ ├── parralel_tqdm.py
│ ├── plot.py
│ └── plot
│ └── plot.sql
├── mkdocs.yml
├── pytest.ini
├── ruff.toml
├── setup.cfg
└── setup.py
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: Ruff
2 | on:
3 | push:
4 | branches: [ main ]
5 | pull_request:
6 | branches: [ main ]
7 | jobs:
8 | ruff:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v2
12 | - uses: actions/setup-python@v2
13 | with:
14 | python-version: 3.9
15 | - run: pip install ruff
16 | - run: ruff check .
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Python Tests
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - '**'
7 | jobs:
8 | test:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout code
12 | uses: actions/checkout@v2
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: '3.10'
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install ".[dev]"
21 |
22 | - name: Run tests library
23 | run: |
24 | make tests
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | *.parquet
7 |
8 | *.ipynb
9 | *.duckdb
10 | duckdb_tmp/
11 | *.block
12 |
13 | evaluation_datasets/
14 |
15 | # C extensions
16 | *.so
17 |
18 | # Distribution / packaging
19 | .Python
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | wheels/
32 | share/python-wheels/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | MANIFEST
37 |
38 | # PyInstaller
39 | # Usually these files are written by a python script from a template
40 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
41 | *.manifest
42 | *.spec
43 |
44 | # Installer logs
45 | pip-log.txt
46 | pip-delete-this-directory.txt
47 |
48 | # Unit test / coverage reports
49 | htmlcov/
50 | .tox/
51 | .nox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *.cover
58 | *.py,cover
59 | .hypothesis/
60 | .pytest_cache/
61 | cover/
62 |
63 | # Translations
64 | *.mo
65 | *.pot
66 |
67 | # Django stuff:
68 | *.log
69 | local_settings.py
70 | db.sqlite3
71 | db.sqlite3-journal
72 |
73 | # Flask stuff:
74 | instance/
75 | .webassets-cache
76 |
77 | # Scrapy stuff:
78 | .scrapy
79 |
80 | # Sphinx documentation
81 | docs/_build/
82 |
83 | # PyBuilder
84 | .pybuilder/
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # IPython
91 | profile_default/
92 | ipython_config.py
93 |
94 | # pyenv
95 | # For a library or package, you might want to ignore these files since the code is
96 | # intended to run in multiple environments; otherwise, check them in:
97 | # .python-version
98 |
99 | # pipenv
100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | # install all needed dependencies.
104 | #Pipfile.lock
105 |
106 | # poetry
107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | # This is especially recommended for binary packages to ensure reproducibility, and is more
109 | # commonly ignored for libraries.
110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 |
113 | # pdm
114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | # in version control.
118 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119 | .pdm.toml
120 | .pdm-python
121 | .pdm-build/
122 |
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 |
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 |
130 | # SageMath parsed files
131 | *.sage.py
132 |
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 |
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 |
146 | # Rope project settings
147 | .ropeproject
148 |
149 | # mkdocs documentation
150 | /site
151 |
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 |
157 | # Pyre type checker
158 | .pyre/
159 |
160 | # pytype static type analyzer
161 | .pytype/
162 |
163 | # Cython debug symbols
164 | cython_debug/
165 |
166 | # PyCharm
167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | # and can be added to the global gitignore or merged into this file. For a more nuclear
170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
172 | arguana
173 | benchmark_bm25s.py
174 | benchmark_crud.py
175 | climate-fever
176 | fever.tmp/duckdb_temp_block-4611686018432402649.block
177 | fever.tmp/duckdb_temp_block-4611686018432404521.block
178 | fever.tmp/duckdb_temp_block-4611686018432404963.block
179 | fever.tmp/duckdb_temp_storage-4.tmp
180 | metrics.json
181 | metrics_20K.json
182 | metrics_bm25s.json
183 | mmetrics_30K.json
184 | msmarco
185 | nfcorpus
186 | nq
187 | quora
188 | scidocs
189 | scifact
190 | trec-covid
191 | webis-touche2020
192 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 LightOn
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | DIALECT := duckdb
2 |
3 | fix:
4 | sqlfluff fix --dialect $(DIALECT)
5 |
6 | lint:
7 | sqlfluff lint --dialect $(DIALECT)
8 |
9 | tests:
10 | @echo "Removing test.duckdb if it exists..."
11 | rm -rf test.duckdb
12 | rm -rf test.duckdb.wal
13 | pytest ducksearch/tables/create.py --disable-warnings
14 | pytest ducksearch/tables/insert.py --disable-warnings
15 | pytest ducksearch/tables/select.py --disable-warnings
16 | rm -rf test.duckdb
17 | rm -rf test.duckdb.wal
18 | pytest ducksearch/hf/insert.py --disable-warnings
19 | rm -rf test.duckdb
20 | rm -rf test.duckdb.wal
21 | pytest ducksearch/evaluation/evaluation.py --disable-warnings
22 | rm -rf test.duckdb
23 | rm -rf test.duckdb.wal
24 | pytest ducksearch/search/create.py --disable-warnings
25 | pytest ducksearch/search/select.py --disable-warnings
26 | rm -rf test.duckdb
27 | rm -rf test.duckdb.wal
28 | pytest ducksearch/search/graphs.py --disable-warnings
29 | rm -rf test.duckdb
30 | rm -rf test.duckdb.wal
31 |
32 | view:
33 | harlequin test.duckdb
34 |
35 | livedoc:
36 | python docs/parse
37 | mkdocs build --clean
38 | mkdocs serve --dirtyreload
39 |
40 | deploydoc:
41 | mkdocs gh-deploy --force
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
DuckSearch
3 |
Efficient BM25 with DuckDB 🦆
4 |
5 |
6 | 
7 |
8 |
9 |
10 |

11 |
12 |

13 |
14 |
15 |
16 | DuckSearch is a lightweight and easy-to-use library to search documents. DuckSearch is built on top of DuckDB, a high-performance analytical database. DuckDB is designed to execute analytical SQL queries fast, and DuckSearch leverages this to provide efficient search and filtering features. DuckSearch index can be updated with new documents and documents can be deleted as well. DuckSearch also supports HuggingFace datasets, allowing to index datasets directly from the HuggingFace Hub.
17 |
18 |
19 | ## Installation
20 |
21 | Install DuckSearch using pip:
22 |
23 | ```bash
24 | pip install ducksearch
25 | ```
26 |
27 | ## Documentation
28 |
29 | The complete documentation is available [here](https://lightonai.github.io/ducksearch/), which includes in-depth guides, examples, and API references.
30 |
31 | ### Upload
32 |
33 | We can upload documents to DuckDB using the `upload.documents` function. The documents are stored in a DuckDB database, and the `fields` are indexed with BM25.
34 |
35 | ```python
36 | from ducksearch import upload
37 |
38 | documents = [
39 | {
40 | "id": 0,
41 | "title": "Hotel California",
42 | "style": "rock",
43 | "date": "1977-02-22",
44 | "popularity": 9,
45 | },
46 | {
47 | "id": 1,
48 | "title": "Here Comes the Sun",
49 | "style": "rock",
50 | "date": "1969-06-10",
51 | "popularity": 10,
52 | },
53 | {
54 | "id": 2,
55 | "title": "Alive",
56 | "style": "electro, punk",
57 | "date": "2007-11-19",
58 | "popularity": 9,
59 | },
60 | ]
61 |
62 | upload.documents(
63 | database="ducksearch.duckdb",
64 | key="id", # Unique document identifier
65 | fields=["title", "style"], # List of fields to use for search.
66 | documents=documents,
67 | dtypes={
68 | "date": "DATE",
69 | "popularity": "INT",
70 | },
71 | )
72 | ```
73 |
74 | ## Search
75 |
76 | `search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8. We will order the results by a weighted sum of the BM25 score and the popularity score provided in the document.
77 |
78 | ```python
79 | from ducksearch import search
80 |
81 | search.documents(
82 | database="ducksearch.duckdb",
83 | queries=["punk", "california"],
84 | top_k=10,
85 | filters="YEAR(date) >= 1970 AND popularity > 8",
86 | order_by="0.8 * score + 0.2 * popularity DESC",
87 | )
88 | ```
89 |
90 | ```python
91 | [
92 | [
93 | {
94 | "id": "2",
95 | "title": "Alive",
96 | "style": "electro, punk",
97 | "date": Timestamp("2007-11-19 00:00:00"),
98 | "popularity": 9,
99 | "score": 0.17841622233390808,
100 | }
101 | ],
102 | [
103 | {
104 | "id": "0",
105 | "title": "Hotel California",
106 | "style": "rock, pop",
107 | "date": Timestamp("1977-02-22 00:00:00"),
108 | "popularity": 9,
109 | "score": 0.156318798661232,
110 | }
111 | ],
112 | ]
113 | ```
114 |
115 | Filters are SQL expressions that are applied to the search results. We can use every filtering function DuckDB provides such as [date functions](https://duckdb.org/docs/sql/functions/date).
116 |
117 | Both `filters` and `order_by` parameters are optional. If not provided, the results are ordered by BM25 relevance and no filters are applied.
118 |
119 | ## Delete and update index
120 |
121 | We can delete documents and update the BM25 weights accordingly using the `delete.documents` function.
122 |
123 | ```python
124 | from ducksearch import delete
125 |
126 | delete.documents(
127 | database="ducksearch.duckdb",
128 | ids=[0, 1],
129 | )
130 | ```
131 |
132 | To update the index, we should first delete the documents and then upload the updated documents.
133 |
134 | ## Extra features
135 |
136 | ### HuggingFace
137 |
138 | The `upload.documents` function can also index HuggingFace datasets directly from the url. The following example demonstrates how to index the FineWeb dataset from HuggingFace. We will use the fields "text" and "url" for search. We will also specify the data types for the "date", "token_count", and "language_score" fields to be able to filter the results.
139 |
140 | ```python
141 | from ducksearch import upload
142 |
143 | upload.documents(
144 | database="fineweb.duckdb",
145 | key="id",
146 | fields=["text", "url"],
147 | documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet",
148 | dtypes={
149 | "date": "DATE",
150 | "token_count": "INT",
151 | "language_score": "FLOAT",
152 | },
153 | limit=3000, # demonstrate with a small dataset
154 | )
155 | ```
156 |
157 | We can then search the FineWeb dataset with the `search.documents` function. We order the results by BM25 score and then date.
158 |
159 | ```python
160 | from ducksearch import search
161 |
162 | search.documents(
163 | database="fineweb.duckdb",
164 | queries=["earth science"],
165 | top_k=2,
166 | order_by="score DESC, date DESC",
167 | )
168 | ```
169 |
170 | ```python
171 | [
172 | [
173 | {
174 | "id": "",
175 | "text": "Earth Science Tutors in Rowland...",
176 | "id_1": "",
177 | "dump": "CC-MAIN-2017-34",
178 | "url": "http://rowland.universitytutor.com/rowland_earth-science-tutoring",
179 | "date": Timestamp("2017-08-19 00:00:00"),
180 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2017-34/segments/1502886105304.35/warc/CC-MAIN-20170819051034-20170819071034-00240.warc.gz",
181 | "language": "en",
182 | "language_score": 0.8718525171279907,
183 | "token_count": 313,
184 | "bm25id": 523,
185 | "score": 2.3761106729507446,
186 | },
187 | {
188 | "id": "",
189 | "text": "- Geomagnetic field....",
190 | "id_1": "",
191 | "dump": "CC-MAIN-2022-21",
192 | "url": "https://www.imperial.ac.uk/people/adrian.muxworthy/?respub-action=citation.html&id=1149861&noscript=noscript",
193 | "date": Timestamp("2022-05-20 00:00:00"),
194 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2022-21/segments/1652662530553.34/warc/CC-MAIN-20220519235259-20220520025259-00601.warc.gz",
195 | "language": "en",
196 | "language_score": 0.8225595951080322,
197 | "token_count": 517,
198 | "bm25id": 4783,
199 | "score": 2.3569871187210083,
200 | },
201 | ]
202 | ]
203 |
204 | ```
205 |
206 | Note: by default, results are ordered by BM25 relevance.
207 |
208 | ## Tables
209 |
210 | Ducksearch creates two distinct schemas: `bm25_tables`, `bm25_documents`.
211 |
212 | - We can find the uploaded documents in the `bm25_tables.documents` table.
213 |
214 | - We can find the inverted index in the `bm25_documents.scores` table. You can update the scores as you wish. Just note that tokens scores will be updated each time you upload documents (every tokens scores mentionned in the set of uploaded documents).
215 |
216 | - We can update the set of stopwords in the `bm25_documents.stopwords` table.
217 |
218 | ## Benchmark
219 |
220 |
221 | | Dataset | ndcg@10 | hits@1 | hits@10 | mrr@10 | map@10 | r-precision | qps | Indexation Time (s) | Number of Documents and Queries |
222 | |-------------------|-----------|---------|----------|----------|---------|-------------|----------------|---------------------|--------------------------------|
223 | | arguana | 0.3779 | 0.0 | 0.8267 | 0.2491 | 0.2528 | 0.0108 | 117.80 | 1.42 | 1,406 queries, 8.67K documents |
224 | | climate-fever | 0.1184 | 0.1068 | 0.3648 | 0.1644 | 0.0803 | 0.0758 | 5.88 | 302.39 | 1,535 queries, 5.42M documents |
225 | | dbpedia-entity | 0.6046 | 0.7669 | 5.6241 | 0.8311 | 0.0649 | 0.0741 | 113.20 | 181.42 | 400 queries, 4.63M documents |
226 | | fever | 0.3861 | 0.2583 | 0.5826 | 0.3525 | 0.3329 | 0.2497 | 74.40 | 329.70 | 6,666 queries, 5.42M documents |
227 | | fiqa | 0.2445 | 0.2207 | 0.6790 | 0.3002 | 0.1848 | 0.1594 | 545.77 | 6.04 | 648 queries, 57K documents |
228 | | hotpotqa | 0.4487 | 0.5059 | 0.9699 | 0.5846 | 0.3642 | 0.3388 | 48.15 | 163.14 | 7,405 queries, 5.23M documents |
229 | | msmarco | 0.8951 | 1.0 | 8.6279 | 1.0 | 0.0459 | 0.0473 | 35.11 | 202.37 | 6,980 queries, 8.84M documents |
230 | | nfcorpus | 0.3301 | 0.4396 | 2.4087 | 0.5292 | 0.1233 | 0.1383 | 3464.66 | 0.99 | 323 queries, 3.6K documents |
231 | | nq | 0.2451 | 0.1272 | 0.4574 | 0.2099 | 0.1934 | 0.1240 | 150.23 | 71.43 | 3,452 queries, 2.68M documents |
232 | | quora | 0.7705 | 0.6783 | 1.1749 | 0.7606 | 0.7206 | 0.6502 | 741.13 | 3.78 | 10,000 queries, 523K documents |
233 | | scidocs | 0.1025 | 0.1790 | 0.8240 | 0.2754 | 0.0154 | 0.0275 | 879.11 | 4.46 | 1,000 queries, 25K documents |
234 | | scifact | 0.6908 | 0.5533 | 0.9133 | 0.6527 | 0.6416 | 0.5468 | 2153.64 | 1.22 | 300 queries, 5K documents |
235 | | trec-covid | 0.9533 | 1.0 | 9.4800 | 1.0 | 0.0074 | 0.0077 | 112.38 | 22.15 | 50 queries, 171K documents |
236 | | webis-touche2020 | 0.4130 | 0.5510 | 3.7347 | 0.7114 | 0.0564 | 0.0827 | 104.65 | 44.14 | 49 queries, 382K documents |
237 |
238 | ## References
239 |
240 | - [DuckDB](https://duckdb.org/)
241 |
242 | - [DuckDB Full Text Search](https://duckdb.org/docs/extensions/full_text_search.html): Note that DuckSearch rely partially on the DuckDB Full Text Search extension but accelerate the search process via `top_k_token` approximation, pre-computation of scores and multi-threading.
243 |
244 | ## License
245 |
246 | DuckSearch is released under the MIT license.
247 |
248 | ## Citation
249 |
250 | ```
251 | @misc{DuckSearch,
252 | title={DuckSearch, efficient search with DuckDB},
253 | author={Sourty, Raphael},
254 | url={https://github.com/lightonai/ducksearch},
255 | year={2024}
256 | }
257 | ```
258 |
--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
1 | from nltk import download
2 | from nltk.corpus import stopwords
3 |
4 | from ducksearch import evaluation, search, upload
5 |
6 | download("stopwords")
7 |
8 | stopword = list(stopwords.words("english"))
9 |
10 | dataset_name = "quora"
11 |
12 | documents, queries, qrels = evaluation.load_beir(
13 | dataset_name=dataset_name,
14 | split="test",
15 | )
16 |
17 | upload.documents(
18 | database=dataset_name,
19 | documents=documents,
20 | key="id",
21 | fields=["title", "text"],
22 | stopwords=stopword,
23 | )
24 |
25 | scores = search.documents(
26 | database=dataset_name,
27 | queries=queries,
28 | top_k=10,
29 | top_k_token=30_000,
30 | batch_size=32,
31 | )
32 |
33 | evaluation_scores = evaluation.evaluate(
34 | scores=scores,
35 | qrels=qrels,
36 | queries=queries,
37 | metrics=["ndcg@10", "hits@1", "hits@10", "mrr@10", "map@10", "r-precision"],
38 | )
39 |
40 | print(evaluation_scores)
41 |
--------------------------------------------------------------------------------
/docs/.pages:
--------------------------------------------------------------------------------
1 | nav:
2 | - documentation
3 | - benchmarks
4 | - api
--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | lightonai.github.io/ducksearch/
--------------------------------------------------------------------------------
/docs/api/.pages:
--------------------------------------------------------------------------------
1 | title: API reference
2 | arrange:
3 | - overview.md
4 | - ...
5 |
--------------------------------------------------------------------------------
/docs/api/decorators/.pages:
--------------------------------------------------------------------------------
1 | title: decorators
--------------------------------------------------------------------------------
/docs/api/decorators/connect-to-duckdb.md:
--------------------------------------------------------------------------------
1 | # connect_to_duckdb
2 |
3 | Establish a connection to the DuckDB database. Retry connecting if an error occurs.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name or path of the DuckDB database to connect to.
12 |
13 | - **read_only** (*bool*) – defaults to `False`
14 |
15 | Whether to open the database in read-only mode. Default is False.
16 |
17 | - **config** (*dict | None*) – defaults to `None`
18 |
19 | Optional configuration settings for the DuckDB connection.
20 |
21 | - **max_retry** (*int*) – defaults to `20`
22 |
23 | The maximum number of times to retry connecting to DuckDB.
24 |
25 | - **sleep_time** (*float*) – defaults to `0.1`
26 |
27 | The time to sleep between retries.
28 |
29 | - **kwargs**
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/docs/api/decorators/execute-with-duckdb.md:
--------------------------------------------------------------------------------
1 | # execute_with_duckdb
2 |
3 | Decorator to execute a SQL query using DuckDB.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **relative_path** (*str | list[str]*)
10 |
11 | A string or list of strings specifying the path(s) to the SQL file(s).
12 |
13 | - **read_only** (*bool*) – defaults to `False`
14 |
15 | Whether the DuckDB connection should be read-only. Default is False.
16 |
17 | - **fields** (*list[str] | None*) – defaults to `None`
18 |
19 | A list of fields to use as keys for the result rows if returning records.
20 |
21 | - **fetch_df** (*bool*) – defaults to `False`
22 |
23 | If True, fetch the result as a pandas DataFrame and return it as a list of dictionaries.
24 |
25 | - **kwargs**
26 |
27 | Additional keyword arguments to be passed to the SQL query, useful for string formatting.
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/docs/api/evaluation/.pages:
--------------------------------------------------------------------------------
1 | title: evaluation
--------------------------------------------------------------------------------
/docs/api/evaluation/evaluate.md:
--------------------------------------------------------------------------------
1 | # evaluate
2 |
3 | Evaluate the performance of document retrieval using relevance judgments.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **scores** (*list[list[dict]]*)
10 |
11 | A list of lists, where each sublist contains dictionaries representing the retrieved documents for a query.
12 |
13 | - **qrels** (*dict*)
14 |
15 | A dictionary mapping queries to relevant documents and their relevance scores.
16 |
17 | - **queries** (*list[str]*)
18 |
19 | A list of queries.
20 |
21 | - **metrics** (*list*) – defaults to `[]`
22 |
23 | A list of metrics to compute. Default includes "ndcg@10" and hits at various levels (e.g., hits@1, hits@10).
24 |
25 |
26 |
27 | ## Examples
28 |
29 | ```python
30 | >>> from ducksearch import evaluation, upload, search
31 |
32 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test")
33 |
34 | >>> upload.documents(
35 | ... database="test.duckdb",
36 | ... key="id",
37 | ... fields=["title", "text"],
38 | ... documents=documents,
39 | ... )
40 | | Table | Size |
41 | |----------------|------|
42 | | documents | 5183 |
43 | | bm25_documents | 5183 |
44 |
45 | >>> scores = search.documents(
46 | ... database="test.duckdb",
47 | ... queries=queries,
48 | ... top_k=10,
49 | ... )
50 | ```
51 |
52 |
--------------------------------------------------------------------------------
/docs/api/evaluation/load-beir.md:
--------------------------------------------------------------------------------
1 | # load_beir
2 |
3 | Load BEIR dataset for document and query retrieval tasks.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **dataset_name** (*str*)
10 |
11 | The name of the dataset to load (e.g., 'scifact').
12 |
13 | - **split** (*str*) – defaults to `test`
14 |
15 | The dataset split to load (e.g., 'test').
16 |
17 |
18 |
19 | ## Examples
20 |
21 | ```python
22 | >>> documents, queries, qrels = load_beir("scifact", split="test")
23 |
24 | >>> len(documents)
25 | 5183
26 |
27 | >>> len(queries)
28 | 300
29 | ```
30 |
31 |
--------------------------------------------------------------------------------
/docs/api/hf/.pages:
--------------------------------------------------------------------------------
1 | title: hf
--------------------------------------------------------------------------------
/docs/api/hf/insert-documents.md:
--------------------------------------------------------------------------------
1 | # insert_documents
2 |
3 | Insert documents from a Hugging Face dataset into DuckDB.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The schema in which the documents table is located.
16 |
17 | - **key** (*str*)
18 |
19 | The key field that uniquely identifies each document (e.g., 'query_id').
20 |
21 | - **url** (*str*)
22 |
23 | The URL of the Hugging Face dataset in Parquet format.
24 |
25 | - **config** (*dict | None*) – defaults to `None`
26 |
27 | Optional configuration options for the DuckDB connection.
28 |
29 | - **limit** (*int | None*) – defaults to `None`
30 |
31 | - **dtypes** (*dict | None*) – defaults to `None`
32 |
33 |
34 |
35 | ## Examples
36 |
37 | ```python
38 | >>> from ducksearch import upload
39 |
40 | >>> upload.documents(
41 | ... database="test.duckdb",
42 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/queries.parquet",
43 | ... key="query_id",
44 | ... fields=["query_id", "text"],
45 | ... )
46 | | Table | Size |
47 | |----------------|------|
48 | | documents | 19 |
49 | | bm25_documents | 19 |
50 |
51 | >>> upload.documents(
52 | ... database="test.duckdb",
53 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/documents.parquet",
54 | ... key="document_id",
55 | ... fields=["document_id", "text"],
56 | ... )
57 | | Table | Size |
58 | |----------------|------|
59 | | documents | 51 |
60 | | bm25_documents | 51 |
61 | ```
62 |
63 |
--------------------------------------------------------------------------------
/docs/api/overview.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | ## decorators
4 |
5 | - [connect_to_duckdb](../decorators/connect-to-duckdb)
6 | - [execute_with_duckdb](../decorators/execute-with-duckdb)
7 |
8 | ## evaluation
9 |
10 | - [evaluate](../evaluation/evaluate)
11 | - [load_beir](../evaluation/load-beir)
12 |
13 | ## hf
14 |
15 | - [insert_documents](../hf/insert-documents)
16 |
17 | ## search
18 |
19 | - [documents](../search/documents)
20 | - [graphs](../search/graphs)
21 | - [queries](../search/queries)
22 | - [search](../search/search)
23 | - [update_index_documents](../search/update-index-documents)
24 | - [update_index_queries](../search/update-index-queries)
25 |
26 | ## tables
27 |
28 | - [add_columns_documents](../tables/add-columns-documents)
29 | - [create_documents](../tables/create-documents)
30 | - [create_documents_queries](../tables/create-documents-queries)
31 | - [create_queries](../tables/create-queries)
32 | - [create_schema](../tables/create-schema)
33 | - [insert_documents](../tables/insert-documents)
34 | - [insert_documents_queries](../tables/insert-documents-queries)
35 | - [insert_queries](../tables/insert-queries)
36 | - [select_documents](../tables/select-documents)
37 | - [select_documents_columns](../tables/select-documents-columns)
38 | - [select_queries](../tables/select-queries)
39 |
40 | ## upload
41 |
42 | - [documents](../upload/documents)
43 | - [queries](../upload/queries)
44 |
45 | ## utils
46 |
47 |
48 | **Classes**
49 |
50 | - [ParallelTqdm](../utils/ParallelTqdm)
51 |
52 | **Functions**
53 |
54 | - [batchify](../utils/batchify)
55 | - [generate_random_hash](../utils/generate-random-hash)
56 | - [get_list_columns_df](../utils/get-list-columns-df)
57 | - [plot](../utils/plot)
58 |
59 |
--------------------------------------------------------------------------------
/docs/api/search/.pages:
--------------------------------------------------------------------------------
1 | title: search
--------------------------------------------------------------------------------
/docs/api/search/documents.md:
--------------------------------------------------------------------------------
1 | # documents
2 |
3 | Search for documents in the documents table using specified queries.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **queries** (*str | list[str]*)
14 |
15 | A string or list of query strings to search for.
16 |
17 | - **batch_size** (*int*) – defaults to `32`
18 |
19 | The batch size for query processing.
20 |
21 | - **top_k** (*int*) – defaults to `10`
22 |
23 | The number of top documents to retrieve for each query.
24 |
25 | - **top_k_token** (*int*) – defaults to `30000`
26 |
27 | The number of documents to score per token.
28 |
29 | - **n_jobs** (*int*) – defaults to `-1`
30 |
31 | The number of parallel jobs to use. Default use all available processors.
32 |
33 | - **config** (*dict | None*) – defaults to `None`
34 |
35 | Optional configuration for DuckDB connection settings.
36 |
37 | - **filters** (*str | None*) – defaults to `None`
38 |
39 | Optional SQL filters to apply during the search.
40 |
41 | - **order_by** (*str | None*) – defaults to `None`
42 |
43 | - **tqdm_bar** (*bool*) – defaults to `True`
44 |
45 | Whether to display a progress bar when searching.
46 |
47 |
48 |
49 | ## Examples
50 |
51 | ```python
52 | >>> from ducksearch import evaluation, upload, search
53 |
54 | >>> documents, queries, qrels = evaluation.load_beir(
55 | ... "scifact",
56 | ... split="test",
57 | ... )
58 |
59 | >>> scores = search.documents(
60 | ... database="test.duckdb",
61 | ... queries=queries,
62 | ... top_k_token=1000,
63 | ... )
64 | ```
65 |
66 |
--------------------------------------------------------------------------------
/docs/api/search/graphs.md:
--------------------------------------------------------------------------------
1 | # graphs
2 |
3 | Search for graphs in DuckDB using the provided queries.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **queries** (*str | list[str]*)
14 |
15 | A string or list of query strings to search for.
16 |
17 | - **batch_size** (*int*) – defaults to `30`
18 |
19 | The batch size for processing queries.
20 |
21 | - **top_k** (*int*) – defaults to `1000`
22 |
23 | The number of top documents to retrieve for each query.
24 |
25 | - **top_k_token** (*int*) – defaults to `30000`
26 |
27 | The number of top tokens to retrieve.
28 |
29 | - **n_jobs** (*int*) – defaults to `-1`
30 |
31 | The number of parallel jobs to use. Default use all available processors.
32 |
33 | - **config** (*dict | None*) – defaults to `None`
34 |
35 | Optional configuration settings for the DuckDB connection.
36 |
37 | - **filters** (*str | None*) – defaults to `None`
38 |
39 | Optional SQL filters to apply during the search.
40 |
41 | - **tqdm_bar** (*bool*) – defaults to `True`
42 |
43 |
44 |
45 | ## Examples
46 |
47 | ```python
48 | >>> from ducksearch import evaluation, upload, search
49 |
50 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="train")
51 |
52 | >>> upload.documents(
53 | ... database="test.duckdb",
54 | ... key="id",
55 | ... fields=["title", "text"],
56 | ... documents=documents,
57 | ... )
58 | | Table | Size |
59 | |----------------|------|
60 | | documents | 5183 |
61 | | bm25_documents | 5183 |
62 |
63 | >>> upload.queries(
64 | ... database="test.duckdb",
65 | ... queries=queries,
66 | ... documents_queries=qrels,
67 | ... )
68 | | Table | Size |
69 | |-------------------|------|
70 | | documents | 5183 |
71 | | queries | 807 |
72 | | bm25_documents | 5183 |
73 | | bm25_queries | 807 |
74 | | documents_queries | 916 |
75 | ```
76 |
77 |
--------------------------------------------------------------------------------
/docs/api/search/queries.md:
--------------------------------------------------------------------------------
1 | # queries
2 |
3 | Search for queries in the queries table using specified queries.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **queries** (*str | list[str]*)
14 |
15 | A string or list of query strings to search for.
16 |
17 | - **batch_size** (*int*) – defaults to `32`
18 |
19 | The batch size for query processing.
20 |
21 | - **top_k** (*int*) – defaults to `10`
22 |
23 | The number of top matching queries to retrieve.
24 |
25 | - **top_k_token** (*int*) – defaults to `30000`
26 |
27 | The number of documents to score per token.
28 |
29 | - **n_jobs** (*int*) – defaults to `-1`
30 |
31 | The number of parallel jobs to use. Default use all available processors.
32 |
33 | - **config** (*dict | None*) – defaults to `None`
34 |
35 | Optional configuration for DuckDB connection settings.
36 |
37 | - **filters** (*str | None*) – defaults to `None`
38 |
39 | Optional SQL filters to apply during the search.
40 |
41 | - **tqdm_bar** (*bool*) – defaults to `True`
42 |
43 |
44 |
45 | ## Examples
46 |
47 | ```python
48 | >>> from ducksearch import evaluation, upload, search
49 |
50 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test")
51 |
52 | >>> scores = search.queries(database="test.duckdb", queries=queries)
53 |
54 | >>> n = sum(1 for sample, query in zip(scores, queries) if sample[0]["query"] == query)
55 | >>> assert n >= 290
56 | ```
57 |
58 |
--------------------------------------------------------------------------------
/docs/api/search/search.md:
--------------------------------------------------------------------------------
1 | # search
2 |
3 | Run the search for documents or queries in parallel.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The name of the schema containing the indexed documents or queries.
16 |
17 | - **source_schema** (*str*)
18 |
19 | The name of the schema containing the original documents or queries.
20 |
21 | - **source** (*str*)
22 |
23 | The table to search (either 'documents' or 'queries').
24 |
25 | - **queries** (*str | list[str]*)
26 |
27 | A string or list of query strings to search for.
28 |
29 | - **batch_size** (*int*) – defaults to `64`
30 |
31 | The batch size for query processing.
32 |
33 | - **top_k** (*int*) – defaults to `10`
34 |
35 | The number of top results to retrieve for each query.
36 |
37 | - **top_k_token** (*int*) – defaults to `30000`
38 |
39 | The number of documents to score per token.
40 |
41 | - **n_jobs** (*int*) – defaults to `-1`
42 |
43 | The number of parallel jobs to use. Default use available processors.
44 |
45 | - **config** (*dict | None*) – defaults to `None`
46 |
47 | Optional configuration for DuckDB connection settings.
48 |
49 | - **filters** (*str | None*) – defaults to `None`
50 |
51 | Optional SQL filters to apply during the search.
52 |
53 | - **order_by** (*str | None*) – defaults to `None`
54 |
55 | - **tqdm_bar** (*bool*) – defaults to `True`
56 |
57 | Whether to display a progress bar when searching.
58 |
59 |
60 |
61 | ## Examples
62 |
63 | ```python
64 | >>> from ducksearch import search
65 |
66 | >>> documents = search.search(
67 | ... database="test.duckdb",
68 | ... source_schema="bm25_tables",
69 | ... schema="bm25_documents",
70 | ... source="documents",
71 | ... queries="random query",
72 | ... top_k_token=10_000,
73 | ... top_k=10,
74 | ... )
75 |
76 | >>> assert len(documents) == 10
77 | ```
78 |
79 |
--------------------------------------------------------------------------------
/docs/api/search/update-index-documents.md:
--------------------------------------------------------------------------------
1 | # update_index_documents
2 |
3 | Update the BM25 search index for documents.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **fields** (*list[str]*)
14 |
15 | The fields to index for each document.
16 |
17 | - **k1** (*float*) – defaults to `1.5`
18 |
19 | The BM25 k1 parameter, controls term saturation.
20 |
21 | - **b** (*float*) – defaults to `0.75`
22 |
23 | The BM25 b parameter, controls document length normalization.
24 |
25 | - **stemmer** (*str*) – defaults to `porter`
26 |
27 | The stemming algorithm to use (e.g., 'porter').
28 |
29 | - **stopwords** (*str | list[str]*) – defaults to `None`
30 |
31 | The list of stopwords to exclude from indexing. Can be a list or a string specifying the language (e.g., "english").
32 |
33 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+`
34 |
35 | A regex pattern to ignore characters during tokenization. Default ignores punctuation and non-alphabetic characters.
36 |
37 | - **strip_accents** (*bool*) – defaults to `True`
38 |
39 | Whether to remove accents from characters during indexing.
40 |
41 | - **lower** (*bool*) – defaults to `True`
42 |
43 | - **batch_size** (*int*) – defaults to `10000`
44 |
45 | The number of documents to process per batch.
46 |
47 | - **config** (*dict | None*) – defaults to `None`
48 |
49 | Optional configuration settings for the DuckDB connection.
50 |
51 |
52 |
53 | ## Examples
54 |
55 | ```python
56 | >>> from ducksearch import evaluation, upload, search
57 |
58 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test")
59 |
60 | >>> upload.documents(
61 | ... database="test.duckdb",
62 | ... key="id",
63 | ... fields=["title", "text"],
64 | ... documents=documents,
65 | ... stopwords=["larva"],
66 | ... )
67 | | Table | Size |
68 | |----------------|------|
69 | | documents | 5183 |
70 | | bm25_documents | 5183 |
71 | ```
72 |
73 |
--------------------------------------------------------------------------------
/docs/api/search/update-index-queries.md:
--------------------------------------------------------------------------------
1 | # update_index_queries
2 |
3 | Update the BM25 search index for queries.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **k1** (*float*) – defaults to `1.5`
14 |
15 | The BM25 k1 parameter, controls term saturation.
16 |
17 | - **b** (*float*) – defaults to `0.75`
18 |
19 | The BM25 b parameter, controls document length normalization.
20 |
21 | - **stemmer** (*str*) – defaults to `porter`
22 |
23 | The stemming algorithm to use (e.g., 'porter').
24 |
25 | - **stopwords** (*str | list[str]*) – defaults to `None`
26 |
27 | The list of stopwords to exclude from indexing. Can be a list or a string specifying the language (e.g., "english").
28 |
29 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+`
30 |
31 | A regex pattern to ignore characters during tokenization. Default ignores punctuation and non-alphabetic characters.
32 |
33 | - **strip_accents** (*bool*) – defaults to `True`
34 |
35 | Whether to remove accents from characters during indexing.
36 |
37 | - **lower** (*bool*) – defaults to `True`
38 |
39 | - **batch_size** (*int*) – defaults to `10000`
40 |
41 | The number of queries to process per batch.
42 |
43 | - **config** (*dict | None*) – defaults to `None`
44 |
45 | Optional configuration settings for the DuckDB connection.
46 |
47 |
48 |
49 | ## Examples
50 |
51 | ```python
52 | >>> from ducksearch import evaluation, upload, search
53 |
54 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test")
55 |
56 | >>> upload.queries(
57 | ... database="test.duckdb",
58 | ... queries=queries,
59 | ... documents_queries=qrels,
60 | ... )
61 | | Table | Size |
62 | |-------------------|------|
63 | | documents | 5183 |
64 | | queries | 300 |
65 | | bm25_documents | 5183 |
66 | | bm25_queries | 300 |
67 | | documents_queries | 339 |
68 | ```
69 |
70 |
--------------------------------------------------------------------------------
/docs/api/tables/.pages:
--------------------------------------------------------------------------------
1 | title: tables
--------------------------------------------------------------------------------
/docs/api/tables/add-columns-documents.md:
--------------------------------------------------------------------------------
1 | # add_columns_documents
2 |
3 | Add columns to the documents table in the DuckDB database.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | - **schema** (*str*)
12 |
13 | - **columns** (*list[str] | str*)
14 |
15 | - **dtypes** (*dict*) – defaults to `None`
16 |
17 | - **config** (*dict*) – defaults to `None`
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/docs/api/tables/create-documents-queries.md:
--------------------------------------------------------------------------------
1 | # create_documents_queries
2 |
3 | Create the documents_queries table in the DuckDB database.
4 |
5 |
6 |
7 |
8 |
9 | ## Examples
10 |
11 | ```python
12 | >>> from ducksearch import tables
13 |
14 | >>> tables.create_schema(
15 | ... database="test.duckdb",
16 | ... schema="bm25_tables"
17 | ... )
18 |
19 | >>> tables.create_documents_queries(
20 | ... database="test.duckdb",
21 | ... schema="bm25_tables",
22 | ... )
23 | ```
24 |
25 |
--------------------------------------------------------------------------------
/docs/api/tables/create-documents.md:
--------------------------------------------------------------------------------
1 | # create_documents
2 |
3 | Create the documents table in the DuckDB database.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | - **schema** (*str*)
12 |
13 | - **columns** (*str | list[str]*)
14 |
15 | - **dtypes** (*dict[str, str] | None*) – defaults to `None`
16 |
17 | - **config** (*dict | None*) – defaults to `None`
18 |
19 |
20 |
21 | ## Examples
22 |
23 | ```python
24 | >>> from ducksearch import tables
25 |
26 | >>> tables.create_schema(
27 | ... database="test.duckdb",
28 | ... schema="bm25_tables"
29 | ... )
30 |
31 | >>> tables.create_documents(
32 | ... database="test.duckdb",
33 | ... schema="bm25_tables",
34 | ... columns=["title", "text"],
35 | ... dtypes={"text": "VARCHAR", "title": "VARCHAR"},
36 | ... )
37 |
38 | >>> df = [
39 | ... {"id": 1, "title": "title document 1", "text": "text document 1"},
40 | ... {"id": 2, "title": "title document 2", "text": "text document 2"},
41 | ... {"id": 3, "title": "title document 3", "text": "text document 3"},
42 | ... ]
43 |
44 | >>> tables.insert_documents(
45 | ... database="test.duckdb",
46 | ... schema="bm25_tables",
47 | ... key="id",
48 | ... df=df,
49 | ... columns=["title", "text"],
50 | ... )
51 | ```
52 |
53 |
--------------------------------------------------------------------------------
/docs/api/tables/create-queries.md:
--------------------------------------------------------------------------------
1 | # create_queries
2 |
3 | Create the queries table in the DuckDB database.
4 |
5 |
6 |
7 |
8 |
9 | ## Examples
10 |
11 | ```python
12 | >>> from ducksearch import tables
13 |
14 | >>> tables.create_schema(
15 | ... database="test.duckdb",
16 | ... schema="bm25_tables"
17 | ... )
18 |
19 | >>> tables.create_queries(
20 | ... database="test.duckdb",
21 | ... schema="bm25_tables",
22 | ... )
23 | ```
24 |
25 |
--------------------------------------------------------------------------------
/docs/api/tables/create-schema.md:
--------------------------------------------------------------------------------
1 | # create_schema
2 |
3 | Create the specified schema in the DuckDB database.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | - **schema** (*str*)
12 |
13 | - **config** (*dict | None*) – defaults to `None`
14 |
15 |
16 |
17 | ## Examples
18 |
19 | ```python
20 | >>> from ducksearch import tables
21 |
22 | >>> tables.create_schema(
23 | ... database="test.duckdb",
24 | ... schema="bm25_tables",
25 | ... )
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/docs/api/tables/insert-documents-queries.md:
--------------------------------------------------------------------------------
1 | # insert_documents_queries
2 |
3 | Insert interactions between documents and queries into the documents_queries table.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The schema in which the documents_queries table is located.
16 |
17 | - **documents_queries** (*dict[dict[str, float]]*)
18 |
19 | A dictionary mapping document IDs to queries and their corresponding scores.
20 |
21 | - **config** (*dict | None*) – defaults to `None`
22 |
23 | Optional configuration options for the DuckDB connection.
24 |
25 |
26 |
27 | ## Examples
28 |
29 | ```python
30 | >>> from ducksearch import tables
31 |
32 | >>> documents_queries = {
33 | ... "1": {"query 1": 0.9, "query 2": 0.8},
34 | ... "2": {"query 2": 0.9, "query 3": 3},
35 | ... "3": {"query 1": 0.9, "query 3": 0.5},
36 | ... }
37 |
38 | >>> tables.insert_documents_queries(
39 | ... database="test.duckdb",
40 | ... schema="bm25_tables",
41 | ... documents_queries=documents_queries
42 | ... )
43 | ```
44 |
45 |
--------------------------------------------------------------------------------
/docs/api/tables/insert-documents.md:
--------------------------------------------------------------------------------
1 | # insert_documents
2 |
3 | Insert documents into the documents table with optional multi-threading.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The schema in which the documents table is located.
16 |
17 | - **df** (*list[dict] | str*)
18 |
19 | The list of document dictionaries or a string (URL) for a Hugging Face dataset to insert.
20 |
21 | - **key** (*str*)
22 |
23 | The field that uniquely identifies each document (e.g., 'id').
24 |
25 | - **columns** (*list[str] | str*)
26 |
27 | The list of document fields to insert. Can be a string if inserting a single field.
28 |
29 | - **dtypes** (*dict[str, str] | None*) – defaults to `None`
30 |
31 | Optional dictionary specifying the DuckDB type for each field. Defaults to 'VARCHAR' for all unspecified fields.
32 |
33 | - **batch_size** (*int*) – defaults to `30000`
34 |
35 | The number of documents to insert in each batch.
36 |
37 | - **n_jobs** (*int*) – defaults to `-1`
38 |
39 | Number of parallel jobs to use for inserting documents. Default use all available processors.
40 |
41 | - **config** (*dict | None*) – defaults to `None`
42 |
43 | Optional configuration options for the DuckDB connection.
44 |
45 | - **limit** (*int | None*) – defaults to `None`
46 |
47 |
48 |
49 | ## Examples
50 |
51 | ```python
52 | >>> from ducksearch import tables
53 |
54 | >>> df = [
55 | ... {"id": 1, "title": "title document 1", "text": "text document 1"},
56 | ... {"id": 2, "title": "title document 2", "text": "text document 2"},
57 | ... {"id": 3, "title": "title document 3", "text": "text document 3"},
58 | ... ]
59 |
60 | >>> _ = tables.insert_documents(
61 | ... database="test.duckdb",
62 | ... schema="bm25_tables",
63 | ... key="id",
64 | ... columns=["title", "text"],
65 | ... df=df
66 | ... )
67 | ```
68 |
69 |
--------------------------------------------------------------------------------
/docs/api/tables/insert-queries.md:
--------------------------------------------------------------------------------
1 | # insert_queries
2 |
3 | Insert a list of queries into the queries table.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The schema in which the queries table is located.
16 |
17 | - **queries** (*list[str]*)
18 |
19 | A list of query strings to insert into the table.
20 |
21 | - **config** (*dict | None*) – defaults to `None`
22 |
23 | Optional configuration options for the DuckDB connection.
24 |
25 |
26 |
27 | ## Examples
28 |
29 | ```python
30 | >>> from ducksearch import tables
31 |
32 | >>> _ = tables.insert_queries(
33 | ... database="test.duckdb",
34 | ... schema="bm25_tables",
35 | ... queries=["query 1", "query 2", "query 3"],
36 | ... )
37 | ```
38 |
39 |
--------------------------------------------------------------------------------
/docs/api/tables/select-documents-columns.md:
--------------------------------------------------------------------------------
1 | # select_documents_columns
2 |
3 | Select the column names from the documents table, excluding the 'bm25id' column.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The schema where the documents table is located.
16 |
17 | - **config** (*dict | None*) – defaults to `None`
18 |
19 | Optional configuration options for the DuckDB connection.
20 |
21 |
22 |
23 | ## Examples
24 |
25 | ```python
26 | >>> from ducksearch import tables
27 |
28 | >>> tables.select_documents_columns(
29 | ... database="test.duckdb",
30 | ... schema="bm25_tables",
31 | ... )
32 | ['id', 'title', 'text']
33 | ```
34 |
35 |
--------------------------------------------------------------------------------
/docs/api/tables/select-documents.md:
--------------------------------------------------------------------------------
1 | # select_documents
2 |
3 | Select all documents from the documents table.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **schema** (*str*)
14 |
15 | The schema where the documents table is located.
16 |
17 | - **limit** (*int | None*) – defaults to `None`
18 |
19 | - **config** (*dict | None*) – defaults to `None`
20 |
21 | Optional configuration options for the DuckDB connection.
22 |
23 |
24 |
25 | ## Examples
26 |
27 | ```python
28 | >>> from ducksearch import tables
29 |
30 | >>> documents = tables.select_documents(
31 | ... database="test.duckdb",
32 | ... schema="bm25_tables",
33 | ... )
34 |
35 | >>> assert len(documents) == 3
36 | ```
37 |
38 |
--------------------------------------------------------------------------------
/docs/api/tables/select-queries.md:
--------------------------------------------------------------------------------
1 | # select_queries
2 |
3 | Select all queries from the queries table.
4 |
5 |
6 |
7 |
8 |
9 | ## Examples
10 |
11 | ```python
12 | >>> from ducksearch import tables
13 |
14 | >>> queries = tables.select_queries(
15 | ... database="test.duckdb",
16 | ... schema="bm25_tables",
17 | ... )
18 |
19 | >>> assert len(queries) == 3
20 | ```
21 |
22 |
--------------------------------------------------------------------------------
/docs/api/upload/.pages:
--------------------------------------------------------------------------------
1 | title: upload
--------------------------------------------------------------------------------
/docs/api/upload/documents.md:
--------------------------------------------------------------------------------
1 | # documents
2 |
3 | Upload documents to DuckDB, create necessary schema, and index using BM25.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | Name of the DuckDB database.
12 |
13 | - **key** (*str*)
14 |
15 | Key identifier for the documents. The key will be renamed to `id` in the database.
16 |
17 | - **fields** (*str | list[str]*)
18 |
19 | List of fields to upload from each document. If a single field is provided as a string, it will be converted to a list.
20 |
21 | - **documents** (*list[dict] | str*)
22 |
23 | Documents to upload. Can be a list of dictionaries or a Hugging Face (HF) URL string pointing to a dataset.
24 |
25 | - **k1** (*float*) – defaults to `1.5`
26 |
27 | BM25 k1 parameter, controls term saturation.
28 |
29 | - **b** (*float*) – defaults to `0.75`
30 |
31 | BM25 b parameter, controls document length normalization.
32 |
33 | - **stemmer** (*str*) – defaults to `porter`
34 |
35 | Stemming algorithm to use (e.g., 'porter'). The type of stemmer to be used. One of 'arabic', 'basque', 'catalan', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'lithuanian', 'nepali', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'serbian', 'spanish', 'swedish', 'tamil', 'turkish', or 'none' if no stemming is to be used.
36 |
37 | - **stopwords** (*str | list[str]*) – defaults to `None`
38 |
39 | List of stopwords to exclude from indexing. Can be a custom list or a language string.
40 |
41 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+`
42 |
43 | Regular expression pattern to ignore characters when indexing. Default ignore punctuation and non-alphabetic characters.
44 |
45 | - **strip_accents** (*bool*) – defaults to `True`
46 |
47 | Whether to remove accents from characters during indexing.
48 |
49 | - **lower** (*bool*) – defaults to `True`
50 |
51 | - **batch_size** (*int*) – defaults to `30000`
52 |
53 | Number of documents to process per batch.
54 |
55 | - **n_jobs** (*int*) – defaults to `-1`
56 |
57 | Number of parallel jobs to use for uploading documents. Default use all available processors.
58 |
59 | - **dtypes** (*dict[str, str] | None*) – defaults to `None`
60 |
61 | - **config** (*dict | None*) – defaults to `None`
62 |
63 | Optional configuration dictionary for the DuckDB connection and other settings.
64 |
65 | - **limit** (*int | None*) – defaults to `None`
66 |
67 | - **tqdm_bar** (*bool*) – defaults to `True`
68 |
69 | Whether to display a progress bar when uploading documents
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/docs/api/upload/queries.md:
--------------------------------------------------------------------------------
1 | # queries
2 |
3 | Upload queries to DuckDB, map documents to queries, and index using BM25.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | Name of the DuckDB database.
12 |
13 | - **queries** (*list[str] | None*) – defaults to `None`
14 |
15 | List of queries to upload. Each query is a string.
16 |
17 | - **documents_queries** (*dict[list]*) – defaults to `None`
18 |
19 | Dictionary mapping document IDs to a list of queries.
20 |
21 | - **k1** (*float*) – defaults to `1.5`
22 |
23 | BM25 k1 parameter, controls term saturation.
24 |
25 | - **b** (*float*) – defaults to `0.75`
26 |
27 | BM25 b parameter, controls document length normalization.
28 |
29 | - **stemmer** (*str*) – defaults to `porter`
30 |
31 | Stemming algorithm to use (e.g., 'porter'). The type of stemmer to be used. One of 'arabic', 'basque', 'catalan', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'lithuanian', 'nepali', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'serbian', 'spanish', 'swedish', 'tamil', 'turkish', or 'none' if no stemming is to be used.
32 |
33 | - **stopwords** (*str | list[str]*) – defaults to `None`
34 |
35 | List of stopwords to exclude from indexing. Default can be a custom list or a language string.
36 |
37 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+`
38 |
39 | Regular expression pattern to ignore characters when indexing. Default ignore punctuation and non-alphabetic characters.
40 |
41 | - **strip_accents** (*bool*) – defaults to `True`
42 |
43 | Whether to remove accents from characters during indexing.
44 |
45 | - **lower** (*bool*) – defaults to `True`
46 |
47 | - **batch_size** (*int*) – defaults to `30000`
48 |
49 | Number of queries to process per batch.
50 |
51 | - **config** (*dict | None*) – defaults to `None`
52 |
53 | Optional configuration dictionary for the DuckDB connection and other settings.
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/docs/api/utils/.pages:
--------------------------------------------------------------------------------
1 | title: utils
--------------------------------------------------------------------------------
/docs/api/utils/ParallelTqdm.md:
--------------------------------------------------------------------------------
1 | # ParallelTqdm
2 |
3 | joblib.Parallel, but with a tqdm progressbar.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **total** (*int*)
10 |
11 | The total number of tasks to complete.
12 |
13 | - **desc** (*str*)
14 |
15 | A description of the task.
16 |
17 | - **tqdm_bar** (*bool*) – defaults to `True`
18 |
19 | Whether to display a tqdm progress bar. Default is False.
20 |
21 | - **show_joblib_header** (*bool*) – defaults to `False`
22 |
23 | Whether to display the joblib header. Default is False
24 |
25 | - **kwargs**
26 |
27 |
28 |
29 |
30 | ## Methods
31 |
32 | ???- note "__call__"
33 |
34 | Main function to dispatch parallel tasks.
35 |
36 | **Parameters**
37 |
38 | - **iterable**
39 |
40 | ???- note "debug"
41 |
42 | ???- note "dispatch_next"
43 |
44 | Dispatch more data for parallel processing
45 |
46 | This method is meant to be called concurrently by the multiprocessing callback. We rely on the thread-safety of dispatch_one_batch to protect against concurrent consumption of the unprotected iterator.
47 |
48 |
49 | ???- note "dispatch_one_batch"
50 |
51 | Prefetch the tasks for the next batch and dispatch them.
52 |
53 | The effective size of the batch is computed here. If there are no more jobs to dispatch, return False, else return True. The iterator consumption and dispatching is protected by the same lock so calling this function should be thread safe.
54 |
55 | **Parameters**
56 |
57 | - **iterator**
58 |
59 | ???- note "format"
60 |
61 | Return the formatted representation of the object.
62 |
63 | **Parameters**
64 |
65 | - **obj**
66 | - **indent** – defaults to `0`
67 |
68 | ???- note "info"
69 |
70 | ???- note "print_progress"
71 |
72 | Display the process of the parallel execution using tqdm
73 |
74 |
75 | ???- note "warn"
76 |
77 | ## References
78 |
79 | https://github.com/joblib/joblib/issues/972
80 |
81 |
--------------------------------------------------------------------------------
/docs/api/utils/batchify.md:
--------------------------------------------------------------------------------
1 | # batchify
2 |
3 | Split a list into batches and optionally display a progress bar.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **X** (*list[str]*)
10 |
11 | A list of items to be batched.
12 |
13 | - **batch_size** (*int*)
14 |
15 | The number of items in each batch.
16 |
17 | - **desc** (*str*) – defaults to ``
18 |
19 | A description to display in the progress bar.
20 |
21 | - **tqdm_bar** (*bool*) – defaults to `True`
22 |
23 | Whether to display a progress bar using `tqdm`.
24 |
25 |
26 |
27 | ## Examples
28 |
29 | ```python
30 | >>> items = ["a", "b", "c", "d", "e", "f"]
31 | >>> batches = list(batchify(items, batch_size=2))
32 | >>> for batch in batches:
33 | ... print(batch)
34 | ['a', 'b']
35 | ['c', 'd']
36 | ['e', 'f']
37 | ```
38 |
39 |
--------------------------------------------------------------------------------
/docs/api/utils/generate-random-hash.md:
--------------------------------------------------------------------------------
1 | # generate_random_hash
2 |
3 | Generate a random SHA-256 hash.
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/utils/get-list-columns-df.md:
--------------------------------------------------------------------------------
1 | # get_list_columns_df
2 |
3 | Get a list of columns from a list of dictionaries or a DataFrame.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **documents** (*list[dict] | pandas.core.frame.DataFrame*)
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/docs/api/utils/plot.md:
--------------------------------------------------------------------------------
1 | # plot
2 |
3 | Generate and display a markdown table with statistics of the specified dataset tables.
4 |
5 |
6 |
7 | ## Parameters
8 |
9 | - **database** (*str*)
10 |
11 | The name of the DuckDB database.
12 |
13 | - **config** (*None | dict*) – defaults to `None`
14 |
15 | Optional configuration options for the DuckDB connection.
16 |
17 | - **tables** – defaults to `['bm25_tables.documents', 'bm25_tables.queries', 'bm25_documents.lengths', 'bm25_queries.lengths', 'bm25_tables.documents_queries']`
18 |
19 | A list of table names to plot statistics for. Defaults to common BM25 tables.
20 |
21 |
22 |
23 | ## Examples
24 |
25 | ```python
26 | >>> from ducksearch import utils
27 |
28 | >>> utils.plot(database="test.duckdb")
29 | | Table | Size |
30 | |-----------|------|
31 | | documents | 5183 |
32 | | queries | 300 |
33 | ```
34 |
35 |
--------------------------------------------------------------------------------
/docs/benchmarks/.pages:
--------------------------------------------------------------------------------
1 | title: Benchmarks
2 | nav:
3 | - Benchmarks: benchmarks.md
4 |
--------------------------------------------------------------------------------
/docs/benchmarks/benchmarks.md:
--------------------------------------------------------------------------------
1 | ## Benchmarks
2 |
3 | ### Ducksearch and BM25s
4 |
5 | While DuckSearch provide advanced filtering features / updates on the index, DuckSearch only score `top_k_token` document per query token. Benchmark might evolve with DuckDB improvements and DuckSearch updates.
6 |
7 | === "Table"
8 |
9 | | Dataset | Metric | Ducksearch | BM25s | Difference (Ducksearch - BM25s) |
10 | |-------------------|---------------|-------------|-----------|---------------------------------|
11 | | **arguana** | ndcg@10 | 0.3779 | 0.3663 | +0.0116 |
12 | | | hits@1 | 0.0 | 0.0 | 0.0 |
13 | | | mrr@10 | 0.2491 | 0.2443 | +0.0048 |
14 | | | map@10 | 0.2528 | 0.2430 | +0.0098 |
15 | | | qps | 117.80 | 2113.50 | -1995.70 |
16 | | | Index Time(s) | 1.42 | 0.48 | +0.94 |
17 | | **climate-fever** | ndcg@10 | 0.1184 | 0.1313 | -0.0129 |
18 | | | hits@1 | 0.1068 | 0.1186 | -0.0118 |
19 | | | mrr@10 | 0.1644 | 0.1809 | -0.0165 |
20 | | | map@10 | 0.0803 | 0.0907 | -0.0104 |
21 | | | qps | 5.88 | 99.49 | -93.61 |
22 | | | Index Time(s) | 302.39 | 209.97 | +92.42 |
23 | | **dbpedia-entity**| ndcg@10 | 0.6046 | 0.6172 | -0.0126 |
24 | | | hits@1 | 0.7669 | 0.7744 | -0.0075 |
25 | | | mrr@10 | 0.8311 | 0.8382 | -0.0071 |
26 | | | map@10 | 0.0649 | 0.0672 | -0.0023 |
27 | | | qps | 113.20 | 182.79 | -69.59 |
28 | | | Index Time(s) | 181.42 | 119.18 | +62.24 |
29 | | **fever** | ndcg@10 | 0.3861 | 0.4825 | -0.0964 |
30 | | | hits@1 | 0.2583 | 0.3312 | -0.0729 |
31 | | | mrr@10 | 0.3525 | 0.4423 | -0.0898 |
32 | | | map@10 | 0.3329 | 0.4212 | -0.0883 |
33 | | | qps | 74.40 | 104.97 | -30.57 |
34 | | | Index Time(s) | 329.70 | 207.52 | +122.18 |
35 | | **fiqa** | ndcg@10 | 0.2445 | 0.2326 | +0.0119 |
36 | | | hits@1 | 0.2207 | 0.2160 | +0.0047 |
37 | | | mrr@10 | 0.3002 | 0.2875 | +0.0127 |
38 | | | map@10 | 0.1848 | 0.1726 | +0.0122 |
39 | | | qps | 545.77 | 2157.35 | -1611.58 |
40 | | | Index Time(s) | 6.04 | 4.27 | +1.77 |
41 | | **hotpotqa** | ndcg@10 | 0.4487 | 0.5630 | -0.1143 |
42 | | | hits@1 | 0.5059 | 0.6523 | -0.1464 |
43 | | | mrr@10 | 0.5846 | 0.7249 | -0.1403 |
44 | | | map@10 | 0.3642 | 0.4697 | -0.1055 |
45 | | | qps | 48.15 | 104.43 | -56.28 |
46 | | | Index Time(s) | 163.14 | 123.39 | +39.75 |
47 | | **msmarco** | ndcg@10 | 0.8951 | 0.9705 | -0.0754 |
48 | | | hits@1 | 1.0 | 1.0 | 0.0 |
49 | | | mrr@10 | 1.0 | 1.0 | 0.0 |
50 | | | map@10 | 0.0459 | 0.0532 | -0.0073 |
51 | | | qps | 35.11 | 71.26 | -36.15 |
52 | | | Index Time(s) | 202.37 | 229.22 | -26.85 |
53 | | **nfcorpus** | ndcg@10 | 0.3301 | 0.3059 | +0.0242 |
54 | | | hits@1 | 0.4396 | 0.4458 | -0.0062 |
55 | | | mrr@10 | 0.5292 | 0.5205 | +0.0087 |
56 | | | map@10 | 0.1233 | 0.1168 | +0.0065 |
57 | | | qps | 3464.66 | 3933.12 | -468.46 |
58 | | | Index Time(s) | 0.99 | 1.67 | -0.68 |
59 | | **nq** | ndcg@10 | 0.2451 | 0.2735 | -0.0284 |
60 | | | hits@1 | 0.1272 | 0.1460 | -0.0188 |
61 | | | mrr@10 | 0.2099 | 0.2366 | -0.0267 |
62 | | | map@10 | 0.1934 | 0.2177 | -0.0243 |
63 | | | qps | 150.23 | 272.62 | -122.39 |
64 | | | Index Time(s) | 71.43 | 87.98 | -16.55 |
65 | | **quora** | ndcg@10 | 0.7705 | 0.7491 | +0.0214 |
66 | | | hits@1 | 0.6783 | 0.6622 | +0.0161 |
67 | | | mrr@10 | 0.7606 | 0.7433 | +0.0173 |
68 | | | map@10 | 0.7206 | 0.6988 | +0.0218 |
69 | | | qps | 741.13 | 1004.44 | -263.31 |
70 | | | Index Time(s) | 3.78 | 6.57 | -2.79 |
71 | | **scidocs** | ndcg@10 | 0.1025 | 0.0993 | +0.0032 |
72 | | | hits@1 | 0.1790 | 0.1910 | -0.0120 |
73 | | | mrr@10 | 0.2754 | 0.2765 | -0.0011 |
74 | | | map@10 | 0.0154 | 0.0147 | +0.0007 |
75 | | | qps | 879.11 | 3570.06 | -2690.95 |
76 | | | Index Time(s) | 4.46 | 1.64 | +2.82 |
77 | | **scifact** | ndcg@10 | 0.6908 | 0.6617 | +0.0291 |
78 | | | hits@1 | 0.5533 | 0.5433 | +0.0100 |
79 | | | mrr@10 | 0.
80 | 6527 | 0.6312 | +0.0215 |
81 | | | map@10 | 0.6416 | 0.6199 | +0.0217 |
82 | | | qps | 2153.64 | 3708.28 | -1554.64 |
83 | | | Index Time(s) | 1.22 | 0.41 | +0.81 |
84 | | **trec-covid** | ndcg@10 | 0.9533 | 0.8983 | +0.0550 |
85 | | | hits@1 | 1.0 | 0.92 | +0.08 |
86 | | | mrr@10 | 1.0 | 0.96 | +0.04 |
87 | | | map@10 | 0.0074 | 0.0069 | +0.0005 |
88 | | | qps | 112.38 | 1275.41 | -1163.03 |
89 | | | Index Time(s) | 22.15 | 10.15 | +12.00 |
90 | | **webis-touche2020** | ndcg@10 | 0.4130 | 0.4671 | -0.0541 |
91 | | | hits@1 | 0.5510 | 0.6122 | -0.0612 |
92 | | | mrr@10 | 0.7114 | 0.7541 | -0.0427 |
93 | | | map@10 | 0.0564 | 0.0659 | -0.0095 |
94 | | | qps | 104.65 | 961.73 | -857.08 |
95 | | | Index Time(s) | 44.14 | 34.89 | +9.25 |
96 |
97 |
--------------------------------------------------------------------------------
/docs/css/version-select.css:
--------------------------------------------------------------------------------
1 | @media only screen and (max-width:76.1875em) {
2 | #version-selector {
3 | padding: .6rem .8rem;
4 | }
5 | }
--------------------------------------------------------------------------------
/docs/documentation/.pages:
--------------------------------------------------------------------------------
1 | title: Documentation
2 | nav:
3 | - Upload: upload.md
4 | - Search: search.md
5 | - Delete: delete.md
6 | - Update: update.md
7 |
8 |
--------------------------------------------------------------------------------
/docs/documentation/delete.md:
--------------------------------------------------------------------------------
1 | ## Delete
2 |
3 | To delete a document, you need to provide the document's ID. The delete operation will remove the document from the database and update the index.
4 |
5 | ```python
6 | from ducksearch import delete, upload
7 |
8 | delete.documents(
9 | database="ducksearch.duckdb",
10 | ids=[0, 1],
11 | )
12 | ```
--------------------------------------------------------------------------------
/docs/documentation/graph.md:
--------------------------------------------------------------------------------
1 | ## Graph
2 |
3 | The `search.graphs` function can be used to search documents with a graph-based query. This function is useful if we have paired documents and queries. The search will retrieve the set of documents and queries that match the input query. Then it will build a graph and compute the weight of each document using a graph-based scoring function.
4 |
5 | The `search.graphs` function is much slower than the `search.documents` function, but might provide better results with decent amount of paired documents / queries.
6 |
7 | ### Documents queries interactions
8 |
9 | We can upload documents queries interactions in order to call the `search.graphs` function. The following example demonstrates how to upload documents queries interactions:
10 |
11 | ```python
12 | from ducksearch import search, upload
13 |
14 | documents = [
15 | {
16 | "id": 0,
17 | "title": "Hotel California",
18 | "style": "rock",
19 | "date": "1977-02-22",
20 | "popularity": 9,
21 | },
22 | {
23 | "id": 1,
24 | "title": "Here Comes the Sun",
25 | "style": "rock",
26 | "date": "1969-06-10",
27 | "popularity": 10,
28 | },
29 | {
30 | "id": 2,
31 | "title": "Alive",
32 | "style": "electro, punk",
33 | "date": "2007-11-19",
34 | "popularity": 9,
35 | },
36 | ]
37 |
38 | upload.documents(
39 | database="ducksearch.duckdb",
40 | key="id",
41 | fields=["title", "style", "date", "popularity"],
42 | documents=documents,
43 | dtypes={
44 | "date": "DATE",
45 | "popularity": "INT",
46 | },
47 | )
48 |
49 | # Mapping between documents ids and queries
50 | documents_queries = {
51 | 0: ["the beatles", "rock band"],
52 | 1: ["rock band", "california"],
53 | 2: ["daft"],
54 | }
55 |
56 | upload.queries(
57 | database="ducksearch.duckdb",
58 | documents_queries=documents_queries,
59 | )
60 | ```
61 |
62 | ???+ tip
63 | We can write documents queries mapping as a list of dict with the weight between the document and the query. The weight is used to compute the score in the `search.graphs` function:
64 |
65 | ```python
66 | documents_queries = {
67 | 0: {"the beatles": 30, "rock band": 10},
68 | 1: {"rock band": 10, "california": 1},
69 | 2: {"daft": 60},
70 | }
71 | ```
72 |
73 | When the weight is not specified, the default value is 1.
74 |
75 | ### Search Graphs
76 |
77 | The following example demonstrates how to search documents with a graph-based query:
78 |
79 | ```python
80 | from ducksearch import search
81 |
82 | search.graphs(
83 | database="ducksearch.duckdb",
84 | queries="daft punk",
85 | top_k=10,
86 | )
87 | ```
88 |
89 | ```python
90 | [
91 | {
92 | "id": "2",
93 | "title": "Alive",
94 | "style": "electro, punk",
95 | "date": Timestamp("2007-11-19 00:00:00"),
96 | "popularity": 9,
97 | "score": 2.877532958984375,
98 | }
99 | ]
100 | ```
--------------------------------------------------------------------------------
/docs/documentation/search.md:
--------------------------------------------------------------------------------
1 | ???+ note
2 | Before we can search for documents, we need to upload them to DuckDB. We can use the `upload.documents` function to upload a list of dictionaries to DuckDB.
3 |
4 | ## Search
5 |
6 | All the search functions require a DuckDB database name as the first argument. The database name is the name of the DuckDB database where the documents are stored. The database name is the same as the one used in the `upload.documents` function. Each search function can take additional parameters to control the search behavior such as the number of documents to return, the number of documents to score for each query token, and the number of parallel jobs to use as well as optional SQL filters.
7 |
8 | ### Documents
9 |
10 | Once the documents are uploaded, we can search for them using the `search.documents` function.
11 | The search function returns a list of list of documents ordered by their BM25 score.
12 |
13 | ```python
14 | search.documents(
15 | database="ducksearch.duckdb",
16 | queries=["daft punk", "rock"],
17 | top_k=10,
18 | top_k_token=10_000,
19 | batch_size=32,
20 | n_jobs=-1,
21 | )
22 | ```
23 |
24 | ```python
25 | [
26 | [
27 | {
28 | "id": "2",
29 | "title": "Alive",
30 | "style": "electro, punk",
31 | "date": Timestamp("2007-11-19 00:00:00"),
32 | "popularity": 9,
33 | "score": 0.16131360828876495,
34 | }
35 | ],
36 | [
37 | {
38 | "id": "1",
39 | "title": "Here Comes the Sun",
40 | "style": "rock",
41 | "date": Timestamp("1969-06-10 00:00:00"),
42 | "popularity": 10,
43 | "score": 0.09199773520231247,
44 | },
45 | {
46 | "id": "0",
47 | "title": "Hotel California",
48 | "style": "rock",
49 | "date": Timestamp("1977-02-22 00:00:00"),
50 | "popularity": 9,
51 | "score": 0.07729987800121307,
52 | },
53 | ],
54 | ]
55 | ```
56 |
57 | ???+ info
58 | The search function is executed in parallel using the `n_jobs` parameter. We can control the number of documents to return using the `top_k` parameter and the number of documents to score for each query token using the `top_k_token` parameter. Reducing `top_k_token` can further speed up the search but may result in lower quality results.
59 |
60 | ### Filters
61 |
62 | We can apply filters to the search using the `filters` parameter. The filters are SQL expressions that are applied to the search results.
63 |
64 | ```python
65 | from ducksearch import search
66 |
67 | search.documents(
68 | database="ducksearch.duckdb",
69 | queries=["rock", "california"],
70 | top_k=10,
71 | top_k_token=10_000,
72 | batch_size=32,
73 | filters="YEAR(date) <= 1990 AND YEAR(date) >= 1970",
74 | n_jobs=-1,
75 | )
76 | ```
77 |
78 | ```python
79 | [
80 | [
81 | {
82 | "score": 0.07729987800121307,
83 | "id": "0",
84 | "title": "Hotel California",
85 | "style": "rock",
86 | "date": Timestamp("1977-02-22 00:00:00"),
87 | "popularity": 9,
88 | }
89 | ],
90 | [
91 | {
92 | "score": 0.16131360828876495,
93 | "id": "0",
94 | "title": "Hotel California",
95 | "style": "rock",
96 | "date": Timestamp("1977-02-22 00:00:00"),
97 | "popularity": 9,
98 | }
99 | ],
100 | ]
101 | ```
102 |
103 | ???+ info
104 | The filters are evaluated by DuckDB, so all DuckDB functions are available for use in the filters. You can find more information about DuckDB functions in the [DuckDB documentation](https://duckdb.org/docs/sql/functions/overview).
105 |
106 |
--------------------------------------------------------------------------------
/docs/documentation/update.md:
--------------------------------------------------------------------------------
1 | ## Update
2 |
3 | To update a document, you to first delete the document and then upload the updated document. The delete operation will remove the document from the database and update the index. Finally, the upload operation will add the updated document to the database and update the index.
4 |
5 | ```python
6 | from ducksearch import delete, upload
7 |
8 | delete.documents(
9 | database="ducksearch.duckdb",
10 | ids=[0, 1],
11 | )
12 |
13 | documents_updated = [
14 | {
15 | "id": 0,
16 | "title": "Hotel California",
17 | "style": "rock",
18 | "date": "1977-02-22",
19 | "popularity": 9,
20 | },
21 | {
22 | "id": 1,
23 | "title": "Here Comes the Sun",
24 | "style": "rock",
25 | "date": "1969-06-10",
26 | "popularity": 10,
27 | },
28 | ]
29 |
30 | upload.documents(
31 | database="ducksearch.duckdb",
32 | key="id",
33 | fields=["title", "style", "date", "popularity"],
34 | documents=documents_updated,
35 | dtypes={
36 | "date": "DATE",
37 | "popularity": "INT",
38 | },
39 | )
40 | ```
41 |
--------------------------------------------------------------------------------
/docs/documentation/upload.md:
--------------------------------------------------------------------------------
1 | ## Upload
2 |
3 | When working with DuckSearch, the first step is to upload documents to DuckDB using the `upload.documents` function. The documents are stored in a DuckDB database, and the fields are indexed with BM25. DuckSearch won't re-index a document if it already exists in the database. Index will be updated along with the new documents.
4 |
5 | ### Upload documents
6 |
7 | The following example demonstrates how to upload a list of documents:
8 |
9 | ```python
10 | from ducksearch import upload
11 |
12 | documents = [
13 | {
14 | "id": 0,
15 | "title": "Hotel California",
16 | "style": "rock",
17 | "date": "1977-02-22",
18 | "popularity": 9,
19 | },
20 | {
21 | "id": 1,
22 | "title": "Here Comes the Sun",
23 | "style": "rock",
24 | "date": "1969-06-10",
25 | "popularity": 10,
26 | },
27 | {
28 | "id": 2,
29 | "title": "Alive",
30 | "style": "electro, punk",
31 | "date": "2007-11-19",
32 | "popularity": 9,
33 | },
34 | ]
35 |
36 | upload.documents(
37 | database="ducksearch.duckdb",
38 | key="id", # unique document identifier
39 | fields=["title", "style", "date", "popularity"], # list of fields to index
40 | documents=documents,
41 | stopwords="english",
42 | stemmer="porter",
43 | lower=True,
44 | strip_accents=True,
45 | dtypes={
46 | "date": "DATE",
47 | "popularity": "INT",
48 | },
49 | )
50 | ```
51 |
52 | ???+ info
53 | stopwords: List of stop words to filter Defaults to 'english' for a pre-defined list of 571 English stopwords.
54 |
55 | stemmer: Stemmer to use. Defaults to 'porter' for the Porter stemmer. Possible values are: 'arabic', 'basque', 'catalan', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'lithuanian', 'nepali', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'serbian', 'spanish', 'swedish', 'tamil', 'turkish', or `None` if no stemming is to be used.
56 |
57 | lower: Whether to convert the text to lowercase. Defaults to `True`.
58 |
59 | strip_accents: Whether to strip accents from the text. Defaults to `True`.
60 |
61 | ### HuggingFace
62 |
63 | The `upload.documents` function can also index HuggingFace datasets directly from the url.
64 | The following example demonstrates how to index the FineWeb dataset from HuggingFace:
65 |
66 | ```python
67 | from ducksearch import upload
68 |
69 | upload.documents(
70 | database="fineweb.duckdb",
71 | key="id",
72 | fields=["text", "url", "date", "language", "token_count", "language_score"],
73 | documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet",
74 | dtypes={
75 | "date": "DATE",
76 | "token_count": "INT",
77 | "language_score": "FLOAT",
78 | },
79 | limit=1000, # demonstrate with a small dataset
80 | )
81 | ```
82 |
83 | ???+ info
84 | More informations about DuckDB and HuggingFace compatibility can be found [here](https://huggingface.co/docs/hub/en/datasets-duckdb) and [here](https://duckdb.org/2024/05/29/access-150k-plus-datasets-from-hugging-face-with-duckdb.html).
85 |
--------------------------------------------------------------------------------
/docs/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lightonai/ducksearch/91422599772f909f490f441ef38415e38224c6d5/docs/img/logo.png
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 |
2 |
DuckSearch
3 |
Efficient BM25 with DuckDB 🦆
4 |
5 |
6 | 
7 |
8 |
9 |
10 |

11 |
12 |

13 |
14 |
15 |
16 | DuckSearch is a lightweight and easy-to-use library to search documents. DuckSearch is built on top of DuckDB, a high-performance analytical database. DuckDB is designed to execute analytical SQL queries fast, and DuckSearch leverages this to provide efficient search and filtering features. DuckSearch index can be updated with new documents and documents can be deleted as well. DuckSearch also supports HuggingFace datasets, allowing to index datasets directly from the HuggingFace Hub.
17 |
18 |
19 | ## Installation
20 |
21 | Install DuckSearch using pip:
22 |
23 | ```bash
24 | pip install ducksearch
25 | ```
26 |
27 | ## Documentation
28 |
29 | The complete documentation is available [here](https://lightonai.github.io/ducksearch/), which includes in-depth guides, examples, and API references.
30 |
31 | ### Upload
32 |
33 | We can upload documents to DuckDB using the `upload.documents` function. The documents are stored in a DuckDB database, and the `fields` are indexed with BM25.
34 |
35 | ```python
36 | from ducksearch import upload
37 |
38 | documents = [
39 | {
40 | "id": 0,
41 | "title": "Hotel California",
42 | "style": "rock",
43 | "date": "1977-02-22",
44 | "popularity": 9,
45 | },
46 | {
47 | "id": 1,
48 | "title": "Here Comes the Sun",
49 | "style": "rock",
50 | "date": "1969-06-10",
51 | "popularity": 10,
52 | },
53 | {
54 | "id": 2,
55 | "title": "Alive",
56 | "style": "electro, punk",
57 | "date": "2007-11-19",
58 | "popularity": 9,
59 | },
60 | ]
61 |
62 | upload.documents(
63 | database="ducksearch.duckdb",
64 | key="id", # Unique document identifier
65 | fields=["title", "style"], # List of fields to use for search.
66 | documents=documents,
67 | dtypes={
68 | "date": "DATE",
69 | "popularity": "INT",
70 | },
71 | )
72 | ```
73 |
74 | ## Search
75 |
76 | `search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8. We will order the results by a weighted sum of the BM25 score and the popularity score provided in the document.
77 |
78 | ```python
79 | from ducksearch import search
80 |
81 | search.documents(
82 | database="ducksearch.duckdb",
83 | queries=["punk", "california"],
84 | top_k=10,
85 | filters="YEAR(date) >= 1970 AND popularity > 8",
86 | order_by="0.8 * score + 0.2 * popularity DESC",
87 | )
88 | ```
89 |
90 | ```python
91 | [
92 | [
93 | {
94 | "id": "2",
95 | "title": "Alive",
96 | "style": "electro, punk",
97 | "date": Timestamp("2007-11-19 00:00:00"),
98 | "popularity": 9,
99 | "score": 0.17841622233390808,
100 | }
101 | ],
102 | [
103 | {
104 | "id": "0",
105 | "title": "Hotel California",
106 | "style": "rock, pop",
107 | "date": Timestamp("1977-02-22 00:00:00"),
108 | "popularity": 9,
109 | "score": 0.156318798661232,
110 | }
111 | ],
112 | ]
113 | ```
114 |
115 | Filters are SQL expressions that are applied to the search results. We can use every filtering function DuckDB provides such as [date functions](https://duckdb.org/docs/sql/functions/date).
116 |
117 | Both `filters` and `order_by` parameters are optional. If not provided, the results are ordered by BM25 relevance and no filters are applied.
118 |
119 | ## Delete and update index
120 |
121 | We can delete documents and update the BM25 weights accordingly using the `delete.documents` function.
122 |
123 | ```python
124 | from ducksearch import delete
125 |
126 | delete.documents(
127 | database="ducksearch.duckdb",
128 | ids=[0, 1],
129 | )
130 | ```
131 |
132 | To update the index, we should first delete the documents and then upload the updated documents.
133 |
134 | ## Extra features
135 |
136 | ### HuggingFace
137 |
138 | The `upload.documents` function can also index HuggingFace datasets directly from the url. The following example demonstrates how to index the FineWeb dataset from HuggingFace. We will use the fields "text" and "url" for search. We will also specify the data types for the "date", "token_count", and "language_score" fields to be able to filter the results.
139 |
140 | ```python
141 | from ducksearch import upload
142 |
143 | upload.documents(
144 | database="fineweb.duckdb",
145 | key="id",
146 | fields=["text", "url"],
147 | documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet",
148 | dtypes={
149 | "date": "DATE",
150 | "token_count": "INT",
151 | "language_score": "FLOAT",
152 | },
153 | limit=3000, # demonstrate with a small dataset
154 | )
155 | ```
156 |
157 | We can then search the FineWeb dataset with the `search.documents` function. We order the results by BM25 score and then date.
158 |
159 | ```python
160 | from ducksearch import search
161 |
162 | search.documents(
163 | database="fineweb.duckdb",
164 | queries=["earth science"],
165 | top_k=2,
166 | order_by="score DESC, date DESC",
167 | )
168 | ```
169 |
170 | ```python
171 | [
172 | [
173 | {
174 | "id": "",
175 | "text": "Earth Science Tutors in Rowland...",
176 | "id_1": "",
177 | "dump": "CC-MAIN-2017-34",
178 | "url": "http://rowland.universitytutor.com/rowland_earth-science-tutoring",
179 | "date": Timestamp("2017-08-19 00:00:00"),
180 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2017-34/segments/1502886105304.35/warc/CC-MAIN-20170819051034-20170819071034-00240.warc.gz",
181 | "language": "en",
182 | "language_score": 0.8718525171279907,
183 | "token_count": 313,
184 | "bm25id": 523,
185 | "score": 2.3761106729507446,
186 | },
187 | {
188 | "id": "",
189 | "text": "- Geomagnetic field....",
190 | "id_1": "",
191 | "dump": "CC-MAIN-2022-21",
192 | "url": "https://www.imperial.ac.uk/people/adrian.muxworthy/?respub-action=citation.html&id=1149861&noscript=noscript",
193 | "date": Timestamp("2022-05-20 00:00:00"),
194 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2022-21/segments/1652662530553.34/warc/CC-MAIN-20220519235259-20220520025259-00601.warc.gz",
195 | "language": "en",
196 | "language_score": 0.8225595951080322,
197 | "token_count": 517,
198 | "bm25id": 4783,
199 | "score": 2.3569871187210083,
200 | },
201 | ]
202 | ]
203 |
204 | ```
205 |
206 | Note: by default, results are ordered by BM25 relevance.
207 |
208 | ## Tables
209 |
210 | Ducksearch creates two distinct schemas: `bm25_tables`, `bm25_documents`.
211 |
212 | - We can find the uploaded documents in the `bm25_tables.documents` table.
213 |
214 | - We can find the inverted index in the `bm25_documents.scores` table. You can update the scores as you wish. Just note that tokens scores will be updated each time you upload documents (every tokens scores mentionned in the set of uploaded documents).
215 |
216 | - We can update the set of stopwords in the `bm25_documents.stopwords` table.
217 |
218 | ## Benchmark
219 |
220 |
221 | | Dataset | ndcg@10 | hits@1 | hits@10 | mrr@10 | map@10 | r-precision | qps | Indexation Time (s) | Number of Documents and Queries |
222 | |-------------------|-----------|---------|----------|----------|---------|-------------|----------------|---------------------|--------------------------------|
223 | | arguana | 0.3779 | 0.0 | 0.8267 | 0.2491 | 0.2528 | 0.0108 | 117.80 | 1.42 | 1,406 queries, 8.67K documents |
224 | | climate-fever | 0.1184 | 0.1068 | 0.3648 | 0.1644 | 0.0803 | 0.0758 | 5.88 | 302.39 | 1,535 queries, 5.42M documents |
225 | | dbpedia-entity | 0.6046 | 0.7669 | 5.6241 | 0.8311 | 0.0649 | 0.0741 | 113.20 | 181.42 | 400 queries, 4.63M documents |
226 | | fever | 0.3861 | 0.2583 | 0.5826 | 0.3525 | 0.3329 | 0.2497 | 74.40 | 329.70 | 6,666 queries, 5.42M documents |
227 | | fiqa | 0.2445 | 0.2207 | 0.6790 | 0.3002 | 0.1848 | 0.1594 | 545.77 | 6.04 | 648 queries, 57K documents |
228 | | hotpotqa | 0.4487 | 0.5059 | 0.9699 | 0.5846 | 0.3642 | 0.3388 | 48.15 | 163.14 | 7,405 queries, 5.23M documents |
229 | | msmarco | 0.8951 | 1.0 | 8.6279 | 1.0 | 0.0459 | 0.0473 | 35.11 | 202.37 | 6,980 queries, 8.84M documents |
230 | | nfcorpus | 0.3301 | 0.4396 | 2.4087 | 0.5292 | 0.1233 | 0.1383 | 3464.66 | 0.99 | 323 queries, 3.6K documents |
231 | | nq | 0.2451 | 0.1272 | 0.4574 | 0.2099 | 0.1934 | 0.1240 | 150.23 | 71.43 | 3,452 queries, 2.68M documents |
232 | | quora | 0.7705 | 0.6783 | 1.1749 | 0.7606 | 0.7206 | 0.6502 | 741.13 | 3.78 | 10,000 queries, 523K documents |
233 | | scidocs | 0.1025 | 0.1790 | 0.8240 | 0.2754 | 0.0154 | 0.0275 | 879.11 | 4.46 | 1,000 queries, 25K documents |
234 | | scifact | 0.6908 | 0.5533 | 0.9133 | 0.6527 | 0.6416 | 0.5468 | 2153.64 | 1.22 | 300 queries, 5K documents |
235 | | trec-covid | 0.9533 | 1.0 | 9.4800 | 1.0 | 0.0074 | 0.0077 | 112.38 | 22.15 | 50 queries, 171K documents |
236 | | webis-touche2020 | 0.4130 | 0.5510 | 3.7347 | 0.7114 | 0.0564 | 0.0827 | 104.65 | 44.14 | 49 queries, 382K documents |
237 |
238 | ## References
239 |
240 | - [DuckDB](https://duckdb.org/)
241 |
242 | - [DuckDB Full Text Search](https://duckdb.org/docs/extensions/full_text_search.html): Note that DuckSearch rely partially on the DuckDB Full Text Search extension but accelerate the search process via `top_k_token` approximation, pre-computation of scores and multi-threading.
243 |
244 | ## License
245 |
246 | DuckSearch is released under the MIT license.
247 |
248 | ## Citation
249 |
250 | ```
251 | @misc{PyLate,
252 | title={DuckSearch, efficient search with DuckDB},
253 | author={Sourty, Raphael},
254 | url={https://github.com/lightonai/ducksearch},
255 | year={2024}
256 | }
257 | ```
--------------------------------------------------------------------------------
/docs/javascripts/config.js:
--------------------------------------------------------------------------------
1 | window.MathJax = {
2 | tex: {
3 | inlineMath: [["\\(", "\\)"]],
4 | displayMath: [["\\[", "\\]"]],
5 | processEscapes: true,
6 | processEnvironments: true
7 | },
8 | options: {
9 | ignoreHtmlClass: ".*|",
10 | processHtmlClass: "arithmatex"
11 | }
12 | };
13 |
14 | document$.subscribe(() => {
15 | MathJax.typesetPromise()
16 | })
--------------------------------------------------------------------------------
/docs/javascripts/tablesort.js:
--------------------------------------------------------------------------------
1 | document$.subscribe(function () {
2 | var tables = document.querySelectorAll("article table:not([class])")
3 | tables.forEach(function (table) {
4 | new Tablesort(table)
5 | })
6 | })
--------------------------------------------------------------------------------
/docs/js/version-select.js:
--------------------------------------------------------------------------------
1 | window.addEventListener("DOMContentLoaded", function () {
2 | // This is a bit hacky. Figure out the base URL from a known CSS file the
3 | // template refers to...
4 | var ex = new RegExp("/?css/version-select.css$");
5 | var sheet = document.querySelector('link[href$="version-select.css"]');
6 |
7 | var ABS_BASE_URL = sheet.href.replace(ex, "");
8 | var CURRENT_VERSION = ABS_BASE_URL.split("/").pop();
9 |
10 | function makeSelect(options, selected) {
11 | var select = document.createElement("select");
12 | select.classList.add("form-control");
13 |
14 | options.forEach(function (i) {
15 | var option = new Option(i.text, i.value, undefined,
16 | i.value === selected);
17 | select.add(option);
18 | });
19 |
20 | return select;
21 | }
22 |
23 | var xhr = new XMLHttpRequest();
24 | xhr.open("GET", ABS_BASE_URL + "/../versions.json");
25 | xhr.onload = function () {
26 | var versions = JSON.parse(this.responseText);
27 |
28 | var realVersion = versions.find(function (i) {
29 | return i.version === CURRENT_VERSION ||
30 | i.aliases.includes(CURRENT_VERSION);
31 | }).version;
32 |
33 | var select = makeSelect(versions.map(function (i) {
34 | return { text: i.title, value: i.version };
35 | }), realVersion);
36 | select.addEventListener("change", function (event) {
37 | window.location.href = ABS_BASE_URL + "/../" + this.value;
38 | });
39 |
40 | var container = document.createElement("div");
41 | container.id = "version-selector";
42 | container.className = "md-nav__item";
43 | container.appendChild(select);
44 |
45 | var sidebar = document.querySelector(".md-nav--primary > .md-nav__list");
46 | sidebar.parentNode.insertBefore(container, sidebar);
47 | };
48 | xhr.send();
49 | });
--------------------------------------------------------------------------------
/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | .md-typeset h2 {
2 | margin: 1.5em 0;
3 | padding-bottom: .4rem;
4 | border-bottom: .04rem solid var(--md-default-fg-color--lighter);
5 | }
6 |
7 | .md-footer {
8 | margin-top: 2em;
9 | }
10 |
11 | .md-typeset pre>code {
12 | border-radius: 0.5em;
13 | }
--------------------------------------------------------------------------------
/ducksearch/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["decorators", "evaluation", "hf", "search", "tables", "upload", "utils"]
2 |
--------------------------------------------------------------------------------
/ducksearch/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (1, 0, 3)
2 |
3 | __version__ = ".".join(map(str, VERSION))
4 |
--------------------------------------------------------------------------------
/ducksearch/decorators/__init__.py:
--------------------------------------------------------------------------------
1 | from .execute_with_duckdb import connect_to_duckdb, execute_with_duckdb
2 |
3 | __all__ = ["execute_with_duckdb", "connect_to_duckdb"]
4 |
--------------------------------------------------------------------------------
/ducksearch/decorators/execute_with_duckdb.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import time
3 | from functools import wraps
4 |
5 | import duckdb
6 |
7 |
8 | def connect_to_duckdb(
9 | database: str,
10 | read_only: bool = False,
11 | config: dict | None = None,
12 | max_retry: int = 30,
13 | sleep_time: float = 0.1,
14 | **kwargs,
15 | ):
16 | """Establish a connection to the DuckDB database. Retry connecting if an error occurs.
17 |
18 | Parameters
19 | ----------
20 | database
21 | The name or path of the DuckDB database to connect to.
22 | read_only
23 | Whether to open the database in read-only mode. Default is False.
24 | config
25 | Optional configuration settings for the DuckDB connection.
26 | max_retry
27 | The maximum number of times to retry connecting to DuckDB.
28 | sleep_time
29 | The time to sleep between retries.
30 |
31 | Returns
32 | -------
33 | duckdb.DuckDBPyConnection
34 | A DuckDB connection object.
35 |
36 | """
37 | current_retry = 0
38 | while True:
39 | try:
40 | conn = (
41 | duckdb.connect(database=database, read_only=read_only, config=config)
42 | if config
43 | else duckdb.connect(database=database, read_only=read_only)
44 | )
45 | break
46 | except Exception as error:
47 | if current_retry >= max_retry:
48 | raise error
49 | time.sleep(sleep_time)
50 | current_retry += 1
51 |
52 | return conn
53 |
54 |
55 | def execute_with_duckdb(
56 | relative_path: str | list[str],
57 | read_only: bool = False,
58 | fields: list[str] | None = None,
59 | fetch_df: bool = False,
60 | **kwargs,
61 | ):
62 | """Decorator to execute a SQL query using DuckDB.
63 |
64 | Parameters
65 | ----------
66 | relative_path
67 | A string or list of strings specifying the path(s) to the SQL file(s).
68 | read_only
69 | Whether the DuckDB connection should be read-only. Default is False.
70 | fields
71 | A list of fields to use as keys for the result rows if returning records.
72 | fetch_df
73 | If True, fetch the result as a pandas DataFrame and return it as a list of dictionaries.
74 | kwargs
75 | Additional keyword arguments to be passed to the SQL query, useful for string formatting.
76 |
77 | Returns
78 | -------
79 | A decorator function that executes the SQL query and returns the result.
80 |
81 | """
82 |
83 | def decorator(func):
84 | @wraps(func)
85 | def wrapper(
86 | *args,
87 | database: str,
88 | config: dict | None = None,
89 | df: list[dict] = None,
90 | relative_path: str | list[str] = relative_path,
91 | **kwargs,
92 | ):
93 | """Connect to DuckDB and execute the query from the provided SQL file path(s)."""
94 | conn = connect_to_duckdb(
95 | database=database,
96 | read_only=read_only,
97 | config=config,
98 | **kwargs,
99 | )
100 |
101 | # Ensure relative_path is treated as a list
102 | if isinstance(relative_path, str):
103 | relative_path = [relative_path]
104 |
105 | try:
106 | # Loop through and execute all SQL files in relative_path
107 | for path in relative_path:
108 | # Build the full path to the SQL file
109 | path = pathlib.Path(__file__).parent.parent.joinpath(path)
110 |
111 | # Read the SQL query from the file
112 | with open(file=path, mode="r") as sql_file:
113 | query = sql_file.read()
114 |
115 | # Format the query with any additional kwargs
116 | if kwargs:
117 | query = query.format(**kwargs)
118 |
119 | # Fetch the result as a DataFrame or a list of rows
120 | if fetch_df:
121 | data = conn.execute(query).fetchdf()
122 | data.columns = data.columns.str.lower()
123 | data = data.to_dict(orient="records")
124 | else:
125 | data = conn.execute(query).fetchall()
126 |
127 | # If fields are provided, map the result rows to dictionaries with the specified field names
128 | if fields is not None:
129 | data = [dict(zip(fields, row)) for row in data]
130 |
131 | # Handle DuckDB-specific exceptions (e.g., too many open files)
132 | except duckdb.duckdb.IOException:
133 | message = "\n--------\nDuckDB exception, too many files open.\nGet current ulimit: ulimit -n\nIncrease ulimit with `ulimit -n 4096` or more.\n--------\n"
134 | raise duckdb.duckdb.IOException(message)
135 |
136 | # Handle other exceptions and provide more detailed error information
137 | except Exception as error:
138 | raise ValueError(
139 | "\n{}:\n{}\n{}:\n{}".format(
140 | type(error).__name__, path, error, query
141 | )
142 | )
143 |
144 | # Close the DuckDB connection in the end
145 | finally:
146 | conn.close()
147 |
148 | # Return the fetched data, if applicable
149 | if fetch_df:
150 | return data
151 |
152 | if data:
153 | return data
154 |
155 | return wrapper
156 |
157 | return decorator
158 |
--------------------------------------------------------------------------------
/ducksearch/delete/__init__.py:
--------------------------------------------------------------------------------
1 | from .documents import documents
2 |
3 | __all__ = ["documents"]
4 |
--------------------------------------------------------------------------------
/ducksearch/delete/delete/documents.sql:
--------------------------------------------------------------------------------
1 | DELETE FROM {schema}.documents
2 | USING parquet_scan('{parquet_file}') AS _df_documents
3 | WHERE {schema}.documents.id = _df_documents.id;
4 |
--------------------------------------------------------------------------------
/ducksearch/delete/delete/documents_queries.sql:
--------------------------------------------------------------------------------
1 | DELETE FROM {schema}.documents_queries
2 | USING parquet_scan('{parquet_file}') AS _df_documents
3 | WHERE {schema}.documents_queries.document_id = _df_documents.id;
--------------------------------------------------------------------------------
/ducksearch/delete/delete/scores.sql:
--------------------------------------------------------------------------------
1 | -- This query finds the set of tokens scores for which there won't be any docid / score to keep.
2 | WITH _docs_to_delete AS (
3 | SELECT DISTINCT bm25.docid
4 | FROM parquet_scan('{parquet_file}') AS p
5 | INNER JOIN bm25_documents.docs AS bm25
6 | ON p.id = bm25.name
7 | ),
8 |
9 | _terms_to_recompute AS (
10 | SELECT DISTINCT term
11 | FROM bm25_documents.terms
12 | INNER JOIN _docs_to_delete
13 | ON bm25_documents.terms.docid = _docs_to_delete.docid
14 | INNER JOIN bm25_documents.dict
15 | ON bm25_documents.terms.termid = bm25_documents.dict.termid
16 | ),
17 |
18 | _scores_to_update AS (
19 | SELECT
20 | _bm25.term,
21 | _bm25.list_scores,
22 | _bm25.list_docids
23 | FROM bm25_documents.scores AS _bm25
24 | INNER JOIN _terms_to_recompute AS _terms
25 | ON _bm25.term = _terms.term
26 | ),
27 |
28 | _unested_scores AS (
29 | SELECT
30 | term,
31 | unnest(list_scores) AS score,
32 | unnest(list_docids) AS docid
33 | FROM _scores_to_update
34 | ),
35 |
36 | _unested_unfiltered_scores AS (
37 | SELECT
38 | _scores.term,
39 | _scores.docid,
40 | _scores.score,
41 | _docs.docid AS to_delete
42 | FROM _unested_scores AS _scores
43 | LEFT JOIN _docs_to_delete AS _docs
44 | ON _scores.docid = _docs.docid
45 | ),
46 |
47 | _unested_filtered_scores AS (
48 | SELECT
49 | term,
50 | docid,
51 | score
52 | FROM _unested_unfiltered_scores
53 | WHERE to_delete IS NULL
54 | ),
55 |
56 | _terms_to_delete AS (
57 | SELECT DISTINCT
58 | ttr.term,
59 | ufs.term AS missing
60 | FROM _terms_to_recompute AS ttr
61 | LEFT JOIN _unested_filtered_scores AS ufs
62 | ON ttr.term = ufs.term
63 | ),
64 |
65 | _scores_to_delete_completely AS (
66 | SELECT DISTINCT term
67 | FROM _terms_to_delete
68 | WHERE missing IS NULL
69 | )
70 |
71 | DELETE FROM bm25_documents.scores AS _scores
72 | USING _scores_to_delete_completely AS _scores_to_delete
73 | WHERE _scores.term = _scores_to_delete.term;
74 |
--------------------------------------------------------------------------------
/ducksearch/delete/documents.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pyarrow as pa
4 | import pyarrow.parquet as pq
5 |
6 | from ..decorators import execute_with_duckdb
7 | from ..utils import plot
8 |
9 |
10 | @execute_with_duckdb(
11 | relative_path="delete/delete/documents.sql",
12 | )
13 | def _drop_documents() -> None:
14 | """Delete documents from the documents table in DuckDB."""
15 |
16 |
17 | @execute_with_duckdb(
18 | relative_path="delete/update/scores.sql",
19 | )
20 | def _update_score() -> None:
21 | """Update the score after deleting documents."""
22 |
23 |
24 | @execute_with_duckdb(
25 | relative_path="delete/update/df.sql",
26 | )
27 | def _update_df() -> None:
28 | """Update the token frequency deleting documents."""
29 |
30 |
31 | @execute_with_duckdb(
32 | relative_path="delete/delete/scores.sql",
33 | )
34 | def _delete_score() -> None:
35 | """Delete the scores for which we don't keep any document."""
36 |
37 |
38 | @execute_with_duckdb(
39 | relative_path="delete/update/docs.sql",
40 | )
41 | def _update_docs() -> None:
42 | """Update the docs table."""
43 |
44 |
45 | @execute_with_duckdb(
46 | relative_path="delete/update/terms.sql",
47 | )
48 | def _update_terms() -> None:
49 | """Update the term table."""
50 |
51 |
52 | @execute_with_duckdb(
53 | relative_path="delete/update/stats.sql",
54 | )
55 | def _update_stats() -> None:
56 | """Update the term table."""
57 |
58 |
59 | def documents(
60 | database: str,
61 | ids: list[str],
62 | schema: str = "bm25_tables",
63 | config: dict | None = None,
64 | ) -> None:
65 | """Delete specified documents from the documents table.
66 |
67 | Parameters
68 | ----------
69 | database
70 | The name of the DuckDB database.
71 | keys
72 | A list of document IDs to delete.
73 | schema
74 | The schema where the documents table is located.
75 | config
76 | Optional configuration options for the DuckDB connection.
77 |
78 | Returns
79 | -------
80 | None
81 | The function deletes the specified documents and updates the plots.
82 |
83 | Examples
84 | --------
85 | >>> from ducksearch import upload, delete
86 |
87 | >>> documents = [
88 | ... {"id": 1, "title": "Document 1", "text": "This is the text of document 1."},
89 | ... {"id": 2, "title": "Document 2", "text": "This is the text of document 2."},
90 | ... {"id": 3, "title": "Document 3", "text": "This is the text of document 3."},
91 | ... ]
92 |
93 | >>> upload.documents(
94 | ... database="test.duckdb",
95 | ... key="id",
96 | ... fields=["title", "text"],
97 | ... documents=documents,
98 | ... )
99 | | Table | Size |
100 | |----------------|------|
101 | | documents | 3 |
102 | | bm25_documents | 3 |
103 |
104 | >>> delete.documents(
105 | ... database="test.duckdb",
106 | ... ids=[1, 2],
107 | ... )
108 | | Table | Size |
109 | |----------------|------|
110 | | documents | 1 |
111 | | bm25_documents | 1 |
112 |
113 | >>> delete.documents(
114 | ... database="test.duckdb",
115 | ... ids=[1, 2, 3],
116 | ... )
117 |
118 | """
119 | # Convert the list of document keys into a pyarrow Table for deletion
120 | documents_ids = pa.Table.from_pydict({"id": ids})
121 |
122 | # Write the document IDs to a parquet file for deletion
123 | pq.write_table(
124 | documents_ids,
125 | "_documents_ids.parquet",
126 | compression="snappy",
127 | )
128 |
129 | _delete_score(
130 | database=database,
131 | parquet_file="_documents_ids.parquet",
132 | config=config,
133 | )
134 |
135 | _update_score(
136 | database=database,
137 | parquet_file="_documents_ids.parquet",
138 | config=config,
139 | )
140 |
141 | _update_df(
142 | database=database,
143 | parquet_file="_documents_ids.parquet",
144 | config=config,
145 | )
146 |
147 | _update_terms(
148 | database=database,
149 | parquet_file="_documents_ids.parquet",
150 | config=config,
151 | )
152 |
153 | _update_docs(
154 | database=database,
155 | parquet_file="_documents_ids.parquet",
156 | config=config,
157 | )
158 |
159 | _update_stats(
160 | database=database,
161 | parquet_file="_documents_ids.parquet",
162 | config=config,
163 | )
164 |
165 | _drop_documents(
166 | database=database,
167 | schema=schema,
168 | parquet_file="_documents_ids.parquet",
169 | config=config,
170 | )
171 |
172 | if os.path.exists("_documents_ids.parquet"):
173 | os.remove("_documents_ids.parquet")
174 |
175 | # Plot the current state of the tables after deletion
176 | return plot(
177 | database=database,
178 | config=config,
179 | tables=[
180 | f"{schema}.documents",
181 | f"{schema}.queries",
182 | "bm25_documents.docs",
183 | "bm25_queries.docs",
184 | "bm25_tables.documents_queries",
185 | ],
186 | )
187 |
--------------------------------------------------------------------------------
/ducksearch/delete/update/df.sql:
--------------------------------------------------------------------------------
1 | WITH _docs_to_delete AS (
2 | SELECT DISTINCT bm25.docid
3 | FROM parquet_scan('{parquet_file}') AS p
4 | INNER JOIN bm25_documents.docs AS bm25
5 | ON p.id = bm25.name
6 | ),
7 |
8 | _tf AS (
9 | SELECT
10 | termid,
11 | sum(tf) AS df
12 | FROM bm25_documents.terms
13 | INNER JOIN _docs_to_delete
14 | ON bm25_documents.terms.docid = _docs_to_delete.docid
15 | GROUP BY 1
16 | )
17 |
18 | UPDATE bm25_documents.dict _dict
19 | SET df = greatest(_dict.df - _tf.df, 0)
20 | FROM _tf
21 | WHERE _dict.termid = _tf.termid;
22 |
--------------------------------------------------------------------------------
/ducksearch/delete/update/docs.sql:
--------------------------------------------------------------------------------
1 | DELETE FROM bm25_documents.docs AS _docs
2 | USING parquet_scan('{parquet_file}') AS _df_documents
3 | WHERE _docs.name = _df_documents.id;
4 |
--------------------------------------------------------------------------------
/ducksearch/delete/update/scores.sql:
--------------------------------------------------------------------------------
1 | -- This query finds the set of tokens scores for which there won't be any docid / score to keep.
2 | WITH _docs_to_delete AS (
3 | SELECT DISTINCT bm25.docid
4 | FROM parquet_scan('{parquet_file}') AS p
5 | INNER JOIN bm25_documents.docs AS bm25
6 | ON p.id = bm25.name
7 | ),
8 |
9 | _terms_to_recompute AS (
10 | SELECT DISTINCT term
11 | FROM bm25_documents.terms
12 | INNER JOIN _docs_to_delete
13 | ON bm25_documents.terms.docid = _docs_to_delete.docid
14 | INNER JOIN bm25_documents.dict
15 | ON bm25_documents.terms.termid = bm25_documents.dict.termid
16 | ),
17 |
18 | _scores_to_update AS (
19 | SELECT
20 | _bm25.term,
21 | _bm25.list_scores,
22 | _bm25.list_docids
23 | FROM bm25_documents.scores AS _bm25
24 | INNER JOIN _terms_to_recompute AS _terms
25 | ON _bm25.term = _terms.term
26 | ),
27 |
28 | _unested_scores AS (
29 | SELECT
30 | term,
31 | unnest(list_scores) AS score,
32 | unnest(list_docids) AS docid
33 | FROM _scores_to_update
34 | ),
35 |
36 | _unested_unfiltered_scores AS (
37 | SELECT
38 | _scores.term,
39 | _scores.docid,
40 | _scores.score,
41 | _docs.docid AS to_delete
42 | FROM _unested_scores AS _scores
43 | LEFT JOIN _docs_to_delete AS _docs
44 | ON _scores.docid = _docs.docid
45 | ),
46 |
47 | _unested_filtered_scores AS (
48 | SELECT
49 | term,
50 | docid,
51 | score
52 | FROM _unested_unfiltered_scores
53 | WHERE to_delete IS NULL
54 | ),
55 |
56 | _list_scores AS (
57 | SELECT
58 | term,
59 | list(docid ORDER BY score DESC, docid ASC) AS list_docids,
60 | list(score ORDER BY score DESC, docid ASC) AS list_scores
61 | FROM _unested_filtered_scores
62 | GROUP BY 1
63 | )
64 |
65 | UPDATE bm25_documents.scores s
66 | SET
67 | list_docids = u.list_docids,
68 | list_scores = u.list_scores
69 | FROM _list_scores AS u
70 | WHERE s.term = u.term;
71 |
--------------------------------------------------------------------------------
/ducksearch/delete/update/stats.sql:
--------------------------------------------------------------------------------
1 | WITH _stats AS (
2 | SELECT
3 | COUNT(*) AS num_docs,
4 | AVG(len) AS avgdl
5 | FROM bm25_documents.docs
6 | )
7 |
8 | UPDATE bm25_documents.stats
9 | SET
10 | num_docs = _stats.num_docs,
11 | avgdl = _stats.avgdl
12 | FROM _stats;
13 |
--------------------------------------------------------------------------------
/ducksearch/delete/update/terms.sql:
--------------------------------------------------------------------------------
1 | WITH _docs_to_delete AS (
2 | SELECT bm25.docid
3 | FROM parquet_scan('{parquet_file}') AS p
4 | INNER JOIN bm25_documents.docs AS bm25
5 | ON p.id = bm25.name
6 | )
7 |
8 | DELETE FROM bm25_documents.terms AS _terms
9 | USING _docs_to_delete AS _docs
10 | WHERE _terms.docid = _docs.docid;
11 |
--------------------------------------------------------------------------------
/ducksearch/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation import evaluate, load_beir
2 |
3 | __all__ = ["evaluate", "load_beir"]
4 |
--------------------------------------------------------------------------------
/ducksearch/evaluation/evaluation.py:
--------------------------------------------------------------------------------
1 | import collections
2 | from typing import Dict
3 |
4 | __all__ = ["evaluate", "load_beir"]
5 |
6 |
7 | def load_beir(dataset_name: str, split: str = "test") -> tuple[list, list, dict]:
8 | """Load BEIR dataset for document and query retrieval tasks.
9 |
10 | Parameters
11 | ----------
12 | dataset_name
13 | The name of the dataset to load (e.g., 'scifact').
14 | split
15 | The dataset split to load (e.g., 'test').
16 |
17 | Returns
18 | -------
19 | tuple
20 | A tuple containing three elements:
21 | - A list of document dictionaries, each containing 'id', 'title', and 'text' fields.
22 | - A list of queries.
23 | - A dictionary of qrels (query relevance judgments).
24 |
25 | Examples
26 | --------
27 | >>> documents, queries, qrels = load_beir("scifact", split="test")
28 |
29 | >>> len(documents)
30 | 5183
31 |
32 | >>> len(queries)
33 | 300
34 |
35 | """
36 | from beir import util
37 | from beir.datasets.data_loader import GenericDataLoader
38 |
39 | data_path = util.download_and_unzip(
40 | url=f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip",
41 | out_dir="./evaluation_datasets/",
42 | )
43 |
44 | documents, queries, qrels = GenericDataLoader(data_folder=data_path).load(
45 | split=split
46 | )
47 |
48 | # Format documents
49 | documents = [
50 | {
51 | "id": document_id,
52 | "title": document["title"],
53 | "text": document["text"],
54 | }
55 | for document_id, document in documents.items()
56 | ]
57 |
58 | _queries = [queries[query_id] for query_id, _ in qrels.items()]
59 |
60 | # Format qrels (relevance judgments)
61 | _qrels = collections.defaultdict(dict)
62 | for query_id, query_documents in qrels.items():
63 | for document in list(query_documents.keys()):
64 | if query_id in queries:
65 | _qrels[document][queries[query_id]] = 1
66 |
67 | return (
68 | documents,
69 | _queries,
70 | _qrels,
71 | )
72 |
73 |
74 | def evaluate(
75 | scores: list[list[dict]],
76 | qrels: dict,
77 | queries: list[str],
78 | metrics: list = [],
79 | ) -> Dict[str, float]:
80 | """Evaluate the performance of document retrieval using relevance judgments.
81 |
82 | Parameters
83 | ----------
84 | scores
85 | A list of lists, where each sublist contains dictionaries representing the retrieved documents for a query.
86 | qrels
87 | A dictionary mapping queries to relevant documents and their relevance scores.
88 | queries
89 | A list of queries.
90 | metrics
91 | A list of metrics to compute. Default includes "ndcg@10" and hits at various levels (e.g., hits@1, hits@10).
92 |
93 | Returns
94 | -------
95 | dict
96 | A dictionary mapping each metric to its computed value.
97 |
98 | Examples
99 | --------
100 | >>> from ducksearch import evaluation, upload, search
101 |
102 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test")
103 |
104 | >>> upload.documents(
105 | ... database="test.duckdb",
106 | ... key="id",
107 | ... fields=["title", "text"],
108 | ... documents=documents,
109 | ... )
110 | | Table | Size |
111 | |----------------|------|
112 | | documents | 5183 |
113 | | bm25_documents | 5183 |
114 |
115 | >>> scores = search.documents(
116 | ... database="test.duckdb",
117 | ... queries=queries,
118 | ... top_k=10,
119 | ... )
120 |
121 | """
122 | from ranx import Qrels, Run, evaluate
123 |
124 | # Format qrels for evaluation
125 | _qrels = collections.defaultdict(dict)
126 | for document_id, document_queries in qrels.items():
127 | for query, score in document_queries.items():
128 | _qrels[query][document_id] = score
129 |
130 | qrels = Qrels(qrels=_qrels)
131 |
132 | # Create a run dict to map queries to their respective retrieved documents and scores
133 | run_dict = {
134 | query: {
135 | match["id"]: 1 - (rank / len(query_matchs))
136 | for rank, match in enumerate(iterable=query_matchs)
137 | }
138 | for query, query_matchs in zip(queries, scores)
139 | }
140 |
141 | run = Run(run=run_dict)
142 |
143 | # Default metrics if none are provided
144 | if not metrics:
145 | metrics = ["ndcg@10"] + [f"hits@{k}" for k in [1, 2, 3, 4, 5, 10]]
146 |
147 | # Evaluate using ranx and return results
148 | return evaluate(
149 | qrels=qrels,
150 | run=run,
151 | metrics=metrics,
152 | make_comparable=True,
153 | )
154 |
--------------------------------------------------------------------------------
/ducksearch/hf/__init__.py:
--------------------------------------------------------------------------------
1 | from .insert import count_rows, insert_documents
2 |
3 | __all__ = ["count_rows", "insert_documents"]
4 |
--------------------------------------------------------------------------------
/ducksearch/hf/drop/tmp.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE {schema}._hf_tmp;
2 |
--------------------------------------------------------------------------------
/ducksearch/hf/insert.py:
--------------------------------------------------------------------------------
1 | from ..decorators import execute_with_duckdb
2 | from ..tables import add_columns_documents, create_documents
3 |
4 |
5 | @execute_with_duckdb(
6 | relative_path="hf/insert/documents.sql",
7 | fetch_df=False,
8 | )
9 | def _insert_documents() -> None:
10 | """Insert the documents from Hugging Face datasets into DuckDB."""
11 |
12 |
13 | @execute_with_duckdb(
14 | relative_path="hf/select/count.sql",
15 | fetch_df=True,
16 | )
17 | def count_rows() -> None:
18 | """Insert the documents from Hugging Face datasets into DuckDB."""
19 |
20 |
21 | @execute_with_duckdb(
22 | relative_path="hf/select/columns.sql",
23 | fetch_df=True,
24 | read_only=True,
25 | )
26 | def _select_columns() -> None:
27 | """Select all columns from the HuggingFace documents table."""
28 |
29 |
30 | @execute_with_duckdb(
31 | relative_path="hf/select/exists.sql",
32 | fetch_df=True,
33 | read_only=True,
34 | )
35 | def _table_exists() -> None:
36 | """Check if the table exists in the DuckDB database."""
37 |
38 |
39 | @execute_with_duckdb(
40 | relative_path="hf/insert/tmp.sql",
41 | fetch_df=False,
42 | )
43 | def _insert_tmp_documents() -> None:
44 | """Insert the documents from Hugging Face datasets into DuckDB."""
45 |
46 |
47 | @execute_with_duckdb(
48 | relative_path="hf/drop/tmp.sql",
49 | fetch_df=True,
50 | )
51 | def _drop_tmp_table() -> None:
52 | """Drop the temporary HF table."""
53 |
54 |
55 | def insert_documents(
56 | database: str,
57 | schema: str,
58 | key: str,
59 | url: list[str] | str,
60 | config: dict | None = None,
61 | limit: int | None = None,
62 | offset: int | None = None,
63 | dtypes: dict | None = None,
64 | fast: bool = False,
65 | ) -> None:
66 | """Insert documents from a Hugging Face dataset into DuckDB.
67 |
68 | Parameters
69 | ----------
70 | database
71 | The name of the DuckDB database.
72 | schema
73 | The schema in which the documents table is located.
74 | key
75 | The key field that uniquely identifies each document (e.g., 'query_id').
76 | fields
77 | A list of fields to be inserted from the dataset. If a single field is provided as a string, it will be converted to a list.
78 | url
79 | The URL of the Hugging Face dataset in Parquet format.
80 | config
81 | Optional configuration options for the DuckDB connection.
82 |
83 | Examples
84 | --------
85 | >>> from ducksearch import upload
86 |
87 | >>> upload.documents(
88 | ... database="test.duckdb",
89 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/queries.parquet",
90 | ... key="query_id",
91 | ... fields=["query_id", "text"],
92 | ... )
93 | | Table | Size |
94 | |----------------|------|
95 | | documents | 19 |
96 | | bm25_documents | 19 |
97 |
98 | >>> upload.documents(
99 | ... database="test.duckdb",
100 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/documents.parquet",
101 | ... key="document_id",
102 | ... fields=["document_id", "text"],
103 | ... )
104 | | Table | Size |
105 | |----------------|------|
106 | | documents | 51 |
107 | | bm25_documents | 51 |
108 |
109 | """
110 | offset_hf = f"OFFSET {offset}" if offset is not None else ""
111 | limit_hf = f"LIMIT {limit}" if limit is not None else ""
112 |
113 | _insert_tmp_documents(
114 | database=database,
115 | schema=schema,
116 | url=url,
117 | key_field=key,
118 | config=config,
119 | offset_hf=offset_hf,
120 | limit_hf=limit_hf,
121 | )
122 |
123 | exists = _table_exists(
124 | database=database,
125 | schema=schema,
126 | table_name="documents",
127 | )[0]["table_exists"]
128 |
129 | _hf_tmp_columns = _select_columns(
130 | database=database,
131 | schema=schema,
132 | table_name="_hf_tmp",
133 | )
134 |
135 | _hf_tmp_columns = [
136 | column["column"] for column in _hf_tmp_columns if column["column"] != "id"
137 | ]
138 |
139 | if exists:
140 | documents_columns = _select_columns(
141 | database=database,
142 | schema=schema,
143 | table_name="documents",
144 | )
145 |
146 | documents_columns = set(
147 | [column["column"] for column in documents_columns if column != "id"]
148 | )
149 |
150 | columns_to_add = list(set(_hf_tmp_columns) - documents_columns)
151 |
152 | if columns_to_add:
153 | add_columns_documents(
154 | database=database,
155 | schema=schema,
156 | columns=columns_to_add,
157 | dtypes=dtypes,
158 | config=config,
159 | )
160 | else:
161 | create_documents(
162 | database=database,
163 | schema=schema,
164 | columns=_hf_tmp_columns,
165 | dtypes=dtypes,
166 | config=config,
167 | )
168 |
169 | _insert_documents(
170 | database=database,
171 | schema=schema,
172 | url=url,
173 | key_field=key,
174 | _hf_tmp_columns=", ".join(_hf_tmp_columns),
175 | limit_hf=limit_hf,
176 | config=config,
177 | )
178 |
179 | _drop_tmp_table(
180 | database=database,
181 | schema=schema,
182 | config=config,
183 | )
184 |
--------------------------------------------------------------------------------
/ducksearch/hf/insert/documents.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.documents (id, {_hf_tmp_columns}) (
2 | WITH _hf_dataset AS (
3 | SELECT
4 | id,
5 | * EXCLUDE (id)
6 | FROM {schema}._hf_tmp
7 | ),
8 |
9 | _new_hf_dataset AS (
10 | SELECT
11 | _hf_dataset.*,
12 | d.id AS existing_id
13 | FROM _hf_dataset
14 | LEFT JOIN {schema}.documents AS d
15 | ON _hf_dataset.id = d.id
16 |
17 | )
18 |
19 | SELECT id, {_hf_tmp_columns}
20 | FROM _new_hf_dataset
21 | WHERE existing_id IS NULL
22 | );
23 |
--------------------------------------------------------------------------------
/ducksearch/hf/insert/tmp.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE TABLE {schema}._hf_tmp AS (
2 | WITH _hf_dataset AS (
3 | SELECT
4 | {key_field} AS id,
5 | *
6 | FROM '{url}'
7 | {limit_hf}
8 | {offset_hf}
9 | ),
10 |
11 | _hf_row_number AS (
12 | SELECT
13 | *,
14 | ROW_NUMBER() OVER (PARTITION BY id ORDER BY id, RANDOM()) AS _row_number
15 | FROM _hf_dataset
16 | )
17 |
18 | SELECT * EXCLUDE (_row_number)
19 | FROM _hf_row_number
20 | WHERE _row_number = 1
21 | );
22 |
--------------------------------------------------------------------------------
/ducksearch/hf/select/columns.sql:
--------------------------------------------------------------------------------
1 | SELECT column_name as column
2 | FROM information_schema.columns
3 | WHERE
4 | lower(table_name) = '{table_name}'
5 | AND table_schema = '{schema}';
6 |
--------------------------------------------------------------------------------
/ducksearch/hf/select/count.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | count(*) as count
3 | FROM '{url}';
4 |
--------------------------------------------------------------------------------
/ducksearch/hf/select/exists.sql:
--------------------------------------------------------------------------------
1 | SELECT EXISTS(
2 | SELECT 1
3 | FROM information_schema.tables
4 | WHERE
5 | LOWER(table_name) = LOWER('{table_name}')
6 | AND table_schema = '{schema}'
7 | ) AS table_exists;
8 |
--------------------------------------------------------------------------------
/ducksearch/search/__init__.py:
--------------------------------------------------------------------------------
1 | from .create import update_index_documents, update_index_queries
2 | from .graphs import graphs
3 | from .select import documents, queries, search
4 |
5 | __all__ = [
6 | "update_index_documents",
7 | "update_index_queries",
8 | "documents",
9 | "queries",
10 | "graphs",
11 | "search",
12 | ]
13 |
--------------------------------------------------------------------------------
/ducksearch/search/create/index.sql:
--------------------------------------------------------------------------------
1 | PRAGMA CREATE_FTS_INDEX(
2 | '{schema}._documents',
3 | 'id',
4 | '_search',
5 | STEMMER='{stemmer}',
6 | STOPWORDS='{stopwords}',
7 | IGNORE='{ignore}',
8 | STRIP_ACCENTS={strip_accents},
9 | LOWER={lower},
10 | OVERWRITE=1
11 | );
12 |
--------------------------------------------------------------------------------
/ducksearch/search/create/queries_index.sql:
--------------------------------------------------------------------------------
1 | PRAGMA CREATE_FTS_INDEX(
2 | '{schema}._queries_{random_hash}',
3 | 'query',
4 | 'query',
5 | STEMMER='{stemmer}',
6 | STOPWORDS='{stopwords}',
7 | IGNORE='{ignore}',
8 | STRIP_ACCENTS={strip_accents},
9 | LOWER={lower},
10 | OVERWRITE=1
11 | );
--------------------------------------------------------------------------------
/ducksearch/search/create/settings.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS {schema}.settings (
2 | k1 FLOAT,
3 | b FLOAT,
4 | stemmer VARCHAR,
5 | stopwords VARCHAR,
6 | ignore VARCHAR,
7 | strip_accents INT,
8 | lower INT
9 | );
10 |
--------------------------------------------------------------------------------
/ducksearch/search/create/stopwords.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE TABLE {schema}.stopwords AS (
2 | SELECT sw
3 | FROM parquet_scan('{parquet_file}')
4 | );
5 |
--------------------------------------------------------------------------------
/ducksearch/search/create/tables.sql:
--------------------------------------------------------------------------------
1 | CREATE SCHEMA IF NOT EXISTS {schema};
2 |
3 | CREATE SEQUENCE IF NOT EXISTS SEQ_{schema}_dict START 1;
4 |
5 | CREATE TABLE IF NOT EXISTS {schema}.dict (
6 | termid INT PRIMARY KEY DEFAULT NEXTVAL('SEQ_{schema}_dict'),
7 | term VARCHAR,
8 | df INT
9 | );
10 |
11 | CREATE TABLE IF NOT EXISTS {schema}.scores (
12 | term VARCHAR,
13 | list_docids INT[],
14 | list_scores FLOAT4[]
15 | );
16 |
17 | CREATE SEQUENCE IF NOT EXISTS SEQ_{schema}_docs START 1;
18 |
19 | CREATE TABLE IF NOT EXISTS {schema}.docs (
20 | docid INT PRIMARY KEY DEFAULT NEXTVAL('SEQ_{schema}_docs'),
21 | len INT,
22 | name VARCHAR
23 | );
24 |
25 | CREATE TABLE IF NOT EXISTS {schema}.stats (
26 | num_docs INT,
27 | avgdl FLOAT
28 | );
29 |
30 | CREATE TABLE IF NOT EXISTS {schema}.terms (
31 | docid INT,
32 | termid INT,
33 | tf INT
34 | );
35 |
36 | CREATE TABLE IF NOT EXISTS {schema}.stopwords (
37 | sw VARCHAR
38 | );
39 |
40 | CREATE OR REPLACE TABLE {schema}._documents AS (
41 | WITH _indexed_documents AS (
42 | SELECT
43 | s.*,
44 | d.name AS existing_id
45 | FROM {source_schema}.{source} s
46 | LEFT JOIN {schema}.docs d
47 | ON s.id = d.name
48 | )
49 |
50 | SELECT
51 | {key_field} AS id,
52 | CONCAT_WS(' ',
53 | {fields}
54 | ) AS _search
55 | FROM _indexed_documents
56 | WHERE existing_id IS NULL
57 | );
58 |
--------------------------------------------------------------------------------
/ducksearch/search/drop/_documents.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE {schema}._documents;
--------------------------------------------------------------------------------
/ducksearch/search/drop/queries.sql:
--------------------------------------------------------------------------------
1 | DROP SCHEMA fts_{schema}__queries_{random_hash} CASCADE;
2 | DROP TABLE {schema}._queries_{random_hash};
3 |
--------------------------------------------------------------------------------
/ducksearch/search/drop/schema.sql:
--------------------------------------------------------------------------------
1 | DROP SCHEMA fts_{schema}__documents CASCADE;
--------------------------------------------------------------------------------
/ducksearch/search/drop/scores.sql:
--------------------------------------------------------------------------------
1 | WITH _terms_scores_to_drop AS (
2 | SELECT DISTINCT
3 | d.term
4 | FROM fts_{schema}__documents.dict fts
5 | INNER JOIN {schema}.dict d
6 | ON fts.term = d.term
7 | )
8 |
9 | DELETE FROM {schema}.scores s
10 | USING _terms_scores_to_drop t
11 | WHERE s.term = t.term;
--------------------------------------------------------------------------------
/ducksearch/search/graphs.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import logging
3 | import os
4 | import resource
5 |
6 | import pyarrow as pa
7 | import pyarrow.parquet as pq
8 | import tqdm
9 | from joblib import delayed
10 |
11 | from ..decorators import execute_with_duckdb
12 | from ..utils import ParallelTqdm, batchify, generate_random_hash
13 | from .create import _select_settings
14 | from .select import _create_queries_index, _insert_queries
15 |
16 |
17 | @execute_with_duckdb(
18 | relative_path="search/select/search_graph.sql",
19 | read_only=True,
20 | fetch_df=True,
21 | )
22 | def _search_graph_query():
23 | """Execute a graph-based search query in DuckDB."""
24 |
25 |
26 | @execute_with_duckdb(
27 | relative_path="search/select/search_graph_filters.sql",
28 | read_only=True,
29 | fetch_df=True,
30 | )
31 | def _search_graph_filters_query():
32 | """Execute a graph-based search query in DuckDB with filters."""
33 |
34 |
35 | def _search_graph(
36 | database: str,
37 | queries: list[str],
38 | top_k: int,
39 | top_k_token: int,
40 | group_id: int,
41 | random_hash: str,
42 | config: dict | None = None,
43 | filters: str | None = None,
44 | ) -> list:
45 | """Perform a graph-based search in DuckDB.
46 |
47 | Parameters
48 | ----------
49 | database
50 | The name of the DuckDB database.
51 | queries
52 | The list of queries to search.
53 | top_k
54 | The number of top results to retrieve for each query.
55 | top_k_token
56 | The number of top tokens to retrieve. Used to select top documents per token.
57 | group_id
58 | The index of the current batch of queries.
59 | config
60 | Optional configuration settings for the DuckDB connection.
61 | filters
62 | Optional SQL filters to apply during the search.
63 |
64 | Returns
65 | -------
66 | list
67 | A list of search results for each query in the batch.
68 | """
69 | search_function = (
70 | _search_graph_filters_query if filters is not None else _search_graph_query
71 | )
72 |
73 | matchs = search_function(
74 | database=database,
75 | queries_schema="bm25_queries",
76 | documents_schema="bm25_documents",
77 | source_schema="bm25_tables",
78 | top_k=top_k,
79 | group_id=group_id,
80 | random_hash=random_hash,
81 | top_k_token=top_k_token,
82 | filters=filters,
83 | config=config,
84 | )
85 |
86 | candidates = collections.defaultdict(list)
87 | for match in matchs:
88 | query = match.pop("_query")
89 | candidates[query].append(match)
90 | return [candidates[query] for query in queries]
91 |
92 |
93 | def graphs(
94 | database: str,
95 | queries: str | list[str],
96 | batch_size: int = 30,
97 | top_k: int = 1000,
98 | top_k_token: int = 30_000,
99 | n_jobs: int = -1,
100 | config: dict | None = None,
101 | filters: str | None = None,
102 | tqdm_bar: bool = True,
103 | ) -> list[dict]:
104 | """Search for graphs in DuckDB using the provided queries.
105 |
106 | Parameters
107 | ----------
108 | database
109 | The name of the DuckDB database.
110 | queries
111 | A string or list of query strings to search for.
112 | batch_size
113 | The batch size for processing queries.
114 | top_k
115 | The number of top documents to retrieve for each query.
116 | top_k_token
117 | The number of top tokens to retrieve.
118 | n_jobs
119 | The number of parallel jobs to use. Default use all available processors.
120 | config
121 | Optional configuration settings for the DuckDB connection.
122 | filters
123 | Optional SQL filters to apply during the search.
124 |
125 | Returns
126 | -------
127 | list[dict]
128 | A list of search results, where each result corresponds to a query.
129 |
130 | Examples
131 | --------
132 | >>> from ducksearch import evaluation, upload, search
133 |
134 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="train")
135 |
136 | >>> upload.documents(
137 | ... database="test.duckdb",
138 | ... key="id",
139 | ... fields=["title", "text"],
140 | ... documents=documents,
141 | ... )
142 | | Table | Size |
143 | |----------------|------|
144 | | documents | 5183 |
145 | | bm25_documents | 5183 |
146 |
147 | >>> upload.queries(
148 | ... database="test.duckdb",
149 | ... queries=queries,
150 | ... documents_queries=qrels,
151 | ... )
152 | | Table | Size |
153 | |-------------------|------|
154 | | documents | 5183 |
155 | | queries | 807 |
156 | | bm25_documents | 5183 |
157 | | bm25_queries | 807 |
158 | | documents_queries | 916 |
159 |
160 |
161 |
162 | """
163 | resource.setrlimit(
164 | resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY)
165 | )
166 |
167 | if isinstance(queries, str):
168 | queries = [queries]
169 |
170 | logging.info("Indexing queries.")
171 | random_hash = generate_random_hash()
172 |
173 | batchs = {
174 | group_id: batch
175 | for group_id, batch in enumerate(
176 | iterable=batchify(
177 | X=queries, batch_size=batch_size, desc="Searching", tqdm_bar=False
178 | )
179 | )
180 | }
181 |
182 | parquet_file = f"_queries_{random_hash}.parquet"
183 | pa_queries, pa_group_ids = [], []
184 | for group_id, batch_queries in batchs.items():
185 | pa_queries.extend(batch_queries)
186 | pa_group_ids.extend([group_id] * len(batch_queries))
187 |
188 | logging.info("Indexing queries.")
189 | index_table = pa.Table.from_pydict({"query": pa_queries, "group_id": pa_group_ids})
190 |
191 | pq.write_table(index_table, parquet_file, compression="snappy")
192 |
193 | _insert_queries(
194 | database=database,
195 | schema="bm25_documents",
196 | parquet_file=parquet_file,
197 | random_hash=random_hash,
198 | config=config,
199 | )
200 |
201 | if os.path.exists(parquet_file):
202 | os.remove(parquet_file)
203 |
204 | settings = _select_settings(
205 | database=database, schema="bm25_documents", config=config
206 | )[0]
207 |
208 | _create_queries_index(
209 | database=database,
210 | schema="bm25_documents",
211 | random_hash=random_hash,
212 | **settings,
213 | config=config,
214 | )
215 |
216 | matchs = []
217 | if n_jobs == 1 or len(batchs) == 1:
218 | if tqdm_bar:
219 | bar = tqdm.tqdm(
220 | total=len(batchs),
221 | position=0,
222 | desc="Searching",
223 | )
224 |
225 | for group_id, batch_queries in batchs.items():
226 | matchs.extend(
227 | _search_graph(
228 | database=database,
229 | queries=batch_queries,
230 | top_k=top_k,
231 | top_k_token=top_k_token,
232 | group_id=group_id,
233 | random_hash=random_hash,
234 | config=config,
235 | filters=filters,
236 | )
237 | )
238 | if tqdm_bar:
239 | bar.update(1)
240 | else:
241 | for match in ParallelTqdm(
242 | n_jobs=n_jobs,
243 | backend="threading",
244 | total=len(batchs),
245 | desc="Searching",
246 | tqdm_bar=tqdm_bar,
247 | )(
248 | delayed(_search_graph)(
249 | database,
250 | batch_queries,
251 | top_k,
252 | top_k_token,
253 | group_id,
254 | random_hash,
255 | config,
256 | filters,
257 | )
258 | for group_id, batch_queries in batchs.items()
259 | ):
260 | matchs.extend(match)
261 |
262 | return matchs
263 |
--------------------------------------------------------------------------------
/ducksearch/search/insert/dict.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.dict (term, df)
2 |
3 | WITH _new_terms AS (
4 | SELECT
5 | fts.df,
6 | fts.term,
7 | d.termid AS existing_id
8 | FROM fts_{schema}__documents.dict fts
9 | LEFT JOIN {schema}.dict d
10 | ON fts.term = d.term
11 | )
12 |
13 | SELECT
14 | term,
15 | df
16 | FROM _new_terms
17 | WHERE existing_id IS NULL;
18 |
--------------------------------------------------------------------------------
/ducksearch/search/insert/docs.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.docs (len, name)
2 |
3 | SELECT
4 | len,
5 | name
6 | FROM fts_{schema}__documents.docs;
7 |
--------------------------------------------------------------------------------
/ducksearch/search/insert/queries.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE TABLE {schema}._queries_{random_hash} AS (
2 | SELECT
3 | query,
4 | group_id
5 | FROM parquet_scan('{parquet_file}')
6 | );
7 |
--------------------------------------------------------------------------------
/ducksearch/search/insert/settings.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.settings (k1, b, stemmer, stopwords, ignore, strip_accents, lower)
2 | VALUES ({k1}, {b}, '{stemmer}', '{stopwords}', '{ignore}', {strip_accents}, {lower});
--------------------------------------------------------------------------------
/ducksearch/search/insert/terms.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.terms (docid, termid, tf)
2 |
3 | WITH _raw_terms AS (
4 | SELECT DISTINCT termid FROM parquet_scan('{parquet_file}')
5 | ),
6 |
7 | _unfiltered_raw_terms AS (
8 | SELECT DISTINCT
9 | _dict.term,
10 | sw.sw IS NOT NULL AS is_stopword
11 | FROM _raw_terms _rt
12 | INNER JOIN {schema}.dict _dict
13 | ON _rt.termid = _dict.termid
14 | LEFT JOIN {schema}.stopwords sw
15 | ON _dict.term = sw.sw
16 | ),
17 |
18 | _filtered_raw_terms AS (
19 | SELECT
20 | term
21 | FROM _unfiltered_raw_terms
22 | WHERE is_stopword = FALSE
23 | ),
24 |
25 | _filtered_raw_terms_bm25id AS (
26 | SELECT DISTINCT
27 | ftsdi.termid
28 | FROM _filtered_raw_terms _raw
29 | JOIN fts_{schema}__documents.dict ftsdi
30 | ON _raw.term = ftsdi.term
31 | )
32 |
33 | , _documents_terms_filter AS (
34 | SELECT
35 | docid,
36 | _terms.termid,
37 | COUNT(*) AS tf
38 | FROM fts_{schema}__documents.terms _terms
39 | INNER JOIN _filtered_raw_terms_bm25id _raw
40 | ON _terms.termid = _raw.termid
41 | GROUP BY 1, 2
42 | )
43 |
44 | SELECT
45 | docs.docid,
46 | dict.termid,
47 | dt.tf
48 | FROM _documents_terms_filter dt
49 | JOIN fts_{schema}__documents.dict ftsdi
50 | ON dt.termid = ftsdi.termid
51 | JOIN fts_{schema}__documents.docs ftsdo
52 | ON dt.docid = ftsdo.docid
53 | JOIN {schema}.dict dict
54 | ON ftsdi.term = dict.term
55 | JOIN {schema}.docs docs
56 | ON ftsdo.name = docs.name;
57 |
--------------------------------------------------------------------------------
/ducksearch/search/select/search.sql:
--------------------------------------------------------------------------------
1 | WITH group_queries AS (
2 | SELECT
3 | query
4 | FROM {schema}._queries_{random_hash}
5 | WHERE group_id = {group_id}
6 | ),
7 |
8 | _input_queries AS (
9 | SELECT
10 | pf.query,
11 | ftsdict.term
12 | FROM group_queries pf
13 | JOIN fts_{schema}__queries_{random_hash}.docs docs
14 | ON pf.query = docs.name
15 | JOIN fts_{schema}__queries_{random_hash}.terms terms
16 | ON docs.docid = terms.docid
17 | JOIN fts_{schema}__queries_{random_hash}.dict ftsdict
18 | ON terms.termid = ftsdict.termid
19 | ),
20 |
21 | _nested_matchs AS (
22 | SELECT
23 | iq.query,
24 | s.list_docids[0:{top_k_token}] as list_docids,
25 | s.list_scores[0:{top_k_token}] as list_scores
26 | FROM {schema}.scores s
27 | INNER JOIN _input_queries iq
28 | ON s.term = iq.term
29 | ),
30 |
31 | _matchs AS (
32 | SELECT
33 | query,
34 | UNNEST(
35 | s.list_docids
36 | ) AS bm25id,
37 | UNNEST(
38 | s.list_scores
39 | ) AS score
40 | FROM _nested_matchs s
41 | ),
42 |
43 | _matchs_scores AS (
44 | SELECT
45 | query,
46 | bm25id,
47 | SUM(score) AS score
48 | FROM _matchs
49 | GROUP BY 1, 2
50 | ),
51 |
52 | _partition_scores AS (
53 | SELECT
54 | query,
55 | bm25id,
56 | score,
57 | RANK() OVER (PARTITION BY query ORDER BY score DESC, RANDOM() ASC) AS rank
58 | FROM _matchs_scores
59 | QUALIFY rank <= {top_k}
60 | )
61 |
62 | SELECT
63 | s.* EXCLUDE (bm25id),
64 | ps.score,
65 | ps.query AS _query
66 | FROM _partition_scores ps
67 | INNER JOIN {source_schema}.{source} s
68 | ON ps.bm25id = s.bm25id
69 | ORDER BY score DESC;
70 |
--------------------------------------------------------------------------------
/ducksearch/search/select/search_filters.sql:
--------------------------------------------------------------------------------
1 | WITH group_queries AS (
2 | SELECT
3 | query
4 | FROM {schema}._queries_{random_hash}
5 | WHERE group_id = {group_id}
6 | ),
7 |
8 | _input_queries AS (
9 | SELECT
10 | pf.query,
11 | ftsdict.term
12 | FROM group_queries pf
13 | JOIN fts_{schema}__queries_{random_hash}.docs docs
14 | ON pf.query = docs.name
15 | JOIN fts_{schema}__queries_{random_hash}.terms terms
16 | ON docs.docid = terms.docid
17 | JOIN fts_{schema}__queries_{random_hash}.dict ftsdict
18 | ON terms.termid = ftsdict.termid
19 | ),
20 |
21 | _matchs AS (
22 | SELECT
23 | query,
24 | UNNEST(
25 | s.list_docids[:{top_k_token}]
26 | ) AS bm25id,
27 | UNNEST(
28 | s.list_scores[:{top_k_token}]
29 | ) AS score
30 | FROM _input_queries iq
31 | INNER JOIN {schema}.scores s
32 | ON iq.term = s.term
33 | ),
34 |
35 | _matchs_scores AS (
36 | SELECT
37 | query AS _query,
38 | bm25id,
39 | SUM(score) AS _score
40 | FROM _matchs
41 | GROUP BY 1, 2
42 | ),
43 |
44 | _documents_filter AS (
45 | SELECT
46 | *
47 | FROM {source_schema}.{source}
48 | WHERE {filters}
49 | ),
50 |
51 | _filtered_scores AS (
52 | SELECT
53 | _query,
54 | _score,
55 | s.* EXCLUDE (bm25id)
56 | FROM _matchs_scores ms
57 | INNER JOIN _documents_filter s
58 | ON ms.bm25id = s.bm25id
59 | ),
60 |
61 | _partition_scores AS (
62 | SELECT
63 | _query,
64 | _score AS score,
65 | * EXCLUDE (_score, _query),
66 | RANK() OVER (PARTITION BY _query {order_by}, RANDOM() ASC) AS _row_number
67 | FROM _filtered_scores
68 | QUALIFY _row_number <= {top_k}
69 | )
70 |
71 | SELECT
72 | * EXCLUDE (_row_number)
73 | FROM _partition_scores
74 | {order_by};
75 |
--------------------------------------------------------------------------------
/ducksearch/search/select/search_graph.sql:
--------------------------------------------------------------------------------
1 | WITH group_queries AS (
2 | SELECT
3 | query
4 | FROM {documents_schema}._queries_{random_hash}
5 | WHERE group_id = {group_id}
6 | ),
7 |
8 | _input_queries AS (
9 | SELECT
10 | pf.query,
11 | ftsdict.term
12 | FROM group_queries pf
13 | JOIN fts_{documents_schema}__queries_{random_hash}.docs docs
14 | ON pf.query = docs.name
15 | JOIN fts_{documents_schema}__queries_{random_hash}.terms terms
16 | ON docs.docid = terms.docid
17 | JOIN fts_{documents_schema}__queries_{random_hash}.dict ftsdict
18 | ON terms.termid = ftsdict.termid
19 | ),
20 |
21 | _documents_matchs AS (
22 | SELECT
23 | iq.query,
24 | UNNEST(
25 | s.list_docids[:{top_k_token}]
26 | ) AS id,
27 | UNNEST(
28 | s.list_scores[:{top_k_token}]
29 | ) AS score
30 | FROM _input_queries iq
31 | INNER JOIN {documents_schema}.scores s
32 | ON iq.term = s.term
33 | ),
34 |
35 | _queries_matchs AS (
36 | SELECT
37 | iq.query,
38 | UNNEST(
39 | s.list_docids[:{top_k_token}]
40 | ) AS id,
41 | UNNEST(
42 | s.list_scores[:{top_k_token}]
43 | ) AS score
44 | FROM _input_queries iq
45 | INNER JOIN {queries_schema}.scores s
46 | ON iq.term = s.term
47 | ),
48 |
49 | _documents_scores AS (
50 | SELECT
51 | query,
52 | id,
53 | SUM(score) AS score
54 | FROM _documents_matchs
55 | GROUP BY 1, 2
56 | ),
57 |
58 | _queries_scores AS (
59 | SELECT
60 | query,
61 | id,
62 | SUM(score) AS score
63 | FROM _queries_matchs
64 | GROUP BY 1, 2
65 | ),
66 |
67 | _documents_ranks AS (
68 | SELECT
69 | query,
70 | id,
71 | score,
72 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number
73 | FROM _documents_scores
74 | ),
75 |
76 | _queries_ranks AS (
77 | SELECT
78 | query,
79 | id,
80 | score,
81 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number
82 | FROM _queries_scores
83 | ),
84 |
85 | _bm25_documents AS (
86 | SELECT
87 | ps.query AS _query,
88 | ddocs.name AS id,
89 | ps.score
90 | FROM _documents_ranks ps
91 | INNER JOIN {documents_schema}.docs AS ddocs
92 | ON ps.id = ddocs.docid
93 | WHERE ps._row_number <= {top_k}
94 | ),
95 |
96 | _bm25_queries AS (
97 | SELECT
98 | ps.query AS _query,
99 | ddocs.name AS id,
100 | ps.score
101 | FROM _queries_ranks ps
102 | INNER JOIN {queries_schema}.docs AS ddocs
103 | ON ps.id = ddocs.docid
104 | WHERE ps._row_number <= {top_k}
105 | ),
106 |
107 | _graph AS (
108 | SELECT
109 | bm25.id AS src_id,
110 | dqg.query_id AS dst_id,
111 | dqg.score AS edge,
112 | 'document' AS src_type,
113 | 'query' AS dst_type,
114 | bm25._query
115 | FROM _bm25_documents AS bm25
116 | INNER JOIN {source_schema}.documents_queries AS dqg
117 | ON bm25.id = dqg.document_id
118 | INNER JOIN _bm25_queries AS bm25q
119 | ON dqg.query_id = bm25q.id
120 | AND bm25._query = bm25q._query
121 | ),
122 |
123 | _graph_scores AS (
124 | SELECT
125 | g.*,
126 | COALESCE(bm25.score, 0) AS src_score,
127 | 0 AS dst_score
128 | FROM _graph AS g
129 | LEFT JOIN _bm25_documents AS bm25
130 | ON g.src_id = bm25.id
131 | AND g._query = bm25._query
132 | WHERE src_type = 'document'
133 | UNION
134 | SELECT
135 | g.*,
136 | 0 AS src_score,
137 | COALESCE(bm25.score, 0) AS dst_score
138 | FROM _graph AS g
139 | LEFT JOIN _bm25_documents AS bm25
140 | ON g.dst_id = bm25.id
141 | AND g._query = bm25._query
142 | WHERE dst_type = 'document'
143 | UNION
144 | SELECT
145 | g.*,
146 | COALESCE(bm25.score, 0) AS src_score,
147 | 0 AS dst_score
148 | FROM _graph AS g
149 | LEFT JOIN _bm25_queries AS bm25
150 | ON g.src_id = bm25.id
151 | AND g._query = bm25._query
152 | WHERE src_type = 'query'
153 | UNION
154 | SELECT
155 | g.*,
156 | 0 AS src_score,
157 | COALESCE(bm25.score, 0) AS dst_score
158 | FROM _graph AS g
159 | LEFT JOIN _bm25_queries AS bm25
160 | ON g.dst_id = bm25.id
161 | AND g._query = bm25._query
162 | WHERE dst_type = 'query'
163 | ),
164 |
165 | graph_scores AS (
166 | SELECT
167 | src_id,
168 | dst_id,
169 | _query,
170 | src_type,
171 | dst_type,
172 | MAX(src_score) AS src_score,
173 | MAX(dst_score) AS dst_score,
174 | MAX(edge) AS edge
175 | FROM _graph_scores
176 | GROUP BY 1, 2, 3, 4, 5
177 | ),
178 |
179 | _rank AS (
180 | SELECT
181 | src_id AS id,
182 | _query,
183 | SUM(src_score + dst_score + edge) AS score
184 | FROM graph_scores
185 | WHERE src_type = 'document'
186 | GROUP BY 1, 2
187 | UNION ALL
188 | SELECT
189 | dst_id AS id,
190 | _query,
191 | SUM(dst_score + src_score + edge) AS score
192 | FROM graph_scores
193 | WHERE dst_type = 'document'
194 | GROUP BY 1, 2
195 | UNION ALL
196 | SELECT
197 | id,
198 | _query,
199 | score
200 | FROM _bm25_documents
201 | ),
202 |
203 | scores AS (
204 | SELECT
205 | id,
206 | _query,
207 | MAX(score) AS score
208 | FROM _rank
209 | GROUP BY 1, 2
210 | )
211 |
212 | SELECT
213 | docs.* EXCLUDE (bm25id),
214 | s.score,
215 | s._query
216 | FROM scores s
217 | JOIN {source_schema}.documents docs
218 | ON s.id = docs.id
219 | ORDER BY s.score DESC;
220 |
--------------------------------------------------------------------------------
/ducksearch/search/select/search_graph_filters.sql:
--------------------------------------------------------------------------------
1 | WITH group_queries AS (
2 | SELECT
3 | query
4 | FROM {documents_schema}._queries_{random_hash}
5 | WHERE group_id = {group_id}
6 | ),
7 |
8 | _input_queries AS (
9 | SELECT
10 | pf.query,
11 | ftsdict.term
12 | FROM group_queries pf
13 | JOIN fts_{documents_schema}__queries_{random_hash}.docs docs
14 | ON pf.query = docs.name
15 | JOIN fts_{documents_schema}__queries_{random_hash}.terms terms
16 | ON docs.docid = terms.docid
17 | JOIN fts_{documents_schema}__queries_{random_hash}.dict ftsdict
18 | ON terms.termid = ftsdict.termid
19 | ),
20 |
21 | _documents_matchs AS (
22 | SELECT
23 | iq.query,
24 | UNNEST(
25 | s.list_docids[:{top_k_token}]
26 | ) AS id,
27 | UNNEST(
28 | s.list_scores[:{top_k_token}]
29 | ) AS score
30 | FROM _input_queries iq
31 | INNER JOIN {documents_schema}.scores s
32 | ON iq.term = s.term
33 | ),
34 |
35 | _queries_matchs AS (
36 | SELECT
37 | iq.query,
38 | UNNEST(
39 | s.list_docids[:{top_k_token}]
40 | ) AS id,
41 | UNNEST(
42 | s.list_scores[:{top_k_token}]
43 | ) AS score
44 | FROM _input_queries iq
45 | INNER JOIN {queries_schema}.scores s
46 | ON iq.term = s.term
47 | ),
48 |
49 | _documents_scores AS (
50 | SELECT
51 | query AS _query,
52 | id AS _id,
53 | SUM(score) AS _score
54 | FROM _documents_matchs
55 | GROUP BY 1, 2
56 | ),
57 |
58 | _documents_scores_filter AS (
59 | SELECT
60 | ds._query AS query,
61 | ds._id AS id,
62 | ds._score AS score
63 | FROM _documents_scores ds
64 | INNER JOIN {source_schema}.documents d
65 | ON ds._id = d.bm25id
66 | WHERE {filters}
67 | ),
68 |
69 | _queries_scores AS (
70 | SELECT
71 | query,
72 | id,
73 | SUM(score) AS score
74 | FROM _queries_matchs
75 | GROUP BY 1, 2
76 | ),
77 |
78 | _documents_ranks AS (
79 | SELECT
80 | query,
81 | id,
82 | score,
83 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number
84 | FROM _documents_scores_filter
85 | ),
86 |
87 | _queries_ranks AS (
88 | SELECT
89 | query,
90 | id,
91 | score,
92 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number
93 | FROM _queries_scores
94 | ),
95 |
96 | _bm25_documents AS (
97 | SELECT
98 | ps.query AS _query,
99 | ddocs.name AS id,
100 | ps.score
101 | FROM _documents_ranks ps
102 | INNER JOIN {documents_schema}.docs AS ddocs
103 | ON ps.id = ddocs.docid
104 | WHERE ps._row_number <= {top_k}
105 | ),
106 |
107 | _bm25_queries AS (
108 | SELECT
109 | ps.query AS _query,
110 | ddocs.name AS id,
111 | ps.score
112 | FROM _queries_ranks ps
113 | INNER JOIN {queries_schema}.docs AS ddocs
114 | ON ps.id = ddocs.docid
115 | WHERE ps._row_number <= {top_k}
116 | ),
117 |
118 | _graph AS (
119 | SELECT
120 | bm25.id AS src_id,
121 | dqg.query_id AS dst_id,
122 | dqg.score AS edge,
123 | 'document' AS src_type,
124 | 'query' AS dst_type,
125 | bm25._query
126 | FROM _bm25_documents AS bm25
127 | INNER JOIN {source_schema}.documents_queries AS dqg
128 | ON bm25.id = dqg.document_id
129 | INNER JOIN _bm25_queries AS bm25q
130 | ON dqg.query_id = bm25q.id
131 | AND bm25._query = bm25q._query
132 | ),
133 |
134 | _graph_scores AS (
135 | SELECT
136 | g.*,
137 | COALESCE(bm25.score, 0) AS src_score,
138 | 0 AS dst_score
139 | FROM _graph AS g
140 | LEFT JOIN _bm25_documents AS bm25
141 | ON g.src_id = bm25.id
142 | AND g._query = bm25._query
143 | WHERE src_type = 'document'
144 | UNION
145 | SELECT
146 | g.*,
147 | 0 AS src_score,
148 | COALESCE(bm25.score, 0) AS dst_score
149 | FROM _graph AS g
150 | LEFT JOIN _bm25_documents AS bm25
151 | ON g.dst_id = bm25.id
152 | AND g._query = bm25._query
153 | WHERE dst_type = 'document'
154 | UNION
155 | SELECT
156 | g.*,
157 | COALESCE(bm25.score, 0) AS src_score,
158 | 0 AS dst_score
159 | FROM _graph AS g
160 | LEFT JOIN _bm25_queries AS bm25
161 | ON g.src_id = bm25.id
162 | AND g._query = bm25._query
163 | WHERE src_type = 'query'
164 | UNION
165 | SELECT
166 | g.*,
167 | 0 AS src_score,
168 | COALESCE(bm25.score, 0) AS dst_score
169 | FROM _graph AS g
170 | LEFT JOIN _bm25_queries AS bm25
171 | ON g.dst_id = bm25.id
172 | AND g._query = bm25._query
173 | WHERE dst_type = 'query'
174 | ),
175 |
176 | graph_scores AS (
177 | SELECT
178 | src_id,
179 | dst_id,
180 | _query,
181 | src_type,
182 | dst_type,
183 | MAX(src_score) AS src_score,
184 | MAX(dst_score) AS dst_score,
185 | MAX(edge) AS edge
186 | FROM _graph_scores
187 | GROUP BY 1, 2, 3, 4, 5
188 | ),
189 |
190 | _rank AS (
191 | SELECT
192 | src_id AS id,
193 | _query,
194 | SUM(src_score + dst_score + edge) AS score
195 | FROM graph_scores
196 | WHERE src_type = 'document'
197 | GROUP BY 1, 2
198 | UNION ALL
199 | SELECT
200 | dst_id AS id,
201 | _query,
202 | SUM(dst_score + src_score + edge) AS score
203 | FROM graph_scores
204 | WHERE dst_type = 'document'
205 | GROUP BY 1, 2
206 | UNION ALL
207 | SELECT
208 | id,
209 | _query,
210 | score
211 | FROM _bm25_documents
212 | ),
213 |
214 | scores AS (
215 | SELECT
216 | id,
217 | _query,
218 | MAX(score) AS score
219 | FROM _rank
220 | GROUP BY 1, 2
221 | )
222 |
223 | SELECT
224 | docs.*,
225 | s.score,
226 | s._query
227 | FROM scores s
228 | JOIN {source_schema}.documents docs
229 | ON s.id = docs.id
230 | ORDER BY s.score DESC;
231 |
--------------------------------------------------------------------------------
/ducksearch/search/select/search_order_by.sql:
--------------------------------------------------------------------------------
1 | WITH group_queries AS (
2 | SELECT
3 | query
4 | FROM {schema}._queries_{random_hash}
5 | WHERE group_id = {group_id}
6 | ),
7 |
8 | _input_queries AS (
9 | SELECT
10 | pf.query,
11 | ftsdict.term
12 | FROM group_queries pf
13 | JOIN fts_{schema}__queries_{random_hash}.docs docs
14 | ON pf.query = docs.name
15 | JOIN fts_{schema}__queries_{random_hash}.terms terms
16 | ON docs.docid = terms.docid
17 | JOIN fts_{schema}__queries_{random_hash}.dict ftsdict
18 | ON terms.termid = ftsdict.termid
19 | ),
20 |
21 | _nested_matchs AS (
22 | SELECT
23 | iq.query,
24 | s.list_docids[0:{top_k_token}] as list_docids,
25 | s.list_scores[0:{top_k_token}] as list_scores
26 | FROM {schema}.scores s
27 | INNER JOIN _input_queries iq
28 | ON s.term = iq.term
29 | ),
30 |
31 | _matchs AS (
32 | SELECT
33 | query,
34 | UNNEST(
35 | s.list_docids
36 | ) AS bm25id,
37 | UNNEST(
38 | s.list_scores
39 | ) AS score
40 | FROM _nested_matchs s
41 | ),
42 |
43 | _matchs_scores AS (
44 | SELECT
45 | query,
46 | bm25id,
47 | SUM(score) AS score
48 | FROM _matchs
49 | GROUP BY 1, 2
50 | ),
51 |
52 | _match_scores_documents AS (
53 | SELECT
54 | ms.query AS _query,
55 | ms.bm25id,
56 | ms.score,
57 | s.*
58 | FROM _matchs_scores ms
59 | INNER JOIN {source_schema}.{source} s
60 | ON ms.bm25id = s.bm25id
61 | ),
62 |
63 | _partition_scores AS (
64 | SELECT
65 | *,
66 | RANK() OVER (PARTITION BY _query {order_by}, RANDOM() ASC) AS rank
67 | FROM _match_scores_documents
68 | QUALIFY rank <= {top_k}
69 | )
70 |
71 | SELECT
72 | *
73 | FROM _partition_scores
74 | {order_by};
75 |
--------------------------------------------------------------------------------
/ducksearch/search/select/settings.sql:
--------------------------------------------------------------------------------
1 | SELECT * FROM {schema}.settings;
--------------------------------------------------------------------------------
/ducksearch/search/select/settings_exists.sql:
--------------------------------------------------------------------------------
1 | SELECT coalesce(EXISTS (
2 | SELECT 1
3 | FROM information_schema.tables
4 | WHERE
5 | table_name = 'settings'
6 | AND table_schema = '{schema}'
7 | ), FALSE) AS table_exists;
8 |
--------------------------------------------------------------------------------
/ducksearch/search/select/stats.sql:
--------------------------------------------------------------------------------
1 | SELECT num_docs, avgdl FROM {schema}.stats;
--------------------------------------------------------------------------------
/ducksearch/search/select/termids_to_score.sql:
--------------------------------------------------------------------------------
1 | WITH _terms_to_score AS (
2 | SELECT
3 | term
4 | FROM fts_{schema}__documents.dict
5 |
6 | )
7 |
8 | SELECT DISTINCT
9 | d.termid
10 | FROM _terms_to_score t
11 | JOIN {schema}.dict d
12 | ON t.term = d.term;
--------------------------------------------------------------------------------
/ducksearch/search/update/bm25id.sql:
--------------------------------------------------------------------------------
1 | UPDATE {source_schema}.{source} source
2 | SET bm25id = {schema}.docs.docid
3 | FROM {schema}.docs
4 | WHERE source.id = {schema}.docs.name;
5 |
--------------------------------------------------------------------------------
/ducksearch/search/update/dict.sql:
--------------------------------------------------------------------------------
1 | WITH new_terms AS (
2 | SELECT
3 | fts.df,
4 | fts.term,
5 | d.termid AS existing_id
6 | FROM fts_{schema}__documents.dict fts
7 | LEFT JOIN {schema}.dict d
8 | ON fts.term = d.term
9 | )
10 |
11 | UPDATE {schema}.dict d
12 | SET df = d.df + nt.df
13 | FROM new_terms nt
14 | WHERE d.termid = nt.existing_id;
15 |
--------------------------------------------------------------------------------
/ducksearch/search/update/scores.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.scores (term, list_docids, list_scores)
2 |
3 | WITH _terms AS (
4 | SELECT termid FROM parquet_scan('{parquet_file}')
5 | ),
6 |
7 | _unfiltered_terms_df AS (
8 | SELECT
9 | d.termid,
10 | d.term,
11 | d.df,
12 | sw.sw IS NOT NULL AS is_stopword
13 | FROM {schema}.dict d
14 | INNER JOIN _terms t
15 | ON d.termid = t.termid
16 | LEFT JOIN {schema}.stopwords sw
17 | ON d.term = sw.sw
18 | ),
19 |
20 | _terms_df AS (
21 | SELECT
22 | termid,
23 | term,
24 | df
25 | FROM _unfiltered_terms_df
26 | WHERE is_stopword = FALSE
27 | ),
28 |
29 | _documents_lengths AS (
30 | SELECT
31 | docid,
32 | len
33 | FROM {schema}.docs
34 | ),
35 |
36 | _documents_terms_df AS (
37 | SELECT
38 | s.docid,
39 | s.termid,
40 | s.tf
41 | FROM {schema}.terms s
42 | INNER JOIN _terms t
43 | ON s.termid = t.termid
44 | ),
45 |
46 | _scores AS (
47 | SELECT
48 | tf.docid,
49 | tf.termid,
50 | tf.tf * LOG(
51 | (
52 | ({num_docs} - tdf.df + 0.5) /
53 | (tdf.df + 0.5)
54 | ) + 1
55 | ) *
56 | (1.0 / (tf.tf + {k1} * (1 - {b} + {b} * (dl.len / {avgdl})))) AS score
57 | FROM
58 | _documents_terms_df tf
59 | JOIN
60 | _documents_lengths dl ON dl.docid = tf.docid
61 | JOIN
62 | _terms_df tdf ON tdf.termid = tf.termid
63 | ),
64 |
65 | _list_scores AS (
66 | SELECT
67 | s.termid,
68 | LIST(d.docid ORDER BY s.score DESC, RANDOM() ASC) AS list_docids,
69 | LIST(s.score ORDER BY s.score DESC, RANDOM() ASC) AS list_scores
70 | FROM _scores s
71 | INNER JOIN
72 | {schema}.docs d
73 | ON s.docid = d.docid
74 | GROUP BY
75 | s.termid
76 | )
77 |
78 | SELECT
79 | d.term,
80 | ls.list_docids,
81 | ls.list_scores
82 | FROM _list_scores ls
83 | JOIN _terms_df d
84 | ON ls.termid = d.termid;
85 |
--------------------------------------------------------------------------------
/ducksearch/search/update/stats.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE TABLE {schema}.stats AS (
2 | SELECT
3 | COUNT(*) AS num_docs,
4 | AVG(len) AS avgdl
5 | FROM {schema}.docs
6 | );
--------------------------------------------------------------------------------
/ducksearch/tables/__init__.py:
--------------------------------------------------------------------------------
1 | from .create import (
2 | create_documents,
3 | create_documents_queries,
4 | create_queries,
5 | create_schema,
6 | )
7 | from .insert import (
8 | insert_documents,
9 | insert_documents_queries,
10 | insert_queries,
11 | )
12 | from .select import (
13 | select_documents,
14 | select_documents_columns,
15 | select_queries,
16 | )
17 | from .update import add_columns_documents
18 |
19 | __all__ = [
20 | "create_documents",
21 | "create_queries",
22 | "create_documents_queries",
23 | "create_schema",
24 | "insert_documents",
25 | "insert_queries",
26 | "insert_documents_queries",
27 | "select_documents",
28 | "select_documents_columns",
29 | "select_queries",
30 | "add_columns_documents",
31 | ]
32 |
--------------------------------------------------------------------------------
/ducksearch/tables/create.py:
--------------------------------------------------------------------------------
1 | from ..decorators import execute_with_duckdb
2 |
3 |
4 | @execute_with_duckdb(
5 | relative_path="tables/create/documents.sql",
6 | )
7 | def _create_documents() -> None:
8 | """Create the documents table in the DuckDB database.
9 |
10 | Parameters
11 | ----------
12 | database: str
13 | The name of the DuckDB database.
14 | config: dict, optional
15 | The configuration options for the DuckDB connection.
16 | """
17 |
18 |
19 | @execute_with_duckdb(
20 | relative_path="tables/create/schema.sql",
21 | )
22 | def _create_schema() -> None:
23 | """Create a schema in the DuckDB database.
24 |
25 | Parameters
26 | ----------
27 | database: str
28 | The name of the DuckDB database.
29 | schema: str
30 | The schema to be created in the database.
31 | config: dict, optional
32 | The configuration options for the DuckDB connection.
33 | """
34 |
35 |
36 | def create_schema(
37 | database: str,
38 | schema: str,
39 | config: dict | None = None,
40 | ) -> None:
41 | """Create the specified schema in the DuckDB database.
42 |
43 | Parameters
44 | ----------
45 | database: str
46 | The name of the DuckDB database.
47 | schema: str
48 | The schema to create within the DuckDB database.
49 | config: dict, optional
50 | The configuration options for the DuckDB connection.
51 |
52 | Examples
53 | --------
54 | >>> from ducksearch import tables
55 |
56 | >>> tables.create_schema(
57 | ... database="test.duckdb",
58 | ... schema="bm25_tables",
59 | ... )
60 | """
61 | return _create_schema(database=database, schema=schema, config=config)
62 |
63 |
64 | def create_documents(
65 | database: str,
66 | schema: str,
67 | columns: str | list[str],
68 | dtypes: dict[str, str] | None = None,
69 | config: dict | None = None,
70 | ) -> None:
71 | """Create the documents table in the DuckDB database.
72 |
73 | Parameters
74 | ----------
75 | database: str
76 | The name of the DuckDB database.
77 | schema: str
78 | The schema in which to create the documents table.
79 | columns: str or list[str]
80 | The list of columns for the documents table. If a string is provided, it will be converted into a list.
81 | dtypes: dict[str, str], optional
82 | A dictionary specifying field names as keys and their DuckDB types as values. Defaults to 'VARCHAR' if not provided.
83 | config: dict, optional
84 | The configuration options for the DuckDB connection.
85 |
86 | Examples
87 | --------
88 | >>> from ducksearch import tables
89 |
90 | >>> tables.create_schema(
91 | ... database="test.duckdb",
92 | ... schema="bm25_tables"
93 | ... )
94 |
95 | >>> tables.create_documents(
96 | ... database="test.duckdb",
97 | ... schema="bm25_tables",
98 | ... columns=["title", "text"],
99 | ... dtypes={"text": "VARCHAR", "title": "VARCHAR"},
100 | ... )
101 |
102 | >>> df = [
103 | ... {"id": 1, "title": "title document 1", "text": "text document 1"},
104 | ... {"id": 2, "title": "title document 2", "text": "text document 2"},
105 | ... {"id": 3, "title": "title document 3", "text": "text document 3"},
106 | ... ]
107 |
108 | >>> tables.insert_documents(
109 | ... database="test.duckdb",
110 | ... schema="bm25_tables",
111 | ... key="id",
112 | ... df=df,
113 | ... columns=["title", "text"],
114 | ... )
115 | """
116 | if not dtypes:
117 | dtypes = {}
118 |
119 | return _create_documents(
120 | database=database,
121 | schema=schema,
122 | fields=", ".join(
123 | [f"{field} {dtypes.get(field, 'VARCHAR')}" for field in columns]
124 | ),
125 | config=config,
126 | )
127 |
128 |
129 | @execute_with_duckdb(
130 | relative_path="tables/create/queries.sql",
131 | )
132 | def create_queries() -> None:
133 | """Create the queries table in the DuckDB database.
134 |
135 | Parameters
136 | ----------
137 | database: str
138 | The name of the DuckDB database.
139 | config: dict, optional
140 | The configuration options for the DuckDB connection.
141 |
142 | Examples
143 | --------
144 | >>> from ducksearch import tables
145 |
146 | >>> tables.create_schema(
147 | ... database="test.duckdb",
148 | ... schema="bm25_tables"
149 | ... )
150 |
151 | >>> tables.create_queries(
152 | ... database="test.duckdb",
153 | ... schema="bm25_tables",
154 | ... )
155 | """
156 |
157 |
158 | @execute_with_duckdb(
159 | relative_path=[
160 | "tables/create/queries.sql",
161 | "tables/create/documents_queries.sql",
162 | ]
163 | )
164 | def create_documents_queries() -> None:
165 | """Create the documents_queries table in the DuckDB database.
166 |
167 | Parameters
168 | ----------
169 | database: str
170 | The name of the DuckDB database.
171 | config: dict, optional
172 | The configuration options for the DuckDB connection.
173 |
174 | Examples
175 | --------
176 | >>> from ducksearch import tables
177 |
178 | >>> tables.create_schema(
179 | ... database="test.duckdb",
180 | ... schema="bm25_tables"
181 | ... )
182 |
183 | >>> tables.create_documents_queries(
184 | ... database="test.duckdb",
185 | ... schema="bm25_tables",
186 | ... )
187 | """
188 |
--------------------------------------------------------------------------------
/ducksearch/tables/create/documents.sql:
--------------------------------------------------------------------------------
1 | CREATE SEQUENCE IF NOT EXISTS _seq_documents_id START 1;
2 |
3 | CREATE TABLE IF NOT EXISTS {schema}.documents (
4 | id VARCHAR PRIMARY KEY DEFAULT (nextval('_seq_documents_id')),
5 | {fields},
6 | bm25id INT DEFAULT NULL
7 | );
8 |
--------------------------------------------------------------------------------
/ducksearch/tables/create/documents_queries.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS {schema}.documents_queries (
2 | document_id VARCHAR,
3 | query_id VARCHAR,
4 | score FLOAT DEFAULT NULL,
5 | FOREIGN KEY (document_id) REFERENCES {schema}.documents (id),
6 | FOREIGN KEY (query_id) REFERENCES {schema}.queries (id)
7 | );
8 |
--------------------------------------------------------------------------------
/ducksearch/tables/create/queries.sql:
--------------------------------------------------------------------------------
1 | CREATE SEQUENCE IF NOT EXISTS {schema}_SEQ_QUERIES_ID START 1;
2 |
3 | CREATE TABLE IF NOT EXISTS {schema}.queries (
4 | id VARCHAR PRIMARY KEY DEFAULT NEXTVAL('{schema}_SEQ_QUERIES_ID'),
5 | query TEXT NOT NULL,
6 | bm25id INT DEFAULT NULL
7 | );
8 |
--------------------------------------------------------------------------------
/ducksearch/tables/create/schema.sql:
--------------------------------------------------------------------------------
1 | CREATE SCHEMA IF NOT EXISTS {schema};
--------------------------------------------------------------------------------
/ducksearch/tables/insert.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import os
3 | import shutil
4 |
5 | import pyarrow as pa
6 | import pyarrow.parquet as pq
7 | from joblib import Parallel, delayed
8 |
9 | from ..decorators import execute_with_duckdb
10 | from ..utils import batchify
11 | from .create import (
12 | create_documents,
13 | create_documents_queries,
14 | create_queries,
15 | )
16 |
17 |
18 | @execute_with_duckdb(
19 | relative_path="tables/insert/documents.sql",
20 | )
21 | def _insert_documents() -> None:
22 | """Insert documents into the documents table.
23 |
24 | Parameters
25 | ----------
26 | database: str
27 | The name of the DuckDB database.
28 | config: dict, optional
29 | The configuration options for the DuckDB connection.
30 | """
31 |
32 |
33 | @execute_with_duckdb(
34 | relative_path="tables/insert/fast_documents.sql",
35 | )
36 | def _insert_documents_fast() -> None:
37 | """Insert documents into the documents table without any duplicate checks.
38 |
39 | Parameters
40 | ----------
41 | database: str
42 | The name of the DuckDB database.
43 | config: dict, optional
44 | The configuration options for the DuckDB connection.
45 | """
46 |
47 |
48 | def write_parquet(
49 | database: str,
50 | documents: list[dict],
51 | index: int,
52 | fields: list[str],
53 | key: str,
54 | ) -> None:
55 | """Write a parquet file with document data for upload.
56 |
57 | Parameters
58 | ----------
59 | documents
60 | A list of dictionaries representing the documents to be written to the parquet file.
61 | index
62 | The index of the current batch being processed.
63 | fields
64 | The list of document fields to be written to the parquet file.
65 | key
66 | The key field to uniquely identify each document.
67 |
68 | Notes
69 | -----
70 | This function writes documents to a temporary parquet file in preparation for bulk uploading into the database.
71 | """
72 | documents_table = collections.defaultdict(list)
73 |
74 | fields = set()
75 | for document in documents:
76 | for field in document.keys():
77 | if field != "id":
78 | fields.add(field)
79 |
80 | for document in documents:
81 | documents_table["id"].append(document[key])
82 | for field in fields:
83 | documents_table[field].append(document.get(field, None))
84 |
85 | documents_path = os.path.join(
86 | ".", f"{database}_tmp", "documents", f"{index}.parquet"
87 | )
88 | documents_table = pa.Table.from_pydict(documents_table)
89 |
90 | pq.write_table(
91 | documents_table,
92 | documents_path,
93 | compression="snappy",
94 | )
95 |
96 |
97 | def insert_documents(
98 | database: str,
99 | schema: str,
100 | df: list[dict] | str,
101 | key: str,
102 | columns: list[str] | str,
103 | dtypes: dict[str, str] | None = None,
104 | batch_size: int = 30_000,
105 | n_jobs: int = -1,
106 | config: dict | None = None,
107 | limit: int | None = None,
108 | fast: bool = False,
109 | ) -> None:
110 | """Insert documents into the documents table with optional multi-threading.
111 |
112 | Parameters
113 | ----------
114 | database
115 | The name of the DuckDB database.
116 | schema
117 | The schema in which the documents table is located.
118 | df
119 | The list of document dictionaries or a string (URL) for a Hugging Face dataset to insert.
120 | key
121 | The field that uniquely identifies each document (e.g., 'id').
122 | columns
123 | The list of document fields to insert. Can be a string if inserting a single field.
124 | dtypes
125 | Optional dictionary specifying the DuckDB type for each field. Defaults to 'VARCHAR' for all unspecified fields.
126 | batch_size
127 | The number of documents to insert in each batch.
128 | n_jobs
129 | Number of parallel jobs to use for inserting documents. Default use all available processors.
130 | config
131 | Optional configuration options for the DuckDB connection.
132 |
133 | Examples
134 | --------
135 | >>> from ducksearch import tables
136 |
137 | >>> df = [
138 | ... {"id": 1, "title": "title document 1", "text": "text document 1"},
139 | ... {"id": 2, "title": "title document 2", "text": "text document 2"},
140 | ... {"id": 3, "title": "title document 3", "text": "text document 3"},
141 | ... ]
142 |
143 | >>> _ = tables.insert_documents(
144 | ... database="test.duckdb",
145 | ... schema="bm25_tables",
146 | ... key="id",
147 | ... columns=["title", "text"],
148 | ... df=df
149 | ... )
150 |
151 | """
152 | columns = [column for column in columns if column != "id"]
153 |
154 | create_documents(
155 | database=database,
156 | schema=schema,
157 | columns=columns,
158 | config=config,
159 | dtypes=dtypes,
160 | )
161 |
162 | documents_path = os.path.join(".", f"{database}_tmp", "documents")
163 |
164 | if os.path.exists(path=documents_path):
165 | shutil.rmtree(documents_path)
166 |
167 | os.makedirs(name=os.path.join(".", f"{database}_tmp"), exist_ok=True)
168 | os.makedirs(name=documents_path, exist_ok=True)
169 |
170 | Parallel(n_jobs=n_jobs, backend="threading")(
171 | delayed(function=write_parquet)(
172 | database,
173 | batch,
174 | index,
175 | columns,
176 | key,
177 | )
178 | for index, batch in enumerate(
179 | iterable=batchify(X=df, batch_size=batch_size, tqdm_bar=False)
180 | )
181 | )
182 |
183 | if fast:
184 | _insert_documents_fast(
185 | database=database,
186 | schema=schema,
187 | parquet_files=os.path.join(documents_path, "*.parquet"),
188 | config=config,
189 | key_field=f"df.{key}",
190 | fields=", ".join(columns),
191 | df_fields=", ".join([f"df.{field}" for field in columns]),
192 | src_fields=", ".join([f"src.{field}" for field in columns]),
193 | )
194 | else:
195 | _insert_documents(
196 | database=database,
197 | schema=schema,
198 | parquet_files=os.path.join(documents_path, "*.parquet"),
199 | config=config,
200 | key_field=f"df.{key}",
201 | fields=", ".join(columns),
202 | df_fields=", ".join([f"df.{field}" for field in columns]),
203 | src_fields=", ".join([f"src.{field}" for field in columns]),
204 | )
205 |
206 | if os.path.exists(path=documents_path):
207 | shutil.rmtree(documents_path)
208 |
209 | if os.path.exists(path=os.path.join(".", f"{database}_tmp")):
210 | shutil.rmtree(os.path.join(".", f"{database}_tmp"))
211 |
212 |
213 | @execute_with_duckdb(
214 | relative_path="tables/insert/queries.sql",
215 | )
216 | def _insert_queries() -> None:
217 | """Insert queries into the queries table.
218 |
219 | Parameters
220 | ----------
221 | database: str
222 | The name of the DuckDB database.
223 | config: dict, optional
224 | The configuration options for the DuckDB connection.
225 | """
226 |
227 |
228 | def insert_queries(
229 | database: str,
230 | schema: str,
231 | queries: list[str],
232 | config: dict | None = None,
233 | ) -> None:
234 | """Insert a list of queries into the queries table.
235 |
236 | Parameters
237 | ----------
238 | database
239 | The name of the DuckDB database.
240 | schema
241 | The schema in which the queries table is located.
242 | queries
243 | A list of query strings to insert into the table.
244 | config
245 | Optional configuration options for the DuckDB connection.
246 |
247 | Examples
248 | --------
249 | >>> from ducksearch import tables
250 |
251 | >>> _ = tables.insert_queries(
252 | ... database="test.duckdb",
253 | ... schema="bm25_tables",
254 | ... queries=["query 1", "query 2", "query 3"],
255 | ... )
256 | """
257 | create_queries(database=database, schema=schema, config=config)
258 |
259 | table = pa.Table.from_pydict({"query": queries})
260 |
261 | pq.write_table(
262 | table,
263 | "_queries.parquet",
264 | compression="snappy",
265 | )
266 |
267 | _insert_queries(
268 | database=database,
269 | schema=schema,
270 | parquet_file="_queries.parquet",
271 | config=config,
272 | )
273 |
274 | if os.path.exists("_queries.parquet"):
275 | os.remove("_queries.parquet")
276 |
277 |
278 | @execute_with_duckdb(
279 | relative_path="tables/insert/documents_queries.sql",
280 | )
281 | def _insert_documents_queries() -> None:
282 | """Insert query-document interactions into the documents_queries table.
283 |
284 | Parameters
285 | ----------
286 | database: str
287 | The name of the DuckDB database.
288 | config: dict, optional
289 | The configuration options for the DuckDB connection.
290 | """
291 |
292 |
293 | def insert_documents_queries(
294 | database: str,
295 | schema: str,
296 | documents_queries: dict[dict[str, float]],
297 | config: dict | None = None,
298 | ) -> None:
299 | """Insert interactions between documents and queries into the documents_queries table.
300 |
301 | Parameters
302 | ----------
303 | database
304 | The name of the DuckDB database.
305 | schema
306 | The schema in which the documents_queries table is located.
307 | documents_queries
308 | A dictionary mapping document IDs to queries and their corresponding scores.
309 | config
310 | Optional configuration options for the DuckDB connection.
311 |
312 | Examples
313 | --------
314 | >>> from ducksearch import tables
315 |
316 | >>> documents_queries = {
317 | ... "1": {"query 1": 0.9, "query 2": 0.8},
318 | ... "2": {"query 2": 0.9, "query 3": 3},
319 | ... "3": {"query 1": 0.9, "query 3": 0.5},
320 | ... }
321 |
322 | >>> tables.insert_documents_queries(
323 | ... database="test.duckdb",
324 | ... schema="bm25_tables",
325 | ... documents_queries=documents_queries
326 | ... )
327 |
328 | """
329 | create_queries(database=database, schema=schema, config=config)
330 |
331 | queries = set()
332 | for _, document_queries in documents_queries.items():
333 | for query in document_queries:
334 | queries.add(query)
335 |
336 | insert_queries(
337 | database=database, schema=schema, queries=list(queries), config=config
338 | )
339 | create_documents_queries(database=database, schema=schema, config=config)
340 |
341 | document_ids, queries, scores = [], [], []
342 | for document_id, document_queries in documents_queries.items():
343 | if isinstance(document_queries, list):
344 | document_queries = {query: 1.0 for query in document_queries}
345 |
346 | for query, score in document_queries.items():
347 | document_ids.append(str(document_id))
348 | queries.append(query)
349 | scores.append(score)
350 |
351 | table = pa.Table.from_pydict(
352 | {
353 | "document_id": document_ids,
354 | "query": queries,
355 | "score": scores,
356 | }
357 | )
358 |
359 | pq.write_table(
360 | table,
361 | "_documents_queries.parquet",
362 | compression="snappy",
363 | )
364 |
365 | _insert_documents_queries(
366 | database=database,
367 | schema=schema,
368 | parquet_file="_documents_queries.parquet",
369 | config=config,
370 | )
371 |
372 | if os.path.exists("_documents_queries.parquet"):
373 | os.remove("_documents_queries.parquet")
374 |
--------------------------------------------------------------------------------
/ducksearch/tables/insert/documents.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.documents (id, {fields})
2 |
3 | WITH _distinct_documents AS (
4 | SELECT DISTINCT
5 | {key_field} AS id,
6 | {df_fields},
7 | ROW_NUMBER() OVER (PARTITION BY id ORDER BY id, RANDOM() ASC) AS _row_number
8 | FROM read_parquet('{parquet_files}') df
9 | ),
10 |
11 | _new_distinct_documents AS (
12 | SELECT DISTINCT
13 | dd.*,
14 | d.id AS existing_id
15 | FROM _distinct_documents dd
16 | LEFT JOIN {schema}.documents AS d
17 | ON dd.id = d.id
18 | WHERE _row_number = 1
19 | )
20 |
21 | SELECT
22 | id,
23 | {fields}
24 | FROM _new_distinct_documents
25 | WHERE existing_id IS NULL;
26 |
--------------------------------------------------------------------------------
/ducksearch/tables/insert/documents_queries.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.documents_queries (document_id, query_id, score)
2 |
3 | WITH _documents_queries_scores AS (
4 | SELECT
5 | document_id,
6 | query,
7 | MAX(score) AS score
8 | FROM parquet_scan('{parquet_file}')
9 | GROUP BY 1, 2
10 | ),
11 |
12 | _distinct_documents_queries AS (
13 | SELECT
14 | dqw.document_id,
15 | q.id AS query_id,
16 | dqw.score,
17 | dq.document_id AS existing_id
18 | FROM _documents_queries_scores AS dqw
19 | INNER JOIN {schema}.queries AS q
20 | ON dqw.query = q.query
21 | INNER JOIN {schema}.documents AS d
22 | ON dqw.document_id = d.id
23 | LEFT JOIN {schema}.documents_queries AS dq
24 | ON q.id = dq.query_id
25 | AND dqw.document_id = dq.document_id
26 | )
27 |
28 | SELECT DISTINCT
29 | document_id,
30 | query_id,
31 | score
32 | FROM _distinct_documents_queries
33 | WHERE existing_id IS NULL;
34 |
--------------------------------------------------------------------------------
/ducksearch/tables/insert/fast_documents.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.documents ({fields})
2 |
3 | WITH _distinct_documents AS (
4 | SELECT DISTINCT
5 | {df_fields}
6 | FROM read_parquet('{parquet_files}') df
7 | )
8 |
9 | SELECT
10 | *
11 | FROM _distinct_documents;
12 |
--------------------------------------------------------------------------------
/ducksearch/tables/insert/queries.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO {schema}.queries (query)
2 |
3 | WITH _distinct_queries AS (
4 | SELECT DISTINCT
5 | df.query,
6 | q.id AS existing_id
7 | FROM parquet_scan('{parquet_file}') AS df
8 | LEFT JOIN {schema}.queries AS q
9 | ON df.query = q.query
10 | )
11 |
12 | SELECT DISTINCT query
13 | FROM _distinct_queries
14 | WHERE existing_id IS NULL;
15 |
--------------------------------------------------------------------------------
/ducksearch/tables/select.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from ..decorators import execute_with_duckdb
4 |
5 |
6 | @execute_with_duckdb(
7 | relative_path="tables/select/documents.sql",
8 | read_only=True,
9 | fetch_df=True,
10 | )
11 | def _select_documents() -> list[dict]:
12 | """Select all documents from the documents table.
13 |
14 | Returns
15 | -------
16 | list[dict]
17 | A list of dictionaries representing the documents.
18 |
19 | Examples
20 | --------
21 | >>> from ducksearch import tables
22 |
23 | >>> documents = tables.select_documents(
24 | ... database="test.duckdb",
25 | ... schema="bm25_tables",
26 | ... )
27 |
28 | >>> assert len(documents) == 3
29 | """
30 |
31 |
32 | def select_documents(
33 | database: str,
34 | schema: str,
35 | limit: int | None = None,
36 | config: dict | None = None,
37 | ) -> list[dict]:
38 | """Select all documents from the documents table.
39 |
40 | Parameters
41 | ----------
42 | database
43 | The name of the DuckDB database.
44 | schema
45 | The schema where the documents table is located.
46 | config
47 | Optional configuration options for the DuckDB connection.
48 |
49 | Returns
50 | -------
51 | list[dict]
52 | A list of dictionaries representing the documents.
53 |
54 | Examples
55 | --------
56 | >>> from ducksearch import tables
57 |
58 | >>> documents = tables.select_documents(
59 | ... database="test.duckdb",
60 | ... schema="bm25_tables",
61 | ... )
62 |
63 | >>> assert len(documents) == 3
64 | """
65 | return pd.DataFrame(
66 | _select_documents(
67 | database=database,
68 | schema=schema,
69 | limit="" if limit is None else f"LIMIT {limit}",
70 | config=config,
71 | )
72 | )
73 |
74 |
75 | @execute_with_duckdb(
76 | relative_path="tables/select/queries.sql",
77 | read_only=True,
78 | fetch_df=True,
79 | )
80 | def select_queries() -> list[dict]:
81 | """Select all queries from the queries table.
82 |
83 | Returns
84 | -------
85 | list[dict]
86 | A list of dictionaries representing the queries.
87 |
88 | Examples
89 | --------
90 | >>> from ducksearch import tables
91 |
92 | >>> queries = tables.select_queries(
93 | ... database="test.duckdb",
94 | ... schema="bm25_tables",
95 | ... )
96 |
97 | >>> assert len(queries) == 3
98 | """
99 |
100 |
101 | @execute_with_duckdb(
102 | relative_path="tables/select/columns.sql",
103 | read_only=True,
104 | fields=["column"],
105 | )
106 | def select_columns() -> list[dict]:
107 | """Retrieve the list of columns from a specified table.
108 |
109 | Returns
110 | -------
111 | list[dict]
112 | A list of dictionaries containing the column names of the table.
113 | """
114 |
115 |
116 | def select_documents_columns(
117 | database: str,
118 | schema: str,
119 | config: dict | None = None,
120 | ) -> list[str]:
121 | """Select the column names from the documents table, excluding the 'bm25id' column.
122 |
123 | Parameters
124 | ----------
125 | database
126 | The name of the DuckDB database.
127 | schema
128 | The schema where the documents table is located.
129 | config
130 | Optional configuration options for the DuckDB connection.
131 |
132 | Returns
133 | -------
134 | list[str]
135 | A list of column names from the documents table.
136 |
137 | Examples
138 | --------
139 | >>> from ducksearch import tables
140 |
141 | >>> tables.select_documents_columns(
142 | ... database="test.duckdb",
143 | ... schema="bm25_tables",
144 | ... )
145 | ['id', 'title', 'text']
146 | """
147 | return [
148 | column["column"]
149 | for column in select_columns(
150 | database=database, schema=schema, table_name="documents", config=config
151 | )
152 | if column["column"] != "bm25id"
153 | ]
154 |
--------------------------------------------------------------------------------
/ducksearch/tables/select/columns.sql:
--------------------------------------------------------------------------------
1 | SELECT column_name
2 | FROM information_schema.columns
3 | WHERE
4 | lower(table_name) = '{table_name}'
5 | AND table_schema = '{schema}';
6 |
--------------------------------------------------------------------------------
/ducksearch/tables/select/documents.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {schema}.documents
3 | ORDER BY id ASC
4 | {limit};
5 |
--------------------------------------------------------------------------------
/ducksearch/tables/select/queries.sql:
--------------------------------------------------------------------------------
1 | SELECT *
2 | FROM {schema}.queries
3 | ORDER BY id ASC;
4 |
--------------------------------------------------------------------------------
/ducksearch/tables/update.py:
--------------------------------------------------------------------------------
1 | from ..decorators import execute_with_duckdb
2 |
3 |
4 | @execute_with_duckdb(
5 | relative_path="tables/update/documents.sql",
6 | )
7 | def _add_columns_documents() -> None:
8 | """Add columns to the documents table in the DuckDB database.
9 |
10 | Parameters
11 | ----------
12 | database: str
13 | The name of the DuckDB database.
14 | config: dict, optional
15 | The configuration options for the DuckDB connection.
16 | """
17 |
18 |
19 | def add_columns_documents(
20 | database: str,
21 | schema: str,
22 | columns: list[str] | str,
23 | dtypes: dict = None,
24 | config: dict = None,
25 | ) -> None:
26 | """Add columns to the documents table in the DuckDB database.
27 |
28 | Parameters
29 | ----------
30 | database:
31 | The name of the DuckDB database.
32 | schema:
33 | The schema in which the documents table is located.
34 | columns:
35 | The columns to add to the documents table.
36 | dtypes:
37 | The data types for the columns to add.
38 | config:
39 | The configuration options for the DuckDB connection.
40 |
41 | """
42 | if isinstance(columns, str):
43 | columns = [columns]
44 |
45 | if dtypes is None:
46 | dtypes = {}
47 |
48 | _add_columns_documents(
49 | database=database,
50 | schema=schema,
51 | fields=", ".join(
52 | [f"ADD COLUMN {field} {dtypes.get(field, 'VARCHAR')}" for field in columns]
53 | ),
54 | config=config,
55 | )
56 |
--------------------------------------------------------------------------------
/ducksearch/tables/update/documents.sql:
--------------------------------------------------------------------------------
1 | ALTER TABLE {schema}.documents
2 | {fields}
3 | ;
--------------------------------------------------------------------------------
/ducksearch/upload/__init__.py:
--------------------------------------------------------------------------------
1 | from .upload import documents, queries
2 |
3 | __all__ = ["documents", "queries"]
4 |
--------------------------------------------------------------------------------
/ducksearch/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .batch import batchify
2 | from .columns import get_list_columns_df
3 | from .hash import generate_random_hash
4 | from .parralel_tqdm import ParallelTqdm
5 | from .plot import plot, plot_shards
6 |
7 | __all__ = [
8 | "batchify",
9 | "get_list_columns_df",
10 | "generate_random_hash",
11 | "plot",
12 | "plot_shards",
13 | "ParallelTqdm",
14 | ]
15 |
--------------------------------------------------------------------------------
/ducksearch/utils/batch.py:
--------------------------------------------------------------------------------
1 | import tqdm
2 |
3 |
4 | def batchify(
5 | X: list[str], batch_size: int, desc: str = "", tqdm_bar: bool = True
6 | ) -> list:
7 | """Split a list into batches and optionally display a progress bar.
8 |
9 | Parameters
10 | ----------
11 | X
12 | A list of items to be batched.
13 | batch_size
14 | The number of items in each batch.
15 | desc
16 | A description to display in the progress bar.
17 | tqdm_bar
18 | Whether to display a progress bar using `tqdm`.
19 |
20 | Yields
21 | ------
22 | list
23 | A list representing a batch of items from `X`.
24 |
25 | Examples
26 | --------
27 | >>> items = ["a", "b", "c", "d", "e", "f"]
28 | >>> batches = list(batchify(items, batch_size=2))
29 | >>> for batch in batches:
30 | ... print(batch)
31 | ['a', 'b']
32 | ['c', 'd']
33 | ['e', 'f']
34 |
35 | """
36 | # Split the input list `X` into batches
37 | batches = [X[pos : pos + batch_size] for pos in range(0, len(X), batch_size)]
38 |
39 | # Use tqdm to show a progress bar if `tqdm_bar` is set to True
40 | if tqdm_bar:
41 | for batch in tqdm.tqdm(
42 | batches,
43 | position=0,
44 | total=len(batches),
45 | desc=desc,
46 | ):
47 | yield batch
48 | else:
49 | # If no progress bar is needed, simply yield the batches
50 | yield from batches
51 |
--------------------------------------------------------------------------------
/ducksearch/utils/columns.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def get_list_columns_df(
5 | documents: list[dict] | pd.DataFrame,
6 | ) -> list[str]:
7 | """Get a list of columns from a list of dictionaries or a DataFrame."""
8 | columns = None
9 | if isinstance(documents, pd.DataFrame):
10 | return list(documents.columns)
11 |
12 | if isinstance(documents, list):
13 | columns = set()
14 | for document in documents:
15 | for column in document.keys():
16 | if column != "id":
17 | columns.add(column)
18 | return list(columns)
19 |
20 | return None
21 |
--------------------------------------------------------------------------------
/ducksearch/utils/hash.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import secrets
3 |
4 |
5 | def generate_random_hash() -> str:
6 | """Generate a random SHA-256 hash."""
7 | random_data = secrets.token_bytes(32)
8 | hash_obj = hashlib.sha256()
9 | hash_obj.update(random_data)
10 | random_hash = hash_obj.hexdigest()
11 | return random_hash
12 |
--------------------------------------------------------------------------------
/ducksearch/utils/parralel_tqdm.py:
--------------------------------------------------------------------------------
1 | import tqdm
2 | from joblib import Parallel
3 |
4 |
5 | class ParallelTqdm(Parallel):
6 | """joblib.Parallel, but with a tqdm progressbar.
7 |
8 | Parameters
9 | ----------
10 | total : int
11 | The total number of tasks to complete.
12 | desc : str
13 | A description of the task.
14 | tqdm_bar : bool, optional
15 | Whether to display a tqdm progress bar. Default is False.
16 | show_joblib_header : bool, optional
17 | Whether to display the joblib header. Default is False
18 |
19 | References
20 | ----------
21 | https://github.com/joblib/joblib/issues/972
22 | """
23 |
24 | def __init__(
25 | self,
26 | *,
27 | total: int,
28 | desc: str,
29 | tqdm_bar: bool = True,
30 | show_joblib_header: bool = False,
31 | **kwargs,
32 | ) -> None:
33 | super().__init__(verbose=(1 if show_joblib_header else 0), **kwargs)
34 | self.total = total
35 | self.desc = desc
36 | self.tqdm_bar = tqdm_bar
37 | self.progress_bar: tqdm.tqdm | None = None
38 |
39 | def __call__(self, iterable):
40 | try:
41 | return super().__call__(iterable)
42 | finally:
43 | if self.progress_bar is not None:
44 | self.progress_bar.close()
45 |
46 | __call__.__doc__ = Parallel.__call__.__doc__
47 |
48 | def dispatch_one_batch(self, iterator):
49 | """Dispatch a batch of tasks, and update the progress bar"""
50 | if self.progress_bar is None and self.tqdm_bar:
51 | self.progress_bar = tqdm.tqdm(
52 | desc=self.desc,
53 | total=self.total,
54 | position=0,
55 | disable=self.tqdm_bar,
56 | unit="tasks",
57 | )
58 | return super().dispatch_one_batch(iterator=iterator)
59 |
60 | dispatch_one_batch.__doc__ = Parallel.dispatch_one_batch.__doc__
61 |
62 | def print_progress(self):
63 | """Display the process of the parallel execution using tqdm"""
64 | if self.total is None and self._original_iterator is None:
65 | self.total = self.n_dispatched_tasks
66 | self.progress_bar.total = self.total
67 | self.progress_bar.refresh()
68 |
69 | if self.tqdm_bar:
70 | self.progress_bar.update(self.n_completed_tasks - self.progress_bar.n)
71 |
--------------------------------------------------------------------------------
/ducksearch/utils/plot.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from ..decorators import execute_with_duckdb
4 |
5 |
6 | def create_aligned_markdown_table(data: dict) -> str:
7 | """Create an aligned markdown table from a dictionary of data.
8 |
9 | Parameters
10 | ----------
11 | data
12 | A dictionary where keys are the table names and values are their sizes.
13 |
14 | Returns
15 | -------
16 | str
17 | A formatted markdown table showing table names and sizes.
18 | """
19 | # Define the headers
20 | headers = ["Table", "Size"]
21 |
22 | # Find the maximum width for each column
23 | max_key_len = max(len(key) for key in data.keys())
24 | max_val_len = max(len(str(value)) for value in data.values())
25 |
26 | # Ensure the headers fit as well
27 | max_key_len = max(max_key_len, len(headers[0]))
28 | max_val_len = max(max_val_len, len(headers[1]))
29 |
30 | # Format the header
31 | header_row = (
32 | f"| {headers[0].ljust(max_key_len)} | {headers[1].ljust(max_val_len)} |\n"
33 | )
34 | separator_row = f"|{'-' * (max_key_len + 2)}|{'-' * (max_val_len + 2)}|\n"
35 |
36 | # Format the rows with aligned columns
37 | table_rows = ""
38 | for key, value in data.items():
39 | table_rows += (
40 | f"| {key.ljust(max_key_len)} | {str(value).ljust(max_val_len)} |\n"
41 | )
42 |
43 | # Combine the header, separator, and rows into the final markdown table
44 | table = f"{header_row}{separator_row}{table_rows}".strip()
45 | return f"\n{table}\n"
46 |
47 |
48 | @execute_with_duckdb(
49 | relative_path="utils/plot/plot.sql",
50 | read_only=True,
51 | fetch_df=True,
52 | )
53 | def _plot_queries_documents():
54 | """Fetch the table statistics from the DuckDB database.
55 |
56 | Returns
57 | -------
58 | list[dict]
59 | A list of dictionaries where each dictionary contains table statistics.
60 | """
61 |
62 |
63 | def plot(
64 | database: str,
65 | config: None | dict = None,
66 | tables=[
67 | "bm25_tables.documents",
68 | "bm25_tables.queries",
69 | "bm25_documents.lengths",
70 | "bm25_queries.lengths",
71 | "bm25_tables.documents_queries",
72 | ],
73 | ) -> str:
74 | """Generate and display a markdown table with statistics of the specified dataset tables.
75 |
76 | Parameters
77 | ----------
78 | database
79 | The name of the DuckDB database.
80 | config
81 | Optional configuration options for the DuckDB connection.
82 | tables
83 | A list of table names to plot statistics for. Defaults to common BM25 tables.
84 |
85 | Returns
86 | -------
87 | str
88 | A markdown table representing the sizes of the specified tables.
89 |
90 | Examples
91 | --------
92 | >>> from ducksearch import utils
93 |
94 | >>> utils.plot(database="test.duckdb")
95 | | Table | Size |
96 | |-----------|------|
97 | | documents | 5183 |
98 | | queries | 300 |
99 | """
100 | data = {}
101 | for table in tables:
102 | try:
103 | # Fetch the table statistics for each specified table
104 | data.update(
105 | _plot_queries_documents(database=database, table=table, config=config)[
106 | 0
107 | ]
108 | )
109 | except Exception:
110 | continue
111 |
112 | # Clean up table names and filter out empty tables
113 | data = {
114 | table.replace(".docs", "").replace("bm25_tables.", ""): size
115 | for table, size in data.items()
116 | if size > 0
117 | }
118 |
119 | if len(data) > 0 and data is not None:
120 | return print(create_aligned_markdown_table(data=data))
121 |
122 |
123 | def plot_shards(
124 | databases: list[str],
125 | config: None | dict = None,
126 | tables=[
127 | "bm25_tables.documents",
128 | "bm25_tables.queries",
129 | "bm25_documents.lengths",
130 | "bm25_queries.lengths",
131 | "bm25_tables.documents_queries",
132 | ],
133 | ) -> str:
134 | """Generate and display a markdown table with statistics of the specified dataset tables.
135 |
136 | Parameters
137 | ----------
138 | database
139 | The name of the DuckDB database.
140 | config
141 | Optional configuration options for the DuckDB connection.
142 | tables
143 | A list of table names to plot statistics for. Defaults to common BM25 tables.
144 |
145 | Returns
146 | -------
147 | str
148 | A markdown table representing the sizes of the specified tables.
149 |
150 | Examples
151 | --------
152 | >>> from ducksearch import utils
153 |
154 | >>> utils.plot(database="test.duckdb")
155 | | Table | Size |
156 | |-----------|------|
157 | | documents | 5183 |
158 | | queries | 300 |
159 | """
160 | statistics = []
161 | for database in databases:
162 | data = {}
163 | for table in tables:
164 | try:
165 | # Fetch the table statistics for each specified table
166 | data.update(
167 | _plot_queries_documents(
168 | database=database, table=table, config=config
169 | )[0]
170 | )
171 | except Exception:
172 | continue
173 |
174 | # Clean up table names and filter out empty tables
175 | data = {
176 | table.replace(".docs", "").replace("bm25_tables.", ""): size
177 | for table, size in data.items()
178 | if size > 0
179 | }
180 |
181 | data = {
182 | "Database": database,
183 | **data,
184 | }
185 |
186 | if len(data) > 0 and data is not None:
187 | statistics.append(data)
188 |
189 | try:
190 | statistics = pd.DataFrame(statistics)
191 | total = statistics.sum(numeric_only=True)
192 | total["Database"] = "Total"
193 | statistics = pd.concat([statistics, total.to_frame().T], ignore_index=True)
194 | statistics = "\n" + statistics.to_markdown(index=False) + "\n"
195 | print(statistics)
196 | except Exception:
197 | pass
198 |
--------------------------------------------------------------------------------
/ducksearch/utils/plot/plot.sql:
--------------------------------------------------------------------------------
1 | select count(*) as '{table}'
2 | from {table};
3 |
4 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | # Project information
2 | site_name: ducksearch
3 | site_description: A search engine for ducks
4 | site_author: Raphael Sourty
5 | site_url: https://lightonai.github.io/ducksearch
6 |
7 | # Repository
8 | repo_name: lighton/ducksearch
9 | repo_url: https://github.com/lightonai/ducksearch
10 | edit_uri: ""
11 |
12 | # Copyright
13 | copyright: Copyright © 2023
14 |
15 | # Configuration
16 | theme:
17 | name: material
18 | custom_dir: docs
19 | language: en
20 |
21 | palette:
22 | - scheme: default
23 | primary: green
24 | accent: green
25 | toggle:
26 | icon: material/brightness-7
27 | name: Switch to dark mode
28 | - scheme: slate
29 | primary: green
30 | accent: green
31 | toggle:
32 | icon: material/brightness-4
33 | name: Switch to light mode
34 |
35 | font:
36 | text: Fira Sans
37 | code: Fira Code
38 | logo: img/logo.png
39 | favicon: img/logo.ico
40 | features:
41 | - content.code.copy
42 | - navigation.tabs
43 | - navigation.instant
44 | - navigation.indexes
45 | - navigation.prune
46 |
47 | # Extras
48 | extra:
49 | social:
50 | - icon: fontawesome/brands/github-alt
51 | link: https://github.com/lightonai/ducksearch
52 |
53 | # Extensions
54 | markdown_extensions:
55 | - admonition
56 | - footnotes
57 | - tables
58 | - toc:
59 | permalink: true
60 | toc_depth: "1-3"
61 | - pymdownx.details
62 | - pymdownx.arithmatex:
63 | generic: true
64 | - pymdownx.highlight:
65 | pygments_lang_class: true
66 | - pymdownx.inlinehilite
67 | - pymdownx.tabbed:
68 | alternate_style: true
69 | - pymdownx.superfences:
70 | custom_fences:
71 | - name: vegalite
72 | class: vegalite
73 | format: !!python/name:mkdocs_charts_plugin.fences.fence_vegalite
74 |
75 |
76 | plugins:
77 | - search
78 | - awesome-pages
79 | - mkdocs-jupyter
80 |
81 | extra_javascript:
82 | - javascripts/config.js
83 | - https://cdn.jsdelivr.net/npm/mathjax@3.2/es5/tex-mml-chtml.js
84 | - https://cdn.jsdelivr.net/npm/vega@5
85 | - https://cdn.jsdelivr.net/npm/vega-lite@5
86 | - https://cdn.jsdelivr.net/npm/vega-embed@6
87 | - https://unpkg.com/tablesort@5.3.0/dist/tablesort.min.js
88 | - javascripts/tablesort.js
89 |
90 | extra_css:
91 | - stylesheets/extra.css
92 | - css/version-select.css
93 |
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 | ignore::DeprecationWarning
4 | ignore::RuntimeWarning
5 | ignore::UserWarning
6 | addopts =
7 | --doctest-modules
8 | --verbose
9 | -ra
10 | --cov-config=.coveragerc
11 | -m "not web and not slow"
12 | doctest_optionflags = NORMALIZE_WHITESPACE NUMBER
13 | norecursedirs =
14 | build
15 | docs
16 | node_modules
17 | markers =
18 | web: tests that require using the Internet
19 | slow: tests that take a long time to run
--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
1 | exclude = [
2 | ".bzr",
3 | ".direnv",
4 | ".eggs",
5 | ".git",
6 | ".git-rewrite",
7 | ".hg",
8 | ".ipynb_checkpoints",
9 | ".mypy_cache",
10 | ".nox",
11 | ".pants.d",
12 | ".pyenv",
13 | ".pytest_cache",
14 | ".pytype",
15 | ".ruff_cache",
16 | ".svn",
17 | ".tox",
18 | ".venv",
19 | ".vscode",
20 | "__pypackages__",
21 | "_build",
22 | "buck-out",
23 | "build",
24 | "dist",
25 | "node_modules",
26 | "site-packages",
27 | "venv",
28 | ]
29 |
30 | # Same as Black.
31 | line-length = 88
32 | indent-width = 4
33 |
34 | target-version = "py310"
35 |
36 | [lint]
37 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default.
38 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
39 | # McCabe complexity (`C901`) by default.
40 | select = ["E4", "E7", "E9", "F"]
41 | ignore = []
42 |
43 | # Allow fix for all enabled rules (when `--fix`) is provided.
44 | fixable = ["ALL"]
45 | unfixable = []
46 |
47 | # Allow unused variables when underscore-prefixed.
48 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
49 |
50 | [format]
51 | quote-style = "double"
52 | indent-style = "space"
53 | skip-magic-trailing-comma = false
54 | line-ending = "auto"
55 | docstring-code-format = false
56 | docstring-code-line-length = "dynamic"
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | from ducksearch.__version__ import __version__
4 |
5 | with open(file="README.md", mode="r", encoding="utf-8") as fh:
6 | long_description = fh.read()
7 |
8 | base_packages = [
9 | "pandas >= 2.2.1",
10 | "duckdb >= 1.0.0",
11 | "pyarrow >= 16.1.0",
12 | "tqdm >= 4.66.4",
13 | "joblib >= 1.4.2",
14 | ]
15 |
16 | eval = ["ranx >= 0.3.16", "beir >= 2.0.0"]
17 |
18 | dev = [
19 | "sqlfluff >= 3.1.0",
20 | "ruff >= 0.4.9",
21 | "pytest-cov >= 5.0.0",
22 | "pytest >= 8.2.1",
23 | "harlequin >= 1.24.0",
24 | "mkdocs-material == 9.5.32",
25 | "mkdocs-awesome-pages-plugin == 2.9.3",
26 | "mkdocs-jupyter == 0.24.8",
27 | "mkdocs_charts_plugin == 0.0.10",
28 | "numpydoc == 1.8.0",
29 | ]
30 |
31 | setuptools.setup(
32 | name="ducksearch",
33 | version=f"{__version__}",
34 | license="MIT",
35 | author="LightOn",
36 | description="DuckSearch: A Python library for efficient search in large collections of text data.",
37 | long_description=long_description,
38 | long_description_content_type="text/markdown",
39 | url="https://github.com/lightonai/ducksearch",
40 | keywords=[],
41 | packages=setuptools.find_packages(),
42 | install_requires=base_packages,
43 | extras_require={
44 | "eval": base_packages + eval,
45 | "dev": base_packages + dev + eval,
46 | },
47 | classifiers=[
48 | "Programming Language :: Python :: 3",
49 | "Operating System :: OS Independent",
50 | ],
51 | python_requires=">=3.8",
52 | include_package_data=True, # Ensure package data is included
53 | package_data={
54 | # Include all .sql files inside the 'ducksearch' package
55 | "ducksearch": ["**/*.sql"],
56 | },
57 | )
58 |
--------------------------------------------------------------------------------