├── .github └── workflows │ ├── ruff.yml │ └── tests.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── benchmark.py ├── docs ├── .pages ├── CNAME ├── api │ ├── .pages │ ├── decorators │ │ ├── .pages │ │ ├── connect-to-duckdb.md │ │ └── execute-with-duckdb.md │ ├── evaluation │ │ ├── .pages │ │ ├── evaluate.md │ │ └── load-beir.md │ ├── hf │ │ ├── .pages │ │ └── insert-documents.md │ ├── overview.md │ ├── search │ │ ├── .pages │ │ ├── documents.md │ │ ├── graphs.md │ │ ├── queries.md │ │ ├── search.md │ │ ├── update-index-documents.md │ │ └── update-index-queries.md │ ├── tables │ │ ├── .pages │ │ ├── add-columns-documents.md │ │ ├── create-documents-queries.md │ │ ├── create-documents.md │ │ ├── create-queries.md │ │ ├── create-schema.md │ │ ├── insert-documents-queries.md │ │ ├── insert-documents.md │ │ ├── insert-queries.md │ │ ├── select-documents-columns.md │ │ ├── select-documents.md │ │ └── select-queries.md │ ├── upload │ │ ├── .pages │ │ ├── documents.md │ │ └── queries.md │ └── utils │ │ ├── .pages │ │ ├── ParallelTqdm.md │ │ ├── batchify.md │ │ ├── generate-random-hash.md │ │ ├── get-list-columns-df.md │ │ └── plot.md ├── benchmarks │ ├── .pages │ └── benchmarks.md ├── css │ └── version-select.css ├── documentation │ ├── .pages │ ├── delete.md │ ├── graph.md │ ├── search.md │ ├── update.md │ └── upload.md ├── img │ └── logo.png ├── index.md ├── javascripts │ ├── config.js │ └── tablesort.js ├── js │ └── version-select.js ├── parse │ └── __main__.py └── stylesheets │ └── extra.css ├── ducksearch ├── __init__.py ├── __version__.py ├── decorators │ ├── __init__.py │ └── execute_with_duckdb.py ├── delete │ ├── __init__.py │ ├── delete │ │ ├── documents.sql │ │ ├── documents_queries.sql │ │ └── scores.sql │ ├── documents.py │ └── update │ │ ├── df.sql │ │ ├── docs.sql │ │ ├── scores.sql │ │ ├── stats.sql │ │ └── terms.sql ├── evaluation │ ├── __init__.py │ └── evaluation.py ├── hf │ ├── __init__.py │ ├── drop │ │ └── tmp.sql │ ├── insert.py │ ├── insert │ │ ├── documents.sql │ │ └── tmp.sql │ └── select │ │ ├── columns.sql │ │ ├── count.sql │ │ └── exists.sql ├── search │ ├── __init__.py │ ├── create.py │ ├── create │ │ ├── index.sql │ │ ├── queries_index.sql │ │ ├── settings.sql │ │ ├── stopwords.sql │ │ └── tables.sql │ ├── drop │ │ ├── _documents.sql │ │ ├── queries.sql │ │ ├── schema.sql │ │ └── scores.sql │ ├── graphs.py │ ├── insert │ │ ├── dict.sql │ │ ├── docs.sql │ │ ├── queries.sql │ │ ├── settings.sql │ │ └── terms.sql │ ├── select.py │ ├── select │ │ ├── search.sql │ │ ├── search_filters.sql │ │ ├── search_graph.sql │ │ ├── search_graph_filters.sql │ │ ├── search_order_by.sql │ │ ├── settings.sql │ │ ├── settings_exists.sql │ │ ├── stats.sql │ │ └── termids_to_score.sql │ └── update │ │ ├── bm25id.sql │ │ ├── dict.sql │ │ ├── scores.sql │ │ └── stats.sql ├── tables │ ├── __init__.py │ ├── create.py │ ├── create │ │ ├── documents.sql │ │ ├── documents_queries.sql │ │ ├── queries.sql │ │ └── schema.sql │ ├── insert.py │ ├── insert │ │ ├── documents.sql │ │ ├── documents_queries.sql │ │ ├── fast_documents.sql │ │ └── queries.sql │ ├── select.py │ ├── select │ │ ├── columns.sql │ │ ├── documents.sql │ │ └── queries.sql │ ├── update.py │ └── update │ │ └── documents.sql ├── upload │ ├── __init__.py │ └── upload.py └── utils │ ├── __init__.py │ ├── batch.py │ ├── columns.py │ ├── hash.py │ ├── parralel_tqdm.py │ ├── plot.py │ └── plot │ └── plot.sql ├── mkdocs.yml ├── pytest.ini ├── ruff.toml ├── setup.cfg └── setup.py /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: Ruff 2 | on: 3 | push: 4 | branches: [ main ] 5 | pull_request: 6 | branches: [ main ] 7 | jobs: 8 | ruff: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - uses: actions/setup-python@v2 13 | with: 14 | python-version: 3.9 15 | - run: pip install ruff 16 | - run: ruff check . -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Python Tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - '**' 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.10' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install ".[dev]" 21 | 22 | - name: Run tests library 23 | run: | 24 | make tests -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | *.parquet 7 | 8 | *.ipynb 9 | *.duckdb 10 | duckdb_tmp/ 11 | *.block 12 | 13 | evaluation_datasets/ 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 119 | .pdm.toml 120 | .pdm-python 121 | .pdm-build/ 122 | 123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 124 | __pypackages__/ 125 | 126 | # Celery stuff 127 | celerybeat-schedule 128 | celerybeat.pid 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | # pytype static type analyzer 161 | .pytype/ 162 | 163 | # Cython debug symbols 164 | cython_debug/ 165 | 166 | # PyCharm 167 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 168 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 169 | # and can be added to the global gitignore or merged into this file. For a more nuclear 170 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 171 | #.idea/ 172 | arguana 173 | benchmark_bm25s.py 174 | benchmark_crud.py 175 | climate-fever 176 | fever.tmp/duckdb_temp_block-4611686018432402649.block 177 | fever.tmp/duckdb_temp_block-4611686018432404521.block 178 | fever.tmp/duckdb_temp_block-4611686018432404963.block 179 | fever.tmp/duckdb_temp_storage-4.tmp 180 | metrics.json 181 | metrics_20K.json 182 | metrics_bm25s.json 183 | mmetrics_30K.json 184 | msmarco 185 | nfcorpus 186 | nq 187 | quora 188 | scidocs 189 | scifact 190 | trec-covid 191 | webis-touche2020 192 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 LightOn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | DIALECT := duckdb 2 | 3 | fix: 4 | sqlfluff fix --dialect $(DIALECT) 5 | 6 | lint: 7 | sqlfluff lint --dialect $(DIALECT) 8 | 9 | tests: 10 | @echo "Removing test.duckdb if it exists..." 11 | rm -rf test.duckdb 12 | rm -rf test.duckdb.wal 13 | pytest ducksearch/tables/create.py --disable-warnings 14 | pytest ducksearch/tables/insert.py --disable-warnings 15 | pytest ducksearch/tables/select.py --disable-warnings 16 | rm -rf test.duckdb 17 | rm -rf test.duckdb.wal 18 | pytest ducksearch/hf/insert.py --disable-warnings 19 | rm -rf test.duckdb 20 | rm -rf test.duckdb.wal 21 | pytest ducksearch/evaluation/evaluation.py --disable-warnings 22 | rm -rf test.duckdb 23 | rm -rf test.duckdb.wal 24 | pytest ducksearch/search/create.py --disable-warnings 25 | pytest ducksearch/search/select.py --disable-warnings 26 | rm -rf test.duckdb 27 | rm -rf test.duckdb.wal 28 | pytest ducksearch/search/graphs.py --disable-warnings 29 | rm -rf test.duckdb 30 | rm -rf test.duckdb.wal 31 | 32 | view: 33 | harlequin test.duckdb 34 | 35 | livedoc: 36 | python docs/parse 37 | mkdocs build --clean 38 | mkdocs serve --dirtyreload 39 | 40 | deploydoc: 41 | mkdocs gh-deploy --force -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

DuckSearch

3 |

Efficient BM25 with DuckDB 🦆

4 |
5 | 6 |

7 | 8 |
9 | 10 | documentation 11 | 12 | license 13 |
14 | 15 |

16 | DuckSearch is a lightweight and easy-to-use library to search documents. DuckSearch is built on top of DuckDB, a high-performance analytical database. DuckDB is designed to execute analytical SQL queries fast, and DuckSearch leverages this to provide efficient search and filtering features. DuckSearch index can be updated with new documents and documents can be deleted as well. DuckSearch also supports HuggingFace datasets, allowing to index datasets directly from the HuggingFace Hub. 17 |

18 | 19 | ## Installation 20 | 21 | Install DuckSearch using pip: 22 | 23 | ```bash 24 | pip install ducksearch 25 | ``` 26 | 27 | ## Documentation 28 | 29 | The complete documentation is available [here](https://lightonai.github.io/ducksearch/), which includes in-depth guides, examples, and API references. 30 | 31 | ### Upload 32 | 33 | We can upload documents to DuckDB using the `upload.documents` function. The documents are stored in a DuckDB database, and the `fields` are indexed with BM25. 34 | 35 | ```python 36 | from ducksearch import upload 37 | 38 | documents = [ 39 | { 40 | "id": 0, 41 | "title": "Hotel California", 42 | "style": "rock", 43 | "date": "1977-02-22", 44 | "popularity": 9, 45 | }, 46 | { 47 | "id": 1, 48 | "title": "Here Comes the Sun", 49 | "style": "rock", 50 | "date": "1969-06-10", 51 | "popularity": 10, 52 | }, 53 | { 54 | "id": 2, 55 | "title": "Alive", 56 | "style": "electro, punk", 57 | "date": "2007-11-19", 58 | "popularity": 9, 59 | }, 60 | ] 61 | 62 | upload.documents( 63 | database="ducksearch.duckdb", 64 | key="id", # Unique document identifier 65 | fields=["title", "style"], # List of fields to use for search. 66 | documents=documents, 67 | dtypes={ 68 | "date": "DATE", 69 | "popularity": "INT", 70 | }, 71 | ) 72 | ``` 73 | 74 | ## Search 75 | 76 | `search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8. We will order the results by a weighted sum of the BM25 score and the popularity score provided in the document. 77 | 78 | ```python 79 | from ducksearch import search 80 | 81 | search.documents( 82 | database="ducksearch.duckdb", 83 | queries=["punk", "california"], 84 | top_k=10, 85 | filters="YEAR(date) >= 1970 AND popularity > 8", 86 | order_by="0.8 * score + 0.2 * popularity DESC", 87 | ) 88 | ``` 89 | 90 | ```python 91 | [ 92 | [ 93 | { 94 | "id": "2", 95 | "title": "Alive", 96 | "style": "electro, punk", 97 | "date": Timestamp("2007-11-19 00:00:00"), 98 | "popularity": 9, 99 | "score": 0.17841622233390808, 100 | } 101 | ], 102 | [ 103 | { 104 | "id": "0", 105 | "title": "Hotel California", 106 | "style": "rock, pop", 107 | "date": Timestamp("1977-02-22 00:00:00"), 108 | "popularity": 9, 109 | "score": 0.156318798661232, 110 | } 111 | ], 112 | ] 113 | ``` 114 | 115 | Filters are SQL expressions that are applied to the search results. We can use every filtering function DuckDB provides such as [date functions](https://duckdb.org/docs/sql/functions/date). 116 | 117 | Both `filters` and `order_by` parameters are optional. If not provided, the results are ordered by BM25 relevance and no filters are applied. 118 | 119 | ## Delete and update index 120 | 121 | We can delete documents and update the BM25 weights accordingly using the `delete.documents` function. 122 | 123 | ```python 124 | from ducksearch import delete 125 | 126 | delete.documents( 127 | database="ducksearch.duckdb", 128 | ids=[0, 1], 129 | ) 130 | ``` 131 | 132 | To update the index, we should first delete the documents and then upload the updated documents. 133 | 134 | ## Extra features 135 | 136 | ### HuggingFace 137 | 138 | The `upload.documents` function can also index HuggingFace datasets directly from the url. The following example demonstrates how to index the FineWeb dataset from HuggingFace. We will use the fields "text" and "url" for search. We will also specify the data types for the "date", "token_count", and "language_score" fields to be able to filter the results. 139 | 140 | ```python 141 | from ducksearch import upload 142 | 143 | upload.documents( 144 | database="fineweb.duckdb", 145 | key="id", 146 | fields=["text", "url"], 147 | documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet", 148 | dtypes={ 149 | "date": "DATE", 150 | "token_count": "INT", 151 | "language_score": "FLOAT", 152 | }, 153 | limit=3000, # demonstrate with a small dataset 154 | ) 155 | ``` 156 | 157 | We can then search the FineWeb dataset with the `search.documents` function. We order the results by BM25 score and then date. 158 | 159 | ```python 160 | from ducksearch import search 161 | 162 | search.documents( 163 | database="fineweb.duckdb", 164 | queries=["earth science"], 165 | top_k=2, 166 | order_by="score DESC, date DESC", 167 | ) 168 | ``` 169 | 170 | ```python 171 | [ 172 | [ 173 | { 174 | "id": "", 175 | "text": "Earth Science Tutors in Rowland...", 176 | "id_1": "", 177 | "dump": "CC-MAIN-2017-34", 178 | "url": "http://rowland.universitytutor.com/rowland_earth-science-tutoring", 179 | "date": Timestamp("2017-08-19 00:00:00"), 180 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2017-34/segments/1502886105304.35/warc/CC-MAIN-20170819051034-20170819071034-00240.warc.gz", 181 | "language": "en", 182 | "language_score": 0.8718525171279907, 183 | "token_count": 313, 184 | "bm25id": 523, 185 | "score": 2.3761106729507446, 186 | }, 187 | { 188 | "id": "", 189 | "text": "- Geomagnetic field....", 190 | "id_1": "", 191 | "dump": "CC-MAIN-2022-21", 192 | "url": "https://www.imperial.ac.uk/people/adrian.muxworthy/?respub-action=citation.html&id=1149861&noscript=noscript", 193 | "date": Timestamp("2022-05-20 00:00:00"), 194 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2022-21/segments/1652662530553.34/warc/CC-MAIN-20220519235259-20220520025259-00601.warc.gz", 195 | "language": "en", 196 | "language_score": 0.8225595951080322, 197 | "token_count": 517, 198 | "bm25id": 4783, 199 | "score": 2.3569871187210083, 200 | }, 201 | ] 202 | ] 203 | 204 | ``` 205 | 206 | Note: by default, results are ordered by BM25 relevance. 207 | 208 | ## Tables 209 | 210 | Ducksearch creates two distinct schemas: `bm25_tables`, `bm25_documents`. 211 | 212 | - We can find the uploaded documents in the `bm25_tables.documents` table. 213 | 214 | - We can find the inverted index in the `bm25_documents.scores` table. You can update the scores as you wish. Just note that tokens scores will be updated each time you upload documents (every tokens scores mentionned in the set of uploaded documents). 215 | 216 | - We can update the set of stopwords in the `bm25_documents.stopwords` table. 217 | 218 | ## Benchmark 219 | 220 | 221 | | Dataset | ndcg@10 | hits@1 | hits@10 | mrr@10 | map@10 | r-precision | qps | Indexation Time (s) | Number of Documents and Queries | 222 | |-------------------|-----------|---------|----------|----------|---------|-------------|----------------|---------------------|--------------------------------| 223 | | arguana | 0.3779 | 0.0 | 0.8267 | 0.2491 | 0.2528 | 0.0108 | 117.80 | 1.42 | 1,406 queries, 8.67K documents | 224 | | climate-fever | 0.1184 | 0.1068 | 0.3648 | 0.1644 | 0.0803 | 0.0758 | 5.88 | 302.39 | 1,535 queries, 5.42M documents | 225 | | dbpedia-entity | 0.6046 | 0.7669 | 5.6241 | 0.8311 | 0.0649 | 0.0741 | 113.20 | 181.42 | 400 queries, 4.63M documents | 226 | | fever | 0.3861 | 0.2583 | 0.5826 | 0.3525 | 0.3329 | 0.2497 | 74.40 | 329.70 | 6,666 queries, 5.42M documents | 227 | | fiqa | 0.2445 | 0.2207 | 0.6790 | 0.3002 | 0.1848 | 0.1594 | 545.77 | 6.04 | 648 queries, 57K documents | 228 | | hotpotqa | 0.4487 | 0.5059 | 0.9699 | 0.5846 | 0.3642 | 0.3388 | 48.15 | 163.14 | 7,405 queries, 5.23M documents | 229 | | msmarco | 0.8951 | 1.0 | 8.6279 | 1.0 | 0.0459 | 0.0473 | 35.11 | 202.37 | 6,980 queries, 8.84M documents | 230 | | nfcorpus | 0.3301 | 0.4396 | 2.4087 | 0.5292 | 0.1233 | 0.1383 | 3464.66 | 0.99 | 323 queries, 3.6K documents | 231 | | nq | 0.2451 | 0.1272 | 0.4574 | 0.2099 | 0.1934 | 0.1240 | 150.23 | 71.43 | 3,452 queries, 2.68M documents | 232 | | quora | 0.7705 | 0.6783 | 1.1749 | 0.7606 | 0.7206 | 0.6502 | 741.13 | 3.78 | 10,000 queries, 523K documents | 233 | | scidocs | 0.1025 | 0.1790 | 0.8240 | 0.2754 | 0.0154 | 0.0275 | 879.11 | 4.46 | 1,000 queries, 25K documents | 234 | | scifact | 0.6908 | 0.5533 | 0.9133 | 0.6527 | 0.6416 | 0.5468 | 2153.64 | 1.22 | 300 queries, 5K documents | 235 | | trec-covid | 0.9533 | 1.0 | 9.4800 | 1.0 | 0.0074 | 0.0077 | 112.38 | 22.15 | 50 queries, 171K documents | 236 | | webis-touche2020 | 0.4130 | 0.5510 | 3.7347 | 0.7114 | 0.0564 | 0.0827 | 104.65 | 44.14 | 49 queries, 382K documents | 237 | 238 | ## References 239 | 240 | - [DuckDB](https://duckdb.org/) 241 | 242 | - [DuckDB Full Text Search](https://duckdb.org/docs/extensions/full_text_search.html): Note that DuckSearch rely partially on the DuckDB Full Text Search extension but accelerate the search process via `top_k_token` approximation, pre-computation of scores and multi-threading. 243 | 244 | ## License 245 | 246 | DuckSearch is released under the MIT license. 247 | 248 | ## Citation 249 | 250 | ``` 251 | @misc{DuckSearch, 252 | title={DuckSearch, efficient search with DuckDB}, 253 | author={Sourty, Raphael}, 254 | url={https://github.com/lightonai/ducksearch}, 255 | year={2024} 256 | } 257 | ``` 258 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | from nltk import download 2 | from nltk.corpus import stopwords 3 | 4 | from ducksearch import evaluation, search, upload 5 | 6 | download("stopwords") 7 | 8 | stopword = list(stopwords.words("english")) 9 | 10 | dataset_name = "quora" 11 | 12 | documents, queries, qrels = evaluation.load_beir( 13 | dataset_name=dataset_name, 14 | split="test", 15 | ) 16 | 17 | upload.documents( 18 | database=dataset_name, 19 | documents=documents, 20 | key="id", 21 | fields=["title", "text"], 22 | stopwords=stopword, 23 | ) 24 | 25 | scores = search.documents( 26 | database=dataset_name, 27 | queries=queries, 28 | top_k=10, 29 | top_k_token=30_000, 30 | batch_size=32, 31 | ) 32 | 33 | evaluation_scores = evaluation.evaluate( 34 | scores=scores, 35 | qrels=qrels, 36 | queries=queries, 37 | metrics=["ndcg@10", "hits@1", "hits@10", "mrr@10", "map@10", "r-precision"], 38 | ) 39 | 40 | print(evaluation_scores) 41 | -------------------------------------------------------------------------------- /docs/.pages: -------------------------------------------------------------------------------- 1 | nav: 2 | - documentation 3 | - benchmarks 4 | - api -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | lightonai.github.io/ducksearch/ -------------------------------------------------------------------------------- /docs/api/.pages: -------------------------------------------------------------------------------- 1 | title: API reference 2 | arrange: 3 | - overview.md 4 | - ... 5 | -------------------------------------------------------------------------------- /docs/api/decorators/.pages: -------------------------------------------------------------------------------- 1 | title: decorators -------------------------------------------------------------------------------- /docs/api/decorators/connect-to-duckdb.md: -------------------------------------------------------------------------------- 1 | # connect_to_duckdb 2 | 3 | Establish a connection to the DuckDB database. Retry connecting if an error occurs. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name or path of the DuckDB database to connect to. 12 | 13 | - **read_only** (*bool*) – defaults to `False` 14 | 15 | Whether to open the database in read-only mode. Default is False. 16 | 17 | - **config** (*dict | None*) – defaults to `None` 18 | 19 | Optional configuration settings for the DuckDB connection. 20 | 21 | - **max_retry** (*int*) – defaults to `20` 22 | 23 | The maximum number of times to retry connecting to DuckDB. 24 | 25 | - **sleep_time** (*float*) – defaults to `0.1` 26 | 27 | The time to sleep between retries. 28 | 29 | - **kwargs** 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /docs/api/decorators/execute-with-duckdb.md: -------------------------------------------------------------------------------- 1 | # execute_with_duckdb 2 | 3 | Decorator to execute a SQL query using DuckDB. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **relative_path** (*str | list[str]*) 10 | 11 | A string or list of strings specifying the path(s) to the SQL file(s). 12 | 13 | - **read_only** (*bool*) – defaults to `False` 14 | 15 | Whether the DuckDB connection should be read-only. Default is False. 16 | 17 | - **fields** (*list[str] | None*) – defaults to `None` 18 | 19 | A list of fields to use as keys for the result rows if returning records. 20 | 21 | - **fetch_df** (*bool*) – defaults to `False` 22 | 23 | If True, fetch the result as a pandas DataFrame and return it as a list of dictionaries. 24 | 25 | - **kwargs** 26 | 27 | Additional keyword arguments to be passed to the SQL query, useful for string formatting. 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/api/evaluation/.pages: -------------------------------------------------------------------------------- 1 | title: evaluation -------------------------------------------------------------------------------- /docs/api/evaluation/evaluate.md: -------------------------------------------------------------------------------- 1 | # evaluate 2 | 3 | Evaluate the performance of document retrieval using relevance judgments. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **scores** (*list[list[dict]]*) 10 | 11 | A list of lists, where each sublist contains dictionaries representing the retrieved documents for a query. 12 | 13 | - **qrels** (*dict*) 14 | 15 | A dictionary mapping queries to relevant documents and their relevance scores. 16 | 17 | - **queries** (*list[str]*) 18 | 19 | A list of queries. 20 | 21 | - **metrics** (*list*) – defaults to `[]` 22 | 23 | A list of metrics to compute. Default includes "ndcg@10" and hits at various levels (e.g., hits@1, hits@10). 24 | 25 | 26 | 27 | ## Examples 28 | 29 | ```python 30 | >>> from ducksearch import evaluation, upload, search 31 | 32 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test") 33 | 34 | >>> upload.documents( 35 | ... database="test.duckdb", 36 | ... key="id", 37 | ... fields=["title", "text"], 38 | ... documents=documents, 39 | ... ) 40 | | Table | Size | 41 | |----------------|------| 42 | | documents | 5183 | 43 | | bm25_documents | 5183 | 44 | 45 | >>> scores = search.documents( 46 | ... database="test.duckdb", 47 | ... queries=queries, 48 | ... top_k=10, 49 | ... ) 50 | ``` 51 | 52 | -------------------------------------------------------------------------------- /docs/api/evaluation/load-beir.md: -------------------------------------------------------------------------------- 1 | # load_beir 2 | 3 | Load BEIR dataset for document and query retrieval tasks. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **dataset_name** (*str*) 10 | 11 | The name of the dataset to load (e.g., 'scifact'). 12 | 13 | - **split** (*str*) – defaults to `test` 14 | 15 | The dataset split to load (e.g., 'test'). 16 | 17 | 18 | 19 | ## Examples 20 | 21 | ```python 22 | >>> documents, queries, qrels = load_beir("scifact", split="test") 23 | 24 | >>> len(documents) 25 | 5183 26 | 27 | >>> len(queries) 28 | 300 29 | ``` 30 | 31 | -------------------------------------------------------------------------------- /docs/api/hf/.pages: -------------------------------------------------------------------------------- 1 | title: hf -------------------------------------------------------------------------------- /docs/api/hf/insert-documents.md: -------------------------------------------------------------------------------- 1 | # insert_documents 2 | 3 | Insert documents from a Hugging Face dataset into DuckDB. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The schema in which the documents table is located. 16 | 17 | - **key** (*str*) 18 | 19 | The key field that uniquely identifies each document (e.g., 'query_id'). 20 | 21 | - **url** (*str*) 22 | 23 | The URL of the Hugging Face dataset in Parquet format. 24 | 25 | - **config** (*dict | None*) – defaults to `None` 26 | 27 | Optional configuration options for the DuckDB connection. 28 | 29 | - **limit** (*int | None*) – defaults to `None` 30 | 31 | - **dtypes** (*dict | None*) – defaults to `None` 32 | 33 | 34 | 35 | ## Examples 36 | 37 | ```python 38 | >>> from ducksearch import upload 39 | 40 | >>> upload.documents( 41 | ... database="test.duckdb", 42 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/queries.parquet", 43 | ... key="query_id", 44 | ... fields=["query_id", "text"], 45 | ... ) 46 | | Table | Size | 47 | |----------------|------| 48 | | documents | 19 | 49 | | bm25_documents | 19 | 50 | 51 | >>> upload.documents( 52 | ... database="test.duckdb", 53 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/documents.parquet", 54 | ... key="document_id", 55 | ... fields=["document_id", "text"], 56 | ... ) 57 | | Table | Size | 58 | |----------------|------| 59 | | documents | 51 | 60 | | bm25_documents | 51 | 61 | ``` 62 | 63 | -------------------------------------------------------------------------------- /docs/api/overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | ## decorators 4 | 5 | - [connect_to_duckdb](../decorators/connect-to-duckdb) 6 | - [execute_with_duckdb](../decorators/execute-with-duckdb) 7 | 8 | ## evaluation 9 | 10 | - [evaluate](../evaluation/evaluate) 11 | - [load_beir](../evaluation/load-beir) 12 | 13 | ## hf 14 | 15 | - [insert_documents](../hf/insert-documents) 16 | 17 | ## search 18 | 19 | - [documents](../search/documents) 20 | - [graphs](../search/graphs) 21 | - [queries](../search/queries) 22 | - [search](../search/search) 23 | - [update_index_documents](../search/update-index-documents) 24 | - [update_index_queries](../search/update-index-queries) 25 | 26 | ## tables 27 | 28 | - [add_columns_documents](../tables/add-columns-documents) 29 | - [create_documents](../tables/create-documents) 30 | - [create_documents_queries](../tables/create-documents-queries) 31 | - [create_queries](../tables/create-queries) 32 | - [create_schema](../tables/create-schema) 33 | - [insert_documents](../tables/insert-documents) 34 | - [insert_documents_queries](../tables/insert-documents-queries) 35 | - [insert_queries](../tables/insert-queries) 36 | - [select_documents](../tables/select-documents) 37 | - [select_documents_columns](../tables/select-documents-columns) 38 | - [select_queries](../tables/select-queries) 39 | 40 | ## upload 41 | 42 | - [documents](../upload/documents) 43 | - [queries](../upload/queries) 44 | 45 | ## utils 46 | 47 | 48 | **Classes** 49 | 50 | - [ParallelTqdm](../utils/ParallelTqdm) 51 | 52 | **Functions** 53 | 54 | - [batchify](../utils/batchify) 55 | - [generate_random_hash](../utils/generate-random-hash) 56 | - [get_list_columns_df](../utils/get-list-columns-df) 57 | - [plot](../utils/plot) 58 | 59 | -------------------------------------------------------------------------------- /docs/api/search/.pages: -------------------------------------------------------------------------------- 1 | title: search -------------------------------------------------------------------------------- /docs/api/search/documents.md: -------------------------------------------------------------------------------- 1 | # documents 2 | 3 | Search for documents in the documents table using specified queries. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **queries** (*str | list[str]*) 14 | 15 | A string or list of query strings to search for. 16 | 17 | - **batch_size** (*int*) – defaults to `32` 18 | 19 | The batch size for query processing. 20 | 21 | - **top_k** (*int*) – defaults to `10` 22 | 23 | The number of top documents to retrieve for each query. 24 | 25 | - **top_k_token** (*int*) – defaults to `30000` 26 | 27 | The number of documents to score per token. 28 | 29 | - **n_jobs** (*int*) – defaults to `-1` 30 | 31 | The number of parallel jobs to use. Default use all available processors. 32 | 33 | - **config** (*dict | None*) – defaults to `None` 34 | 35 | Optional configuration for DuckDB connection settings. 36 | 37 | - **filters** (*str | None*) – defaults to `None` 38 | 39 | Optional SQL filters to apply during the search. 40 | 41 | - **order_by** (*str | None*) – defaults to `None` 42 | 43 | - **tqdm_bar** (*bool*) – defaults to `True` 44 | 45 | Whether to display a progress bar when searching. 46 | 47 | 48 | 49 | ## Examples 50 | 51 | ```python 52 | >>> from ducksearch import evaluation, upload, search 53 | 54 | >>> documents, queries, qrels = evaluation.load_beir( 55 | ... "scifact", 56 | ... split="test", 57 | ... ) 58 | 59 | >>> scores = search.documents( 60 | ... database="test.duckdb", 61 | ... queries=queries, 62 | ... top_k_token=1000, 63 | ... ) 64 | ``` 65 | 66 | -------------------------------------------------------------------------------- /docs/api/search/graphs.md: -------------------------------------------------------------------------------- 1 | # graphs 2 | 3 | Search for graphs in DuckDB using the provided queries. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **queries** (*str | list[str]*) 14 | 15 | A string or list of query strings to search for. 16 | 17 | - **batch_size** (*int*) – defaults to `30` 18 | 19 | The batch size for processing queries. 20 | 21 | - **top_k** (*int*) – defaults to `1000` 22 | 23 | The number of top documents to retrieve for each query. 24 | 25 | - **top_k_token** (*int*) – defaults to `30000` 26 | 27 | The number of top tokens to retrieve. 28 | 29 | - **n_jobs** (*int*) – defaults to `-1` 30 | 31 | The number of parallel jobs to use. Default use all available processors. 32 | 33 | - **config** (*dict | None*) – defaults to `None` 34 | 35 | Optional configuration settings for the DuckDB connection. 36 | 37 | - **filters** (*str | None*) – defaults to `None` 38 | 39 | Optional SQL filters to apply during the search. 40 | 41 | - **tqdm_bar** (*bool*) – defaults to `True` 42 | 43 | 44 | 45 | ## Examples 46 | 47 | ```python 48 | >>> from ducksearch import evaluation, upload, search 49 | 50 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="train") 51 | 52 | >>> upload.documents( 53 | ... database="test.duckdb", 54 | ... key="id", 55 | ... fields=["title", "text"], 56 | ... documents=documents, 57 | ... ) 58 | | Table | Size | 59 | |----------------|------| 60 | | documents | 5183 | 61 | | bm25_documents | 5183 | 62 | 63 | >>> upload.queries( 64 | ... database="test.duckdb", 65 | ... queries=queries, 66 | ... documents_queries=qrels, 67 | ... ) 68 | | Table | Size | 69 | |-------------------|------| 70 | | documents | 5183 | 71 | | queries | 807 | 72 | | bm25_documents | 5183 | 73 | | bm25_queries | 807 | 74 | | documents_queries | 916 | 75 | ``` 76 | 77 | -------------------------------------------------------------------------------- /docs/api/search/queries.md: -------------------------------------------------------------------------------- 1 | # queries 2 | 3 | Search for queries in the queries table using specified queries. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **queries** (*str | list[str]*) 14 | 15 | A string or list of query strings to search for. 16 | 17 | - **batch_size** (*int*) – defaults to `32` 18 | 19 | The batch size for query processing. 20 | 21 | - **top_k** (*int*) – defaults to `10` 22 | 23 | The number of top matching queries to retrieve. 24 | 25 | - **top_k_token** (*int*) – defaults to `30000` 26 | 27 | The number of documents to score per token. 28 | 29 | - **n_jobs** (*int*) – defaults to `-1` 30 | 31 | The number of parallel jobs to use. Default use all available processors. 32 | 33 | - **config** (*dict | None*) – defaults to `None` 34 | 35 | Optional configuration for DuckDB connection settings. 36 | 37 | - **filters** (*str | None*) – defaults to `None` 38 | 39 | Optional SQL filters to apply during the search. 40 | 41 | - **tqdm_bar** (*bool*) – defaults to `True` 42 | 43 | 44 | 45 | ## Examples 46 | 47 | ```python 48 | >>> from ducksearch import evaluation, upload, search 49 | 50 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test") 51 | 52 | >>> scores = search.queries(database="test.duckdb", queries=queries) 53 | 54 | >>> n = sum(1 for sample, query in zip(scores, queries) if sample[0]["query"] == query) 55 | >>> assert n >= 290 56 | ``` 57 | 58 | -------------------------------------------------------------------------------- /docs/api/search/search.md: -------------------------------------------------------------------------------- 1 | # search 2 | 3 | Run the search for documents or queries in parallel. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The name of the schema containing the indexed documents or queries. 16 | 17 | - **source_schema** (*str*) 18 | 19 | The name of the schema containing the original documents or queries. 20 | 21 | - **source** (*str*) 22 | 23 | The table to search (either 'documents' or 'queries'). 24 | 25 | - **queries** (*str | list[str]*) 26 | 27 | A string or list of query strings to search for. 28 | 29 | - **batch_size** (*int*) – defaults to `64` 30 | 31 | The batch size for query processing. 32 | 33 | - **top_k** (*int*) – defaults to `10` 34 | 35 | The number of top results to retrieve for each query. 36 | 37 | - **top_k_token** (*int*) – defaults to `30000` 38 | 39 | The number of documents to score per token. 40 | 41 | - **n_jobs** (*int*) – defaults to `-1` 42 | 43 | The number of parallel jobs to use. Default use available processors. 44 | 45 | - **config** (*dict | None*) – defaults to `None` 46 | 47 | Optional configuration for DuckDB connection settings. 48 | 49 | - **filters** (*str | None*) – defaults to `None` 50 | 51 | Optional SQL filters to apply during the search. 52 | 53 | - **order_by** (*str | None*) – defaults to `None` 54 | 55 | - **tqdm_bar** (*bool*) – defaults to `True` 56 | 57 | Whether to display a progress bar when searching. 58 | 59 | 60 | 61 | ## Examples 62 | 63 | ```python 64 | >>> from ducksearch import search 65 | 66 | >>> documents = search.search( 67 | ... database="test.duckdb", 68 | ... source_schema="bm25_tables", 69 | ... schema="bm25_documents", 70 | ... source="documents", 71 | ... queries="random query", 72 | ... top_k_token=10_000, 73 | ... top_k=10, 74 | ... ) 75 | 76 | >>> assert len(documents) == 10 77 | ``` 78 | 79 | -------------------------------------------------------------------------------- /docs/api/search/update-index-documents.md: -------------------------------------------------------------------------------- 1 | # update_index_documents 2 | 3 | Update the BM25 search index for documents. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **fields** (*list[str]*) 14 | 15 | The fields to index for each document. 16 | 17 | - **k1** (*float*) – defaults to `1.5` 18 | 19 | The BM25 k1 parameter, controls term saturation. 20 | 21 | - **b** (*float*) – defaults to `0.75` 22 | 23 | The BM25 b parameter, controls document length normalization. 24 | 25 | - **stemmer** (*str*) – defaults to `porter` 26 | 27 | The stemming algorithm to use (e.g., 'porter'). 28 | 29 | - **stopwords** (*str | list[str]*) – defaults to `None` 30 | 31 | The list of stopwords to exclude from indexing. Can be a list or a string specifying the language (e.g., "english"). 32 | 33 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+` 34 | 35 | A regex pattern to ignore characters during tokenization. Default ignores punctuation and non-alphabetic characters. 36 | 37 | - **strip_accents** (*bool*) – defaults to `True` 38 | 39 | Whether to remove accents from characters during indexing. 40 | 41 | - **lower** (*bool*) – defaults to `True` 42 | 43 | - **batch_size** (*int*) – defaults to `10000` 44 | 45 | The number of documents to process per batch. 46 | 47 | - **config** (*dict | None*) – defaults to `None` 48 | 49 | Optional configuration settings for the DuckDB connection. 50 | 51 | 52 | 53 | ## Examples 54 | 55 | ```python 56 | >>> from ducksearch import evaluation, upload, search 57 | 58 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test") 59 | 60 | >>> upload.documents( 61 | ... database="test.duckdb", 62 | ... key="id", 63 | ... fields=["title", "text"], 64 | ... documents=documents, 65 | ... stopwords=["larva"], 66 | ... ) 67 | | Table | Size | 68 | |----------------|------| 69 | | documents | 5183 | 70 | | bm25_documents | 5183 | 71 | ``` 72 | 73 | -------------------------------------------------------------------------------- /docs/api/search/update-index-queries.md: -------------------------------------------------------------------------------- 1 | # update_index_queries 2 | 3 | Update the BM25 search index for queries. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **k1** (*float*) – defaults to `1.5` 14 | 15 | The BM25 k1 parameter, controls term saturation. 16 | 17 | - **b** (*float*) – defaults to `0.75` 18 | 19 | The BM25 b parameter, controls document length normalization. 20 | 21 | - **stemmer** (*str*) – defaults to `porter` 22 | 23 | The stemming algorithm to use (e.g., 'porter'). 24 | 25 | - **stopwords** (*str | list[str]*) – defaults to `None` 26 | 27 | The list of stopwords to exclude from indexing. Can be a list or a string specifying the language (e.g., "english"). 28 | 29 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+` 30 | 31 | A regex pattern to ignore characters during tokenization. Default ignores punctuation and non-alphabetic characters. 32 | 33 | - **strip_accents** (*bool*) – defaults to `True` 34 | 35 | Whether to remove accents from characters during indexing. 36 | 37 | - **lower** (*bool*) – defaults to `True` 38 | 39 | - **batch_size** (*int*) – defaults to `10000` 40 | 41 | The number of queries to process per batch. 42 | 43 | - **config** (*dict | None*) – defaults to `None` 44 | 45 | Optional configuration settings for the DuckDB connection. 46 | 47 | 48 | 49 | ## Examples 50 | 51 | ```python 52 | >>> from ducksearch import evaluation, upload, search 53 | 54 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test") 55 | 56 | >>> upload.queries( 57 | ... database="test.duckdb", 58 | ... queries=queries, 59 | ... documents_queries=qrels, 60 | ... ) 61 | | Table | Size | 62 | |-------------------|------| 63 | | documents | 5183 | 64 | | queries | 300 | 65 | | bm25_documents | 5183 | 66 | | bm25_queries | 300 | 67 | | documents_queries | 339 | 68 | ``` 69 | 70 | -------------------------------------------------------------------------------- /docs/api/tables/.pages: -------------------------------------------------------------------------------- 1 | title: tables -------------------------------------------------------------------------------- /docs/api/tables/add-columns-documents.md: -------------------------------------------------------------------------------- 1 | # add_columns_documents 2 | 3 | Add columns to the documents table in the DuckDB database. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | - **schema** (*str*) 12 | 13 | - **columns** (*list[str] | str*) 14 | 15 | - **dtypes** (*dict*) – defaults to `None` 16 | 17 | - **config** (*dict*) – defaults to `None` 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /docs/api/tables/create-documents-queries.md: -------------------------------------------------------------------------------- 1 | # create_documents_queries 2 | 3 | Create the documents_queries table in the DuckDB database. 4 | 5 | 6 | 7 | 8 | 9 | ## Examples 10 | 11 | ```python 12 | >>> from ducksearch import tables 13 | 14 | >>> tables.create_schema( 15 | ... database="test.duckdb", 16 | ... schema="bm25_tables" 17 | ... ) 18 | 19 | >>> tables.create_documents_queries( 20 | ... database="test.duckdb", 21 | ... schema="bm25_tables", 22 | ... ) 23 | ``` 24 | 25 | -------------------------------------------------------------------------------- /docs/api/tables/create-documents.md: -------------------------------------------------------------------------------- 1 | # create_documents 2 | 3 | Create the documents table in the DuckDB database. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | - **schema** (*str*) 12 | 13 | - **columns** (*str | list[str]*) 14 | 15 | - **dtypes** (*dict[str, str] | None*) – defaults to `None` 16 | 17 | - **config** (*dict | None*) – defaults to `None` 18 | 19 | 20 | 21 | ## Examples 22 | 23 | ```python 24 | >>> from ducksearch import tables 25 | 26 | >>> tables.create_schema( 27 | ... database="test.duckdb", 28 | ... schema="bm25_tables" 29 | ... ) 30 | 31 | >>> tables.create_documents( 32 | ... database="test.duckdb", 33 | ... schema="bm25_tables", 34 | ... columns=["title", "text"], 35 | ... dtypes={"text": "VARCHAR", "title": "VARCHAR"}, 36 | ... ) 37 | 38 | >>> df = [ 39 | ... {"id": 1, "title": "title document 1", "text": "text document 1"}, 40 | ... {"id": 2, "title": "title document 2", "text": "text document 2"}, 41 | ... {"id": 3, "title": "title document 3", "text": "text document 3"}, 42 | ... ] 43 | 44 | >>> tables.insert_documents( 45 | ... database="test.duckdb", 46 | ... schema="bm25_tables", 47 | ... key="id", 48 | ... df=df, 49 | ... columns=["title", "text"], 50 | ... ) 51 | ``` 52 | 53 | -------------------------------------------------------------------------------- /docs/api/tables/create-queries.md: -------------------------------------------------------------------------------- 1 | # create_queries 2 | 3 | Create the queries table in the DuckDB database. 4 | 5 | 6 | 7 | 8 | 9 | ## Examples 10 | 11 | ```python 12 | >>> from ducksearch import tables 13 | 14 | >>> tables.create_schema( 15 | ... database="test.duckdb", 16 | ... schema="bm25_tables" 17 | ... ) 18 | 19 | >>> tables.create_queries( 20 | ... database="test.duckdb", 21 | ... schema="bm25_tables", 22 | ... ) 23 | ``` 24 | 25 | -------------------------------------------------------------------------------- /docs/api/tables/create-schema.md: -------------------------------------------------------------------------------- 1 | # create_schema 2 | 3 | Create the specified schema in the DuckDB database. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | - **schema** (*str*) 12 | 13 | - **config** (*dict | None*) – defaults to `None` 14 | 15 | 16 | 17 | ## Examples 18 | 19 | ```python 20 | >>> from ducksearch import tables 21 | 22 | >>> tables.create_schema( 23 | ... database="test.duckdb", 24 | ... schema="bm25_tables", 25 | ... ) 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /docs/api/tables/insert-documents-queries.md: -------------------------------------------------------------------------------- 1 | # insert_documents_queries 2 | 3 | Insert interactions between documents and queries into the documents_queries table. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The schema in which the documents_queries table is located. 16 | 17 | - **documents_queries** (*dict[dict[str, float]]*) 18 | 19 | A dictionary mapping document IDs to queries and their corresponding scores. 20 | 21 | - **config** (*dict | None*) – defaults to `None` 22 | 23 | Optional configuration options for the DuckDB connection. 24 | 25 | 26 | 27 | ## Examples 28 | 29 | ```python 30 | >>> from ducksearch import tables 31 | 32 | >>> documents_queries = { 33 | ... "1": {"query 1": 0.9, "query 2": 0.8}, 34 | ... "2": {"query 2": 0.9, "query 3": 3}, 35 | ... "3": {"query 1": 0.9, "query 3": 0.5}, 36 | ... } 37 | 38 | >>> tables.insert_documents_queries( 39 | ... database="test.duckdb", 40 | ... schema="bm25_tables", 41 | ... documents_queries=documents_queries 42 | ... ) 43 | ``` 44 | 45 | -------------------------------------------------------------------------------- /docs/api/tables/insert-documents.md: -------------------------------------------------------------------------------- 1 | # insert_documents 2 | 3 | Insert documents into the documents table with optional multi-threading. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The schema in which the documents table is located. 16 | 17 | - **df** (*list[dict] | str*) 18 | 19 | The list of document dictionaries or a string (URL) for a Hugging Face dataset to insert. 20 | 21 | - **key** (*str*) 22 | 23 | The field that uniquely identifies each document (e.g., 'id'). 24 | 25 | - **columns** (*list[str] | str*) 26 | 27 | The list of document fields to insert. Can be a string if inserting a single field. 28 | 29 | - **dtypes** (*dict[str, str] | None*) – defaults to `None` 30 | 31 | Optional dictionary specifying the DuckDB type for each field. Defaults to 'VARCHAR' for all unspecified fields. 32 | 33 | - **batch_size** (*int*) – defaults to `30000` 34 | 35 | The number of documents to insert in each batch. 36 | 37 | - **n_jobs** (*int*) – defaults to `-1` 38 | 39 | Number of parallel jobs to use for inserting documents. Default use all available processors. 40 | 41 | - **config** (*dict | None*) – defaults to `None` 42 | 43 | Optional configuration options for the DuckDB connection. 44 | 45 | - **limit** (*int | None*) – defaults to `None` 46 | 47 | 48 | 49 | ## Examples 50 | 51 | ```python 52 | >>> from ducksearch import tables 53 | 54 | >>> df = [ 55 | ... {"id": 1, "title": "title document 1", "text": "text document 1"}, 56 | ... {"id": 2, "title": "title document 2", "text": "text document 2"}, 57 | ... {"id": 3, "title": "title document 3", "text": "text document 3"}, 58 | ... ] 59 | 60 | >>> _ = tables.insert_documents( 61 | ... database="test.duckdb", 62 | ... schema="bm25_tables", 63 | ... key="id", 64 | ... columns=["title", "text"], 65 | ... df=df 66 | ... ) 67 | ``` 68 | 69 | -------------------------------------------------------------------------------- /docs/api/tables/insert-queries.md: -------------------------------------------------------------------------------- 1 | # insert_queries 2 | 3 | Insert a list of queries into the queries table. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The schema in which the queries table is located. 16 | 17 | - **queries** (*list[str]*) 18 | 19 | A list of query strings to insert into the table. 20 | 21 | - **config** (*dict | None*) – defaults to `None` 22 | 23 | Optional configuration options for the DuckDB connection. 24 | 25 | 26 | 27 | ## Examples 28 | 29 | ```python 30 | >>> from ducksearch import tables 31 | 32 | >>> _ = tables.insert_queries( 33 | ... database="test.duckdb", 34 | ... schema="bm25_tables", 35 | ... queries=["query 1", "query 2", "query 3"], 36 | ... ) 37 | ``` 38 | 39 | -------------------------------------------------------------------------------- /docs/api/tables/select-documents-columns.md: -------------------------------------------------------------------------------- 1 | # select_documents_columns 2 | 3 | Select the column names from the documents table, excluding the 'bm25id' column. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The schema where the documents table is located. 16 | 17 | - **config** (*dict | None*) – defaults to `None` 18 | 19 | Optional configuration options for the DuckDB connection. 20 | 21 | 22 | 23 | ## Examples 24 | 25 | ```python 26 | >>> from ducksearch import tables 27 | 28 | >>> tables.select_documents_columns( 29 | ... database="test.duckdb", 30 | ... schema="bm25_tables", 31 | ... ) 32 | ['id', 'title', 'text'] 33 | ``` 34 | 35 | -------------------------------------------------------------------------------- /docs/api/tables/select-documents.md: -------------------------------------------------------------------------------- 1 | # select_documents 2 | 3 | Select all documents from the documents table. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **schema** (*str*) 14 | 15 | The schema where the documents table is located. 16 | 17 | - **limit** (*int | None*) – defaults to `None` 18 | 19 | - **config** (*dict | None*) – defaults to `None` 20 | 21 | Optional configuration options for the DuckDB connection. 22 | 23 | 24 | 25 | ## Examples 26 | 27 | ```python 28 | >>> from ducksearch import tables 29 | 30 | >>> documents = tables.select_documents( 31 | ... database="test.duckdb", 32 | ... schema="bm25_tables", 33 | ... ) 34 | 35 | >>> assert len(documents) == 3 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /docs/api/tables/select-queries.md: -------------------------------------------------------------------------------- 1 | # select_queries 2 | 3 | Select all queries from the queries table. 4 | 5 | 6 | 7 | 8 | 9 | ## Examples 10 | 11 | ```python 12 | >>> from ducksearch import tables 13 | 14 | >>> queries = tables.select_queries( 15 | ... database="test.duckdb", 16 | ... schema="bm25_tables", 17 | ... ) 18 | 19 | >>> assert len(queries) == 3 20 | ``` 21 | 22 | -------------------------------------------------------------------------------- /docs/api/upload/.pages: -------------------------------------------------------------------------------- 1 | title: upload -------------------------------------------------------------------------------- /docs/api/upload/documents.md: -------------------------------------------------------------------------------- 1 | # documents 2 | 3 | Upload documents to DuckDB, create necessary schema, and index using BM25. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | Name of the DuckDB database. 12 | 13 | - **key** (*str*) 14 | 15 | Key identifier for the documents. The key will be renamed to `id` in the database. 16 | 17 | - **fields** (*str | list[str]*) 18 | 19 | List of fields to upload from each document. If a single field is provided as a string, it will be converted to a list. 20 | 21 | - **documents** (*list[dict] | str*) 22 | 23 | Documents to upload. Can be a list of dictionaries or a Hugging Face (HF) URL string pointing to a dataset. 24 | 25 | - **k1** (*float*) – defaults to `1.5` 26 | 27 | BM25 k1 parameter, controls term saturation. 28 | 29 | - **b** (*float*) – defaults to `0.75` 30 | 31 | BM25 b parameter, controls document length normalization. 32 | 33 | - **stemmer** (*str*) – defaults to `porter` 34 | 35 | Stemming algorithm to use (e.g., 'porter'). The type of stemmer to be used. One of 'arabic', 'basque', 'catalan', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'lithuanian', 'nepali', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'serbian', 'spanish', 'swedish', 'tamil', 'turkish', or 'none' if no stemming is to be used. 36 | 37 | - **stopwords** (*str | list[str]*) – defaults to `None` 38 | 39 | List of stopwords to exclude from indexing. Can be a custom list or a language string. 40 | 41 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+` 42 | 43 | Regular expression pattern to ignore characters when indexing. Default ignore punctuation and non-alphabetic characters. 44 | 45 | - **strip_accents** (*bool*) – defaults to `True` 46 | 47 | Whether to remove accents from characters during indexing. 48 | 49 | - **lower** (*bool*) – defaults to `True` 50 | 51 | - **batch_size** (*int*) – defaults to `30000` 52 | 53 | Number of documents to process per batch. 54 | 55 | - **n_jobs** (*int*) – defaults to `-1` 56 | 57 | Number of parallel jobs to use for uploading documents. Default use all available processors. 58 | 59 | - **dtypes** (*dict[str, str] | None*) – defaults to `None` 60 | 61 | - **config** (*dict | None*) – defaults to `None` 62 | 63 | Optional configuration dictionary for the DuckDB connection and other settings. 64 | 65 | - **limit** (*int | None*) – defaults to `None` 66 | 67 | - **tqdm_bar** (*bool*) – defaults to `True` 68 | 69 | Whether to display a progress bar when uploading documents 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /docs/api/upload/queries.md: -------------------------------------------------------------------------------- 1 | # queries 2 | 3 | Upload queries to DuckDB, map documents to queries, and index using BM25. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | Name of the DuckDB database. 12 | 13 | - **queries** (*list[str] | None*) – defaults to `None` 14 | 15 | List of queries to upload. Each query is a string. 16 | 17 | - **documents_queries** (*dict[list]*) – defaults to `None` 18 | 19 | Dictionary mapping document IDs to a list of queries. 20 | 21 | - **k1** (*float*) – defaults to `1.5` 22 | 23 | BM25 k1 parameter, controls term saturation. 24 | 25 | - **b** (*float*) – defaults to `0.75` 26 | 27 | BM25 b parameter, controls document length normalization. 28 | 29 | - **stemmer** (*str*) – defaults to `porter` 30 | 31 | Stemming algorithm to use (e.g., 'porter'). The type of stemmer to be used. One of 'arabic', 'basque', 'catalan', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'lithuanian', 'nepali', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'serbian', 'spanish', 'swedish', 'tamil', 'turkish', or 'none' if no stemming is to be used. 32 | 33 | - **stopwords** (*str | list[str]*) – defaults to `None` 34 | 35 | List of stopwords to exclude from indexing. Default can be a custom list or a language string. 36 | 37 | - **ignore** (*str*) – defaults to `(\.|[^a-z])+` 38 | 39 | Regular expression pattern to ignore characters when indexing. Default ignore punctuation and non-alphabetic characters. 40 | 41 | - **strip_accents** (*bool*) – defaults to `True` 42 | 43 | Whether to remove accents from characters during indexing. 44 | 45 | - **lower** (*bool*) – defaults to `True` 46 | 47 | - **batch_size** (*int*) – defaults to `30000` 48 | 49 | Number of queries to process per batch. 50 | 51 | - **config** (*dict | None*) – defaults to `None` 52 | 53 | Optional configuration dictionary for the DuckDB connection and other settings. 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /docs/api/utils/.pages: -------------------------------------------------------------------------------- 1 | title: utils -------------------------------------------------------------------------------- /docs/api/utils/ParallelTqdm.md: -------------------------------------------------------------------------------- 1 | # ParallelTqdm 2 | 3 | joblib.Parallel, but with a tqdm progressbar. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **total** (*int*) 10 | 11 | The total number of tasks to complete. 12 | 13 | - **desc** (*str*) 14 | 15 | A description of the task. 16 | 17 | - **tqdm_bar** (*bool*) – defaults to `True` 18 | 19 | Whether to display a tqdm progress bar. Default is False. 20 | 21 | - **show_joblib_header** (*bool*) – defaults to `False` 22 | 23 | Whether to display the joblib header. Default is False 24 | 25 | - **kwargs** 26 | 27 | 28 | 29 | 30 | ## Methods 31 | 32 | ???- note "__call__" 33 | 34 | Main function to dispatch parallel tasks. 35 | 36 | **Parameters** 37 | 38 | - **iterable** 39 | 40 | ???- note "debug" 41 | 42 | ???- note "dispatch_next" 43 | 44 | Dispatch more data for parallel processing 45 | 46 | This method is meant to be called concurrently by the multiprocessing callback. We rely on the thread-safety of dispatch_one_batch to protect against concurrent consumption of the unprotected iterator. 47 | 48 | 49 | ???- note "dispatch_one_batch" 50 | 51 | Prefetch the tasks for the next batch and dispatch them. 52 | 53 | The effective size of the batch is computed here. If there are no more jobs to dispatch, return False, else return True. The iterator consumption and dispatching is protected by the same lock so calling this function should be thread safe. 54 | 55 | **Parameters** 56 | 57 | - **iterator** 58 | 59 | ???- note "format" 60 | 61 | Return the formatted representation of the object. 62 | 63 | **Parameters** 64 | 65 | - **obj** 66 | - **indent** – defaults to `0` 67 | 68 | ???- note "info" 69 | 70 | ???- note "print_progress" 71 | 72 | Display the process of the parallel execution using tqdm 73 | 74 | 75 | ???- note "warn" 76 | 77 | ## References 78 | 79 | https://github.com/joblib/joblib/issues/972 80 | 81 | -------------------------------------------------------------------------------- /docs/api/utils/batchify.md: -------------------------------------------------------------------------------- 1 | # batchify 2 | 3 | Split a list into batches and optionally display a progress bar. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **X** (*list[str]*) 10 | 11 | A list of items to be batched. 12 | 13 | - **batch_size** (*int*) 14 | 15 | The number of items in each batch. 16 | 17 | - **desc** (*str*) – defaults to `` 18 | 19 | A description to display in the progress bar. 20 | 21 | - **tqdm_bar** (*bool*) – defaults to `True` 22 | 23 | Whether to display a progress bar using `tqdm`. 24 | 25 | 26 | 27 | ## Examples 28 | 29 | ```python 30 | >>> items = ["a", "b", "c", "d", "e", "f"] 31 | >>> batches = list(batchify(items, batch_size=2)) 32 | >>> for batch in batches: 33 | ... print(batch) 34 | ['a', 'b'] 35 | ['c', 'd'] 36 | ['e', 'f'] 37 | ``` 38 | 39 | -------------------------------------------------------------------------------- /docs/api/utils/generate-random-hash.md: -------------------------------------------------------------------------------- 1 | # generate_random_hash 2 | 3 | Generate a random SHA-256 hash. 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /docs/api/utils/get-list-columns-df.md: -------------------------------------------------------------------------------- 1 | # get_list_columns_df 2 | 3 | Get a list of columns from a list of dictionaries or a DataFrame. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **documents** (*list[dict] | pandas.core.frame.DataFrame*) 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/api/utils/plot.md: -------------------------------------------------------------------------------- 1 | # plot 2 | 3 | Generate and display a markdown table with statistics of the specified dataset tables. 4 | 5 | 6 | 7 | ## Parameters 8 | 9 | - **database** (*str*) 10 | 11 | The name of the DuckDB database. 12 | 13 | - **config** (*None | dict*) – defaults to `None` 14 | 15 | Optional configuration options for the DuckDB connection. 16 | 17 | - **tables** – defaults to `['bm25_tables.documents', 'bm25_tables.queries', 'bm25_documents.lengths', 'bm25_queries.lengths', 'bm25_tables.documents_queries']` 18 | 19 | A list of table names to plot statistics for. Defaults to common BM25 tables. 20 | 21 | 22 | 23 | ## Examples 24 | 25 | ```python 26 | >>> from ducksearch import utils 27 | 28 | >>> utils.plot(database="test.duckdb") 29 | | Table | Size | 30 | |-----------|------| 31 | | documents | 5183 | 32 | | queries | 300 | 33 | ``` 34 | 35 | -------------------------------------------------------------------------------- /docs/benchmarks/.pages: -------------------------------------------------------------------------------- 1 | title: Benchmarks 2 | nav: 3 | - Benchmarks: benchmarks.md 4 | -------------------------------------------------------------------------------- /docs/benchmarks/benchmarks.md: -------------------------------------------------------------------------------- 1 | ## Benchmarks 2 | 3 | ### Ducksearch and BM25s 4 | 5 | While DuckSearch provide advanced filtering features / updates on the index, DuckSearch only score `top_k_token` document per query token. Benchmark might evolve with DuckDB improvements and DuckSearch updates. 6 | 7 | === "Table" 8 | 9 | | Dataset | Metric | Ducksearch | BM25s | Difference (Ducksearch - BM25s) | 10 | |-------------------|---------------|-------------|-----------|---------------------------------| 11 | | **arguana** | ndcg@10 | 0.3779 | 0.3663 | +0.0116 | 12 | | | hits@1 | 0.0 | 0.0 | 0.0 | 13 | | | mrr@10 | 0.2491 | 0.2443 | +0.0048 | 14 | | | map@10 | 0.2528 | 0.2430 | +0.0098 | 15 | | | qps | 117.80 | 2113.50 | -1995.70 | 16 | | | Index Time(s) | 1.42 | 0.48 | +0.94 | 17 | | **climate-fever** | ndcg@10 | 0.1184 | 0.1313 | -0.0129 | 18 | | | hits@1 | 0.1068 | 0.1186 | -0.0118 | 19 | | | mrr@10 | 0.1644 | 0.1809 | -0.0165 | 20 | | | map@10 | 0.0803 | 0.0907 | -0.0104 | 21 | | | qps | 5.88 | 99.49 | -93.61 | 22 | | | Index Time(s) | 302.39 | 209.97 | +92.42 | 23 | | **dbpedia-entity**| ndcg@10 | 0.6046 | 0.6172 | -0.0126 | 24 | | | hits@1 | 0.7669 | 0.7744 | -0.0075 | 25 | | | mrr@10 | 0.8311 | 0.8382 | -0.0071 | 26 | | | map@10 | 0.0649 | 0.0672 | -0.0023 | 27 | | | qps | 113.20 | 182.79 | -69.59 | 28 | | | Index Time(s) | 181.42 | 119.18 | +62.24 | 29 | | **fever** | ndcg@10 | 0.3861 | 0.4825 | -0.0964 | 30 | | | hits@1 | 0.2583 | 0.3312 | -0.0729 | 31 | | | mrr@10 | 0.3525 | 0.4423 | -0.0898 | 32 | | | map@10 | 0.3329 | 0.4212 | -0.0883 | 33 | | | qps | 74.40 | 104.97 | -30.57 | 34 | | | Index Time(s) | 329.70 | 207.52 | +122.18 | 35 | | **fiqa** | ndcg@10 | 0.2445 | 0.2326 | +0.0119 | 36 | | | hits@1 | 0.2207 | 0.2160 | +0.0047 | 37 | | | mrr@10 | 0.3002 | 0.2875 | +0.0127 | 38 | | | map@10 | 0.1848 | 0.1726 | +0.0122 | 39 | | | qps | 545.77 | 2157.35 | -1611.58 | 40 | | | Index Time(s) | 6.04 | 4.27 | +1.77 | 41 | | **hotpotqa** | ndcg@10 | 0.4487 | 0.5630 | -0.1143 | 42 | | | hits@1 | 0.5059 | 0.6523 | -0.1464 | 43 | | | mrr@10 | 0.5846 | 0.7249 | -0.1403 | 44 | | | map@10 | 0.3642 | 0.4697 | -0.1055 | 45 | | | qps | 48.15 | 104.43 | -56.28 | 46 | | | Index Time(s) | 163.14 | 123.39 | +39.75 | 47 | | **msmarco** | ndcg@10 | 0.8951 | 0.9705 | -0.0754 | 48 | | | hits@1 | 1.0 | 1.0 | 0.0 | 49 | | | mrr@10 | 1.0 | 1.0 | 0.0 | 50 | | | map@10 | 0.0459 | 0.0532 | -0.0073 | 51 | | | qps | 35.11 | 71.26 | -36.15 | 52 | | | Index Time(s) | 202.37 | 229.22 | -26.85 | 53 | | **nfcorpus** | ndcg@10 | 0.3301 | 0.3059 | +0.0242 | 54 | | | hits@1 | 0.4396 | 0.4458 | -0.0062 | 55 | | | mrr@10 | 0.5292 | 0.5205 | +0.0087 | 56 | | | map@10 | 0.1233 | 0.1168 | +0.0065 | 57 | | | qps | 3464.66 | 3933.12 | -468.46 | 58 | | | Index Time(s) | 0.99 | 1.67 | -0.68 | 59 | | **nq** | ndcg@10 | 0.2451 | 0.2735 | -0.0284 | 60 | | | hits@1 | 0.1272 | 0.1460 | -0.0188 | 61 | | | mrr@10 | 0.2099 | 0.2366 | -0.0267 | 62 | | | map@10 | 0.1934 | 0.2177 | -0.0243 | 63 | | | qps | 150.23 | 272.62 | -122.39 | 64 | | | Index Time(s) | 71.43 | 87.98 | -16.55 | 65 | | **quora** | ndcg@10 | 0.7705 | 0.7491 | +0.0214 | 66 | | | hits@1 | 0.6783 | 0.6622 | +0.0161 | 67 | | | mrr@10 | 0.7606 | 0.7433 | +0.0173 | 68 | | | map@10 | 0.7206 | 0.6988 | +0.0218 | 69 | | | qps | 741.13 | 1004.44 | -263.31 | 70 | | | Index Time(s) | 3.78 | 6.57 | -2.79 | 71 | | **scidocs** | ndcg@10 | 0.1025 | 0.0993 | +0.0032 | 72 | | | hits@1 | 0.1790 | 0.1910 | -0.0120 | 73 | | | mrr@10 | 0.2754 | 0.2765 | -0.0011 | 74 | | | map@10 | 0.0154 | 0.0147 | +0.0007 | 75 | | | qps | 879.11 | 3570.06 | -2690.95 | 76 | | | Index Time(s) | 4.46 | 1.64 | +2.82 | 77 | | **scifact** | ndcg@10 | 0.6908 | 0.6617 | +0.0291 | 78 | | | hits@1 | 0.5533 | 0.5433 | +0.0100 | 79 | | | mrr@10 | 0. 80 | 6527 | 0.6312 | +0.0215 | 81 | | | map@10 | 0.6416 | 0.6199 | +0.0217 | 82 | | | qps | 2153.64 | 3708.28 | -1554.64 | 83 | | | Index Time(s) | 1.22 | 0.41 | +0.81 | 84 | | **trec-covid** | ndcg@10 | 0.9533 | 0.8983 | +0.0550 | 85 | | | hits@1 | 1.0 | 0.92 | +0.08 | 86 | | | mrr@10 | 1.0 | 0.96 | +0.04 | 87 | | | map@10 | 0.0074 | 0.0069 | +0.0005 | 88 | | | qps | 112.38 | 1275.41 | -1163.03 | 89 | | | Index Time(s) | 22.15 | 10.15 | +12.00 | 90 | | **webis-touche2020** | ndcg@10 | 0.4130 | 0.4671 | -0.0541 | 91 | | | hits@1 | 0.5510 | 0.6122 | -0.0612 | 92 | | | mrr@10 | 0.7114 | 0.7541 | -0.0427 | 93 | | | map@10 | 0.0564 | 0.0659 | -0.0095 | 94 | | | qps | 104.65 | 961.73 | -857.08 | 95 | | | Index Time(s) | 44.14 | 34.89 | +9.25 | 96 | 97 | -------------------------------------------------------------------------------- /docs/css/version-select.css: -------------------------------------------------------------------------------- 1 | @media only screen and (max-width:76.1875em) { 2 | #version-selector { 3 | padding: .6rem .8rem; 4 | } 5 | } -------------------------------------------------------------------------------- /docs/documentation/.pages: -------------------------------------------------------------------------------- 1 | title: Documentation 2 | nav: 3 | - Upload: upload.md 4 | - Search: search.md 5 | - Delete: delete.md 6 | - Update: update.md 7 | 8 | -------------------------------------------------------------------------------- /docs/documentation/delete.md: -------------------------------------------------------------------------------- 1 | ## Delete 2 | 3 | To delete a document, you need to provide the document's ID. The delete operation will remove the document from the database and update the index. 4 | 5 | ```python 6 | from ducksearch import delete, upload 7 | 8 | delete.documents( 9 | database="ducksearch.duckdb", 10 | ids=[0, 1], 11 | ) 12 | ``` -------------------------------------------------------------------------------- /docs/documentation/graph.md: -------------------------------------------------------------------------------- 1 | ## Graph 2 | 3 | The `search.graphs` function can be used to search documents with a graph-based query. This function is useful if we have paired documents and queries. The search will retrieve the set of documents and queries that match the input query. Then it will build a graph and compute the weight of each document using a graph-based scoring function. 4 | 5 | The `search.graphs` function is much slower than the `search.documents` function, but might provide better results with decent amount of paired documents / queries. 6 | 7 | ### Documents queries interactions 8 | 9 | We can upload documents queries interactions in order to call the `search.graphs` function. The following example demonstrates how to upload documents queries interactions: 10 | 11 | ```python 12 | from ducksearch import search, upload 13 | 14 | documents = [ 15 | { 16 | "id": 0, 17 | "title": "Hotel California", 18 | "style": "rock", 19 | "date": "1977-02-22", 20 | "popularity": 9, 21 | }, 22 | { 23 | "id": 1, 24 | "title": "Here Comes the Sun", 25 | "style": "rock", 26 | "date": "1969-06-10", 27 | "popularity": 10, 28 | }, 29 | { 30 | "id": 2, 31 | "title": "Alive", 32 | "style": "electro, punk", 33 | "date": "2007-11-19", 34 | "popularity": 9, 35 | }, 36 | ] 37 | 38 | upload.documents( 39 | database="ducksearch.duckdb", 40 | key="id", 41 | fields=["title", "style", "date", "popularity"], 42 | documents=documents, 43 | dtypes={ 44 | "date": "DATE", 45 | "popularity": "INT", 46 | }, 47 | ) 48 | 49 | # Mapping between documents ids and queries 50 | documents_queries = { 51 | 0: ["the beatles", "rock band"], 52 | 1: ["rock band", "california"], 53 | 2: ["daft"], 54 | } 55 | 56 | upload.queries( 57 | database="ducksearch.duckdb", 58 | documents_queries=documents_queries, 59 | ) 60 | ``` 61 | 62 | ???+ tip 63 | We can write documents queries mapping as a list of dict with the weight between the document and the query. The weight is used to compute the score in the `search.graphs` function: 64 | 65 | ```python 66 | documents_queries = { 67 | 0: {"the beatles": 30, "rock band": 10}, 68 | 1: {"rock band": 10, "california": 1}, 69 | 2: {"daft": 60}, 70 | } 71 | ``` 72 | 73 | When the weight is not specified, the default value is 1. 74 | 75 | ### Search Graphs 76 | 77 | The following example demonstrates how to search documents with a graph-based query: 78 | 79 | ```python 80 | from ducksearch import search 81 | 82 | search.graphs( 83 | database="ducksearch.duckdb", 84 | queries="daft punk", 85 | top_k=10, 86 | ) 87 | ``` 88 | 89 | ```python 90 | [ 91 | { 92 | "id": "2", 93 | "title": "Alive", 94 | "style": "electro, punk", 95 | "date": Timestamp("2007-11-19 00:00:00"), 96 | "popularity": 9, 97 | "score": 2.877532958984375, 98 | } 99 | ] 100 | ``` -------------------------------------------------------------------------------- /docs/documentation/search.md: -------------------------------------------------------------------------------- 1 | ???+ note 2 | Before we can search for documents, we need to upload them to DuckDB. We can use the `upload.documents` function to upload a list of dictionaries to DuckDB. 3 | 4 | ## Search 5 | 6 | All the search functions require a DuckDB database name as the first argument. The database name is the name of the DuckDB database where the documents are stored. The database name is the same as the one used in the `upload.documents` function. Each search function can take additional parameters to control the search behavior such as the number of documents to return, the number of documents to score for each query token, and the number of parallel jobs to use as well as optional SQL filters. 7 | 8 | ### Documents 9 | 10 | Once the documents are uploaded, we can search for them using the `search.documents` function. 11 | The search function returns a list of list of documents ordered by their BM25 score. 12 | 13 | ```python 14 | search.documents( 15 | database="ducksearch.duckdb", 16 | queries=["daft punk", "rock"], 17 | top_k=10, 18 | top_k_token=10_000, 19 | batch_size=32, 20 | n_jobs=-1, 21 | ) 22 | ``` 23 | 24 | ```python 25 | [ 26 | [ 27 | { 28 | "id": "2", 29 | "title": "Alive", 30 | "style": "electro, punk", 31 | "date": Timestamp("2007-11-19 00:00:00"), 32 | "popularity": 9, 33 | "score": 0.16131360828876495, 34 | } 35 | ], 36 | [ 37 | { 38 | "id": "1", 39 | "title": "Here Comes the Sun", 40 | "style": "rock", 41 | "date": Timestamp("1969-06-10 00:00:00"), 42 | "popularity": 10, 43 | "score": 0.09199773520231247, 44 | }, 45 | { 46 | "id": "0", 47 | "title": "Hotel California", 48 | "style": "rock", 49 | "date": Timestamp("1977-02-22 00:00:00"), 50 | "popularity": 9, 51 | "score": 0.07729987800121307, 52 | }, 53 | ], 54 | ] 55 | ``` 56 | 57 | ???+ info 58 | The search function is executed in parallel using the `n_jobs` parameter. We can control the number of documents to return using the `top_k` parameter and the number of documents to score for each query token using the `top_k_token` parameter. Reducing `top_k_token` can further speed up the search but may result in lower quality results. 59 | 60 | ### Filters 61 | 62 | We can apply filters to the search using the `filters` parameter. The filters are SQL expressions that are applied to the search results. 63 | 64 | ```python 65 | from ducksearch import search 66 | 67 | search.documents( 68 | database="ducksearch.duckdb", 69 | queries=["rock", "california"], 70 | top_k=10, 71 | top_k_token=10_000, 72 | batch_size=32, 73 | filters="YEAR(date) <= 1990 AND YEAR(date) >= 1970", 74 | n_jobs=-1, 75 | ) 76 | ``` 77 | 78 | ```python 79 | [ 80 | [ 81 | { 82 | "score": 0.07729987800121307, 83 | "id": "0", 84 | "title": "Hotel California", 85 | "style": "rock", 86 | "date": Timestamp("1977-02-22 00:00:00"), 87 | "popularity": 9, 88 | } 89 | ], 90 | [ 91 | { 92 | "score": 0.16131360828876495, 93 | "id": "0", 94 | "title": "Hotel California", 95 | "style": "rock", 96 | "date": Timestamp("1977-02-22 00:00:00"), 97 | "popularity": 9, 98 | } 99 | ], 100 | ] 101 | ``` 102 | 103 | ???+ info 104 | The filters are evaluated by DuckDB, so all DuckDB functions are available for use in the filters. You can find more information about DuckDB functions in the [DuckDB documentation](https://duckdb.org/docs/sql/functions/overview). 105 | 106 | -------------------------------------------------------------------------------- /docs/documentation/update.md: -------------------------------------------------------------------------------- 1 | ## Update 2 | 3 | To update a document, you to first delete the document and then upload the updated document. The delete operation will remove the document from the database and update the index. Finally, the upload operation will add the updated document to the database and update the index. 4 | 5 | ```python 6 | from ducksearch import delete, upload 7 | 8 | delete.documents( 9 | database="ducksearch.duckdb", 10 | ids=[0, 1], 11 | ) 12 | 13 | documents_updated = [ 14 | { 15 | "id": 0, 16 | "title": "Hotel California", 17 | "style": "rock", 18 | "date": "1977-02-22", 19 | "popularity": 9, 20 | }, 21 | { 22 | "id": 1, 23 | "title": "Here Comes the Sun", 24 | "style": "rock", 25 | "date": "1969-06-10", 26 | "popularity": 10, 27 | }, 28 | ] 29 | 30 | upload.documents( 31 | database="ducksearch.duckdb", 32 | key="id", 33 | fields=["title", "style", "date", "popularity"], 34 | documents=documents_updated, 35 | dtypes={ 36 | "date": "DATE", 37 | "popularity": "INT", 38 | }, 39 | ) 40 | ``` 41 | -------------------------------------------------------------------------------- /docs/documentation/upload.md: -------------------------------------------------------------------------------- 1 | ## Upload 2 | 3 | When working with DuckSearch, the first step is to upload documents to DuckDB using the `upload.documents` function. The documents are stored in a DuckDB database, and the fields are indexed with BM25. DuckSearch won't re-index a document if it already exists in the database. Index will be updated along with the new documents. 4 | 5 | ### Upload documents 6 | 7 | The following example demonstrates how to upload a list of documents: 8 | 9 | ```python 10 | from ducksearch import upload 11 | 12 | documents = [ 13 | { 14 | "id": 0, 15 | "title": "Hotel California", 16 | "style": "rock", 17 | "date": "1977-02-22", 18 | "popularity": 9, 19 | }, 20 | { 21 | "id": 1, 22 | "title": "Here Comes the Sun", 23 | "style": "rock", 24 | "date": "1969-06-10", 25 | "popularity": 10, 26 | }, 27 | { 28 | "id": 2, 29 | "title": "Alive", 30 | "style": "electro, punk", 31 | "date": "2007-11-19", 32 | "popularity": 9, 33 | }, 34 | ] 35 | 36 | upload.documents( 37 | database="ducksearch.duckdb", 38 | key="id", # unique document identifier 39 | fields=["title", "style", "date", "popularity"], # list of fields to index 40 | documents=documents, 41 | stopwords="english", 42 | stemmer="porter", 43 | lower=True, 44 | strip_accents=True, 45 | dtypes={ 46 | "date": "DATE", 47 | "popularity": "INT", 48 | }, 49 | ) 50 | ``` 51 | 52 | ???+ info 53 | stopwords: List of stop words to filter Defaults to 'english' for a pre-defined list of 571 English stopwords. 54 | 55 | stemmer: Stemmer to use. Defaults to 'porter' for the Porter stemmer. Possible values are: 'arabic', 'basque', 'catalan', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hindi', 'hungarian', 'indonesian', 'irish', 'italian', 'lithuanian', 'nepali', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'serbian', 'spanish', 'swedish', 'tamil', 'turkish', or `None` if no stemming is to be used. 56 | 57 | lower: Whether to convert the text to lowercase. Defaults to `True`. 58 | 59 | strip_accents: Whether to strip accents from the text. Defaults to `True`. 60 | 61 | ### HuggingFace 62 | 63 | The `upload.documents` function can also index HuggingFace datasets directly from the url. 64 | The following example demonstrates how to index the FineWeb dataset from HuggingFace: 65 | 66 | ```python 67 | from ducksearch import upload 68 | 69 | upload.documents( 70 | database="fineweb.duckdb", 71 | key="id", 72 | fields=["text", "url", "date", "language", "token_count", "language_score"], 73 | documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet", 74 | dtypes={ 75 | "date": "DATE", 76 | "token_count": "INT", 77 | "language_score": "FLOAT", 78 | }, 79 | limit=1000, # demonstrate with a small dataset 80 | ) 81 | ``` 82 | 83 | ???+ info 84 | More informations about DuckDB and HuggingFace compatibility can be found [here](https://huggingface.co/docs/hub/en/datasets-duckdb) and [here](https://duckdb.org/2024/05/29/access-150k-plus-datasets-from-hugging-face-with-duckdb.html). 85 | -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lightonai/ducksearch/91422599772f909f490f441ef38415e38224c6d5/docs/img/logo.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 |
2 |

DuckSearch

3 |

Efficient BM25 with DuckDB 🦆

4 |
5 | 6 |

7 | 8 |
9 | 10 | documentation 11 | 12 | license 13 |
14 | 15 |

16 | DuckSearch is a lightweight and easy-to-use library to search documents. DuckSearch is built on top of DuckDB, a high-performance analytical database. DuckDB is designed to execute analytical SQL queries fast, and DuckSearch leverages this to provide efficient search and filtering features. DuckSearch index can be updated with new documents and documents can be deleted as well. DuckSearch also supports HuggingFace datasets, allowing to index datasets directly from the HuggingFace Hub. 17 |

18 | 19 | ## Installation 20 | 21 | Install DuckSearch using pip: 22 | 23 | ```bash 24 | pip install ducksearch 25 | ``` 26 | 27 | ## Documentation 28 | 29 | The complete documentation is available [here](https://lightonai.github.io/ducksearch/), which includes in-depth guides, examples, and API references. 30 | 31 | ### Upload 32 | 33 | We can upload documents to DuckDB using the `upload.documents` function. The documents are stored in a DuckDB database, and the `fields` are indexed with BM25. 34 | 35 | ```python 36 | from ducksearch import upload 37 | 38 | documents = [ 39 | { 40 | "id": 0, 41 | "title": "Hotel California", 42 | "style": "rock", 43 | "date": "1977-02-22", 44 | "popularity": 9, 45 | }, 46 | { 47 | "id": 1, 48 | "title": "Here Comes the Sun", 49 | "style": "rock", 50 | "date": "1969-06-10", 51 | "popularity": 10, 52 | }, 53 | { 54 | "id": 2, 55 | "title": "Alive", 56 | "style": "electro, punk", 57 | "date": "2007-11-19", 58 | "popularity": 9, 59 | }, 60 | ] 61 | 62 | upload.documents( 63 | database="ducksearch.duckdb", 64 | key="id", # Unique document identifier 65 | fields=["title", "style"], # List of fields to use for search. 66 | documents=documents, 67 | dtypes={ 68 | "date": "DATE", 69 | "popularity": "INT", 70 | }, 71 | ) 72 | ``` 73 | 74 | ## Search 75 | 76 | `search.documents` returns a list of list of documents ordered by relevance. We can control the number of documents to return using the `top_k` parameter. The following example demonstrates how to search for documents with the queries "punk" and "california" while filtering the results to include only documents with a date after 1970 and a popularity score greater than 8. We will order the results by a weighted sum of the BM25 score and the popularity score provided in the document. 77 | 78 | ```python 79 | from ducksearch import search 80 | 81 | search.documents( 82 | database="ducksearch.duckdb", 83 | queries=["punk", "california"], 84 | top_k=10, 85 | filters="YEAR(date) >= 1970 AND popularity > 8", 86 | order_by="0.8 * score + 0.2 * popularity DESC", 87 | ) 88 | ``` 89 | 90 | ```python 91 | [ 92 | [ 93 | { 94 | "id": "2", 95 | "title": "Alive", 96 | "style": "electro, punk", 97 | "date": Timestamp("2007-11-19 00:00:00"), 98 | "popularity": 9, 99 | "score": 0.17841622233390808, 100 | } 101 | ], 102 | [ 103 | { 104 | "id": "0", 105 | "title": "Hotel California", 106 | "style": "rock, pop", 107 | "date": Timestamp("1977-02-22 00:00:00"), 108 | "popularity": 9, 109 | "score": 0.156318798661232, 110 | } 111 | ], 112 | ] 113 | ``` 114 | 115 | Filters are SQL expressions that are applied to the search results. We can use every filtering function DuckDB provides such as [date functions](https://duckdb.org/docs/sql/functions/date). 116 | 117 | Both `filters` and `order_by` parameters are optional. If not provided, the results are ordered by BM25 relevance and no filters are applied. 118 | 119 | ## Delete and update index 120 | 121 | We can delete documents and update the BM25 weights accordingly using the `delete.documents` function. 122 | 123 | ```python 124 | from ducksearch import delete 125 | 126 | delete.documents( 127 | database="ducksearch.duckdb", 128 | ids=[0, 1], 129 | ) 130 | ``` 131 | 132 | To update the index, we should first delete the documents and then upload the updated documents. 133 | 134 | ## Extra features 135 | 136 | ### HuggingFace 137 | 138 | The `upload.documents` function can also index HuggingFace datasets directly from the url. The following example demonstrates how to index the FineWeb dataset from HuggingFace. We will use the fields "text" and "url" for search. We will also specify the data types for the "date", "token_count", and "language_score" fields to be able to filter the results. 139 | 140 | ```python 141 | from ducksearch import upload 142 | 143 | upload.documents( 144 | database="fineweb.duckdb", 145 | key="id", 146 | fields=["text", "url"], 147 | documents="https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/sample/10BT/000_00000.parquet", 148 | dtypes={ 149 | "date": "DATE", 150 | "token_count": "INT", 151 | "language_score": "FLOAT", 152 | }, 153 | limit=3000, # demonstrate with a small dataset 154 | ) 155 | ``` 156 | 157 | We can then search the FineWeb dataset with the `search.documents` function. We order the results by BM25 score and then date. 158 | 159 | ```python 160 | from ducksearch import search 161 | 162 | search.documents( 163 | database="fineweb.duckdb", 164 | queries=["earth science"], 165 | top_k=2, 166 | order_by="score DESC, date DESC", 167 | ) 168 | ``` 169 | 170 | ```python 171 | [ 172 | [ 173 | { 174 | "id": "", 175 | "text": "Earth Science Tutors in Rowland...", 176 | "id_1": "", 177 | "dump": "CC-MAIN-2017-34", 178 | "url": "http://rowland.universitytutor.com/rowland_earth-science-tutoring", 179 | "date": Timestamp("2017-08-19 00:00:00"), 180 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2017-34/segments/1502886105304.35/warc/CC-MAIN-20170819051034-20170819071034-00240.warc.gz", 181 | "language": "en", 182 | "language_score": 0.8718525171279907, 183 | "token_count": 313, 184 | "bm25id": 523, 185 | "score": 2.3761106729507446, 186 | }, 187 | { 188 | "id": "", 189 | "text": "- Geomagnetic field....", 190 | "id_1": "", 191 | "dump": "CC-MAIN-2022-21", 192 | "url": "https://www.imperial.ac.uk/people/adrian.muxworthy/?respub-action=citation.html&id=1149861&noscript=noscript", 193 | "date": Timestamp("2022-05-20 00:00:00"), 194 | "file_path": "s3://commoncrawl/crawl-data/CC-MAIN-2022-21/segments/1652662530553.34/warc/CC-MAIN-20220519235259-20220520025259-00601.warc.gz", 195 | "language": "en", 196 | "language_score": 0.8225595951080322, 197 | "token_count": 517, 198 | "bm25id": 4783, 199 | "score": 2.3569871187210083, 200 | }, 201 | ] 202 | ] 203 | 204 | ``` 205 | 206 | Note: by default, results are ordered by BM25 relevance. 207 | 208 | ## Tables 209 | 210 | Ducksearch creates two distinct schemas: `bm25_tables`, `bm25_documents`. 211 | 212 | - We can find the uploaded documents in the `bm25_tables.documents` table. 213 | 214 | - We can find the inverted index in the `bm25_documents.scores` table. You can update the scores as you wish. Just note that tokens scores will be updated each time you upload documents (every tokens scores mentionned in the set of uploaded documents). 215 | 216 | - We can update the set of stopwords in the `bm25_documents.stopwords` table. 217 | 218 | ## Benchmark 219 | 220 | 221 | | Dataset | ndcg@10 | hits@1 | hits@10 | mrr@10 | map@10 | r-precision | qps | Indexation Time (s) | Number of Documents and Queries | 222 | |-------------------|-----------|---------|----------|----------|---------|-------------|----------------|---------------------|--------------------------------| 223 | | arguana | 0.3779 | 0.0 | 0.8267 | 0.2491 | 0.2528 | 0.0108 | 117.80 | 1.42 | 1,406 queries, 8.67K documents | 224 | | climate-fever | 0.1184 | 0.1068 | 0.3648 | 0.1644 | 0.0803 | 0.0758 | 5.88 | 302.39 | 1,535 queries, 5.42M documents | 225 | | dbpedia-entity | 0.6046 | 0.7669 | 5.6241 | 0.8311 | 0.0649 | 0.0741 | 113.20 | 181.42 | 400 queries, 4.63M documents | 226 | | fever | 0.3861 | 0.2583 | 0.5826 | 0.3525 | 0.3329 | 0.2497 | 74.40 | 329.70 | 6,666 queries, 5.42M documents | 227 | | fiqa | 0.2445 | 0.2207 | 0.6790 | 0.3002 | 0.1848 | 0.1594 | 545.77 | 6.04 | 648 queries, 57K documents | 228 | | hotpotqa | 0.4487 | 0.5059 | 0.9699 | 0.5846 | 0.3642 | 0.3388 | 48.15 | 163.14 | 7,405 queries, 5.23M documents | 229 | | msmarco | 0.8951 | 1.0 | 8.6279 | 1.0 | 0.0459 | 0.0473 | 35.11 | 202.37 | 6,980 queries, 8.84M documents | 230 | | nfcorpus | 0.3301 | 0.4396 | 2.4087 | 0.5292 | 0.1233 | 0.1383 | 3464.66 | 0.99 | 323 queries, 3.6K documents | 231 | | nq | 0.2451 | 0.1272 | 0.4574 | 0.2099 | 0.1934 | 0.1240 | 150.23 | 71.43 | 3,452 queries, 2.68M documents | 232 | | quora | 0.7705 | 0.6783 | 1.1749 | 0.7606 | 0.7206 | 0.6502 | 741.13 | 3.78 | 10,000 queries, 523K documents | 233 | | scidocs | 0.1025 | 0.1790 | 0.8240 | 0.2754 | 0.0154 | 0.0275 | 879.11 | 4.46 | 1,000 queries, 25K documents | 234 | | scifact | 0.6908 | 0.5533 | 0.9133 | 0.6527 | 0.6416 | 0.5468 | 2153.64 | 1.22 | 300 queries, 5K documents | 235 | | trec-covid | 0.9533 | 1.0 | 9.4800 | 1.0 | 0.0074 | 0.0077 | 112.38 | 22.15 | 50 queries, 171K documents | 236 | | webis-touche2020 | 0.4130 | 0.5510 | 3.7347 | 0.7114 | 0.0564 | 0.0827 | 104.65 | 44.14 | 49 queries, 382K documents | 237 | 238 | ## References 239 | 240 | - [DuckDB](https://duckdb.org/) 241 | 242 | - [DuckDB Full Text Search](https://duckdb.org/docs/extensions/full_text_search.html): Note that DuckSearch rely partially on the DuckDB Full Text Search extension but accelerate the search process via `top_k_token` approximation, pre-computation of scores and multi-threading. 243 | 244 | ## License 245 | 246 | DuckSearch is released under the MIT license. 247 | 248 | ## Citation 249 | 250 | ``` 251 | @misc{PyLate, 252 | title={DuckSearch, efficient search with DuckDB}, 253 | author={Sourty, Raphael}, 254 | url={https://github.com/lightonai/ducksearch}, 255 | year={2024} 256 | } 257 | ``` -------------------------------------------------------------------------------- /docs/javascripts/config.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.typesetPromise() 16 | }) -------------------------------------------------------------------------------- /docs/javascripts/tablesort.js: -------------------------------------------------------------------------------- 1 | document$.subscribe(function () { 2 | var tables = document.querySelectorAll("article table:not([class])") 3 | tables.forEach(function (table) { 4 | new Tablesort(table) 5 | }) 6 | }) -------------------------------------------------------------------------------- /docs/js/version-select.js: -------------------------------------------------------------------------------- 1 | window.addEventListener("DOMContentLoaded", function () { 2 | // This is a bit hacky. Figure out the base URL from a known CSS file the 3 | // template refers to... 4 | var ex = new RegExp("/?css/version-select.css$"); 5 | var sheet = document.querySelector('link[href$="version-select.css"]'); 6 | 7 | var ABS_BASE_URL = sheet.href.replace(ex, ""); 8 | var CURRENT_VERSION = ABS_BASE_URL.split("/").pop(); 9 | 10 | function makeSelect(options, selected) { 11 | var select = document.createElement("select"); 12 | select.classList.add("form-control"); 13 | 14 | options.forEach(function (i) { 15 | var option = new Option(i.text, i.value, undefined, 16 | i.value === selected); 17 | select.add(option); 18 | }); 19 | 20 | return select; 21 | } 22 | 23 | var xhr = new XMLHttpRequest(); 24 | xhr.open("GET", ABS_BASE_URL + "/../versions.json"); 25 | xhr.onload = function () { 26 | var versions = JSON.parse(this.responseText); 27 | 28 | var realVersion = versions.find(function (i) { 29 | return i.version === CURRENT_VERSION || 30 | i.aliases.includes(CURRENT_VERSION); 31 | }).version; 32 | 33 | var select = makeSelect(versions.map(function (i) { 34 | return { text: i.title, value: i.version }; 35 | }), realVersion); 36 | select.addEventListener("change", function (event) { 37 | window.location.href = ABS_BASE_URL + "/../" + this.value; 38 | }); 39 | 40 | var container = document.createElement("div"); 41 | container.id = "version-selector"; 42 | container.className = "md-nav__item"; 43 | container.appendChild(select); 44 | 45 | var sidebar = document.querySelector(".md-nav--primary > .md-nav__list"); 46 | sidebar.parentNode.insertBefore(container, sidebar); 47 | }; 48 | xhr.send(); 49 | }); -------------------------------------------------------------------------------- /docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | .md-typeset h2 { 2 | margin: 1.5em 0; 3 | padding-bottom: .4rem; 4 | border-bottom: .04rem solid var(--md-default-fg-color--lighter); 5 | } 6 | 7 | .md-footer { 8 | margin-top: 2em; 9 | } 10 | 11 | .md-typeset pre>code { 12 | border-radius: 0.5em; 13 | } -------------------------------------------------------------------------------- /ducksearch/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["decorators", "evaluation", "hf", "search", "tables", "upload", "utils"] 2 | -------------------------------------------------------------------------------- /ducksearch/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (1, 0, 3) 2 | 3 | __version__ = ".".join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /ducksearch/decorators/__init__.py: -------------------------------------------------------------------------------- 1 | from .execute_with_duckdb import connect_to_duckdb, execute_with_duckdb 2 | 3 | __all__ = ["execute_with_duckdb", "connect_to_duckdb"] 4 | -------------------------------------------------------------------------------- /ducksearch/decorators/execute_with_duckdb.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import time 3 | from functools import wraps 4 | 5 | import duckdb 6 | 7 | 8 | def connect_to_duckdb( 9 | database: str, 10 | read_only: bool = False, 11 | config: dict | None = None, 12 | max_retry: int = 30, 13 | sleep_time: float = 0.1, 14 | **kwargs, 15 | ): 16 | """Establish a connection to the DuckDB database. Retry connecting if an error occurs. 17 | 18 | Parameters 19 | ---------- 20 | database 21 | The name or path of the DuckDB database to connect to. 22 | read_only 23 | Whether to open the database in read-only mode. Default is False. 24 | config 25 | Optional configuration settings for the DuckDB connection. 26 | max_retry 27 | The maximum number of times to retry connecting to DuckDB. 28 | sleep_time 29 | The time to sleep between retries. 30 | 31 | Returns 32 | ------- 33 | duckdb.DuckDBPyConnection 34 | A DuckDB connection object. 35 | 36 | """ 37 | current_retry = 0 38 | while True: 39 | try: 40 | conn = ( 41 | duckdb.connect(database=database, read_only=read_only, config=config) 42 | if config 43 | else duckdb.connect(database=database, read_only=read_only) 44 | ) 45 | break 46 | except Exception as error: 47 | if current_retry >= max_retry: 48 | raise error 49 | time.sleep(sleep_time) 50 | current_retry += 1 51 | 52 | return conn 53 | 54 | 55 | def execute_with_duckdb( 56 | relative_path: str | list[str], 57 | read_only: bool = False, 58 | fields: list[str] | None = None, 59 | fetch_df: bool = False, 60 | **kwargs, 61 | ): 62 | """Decorator to execute a SQL query using DuckDB. 63 | 64 | Parameters 65 | ---------- 66 | relative_path 67 | A string or list of strings specifying the path(s) to the SQL file(s). 68 | read_only 69 | Whether the DuckDB connection should be read-only. Default is False. 70 | fields 71 | A list of fields to use as keys for the result rows if returning records. 72 | fetch_df 73 | If True, fetch the result as a pandas DataFrame and return it as a list of dictionaries. 74 | kwargs 75 | Additional keyword arguments to be passed to the SQL query, useful for string formatting. 76 | 77 | Returns 78 | ------- 79 | A decorator function that executes the SQL query and returns the result. 80 | 81 | """ 82 | 83 | def decorator(func): 84 | @wraps(func) 85 | def wrapper( 86 | *args, 87 | database: str, 88 | config: dict | None = None, 89 | df: list[dict] = None, 90 | relative_path: str | list[str] = relative_path, 91 | **kwargs, 92 | ): 93 | """Connect to DuckDB and execute the query from the provided SQL file path(s).""" 94 | conn = connect_to_duckdb( 95 | database=database, 96 | read_only=read_only, 97 | config=config, 98 | **kwargs, 99 | ) 100 | 101 | # Ensure relative_path is treated as a list 102 | if isinstance(relative_path, str): 103 | relative_path = [relative_path] 104 | 105 | try: 106 | # Loop through and execute all SQL files in relative_path 107 | for path in relative_path: 108 | # Build the full path to the SQL file 109 | path = pathlib.Path(__file__).parent.parent.joinpath(path) 110 | 111 | # Read the SQL query from the file 112 | with open(file=path, mode="r") as sql_file: 113 | query = sql_file.read() 114 | 115 | # Format the query with any additional kwargs 116 | if kwargs: 117 | query = query.format(**kwargs) 118 | 119 | # Fetch the result as a DataFrame or a list of rows 120 | if fetch_df: 121 | data = conn.execute(query).fetchdf() 122 | data.columns = data.columns.str.lower() 123 | data = data.to_dict(orient="records") 124 | else: 125 | data = conn.execute(query).fetchall() 126 | 127 | # If fields are provided, map the result rows to dictionaries with the specified field names 128 | if fields is not None: 129 | data = [dict(zip(fields, row)) for row in data] 130 | 131 | # Handle DuckDB-specific exceptions (e.g., too many open files) 132 | except duckdb.duckdb.IOException: 133 | message = "\n--------\nDuckDB exception, too many files open.\nGet current ulimit: ulimit -n\nIncrease ulimit with `ulimit -n 4096` or more.\n--------\n" 134 | raise duckdb.duckdb.IOException(message) 135 | 136 | # Handle other exceptions and provide more detailed error information 137 | except Exception as error: 138 | raise ValueError( 139 | "\n{}:\n{}\n{}:\n{}".format( 140 | type(error).__name__, path, error, query 141 | ) 142 | ) 143 | 144 | # Close the DuckDB connection in the end 145 | finally: 146 | conn.close() 147 | 148 | # Return the fetched data, if applicable 149 | if fetch_df: 150 | return data 151 | 152 | if data: 153 | return data 154 | 155 | return wrapper 156 | 157 | return decorator 158 | -------------------------------------------------------------------------------- /ducksearch/delete/__init__.py: -------------------------------------------------------------------------------- 1 | from .documents import documents 2 | 3 | __all__ = ["documents"] 4 | -------------------------------------------------------------------------------- /ducksearch/delete/delete/documents.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM {schema}.documents 2 | USING parquet_scan('{parquet_file}') AS _df_documents 3 | WHERE {schema}.documents.id = _df_documents.id; 4 | -------------------------------------------------------------------------------- /ducksearch/delete/delete/documents_queries.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM {schema}.documents_queries 2 | USING parquet_scan('{parquet_file}') AS _df_documents 3 | WHERE {schema}.documents_queries.document_id = _df_documents.id; -------------------------------------------------------------------------------- /ducksearch/delete/delete/scores.sql: -------------------------------------------------------------------------------- 1 | -- This query finds the set of tokens scores for which there won't be any docid / score to keep. 2 | WITH _docs_to_delete AS ( 3 | SELECT DISTINCT bm25.docid 4 | FROM parquet_scan('{parquet_file}') AS p 5 | INNER JOIN bm25_documents.docs AS bm25 6 | ON p.id = bm25.name 7 | ), 8 | 9 | _terms_to_recompute AS ( 10 | SELECT DISTINCT term 11 | FROM bm25_documents.terms 12 | INNER JOIN _docs_to_delete 13 | ON bm25_documents.terms.docid = _docs_to_delete.docid 14 | INNER JOIN bm25_documents.dict 15 | ON bm25_documents.terms.termid = bm25_documents.dict.termid 16 | ), 17 | 18 | _scores_to_update AS ( 19 | SELECT 20 | _bm25.term, 21 | _bm25.list_scores, 22 | _bm25.list_docids 23 | FROM bm25_documents.scores AS _bm25 24 | INNER JOIN _terms_to_recompute AS _terms 25 | ON _bm25.term = _terms.term 26 | ), 27 | 28 | _unested_scores AS ( 29 | SELECT 30 | term, 31 | unnest(list_scores) AS score, 32 | unnest(list_docids) AS docid 33 | FROM _scores_to_update 34 | ), 35 | 36 | _unested_unfiltered_scores AS ( 37 | SELECT 38 | _scores.term, 39 | _scores.docid, 40 | _scores.score, 41 | _docs.docid AS to_delete 42 | FROM _unested_scores AS _scores 43 | LEFT JOIN _docs_to_delete AS _docs 44 | ON _scores.docid = _docs.docid 45 | ), 46 | 47 | _unested_filtered_scores AS ( 48 | SELECT 49 | term, 50 | docid, 51 | score 52 | FROM _unested_unfiltered_scores 53 | WHERE to_delete IS NULL 54 | ), 55 | 56 | _terms_to_delete AS ( 57 | SELECT DISTINCT 58 | ttr.term, 59 | ufs.term AS missing 60 | FROM _terms_to_recompute AS ttr 61 | LEFT JOIN _unested_filtered_scores AS ufs 62 | ON ttr.term = ufs.term 63 | ), 64 | 65 | _scores_to_delete_completely AS ( 66 | SELECT DISTINCT term 67 | FROM _terms_to_delete 68 | WHERE missing IS NULL 69 | ) 70 | 71 | DELETE FROM bm25_documents.scores AS _scores 72 | USING _scores_to_delete_completely AS _scores_to_delete 73 | WHERE _scores.term = _scores_to_delete.term; 74 | -------------------------------------------------------------------------------- /ducksearch/delete/documents.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | 6 | from ..decorators import execute_with_duckdb 7 | from ..utils import plot 8 | 9 | 10 | @execute_with_duckdb( 11 | relative_path="delete/delete/documents.sql", 12 | ) 13 | def _drop_documents() -> None: 14 | """Delete documents from the documents table in DuckDB.""" 15 | 16 | 17 | @execute_with_duckdb( 18 | relative_path="delete/update/scores.sql", 19 | ) 20 | def _update_score() -> None: 21 | """Update the score after deleting documents.""" 22 | 23 | 24 | @execute_with_duckdb( 25 | relative_path="delete/update/df.sql", 26 | ) 27 | def _update_df() -> None: 28 | """Update the token frequency deleting documents.""" 29 | 30 | 31 | @execute_with_duckdb( 32 | relative_path="delete/delete/scores.sql", 33 | ) 34 | def _delete_score() -> None: 35 | """Delete the scores for which we don't keep any document.""" 36 | 37 | 38 | @execute_with_duckdb( 39 | relative_path="delete/update/docs.sql", 40 | ) 41 | def _update_docs() -> None: 42 | """Update the docs table.""" 43 | 44 | 45 | @execute_with_duckdb( 46 | relative_path="delete/update/terms.sql", 47 | ) 48 | def _update_terms() -> None: 49 | """Update the term table.""" 50 | 51 | 52 | @execute_with_duckdb( 53 | relative_path="delete/update/stats.sql", 54 | ) 55 | def _update_stats() -> None: 56 | """Update the term table.""" 57 | 58 | 59 | def documents( 60 | database: str, 61 | ids: list[str], 62 | schema: str = "bm25_tables", 63 | config: dict | None = None, 64 | ) -> None: 65 | """Delete specified documents from the documents table. 66 | 67 | Parameters 68 | ---------- 69 | database 70 | The name of the DuckDB database. 71 | keys 72 | A list of document IDs to delete. 73 | schema 74 | The schema where the documents table is located. 75 | config 76 | Optional configuration options for the DuckDB connection. 77 | 78 | Returns 79 | ------- 80 | None 81 | The function deletes the specified documents and updates the plots. 82 | 83 | Examples 84 | -------- 85 | >>> from ducksearch import upload, delete 86 | 87 | >>> documents = [ 88 | ... {"id": 1, "title": "Document 1", "text": "This is the text of document 1."}, 89 | ... {"id": 2, "title": "Document 2", "text": "This is the text of document 2."}, 90 | ... {"id": 3, "title": "Document 3", "text": "This is the text of document 3."}, 91 | ... ] 92 | 93 | >>> upload.documents( 94 | ... database="test.duckdb", 95 | ... key="id", 96 | ... fields=["title", "text"], 97 | ... documents=documents, 98 | ... ) 99 | | Table | Size | 100 | |----------------|------| 101 | | documents | 3 | 102 | | bm25_documents | 3 | 103 | 104 | >>> delete.documents( 105 | ... database="test.duckdb", 106 | ... ids=[1, 2], 107 | ... ) 108 | | Table | Size | 109 | |----------------|------| 110 | | documents | 1 | 111 | | bm25_documents | 1 | 112 | 113 | >>> delete.documents( 114 | ... database="test.duckdb", 115 | ... ids=[1, 2, 3], 116 | ... ) 117 | 118 | """ 119 | # Convert the list of document keys into a pyarrow Table for deletion 120 | documents_ids = pa.Table.from_pydict({"id": ids}) 121 | 122 | # Write the document IDs to a parquet file for deletion 123 | pq.write_table( 124 | documents_ids, 125 | "_documents_ids.parquet", 126 | compression="snappy", 127 | ) 128 | 129 | _delete_score( 130 | database=database, 131 | parquet_file="_documents_ids.parquet", 132 | config=config, 133 | ) 134 | 135 | _update_score( 136 | database=database, 137 | parquet_file="_documents_ids.parquet", 138 | config=config, 139 | ) 140 | 141 | _update_df( 142 | database=database, 143 | parquet_file="_documents_ids.parquet", 144 | config=config, 145 | ) 146 | 147 | _update_terms( 148 | database=database, 149 | parquet_file="_documents_ids.parquet", 150 | config=config, 151 | ) 152 | 153 | _update_docs( 154 | database=database, 155 | parquet_file="_documents_ids.parquet", 156 | config=config, 157 | ) 158 | 159 | _update_stats( 160 | database=database, 161 | parquet_file="_documents_ids.parquet", 162 | config=config, 163 | ) 164 | 165 | _drop_documents( 166 | database=database, 167 | schema=schema, 168 | parquet_file="_documents_ids.parquet", 169 | config=config, 170 | ) 171 | 172 | if os.path.exists("_documents_ids.parquet"): 173 | os.remove("_documents_ids.parquet") 174 | 175 | # Plot the current state of the tables after deletion 176 | return plot( 177 | database=database, 178 | config=config, 179 | tables=[ 180 | f"{schema}.documents", 181 | f"{schema}.queries", 182 | "bm25_documents.docs", 183 | "bm25_queries.docs", 184 | "bm25_tables.documents_queries", 185 | ], 186 | ) 187 | -------------------------------------------------------------------------------- /ducksearch/delete/update/df.sql: -------------------------------------------------------------------------------- 1 | WITH _docs_to_delete AS ( 2 | SELECT DISTINCT bm25.docid 3 | FROM parquet_scan('{parquet_file}') AS p 4 | INNER JOIN bm25_documents.docs AS bm25 5 | ON p.id = bm25.name 6 | ), 7 | 8 | _tf AS ( 9 | SELECT 10 | termid, 11 | sum(tf) AS df 12 | FROM bm25_documents.terms 13 | INNER JOIN _docs_to_delete 14 | ON bm25_documents.terms.docid = _docs_to_delete.docid 15 | GROUP BY 1 16 | ) 17 | 18 | UPDATE bm25_documents.dict _dict 19 | SET df = greatest(_dict.df - _tf.df, 0) 20 | FROM _tf 21 | WHERE _dict.termid = _tf.termid; 22 | -------------------------------------------------------------------------------- /ducksearch/delete/update/docs.sql: -------------------------------------------------------------------------------- 1 | DELETE FROM bm25_documents.docs AS _docs 2 | USING parquet_scan('{parquet_file}') AS _df_documents 3 | WHERE _docs.name = _df_documents.id; 4 | -------------------------------------------------------------------------------- /ducksearch/delete/update/scores.sql: -------------------------------------------------------------------------------- 1 | -- This query finds the set of tokens scores for which there won't be any docid / score to keep. 2 | WITH _docs_to_delete AS ( 3 | SELECT DISTINCT bm25.docid 4 | FROM parquet_scan('{parquet_file}') AS p 5 | INNER JOIN bm25_documents.docs AS bm25 6 | ON p.id = bm25.name 7 | ), 8 | 9 | _terms_to_recompute AS ( 10 | SELECT DISTINCT term 11 | FROM bm25_documents.terms 12 | INNER JOIN _docs_to_delete 13 | ON bm25_documents.terms.docid = _docs_to_delete.docid 14 | INNER JOIN bm25_documents.dict 15 | ON bm25_documents.terms.termid = bm25_documents.dict.termid 16 | ), 17 | 18 | _scores_to_update AS ( 19 | SELECT 20 | _bm25.term, 21 | _bm25.list_scores, 22 | _bm25.list_docids 23 | FROM bm25_documents.scores AS _bm25 24 | INNER JOIN _terms_to_recompute AS _terms 25 | ON _bm25.term = _terms.term 26 | ), 27 | 28 | _unested_scores AS ( 29 | SELECT 30 | term, 31 | unnest(list_scores) AS score, 32 | unnest(list_docids) AS docid 33 | FROM _scores_to_update 34 | ), 35 | 36 | _unested_unfiltered_scores AS ( 37 | SELECT 38 | _scores.term, 39 | _scores.docid, 40 | _scores.score, 41 | _docs.docid AS to_delete 42 | FROM _unested_scores AS _scores 43 | LEFT JOIN _docs_to_delete AS _docs 44 | ON _scores.docid = _docs.docid 45 | ), 46 | 47 | _unested_filtered_scores AS ( 48 | SELECT 49 | term, 50 | docid, 51 | score 52 | FROM _unested_unfiltered_scores 53 | WHERE to_delete IS NULL 54 | ), 55 | 56 | _list_scores AS ( 57 | SELECT 58 | term, 59 | list(docid ORDER BY score DESC, docid ASC) AS list_docids, 60 | list(score ORDER BY score DESC, docid ASC) AS list_scores 61 | FROM _unested_filtered_scores 62 | GROUP BY 1 63 | ) 64 | 65 | UPDATE bm25_documents.scores s 66 | SET 67 | list_docids = u.list_docids, 68 | list_scores = u.list_scores 69 | FROM _list_scores AS u 70 | WHERE s.term = u.term; 71 | -------------------------------------------------------------------------------- /ducksearch/delete/update/stats.sql: -------------------------------------------------------------------------------- 1 | WITH _stats AS ( 2 | SELECT 3 | COUNT(*) AS num_docs, 4 | AVG(len) AS avgdl 5 | FROM bm25_documents.docs 6 | ) 7 | 8 | UPDATE bm25_documents.stats 9 | SET 10 | num_docs = _stats.num_docs, 11 | avgdl = _stats.avgdl 12 | FROM _stats; 13 | -------------------------------------------------------------------------------- /ducksearch/delete/update/terms.sql: -------------------------------------------------------------------------------- 1 | WITH _docs_to_delete AS ( 2 | SELECT bm25.docid 3 | FROM parquet_scan('{parquet_file}') AS p 4 | INNER JOIN bm25_documents.docs AS bm25 5 | ON p.id = bm25.name 6 | ) 7 | 8 | DELETE FROM bm25_documents.terms AS _terms 9 | USING _docs_to_delete AS _docs 10 | WHERE _terms.docid = _docs.docid; 11 | -------------------------------------------------------------------------------- /ducksearch/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import evaluate, load_beir 2 | 3 | __all__ = ["evaluate", "load_beir"] 4 | -------------------------------------------------------------------------------- /ducksearch/evaluation/evaluation.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from typing import Dict 3 | 4 | __all__ = ["evaluate", "load_beir"] 5 | 6 | 7 | def load_beir(dataset_name: str, split: str = "test") -> tuple[list, list, dict]: 8 | """Load BEIR dataset for document and query retrieval tasks. 9 | 10 | Parameters 11 | ---------- 12 | dataset_name 13 | The name of the dataset to load (e.g., 'scifact'). 14 | split 15 | The dataset split to load (e.g., 'test'). 16 | 17 | Returns 18 | ------- 19 | tuple 20 | A tuple containing three elements: 21 | - A list of document dictionaries, each containing 'id', 'title', and 'text' fields. 22 | - A list of queries. 23 | - A dictionary of qrels (query relevance judgments). 24 | 25 | Examples 26 | -------- 27 | >>> documents, queries, qrels = load_beir("scifact", split="test") 28 | 29 | >>> len(documents) 30 | 5183 31 | 32 | >>> len(queries) 33 | 300 34 | 35 | """ 36 | from beir import util 37 | from beir.datasets.data_loader import GenericDataLoader 38 | 39 | data_path = util.download_and_unzip( 40 | url=f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip", 41 | out_dir="./evaluation_datasets/", 42 | ) 43 | 44 | documents, queries, qrels = GenericDataLoader(data_folder=data_path).load( 45 | split=split 46 | ) 47 | 48 | # Format documents 49 | documents = [ 50 | { 51 | "id": document_id, 52 | "title": document["title"], 53 | "text": document["text"], 54 | } 55 | for document_id, document in documents.items() 56 | ] 57 | 58 | _queries = [queries[query_id] for query_id, _ in qrels.items()] 59 | 60 | # Format qrels (relevance judgments) 61 | _qrels = collections.defaultdict(dict) 62 | for query_id, query_documents in qrels.items(): 63 | for document in list(query_documents.keys()): 64 | if query_id in queries: 65 | _qrels[document][queries[query_id]] = 1 66 | 67 | return ( 68 | documents, 69 | _queries, 70 | _qrels, 71 | ) 72 | 73 | 74 | def evaluate( 75 | scores: list[list[dict]], 76 | qrels: dict, 77 | queries: list[str], 78 | metrics: list = [], 79 | ) -> Dict[str, float]: 80 | """Evaluate the performance of document retrieval using relevance judgments. 81 | 82 | Parameters 83 | ---------- 84 | scores 85 | A list of lists, where each sublist contains dictionaries representing the retrieved documents for a query. 86 | qrels 87 | A dictionary mapping queries to relevant documents and their relevance scores. 88 | queries 89 | A list of queries. 90 | metrics 91 | A list of metrics to compute. Default includes "ndcg@10" and hits at various levels (e.g., hits@1, hits@10). 92 | 93 | Returns 94 | ------- 95 | dict 96 | A dictionary mapping each metric to its computed value. 97 | 98 | Examples 99 | -------- 100 | >>> from ducksearch import evaluation, upload, search 101 | 102 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="test") 103 | 104 | >>> upload.documents( 105 | ... database="test.duckdb", 106 | ... key="id", 107 | ... fields=["title", "text"], 108 | ... documents=documents, 109 | ... ) 110 | | Table | Size | 111 | |----------------|------| 112 | | documents | 5183 | 113 | | bm25_documents | 5183 | 114 | 115 | >>> scores = search.documents( 116 | ... database="test.duckdb", 117 | ... queries=queries, 118 | ... top_k=10, 119 | ... ) 120 | 121 | """ 122 | from ranx import Qrels, Run, evaluate 123 | 124 | # Format qrels for evaluation 125 | _qrels = collections.defaultdict(dict) 126 | for document_id, document_queries in qrels.items(): 127 | for query, score in document_queries.items(): 128 | _qrels[query][document_id] = score 129 | 130 | qrels = Qrels(qrels=_qrels) 131 | 132 | # Create a run dict to map queries to their respective retrieved documents and scores 133 | run_dict = { 134 | query: { 135 | match["id"]: 1 - (rank / len(query_matchs)) 136 | for rank, match in enumerate(iterable=query_matchs) 137 | } 138 | for query, query_matchs in zip(queries, scores) 139 | } 140 | 141 | run = Run(run=run_dict) 142 | 143 | # Default metrics if none are provided 144 | if not metrics: 145 | metrics = ["ndcg@10"] + [f"hits@{k}" for k in [1, 2, 3, 4, 5, 10]] 146 | 147 | # Evaluate using ranx and return results 148 | return evaluate( 149 | qrels=qrels, 150 | run=run, 151 | metrics=metrics, 152 | make_comparable=True, 153 | ) 154 | -------------------------------------------------------------------------------- /ducksearch/hf/__init__.py: -------------------------------------------------------------------------------- 1 | from .insert import count_rows, insert_documents 2 | 3 | __all__ = ["count_rows", "insert_documents"] 4 | -------------------------------------------------------------------------------- /ducksearch/hf/drop/tmp.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE {schema}._hf_tmp; 2 | -------------------------------------------------------------------------------- /ducksearch/hf/insert.py: -------------------------------------------------------------------------------- 1 | from ..decorators import execute_with_duckdb 2 | from ..tables import add_columns_documents, create_documents 3 | 4 | 5 | @execute_with_duckdb( 6 | relative_path="hf/insert/documents.sql", 7 | fetch_df=False, 8 | ) 9 | def _insert_documents() -> None: 10 | """Insert the documents from Hugging Face datasets into DuckDB.""" 11 | 12 | 13 | @execute_with_duckdb( 14 | relative_path="hf/select/count.sql", 15 | fetch_df=True, 16 | ) 17 | def count_rows() -> None: 18 | """Insert the documents from Hugging Face datasets into DuckDB.""" 19 | 20 | 21 | @execute_with_duckdb( 22 | relative_path="hf/select/columns.sql", 23 | fetch_df=True, 24 | read_only=True, 25 | ) 26 | def _select_columns() -> None: 27 | """Select all columns from the HuggingFace documents table.""" 28 | 29 | 30 | @execute_with_duckdb( 31 | relative_path="hf/select/exists.sql", 32 | fetch_df=True, 33 | read_only=True, 34 | ) 35 | def _table_exists() -> None: 36 | """Check if the table exists in the DuckDB database.""" 37 | 38 | 39 | @execute_with_duckdb( 40 | relative_path="hf/insert/tmp.sql", 41 | fetch_df=False, 42 | ) 43 | def _insert_tmp_documents() -> None: 44 | """Insert the documents from Hugging Face datasets into DuckDB.""" 45 | 46 | 47 | @execute_with_duckdb( 48 | relative_path="hf/drop/tmp.sql", 49 | fetch_df=True, 50 | ) 51 | def _drop_tmp_table() -> None: 52 | """Drop the temporary HF table.""" 53 | 54 | 55 | def insert_documents( 56 | database: str, 57 | schema: str, 58 | key: str, 59 | url: list[str] | str, 60 | config: dict | None = None, 61 | limit: int | None = None, 62 | offset: int | None = None, 63 | dtypes: dict | None = None, 64 | fast: bool = False, 65 | ) -> None: 66 | """Insert documents from a Hugging Face dataset into DuckDB. 67 | 68 | Parameters 69 | ---------- 70 | database 71 | The name of the DuckDB database. 72 | schema 73 | The schema in which the documents table is located. 74 | key 75 | The key field that uniquely identifies each document (e.g., 'query_id'). 76 | fields 77 | A list of fields to be inserted from the dataset. If a single field is provided as a string, it will be converted to a list. 78 | url 79 | The URL of the Hugging Face dataset in Parquet format. 80 | config 81 | Optional configuration options for the DuckDB connection. 82 | 83 | Examples 84 | -------- 85 | >>> from ducksearch import upload 86 | 87 | >>> upload.documents( 88 | ... database="test.duckdb", 89 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/queries.parquet", 90 | ... key="query_id", 91 | ... fields=["query_id", "text"], 92 | ... ) 93 | | Table | Size | 94 | |----------------|------| 95 | | documents | 19 | 96 | | bm25_documents | 19 | 97 | 98 | >>> upload.documents( 99 | ... database="test.duckdb", 100 | ... documents="hf://datasets/lightonai/lighton-ms-marco-mini/documents.parquet", 101 | ... key="document_id", 102 | ... fields=["document_id", "text"], 103 | ... ) 104 | | Table | Size | 105 | |----------------|------| 106 | | documents | 51 | 107 | | bm25_documents | 51 | 108 | 109 | """ 110 | offset_hf = f"OFFSET {offset}" if offset is not None else "" 111 | limit_hf = f"LIMIT {limit}" if limit is not None else "" 112 | 113 | _insert_tmp_documents( 114 | database=database, 115 | schema=schema, 116 | url=url, 117 | key_field=key, 118 | config=config, 119 | offset_hf=offset_hf, 120 | limit_hf=limit_hf, 121 | ) 122 | 123 | exists = _table_exists( 124 | database=database, 125 | schema=schema, 126 | table_name="documents", 127 | )[0]["table_exists"] 128 | 129 | _hf_tmp_columns = _select_columns( 130 | database=database, 131 | schema=schema, 132 | table_name="_hf_tmp", 133 | ) 134 | 135 | _hf_tmp_columns = [ 136 | column["column"] for column in _hf_tmp_columns if column["column"] != "id" 137 | ] 138 | 139 | if exists: 140 | documents_columns = _select_columns( 141 | database=database, 142 | schema=schema, 143 | table_name="documents", 144 | ) 145 | 146 | documents_columns = set( 147 | [column["column"] for column in documents_columns if column != "id"] 148 | ) 149 | 150 | columns_to_add = list(set(_hf_tmp_columns) - documents_columns) 151 | 152 | if columns_to_add: 153 | add_columns_documents( 154 | database=database, 155 | schema=schema, 156 | columns=columns_to_add, 157 | dtypes=dtypes, 158 | config=config, 159 | ) 160 | else: 161 | create_documents( 162 | database=database, 163 | schema=schema, 164 | columns=_hf_tmp_columns, 165 | dtypes=dtypes, 166 | config=config, 167 | ) 168 | 169 | _insert_documents( 170 | database=database, 171 | schema=schema, 172 | url=url, 173 | key_field=key, 174 | _hf_tmp_columns=", ".join(_hf_tmp_columns), 175 | limit_hf=limit_hf, 176 | config=config, 177 | ) 178 | 179 | _drop_tmp_table( 180 | database=database, 181 | schema=schema, 182 | config=config, 183 | ) 184 | -------------------------------------------------------------------------------- /ducksearch/hf/insert/documents.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.documents (id, {_hf_tmp_columns}) ( 2 | WITH _hf_dataset AS ( 3 | SELECT 4 | id, 5 | * EXCLUDE (id) 6 | FROM {schema}._hf_tmp 7 | ), 8 | 9 | _new_hf_dataset AS ( 10 | SELECT 11 | _hf_dataset.*, 12 | d.id AS existing_id 13 | FROM _hf_dataset 14 | LEFT JOIN {schema}.documents AS d 15 | ON _hf_dataset.id = d.id 16 | 17 | ) 18 | 19 | SELECT id, {_hf_tmp_columns} 20 | FROM _new_hf_dataset 21 | WHERE existing_id IS NULL 22 | ); 23 | -------------------------------------------------------------------------------- /ducksearch/hf/insert/tmp.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE {schema}._hf_tmp AS ( 2 | WITH _hf_dataset AS ( 3 | SELECT 4 | {key_field} AS id, 5 | * 6 | FROM '{url}' 7 | {limit_hf} 8 | {offset_hf} 9 | ), 10 | 11 | _hf_row_number AS ( 12 | SELECT 13 | *, 14 | ROW_NUMBER() OVER (PARTITION BY id ORDER BY id, RANDOM()) AS _row_number 15 | FROM _hf_dataset 16 | ) 17 | 18 | SELECT * EXCLUDE (_row_number) 19 | FROM _hf_row_number 20 | WHERE _row_number = 1 21 | ); 22 | -------------------------------------------------------------------------------- /ducksearch/hf/select/columns.sql: -------------------------------------------------------------------------------- 1 | SELECT column_name as column 2 | FROM information_schema.columns 3 | WHERE 4 | lower(table_name) = '{table_name}' 5 | AND table_schema = '{schema}'; 6 | -------------------------------------------------------------------------------- /ducksearch/hf/select/count.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | count(*) as count 3 | FROM '{url}'; 4 | -------------------------------------------------------------------------------- /ducksearch/hf/select/exists.sql: -------------------------------------------------------------------------------- 1 | SELECT EXISTS( 2 | SELECT 1 3 | FROM information_schema.tables 4 | WHERE 5 | LOWER(table_name) = LOWER('{table_name}') 6 | AND table_schema = '{schema}' 7 | ) AS table_exists; 8 | -------------------------------------------------------------------------------- /ducksearch/search/__init__.py: -------------------------------------------------------------------------------- 1 | from .create import update_index_documents, update_index_queries 2 | from .graphs import graphs 3 | from .select import documents, queries, search 4 | 5 | __all__ = [ 6 | "update_index_documents", 7 | "update_index_queries", 8 | "documents", 9 | "queries", 10 | "graphs", 11 | "search", 12 | ] 13 | -------------------------------------------------------------------------------- /ducksearch/search/create/index.sql: -------------------------------------------------------------------------------- 1 | PRAGMA CREATE_FTS_INDEX( 2 | '{schema}._documents', 3 | 'id', 4 | '_search', 5 | STEMMER='{stemmer}', 6 | STOPWORDS='{stopwords}', 7 | IGNORE='{ignore}', 8 | STRIP_ACCENTS={strip_accents}, 9 | LOWER={lower}, 10 | OVERWRITE=1 11 | ); 12 | -------------------------------------------------------------------------------- /ducksearch/search/create/queries_index.sql: -------------------------------------------------------------------------------- 1 | PRAGMA CREATE_FTS_INDEX( 2 | '{schema}._queries_{random_hash}', 3 | 'query', 4 | 'query', 5 | STEMMER='{stemmer}', 6 | STOPWORDS='{stopwords}', 7 | IGNORE='{ignore}', 8 | STRIP_ACCENTS={strip_accents}, 9 | LOWER={lower}, 10 | OVERWRITE=1 11 | ); -------------------------------------------------------------------------------- /ducksearch/search/create/settings.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {schema}.settings ( 2 | k1 FLOAT, 3 | b FLOAT, 4 | stemmer VARCHAR, 5 | stopwords VARCHAR, 6 | ignore VARCHAR, 7 | strip_accents INT, 8 | lower INT 9 | ); 10 | -------------------------------------------------------------------------------- /ducksearch/search/create/stopwords.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE {schema}.stopwords AS ( 2 | SELECT sw 3 | FROM parquet_scan('{parquet_file}') 4 | ); 5 | -------------------------------------------------------------------------------- /ducksearch/search/create/tables.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA IF NOT EXISTS {schema}; 2 | 3 | CREATE SEQUENCE IF NOT EXISTS SEQ_{schema}_dict START 1; 4 | 5 | CREATE TABLE IF NOT EXISTS {schema}.dict ( 6 | termid INT PRIMARY KEY DEFAULT NEXTVAL('SEQ_{schema}_dict'), 7 | term VARCHAR, 8 | df INT 9 | ); 10 | 11 | CREATE TABLE IF NOT EXISTS {schema}.scores ( 12 | term VARCHAR, 13 | list_docids INT[], 14 | list_scores FLOAT4[] 15 | ); 16 | 17 | CREATE SEQUENCE IF NOT EXISTS SEQ_{schema}_docs START 1; 18 | 19 | CREATE TABLE IF NOT EXISTS {schema}.docs ( 20 | docid INT PRIMARY KEY DEFAULT NEXTVAL('SEQ_{schema}_docs'), 21 | len INT, 22 | name VARCHAR 23 | ); 24 | 25 | CREATE TABLE IF NOT EXISTS {schema}.stats ( 26 | num_docs INT, 27 | avgdl FLOAT 28 | ); 29 | 30 | CREATE TABLE IF NOT EXISTS {schema}.terms ( 31 | docid INT, 32 | termid INT, 33 | tf INT 34 | ); 35 | 36 | CREATE TABLE IF NOT EXISTS {schema}.stopwords ( 37 | sw VARCHAR 38 | ); 39 | 40 | CREATE OR REPLACE TABLE {schema}._documents AS ( 41 | WITH _indexed_documents AS ( 42 | SELECT 43 | s.*, 44 | d.name AS existing_id 45 | FROM {source_schema}.{source} s 46 | LEFT JOIN {schema}.docs d 47 | ON s.id = d.name 48 | ) 49 | 50 | SELECT 51 | {key_field} AS id, 52 | CONCAT_WS(' ', 53 | {fields} 54 | ) AS _search 55 | FROM _indexed_documents 56 | WHERE existing_id IS NULL 57 | ); 58 | -------------------------------------------------------------------------------- /ducksearch/search/drop/_documents.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE {schema}._documents; -------------------------------------------------------------------------------- /ducksearch/search/drop/queries.sql: -------------------------------------------------------------------------------- 1 | DROP SCHEMA fts_{schema}__queries_{random_hash} CASCADE; 2 | DROP TABLE {schema}._queries_{random_hash}; 3 | -------------------------------------------------------------------------------- /ducksearch/search/drop/schema.sql: -------------------------------------------------------------------------------- 1 | DROP SCHEMA fts_{schema}__documents CASCADE; -------------------------------------------------------------------------------- /ducksearch/search/drop/scores.sql: -------------------------------------------------------------------------------- 1 | WITH _terms_scores_to_drop AS ( 2 | SELECT DISTINCT 3 | d.term 4 | FROM fts_{schema}__documents.dict fts 5 | INNER JOIN {schema}.dict d 6 | ON fts.term = d.term 7 | ) 8 | 9 | DELETE FROM {schema}.scores s 10 | USING _terms_scores_to_drop t 11 | WHERE s.term = t.term; -------------------------------------------------------------------------------- /ducksearch/search/graphs.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import logging 3 | import os 4 | import resource 5 | 6 | import pyarrow as pa 7 | import pyarrow.parquet as pq 8 | import tqdm 9 | from joblib import delayed 10 | 11 | from ..decorators import execute_with_duckdb 12 | from ..utils import ParallelTqdm, batchify, generate_random_hash 13 | from .create import _select_settings 14 | from .select import _create_queries_index, _insert_queries 15 | 16 | 17 | @execute_with_duckdb( 18 | relative_path="search/select/search_graph.sql", 19 | read_only=True, 20 | fetch_df=True, 21 | ) 22 | def _search_graph_query(): 23 | """Execute a graph-based search query in DuckDB.""" 24 | 25 | 26 | @execute_with_duckdb( 27 | relative_path="search/select/search_graph_filters.sql", 28 | read_only=True, 29 | fetch_df=True, 30 | ) 31 | def _search_graph_filters_query(): 32 | """Execute a graph-based search query in DuckDB with filters.""" 33 | 34 | 35 | def _search_graph( 36 | database: str, 37 | queries: list[str], 38 | top_k: int, 39 | top_k_token: int, 40 | group_id: int, 41 | random_hash: str, 42 | config: dict | None = None, 43 | filters: str | None = None, 44 | ) -> list: 45 | """Perform a graph-based search in DuckDB. 46 | 47 | Parameters 48 | ---------- 49 | database 50 | The name of the DuckDB database. 51 | queries 52 | The list of queries to search. 53 | top_k 54 | The number of top results to retrieve for each query. 55 | top_k_token 56 | The number of top tokens to retrieve. Used to select top documents per token. 57 | group_id 58 | The index of the current batch of queries. 59 | config 60 | Optional configuration settings for the DuckDB connection. 61 | filters 62 | Optional SQL filters to apply during the search. 63 | 64 | Returns 65 | ------- 66 | list 67 | A list of search results for each query in the batch. 68 | """ 69 | search_function = ( 70 | _search_graph_filters_query if filters is not None else _search_graph_query 71 | ) 72 | 73 | matchs = search_function( 74 | database=database, 75 | queries_schema="bm25_queries", 76 | documents_schema="bm25_documents", 77 | source_schema="bm25_tables", 78 | top_k=top_k, 79 | group_id=group_id, 80 | random_hash=random_hash, 81 | top_k_token=top_k_token, 82 | filters=filters, 83 | config=config, 84 | ) 85 | 86 | candidates = collections.defaultdict(list) 87 | for match in matchs: 88 | query = match.pop("_query") 89 | candidates[query].append(match) 90 | return [candidates[query] for query in queries] 91 | 92 | 93 | def graphs( 94 | database: str, 95 | queries: str | list[str], 96 | batch_size: int = 30, 97 | top_k: int = 1000, 98 | top_k_token: int = 30_000, 99 | n_jobs: int = -1, 100 | config: dict | None = None, 101 | filters: str | None = None, 102 | tqdm_bar: bool = True, 103 | ) -> list[dict]: 104 | """Search for graphs in DuckDB using the provided queries. 105 | 106 | Parameters 107 | ---------- 108 | database 109 | The name of the DuckDB database. 110 | queries 111 | A string or list of query strings to search for. 112 | batch_size 113 | The batch size for processing queries. 114 | top_k 115 | The number of top documents to retrieve for each query. 116 | top_k_token 117 | The number of top tokens to retrieve. 118 | n_jobs 119 | The number of parallel jobs to use. Default use all available processors. 120 | config 121 | Optional configuration settings for the DuckDB connection. 122 | filters 123 | Optional SQL filters to apply during the search. 124 | 125 | Returns 126 | ------- 127 | list[dict] 128 | A list of search results, where each result corresponds to a query. 129 | 130 | Examples 131 | -------- 132 | >>> from ducksearch import evaluation, upload, search 133 | 134 | >>> documents, queries, qrels = evaluation.load_beir("scifact", split="train") 135 | 136 | >>> upload.documents( 137 | ... database="test.duckdb", 138 | ... key="id", 139 | ... fields=["title", "text"], 140 | ... documents=documents, 141 | ... ) 142 | | Table | Size | 143 | |----------------|------| 144 | | documents | 5183 | 145 | | bm25_documents | 5183 | 146 | 147 | >>> upload.queries( 148 | ... database="test.duckdb", 149 | ... queries=queries, 150 | ... documents_queries=qrels, 151 | ... ) 152 | | Table | Size | 153 | |-------------------|------| 154 | | documents | 5183 | 155 | | queries | 807 | 156 | | bm25_documents | 5183 | 157 | | bm25_queries | 807 | 158 | | documents_queries | 916 | 159 | 160 | 161 | 162 | """ 163 | resource.setrlimit( 164 | resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY) 165 | ) 166 | 167 | if isinstance(queries, str): 168 | queries = [queries] 169 | 170 | logging.info("Indexing queries.") 171 | random_hash = generate_random_hash() 172 | 173 | batchs = { 174 | group_id: batch 175 | for group_id, batch in enumerate( 176 | iterable=batchify( 177 | X=queries, batch_size=batch_size, desc="Searching", tqdm_bar=False 178 | ) 179 | ) 180 | } 181 | 182 | parquet_file = f"_queries_{random_hash}.parquet" 183 | pa_queries, pa_group_ids = [], [] 184 | for group_id, batch_queries in batchs.items(): 185 | pa_queries.extend(batch_queries) 186 | pa_group_ids.extend([group_id] * len(batch_queries)) 187 | 188 | logging.info("Indexing queries.") 189 | index_table = pa.Table.from_pydict({"query": pa_queries, "group_id": pa_group_ids}) 190 | 191 | pq.write_table(index_table, parquet_file, compression="snappy") 192 | 193 | _insert_queries( 194 | database=database, 195 | schema="bm25_documents", 196 | parquet_file=parquet_file, 197 | random_hash=random_hash, 198 | config=config, 199 | ) 200 | 201 | if os.path.exists(parquet_file): 202 | os.remove(parquet_file) 203 | 204 | settings = _select_settings( 205 | database=database, schema="bm25_documents", config=config 206 | )[0] 207 | 208 | _create_queries_index( 209 | database=database, 210 | schema="bm25_documents", 211 | random_hash=random_hash, 212 | **settings, 213 | config=config, 214 | ) 215 | 216 | matchs = [] 217 | if n_jobs == 1 or len(batchs) == 1: 218 | if tqdm_bar: 219 | bar = tqdm.tqdm( 220 | total=len(batchs), 221 | position=0, 222 | desc="Searching", 223 | ) 224 | 225 | for group_id, batch_queries in batchs.items(): 226 | matchs.extend( 227 | _search_graph( 228 | database=database, 229 | queries=batch_queries, 230 | top_k=top_k, 231 | top_k_token=top_k_token, 232 | group_id=group_id, 233 | random_hash=random_hash, 234 | config=config, 235 | filters=filters, 236 | ) 237 | ) 238 | if tqdm_bar: 239 | bar.update(1) 240 | else: 241 | for match in ParallelTqdm( 242 | n_jobs=n_jobs, 243 | backend="threading", 244 | total=len(batchs), 245 | desc="Searching", 246 | tqdm_bar=tqdm_bar, 247 | )( 248 | delayed(_search_graph)( 249 | database, 250 | batch_queries, 251 | top_k, 252 | top_k_token, 253 | group_id, 254 | random_hash, 255 | config, 256 | filters, 257 | ) 258 | for group_id, batch_queries in batchs.items() 259 | ): 260 | matchs.extend(match) 261 | 262 | return matchs 263 | -------------------------------------------------------------------------------- /ducksearch/search/insert/dict.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.dict (term, df) 2 | 3 | WITH _new_terms AS ( 4 | SELECT 5 | fts.df, 6 | fts.term, 7 | d.termid AS existing_id 8 | FROM fts_{schema}__documents.dict fts 9 | LEFT JOIN {schema}.dict d 10 | ON fts.term = d.term 11 | ) 12 | 13 | SELECT 14 | term, 15 | df 16 | FROM _new_terms 17 | WHERE existing_id IS NULL; 18 | -------------------------------------------------------------------------------- /ducksearch/search/insert/docs.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.docs (len, name) 2 | 3 | SELECT 4 | len, 5 | name 6 | FROM fts_{schema}__documents.docs; 7 | -------------------------------------------------------------------------------- /ducksearch/search/insert/queries.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE {schema}._queries_{random_hash} AS ( 2 | SELECT 3 | query, 4 | group_id 5 | FROM parquet_scan('{parquet_file}') 6 | ); 7 | -------------------------------------------------------------------------------- /ducksearch/search/insert/settings.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.settings (k1, b, stemmer, stopwords, ignore, strip_accents, lower) 2 | VALUES ({k1}, {b}, '{stemmer}', '{stopwords}', '{ignore}', {strip_accents}, {lower}); -------------------------------------------------------------------------------- /ducksearch/search/insert/terms.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.terms (docid, termid, tf) 2 | 3 | WITH _raw_terms AS ( 4 | SELECT DISTINCT termid FROM parquet_scan('{parquet_file}') 5 | ), 6 | 7 | _unfiltered_raw_terms AS ( 8 | SELECT DISTINCT 9 | _dict.term, 10 | sw.sw IS NOT NULL AS is_stopword 11 | FROM _raw_terms _rt 12 | INNER JOIN {schema}.dict _dict 13 | ON _rt.termid = _dict.termid 14 | LEFT JOIN {schema}.stopwords sw 15 | ON _dict.term = sw.sw 16 | ), 17 | 18 | _filtered_raw_terms AS ( 19 | SELECT 20 | term 21 | FROM _unfiltered_raw_terms 22 | WHERE is_stopword = FALSE 23 | ), 24 | 25 | _filtered_raw_terms_bm25id AS ( 26 | SELECT DISTINCT 27 | ftsdi.termid 28 | FROM _filtered_raw_terms _raw 29 | JOIN fts_{schema}__documents.dict ftsdi 30 | ON _raw.term = ftsdi.term 31 | ) 32 | 33 | , _documents_terms_filter AS ( 34 | SELECT 35 | docid, 36 | _terms.termid, 37 | COUNT(*) AS tf 38 | FROM fts_{schema}__documents.terms _terms 39 | INNER JOIN _filtered_raw_terms_bm25id _raw 40 | ON _terms.termid = _raw.termid 41 | GROUP BY 1, 2 42 | ) 43 | 44 | SELECT 45 | docs.docid, 46 | dict.termid, 47 | dt.tf 48 | FROM _documents_terms_filter dt 49 | JOIN fts_{schema}__documents.dict ftsdi 50 | ON dt.termid = ftsdi.termid 51 | JOIN fts_{schema}__documents.docs ftsdo 52 | ON dt.docid = ftsdo.docid 53 | JOIN {schema}.dict dict 54 | ON ftsdi.term = dict.term 55 | JOIN {schema}.docs docs 56 | ON ftsdo.name = docs.name; 57 | -------------------------------------------------------------------------------- /ducksearch/search/select/search.sql: -------------------------------------------------------------------------------- 1 | WITH group_queries AS ( 2 | SELECT 3 | query 4 | FROM {schema}._queries_{random_hash} 5 | WHERE group_id = {group_id} 6 | ), 7 | 8 | _input_queries AS ( 9 | SELECT 10 | pf.query, 11 | ftsdict.term 12 | FROM group_queries pf 13 | JOIN fts_{schema}__queries_{random_hash}.docs docs 14 | ON pf.query = docs.name 15 | JOIN fts_{schema}__queries_{random_hash}.terms terms 16 | ON docs.docid = terms.docid 17 | JOIN fts_{schema}__queries_{random_hash}.dict ftsdict 18 | ON terms.termid = ftsdict.termid 19 | ), 20 | 21 | _nested_matchs AS ( 22 | SELECT 23 | iq.query, 24 | s.list_docids[0:{top_k_token}] as list_docids, 25 | s.list_scores[0:{top_k_token}] as list_scores 26 | FROM {schema}.scores s 27 | INNER JOIN _input_queries iq 28 | ON s.term = iq.term 29 | ), 30 | 31 | _matchs AS ( 32 | SELECT 33 | query, 34 | UNNEST( 35 | s.list_docids 36 | ) AS bm25id, 37 | UNNEST( 38 | s.list_scores 39 | ) AS score 40 | FROM _nested_matchs s 41 | ), 42 | 43 | _matchs_scores AS ( 44 | SELECT 45 | query, 46 | bm25id, 47 | SUM(score) AS score 48 | FROM _matchs 49 | GROUP BY 1, 2 50 | ), 51 | 52 | _partition_scores AS ( 53 | SELECT 54 | query, 55 | bm25id, 56 | score, 57 | RANK() OVER (PARTITION BY query ORDER BY score DESC, RANDOM() ASC) AS rank 58 | FROM _matchs_scores 59 | QUALIFY rank <= {top_k} 60 | ) 61 | 62 | SELECT 63 | s.* EXCLUDE (bm25id), 64 | ps.score, 65 | ps.query AS _query 66 | FROM _partition_scores ps 67 | INNER JOIN {source_schema}.{source} s 68 | ON ps.bm25id = s.bm25id 69 | ORDER BY score DESC; 70 | -------------------------------------------------------------------------------- /ducksearch/search/select/search_filters.sql: -------------------------------------------------------------------------------- 1 | WITH group_queries AS ( 2 | SELECT 3 | query 4 | FROM {schema}._queries_{random_hash} 5 | WHERE group_id = {group_id} 6 | ), 7 | 8 | _input_queries AS ( 9 | SELECT 10 | pf.query, 11 | ftsdict.term 12 | FROM group_queries pf 13 | JOIN fts_{schema}__queries_{random_hash}.docs docs 14 | ON pf.query = docs.name 15 | JOIN fts_{schema}__queries_{random_hash}.terms terms 16 | ON docs.docid = terms.docid 17 | JOIN fts_{schema}__queries_{random_hash}.dict ftsdict 18 | ON terms.termid = ftsdict.termid 19 | ), 20 | 21 | _matchs AS ( 22 | SELECT 23 | query, 24 | UNNEST( 25 | s.list_docids[:{top_k_token}] 26 | ) AS bm25id, 27 | UNNEST( 28 | s.list_scores[:{top_k_token}] 29 | ) AS score 30 | FROM _input_queries iq 31 | INNER JOIN {schema}.scores s 32 | ON iq.term = s.term 33 | ), 34 | 35 | _matchs_scores AS ( 36 | SELECT 37 | query AS _query, 38 | bm25id, 39 | SUM(score) AS _score 40 | FROM _matchs 41 | GROUP BY 1, 2 42 | ), 43 | 44 | _documents_filter AS ( 45 | SELECT 46 | * 47 | FROM {source_schema}.{source} 48 | WHERE {filters} 49 | ), 50 | 51 | _filtered_scores AS ( 52 | SELECT 53 | _query, 54 | _score, 55 | s.* EXCLUDE (bm25id) 56 | FROM _matchs_scores ms 57 | INNER JOIN _documents_filter s 58 | ON ms.bm25id = s.bm25id 59 | ), 60 | 61 | _partition_scores AS ( 62 | SELECT 63 | _query, 64 | _score AS score, 65 | * EXCLUDE (_score, _query), 66 | RANK() OVER (PARTITION BY _query {order_by}, RANDOM() ASC) AS _row_number 67 | FROM _filtered_scores 68 | QUALIFY _row_number <= {top_k} 69 | ) 70 | 71 | SELECT 72 | * EXCLUDE (_row_number) 73 | FROM _partition_scores 74 | {order_by}; 75 | -------------------------------------------------------------------------------- /ducksearch/search/select/search_graph.sql: -------------------------------------------------------------------------------- 1 | WITH group_queries AS ( 2 | SELECT 3 | query 4 | FROM {documents_schema}._queries_{random_hash} 5 | WHERE group_id = {group_id} 6 | ), 7 | 8 | _input_queries AS ( 9 | SELECT 10 | pf.query, 11 | ftsdict.term 12 | FROM group_queries pf 13 | JOIN fts_{documents_schema}__queries_{random_hash}.docs docs 14 | ON pf.query = docs.name 15 | JOIN fts_{documents_schema}__queries_{random_hash}.terms terms 16 | ON docs.docid = terms.docid 17 | JOIN fts_{documents_schema}__queries_{random_hash}.dict ftsdict 18 | ON terms.termid = ftsdict.termid 19 | ), 20 | 21 | _documents_matchs AS ( 22 | SELECT 23 | iq.query, 24 | UNNEST( 25 | s.list_docids[:{top_k_token}] 26 | ) AS id, 27 | UNNEST( 28 | s.list_scores[:{top_k_token}] 29 | ) AS score 30 | FROM _input_queries iq 31 | INNER JOIN {documents_schema}.scores s 32 | ON iq.term = s.term 33 | ), 34 | 35 | _queries_matchs AS ( 36 | SELECT 37 | iq.query, 38 | UNNEST( 39 | s.list_docids[:{top_k_token}] 40 | ) AS id, 41 | UNNEST( 42 | s.list_scores[:{top_k_token}] 43 | ) AS score 44 | FROM _input_queries iq 45 | INNER JOIN {queries_schema}.scores s 46 | ON iq.term = s.term 47 | ), 48 | 49 | _documents_scores AS ( 50 | SELECT 51 | query, 52 | id, 53 | SUM(score) AS score 54 | FROM _documents_matchs 55 | GROUP BY 1, 2 56 | ), 57 | 58 | _queries_scores AS ( 59 | SELECT 60 | query, 61 | id, 62 | SUM(score) AS score 63 | FROM _queries_matchs 64 | GROUP BY 1, 2 65 | ), 66 | 67 | _documents_ranks AS ( 68 | SELECT 69 | query, 70 | id, 71 | score, 72 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number 73 | FROM _documents_scores 74 | ), 75 | 76 | _queries_ranks AS ( 77 | SELECT 78 | query, 79 | id, 80 | score, 81 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number 82 | FROM _queries_scores 83 | ), 84 | 85 | _bm25_documents AS ( 86 | SELECT 87 | ps.query AS _query, 88 | ddocs.name AS id, 89 | ps.score 90 | FROM _documents_ranks ps 91 | INNER JOIN {documents_schema}.docs AS ddocs 92 | ON ps.id = ddocs.docid 93 | WHERE ps._row_number <= {top_k} 94 | ), 95 | 96 | _bm25_queries AS ( 97 | SELECT 98 | ps.query AS _query, 99 | ddocs.name AS id, 100 | ps.score 101 | FROM _queries_ranks ps 102 | INNER JOIN {queries_schema}.docs AS ddocs 103 | ON ps.id = ddocs.docid 104 | WHERE ps._row_number <= {top_k} 105 | ), 106 | 107 | _graph AS ( 108 | SELECT 109 | bm25.id AS src_id, 110 | dqg.query_id AS dst_id, 111 | dqg.score AS edge, 112 | 'document' AS src_type, 113 | 'query' AS dst_type, 114 | bm25._query 115 | FROM _bm25_documents AS bm25 116 | INNER JOIN {source_schema}.documents_queries AS dqg 117 | ON bm25.id = dqg.document_id 118 | INNER JOIN _bm25_queries AS bm25q 119 | ON dqg.query_id = bm25q.id 120 | AND bm25._query = bm25q._query 121 | ), 122 | 123 | _graph_scores AS ( 124 | SELECT 125 | g.*, 126 | COALESCE(bm25.score, 0) AS src_score, 127 | 0 AS dst_score 128 | FROM _graph AS g 129 | LEFT JOIN _bm25_documents AS bm25 130 | ON g.src_id = bm25.id 131 | AND g._query = bm25._query 132 | WHERE src_type = 'document' 133 | UNION 134 | SELECT 135 | g.*, 136 | 0 AS src_score, 137 | COALESCE(bm25.score, 0) AS dst_score 138 | FROM _graph AS g 139 | LEFT JOIN _bm25_documents AS bm25 140 | ON g.dst_id = bm25.id 141 | AND g._query = bm25._query 142 | WHERE dst_type = 'document' 143 | UNION 144 | SELECT 145 | g.*, 146 | COALESCE(bm25.score, 0) AS src_score, 147 | 0 AS dst_score 148 | FROM _graph AS g 149 | LEFT JOIN _bm25_queries AS bm25 150 | ON g.src_id = bm25.id 151 | AND g._query = bm25._query 152 | WHERE src_type = 'query' 153 | UNION 154 | SELECT 155 | g.*, 156 | 0 AS src_score, 157 | COALESCE(bm25.score, 0) AS dst_score 158 | FROM _graph AS g 159 | LEFT JOIN _bm25_queries AS bm25 160 | ON g.dst_id = bm25.id 161 | AND g._query = bm25._query 162 | WHERE dst_type = 'query' 163 | ), 164 | 165 | graph_scores AS ( 166 | SELECT 167 | src_id, 168 | dst_id, 169 | _query, 170 | src_type, 171 | dst_type, 172 | MAX(src_score) AS src_score, 173 | MAX(dst_score) AS dst_score, 174 | MAX(edge) AS edge 175 | FROM _graph_scores 176 | GROUP BY 1, 2, 3, 4, 5 177 | ), 178 | 179 | _rank AS ( 180 | SELECT 181 | src_id AS id, 182 | _query, 183 | SUM(src_score + dst_score + edge) AS score 184 | FROM graph_scores 185 | WHERE src_type = 'document' 186 | GROUP BY 1, 2 187 | UNION ALL 188 | SELECT 189 | dst_id AS id, 190 | _query, 191 | SUM(dst_score + src_score + edge) AS score 192 | FROM graph_scores 193 | WHERE dst_type = 'document' 194 | GROUP BY 1, 2 195 | UNION ALL 196 | SELECT 197 | id, 198 | _query, 199 | score 200 | FROM _bm25_documents 201 | ), 202 | 203 | scores AS ( 204 | SELECT 205 | id, 206 | _query, 207 | MAX(score) AS score 208 | FROM _rank 209 | GROUP BY 1, 2 210 | ) 211 | 212 | SELECT 213 | docs.* EXCLUDE (bm25id), 214 | s.score, 215 | s._query 216 | FROM scores s 217 | JOIN {source_schema}.documents docs 218 | ON s.id = docs.id 219 | ORDER BY s.score DESC; 220 | -------------------------------------------------------------------------------- /ducksearch/search/select/search_graph_filters.sql: -------------------------------------------------------------------------------- 1 | WITH group_queries AS ( 2 | SELECT 3 | query 4 | FROM {documents_schema}._queries_{random_hash} 5 | WHERE group_id = {group_id} 6 | ), 7 | 8 | _input_queries AS ( 9 | SELECT 10 | pf.query, 11 | ftsdict.term 12 | FROM group_queries pf 13 | JOIN fts_{documents_schema}__queries_{random_hash}.docs docs 14 | ON pf.query = docs.name 15 | JOIN fts_{documents_schema}__queries_{random_hash}.terms terms 16 | ON docs.docid = terms.docid 17 | JOIN fts_{documents_schema}__queries_{random_hash}.dict ftsdict 18 | ON terms.termid = ftsdict.termid 19 | ), 20 | 21 | _documents_matchs AS ( 22 | SELECT 23 | iq.query, 24 | UNNEST( 25 | s.list_docids[:{top_k_token}] 26 | ) AS id, 27 | UNNEST( 28 | s.list_scores[:{top_k_token}] 29 | ) AS score 30 | FROM _input_queries iq 31 | INNER JOIN {documents_schema}.scores s 32 | ON iq.term = s.term 33 | ), 34 | 35 | _queries_matchs AS ( 36 | SELECT 37 | iq.query, 38 | UNNEST( 39 | s.list_docids[:{top_k_token}] 40 | ) AS id, 41 | UNNEST( 42 | s.list_scores[:{top_k_token}] 43 | ) AS score 44 | FROM _input_queries iq 45 | INNER JOIN {queries_schema}.scores s 46 | ON iq.term = s.term 47 | ), 48 | 49 | _documents_scores AS ( 50 | SELECT 51 | query AS _query, 52 | id AS _id, 53 | SUM(score) AS _score 54 | FROM _documents_matchs 55 | GROUP BY 1, 2 56 | ), 57 | 58 | _documents_scores_filter AS ( 59 | SELECT 60 | ds._query AS query, 61 | ds._id AS id, 62 | ds._score AS score 63 | FROM _documents_scores ds 64 | INNER JOIN {source_schema}.documents d 65 | ON ds._id = d.bm25id 66 | WHERE {filters} 67 | ), 68 | 69 | _queries_scores AS ( 70 | SELECT 71 | query, 72 | id, 73 | SUM(score) AS score 74 | FROM _queries_matchs 75 | GROUP BY 1, 2 76 | ), 77 | 78 | _documents_ranks AS ( 79 | SELECT 80 | query, 81 | id, 82 | score, 83 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number 84 | FROM _documents_scores_filter 85 | ), 86 | 87 | _queries_ranks AS ( 88 | SELECT 89 | query, 90 | id, 91 | score, 92 | ROW_NUMBER() OVER (PARTITION BY query ORDER BY score DESC) AS _row_number 93 | FROM _queries_scores 94 | ), 95 | 96 | _bm25_documents AS ( 97 | SELECT 98 | ps.query AS _query, 99 | ddocs.name AS id, 100 | ps.score 101 | FROM _documents_ranks ps 102 | INNER JOIN {documents_schema}.docs AS ddocs 103 | ON ps.id = ddocs.docid 104 | WHERE ps._row_number <= {top_k} 105 | ), 106 | 107 | _bm25_queries AS ( 108 | SELECT 109 | ps.query AS _query, 110 | ddocs.name AS id, 111 | ps.score 112 | FROM _queries_ranks ps 113 | INNER JOIN {queries_schema}.docs AS ddocs 114 | ON ps.id = ddocs.docid 115 | WHERE ps._row_number <= {top_k} 116 | ), 117 | 118 | _graph AS ( 119 | SELECT 120 | bm25.id AS src_id, 121 | dqg.query_id AS dst_id, 122 | dqg.score AS edge, 123 | 'document' AS src_type, 124 | 'query' AS dst_type, 125 | bm25._query 126 | FROM _bm25_documents AS bm25 127 | INNER JOIN {source_schema}.documents_queries AS dqg 128 | ON bm25.id = dqg.document_id 129 | INNER JOIN _bm25_queries AS bm25q 130 | ON dqg.query_id = bm25q.id 131 | AND bm25._query = bm25q._query 132 | ), 133 | 134 | _graph_scores AS ( 135 | SELECT 136 | g.*, 137 | COALESCE(bm25.score, 0) AS src_score, 138 | 0 AS dst_score 139 | FROM _graph AS g 140 | LEFT JOIN _bm25_documents AS bm25 141 | ON g.src_id = bm25.id 142 | AND g._query = bm25._query 143 | WHERE src_type = 'document' 144 | UNION 145 | SELECT 146 | g.*, 147 | 0 AS src_score, 148 | COALESCE(bm25.score, 0) AS dst_score 149 | FROM _graph AS g 150 | LEFT JOIN _bm25_documents AS bm25 151 | ON g.dst_id = bm25.id 152 | AND g._query = bm25._query 153 | WHERE dst_type = 'document' 154 | UNION 155 | SELECT 156 | g.*, 157 | COALESCE(bm25.score, 0) AS src_score, 158 | 0 AS dst_score 159 | FROM _graph AS g 160 | LEFT JOIN _bm25_queries AS bm25 161 | ON g.src_id = bm25.id 162 | AND g._query = bm25._query 163 | WHERE src_type = 'query' 164 | UNION 165 | SELECT 166 | g.*, 167 | 0 AS src_score, 168 | COALESCE(bm25.score, 0) AS dst_score 169 | FROM _graph AS g 170 | LEFT JOIN _bm25_queries AS bm25 171 | ON g.dst_id = bm25.id 172 | AND g._query = bm25._query 173 | WHERE dst_type = 'query' 174 | ), 175 | 176 | graph_scores AS ( 177 | SELECT 178 | src_id, 179 | dst_id, 180 | _query, 181 | src_type, 182 | dst_type, 183 | MAX(src_score) AS src_score, 184 | MAX(dst_score) AS dst_score, 185 | MAX(edge) AS edge 186 | FROM _graph_scores 187 | GROUP BY 1, 2, 3, 4, 5 188 | ), 189 | 190 | _rank AS ( 191 | SELECT 192 | src_id AS id, 193 | _query, 194 | SUM(src_score + dst_score + edge) AS score 195 | FROM graph_scores 196 | WHERE src_type = 'document' 197 | GROUP BY 1, 2 198 | UNION ALL 199 | SELECT 200 | dst_id AS id, 201 | _query, 202 | SUM(dst_score + src_score + edge) AS score 203 | FROM graph_scores 204 | WHERE dst_type = 'document' 205 | GROUP BY 1, 2 206 | UNION ALL 207 | SELECT 208 | id, 209 | _query, 210 | score 211 | FROM _bm25_documents 212 | ), 213 | 214 | scores AS ( 215 | SELECT 216 | id, 217 | _query, 218 | MAX(score) AS score 219 | FROM _rank 220 | GROUP BY 1, 2 221 | ) 222 | 223 | SELECT 224 | docs.*, 225 | s.score, 226 | s._query 227 | FROM scores s 228 | JOIN {source_schema}.documents docs 229 | ON s.id = docs.id 230 | ORDER BY s.score DESC; 231 | -------------------------------------------------------------------------------- /ducksearch/search/select/search_order_by.sql: -------------------------------------------------------------------------------- 1 | WITH group_queries AS ( 2 | SELECT 3 | query 4 | FROM {schema}._queries_{random_hash} 5 | WHERE group_id = {group_id} 6 | ), 7 | 8 | _input_queries AS ( 9 | SELECT 10 | pf.query, 11 | ftsdict.term 12 | FROM group_queries pf 13 | JOIN fts_{schema}__queries_{random_hash}.docs docs 14 | ON pf.query = docs.name 15 | JOIN fts_{schema}__queries_{random_hash}.terms terms 16 | ON docs.docid = terms.docid 17 | JOIN fts_{schema}__queries_{random_hash}.dict ftsdict 18 | ON terms.termid = ftsdict.termid 19 | ), 20 | 21 | _nested_matchs AS ( 22 | SELECT 23 | iq.query, 24 | s.list_docids[0:{top_k_token}] as list_docids, 25 | s.list_scores[0:{top_k_token}] as list_scores 26 | FROM {schema}.scores s 27 | INNER JOIN _input_queries iq 28 | ON s.term = iq.term 29 | ), 30 | 31 | _matchs AS ( 32 | SELECT 33 | query, 34 | UNNEST( 35 | s.list_docids 36 | ) AS bm25id, 37 | UNNEST( 38 | s.list_scores 39 | ) AS score 40 | FROM _nested_matchs s 41 | ), 42 | 43 | _matchs_scores AS ( 44 | SELECT 45 | query, 46 | bm25id, 47 | SUM(score) AS score 48 | FROM _matchs 49 | GROUP BY 1, 2 50 | ), 51 | 52 | _match_scores_documents AS ( 53 | SELECT 54 | ms.query AS _query, 55 | ms.bm25id, 56 | ms.score, 57 | s.* 58 | FROM _matchs_scores ms 59 | INNER JOIN {source_schema}.{source} s 60 | ON ms.bm25id = s.bm25id 61 | ), 62 | 63 | _partition_scores AS ( 64 | SELECT 65 | *, 66 | RANK() OVER (PARTITION BY _query {order_by}, RANDOM() ASC) AS rank 67 | FROM _match_scores_documents 68 | QUALIFY rank <= {top_k} 69 | ) 70 | 71 | SELECT 72 | * 73 | FROM _partition_scores 74 | {order_by}; 75 | -------------------------------------------------------------------------------- /ducksearch/search/select/settings.sql: -------------------------------------------------------------------------------- 1 | SELECT * FROM {schema}.settings; -------------------------------------------------------------------------------- /ducksearch/search/select/settings_exists.sql: -------------------------------------------------------------------------------- 1 | SELECT coalesce(EXISTS ( 2 | SELECT 1 3 | FROM information_schema.tables 4 | WHERE 5 | table_name = 'settings' 6 | AND table_schema = '{schema}' 7 | ), FALSE) AS table_exists; 8 | -------------------------------------------------------------------------------- /ducksearch/search/select/stats.sql: -------------------------------------------------------------------------------- 1 | SELECT num_docs, avgdl FROM {schema}.stats; -------------------------------------------------------------------------------- /ducksearch/search/select/termids_to_score.sql: -------------------------------------------------------------------------------- 1 | WITH _terms_to_score AS ( 2 | SELECT 3 | term 4 | FROM fts_{schema}__documents.dict 5 | 6 | ) 7 | 8 | SELECT DISTINCT 9 | d.termid 10 | FROM _terms_to_score t 11 | JOIN {schema}.dict d 12 | ON t.term = d.term; -------------------------------------------------------------------------------- /ducksearch/search/update/bm25id.sql: -------------------------------------------------------------------------------- 1 | UPDATE {source_schema}.{source} source 2 | SET bm25id = {schema}.docs.docid 3 | FROM {schema}.docs 4 | WHERE source.id = {schema}.docs.name; 5 | -------------------------------------------------------------------------------- /ducksearch/search/update/dict.sql: -------------------------------------------------------------------------------- 1 | WITH new_terms AS ( 2 | SELECT 3 | fts.df, 4 | fts.term, 5 | d.termid AS existing_id 6 | FROM fts_{schema}__documents.dict fts 7 | LEFT JOIN {schema}.dict d 8 | ON fts.term = d.term 9 | ) 10 | 11 | UPDATE {schema}.dict d 12 | SET df = d.df + nt.df 13 | FROM new_terms nt 14 | WHERE d.termid = nt.existing_id; 15 | -------------------------------------------------------------------------------- /ducksearch/search/update/scores.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.scores (term, list_docids, list_scores) 2 | 3 | WITH _terms AS ( 4 | SELECT termid FROM parquet_scan('{parquet_file}') 5 | ), 6 | 7 | _unfiltered_terms_df AS ( 8 | SELECT 9 | d.termid, 10 | d.term, 11 | d.df, 12 | sw.sw IS NOT NULL AS is_stopword 13 | FROM {schema}.dict d 14 | INNER JOIN _terms t 15 | ON d.termid = t.termid 16 | LEFT JOIN {schema}.stopwords sw 17 | ON d.term = sw.sw 18 | ), 19 | 20 | _terms_df AS ( 21 | SELECT 22 | termid, 23 | term, 24 | df 25 | FROM _unfiltered_terms_df 26 | WHERE is_stopword = FALSE 27 | ), 28 | 29 | _documents_lengths AS ( 30 | SELECT 31 | docid, 32 | len 33 | FROM {schema}.docs 34 | ), 35 | 36 | _documents_terms_df AS ( 37 | SELECT 38 | s.docid, 39 | s.termid, 40 | s.tf 41 | FROM {schema}.terms s 42 | INNER JOIN _terms t 43 | ON s.termid = t.termid 44 | ), 45 | 46 | _scores AS ( 47 | SELECT 48 | tf.docid, 49 | tf.termid, 50 | tf.tf * LOG( 51 | ( 52 | ({num_docs} - tdf.df + 0.5) / 53 | (tdf.df + 0.5) 54 | ) + 1 55 | ) * 56 | (1.0 / (tf.tf + {k1} * (1 - {b} + {b} * (dl.len / {avgdl})))) AS score 57 | FROM 58 | _documents_terms_df tf 59 | JOIN 60 | _documents_lengths dl ON dl.docid = tf.docid 61 | JOIN 62 | _terms_df tdf ON tdf.termid = tf.termid 63 | ), 64 | 65 | _list_scores AS ( 66 | SELECT 67 | s.termid, 68 | LIST(d.docid ORDER BY s.score DESC, RANDOM() ASC) AS list_docids, 69 | LIST(s.score ORDER BY s.score DESC, RANDOM() ASC) AS list_scores 70 | FROM _scores s 71 | INNER JOIN 72 | {schema}.docs d 73 | ON s.docid = d.docid 74 | GROUP BY 75 | s.termid 76 | ) 77 | 78 | SELECT 79 | d.term, 80 | ls.list_docids, 81 | ls.list_scores 82 | FROM _list_scores ls 83 | JOIN _terms_df d 84 | ON ls.termid = d.termid; 85 | -------------------------------------------------------------------------------- /ducksearch/search/update/stats.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE TABLE {schema}.stats AS ( 2 | SELECT 3 | COUNT(*) AS num_docs, 4 | AVG(len) AS avgdl 5 | FROM {schema}.docs 6 | ); -------------------------------------------------------------------------------- /ducksearch/tables/__init__.py: -------------------------------------------------------------------------------- 1 | from .create import ( 2 | create_documents, 3 | create_documents_queries, 4 | create_queries, 5 | create_schema, 6 | ) 7 | from .insert import ( 8 | insert_documents, 9 | insert_documents_queries, 10 | insert_queries, 11 | ) 12 | from .select import ( 13 | select_documents, 14 | select_documents_columns, 15 | select_queries, 16 | ) 17 | from .update import add_columns_documents 18 | 19 | __all__ = [ 20 | "create_documents", 21 | "create_queries", 22 | "create_documents_queries", 23 | "create_schema", 24 | "insert_documents", 25 | "insert_queries", 26 | "insert_documents_queries", 27 | "select_documents", 28 | "select_documents_columns", 29 | "select_queries", 30 | "add_columns_documents", 31 | ] 32 | -------------------------------------------------------------------------------- /ducksearch/tables/create.py: -------------------------------------------------------------------------------- 1 | from ..decorators import execute_with_duckdb 2 | 3 | 4 | @execute_with_duckdb( 5 | relative_path="tables/create/documents.sql", 6 | ) 7 | def _create_documents() -> None: 8 | """Create the documents table in the DuckDB database. 9 | 10 | Parameters 11 | ---------- 12 | database: str 13 | The name of the DuckDB database. 14 | config: dict, optional 15 | The configuration options for the DuckDB connection. 16 | """ 17 | 18 | 19 | @execute_with_duckdb( 20 | relative_path="tables/create/schema.sql", 21 | ) 22 | def _create_schema() -> None: 23 | """Create a schema in the DuckDB database. 24 | 25 | Parameters 26 | ---------- 27 | database: str 28 | The name of the DuckDB database. 29 | schema: str 30 | The schema to be created in the database. 31 | config: dict, optional 32 | The configuration options for the DuckDB connection. 33 | """ 34 | 35 | 36 | def create_schema( 37 | database: str, 38 | schema: str, 39 | config: dict | None = None, 40 | ) -> None: 41 | """Create the specified schema in the DuckDB database. 42 | 43 | Parameters 44 | ---------- 45 | database: str 46 | The name of the DuckDB database. 47 | schema: str 48 | The schema to create within the DuckDB database. 49 | config: dict, optional 50 | The configuration options for the DuckDB connection. 51 | 52 | Examples 53 | -------- 54 | >>> from ducksearch import tables 55 | 56 | >>> tables.create_schema( 57 | ... database="test.duckdb", 58 | ... schema="bm25_tables", 59 | ... ) 60 | """ 61 | return _create_schema(database=database, schema=schema, config=config) 62 | 63 | 64 | def create_documents( 65 | database: str, 66 | schema: str, 67 | columns: str | list[str], 68 | dtypes: dict[str, str] | None = None, 69 | config: dict | None = None, 70 | ) -> None: 71 | """Create the documents table in the DuckDB database. 72 | 73 | Parameters 74 | ---------- 75 | database: str 76 | The name of the DuckDB database. 77 | schema: str 78 | The schema in which to create the documents table. 79 | columns: str or list[str] 80 | The list of columns for the documents table. If a string is provided, it will be converted into a list. 81 | dtypes: dict[str, str], optional 82 | A dictionary specifying field names as keys and their DuckDB types as values. Defaults to 'VARCHAR' if not provided. 83 | config: dict, optional 84 | The configuration options for the DuckDB connection. 85 | 86 | Examples 87 | -------- 88 | >>> from ducksearch import tables 89 | 90 | >>> tables.create_schema( 91 | ... database="test.duckdb", 92 | ... schema="bm25_tables" 93 | ... ) 94 | 95 | >>> tables.create_documents( 96 | ... database="test.duckdb", 97 | ... schema="bm25_tables", 98 | ... columns=["title", "text"], 99 | ... dtypes={"text": "VARCHAR", "title": "VARCHAR"}, 100 | ... ) 101 | 102 | >>> df = [ 103 | ... {"id": 1, "title": "title document 1", "text": "text document 1"}, 104 | ... {"id": 2, "title": "title document 2", "text": "text document 2"}, 105 | ... {"id": 3, "title": "title document 3", "text": "text document 3"}, 106 | ... ] 107 | 108 | >>> tables.insert_documents( 109 | ... database="test.duckdb", 110 | ... schema="bm25_tables", 111 | ... key="id", 112 | ... df=df, 113 | ... columns=["title", "text"], 114 | ... ) 115 | """ 116 | if not dtypes: 117 | dtypes = {} 118 | 119 | return _create_documents( 120 | database=database, 121 | schema=schema, 122 | fields=", ".join( 123 | [f"{field} {dtypes.get(field, 'VARCHAR')}" for field in columns] 124 | ), 125 | config=config, 126 | ) 127 | 128 | 129 | @execute_with_duckdb( 130 | relative_path="tables/create/queries.sql", 131 | ) 132 | def create_queries() -> None: 133 | """Create the queries table in the DuckDB database. 134 | 135 | Parameters 136 | ---------- 137 | database: str 138 | The name of the DuckDB database. 139 | config: dict, optional 140 | The configuration options for the DuckDB connection. 141 | 142 | Examples 143 | -------- 144 | >>> from ducksearch import tables 145 | 146 | >>> tables.create_schema( 147 | ... database="test.duckdb", 148 | ... schema="bm25_tables" 149 | ... ) 150 | 151 | >>> tables.create_queries( 152 | ... database="test.duckdb", 153 | ... schema="bm25_tables", 154 | ... ) 155 | """ 156 | 157 | 158 | @execute_with_duckdb( 159 | relative_path=[ 160 | "tables/create/queries.sql", 161 | "tables/create/documents_queries.sql", 162 | ] 163 | ) 164 | def create_documents_queries() -> None: 165 | """Create the documents_queries table in the DuckDB database. 166 | 167 | Parameters 168 | ---------- 169 | database: str 170 | The name of the DuckDB database. 171 | config: dict, optional 172 | The configuration options for the DuckDB connection. 173 | 174 | Examples 175 | -------- 176 | >>> from ducksearch import tables 177 | 178 | >>> tables.create_schema( 179 | ... database="test.duckdb", 180 | ... schema="bm25_tables" 181 | ... ) 182 | 183 | >>> tables.create_documents_queries( 184 | ... database="test.duckdb", 185 | ... schema="bm25_tables", 186 | ... ) 187 | """ 188 | -------------------------------------------------------------------------------- /ducksearch/tables/create/documents.sql: -------------------------------------------------------------------------------- 1 | CREATE SEQUENCE IF NOT EXISTS _seq_documents_id START 1; 2 | 3 | CREATE TABLE IF NOT EXISTS {schema}.documents ( 4 | id VARCHAR PRIMARY KEY DEFAULT (nextval('_seq_documents_id')), 5 | {fields}, 6 | bm25id INT DEFAULT NULL 7 | ); 8 | -------------------------------------------------------------------------------- /ducksearch/tables/create/documents_queries.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {schema}.documents_queries ( 2 | document_id VARCHAR, 3 | query_id VARCHAR, 4 | score FLOAT DEFAULT NULL, 5 | FOREIGN KEY (document_id) REFERENCES {schema}.documents (id), 6 | FOREIGN KEY (query_id) REFERENCES {schema}.queries (id) 7 | ); 8 | -------------------------------------------------------------------------------- /ducksearch/tables/create/queries.sql: -------------------------------------------------------------------------------- 1 | CREATE SEQUENCE IF NOT EXISTS {schema}_SEQ_QUERIES_ID START 1; 2 | 3 | CREATE TABLE IF NOT EXISTS {schema}.queries ( 4 | id VARCHAR PRIMARY KEY DEFAULT NEXTVAL('{schema}_SEQ_QUERIES_ID'), 5 | query TEXT NOT NULL, 6 | bm25id INT DEFAULT NULL 7 | ); 8 | -------------------------------------------------------------------------------- /ducksearch/tables/create/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE SCHEMA IF NOT EXISTS {schema}; -------------------------------------------------------------------------------- /ducksearch/tables/insert.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os 3 | import shutil 4 | 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | from joblib import Parallel, delayed 8 | 9 | from ..decorators import execute_with_duckdb 10 | from ..utils import batchify 11 | from .create import ( 12 | create_documents, 13 | create_documents_queries, 14 | create_queries, 15 | ) 16 | 17 | 18 | @execute_with_duckdb( 19 | relative_path="tables/insert/documents.sql", 20 | ) 21 | def _insert_documents() -> None: 22 | """Insert documents into the documents table. 23 | 24 | Parameters 25 | ---------- 26 | database: str 27 | The name of the DuckDB database. 28 | config: dict, optional 29 | The configuration options for the DuckDB connection. 30 | """ 31 | 32 | 33 | @execute_with_duckdb( 34 | relative_path="tables/insert/fast_documents.sql", 35 | ) 36 | def _insert_documents_fast() -> None: 37 | """Insert documents into the documents table without any duplicate checks. 38 | 39 | Parameters 40 | ---------- 41 | database: str 42 | The name of the DuckDB database. 43 | config: dict, optional 44 | The configuration options for the DuckDB connection. 45 | """ 46 | 47 | 48 | def write_parquet( 49 | database: str, 50 | documents: list[dict], 51 | index: int, 52 | fields: list[str], 53 | key: str, 54 | ) -> None: 55 | """Write a parquet file with document data for upload. 56 | 57 | Parameters 58 | ---------- 59 | documents 60 | A list of dictionaries representing the documents to be written to the parquet file. 61 | index 62 | The index of the current batch being processed. 63 | fields 64 | The list of document fields to be written to the parquet file. 65 | key 66 | The key field to uniquely identify each document. 67 | 68 | Notes 69 | ----- 70 | This function writes documents to a temporary parquet file in preparation for bulk uploading into the database. 71 | """ 72 | documents_table = collections.defaultdict(list) 73 | 74 | fields = set() 75 | for document in documents: 76 | for field in document.keys(): 77 | if field != "id": 78 | fields.add(field) 79 | 80 | for document in documents: 81 | documents_table["id"].append(document[key]) 82 | for field in fields: 83 | documents_table[field].append(document.get(field, None)) 84 | 85 | documents_path = os.path.join( 86 | ".", f"{database}_tmp", "documents", f"{index}.parquet" 87 | ) 88 | documents_table = pa.Table.from_pydict(documents_table) 89 | 90 | pq.write_table( 91 | documents_table, 92 | documents_path, 93 | compression="snappy", 94 | ) 95 | 96 | 97 | def insert_documents( 98 | database: str, 99 | schema: str, 100 | df: list[dict] | str, 101 | key: str, 102 | columns: list[str] | str, 103 | dtypes: dict[str, str] | None = None, 104 | batch_size: int = 30_000, 105 | n_jobs: int = -1, 106 | config: dict | None = None, 107 | limit: int | None = None, 108 | fast: bool = False, 109 | ) -> None: 110 | """Insert documents into the documents table with optional multi-threading. 111 | 112 | Parameters 113 | ---------- 114 | database 115 | The name of the DuckDB database. 116 | schema 117 | The schema in which the documents table is located. 118 | df 119 | The list of document dictionaries or a string (URL) for a Hugging Face dataset to insert. 120 | key 121 | The field that uniquely identifies each document (e.g., 'id'). 122 | columns 123 | The list of document fields to insert. Can be a string if inserting a single field. 124 | dtypes 125 | Optional dictionary specifying the DuckDB type for each field. Defaults to 'VARCHAR' for all unspecified fields. 126 | batch_size 127 | The number of documents to insert in each batch. 128 | n_jobs 129 | Number of parallel jobs to use for inserting documents. Default use all available processors. 130 | config 131 | Optional configuration options for the DuckDB connection. 132 | 133 | Examples 134 | -------- 135 | >>> from ducksearch import tables 136 | 137 | >>> df = [ 138 | ... {"id": 1, "title": "title document 1", "text": "text document 1"}, 139 | ... {"id": 2, "title": "title document 2", "text": "text document 2"}, 140 | ... {"id": 3, "title": "title document 3", "text": "text document 3"}, 141 | ... ] 142 | 143 | >>> _ = tables.insert_documents( 144 | ... database="test.duckdb", 145 | ... schema="bm25_tables", 146 | ... key="id", 147 | ... columns=["title", "text"], 148 | ... df=df 149 | ... ) 150 | 151 | """ 152 | columns = [column for column in columns if column != "id"] 153 | 154 | create_documents( 155 | database=database, 156 | schema=schema, 157 | columns=columns, 158 | config=config, 159 | dtypes=dtypes, 160 | ) 161 | 162 | documents_path = os.path.join(".", f"{database}_tmp", "documents") 163 | 164 | if os.path.exists(path=documents_path): 165 | shutil.rmtree(documents_path) 166 | 167 | os.makedirs(name=os.path.join(".", f"{database}_tmp"), exist_ok=True) 168 | os.makedirs(name=documents_path, exist_ok=True) 169 | 170 | Parallel(n_jobs=n_jobs, backend="threading")( 171 | delayed(function=write_parquet)( 172 | database, 173 | batch, 174 | index, 175 | columns, 176 | key, 177 | ) 178 | for index, batch in enumerate( 179 | iterable=batchify(X=df, batch_size=batch_size, tqdm_bar=False) 180 | ) 181 | ) 182 | 183 | if fast: 184 | _insert_documents_fast( 185 | database=database, 186 | schema=schema, 187 | parquet_files=os.path.join(documents_path, "*.parquet"), 188 | config=config, 189 | key_field=f"df.{key}", 190 | fields=", ".join(columns), 191 | df_fields=", ".join([f"df.{field}" for field in columns]), 192 | src_fields=", ".join([f"src.{field}" for field in columns]), 193 | ) 194 | else: 195 | _insert_documents( 196 | database=database, 197 | schema=schema, 198 | parquet_files=os.path.join(documents_path, "*.parquet"), 199 | config=config, 200 | key_field=f"df.{key}", 201 | fields=", ".join(columns), 202 | df_fields=", ".join([f"df.{field}" for field in columns]), 203 | src_fields=", ".join([f"src.{field}" for field in columns]), 204 | ) 205 | 206 | if os.path.exists(path=documents_path): 207 | shutil.rmtree(documents_path) 208 | 209 | if os.path.exists(path=os.path.join(".", f"{database}_tmp")): 210 | shutil.rmtree(os.path.join(".", f"{database}_tmp")) 211 | 212 | 213 | @execute_with_duckdb( 214 | relative_path="tables/insert/queries.sql", 215 | ) 216 | def _insert_queries() -> None: 217 | """Insert queries into the queries table. 218 | 219 | Parameters 220 | ---------- 221 | database: str 222 | The name of the DuckDB database. 223 | config: dict, optional 224 | The configuration options for the DuckDB connection. 225 | """ 226 | 227 | 228 | def insert_queries( 229 | database: str, 230 | schema: str, 231 | queries: list[str], 232 | config: dict | None = None, 233 | ) -> None: 234 | """Insert a list of queries into the queries table. 235 | 236 | Parameters 237 | ---------- 238 | database 239 | The name of the DuckDB database. 240 | schema 241 | The schema in which the queries table is located. 242 | queries 243 | A list of query strings to insert into the table. 244 | config 245 | Optional configuration options for the DuckDB connection. 246 | 247 | Examples 248 | -------- 249 | >>> from ducksearch import tables 250 | 251 | >>> _ = tables.insert_queries( 252 | ... database="test.duckdb", 253 | ... schema="bm25_tables", 254 | ... queries=["query 1", "query 2", "query 3"], 255 | ... ) 256 | """ 257 | create_queries(database=database, schema=schema, config=config) 258 | 259 | table = pa.Table.from_pydict({"query": queries}) 260 | 261 | pq.write_table( 262 | table, 263 | "_queries.parquet", 264 | compression="snappy", 265 | ) 266 | 267 | _insert_queries( 268 | database=database, 269 | schema=schema, 270 | parquet_file="_queries.parquet", 271 | config=config, 272 | ) 273 | 274 | if os.path.exists("_queries.parquet"): 275 | os.remove("_queries.parquet") 276 | 277 | 278 | @execute_with_duckdb( 279 | relative_path="tables/insert/documents_queries.sql", 280 | ) 281 | def _insert_documents_queries() -> None: 282 | """Insert query-document interactions into the documents_queries table. 283 | 284 | Parameters 285 | ---------- 286 | database: str 287 | The name of the DuckDB database. 288 | config: dict, optional 289 | The configuration options for the DuckDB connection. 290 | """ 291 | 292 | 293 | def insert_documents_queries( 294 | database: str, 295 | schema: str, 296 | documents_queries: dict[dict[str, float]], 297 | config: dict | None = None, 298 | ) -> None: 299 | """Insert interactions between documents and queries into the documents_queries table. 300 | 301 | Parameters 302 | ---------- 303 | database 304 | The name of the DuckDB database. 305 | schema 306 | The schema in which the documents_queries table is located. 307 | documents_queries 308 | A dictionary mapping document IDs to queries and their corresponding scores. 309 | config 310 | Optional configuration options for the DuckDB connection. 311 | 312 | Examples 313 | -------- 314 | >>> from ducksearch import tables 315 | 316 | >>> documents_queries = { 317 | ... "1": {"query 1": 0.9, "query 2": 0.8}, 318 | ... "2": {"query 2": 0.9, "query 3": 3}, 319 | ... "3": {"query 1": 0.9, "query 3": 0.5}, 320 | ... } 321 | 322 | >>> tables.insert_documents_queries( 323 | ... database="test.duckdb", 324 | ... schema="bm25_tables", 325 | ... documents_queries=documents_queries 326 | ... ) 327 | 328 | """ 329 | create_queries(database=database, schema=schema, config=config) 330 | 331 | queries = set() 332 | for _, document_queries in documents_queries.items(): 333 | for query in document_queries: 334 | queries.add(query) 335 | 336 | insert_queries( 337 | database=database, schema=schema, queries=list(queries), config=config 338 | ) 339 | create_documents_queries(database=database, schema=schema, config=config) 340 | 341 | document_ids, queries, scores = [], [], [] 342 | for document_id, document_queries in documents_queries.items(): 343 | if isinstance(document_queries, list): 344 | document_queries = {query: 1.0 for query in document_queries} 345 | 346 | for query, score in document_queries.items(): 347 | document_ids.append(str(document_id)) 348 | queries.append(query) 349 | scores.append(score) 350 | 351 | table = pa.Table.from_pydict( 352 | { 353 | "document_id": document_ids, 354 | "query": queries, 355 | "score": scores, 356 | } 357 | ) 358 | 359 | pq.write_table( 360 | table, 361 | "_documents_queries.parquet", 362 | compression="snappy", 363 | ) 364 | 365 | _insert_documents_queries( 366 | database=database, 367 | schema=schema, 368 | parquet_file="_documents_queries.parquet", 369 | config=config, 370 | ) 371 | 372 | if os.path.exists("_documents_queries.parquet"): 373 | os.remove("_documents_queries.parquet") 374 | -------------------------------------------------------------------------------- /ducksearch/tables/insert/documents.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.documents (id, {fields}) 2 | 3 | WITH _distinct_documents AS ( 4 | SELECT DISTINCT 5 | {key_field} AS id, 6 | {df_fields}, 7 | ROW_NUMBER() OVER (PARTITION BY id ORDER BY id, RANDOM() ASC) AS _row_number 8 | FROM read_parquet('{parquet_files}') df 9 | ), 10 | 11 | _new_distinct_documents AS ( 12 | SELECT DISTINCT 13 | dd.*, 14 | d.id AS existing_id 15 | FROM _distinct_documents dd 16 | LEFT JOIN {schema}.documents AS d 17 | ON dd.id = d.id 18 | WHERE _row_number = 1 19 | ) 20 | 21 | SELECT 22 | id, 23 | {fields} 24 | FROM _new_distinct_documents 25 | WHERE existing_id IS NULL; 26 | -------------------------------------------------------------------------------- /ducksearch/tables/insert/documents_queries.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.documents_queries (document_id, query_id, score) 2 | 3 | WITH _documents_queries_scores AS ( 4 | SELECT 5 | document_id, 6 | query, 7 | MAX(score) AS score 8 | FROM parquet_scan('{parquet_file}') 9 | GROUP BY 1, 2 10 | ), 11 | 12 | _distinct_documents_queries AS ( 13 | SELECT 14 | dqw.document_id, 15 | q.id AS query_id, 16 | dqw.score, 17 | dq.document_id AS existing_id 18 | FROM _documents_queries_scores AS dqw 19 | INNER JOIN {schema}.queries AS q 20 | ON dqw.query = q.query 21 | INNER JOIN {schema}.documents AS d 22 | ON dqw.document_id = d.id 23 | LEFT JOIN {schema}.documents_queries AS dq 24 | ON q.id = dq.query_id 25 | AND dqw.document_id = dq.document_id 26 | ) 27 | 28 | SELECT DISTINCT 29 | document_id, 30 | query_id, 31 | score 32 | FROM _distinct_documents_queries 33 | WHERE existing_id IS NULL; 34 | -------------------------------------------------------------------------------- /ducksearch/tables/insert/fast_documents.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.documents ({fields}) 2 | 3 | WITH _distinct_documents AS ( 4 | SELECT DISTINCT 5 | {df_fields} 6 | FROM read_parquet('{parquet_files}') df 7 | ) 8 | 9 | SELECT 10 | * 11 | FROM _distinct_documents; 12 | -------------------------------------------------------------------------------- /ducksearch/tables/insert/queries.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO {schema}.queries (query) 2 | 3 | WITH _distinct_queries AS ( 4 | SELECT DISTINCT 5 | df.query, 6 | q.id AS existing_id 7 | FROM parquet_scan('{parquet_file}') AS df 8 | LEFT JOIN {schema}.queries AS q 9 | ON df.query = q.query 10 | ) 11 | 12 | SELECT DISTINCT query 13 | FROM _distinct_queries 14 | WHERE existing_id IS NULL; 15 | -------------------------------------------------------------------------------- /ducksearch/tables/select.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ..decorators import execute_with_duckdb 4 | 5 | 6 | @execute_with_duckdb( 7 | relative_path="tables/select/documents.sql", 8 | read_only=True, 9 | fetch_df=True, 10 | ) 11 | def _select_documents() -> list[dict]: 12 | """Select all documents from the documents table. 13 | 14 | Returns 15 | ------- 16 | list[dict] 17 | A list of dictionaries representing the documents. 18 | 19 | Examples 20 | -------- 21 | >>> from ducksearch import tables 22 | 23 | >>> documents = tables.select_documents( 24 | ... database="test.duckdb", 25 | ... schema="bm25_tables", 26 | ... ) 27 | 28 | >>> assert len(documents) == 3 29 | """ 30 | 31 | 32 | def select_documents( 33 | database: str, 34 | schema: str, 35 | limit: int | None = None, 36 | config: dict | None = None, 37 | ) -> list[dict]: 38 | """Select all documents from the documents table. 39 | 40 | Parameters 41 | ---------- 42 | database 43 | The name of the DuckDB database. 44 | schema 45 | The schema where the documents table is located. 46 | config 47 | Optional configuration options for the DuckDB connection. 48 | 49 | Returns 50 | ------- 51 | list[dict] 52 | A list of dictionaries representing the documents. 53 | 54 | Examples 55 | -------- 56 | >>> from ducksearch import tables 57 | 58 | >>> documents = tables.select_documents( 59 | ... database="test.duckdb", 60 | ... schema="bm25_tables", 61 | ... ) 62 | 63 | >>> assert len(documents) == 3 64 | """ 65 | return pd.DataFrame( 66 | _select_documents( 67 | database=database, 68 | schema=schema, 69 | limit="" if limit is None else f"LIMIT {limit}", 70 | config=config, 71 | ) 72 | ) 73 | 74 | 75 | @execute_with_duckdb( 76 | relative_path="tables/select/queries.sql", 77 | read_only=True, 78 | fetch_df=True, 79 | ) 80 | def select_queries() -> list[dict]: 81 | """Select all queries from the queries table. 82 | 83 | Returns 84 | ------- 85 | list[dict] 86 | A list of dictionaries representing the queries. 87 | 88 | Examples 89 | -------- 90 | >>> from ducksearch import tables 91 | 92 | >>> queries = tables.select_queries( 93 | ... database="test.duckdb", 94 | ... schema="bm25_tables", 95 | ... ) 96 | 97 | >>> assert len(queries) == 3 98 | """ 99 | 100 | 101 | @execute_with_duckdb( 102 | relative_path="tables/select/columns.sql", 103 | read_only=True, 104 | fields=["column"], 105 | ) 106 | def select_columns() -> list[dict]: 107 | """Retrieve the list of columns from a specified table. 108 | 109 | Returns 110 | ------- 111 | list[dict] 112 | A list of dictionaries containing the column names of the table. 113 | """ 114 | 115 | 116 | def select_documents_columns( 117 | database: str, 118 | schema: str, 119 | config: dict | None = None, 120 | ) -> list[str]: 121 | """Select the column names from the documents table, excluding the 'bm25id' column. 122 | 123 | Parameters 124 | ---------- 125 | database 126 | The name of the DuckDB database. 127 | schema 128 | The schema where the documents table is located. 129 | config 130 | Optional configuration options for the DuckDB connection. 131 | 132 | Returns 133 | ------- 134 | list[str] 135 | A list of column names from the documents table. 136 | 137 | Examples 138 | -------- 139 | >>> from ducksearch import tables 140 | 141 | >>> tables.select_documents_columns( 142 | ... database="test.duckdb", 143 | ... schema="bm25_tables", 144 | ... ) 145 | ['id', 'title', 'text'] 146 | """ 147 | return [ 148 | column["column"] 149 | for column in select_columns( 150 | database=database, schema=schema, table_name="documents", config=config 151 | ) 152 | if column["column"] != "bm25id" 153 | ] 154 | -------------------------------------------------------------------------------- /ducksearch/tables/select/columns.sql: -------------------------------------------------------------------------------- 1 | SELECT column_name 2 | FROM information_schema.columns 3 | WHERE 4 | lower(table_name) = '{table_name}' 5 | AND table_schema = '{schema}'; 6 | -------------------------------------------------------------------------------- /ducksearch/tables/select/documents.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM {schema}.documents 3 | ORDER BY id ASC 4 | {limit}; 5 | -------------------------------------------------------------------------------- /ducksearch/tables/select/queries.sql: -------------------------------------------------------------------------------- 1 | SELECT * 2 | FROM {schema}.queries 3 | ORDER BY id ASC; 4 | -------------------------------------------------------------------------------- /ducksearch/tables/update.py: -------------------------------------------------------------------------------- 1 | from ..decorators import execute_with_duckdb 2 | 3 | 4 | @execute_with_duckdb( 5 | relative_path="tables/update/documents.sql", 6 | ) 7 | def _add_columns_documents() -> None: 8 | """Add columns to the documents table in the DuckDB database. 9 | 10 | Parameters 11 | ---------- 12 | database: str 13 | The name of the DuckDB database. 14 | config: dict, optional 15 | The configuration options for the DuckDB connection. 16 | """ 17 | 18 | 19 | def add_columns_documents( 20 | database: str, 21 | schema: str, 22 | columns: list[str] | str, 23 | dtypes: dict = None, 24 | config: dict = None, 25 | ) -> None: 26 | """Add columns to the documents table in the DuckDB database. 27 | 28 | Parameters 29 | ---------- 30 | database: 31 | The name of the DuckDB database. 32 | schema: 33 | The schema in which the documents table is located. 34 | columns: 35 | The columns to add to the documents table. 36 | dtypes: 37 | The data types for the columns to add. 38 | config: 39 | The configuration options for the DuckDB connection. 40 | 41 | """ 42 | if isinstance(columns, str): 43 | columns = [columns] 44 | 45 | if dtypes is None: 46 | dtypes = {} 47 | 48 | _add_columns_documents( 49 | database=database, 50 | schema=schema, 51 | fields=", ".join( 52 | [f"ADD COLUMN {field} {dtypes.get(field, 'VARCHAR')}" for field in columns] 53 | ), 54 | config=config, 55 | ) 56 | -------------------------------------------------------------------------------- /ducksearch/tables/update/documents.sql: -------------------------------------------------------------------------------- 1 | ALTER TABLE {schema}.documents 2 | {fields} 3 | ; -------------------------------------------------------------------------------- /ducksearch/upload/__init__.py: -------------------------------------------------------------------------------- 1 | from .upload import documents, queries 2 | 3 | __all__ = ["documents", "queries"] 4 | -------------------------------------------------------------------------------- /ducksearch/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .batch import batchify 2 | from .columns import get_list_columns_df 3 | from .hash import generate_random_hash 4 | from .parralel_tqdm import ParallelTqdm 5 | from .plot import plot, plot_shards 6 | 7 | __all__ = [ 8 | "batchify", 9 | "get_list_columns_df", 10 | "generate_random_hash", 11 | "plot", 12 | "plot_shards", 13 | "ParallelTqdm", 14 | ] 15 | -------------------------------------------------------------------------------- /ducksearch/utils/batch.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | 3 | 4 | def batchify( 5 | X: list[str], batch_size: int, desc: str = "", tqdm_bar: bool = True 6 | ) -> list: 7 | """Split a list into batches and optionally display a progress bar. 8 | 9 | Parameters 10 | ---------- 11 | X 12 | A list of items to be batched. 13 | batch_size 14 | The number of items in each batch. 15 | desc 16 | A description to display in the progress bar. 17 | tqdm_bar 18 | Whether to display a progress bar using `tqdm`. 19 | 20 | Yields 21 | ------ 22 | list 23 | A list representing a batch of items from `X`. 24 | 25 | Examples 26 | -------- 27 | >>> items = ["a", "b", "c", "d", "e", "f"] 28 | >>> batches = list(batchify(items, batch_size=2)) 29 | >>> for batch in batches: 30 | ... print(batch) 31 | ['a', 'b'] 32 | ['c', 'd'] 33 | ['e', 'f'] 34 | 35 | """ 36 | # Split the input list `X` into batches 37 | batches = [X[pos : pos + batch_size] for pos in range(0, len(X), batch_size)] 38 | 39 | # Use tqdm to show a progress bar if `tqdm_bar` is set to True 40 | if tqdm_bar: 41 | for batch in tqdm.tqdm( 42 | batches, 43 | position=0, 44 | total=len(batches), 45 | desc=desc, 46 | ): 47 | yield batch 48 | else: 49 | # If no progress bar is needed, simply yield the batches 50 | yield from batches 51 | -------------------------------------------------------------------------------- /ducksearch/utils/columns.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def get_list_columns_df( 5 | documents: list[dict] | pd.DataFrame, 6 | ) -> list[str]: 7 | """Get a list of columns from a list of dictionaries or a DataFrame.""" 8 | columns = None 9 | if isinstance(documents, pd.DataFrame): 10 | return list(documents.columns) 11 | 12 | if isinstance(documents, list): 13 | columns = set() 14 | for document in documents: 15 | for column in document.keys(): 16 | if column != "id": 17 | columns.add(column) 18 | return list(columns) 19 | 20 | return None 21 | -------------------------------------------------------------------------------- /ducksearch/utils/hash.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import secrets 3 | 4 | 5 | def generate_random_hash() -> str: 6 | """Generate a random SHA-256 hash.""" 7 | random_data = secrets.token_bytes(32) 8 | hash_obj = hashlib.sha256() 9 | hash_obj.update(random_data) 10 | random_hash = hash_obj.hexdigest() 11 | return random_hash 12 | -------------------------------------------------------------------------------- /ducksearch/utils/parralel_tqdm.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | from joblib import Parallel 3 | 4 | 5 | class ParallelTqdm(Parallel): 6 | """joblib.Parallel, but with a tqdm progressbar. 7 | 8 | Parameters 9 | ---------- 10 | total : int 11 | The total number of tasks to complete. 12 | desc : str 13 | A description of the task. 14 | tqdm_bar : bool, optional 15 | Whether to display a tqdm progress bar. Default is False. 16 | show_joblib_header : bool, optional 17 | Whether to display the joblib header. Default is False 18 | 19 | References 20 | ---------- 21 | https://github.com/joblib/joblib/issues/972 22 | """ 23 | 24 | def __init__( 25 | self, 26 | *, 27 | total: int, 28 | desc: str, 29 | tqdm_bar: bool = True, 30 | show_joblib_header: bool = False, 31 | **kwargs, 32 | ) -> None: 33 | super().__init__(verbose=(1 if show_joblib_header else 0), **kwargs) 34 | self.total = total 35 | self.desc = desc 36 | self.tqdm_bar = tqdm_bar 37 | self.progress_bar: tqdm.tqdm | None = None 38 | 39 | def __call__(self, iterable): 40 | try: 41 | return super().__call__(iterable) 42 | finally: 43 | if self.progress_bar is not None: 44 | self.progress_bar.close() 45 | 46 | __call__.__doc__ = Parallel.__call__.__doc__ 47 | 48 | def dispatch_one_batch(self, iterator): 49 | """Dispatch a batch of tasks, and update the progress bar""" 50 | if self.progress_bar is None and self.tqdm_bar: 51 | self.progress_bar = tqdm.tqdm( 52 | desc=self.desc, 53 | total=self.total, 54 | position=0, 55 | disable=self.tqdm_bar, 56 | unit="tasks", 57 | ) 58 | return super().dispatch_one_batch(iterator=iterator) 59 | 60 | dispatch_one_batch.__doc__ = Parallel.dispatch_one_batch.__doc__ 61 | 62 | def print_progress(self): 63 | """Display the process of the parallel execution using tqdm""" 64 | if self.total is None and self._original_iterator is None: 65 | self.total = self.n_dispatched_tasks 66 | self.progress_bar.total = self.total 67 | self.progress_bar.refresh() 68 | 69 | if self.tqdm_bar: 70 | self.progress_bar.update(self.n_completed_tasks - self.progress_bar.n) 71 | -------------------------------------------------------------------------------- /ducksearch/utils/plot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ..decorators import execute_with_duckdb 4 | 5 | 6 | def create_aligned_markdown_table(data: dict) -> str: 7 | """Create an aligned markdown table from a dictionary of data. 8 | 9 | Parameters 10 | ---------- 11 | data 12 | A dictionary where keys are the table names and values are their sizes. 13 | 14 | Returns 15 | ------- 16 | str 17 | A formatted markdown table showing table names and sizes. 18 | """ 19 | # Define the headers 20 | headers = ["Table", "Size"] 21 | 22 | # Find the maximum width for each column 23 | max_key_len = max(len(key) for key in data.keys()) 24 | max_val_len = max(len(str(value)) for value in data.values()) 25 | 26 | # Ensure the headers fit as well 27 | max_key_len = max(max_key_len, len(headers[0])) 28 | max_val_len = max(max_val_len, len(headers[1])) 29 | 30 | # Format the header 31 | header_row = ( 32 | f"| {headers[0].ljust(max_key_len)} | {headers[1].ljust(max_val_len)} |\n" 33 | ) 34 | separator_row = f"|{'-' * (max_key_len + 2)}|{'-' * (max_val_len + 2)}|\n" 35 | 36 | # Format the rows with aligned columns 37 | table_rows = "" 38 | for key, value in data.items(): 39 | table_rows += ( 40 | f"| {key.ljust(max_key_len)} | {str(value).ljust(max_val_len)} |\n" 41 | ) 42 | 43 | # Combine the header, separator, and rows into the final markdown table 44 | table = f"{header_row}{separator_row}{table_rows}".strip() 45 | return f"\n{table}\n" 46 | 47 | 48 | @execute_with_duckdb( 49 | relative_path="utils/plot/plot.sql", 50 | read_only=True, 51 | fetch_df=True, 52 | ) 53 | def _plot_queries_documents(): 54 | """Fetch the table statistics from the DuckDB database. 55 | 56 | Returns 57 | ------- 58 | list[dict] 59 | A list of dictionaries where each dictionary contains table statistics. 60 | """ 61 | 62 | 63 | def plot( 64 | database: str, 65 | config: None | dict = None, 66 | tables=[ 67 | "bm25_tables.documents", 68 | "bm25_tables.queries", 69 | "bm25_documents.lengths", 70 | "bm25_queries.lengths", 71 | "bm25_tables.documents_queries", 72 | ], 73 | ) -> str: 74 | """Generate and display a markdown table with statistics of the specified dataset tables. 75 | 76 | Parameters 77 | ---------- 78 | database 79 | The name of the DuckDB database. 80 | config 81 | Optional configuration options for the DuckDB connection. 82 | tables 83 | A list of table names to plot statistics for. Defaults to common BM25 tables. 84 | 85 | Returns 86 | ------- 87 | str 88 | A markdown table representing the sizes of the specified tables. 89 | 90 | Examples 91 | -------- 92 | >>> from ducksearch import utils 93 | 94 | >>> utils.plot(database="test.duckdb") 95 | | Table | Size | 96 | |-----------|------| 97 | | documents | 5183 | 98 | | queries | 300 | 99 | """ 100 | data = {} 101 | for table in tables: 102 | try: 103 | # Fetch the table statistics for each specified table 104 | data.update( 105 | _plot_queries_documents(database=database, table=table, config=config)[ 106 | 0 107 | ] 108 | ) 109 | except Exception: 110 | continue 111 | 112 | # Clean up table names and filter out empty tables 113 | data = { 114 | table.replace(".docs", "").replace("bm25_tables.", ""): size 115 | for table, size in data.items() 116 | if size > 0 117 | } 118 | 119 | if len(data) > 0 and data is not None: 120 | return print(create_aligned_markdown_table(data=data)) 121 | 122 | 123 | def plot_shards( 124 | databases: list[str], 125 | config: None | dict = None, 126 | tables=[ 127 | "bm25_tables.documents", 128 | "bm25_tables.queries", 129 | "bm25_documents.lengths", 130 | "bm25_queries.lengths", 131 | "bm25_tables.documents_queries", 132 | ], 133 | ) -> str: 134 | """Generate and display a markdown table with statistics of the specified dataset tables. 135 | 136 | Parameters 137 | ---------- 138 | database 139 | The name of the DuckDB database. 140 | config 141 | Optional configuration options for the DuckDB connection. 142 | tables 143 | A list of table names to plot statistics for. Defaults to common BM25 tables. 144 | 145 | Returns 146 | ------- 147 | str 148 | A markdown table representing the sizes of the specified tables. 149 | 150 | Examples 151 | -------- 152 | >>> from ducksearch import utils 153 | 154 | >>> utils.plot(database="test.duckdb") 155 | | Table | Size | 156 | |-----------|------| 157 | | documents | 5183 | 158 | | queries | 300 | 159 | """ 160 | statistics = [] 161 | for database in databases: 162 | data = {} 163 | for table in tables: 164 | try: 165 | # Fetch the table statistics for each specified table 166 | data.update( 167 | _plot_queries_documents( 168 | database=database, table=table, config=config 169 | )[0] 170 | ) 171 | except Exception: 172 | continue 173 | 174 | # Clean up table names and filter out empty tables 175 | data = { 176 | table.replace(".docs", "").replace("bm25_tables.", ""): size 177 | for table, size in data.items() 178 | if size > 0 179 | } 180 | 181 | data = { 182 | "Database": database, 183 | **data, 184 | } 185 | 186 | if len(data) > 0 and data is not None: 187 | statistics.append(data) 188 | 189 | try: 190 | statistics = pd.DataFrame(statistics) 191 | total = statistics.sum(numeric_only=True) 192 | total["Database"] = "Total" 193 | statistics = pd.concat([statistics, total.to_frame().T], ignore_index=True) 194 | statistics = "\n" + statistics.to_markdown(index=False) + "\n" 195 | print(statistics) 196 | except Exception: 197 | pass 198 | -------------------------------------------------------------------------------- /ducksearch/utils/plot/plot.sql: -------------------------------------------------------------------------------- 1 | select count(*) as '{table}' 2 | from {table}; 3 | 4 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Project information 2 | site_name: ducksearch 3 | site_description: A search engine for ducks 4 | site_author: Raphael Sourty 5 | site_url: https://lightonai.github.io/ducksearch 6 | 7 | # Repository 8 | repo_name: lighton/ducksearch 9 | repo_url: https://github.com/lightonai/ducksearch 10 | edit_uri: "" 11 | 12 | # Copyright 13 | copyright: Copyright © 2023 14 | 15 | # Configuration 16 | theme: 17 | name: material 18 | custom_dir: docs 19 | language: en 20 | 21 | palette: 22 | - scheme: default 23 | primary: green 24 | accent: green 25 | toggle: 26 | icon: material/brightness-7 27 | name: Switch to dark mode 28 | - scheme: slate 29 | primary: green 30 | accent: green 31 | toggle: 32 | icon: material/brightness-4 33 | name: Switch to light mode 34 | 35 | font: 36 | text: Fira Sans 37 | code: Fira Code 38 | logo: img/logo.png 39 | favicon: img/logo.ico 40 | features: 41 | - content.code.copy 42 | - navigation.tabs 43 | - navigation.instant 44 | - navigation.indexes 45 | - navigation.prune 46 | 47 | # Extras 48 | extra: 49 | social: 50 | - icon: fontawesome/brands/github-alt 51 | link: https://github.com/lightonai/ducksearch 52 | 53 | # Extensions 54 | markdown_extensions: 55 | - admonition 56 | - footnotes 57 | - tables 58 | - toc: 59 | permalink: true 60 | toc_depth: "1-3" 61 | - pymdownx.details 62 | - pymdownx.arithmatex: 63 | generic: true 64 | - pymdownx.highlight: 65 | pygments_lang_class: true 66 | - pymdownx.inlinehilite 67 | - pymdownx.tabbed: 68 | alternate_style: true 69 | - pymdownx.superfences: 70 | custom_fences: 71 | - name: vegalite 72 | class: vegalite 73 | format: !!python/name:mkdocs_charts_plugin.fences.fence_vegalite 74 | 75 | 76 | plugins: 77 | - search 78 | - awesome-pages 79 | - mkdocs-jupyter 80 | 81 | extra_javascript: 82 | - javascripts/config.js 83 | - https://cdn.jsdelivr.net/npm/mathjax@3.2/es5/tex-mml-chtml.js 84 | - https://cdn.jsdelivr.net/npm/vega@5 85 | - https://cdn.jsdelivr.net/npm/vega-lite@5 86 | - https://cdn.jsdelivr.net/npm/vega-embed@6 87 | - https://unpkg.com/tablesort@5.3.0/dist/tablesort.min.js 88 | - javascripts/tablesort.js 89 | 90 | extra_css: 91 | - stylesheets/extra.css 92 | - css/version-select.css 93 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning 4 | ignore::RuntimeWarning 5 | ignore::UserWarning 6 | addopts = 7 | --doctest-modules 8 | --verbose 9 | -ra 10 | --cov-config=.coveragerc 11 | -m "not web and not slow" 12 | doctest_optionflags = NORMALIZE_WHITESPACE NUMBER 13 | norecursedirs = 14 | build 15 | docs 16 | node_modules 17 | markers = 18 | web: tests that require using the Internet 19 | slow: tests that take a long time to run -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | exclude = [ 2 | ".bzr", 3 | ".direnv", 4 | ".eggs", 5 | ".git", 6 | ".git-rewrite", 7 | ".hg", 8 | ".ipynb_checkpoints", 9 | ".mypy_cache", 10 | ".nox", 11 | ".pants.d", 12 | ".pyenv", 13 | ".pytest_cache", 14 | ".pytype", 15 | ".ruff_cache", 16 | ".svn", 17 | ".tox", 18 | ".venv", 19 | ".vscode", 20 | "__pypackages__", 21 | "_build", 22 | "buck-out", 23 | "build", 24 | "dist", 25 | "node_modules", 26 | "site-packages", 27 | "venv", 28 | ] 29 | 30 | # Same as Black. 31 | line-length = 88 32 | indent-width = 4 33 | 34 | target-version = "py310" 35 | 36 | [lint] 37 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 38 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 39 | # McCabe complexity (`C901`) by default. 40 | select = ["E4", "E7", "E9", "F"] 41 | ignore = [] 42 | 43 | # Allow fix for all enabled rules (when `--fix`) is provided. 44 | fixable = ["ALL"] 45 | unfixable = [] 46 | 47 | # Allow unused variables when underscore-prefixed. 48 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 49 | 50 | [format] 51 | quote-style = "double" 52 | indent-style = "space" 53 | skip-magic-trailing-comma = false 54 | line-ending = "auto" 55 | docstring-code-format = false 56 | docstring-code-line-length = "dynamic" -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | from ducksearch.__version__ import __version__ 4 | 5 | with open(file="README.md", mode="r", encoding="utf-8") as fh: 6 | long_description = fh.read() 7 | 8 | base_packages = [ 9 | "pandas >= 2.2.1", 10 | "duckdb >= 1.0.0", 11 | "pyarrow >= 16.1.0", 12 | "tqdm >= 4.66.4", 13 | "joblib >= 1.4.2", 14 | ] 15 | 16 | eval = ["ranx >= 0.3.16", "beir >= 2.0.0"] 17 | 18 | dev = [ 19 | "sqlfluff >= 3.1.0", 20 | "ruff >= 0.4.9", 21 | "pytest-cov >= 5.0.0", 22 | "pytest >= 8.2.1", 23 | "harlequin >= 1.24.0", 24 | "mkdocs-material == 9.5.32", 25 | "mkdocs-awesome-pages-plugin == 2.9.3", 26 | "mkdocs-jupyter == 0.24.8", 27 | "mkdocs_charts_plugin == 0.0.10", 28 | "numpydoc == 1.8.0", 29 | ] 30 | 31 | setuptools.setup( 32 | name="ducksearch", 33 | version=f"{__version__}", 34 | license="MIT", 35 | author="LightOn", 36 | description="DuckSearch: A Python library for efficient search in large collections of text data.", 37 | long_description=long_description, 38 | long_description_content_type="text/markdown", 39 | url="https://github.com/lightonai/ducksearch", 40 | keywords=[], 41 | packages=setuptools.find_packages(), 42 | install_requires=base_packages, 43 | extras_require={ 44 | "eval": base_packages + eval, 45 | "dev": base_packages + dev + eval, 46 | }, 47 | classifiers=[ 48 | "Programming Language :: Python :: 3", 49 | "Operating System :: OS Independent", 50 | ], 51 | python_requires=">=3.8", 52 | include_package_data=True, # Ensure package data is included 53 | package_data={ 54 | # Include all .sql files inside the 'ducksearch' package 55 | "ducksearch": ["**/*.sql"], 56 | }, 57 | ) 58 | --------------------------------------------------------------------------------