├── .github └── workflows │ ├── python-publish.yaml │ └── run-tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── Makefile ├── README.md ├── docs ├── api │ ├── backends.md │ ├── documents.md │ └── embeddings.md ├── assets │ ├── favicon.png │ └── logo.png ├── index.md ├── quickstart.md └── usage.md ├── examples └── simple_django_app │ ├── manage.py │ ├── products │ ├── __init__.py │ ├── admin.py │ ├── apps.py │ ├── documents.py │ ├── migrations │ │ ├── 0001_initial.py │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ └── views.py │ ├── requirements.txt │ └── simple_django_app │ ├── __init__.py │ ├── asgi.py │ ├── settings.py │ ├── urls.py │ └── wsgi.py ├── mkdocs.yaml ├── poetry.lock ├── pyproject.toml ├── src └── django_semantic_search │ ├── __init__.py │ ├── apps.py │ ├── backends │ ├── __init__.py │ ├── base.py │ ├── qdrant.py │ └── types.py │ ├── decorators.py │ ├── default_settings.py │ ├── documents.py │ ├── embeddings │ ├── __init__.py │ ├── base.py │ ├── fastembed.py │ ├── openai.py │ └── sentence_transformers.py │ ├── types.py │ └── utils.py └── tests ├── conftest.py ├── django_semantic_search ├── test_apps.py ├── test_decorators.py ├── test_documents.py ├── test_fastembed.py ├── test_openai_embeddings.py ├── test_sentence_transformers.py └── test_vector_index_embeddings.py └── mocks.py /.github/workflows/python-publish.yaml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | workflow_dispatch: 13 | push: 14 | # Pattern matched against refs/tags 15 | tags: 16 | - 'v*' # Push events to every version tag 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v2 25 | - name: Set up Python 26 | uses: actions/setup-python@v2 27 | with: 28 | python-version: '3.10.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install poetry 32 | poetry install 33 | - name: Build package 34 | run: poetry build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yaml: -------------------------------------------------------------------------------- 1 | name: Run tests 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | 8 | defaults: 9 | run: 10 | shell: bash 11 | 12 | jobs: 13 | test: 14 | strategy: 15 | matrix: 16 | python-version: 17 | - '3.10.x' 18 | - '3.11.x' 19 | - '3.12.x' 20 | os: 21 | - ubuntu-22.04 22 | - windows-latest 23 | 24 | runs-on: ${{ matrix.os }} 25 | 26 | name: Python ${{ matrix.python-version }} test on ${{ matrix.os }} 27 | 28 | steps: 29 | - name: Check out repository 30 | uses: actions/checkout@v3 31 | 32 | - name: Set up python 33 | id: setup-python 34 | uses: actions/setup-python@v4 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | 38 | - name: Install Poetry 39 | uses: snok/install-poetry@v1.3.4 40 | with: 41 | virtualenvs-create: true 42 | virtualenvs-in-project: true 43 | 44 | - name: Install dependencies 45 | run: poetry install --no-interaction --extras all --with dev 46 | 47 | - name: Run tests 48 | run: | 49 | source $VENV 50 | poetry run pytest 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | .idea/ 163 | 164 | # Project specific settings 165 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | default_language_version: 4 | python: python3.10 5 | 6 | ci: 7 | autofix_prs: true 8 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' 9 | autoupdate_schedule: quarterly 10 | # submodules: true 11 | 12 | repos: 13 | - repo: https://github.com/pre-commit/pre-commit-hooks 14 | rev: v4.6.0 15 | hooks: 16 | - id: check-yaml 17 | - id: end-of-file-fixer 18 | - id: trailing-whitespace 19 | - id: check-ast 20 | - id: check-added-large-files 21 | 22 | - repo: https://github.com/astral-sh/ruff-pre-commit 23 | rev: v0.5.0 24 | hooks: 25 | - id: ruff 26 | args: [ --fix ] 27 | - id: ruff-format 28 | 29 | - repo: https://github.com/PyCQA/isort 30 | rev: 5.12.0 31 | hooks: 32 | - id: isort 33 | name: "Sort Imports" 34 | args: [ "--profile", "black" ] 35 | 36 | - repo: https://github.com/pre-commit/mirrors-mypy 37 | rev: v1.9.0 38 | hooks: 39 | - id: mypy 40 | exclude: ^examples/ 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | docs_preview: 2 | echo "Previewing docs..." 3 | mkdocs serve 4 | 5 | docs_deploy: 6 | echo "Building docs..." 7 | mkdocs gh-deploy --force 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [django-semantic-search](https://kacperlukawski.github.io/django-semantic-search/) 2 | 3 | [![Latest PyPI version](https://img.shields.io/pypi/v/django-semantic-search.svg?style=flat-square)](https://pypi.python.org/pypi/django-semantic-search/) 4 | [![GitHub License](https://img.shields.io/github/license/kacperlukawski/django-semantic-search)](LICENSE) 5 | 6 | > Bringing semantic search to Django. Integrates seamlessly with Django ORM. 7 | 8 | **Full documentation for the project is available at https://kacperlukawski.github.io/django-semantic-search/** 9 | 10 | Django built-in search capabilities are rather limited. Finding a relevant instance of a model relies on the relational 11 | database's search capabilities, like SQL `LIKE` queries. This is not ideal for high-quality search results. This library 12 | aims to provide a semantic search capability to Django, allowing for more relevant search results. All this is done in 13 | a Django-friendly way, integrating with Django ORM. 14 | 15 | The library does not aim to provide all the features of search engines, but rather to provide a simple way to integrate 16 | Django applications with semantic search capabilities, using existing vector search engines, a.k.a. vector databases, 17 | and embedding models. 18 | 19 | ## Installation 20 | 21 | The `django-semantic-search` library can be installed via your favorite package manager. For example, using `pip`: 22 | 23 | ```shell 24 | pip install django-semantic-search 25 | ``` 26 | 27 | The current version is still experimental, and the API may change in the future. 28 | 29 | ## Quickstart 30 | 31 | Assuming, you already have a `Book` model defined in your Django application, you can define a corresponding subclass 32 | of the `Document` class from the `django_semantic_search` package. The `Document` class maps the Django model to the 33 | vector search engine. The document has to be registered with the `register_document` function. 34 | 35 | ```python 36 | from django_semantic_search import Document, VectorIndex, register_document 37 | from myapp.models import Book 38 | 39 | @register_document 40 | class BookDocument(Document): 41 | class Meta: 42 | model = Book 43 | indexes = [ 44 | VectorIndex("title"), 45 | VectorIndex("description"), 46 | ] 47 | ``` 48 | 49 | The `BookDocument` class defines the fields that will be indexed in the vector search engine. In this case, the `title` 50 | and `description` fields are indexed as separate vectors. The `VectorIndex` class is used to define the fields that 51 | should be indexed. 52 | 53 | A more detailed guide is available in the [Quickstart](https://kacperlukawski.github.io/django-semantic-search/quickstart/) 54 | section of the documentation. 55 | 56 | ## Usage 57 | 58 | Please refer to the [Usage](https://kacperlukawski.github.io/django-semantic-search/usage/) section in the documentation. 59 | 60 | ## Features 61 | 62 | - Define the search fields for a model. 63 | - Reflect the configuration in your vector search engine. 64 | - Auto-populate the vector search engine with the data from the Django models. 65 | - Support for multiple embedding models: 66 | - Sentence Transformers 67 | - OpenAI 68 | - FastEmbed (both dense and sparse embeddings) 69 | 70 | For the latest documentation, visit [https://kacperlukawski.github.io/django-semantic-search/](https://kacperlukawski.github.io/django-semantic-search/). 71 | 72 | ## Roadmap 73 | 74 | This is a general roadmap for the project. The list is not exhaustive and may change over time. 75 | 76 | - [ ] Allow using multiple fields for a single vector index. 77 | - [ ] Define overriding the default embedding model for each `VectorIndex`. 78 | - [ ] Implement wrappers for embedding models. 79 | - [ ] Add support for modalities other than text. 80 | - [ ] Improve the test coverage. 81 | - [ ] Add metadata filtering to the search method. 82 | 83 | If you have any suggestions or feature requests, feel free to create an issue in the project's repository. 84 | -------------------------------------------------------------------------------- /docs/api/backends.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Backends 3 | --- 4 | 5 | Backends are external tools supposed to provide the functionality of semantic search. The library does not assume 6 | any specific backend, but it provides a way to integrate with them. The following backends are supported: 7 | 8 | ## Qdrant 9 | 10 | Qdrant is a high-performance vector search engine written in Rust. 11 | 12 | ::: django_semantic_search.backends.qdrant.QdrantBackend 13 | options: 14 | members: false 15 | -------------------------------------------------------------------------------- /docs/api/documents.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: API Reference 3 | --- 4 | 5 | `django-semantic-search` was designed to mimic some of the patterns used in popular Django libraries, such as 6 | `django-import-export` to reduce the learning curve for new users. 7 | 8 | The base concept of the library is a `Document` subclass that represents a single searchable entity. The library 9 | provides a way to define a document class for a selected model. The document class is responsible for converting 10 | the model instances into the vector representation and storing them in the vector search engine, as well as for 11 | performing the search queries. 12 | 13 | ## Documents 14 | 15 | ::: django_semantic_search.Document 16 | options: 17 | members: false 18 | -------------------------------------------------------------------------------- /docs/api/embeddings.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Embedding models 3 | --- 4 | 5 | An embedding model is a tool that converts text data into a vector representation. The quality of the embedding model 6 | is crucial for the quality of the search results. You can configure multiple embedding models in your Django settings 7 | and use them for different fields in your documents. 8 | 9 | ## Configuration 10 | 11 | ### Default Embedding Model 12 | 13 | Configure the default embedding model that will be used when no specific model is specified: 14 | 15 | ```python title="settings.py" 16 | SEMANTIC_SEARCH = { 17 | "default_embeddings": { 18 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 19 | "configuration": { 20 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 21 | }, 22 | }, 23 | } 24 | ``` 25 | 26 | ### Named Embedding Models 27 | 28 | You can define multiple named embedding models to use for different fields: 29 | 30 | ```python title="settings.py" 31 | SEMANTIC_SEARCH = { 32 | "embedding_models": { 33 | "title_model": { 34 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 35 | "configuration": { 36 | "model_name": "sentence-transformers/all-mpnet-base-v2", 37 | "document_prompt": "Title: ", 38 | }, 39 | }, 40 | "content_model": { 41 | "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel", 42 | "configuration": { 43 | "model": "text-embedding-3-small", 44 | }, 45 | }, 46 | }, 47 | ... 48 | } 49 | ``` 50 | 51 | Then reference these models in your document definitions: 52 | 53 | ```python title="documents.py" 54 | @register_document 55 | class BookDocument(Document): 56 | class Meta: 57 | model = Book 58 | indexes = [ 59 | VectorIndex("title", embedding_model="title_model"), 60 | VectorIndex("content", embedding_model="content_model"), 61 | VectorIndex("summary"), # Will use default_embeddings 62 | ] 63 | ``` 64 | 65 | Note: Fields without a specified `embedding_model` will use the model defined in `default_embeddings`. 66 | 67 | ## Supported Models 68 | 69 | Currently, `django-semantic-search` supports the following embedding models: 70 | 71 | ### Sentence Transformers 72 | 73 | The [Sentence Transformers](https://www.sbert.net) library provides a way to convert text data into a vector 74 | representation. There are [over 5,000 pre-trained models 75 | available](https://huggingface.co/models?library=sentence-transformers), and you can choose the one that fits your needs the 76 | best. 77 | 78 | One of the available models is `all-MiniLM-L6-v2`, which is a lightweight model that provides a good balance between the 79 | quality of the search results and the resource consumption. 80 | 81 | ::: django_semantic_search.embeddings.SentenceTransformerModel 82 | options: 83 | members: 84 | - __init__ 85 | - embed_document 86 | - embed_query 87 | - vector_size 88 | 89 | ### OpenAI 90 | 91 | [OpenAI](https://platform.openai.com/docs/guides/embeddings) provides powerful embedding models through their API. The default model is `text-embedding-3-small`, which 92 | offers a good balance between quality and cost. 93 | 94 | To use OpenAI embeddings, first install the required dependencies: 95 | 96 | ```bash 97 | pip install django-semantic-search[openai] 98 | ``` 99 | 100 | Then configure it in your Django settings: 101 | 102 | ```python title="settings.py" 103 | SEMANTIC_SEARCH = { 104 | "default_embeddings": { 105 | "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel", 106 | "configuration": { 107 | "model": "text-embedding-3-small", 108 | "api_key": "your-api-key", # Optional if set in env 109 | }, 110 | }, 111 | ... 112 | } 113 | ``` 114 | 115 | The API key can also be provided through the `OPENAI_API_KEY` environment variable. 116 | 117 | ::: django_semantic_search.embeddings.OpenAIEmbeddingModel 118 | options: 119 | members: 120 | - __init__ 121 | - embed_document 122 | - embed_query 123 | - vector_size 124 | 125 | ### FastEmbed 126 | 127 | [FastEmbed](https://github.com/qdrant/fastembed) is a lightweight and efficient embedding library that supports both 128 | dense and sparse embeddings. It provides fast, accurate embeddings suitable for production use. 129 | 130 | #### Installation 131 | 132 | To use FastEmbed embeddings, install the required dependencies: 133 | 134 | ```bash 135 | pip install django-semantic-search[fastembed] 136 | ``` 137 | 138 | #### Dense Embeddings 139 | 140 | For dense embeddings, configure FastEmbed in your Django settings: 141 | 142 | ```python title="settings.py" 143 | SEMANTIC_SEARCH = { 144 | "default_embeddings": { 145 | "model": "django_semantic_search.embeddings.FastEmbedDenseModel", 146 | "configuration": { 147 | "model_name": "BAAI/bge-small-en-v1.5", 148 | }, 149 | }, 150 | ... 151 | } 152 | ``` 153 | 154 | ::: django_semantic_search.embeddings.FastEmbedDenseModel 155 | options: 156 | members: 157 | - __init__ 158 | - embed_document 159 | - embed_query 160 | - vector_size 161 | 162 | #### Sparse Embeddings (Coming Soon) 163 | 164 | > **Note:** Sparse embeddings support is currently under development and not yet available for use in 165 | > django-semantic-search. This feature will be available in a future release. 166 | 167 | While FastEmbed supports sparse embeddings (like BM25), the integration with django-semantic-search is still in 168 | progress. 169 | -------------------------------------------------------------------------------- /docs/assets/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/docs/assets/favicon.png -------------------------------------------------------------------------------- /docs/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/docs/assets/logo.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Django semantic search 3 | --- 4 | 5 | # django-semantic-search 6 | 7 | [![Latest PyPI version](https://img.shields.io/pypi/v/django-semantic-search.svg?style=flat-square)](https://pypi.python.org/pypi/django-semantic-search/) 8 | [![GitHub License](https://img.shields.io/github/license/kacperlukawski/django-semantic-search)](https://github.com/kacperlukawski/django-semantic-search/LICENSE) 9 | 10 | !!! Note "" 11 | Bringing semantic search to Django. Integrates seamlessly with Django ORM. 12 | 13 | Django built-in search capabilities are rather limited. Finding a relevant instance of a model relies on the relational 14 | database's search capabilities, like SQL `LIKE` queries. This is not ideal for high-quality search results. This library 15 | aims to provide a semantic search capability to Django, allowing for more relevant search results. All this is done in 16 | a Django-friendly way, integrating with Django ORM. 17 | 18 | The library does not aim to provide all the features of search engines, but rather to provide a simple way to integrate 19 | Django applications with semantic search capabilities, using existing vector search engines, a.k.a. vector databases, 20 | and embedding models. 21 | 22 | ## Installation 23 | 24 | The `django-semantic-search` library can be installed via your favorite package manager. For example, using `pip`: 25 | 26 | ```shell 27 | pip install django-semantic-search 28 | ``` 29 | 30 | The current version is still experimental, and the API may change in the future. 31 | 32 | ## Supported tools 33 | 34 | `django-semantic-search` has to cooperate with other tools to provide semantic search capabilities. You have to choose 35 | a vector search engine and an embedding model to use with the library, and configure them in the Django settings. 36 | 37 | ### Vector search engines 38 | 39 | The library supports the following vector search engines: 40 | 41 | - [Qdrant](api/backends.md#qdrant) 42 | 43 | If you would like to contribute support for another vector search engine, feel free to create a pull request. 44 | 45 | ### Embedding models 46 | 47 | Choosing the right embedding model is crucial for the quality of the search results. The current version of the library 48 | focuses on bringing the semantic search capabilities to Django, and provides integrations with the following vector embedding models: 49 | 50 | - [Sentence Transformers](api/embeddings.md#sentence-transformers) 51 | - [OpenAI](api/embeddings.md#openai) 52 | - [FastEmbed](api/embeddings.md#fastembed) (currently supports dense embeddings, sparse embeddings coming soon) 53 | 54 | In web-based applications, it makes a lot of sense to choose an external service for the embedding model, as it can be 55 | resource-intensive. Please do expect that the library will support more embedding models in the future, and will provide 56 | a way to integrate them with Django. 57 | 58 | Again, if you would like to contribute support for another embedding model, feel free to create a pull request. 59 | 60 | ## Configuration 61 | 62 | As with any Django application, you need to add the library to the `INSTALLED_APPS` list in the `settings.py` file of 63 | your project: 64 | 65 | ```python title="settings.py" 66 | INSTALLED_APPS = [ 67 | ..., # external apps, such as Django Rest Framework 68 | 'django_semantic_search', 69 | ..., # your custom apps, using django-semantic-search 70 | ] 71 | ``` 72 | 73 | All the library configuration is also done in the `settings.py` file of the project, via the `SEMANTIC_SEARCH` 74 | dictionary. Here is a full example of the configuration: 75 | 76 | ```python title="settings.py" 77 | --8<-- "src/django_semantic_search/default_settings.py" 78 | ``` 79 | 80 | ## Quickstart 81 | 82 | If you would like to be guided step-by-step through the installation and configuration process, please refer to the 83 | [Quickstart](quickstart.md) guide. 84 | 85 | ## Examples 86 | 87 | If you prefer going straight to the code, you can check the `examples` folder. In the future it will contain more 88 | examples of how to use the library, but for the time being, it contains just a simple Django project with a single 89 | app that demonstrates how to use the library. 90 | 91 | ### Simple Django App 92 | 93 | The `examples` folder contains a minimal Django `simple_django_app` project using the `django-semantic-search` library. 94 | It shows how to configure semantic search in a Django project. The application defines a simple model and a document 95 | class for it, and demonstrates how to search for instances of the model using the library. 96 | 97 | #### Prerequisites 98 | 99 | By default, the `simple_django_app` project uses the `Qdrant` vector search engine and the `all-MiniLM-L6-v2` Sentence 100 | Transformers model. You have to install the `django-semantic-search` library with the `qdrant` and `sentence-transformers` 101 | extras to run the project. The dependencies might be installed from the requirements file: 102 | 103 | ```shell 104 | pip install -r examples/simple_django_app/requirements.txt 105 | ``` 106 | 107 | The default configuration assumes that the Qdrant service is running on `localhost:6333`. Please refer to the Qdrant 108 | documentation on [how to set up the service](https://qdrant.tech/documentation/quickstart/#download-and-run). 109 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quickstart 3 | --- 4 | 5 | This quickstart guide will help you to get started with the `django-semantic-search` library. It will guide you through 6 | the installation process, the configuration of the vector search engine and the embedding model, and the definition of 7 | documents for the selected model. 8 | 9 | Assuming you already have a Django project set up, let's get started. 10 | 11 | ## 1. Install django-semantic-search 12 | 13 | The `django-semantic-search` library can be installed via your favorite package manager. For example, using `pip`: 14 | 15 | ```shell 16 | pip install django-semantic-search 17 | ``` 18 | 19 | The default installation does not include any vector search engine or embedding model, so you typically have to install 20 | the package with the desired support. For example, to install the package with [Qdrant](https://qdrant.tech) and 21 | [Sentence Transformers](https://www.sbert.net) support, you can run: 22 | 23 | ```shell 24 | pip install django-semantic-search[qdrant,sentence-transformers] 25 | ``` 26 | 27 | ## 2. Modify the Django settings 28 | 29 | Add the library to the `INSTALLED_APPS` list in the `settings.py` file of your project: 30 | 31 | ```python title="settings.py" 32 | INSTALLED_APPS = [ 33 | ..., # external apps, such as Django Rest Framework 34 | 'django_semantic_search', 35 | ..., # your custom apps, using django-semantic-search 36 | ] 37 | ``` 38 | 39 | ## 3. Choose the vector search engine and the embedding model 40 | 41 | Do not close the `settings.py` file yet. You need to configure the vector search engine and the embedding model. Add the 42 | `SEMANTIC_SEARCH` dictionary to the `settings.py` file of the project. Here is a basic example: 43 | 44 | ```python title="settings.py" 45 | SEMANTIC_SEARCH = { 46 | "vector_store": { 47 | "backend": "django_semantic_search.backends.qdrant.QdrantBackend", 48 | "configuration": { 49 | "location": "http://localhost:6333", 50 | }, 51 | }, 52 | "default_embeddings": { 53 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 54 | "configuration": { 55 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 56 | }, 57 | }, 58 | } 59 | ``` 60 | 61 | For more advanced configurations, including using different embedding models for different fields, see the [Embedding Models](api/embeddings.md) documentation. 62 | 63 | ## 4. Create a model class (skip if you already have one) 64 | 65 | Our example will use a simple model class, `Book`, with the `title`, `author`, and `description` fields. Here is the 66 | model definition: 67 | 68 | ```python title="books/models.py" 69 | from django.db import models 70 | 71 | class Book(models.Model): 72 | title = models.CharField(max_length=255) 73 | author = models.CharField(max_length=255) 74 | description = models.TextField() 75 | ``` 76 | 77 | A newly created model means we need to create a migration and apply it to the database: 78 | 79 | ```shell 80 | python manage.py makemigrations 81 | python manage.py migrate 82 | ``` 83 | 84 | ## 5. Define document class for the selected model 85 | 86 | Once the model is defined, you need to create a document class that inherits from `django_semantic_search.Document`. 87 | 88 | Assuming we have a `Book` model with the `title`, `author`, and `description` fields, here is an example of a document 89 | class for the `Book` model, with the `title` and `description` fields defined as searchable. Please do not forget to 90 | use the `register_document` decorator to register the document class with the library. 91 | 92 | ```python title="books/documents.py" 93 | from django_semantic_search import Document, VectorIndex, register_document 94 | from books.models import Book 95 | 96 | @register_document 97 | class BookDocument(Document): 98 | class Meta: 99 | model = Book 100 | indexes = [ 101 | VectorIndex("title"), 102 | VectorIndex("description"), 103 | ] 104 | ``` 105 | 106 | Currently, only single fields can be used for the vector index. 107 | 108 | The decorator `register_document` takes care of creating the signals for the model, so all the created/updated/deleted 109 | instances of the model will be automatically indexed in the vector search engine. 110 | 111 | ## 6. Create and store the instances of the model 112 | 113 | From now on, whenever you create or update an instance of the `Book` model, the instance will be automatically indexed 114 | in the vector search engine. Here is an example of creating a new instance of the `Book` model: 115 | 116 | ```python title="books/views.py" 117 | from books.models import Book 118 | 119 | def create_book(request): 120 | book = Book.objects.create( 121 | title="The Lord of the Rings", 122 | author="J.R.R. Tolkien", 123 | description="The Lord of the Rings is an epic high-fantasy novel by the English author and scholar J. R. R. Tolkien." 124 | ) 125 | return book 126 | ``` 127 | 128 | The `create_book` function creates a new instance of the `Book` model with the title, author, and description fields 129 | filled in. The instance is then returned. Under the hood, a corresponding document is created and indexed in the vector 130 | search engine. It ignores the `author` field, as it is not defined as a searchable field in the `BookDocument` class. 131 | 132 | ## 7. Search for the instances of the model 133 | 134 | The `BookDocument` class serves as a bridge between the Django model and the vector search engine. You can use the 135 | `search` method to find the most relevant instances of the model. Here is an example of searching for the instances of 136 | the `Book` model: 137 | 138 | ```python title="books/views.py" 139 | from books.documents import BookDocument 140 | 141 | results = BookDocument.objects.search(title=query) 142 | ``` 143 | 144 | We specifically chose the `title` field to search for the instances of the `Book` model. The `search` method returns a 145 | queryset of the most relevant instances of the model, based on the search query. Alternatively, you can search for the 146 | instances using the `description` field: 147 | 148 | ```python title="books/views.py" 149 | results = BookDocument.objects.search(description=query) 150 | ``` 151 | 152 | Currently, only a single field can be used for the search query, but we plan to extend this functionality in the future. 153 | 154 | !!!Info 155 | This tutorial covers the happy path of using the `django-semantic-search` library. If you encounter any issues or 156 | have any questions, feel free to create an issue in the project's repository. Please make sure to check the list of 157 | [Frequency Asked Questions](usage.md#frequently-asked-questions) before creating a new issue. 158 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Usage" 3 | --- 4 | 5 | This section focuses on specific usage examples of the `django-semantic-search` library. If you are looking for 6 | a step-by-step introduction, please refer to the [Quickstart](quickstart.md) guide. 7 | 8 | ## Configuration 9 | 10 | As with any Django application, you need to add the library to the `INSTALLED_APPS` list in the `settings.py` file of 11 | your project: 12 | 13 | ```python title="settings.py" 14 | INSTALLED_APPS = [ 15 | ..., # external apps, such as Django Rest Framework 16 | 'django_semantic_search', 17 | ..., # your custom apps, using django-semantic-search 18 | ] 19 | ``` 20 | 21 | All the library configuration is done in the `settings.py` file of the project, via the `SEMANTIC_SEARCH` 22 | dictionary. Here is a full example of the configuration: 23 | 24 | ```python title="settings.py" 25 | --8<-- "src/django_semantic_search/default_settings.py" 26 | ``` 27 | 28 | ### Using Different Embedding Models 29 | 30 | You can define multiple embedding models in the settings and use them for different fields in your documents: 31 | 32 | ```python title="settings.py" 33 | SEMANTIC_SEARCH = { 34 | "default_embeddings": { 35 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 36 | "configuration": { 37 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 38 | }, 39 | }, 40 | "embedding_models": { 41 | "title_model": { 42 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 43 | "configuration": { 44 | "model_name": "sentence-transformers/all-mpnet-base-v2", 45 | "document_prompt": "Title: ", 46 | }, 47 | }, 48 | "content_model": { 49 | "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel", 50 | "configuration": { 51 | "model": "text-embedding-3-small", 52 | }, 53 | }, 54 | } 55 | } 56 | ``` 57 | 58 | Then reference these models in your document definitions: 59 | 60 | ```python title="books/documents.py" 61 | @register_document 62 | class BookDocument(Document): 63 | class Meta: 64 | model = Book 65 | indexes = [ 66 | VectorIndex("title", embedding_model="title_model"), # Uses title_model 67 | VectorIndex("content", embedding_model="content_model"), # Uses content_model 68 | VectorIndex("description"), # Uses default_embeddings 69 | ] 70 | ``` 71 | 72 | If no specific embedding model is specified for a `VectorIndex`, it will use the model defined in `default_embeddings`. 73 | 74 | ## Frequently Asked Questions 75 | 76 | This section describes some common questions and answers related to the `django-semantic-search` library. 77 | 78 | ### How to define which fields are searchable? 79 | 80 | To define the search fields for a model, you need to create a document class that inherits from 81 | `django_semantic_search.Document`. There is no strict requirement for the document class to be put in a specific 82 | package, but it is recommended to put it in the `documents.py` file in the app package. 83 | 84 | Assuming, we have a `Book` model with the `title`, `author`, and `description` fields: 85 | 86 | ```python title="books/models.py" 87 | from django.db import models 88 | 89 | class Book(models.Model): 90 | title = models.CharField(max_length=255) 91 | author = models.CharField(max_length=255) 92 | description = models.TextField() 93 | ``` 94 | 95 | Here is an example of a document class for the `Book` model, with the `title` and `description` fields defined as 96 | searchable: 97 | 98 | ```python title="books/documents.py" 99 | from django_semantic_search import Document, VectorIndex 100 | from books.models import Book 101 | 102 | class BookDocument(Document): 103 | class Meta: 104 | model = Book 105 | indexes = [ 106 | VectorIndex("title"), 107 | VectorIndex("description"), 108 | ] 109 | ``` 110 | 111 | Currently, the default embedding model is used for all the fields. 112 | 113 | ### How to search for documents? 114 | 115 | To search for documents, you can use the `search` method of the document class. The method returns a Django queryset 116 | with the search results. 117 | 118 | Here is an example of searching for books with the title containing the word "Django": 119 | 120 | ```python title="books/views.py" 121 | from books.documents import BookDocument 122 | 123 | def search_books(request): 124 | query = "Django" 125 | books = BookDocument.objects.search(title=query) 126 | return render(request, "books/search_results.html", {"books": books}) 127 | ``` 128 | 129 | Using the named arguments in the `search` method allows you to search for documents with specific fields. 130 | 131 | ### How to index the existing data? 132 | 133 | If you are adding the `django-semantic-search` library to an existing project, you may want to index the existing 134 | instances of the models. To do this, you can use the `index` method of the document class. 135 | 136 | Here is an example of indexing all the existing instances of the `Book` model: 137 | 138 | ```python title="index_models.py" 139 | from books.models import Book 140 | from books.documents import BookDocument 141 | 142 | def index_books(request): 143 | all_books = Book.objects.all() 144 | BookDocument.objects.index(all_books) 145 | return HttpResponse("Books indexed successfully.") 146 | ``` 147 | 148 | !!!Warning 149 | Indexing all the instances of the model can be resource-intensive, as each instance of the model has to be converted 150 | to the vector representation. It is recommended to run the indexing process in a background task or a separate 151 | management command. 152 | -------------------------------------------------------------------------------- /examples/simple_django_app/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | 4 | import os 5 | import sys 6 | 7 | 8 | def main(): 9 | """Run administrative tasks.""" 10 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_django_app.settings") 11 | try: 12 | from django.core.management import execute_from_command_line 13 | except ImportError as exc: 14 | raise ImportError( 15 | "Couldn't import Django. Are you sure it's installed and " 16 | "available on your PYTHONPATH environment variable? Did you " 17 | "forget to activate a virtual environment?" 18 | ) from exc 19 | execute_from_command_line(sys.argv) 20 | 21 | 22 | if __name__ == "__main__": 23 | main() 24 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/examples/simple_django_app/products/__init__.py -------------------------------------------------------------------------------- /examples/simple_django_app/products/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from products.models import Product 3 | 4 | 5 | class ProductAdmin(admin.ModelAdmin): 6 | pass 7 | 8 | 9 | admin.site.register(Product, ProductAdmin) 10 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ProductsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "products" 7 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/documents.py: -------------------------------------------------------------------------------- 1 | import django_semantic_search as dss 2 | 3 | from .models import Product 4 | 5 | 6 | @dss.register_document 7 | class ProductDocument(dss.Document): 8 | """ 9 | Maps the Product model to a document for the semantic search engine. 10 | """ 11 | 12 | class Meta: 13 | model = Product 14 | indexes = [ 15 | # One vector index is created for the description field 16 | dss.VectorIndex("description"), 17 | # Another vector index is created just for the name field 18 | dss.VectorIndex("name"), 19 | ] 20 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.1 on 2024-08-28 11:15 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | initial = True 8 | 9 | dependencies = [] 10 | 11 | operations = [ 12 | migrations.CreateModel( 13 | name="Product", 14 | fields=[ 15 | ( 16 | "id", 17 | models.BigAutoField( 18 | auto_created=True, 19 | primary_key=True, 20 | serialize=False, 21 | verbose_name="ID", 22 | ), 23 | ), 24 | ("name", models.CharField(max_length=255)), 25 | ("description", models.TextField()), 26 | ("thumbnail", models.URLField(blank=True, null=True)), 27 | ("price", models.DecimalField(decimal_places=2, max_digits=10)), 28 | ("created_at", models.DateTimeField(auto_now_add=True)), 29 | ("updated_at", models.DateTimeField(auto_now=True)), 30 | ], 31 | ), 32 | ] 33 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/examples/simple_django_app/products/migrations/__init__.py -------------------------------------------------------------------------------- /examples/simple_django_app/products/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | 4 | class Product(models.Model): 5 | """ 6 | Model to store the product information. 7 | """ 8 | 9 | name = models.CharField(max_length=255) 10 | description = models.TextField() 11 | thumbnail = models.URLField(blank=True, null=True) 12 | price = models.DecimalField(max_digits=10, decimal_places=2) 13 | created_at = models.DateTimeField(auto_now_add=True) 14 | updated_at = models.DateTimeField(auto_now=True) 15 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/tests.py: -------------------------------------------------------------------------------- 1 | # Create your tests here. 2 | -------------------------------------------------------------------------------- /examples/simple_django_app/products/views.py: -------------------------------------------------------------------------------- 1 | from django.http import JsonResponse 2 | from products.documents import ProductDocument 3 | 4 | 5 | def index(request): 6 | """ 7 | View for the index page. 8 | :param request: request object. 9 | :return: response object. 10 | """ 11 | user_query = request.GET.get("query", "hello, world!") 12 | name_results = ProductDocument.objects.search(name=user_query) 13 | description_results = ProductDocument.objects.search(description=user_query) 14 | return JsonResponse( 15 | { 16 | "message": "Hello, world!", 17 | "name_results": list(name_results.values()), 18 | "description_results": list(description_results.values()), 19 | } 20 | ) 21 | -------------------------------------------------------------------------------- /examples/simple_django_app/requirements.txt: -------------------------------------------------------------------------------- 1 | django 2 | django-semantic-search[qdrant,sentence-transformers] 3 | -------------------------------------------------------------------------------- /examples/simple_django_app/simple_django_app/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/examples/simple_django_app/simple_django_app/__init__.py -------------------------------------------------------------------------------- /examples/simple_django_app/simple_django_app/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for simple_django_app project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_django_app.settings") 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /examples/simple_django_app/simple_django_app/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for simple_django_app project. 3 | 4 | Generated by 'django-admin startproject' using Django 5.1. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.1/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/5.1/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | 15 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 16 | BASE_DIR = Path(__file__).resolve().parent.parent 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = "django-insecure-kw=7=8-o6j8*s=qar$r951i^7s*@_+%e0!ccz0_#ye7%d)&o(j" 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | "django.contrib.admin", 35 | "django.contrib.auth", 36 | "django.contrib.contenttypes", 37 | "django.contrib.sessions", 38 | "django.contrib.messages", 39 | "django.contrib.staticfiles", 40 | "django_semantic_search", # Make sure it's before all the apps that use it. 41 | "products", 42 | ] 43 | 44 | MIDDLEWARE = [ 45 | "django.middleware.security.SecurityMiddleware", 46 | "django.contrib.sessions.middleware.SessionMiddleware", 47 | "django.middleware.common.CommonMiddleware", 48 | "django.middleware.csrf.CsrfViewMiddleware", 49 | "django.contrib.auth.middleware.AuthenticationMiddleware", 50 | "django.contrib.messages.middleware.MessageMiddleware", 51 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 52 | ] 53 | 54 | ROOT_URLCONF = "simple_django_app.urls" 55 | 56 | TEMPLATES = [ 57 | { 58 | "BACKEND": "django.template.backends.django.DjangoTemplates", 59 | "DIRS": [], 60 | "APP_DIRS": True, 61 | "OPTIONS": { 62 | "context_processors": [ 63 | "django.template.context_processors.debug", 64 | "django.template.context_processors.request", 65 | "django.contrib.auth.context_processors.auth", 66 | "django.contrib.messages.context_processors.messages", 67 | ], 68 | }, 69 | }, 70 | ] 71 | 72 | WSGI_APPLICATION = "simple_django_app.wsgi.application" 73 | 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/5.1/ref/settings/#databases 77 | 78 | DATABASES = { 79 | "default": { 80 | "ENGINE": "django.db.backends.sqlite3", 81 | "NAME": BASE_DIR / "db.sqlite3", 82 | } 83 | } 84 | 85 | 86 | # Password validation 87 | # https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators 88 | 89 | AUTH_PASSWORD_VALIDATORS = [ 90 | { 91 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", 92 | }, 93 | { 94 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", 95 | }, 96 | { 97 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", 98 | }, 99 | { 100 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", 101 | }, 102 | ] 103 | 104 | 105 | # Internationalization 106 | # https://docs.djangoproject.com/en/5.1/topics/i18n/ 107 | 108 | LANGUAGE_CODE = "en-us" 109 | 110 | TIME_ZONE = "UTC" 111 | 112 | USE_I18N = True 113 | 114 | USE_TZ = True 115 | 116 | 117 | # Static files (CSS, JavaScript, Images) 118 | # https://docs.djangoproject.com/en/5.1/howto/static-files/ 119 | 120 | STATIC_URL = "static/" 121 | 122 | # Default primary key field type 123 | # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field 124 | 125 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" 126 | 127 | # Semantic search settings 128 | 129 | SEMANTIC_SEARCH = { 130 | "vector_store": { 131 | "backend": "django_semantic_search.backends.qdrant.QdrantBackend", 132 | "configuration": { 133 | "location": "http://localhost:6333", 134 | }, 135 | }, 136 | "default_embeddings": { 137 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 138 | "configuration": { 139 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 140 | }, 141 | }, 142 | } 143 | -------------------------------------------------------------------------------- /examples/simple_django_app/simple_django_app/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL configuration for simple_django_app project. 3 | 4 | The `urlpatterns` list routes URLs to views. For more information please see: 5 | https://docs.djangoproject.com/en/5.1/topics/http/urls/ 6 | Examples: 7 | Function views 8 | 1. Add an import: from my_app import views 9 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 10 | Class-based views 11 | 1. Add an import: from other_app.views import Home 12 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 13 | Including another URLconf 14 | 1. Import the include() function: from django.urls import include, path 15 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 16 | """ 17 | 18 | from django.contrib import admin 19 | from django.urls import path 20 | from products import views as product_views 21 | 22 | urlpatterns = [ 23 | path("", product_views.index, name="index"), 24 | path("admin/", admin.site.urls), 25 | ] 26 | -------------------------------------------------------------------------------- /examples/simple_django_app/simple_django_app/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for simple_django_app project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_django_app.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /mkdocs.yaml: -------------------------------------------------------------------------------- 1 | site_name: django-semantic-search 2 | site_url: https://kacperlukawski.github.io/django-semantic-search/ 3 | site_description: Bringing semantic search to Django. Integrates seamlessly with Django ORM. 4 | repo_url: https://github.com/kacperlukawski/django-semantic-search 5 | nav: 6 | - Home: index.md 7 | - Quickstart: quickstart.md 8 | - Usage: usage.md 9 | - API Reference: 10 | - Documents: api/documents.md 11 | - Backends: api/backends.md 12 | - Embeddings: api/embeddings.md 13 | theme: 14 | name: material 15 | logo: assets/logo.png 16 | favicon: assets/favicon.png 17 | palette: 18 | # Palette toggle for light mode 19 | - media: "(prefers-color-scheme: light)" 20 | scheme: default 21 | primary: orange 22 | toggle: 23 | icon: material/brightness-7 24 | name: Switch to dark mode 25 | # Palette toggle for dark mode 26 | - media: "(prefers-color-scheme: dark)" 27 | scheme: slate 28 | primary: deep orange 29 | toggle: 30 | icon: material/brightness-4 31 | name: Switch to light mode 32 | font: 33 | text: Roboto 34 | code: Roboto Mono 35 | features: 36 | - search.suggest 37 | - search.highlight 38 | - toc.integrate 39 | - navigation.tabs 40 | - content.code.copy 41 | plugins: 42 | - search 43 | - mkdocstrings: 44 | handlers: 45 | python: 46 | options: 47 | annotations_path: brief 48 | show_root_heading: true 49 | show_root_toc_entry: true 50 | show_symbol_type_heading: true 51 | heading_level: 3 52 | docstring_style: sphinx 53 | - social: 54 | cards_layout_options: 55 | font_family: Roboto 56 | logo: assets/logo.png 57 | background_color: "#ff6e42" 58 | color: "white" 59 | markdown_extensions: 60 | - attr_list 61 | - admonition 62 | - md_in_html 63 | - pymdownx.details 64 | - pymdownx.highlight: 65 | anchor_linenums: true 66 | line_spans: __span 67 | pygments_lang_class: true 68 | - pymdownx.inlinehilite 69 | - pymdownx.snippets 70 | - pymdownx.superfences 71 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "django-semantic-search" 3 | version = "0.2.1" 4 | description = "Bringing semantic search to Django. Integrates seamlessly with Django ORM." 5 | authors = ["Kacper Łukawski "] 6 | license = "Apache-2.0" 7 | readme = "README.md" 8 | packages = [ 9 | { include = "django_semantic_search", from = "src" }, 10 | ] 11 | 12 | [tool.poetry.dependencies] 13 | python = ">=3.10" 14 | django = ">=5.0" 15 | qdrant-client = "^1.11.1" 16 | sentence-transformers = { version = "^4.1.0", optional = true } 17 | torch = [ 18 | {version = "^2.0.0", markers = "sys_platform == 'darwin'", source = "pypi", optional = true}, 19 | {version = "^2.0.0", markers = "sys_platform != 'darwin'", source = "pytorch_cpu", optional = true} 20 | ] 21 | openai = { version = "^1.0.0", optional = true } 22 | fastembed = { version = "^0.6.1", optional = true } 23 | 24 | [tool.poetry.extras] 25 | qdrant = ["qdrant-client"] 26 | sentence-transformers = ["sentence-transformers", "torch"] 27 | openai = ["openai"] 28 | fastembed = ["fastembed"] 29 | all = ["qdrant-client", "sentence-transformers", "torch", "openai", "fastembed"] 30 | 31 | [tool.poetry.group.dev] 32 | optional = true 33 | 34 | [tool.poetry.group.dev.dependencies] 35 | pre-commit = "^3.8.0" 36 | ruff = "^0.6.2" 37 | pytest = "^8.3.2" 38 | mkdocs = "^1.6.1" 39 | mkdocstrings-python = "^1.11.1" 40 | mkdocs-material = {extras = ["imaging"], version = "^9.5.34"} 41 | 42 | [[tool.poetry.source]] 43 | name = "pytorch_cpu" 44 | url = "https://download.pytorch.org/whl/cpu" 45 | priority = "explicit" 46 | 47 | [tool.pytest.ini_options] 48 | minversion = "7.1" 49 | pythonpath = [ 50 | "src/" 51 | ] 52 | testpaths = [ 53 | "tests/" 54 | ] 55 | 56 | [tool.ruff] 57 | lint.typing-modules = ["cibuildwheel.typing"] 58 | 59 | [build-system] 60 | requires = ["poetry-core"] 61 | build-backend = "poetry.core.masonry.api" 62 | -------------------------------------------------------------------------------- /src/django_semantic_search/__init__.py: -------------------------------------------------------------------------------- 1 | from .decorators import register_document 2 | from .documents import Document, VectorIndex 3 | 4 | __all__ = [ 5 | "Document", 6 | "VectorIndex", 7 | "register_document", 8 | ] 9 | -------------------------------------------------------------------------------- /src/django_semantic_search/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | from django.conf import settings 3 | 4 | from django_semantic_search import default_settings 5 | 6 | 7 | class DjangoSemanticSearchConfig(AppConfig): 8 | name = "django_semantic_search" 9 | verbose_name = "Django Semantic Search" 10 | 11 | def ready(self): 12 | # Load the default settings 13 | for setting in dir(default_settings): 14 | if setting.isupper() and not hasattr(settings, setting): 15 | setattr(settings, setting, getattr(default_settings, setting)) 16 | -------------------------------------------------------------------------------- /src/django_semantic_search/backends/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/src/django_semantic_search/backends/__init__.py -------------------------------------------------------------------------------- /src/django_semantic_search/backends/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import List 3 | 4 | from django_semantic_search.backends.types import IndexConfiguration 5 | from django_semantic_search.documents import Document 6 | from django_semantic_search.types import DocumentID 7 | 8 | 9 | class BaseVectorSearchBackend(abc.ABC): 10 | """ 11 | Base class for all the vector search backends, such as Qdrant. 12 | """ 13 | 14 | def __init__(self, index_configuration: IndexConfiguration): 15 | self.index_configuration = index_configuration 16 | self.configure() 17 | 18 | @abc.abstractmethod 19 | def configure(self): 20 | """ 21 | Configure the indexes for the backend. 22 | """ 23 | raise NotImplementedError 24 | 25 | @abc.abstractmethod 26 | def search( 27 | self, vector_name: str, query: List[float], limit: int = 10 28 | ) -> List[DocumentID]: 29 | """ 30 | Search for the documents similar to the query vector in the backend. 31 | :param vector_name: 32 | :param query: 33 | :param limit: 34 | :return: 35 | """ 36 | raise NotImplementedError 37 | 38 | @abc.abstractmethod 39 | def save(self, document: Document): 40 | """ 41 | Save the document in the backend. 42 | :param configuration: vector store configuration. 43 | :param document: 44 | :return: 45 | """ 46 | raise NotImplementedError 47 | 48 | @abc.abstractmethod 49 | def delete(self, document_id: DocumentID): 50 | """ 51 | Delete the document from the backend. 52 | :param configuration: vector store configuration. 53 | :param document_id: id of the document to delete. 54 | """ 55 | raise NotImplementedError 56 | -------------------------------------------------------------------------------- /src/django_semantic_search/backends/qdrant.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import uuid 3 | from typing import List 4 | 5 | from django_semantic_search import Document 6 | from django_semantic_search.backends.base import BaseVectorSearchBackend 7 | from django_semantic_search.backends.types import Distance, IndexConfiguration 8 | from django_semantic_search.types import DocumentID 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class QdrantBackend(BaseVectorSearchBackend): 14 | """ 15 | Backend that integrates with Qdrant vector database. 16 | 17 | It handles the configuration of separate collections per each model we want to enable search for. Users rarely 18 | interact with this backend directly, as backend is usually configured via Django settings. 19 | 20 | **Requirements**: 21 | 22 | ```bash 23 | pip install django-semantic-search[qdrant] 24 | ``` 25 | 26 | **Usage**: 27 | 28 | ```python title="settings.py" 29 | SEMANTIC_SEARCH = { 30 | "vector_store": { 31 | "backend": "django_semantic_search.backends.qdrant.QdrantBackend", 32 | "configuration": { 33 | "host": "http://localhost:6333", 34 | }, 35 | }, 36 | ... 37 | } 38 | ``` 39 | """ 40 | 41 | from qdrant_client import models 42 | 43 | DISTANCE_MAPPING = { 44 | Distance.COSINE: models.Distance.COSINE, 45 | Distance.EUCLIDEAN: models.Distance.EUCLID, 46 | Distance.DOT_PRODUCT: models.Distance.DOT, 47 | } 48 | 49 | def __init__(self, index_configuration: IndexConfiguration, *args, **kwargs): 50 | from qdrant_client import QdrantClient 51 | 52 | self.client = QdrantClient(*args, **kwargs) 53 | super().__init__(index_configuration) 54 | 55 | def configure(self): 56 | from qdrant_client import models 57 | 58 | try: 59 | collection_info = self.client.get_collection( # noqa 60 | collection_name=self.index_configuration.namespace 61 | ) 62 | # TODO: validate if all the vectors are present and with correct types 63 | except Exception: 64 | logger.warning( 65 | f"Collection {self.index_configuration.namespace} does not exist. Creating a new one." 66 | ) 67 | self.client.create_collection( 68 | collection_name=self.index_configuration.namespace, 69 | vectors_config={ 70 | vector_name: models.VectorParams( 71 | size=vector_config.size, 72 | distance=self.DISTANCE_MAPPING.get(vector_config.distance), 73 | ) 74 | for vector_name, vector_config in self.index_configuration.vectors.items() 75 | }, 76 | ) 77 | self.client.create_payload_index( 78 | collection_name=self.index_configuration.namespace, 79 | field_name=self.index_configuration.id_field, 80 | field_schema=models.PayloadSchemaType.KEYWORD, 81 | ) 82 | 83 | def search( 84 | self, vector_name: str, query: List[float], limit: int = 10 85 | ) -> List[DocumentID]: 86 | results = self.client.query_points( 87 | collection_name=self.index_configuration.namespace, 88 | query=query, 89 | using=vector_name, 90 | limit=limit, 91 | with_vectors=False, 92 | with_payload=True, 93 | ) 94 | return [ 95 | result.payload.get(self.index_configuration.id_field) 96 | for result in results.points 97 | ] 98 | 99 | def save(self, document: Document): 100 | from qdrant_client import models 101 | 102 | vectors = document.vectors() 103 | payload = { 104 | self.index_configuration.id_field: document.id, 105 | **document.metadata(), 106 | } 107 | self.client.upsert( 108 | collection_name=self.index_configuration.namespace, 109 | points=[ 110 | models.PointStruct( 111 | id=uuid.uuid4().hex, 112 | vector=vectors, 113 | payload=payload, 114 | ) 115 | ], 116 | ) 117 | 118 | def delete(self, document_id: DocumentID): 119 | from qdrant_client import models 120 | 121 | self.client.delete( 122 | collection_name=self.index_configuration.namespace, 123 | points_selector=models.Filter( 124 | must=[ 125 | models.FieldCondition( 126 | key=self.index_configuration.id_field, 127 | match=models.MatchValue( 128 | value=document_id, 129 | ), 130 | ) 131 | ] 132 | ), 133 | ) 134 | -------------------------------------------------------------------------------- /src/django_semantic_search/backends/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from enum import Enum 3 | from typing import Dict 4 | 5 | 6 | class Distance(str, Enum): 7 | COSINE = "cosine" 8 | EUCLIDEAN = "euclidean" 9 | DOT_PRODUCT = "dot_product" 10 | 11 | 12 | @dataclass(frozen=True, eq=True, slots=True) 13 | class VectorConfiguration: 14 | size: int 15 | distance: Distance 16 | 17 | 18 | @dataclass(frozen=True, eq=True, slots=True) 19 | class IndexConfiguration: 20 | """ 21 | Configuration of the indexes to create in the vector store. 22 | """ 23 | 24 | # Name of the collection representing a particular entity type 25 | namespace: str 26 | # List of indexes to create, along with their configuration 27 | vectors: Dict[str, VectorConfiguration] = field(default_factory=dict) 28 | # Name of the property that contains the document id 29 | id_field: str = "id" 30 | 31 | def __hash__(self): 32 | frozen_vectors = frozenset(sorted(self.vectors.items())) 33 | return hash(self.namespace) + hash(self.id_field) + hash(frozen_vectors) 34 | -------------------------------------------------------------------------------- /src/django_semantic_search/decorators.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Type 3 | 4 | from django.core.exceptions import ImproperlyConfigured 5 | from django.db import models 6 | from django.dispatch import receiver 7 | 8 | from django_semantic_search.documents import Document 9 | from django_semantic_search.utils import load_backend 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def register_document(document_cls: Type[Document]) -> Type[Document]: 15 | """ 16 | Register the document class to be used for the specified model. 17 | :param document_cls: document class to register 18 | """ 19 | default_meta = Document.Meta 20 | meta = getattr(document_cls, "meta", None) 21 | if meta is None: 22 | raise ImproperlyConfigured( 23 | f"Document class {document_cls.__name__} does not have a Meta class." 24 | ) 25 | 26 | # Get the model class from the Meta class of the document 27 | model_cls = getattr(meta, "model", default_meta.model) 28 | if not model_cls: 29 | raise ImproperlyConfigured( 30 | f"Meta class for {document_cls.__name__} does not have a model attribute." 31 | ) 32 | 33 | # Validate all the indexes for the document 34 | indexes = getattr(meta, "indexes", default_meta.indexes) 35 | for index in indexes: 36 | index.validate(model_cls) 37 | 38 | # Register the model handlers 39 | register_model_handlers(document_cls) 40 | 41 | # Set up the document class to initialize vector store 42 | index_configuration = document_cls.index_configuration 43 | backend = load_backend(index_configuration) 44 | logger.info( 45 | f"Initializing vector store for {document_cls.meta.model} with backend {backend}" 46 | ) 47 | 48 | return document_cls 49 | 50 | 51 | def register_model_handlers(document_cls: Type[Document]) -> Type[Document]: 52 | """ 53 | Register all the model signals to update the documents in the vector store. 54 | """ 55 | logger.info(f"Registering handlers for {document_cls.meta.model}") 56 | 57 | disable_signals = getattr( 58 | document_cls.meta, "disable_signals", Document.Meta.disable_signals 59 | ) 60 | if disable_signals: 61 | logger.warning( 62 | f"Signals are disabled for {document_cls.meta.model}. Model changes " 63 | f"will not be reflected in the vector store." 64 | ) 65 | return document_cls 66 | 67 | if hasattr(document_cls.meta, "__signals_registered__"): 68 | logger.warning(f"Signals are already registered for {document_cls.meta.model}.") 69 | return document_cls 70 | 71 | model = document_cls.meta.model 72 | 73 | @receiver(models.signals.post_save, sender=model, weak=False) 74 | def save_model( 75 | sender: Type[models.Model], instance: models.Model, created: bool, **kwargs: Any 76 | ) -> None: 77 | logger.debug(f"Saving document for {instance}") 78 | # TODO: detect the changes in the model and determine if the document should be updated 79 | 80 | # Create the document instance out of the model instance and save it 81 | document = document_cls(instance) 82 | document.save() 83 | 84 | @receiver(models.signals.post_delete, sender=model, weak=False) 85 | def delete_model( 86 | sender: Type[models.Model], instance: models.Model, **kwargs: Any 87 | ) -> None: 88 | logger.debug(f"Deleting document for {instance}") 89 | # Create the document instance out of the model instance and delete it 90 | document = document_cls(instance) 91 | document.delete() 92 | 93 | # Mark the signals as registered 94 | setattr(document_cls.meta, "__signals_registered__", True) 95 | 96 | return document_cls 97 | -------------------------------------------------------------------------------- /src/django_semantic_search/default_settings.py: -------------------------------------------------------------------------------- 1 | SEMANTIC_SEARCH = { 2 | # Vector store is a backend that stores the vectors and provides the search functionality. 3 | "vector_store": { 4 | # Either the path to the backend class or the class itself 5 | "backend": "django_semantic_search.backends.qdrant.QdrantBackend", 6 | # Configuration is passed directly to the backend class during initialization. 7 | "configuration": { 8 | "location": "http://localhost:6333", 9 | }, 10 | }, 11 | # Default embeddings are used to generate the embeddings for the documents if no embeddings are provided. 12 | # This model will be used when no specific embedding_model is specified for a VectorIndex. 13 | "default_embeddings": { 14 | # Either the path to the embeddings model class or the class itself 15 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 16 | # Configuration is passed directly to the embeddings model class during initialization. 17 | "configuration": { 18 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 19 | }, 20 | }, 21 | # Optional named embedding models that can be referenced by VectorIndex instances. 22 | # This allows using different embedding models for different fields in your documents. 23 | "embedding_models": { 24 | # Each key is a unique identifier for the embedding model 25 | "title_model": { 26 | # Either the path to the embeddings model class or the class itself 27 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 28 | # Configuration is passed directly to the embeddings model class during initialization. 29 | "configuration": { 30 | "model_name": "sentence-transformers/all-mpnet-base-v2", 31 | "document_prompt": "Title: ", 32 | }, 33 | }, 34 | "content_model": { 35 | "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel", 36 | "configuration": { 37 | "model": "text-embedding-3-small", 38 | }, 39 | }, 40 | }, 41 | } 42 | -------------------------------------------------------------------------------- /src/django_semantic_search/documents.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | from typing import Dict, Generic, Iterable, List, Optional, Type, TypeVar 4 | 5 | from django.db import models 6 | from django.db.models import QuerySet 7 | 8 | from django_semantic_search.backends.types import ( 9 | Distance, 10 | IndexConfiguration, 11 | VectorConfiguration, 12 | ) 13 | from django_semantic_search.types import DocumentID, MetadataValue, Vector 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | T = TypeVar("T", bound=models.Model) 18 | 19 | 20 | class VectorIndex: 21 | """ 22 | A definition of a single vector index. It contains the name of the index and the fields that should be indexed, 23 | but also allows to surpass the default settings of django-semantic-search. 24 | """ 25 | 26 | def __init__( 27 | self, 28 | *fields: str, 29 | index_name: Optional[str] = None, 30 | distance: Distance = Distance.COSINE, 31 | embedding_model: Optional[str] = None, 32 | ): 33 | """ 34 | :param fields: model fields to index together. 35 | :param index_name: name of the index to use in a backend. By default, it is the concatenation of the fields. 36 | :param distance: distance metric to use for similarity search. 37 | :param embedding_model: name of the embedding model to use, must be defined in SEMANTIC_SEARCH settings. 38 | """ 39 | # Loading the default embedding model here, as otherwise it would create a circular import 40 | from django_semantic_search.utils import load_embedding_model 41 | 42 | if len(fields) != 1: 43 | raise ValueError("Only single field indexes are supported at the moment.") 44 | 45 | self._fields: List[str] = list(fields) 46 | self._index_name = index_name or "_".join(fields) 47 | self._distance = distance 48 | self._embedding_model = load_embedding_model(embedding_model) 49 | 50 | def validate(self, model_cls: Type[models.Model]): 51 | """ 52 | Validate the index configuration for the model. 53 | :param model_cls: model class to validate the index for. 54 | """ 55 | for field in self._fields: 56 | if not hasattr(model_cls, field): 57 | raise ValueError( 58 | f"Field {field} is not present in the model {model_cls.__name__}" 59 | ) 60 | 61 | def is_for_field(self, field: str) -> bool: 62 | """ 63 | Check if the index is for the field. 64 | :param field: field to check. 65 | :return: True if the index is for the field, False otherwise. 66 | """ 67 | return field in self._fields 68 | 69 | @property 70 | def index_name(self) -> str: 71 | """ 72 | Return the name of the index. 73 | :return: index name. 74 | """ 75 | return self._index_name 76 | 77 | @property 78 | def distance(self) -> Distance: 79 | """ 80 | Return the distance metric to use for the index. 81 | :return: distance metric. 82 | """ 83 | return self._distance 84 | 85 | @property 86 | def vector_size(self) -> int: 87 | """ 88 | Return the size of the individual embedding. 89 | :return: size of the embedding. 90 | """ 91 | return self._embedding_model.vector_size() 92 | 93 | def get_model_embedding(self, instance: models.Model) -> Vector: 94 | """ 95 | Get the embedding for the instance. 96 | :param instance: model instance to get the embedding for. 97 | :return: embedding for the instance. 98 | """ 99 | return self._embedding_model.embed_document( 100 | " ".join(getattr(instance, field) for field in self._fields) 101 | ) 102 | 103 | def get_query_embedding(self, query: str) -> Vector: 104 | """ 105 | Get the embedding for the query. 106 | :param query: query to get the embedding for. 107 | :return: embedding for the query. 108 | """ 109 | return self._embedding_model.embed_query(query) 110 | 111 | 112 | class MetaManager: 113 | """ 114 | A descriptor to store an instance of the Meta class instance on the document class. 115 | """ 116 | 117 | def __get__(self, instance: Optional["Document"], owner: Type["Document"]): 118 | if not hasattr(owner, "_meta"): 119 | setattr(owner, "_meta", owner.Meta()) 120 | return getattr(owner, "_meta") 121 | 122 | 123 | class IndexConfigurationManager: 124 | """ 125 | A descriptor to store an instance of the IndexConfiguration class instance on the document class. The configuration 126 | of the index is derived from the Meta class of the document. 127 | """ 128 | 129 | def __get__( 130 | self, instance: Optional["Document"], owner: Type["Document"] 131 | ) -> IndexConfiguration: 132 | if not hasattr(owner, "_index_configuration"): 133 | attr_meta = owner.meta 134 | model = getattr(attr_meta, "model", None) 135 | model_name = model.__name__ if model else None 136 | index_namespace = getattr(attr_meta, "namespace", model_name) 137 | indexes = getattr(attr_meta, "indexes", []) 138 | config = IndexConfiguration( 139 | namespace=index_namespace, 140 | vectors={ 141 | index.index_name: VectorConfiguration( 142 | size=index.vector_size, 143 | distance=index.distance, 144 | ) 145 | for index in indexes 146 | }, 147 | ) 148 | setattr(owner, "_index_configuration", config) 149 | return getattr(owner, "_index_configuration") 150 | 151 | 152 | class BackendManager: 153 | """ 154 | A descriptor to store an instance of the backend on the document class. The backend is derived from the index 155 | configuration and is loaded dynamically. 156 | """ 157 | 158 | def __get__(self, instance: Optional["Document"], owner: Type["Document"]): 159 | if not hasattr(owner, "_backend"): 160 | from django_semantic_search.utils import load_backend 161 | 162 | setattr(owner, "_backend", load_backend(owner.index_configuration)) 163 | return getattr(owner, "_backend") 164 | 165 | 166 | class DocumentManager(Generic[T]): 167 | """ 168 | A descriptor to store an instance of the document manager on the document class. The document manager is used to 169 | find similar documents in the vector index, but also to perform any other operations on the querysets of the 170 | model instances. 171 | """ 172 | 173 | def __init__(self, cls: Type["Document"]): 174 | self.cls = cls 175 | 176 | def search( 177 | self, 178 | limit: int = 10, 179 | **kwargs, 180 | ) -> QuerySet[T]: 181 | """ 182 | Find the documents similar to the query in the vector index. If there are multiple indexes, the search is 183 | performed in all of them and the results are combined. 184 | :param limit: number of results to return. 185 | :param kwargs: query parameters to restrict the search. 186 | :return: 187 | """ 188 | if len(kwargs) != 1: 189 | raise ValueError("Only single field indexes are supported at the moment.") 190 | 191 | field_name, field_value = next(iter(kwargs.items())) 192 | vector_index = next( 193 | index for index in self.cls.meta.indexes if index.is_for_field(field_name) 194 | ) 195 | if vector_index is None: 196 | raise ValueError(f"No index found for field {field_name}") 197 | 198 | query_embedding = vector_index.get_query_embedding(field_value) 199 | document_ids = self.cls.backend.search( 200 | vector_index.index_name, query_embedding, limit=limit 201 | ) 202 | if not document_ids: 203 | return self.cls.meta.model.objects.none() 204 | 205 | preserved_ids = models.Case( 206 | *[models.When(pk=pk, then=pos) for pos, pk in enumerate(document_ids)] 207 | ) 208 | queryset = self.cls.meta.model.objects.filter(pk__in=document_ids).order_by( 209 | preserved_ids 210 | ) 211 | return queryset 212 | 213 | def index(self, qs: QuerySet[T]): 214 | """ 215 | Index the queryset of the model instances. 216 | :param qs: queryset of the model instances to index. 217 | """ 218 | # TODO: this is the most basic implementation, it should be optimized 219 | for instance in qs: 220 | self.cls(instance).save() 221 | 222 | 223 | class DocumentManagerDescriptor(Generic[T]): 224 | """ 225 | A descriptor to store the document manager on the document class. 226 | """ 227 | 228 | def __get__(self, instance, owner): 229 | if not hasattr(owner, "_document_manager"): 230 | setattr(owner, "_document_manager", DocumentManager[T](owner)) 231 | return getattr(owner, "_document_manager") 232 | 233 | 234 | class Document(abc.ABC, Generic[T]): 235 | """ 236 | Base class for all the documents. There is a one-to-one mapping between the document subclass and the model class, 237 | to configure how a specific model instances should be converted to a document. 238 | 239 | **Usage**: 240 | 241 | ```python title="products/models.py" 242 | from django.db import models 243 | 244 | class Product(models.Model): 245 | name = models.CharField(max_length=255) 246 | description = models.TextField() 247 | 248 | ``` 249 | 250 | ```python title="products/documents.py" 251 | from django_semantic_search import Document, VectorIndex 252 | from django_semantic_search.decorators import register_document 253 | 254 | @register_document 255 | class ProductDocument(Document): 256 | class Meta: 257 | model = Product 258 | indexes = [ 259 | VectorIndex("name"), 260 | VectorIndex("description"), 261 | ] 262 | ``` 263 | 264 | `django-semantic-search` will automatically handle all the configuration in the backend. The `register_document` 265 | decorator will register the model signals to update the documents in the vector store when the model is updated 266 | or deleted. As a user you don't have to manually call the `save` or `delete` methods on the document instances. 267 | 268 | **Search example:** 269 | 270 | ```python title="products/views.py" 271 | from django.http import JsonResponse 272 | from products.documents import ProductDocument 273 | 274 | def my_view(request): 275 | query = "this is a query" 276 | results = ProductDocument.objects.find(name=query) 277 | return JsonResponse( 278 | { 279 | "results": list(name_results.values()) 280 | } 281 | ) 282 | ``` 283 | 284 | The `find` method on the `objects` attribute of the document class will return the queryset of the model instances 285 | that are similar to the query. The search is performed using the selected vector index passed as a keyword argument 286 | to the `find` method. In our case, we are searching for the query in the `name` field of the `Product` model. If we 287 | want to search in the `description` field, we would call `ProductDocument.objects.find(description=query)`. 288 | """ 289 | 290 | # Important: 291 | # The following descriptors have to be defined in the specific order, as they depend on each other 292 | # and the order of the descriptors is the order in which they are executed. 293 | meta = MetaManager() 294 | index_configuration = IndexConfigurationManager() 295 | backend = BackendManager() 296 | objects = DocumentManagerDescriptor[T]() 297 | 298 | def __init__(self, instance: T): 299 | self._instance = instance 300 | 301 | def save(self) -> None: 302 | """ 303 | Save the document in the vector store. 304 | """ 305 | if not self._instance.pk: 306 | raise ValueError( 307 | "The model instance has to be saved before creating a document." 308 | ) 309 | self.backend.save(self) 310 | 311 | def delete(self) -> None: 312 | """ 313 | Delete the document from the vector store. 314 | """ 315 | self.backend.delete(self.id) 316 | 317 | @property 318 | def id(self) -> DocumentID: 319 | if not self._instance.pk: 320 | raise ValueError( 321 | "The model instance has to be saved before accessing the ID." 322 | ) 323 | return self._instance.pk 324 | 325 | def vectors(self) -> Dict[str, Vector]: 326 | """ 327 | Return the vectors for the document. 328 | :return: dictionary of the vectors. 329 | """ 330 | return { 331 | index.index_name: index.get_model_embedding(self._instance) 332 | for index in self.meta.indexes 333 | } 334 | 335 | def metadata(self) -> Dict[str, MetadataValue]: 336 | """ 337 | Return the metadata for the document. 338 | :return: dictionary of the metadata. 339 | """ 340 | include_fields = getattr( 341 | self.meta, "include_fields", Document.Meta.include_fields 342 | ) 343 | if "*" in include_fields: 344 | include_fields = [field.name for field in self._instance._meta.fields] 345 | return {field: getattr(self._instance, field) for field in include_fields} 346 | 347 | class Meta: 348 | # The model this document is associated with 349 | model: Optional[Type[models.Model]] = None 350 | # Namespace for the documents in the vector store, defaults to the model name 351 | namespace: Optional[str] = None 352 | # List of vector indexes created out of the model fields 353 | indexes: Iterable[VectorIndex] = [] 354 | # Model fields that should be included in the metadata 355 | include_fields: List[str] = ["*"] 356 | # Flag to disable signals on the model, so the documents are not updated on model changes 357 | disable_signals: bool = False 358 | -------------------------------------------------------------------------------- /src/django_semantic_search/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from .fastembed import FastEmbedDenseModel, FastEmbedSparseModel 2 | from .openai import OpenAIEmbeddingModel 3 | from .sentence_transformers import SentenceTransformerModel 4 | 5 | __all__ = [ 6 | "SentenceTransformerModel", 7 | "OpenAIEmbeddingModel", 8 | "FastEmbedDenseModel", 9 | "FastEmbedSparseModel", 10 | ] 11 | -------------------------------------------------------------------------------- /src/django_semantic_search/embeddings/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from typing import Protocol 3 | 4 | from django_semantic_search.types import ( 5 | DenseVector, 6 | DocumentContent, 7 | Query, 8 | SparseVector, 9 | ) 10 | 11 | 12 | class EmbeddingModel(Protocol): 13 | """Protocol defining common interface for all embedding models.""" 14 | 15 | def vector_size(self) -> int: 16 | """Return the size of the individual embedding.""" 17 | ... 18 | 19 | def supports_document(self, document: DocumentContent) -> bool: 20 | """Check if the embedding model supports the document.""" 21 | ... 22 | 23 | 24 | class DenseEmbeddingModel(abc.ABC): 25 | """Base class for models producing dense vector embeddings.""" 26 | 27 | @abc.abstractmethod 28 | def vector_size(self) -> int: 29 | """Return the fixed size of dense embeddings.""" 30 | raise NotImplementedError 31 | 32 | @abc.abstractmethod 33 | def embed_document(self, document: DocumentContent) -> DenseVector: 34 | """Embed a document into a dense vector.""" 35 | raise NotImplementedError 36 | 37 | @abc.abstractmethod 38 | def embed_query(self, query: Query) -> DenseVector: 39 | """Embed a query into a dense vector.""" 40 | raise NotImplementedError 41 | 42 | 43 | class SparseEmbeddingModel(abc.ABC): 44 | """Base class for models producing sparse vector embeddings.""" 45 | 46 | @abc.abstractmethod 47 | def embed_document(self, document: DocumentContent) -> SparseVector: 48 | """Embed a document into a sparse vector.""" 49 | raise NotImplementedError 50 | 51 | @abc.abstractmethod 52 | def embed_query(self, query: Query) -> SparseVector: 53 | """Embed a query into a sparse vector.""" 54 | raise NotImplementedError 55 | 56 | 57 | class TextEmbeddingMixin: 58 | """Mixin for text-specific embedding functionality.""" 59 | 60 | def supports_document(self, document: DocumentContent) -> bool: 61 | return isinstance(document, str) 62 | 63 | 64 | class DenseTextEmbeddingModel(TextEmbeddingMixin, DenseEmbeddingModel, abc.ABC): 65 | """Base class for dense text embedding models.""" 66 | 67 | pass 68 | 69 | 70 | class SparseTextEmbeddingModel(TextEmbeddingMixin, SparseEmbeddingModel, abc.ABC): 71 | """Base class for sparse text embedding models.""" 72 | 73 | pass 74 | -------------------------------------------------------------------------------- /src/django_semantic_search/embeddings/fastembed.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from django_semantic_search.embeddings.base import ( 4 | DenseTextEmbeddingModel, 5 | SparseTextEmbeddingModel, 6 | ) 7 | from django_semantic_search.types import ( 8 | DenseVector, 9 | DocumentContent, 10 | Query, 11 | SparseVector, 12 | ) 13 | 14 | 15 | class FastEmbedDenseModel(DenseTextEmbeddingModel): 16 | """ 17 | FastEmbed dense embedding model that uses the FastEmbed library to generate dense embeddings. 18 | 19 | **Requirements:** 20 | 21 | ```shell 22 | pip install django-semantic-search[fastembed] 23 | ``` 24 | 25 | **Usage:** 26 | 27 | ```python title="settings.py" 28 | SEMANTIC_SEARCH = { 29 | "default_embeddings": { 30 | "model": "django_semantic_search.embeddings.FastEmbedDenseModel", 31 | "configuration": { 32 | "model_name": "BAAI/bge-small-en-v1.5", 33 | }, 34 | }, 35 | ... 36 | } 37 | ``` 38 | """ 39 | 40 | def __init__( 41 | self, 42 | model_name: str, 43 | **kwargs, 44 | ): 45 | """ 46 | Initialize the FastEmbed dense model. 47 | 48 | :param model_name: name of the model to use 49 | :param kwargs: additional kwargs passed to FastEmbed 50 | """ 51 | from fastembed import TextEmbedding 52 | 53 | self._model = TextEmbedding( 54 | model_name=model_name, 55 | **kwargs, 56 | ) 57 | # Cache the vector size after first call 58 | self._vector_size: Optional[int] = None 59 | 60 | def vector_size(self) -> int: 61 | """ 62 | Return the size of the individual embedding. 63 | :return: size of the embedding. 64 | """ 65 | if self._vector_size is None: 66 | # Get vector size by embedding a test string 67 | vector = next(self._model.embed(["test"])) 68 | self._vector_size = len(vector) 69 | return self._vector_size 70 | 71 | def embed_document(self, document: str) -> DenseVector: 72 | """ 73 | Embed a document into a vector. 74 | :param document: document to embed. 75 | :return: document embedding. 76 | """ 77 | vector = next(self._model.passage_embed([document])) 78 | return vector.tolist() 79 | 80 | def embed_query(self, query: str) -> DenseVector: 81 | """ 82 | Embed a query into a vector. 83 | :param query: query to embed. 84 | :return: query embedding. 85 | """ 86 | vector = next(self._model.query_embed([query])) 87 | return vector.tolist() 88 | 89 | 90 | class FastEmbedSparseModel(SparseTextEmbeddingModel): 91 | """ 92 | FastEmbed sparse embedding model that uses the FastEmbed library to generate sparse embeddings. 93 | 94 | **Requirements:** 95 | 96 | ```shell 97 | pip install django-semantic-search[fastembed] 98 | ``` 99 | 100 | **Important:** For now, there is no way to use the model in django-semantic-search, but it's on the way. 101 | """ 102 | 103 | def __init__( 104 | self, 105 | model_name: str, 106 | **kwargs, 107 | ): 108 | """ 109 | Initialize the FastEmbed sparse model. 110 | 111 | :param model_name: name of the model to use 112 | :param kwargs: additional kwargs passed to FastEmbed 113 | """ 114 | from fastembed import SparseTextEmbedding 115 | 116 | self._model = SparseTextEmbedding( 117 | model_name=model_name, 118 | **kwargs, 119 | ) 120 | 121 | def embed_document(self, document: DocumentContent) -> SparseVector: 122 | vector = next(self._model.passage_embed([document])) 123 | return dict(zip(vector.indices.tolist(), vector.values.tolist())) 124 | 125 | def embed_query(self, query: Query) -> SparseVector: 126 | vector = next(self._model.query_embed([query])) 127 | return dict(zip(vector.indices.tolist(), vector.values.tolist())) 128 | -------------------------------------------------------------------------------- /src/django_semantic_search/embeddings/openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | from openai import OpenAI 5 | 6 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel 7 | from django_semantic_search.types import DenseVector 8 | 9 | 10 | class OpenAIEmbeddingModel(DenseTextEmbeddingModel): 11 | """ 12 | OpenAI text embedding model that uses the OpenAI API to generate dense embeddings. 13 | 14 | **Requirements**: 15 | 16 | ```bash 17 | pip install django-semantic-search[openai] 18 | ``` 19 | 20 | **Usage**: 21 | 22 | ```python title="settings.py" 23 | SEMANTIC_SEARCH = { 24 | "default_embeddings": { 25 | "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel", 26 | "configuration": { 27 | "model": "text-embedding-3-small", 28 | "api_key": "your-api-key", # Optional if set in env 29 | }, 30 | }, 31 | ... 32 | } 33 | ``` 34 | """ 35 | 36 | def __init__( 37 | self, 38 | model: str = "text-embedding-3-small", 39 | api_key: Optional[str] = None, 40 | **kwargs, 41 | ): 42 | """ 43 | Initialize the OpenAI embedding model. 44 | 45 | :param model: OpenAI model to use for embeddings 46 | :param api_key: OpenAI API key. If not provided, will look for OPENAI_API_KEY env variable 47 | :param kwargs: Additional kwargs passed to OpenAI client 48 | """ 49 | self._model = model 50 | api_key = api_key or os.getenv("OPENAI_API_KEY") 51 | if not api_key: 52 | raise ValueError( 53 | "OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable" 54 | ) 55 | self._client = OpenAI(api_key=api_key, **kwargs) 56 | # Cache the vector size after first call 57 | self._vector_size: Optional[int] = None 58 | 59 | def vector_size(self) -> int: 60 | if self._vector_size is None: 61 | response = self._client.embeddings.create( 62 | model=self._model, 63 | input="test", 64 | ) 65 | self._vector_size = len(response.data[0].embedding) 66 | return self._vector_size 67 | 68 | def embed_document(self, document: str) -> DenseVector: 69 | response = self._client.embeddings.create( 70 | model=self._model, 71 | input=document, 72 | ) 73 | return response.data[0].embedding 74 | 75 | def embed_query(self, query: str) -> DenseVector: 76 | return self.embed_document(query) 77 | -------------------------------------------------------------------------------- /src/django_semantic_search/embeddings/sentence_transformers.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel 4 | from django_semantic_search.types import DenseVector 5 | 6 | 7 | class SentenceTransformerModel(DenseTextEmbeddingModel): 8 | """ 9 | Sentence-transformers model for embedding text. 10 | 11 | It is a wrapper around the sentence-transformers library. Users would rarely need to use this class directly, but 12 | rather specify it in the Django settings. 13 | 14 | **Requirements:** 15 | 16 | ```shell 17 | pip install django-semantic-search[sentence-transformers] 18 | ``` 19 | 20 | **Usage:** 21 | 22 | ```python title="settings.py" 23 | SEMANTIC_SEARCH = { 24 | "default_embeddings": { 25 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 26 | "configuration": { 27 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 28 | }, 29 | }, 30 | ... 31 | } 32 | ``` 33 | 34 | Some models accept prompts to be used for the document and query. These prompts are used as additional 35 | instructions for the model to generate embeddings. For example, if the `document_prompt` is set to `"Doc: "`, the 36 | model will generate embeddings with the prompt `"Doc: "` followed by the document text. Similarly, the 37 | `query_prompt` is used for the query, if set. 38 | 39 | ```python title="settings.py" 40 | SEMANTIC_SEARCH = { 41 | "default_embeddings": { 42 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 43 | "configuration": { 44 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 45 | "document_prompt": "Doc: ", 46 | "query_prompt": "Query: ", 47 | }, 48 | }, 49 | ... 50 | } 51 | ``` 52 | """ 53 | 54 | def __init__( 55 | self, 56 | model_name: str, 57 | document_prompt: Optional[str] = None, 58 | query_prompt: Optional[str] = None, 59 | ): 60 | """ 61 | Initialize the sentence-transformers model. 62 | 63 | Some models accept prompts to be used for the document and query. These prompts are used as additional 64 | instructions for the model to generate embeddings. For example, if the `document_prompt` is set to "Doc: ", the 65 | model will generate embeddings with the prompt "Doc: " followed by the document text. 66 | 67 | :param model_name: name of the model to use. 68 | :param document_prompt: prompt to use for the document, defaults to None. 69 | :param query_prompt: prompt to use for the query, defaults to None. 70 | """ 71 | from sentence_transformers import SentenceTransformer 72 | 73 | self._model = SentenceTransformer(model_name) 74 | self._document_prompt = document_prompt 75 | self._query_prompt = query_prompt 76 | 77 | def vector_size(self) -> int: 78 | """ 79 | Return the size of the individual embedding. 80 | :return: size of the embedding. 81 | """ 82 | return self._model.get_sentence_embedding_dimension() 83 | 84 | def embed_document(self, document: str) -> DenseVector: 85 | """ 86 | Embed a document into a vector. 87 | :param document: document to embed. 88 | :return: document embedding. 89 | """ 90 | return self._model.encode(document, prompt=self._document_prompt).tolist() 91 | 92 | def embed_query(self, query: str) -> DenseVector: 93 | """ 94 | Embed a query into a vector. 95 | :param query: query to embed. 96 | :return: query embedding. 97 | """ 98 | return self._model.encode(query, prompt=self._query_prompt).tolist() 99 | -------------------------------------------------------------------------------- /src/django_semantic_search/types.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | # Dense vector is a list of floats 4 | DenseVector = List[float] 5 | 6 | # Sparse vector is a dictionary of the form {index: value} 7 | # where index is a unique token identifier and value is the weight of the token. 8 | # Different backends may store the sparse vector in a different way. 9 | SparseVector = Dict[int, float] 10 | 11 | # Vector is either a dense or a sparse vector for now, but that might 12 | # change in the future, for example, to support multi-vector representations. 13 | Vector = Union[DenseVector, SparseVector] 14 | 15 | # Document ID uniquely identifies a document. 16 | DocumentID = Union[int, str] 17 | 18 | # Document content might be any supported modality. Currently just text, but that 19 | # might change in the future, when we support images, audio, etc. 20 | DocumentContent = Union[str] 21 | 22 | # Each document may have metadata associated with it, that can be used for filtering. 23 | # For now, we support only a few basic types, but that might change in the future. 24 | # TODO: support more types in the metadata value, preferably the same as in the database 25 | MetadataValue = Union[int, str, float, bool] 26 | 27 | # Queries may have the same format as the documents, but we keep a separate type for 28 | # them for better readability. 29 | Query = DocumentContent 30 | -------------------------------------------------------------------------------- /src/django_semantic_search/utils.py: -------------------------------------------------------------------------------- 1 | from functools import cache 2 | from typing import Optional 3 | 4 | from django.conf import settings 5 | from django.utils.module_loading import import_string 6 | 7 | from django_semantic_search.backends.types import IndexConfiguration 8 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel 9 | 10 | 11 | @cache 12 | def load_embedding_model(model_name: Optional[str] = None) -> DenseTextEmbeddingModel: 13 | """ 14 | Load the embedding model specified in settings. 15 | :param model_name: name of the model configuration to use from settings 16 | :return: embedding model instance 17 | """ 18 | semantic_search_settings = settings.SEMANTIC_SEARCH 19 | 20 | if model_name is None: 21 | model_config = semantic_search_settings["default_embeddings"] 22 | else: 23 | if "embedding_models" not in semantic_search_settings: 24 | raise ValueError("No embedding_models defined in settings") 25 | if model_name not in semantic_search_settings["embedding_models"]: 26 | raise ValueError(f"Embedding model {model_name} not found in settings") 27 | model_config = semantic_search_settings["embedding_models"][model_name] 28 | 29 | model_cls = model_config["model"] 30 | if isinstance(model_cls, str): 31 | model_cls = import_string(model_cls) 32 | model_configuration = model_config["configuration"] 33 | return model_cls(**model_configuration) 34 | 35 | 36 | @cache 37 | def load_backend(index_configuration: IndexConfiguration): 38 | """ 39 | Load the backend, as specified in the settings. 40 | :return: backend instance. 41 | """ 42 | semantic_search_settings = settings.SEMANTIC_SEARCH 43 | backend_cls = semantic_search_settings["vector_store"]["backend"] 44 | if isinstance(backend_cls, str): 45 | backend_cls = import_string(backend_cls) 46 | backend_config = semantic_search_settings["vector_store"]["configuration"] 47 | return backend_cls(index_configuration, **backend_config) 48 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import django 2 | from mocks import test_settings 3 | 4 | 5 | def pytest_configure(config): 6 | from django.conf import settings 7 | 8 | settings.configure( 9 | DATABASES={ 10 | "default": { 11 | "ENGINE": "django.db.backends.sqlite3", 12 | "NAME": ":memory:", 13 | "AUTOCOMMIT": True, 14 | } 15 | }, 16 | SEMANTIC_SEARCH=test_settings, 17 | ) 18 | 19 | django.setup() 20 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_apps.py: -------------------------------------------------------------------------------- 1 | from mocks import test_settings 2 | 3 | import django_semantic_search 4 | from django_semantic_search import default_settings 5 | from django_semantic_search.apps import DjangoSemanticSearchConfig 6 | 7 | 8 | def test_custom_settings_are_not_overwritten_on_ready(): 9 | from django.conf import settings 10 | 11 | # Save the initial settings and set custom settings 12 | init_semantic_search_settings = getattr(settings, "SEMANTIC_SEARCH") 13 | setattr(settings, "SEMANTIC_SEARCH", test_settings) 14 | 15 | # Run ready and check that the settings are not overwritten 16 | config = DjangoSemanticSearchConfig( 17 | "django_semantic_search", django_semantic_search 18 | ) 19 | config.ready() 20 | 21 | assert hasattr(settings, "SEMANTIC_SEARCH") 22 | assert settings.SEMANTIC_SEARCH == test_settings 23 | 24 | # Restore the initial settings 25 | setattr(settings, "SEMANTIC_SEARCH", init_semantic_search_settings) 26 | 27 | 28 | def test_default_settings_are_set_on_ready(): 29 | from django.conf import settings 30 | 31 | # Save the initial settings and delete them so that the default settings are set 32 | init_semantic_search_settings = getattr(settings, "SEMANTIC_SEARCH") 33 | delattr(settings, "SEMANTIC_SEARCH") 34 | 35 | # Run ready and check that the settings are not overwritten 36 | config = DjangoSemanticSearchConfig( 37 | "django_semantic_search", django_semantic_search 38 | ) 39 | config.ready() 40 | 41 | assert hasattr(settings, "SEMANTIC_SEARCH") 42 | assert settings.SEMANTIC_SEARCH == default_settings.SEMANTIC_SEARCH 43 | 44 | # Restore the initial settings 45 | setattr(settings, "SEMANTIC_SEARCH", init_semantic_search_settings) 46 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_decorators.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from django.core.exceptions import ImproperlyConfigured 3 | from django.db import models 4 | 5 | import django_semantic_search as dss 6 | 7 | 8 | class DummyModel(models.Model): 9 | name = models.CharField(max_length=100) 10 | 11 | class Meta: 12 | app_label = "test_decorators" 13 | 14 | 15 | def test_register_document_fails_on_missing_meta(): 16 | """ 17 | Test that the register_document decorator fails when the document class does not have a Meta class. 18 | """ 19 | 20 | try: 21 | 22 | @dss.register_document 23 | class InvalidDocument: # noqa 24 | pass 25 | except ImproperlyConfigured as e: 26 | assert str(e) == "Document class InvalidDocument does not have a Meta class." 27 | 28 | 29 | def test_register_document_fails_on_duplicate_registration(): 30 | """ 31 | Test that the register_document decorator fails when the document class is registered for the same model twice. 32 | """ 33 | 34 | @dss.register_document 35 | class Document1(dss.Document): # noqa 36 | class Meta: 37 | model = DummyModel 38 | 39 | try: 40 | 41 | @dss.register_document 42 | class Document2(dss.Document): # noqa 43 | class Meta: 44 | model = DummyModel 45 | except ImproperlyConfigured as e: 46 | assert str(e) == "Document class for model DummyModel is already registered." 47 | 48 | 49 | @pytest.mark.integration 50 | def test_register_document_creates_update_delete_signals(): 51 | """ 52 | Test that the document registers the post_save and post_delete signals for the model. 53 | """ 54 | 55 | class SingleUseDummyModel(DummyModel): 56 | """Single use model for testing the document registration in this test only.""" 57 | 58 | class Meta: 59 | app_label = "test_decorators" 60 | 61 | assert not models.signals.post_save.has_listeners(SingleUseDummyModel) 62 | assert not models.signals.post_delete.has_listeners(SingleUseDummyModel) 63 | 64 | @dss.register_document 65 | class AnotherDummyDocument(dss.Document): # noqa 66 | class Meta: 67 | model = SingleUseDummyModel 68 | namespace = "dummy" 69 | indexes = (dss.VectorIndex("name"),) 70 | 71 | assert models.signals.post_save.has_listeners(SingleUseDummyModel) 72 | assert models.signals.post_delete.has_listeners(SingleUseDummyModel) 73 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_documents.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from django.db import models 3 | 4 | import django_semantic_search as dss 5 | 6 | 7 | class DummyModel(models.Model): 8 | name = models.CharField(max_length=255) 9 | description = models.TextField() 10 | ignored_field = models.CharField(max_length=255) 11 | 12 | class Meta: 13 | app_label = "test_documents" 14 | 15 | 16 | @dss.register_document 17 | class DummyDocument(dss.Document): 18 | class Meta: 19 | model = DummyModel 20 | namespace = "dummy" 21 | indexes = [ 22 | dss.VectorIndex("name"), 23 | dss.VectorIndex("description"), 24 | ] 25 | 26 | 27 | @pytest.fixture(scope="module") 28 | def django_test_database(): 29 | """ 30 | Create a test database for Django with the dummy model. 31 | :return: 32 | """ 33 | from django.db import connection 34 | 35 | with connection.schema_editor() as schema_editor: 36 | yield schema_editor.create_model(DummyModel) 37 | schema_editor.delete_model(DummyModel) 38 | 39 | 40 | def test_dummy_document_produces_vectors(): 41 | """ 42 | Test that the document produces the correct vectors. 43 | """ 44 | dummy = DummyModel( 45 | name="test", description="test description", ignored_field="ignored" 46 | ) 47 | document = DummyDocument(dummy) 48 | vectors = document.vectors() 49 | assert len(vectors) == 2 50 | assert "name" in vectors 51 | assert "description" in vectors 52 | assert "name_description" not in vectors 53 | 54 | 55 | def test_dummy_document_produces_metadata(): 56 | """ 57 | Test that the document produces the correct metadata. 58 | """ 59 | dummy = DummyModel( 60 | name="test", description="test description", ignored_field="ignored" 61 | ) 62 | document = DummyDocument(dummy) 63 | metadata = document.metadata() 64 | assert "name" in metadata 65 | assert "description" in metadata 66 | assert metadata["name"] == "test" 67 | assert metadata["description"] == "test description" 68 | 69 | 70 | def test_two_documents_have_different_backends(): 71 | """ 72 | Test that two documents with different indexes have different backends. 73 | """ 74 | 75 | class AnotherModel(models.Model): 76 | name = models.CharField(max_length=255) 77 | description = models.TextField() 78 | 79 | class Meta: 80 | app_label = "test_documents" 81 | 82 | @dss.register_document 83 | class AnotherDocument(dss.Document): 84 | class Meta: 85 | model = AnotherModel 86 | namespace = "another" 87 | indexes = [ 88 | dss.VectorIndex("name"), 89 | ] 90 | 91 | dummy_index_configuration = DummyDocument.backend.index_configuration 92 | another_index_configuration = AnotherDocument.backend.index_configuration 93 | assert dummy_index_configuration.namespace == "dummy" 94 | assert another_index_configuration.namespace == "another" 95 | 96 | 97 | def test_document_signals_work_correctly(django_test_database): 98 | """ 99 | Test that the search manager returns an empty queryset. 100 | """ 101 | dummy = DummyModel( 102 | name="test", description="test description", ignored_field="ignored" 103 | ) 104 | queryset = DummyDocument.objects.search(name="test") 105 | assert queryset.count() == 0 106 | dummy.save() 107 | queryset = DummyDocument.objects.search(name="test") 108 | assert queryset.count() == 1 109 | dummy.delete() 110 | queryset = DummyDocument.objects.search(name="test") 111 | assert queryset.count() == 0 112 | 113 | 114 | def test_model_has_more_entries_than_vector_backend(): 115 | from django.db import connection 116 | 117 | class JustAnotherModel(models.Model): 118 | name = models.CharField(max_length=255) 119 | description = models.TextField() 120 | 121 | class Meta: 122 | app_label = "test_documents" 123 | 124 | with connection.schema_editor() as schema_editor: 125 | schema_editor.create_model(JustAnotherModel) 126 | 127 | # Create some instances of the model, which won't be in the vector store yet (document is created later) 128 | JustAnotherModel(name="test1", description="test description 1").save() 129 | JustAnotherModel(name="test2", description="test description 2").save() 130 | 131 | @dss.register_document 132 | class JustAnotherDocument(dss.Document): 133 | class Meta: 134 | model = JustAnotherModel 135 | namespace = "just_another" 136 | indexes = [ 137 | dss.VectorIndex("name"), 138 | ] 139 | 140 | assert JustAnotherModel.objects.count() == 2 141 | assert JustAnotherDocument.objects.search(name="a").count() == 0 142 | 143 | JustAnotherModel(name="test3", description="test description 3").save() 144 | 145 | assert JustAnotherModel.objects.count() == 3 146 | assert JustAnotherDocument.objects.search(name="a").count() == 1 147 | 148 | JustAnotherDocument.objects.index(JustAnotherModel.objects.all()) 149 | 150 | assert JustAnotherModel.objects.count() == 3 151 | assert JustAnotherDocument.objects.search(name="a").count() == 3 152 | 153 | schema_editor.delete_model(JustAnotherModel) 154 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_fastembed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from django_semantic_search.embeddings.fastembed import ( 5 | FastEmbedDenseModel, 6 | FastEmbedSparseModel, 7 | ) 8 | 9 | 10 | @pytest.mark.integration 11 | class TestFastEmbedDenseModel: 12 | @pytest.fixture(autouse=True) 13 | def setup_model(self): 14 | self.model = FastEmbedDenseModel(model_name="BAAI/bge-small-en-v1.5") 15 | 16 | def test_initialization(self): 17 | model = FastEmbedDenseModel(model_name="BAAI/bge-small-en-v1.5") 18 | assert isinstance(model._model, object) # Check model is initialized 19 | assert model._vector_size is None # Size should be initially uncached 20 | 21 | def test_vector_size(self): 22 | size = self.model.vector_size() 23 | assert isinstance(size, int) 24 | assert size > 0 25 | # Check that size is cached 26 | assert self.model._vector_size == size 27 | # Get it again to test cached path 28 | assert self.model.vector_size() == size 29 | 30 | def test_embed_document(self): 31 | vector = self.model.embed_document("This is a test document") 32 | assert isinstance(vector, list) 33 | assert len(vector) == self.model.vector_size() 34 | assert all(isinstance(x, float) for x in vector) 35 | 36 | def test_embed_query(self): 37 | vector = self.model.embed_query("test query") 38 | assert isinstance(vector, list) 39 | assert len(vector) == self.model.vector_size() 40 | assert all(isinstance(x, float) for x in vector) 41 | 42 | def test_consistent_embeddings(self): 43 | text = "This is a test document" 44 | vector1 = self.model.embed_document(text) 45 | vector2 = self.model.embed_document(text) 46 | # Vectors should be nearly identical for same input 47 | assert np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8) 48 | 49 | 50 | @pytest.mark.integration 51 | class TestFastEmbedSparseModel: 52 | @pytest.fixture(autouse=True) 53 | def setup_model(self): 54 | self.model = FastEmbedSparseModel(model_name="Qdrant/bm25") 55 | 56 | def test_initialization(self): 57 | model = FastEmbedSparseModel(model_name="Qdrant/bm25") 58 | assert isinstance(model._model, object) # Check model is initialized 59 | 60 | def test_embed_document(self): 61 | vector = self.model.embed_document("This is a test document") 62 | assert isinstance(vector, dict) 63 | # Sparse vectors should have indices and values 64 | assert len(vector) > 0 65 | assert all( 66 | isinstance(k, int) and isinstance(v, float) for k, v in vector.items() 67 | ) 68 | 69 | def test_embed_query(self): 70 | vector = self.model.embed_query("test query") 71 | assert isinstance(vector, dict) 72 | # Sparse vectors should have indices and values 73 | assert len(vector) > 0 74 | assert all( 75 | isinstance(k, int) and isinstance(v, (int, float)) 76 | for k, v in vector.items() 77 | ) 78 | 79 | def test_consistent_embeddings(self): 80 | text = "This is a test document" 81 | vector1 = self.model.embed_document(text) 82 | vector2 = self.model.embed_document(text) 83 | # Vectors should be identical for same input 84 | assert vector1 == vector2 85 | 86 | def test_sparse_vector_format(self): 87 | vector = self.model.embed_document("This is a test document") 88 | # Check that indices are unique 89 | assert len(vector.keys()) == len(set(vector.keys())) 90 | # Values should be non-negative for BM25-like models 91 | assert all(v >= 0 for v in vector.values()) 92 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_openai_embeddings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from django_semantic_search.embeddings.openai import OpenAIEmbeddingModel 7 | 8 | 9 | @pytest.mark.skipif( 10 | not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set in environment" 11 | ) 12 | class TestOpenAIEmbeddingModel: 13 | def test_initialization(self): 14 | model = OpenAIEmbeddingModel() 15 | assert model._model == "text-embedding-3-small" 16 | 17 | def test_initialization_fails_without_api_key(self, monkeypatch): 18 | monkeypatch.delenv("OPENAI_API_KEY", raising=False) 19 | with pytest.raises(ValueError) as exc_info: 20 | OpenAIEmbeddingModel() 21 | assert "OpenAI API key must be provided" in str(exc_info.value) 22 | 23 | def test_vector_size(self): 24 | model = OpenAIEmbeddingModel() 25 | size = model.vector_size() 26 | assert isinstance(size, int) 27 | assert size > 0 28 | # Check that size is cached 29 | assert model._vector_size == size 30 | 31 | def test_embed_document(self): 32 | model = OpenAIEmbeddingModel() 33 | vector = model.embed_document("This is a test document") 34 | assert isinstance(vector, list) 35 | assert len(vector) == model.vector_size() 36 | assert all(isinstance(x, float) for x in vector) 37 | 38 | def test_embed_query(self): 39 | model = OpenAIEmbeddingModel() 40 | vector = model.embed_query("test query") 41 | assert isinstance(vector, list) 42 | assert len(vector) == model.vector_size() 43 | assert all(isinstance(x, float) for x in vector) 44 | 45 | def test_consistent_embeddings(self): 46 | model = OpenAIEmbeddingModel() 47 | text = "This is a test document" 48 | vector1 = model.embed_document(text) 49 | vector2 = model.embed_document(text) 50 | # Vectors should be nearly identical for same input 51 | assert np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8) 52 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_sentence_transformers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from django_semantic_search.embeddings.sentence_transformers import ( # noqa 5 | SentenceTransformerModel, 6 | ) 7 | 8 | 9 | @pytest.mark.integration 10 | class TestSentenceTransformerModel: 11 | @pytest.fixture(autouse=True) 12 | def setup_model(self): 13 | self.model = SentenceTransformerModel( 14 | model_name="sentence-transformers/all-MiniLM-L6-v2" 15 | ) 16 | 17 | def test_initialization(self): 18 | model = SentenceTransformerModel( 19 | model_name="sentence-transformers/all-MiniLM-L6-v2" 20 | ) 21 | assert isinstance(model._model, object) # Check model is initialized 22 | assert model._document_prompt is None 23 | assert model._query_prompt is None 24 | 25 | def test_initialization_with_prompts(self): 26 | model = SentenceTransformerModel( 27 | model_name="sentence-transformers/all-MiniLM-L6-v2", 28 | document_prompt="Doc: ", 29 | query_prompt="Query: ", 30 | ) 31 | assert model._document_prompt == "Doc: " 32 | assert model._query_prompt == "Query: " 33 | 34 | def test_vector_size(self): 35 | size = self.model.vector_size() 36 | assert isinstance(size, int) 37 | assert size > 0 38 | # Common size for all-MiniLM-L6-v2 model 39 | assert size == 384 40 | 41 | def test_embed_document(self): 42 | vector = self.model.embed_document("This is a test document") 43 | assert isinstance(vector, list) 44 | assert len(vector) == self.model.vector_size() 45 | assert all(isinstance(x, float) for x in vector) 46 | 47 | def test_embed_query(self): 48 | vector = self.model.embed_query("test query") 49 | assert isinstance(vector, list) 50 | assert len(vector) == self.model.vector_size() 51 | assert all(isinstance(x, float) for x in vector) 52 | 53 | def test_consistent_embeddings(self): 54 | text = "This is a test document" 55 | vector1 = self.model.embed_document(text) 56 | vector2 = self.model.embed_document(text) 57 | # Vectors should be nearly identical for same input 58 | assert np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8) 59 | 60 | def test_document_prompt_affects_embedding(self): 61 | model_with_prompt = SentenceTransformerModel( 62 | model_name="sentence-transformers/all-MiniLM-L6-v2", 63 | document_prompt="Doc: ", 64 | ) 65 | text = "This is a test document" 66 | vector1 = self.model.embed_document(text) 67 | vector2 = model_with_prompt.embed_document(text) 68 | # Vectors should be different when using a prompt 69 | assert not np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8) 70 | 71 | def test_query_prompt_affects_embedding(self): 72 | model_with_prompt = SentenceTransformerModel( 73 | model_name="sentence-transformers/all-MiniLM-L6-v2", 74 | query_prompt="Query: ", 75 | ) 76 | text = "test query" 77 | vector1 = self.model.embed_query(text) 78 | vector2 = model_with_prompt.embed_query(text) 79 | # Vectors should be different when using a prompt 80 | assert not np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8) 81 | -------------------------------------------------------------------------------- /tests/django_semantic_search/test_vector_index_embeddings.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from django.conf import settings 3 | from django.db import models 4 | 5 | from django_semantic_search import Document, VectorIndex, register_document 6 | 7 | 8 | class TestModel(models.Model): 9 | title = models.CharField(max_length=255) 10 | content = models.TextField() 11 | 12 | class Meta: 13 | app_label = "test_vector_index" 14 | 15 | 16 | @pytest.mark.integration 17 | class TestVectorIndexEmbeddings: 18 | @pytest.fixture(autouse=True) 19 | def setup_settings(self): # Remove the settings parameter 20 | settings.SEMANTIC_SEARCH = { 21 | "vector_store": { 22 | "backend": "django_semantic_search.backends.qdrant.QdrantBackend", 23 | "configuration": {"location": ":memory:"}, 24 | }, 25 | "default_embeddings": { 26 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 27 | "configuration": { 28 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 29 | }, 30 | }, 31 | "embedding_models": { 32 | "title_model": { 33 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 34 | "configuration": { 35 | "model_name": "sentence-transformers/all-mpnet-base-v2", 36 | "document_prompt": "Title: ", 37 | }, 38 | }, 39 | "content_model": { 40 | "model": "django_semantic_search.embeddings.SentenceTransformerModel", 41 | "configuration": { 42 | "model_name": "sentence-transformers/all-MiniLM-L6-v2", 43 | "document_prompt": "Content: ", 44 | }, 45 | }, 46 | }, 47 | } 48 | 49 | def test_different_models_for_indexes(self): 50 | @register_document 51 | class TestDocument(Document): 52 | class Meta: 53 | model = TestModel 54 | indexes = [ 55 | VectorIndex("title", embedding_model="title_model"), 56 | VectorIndex("content", embedding_model="content_model"), 57 | ] 58 | 59 | # Create test instances 60 | instance = TestModel(title="Test Title", content="Test Content") 61 | 62 | # Get embeddings for both fields 63 | title_embedding = TestDocument.meta.indexes[0].get_model_embedding(instance) 64 | content_embedding = TestDocument.meta.indexes[1].get_model_embedding(instance) 65 | 66 | # Embeddings should be different sizes due to different models 67 | assert len(title_embedding) != len(content_embedding) 68 | 69 | def test_default_model_fallback(self): 70 | @register_document 71 | class TestDocument(Document): 72 | class Meta: 73 | model = TestModel 74 | indexes = [ 75 | VectorIndex("title"), # Uses default model 76 | VectorIndex("content", embedding_model="content_model"), 77 | ] 78 | 79 | instance = TestModel(title="Test Title", content="Test Content") 80 | 81 | # Both embeddings should work 82 | title_embedding = TestDocument.meta.indexes[0].get_model_embedding(instance) 83 | content_embedding = TestDocument.meta.indexes[1].get_model_embedding(instance) 84 | 85 | assert isinstance(title_embedding, (list, tuple)) 86 | assert isinstance(content_embedding, (list, tuple)) 87 | 88 | def test_invalid_model_name(self): 89 | with pytest.raises(ValueError) as exc_info: 90 | VectorIndex("title", embedding_model="non_existent_model") 91 | assert "Embedding model non_existent_model not found in settings" in str( 92 | exc_info.value 93 | ) 94 | -------------------------------------------------------------------------------- /tests/mocks.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import defaultdict 3 | from hashlib import md5 4 | from typing import Dict, List 5 | 6 | from django_semantic_search import Document 7 | from django_semantic_search.backends.base import BaseVectorSearchBackend 8 | from django_semantic_search.backends.types import IndexConfiguration 9 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel 10 | from django_semantic_search.types import DenseVector, DocumentID, Vector 11 | 12 | 13 | class MockDenseTextEmbeddingModel(DenseTextEmbeddingModel): 14 | """ 15 | Mock dense text embedding model for testing purposes. It produces short random vectors, 16 | but these vectors are consistent for the same input. So it can be used for testing purposes. 17 | """ 18 | 19 | def __init__(self, size: int = 10): 20 | self._size = size 21 | 22 | def vector_size(self) -> int: 23 | return self._size 24 | 25 | def embed_document(self, document: str) -> DenseVector: 26 | """Return a random vector.""" 27 | document_hash = md5(document.encode()).hexdigest() 28 | random.seed(document_hash) 29 | return [random.random() for _ in range(self._size)] 30 | 31 | def embed_query(self, query: str) -> DenseVector: 32 | return self.embed_document(query) 33 | 34 | 35 | class MockVectorSearchBackend(BaseVectorSearchBackend): 36 | """ 37 | Mock vector search backend for testing purposes. It stores the vectors in memory, and allows to search for the 38 | closest vectors. 39 | """ 40 | 41 | def __init__(self, index_configuration: IndexConfiguration): 42 | super().__init__(index_configuration) 43 | self._documents: Dict[str, Dict[DocumentID, Document]] = defaultdict(dict) 44 | 45 | def configure(self): 46 | """No configuration is needed for the mock backend.""" 47 | pass 48 | 49 | def search( 50 | self, vector_name: str, query: Vector, limit: int = 10 51 | ) -> List[DocumentID]: 52 | random.seed(sum(query)) 53 | max_results = min( 54 | limit, len(self._documents[self.index_configuration.namespace]) 55 | ) 56 | selected_documents = random.sample( 57 | list(self._documents[self.index_configuration.namespace].values()), 58 | k=max_results, 59 | ) 60 | return [doc.id for doc in selected_documents] 61 | 62 | def save(self, document: Document) -> None: 63 | self._documents[self.index_configuration.namespace][document.id] = document 64 | 65 | def delete(self, document_id: DocumentID) -> None: 66 | del self._documents[self.index_configuration.namespace][document_id] 67 | 68 | 69 | # Configuration for the tests 70 | test_settings = { 71 | "vector_store": { 72 | "backend": MockVectorSearchBackend, 73 | "configuration": {}, 74 | }, 75 | "default_embeddings": { 76 | "model": MockDenseTextEmbeddingModel, 77 | "configuration": {}, 78 | }, 79 | } 80 | --------------------------------------------------------------------------------