├── .github
    └── workflows
    │   ├── python-publish.yaml
    │   └── run-tests.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── api
    │   ├── backends.md
    │   ├── documents.md
    │   └── embeddings.md
    ├── assets
    │   ├── favicon.png
    │   └── logo.png
    ├── index.md
    ├── quickstart.md
    └── usage.md
├── examples
    └── simple_django_app
    │   ├── manage.py
    │   ├── products
    │       ├── __init__.py
    │       ├── admin.py
    │       ├── apps.py
    │       ├── documents.py
    │       ├── migrations
    │       │   ├── 0001_initial.py
    │       │   └── __init__.py
    │       ├── models.py
    │       ├── tests.py
    │       └── views.py
    │   ├── requirements.txt
    │   └── simple_django_app
    │       ├── __init__.py
    │       ├── asgi.py
    │       ├── settings.py
    │       ├── urls.py
    │       └── wsgi.py
├── mkdocs.yaml
├── poetry.lock
├── pyproject.toml
├── src
    └── django_semantic_search
    │   ├── __init__.py
    │   ├── apps.py
    │   ├── backends
    │       ├── __init__.py
    │       ├── base.py
    │       ├── qdrant.py
    │       └── types.py
    │   ├── decorators.py
    │   ├── default_settings.py
    │   ├── documents.py
    │   ├── embeddings
    │       ├── __init__.py
    │       ├── base.py
    │       ├── fastembed.py
    │       ├── openai.py
    │       └── sentence_transformers.py
    │   ├── types.py
    │   └── utils.py
└── tests
    ├── conftest.py
    ├── django_semantic_search
        ├── test_apps.py
        ├── test_decorators.py
        ├── test_documents.py
        ├── test_fastembed.py
        ├── test_openai_embeddings.py
        ├── test_sentence_transformers.py
        └── test_vector_index_embeddings.py
    └── mocks.py


/.github/workflows/python-publish.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   workflow_dispatch:
13 |   push:
14 |     # Pattern matched against refs/tags
15 |     tags:
16 |       - 'v*'           # Push events to every version tag
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v2
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v2
27 |       with:
28 |         python-version: '3.10.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install poetry
32 |         poetry install
33 |     - name: Build package
34 |       run: poetry build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.github/workflows/run-tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Run tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 | 
 8 | defaults:
 9 |   run:
10 |     shell: bash
11 | 
12 | jobs:
13 |   test:
14 |     strategy:
15 |       matrix:
16 |         python-version:
17 |           - '3.10.x'
18 |           - '3.11.x'
19 |           - '3.12.x'
20 |         os:
21 |           - ubuntu-22.04
22 |           - windows-latest
23 | 
24 |     runs-on: ${{ matrix.os }}
25 | 
26 |     name: Python ${{ matrix.python-version }} test on ${{ matrix.os }}
27 | 
28 |     steps:
29 |       - name: Check out repository
30 |         uses: actions/checkout@v3
31 | 
32 |       - name: Set up python
33 |         id: setup-python
34 |         uses: actions/setup-python@v4
35 |         with:
36 |           python-version: ${{ matrix.python-version }}
37 | 
38 |       - name: Install Poetry
39 |         uses: snok/install-poetry@v1.3.4
40 |         with:
41 |           virtualenvs-create: true
42 |           virtualenvs-in-project: true
43 | 
44 |       - name: Install dependencies
45 |         run: poetry install --no-interaction --extras all --with dev
46 | 
47 |       - name: Run tests
48 |         run: |
49 |           source $VENV
50 |           poetry run pytest
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | .idea/
163 | 
164 | # Project specific settings
165 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | default_language_version:
 4 |   python: python3.10
 5 | 
 6 | ci:
 7 |   autofix_prs: true
 8 |   autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
 9 |   autoupdate_schedule: quarterly
10 |   # submodules: true
11 | 
12 | repos:
13 |   - repo: https://github.com/pre-commit/pre-commit-hooks
14 |     rev: v4.6.0
15 |     hooks:
16 |       - id: check-yaml
17 |       - id: end-of-file-fixer
18 |       - id: trailing-whitespace
19 |       - id: check-ast
20 |       - id: check-added-large-files
21 | 
22 |   - repo: https://github.com/astral-sh/ruff-pre-commit
23 |     rev: v0.5.0
24 |     hooks:
25 |       - id: ruff
26 |         args: [ --fix ]
27 |       - id: ruff-format
28 | 
29 |   - repo: https://github.com/PyCQA/isort
30 |     rev: 5.12.0
31 |     hooks:
32 |       - id: isort
33 |         name: "Sort Imports"
34 |         args: [ "--profile", "black" ]
35 | 
36 |   - repo: https://github.com/pre-commit/mirrors-mypy
37 |     rev: v1.9.0
38 |     hooks:
39 |       - id: mypy
40 |         exclude: ^examples/
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | docs_preview:
2 | 	echo "Previewing docs..."
3 | 	mkdocs serve
4 | 
5 | docs_deploy:
6 | 	echo "Building docs..."
7 | 	mkdocs gh-deploy --force
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # [django-semantic-search](https://kacperlukawski.github.io/django-semantic-search/)
 2 | 
 3 | [![Latest PyPI version](https://img.shields.io/pypi/v/django-semantic-search.svg?style=flat-square)](https://pypi.python.org/pypi/django-semantic-search/)
 4 | [![GitHub License](https://img.shields.io/github/license/kacperlukawski/django-semantic-search)](LICENSE)
 5 | 
 6 | > Bringing semantic search to Django. Integrates seamlessly with Django ORM.
 7 | 
 8 | **Full documentation for the project is available at https://kacperlukawski.github.io/django-semantic-search/**
 9 | 
10 | Django built-in search capabilities are rather limited. Finding a relevant instance of a model relies on the relational
11 | database's search capabilities, like SQL `LIKE` queries. This is not ideal for high-quality search results. This library
12 | aims to provide a semantic search capability to Django, allowing for more relevant search results. All this is done in
13 | a Django-friendly way, integrating with Django ORM.
14 | 
15 | The library does not aim to provide all the features of search engines, but rather to provide a simple way to integrate
16 | Django applications with semantic search capabilities, using existing vector search engines, a.k.a. vector databases,
17 | and embedding models.
18 | 
19 | ## Installation
20 | 
21 | The `django-semantic-search` library can be installed via your favorite package manager. For example, using `pip`:
22 | 
23 | ```shell
24 | pip install django-semantic-search
25 | ```
26 | 
27 | The current version is still experimental, and the API may change in the future.
28 | 
29 | ## Quickstart
30 | 
31 | Assuming, you already have a `Book` model defined in your Django application, you can define a corresponding subclass
32 | of the `Document` class from the `django_semantic_search` package. The `Document` class maps the Django model to the
33 | vector search engine. The document has to be registered with the `register_document` function.
34 | 
35 | ```python
36 | from django_semantic_search import Document, VectorIndex, register_document
37 | from myapp.models import Book
38 | 
39 | @register_document
40 | class BookDocument(Document):
41 |     class Meta:
42 |         model = Book
43 |         indexes = [
44 |             VectorIndex("title"),
45 |             VectorIndex("description"),
46 |         ]
47 | ```
48 | 
49 | The `BookDocument` class defines the fields that will be indexed in the vector search engine. In this case, the `title`
50 | and `description` fields are indexed as separate vectors. The `VectorIndex` class is used to define the fields that
51 | should be indexed.
52 | 
53 | A more detailed guide is available in the [Quickstart](https://kacperlukawski.github.io/django-semantic-search/quickstart/)
54 | section of the documentation.
55 | 
56 | ## Usage
57 | 
58 | Please refer to the [Usage](https://kacperlukawski.github.io/django-semantic-search/usage/) section in the documentation.
59 | 
60 | ## Features
61 | 
62 | - Define the search fields for a model.
63 | - Reflect the configuration in your vector search engine.
64 | - Auto-populate the vector search engine with the data from the Django models.
65 | - Support for multiple embedding models:
66 |   - Sentence Transformers
67 |   - OpenAI
68 |   - FastEmbed (both dense and sparse embeddings)
69 | 
70 | For the latest documentation, visit [https://kacperlukawski.github.io/django-semantic-search/](https://kacperlukawski.github.io/django-semantic-search/).
71 | 
72 | ## Roadmap
73 | 
74 | This is a general roadmap for the project. The list is not exhaustive and may change over time.
75 | 
76 | - [ ] Allow using multiple fields for a single vector index.
77 | - [ ] Define overriding the default embedding model for each `VectorIndex`.
78 | - [ ] Implement wrappers for embedding models.
79 | - [ ] Add support for modalities other than text.
80 | - [ ] Improve the test coverage.
81 | - [ ] Add metadata filtering to the search method.
82 | 
83 | If you have any suggestions or feature requests, feel free to create an issue in the project's repository.
84 | 


--------------------------------------------------------------------------------
/docs/api/backends.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Backends
 3 | ---
 4 | 
 5 | Backends are external tools supposed to provide the functionality of semantic search. The library does not assume
 6 | any specific backend, but it provides a way to integrate with them. The following backends are supported:
 7 | 
 8 | ## Qdrant
 9 | 
10 | Qdrant is a high-performance vector search engine written in Rust.
11 | 
12 | ::: django_semantic_search.backends.qdrant.QdrantBackend
13 |     options:
14 |         members: false
15 | 


--------------------------------------------------------------------------------
/docs/api/documents.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: API Reference
 3 | ---
 4 | 
 5 | `django-semantic-search` was designed to mimic some of the patterns used in popular Django libraries, such as
 6 | `django-import-export` to reduce the learning curve for new users.
 7 | 
 8 | The base concept of the library is a `Document` subclass that represents a single searchable entity. The library
 9 | provides a way to define a document class for a selected model. The document class is responsible for converting
10 | the model instances into the vector representation and storing them in the vector search engine, as well as for
11 | performing the search queries.
12 | 
13 | ## Documents
14 | 
15 | ::: django_semantic_search.Document
16 |     options:
17 |         members: false
18 | 


--------------------------------------------------------------------------------
/docs/api/embeddings.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Embedding models
  3 | ---
  4 | 
  5 | An embedding model is a tool that converts text data into a vector representation. The quality of the embedding model
  6 | is crucial for the quality of the search results. You can configure multiple embedding models in your Django settings
  7 | and use them for different fields in your documents.
  8 | 
  9 | ## Configuration
 10 | 
 11 | ### Default Embedding Model
 12 | 
 13 | Configure the default embedding model that will be used when no specific model is specified:
 14 | 
 15 | ```python title="settings.py"
 16 | SEMANTIC_SEARCH = {
 17 |     "default_embeddings": {
 18 |         "model": "django_semantic_search.embeddings.SentenceTransformerModel",
 19 |         "configuration": {
 20 |             "model_name": "sentence-transformers/all-MiniLM-L6-v2",
 21 |         },
 22 |     },
 23 | }
 24 | ```
 25 | 
 26 | ### Named Embedding Models
 27 | 
 28 | You can define multiple named embedding models to use for different fields:
 29 | 
 30 | ```python title="settings.py"
 31 | SEMANTIC_SEARCH = {
 32 |     "embedding_models": {
 33 |         "title_model": {
 34 |             "model": "django_semantic_search.embeddings.SentenceTransformerModel",
 35 |             "configuration": {
 36 |                 "model_name": "sentence-transformers/all-mpnet-base-v2",
 37 |                 "document_prompt": "Title: ",
 38 |             },
 39 |         },
 40 |         "content_model": {
 41 |             "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel",
 42 |             "configuration": {
 43 |                 "model": "text-embedding-3-small",
 44 |             },
 45 |         },
 46 |     },
 47 |     ...
 48 | }
 49 | ```
 50 | 
 51 | Then reference these models in your document definitions:
 52 | 
 53 | ```python title="documents.py"
 54 | @register_document
 55 | class BookDocument(Document):
 56 |     class Meta:
 57 |         model = Book
 58 |         indexes = [
 59 |             VectorIndex("title", embedding_model="title_model"),
 60 |             VectorIndex("content", embedding_model="content_model"),
 61 |             VectorIndex("summary"),  # Will use default_embeddings
 62 |         ]
 63 | ```
 64 | 
 65 | Note: Fields without a specified `embedding_model` will use the model defined in `default_embeddings`.
 66 | 
 67 | ## Supported Models
 68 | 
 69 | Currently, `django-semantic-search` supports the following embedding models:
 70 | 
 71 | ### Sentence Transformers
 72 | 
 73 | The [Sentence Transformers](https://www.sbert.net) library provides a way to convert text data into a vector
 74 | representation. There are [over 5,000 pre-trained models
 75 | available](https://huggingface.co/models?library=sentence-transformers), and you can choose the one that fits your needs the
 76 | best.
 77 | 
 78 | One of the available models is `all-MiniLM-L6-v2`, which is a lightweight model that provides a good balance between the
 79 | quality of the search results and the resource consumption.
 80 | 
 81 | ::: django_semantic_search.embeddings.SentenceTransformerModel
 82 |     options:
 83 |         members:
 84 |             - __init__
 85 |             - embed_document
 86 |             - embed_query
 87 |             - vector_size
 88 | 
 89 | ### OpenAI
 90 | 
 91 | [OpenAI](https://platform.openai.com/docs/guides/embeddings) provides powerful embedding models through their API. The default model is `text-embedding-3-small`, which
 92 | offers a good balance between quality and cost.
 93 | 
 94 | To use OpenAI embeddings, first install the required dependencies:
 95 | 
 96 | ```bash
 97 | pip install django-semantic-search[openai]
 98 | ```
 99 | 
100 | Then configure it in your Django settings:
101 | 
102 | ```python title="settings.py"
103 | SEMANTIC_SEARCH = {
104 |     "default_embeddings": {
105 |         "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel",
106 |         "configuration": {
107 |             "model": "text-embedding-3-small",
108 |             "api_key": "your-api-key",  # Optional if set in env
109 |         },
110 |     },
111 |     ...
112 | }
113 | ```
114 | 
115 | The API key can also be provided through the `OPENAI_API_KEY` environment variable.
116 | 
117 | ::: django_semantic_search.embeddings.OpenAIEmbeddingModel
118 |     options:
119 |         members:
120 |             - __init__
121 |             - embed_document
122 |             - embed_query
123 |             - vector_size
124 | 
125 | ### FastEmbed
126 | 
127 | [FastEmbed](https://github.com/qdrant/fastembed) is a lightweight and efficient embedding library that supports both
128 | dense and sparse embeddings. It provides fast, accurate embeddings suitable for production use.
129 | 
130 | #### Installation
131 | 
132 | To use FastEmbed embeddings, install the required dependencies:
133 | 
134 | ```bash
135 | pip install django-semantic-search[fastembed]
136 | ```
137 | 
138 | #### Dense Embeddings
139 | 
140 | For dense embeddings, configure FastEmbed in your Django settings:
141 | 
142 | ```python title="settings.py"
143 | SEMANTIC_SEARCH = {
144 |     "default_embeddings": {
145 |         "model": "django_semantic_search.embeddings.FastEmbedDenseModel",
146 |         "configuration": {
147 |             "model_name": "BAAI/bge-small-en-v1.5",
148 |         },
149 |     },
150 |     ...
151 | }
152 | ```
153 | 
154 | ::: django_semantic_search.embeddings.FastEmbedDenseModel
155 |     options:
156 |         members:
157 |             - __init__
158 |             - embed_document
159 |             - embed_query
160 |             - vector_size
161 | 
162 | #### Sparse Embeddings (Coming Soon)
163 | 
164 | > **Note:** Sparse embeddings support is currently under development and not yet available for use in
165 | > django-semantic-search. This feature will be available in a future release.
166 | 
167 | While FastEmbed supports sparse embeddings (like BM25), the integration with django-semantic-search is still in
168 | progress.
169 | 


--------------------------------------------------------------------------------
/docs/assets/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/docs/assets/favicon.png


--------------------------------------------------------------------------------
/docs/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/docs/assets/logo.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Django semantic search
  3 | ---
  4 | 
  5 | # django-semantic-search
  6 | 
  7 | [![Latest PyPI version](https://img.shields.io/pypi/v/django-semantic-search.svg?style=flat-square)](https://pypi.python.org/pypi/django-semantic-search/)
  8 | [![GitHub License](https://img.shields.io/github/license/kacperlukawski/django-semantic-search)](https://github.com/kacperlukawski/django-semantic-search/LICENSE)
  9 | 
 10 | !!! Note ""
 11 |     Bringing semantic search to Django. Integrates seamlessly with Django ORM.
 12 | 
 13 | Django built-in search capabilities are rather limited. Finding a relevant instance of a model relies on the relational
 14 | database's search capabilities, like SQL `LIKE` queries. This is not ideal for high-quality search results. This library
 15 | aims to provide a semantic search capability to Django, allowing for more relevant search results. All this is done in
 16 | a Django-friendly way, integrating with Django ORM.
 17 | 
 18 | The library does not aim to provide all the features of search engines, but rather to provide a simple way to integrate
 19 | Django applications with semantic search capabilities, using existing vector search engines, a.k.a. vector databases,
 20 | and embedding models.
 21 | 
 22 | ## Installation
 23 | 
 24 | The `django-semantic-search` library can be installed via your favorite package manager. For example, using `pip`:
 25 | 
 26 | ```shell
 27 | pip install django-semantic-search
 28 | ```
 29 | 
 30 | The current version is still experimental, and the API may change in the future.
 31 | 
 32 | ## Supported tools
 33 | 
 34 | `django-semantic-search` has to cooperate with other tools to provide semantic search capabilities. You have to choose
 35 | a vector search engine and an embedding model to use with the library, and configure them in the Django settings.
 36 | 
 37 | ### Vector search engines
 38 | 
 39 | The library supports the following vector search engines:
 40 | 
 41 | - [Qdrant](api/backends.md#qdrant)
 42 | 
 43 | If you would like to contribute support for another vector search engine, feel free to create a pull request.
 44 | 
 45 | ### Embedding models
 46 | 
 47 | Choosing the right embedding model is crucial for the quality of the search results. The current version of the library
 48 | focuses on bringing the semantic search capabilities to Django, and provides integrations with the following vector embedding models:
 49 | 
 50 | - [Sentence Transformers](api/embeddings.md#sentence-transformers)
 51 | - [OpenAI](api/embeddings.md#openai)
 52 | - [FastEmbed](api/embeddings.md#fastembed) (currently supports dense embeddings, sparse embeddings coming soon)
 53 | 
 54 | In web-based applications, it makes a lot of sense to choose an external service for the embedding model, as it can be
 55 | resource-intensive. Please do expect that the library will support more embedding models in the future, and will provide
 56 | a way to integrate them with Django.
 57 | 
 58 | Again, if you would like to contribute support for another embedding model, feel free to create a pull request.
 59 | 
 60 | ## Configuration
 61 | 
 62 | As with any Django application, you need to add the library to the `INSTALLED_APPS` list in the `settings.py` file of
 63 | your project:
 64 | 
 65 | ```python title="settings.py"
 66 | INSTALLED_APPS = [
 67 |     ...,  # external apps, such as Django Rest Framework
 68 |     'django_semantic_search',
 69 |     ...,  # your custom apps, using django-semantic-search
 70 | ]
 71 | ```
 72 | 
 73 | All the library configuration is also done in the `settings.py` file of the project, via the `SEMANTIC_SEARCH`
 74 | dictionary. Here is a full example of the configuration:
 75 | 
 76 | ```python title="settings.py"
 77 | --8<-- "src/django_semantic_search/default_settings.py"
 78 | ```
 79 | 
 80 | ## Quickstart
 81 | 
 82 | If you would like to be guided step-by-step through the installation and configuration process, please refer to the
 83 | [Quickstart](quickstart.md) guide.
 84 | 
 85 | ## Examples
 86 | 
 87 | If you prefer going straight to the code, you can check the `examples` folder. In the future it will contain more
 88 | examples of how to use the library, but for the time being, it contains just a simple Django project with a single
 89 | app that demonstrates how to use the library.
 90 | 
 91 | ### Simple Django App
 92 | 
 93 | The `examples` folder contains a minimal Django `simple_django_app` project using the `django-semantic-search` library.
 94 | It shows how to configure semantic search in a Django project. The application defines a simple model and a document
 95 | class for it, and demonstrates how to search for instances of the model using the library.
 96 | 
 97 | #### Prerequisites
 98 | 
 99 | By default, the `simple_django_app` project uses the `Qdrant` vector search engine and the `all-MiniLM-L6-v2` Sentence
100 | Transformers model. You have to install the `django-semantic-search` library with the `qdrant` and `sentence-transformers`
101 | extras to run the project. The dependencies might be installed from the requirements file:
102 | 
103 | ```shell
104 | pip install -r examples/simple_django_app/requirements.txt
105 | ```
106 | 
107 | The default configuration assumes that the Qdrant service is running on `localhost:6333`. Please refer to the Qdrant
108 | documentation on [how to set up the service](https://qdrant.tech/documentation/quickstart/#download-and-run).
109 | 


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Quickstart
  3 | ---
  4 | 
  5 | This quickstart guide will help you to get started with the `django-semantic-search` library. It will guide you through
  6 | the installation process, the configuration of the vector search engine and the embedding model, and the definition of
  7 | documents for the selected model.
  8 | 
  9 | Assuming you already have a Django project set up, let's get started.
 10 | 
 11 | ## 1. Install django-semantic-search
 12 | 
 13 | The `django-semantic-search` library can be installed via your favorite package manager. For example, using `pip`:
 14 | 
 15 | ```shell
 16 | pip install django-semantic-search
 17 | ```
 18 | 
 19 | The default installation does not include any vector search engine or embedding model, so you typically have to install
 20 | the package with the desired support. For example, to install the package with [Qdrant](https://qdrant.tech) and
 21 | [Sentence Transformers](https://www.sbert.net) support, you can run:
 22 | 
 23 | ```shell
 24 | pip install django-semantic-search[qdrant,sentence-transformers]
 25 | ```
 26 | 
 27 | ## 2. Modify the Django settings
 28 | 
 29 | Add the library to the `INSTALLED_APPS` list in the `settings.py` file of your project:
 30 | 
 31 | ```python title="settings.py"
 32 | INSTALLED_APPS = [
 33 |     ...,  # external apps, such as Django Rest Framework
 34 |     'django_semantic_search',
 35 |     ...,  # your custom apps, using django-semantic-search
 36 | ]
 37 | ```
 38 | 
 39 | ## 3. Choose the vector search engine and the embedding model
 40 | 
 41 | Do not close the `settings.py` file yet. You need to configure the vector search engine and the embedding model. Add the
 42 | `SEMANTIC_SEARCH` dictionary to the `settings.py` file of the project. Here is a basic example:
 43 | 
 44 | ```python title="settings.py"
 45 | SEMANTIC_SEARCH = {
 46 |     "vector_store": {
 47 |         "backend": "django_semantic_search.backends.qdrant.QdrantBackend",
 48 |         "configuration": {
 49 |             "location": "http://localhost:6333",
 50 |         },
 51 |     },
 52 |     "default_embeddings": {
 53 |         "model": "django_semantic_search.embeddings.SentenceTransformerModel",
 54 |         "configuration": {
 55 |             "model_name": "sentence-transformers/all-MiniLM-L6-v2",
 56 |         },
 57 |     },
 58 | }
 59 | ```
 60 | 
 61 | For more advanced configurations, including using different embedding models for different fields, see the [Embedding Models](api/embeddings.md) documentation.
 62 | 
 63 | ## 4. Create a model class (skip if you already have one)
 64 | 
 65 | Our example will use a simple model class, `Book`, with the `title`, `author`, and `description` fields. Here is the
 66 | model definition:
 67 | 
 68 | ```python title="books/models.py"
 69 | from django.db import models
 70 | 
 71 | class Book(models.Model):
 72 |     title = models.CharField(max_length=255)
 73 |     author = models.CharField(max_length=255)
 74 |     description = models.TextField()
 75 | ```
 76 | 
 77 | A newly created model means we need to create a migration and apply it to the database:
 78 | 
 79 | ```shell
 80 | python manage.py makemigrations
 81 | python manage.py migrate
 82 | ```
 83 | 
 84 | ## 5. Define document class for the selected model
 85 | 
 86 | Once the model is defined, you need to create a document class that inherits from `django_semantic_search.Document`.
 87 | 
 88 | Assuming we have a `Book` model with the `title`, `author`, and `description` fields, here is an example of a document
 89 | class for the `Book` model, with the `title` and `description` fields defined as searchable. Please do not forget to
 90 | use the `register_document` decorator to register the document class with the library.
 91 | 
 92 | ```python title="books/documents.py"
 93 | from django_semantic_search import Document, VectorIndex, register_document
 94 | from books.models import Book
 95 | 
 96 | @register_document
 97 | class BookDocument(Document):
 98 |     class Meta:
 99 |         model = Book
100 |         indexes = [
101 |             VectorIndex("title"),
102 |             VectorIndex("description"),
103 |         ]
104 | ```
105 | 
106 | Currently, only single fields can be used for the vector index.
107 | 
108 | The decorator `register_document` takes care of creating the signals for the model, so all the created/updated/deleted
109 | instances of the model will be automatically indexed in the vector search engine.
110 | 
111 | ## 6. Create and store the instances of the model
112 | 
113 | From now on, whenever you create or update an instance of the `Book` model, the instance will be automatically indexed
114 | in the vector search engine. Here is an example of creating a new instance of the `Book` model:
115 | 
116 | ```python title="books/views.py"
117 | from books.models import Book
118 | 
119 | def create_book(request):
120 |     book = Book.objects.create(
121 |         title="The Lord of the Rings",
122 |         author="J.R.R. Tolkien",
123 |         description="The Lord of the Rings is an epic high-fantasy novel by the English author and scholar J. R. R. Tolkien."
124 |     )
125 |     return book
126 | ```
127 | 
128 | The `create_book` function creates a new instance of the `Book` model with the title, author, and description fields
129 | filled in. The instance is then returned. Under the hood, a corresponding document is created and indexed in the vector
130 | search engine. It ignores the `author` field, as it is not defined as a searchable field in the `BookDocument` class.
131 | 
132 | ## 7. Search for the instances of the model
133 | 
134 | The `BookDocument` class serves as a bridge between the Django model and the vector search engine. You can use the
135 | `search` method to find the most relevant instances of the model. Here is an example of searching for the instances of
136 | the `Book` model:
137 | 
138 | ```python title="books/views.py"
139 | from books.documents import BookDocument
140 | 
141 | results = BookDocument.objects.search(title=query)
142 | ```
143 | 
144 | We specifically chose the `title` field to search for the instances of the `Book` model. The `search` method returns a
145 | queryset of the most relevant instances of the model, based on the search query. Alternatively, you can search for the
146 | instances using the `description` field:
147 | 
148 | ```python title="books/views.py"
149 | results = BookDocument.objects.search(description=query)
150 | ```
151 | 
152 | Currently, only a single field can be used for the search query, but we plan to extend this functionality in the future.
153 | 
154 | !!!Info
155 |     This tutorial covers the happy path of using the `django-semantic-search` library. If you encounter any issues or
156 |     have any questions, feel free to create an issue in the project's repository. Please make sure to check the list of
157 |     [Frequency Asked Questions](usage.md#frequently-asked-questions) before creating a new issue.
158 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Usage"
  3 | ---
  4 | 
  5 | This section focuses on specific usage examples of the `django-semantic-search` library. If you are looking for
  6 | a step-by-step introduction, please refer to the [Quickstart](quickstart.md) guide.
  7 | 
  8 | ## Configuration
  9 | 
 10 | As with any Django application, you need to add the library to the `INSTALLED_APPS` list in the `settings.py` file of
 11 | your project:
 12 | 
 13 | ```python title="settings.py"
 14 | INSTALLED_APPS = [
 15 |     ...,  # external apps, such as Django Rest Framework
 16 |     'django_semantic_search',
 17 |     ...,  # your custom apps, using django-semantic-search
 18 | ]
 19 | ```
 20 | 
 21 | All the library configuration is done in the `settings.py` file of the project, via the `SEMANTIC_SEARCH`
 22 | dictionary. Here is a full example of the configuration:
 23 | 
 24 | ```python title="settings.py"
 25 | --8<-- "src/django_semantic_search/default_settings.py"
 26 | ```
 27 | 
 28 | ### Using Different Embedding Models
 29 | 
 30 | You can define multiple embedding models in the settings and use them for different fields in your documents:
 31 | 
 32 | ```python title="settings.py"
 33 | SEMANTIC_SEARCH = {
 34 |     "default_embeddings": {
 35 |         "model": "django_semantic_search.embeddings.SentenceTransformerModel",
 36 |         "configuration": {
 37 |             "model_name": "sentence-transformers/all-MiniLM-L6-v2",
 38 |         },
 39 |     },
 40 |     "embedding_models": {
 41 |         "title_model": {
 42 |             "model": "django_semantic_search.embeddings.SentenceTransformerModel",
 43 |             "configuration": {
 44 |                 "model_name": "sentence-transformers/all-mpnet-base-v2",
 45 |                 "document_prompt": "Title: ",
 46 |             },
 47 |         },
 48 |         "content_model": {
 49 |             "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel",
 50 |             "configuration": {
 51 |                 "model": "text-embedding-3-small",
 52 |             },
 53 |         },
 54 |     }
 55 | }
 56 | ```
 57 | 
 58 | Then reference these models in your document definitions:
 59 | 
 60 | ```python title="books/documents.py"
 61 | @register_document
 62 | class BookDocument(Document):
 63 |     class Meta:
 64 |         model = Book
 65 |         indexes = [
 66 |             VectorIndex("title", embedding_model="title_model"),  # Uses title_model
 67 |             VectorIndex("content", embedding_model="content_model"),  # Uses content_model
 68 |             VectorIndex("description"),  # Uses default_embeddings
 69 |         ]
 70 | ```
 71 | 
 72 | If no specific embedding model is specified for a `VectorIndex`, it will use the model defined in `default_embeddings`.
 73 | 
 74 | ## Frequently Asked Questions
 75 | 
 76 | This section describes some common questions and answers related to the `django-semantic-search` library.
 77 | 
 78 | ### How to define which fields are searchable?
 79 | 
 80 | To define the search fields for a model, you need to create a document class that inherits from
 81 | `django_semantic_search.Document`. There is no strict requirement for the document class to be put in a specific
 82 | package, but it is recommended to put it in the `documents.py` file in the app package.
 83 | 
 84 | Assuming, we have a `Book` model with the `title`, `author`, and `description` fields:
 85 | 
 86 | ```python title="books/models.py"
 87 | from django.db import models
 88 | 
 89 | class Book(models.Model):
 90 |     title = models.CharField(max_length=255)
 91 |     author = models.CharField(max_length=255)
 92 |     description = models.TextField()
 93 | ```
 94 | 
 95 | Here is an example of a document class for the `Book` model, with the `title` and `description` fields defined as
 96 | searchable:
 97 | 
 98 | ```python title="books/documents.py"
 99 | from django_semantic_search import Document, VectorIndex
100 | from books.models import Book
101 | 
102 | class BookDocument(Document):
103 |     class Meta:
104 |         model = Book
105 |         indexes = [
106 |             VectorIndex("title"),
107 |             VectorIndex("description"),
108 |         ]
109 | ```
110 | 
111 | Currently, the default embedding model is used for all the fields.
112 | 
113 | ### How to search for documents?
114 | 
115 | To search for documents, you can use the `search` method of the document class. The method returns a Django queryset
116 | with the search results.
117 | 
118 | Here is an example of searching for books with the title containing the word "Django":
119 | 
120 | ```python title="books/views.py"
121 | from books.documents import BookDocument
122 | 
123 | def search_books(request):
124 |     query = "Django"
125 |     books = BookDocument.objects.search(title=query)
126 |     return render(request, "books/search_results.html", {"books": books})
127 | ```
128 | 
129 | Using the named arguments in the `search` method allows you to search for documents with specific fields.
130 | 
131 | ### How to index the existing data?
132 | 
133 | If you are adding the `django-semantic-search` library to an existing project, you may want to index the existing
134 | instances of the models. To do this, you can use the `index` method of the document class.
135 | 
136 | Here is an example of indexing all the existing instances of the `Book` model:
137 | 
138 | ```python title="index_models.py"
139 | from books.models import Book
140 | from books.documents import BookDocument
141 | 
142 | def index_books(request):
143 |     all_books = Book.objects.all()
144 |     BookDocument.objects.index(all_books)
145 |     return HttpResponse("Books indexed successfully.")
146 | ```
147 | 
148 | !!!Warning
149 |     Indexing all the instances of the model can be resource-intensive, as each instance of the model has to be converted
150 |     to the vector representation. It is recommended to run the indexing process in a background task or a separate
151 |     management command.
152 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | 
 8 | def main():
 9 |     """Run administrative tasks."""
10 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_django_app.settings")
11 |     try:
12 |         from django.core.management import execute_from_command_line
13 |     except ImportError as exc:
14 |         raise ImportError(
15 |             "Couldn't import Django. Are you sure it's installed and "
16 |             "available on your PYTHONPATH environment variable? Did you "
17 |             "forget to activate a virtual environment?"
18 |         ) from exc
19 |     execute_from_command_line(sys.argv)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     main()
24 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/examples/simple_django_app/products/__init__.py


--------------------------------------------------------------------------------
/examples/simple_django_app/products/admin.py:
--------------------------------------------------------------------------------
 1 | from django.contrib import admin
 2 | from products.models import Product
 3 | 
 4 | 
 5 | class ProductAdmin(admin.ModelAdmin):
 6 |     pass
 7 | 
 8 | 
 9 | admin.site.register(Product, ProductAdmin)
10 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class ProductsConfig(AppConfig):
5 |     default_auto_field = "django.db.models.BigAutoField"
6 |     name = "products"
7 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/documents.py:
--------------------------------------------------------------------------------
 1 | import django_semantic_search as dss
 2 | 
 3 | from .models import Product
 4 | 
 5 | 
 6 | @dss.register_document
 7 | class ProductDocument(dss.Document):
 8 |     """
 9 |     Maps the Product model to a document for the semantic search engine.
10 |     """
11 | 
12 |     class Meta:
13 |         model = Product
14 |         indexes = [
15 |             # One vector index is created for the description field
16 |             dss.VectorIndex("description"),
17 |             # Another vector index is created just for the name field
18 |             dss.VectorIndex("name"),
19 |         ]
20 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/migrations/0001_initial.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 5.1 on 2024-08-28 11:15
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 |     initial = True
 8 | 
 9 |     dependencies = []
10 | 
11 |     operations = [
12 |         migrations.CreateModel(
13 |             name="Product",
14 |             fields=[
15 |                 (
16 |                     "id",
17 |                     models.BigAutoField(
18 |                         auto_created=True,
19 |                         primary_key=True,
20 |                         serialize=False,
21 |                         verbose_name="ID",
22 |                     ),
23 |                 ),
24 |                 ("name", models.CharField(max_length=255)),
25 |                 ("description", models.TextField()),
26 |                 ("thumbnail", models.URLField(blank=True, null=True)),
27 |                 ("price", models.DecimalField(decimal_places=2, max_digits=10)),
28 |                 ("created_at", models.DateTimeField(auto_now_add=True)),
29 |                 ("updated_at", models.DateTimeField(auto_now=True)),
30 |             ],
31 |         ),
32 |     ]
33 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/examples/simple_django_app/products/migrations/__init__.py


--------------------------------------------------------------------------------
/examples/simple_django_app/products/models.py:
--------------------------------------------------------------------------------
 1 | from django.db import models
 2 | 
 3 | 
 4 | class Product(models.Model):
 5 |     """
 6 |     Model to store the product information.
 7 |     """
 8 | 
 9 |     name = models.CharField(max_length=255)
10 |     description = models.TextField()
11 |     thumbnail = models.URLField(blank=True, null=True)
12 |     price = models.DecimalField(max_digits=10, decimal_places=2)
13 |     created_at = models.DateTimeField(auto_now_add=True)
14 |     updated_at = models.DateTimeField(auto_now=True)
15 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/tests.py:
--------------------------------------------------------------------------------
1 | # Create your tests here.
2 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/products/views.py:
--------------------------------------------------------------------------------
 1 | from django.http import JsonResponse
 2 | from products.documents import ProductDocument
 3 | 
 4 | 
 5 | def index(request):
 6 |     """
 7 |     View for the index page.
 8 |     :param request: request object.
 9 |     :return: response object.
10 |     """
11 |     user_query = request.GET.get("query", "hello, world!")
12 |     name_results = ProductDocument.objects.search(name=user_query)
13 |     description_results = ProductDocument.objects.search(description=user_query)
14 |     return JsonResponse(
15 |         {
16 |             "message": "Hello, world!",
17 |             "name_results": list(name_results.values()),
18 |             "description_results": list(description_results.values()),
19 |         }
20 |     )
21 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/requirements.txt:
--------------------------------------------------------------------------------
1 | django
2 | django-semantic-search[qdrant,sentence-transformers]
3 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/simple_django_app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/examples/simple_django_app/simple_django_app/__init__.py


--------------------------------------------------------------------------------
/examples/simple_django_app/simple_django_app/asgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASGI config for simple_django_app project.
 3 | 
 4 | It exposes the ASGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/5.1/howto/deployment/asgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.asgi import get_asgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_django_app.settings")
15 | 
16 | application = get_asgi_application()
17 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/simple_django_app/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for simple_django_app project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 5.1.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/5.1/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/5.1/ref/settings/
 11 | """
 12 | 
 13 | from pathlib import Path
 14 | 
 15 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
 16 | BASE_DIR = Path(__file__).resolve().parent.parent
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/5.1/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = "django-insecure-kw=7=8-o6j8*s=qar$r951i^7s*@_+%e0!ccz0_#ye7%d)&o(j"
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = []
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = [
 34 |     "django.contrib.admin",
 35 |     "django.contrib.auth",
 36 |     "django.contrib.contenttypes",
 37 |     "django.contrib.sessions",
 38 |     "django.contrib.messages",
 39 |     "django.contrib.staticfiles",
 40 |     "django_semantic_search",  # Make sure it's before all the apps that use it.
 41 |     "products",
 42 | ]
 43 | 
 44 | MIDDLEWARE = [
 45 |     "django.middleware.security.SecurityMiddleware",
 46 |     "django.contrib.sessions.middleware.SessionMiddleware",
 47 |     "django.middleware.common.CommonMiddleware",
 48 |     "django.middleware.csrf.CsrfViewMiddleware",
 49 |     "django.contrib.auth.middleware.AuthenticationMiddleware",
 50 |     "django.contrib.messages.middleware.MessageMiddleware",
 51 |     "django.middleware.clickjacking.XFrameOptionsMiddleware",
 52 | ]
 53 | 
 54 | ROOT_URLCONF = "simple_django_app.urls"
 55 | 
 56 | TEMPLATES = [
 57 |     {
 58 |         "BACKEND": "django.template.backends.django.DjangoTemplates",
 59 |         "DIRS": [],
 60 |         "APP_DIRS": True,
 61 |         "OPTIONS": {
 62 |             "context_processors": [
 63 |                 "django.template.context_processors.debug",
 64 |                 "django.template.context_processors.request",
 65 |                 "django.contrib.auth.context_processors.auth",
 66 |                 "django.contrib.messages.context_processors.messages",
 67 |             ],
 68 |         },
 69 |     },
 70 | ]
 71 | 
 72 | WSGI_APPLICATION = "simple_django_app.wsgi.application"
 73 | 
 74 | 
 75 | # Database
 76 | # https://docs.djangoproject.com/en/5.1/ref/settings/#databases
 77 | 
 78 | DATABASES = {
 79 |     "default": {
 80 |         "ENGINE": "django.db.backends.sqlite3",
 81 |         "NAME": BASE_DIR / "db.sqlite3",
 82 |     }
 83 | }
 84 | 
 85 | 
 86 | # Password validation
 87 | # https://docs.djangoproject.com/en/5.1/ref/settings/#auth-password-validators
 88 | 
 89 | AUTH_PASSWORD_VALIDATORS = [
 90 |     {
 91 |         "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
 92 |     },
 93 |     {
 94 |         "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
 95 |     },
 96 |     {
 97 |         "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
 98 |     },
 99 |     {
100 |         "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
101 |     },
102 | ]
103 | 
104 | 
105 | # Internationalization
106 | # https://docs.djangoproject.com/en/5.1/topics/i18n/
107 | 
108 | LANGUAGE_CODE = "en-us"
109 | 
110 | TIME_ZONE = "UTC"
111 | 
112 | USE_I18N = True
113 | 
114 | USE_TZ = True
115 | 
116 | 
117 | # Static files (CSS, JavaScript, Images)
118 | # https://docs.djangoproject.com/en/5.1/howto/static-files/
119 | 
120 | STATIC_URL = "static/"
121 | 
122 | # Default primary key field type
123 | # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
124 | 
125 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
126 | 
127 | # Semantic search settings
128 | 
129 | SEMANTIC_SEARCH = {
130 |     "vector_store": {
131 |         "backend": "django_semantic_search.backends.qdrant.QdrantBackend",
132 |         "configuration": {
133 |             "location": "http://localhost:6333",
134 |         },
135 |     },
136 |     "default_embeddings": {
137 |         "model": "django_semantic_search.embeddings.SentenceTransformerModel",
138 |         "configuration": {
139 |             "model_name": "sentence-transformers/all-MiniLM-L6-v2",
140 |         },
141 |     },
142 | }
143 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/simple_django_app/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | URL configuration for simple_django_app project.
 3 | 
 4 | The `urlpatterns` list routes URLs to views. For more information please see:
 5 |     https://docs.djangoproject.com/en/5.1/topics/http/urls/
 6 | Examples:
 7 | Function views
 8 |     1. Add an import:  from my_app import views
 9 |     2. Add a URL to urlpatterns:  path('', views.home, name='home')
10 | Class-based views
11 |     1. Add an import:  from other_app.views import Home
12 |     2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
13 | Including another URLconf
14 |     1. Import the include() function: from django.urls import include, path
15 |     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
16 | """
17 | 
18 | from django.contrib import admin
19 | from django.urls import path
20 | from products import views as product_views
21 | 
22 | urlpatterns = [
23 |     path("", product_views.index, name="index"),
24 |     path("admin/", admin.site.urls),
25 | ]
26 | 


--------------------------------------------------------------------------------
/examples/simple_django_app/simple_django_app/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for simple_django_app project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/5.1/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "simple_django_app.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/mkdocs.yaml:
--------------------------------------------------------------------------------
 1 | site_name: django-semantic-search
 2 | site_url: https://kacperlukawski.github.io/django-semantic-search/
 3 | site_description: Bringing semantic search to Django. Integrates seamlessly with Django ORM.
 4 | repo_url: https://github.com/kacperlukawski/django-semantic-search
 5 | nav:
 6 |   - Home: index.md
 7 |   - Quickstart: quickstart.md
 8 |   - Usage: usage.md
 9 |   - API Reference:
10 |       - Documents: api/documents.md
11 |       - Backends: api/backends.md
12 |       - Embeddings: api/embeddings.md
13 | theme:
14 |   name: material
15 |   logo: assets/logo.png
16 |   favicon: assets/favicon.png
17 |   palette:
18 |     # Palette toggle for light mode
19 |     - media: "(prefers-color-scheme: light)"
20 |       scheme: default
21 |       primary: orange
22 |       toggle:
23 |         icon: material/brightness-7
24 |         name: Switch to dark mode
25 |     # Palette toggle for dark mode
26 |     - media: "(prefers-color-scheme: dark)"
27 |       scheme: slate
28 |       primary: deep orange
29 |       toggle:
30 |         icon: material/brightness-4
31 |         name: Switch to light mode
32 |   font:
33 |     text: Roboto
34 |     code: Roboto Mono
35 |   features:
36 |     - search.suggest
37 |     - search.highlight
38 |     - toc.integrate
39 |     - navigation.tabs
40 |     - content.code.copy
41 | plugins:
42 |   - search
43 |   - mkdocstrings:
44 |       handlers:
45 |         python:
46 |           options:
47 |             annotations_path: brief
48 |             show_root_heading: true
49 |             show_root_toc_entry: true
50 |             show_symbol_type_heading: true
51 |             heading_level: 3
52 |             docstring_style: sphinx
53 |   - social:
54 |       cards_layout_options:
55 |         font_family: Roboto
56 |         logo: assets/logo.png
57 |         background_color: "#ff6e42"
58 |         color: "white"
59 | markdown_extensions:
60 |   - attr_list
61 |   - admonition
62 |   - md_in_html
63 |   - pymdownx.details
64 |   - pymdownx.highlight:
65 |       anchor_linenums: true
66 |       line_spans: __span
67 |       pygments_lang_class: true
68 |   - pymdownx.inlinehilite
69 |   - pymdownx.snippets
70 |   - pymdownx.superfences
71 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "django-semantic-search"
 3 | version = "0.2.1"
 4 | description = "Bringing semantic search to Django. Integrates seamlessly with Django ORM."
 5 | authors = ["Kacper Łukawski <lukawski.kacper@gmail.com>"]
 6 | license = "Apache-2.0"
 7 | readme = "README.md"
 8 | packages = [
 9 |     { include = "django_semantic_search", from = "src" },
10 | ]
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">=3.10"
14 | django = ">=5.0"
15 | qdrant-client = "^1.11.1"
16 | sentence-transformers = { version = "^4.1.0", optional = true }
17 | torch = [
18 |   {version = "^2.0.0", markers = "sys_platform == 'darwin'", source = "pypi", optional = true},
19 |   {version = "^2.0.0", markers = "sys_platform != 'darwin'", source = "pytorch_cpu", optional = true}
20 | ]
21 | openai = { version = "^1.0.0", optional = true }
22 | fastembed = { version = "^0.6.1", optional = true }
23 | 
24 | [tool.poetry.extras]
25 | qdrant = ["qdrant-client"]
26 | sentence-transformers = ["sentence-transformers", "torch"]
27 | openai = ["openai"]
28 | fastembed = ["fastembed"]
29 | all = ["qdrant-client", "sentence-transformers", "torch", "openai", "fastembed"]
30 | 
31 | [tool.poetry.group.dev]
32 | optional = true
33 | 
34 | [tool.poetry.group.dev.dependencies]
35 | pre-commit = "^3.8.0"
36 | ruff = "^0.6.2"
37 | pytest = "^8.3.2"
38 | mkdocs = "^1.6.1"
39 | mkdocstrings-python = "^1.11.1"
40 | mkdocs-material = {extras = ["imaging"], version = "^9.5.34"}
41 | 
42 | [[tool.poetry.source]]
43 | name = "pytorch_cpu"
44 | url = "https://download.pytorch.org/whl/cpu"
45 | priority = "explicit"
46 | 
47 | [tool.pytest.ini_options]
48 | minversion = "7.1"
49 | pythonpath = [
50 |     "src/"
51 | ]
52 | testpaths = [
53 |     "tests/"
54 | ]
55 | 
56 | [tool.ruff]
57 | lint.typing-modules = ["cibuildwheel.typing"]
58 | 
59 | [build-system]
60 | requires = ["poetry-core"]
61 | build-backend = "poetry.core.masonry.api"
62 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/__init__.py:
--------------------------------------------------------------------------------
1 | from .decorators import register_document
2 | from .documents import Document, VectorIndex
3 | 
4 | __all__ = [
5 |     "Document",
6 |     "VectorIndex",
7 |     "register_document",
8 | ]
9 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/apps.py:
--------------------------------------------------------------------------------
 1 | from django.apps import AppConfig
 2 | from django.conf import settings
 3 | 
 4 | from django_semantic_search import default_settings
 5 | 
 6 | 
 7 | class DjangoSemanticSearchConfig(AppConfig):
 8 |     name = "django_semantic_search"
 9 |     verbose_name = "Django Semantic Search"
10 | 
11 |     def ready(self):
12 |         # Load the default settings
13 |         for setting in dir(default_settings):
14 |             if setting.isupper() and not hasattr(settings, setting):
15 |                 setattr(settings, setting, getattr(default_settings, setting))
16 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/backends/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kacperlukawski/django-semantic-search/9123670622e55ca66205f1abb2ceae2c91f0bcb9/src/django_semantic_search/backends/__init__.py


--------------------------------------------------------------------------------
/src/django_semantic_search/backends/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import List
 3 | 
 4 | from django_semantic_search.backends.types import IndexConfiguration
 5 | from django_semantic_search.documents import Document
 6 | from django_semantic_search.types import DocumentID
 7 | 
 8 | 
 9 | class BaseVectorSearchBackend(abc.ABC):
10 |     """
11 |     Base class for all the vector search backends, such as Qdrant.
12 |     """
13 | 
14 |     def __init__(self, index_configuration: IndexConfiguration):
15 |         self.index_configuration = index_configuration
16 |         self.configure()
17 | 
18 |     @abc.abstractmethod
19 |     def configure(self):
20 |         """
21 |         Configure the indexes for the backend.
22 |         """
23 |         raise NotImplementedError
24 | 
25 |     @abc.abstractmethod
26 |     def search(
27 |         self, vector_name: str, query: List[float], limit: int = 10
28 |     ) -> List[DocumentID]:
29 |         """
30 |         Search for the documents similar to the query vector in the backend.
31 |         :param vector_name:
32 |         :param query:
33 |         :param limit:
34 |         :return:
35 |         """
36 |         raise NotImplementedError
37 | 
38 |     @abc.abstractmethod
39 |     def save(self, document: Document):
40 |         """
41 |         Save the document in the backend.
42 |         :param configuration: vector store configuration.
43 |         :param document:
44 |         :return:
45 |         """
46 |         raise NotImplementedError
47 | 
48 |     @abc.abstractmethod
49 |     def delete(self, document_id: DocumentID):
50 |         """
51 |         Delete the document from the backend.
52 |         :param configuration: vector store configuration.
53 |         :param document_id: id of the document to delete.
54 |         """
55 |         raise NotImplementedError
56 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/backends/qdrant.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import uuid
  3 | from typing import List
  4 | 
  5 | from django_semantic_search import Document
  6 | from django_semantic_search.backends.base import BaseVectorSearchBackend
  7 | from django_semantic_search.backends.types import Distance, IndexConfiguration
  8 | from django_semantic_search.types import DocumentID
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class QdrantBackend(BaseVectorSearchBackend):
 14 |     """
 15 |     Backend that integrates with Qdrant vector database.
 16 | 
 17 |     It handles the configuration of separate collections per each model we want to enable search for. Users rarely
 18 |     interact with this backend directly, as backend is usually configured via Django settings.
 19 | 
 20 |     **Requirements**:
 21 | 
 22 |     ```bash
 23 |     pip install django-semantic-search[qdrant]
 24 |     ```
 25 | 
 26 |     **Usage**:
 27 | 
 28 |     ```python title="settings.py"
 29 |     SEMANTIC_SEARCH = {
 30 |         "vector_store": {
 31 |             "backend": "django_semantic_search.backends.qdrant.QdrantBackend",
 32 |             "configuration": {
 33 |                 "host": "http://localhost:6333",
 34 |             },
 35 |         },
 36 |         ...
 37 |     }
 38 |     ```
 39 |     """
 40 | 
 41 |     from qdrant_client import models
 42 | 
 43 |     DISTANCE_MAPPING = {
 44 |         Distance.COSINE: models.Distance.COSINE,
 45 |         Distance.EUCLIDEAN: models.Distance.EUCLID,
 46 |         Distance.DOT_PRODUCT: models.Distance.DOT,
 47 |     }
 48 | 
 49 |     def __init__(self, index_configuration: IndexConfiguration, *args, **kwargs):
 50 |         from qdrant_client import QdrantClient
 51 | 
 52 |         self.client = QdrantClient(*args, **kwargs)
 53 |         super().__init__(index_configuration)
 54 | 
 55 |     def configure(self):
 56 |         from qdrant_client import models
 57 | 
 58 |         try:
 59 |             collection_info = self.client.get_collection(  # noqa
 60 |                 collection_name=self.index_configuration.namespace
 61 |             )
 62 |             # TODO: validate if all the vectors are present and with correct types
 63 |         except Exception:
 64 |             logger.warning(
 65 |                 f"Collection {self.index_configuration.namespace} does not exist. Creating a new one."
 66 |             )
 67 |             self.client.create_collection(
 68 |                 collection_name=self.index_configuration.namespace,
 69 |                 vectors_config={
 70 |                     vector_name: models.VectorParams(
 71 |                         size=vector_config.size,
 72 |                         distance=self.DISTANCE_MAPPING.get(vector_config.distance),
 73 |                     )
 74 |                     for vector_name, vector_config in self.index_configuration.vectors.items()
 75 |                 },
 76 |             )
 77 |             self.client.create_payload_index(
 78 |                 collection_name=self.index_configuration.namespace,
 79 |                 field_name=self.index_configuration.id_field,
 80 |                 field_schema=models.PayloadSchemaType.KEYWORD,
 81 |             )
 82 | 
 83 |     def search(
 84 |         self, vector_name: str, query: List[float], limit: int = 10
 85 |     ) -> List[DocumentID]:
 86 |         results = self.client.query_points(
 87 |             collection_name=self.index_configuration.namespace,
 88 |             query=query,
 89 |             using=vector_name,
 90 |             limit=limit,
 91 |             with_vectors=False,
 92 |             with_payload=True,
 93 |         )
 94 |         return [
 95 |             result.payload.get(self.index_configuration.id_field)
 96 |             for result in results.points
 97 |         ]
 98 | 
 99 |     def save(self, document: Document):
100 |         from qdrant_client import models
101 | 
102 |         vectors = document.vectors()
103 |         payload = {
104 |             self.index_configuration.id_field: document.id,
105 |             **document.metadata(),
106 |         }
107 |         self.client.upsert(
108 |             collection_name=self.index_configuration.namespace,
109 |             points=[
110 |                 models.PointStruct(
111 |                     id=uuid.uuid4().hex,
112 |                     vector=vectors,
113 |                     payload=payload,
114 |                 )
115 |             ],
116 |         )
117 | 
118 |     def delete(self, document_id: DocumentID):
119 |         from qdrant_client import models
120 | 
121 |         self.client.delete(
122 |             collection_name=self.index_configuration.namespace,
123 |             points_selector=models.Filter(
124 |                 must=[
125 |                     models.FieldCondition(
126 |                         key=self.index_configuration.id_field,
127 |                         match=models.MatchValue(
128 |                             value=document_id,
129 |                         ),
130 |                     )
131 |                 ]
132 |             ),
133 |         )
134 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/backends/types.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from enum import Enum
 3 | from typing import Dict
 4 | 
 5 | 
 6 | class Distance(str, Enum):
 7 |     COSINE = "cosine"
 8 |     EUCLIDEAN = "euclidean"
 9 |     DOT_PRODUCT = "dot_product"
10 | 
11 | 
12 | @dataclass(frozen=True, eq=True, slots=True)
13 | class VectorConfiguration:
14 |     size: int
15 |     distance: Distance
16 | 
17 | 
18 | @dataclass(frozen=True, eq=True, slots=True)
19 | class IndexConfiguration:
20 |     """
21 |     Configuration of the indexes to create in the vector store.
22 |     """
23 | 
24 |     # Name of the collection representing a particular entity type
25 |     namespace: str
26 |     # List of indexes to create, along with their configuration
27 |     vectors: Dict[str, VectorConfiguration] = field(default_factory=dict)
28 |     # Name of the property that contains the document id
29 |     id_field: str = "id"
30 | 
31 |     def __hash__(self):
32 |         frozen_vectors = frozenset(sorted(self.vectors.items()))
33 |         return hash(self.namespace) + hash(self.id_field) + hash(frozen_vectors)
34 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/decorators.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Any, Type
 3 | 
 4 | from django.core.exceptions import ImproperlyConfigured
 5 | from django.db import models
 6 | from django.dispatch import receiver
 7 | 
 8 | from django_semantic_search.documents import Document
 9 | from django_semantic_search.utils import load_backend
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def register_document(document_cls: Type[Document]) -> Type[Document]:
15 |     """
16 |     Register the document class to be used for the specified model.
17 |     :param document_cls: document class to register
18 |     """
19 |     default_meta = Document.Meta
20 |     meta = getattr(document_cls, "meta", None)
21 |     if meta is None:
22 |         raise ImproperlyConfigured(
23 |             f"Document class {document_cls.__name__} does not have a Meta class."
24 |         )
25 | 
26 |     # Get the model class from the Meta class of the document
27 |     model_cls = getattr(meta, "model", default_meta.model)
28 |     if not model_cls:
29 |         raise ImproperlyConfigured(
30 |             f"Meta class for {document_cls.__name__} does not have a model attribute."
31 |         )
32 | 
33 |     # Validate all the indexes for the document
34 |     indexes = getattr(meta, "indexes", default_meta.indexes)
35 |     for index in indexes:
36 |         index.validate(model_cls)
37 | 
38 |     # Register the model handlers
39 |     register_model_handlers(document_cls)
40 | 
41 |     # Set up the document class to initialize vector store
42 |     index_configuration = document_cls.index_configuration
43 |     backend = load_backend(index_configuration)
44 |     logger.info(
45 |         f"Initializing vector store for {document_cls.meta.model} with backend {backend}"
46 |     )
47 | 
48 |     return document_cls
49 | 
50 | 
51 | def register_model_handlers(document_cls: Type[Document]) -> Type[Document]:
52 |     """
53 |     Register all the model signals to update the documents in the vector store.
54 |     """
55 |     logger.info(f"Registering handlers for {document_cls.meta.model}")
56 | 
57 |     disable_signals = getattr(
58 |         document_cls.meta, "disable_signals", Document.Meta.disable_signals
59 |     )
60 |     if disable_signals:
61 |         logger.warning(
62 |             f"Signals are disabled for {document_cls.meta.model}. Model changes "
63 |             f"will not be reflected in the vector store."
64 |         )
65 |         return document_cls
66 | 
67 |     if hasattr(document_cls.meta, "__signals_registered__"):
68 |         logger.warning(f"Signals are already registered for {document_cls.meta.model}.")
69 |         return document_cls
70 | 
71 |     model = document_cls.meta.model
72 | 
73 |     @receiver(models.signals.post_save, sender=model, weak=False)
74 |     def save_model(
75 |         sender: Type[models.Model], instance: models.Model, created: bool, **kwargs: Any
76 |     ) -> None:
77 |         logger.debug(f"Saving document for {instance}")
78 |         # TODO: detect the changes in the model and determine if the document should be updated
79 | 
80 |         # Create the document instance out of the model instance and save it
81 |         document = document_cls(instance)
82 |         document.save()
83 | 
84 |     @receiver(models.signals.post_delete, sender=model, weak=False)
85 |     def delete_model(
86 |         sender: Type[models.Model], instance: models.Model, **kwargs: Any
87 |     ) -> None:
88 |         logger.debug(f"Deleting document for {instance}")
89 |         # Create the document instance out of the model instance and delete it
90 |         document = document_cls(instance)
91 |         document.delete()
92 | 
93 |     # Mark the signals as registered
94 |     setattr(document_cls.meta, "__signals_registered__", True)
95 | 
96 |     return document_cls
97 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/default_settings.py:
--------------------------------------------------------------------------------
 1 | SEMANTIC_SEARCH = {
 2 |     # Vector store is a backend that stores the vectors and provides the search functionality.
 3 |     "vector_store": {
 4 |         # Either the path to the backend class or the class itself
 5 |         "backend": "django_semantic_search.backends.qdrant.QdrantBackend",
 6 |         # Configuration is passed directly to the backend class during initialization.
 7 |         "configuration": {
 8 |             "location": "http://localhost:6333",
 9 |         },
10 |     },
11 |     # Default embeddings are used to generate the embeddings for the documents if no embeddings are provided.
12 |     # This model will be used when no specific embedding_model is specified for a VectorIndex.
13 |     "default_embeddings": {
14 |         # Either the path to the embeddings model class or the class itself
15 |         "model": "django_semantic_search.embeddings.SentenceTransformerModel",
16 |         # Configuration is passed directly to the embeddings model class during initialization.
17 |         "configuration": {
18 |             "model_name": "sentence-transformers/all-MiniLM-L6-v2",
19 |         },
20 |     },
21 |     # Optional named embedding models that can be referenced by VectorIndex instances.
22 |     # This allows using different embedding models for different fields in your documents.
23 |     "embedding_models": {
24 |         # Each key is a unique identifier for the embedding model
25 |         "title_model": {
26 |             # Either the path to the embeddings model class or the class itself
27 |             "model": "django_semantic_search.embeddings.SentenceTransformerModel",
28 |             # Configuration is passed directly to the embeddings model class during initialization.
29 |             "configuration": {
30 |                 "model_name": "sentence-transformers/all-mpnet-base-v2",
31 |                 "document_prompt": "Title: ",
32 |             },
33 |         },
34 |         "content_model": {
35 |             "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel",
36 |             "configuration": {
37 |                 "model": "text-embedding-3-small",
38 |             },
39 |         },
40 |     },
41 | }
42 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/documents.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import logging
  3 | from typing import Dict, Generic, Iterable, List, Optional, Type, TypeVar
  4 | 
  5 | from django.db import models
  6 | from django.db.models import QuerySet
  7 | 
  8 | from django_semantic_search.backends.types import (
  9 |     Distance,
 10 |     IndexConfiguration,
 11 |     VectorConfiguration,
 12 | )
 13 | from django_semantic_search.types import DocumentID, MetadataValue, Vector
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | T = TypeVar("T", bound=models.Model)
 18 | 
 19 | 
 20 | class VectorIndex:
 21 |     """
 22 |     A definition of a single vector index. It contains the name of the index and the fields that should be indexed,
 23 |     but also allows to surpass the default settings of django-semantic-search.
 24 |     """
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         *fields: str,
 29 |         index_name: Optional[str] = None,
 30 |         distance: Distance = Distance.COSINE,
 31 |         embedding_model: Optional[str] = None,
 32 |     ):
 33 |         """
 34 |         :param fields: model fields to index together.
 35 |         :param index_name: name of the index to use in a backend. By default, it is the concatenation of the fields.
 36 |         :param distance: distance metric to use for similarity search.
 37 |         :param embedding_model: name of the embedding model to use, must be defined in SEMANTIC_SEARCH settings.
 38 |         """
 39 |         # Loading the default embedding model here, as otherwise it would create a circular import
 40 |         from django_semantic_search.utils import load_embedding_model
 41 | 
 42 |         if len(fields) != 1:
 43 |             raise ValueError("Only single field indexes are supported at the moment.")
 44 | 
 45 |         self._fields: List[str] = list(fields)
 46 |         self._index_name = index_name or "_".join(fields)
 47 |         self._distance = distance
 48 |         self._embedding_model = load_embedding_model(embedding_model)
 49 | 
 50 |     def validate(self, model_cls: Type[models.Model]):
 51 |         """
 52 |         Validate the index configuration for the model.
 53 |         :param model_cls: model class to validate the index for.
 54 |         """
 55 |         for field in self._fields:
 56 |             if not hasattr(model_cls, field):
 57 |                 raise ValueError(
 58 |                     f"Field {field} is not present in the model {model_cls.__name__}"
 59 |                 )
 60 | 
 61 |     def is_for_field(self, field: str) -> bool:
 62 |         """
 63 |         Check if the index is for the field.
 64 |         :param field: field to check.
 65 |         :return: True if the index is for the field, False otherwise.
 66 |         """
 67 |         return field in self._fields
 68 | 
 69 |     @property
 70 |     def index_name(self) -> str:
 71 |         """
 72 |         Return the name of the index.
 73 |         :return: index name.
 74 |         """
 75 |         return self._index_name
 76 | 
 77 |     @property
 78 |     def distance(self) -> Distance:
 79 |         """
 80 |         Return the distance metric to use for the index.
 81 |         :return: distance metric.
 82 |         """
 83 |         return self._distance
 84 | 
 85 |     @property
 86 |     def vector_size(self) -> int:
 87 |         """
 88 |         Return the size of the individual embedding.
 89 |         :return: size of the embedding.
 90 |         """
 91 |         return self._embedding_model.vector_size()
 92 | 
 93 |     def get_model_embedding(self, instance: models.Model) -> Vector:
 94 |         """
 95 |         Get the embedding for the instance.
 96 |         :param instance: model instance to get the embedding for.
 97 |         :return: embedding for the instance.
 98 |         """
 99 |         return self._embedding_model.embed_document(
100 |             " ".join(getattr(instance, field) for field in self._fields)
101 |         )
102 | 
103 |     def get_query_embedding(self, query: str) -> Vector:
104 |         """
105 |         Get the embedding for the query.
106 |         :param query: query to get the embedding for.
107 |         :return: embedding for the query.
108 |         """
109 |         return self._embedding_model.embed_query(query)
110 | 
111 | 
112 | class MetaManager:
113 |     """
114 |     A descriptor to store an instance of the Meta class instance on the document class.
115 |     """
116 | 
117 |     def __get__(self, instance: Optional["Document"], owner: Type["Document"]):
118 |         if not hasattr(owner, "_meta"):
119 |             setattr(owner, "_meta", owner.Meta())
120 |         return getattr(owner, "_meta")
121 | 
122 | 
123 | class IndexConfigurationManager:
124 |     """
125 |     A descriptor to store an instance of the IndexConfiguration class instance on the document class. The configuration
126 |     of the index is derived from the Meta class of the document.
127 |     """
128 | 
129 |     def __get__(
130 |         self, instance: Optional["Document"], owner: Type["Document"]
131 |     ) -> IndexConfiguration:
132 |         if not hasattr(owner, "_index_configuration"):
133 |             attr_meta = owner.meta
134 |             model = getattr(attr_meta, "model", None)
135 |             model_name = model.__name__ if model else None
136 |             index_namespace = getattr(attr_meta, "namespace", model_name)
137 |             indexes = getattr(attr_meta, "indexes", [])
138 |             config = IndexConfiguration(
139 |                 namespace=index_namespace,
140 |                 vectors={
141 |                     index.index_name: VectorConfiguration(
142 |                         size=index.vector_size,
143 |                         distance=index.distance,
144 |                     )
145 |                     for index in indexes
146 |                 },
147 |             )
148 |             setattr(owner, "_index_configuration", config)
149 |         return getattr(owner, "_index_configuration")
150 | 
151 | 
152 | class BackendManager:
153 |     """
154 |     A descriptor to store an instance of the backend on the document class. The backend is derived from the index
155 |     configuration and is loaded dynamically.
156 |     """
157 | 
158 |     def __get__(self, instance: Optional["Document"], owner: Type["Document"]):
159 |         if not hasattr(owner, "_backend"):
160 |             from django_semantic_search.utils import load_backend
161 | 
162 |             setattr(owner, "_backend", load_backend(owner.index_configuration))
163 |         return getattr(owner, "_backend")
164 | 
165 | 
166 | class DocumentManager(Generic[T]):
167 |     """
168 |     A descriptor to store an instance of the document manager on the document class. The document manager is used to
169 |     find similar documents in the vector index, but also to perform any other operations on the querysets of the
170 |     model instances.
171 |     """
172 | 
173 |     def __init__(self, cls: Type["Document"]):
174 |         self.cls = cls
175 | 
176 |     def search(
177 |         self,
178 |         limit: int = 10,
179 |         **kwargs,
180 |     ) -> QuerySet[T]:
181 |         """
182 |         Find the documents similar to the query in the vector index. If there are multiple indexes, the search is
183 |         performed in all of them and the results are combined.
184 |         :param limit: number of results to return.
185 |         :param kwargs: query parameters to restrict the search.
186 |         :return:
187 |         """
188 |         if len(kwargs) != 1:
189 |             raise ValueError("Only single field indexes are supported at the moment.")
190 | 
191 |         field_name, field_value = next(iter(kwargs.items()))
192 |         vector_index = next(
193 |             index for index in self.cls.meta.indexes if index.is_for_field(field_name)
194 |         )
195 |         if vector_index is None:
196 |             raise ValueError(f"No index found for field {field_name}")
197 | 
198 |         query_embedding = vector_index.get_query_embedding(field_value)
199 |         document_ids = self.cls.backend.search(
200 |             vector_index.index_name, query_embedding, limit=limit
201 |         )
202 |         if not document_ids:
203 |             return self.cls.meta.model.objects.none()
204 | 
205 |         preserved_ids = models.Case(
206 |             *[models.When(pk=pk, then=pos) for pos, pk in enumerate(document_ids)]
207 |         )
208 |         queryset = self.cls.meta.model.objects.filter(pk__in=document_ids).order_by(
209 |             preserved_ids
210 |         )
211 |         return queryset
212 | 
213 |     def index(self, qs: QuerySet[T]):
214 |         """
215 |         Index the queryset of the model instances.
216 |         :param qs: queryset of the model instances to index.
217 |         """
218 |         # TODO: this is the most basic implementation, it should be optimized
219 |         for instance in qs:
220 |             self.cls(instance).save()
221 | 
222 | 
223 | class DocumentManagerDescriptor(Generic[T]):
224 |     """
225 |     A descriptor to store the document manager on the document class.
226 |     """
227 | 
228 |     def __get__(self, instance, owner):
229 |         if not hasattr(owner, "_document_manager"):
230 |             setattr(owner, "_document_manager", DocumentManager[T](owner))
231 |         return getattr(owner, "_document_manager")
232 | 
233 | 
234 | class Document(abc.ABC, Generic[T]):
235 |     """
236 |     Base class for all the documents. There is a one-to-one mapping between the document subclass and the model class,
237 |     to configure how a specific model instances should be converted to a document.
238 | 
239 |     **Usage**:
240 | 
241 |     ```python title="products/models.py"
242 |     from django.db import models
243 | 
244 |     class Product(models.Model):
245 |         name = models.CharField(max_length=255)
246 |         description = models.TextField()
247 | 
248 |     ```
249 | 
250 |     ```python title="products/documents.py"
251 |     from django_semantic_search import Document, VectorIndex
252 |     from django_semantic_search.decorators import register_document
253 | 
254 |     @register_document
255 |     class ProductDocument(Document):
256 |         class Meta:
257 |             model = Product
258 |             indexes = [
259 |                 VectorIndex("name"),
260 |                 VectorIndex("description"),
261 |             ]
262 |     ```
263 | 
264 |     `django-semantic-search` will automatically handle all the configuration in the backend. The `register_document`
265 |     decorator will register the model signals to update the documents in the vector store when the model is updated
266 |     or deleted. As a user you don't have to manually call the `save` or `delete` methods on the document instances.
267 | 
268 |     **Search example:**
269 | 
270 |     ```python title="products/views.py"
271 |     from django.http import JsonResponse
272 |     from products.documents import ProductDocument
273 | 
274 |     def my_view(request):
275 |         query = "this is a query"
276 |         results = ProductDocument.objects.find(name=query)
277 |         return JsonResponse(
278 |             {
279 |                 "results": list(name_results.values())
280 |             }
281 |         )
282 |     ```
283 | 
284 |     The `find` method on the `objects` attribute of the document class will return the queryset of the model instances
285 |     that are similar to the query. The search is performed using the selected vector index passed as a keyword argument
286 |     to the `find` method. In our case, we are searching for the query in the `name` field of the `Product` model. If we
287 |     want to search in the `description` field, we would call `ProductDocument.objects.find(description=query)`.
288 |     """
289 | 
290 |     # Important:
291 |     # The following descriptors have to be defined in the specific order, as they depend on each other
292 |     # and the order of the descriptors is the order in which they are executed.
293 |     meta = MetaManager()
294 |     index_configuration = IndexConfigurationManager()
295 |     backend = BackendManager()
296 |     objects = DocumentManagerDescriptor[T]()
297 | 
298 |     def __init__(self, instance: T):
299 |         self._instance = instance
300 | 
301 |     def save(self) -> None:
302 |         """
303 |         Save the document in the vector store.
304 |         """
305 |         if not self._instance.pk:
306 |             raise ValueError(
307 |                 "The model instance has to be saved before creating a document."
308 |             )
309 |         self.backend.save(self)
310 | 
311 |     def delete(self) -> None:
312 |         """
313 |         Delete the document from the vector store.
314 |         """
315 |         self.backend.delete(self.id)
316 | 
317 |     @property
318 |     def id(self) -> DocumentID:
319 |         if not self._instance.pk:
320 |             raise ValueError(
321 |                 "The model instance has to be saved before accessing the ID."
322 |             )
323 |         return self._instance.pk
324 | 
325 |     def vectors(self) -> Dict[str, Vector]:
326 |         """
327 |         Return the vectors for the document.
328 |         :return: dictionary of the vectors.
329 |         """
330 |         return {
331 |             index.index_name: index.get_model_embedding(self._instance)
332 |             for index in self.meta.indexes
333 |         }
334 | 
335 |     def metadata(self) -> Dict[str, MetadataValue]:
336 |         """
337 |         Return the metadata for the document.
338 |         :return: dictionary of the metadata.
339 |         """
340 |         include_fields = getattr(
341 |             self.meta, "include_fields", Document.Meta.include_fields
342 |         )
343 |         if "*" in include_fields:
344 |             include_fields = [field.name for field in self._instance._meta.fields]
345 |         return {field: getattr(self._instance, field) for field in include_fields}
346 | 
347 |     class Meta:
348 |         # The model this document is associated with
349 |         model: Optional[Type[models.Model]] = None
350 |         # Namespace for the documents in the vector store, defaults to the model name
351 |         namespace: Optional[str] = None
352 |         # List of vector indexes created out of the model fields
353 |         indexes: Iterable[VectorIndex] = []
354 |         # Model fields that should be included in the metadata
355 |         include_fields: List[str] = ["*"]
356 |         # Flag to disable signals on the model, so the documents are not updated on model changes
357 |         disable_signals: bool = False
358 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | from .fastembed import FastEmbedDenseModel, FastEmbedSparseModel
 2 | from .openai import OpenAIEmbeddingModel
 3 | from .sentence_transformers import SentenceTransformerModel
 4 | 
 5 | __all__ = [
 6 |     "SentenceTransformerModel",
 7 |     "OpenAIEmbeddingModel",
 8 |     "FastEmbedDenseModel",
 9 |     "FastEmbedSparseModel",
10 | ]
11 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/embeddings/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from typing import Protocol
 3 | 
 4 | from django_semantic_search.types import (
 5 |     DenseVector,
 6 |     DocumentContent,
 7 |     Query,
 8 |     SparseVector,
 9 | )
10 | 
11 | 
12 | class EmbeddingModel(Protocol):
13 |     """Protocol defining common interface for all embedding models."""
14 | 
15 |     def vector_size(self) -> int:
16 |         """Return the size of the individual embedding."""
17 |         ...
18 | 
19 |     def supports_document(self, document: DocumentContent) -> bool:
20 |         """Check if the embedding model supports the document."""
21 |         ...
22 | 
23 | 
24 | class DenseEmbeddingModel(abc.ABC):
25 |     """Base class for models producing dense vector embeddings."""
26 | 
27 |     @abc.abstractmethod
28 |     def vector_size(self) -> int:
29 |         """Return the fixed size of dense embeddings."""
30 |         raise NotImplementedError
31 | 
32 |     @abc.abstractmethod
33 |     def embed_document(self, document: DocumentContent) -> DenseVector:
34 |         """Embed a document into a dense vector."""
35 |         raise NotImplementedError
36 | 
37 |     @abc.abstractmethod
38 |     def embed_query(self, query: Query) -> DenseVector:
39 |         """Embed a query into a dense vector."""
40 |         raise NotImplementedError
41 | 
42 | 
43 | class SparseEmbeddingModel(abc.ABC):
44 |     """Base class for models producing sparse vector embeddings."""
45 | 
46 |     @abc.abstractmethod
47 |     def embed_document(self, document: DocumentContent) -> SparseVector:
48 |         """Embed a document into a sparse vector."""
49 |         raise NotImplementedError
50 | 
51 |     @abc.abstractmethod
52 |     def embed_query(self, query: Query) -> SparseVector:
53 |         """Embed a query into a sparse vector."""
54 |         raise NotImplementedError
55 | 
56 | 
57 | class TextEmbeddingMixin:
58 |     """Mixin for text-specific embedding functionality."""
59 | 
60 |     def supports_document(self, document: DocumentContent) -> bool:
61 |         return isinstance(document, str)
62 | 
63 | 
64 | class DenseTextEmbeddingModel(TextEmbeddingMixin, DenseEmbeddingModel, abc.ABC):
65 |     """Base class for dense text embedding models."""
66 | 
67 |     pass
68 | 
69 | 
70 | class SparseTextEmbeddingModel(TextEmbeddingMixin, SparseEmbeddingModel, abc.ABC):
71 |     """Base class for sparse text embedding models."""
72 | 
73 |     pass
74 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/embeddings/fastembed.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | from django_semantic_search.embeddings.base import (
  4 |     DenseTextEmbeddingModel,
  5 |     SparseTextEmbeddingModel,
  6 | )
  7 | from django_semantic_search.types import (
  8 |     DenseVector,
  9 |     DocumentContent,
 10 |     Query,
 11 |     SparseVector,
 12 | )
 13 | 
 14 | 
 15 | class FastEmbedDenseModel(DenseTextEmbeddingModel):
 16 |     """
 17 |     FastEmbed dense embedding model that uses the FastEmbed library to generate dense embeddings.
 18 | 
 19 |     **Requirements:**
 20 | 
 21 |     ```shell
 22 |     pip install django-semantic-search[fastembed]
 23 |     ```
 24 | 
 25 |     **Usage:**
 26 | 
 27 |     ```python title="settings.py"
 28 |     SEMANTIC_SEARCH = {
 29 |         "default_embeddings": {
 30 |             "model": "django_semantic_search.embeddings.FastEmbedDenseModel",
 31 |             "configuration": {
 32 |                 "model_name": "BAAI/bge-small-en-v1.5",
 33 |             },
 34 |         },
 35 |         ...
 36 |     }
 37 |     ```
 38 |     """
 39 | 
 40 |     def __init__(
 41 |         self,
 42 |         model_name: str,
 43 |         **kwargs,
 44 |     ):
 45 |         """
 46 |         Initialize the FastEmbed dense model.
 47 | 
 48 |         :param model_name: name of the model to use
 49 |         :param kwargs: additional kwargs passed to FastEmbed
 50 |         """
 51 |         from fastembed import TextEmbedding
 52 | 
 53 |         self._model = TextEmbedding(
 54 |             model_name=model_name,
 55 |             **kwargs,
 56 |         )
 57 |         # Cache the vector size after first call
 58 |         self._vector_size: Optional[int] = None
 59 | 
 60 |     def vector_size(self) -> int:
 61 |         """
 62 |         Return the size of the individual embedding.
 63 |         :return: size of the embedding.
 64 |         """
 65 |         if self._vector_size is None:
 66 |             # Get vector size by embedding a test string
 67 |             vector = next(self._model.embed(["test"]))
 68 |             self._vector_size = len(vector)
 69 |         return self._vector_size
 70 | 
 71 |     def embed_document(self, document: str) -> DenseVector:
 72 |         """
 73 |         Embed a document into a vector.
 74 |         :param document: document to embed.
 75 |         :return: document embedding.
 76 |         """
 77 |         vector = next(self._model.passage_embed([document]))
 78 |         return vector.tolist()
 79 | 
 80 |     def embed_query(self, query: str) -> DenseVector:
 81 |         """
 82 |         Embed a query into a vector.
 83 |         :param query: query to embed.
 84 |         :return: query embedding.
 85 |         """
 86 |         vector = next(self._model.query_embed([query]))
 87 |         return vector.tolist()
 88 | 
 89 | 
 90 | class FastEmbedSparseModel(SparseTextEmbeddingModel):
 91 |     """
 92 |     FastEmbed sparse embedding model that uses the FastEmbed library to generate sparse embeddings.
 93 | 
 94 |     **Requirements:**
 95 | 
 96 |     ```shell
 97 |     pip install django-semantic-search[fastembed]
 98 |     ```
 99 | 
100 |     **Important:** For now, there is no way to use the model in django-semantic-search, but it's on the way.
101 |     """
102 | 
103 |     def __init__(
104 |         self,
105 |         model_name: str,
106 |         **kwargs,
107 |     ):
108 |         """
109 |         Initialize the FastEmbed sparse model.
110 | 
111 |         :param model_name: name of the model to use
112 |         :param kwargs: additional kwargs passed to FastEmbed
113 |         """
114 |         from fastembed import SparseTextEmbedding
115 | 
116 |         self._model = SparseTextEmbedding(
117 |             model_name=model_name,
118 |             **kwargs,
119 |         )
120 | 
121 |     def embed_document(self, document: DocumentContent) -> SparseVector:
122 |         vector = next(self._model.passage_embed([document]))
123 |         return dict(zip(vector.indices.tolist(), vector.values.tolist()))
124 | 
125 |     def embed_query(self, query: Query) -> SparseVector:
126 |         vector = next(self._model.query_embed([query]))
127 |         return dict(zip(vector.indices.tolist(), vector.values.tolist()))
128 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/embeddings/openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Optional
 3 | 
 4 | from openai import OpenAI
 5 | 
 6 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel
 7 | from django_semantic_search.types import DenseVector
 8 | 
 9 | 
10 | class OpenAIEmbeddingModel(DenseTextEmbeddingModel):
11 |     """
12 |     OpenAI text embedding model that uses the OpenAI API to generate dense embeddings.
13 | 
14 |     **Requirements**:
15 | 
16 |     ```bash
17 |     pip install django-semantic-search[openai]
18 |     ```
19 | 
20 |     **Usage**:
21 | 
22 |     ```python title="settings.py"
23 |     SEMANTIC_SEARCH = {
24 |         "default_embeddings": {
25 |             "model": "django_semantic_search.embeddings.OpenAIEmbeddingModel",
26 |             "configuration": {
27 |                 "model": "text-embedding-3-small",
28 |                 "api_key": "your-api-key",  # Optional if set in env
29 |             },
30 |         },
31 |         ...
32 |     }
33 |     ```
34 |     """
35 | 
36 |     def __init__(
37 |         self,
38 |         model: str = "text-embedding-3-small",
39 |         api_key: Optional[str] = None,
40 |         **kwargs,
41 |     ):
42 |         """
43 |         Initialize the OpenAI embedding model.
44 | 
45 |         :param model: OpenAI model to use for embeddings
46 |         :param api_key: OpenAI API key. If not provided, will look for OPENAI_API_KEY env variable
47 |         :param kwargs: Additional kwargs passed to OpenAI client
48 |         """
49 |         self._model = model
50 |         api_key = api_key or os.getenv("OPENAI_API_KEY")
51 |         if not api_key:
52 |             raise ValueError(
53 |                 "OpenAI API key must be provided either through api_key parameter or OPENAI_API_KEY environment variable"
54 |             )
55 |         self._client = OpenAI(api_key=api_key, **kwargs)
56 |         # Cache the vector size after first call
57 |         self._vector_size: Optional[int] = None
58 | 
59 |     def vector_size(self) -> int:
60 |         if self._vector_size is None:
61 |             response = self._client.embeddings.create(
62 |                 model=self._model,
63 |                 input="test",
64 |             )
65 |             self._vector_size = len(response.data[0].embedding)
66 |         return self._vector_size
67 | 
68 |     def embed_document(self, document: str) -> DenseVector:
69 |         response = self._client.embeddings.create(
70 |             model=self._model,
71 |             input=document,
72 |         )
73 |         return response.data[0].embedding
74 | 
75 |     def embed_query(self, query: str) -> DenseVector:
76 |         return self.embed_document(query)
77 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/embeddings/sentence_transformers.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel
 4 | from django_semantic_search.types import DenseVector
 5 | 
 6 | 
 7 | class SentenceTransformerModel(DenseTextEmbeddingModel):
 8 |     """
 9 |     Sentence-transformers model for embedding text.
10 | 
11 |     It is a wrapper around the sentence-transformers library. Users would rarely need to use this class directly, but
12 |     rather specify it in the Django settings.
13 | 
14 |     **Requirements:**
15 | 
16 |     ```shell
17 |     pip install django-semantic-search[sentence-transformers]
18 |     ```
19 | 
20 |     **Usage:**
21 | 
22 |     ```python title="settings.py"
23 |     SEMANTIC_SEARCH = {
24 |         "default_embeddings": {
25 |             "model": "django_semantic_search.embeddings.SentenceTransformerModel",
26 |             "configuration": {
27 |                 "model_name": "sentence-transformers/all-MiniLM-L6-v2",
28 |             },
29 |         },
30 |         ...
31 |     }
32 |     ```
33 | 
34 |     Some models accept prompts to be used for the document and query. These prompts are used as additional
35 |     instructions for the model to generate embeddings. For example, if the `document_prompt` is set to `"Doc: "`, the
36 |     model will generate embeddings with the prompt `"Doc: "` followed by the document text. Similarly, the
37 |     `query_prompt` is used for the query, if set.
38 | 
39 |     ```python title="settings.py"
40 |     SEMANTIC_SEARCH = {
41 |         "default_embeddings": {
42 |             "model": "django_semantic_search.embeddings.SentenceTransformerModel",
43 |             "configuration": {
44 |                 "model_name": "sentence-transformers/all-MiniLM-L6-v2",
45 |                 "document_prompt": "Doc: ",
46 |                 "query_prompt": "Query: ",
47 |             },
48 |         },
49 |         ...
50 |     }
51 |     ```
52 |     """
53 | 
54 |     def __init__(
55 |         self,
56 |         model_name: str,
57 |         document_prompt: Optional[str] = None,
58 |         query_prompt: Optional[str] = None,
59 |     ):
60 |         """
61 |         Initialize the sentence-transformers model.
62 | 
63 |         Some models accept prompts to be used for the document and query. These prompts are used as additional
64 |         instructions for the model to generate embeddings. For example, if the `document_prompt` is set to "Doc: ", the
65 |         model will generate embeddings with the prompt "Doc: " followed by the document text.
66 | 
67 |         :param model_name: name of the model to use.
68 |         :param document_prompt: prompt to use for the document, defaults to None.
69 |         :param query_prompt: prompt to use for the query, defaults to None.
70 |         """
71 |         from sentence_transformers import SentenceTransformer
72 | 
73 |         self._model = SentenceTransformer(model_name)
74 |         self._document_prompt = document_prompt
75 |         self._query_prompt = query_prompt
76 | 
77 |     def vector_size(self) -> int:
78 |         """
79 |         Return the size of the individual embedding.
80 |         :return: size of the embedding.
81 |         """
82 |         return self._model.get_sentence_embedding_dimension()
83 | 
84 |     def embed_document(self, document: str) -> DenseVector:
85 |         """
86 |         Embed a document into a vector.
87 |         :param document: document to embed.
88 |         :return: document embedding.
89 |         """
90 |         return self._model.encode(document, prompt=self._document_prompt).tolist()
91 | 
92 |     def embed_query(self, query: str) -> DenseVector:
93 |         """
94 |         Embed a query into a vector.
95 |         :param query: query to embed.
96 |         :return: query embedding.
97 |         """
98 |         return self._model.encode(query, prompt=self._query_prompt).tolist()
99 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/types.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Union
 2 | 
 3 | # Dense vector is a list of floats
 4 | DenseVector = List[float]
 5 | 
 6 | # Sparse vector is a dictionary of the form {index: value}
 7 | # where index is a unique token identifier and value is the weight of the token.
 8 | # Different backends may store the sparse vector in a different way.
 9 | SparseVector = Dict[int, float]
10 | 
11 | # Vector is either a dense or a sparse vector for now, but that might
12 | # change in the future, for example, to support multi-vector representations.
13 | Vector = Union[DenseVector, SparseVector]
14 | 
15 | # Document ID uniquely identifies a document.
16 | DocumentID = Union[int, str]
17 | 
18 | # Document content might be any supported modality. Currently just text, but that
19 | # might change in the future, when we support images, audio, etc.
20 | DocumentContent = Union[str]
21 | 
22 | # Each document may have metadata associated with it, that can be used for filtering.
23 | # For now, we support only a few basic types, but that might change in the future.
24 | # TODO: support more types in the metadata value, preferably the same as in the database
25 | MetadataValue = Union[int, str, float, bool]
26 | 
27 | # Queries may have the same format as the documents, but we keep a separate type for
28 | # them for better readability.
29 | Query = DocumentContent
30 | 


--------------------------------------------------------------------------------
/src/django_semantic_search/utils.py:
--------------------------------------------------------------------------------
 1 | from functools import cache
 2 | from typing import Optional
 3 | 
 4 | from django.conf import settings
 5 | from django.utils.module_loading import import_string
 6 | 
 7 | from django_semantic_search.backends.types import IndexConfiguration
 8 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel
 9 | 
10 | 
11 | @cache
12 | def load_embedding_model(model_name: Optional[str] = None) -> DenseTextEmbeddingModel:
13 |     """
14 |     Load the embedding model specified in settings.
15 |     :param model_name: name of the model configuration to use from settings
16 |     :return: embedding model instance
17 |     """
18 |     semantic_search_settings = settings.SEMANTIC_SEARCH
19 | 
20 |     if model_name is None:
21 |         model_config = semantic_search_settings["default_embeddings"]
22 |     else:
23 |         if "embedding_models" not in semantic_search_settings:
24 |             raise ValueError("No embedding_models defined in settings")
25 |         if model_name not in semantic_search_settings["embedding_models"]:
26 |             raise ValueError(f"Embedding model {model_name} not found in settings")
27 |         model_config = semantic_search_settings["embedding_models"][model_name]
28 | 
29 |     model_cls = model_config["model"]
30 |     if isinstance(model_cls, str):
31 |         model_cls = import_string(model_cls)
32 |     model_configuration = model_config["configuration"]
33 |     return model_cls(**model_configuration)
34 | 
35 | 
36 | @cache
37 | def load_backend(index_configuration: IndexConfiguration):
38 |     """
39 |     Load the backend, as specified in the settings.
40 |     :return: backend instance.
41 |     """
42 |     semantic_search_settings = settings.SEMANTIC_SEARCH
43 |     backend_cls = semantic_search_settings["vector_store"]["backend"]
44 |     if isinstance(backend_cls, str):
45 |         backend_cls = import_string(backend_cls)
46 |     backend_config = semantic_search_settings["vector_store"]["configuration"]
47 |     return backend_cls(index_configuration, **backend_config)
48 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import django
 2 | from mocks import test_settings
 3 | 
 4 | 
 5 | def pytest_configure(config):
 6 |     from django.conf import settings
 7 | 
 8 |     settings.configure(
 9 |         DATABASES={
10 |             "default": {
11 |                 "ENGINE": "django.db.backends.sqlite3",
12 |                 "NAME": ":memory:",
13 |                 "AUTOCOMMIT": True,
14 |             }
15 |         },
16 |         SEMANTIC_SEARCH=test_settings,
17 |     )
18 | 
19 |     django.setup()
20 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_apps.py:
--------------------------------------------------------------------------------
 1 | from mocks import test_settings
 2 | 
 3 | import django_semantic_search
 4 | from django_semantic_search import default_settings
 5 | from django_semantic_search.apps import DjangoSemanticSearchConfig
 6 | 
 7 | 
 8 | def test_custom_settings_are_not_overwritten_on_ready():
 9 |     from django.conf import settings
10 | 
11 |     # Save the initial settings and set custom settings
12 |     init_semantic_search_settings = getattr(settings, "SEMANTIC_SEARCH")
13 |     setattr(settings, "SEMANTIC_SEARCH", test_settings)
14 | 
15 |     # Run ready and check that the settings are not overwritten
16 |     config = DjangoSemanticSearchConfig(
17 |         "django_semantic_search", django_semantic_search
18 |     )
19 |     config.ready()
20 | 
21 |     assert hasattr(settings, "SEMANTIC_SEARCH")
22 |     assert settings.SEMANTIC_SEARCH == test_settings
23 | 
24 |     # Restore the initial settings
25 |     setattr(settings, "SEMANTIC_SEARCH", init_semantic_search_settings)
26 | 
27 | 
28 | def test_default_settings_are_set_on_ready():
29 |     from django.conf import settings
30 | 
31 |     # Save the initial settings and delete them so that the default settings are set
32 |     init_semantic_search_settings = getattr(settings, "SEMANTIC_SEARCH")
33 |     delattr(settings, "SEMANTIC_SEARCH")
34 | 
35 |     # Run ready and check that the settings are not overwritten
36 |     config = DjangoSemanticSearchConfig(
37 |         "django_semantic_search", django_semantic_search
38 |     )
39 |     config.ready()
40 | 
41 |     assert hasattr(settings, "SEMANTIC_SEARCH")
42 |     assert settings.SEMANTIC_SEARCH == default_settings.SEMANTIC_SEARCH
43 | 
44 |     # Restore the initial settings
45 |     setattr(settings, "SEMANTIC_SEARCH", init_semantic_search_settings)
46 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_decorators.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from django.core.exceptions import ImproperlyConfigured
 3 | from django.db import models
 4 | 
 5 | import django_semantic_search as dss
 6 | 
 7 | 
 8 | class DummyModel(models.Model):
 9 |     name = models.CharField(max_length=100)
10 | 
11 |     class Meta:
12 |         app_label = "test_decorators"
13 | 
14 | 
15 | def test_register_document_fails_on_missing_meta():
16 |     """
17 |     Test that the register_document decorator fails when the document class does not have a Meta class.
18 |     """
19 | 
20 |     try:
21 | 
22 |         @dss.register_document
23 |         class InvalidDocument:  # noqa
24 |             pass
25 |     except ImproperlyConfigured as e:
26 |         assert str(e) == "Document class InvalidDocument does not have a Meta class."
27 | 
28 | 
29 | def test_register_document_fails_on_duplicate_registration():
30 |     """
31 |     Test that the register_document decorator fails when the document class is registered for the same model twice.
32 |     """
33 | 
34 |     @dss.register_document
35 |     class Document1(dss.Document):  # noqa
36 |         class Meta:
37 |             model = DummyModel
38 | 
39 |     try:
40 | 
41 |         @dss.register_document
42 |         class Document2(dss.Document):  # noqa
43 |             class Meta:
44 |                 model = DummyModel
45 |     except ImproperlyConfigured as e:
46 |         assert str(e) == "Document class for model DummyModel is already registered."
47 | 
48 | 
49 | @pytest.mark.integration
50 | def test_register_document_creates_update_delete_signals():
51 |     """
52 |     Test that the document registers the post_save and post_delete signals for the model.
53 |     """
54 | 
55 |     class SingleUseDummyModel(DummyModel):
56 |         """Single use model for testing the document registration in this test only."""
57 | 
58 |         class Meta:
59 |             app_label = "test_decorators"
60 | 
61 |     assert not models.signals.post_save.has_listeners(SingleUseDummyModel)
62 |     assert not models.signals.post_delete.has_listeners(SingleUseDummyModel)
63 | 
64 |     @dss.register_document
65 |     class AnotherDummyDocument(dss.Document):  # noqa
66 |         class Meta:
67 |             model = SingleUseDummyModel
68 |             namespace = "dummy"
69 |             indexes = (dss.VectorIndex("name"),)
70 | 
71 |     assert models.signals.post_save.has_listeners(SingleUseDummyModel)
72 |     assert models.signals.post_delete.has_listeners(SingleUseDummyModel)
73 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_documents.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from django.db import models
  3 | 
  4 | import django_semantic_search as dss
  5 | 
  6 | 
  7 | class DummyModel(models.Model):
  8 |     name = models.CharField(max_length=255)
  9 |     description = models.TextField()
 10 |     ignored_field = models.CharField(max_length=255)
 11 | 
 12 |     class Meta:
 13 |         app_label = "test_documents"
 14 | 
 15 | 
 16 | @dss.register_document
 17 | class DummyDocument(dss.Document):
 18 |     class Meta:
 19 |         model = DummyModel
 20 |         namespace = "dummy"
 21 |         indexes = [
 22 |             dss.VectorIndex("name"),
 23 |             dss.VectorIndex("description"),
 24 |         ]
 25 | 
 26 | 
 27 | @pytest.fixture(scope="module")
 28 | def django_test_database():
 29 |     """
 30 |     Create a test database for Django with the dummy model.
 31 |     :return:
 32 |     """
 33 |     from django.db import connection
 34 | 
 35 |     with connection.schema_editor() as schema_editor:
 36 |         yield schema_editor.create_model(DummyModel)
 37 |         schema_editor.delete_model(DummyModel)
 38 | 
 39 | 
 40 | def test_dummy_document_produces_vectors():
 41 |     """
 42 |     Test that the document produces the correct vectors.
 43 |     """
 44 |     dummy = DummyModel(
 45 |         name="test", description="test description", ignored_field="ignored"
 46 |     )
 47 |     document = DummyDocument(dummy)
 48 |     vectors = document.vectors()
 49 |     assert len(vectors) == 2
 50 |     assert "name" in vectors
 51 |     assert "description" in vectors
 52 |     assert "name_description" not in vectors
 53 | 
 54 | 
 55 | def test_dummy_document_produces_metadata():
 56 |     """
 57 |     Test that the document produces the correct metadata.
 58 |     """
 59 |     dummy = DummyModel(
 60 |         name="test", description="test description", ignored_field="ignored"
 61 |     )
 62 |     document = DummyDocument(dummy)
 63 |     metadata = document.metadata()
 64 |     assert "name" in metadata
 65 |     assert "description" in metadata
 66 |     assert metadata["name"] == "test"
 67 |     assert metadata["description"] == "test description"
 68 | 
 69 | 
 70 | def test_two_documents_have_different_backends():
 71 |     """
 72 |     Test that two documents with different indexes have different backends.
 73 |     """
 74 | 
 75 |     class AnotherModel(models.Model):
 76 |         name = models.CharField(max_length=255)
 77 |         description = models.TextField()
 78 | 
 79 |         class Meta:
 80 |             app_label = "test_documents"
 81 | 
 82 |     @dss.register_document
 83 |     class AnotherDocument(dss.Document):
 84 |         class Meta:
 85 |             model = AnotherModel
 86 |             namespace = "another"
 87 |             indexes = [
 88 |                 dss.VectorIndex("name"),
 89 |             ]
 90 | 
 91 |     dummy_index_configuration = DummyDocument.backend.index_configuration
 92 |     another_index_configuration = AnotherDocument.backend.index_configuration
 93 |     assert dummy_index_configuration.namespace == "dummy"
 94 |     assert another_index_configuration.namespace == "another"
 95 | 
 96 | 
 97 | def test_document_signals_work_correctly(django_test_database):
 98 |     """
 99 |     Test that the search manager returns an empty queryset.
100 |     """
101 |     dummy = DummyModel(
102 |         name="test", description="test description", ignored_field="ignored"
103 |     )
104 |     queryset = DummyDocument.objects.search(name="test")
105 |     assert queryset.count() == 0
106 |     dummy.save()
107 |     queryset = DummyDocument.objects.search(name="test")
108 |     assert queryset.count() == 1
109 |     dummy.delete()
110 |     queryset = DummyDocument.objects.search(name="test")
111 |     assert queryset.count() == 0
112 | 
113 | 
114 | def test_model_has_more_entries_than_vector_backend():
115 |     from django.db import connection
116 | 
117 |     class JustAnotherModel(models.Model):
118 |         name = models.CharField(max_length=255)
119 |         description = models.TextField()
120 | 
121 |         class Meta:
122 |             app_label = "test_documents"
123 | 
124 |     with connection.schema_editor() as schema_editor:
125 |         schema_editor.create_model(JustAnotherModel)
126 | 
127 |         # Create some instances of the model, which won't be in the vector store yet (document is created later)
128 |         JustAnotherModel(name="test1", description="test description 1").save()
129 |         JustAnotherModel(name="test2", description="test description 2").save()
130 | 
131 |         @dss.register_document
132 |         class JustAnotherDocument(dss.Document):
133 |             class Meta:
134 |                 model = JustAnotherModel
135 |                 namespace = "just_another"
136 |                 indexes = [
137 |                     dss.VectorIndex("name"),
138 |                 ]
139 | 
140 |         assert JustAnotherModel.objects.count() == 2
141 |         assert JustAnotherDocument.objects.search(name="a").count() == 0
142 | 
143 |         JustAnotherModel(name="test3", description="test description 3").save()
144 | 
145 |         assert JustAnotherModel.objects.count() == 3
146 |         assert JustAnotherDocument.objects.search(name="a").count() == 1
147 | 
148 |         JustAnotherDocument.objects.index(JustAnotherModel.objects.all())
149 | 
150 |         assert JustAnotherModel.objects.count() == 3
151 |         assert JustAnotherDocument.objects.search(name="a").count() == 3
152 | 
153 |         schema_editor.delete_model(JustAnotherModel)
154 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_fastembed.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from django_semantic_search.embeddings.fastembed import (
 5 |     FastEmbedDenseModel,
 6 |     FastEmbedSparseModel,
 7 | )
 8 | 
 9 | 
10 | @pytest.mark.integration
11 | class TestFastEmbedDenseModel:
12 |     @pytest.fixture(autouse=True)
13 |     def setup_model(self):
14 |         self.model = FastEmbedDenseModel(model_name="BAAI/bge-small-en-v1.5")
15 | 
16 |     def test_initialization(self):
17 |         model = FastEmbedDenseModel(model_name="BAAI/bge-small-en-v1.5")
18 |         assert isinstance(model._model, object)  # Check model is initialized
19 |         assert model._vector_size is None  # Size should be initially uncached
20 | 
21 |     def test_vector_size(self):
22 |         size = self.model.vector_size()
23 |         assert isinstance(size, int)
24 |         assert size > 0
25 |         # Check that size is cached
26 |         assert self.model._vector_size == size
27 |         # Get it again to test cached path
28 |         assert self.model.vector_size() == size
29 | 
30 |     def test_embed_document(self):
31 |         vector = self.model.embed_document("This is a test document")
32 |         assert isinstance(vector, list)
33 |         assert len(vector) == self.model.vector_size()
34 |         assert all(isinstance(x, float) for x in vector)
35 | 
36 |     def test_embed_query(self):
37 |         vector = self.model.embed_query("test query")
38 |         assert isinstance(vector, list)
39 |         assert len(vector) == self.model.vector_size()
40 |         assert all(isinstance(x, float) for x in vector)
41 | 
42 |     def test_consistent_embeddings(self):
43 |         text = "This is a test document"
44 |         vector1 = self.model.embed_document(text)
45 |         vector2 = self.model.embed_document(text)
46 |         # Vectors should be nearly identical for same input
47 |         assert np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8)
48 | 
49 | 
50 | @pytest.mark.integration
51 | class TestFastEmbedSparseModel:
52 |     @pytest.fixture(autouse=True)
53 |     def setup_model(self):
54 |         self.model = FastEmbedSparseModel(model_name="Qdrant/bm25")
55 | 
56 |     def test_initialization(self):
57 |         model = FastEmbedSparseModel(model_name="Qdrant/bm25")
58 |         assert isinstance(model._model, object)  # Check model is initialized
59 | 
60 |     def test_embed_document(self):
61 |         vector = self.model.embed_document("This is a test document")
62 |         assert isinstance(vector, dict)
63 |         # Sparse vectors should have indices and values
64 |         assert len(vector) > 0
65 |         assert all(
66 |             isinstance(k, int) and isinstance(v, float) for k, v in vector.items()
67 |         )
68 | 
69 |     def test_embed_query(self):
70 |         vector = self.model.embed_query("test query")
71 |         assert isinstance(vector, dict)
72 |         # Sparse vectors should have indices and values
73 |         assert len(vector) > 0
74 |         assert all(
75 |             isinstance(k, int) and isinstance(v, (int, float))
76 |             for k, v in vector.items()
77 |         )
78 | 
79 |     def test_consistent_embeddings(self):
80 |         text = "This is a test document"
81 |         vector1 = self.model.embed_document(text)
82 |         vector2 = self.model.embed_document(text)
83 |         # Vectors should be identical for same input
84 |         assert vector1 == vector2
85 | 
86 |     def test_sparse_vector_format(self):
87 |         vector = self.model.embed_document("This is a test document")
88 |         # Check that indices are unique
89 |         assert len(vector.keys()) == len(set(vector.keys()))
90 |         # Values should be non-negative for BM25-like models
91 |         assert all(v >= 0 for v in vector.values())
92 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_openai_embeddings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from django_semantic_search.embeddings.openai import OpenAIEmbeddingModel
 7 | 
 8 | 
 9 | @pytest.mark.skipif(
10 |     not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set in environment"
11 | )
12 | class TestOpenAIEmbeddingModel:
13 |     def test_initialization(self):
14 |         model = OpenAIEmbeddingModel()
15 |         assert model._model == "text-embedding-3-small"
16 | 
17 |     def test_initialization_fails_without_api_key(self, monkeypatch):
18 |         monkeypatch.delenv("OPENAI_API_KEY", raising=False)
19 |         with pytest.raises(ValueError) as exc_info:
20 |             OpenAIEmbeddingModel()
21 |         assert "OpenAI API key must be provided" in str(exc_info.value)
22 | 
23 |     def test_vector_size(self):
24 |         model = OpenAIEmbeddingModel()
25 |         size = model.vector_size()
26 |         assert isinstance(size, int)
27 |         assert size > 0
28 |         # Check that size is cached
29 |         assert model._vector_size == size
30 | 
31 |     def test_embed_document(self):
32 |         model = OpenAIEmbeddingModel()
33 |         vector = model.embed_document("This is a test document")
34 |         assert isinstance(vector, list)
35 |         assert len(vector) == model.vector_size()
36 |         assert all(isinstance(x, float) for x in vector)
37 | 
38 |     def test_embed_query(self):
39 |         model = OpenAIEmbeddingModel()
40 |         vector = model.embed_query("test query")
41 |         assert isinstance(vector, list)
42 |         assert len(vector) == model.vector_size()
43 |         assert all(isinstance(x, float) for x in vector)
44 | 
45 |     def test_consistent_embeddings(self):
46 |         model = OpenAIEmbeddingModel()
47 |         text = "This is a test document"
48 |         vector1 = model.embed_document(text)
49 |         vector2 = model.embed_document(text)
50 |         # Vectors should be nearly identical for same input
51 |         assert np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8)
52 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_sentence_transformers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pytest
 3 | 
 4 | from django_semantic_search.embeddings.sentence_transformers import (  # noqa
 5 |     SentenceTransformerModel,
 6 | )
 7 | 
 8 | 
 9 | @pytest.mark.integration
10 | class TestSentenceTransformerModel:
11 |     @pytest.fixture(autouse=True)
12 |     def setup_model(self):
13 |         self.model = SentenceTransformerModel(
14 |             model_name="sentence-transformers/all-MiniLM-L6-v2"
15 |         )
16 | 
17 |     def test_initialization(self):
18 |         model = SentenceTransformerModel(
19 |             model_name="sentence-transformers/all-MiniLM-L6-v2"
20 |         )
21 |         assert isinstance(model._model, object)  # Check model is initialized
22 |         assert model._document_prompt is None
23 |         assert model._query_prompt is None
24 | 
25 |     def test_initialization_with_prompts(self):
26 |         model = SentenceTransformerModel(
27 |             model_name="sentence-transformers/all-MiniLM-L6-v2",
28 |             document_prompt="Doc: ",
29 |             query_prompt="Query: ",
30 |         )
31 |         assert model._document_prompt == "Doc: "
32 |         assert model._query_prompt == "Query: "
33 | 
34 |     def test_vector_size(self):
35 |         size = self.model.vector_size()
36 |         assert isinstance(size, int)
37 |         assert size > 0
38 |         # Common size for all-MiniLM-L6-v2 model
39 |         assert size == 384
40 | 
41 |     def test_embed_document(self):
42 |         vector = self.model.embed_document("This is a test document")
43 |         assert isinstance(vector, list)
44 |         assert len(vector) == self.model.vector_size()
45 |         assert all(isinstance(x, float) for x in vector)
46 | 
47 |     def test_embed_query(self):
48 |         vector = self.model.embed_query("test query")
49 |         assert isinstance(vector, list)
50 |         assert len(vector) == self.model.vector_size()
51 |         assert all(isinstance(x, float) for x in vector)
52 | 
53 |     def test_consistent_embeddings(self):
54 |         text = "This is a test document"
55 |         vector1 = self.model.embed_document(text)
56 |         vector2 = self.model.embed_document(text)
57 |         # Vectors should be nearly identical for same input
58 |         assert np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8)
59 | 
60 |     def test_document_prompt_affects_embedding(self):
61 |         model_with_prompt = SentenceTransformerModel(
62 |             model_name="sentence-transformers/all-MiniLM-L6-v2",
63 |             document_prompt="Doc: ",
64 |         )
65 |         text = "This is a test document"
66 |         vector1 = self.model.embed_document(text)
67 |         vector2 = model_with_prompt.embed_document(text)
68 |         # Vectors should be different when using a prompt
69 |         assert not np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8)
70 | 
71 |     def test_query_prompt_affects_embedding(self):
72 |         model_with_prompt = SentenceTransformerModel(
73 |             model_name="sentence-transformers/all-MiniLM-L6-v2",
74 |             query_prompt="Query: ",
75 |         )
76 |         text = "test query"
77 |         vector1 = self.model.embed_query(text)
78 |         vector2 = model_with_prompt.embed_query(text)
79 |         # Vectors should be different when using a prompt
80 |         assert not np.allclose(vector1, vector2, rtol=1e-5, atol=1e-8)
81 | 


--------------------------------------------------------------------------------
/tests/django_semantic_search/test_vector_index_embeddings.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from django.conf import settings
 3 | from django.db import models
 4 | 
 5 | from django_semantic_search import Document, VectorIndex, register_document
 6 | 
 7 | 
 8 | class TestModel(models.Model):
 9 |     title = models.CharField(max_length=255)
10 |     content = models.TextField()
11 | 
12 |     class Meta:
13 |         app_label = "test_vector_index"
14 | 
15 | 
16 | @pytest.mark.integration
17 | class TestVectorIndexEmbeddings:
18 |     @pytest.fixture(autouse=True)
19 |     def setup_settings(self):  # Remove the settings parameter
20 |         settings.SEMANTIC_SEARCH = {
21 |             "vector_store": {
22 |                 "backend": "django_semantic_search.backends.qdrant.QdrantBackend",
23 |                 "configuration": {"location": ":memory:"},
24 |             },
25 |             "default_embeddings": {
26 |                 "model": "django_semantic_search.embeddings.SentenceTransformerModel",
27 |                 "configuration": {
28 |                     "model_name": "sentence-transformers/all-MiniLM-L6-v2",
29 |                 },
30 |             },
31 |             "embedding_models": {
32 |                 "title_model": {
33 |                     "model": "django_semantic_search.embeddings.SentenceTransformerModel",
34 |                     "configuration": {
35 |                         "model_name": "sentence-transformers/all-mpnet-base-v2",
36 |                         "document_prompt": "Title: ",
37 |                     },
38 |                 },
39 |                 "content_model": {
40 |                     "model": "django_semantic_search.embeddings.SentenceTransformerModel",
41 |                     "configuration": {
42 |                         "model_name": "sentence-transformers/all-MiniLM-L6-v2",
43 |                         "document_prompt": "Content: ",
44 |                     },
45 |                 },
46 |             },
47 |         }
48 | 
49 |     def test_different_models_for_indexes(self):
50 |         @register_document
51 |         class TestDocument(Document):
52 |             class Meta:
53 |                 model = TestModel
54 |                 indexes = [
55 |                     VectorIndex("title", embedding_model="title_model"),
56 |                     VectorIndex("content", embedding_model="content_model"),
57 |                 ]
58 | 
59 |         # Create test instances
60 |         instance = TestModel(title="Test Title", content="Test Content")
61 | 
62 |         # Get embeddings for both fields
63 |         title_embedding = TestDocument.meta.indexes[0].get_model_embedding(instance)
64 |         content_embedding = TestDocument.meta.indexes[1].get_model_embedding(instance)
65 | 
66 |         # Embeddings should be different sizes due to different models
67 |         assert len(title_embedding) != len(content_embedding)
68 | 
69 |     def test_default_model_fallback(self):
70 |         @register_document
71 |         class TestDocument(Document):
72 |             class Meta:
73 |                 model = TestModel
74 |                 indexes = [
75 |                     VectorIndex("title"),  # Uses default model
76 |                     VectorIndex("content", embedding_model="content_model"),
77 |                 ]
78 | 
79 |         instance = TestModel(title="Test Title", content="Test Content")
80 | 
81 |         # Both embeddings should work
82 |         title_embedding = TestDocument.meta.indexes[0].get_model_embedding(instance)
83 |         content_embedding = TestDocument.meta.indexes[1].get_model_embedding(instance)
84 | 
85 |         assert isinstance(title_embedding, (list, tuple))
86 |         assert isinstance(content_embedding, (list, tuple))
87 | 
88 |     def test_invalid_model_name(self):
89 |         with pytest.raises(ValueError) as exc_info:
90 |             VectorIndex("title", embedding_model="non_existent_model")
91 |         assert "Embedding model non_existent_model not found in settings" in str(
92 |             exc_info.value
93 |         )
94 | 


--------------------------------------------------------------------------------
/tests/mocks.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from collections import defaultdict
 3 | from hashlib import md5
 4 | from typing import Dict, List
 5 | 
 6 | from django_semantic_search import Document
 7 | from django_semantic_search.backends.base import BaseVectorSearchBackend
 8 | from django_semantic_search.backends.types import IndexConfiguration
 9 | from django_semantic_search.embeddings.base import DenseTextEmbeddingModel
10 | from django_semantic_search.types import DenseVector, DocumentID, Vector
11 | 
12 | 
13 | class MockDenseTextEmbeddingModel(DenseTextEmbeddingModel):
14 |     """
15 |     Mock dense text embedding model for testing purposes. It produces short random vectors,
16 |     but these vectors are consistent for the same input. So it can be used for testing purposes.
17 |     """
18 | 
19 |     def __init__(self, size: int = 10):
20 |         self._size = size
21 | 
22 |     def vector_size(self) -> int:
23 |         return self._size
24 | 
25 |     def embed_document(self, document: str) -> DenseVector:
26 |         """Return a random vector."""
27 |         document_hash = md5(document.encode()).hexdigest()
28 |         random.seed(document_hash)
29 |         return [random.random() for _ in range(self._size)]
30 | 
31 |     def embed_query(self, query: str) -> DenseVector:
32 |         return self.embed_document(query)
33 | 
34 | 
35 | class MockVectorSearchBackend(BaseVectorSearchBackend):
36 |     """
37 |     Mock vector search backend for testing purposes. It stores the vectors in memory, and allows to search for the
38 |     closest vectors.
39 |     """
40 | 
41 |     def __init__(self, index_configuration: IndexConfiguration):
42 |         super().__init__(index_configuration)
43 |         self._documents: Dict[str, Dict[DocumentID, Document]] = defaultdict(dict)
44 | 
45 |     def configure(self):
46 |         """No configuration is needed for the mock backend."""
47 |         pass
48 | 
49 |     def search(
50 |         self, vector_name: str, query: Vector, limit: int = 10
51 |     ) -> List[DocumentID]:
52 |         random.seed(sum(query))
53 |         max_results = min(
54 |             limit, len(self._documents[self.index_configuration.namespace])
55 |         )
56 |         selected_documents = random.sample(
57 |             list(self._documents[self.index_configuration.namespace].values()),
58 |             k=max_results,
59 |         )
60 |         return [doc.id for doc in selected_documents]
61 | 
62 |     def save(self, document: Document) -> None:
63 |         self._documents[self.index_configuration.namespace][document.id] = document
64 | 
65 |     def delete(self, document_id: DocumentID) -> None:
66 |         del self._documents[self.index_configuration.namespace][document_id]
67 | 
68 | 
69 | # Configuration for the tests
70 | test_settings = {
71 |     "vector_store": {
72 |         "backend": MockVectorSearchBackend,
73 |         "configuration": {},
74 |     },
75 |     "default_embeddings": {
76 |         "model": MockDenseTextEmbeddingModel,
77 |         "configuration": {},
78 |     },
79 | }
80 | 


--------------------------------------------------------------------------------