├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── examples ├── README.md ├── dspy-demo │ ├── .env.example │ ├── README.md │ ├── example.py │ ├── requirements.txt │ ├── sample_data.txt │ ├── test.py │ └── utils.py ├── gemini-ai-embeddings-demo │ ├── README.md │ ├── example.ipynb │ ├── example.py │ └── requirements.txt ├── graphrag-demo │ ├── README.md │ ├── graphrag-demo.py │ ├── init.sql │ └── requirements.txt ├── graphrag-step-by-step-tutorial │ ├── README.md │ └── example.ipynb ├── image_search │ ├── README.md │ └── example.ipynb ├── jina-ai-embeddings-demo │ ├── README.md │ ├── jina-ai-embeddings-demo.py │ └── requirements.txt ├── langchain-agent-demo │ ├── .env.example │ ├── README.md │ ├── __init__.py │ ├── example.py │ ├── knowledge_base.py │ ├── requirements.txt │ ├── sample_data.txt │ └── utils.py ├── llamaindex-tidb-vector-with-ui │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── templates │ │ └── index.html ├── llamaindex-tidb-vector │ ├── README.md │ ├── chat_with_url.py │ └── requirements.txt ├── openai_embedding │ ├── README.md │ ├── example.ipynb │ ├── example.py │ └── requirements.txt ├── orm-django-quickstart │ ├── .env.example │ ├── .gitignore │ ├── README.md │ ├── manage.py │ ├── requirements.txt │ └── sample_project │ │ ├── __init__.py │ │ ├── asgi.py │ │ ├── forms.py │ │ ├── migrations │ │ ├── 0001_initial.py │ │ └── __init__.py │ │ ├── models.py │ │ ├── settings.py │ │ ├── urls.py │ │ ├── views.py │ │ └── wsgi.py ├── orm-peewee-quickstart │ ├── .env.example │ ├── README.md │ ├── peewee-quickstart.py │ └── requirements.txt ├── orm-sqlalchemy-quickstart │ ├── .env.example │ ├── README.md │ ├── requirements.txt │ └── sqlalchemy-quickstart.py ├── python-client-quickstart │ ├── .env.example │ ├── README.md │ ├── example.py │ └── requirements.txt ├── semantic-cache │ ├── README.md │ ├── cache.py │ └── requirements.txt └── static │ └── images │ └── tidbcloud-connect-parameters.png ├── poetry.lock ├── pyproject.toml ├── tests ├── __init__.py ├── config.py ├── integrations │ ├── __init__.py │ ├── test_utils.py │ └── test_vector_client.py ├── peewee │ ├── __init__.py │ └── test_peewee.py └── sqlalchemy │ ├── __init__.py │ └── test_sqlalchemy.py ├── tidb_vector ├── __init__.py ├── constants.py ├── integrations │ ├── __init__.py │ ├── utils.py │ └── vector_client.py ├── peewee │ ├── __init__.py │ ├── adaptor.py │ └── vector_type.py ├── sqlalchemy │ ├── __init__.py │ ├── adaptor.py │ └── vector_type.py └── utils.py └── tox.ini /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | push: 4 | branches: 5 | - main 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | lint: 13 | name: lint 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v3 18 | 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | python -m pip install tox 23 | 24 | - name: Run lint 25 | run: | 26 | tox -e lint 27 | 28 | tests: 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | python-version: 33 | - "3.12" 34 | name: py${{ matrix.python-version }}_test 35 | runs-on: ubuntu-latest 36 | services: 37 | tidb: 38 | image: wangdi4zm/tind:v8.4.0-vector-index 39 | ports: 40 | - 4000:4000 41 | steps: 42 | - name: Checkout 43 | uses: actions/checkout@v3 44 | 45 | - name: Setup Python 46 | uses: actions/setup-python@v4 47 | with: 48 | python-version: ${{ matrix.python-version }} 49 | 50 | - name: Install dependencies 51 | run: | 52 | python -m pip install --upgrade pip 53 | python -m pip install tox tox-gh-actions 54 | sudo apt-get update 55 | sudo apt-get install -y libmemcached-dev zlib1g-dev 56 | 57 | - name: Run tests 58 | run: tox 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | .idea/ 141 | django_tests_dir 142 | 143 | *.swp 144 | 145 | .vscode/ 146 | 147 | .DS_Store 148 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | default_language_version: 3 | python: python3 4 | 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.5.0 8 | hooks: 9 | - id: check-byte-order-marker 10 | - id: check-merge-conflict 11 | - id: check-symlinks 12 | - id: check-toml 13 | - id: check-yaml 14 | - id: detect-private-key 15 | - id: end-of-file-fixer 16 | - id: mixed-line-ending 17 | - id: trailing-whitespace 18 | - repo: https://github.com/charliermarsh/ruff-pre-commit 19 | rev: v0.1.5 20 | hooks: 21 | - id: ruff 22 | args: [--fix, --exit-non-zero-on-fix] 23 | - repo: https://github.com/psf/black-pre-commit-mirror 24 | rev: 23.10.1 25 | hooks: 26 | - id: black-jupyter 27 | name: black-src 28 | alias: black 29 | - repo: https://github.com/pre-commit/mirrors-mypy 30 | rev: v1.0.1 31 | hooks: 32 | - id: mypy 33 | additional_dependencies: 34 | [ 35 | "types-requests", 36 | "types-Deprecated", 37 | "types-redis", 38 | "types-setuptools", 39 | "types-PyYAML", 40 | "types-protobuf==4.24.0.4", 41 | ] 42 | - repo: https://github.com/psf/black-pre-commit-mirror 43 | rev: 23.10.1 44 | hooks: 45 | - id: black-jupyter 46 | name: black-docs-py 47 | alias: black 48 | files: docs/ 49 | # Using PEP 8's line length in docs prevents excess left/right scrolling 50 | args: [--line-length=79] 51 | - repo: https://github.com/adamchainz/blacken-docs 52 | rev: 1.16.0 53 | hooks: 54 | - id: blacken-docs 55 | name: black-docs-text 56 | alias: black 57 | types_or: [rst, markdown, tex] 58 | additional_dependencies: [black==23.10.1] 59 | # Using PEP 8's line length in docs prevents excess left/right scrolling 60 | args: [--line-length=79] 61 | - repo: https://github.com/pre-commit/mirrors-prettier 62 | rev: v3.0.3 63 | hooks: 64 | - id: prettier 65 | - repo: https://github.com/codespell-project/codespell 66 | rev: v2.2.6 67 | hooks: 68 | - id: codespell 69 | additional_dependencies: [tomli] 70 | args: ["--ignore-words-list", "nin"] 71 | - repo: https://github.com/srstevenson/nb-clean 72 | rev: 3.1.0 73 | hooks: 74 | - id: nb-clean 75 | args: [--preserve-cell-outputs, --remove-empty-cells] 76 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | ## Contributing Guidelines 4 | 5 | As TiDB Serverless introduced Vector Search feature that enable users to access vector data via SQL, we are also starting to build the ORM, SDK or libraries for the users to interact with TiDB Serverless and vector data. Such as Python SDK for TiDB Serverless it self, or new column support for traditional ORM like SQLAlchemy, Django ORM, etc. 6 | 7 | Here we call for contributions to enhance the ecosystem of TiDB Serverless and vector data. You can contribute to the following areas: 8 | 9 | 10 | 11 | ### Software Prerequisites for Development 12 | * [Python](https://www.python.org/downloads/) 13 | * [TiDB Serverless](https://pingcap.com/ai) for testing the SDK or libraries 14 | * [Visual Studio Code](https://code.visualstudio.com/) or any other code editor 15 | 16 | 17 | ### Components of the Project 18 | 19 | #### Python SDK for TiDB Serverless 20 | 21 | This repo `pingcap/tidb-vector-python` is the Python SDK for TiDB Serverless. You can contribute to this repo by adding new features, fixing bugs, or improving the performance of the SDK. 22 | 23 | 24 | #### Example or Tutorials 25 | 26 | In this repo, there is a directory [examples](https://github.com/pingcap/tidb-vector-python/) that contains examples and tutorials for using TiDB Serverless and vector data. You can contribute to this directory by adding new examples or tutorials. 27 | 28 | Currently, we are looking for the following types of examples or tutorials: 29 | 30 | * Tutorials that enable users to use TiDB Serverless and vector data in different business scenarios, such as suggestion, recommendation system, etc. 31 | * Examples that demonstrate how to use TiDB Serverless and other tools or libraries, such as Dify, Jina AI, Anthropic AI, etc. 32 | * Notebooks that show how to use TiDB Serverless and vector data in different machine learning or deep learning tasks. 33 | 34 | Not limited to the above types, you can also contribute other types of examples or tutorials that you think are helpful for the users. 35 | 36 | 37 | ## Maintainers 38 | 39 | Please feel free to reach out to the maintainers if you have any questions or need help with the project. 40 | 41 | * [wd0517](https://github.com/wd0517) 42 | * [634750802](https://github.com/634750802) 43 | * [Mini256](https://github.com/Mini256) 44 | * [IANTHEREAL](https://github.com/IANTHEREAL) 45 | * [Cheese](https://github.com/Icemap) 46 | 47 | ## Discussion 48 | 49 | If you have any questions or suggestions, please feel free to open a discussion in the [Discussions](https://github.com/pingcap/tidb-vector-python/) 50 | 51 | or contact us via [@TiDB_Developer](https://twitter.com/TiDB_Developer) on Twitter. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | GIT_ROOT ?= $(shell git rev-parse --show-toplevel) 2 | 3 | format: ## Run code autoformatters (black). 4 | pre-commit install 5 | pre-commit run black --all-files 6 | 7 | lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy 8 | tox -e lint 9 | 10 | test: 11 | tox 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tidb-vector-python 2 | 3 | Use TiDB Vector Search with Python. 4 | 5 | ## Usage 6 | 7 | TiDB is a SQL database so that this package introduces Vector Search capability for Python ORMs: 8 | 9 | - [#SQLAlchemy](#sqlalchemy) 10 | - [#Peewee](#peewee) 11 | - [#Django](#django) 12 | 13 | Pick one that you are familiar with to get started. If you are not using any of them, we recommend [#SQLAlchemy](#sqlalchemy). 14 | 15 | We also provide a Vector Search client for simple usage: 16 | 17 | - [#TiDB Vector Client](#tidb-vector-client) 18 | 19 | ### SQLAlchemy 20 | 21 | Install: 22 | 23 | ```bash 24 | pip install tidb-vector sqlalchemy pymysql 25 | ``` 26 | 27 | Usage: 28 | 29 | ```python 30 | from sqlalchemy import Integer, Column 31 | from sqlalchemy import create_engine, select 32 | from sqlalchemy.dialects.mysql import LONGTEXT 33 | from sqlalchemy.orm import Session, declarative_base 34 | 35 | import tidb_vector 36 | from tidb_vector.sqlalchemy import VectorType, VectorAdaptor 37 | 38 | engine = create_engine("mysql+pymysql://root@127.0.0.1:4000/test") 39 | Base = declarative_base() 40 | 41 | 42 | # Define table schema 43 | class Doc(Base): 44 | __tablename__ = "doc" 45 | id = Column(Integer, primary_key=True) 46 | embedding = Column(VectorType(dim=3)) 47 | content = Column(LONGTEXT) 48 | 49 | 50 | # Create empty table 51 | Base.metadata.drop_all(engine) # clean data from last run 52 | Base.metadata.create_all(engine) 53 | 54 | # Create index for L2 distance 55 | VectorAdaptor(engine).create_vector_index( 56 | Doc.embedding, tidb_vector.DistanceMetric.L2, skip_existing=True 57 | # For cosine distance, use tidb_vector.DistanceMetric.COSINE 58 | ) 59 | 60 | # Insert content with vectors 61 | with Session(engine) as session: 62 | session.add(Doc(id=1, content="dog", embedding=[1, 2, 1])) 63 | session.add(Doc(id=2, content="fish", embedding=[1, 2, 4])) 64 | session.add(Doc(id=3, content="tree", embedding=[1, 0, 0])) 65 | session.commit() 66 | 67 | # Perform Vector Search for Top K=1 68 | with Session(engine) as session: 69 | results = session.execute( 70 | select(Doc.id, Doc.content) 71 | .order_by(Doc.embedding.l2_distance([1, 2, 3])) 72 | # For cosine distance, use Doc.embedding.cosine_distance(...) 73 | .limit(1) 74 | ).all() 75 | print(results) 76 | 77 | # Perform filtered Vector Search by adding a Where Clause: 78 | with Session(engine) as session: 79 | results = session.execute( 80 | select(Doc.id, Doc.content) 81 | .where(Doc.content == "dog") 82 | .order_by(Doc.embedding.l2_distance([1, 2, 3])) 83 | .limit(1) 84 | ).all() 85 | print(results) 86 | ``` 87 | 88 | ### Peewee 89 | 90 | Install: 91 | 92 | ```bash 93 | pip install tidb-vector peewee pymysql 94 | ``` 95 | 96 | Usage: 97 | 98 | ```python 99 | import tidb_vector 100 | from peewee import Model, MySQLDatabase, IntegerField, TextField 101 | from tidb_vector.peewee import VectorField, VectorAdaptor 102 | 103 | db = MySQLDatabase( 104 | database="test", 105 | user="root", 106 | password="", 107 | host="127.0.0.1", 108 | port=4000, 109 | ) 110 | 111 | 112 | # Define table schema 113 | class Doc(Model): 114 | class Meta: 115 | database = db 116 | table_name = "peewee_test" 117 | 118 | id = IntegerField(primary_key=True) 119 | embedding = VectorField(3) 120 | content = TextField() 121 | 122 | 123 | # Create empty table and index for L2 distance 124 | db.drop_tables([Doc]) # clean data from last run 125 | db.create_tables([Doc]) 126 | # For cosine distance, use tidb_vector.DistanceMetric.COSINE 127 | VectorAdaptor(db).create_vector_index(Doc.embedding, tidb_vector.DistanceMetric.L2) 128 | 129 | # Insert content with vectors 130 | Doc.insert_many( 131 | [ 132 | {"id": 1, "content": "dog", "embedding": [1, 2, 1]}, 133 | {"id": 2, "content": "fish", "embedding": [1, 2, 4]}, 134 | {"id": 3, "content": "tree", "embedding": [1, 0, 0]}, 135 | ] 136 | ).execute() 137 | 138 | # Perform Vector Search for Top K=1 139 | cursor = ( 140 | Doc.select(Doc.id, Doc.content) 141 | # For cosine distance, use Doc.embedding.cosine_distance(...) 142 | .order_by(Doc.embedding.l2_distance([1, 2, 3])) 143 | .limit(1) 144 | ) 145 | for row in cursor: 146 | print(row.id, row.content) 147 | 148 | 149 | # Perform filtered Vector Search by adding a Where Clause: 150 | cursor = ( 151 | Doc.select(Doc.id, Doc.content) 152 | .where(Doc.content == "dog") 153 | .order_by(Doc.embedding.l2_distance([1, 2, 3])) 154 | .limit(1) 155 | ) 156 | for row in cursor: 157 | print(row.id, row.content) 158 | ``` 159 | 160 | ### Django 161 | 162 | > [!TIP] 163 | > 164 | > Django is a full-featured web framework, not just an ORM. The following usage introducutions are provided for existing Django users. 165 | > 166 | > For new users to get started, consider using SQLAlchemy or Peewee. 167 | 168 | Install: 169 | 170 | ```bash 171 | pip install 'django-tidb[vector]~=5.0.0' 'django~=5.0.0' mysqlclient 172 | ``` 173 | 174 | Usage: 175 | 176 | 1\. Configure `django_tidb` as engine, like: 177 | 178 | ```python 179 | DATABASES = { 180 | 'default': { 181 | 'ENGINE': 'django_tidb', 182 | 'NAME': 'django', 183 | 'USER': 'root', 184 | 'PASSWORD': '', 185 | 'HOST': '127.0.0.1', 186 | 'PORT': 4000, 187 | }, 188 | } 189 | ``` 190 | 191 | 2\. Define a model with a vector field and vector index: 192 | 193 | ```python 194 | from django.db import models 195 | from django_tidb.fields.vector import VectorField, VectorIndex, L2Distance 196 | 197 | class Doc(models.Model): 198 | id = models.IntegerField(primary_key=True) 199 | embedding = VectorField(dimensions=3) 200 | content = models.TextField() 201 | class Meta: 202 | indexes = [VectorIndex(L2Distance("embedding"), name="idx")] 203 | ``` 204 | 205 | 3\. Insert data: 206 | 207 | ```python 208 | Doc.objects.create(id=1, content="dog", embedding=[1, 2, 1]) 209 | Doc.objects.create(id=2, content="fish", embedding=[1, 2, 4]) 210 | Doc.objects.create(id=3, content="tree", embedding=[1, 0, 0]) 211 | ``` 212 | 213 | 4\. Perform Vector Search for Top K=1: 214 | 215 | ```python 216 | queryset = ( 217 | Doc.objects 218 | .order_by(L2Distance("embedding", [1, 2, 3])) 219 | .values("id", "content")[:1] 220 | ) 221 | print(queryset) 222 | ``` 223 | 224 | 5\. Perform filtered Vector Search by adding a Where Clause: 225 | 226 | ```python 227 | queryset = ( 228 | Doc.objects 229 | .filter(content="dog") 230 | .order_by(L2Distance("embedding", [1, 2, 3])) 231 | .values("id", "content")[:1] 232 | ) 233 | print(queryset) 234 | ``` 235 | 236 | For more details, see [django-tidb](https://github.com/pingcap/django-tidb?tab=readme-ov-file#vector-beta). 237 | 238 | ### TiDB Vector Client 239 | 240 | Within the framework, you can directly utilize the built-in `TiDBVectorClient`, as demonstrated by integrations like [Langchain](https://python.langchain.com/docs/integrations/vectorstores/tidb_vector) and [Llama index](https://docs.llamaindex.ai/en/stable/community/integrations/vector_stores.html#using-a-vector-store-as-an-index), to seamlessly interact with TiDB Vector. This approach abstracts away the need to manage the underlying ORM, simplifying your interaction with the vector store. 241 | 242 | We provide `TiDBVectorClient` which is based on sqlalchemy, you need to use `pip install tidb-vector[client]` to install it. 243 | 244 | Create a `TiDBVectorClient` instance: 245 | 246 | ```python 247 | from tidb_vector.integrations import TiDBVectorClient 248 | 249 | TABLE_NAME = 'vector_test' 250 | CONNECTION_STRING = 'mysql+pymysql://:@:4000/?ssl_verify_cert=true&ssl_verify_identity=true' 251 | 252 | tidb_vs = TiDBVectorClient( 253 | # the table which will store the vector data 254 | table_name=TABLE_NAME, 255 | # tidb connection string 256 | connection_string=CONNECTION_STRING, 257 | # the dimension of the vector, in this example, we use the ada model, which has 1536 dimensions 258 | vector_dimension=1536, 259 | # if recreate the table if it already exists 260 | drop_existing_table=True, 261 | ) 262 | ``` 263 | 264 | Bulk insert: 265 | 266 | ```python 267 | ids = [ 268 | "f8e7dee2-63b6-42f1-8b60-2d46710c1971", 269 | "8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6", 270 | "e4991349-d00b-485c-a481-f61695f2b5ae", 271 | ] 272 | documents = ["foo", "bar", "baz"] 273 | embeddings = [ 274 | text_to_embedding("foo"), 275 | text_to_embedding("bar"), 276 | text_to_embedding("baz"), 277 | ] 278 | metadatas = [ 279 | {"page": 1, "category": "P1"}, 280 | {"page": 2, "category": "P1"}, 281 | {"page": 3, "category": "P2"}, 282 | ] 283 | 284 | tidb_vs.insert( 285 | ids=ids, 286 | texts=documents, 287 | embeddings=embeddings, 288 | metadatas=metadatas, 289 | ) 290 | ``` 291 | 292 | Query: 293 | 294 | ```python 295 | tidb_vs.query(text_to_embedding("foo"), k=3) 296 | 297 | # query with filter 298 | tidb_vs.query(text_to_embedding("foo"), k=3, filter={"category": "P1"}) 299 | ``` 300 | 301 | Bulk delete: 302 | 303 | ```python 304 | tidb_vs.delete(["f8e7dee2-63b6-42f1-8b60-2d46710c1971"]) 305 | 306 | # delete with filter 307 | tidb_vs.delete(["f8e7dee2-63b6-42f1-8b60-2d46710c1971"], filter={"category": "P1"}) 308 | ``` 309 | 310 | ## Examples 311 | 312 | There are some examples to show how to use the tidb-vector-python to interact with TiDB Vector in different scenarios. 313 | 314 | - [OpenAI Embedding](./examples/openai_embedding/README.md): use the OpenAI embedding model to generate vectors for text data, store them in TiDB Vector, and search for similar text. 315 | - [Image Search](./examples/image_search/README.md): use the OpenAI CLIP model to generate vectors for image and text, store them in TiDB Vector, and search for similar images. 316 | - [LlamaIndex RAG with UI](./examples/llamaindex-tidb-vector-with-ui/README.md): use the LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application. 317 | - [Chat with URL](./llamaindex-tidb-vector/README.md): use LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application that can chat with a URL. 318 | - [GraphRAG](./examples/graphrag-demo/README.md): 20 lines code of using TiDB Serverless to build a Knowledge Graph based RAG application. 319 | - [GraphRAG Step by Step Tutorial](./examples/graphrag-step-by-step-tutorial/README.md): Step by step tutorial to build a Knowledge Graph based RAG application with Colab notebook. In this tutorial, you will learn how to extract knowledge from a text corpus, build a Knowledge Graph, store the Knowledge Graph in TiDB Serverless, and search from the Knowledge Graph. 320 | - [Vector Search Notebook with SQLAlchemy](https://colab.research.google.com/drive/1LuJn4mtKsjr3lHbzMa2RM-oroUvpy83y?usp=sharing): use [SQLAlchemy](https://www.sqlalchemy.org/) to interact with TiDB Serverless: connect db, index&store data and then search vectors. 321 | - [Build RAG with Jina AI Embeddings](./examples/jina-ai-embeddings-demo/README.md): use Jina AI to generate embeddings for text data, store the embeddings in TiDB Vector Storage, and search for similar embeddings. 322 | - [Semantic Cache](./examples/semantic-cache/README.md): build a semantic cache with Jina AI and TiDB Vector. 323 | 324 | for more examples, see the [examples](./examples) directory. 325 | 326 | ## Contributing 327 | 328 | Please feel free to reach out to the maintainers if you have any questions or need help with the project. Before contributing, please read the [CONTRIBUTING.md](./CONTRIBUTING.md) file. 329 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Vector Examples 2 | 3 | This directory contains examples of how to use the TiDB as a vector database. 4 | 5 | ## Prerequisites 6 | 7 | Please follow the instructions below to set up a TiDB Serverless cluster with built-in vector search supported. 8 | 9 | 1. Sign up [TiDB Cloud](https://tidbcloud.com) 10 | 2. Follow this [tutorial](https://docs.pingcap.com/tidbcloud/tidb-cloud-quickstart#step-1-create-a-tidb-cluster) to create a TiDB Serverless cluster 11 | 3. Navigate to the [Clusters](https://tidbcloud.com/console/clusters) page, and then click the name of your target cluster to go to its overview page 12 | 4. Click Connect in the upper-right corner. 13 | 5. In the connection dialog, select General from the Connect With dropdown and keep the default setting of the Endpoint Type as Public. 14 | 6. If you have not set a password yet, click Create password to generate a random password. 15 | 16 |
17 | 18 | The connection dialog of TiDB Serverless 19 | 20 |
The connection dialog of TiDB Serverless
21 |
22 | 7. Save the connection parameters to a safe place. You will need them to connect to the TiDB Serverless cluster in the following examples. 23 | 24 | ## Examples 25 | - [OpenAI Embedding](./openai_embedding/README.md): use the OpenAI embedding model to generate vectors for text data. 26 | - [Image Search](./image_search/README.md): use the OpenAI CLIP model to generate vectors for image and text. 27 | - [LlamaIndex RAG with UI](./llamaindex-tidb-vector-with-ui/README.md): use the LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application. 28 | - [Chat with URL](./llamaindex-tidb-vector/README.md): use LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application that can chat with a URL. 29 | - [GraphRAG](./graphrag-demo/README.md): 20 lines code of using TiDB Serverless to build a Knowledge Graph based RAG application. 30 | - [GraphRAG Step by Step Tutorial](./graphrag-step-by-step-tutorial/README.md): Step by step tutorial to build a Knowledge Graph based RAG application with Colab notebook. In this tutorial, you will learn how to extract knowledge from a text corpus, build a Knowledge Graph, store the Knowledge Graph in TiDB Serverless, and search from the Knowledge Graph. 31 | - [Vector Search Notebook with SQLAlchemy](https://colab.research.google.com/drive/1LuJn4mtKsjr3lHbzMa2RM-oroUvpy83y?usp=sharing): use [SQLAlchemy](https://www.sqlalchemy.org/) to interact with TiDB Serverless: connect db, index&store data and then search vectors. 32 | - [Build RAG with Jina AI Embeddings](./jina-ai-embeddings-demo/README.md): use Jina AI to generate embeddings for text data, store the embeddings in TiDB Vector Storage, and search for similar embeddings. 33 | - [Semantic Cache](./semantic-cache/README.md): build a semantic cache with Jina AI and TiDB Vector. 34 | 35 | ## Real World Applications 36 | 37 | ### TiDB.ai 38 | 39 | [tidb.ai](https://tidb.ai) is an amazing out-of-the-box Graph RAG(Retrieval Augmented Generation) template project based on the TiDB vector store, it contains ui and server logic, fork it on [github](https://github.com/pingcap/tidb.ai) and deploy your own. 40 | 41 | ![out-of-box-conversational-search](https://github.com/pingcap/tidb.ai/assets/1237528/0784e26e-8392-4bbe-bda1-6a680b12a805 "Image Title") 42 | -------------------------------------------------------------------------------- /examples/dspy-demo/.env.example: -------------------------------------------------------------------------------- 1 | # A example database URL to connect to a TiDB cluster from macOS: 2 | # mysql+pymysql://.root:@gateway01..prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true 3 | TIDB_DATABASE_URL="mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true" 4 | 5 | # The name of the language model to use for the language model-based retriever. 6 | LM_MODEL_NAME="" 7 | 8 | # The base URL of the Ollama API. 9 | OLLAMA_BASE_URL="http://:11434" 10 | 11 | # The API key to use for the Ollama API. 12 | OLLAMA_API_KEY="ollama" 13 | 14 | # sentence-transformers model 15 | SENTENCE_TRANSFORMERS_MODEL="" -------------------------------------------------------------------------------- /examples/dspy-demo/README.md: -------------------------------------------------------------------------------- 1 | # DSPy Demo 2 | 3 | This example demonstrates how to use the DSPy and TiDB Serverless to build a simple RAG application. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster 8 | - Python 3.10 or later 9 | - Ollama or OpenAI 10 | 11 | ## Run the example 12 | 13 | ### Clone this repo 14 | 15 | ```bash 16 | git clone https://github.com/pingcap/tidb-vector-python.git 17 | ``` 18 | 19 | ### Create a virtual environment 20 | 21 | ```bash 22 | cd tidb-vector-python/examples/dspy-demo 23 | python3 -m venv .venv 24 | source .venv/bin/activate 25 | ``` 26 | 27 | ### Install dependencies 28 | 29 | ```bash 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | ### Set the environment variables 34 | 35 | Get the TiDB connection string via `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as 36 | described in the [Prerequisites](../README.md#prerequisites) section. 37 | 38 | The TiDB connection string will look like: 39 | 40 | ``` 41 | mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True 42 | ``` 43 | 44 | ### Run this example 45 | 46 | ```text 47 | $ python3 example.py 48 | Connected to TiDB. 49 | describe table: {'success': True, 'result': 6, 'error': None} 50 | Initializing the TidbRM model... 51 | TidbRM model initialized successfully. 52 | Loading sample data... 53 | sample_data.txt found. 54 | Sample data loaded successfully. 55 | Embedding sample data... 56 | 0 At My Wind [-0.27386308 -0.3816067 -0.12257734 0.04750763 -0.12517984] 57 | 1 Little Win [ 0.05535038 -0.2605278 -0.19080743 -0.3411712 -0.0255685 ] 58 | 2 Storm Wind [-0.24868685 -0.21516131 -0.03831396 0.08118728 -0.05171517] 59 | 3 Yes I Am ( [ 0.07458089 -0.31562874 -0.14104412 -0.13799803 -0.02719649] 60 | 4 The Great [-0.08352712 0.12166582 -0.07781561 0.2473993 -0.3156342 ] 61 | 5 Rosario Da [-0.02921938 0.16053236 -0.2157185 -0.14237025 0.3970173 ] 62 | 6 Robert B. [-0.05834749 -0.16446972 -0.00786973 0.02972636 0.03525066] 63 | 7 Richard M. [-0.13012317 -0.20805678 0.0064573 0.05393503 0.043081 ] 64 | 8 Everything [ 0.09028038 0.03007011 -0.29266015 -0.27439988 -0.2159805 ] 65 | 9 Everything [-0.02172723 0.22668567 -0.17105839 0.04179271 -0.18812893] 66 | 10 Janick Ger [ 0.179568 -0.4577289 -0.05370283 0.09678644 -0.27309376] 67 | 11 Dave Murra [ 0.01501587 -0.32756883 -0.08704209 -0.07916276 -0.23352458] 68 | 12 Roy Z | Ro [ 0.12954581 -0.27150235 -0.0992474 0.14631633 0.09378276] 69 | 13 Heather Ba [ 0.07651925 -0.23409796 -0.03234328 0.01846722 -0.09262329] 70 | 14 Gianfranco [ 0.00602041 0.29790103 -0.2082347 0.12557846 0.13808164] 71 | Sample data embedded successfully. 72 | Sample data number: 15 73 | Inserting documents into TiDB... 74 | Documents inserted successfully. 75 | Answering the question: 'who write At My Window'... 76 | Townes Van Zandt wrote At My Window, which is an album released in 1987. The reasoning provided explains that Townes Van Zandt was the songwriter behind this album, showcasing his unique style and poetic lyrics. The album features a mix of folk, country, and Americana sounds, exploring themes of love, loss, and self-discovery. At My Window is often cited as one of the greatest albums in American music history, having had a significant influence on many other artists. 77 | 78 | 79 | 80 | Answer questions with short factoid answers. 81 | 82 | --- 83 | 84 | Follow the following format. 85 | 86 | Context: may contain relevant facts 87 | 88 | Question: ${question} 89 | 90 | Reasoning: Let's think step by step in order to ${produce the answer}. We ... 91 | 92 | Answer: often between 1 and 5 words 93 | 94 | --- 95 | 96 | Context: 97 | [1] «{'long_text': 'Rosario Dawson | Rosario Isabel Dawson (born May 9, 1979) is an American actress, producer, singer, comic book writer, and political activist. She made her film debut in the 1995 teen drama "Kids". Her subsequent film roles include "He Got Game", "Men in Black II", "25th Hour", "Rent", "Sin City", "Death Proof", "Seven Pounds", "", and "Top Five". Dawson has also provided voice-over work for Disney and DC.'}» 98 | [2] «{'long_text': 'Dave Murray (musician) | David Michael "Dave" Murray (born 23 December 1956) is an English guitarist and songwriter best known as one of the earliest members of the British heavy metal band Iron Maiden. Along with the group\'s bassist and primary songwriter Steve Harris, Murray has appeared on all of the band\'s releases.'}» 99 | [3] «{'long_text': 'Heather Baker | Heather Baker (born October 9, 1984) is a female American songwriter, guitarist, producer and founder of the Electronica band Antiwave. Heather is known for being a session and touring guitar player for the likes of Bonnie Mckee (Pulse Music Publishing), Krewella (Columbia Records), Kerli (Island Records), The Iron Maidens (Powerslave Records) and currently plays with the band Fake Figures (members of Atreyu, Eyelid and Scars of Tomorrow) and NoMBe (TH3RD BRAIN)'}» 100 | [4] «{'long_text': 'Janick Gers | Janick Robert Gers ( ; born 27 January 1957 in Hartlepool, England) is an English musician, best known for being one of the three current guitarists in Iron Maiden, along with Dave Murray and Adrian Smith, as well as his earlier work with Gillan and White Spirit.'}» 101 | [5] «{'long_text': 'Robert B. Sherman | Robert Bernard Sherman (December 19, 1925 – March 6, 2012) was an American songwriter who specialized in musical films with his brother Richard Morton Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers\' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "The Slipper and the Rose", and "Charlotte\'s Web". Their most well known work, however, remains the theme park song "It\'s a Small World (After All)". According to Time.com, this song is the most performed song of all time.'}» 102 | [6] «{'long_text': 'Richard M. Sherman | Richard Morton Sherman (born June 12, 1928) is an American songwriter who specialized in musical films with his brother Robert Bernard Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers\' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "Snoopy Come Home", "Bedknobs and Broomsticks", "The Slipper and the Rose", and "Charlotte\'s Web". Their most well known work, however, remains the theme park song "It\'s a Small World (After All)". According to Time.com, this song is the most performed song of all time.'}» 103 | [7] «{'long_text': 'Everything Changes (Julian Lennon album) | Everything Changes is the sixth studio album by English singer-songwriter Julian Lennon. It was released on 2 October 2011.'}» 104 | [8] «{'long_text': 'Roy Z | Roy Z (born February, 1968) is an American guitarist, songwriter and producer, best known for his work with Bruce Dickinson (from Iron Maiden), Halford, and Judas Priest. He also is the founder of Tribe of Gypsies, a Latin influenced hard rock band.'}» 105 | [9] «{'long_text': 'Gianfranco Rosi (director) | Gianfranco Rosi is an Italian director, cinematographer, producer and screenwriter. His film "Sacro GRA" won Golden Lion at 70th Venice International Film Festival. "Sacro GRA" is the first documentary film to win Golden Lion in history of the Venice film festival and the first Italian film to win in fifteen years, after Gianni Amelio\'s "The Way We Laughed" won the award in 1998. His 2016 film "Fire at Sea", a documentary focused on European migrant crisis on the Sicilan island of Lampedusa, won the Golden Bear at the 66th Berlin International Film Festival. Rosi is the only documentary filmmaker to win two top prizes at major European film festivals (Cannes, Berlin and Venice) and is currently the only filmmaker besides Michael Haneke, Jafar Panahi, Ang Lee, and Ken Loach to win two top European festival prizes in the 21st century.'}» 106 | [10] «{'long_text': 'The Great Victorian Collection | The Great Victorian Collection, published in 1975, is a novel by Northern Irish-Canadian writer Brian Moore. Set in Carmel, California, it tells the story of a man who dreams that the empty parking lot he can see from his hotel window has been transformed by the arrival of a collection of priceless Victoriana on display in a vast open-air market. When he awakes he finds that he can no longer distinguish the dream from reality.'}» 107 | [11] «{'long_text': 'Everything Has Changed | "Everything Has Changed" is a song written and performed by American singer-songwriter Taylor Swift and English singer-songwriter Ed Sheeran, taken from Swift\'s fourth studio album, "Red" (2012). Produced by Butch Walker, the track was released as the sixth single from the album on July 16, 2013. "Everything Has Changed" is a guitar ballad combining folk and pop genres about "wanting to get to know a new lover better".'}» 108 | [12] «{'long_text': 'Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released.'}» 109 | [13] «{'long_text': 'Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee.'}» 110 | [14] «{'long_text': 'Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Records on September 21, 1993 (see 1993 in music). The title is generally thought to refer to Etheridge\'s recent coming out as a lesbian, confirming long-standing rumors about her personal life. This is the album that gave Etheridge national recognition. The rock ballad "Come to My Window" was the first single released from the album, which peaked at No. 25 on the "Billboard" Hot 100, and its video featured actress Juliette Lewis having a nervous breakdown. This single brought the album into the public consciousness and was quickly followed by "I\'m the Only One", which became a major hit and reached No. 8 on the Hot 100, and "If I Wanted To", which hit No. 16.'}» 111 | [15] «{'long_text': 'At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt\'s first studio album in the nine years that followed 1978\'s "Flyin\' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.'}» 112 | 113 | Question: who write At My Window 114 | 115 | Reasoning: Let's think step by step in order to At My Window is an album written and performed by Townes Van Zandt. The album was released in 1987 and it's considered one of his best works, showcasing his unique songwriting style and poetic lyrics. The album features a mix of folk, country, and Americana sounds, with songs that explore themes of love, loss, and self-discovery. At My Window is often cited as one of the greatest albums in the history of American music, and it has had a significant influence on many other artists. Townes Van Zandt was an American singer-songwriter who was active from the 1960s until his death in 1997. He was known for his poetic lyrics and his ability to tell stories through his 116 | 117 | Answer: Townes Van Zandt wrote At My Window, which is an album released in 1987. The reasoning provided explains that Townes Van Zandt was the songwriter behind this album, showcasing his unique style and poetic lyrics. The album features a mix of folk, country, and Americana sounds, exploring themes of love, loss, and self-discovery. At My Window is often cited as one of the greatest albums in American music history, having had a significant influence on many other artists. 118 | ``` 119 | -------------------------------------------------------------------------------- /examples/dspy-demo/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | from functools import partial 3 | import dspy 4 | from dotenv import load_dotenv 5 | from dspy.datasets import HotPotQA 6 | from dspy.evaluate import Evaluate 7 | from dspy.teleprompt import BootstrapFewShot 8 | from sentence_transformers import SentenceTransformer 9 | from tidb_vector.integrations import TiDBVectorClient 10 | from utils import sentence_transformer_embedding_function, TidbRM, RAG 11 | 12 | # Load the environment variables from the .env file. 13 | load_dotenv() 14 | 15 | embed_model = SentenceTransformer(os.environ.get('SENTENCE_TRANSFORMERS_MODEL'), trust_remote_code=True) 16 | embed_model_dim = embed_model.get_sentence_embedding_dimension() 17 | embedding_function = partial(sentence_transformer_embedding_function, embed_model) 18 | 19 | # The configuration for the TiDBVectorClient. 20 | tidb_vector_client = TiDBVectorClient( 21 | # The table which will store the TiDB vector data. 22 | table_name=os.environ.get('TIDB_TABLE_NAME', 'embedded_documents'), 23 | # The connection string to the TiDB cluster. 24 | # The connection string should be in the format of: 25 | # mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true 26 | connection_string=os.environ.get('TIDB_DATABASE_URL'), 27 | # The dimension of the vector generated by the embedding model. 28 | vector_dimension=embed_model_dim, 29 | # Determine whether to recreate the table if it already exists. 30 | drop_existing_table=True, 31 | ) 32 | 33 | print("Connected to TiDB.") 34 | print("describe table:", tidb_vector_client.execute("describe embedded_documents;")) 35 | 36 | print("Initializing the TidbRM model...") 37 | retriever_model = TidbRM(tidb_vector_client=tidb_vector_client, embedding_function=embedding_function) 38 | print("TidbRM model initialized successfully.") 39 | 40 | print("Loading sample data...") 41 | # test sample data 42 | # load sample_data.txt if not local file, you can use requests.get(url).text 43 | # sample data url: https://raw.githubusercontent.com/wxywb/dspy_dataset_sample/master/sample_data.txt 44 | with open('sample_data.txt', 'r') as f: 45 | # I prepare a small set of data for speeding up embedding, you can replace it with your own data. 46 | print("sample_data.txt found.") 47 | sample_data = f.read() 48 | print("Sample data loaded successfully.") 49 | 50 | print("Embedding sample data...") 51 | documents = [] 52 | for idx, passage in enumerate(sample_data.split('\n')[:3]): 53 | embedding = embedding_function([passage])[0] 54 | print(idx, passage[:10], embedding[:5]) 55 | if len(passage) == 0: 56 | continue 57 | documents.append({ 58 | "id": str(idx), 59 | "text": passage, 60 | "embedding": embedding, 61 | "metadata": {"category": "album"}, 62 | }) 63 | print("Sample data embedded successfully.") 64 | print("Sample data number:", len(documents)) 65 | 66 | print("Inserting documents into TiDB...") 67 | tidb_vector_client.insert( 68 | ids=[doc["id"] for doc in documents], 69 | texts=[doc["text"] for doc in documents], 70 | embeddings=[doc["embedding"] for doc in documents], 71 | metadatas=[doc["metadata"] for doc in documents], 72 | ) 73 | print("Documents inserted successfully.") 74 | 75 | language_model = dspy.OllamaLocal( 76 | model=os.environ.get('LM_MODEL_NAME', 'llama3:8b'), 77 | base_url=os.environ.get('OLLAMA_BASE_URL'), 78 | api_key=os.environ.get('OLLAMA_API_KEY') 79 | ) 80 | dspy.settings.configure(lm=language_model) 81 | 82 | rag = RAG(retriever_model) 83 | 84 | dataset = HotPotQA(train_seed=1, train_size=2, eval_seed=2023, dev_size=5, test_size=0) 85 | # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata. 86 | trainset = [x.with_inputs('question') for x in dataset.train] 87 | devset = [x.with_inputs('question') for x in dataset.dev] 88 | 89 | metric = dspy.evaluate.answer_exact_match 90 | evaluate_on_hotpotqa = Evaluate(devset=devset[:], display_progress=True, display_table=False) 91 | score = evaluate_on_hotpotqa(rag, metric=metric) 92 | print('rag:', score) 93 | 94 | 95 | # Validation logic: check that the predicted answer is correct. 96 | # Also check that the retrieved context does contain that answer. 97 | def validate_context_and_answer(example, pred, trace=None): 98 | answer_em = dspy.evaluate.answer_exact_match(example, pred) 99 | answer_pm = dspy.evaluate.answer_passage_match(example, pred) 100 | return answer_em and answer_pm 101 | 102 | 103 | # Set up a basic teleprompter, which will compile our RAG program. 104 | teleprompter = BootstrapFewShot(metric=validate_context_and_answer) 105 | 106 | # Compile! 107 | compiled_rag = teleprompter.compile(rag, trainset=trainset) 108 | # Now compiled_rag is optimized and ready to answer your new question! 109 | score = evaluate_on_hotpotqa(compiled_rag, metric=metric) 110 | print('compile_rag:', score) 111 | 112 | if __name__ == '__main__': 113 | print("Answering the question: 'who write At My Window'...") 114 | print(rag("who write At My Window").answer) 115 | print(language_model.inspect_history(n=1)) 116 | -------------------------------------------------------------------------------- /examples/dspy-demo/requirements.txt: -------------------------------------------------------------------------------- 1 | PyMySQL==1.1.0 2 | SQLAlchemy==2.0.30 3 | dspy-ai==2.4.9 4 | openai==1.35.1 5 | sentence-transformers==3.0.1 6 | tidb-vector -------------------------------------------------------------------------------- /examples/dspy-demo/sample_data.txt: -------------------------------------------------------------------------------- 1 | At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt's first studio album in the nine years that followed 1978's "Flyin' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high. 2 | Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee. 3 | Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released. 4 | Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Records on September 21, 1993 (see 1993 in music). The title is generally thought to refer to Etheridge's recent coming out as a lesbian, confirming long-standing rumors about her personal life. This is the album that gave Etheridge national recognition. The rock ballad "Come to My Window" was the first single released from the album, which peaked at No. 25 on the "Billboard" Hot 100, and its video featured actress Juliette Lewis having a nervous breakdown. This single brought the album into the public consciousness and was quickly followed by "I'm the Only One", which became a major hit and reached No. 8 on the Hot 100, and "If I Wanted To", which hit No. 16. 5 | The Great Victorian Collection | The Great Victorian Collection, published in 1975, is a novel by Northern Irish-Canadian writer Brian Moore. Set in Carmel, California, it tells the story of a man who dreams that the empty parking lot he can see from his hotel window has been transformed by the arrival of a collection of priceless Victoriana on display in a vast open-air market. When he awakes he finds that he can no longer distinguish the dream from reality. 6 | Rosario Dawson | Rosario Isabel Dawson (born May 9, 1979) is an American actress, producer, singer, comic book writer, and political activist. She made her film debut in the 1995 teen drama "Kids". Her subsequent film roles include "He Got Game", "Men in Black II", "25th Hour", "Rent", "Sin City", "Death Proof", "Seven Pounds", "", and "Top Five". Dawson has also provided voice-over work for Disney and DC. 7 | Robert B. Sherman | Robert Bernard Sherman (December 19, 1925 – March 6, 2012) was an American songwriter who specialized in musical films with his brother Richard Morton Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time. 8 | Richard M. Sherman | Richard Morton Sherman (born June 12, 1928) is an American songwriter who specialized in musical films with his brother Robert Bernard Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "Snoopy Come Home", "Bedknobs and Broomsticks", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time. 9 | Everything Has Changed | "Everything Has Changed" is a song written and performed by American singer-songwriter Taylor Swift and English singer-songwriter Ed Sheeran, taken from Swift's fourth studio album, "Red" (2012). Produced by Butch Walker, the track was released as the sixth single from the album on July 16, 2013. "Everything Has Changed" is a guitar ballad combining folk and pop genres about "wanting to get to know a new lover better". 10 | Everything Changes (Julian Lennon album) | Everything Changes is the sixth studio album by English singer-songwriter Julian Lennon. It was released on 2 October 2011. 11 | Janick Gers | Janick Robert Gers ( ; born 27 January 1957 in Hartlepool, England) is an English musician, best known for being one of the three current guitarists in Iron Maiden, along with Dave Murray and Adrian Smith, as well as his earlier work with Gillan and White Spirit. 12 | Dave Murray (musician) | David Michael "Dave" Murray (born 23 December 1956) is an English guitarist and songwriter best known as one of the earliest members of the British heavy metal band Iron Maiden. Along with the group's bassist and primary songwriter Steve Harris, Murray has appeared on all of the band's releases. 13 | Roy Z | Roy Z (born February, 1968) is an American guitarist, songwriter and producer, best known for his work with Bruce Dickinson (from Iron Maiden), Halford, and Judas Priest. He also is the founder of Tribe of Gypsies, a Latin influenced hard rock band. 14 | Heather Baker | Heather Baker (born October 9, 1984) is a female American songwriter, guitarist, producer and founder of the Electronica band Antiwave. Heather is known for being a session and touring guitar player for the likes of Bonnie Mckee (Pulse Music Publishing), Krewella (Columbia Records), Kerli (Island Records), The Iron Maidens (Powerslave Records) and currently plays with the band Fake Figures (members of Atreyu, Eyelid and Scars of Tomorrow) and NoMBe (TH3RD BRAIN) 15 | Gianfranco Rosi (director) | Gianfranco Rosi is an Italian director, cinematographer, producer and screenwriter. His film "Sacro GRA" won Golden Lion at 70th Venice International Film Festival. "Sacro GRA" is the first documentary film to win Golden Lion in history of the Venice film festival and the first Italian film to win in fifteen years, after Gianni Amelio's "The Way We Laughed" won the award in 1998. His 2016 film "Fire at Sea", a documentary focused on European migrant crisis on the Sicilan island of Lampedusa, won the Golden Bear at the 66th Berlin International Film Festival. Rosi is the only documentary filmmaker to win two top prizes at major European film festivals (Cannes, Berlin and Venice) and is currently the only filmmaker besides Michael Haneke, Jafar Panahi, Ang Lee, and Ken Loach to win two top European festival prizes in the 21st century. -------------------------------------------------------------------------------- /examples/dspy-demo/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | import pytest 4 | from functools import partial 5 | from utils import sentence_transformer_embedding_function, Vector, Vectors 6 | from pydantic import BaseModel, ValidationError 7 | from sentence_transformers import SentenceTransformer 8 | 9 | load_dotenv() 10 | 11 | 12 | class VectorModel(BaseModel): 13 | vector: Vector 14 | 15 | 16 | class VectorsModel(BaseModel): 17 | vectors: Vectors 18 | 19 | 20 | @pytest.fixture(scope='module') 21 | def embed_model(): 22 | return SentenceTransformer(os.environ.get('SENTENCE_TRANSFORMERS_MODEL'), trust_remote_code=True) 23 | 24 | 25 | def test_sentence_transformer_embedding_function_return_shape(embed_model: SentenceTransformer): 26 | embed_model_dim = embed_model.get_sentence_embedding_dimension() 27 | 28 | assert embed_model.encode(["Hello, world!"]).shape == (1, embed_model_dim) 29 | assert embed_model.encode(["Hello, world!", "hi"]).shape == (2, embed_model_dim) 30 | assert embed_model.encode("Hello, World!").shape == (embed_model_dim,) 31 | 32 | 33 | def test_embedding_function(embed_model: SentenceTransformer): 34 | embedding_function = partial(sentence_transformer_embedding_function, embed_model) 35 | try: 36 | vector = embedding_function(sentences="Hello, world!") 37 | VectorModel(vector=vector) 38 | except ValidationError: 39 | assert False 40 | try: 41 | vectors = embedding_function(sentences=["Hello, world!"]) 42 | VectorsModel(vectors=vectors) 43 | except ValidationError: 44 | assert False 45 | try: 46 | vectors = embedding_function(sentences=["Hello, world!", "hi"]) 47 | VectorsModel(vectors=vectors) 48 | except ValidationError: 49 | assert False 50 | -------------------------------------------------------------------------------- /examples/dspy-demo/utils.py: -------------------------------------------------------------------------------- 1 | from dsp import dotdict 2 | from typing import Union, List, Optional 3 | import dspy 4 | from sentence_transformers import SentenceTransformer 5 | from tidb_vector.integrations import TiDBVectorClient 6 | 7 | # Vector and Vectors 8 | # https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-encoding_format 9 | Vector = Union[List[float], List[int]] 10 | Vectors = List[Vector] 11 | 12 | 13 | def sentence_transformer_embedding_function( 14 | embed_model: SentenceTransformer, 15 | sentences: Union[str, List[str]] 16 | ) -> Union[Vector, Vectors]: 17 | """ 18 | Generates vector embeddings for the given text using the sentence-transformers model. 19 | 20 | Args: 21 | embed_model (SentenceTransformer): The sentence-transformers model to use. 22 | sentences (Union[str, List[str]]): The text or list of texts for which to generate embeddings. 23 | 24 | Returns: 25 | if sentences is a single string: 26 | List[float]: The embedding for the input sentence. 27 | if sentences is a list of strings: 28 | List[List[float]]: The embeddings for the input sentences. 29 | 30 | 31 | Examples: 32 | Below is a code snippet that shows how to use this function: 33 | ```python 34 | embeddings = sentence_transformer_embedding_function("Hello, world!") 35 | ``` 36 | or 37 | ```python 38 | embeddings = sentence_transformer_embedding_function(["Hello, world!"]) 39 | ``` 40 | """ 41 | 42 | return embed_model.encode(sentences).tolist() 43 | 44 | 45 | class TidbRM(dspy.Retrieve): 46 | """ 47 | A retrieval module that uses TiDBVectorClient to return passages for a given query. 48 | 49 | Args: 50 | tidb_vector_client (TiDBVectorClient): The TiDBVectorClient instance to use for querying TiDB. 51 | embedding_function (callable): The function to convert a list of text to embeddings. 52 | The embedding function should take a list of text strings as input and output a list of embeddings. 53 | k (int, optional): The number of top passages to retrieve. Defaults to 3. 54 | 55 | Returns: 56 | dspy.Prediction: An object containing the retrieved passages. 57 | 58 | Examples: 59 | Below is a code snippet that shows how to use this as the default retriever: 60 | use OpenAI 61 | ```python 62 | llm = dspy.OpenAI(model="gpt-3.5-turbo") 63 | retriever_model = TidbRM( 64 | tidb_vector_client=tidb_vector_client, 65 | embedding_function=sentence_transformer_embedding_function 66 | ) 67 | dspy.settings.configure(rm=retriever_model) 68 | ``` 69 | 70 | use Ollama 71 | ```python 72 | llm = dspy.OllamaLocal(model="llama3:8b") 73 | retriever_model = TidbRM( 74 | tidb_vector_client=tidb_vector_client, 75 | embedding_function=llm 76 | ) 77 | 78 | """ 79 | 80 | def __init__(self, tidb_vector_client: TiDBVectorClient, embedding_function: Optional[callable] = None, k: int = 3): 81 | super().__init__(k) 82 | self.tidb_vector_client = tidb_vector_client 83 | self.embedding_function = embedding_function 84 | self.top_k = k 85 | 86 | def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None, **kwargs) -> dspy.Prediction: 87 | """ 88 | Retrieve passages for the given query. 89 | 90 | Args: 91 | query_or_queries (Union[str, List[str]]): The query or queries for which to retrieve passages. 92 | k (Optional[int]): The number of top passages to retrieve. Defaults to 3. 93 | 94 | Returns: 95 | dspy.Prediction: An object containing the retrieved passages. 96 | 97 | Examples: 98 | Below is a code snippet that shows how to use this function: 99 | ```python 100 | passages = self.retrieve("Hello, world!") 101 | ``` 102 | """ 103 | query_embeddings = self.embedding_function(query_or_queries) 104 | k = k or self.top_k 105 | tidb_vector_res = self.tidb_vector_client.query(query_vector=query_embeddings, k=k) 106 | passages_scores = {} 107 | for res in tidb_vector_res: 108 | res.metadata = dotdict(res.metadata) 109 | passages_scores[res.document] = res.distance 110 | sorted_passages = sorted(passages_scores.items(), key=lambda x: x[1], reverse=True) 111 | 112 | return dspy.Prediction(passages=[dotdict({"long_text": passage}) for passage, _ in sorted_passages]) 113 | 114 | 115 | class GenerateAnswer(dspy.Signature): 116 | """Answer questions with short factoid answers.""" 117 | 118 | context = dspy.InputField(desc="may contain relevant facts") 119 | question = dspy.InputField() 120 | answer = dspy.OutputField(desc="often between 1 and 5 words") 121 | 122 | 123 | class RAG(dspy.Module): 124 | def __init__(self, rm): 125 | super().__init__() 126 | self.retrieve = rm 127 | 128 | # This signature indicates the task imposed on the COT module. 129 | self.generate_answer = dspy.ChainOfThought(GenerateAnswer) 130 | 131 | def forward(self, question): 132 | # Use milvus_rm to retrieve context for the question. 133 | context = self.retrieve(question).passages 134 | # COT module takes "context, query" and output "answer". 135 | prediction = self.generate_answer(context=context, question=question) 136 | return dspy.Prediction(context=[item.long_text for item in context], answer=prediction.answer) 137 | -------------------------------------------------------------------------------- /examples/gemini-ai-embeddings-demo/README.md: -------------------------------------------------------------------------------- 1 | # GeminiAI Embedding Example 2 | 3 | This example demonstrates how to utilize GeminiAI embedding for semantic search. According to GeminiAI's [documentation](https://ai.google.dev/gemini-api/docs/embeddings), we will use cosine similarity to calculate vector distance. 4 | 5 | You can run this example in two ways: 6 | 7 | - [Run in Jupyter Notebook](#jupyter-notebook) 8 | - [Run in Local](#run-in-local) 9 | 10 | ## Jupyter Notebook 11 | 12 | Notebook: [example.ipynb](./example.ipynb) 13 | 14 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/gemini-ai-embeddings-demo/example.ipynb). 15 | 16 | ## Run in Local 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | python3 -m venv .venv 22 | source .venv/bin/activate 23 | ``` 24 | 25 | ### Install the requirements 26 | 27 | ```bash 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | ### Set the environment variables 32 | 33 | Get the `GEMINI_API_KEY` from [GeminiAI](https://ai.google.dev/gemini-api/docs/quickstart) 34 | 35 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. 36 | 37 | ```bash 38 | export GEMINI_API_KEY="*******" 39 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com" 40 | export TIDB_USERNAME="****.root" 41 | export TIDB_PASSWORD="****" 42 | ``` 43 | 44 | ### Run the example 45 | 46 | ```bash 47 | python3 example.py 48 | ``` 49 | -------------------------------------------------------------------------------- /examples/gemini-ai-embeddings-demo/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "ewKGZW06kmIv" 7 | }, 8 | "source": [ 9 | "# Example of Embedding\n", 10 | "\n", 11 | "It is an embedding example that uses `tidb_vector_python` as its library." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "id": "F1fsS576izUl" 18 | }, 19 | "source": [ 20 | "## Install Dependencies" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "id": "pTpKX_lDizUp" 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "%%capture\n", 32 | "%pip install google.generativeai peewee pymysql tidb_vector" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "psEHGWiHizUq" 39 | }, 40 | "source": [ 41 | "## Preapre the environment\n", 42 | "\n", 43 | "> **Note:**\n", 44 | ">\n", 45 | "> - You can get the `GEMINI_API_KEY` from [GeminiAI](https://ai.google.dev/gemini-api/docs/quickstart).\n", 46 | "> - You can get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.\n", 47 | "\n", 48 | "Set the embedding model as `models/embedding-001`, and\n", 49 | "the amount of embedding dimensions is `768`." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "id": "MgKOjwmYizUq" 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import getpass\n", 61 | "\n", 62 | "GEMINI_API_KEY = getpass.getpass(\"Enter your GeminiAI API key: \")\n", 63 | "TIDB_HOST = input(\"Enter your TiDB host: \")\n", 64 | "TIDB_USERNAME = input(\"Enter your TiDB username: \")\n", 65 | "TIDB_PASSWORD = getpass.getpass(\"Enter your TiDB password: \")\n", 66 | "\n", 67 | "embedding_model = \"models/embedding-001\"\n", 68 | "embedding_dimensions = 768" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "3WbH_BITizUr" 75 | }, 76 | "source": [ 77 | "## Initial the Clients of OpenAI and Database" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "id": "UWtcs58-izUr" 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "import google.generativeai as genai\n", 89 | "from peewee import Model, MySQLDatabase, TextField, SQL\n", 90 | "from tidb_vector.peewee import VectorField\n", 91 | "\n", 92 | "genai.configure(api_key=GEMINI_API_KEY)\n", 93 | "db = MySQLDatabase(\n", 94 | " 'test',\n", 95 | " user=TIDB_USERNAME,\n", 96 | " password=TIDB_PASSWORD,\n", 97 | " host=TIDB_HOST,\n", 98 | " port=4000,\n", 99 | " ssl_verify_cert=True,\n", 100 | " ssl_verify_identity=True\n", 101 | ")\n", 102 | "db.connect()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "id": "uOyjrmWJizUr" 109 | }, 110 | "source": [ 111 | "## Prepare the Context\n", 112 | "\n", 113 | "In this case, contexts are the documents, use the openai embeddings model to get the embeddings of the documents, and store them in the TiDB." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "id": "_e5P_m0MizUs" 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "documents = [\n", 125 | " \"TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.\",\n", 126 | " \"TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.\",\n", 127 | " \"TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. \",\n", 128 | "]\n", 129 | "\n", 130 | "class DocModel(Model):\n", 131 | " text = TextField()\n", 132 | " embedding = VectorField(dimensions=embedding_dimensions)\n", 133 | "\n", 134 | " class Meta:\n", 135 | " database = db\n", 136 | " table_name = \"gemini_embedding_test\"\n", 137 | "\n", 138 | " def __str__(self):\n", 139 | " return self.text\n", 140 | "\n", 141 | "db.drop_tables([DocModel])\n", 142 | "db.create_tables([DocModel])\n", 143 | "\n", 144 | "embeddings = genai.embed_content(model=embedding_model, content=documents, task_type=\"retrieval_document\")\n", 145 | "data_source = [\n", 146 | " {\"text\": doc, \"embedding\": emb}\n", 147 | " for doc, emb in zip(documents, embeddings['embedding'])\n", 148 | "]\n", 149 | "DocModel.insert_many(data_source).execute()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "id": "zMP-P1g8izUs" 156 | }, 157 | "source": [ 158 | "## Initial the Vector of Question\n", 159 | "\n", 160 | "Ask a question, use the openai embeddings model to get the embeddings of the question" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "id": "-zrTOxs4izUt" 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "question = \"what is TiKV?\"\n", 172 | "question_embedding = genai.embed_content(model=embedding_model, content=[question], task_type=\"retrieval_query\")['embedding'][0]" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "atc0gXVZizUt" 179 | }, 180 | "source": [ 181 | "## Retrieve by Cosine Distance of Vectors\n", 182 | "Get the relevant documents from the TiDB by comparing the embeddings of the question and the documents" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "id": "DTtJRX64izUt" 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "related_docs = DocModel.select(\n", 194 | " DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias(\"distance\")\n", 195 | ").order_by(SQL(\"distance\")).limit(3)\n", 196 | "\n", 197 | "print(\"Question:\", question)\n", 198 | "print(\"Related documents:\")\n", 199 | "for doc in related_docs:\n", 200 | " print(doc.distance, doc.text)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "id": "bYBetPchmNUp" 207 | }, 208 | "source": [ 209 | "## Cleanup" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "id": "Lh27gC7gizUt" 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "db.close()" 221 | ] 222 | } 223 | ], 224 | "metadata": { 225 | "colab": { 226 | "provenance": [], 227 | "toc_visible": true 228 | }, 229 | "kernelspec": { 230 | "display_name": ".venv", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.10.13" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 0 249 | } 250 | -------------------------------------------------------------------------------- /examples/gemini-ai-embeddings-demo/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | from peewee import Model, MySQLDatabase, TextField, SQL 3 | from tidb_vector.peewee import VectorField 4 | import google.generativeai as genai # Hypothetical import for Gemini API client 5 | 6 | # Init Gemini client 7 | # Adjust the initialization according to the Gemini API documentation 8 | genai.configure(api_key=os.environ.get('GEMINI_API_KEY')) 9 | embedding_model = 'models/embedding-001' # Replace with the actual model name 10 | embedding_dimensions = 768 # Adjust if different for the Gemini model 11 | 12 | # Init TiDB connection 13 | db = MySQLDatabase( 14 | 'test', 15 | user=os.environ.get('TIDB_USERNAME'), 16 | password=os.environ.get('TIDB_PASSWORD'), 17 | host=os.environ.get('TIDB_HOST'), 18 | port=4000, 19 | ssl_verify_cert=True, 20 | ssl_verify_identity=True 21 | ) 22 | 23 | documents = [ 24 | "TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.", 25 | "TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.", 26 | "TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability.", 27 | ] 28 | 29 | # Define a model with a VectorField to store the embeddings 30 | class DocModel(Model): 31 | text = TextField() 32 | embedding = VectorField(dimensions=embedding_dimensions) 33 | 34 | class Meta: 35 | database = db 36 | table_name = "gemini_embedding_test" 37 | 38 | def __str__(self): 39 | return self.text 40 | 41 | db.connect() 42 | db.drop_tables([DocModel]) 43 | db.create_tables([DocModel]) 44 | 45 | # Insert the documents and their embeddings into TiDB 46 | embeddings = genai.embed_content(model=embedding_model, content=documents, task_type="retrieval_document") 47 | data_source = [ 48 | {"text": doc, "embedding": emb} 49 | for doc, emb in zip(documents, embeddings['embedding']) 50 | ] 51 | DocModel.insert_many(data_source).execute() 52 | 53 | # Query the most similar documents to a question 54 | # 1. Generate the embedding of the question 55 | # 2. Query the most similar documents based on the cosine distance in TiDB 56 | # 3. Print the results 57 | question = "what is TiKV?" 58 | question_embedding = genai.embed_content(model=embedding_model, content=[question], task_type="retrieval_query")['embedding'][0] 59 | related_docs = DocModel.select( 60 | DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias("distance") 61 | ).order_by(SQL("distance")).limit(3) 62 | 63 | print("Question:", question) 64 | print("Related documents:") 65 | for doc in related_docs: 66 | print(doc.distance, doc.text) 67 | 68 | db.close() 69 | 70 | # Expected Output: 71 | # 72 | # Question: what is TiKV? 73 | # Related documents: 74 | # 0.22371791507562544 TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. 75 | # 0.3317073143109729 TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee. 76 | # 0.3690570695898543 TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. -------------------------------------------------------------------------------- /examples/gemini-ai-embeddings-demo/requirements.txt: -------------------------------------------------------------------------------- 1 | google.generativeai 2 | peewee 3 | tidb-vector -------------------------------------------------------------------------------- /examples/graphrag-demo/README.md: -------------------------------------------------------------------------------- 1 | # GraphRAG Demo 2 | 3 | This example demonstrates how to use the DSPy and TiDB Serverless to build a simple GraphRAG application. It crawled an example webpage and index the content to TiDB Serverless with Graph, then use the Graph and Vector to search the content and generate the answer with OpenAI. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster 8 | - Vector search enabled 9 | - Run the [init.sql](./init.sql) in your cluster 10 | - Python 3.8 or later 11 | - OpenAI [API key](https://platform.openai.com/docs/quickstart) 12 | 13 | ## Run the example 14 | 15 | ### Clone this repo 16 | 17 | ```bash 18 | git clone https://github.com/pingcap/tidb-vector-python.git 19 | ``` 20 | 21 | ### Create a virtual environment 22 | 23 | ```bash 24 | cd tidb-vector-python/examples/graphrag-demo 25 | python3 -m venv .venv 26 | source .venv/bin/activate 27 | ``` 28 | 29 | ### Install dependencies 30 | 31 | ```bash 32 | pip install -r requirements.txt 33 | ``` 34 | 35 | ### Set the environment variables 36 | 37 | Get the TiDB connection string via `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. 38 | 39 | The TiDB connection string will look like: 40 | 41 | ``` 42 | mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True 43 | ``` 44 | 45 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart) 46 | 47 | ### Run this example 48 | 49 | 50 | ```text 51 | $ python3 graphrag-demo.py 52 | Input your TIDB connection string: 53 | Input your OpenAI API Key: 54 | Enter your question: 55 | ``` 56 | -------------------------------------------------------------------------------- /examples/graphrag-demo/graphrag-demo.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, text 2 | import openai 3 | import getpass 4 | 5 | # TiDB Connection String Pattern: 6 | # mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True 7 | 8 | db_engine = create_engine(getpass.getpass("Input your TIDB connection string:")) 9 | oai_cli = openai.OpenAI(api_key=getpass.getpass("Input your OpenAI API Key:")) 10 | question = input("Enter your question:") 11 | embedding = str(oai_cli.embeddings.create(input=[question], model="text-embedding-3-small").data[0].embedding) 12 | 13 | with db_engine.connect() as conn: 14 | result = conn.execute(text(""" 15 | WITH initial_entity AS ( 16 | SELECT id FROM `entities` 17 | ORDER BY VEC_Cosine_Distance(description_vec, :embedding) LIMIT 1 18 | ), entities_ids AS ( 19 | SELECT source_entity_id i FROM relationships r INNER JOIN initial_entity i ON r.target_entity_id = i.id 20 | UNION SELECT target_entity_id i FROM relationships r INNER JOIN initial_entity i ON r.source_entity_id = i.id 21 | UNION SELECT initial_entity.id i FROM initial_entity 22 | ) SELECT description FROM `entities` WHERE id IN (SELECT i FROM entities_ids);"""), {"embedding": embedding}).fetchall() 23 | 24 | print(oai_cli.chat.completions.create(model="gpt-4o", messages=[ 25 | {"role": "system", "content": f"Please carefully answer the question by {str(result)}"}, 26 | {"role": "user", "content": question}]).choices[0].message.content) 27 | -------------------------------------------------------------------------------- /examples/graphrag-demo/requirements.txt: -------------------------------------------------------------------------------- 1 | PyMySQL==1.1.0 2 | openai==1.27.0 3 | SQLAlchemy==2.0.30 -------------------------------------------------------------------------------- /examples/graphrag-step-by-step-tutorial/README.md: -------------------------------------------------------------------------------- 1 | # GraphRAG Step by Step Tutorials 2 | 3 | This example demonstrates how to achieve GraphRAG using just 10 lines of core code. The steps outlined in this example guide you through setting up the environment, handling dependencies, and executing the core code to build and query a knowledge graph. Below are the main sections and steps involved in this process. 4 | 5 | ## Table of Contents 6 | 7 | 1. **Setting** 8 | 2. **Dependencies** 9 | 3. **Prerequisites** 10 | 4. **Core Code** 11 | - Part 1: Indexing 12 | - Set OpenAI and DSPy 13 | - Load Raw Wikipedia Page 14 | - Extract Raw Wikipedia Page to Knowledge Graph 15 | - Let's Show the Graph 16 | - Save Graph to TiDB Serverless 17 | - Part 2: Retrieve 18 | - Ask Question 19 | - Find Entities and Relationships 20 | - Part 3: Generate Answer 21 | 22 | 23 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/graphrag-step-by-step-tutorial/example.ipynb). 24 | -------------------------------------------------------------------------------- /examples/image_search/README.md: -------------------------------------------------------------------------------- 1 | # Image Search Example 2 | 3 | This example shows how to use OpenAI CLIP to encode images as embeddings and store them in TiDB Serverless. It also demonstrates how to use the CLIP model to encode query text and search for the most similar images. 4 | 5 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/image_search/example.ipynb). 6 | -------------------------------------------------------------------------------- /examples/image_search/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example of Image Search\n", 8 | "\n", 9 | "It is an example of image search using [OpenAI CLIP](https://huggingface.co/docs/transformers/model_doc/clip) and TiDB Serverless Vector Search.\n", 10 | "\n", 11 | "We will use the CLIP model to encode the image to a 512-dimensional vector and store them in TiDB Serverless. Then use the same model to encode the text query and search for the most similar images in TiDB Serverless." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Install dependencies\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "%pip install -q torch transformers requests ipyplot datasets sqlalchemy pymysql tidb_vector" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Prepare the environment\n", 35 | "\n", 36 | "> **Note:**\n", 37 | ">\n", 38 | "> - You can get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.\n", 39 | "> - In this example, we use CLIP to generate text and image embeddings with 512 dimensions.\n" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import getpass\n", 49 | "\n", 50 | "TIDB_HOST = input(\"Enter your TiDB host: \")\n", 51 | "TIDB_USERNAME = input(\"Enter your TiDB username: \")\n", 52 | "TIDB_PASSWORD = getpass.getpass(\"Enter your TiDB password: \")\n", 53 | "\n", 54 | "CLIP_DIMENSION = 512" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Initial the Database and Table" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "from sqlalchemy import URL, create_engine, Column, Integer\n", 71 | "from sqlalchemy.orm import declarative_base, sessionmaker\n", 72 | "from tidb_vector.sqlalchemy import VectorType\n", 73 | "\n", 74 | "engine = create_engine(URL(\n", 75 | " \"mysql+pymysql\",\n", 76 | " username=TIDB_USERNAME,\n", 77 | " password=TIDB_PASSWORD,\n", 78 | " host=TIDB_HOST,\n", 79 | " port=4000,\n", 80 | " database=\"test\",\n", 81 | " query={\"ssl_verify_cert\": True, \"ssl_verify_identity\": True},\n", 82 | "))\n", 83 | "\n", 84 | "Session = sessionmaker(bind=engine)\n", 85 | "Base = declarative_base()\n", 86 | "\n", 87 | "class ImageSearchTest(Base):\n", 88 | " __tablename__ = \"image_search_test\"\n", 89 | "\n", 90 | " id = Column(Integer, primary_key=True)\n", 91 | " image_id = Column(Integer)\n", 92 | " embedding = Column(VectorType(CLIP_DIMENSION))\n", 93 | "\n", 94 | "Base.metadata.drop_all(engine)\n", 95 | "Base.metadata.create_all(engine)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Initial CLIP model" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "import torch\n", 112 | "from transformers import CLIPProcessor, CLIPModel\n", 113 | "\n", 114 | "\n", 115 | "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n", 116 | "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Load test images" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "import datasets\n", 133 | "\n", 134 | "imagenet_datasets = datasets.load_dataset('theodor1289/imagenet-1k_tiny', split='train')" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# inspect the imagenet datasets\n", 144 | "imagenet_datasets[0]" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "extract the images" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "import ipyplot\n", 161 | "\n", 162 | "imagenet_images = [i['image'] for i in imagenet_datasets]\n", 163 | "ipyplot.plot_images(imagenet_images, max_images=20, img_width=100)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## Define the encode function and other helper functions" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "def encode_images_to_embeddings(images):\n", 180 | " # accept a list of images and return the image embeddings\n", 181 | " with torch.no_grad():\n", 182 | " inputs = processor(images=images, return_tensors=\"pt\")\n", 183 | " image_features = model.get_image_features(**inputs)\n", 184 | " return image_features.cpu().detach().numpy()\n", 185 | "\n", 186 | "def encode_text_to_embedding(text):\n", 187 | " # accept a text and return the text embedding\n", 188 | " with torch.no_grad():\n", 189 | " inputs = processor(text=text, return_tensors=\"pt\")\n", 190 | " text_features = model.get_text_features(**inputs)\n", 191 | " return text_features.cpu().detach().numpy()[0]\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "## Store the images and their corresponding image embeddings in TiDB Serverless" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "images_embedding = encode_images_to_embeddings(imagenet_images)\n", 208 | "objects = []\n", 209 | "\n", 210 | "for i, embedding in enumerate(images_embedding):\n", 211 | " img = imagenet_images[i]\n", 212 | " objects.append(\n", 213 | " ImageSearchTest(\n", 214 | " image_id=i,\n", 215 | " embedding=embedding\n", 216 | " )\n", 217 | " )\n", 218 | "\n", 219 | "with Session() as session:\n", 220 | " session.add_all(objects)\n", 221 | " session.commit()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "## Search for similar images using the text query" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "from sqlalchemy import asc\n", 238 | "\n", 239 | "query_text = \"dog\"\n", 240 | "query_text_embedding = encode_text_to_embedding(query_text)\n", 241 | "\n", 242 | "with Session() as session:\n", 243 | " results = session.query(\n", 244 | " ImageSearchTest,\n", 245 | " ImageSearchTest.embedding.cosine_distance(query_text_embedding).label(\"distance\"),\n", 246 | " ).order_by(\n", 247 | " asc(\"distance\")\n", 248 | " ).limit(5).all()\n", 249 | "\n", 250 | "\n", 251 | " similar_images = []\n", 252 | " similarities = []\n", 253 | " for obj, d in results:\n", 254 | " similar_images.append(imagenet_images[obj.image_id])\n", 255 | " similarities.append(round(1 - d, 3))\n", 256 | "\n", 257 | "# display the similar images\n", 258 | "ipyplot.plot_images(similar_images, labels=similarities, img_width=100)\n" 259 | ] 260 | } 261 | ], 262 | "metadata": { 263 | "kernelspec": { 264 | "display_name": ".venv", 265 | "language": "python", 266 | "name": "python3" 267 | }, 268 | "language_info": { 269 | "codemirror_mode": { 270 | "name": "ipython", 271 | "version": 3 272 | }, 273 | "file_extension": ".py", 274 | "mimetype": "text/x-python", 275 | "name": "python", 276 | "nbconvert_exporter": "python", 277 | "pygments_lexer": "ipython3", 278 | "version": "3.12.2" 279 | } 280 | }, 281 | "nbformat": 4, 282 | "nbformat_minor": 2 283 | } 284 | -------------------------------------------------------------------------------- /examples/jina-ai-embeddings-demo/README.md: -------------------------------------------------------------------------------- 1 | # Jina AI Embeddings Demo 2 | This is a simple demo to show how to use Jina AI to generate embeddings for text data. Then store the embeddings in TiDB Vector Storage and search for similar embeddings. 3 | 4 | ## Prerequisites 5 | 6 | - A running TiDB Serverless cluster with vector search enabled 7 | - Python 3.8 or later 8 | - Jina AI API key 9 | 10 | ## Run the example 11 | 12 | ### Clone this repo 13 | 14 | ```bash 15 | git clone https://github.com/pingcap/tidb-vector-python.git 16 | ``` 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | cd tidb-vector-python/examples/jina-ai-embeddings-demo 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | ``` 25 | 26 | ### Install dependencies 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | ### Set the environment variables 33 | 34 | Get the Jina AI API key from the [Jina AI Embedding API](https://jina.ai/embeddings/) page 35 | 36 | Get the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then replace the following placeholders to get the `TIDB_DATABASE_URL`. 37 | 38 | ```bash 39 | export JINA_API_KEY="****" 40 | export TIDB_DATABASE_URL="mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true" 41 | ``` 42 | or create a `.env` file with the above environment variables. 43 | 44 | 45 | ### Run this example 46 | 47 | ```text 48 | $ python jina-ai-embeddings-demo.py 49 | - Inserting Data to TiDB... 50 | - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. 51 | - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. 52 | - List All Documents and Their Distances to the Query: 53 | - distance: 0.3585317326132522 54 | content: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. 55 | - distance: 0.10858102967720984 56 | content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. 57 | - The Most Relevant Document and Its Distance to the Query: 58 | - distance: 0.10858102967720984 59 | content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. 60 | ``` -------------------------------------------------------------------------------- /examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import dotenv 4 | 5 | from sqlalchemy import Column, Integer, String, create_engine 6 | from sqlalchemy.orm import Session, declarative_base 7 | from tidb_vector.sqlalchemy import VectorType 8 | 9 | dotenv.load_dotenv() 10 | 11 | 12 | # Step 1. Define a helper function to generate embeddings using Jina AI's API. 13 | JINAAI_API_KEY = os.getenv('JINAAI_API_KEY') 14 | assert JINAAI_API_KEY is not None 15 | 16 | 17 | def generate_embeddings(text: str): 18 | JINAAI_API_URL = 'https://api.jina.ai/v1/embeddings' 19 | JINAAI_HEADERS = { 20 | 'Content-Type': 'application/json', 21 | 'Authorization': f'Bearer {JINAAI_API_KEY}' 22 | } 23 | JINAAI_REQUEST_DATA = { 24 | 'input': [text], 25 | 'model': 'jina-embeddings-v2-base-en' # with dimisions 768. 26 | } 27 | response = requests.post(JINAAI_API_URL, headers=JINAAI_HEADERS, json=JINAAI_REQUEST_DATA) 28 | return response.json()['data'][0]['embedding'] 29 | 30 | 31 | # Step 2. Connect TiDB Serverless 32 | TIDB_DATABASE_URL = os.getenv('TIDB_DATABASE_URL') 33 | assert TIDB_DATABASE_URL is not None 34 | engine = create_engine(url=TIDB_DATABASE_URL, pool_recycle=300) 35 | 36 | 37 | # Step 3. Create the vector table. 38 | Base = declarative_base() 39 | 40 | 41 | class Document(Base): 42 | __tablename__ = "jinaai_tidb_demo_documents" 43 | 44 | id = Column(Integer, primary_key=True) 45 | content = Column(String(255), nullable=False) 46 | content_vec = Column( 47 | # DIMENSIONS is determined by the embedding model, 48 | # for Jina AI's jina-embeddings-v2-base-en model it's 768. 49 | VectorType(dim=768), 50 | ) 51 | 52 | 53 | Base.metadata.create_all(engine) 54 | 55 | 56 | # Step 4. Generate embeddings for texts via Jina AI API and store them in TiDB. 57 | 58 | TEXTS = [ 59 | 'Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.', 60 | 'TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.', 61 | ] 62 | 63 | data = [] 64 | for text in TEXTS: 65 | # Generate the embedding for the text via Jina AI API. 66 | embedding = generate_embeddings(text) 67 | data.append({ 68 | 'text': text, 69 | 'embedding': embedding 70 | }) 71 | 72 | with Session(engine) as session: 73 | print('- Inserting Data to TiDB...') 74 | for item in data: 75 | print(f' - Inserting: {item["text"]}') 76 | session.add(Document( 77 | content=item['text'], 78 | content_vec=item['embedding'] 79 | )) 80 | session.commit() 81 | 82 | 83 | # Step 5. Query the most relevant document based on the query. 84 | query = 'What is TiDB?' 85 | # Generate the embedding for the query via Jina AI API. 86 | query_embedding = generate_embeddings(query) 87 | with Session(engine) as session: 88 | print('- List All Documents and Their Distances to the Query:') 89 | for doc, distance in session.query( 90 | Document, 91 | Document.content_vec.cosine_distance(query_embedding).label('distance') 92 | ).all(): 93 | print(f' - distance: {distance}\n' 94 | f' content: {doc.content}') 95 | 96 | print('- The Most Relevant Document and Its Distance to the Query:') 97 | doc, distance = session.query( 98 | Document, 99 | Document.content_vec.cosine_distance(query_embedding).label('distance') 100 | ).order_by( 101 | 'distance' 102 | ).limit(1).first() 103 | print(f' - distance: {distance}\n' 104 | f' content: {doc.content}') 105 | 106 | # Expected Output: 107 | # 108 | # - Inserting Data to TiDB... 109 | # - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. 110 | # - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. 111 | # - List All Documents and Their Distances to the Query: 112 | # - distance: 0.3585317326132522 113 | # content: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI. 114 | # - distance: 0.10858102967720984 115 | # content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. 116 | # - The Most Relevant Document and Its Distance to the Query: 117 | # - distance: 0.10858102967720984 118 | # content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. 119 | 120 | -------------------------------------------------------------------------------- /examples/jina-ai-embeddings-demo/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | PyMySQL 3 | openai==1.27.0 4 | SQLAlchemy 5 | tidb-vector>=0.0.9 6 | python-dotenv -------------------------------------------------------------------------------- /examples/langchain-agent-demo/.env.example: -------------------------------------------------------------------------------- 1 | # A example database URL to connect to a TiDB cluster from macOS: 2 | # mysql+pymysql://.root:@gateway01..prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true 3 | TIDB_DATABASE_URL="mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true" 4 | 5 | # The name of the language model to use for the language model-based retriever. 6 | LM_MODEL_NAME="" 7 | 8 | # The base URL of the Ollama API. 9 | OLLAMA_BASE_URL="http://:11434" 10 | 11 | # The API key to use for the Ollama API. 12 | OLLAMA_API_KEY="ollama" 13 | 14 | # sentence-transformers model 15 | SENTENCE_TRANSFORMERS_MODEL="" -------------------------------------------------------------------------------- /examples/langchain-agent-demo/README.md: -------------------------------------------------------------------------------- 1 | # LangChain Agent Demo 2 | 3 | An Agent demo, Classify and Extract information from text using TiDBVectorClient, LangChain, and LLM. 4 | 5 | e.g. 6 | input: "At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987." 7 | 8 | query related documents: 9 | - "At My Window (album) | At My Window is an album ... " 10 | - "Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. ... " 11 | - "Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. ... " 12 | 13 | classify the input text: `{"category": "album", "reason": "The document is about an album named 'At My Window'."}` 14 | 15 | This demo is similar to the official cookbook, but replaces the knowledge part with tidbVectorClient. It tests the 16 | project's compatibility with both the official features and LangChain. 17 | 18 | - https://cookbook.openai.com/examples/how_to_build_a_tool-using_agent_with_langchain 19 | - https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/ 20 | 21 | 22 | 23 | ## Prerequisites 24 | 25 | - TiDB Serverless cluster 26 | - Python 3.10 or later 27 | - Ollama or OpenAI 28 | - langchain==0.2.10 29 | - langchain-community==0.2.9 30 | 31 | ## Run the example 32 | 33 | ### Clone this repo 34 | 35 | ```bash 36 | git clone https://github.com/pingcap/tidb-vector-python.git 37 | ``` 38 | 39 | ### Create a virtual environment 40 | 41 | ```bash 42 | cd tidb-vector-python/examples/langchain-agent-demo 43 | python3 -m venv .venv 44 | source .venv/bin/activate 45 | ``` 46 | 47 | ### Install dependencies 48 | 49 | ```bash 50 | pip install -r requirements.txt 51 | ``` 52 | 53 | ### Set the environment variables 54 | 55 | Get the TiDB connection string via `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as 56 | described in the [Prerequisites](../README.md#prerequisites) section. 57 | 58 | The TiDB connection string will look like: 59 | 60 | ``` 61 | mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True 62 | ``` 63 | 64 | ### Run the example 65 | ```text 66 | python ./tidb-vector-python/examples/langchain-agent-demo/example.py 67 | Connected to TiDB. 68 | describe table: 69 | {'success': True, 'result': 6, 'error': None} 70 | Initializing the retriever... 71 | Retriever initialized successfully. 72 | Loading sample data... 73 | sample_data.txt found. 74 | Sample data loaded successfully. 75 | Embedding sample data... 76 | 0 At My Wind [-0.14979149401187897, 0.07634416222572327, 0.07299982756376266, 0.153825044631958, 0.04083935171365738] 77 | 1 Little Win [0.32180845737457275, 0.5461692214012146, -0.014786622487008572, 0.03591456636786461, -0.22666659951210022] 78 | 2 Storm Wind [-0.022210828959941864, 0.16006261110305786, 0.14314979314804077, -0.08256750553846359, 0.14658856391906738] 79 | Sample data embedded successfully. 80 | Sample data number: 3 81 | Inserting documents into TiDB... 82 | Documents inserted successfully. 83 | # ---- Init Finish ---- 84 | > Entering new RunnableSequence chain... 85 | > Entering new RunnableParallel chain... 86 | > Entering new RunnableSequence chain... 87 | > Entering new RunnablePassthrough chain... 88 | > Finished chain. 89 | {'At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt\'s first studio album in the nine years that followed 1978\'s "Flyin\' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.': 0.6090894176961388, 'Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee.': 0.8308758434772159, 'Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released.': 0.9628706551444856} 90 | > Entering new RunnableLambda chain... 91 | > Finished chain. 92 | > Finished chain. 93 | > Finished chain. 94 | > Entering new PromptTemplate chain... 95 | > Finished chain. 96 | > Entering new OpenAIToolsAgentOutputParser chain... 97 | > Finished chain. 98 | > Finished chain. 99 | [ToolAgentAction(tool='Classification', tool_input={'category': 'album', 'reason': "The document is about an album named 'At My Window'."}, log='\nInvoking: `Classification` with `{\'category\': \'album\', \'reason\': "The document is about an album named \'At My Window\'."}`\n\n\n', message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_wqo6r2px', 'function': {'arguments': '{"category":"album","reason":"The document is about an album named \'At My Window\'."}', 'name': 'Classification'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 397, 'total_tokens': 430}, 'model_name': 'mistral:latest', 'system_fingerprint': 'fp_ollama', 'finish_reason': 'stop', 'logprobs': None}, id='run-cb5b41e6-5978-4164-8ac8-16a9116e47bd-0', tool_calls=[{'name': 'Classification', 'args': {'category': 'album', 'reason': "The document is about an album named 'At My Window'."}, 'id': 'call_wqo6r2px', 'type': 'tool_call'}], usage_metadata={'input_tokens': 397, 'output_tokens': 33, 'total_tokens': 430})], tool_call_id='call_wqo6r2px')] 100 | ``` 101 | -------------------------------------------------------------------------------- /examples/langchain-agent-demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/langchain-agent-demo/__init__.py -------------------------------------------------------------------------------- /examples/langchain-agent-demo/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | from langchain_core.pydantic_v1 import BaseModel, Field 4 | from utils import format_docs 5 | from knowledge_base import retriever 6 | from langchain_core.prompts import PromptTemplate 7 | from langchain_core.runnables import RunnablePassthrough 8 | from langchain_core.messages import HumanMessage 9 | from langchain_core.utils.function_calling import convert_to_openai_tool 10 | from langchain_core.callbacks import FileCallbackHandler, StdOutCallbackHandler 11 | from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser 12 | from langchain_openai import ChatOpenAI 13 | from dotenv import load_dotenv 14 | from dotenv import find_dotenv 15 | from loguru import logger 16 | 17 | logfile = "output.log" 18 | logger.add(logfile, colorize=True, enqueue=True) 19 | handler_file = FileCallbackHandler(logfile) 20 | handler_strout = StdOutCallbackHandler() 21 | 22 | _ = load_dotenv(find_dotenv()) 23 | 24 | 25 | class ClassEnum(str, Enum): 26 | album = "album" 27 | director = "director" 28 | actor = "actor" 29 | book = "book" 30 | songwriter = "songwriter" 31 | musician = "musician" 32 | others = "others" 33 | 34 | 35 | class Classification(BaseModel): 36 | """Classify the document into a category.""" 37 | 38 | # ! Only Hinting category is not work for 'convert method', need to specify the values of the category in desc, 39 | category: ClassEnum = Field( 40 | description=f"The category of the document, should be one of the following values: {[e.value for e in ClassEnum]}" 41 | ) 42 | reason: str = Field(description="The reason for the classification.") 43 | 44 | 45 | model = ChatOpenAI( 46 | base_url=os.environ.get('OLLAMA_BASE_URL'), 47 | api_key=os.environ.get('OLLAMA_API_KEY'), 48 | # model need support instruction function-calling. 49 | model=os.environ.get('LM_MODEL_NAME'), 50 | temperature=0, 51 | ) 52 | tools = [convert_to_openai_tool(Classification)] 53 | model_with_tools = model.bind_tools(tools=tools, tool_choice='required') 54 | 55 | parser = OpenAIToolsAgentOutputParser() 56 | prompt = PromptTemplate( 57 | template=""" 58 | You are an intelligent assistant, you will receive some documents about input, base on these info, 59 | tasked with classifying items based on their descriptions, use function calling 'Classification'. 60 | 61 | related documents: {documents} 62 | input: {input} 63 | """, 64 | input_variables=["documents", "input"], 65 | ) 66 | 67 | chain = {"documents": retriever | format_docs, "input": RunnablePassthrough()} | prompt | model_with_tools | parser 68 | 69 | resp = chain.invoke(HumanMessage(content="At My Window"), {"callbacks": [handler_file, handler_strout]}) 70 | print(resp) 71 | 72 | # 73 | if __name__ == '__main__': 74 | pass 75 | -------------------------------------------------------------------------------- /examples/langchain-agent-demo/knowledge_base.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List, Callable 2 | from dotenv import load_dotenv, find_dotenv 3 | from langchain_core.callbacks import CallbackManagerForRetrieverRun 4 | from langchain_core.documents import Document 5 | from sentence_transformers import SentenceTransformer 6 | from functools import partial 7 | import os 8 | from tidb_vector.integrations import TiDBVectorClient 9 | from langchain_core.retrievers import BaseRetriever 10 | 11 | # load environment variables 12 | _ = load_dotenv(find_dotenv()) 13 | 14 | # https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-encoding_format 15 | Vector = Union[List[float], List[int]] 16 | Vectors = List[Vector] 17 | 18 | 19 | def sentence_transformer_embedding_function( 20 | embed_model: SentenceTransformer, sentences: Union[str, List[str]] 21 | ) -> Union[Vector, Vectors]: 22 | """ 23 | Generates vector embeddings for the given text using the sentence-transformers model. 24 | 25 | Args: 26 | embed_model (SentenceTransformer): The sentence-transformers model to use. 27 | sentences (Union[str, List[str]]): The text or list of texts for which to generate embeddings. 28 | 29 | Returns: 30 | if sentences is a single string: 31 | List[float]: The embedding for the input sentence. 32 | if sentences is a list of strings: 33 | List[List[float]]: The embeddings for the input sentences. 34 | 35 | 36 | Examples: 37 | Below is a code snippet that shows how to use this function: 38 | ```python 39 | embeddings = sentence_transformer_embedding_function("Hello, world!") 40 | ``` 41 | or 42 | ```python 43 | embeddings = sentence_transformer_embedding_function(["Hello, world!"]) 44 | ``` 45 | """ 46 | 47 | return embed_model.encode(sentences).tolist() 48 | 49 | 50 | class TiRetriever(BaseRetriever): 51 | """A retriever that contains the top k documents that contain the user query. 52 | 53 | This retriever only implements the sync method _get_relevant_documents. 54 | 55 | If the retriever were to involve file access or network access, it could benefit 56 | from a native async implementation of `_aget_relevant_documents`. 57 | 58 | As usual, with Runnables, there's a default async implementation that's provided 59 | that delegates to the sync implementation running on another thread. 60 | """ 61 | 62 | """Vector Database Client. For example, TiDBVectorClient.""" 63 | rm: TiDBVectorClient 64 | """""" 65 | embedding_function: Callable[[str], Vector] 66 | """The number of top documents to return.""" 67 | k: int 68 | 69 | def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]: 70 | """Sync implementations for retriever.""" 71 | query_embeddings = self.embedding_function(str(query)) 72 | tidb_vector_res = self.rm.query(query_embeddings, k=self.k) 73 | passages_scores = {} 74 | for res in tidb_vector_res: 75 | passages_scores[res.document] = res.distance 76 | sorted_passages = sorted(passages_scores.items(), key=lambda x: x[1], reverse=True) 77 | return [Document(text) for (text, score) in sorted_passages] 78 | 79 | 80 | embed_model = SentenceTransformer(os.environ.get('SENTENCE_TRANSFORMERS_MODEL'), trust_remote_code=True) 81 | embed_model_dim = embed_model.get_sentence_embedding_dimension() 82 | 83 | embedding_function = partial(sentence_transformer_embedding_function, embed_model) 84 | 85 | tidb_vector_client = TiDBVectorClient( 86 | table_name=os.environ.get('TIDB_TABLE_NAME', 'embedded_documents'), 87 | connection_string=os.environ.get('TIDB_DATABASE_URL'), 88 | vector_dimension=embed_model_dim, 89 | drop_existing_table=True, 90 | ) 91 | 92 | print("Connected to TiDB.") 93 | print("describe table:") 94 | print(tidb_vector_client.execute("describe embedded_documents;")) 95 | 96 | print("Initializing the retriever...") 97 | retriever = TiRetriever(rm=tidb_vector_client, embedding_function=embedding_function, k=3) 98 | print("Retriever initialized successfully.") 99 | 100 | print("Loading sample data...") 101 | # test sample data 102 | # load sample_data.txt if not local file, you can use requests.get(url).text 103 | # sample data url: https://raw.githubusercontent.com/wxywb/dspy_dataset_sample/master/sample_data.txt 104 | with open('sample_data.txt', 'r') as f: 105 | # I prepare a small set of data for speeding up embedding, you can replace it with your own data. 106 | print("sample_data.txt found.") 107 | sample_data = f.read() 108 | print("Sample data loaded successfully.") 109 | 110 | print("Embedding sample data...") 111 | documents = [] 112 | for idx, passage in enumerate(sample_data.split('\n')[:3]): 113 | embedding = embedding_function([passage])[0] 114 | print(idx, passage[:10], embedding[:5]) 115 | if len(passage) == 0: 116 | continue 117 | documents.append( 118 | { 119 | "id": str(idx), 120 | "text": passage, 121 | "embedding": embedding, 122 | "metadata": {"category": "album"}, 123 | } 124 | ) 125 | print("Sample data embedded successfully.") 126 | print("Sample data number:", len(documents)) 127 | 128 | print("Inserting documents into TiDB...") 129 | tidb_vector_client.insert( 130 | ids=[doc["id"] for doc in documents], 131 | texts=[doc["text"] for doc in documents], 132 | embeddings=[doc["embedding"] for doc in documents], 133 | metadatas=[doc["metadata"] for doc in documents], 134 | ) 135 | print("Documents inserted successfully.") 136 | 137 | print("# ---- Init Finish ----") 138 | -------------------------------------------------------------------------------- /examples/langchain-agent-demo/requirements.txt: -------------------------------------------------------------------------------- 1 | PyMySQL==1.1.0 2 | mysqlclient==2.2.4 3 | langchain==0.2.10 4 | langchain-community==0.2.9 5 | sentence-transformers==3.0.1 6 | langchain_openai==0.1.17 7 | loguru==0.7.2 8 | 9 | -------------------------------------------------------------------------------- /examples/langchain-agent-demo/sample_data.txt: -------------------------------------------------------------------------------- 1 | At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt's first studio album in the nine years that followed 1978's "Flyin' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high. 2 | Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee. 3 | Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released. 4 | Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Records on September 21, 1993 (see 1993 in music). The title is generally thought to refer to Etheridge's recent coming out as a lesbian, confirming long-standing rumors about her personal life. This is the album that gave Etheridge national recognition. The rock ballad "Come to My Window" was the first single released from the album, which peaked at No. 25 on the "Billboard" Hot 100, and its video featured actress Juliette Lewis having a nervous breakdown. This single brought the album into the public consciousness and was quickly followed by "I'm the Only One", which became a major hit and reached No. 8 on the Hot 100, and "If I Wanted To", which hit No. 16. 5 | The Great Victorian Collection | The Great Victorian Collection, published in 1975, is a novel by Northern Irish-Canadian writer Brian Moore. Set in Carmel, California, it tells the story of a man who dreams that the empty parking lot he can see from his hotel window has been transformed by the arrival of a collection of priceless Victoriana on display in a vast open-air market. When he awakes he finds that he can no longer distinguish the dream from reality. 6 | Rosario Dawson | Rosario Isabel Dawson (born May 9, 1979) is an American actress, producer, singer, comic book writer, and political activist. She made her film debut in the 1995 teen drama "Kids". Her subsequent film roles include "He Got Game", "Men in Black II", "25th Hour", "Rent", "Sin City", "Death Proof", "Seven Pounds", "", and "Top Five". Dawson has also provided voice-over work for Disney and DC. 7 | Robert B. Sherman | Robert Bernard Sherman (December 19, 1925 – March 6, 2012) was an American songwriter who specialized in musical films with his brother Richard Morton Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time. 8 | Richard M. Sherman | Richard Morton Sherman (born June 12, 1928) is an American songwriter who specialized in musical films with his brother Robert Bernard Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "Snoopy Come Home", "Bedknobs and Broomsticks", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time. 9 | Everything Has Changed | "Everything Has Changed" is a song written and performed by American singer-songwriter Taylor Swift and English singer-songwriter Ed Sheeran, taken from Swift's fourth studio album, "Red" (2012). Produced by Butch Walker, the track was released as the sixth single from the album on July 16, 2013. "Everything Has Changed" is a guitar ballad combining folk and pop genres about "wanting to get to know a new lover better". 10 | Everything Changes (Julian Lennon album) | Everything Changes is the sixth studio album by English singer-songwriter Julian Lennon. It was released on 2 October 2011. 11 | Janick Gers | Janick Robert Gers ( ; born 27 January 1957 in Hartlepool, England) is an English musician, best known for being one of the three current guitarists in Iron Maiden, along with Dave Murray and Adrian Smith, as well as his earlier work with Gillan and White Spirit. 12 | Dave Murray (musician) | David Michael "Dave" Murray (born 23 December 1956) is an English guitarist and songwriter best known as one of the earliest members of the British heavy metal band Iron Maiden. Along with the group's bassist and primary songwriter Steve Harris, Murray has appeared on all of the band's releases. 13 | Roy Z | Roy Z (born February, 1968) is an American guitarist, songwriter and producer, best known for his work with Bruce Dickinson (from Iron Maiden), Halford, and Judas Priest. He also is the founder of Tribe of Gypsies, a Latin influenced hard rock band. 14 | Heather Baker | Heather Baker (born October 9, 1984) is a female American songwriter, guitarist, producer and founder of the Electronica band Antiwave. Heather is known for being a session and touring guitar player for the likes of Bonnie Mckee (Pulse Music Publishing), Krewella (Columbia Records), Kerli (Island Records), The Iron Maidens (Powerslave Records) and currently plays with the band Fake Figures (members of Atreyu, Eyelid and Scars of Tomorrow) and NoMBe (TH3RD BRAIN) 15 | Gianfranco Rosi (director) | Gianfranco Rosi is an Italian director, cinematographer, producer and screenwriter. His film "Sacro GRA" won Golden Lion at 70th Venice International Film Festival. "Sacro GRA" is the first documentary film to win Golden Lion in history of the Venice film festival and the first Italian film to win in fifteen years, after Gianni Amelio's "The Way We Laughed" won the award in 1998. His 2016 film "Fire at Sea", a documentary focused on European migrant crisis on the Sicilan island of Lampedusa, won the Golden Bear at the 66th Berlin International Film Festival. Rosi is the only documentary filmmaker to win two top prizes at major European film festivals (Cannes, Berlin and Venice) and is currently the only filmmaker besides Michael Haneke, Jafar Panahi, Ang Lee, and Ken Loach to win two top European festival prizes in the 21st century. -------------------------------------------------------------------------------- /examples/langchain-agent-demo/utils.py: -------------------------------------------------------------------------------- 1 | def format_docs(docs): 2 | return "\n\n".join([d.page_content for d in docs]) 3 | 4 | 5 | -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector-with-ui/README.md: -------------------------------------------------------------------------------- 1 | # LlamaIndex RAG Example with Simple UI 2 | 3 | This example demonstrates how to use the LlamaIndex and TiDB Serverless to build a simple RAG(Retrival-Augmented Generation) application with simple UI. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster with vector search enabled 8 | - Python 3.8 or later 9 | - OpenAI [API key](https://platform.openai.com/docs/quickstart) 10 | 11 | ## Run the example 12 | 13 | ### Clone this repo 14 | 15 | ```bash 16 | git clone https://github.com/pingcap/tidb-vector-python.git 17 | ``` 18 | 19 | ### Create a virtual environment 20 | 21 | ```bash 22 | cd tidb-vector-python/examples/llamaindex-tidb-vector-with-ui 23 | python3 -m venv .venv 24 | source .venv/bin/activate 25 | ``` 26 | 27 | ### Install dependencies 28 | 29 | ```bash 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | ### Set the environment variables 34 | 35 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart) 36 | 37 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. 38 | 39 | ```bash 40 | export OPENAI_API_KEY="sk-*******" 41 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com" 42 | export TIDB_USERNAME="****.root" 43 | export TIDB_PASSWORD="****" 44 | ``` 45 | 46 | ### Prepare data and run the server 47 | 48 | ```bash 49 | # prepare the data 50 | python app.py prepare 51 | 52 | # runserver 53 | python app.py runserver 54 | ``` 55 | 56 | Now you can visit [http://127.0.0.1:3000/](http://127.0.0.1:3000/) to interact with the RAG application. 57 | -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector-with-ui/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import logging 5 | import click 6 | import uvicorn 7 | import fastapi 8 | import asyncio 9 | from enum import Enum 10 | from sqlalchemy import URL 11 | from fastapi.encoders import jsonable_encoder 12 | from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse 13 | from fastapi.templating import Jinja2Templates 14 | from llama_index.core import VectorStoreIndex, StorageContext 15 | from llama_index.core.base.response.schema import StreamingResponse as llamaStreamingResponse 16 | from llama_index.vector_stores.tidbvector import TiDBVectorStore 17 | from llama_index.readers.web import SimpleWebPageReader 18 | 19 | 20 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 21 | logger = logging.getLogger() 22 | 23 | 24 | class EventType(Enum): 25 | META = 1 26 | ANSWER = 2 27 | 28 | 29 | logger.info("Initializing TiDB Vector Store....") 30 | tidb_connection_url = URL( 31 | "mysql+pymysql", 32 | username=os.environ['TIDB_USERNAME'], 33 | password=os.environ['TIDB_PASSWORD'], 34 | host=os.environ['TIDB_HOST'], 35 | port=4000, 36 | database="test", 37 | query={"ssl_verify_cert": True, "ssl_verify_identity": True}, 38 | ) 39 | tidbvec = TiDBVectorStore( 40 | connection_string=tidb_connection_url, 41 | table_name="llama_index_rag_test", 42 | distance_strategy="cosine", 43 | vector_dimension=1536, # Length of the vectors returned by the model 44 | drop_existing_table=False, 45 | ) 46 | tidb_vec_index = VectorStoreIndex.from_vector_store(tidbvec) 47 | storage_context = StorageContext.from_defaults(vector_store=tidbvec) 48 | query_engine = tidb_vec_index.as_query_engine(streaming=True) 49 | logger.info("TiDB Vector Store initialized successfully") 50 | 51 | 52 | def do_prepare_data(): 53 | logger.info("Preparing the data for the application") 54 | documents = SimpleWebPageReader(html_to_text=True).load_data( 55 | ["http://paulgraham.com/worked.html"] 56 | ) 57 | tidb_vec_index.from_documents(documents, storage_context=storage_context, show_progress=True) 58 | logger.info("Data preparation complete") 59 | 60 | 61 | # https://stackoverflow.com/questions/76288582/is-there-a-way-to-stream-output-in-fastapi-from-the-response-i-get-from-llama-in 62 | async def astreamer(response: llamaStreamingResponse): 63 | try: 64 | meta = json.dumps(jsonable_encoder(list(vars(node) for node in response.source_nodes))) 65 | yield f'{EventType.META.value}: {meta}\n\n' 66 | for i in response.response_gen: 67 | yield f'{EventType.ANSWER.value}: {i}\n\n' 68 | await asyncio.sleep(.1) 69 | except asyncio.CancelledError as e: 70 | print('cancelled') 71 | 72 | 73 | app = fastapi.FastAPI() 74 | templates = Jinja2Templates(directory="templates") 75 | 76 | 77 | @app.get('/', response_class=HTMLResponse) 78 | def index(request: fastapi.Request): 79 | return templates.TemplateResponse("index.html", {"request": request}) 80 | 81 | 82 | @app.get('/ask') 83 | async def ask(q: str): 84 | response = query_engine.query(q) 85 | return StreamingResponse(astreamer(response), media_type='text/event-stream') 86 | 87 | 88 | @click.group(context_settings={'max_content_width': 150}) 89 | def cli(): 90 | pass 91 | 92 | 93 | @cli.command() 94 | @click.option('--host', default='127.0.0.1', help="Host, default=127.0.0.1") 95 | @click.option('--port', default=3000, help="Port, default=3000") 96 | @click.option('--reload', is_flag=True, help="Enable auto-reload") 97 | def runserver(host, port, reload): 98 | uvicorn.run( 99 | "__main__:app", host=host, port=port, reload=reload, 100 | log_level="debug", workers=1, 101 | ) 102 | 103 | 104 | @cli.command() 105 | def prepare(): 106 | do_prepare_data() 107 | 108 | 109 | if __name__ == '__main__': 110 | cli() 111 | -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector-with-ui/requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | fastapi 3 | uvicorn 4 | Jinja2 5 | llama-index 6 | llama-index-readers-web 7 | llama-index-vector-stores-tidbvector -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector-with-ui/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | LlamaIndex & TiDB RAG Demo 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 |
17 |
18 | 22 | 27 |
28 |
29 |

Answer Body

30 |
31 |
Empty
32 |
33 |
34 |
35 |

Chunks Retrieved

36 |
37 |
    38 |
  • 39 |
40 |
41 |
42 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector/README.md: -------------------------------------------------------------------------------- 1 | # LlamaIndex RAG Example 2 | 3 | This example demonstrates how to use the LlamaIndex and TiDB Serverless to build a simple RAG(Retrival-Augmented Generation) application. It crawl an example webpage and index the content to TiDB Serverless with LlamaIndex, then use the LlamaIndex to search the content and generate the answer with OpenAI. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster with vector search enabled 8 | - Python 3.8 or later 9 | - OpenAI [API key](https://platform.openai.com/docs/quickstart) 10 | 11 | ## Run the example 12 | 13 | ### Clone this repo 14 | 15 | ```bash 16 | git clone https://github.com/pingcap/tidb-vector-python.git 17 | ``` 18 | 19 | ### Create a virtual environment 20 | 21 | ```bash 22 | cd tidb-vector-python/examples/llamaindex-tidb-vector 23 | python3 -m venv .venv 24 | source .venv/bin/activate 25 | ``` 26 | 27 | ### Install dependencies 28 | 29 | ```bash 30 | pip install -r requirements.txt 31 | ``` 32 | 33 | ### Set the environment variables 34 | 35 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart) 36 | 37 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. 38 | 39 | ```bash 40 | export OPENAI_API_KEY="sk-*******" 41 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com" 42 | export TIDB_USERNAME="****.root" 43 | export TIDB_PASSWORD="****" 44 | ``` 45 | 46 | ### Run this example 47 | 48 | ```text 49 | $ python chat_with_url.py --help 50 | Usage: chat_with_url.py [OPTIONS] 51 | 52 | Options: 53 | --url TEXT URL you want to talk to, 54 | default=https://docs.pingcap.com/tidb/stable/overview 55 | --help Show this message and exit. 56 | $ 57 | $ python chat_with_url.py 58 | Enter your question: tidb vs mysql 59 | TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. It is MySQL compatible and features horizontal scalability, strong consistency, and high availability. TiDB is designed to provide users with a one-stop database solution that covers OLTP, OLAP, and HTAP services. It offers easy horizontal scaling, financial-grade high availability, real-time HTAP capabilities, cloud-native features, and compatibility with the MySQL protocol and ecosystem. 60 | Enter your question: 61 | ``` -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector/chat_with_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | 4 | import click 5 | from sqlalchemy import URL 6 | from llama_index.core import VectorStoreIndex, StorageContext 7 | from llama_index.vector_stores.tidbvector import TiDBVectorStore # type: ignore 8 | from llama_index.readers.web import SimpleWebPageReader 9 | 10 | 11 | tidb_connection_url = URL( 12 | "mysql+pymysql", 13 | username=os.environ['TIDB_USERNAME'], 14 | password=os.environ['TIDB_PASSWORD'], 15 | host=os.environ['TIDB_HOST'], 16 | port=4000, 17 | database="test", 18 | query={"ssl_verify_cert": True, "ssl_verify_identity": True}, 19 | ) 20 | tidbvec = TiDBVectorStore( 21 | connection_string=tidb_connection_url, 22 | table_name="llama_index_rag_test", 23 | distance_strategy="cosine", 24 | vector_dimension=1536, # The dimension is decided by the model 25 | drop_existing_table=False, 26 | ) 27 | tidb_vec_index = VectorStoreIndex.from_vector_store(tidbvec) 28 | storage_context = StorageContext.from_defaults(vector_store=tidbvec) 29 | query_engine = tidb_vec_index.as_query_engine(streaming=True) 30 | 31 | 32 | def do_prepare_data(url): 33 | documents = SimpleWebPageReader(html_to_text=True).load_data([url,]) 34 | tidb_vec_index.from_documents(documents, storage_context=storage_context, show_progress=True) 35 | 36 | 37 | _default_url = 'https://docs.pingcap.com/tidb/stable/overview' 38 | 39 | @click.command() 40 | @click.option('--url',default=_default_url, 41 | help=f'URL you want to talk to, default={_default_url}') 42 | def chat_with_url(url): 43 | do_prepare_data(url) 44 | while True: 45 | question = click.prompt("Enter your question") 46 | response = query_engine.query(question) 47 | click.echo(response) 48 | 49 | if __name__ == '__main__': 50 | chat_with_url() 51 | -------------------------------------------------------------------------------- /examples/llamaindex-tidb-vector/requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.1.7 2 | SQLAlchemy==2.0.29 3 | llama-index==0.10.29 4 | llama-index-readers-web==0.1.8 5 | llama-index-vector-stores-tidbvector==0.1.2 6 | -------------------------------------------------------------------------------- /examples/openai_embedding/README.md: -------------------------------------------------------------------------------- 1 | # OpenAI Embedding Example 2 | 3 | This example demonstrates how to utilize OpenAI embedding for semantic search. According to OpenAI's [documentation](https://platform.openai.com/docs/guides/embeddings/which-distance-function-should-i-use), we will use cosine similarity to calculate vector distance. 4 | 5 | You can run this example in two ways: 6 | 7 | - [Run in Jupyter Notebook](#jupyter-notebook) 8 | - [Run in Local](#run-in-local) 9 | 10 | ## Jupyter Notebook 11 | 12 | Notebook: [example.ipynb](./example.ipynb) 13 | 14 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/openai_embedding/example.ipynb). 15 | 16 | ## Run in Local 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | python3 -m venv .venv 22 | source .venv/bin/activate 23 | ``` 24 | 25 | ### Install the requirements 26 | 27 | ```bash 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | ### Set the environment variables 32 | 33 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart) 34 | 35 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. 36 | 37 | ```bash 38 | export OPENAI_API_KEY="sk-*******" 39 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com" 40 | export TIDB_USERNAME="****.root" 41 | export TIDB_PASSWORD="****" 42 | ``` 43 | 44 | ### Run the example 45 | 46 | ```bash 47 | python3 example.py 48 | ``` 49 | -------------------------------------------------------------------------------- /examples/openai_embedding/example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Example of Embedding\n", 7 | "\n", 8 | "It is an embedding example that uses `tidb_vector_python` as its library." 9 | ], 10 | "metadata": { 11 | "id": "ewKGZW06kmIv" 12 | } 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "id": "F1fsS576izUl" 18 | }, 19 | "source": [ 20 | "## Install Dependencies" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "id": "pTpKX_lDizUp" 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "%%capture\n", 32 | "%pip install openai peewee pymysql tidb_vector" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "psEHGWiHizUq" 39 | }, 40 | "source": [ 41 | "## Preapre the environment\n", 42 | "\n", 43 | "> **Note:**\n", 44 | ">\n", 45 | "> - You can get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart).\n", 46 | "> - You can get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.\n", 47 | "\n", 48 | "Set the embedding model as `text-embedding-3-small`, and\n", 49 | "the amount of embedding dimensions is `1536`." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "id": "MgKOjwmYizUq" 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "import getpass\n", 61 | "\n", 62 | "OPENAI_API_KEY = getpass.getpass(\"Enter your OpenAI API key: \")\n", 63 | "TIDB_HOST = input(\"Enter your TiDB host: \")\n", 64 | "TIDB_USERNAME = input(\"Enter your TiDB username: \")\n", 65 | "TIDB_PASSWORD = getpass.getpass(\"Enter your TiDB password: \")\n", 66 | "\n", 67 | "embedding_model = \"text-embedding-3-small\"\n", 68 | "embedding_dimensions = 1536" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "3WbH_BITizUr" 75 | }, 76 | "source": [ 77 | "## Initial the Clients of OpenAI and Database" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "id": "UWtcs58-izUr" 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "from openai import OpenAI\n", 89 | "from peewee import Model, MySQLDatabase, TextField, SQL\n", 90 | "from tidb_vector.peewee import VectorField\n", 91 | "\n", 92 | "client = OpenAI(api_key=OPENAI_API_KEY)\n", 93 | "db = MySQLDatabase(\n", 94 | " 'test',\n", 95 | " user=TIDB_USERNAME,\n", 96 | " password=TIDB_PASSWORD,\n", 97 | " host=TIDB_HOST,\n", 98 | " port=4000,\n", 99 | " ssl_verify_cert=True,\n", 100 | " ssl_verify_identity=True\n", 101 | ")\n", 102 | "db.connect()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "id": "uOyjrmWJizUr" 109 | }, 110 | "source": [ 111 | "## Prepare the Context\n", 112 | "\n", 113 | "In this case, contexts are the documents, use the openai embeddings model to get the embeddings of the documents, and store them in the TiDB." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "id": "_e5P_m0MizUs" 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "documents = [\n", 125 | " \"TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.\",\n", 126 | " \"TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.\",\n", 127 | " \"TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. \",\n", 128 | "]\n", 129 | "\n", 130 | "class DocModel(Model):\n", 131 | " text = TextField()\n", 132 | " embedding = VectorField(dimensions=embedding_dimensions)\n", 133 | "\n", 134 | " class Meta:\n", 135 | " database = db\n", 136 | " table_name = \"openai_embedding_test\"\n", 137 | "\n", 138 | " def __str__(self):\n", 139 | " return self.text\n", 140 | "\n", 141 | "db.drop_tables([DocModel])\n", 142 | "db.create_tables([DocModel])\n", 143 | "\n", 144 | "embeddings = [\n", 145 | " r.embedding\n", 146 | " for r in client.embeddings.create(\n", 147 | " input=documents, model=embedding_model\n", 148 | " ).data\n", 149 | "]\n", 150 | "data_source = [\n", 151 | " {\"text\": doc, \"embedding\": emb}\n", 152 | " for doc, emb in zip(documents, embeddings)\n", 153 | "]\n", 154 | "DocModel.insert_many(data_source).execute()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "zMP-P1g8izUs" 161 | }, 162 | "source": [ 163 | "## Initial the Vector of Question\n", 164 | "\n", 165 | "Ask a question, use the openai embeddings model to get the embeddings of the question" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "id": "-zrTOxs4izUt" 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "question = \"what is TiKV?\"\n", 177 | "question_embedding = client.embeddings.create(input=question, model=embedding_model).data[0].embedding" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "id": "atc0gXVZizUt" 184 | }, 185 | "source": [ 186 | "## Retrieve by Cosine Distance of Vectors\n", 187 | "Get the relevant documents from the TiDB by comparing the embeddings of the question and the documents" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "id": "DTtJRX64izUt" 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "related_docs = DocModel.select(\n", 199 | " DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias(\"distance\")\n", 200 | ").order_by(SQL(\"distance\")).limit(3)\n", 201 | "\n", 202 | "print(\"Question:\", question)\n", 203 | "print(\"Related documents:\")\n", 204 | "for doc in related_docs:\n", 205 | " print(doc.distance, doc.text)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "source": [ 211 | "## Cleanup" 212 | ], 213 | "metadata": { 214 | "id": "bYBetPchmNUp" 215 | } 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "id": "Lh27gC7gizUt" 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "db.close()" 226 | ] 227 | } 228 | ], 229 | "metadata": { 230 | "kernelspec": { 231 | "display_name": ".venv", 232 | "language": "python", 233 | "name": "python3" 234 | }, 235 | "language_info": { 236 | "codemirror_mode": { 237 | "name": "ipython", 238 | "version": 3 239 | }, 240 | "file_extension": ".py", 241 | "mimetype": "text/x-python", 242 | "name": "python", 243 | "nbconvert_exporter": "python", 244 | "pygments_lexer": "ipython3", 245 | "version": "3.11.6" 246 | }, 247 | "colab": { 248 | "provenance": [], 249 | "toc_visible": true 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 0 254 | } 255 | -------------------------------------------------------------------------------- /examples/openai_embedding/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | from openai import OpenAI 3 | from peewee import Model, MySQLDatabase, TextField, SQL 4 | from tidb_vector.peewee import VectorField 5 | 6 | # Init OpenAI client 7 | # In this example, we use the text-embedding-3-small model to generate embeddings 8 | client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY')) 9 | embedding_model = "text-embedding-3-small" 10 | embedding_dimensions = 1536 11 | 12 | # Init TiDB connection 13 | # Note: TiDB Serverless requires secure connection, so we need to set ssl_verify_cert and ssl_verify_identity to True 14 | # Remember to set the environment variables with your own TiDB credentials 15 | db = MySQLDatabase( 16 | 'test', 17 | user=os.environ.get('TIDB_USERNAME'), 18 | password=os.environ.get('TIDB_PASSWORD'), 19 | host=os.environ.get('TIDB_HOST'), 20 | port=4000, 21 | ssl_verify_cert=True, 22 | ssl_verify_identity=True 23 | ) 24 | 25 | documents = [ 26 | "TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.", 27 | "TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.", 28 | "TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. ", 29 | ] 30 | 31 | # Define a model with a VectorField to store the embeddings 32 | class DocModel(Model): 33 | text = TextField() 34 | embedding = VectorField(dimensions=embedding_dimensions) 35 | 36 | class Meta: 37 | database = db 38 | table_name = "openai_embedding_test" 39 | 40 | def __str__(self): 41 | return self.text 42 | 43 | db.connect() 44 | db.drop_tables([DocModel]) 45 | db.create_tables([DocModel]) 46 | 47 | # Insert the documents and their embeddings into TiDB 48 | embeddings = [ 49 | r.embedding 50 | for r in client.embeddings.create( 51 | input=documents, model=embedding_model 52 | ).data 53 | ] 54 | data_source = [ 55 | {"text": doc, "embedding": emb} 56 | for doc, emb in zip(documents, embeddings) 57 | ] 58 | DocModel.insert_many(data_source).execute() 59 | 60 | # Query the most similar documents to a question 61 | # 1. Generate the embedding of the question 62 | # 2. Query the most similar documents based on the cosine distance in TiDB 63 | # 3. Print the results 64 | question = "what is TiKV?" 65 | question_embedding = client.embeddings.create(input=question, model=embedding_model).data[0].embedding 66 | related_docs = DocModel.select( 67 | DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias("distance") 68 | ).order_by(SQL("distance")).limit(3) 69 | 70 | print("Question:", question) 71 | print("Related documents:") 72 | for doc in related_docs: 73 | print(doc.distance, doc.text) 74 | 75 | db.close() -------------------------------------------------------------------------------- /examples/openai_embedding/requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | peewee 3 | pymysql 4 | tidb-vector -------------------------------------------------------------------------------- /examples/orm-django-quickstart/.env.example: -------------------------------------------------------------------------------- 1 | TIDB_HOST='xxxxxxxx.aws.tidbcloud.com' 2 | TIDB_PORT='4000' 3 | TIDB_USERNAME='xxxxxxxxxxx.root' 4 | TIDB_PASSWORD='xxxxxxx' 5 | TIDB_DATABASE='test' 6 | # The CA certificate file path. 7 | # The example path is for macOS. 8 | # For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path. 9 | TIDB_CA_PATH='/etc/ssl/cert.pem' -------------------------------------------------------------------------------- /examples/orm-django-quickstart/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /examples/orm-django-quickstart/README.md: -------------------------------------------------------------------------------- 1 | # Integrate TiDB Vector Search with Django ORM 2 | 3 | This is a simple demo to show how to integrate TiDB Vector Search with the Django ORM to search for similar text in a TiDB Serverless cluster. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster with vector search enabled 8 | - Python 3.8 or later 9 | 10 | ## Run the example 11 | 12 | ### Clone this repo 13 | 14 | ```bash 15 | git clone https://github.com/pingcap/tidb-vector-python.git 16 | ``` 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | cd tidb-vector-python/examples/orm-django-quickstart 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | ``` 25 | 26 | ### Install dependencies 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | ### Set the environment variables 33 | 34 | Create a `.env` file via the following command. 35 | 36 | ```shell 37 | cp .env.example .env 38 | ``` 39 | 40 | Copy the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then set up the following environment variables in the `.env` file. 41 | 42 | ```bash 43 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com 44 | TIDB_PORT=4000 45 | TIDB_USERNAME=******.root 46 | TIDB_PASSWORD=******** 47 | TIDB_DATABASE=test 48 | # For macOS. For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path . 49 | TIDB_CA_PATH=/etc/ssl/cert.pem 50 | ``` 51 | 52 | ### Run this example 53 | 54 | Migrate the table schema: 55 | 56 | ```shell 57 | pyhton manage.py migrate 58 | ``` 59 | 60 | Run the server: 61 | 62 | ```shell 63 | python manage.py runserver 64 | ``` 65 | 66 | Open your browser and visit `http://localhost:8000/`. 67 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sample_project.settings") 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/requirements.txt: -------------------------------------------------------------------------------- 1 | Django==4.2.4 2 | django-tidb>=5.0.1 3 | mysqlclient==2.2.0 4 | python-dotenv==1.0.0 5 | tidb-vector>=0.0.9 6 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/orm-django-quickstart/sample_project/__init__.py -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for sample_project project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sample_project.settings") 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/forms.py: -------------------------------------------------------------------------------- 1 | from django import forms 2 | from django.db import models, transaction 3 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 4.2.4 on 2024-11-05 05:36 2 | 3 | from django.db import migrations, models 4 | import django_tidb.fields.vector 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | initial = True 10 | 11 | dependencies = [] 12 | 13 | operations = [ 14 | migrations.CreateModel( 15 | name="Document", 16 | fields=[ 17 | ( 18 | "id", 19 | models.BigAutoField( 20 | auto_created=True, 21 | primary_key=True, 22 | serialize=False, 23 | verbose_name="ID", 24 | ), 25 | ), 26 | ("content", models.TextField()), 27 | ("embedding", django_tidb.fields.vector.VectorField(dimensions=3)), 28 | ], 29 | options={ 30 | "indexes": [ 31 | django_tidb.fields.vector.VectorIndex( 32 | django_tidb.fields.vector.L2Distance("embedding"), name="idx_l2" 33 | ), 34 | django_tidb.fields.vector.VectorIndex( 35 | django_tidb.fields.vector.CosineDistance("embedding"), 36 | name="idx_cos", 37 | ), 38 | ], 39 | }, 40 | ), 41 | ] 42 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/orm-django-quickstart/sample_project/migrations/__init__.py -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from django_tidb.fields.vector import VectorField, VectorIndex, CosineDistance, L2Distance 3 | 4 | 5 | class Document(models.Model): 6 | content = models.TextField() 7 | embedding = VectorField(dimensions=3) 8 | class Meta: 9 | indexes = [ 10 | VectorIndex(L2Distance("embedding"), name='idx_l2'), 11 | VectorIndex(CosineDistance("embedding"), name='idx_cos'), 12 | ] 13 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for sample_project project. 3 | 4 | Generated by 'django-admin startproject' using Django 4.2.4. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/4.2/ref/settings/ 11 | """ 12 | import os 13 | from pathlib import Path 14 | 15 | import dotenv 16 | 17 | dotenv.load_dotenv() 18 | 19 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 20 | BASE_DIR = Path(__file__).resolve().parent.parent 21 | 22 | 23 | # Quick-start development settings - unsuitable for production 24 | # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ 25 | 26 | # SECURITY WARNING: keep the secret key used in production secret! 27 | SECRET_KEY = "django-insecure-axer15+p=pea&u%9e4t^3314jagd+1e$5!i9%oh1^yu@1hf6w4" 28 | 29 | # SECURITY WARNING: don't run with debug turned on in production! 30 | DEBUG = True 31 | 32 | ALLOWED_HOSTS = [] 33 | 34 | 35 | # Application definition 36 | 37 | INSTALLED_APPS = [ 38 | "django.contrib.admin", 39 | "django.contrib.auth", 40 | "django.contrib.contenttypes", 41 | "django.contrib.sessions", 42 | "django.contrib.messages", 43 | "django.contrib.staticfiles", 44 | "sample_project", 45 | ] 46 | 47 | MIDDLEWARE = [ 48 | "django.middleware.security.SecurityMiddleware", 49 | "django.contrib.sessions.middleware.SessionMiddleware", 50 | "django.middleware.common.CommonMiddleware", 51 | "django.middleware.csrf.CsrfViewMiddleware", 52 | "django.contrib.auth.middleware.AuthenticationMiddleware", 53 | "django.contrib.messages.middleware.MessageMiddleware", 54 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 55 | ] 56 | 57 | ROOT_URLCONF = "sample_project.urls" 58 | 59 | TEMPLATES = [ 60 | { 61 | "BACKEND": "django.template.backends.django.DjangoTemplates", 62 | "DIRS": [], 63 | "APP_DIRS": True, 64 | "OPTIONS": { 65 | "context_processors": [ 66 | "django.template.context_processors.debug", 67 | "django.template.context_processors.request", 68 | "django.contrib.auth.context_processors.auth", 69 | "django.contrib.messages.context_processors.messages", 70 | ], 71 | }, 72 | }, 73 | ] 74 | 75 | WSGI_APPLICATION = "sample_project.wsgi.application" 76 | 77 | 78 | # Database 79 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases 80 | 81 | DATABASES = { 82 | "default": { 83 | # https://github.com/pingcap/django-tidb 84 | "ENGINE": "django_tidb", 85 | "HOST": os.environ.get("TIDB_HOST", "127.0.0.1"), 86 | "PORT": int(os.environ.get("TIDB_PORT", 4000)), 87 | "USER": os.environ.get("TIDB_USERNAME", "root"), 88 | "PASSWORD": os.environ.get("TIDB_PASSWORD", ""), 89 | "NAME": os.environ.get("TIDB_DATABASE", "test"), 90 | "OPTIONS": { 91 | "charset": "utf8mb4", 92 | }, 93 | } 94 | } 95 | 96 | TIDB_CA_PATH = os.environ.get("TIDB_CA_PATH", "") 97 | if TIDB_CA_PATH: 98 | DATABASES["default"]["OPTIONS"]["ssl_mode"] = "VERIFY_IDENTITY" 99 | DATABASES["default"]["OPTIONS"]["ssl"] = { 100 | "ca": TIDB_CA_PATH, 101 | } 102 | 103 | # Password validation 104 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators 105 | 106 | AUTH_PASSWORD_VALIDATORS = [ 107 | { 108 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", 109 | }, 110 | { 111 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", 112 | }, 113 | { 114 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", 115 | }, 116 | { 117 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", 118 | }, 119 | ] 120 | 121 | 122 | # Internationalization 123 | # https://docs.djangoproject.com/en/4.2/topics/i18n/ 124 | 125 | LANGUAGE_CODE = "en-us" 126 | 127 | TIME_ZONE = "UTC" 128 | 129 | USE_I18N = True 130 | 131 | USE_TZ = True 132 | 133 | 134 | # Static files (CSS, JavaScript, Images) 135 | # https://docs.djangoproject.com/en/4.2/howto/static-files/ 136 | 137 | STATIC_URL = "static/" 138 | 139 | # Default primary key field type 140 | # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field 141 | 142 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" 143 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/urls.py: -------------------------------------------------------------------------------- 1 | """ 2 | URL configuration for sample_project project. 3 | 4 | The `urlpatterns` list routes URLs to views. For more information please see: 5 | https://docs.djangoproject.com/en/4.2/topics/http/urls/ 6 | Examples: 7 | Function views 8 | 1. Add an import: from my_app import views 9 | 2. Add a URL to urlpatterns: path('', views.home, name='home') 10 | Class-based views 11 | 1. Add an import: from other_app.views import Home 12 | 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') 13 | Including another URLconf 14 | 1. Import the include() function: from django.urls import include, path 15 | 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) 16 | """ 17 | from django.urls import path 18 | 19 | from . import views 20 | 21 | urlpatterns = [ 22 | path("", views.list_routes, name="index"), 23 | path("insert_documents", views.insert_documents, name="insert_documents"), 24 | path("get_nearest_neighbors_documents", views.get_nearest_neighbors_documents, name="get_nearest_neighbors_documents"), 25 | path("get_documents_within_distance", views.get_documents_within_distance, name="get_documents_within_distance"), 26 | ] 27 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/views.py: -------------------------------------------------------------------------------- 1 | from django.http import HttpResponse, JsonResponse 2 | from django_tidb.fields.vector import CosineDistance 3 | 4 | from .models import Document 5 | 6 | 7 | # Insert 3 documents. 8 | def insert_documents(request): 9 | Document.objects.create(content="dog", embedding=[1, 2, 1]) 10 | Document.objects.create(content="fish", embedding=[1, 2, 4]) 11 | Document.objects.create(content="tree", embedding=[1, 0, 0]) 12 | 13 | return HttpResponse("Insert documents successfully.") 14 | 15 | 16 | # Get 3-nearest neighbor documents. 17 | def get_nearest_neighbors_documents(request): 18 | results = Document.objects.annotate( 19 | distance=CosineDistance('embedding', [1, 2, 3]) 20 | ).order_by('distance')[:3] 21 | response = [] 22 | for doc in results: 23 | response.append({ 24 | 'distance': doc.distance, 25 | 'document': doc.content 26 | }) 27 | 28 | return JsonResponse(response, safe=False) 29 | 30 | 31 | # Get documents within a certain distance. 32 | def get_documents_within_distance(request): 33 | results = Document.objects.annotate( 34 | distance=CosineDistance('embedding', [1, 2, 3]) 35 | ).filter(distance__lt=0.2).order_by('distance')[:3] 36 | response = [] 37 | for doc in results: 38 | response.append({ 39 | 'distance': doc.distance, 40 | 'document': doc.content 41 | }) 42 | 43 | return JsonResponse(response, safe=False) 44 | 45 | 46 | def list_routes(request): 47 | return JsonResponse({ 48 | 'routes': [ 49 | '/insert_documents', 50 | '/get_nearest_neighbors_documents', 51 | '/get_documents_within_distance' 52 | ] 53 | }) 54 | -------------------------------------------------------------------------------- /examples/orm-django-quickstart/sample_project/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for sample_project project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sample_project.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /examples/orm-peewee-quickstart/.env.example: -------------------------------------------------------------------------------- 1 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com 2 | TIDB_PORT=4000 3 | TIDB_USERNAME=******.root 4 | TIDB_PASSWORD=******** 5 | TIDB_DATABASE=test 6 | # TiDB Serverless Cluster requires SSL connection for public network access. 7 | # For local TiDB cluster, please set TIDB_SSL=false to disable SSL. 8 | TIDB_SSL=true -------------------------------------------------------------------------------- /examples/orm-peewee-quickstart/README.md: -------------------------------------------------------------------------------- 1 | # Integrate TiDB Vector Search with Peewee ORM 2 | 3 | This is a simple demo to show how to integrate TiDB Vector Search with the Peewee ORM to search for similar text in a TiDB Serverless cluster. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster with vector search enabled 8 | - Python 3.8 or later 9 | 10 | ## Run the example 11 | 12 | ### Clone this repo 13 | 14 | ```bash 15 | git clone https://github.com/pingcap/tidb-vector-python.git 16 | ``` 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | cd tidb-vector-python/examples/orm-peewee-quickstart 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | ``` 25 | 26 | ### Install dependencies 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | ### Set the environment variables 33 | 34 | Create a `.env` file via the following command. 35 | 36 | ```shell 37 | cp .env.example .env 38 | ``` 39 | 40 | Copy the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then set up the following environment variables in the `.env` file. 41 | 42 | ```bash 43 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com 44 | TIDB_PORT=4000 45 | TIDB_USERNAME=******.root 46 | TIDB_PASSWORD=******** 47 | TIDB_DATABASE=test 48 | # For macOS. For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path . 49 | TIDB_CA_PATH=/etc/ssl/cert.pem 50 | ``` 51 | 52 | ### Run this example 53 | 54 | ```text 55 | $ python peewee-quickstart.py 56 | Get 3-nearest neighbor documents: 57 | - distance: 0.00853986601633272 58 | document: fish 59 | - distance: 0.12712843905603044 60 | document: dog 61 | - distance: 0.7327387580875756 62 | document: tree 63 | Get documents within a certain distance: 64 | - distance: 0.00853986601633272 65 | document: fish 66 | - distance: 0.12712843905603044 67 | document: dog 68 | ``` -------------------------------------------------------------------------------- /examples/orm-peewee-quickstart/peewee-quickstart.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dotenv 3 | 4 | from tidb_vector.peewee import VectorField, VectorAdaptor 5 | from tidb_vector.constants import DistanceMetric 6 | from peewee import Model, MySQLDatabase, TextField 7 | 8 | dotenv.load_dotenv() 9 | 10 | # Step 1: Connect to TiDB using Peewee. 11 | 12 | # Using `pymysql` as the driver. 13 | ssl_kwargs = { 14 | 'ssl_verify_cert': True, 15 | 'ssl_verify_identity': True, 16 | } 17 | 18 | # Using `mysqlclient` as the driver. 19 | # ssl_kwargs = { 20 | # 'ssl_mode': 'VERIFY_IDENTITY', 21 | # 'ssl': { 22 | # # Root certificate default path 23 | # # https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters/#root-certificate-default-path 24 | # 'ca': os.environ.get('TIDB_CA_PATH', '/path/to/ca.pem'), 25 | # }, 26 | # } 27 | 28 | db = MySQLDatabase( 29 | database=os.environ.get('TIDB_DATABASE', 'test'), 30 | user=os.environ.get('TIDB_USERNAME', 'root'), 31 | password=os.environ.get('TIDB_PASSWORD', ''), 32 | host=os.environ.get('TIDB_HOST', 'localhost'), 33 | port=int(os.environ.get('TIDB_PORT', '4000')), 34 | **ssl_kwargs if os.environ.get('TIDB_SSL', 'false').lower() == 'true' else {}, 35 | ) 36 | 37 | 38 | # Step 2: Define a table with a vector column. 39 | 40 | # Create table without HNSW index. 41 | class Document(Model): 42 | class Meta: 43 | database = db 44 | table_name = 'peewee_demo_documents' 45 | 46 | content = TextField() 47 | embedding = VectorField(3) 48 | 49 | 50 | # Create table with HNSW index. 51 | class DocumentWithIndex(Model): 52 | class Meta: 53 | database = db 54 | table_name = 'peewee_demo_documents_with_index' 55 | 56 | content = TextField() 57 | embedding = VectorField(3) 58 | 59 | 60 | db.connect() 61 | db.drop_tables([Document, DocumentWithIndex]) 62 | db.create_tables([Document, DocumentWithIndex]) 63 | VectorAdaptor(db).create_vector_index( 64 | DocumentWithIndex.embedding, 65 | DistanceMetric.COSINE, 66 | ) 67 | 68 | # Step 3. Insert embeddings into the table. 69 | Document.create(content='dog', embedding=[1, 2, 1]) 70 | Document.create(content='fish', embedding=[1, 2, 4]) 71 | Document.create(content='tree', embedding=[1, 0, 0]) 72 | 73 | # Step 4. Get the 3-nearest neighbor documents. 74 | print('Get 3-nearest neighbor documents:') 75 | distance = Document.embedding.cosine_distance([1, 2, 3]).alias('distance') 76 | results = Document.select(Document, distance).order_by(distance).limit(3) 77 | 78 | for doc in results: 79 | print(f' - distance: {doc.distance}\n' 80 | f' document: {doc.content}') 81 | 82 | # Step 5. Get documents within a certain distance. 83 | print('Get documents within a certain distance:') 84 | distance_expression = Document.embedding.cosine_distance([1, 2, 3]) 85 | distance = distance_expression.alias('distance') 86 | results = Document.select(Document, distance).where(distance_expression < 0.2).order_by(distance).limit(3) 87 | 88 | for doc in results: 89 | print(f' - distance: {doc.distance}\n' 90 | f' document: {doc.content}') 91 | -------------------------------------------------------------------------------- /examples/orm-peewee-quickstart/requirements.txt: -------------------------------------------------------------------------------- 1 | PyMySQL==1.1.0 2 | python-dotenv==1.0.0 3 | peewee==3.17.5 4 | tidb-vector>=0.0.9 -------------------------------------------------------------------------------- /examples/orm-sqlalchemy-quickstart/.env.example: -------------------------------------------------------------------------------- 1 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com 2 | TIDB_PORT=4000 3 | TIDB_USERNAME=******.root 4 | TIDB_PASSWORD=******** 5 | TIDB_DATABASE=test 6 | # TiDB Serverless Cluster requires SSL connection for public network access. 7 | # For local TiDB cluster, please set TIDB_SSL=false to disable SSL. 8 | TIDB_SSL=true -------------------------------------------------------------------------------- /examples/orm-sqlalchemy-quickstart/README.md: -------------------------------------------------------------------------------- 1 | # Integrate TiDB Vector Search with SQLAlchemy ORM 2 | 3 | This is a simple demo to show how to integrate TiDB Vector Search with the SQLAlchemy ORM to search for similar text in a TiDB Serverless cluster. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster with vector search enabled 8 | - Python 3.8 or later 9 | 10 | ## Run the example 11 | 12 | ### Clone this repo 13 | 14 | ```bash 15 | git clone https://github.com/pingcap/tidb-vector-python.git 16 | ``` 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | cd tidb-vector-python/examples/orm-sqlalchemy-quickstart 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | ``` 25 | 26 | ### Install dependencies 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | ### Set the environment variables 33 | 34 | Create a `.env` file via the following command. 35 | 36 | ```shell 37 | cp .env.example .env 38 | ``` 39 | 40 | Copy the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then replace the placeholders in the `.env` file. 41 | 42 | ```bash 43 | TIDB_DATABASE_URL=mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true 44 | ``` 45 | 46 | ### Run this example 47 | 48 | ```text 49 | $ python sqlalchemy-quickstart.py 50 | Get 3-nearest neighbor documents: 51 | - distance: 0.00853986601633272 52 | document: fish 53 | - distance: 0.12712843905603044 54 | document: dog 55 | - distance: 0.7327387580875756 56 | document: tree 57 | Get documents within a certain distance: 58 | - distance: 0.00853986601633272 59 | document: fish 60 | - distance: 0.12712843905603044 61 | document: dog 62 | ``` -------------------------------------------------------------------------------- /examples/orm-sqlalchemy-quickstart/requirements.txt: -------------------------------------------------------------------------------- 1 | PyMySQL==1.1.0 2 | python-dotenv==1.0.0 3 | SQLAlchemy==2.0.30 4 | tidb-vector>=0.0.9 -------------------------------------------------------------------------------- /examples/orm-sqlalchemy-quickstart/sqlalchemy-quickstart.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dotenv 3 | 4 | from sqlalchemy import Column, Integer, create_engine, Text, URL 5 | from sqlalchemy.orm import declarative_base, Session 6 | from tidb_vector.sqlalchemy import VectorType, VectorAdaptor 7 | from tidb_vector.constants import DistanceMetric 8 | 9 | dotenv.load_dotenv() 10 | 11 | # Step 1: Connect to TiDB using SQLAlchemy. 12 | 13 | # Using `pymysql` as the driver. 14 | drivername = 'mysql+pymysql' 15 | ssl_kwargs = { 16 | 'ssl_verify_cert': 'true', 17 | 'ssl_verify_identity': 'true', 18 | } 19 | 20 | # Using `mysqlclient` as the driver. 21 | # drivername = 'mysql+mysqldb' 22 | # ssl_kwargs = { 23 | # 'ssl_mode': 'VERIFY_IDENTITY', 24 | # 'ssl': { 25 | # # Root certificate default path 26 | # # https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters/#root-certificate-default-path 27 | # 'ca': os.environ.get('TIDB_CA_PATH', '/path/to/ca.pem'), 28 | # }, 29 | # } 30 | 31 | engine = create_engine(URL.create( 32 | drivername=drivername, 33 | username=os.environ['TIDB_USERNAME'], 34 | password=os.environ['TIDB_PASSWORD'], 35 | host=os.environ['TIDB_HOST'], 36 | port=os.environ['TIDB_PORT'], 37 | database=os.environ['TIDB_DATABASE'], 38 | query=ssl_kwargs if os.environ.get('TIDB_SSL', 'false').lower() == 'true' else {}, 39 | )) 40 | 41 | 42 | # Step 2: Define a table with a vector column. 43 | Base = declarative_base() 44 | 45 | 46 | class Document(Base): 47 | __tablename__ = 'sqlalchemy_demo_documents' 48 | id = Column(Integer, primary_key=True) 49 | content = Column(Text) 50 | embedding = Column(VectorType(3)) 51 | 52 | 53 | # Or add HNSW index when creating table. 54 | class DocumentWithIndex(Base): 55 | __tablename__ = 'sqlalchemy_demo_documents_with_index' 56 | id = Column(Integer, primary_key=True) 57 | content = Column(Text) 58 | embedding = Column(VectorType(3)) 59 | 60 | 61 | Base.metadata.drop_all(engine) 62 | Base.metadata.create_all(engine) 63 | VectorAdaptor(engine).create_vector_index( 64 | DocumentWithIndex.embedding, 65 | DistanceMetric.COSINE, 66 | skip_existing=True, 67 | ) 68 | 69 | 70 | # Step 3: Insert embeddings into the table. 71 | with Session(engine) as session: 72 | session.add(Document(content="dog", embedding=[1, 2, 1])) 73 | session.add(Document(content="fish", embedding=[1, 2, 4])) 74 | session.add(Document(content="tree", embedding=[1, 0, 0])) 75 | session.commit() 76 | 77 | 78 | # Step 4: Get the 3-nearest neighbor documents. 79 | print('Get 3-nearest neighbor documents:') 80 | with Session(engine) as session: 81 | distance = Document.embedding.cosine_distance([1, 2, 3]).label('distance') 82 | results = session.query(Document, distance).order_by(distance).limit(3).all() 83 | 84 | for doc, distance in results: 85 | print(f' - distance: {distance}\n' 86 | f' document: {doc.content}') 87 | 88 | # Step 5: Get documents within a certain distance. 89 | print('Get documents within a certain distance:') 90 | with (Session(engine) as session): 91 | distance = Document.embedding.cosine_distance([1, 2, 3]).label('distance') 92 | results = session.query( 93 | Document, distance 94 | ).filter(distance < 0.2).order_by(distance).limit(3).all() 95 | 96 | for doc, distance in results: 97 | print(f' - distance: {distance}\n' 98 | f' document: {doc.content}') 99 | -------------------------------------------------------------------------------- /examples/python-client-quickstart/.env.example: -------------------------------------------------------------------------------- 1 | # A example database URL to connect to a TiDB cluster from macOS: 2 | # mysql+pymysql://.root:@gateway01..prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true 3 | TIDB_DATABASE_URL="mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true" 4 | -------------------------------------------------------------------------------- /examples/python-client-quickstart/README.md: -------------------------------------------------------------------------------- 1 | # TiDB Vector Search Python Client Quickstart 2 | 3 | This is a simple demo to show how to use the TiDB Vector Search Python Client to search for similar text in a TiDB Serverless cluster. 4 | 5 | ## Prerequisites 6 | 7 | - A running TiDB Serverless cluster with vector search enabled 8 | - Python 3.8 or later 9 | 10 | ## Run the example 11 | 12 | ### Clone this repo 13 | 14 | ```bash 15 | git clone https://github.com/pingcap/tidb-vector-python.git 16 | ``` 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | cd tidb-vector-python/examples/python-client-quickstart 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | ``` 25 | 26 | ### Install dependencies 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | ### Set the environment variables 33 | 34 | Get the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then replace the following placeholders to get the `TIDB_DATABASE_URL`. 35 | 36 | ```bash 37 | export TIDB_DATABASE_URL="mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true" 38 | ``` 39 | or create a `.env` file with the above environment variables. 40 | 41 | ### Run this example 42 | 43 | ```text 44 | $ python example.py 45 | Downloading and loading the embedding model... 46 | Search result ("a swimming animal"): 47 | - text: "fish", distance: 0.4562914811223072 48 | - text: "dog", distance: 0.6469335836410557 49 | - text: "tree", distance: 0.798545178640937 50 | ``` -------------------------------------------------------------------------------- /examples/python-client-quickstart/example.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from tidb_vector.integrations import TiDBVectorClient 4 | from sentence_transformers import SentenceTransformer 5 | from dotenv import load_dotenv 6 | 7 | # Step 1. Initialize embedding model 8 | 9 | print("Downloading and loading the embedding model...") 10 | embed_model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5", trust_remote_code=True) 11 | embed_model_dims = embed_model.get_sentence_embedding_dimension() 12 | 13 | 14 | def text_to_embedding(text): 15 | """Generates vector embeddings for the given text.""" 16 | embedding = embed_model.encode(text) 17 | return embedding.tolist() 18 | 19 | 20 | # Step 2. Initialize TiDBVectorClient instance 21 | 22 | load_dotenv() 23 | 24 | vector_store = TiDBVectorClient( 25 | # The table which will store the vector data. 26 | table_name='embedded_documents', 27 | # The connection string to the TiDB cluster. 28 | # The connection string should be in the format of: 29 | # mysql+pymysql://:@:4000/?ssl_ca=&ssl_verify_cert=true&ssl_verify_identity=true 30 | connection_string=os.environ.get('TIDB_DATABASE_URL'), 31 | # The dimension of the vector generated by the embedding model. 32 | vector_dimension=embed_model_dims, 33 | # Determine whether to recreate the table if it already exists. 34 | drop_existing_table=True, 35 | ) 36 | 37 | # Step 3. Bulk insert objects and their embeddings 38 | 39 | documents = [ 40 | { 41 | "id": "f8e7dee2-63b6-42f1-8b60-2d46710c1971", 42 | "text": "dog", 43 | "embedding": text_to_embedding("dog"), 44 | "metadata": {"category": "animal"}, 45 | }, 46 | { 47 | "id": "8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6", 48 | "text": "fish", 49 | "embedding": text_to_embedding("fish"), 50 | "metadata": {"category": "animal"}, 51 | }, 52 | { 53 | "id": "e4991349-d00b-485c-a481-f61695f2b5ae", 54 | "text": "tree", 55 | "embedding": text_to_embedding("tree"), 56 | "metadata": {"category": "plant"}, 57 | }, 58 | ] 59 | 60 | vector_store.insert( 61 | ids=[doc["id"] for doc in documents], 62 | texts=[doc["text"] for doc in documents], 63 | embeddings=[doc["embedding"] for doc in documents], 64 | metadatas=[doc["metadata"] for doc in documents], 65 | ) 66 | 67 | # Step 4. Perform vector search to find the most semantically similar documents to the query. 68 | 69 | 70 | def print_result(query, result): 71 | print(f"Search result (\"{query}\"):") 72 | for r in result: 73 | print(f"- text: \"{r.document}\", distance: {r.distance}") 74 | 75 | 76 | query = "a swimming animal" 77 | query_embedding = text_to_embedding(query) 78 | search_result = vector_store.query(query_embedding, k=3) 79 | print_result(query, search_result) 80 | -------------------------------------------------------------------------------- /examples/python-client-quickstart/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv==1.0.0 2 | PyMySQL==1.1.0 3 | sentence-transformers==3.0.1 4 | SQLAlchemy==2.0.30 5 | tidb-vector 6 | -------------------------------------------------------------------------------- /examples/semantic-cache/README.md: -------------------------------------------------------------------------------- 1 | # Semantic Cache with Jina AI and TiDB Vector 2 | Semantic cache is a cache that stores the semantic information of the data. It can be used to speed up the search process by storing the embeddings of the data and searching for similar embeddings. This example demonstrates how to use Jina AI to generate embeddings for text data and store the embeddings in TiDB Vector Storage. It also shows how to search for similar embeddings in TiDB Vector Storage. 3 | 4 | ## Prerequisites 5 | 6 | - A running TiDB Serverless cluster with vector search enabled 7 | - Python 3.8 or later 8 | - Jina AI API key 9 | 10 | ## Run the example 11 | 12 | ### Clone this repo 13 | 14 | ```bash 15 | git clone https://github.com/pingcap/tidb-vector-python.git 16 | ``` 17 | 18 | ### Create a virtual environment 19 | 20 | ```bash 21 | cd tidb-vector-python/examples/semantic-cache 22 | python3 -m venv .venv 23 | source .venv/bin/activate 24 | ``` 25 | 26 | ### Install dependencies 27 | 28 | ```bash 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | ### Set the environment variables 33 | 34 | Get the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, and `DATABASE` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. Then set the following environment variables: 35 | 36 | ```bash 37 | export DATABASE_URI="mysql+pymysql://34u7xMnnDLSkjV1.root:@gateway01.eu-central-1.prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true" 38 | ``` 39 | or create a `.env` file with the above environment variables. 40 | 41 | 42 | ### Run this example 43 | 44 | 45 | #### Start the semantic cache server 46 | 47 | ```bash 48 | fastapi dev cache.py 49 | ``` 50 | 51 | #### Test the API 52 | 53 | Get the Jina AI API key from the [Jina AI Embedding API](https://jina.ai/embeddings/) page, and save it somewhere safe for later use. 54 | 55 | `POST /set` 56 | 57 | ```bash 58 | curl --location ':8000/set' \ 59 | --header 'Content-Type: application/json' \ 60 | --header 'Authorization: Bearer ' \ 61 | --data '{ 62 | "key": "what is tidb", 63 | "value": "tidb is a mysql-compatible and htap database" 64 | }' 65 | ``` 66 | 67 | `GET /get/` 68 | 69 | ```bash 70 | curl --location ':8000/get/what%27s%20tidb%20and%20tikv?max_distance=0.5' \ 71 | --header 'Content-Type: application/json' \ 72 | --header 'Authorization: Bearer ' 73 | ``` -------------------------------------------------------------------------------- /examples/semantic-cache/cache.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from typing import Optional, Annotated 4 | 5 | import requests 6 | import dotenv 7 | from fastapi import Depends, FastAPI 8 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer 9 | from sqlmodel import ( 10 | SQLModel, 11 | Session, 12 | create_engine, 13 | select, 14 | Field, 15 | Column, 16 | String, 17 | Text, 18 | DateTime, 19 | ) 20 | from sqlalchemy import func 21 | from tidb_vector.sqlalchemy import VectorType 22 | dotenv.load_dotenv() 23 | 24 | 25 | # Configuration from .env 26 | # Example: "mysql+pymysql://:@:/?ssl_mode=VERIFY_IDENTITY&ssl_ca=/etc/ssl/cert.pem" 27 | DATABASE_URI = os.getenv('DATABASE_URI') 28 | # Ref: https://docs.pingcap.com/tidb/stable/time-to-live 29 | # Default: 604800 SECOND (1 week) 30 | TIME_TO_LIVE = os.getenv('TIME_TO_LIVE') 31 | 32 | 33 | # Get Embeddings from Jina AI 34 | def generate_embeddings(jinaai_api_key: str, text: str): 35 | JINAAI_API_URL = 'https://api.jina.ai/v1/embeddings' 36 | JINAAI_HEADERS = { 37 | 'Content-Type': 'application/json', 38 | 'Authorization': f'Bearer {jinaai_api_key}' 39 | } 40 | JINAAI_REQUEST_DATA = { 41 | 'input': [text], 42 | 'model': 'jina-embeddings-v2-base-en' # with dimisions 768 43 | } 44 | response = requests.post(JINAAI_API_URL, headers=JINAAI_HEADERS, json=JINAAI_REQUEST_DATA) 45 | return response.json()['data'][0]['embedding'] 46 | 47 | 48 | class Cache(SQLModel, table=True): 49 | __table_args__ = { 50 | # Ref: https://docs.pingcap.com/tidb/stable/time-to-live 51 | 'mysql_TTL': f'created_at + INTERVAL {TIME_TO_LIVE} SECOND', 52 | } 53 | 54 | id: Optional[int] = Field(default=None, primary_key=True) 55 | key: str = Field(sa_column=Column(String(255), unique=True, nullable=False)) 56 | key_vec: Optional[list[float]]= Field( 57 | sa_column=Column( 58 | VectorType(768), 59 | default=None, 60 | nullable=False, 61 | ) 62 | ) 63 | value: Optional[str] = Field(sa_column=Column(Text)) 64 | created_at: datetime = Field( 65 | sa_column=Column(DateTime, server_default=func.now(), nullable=False) 66 | ) 67 | updated_at: datetime = Field( 68 | sa_column=Column( 69 | DateTime, server_default=func.now(), onupdate=func.now(), nullable=False 70 | ) 71 | ) 72 | 73 | engine = create_engine(DATABASE_URI) 74 | SQLModel.metadata.create_all(engine) 75 | 76 | app = FastAPI() 77 | security = HTTPBearer() 78 | 79 | @app.get("/") 80 | def index(): 81 | return { 82 | "message": "Welcome to Semantic Cache API, it is built using Jina AI Embeddings API and TiDB Vector", 83 | "docs": "/docs", 84 | "redoc": "/redoc", 85 | "about": "https://github.com/pingcap/tidb-vector-python/blob/main/examples/semantic-cache/README.md", 86 | "config": { 87 | "TIME_TO_LIVE": int(TIME_TO_LIVE), 88 | "EMBEDDING_DIMENSIONS": 768, 89 | "EMBEDDING_PROVIDER": "Jina AI", 90 | "EMBEDDING_MODEL": "jina-embeddings-v2-base-en", 91 | } 92 | } 93 | 94 | 95 | # /set method of Semantic Cache 96 | @app.post("/set") 97 | def set( 98 | credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)], 99 | cache: Cache, 100 | ): 101 | cache.key_vec = generate_embeddings(credentials.credentials, cache.key) 102 | 103 | with Session(engine) as session: 104 | session.add(cache) 105 | session.commit() 106 | 107 | return {'message': 'Cache has been set'} 108 | 109 | 110 | @app.get("/get/{key}") 111 | def get( 112 | credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)], 113 | key: str, 114 | max_distance: Optional[float] = 0.1, 115 | ): 116 | key_vec = generate_embeddings(credentials.credentials, key) 117 | # The max value of distance is 0.3 118 | max_distance = min(max_distance, 0.3) 119 | 120 | with Session(engine) as session: 121 | result = session.exec( 122 | select( 123 | Cache, 124 | Cache.key_vec.cosine_distance(key_vec).label('distance') 125 | ).order_by( 126 | 'distance' 127 | ).limit(1) 128 | ).first() 129 | 130 | if result is None: 131 | return {"message": "Cache not found"}, 404 132 | 133 | cache, distance = result 134 | if distance > max_distance: 135 | return {"message": "Cache not found"}, 404 136 | 137 | return { 138 | "key": cache.key, 139 | "value": cache.value, 140 | "distance": distance 141 | } -------------------------------------------------------------------------------- /examples/semantic-cache/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | PyMySQL 3 | sqlmodel==0.0.19 4 | tidb-vector>=0.0.9 5 | python-dotenv 6 | fastapi 7 | -------------------------------------------------------------------------------- /examples/static/images/tidbcloud-connect-parameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/static/images/tidbcloud-connect-parameters.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "tidb-vector" 3 | # this version is usless, now read the version from __init__.py 4 | version = "0.0.0" 5 | description = "A Python client for TiDB Vector" 6 | authors = ["IANTHEREAL "] 7 | license = "Apache-2.0" 8 | readme = "README.md" 9 | packages = [{include = "tidb_vector"}] 10 | 11 | [tool.poetry-version-plugin] 12 | source = "init" 13 | 14 | [tool.poetry.dependencies] 15 | python = ">=3.8.1,<4.0" 16 | numpy = "^1" 17 | SQLAlchemy = {version = ">=1.4,<3", optional = true} 18 | 19 | [tool.poetry.extras] 20 | client = ["SQLAlchemy"] 21 | 22 | [tool.poetry.group.test.dependencies] 23 | # The only dependencies that should be added are 24 | # dependencies used for running tests. 25 | # Any dependencies that do not meet that criteria will be removed. 26 | pytest = "^7.3.0" 27 | pytest-cov = "^4.0.0" 28 | pytest-dotenv = "^0.5.2" 29 | 30 | [build-system] 31 | requires = ["poetry-core"] 32 | build-backend = "poetry.core.masonry.api" 33 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/__init__.py -------------------------------------------------------------------------------- /tests/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class TestConfig: 5 | TIDB_HOST = os.getenv("TEST_TIDB_HOST", "127.0.0.1") 6 | TIDB_USER = os.getenv("TEST_TIDB_USER", "root") 7 | TIDB_PASSWORD = os.getenv("TEST_TIDB_PASSWORD", "") 8 | TIDB_PORT = int(os.getenv("TEST_TIDB_PORT", "4000")) 9 | TIDB_SSL = os.getenv("TEST_TIDB_SSL", "false").lower() in ["true", "1"] 10 | -------------------------------------------------------------------------------- /tests/integrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/integrations/__init__.py -------------------------------------------------------------------------------- /tests/integrations/test_utils.py: -------------------------------------------------------------------------------- 1 | """Test TiDB Vector Search functionality.""" 2 | from __future__ import annotations 3 | 4 | from tidb_vector.integrations.utils import extract_info_from_column_definition 5 | 6 | 7 | def test_extract_info_from_column_definition(): 8 | # Test case with dimension and distance metric 9 | column_type = "VECTOR(128)" 10 | column_comment = "hnsw(distance=cosine)" 11 | expected_result = (128, "cosine") 12 | assert ( 13 | extract_info_from_column_definition(column_type, column_comment) 14 | == expected_result 15 | ) 16 | 17 | # Test case with dimension but no distance metric 18 | column_type = "VECTOR(256)" 19 | column_comment = "some comment" 20 | expected_result = (256, None) 21 | assert ( 22 | extract_info_from_column_definition(column_type, column_comment) 23 | == expected_result 24 | ) 25 | 26 | # Test case with no dimension and no distance metric 27 | column_type = "VECTOR" 28 | column_comment = "another comment" 29 | expected_result = (None, None) 30 | assert ( 31 | extract_info_from_column_definition(column_type, column_comment) 32 | == expected_result 33 | ) 34 | 35 | # Test case with no dimension and no comment 36 | column_type = "VECTOR" 37 | column_comment = "" 38 | expected_result = (None, None) 39 | assert ( 40 | extract_info_from_column_definition(column_type, column_comment) 41 | == expected_result 42 | ) 43 | 44 | # Test case with dimension but no comment 45 | column_type = "VECTOR(256)" 46 | column_comment = "" 47 | expected_result = (256, None) 48 | assert ( 49 | extract_info_from_column_definition(column_type, column_comment) 50 | == expected_result 51 | ) 52 | 53 | # Test case without index type 54 | column_type = "VECTOR" 55 | column_comment = "distance=l2" 56 | expected_result = (None, "l2") 57 | assert ( 58 | extract_info_from_column_definition(column_type, column_comment) 59 | == expected_result 60 | ) 61 | 62 | # Test case with addition comment content 63 | column_type = "VECTOR(128)" 64 | column_comment = "test, hnsw(distance=l2)" 65 | expected_result = (128, "l2") 66 | assert ( 67 | extract_info_from_column_definition(column_type, column_comment) 68 | == expected_result 69 | ) 70 | -------------------------------------------------------------------------------- /tests/peewee/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/peewee/__init__.py -------------------------------------------------------------------------------- /tests/sqlalchemy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/sqlalchemy/__init__.py -------------------------------------------------------------------------------- /tidb_vector/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import MAX_DIM, MIN_DIM, DistanceMetric, VectorDataType 2 | 3 | __version__ = "0.0.14" 4 | __all__ = ["MAX_DIM", "MIN_DIM", "DistanceMetric", "VectorDataType"] 5 | -------------------------------------------------------------------------------- /tidb_vector/constants.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import typing 3 | 4 | import numpy 5 | 6 | # TiDB Vector has a limitation on the dimension length 7 | MAX_DIM = 16000 8 | MIN_DIM = 1 9 | 10 | 11 | VectorDataType = typing.Union[numpy.ndarray, typing.List[float]] 12 | 13 | 14 | class DistanceMetric(enum.Enum): 15 | """ 16 | An enumeration representing different types of distance metrics. 17 | 18 | - `DistanceMetric.L2`: L2 (Euclidean) distance metric. 19 | - `DistanceMetric.COSINE`: Cosine distance metric. 20 | """ 21 | 22 | L2 = "L2" 23 | COSINE = "COSINE" 24 | 25 | def to_sql_func(self): 26 | """ 27 | Converts the DistanceMetric to its corresponding SQL function name. 28 | 29 | Returns: 30 | str: The SQL function name. 31 | 32 | Raises: 33 | ValueError: If the DistanceMetric enum member is not supported. 34 | """ 35 | if self == DistanceMetric.L2: 36 | return "VEC_L2_DISTANCE" 37 | elif self == DistanceMetric.COSINE: 38 | return "VEC_COSINE_DISTANCE" 39 | else: 40 | raise ValueError("unsupported distance metric") 41 | -------------------------------------------------------------------------------- /tidb_vector/integrations/__init__.py: -------------------------------------------------------------------------------- 1 | from tidb_vector.integrations.vector_client import TiDBVectorClient 2 | from tidb_vector.integrations.utils import ( 3 | EmbeddingColumnMismatchError, 4 | check_table_existence, 5 | get_embedding_column_definition, 6 | ) 7 | 8 | __all__ = [ 9 | "TiDBVectorClient", 10 | "EmbeddingColumnMismatchError", 11 | "check_table_existence", 12 | "get_embedding_column_definition", 13 | ] 14 | -------------------------------------------------------------------------------- /tidb_vector/integrations/utils.py: -------------------------------------------------------------------------------- 1 | import sqlalchemy 2 | import re 3 | from typing import Any, Dict, Optional 4 | 5 | 6 | class EmbeddingColumnMismatchError(ValueError): 7 | """ 8 | Exception raised when the existing embedding column does not match the expected dimension. 9 | 10 | Attributes: 11 | existing_col (str): The definition of the existing embedding column. 12 | expected_col (str): The definition of the expected embedding column. 13 | """ 14 | 15 | def __init__(self, existing_col, expected_col): 16 | self.existing_col = existing_col 17 | self.expected_col = expected_col 18 | super().__init__( 19 | f"The existing embedding column ({existing_col}) does not match the expected dimension ({expected_col})." 20 | ) 21 | 22 | 23 | def check_table_existence( 24 | connection_string: str, 25 | table_name: str, 26 | engine_args: Optional[Dict[str, Any]] = None, 27 | ) -> bool: 28 | """ 29 | Check if the vector table exists in the database 30 | 31 | Args: 32 | connection_string (str): The connection string for the database. 33 | table_name (str): The name of the table to check. 34 | engine_args (Optional[Dict[str, Any]]): Additional arguments for the engine. 35 | 36 | Returns: 37 | bool: True if the table exists, False otherwise. 38 | """ 39 | engine = sqlalchemy.create_engine(connection_string, **(engine_args or {})) 40 | try: 41 | inspector = sqlalchemy.inspect(engine) 42 | return table_name in inspector.get_table_names() 43 | finally: 44 | engine.dispose() 45 | 46 | 47 | def get_embedding_column_definition( 48 | connection_string: str, 49 | table_name: str, 50 | column_name: str, 51 | engine_args: Optional[Dict[str, Any]] = None, 52 | ): 53 | """ 54 | Retrieves the column definition of an embedding column from a database table. 55 | 56 | Args: 57 | connection_string (str): The connection string to the database. 58 | table_name (str): The name of the table. 59 | column_name (str): The name of the column. 60 | engine_args (Optional[Dict[str, Any]]): Additional arguments for the engine. 61 | 62 | Returns: 63 | tuple: A tuple containing the dimension (int or None) and distance metric (str or None). 64 | """ 65 | engine = sqlalchemy.create_engine(connection_string, **(engine_args or {})) 66 | try: 67 | with engine.connect() as connection: 68 | query = f"""SELECT COLUMN_TYPE, COLUMN_COMMENT 69 | FROM INFORMATION_SCHEMA.COLUMNS 70 | WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{column_name}'""" 71 | result = connection.execute(sqlalchemy.text(query)).fetchone() 72 | if result: 73 | return extract_info_from_column_definition(result[0], result[1]) 74 | finally: 75 | engine.dispose() 76 | 77 | return None, None 78 | 79 | 80 | def extract_info_from_column_definition(column_type, column_comment): 81 | """ 82 | Extracts the dimension and distance metric from a column definition, 83 | supporting both optional dimension and optional comment. 84 | 85 | Args: 86 | column_type (str): The column definition, possibly including dimension and a comment. 87 | 88 | Returns: 89 | tuple: A tuple containing the dimension (int or None) and the distance metric (str or None). 90 | """ 91 | # Try to extract the dimension, which is optional. 92 | dimension_match = re.search(r"VECTOR(?:\((\d+)\))?", column_type, re.IGNORECASE) 93 | dimension = ( 94 | int(dimension_match.group(1)) 95 | if dimension_match and dimension_match.group(1) 96 | else None 97 | ) 98 | 99 | # Extracting index type and distance metric from the comment, supporting both single and double quotes. 100 | distance_match = re.search(r"distance=([^,\)]+)", column_comment) 101 | distance = distance_match.group(1) if distance_match else None 102 | 103 | return dimension, distance 104 | -------------------------------------------------------------------------------- /tidb_vector/peewee/__init__.py: -------------------------------------------------------------------------------- 1 | from .vector_type import VectorField 2 | from .adaptor import VectorAdaptor 3 | 4 | __all__ = ["VectorField", "VectorAdaptor"] 5 | -------------------------------------------------------------------------------- /tidb_vector/peewee/adaptor.py: -------------------------------------------------------------------------------- 1 | import peewee 2 | import tidb_vector 3 | from .vector_type import VectorField 4 | 5 | 6 | class VectorAdaptor: 7 | """ 8 | A wrapper over existing Peewee Database to provide additional vector search capabilities. 9 | """ 10 | 11 | engine: peewee.Database 12 | 13 | def __init__(self, engine: peewee.Database): 14 | self.engine = engine 15 | 16 | def _check_vector_column(self, field: VectorField): 17 | if not isinstance(field, VectorField): 18 | raise ValueError("Not a vector field") 19 | 20 | def has_vector_index(self, field: VectorField) -> bool: 21 | """ 22 | Check if the index for the vector column exists. 23 | """ 24 | 25 | self._check_vector_column(field) 26 | 27 | table_name = field.model._meta.table_name 28 | 29 | # TODO: Better quote 30 | cursor: peewee.CursorWrapper = self.engine.execute_sql( 31 | f"SHOW INDEX FROM `{table_name}`" 32 | ) 33 | column_name_idx = None 34 | for idx, column in enumerate(cursor.description): 35 | if column[0].lower() == "column_name": 36 | column_name_idx = idx 37 | break 38 | if column_name_idx is None: 39 | raise ValueError("Failed to parse SHOW INDEX result") 40 | 41 | for row in cursor: 42 | column_name = row[column_name_idx] 43 | if column_name.lower() == field.name.lower(): 44 | return True 45 | 46 | return False 47 | 48 | def create_vector_index( 49 | self, 50 | field: VectorField, 51 | distance_metric: tidb_vector.DistanceMetric, 52 | skip_existing: bool = False, 53 | ): 54 | """ 55 | Create vector index for the vector column. 56 | 57 | Parameters 58 | ---------- 59 | field : peewee.Field 60 | The field for which the vector index is to be created. 61 | 62 | distance_metric : tidb_vector.DistanceMetric 63 | The distance metric to be used for the vector index. 64 | Available values are: 65 | - tidb_vector.DistanceMetric.L2 66 | - tidb_vector.DistanceMetric.COSINE 67 | 68 | skip_existing : bool 69 | If True, skips creating the index if it already exists. Default is False. 70 | 71 | Raises 72 | ------ 73 | ValueError 74 | If the vector field does not have a fixed dimension. 75 | 76 | ValueError 77 | If the field is not a vector field. 78 | 79 | Note 80 | ---- 81 | If you want to use high-avaliability columnar storage feature, use raw SQL instead. 82 | 83 | """ 84 | 85 | self._check_vector_column(field) 86 | 87 | if field.dimensions is None: 88 | raise ValueError( 89 | "Vector index is only supported for fixed dimension vectors" 90 | ) 91 | 92 | if skip_existing: 93 | if self.has_vector_index(field): 94 | # TODO: Currently there is no easy way to verify whether the distance 95 | # metric is correct. We should check it and throw error if distance metric is not matching 96 | return 97 | 98 | table_name = field.model._meta.table_name 99 | column_name = field.name 100 | index_name = f"vec_idx_{field.name}" 101 | 102 | self.engine.execute_sql(f"ALTER TABLE `{table_name}` SET TIFLASH REPLICA 1") 103 | self.engine.execute_sql( 104 | f""" 105 | ALTER TABLE `{table_name}` 106 | ADD VECTOR INDEX `{index_name}` (({distance_metric.to_sql_func()}(`{column_name}`))) 107 | """ 108 | ) 109 | -------------------------------------------------------------------------------- /tidb_vector/peewee/vector_type.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from peewee import Field, fn 3 | 4 | from tidb_vector.utils import decode_vector, encode_vector 5 | 6 | 7 | class VectorField(Field): 8 | dimensions: Optional[int] 9 | 10 | field_type = "VECTOR" 11 | 12 | def __init__(self, dimensions: Optional[int] = None, *args, **kwargs): 13 | self.dimensions = dimensions 14 | super(VectorField, self).__init__(*args, **kwargs) 15 | 16 | def get_modifiers(self): 17 | return self.dimensions and [self.dimensions] or None 18 | 19 | def db_value(self, value): 20 | return encode_vector(value) 21 | 22 | def python_value(self, value): 23 | return decode_vector(value) 24 | 25 | def l1_distance(self, vector): 26 | return fn.VEC_L1_DISTANCE(self, self.to_value(vector)) 27 | 28 | def l2_distance(self, vector): 29 | return fn.VEC_L2_DISTANCE(self, self.to_value(vector)) 30 | 31 | def cosine_distance(self, vector): 32 | return fn.VEC_COSINE_DISTANCE(self, self.to_value(vector)) 33 | 34 | def negative_inner_product(self, vector): 35 | return fn.VEC_NEGATIVE_INNER_PRODUCT(self, self.to_value(vector)) 36 | -------------------------------------------------------------------------------- /tidb_vector/sqlalchemy/__init__.py: -------------------------------------------------------------------------------- 1 | from .vector_type import VectorType 2 | from .adaptor import VectorAdaptor 3 | 4 | __all__ = ["VectorType", "VectorAdaptor"] 5 | -------------------------------------------------------------------------------- /tidb_vector/sqlalchemy/adaptor.py: -------------------------------------------------------------------------------- 1 | import sqlalchemy 2 | import tidb_vector 3 | from .vector_type import VectorType 4 | 5 | 6 | class VectorAdaptor: 7 | """ 8 | A wrapper over existing SQLAlchemy engine to provide additional vector search capabilities. 9 | """ 10 | 11 | engine: sqlalchemy.Engine 12 | 13 | def __init__(self, engine: sqlalchemy.Engine): 14 | self.engine = engine 15 | 16 | def _check_vector_column(self, column: sqlalchemy.Column): 17 | if not isinstance(column.type, VectorType): 18 | raise ValueError("Not a vector column") 19 | 20 | def has_vector_index(self, column: sqlalchemy.Column) -> bool: 21 | """ 22 | Check if the index for the vector column exists. 23 | """ 24 | 25 | self._check_vector_column(column) 26 | 27 | with self.engine.begin() as conn: 28 | table_name = conn.dialect.identifier_preparer.format_table(column.table) 29 | query = sqlalchemy.text(f"SHOW INDEX FROM {table_name}") 30 | result = conn.execute(query) 31 | result_dict = result.mappings().all() 32 | for row in result_dict: 33 | if row["Column_name"].lower() == column.name.lower(): 34 | return True 35 | return False 36 | 37 | def create_vector_index( 38 | self, 39 | column: sqlalchemy.Column, 40 | distance_metric: tidb_vector.DistanceMetric, 41 | skip_existing: bool = False, 42 | ): 43 | """ 44 | Create vector index for the vector column. 45 | 46 | Parameters 47 | ---------- 48 | column : sqlalchemy.Column 49 | The column for which the vector index is to be created. 50 | 51 | distance_metric : tidb_vector.DistanceMetric 52 | The distance metric to be used for the vector index. 53 | Available values are: 54 | - tidb_vector.DistanceMetric.L2 55 | - tidb_vector.DistanceMetric.COSINE 56 | 57 | skip_existing : bool 58 | If True, skips creating the index if it already exists. Default is False. 59 | 60 | Raises 61 | ------ 62 | ValueError 63 | If the vector column does not have a fixed dimension. 64 | 65 | ValueError 66 | If the column is not a vector column. 67 | 68 | Note 69 | ---- 70 | If you want to use high-avaliability columnar storage feature, use raw SQL instead. 71 | 72 | """ 73 | 74 | self._check_vector_column(column) 75 | 76 | if column.type.dim is None: 77 | raise ValueError( 78 | "Vector index is only supported for fixed dimension vectors" 79 | ) 80 | 81 | if skip_existing: 82 | if self.has_vector_index(column): 83 | # TODO: Currently there is no easy way to verify whether the distance 84 | # metric is correct. We should check it and throw error if distance metric is not matching 85 | return 86 | 87 | with self.engine.begin() as conn: 88 | table_name = conn.dialect.identifier_preparer.format_table(column.table) 89 | column_name = conn.dialect.identifier_preparer.format_column(column) 90 | index_name = conn.dialect.identifier_preparer.quote( 91 | f"vec_idx_{column.name}" 92 | ) 93 | 94 | query = sqlalchemy.text(f"ALTER TABLE {table_name} SET TIFLASH REPLICA 1") 95 | conn.execute(query) 96 | 97 | query = sqlalchemy.text( 98 | f""" 99 | ALTER TABLE {table_name} 100 | ADD VECTOR INDEX {index_name} (({distance_metric.to_sql_func()}({column_name}))) 101 | """ 102 | ) 103 | conn.execute(query) 104 | -------------------------------------------------------------------------------- /tidb_vector/sqlalchemy/vector_type.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import sqlalchemy 3 | import tidb_vector 4 | import tidb_vector.utils 5 | 6 | 7 | class VectorType(sqlalchemy.types.UserDefinedType): 8 | """ 9 | Represents a vector column type in TiDB. 10 | """ 11 | 12 | dim: Optional[int] 13 | 14 | cache_ok = True 15 | 16 | def __init__(self, dim: Optional[int] = None): 17 | if dim is not None and not isinstance(dim, int): 18 | raise ValueError("expected dimension to be an integer or None") 19 | 20 | # tidb vector dimention length has limitation 21 | if dim is not None and (dim < tidb_vector.MIN_DIM or dim > tidb_vector.MAX_DIM): 22 | raise ValueError( 23 | f"expected dimension to be in [{tidb_vector.MIN_DIM}, {tidb_vector.MAX_DIM}]" 24 | ) 25 | 26 | super(sqlalchemy.types.UserDefinedType, self).__init__() 27 | self.dim = dim 28 | 29 | def get_col_spec(self, **kw): 30 | """ 31 | Returns the column specification for the vector column. 32 | 33 | If the dimension is not specified, it returns "VECTOR". 34 | Otherwise, it returns "VECTOR()". 35 | 36 | :param kw: Additional keyword arguments. 37 | :return: The column specification string. 38 | """ 39 | 40 | if self.dim is None: 41 | return "VECTOR" 42 | return f"VECTOR({self.dim})" 43 | 44 | def bind_processor(self, dialect): 45 | """Convert the vector float array to a string representation suitable for binding to a database column.""" 46 | 47 | def process(value): 48 | return tidb_vector.utils.encode_vector(value, self.dim) 49 | 50 | return process 51 | 52 | def result_processor(self, dialect, coltype): 53 | """Convert the vector data from the database into vector array.""" 54 | 55 | def process(value): 56 | return tidb_vector.utils.decode_vector(value) 57 | 58 | return process 59 | 60 | class comparator_factory(sqlalchemy.types.UserDefinedType.Comparator): 61 | """Returns a comparator factory that provides the distance functions.""" 62 | 63 | def l1_distance(self, other: tidb_vector.VectorDataType): 64 | formatted_other = tidb_vector.utils.encode_vector(other) 65 | return sqlalchemy.func.VEC_L1_DISTANCE(self, formatted_other).label( 66 | "l1_distance" 67 | ) 68 | 69 | def l2_distance(self, other: tidb_vector.VectorDataType): 70 | formatted_other = tidb_vector.utils.encode_vector(other) 71 | return sqlalchemy.func.VEC_L2_DISTANCE(self, formatted_other).label( 72 | "l2_distance" 73 | ) 74 | 75 | def cosine_distance(self, other: tidb_vector.VectorDataType): 76 | formatted_other = tidb_vector.utils.encode_vector(other) 77 | return sqlalchemy.func.VEC_COSINE_DISTANCE(self, formatted_other).label( 78 | "cosine_distance" 79 | ) 80 | 81 | def negative_inner_product(self, other: tidb_vector.VectorDataType): 82 | formatted_other = tidb_vector.utils.encode_vector(other) 83 | return sqlalchemy.func.VEC_NEGATIVE_INNER_PRODUCT( 84 | self, formatted_other 85 | ).label("negative_inner_product") 86 | -------------------------------------------------------------------------------- /tidb_vector/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tidb_vector 3 | 4 | 5 | def encode_vector(value: tidb_vector.VectorDataType, dim=None): 6 | if value is None: 7 | return value 8 | 9 | if dim is not None and len(value) != dim: 10 | raise ValueError(f"expected {dim} dimensions, but got {len(value)}") 11 | 12 | if isinstance(value, np.ndarray): 13 | if value.ndim != 1: 14 | raise ValueError("expected ndim to be 1") 15 | return f"[{','.join(map(str, value))}]" 16 | 17 | return str(value) 18 | 19 | 20 | def decode_vector(value: str) -> np.ndarray: 21 | if value is None: 22 | return value 23 | 24 | if value == "[]": 25 | return np.array([], dtype=np.float32) 26 | 27 | return np.array(value[1:-1].split(","), dtype=np.float32) 28 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | alwayscopy=true 3 | envlist = py311,py310,py39,py38,lint 4 | 5 | [gh-actions] 6 | python = 7 | 3.8: py38 8 | 3.9: py39 9 | 3.10: py310 10 | 3.11: py311 11 | 12 | [testenv] 13 | passenv = * 14 | deps = 15 | pytest 16 | peewee 17 | sqlalchemy 18 | pymysql 19 | commands = 20 | pytest tests 21 | setenv = 22 | LANG = en_US.utf-8 23 | 24 | [testenv:lint] 25 | skip_install = True 26 | allowlist_externals = bash 27 | deps = 28 | flake8==6.0.0 29 | black==23.7.0 30 | commands = 31 | bash -c "flake8 --max-line-length 130 tidb_vector tests" 32 | bash -c "black --diff --check tidb_vector tests" 33 | --------------------------------------------------------------------------------