├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── examples
    ├── README.md
    ├── dspy-demo
    │   ├── .env.example
    │   ├── README.md
    │   ├── example.py
    │   ├── requirements.txt
    │   ├── sample_data.txt
    │   ├── test.py
    │   └── utils.py
    ├── gemini-ai-embeddings-demo
    │   ├── README.md
    │   ├── example.ipynb
    │   ├── example.py
    │   └── requirements.txt
    ├── graphrag-demo
    │   ├── README.md
    │   ├── graphrag-demo.py
    │   ├── init.sql
    │   └── requirements.txt
    ├── graphrag-step-by-step-tutorial
    │   ├── README.md
    │   └── example.ipynb
    ├── image_search
    │   ├── README.md
    │   └── example.ipynb
    ├── jina-ai-embeddings-demo
    │   ├── README.md
    │   ├── jina-ai-embeddings-demo.py
    │   └── requirements.txt
    ├── langchain-agent-demo
    │   ├── .env.example
    │   ├── README.md
    │   ├── __init__.py
    │   ├── example.py
    │   ├── knowledge_base.py
    │   ├── requirements.txt
    │   ├── sample_data.txt
    │   └── utils.py
    ├── llamaindex-tidb-vector-with-ui
    │   ├── README.md
    │   ├── app.py
    │   ├── requirements.txt
    │   └── templates
    │   │   └── index.html
    ├── llamaindex-tidb-vector
    │   ├── README.md
    │   ├── chat_with_url.py
    │   └── requirements.txt
    ├── openai_embedding
    │   ├── README.md
    │   ├── example.ipynb
    │   ├── example.py
    │   └── requirements.txt
    ├── orm-django-quickstart
    │   ├── .env.example
    │   ├── .gitignore
    │   ├── README.md
    │   ├── manage.py
    │   ├── requirements.txt
    │   └── sample_project
    │   │   ├── __init__.py
    │   │   ├── asgi.py
    │   │   ├── forms.py
    │   │   ├── migrations
    │   │       ├── 0001_initial.py
    │   │       └── __init__.py
    │   │   ├── models.py
    │   │   ├── settings.py
    │   │   ├── urls.py
    │   │   ├── views.py
    │   │   └── wsgi.py
    ├── orm-peewee-quickstart
    │   ├── .env.example
    │   ├── README.md
    │   ├── peewee-quickstart.py
    │   └── requirements.txt
    ├── orm-sqlalchemy-quickstart
    │   ├── .env.example
    │   ├── README.md
    │   ├── requirements.txt
    │   └── sqlalchemy-quickstart.py
    ├── python-client-quickstart
    │   ├── .env.example
    │   ├── README.md
    │   ├── example.py
    │   └── requirements.txt
    ├── semantic-cache
    │   ├── README.md
    │   ├── cache.py
    │   └── requirements.txt
    └── static
    │   └── images
    │       └── tidbcloud-connect-parameters.png
├── poetry.lock
├── pyproject.toml
├── tests
    ├── __init__.py
    ├── config.py
    ├── integrations
    │   ├── __init__.py
    │   ├── test_utils.py
    │   └── test_vector_client.py
    ├── peewee
    │   ├── __init__.py
    │   └── test_peewee.py
    └── sqlalchemy
    │   ├── __init__.py
    │   └── test_sqlalchemy.py
├── tidb_vector
    ├── __init__.py
    ├── constants.py
    ├── integrations
    │   ├── __init__.py
    │   ├── utils.py
    │   └── vector_client.py
    ├── peewee
    │   ├── __init__.py
    │   ├── adaptor.py
    │   └── vector_type.py
    ├── sqlalchemy
    │   ├── __init__.py
    │   ├── adaptor.py
    │   └── vector_type.py
    └── utils.py
└── tox.ini


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.ref }}
 9 |   cancel-in-progress: true
10 | 
11 | jobs:
12 |   lint:
13 |     name: lint
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v3
18 | 
19 |       - name: Install dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           python -m pip install tox
23 | 
24 |       - name: Run lint
25 |         run: |
26 |           tox -e lint
27 | 
28 |   tests:
29 |     strategy:
30 |       fail-fast: false
31 |       matrix:
32 |         python-version:
33 |           - "3.12"
34 |     name: py${{ matrix.python-version }}_test
35 |     runs-on: ubuntu-latest
36 |     services:
37 |       tidb:
38 |         image: wangdi4zm/tind:v8.4.0-vector-index
39 |         ports:
40 |           - 4000:4000
41 |     steps:
42 |       - name: Checkout
43 |         uses: actions/checkout@v3
44 | 
45 |       - name: Setup Python
46 |         uses: actions/setup-python@v4
47 |         with:
48 |           python-version: ${{ matrix.python-version }}
49 | 
50 |       - name: Install dependencies
51 |         run: |
52 |           python -m pip install --upgrade pip
53 |           python -m pip install tox tox-gh-actions
54 |           sudo apt-get update
55 |           sudo apt-get install -y libmemcached-dev zlib1g-dev
56 | 
57 |       - name: Run tests
58 |         run: tox
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | .idea/
141 | django_tests_dir
142 | 
143 | *.swp
144 | 
145 | .vscode/
146 | 
147 | .DS_Store
148 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | default_language_version:
 3 |   python: python3
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v4.5.0
 8 |     hooks:
 9 |       - id: check-byte-order-marker
10 |       - id: check-merge-conflict
11 |       - id: check-symlinks
12 |       - id: check-toml
13 |       - id: check-yaml
14 |       - id: detect-private-key
15 |       - id: end-of-file-fixer
16 |       - id: mixed-line-ending
17 |       - id: trailing-whitespace
18 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
19 |     rev: v0.1.5
20 |     hooks:
21 |       - id: ruff
22 |         args: [--fix, --exit-non-zero-on-fix]
23 |   - repo: https://github.com/psf/black-pre-commit-mirror
24 |     rev: 23.10.1
25 |     hooks:
26 |       - id: black-jupyter
27 |         name: black-src
28 |         alias: black
29 |   - repo: https://github.com/pre-commit/mirrors-mypy
30 |     rev: v1.0.1
31 |     hooks:
32 |       - id: mypy
33 |         additional_dependencies:
34 |           [
35 |             "types-requests",
36 |             "types-Deprecated",
37 |             "types-redis",
38 |             "types-setuptools",
39 |             "types-PyYAML",
40 |             "types-protobuf==4.24.0.4",
41 |           ]
42 |   - repo: https://github.com/psf/black-pre-commit-mirror
43 |     rev: 23.10.1
44 |     hooks:
45 |       - id: black-jupyter
46 |         name: black-docs-py
47 |         alias: black
48 |         files: docs/
49 |         # Using PEP 8's line length in docs prevents excess left/right scrolling
50 |         args: [--line-length=79]
51 |   - repo: https://github.com/adamchainz/blacken-docs
52 |     rev: 1.16.0
53 |     hooks:
54 |       - id: blacken-docs
55 |         name: black-docs-text
56 |         alias: black
57 |         types_or: [rst, markdown, tex]
58 |         additional_dependencies: [black==23.10.1]
59 |         # Using PEP 8's line length in docs prevents excess left/right scrolling
60 |         args: [--line-length=79]
61 |   - repo: https://github.com/pre-commit/mirrors-prettier
62 |     rev: v3.0.3
63 |     hooks:
64 |       - id: prettier
65 |   - repo: https://github.com/codespell-project/codespell
66 |     rev: v2.2.6
67 |     hooks:
68 |       - id: codespell
69 |         additional_dependencies: [tomli]
70 |         args: ["--ignore-words-list", "nin"]
71 |   - repo: https://github.com/srstevenson/nb-clean
72 |     rev: 3.1.0
73 |     hooks:
74 |       - id: nb-clean
75 |         args: [--preserve-cell-outputs, --remove-empty-cells]
76 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | ## Contributing Guidelines
 4 | 
 5 | As TiDB Serverless introduced Vector Search feature that enable users to access vector data via SQL, we are also starting to build the ORM, SDK or libraries for the users to interact with TiDB Serverless and vector data. Such as Python SDK for TiDB Serverless it self, or new column support for traditional ORM like SQLAlchemy, Django ORM, etc.
 6 | 
 7 | Here we call for contributions to enhance the ecosystem of TiDB Serverless and vector data. You can contribute to the following areas:
 8 | 
 9 | 
10 | 
11 | ### Software Prerequisites for Development
12 | * [Python](https://www.python.org/downloads/)
13 | * [TiDB Serverless](https://pingcap.com/ai) for testing the SDK or libraries
14 | * [Visual Studio Code](https://code.visualstudio.com/) or any other code editor
15 | 
16 | 
17 | ### Components of the Project
18 | 
19 | #### Python SDK for TiDB Serverless
20 | 
21 | This repo `pingcap/tidb-vector-python` is the Python SDK for TiDB Serverless. You can contribute to this repo by adding new features, fixing bugs, or improving the performance of the SDK.
22 | 
23 | 
24 | #### Example or Tutorials
25 | 
26 | In this repo, there is a directory [examples](https://github.com/pingcap/tidb-vector-python/) that contains examples and tutorials for using TiDB Serverless and vector data. You can contribute to this directory by adding new examples or tutorials.
27 | 
28 | Currently, we are looking for the following types of examples or tutorials:
29 | 
30 | * Tutorials that enable users to use TiDB Serverless and vector data in different business scenarios, such as suggestion, recommendation system, etc.
31 | * Examples that demonstrate how to use TiDB Serverless and other tools or libraries, such as Dify, Jina AI, Anthropic AI, etc.
32 | * Notebooks that show how to use TiDB Serverless and vector data in different machine learning or deep learning tasks.
33 | 
34 | Not limited to the above types, you can also contribute other types of examples or tutorials that you think are helpful for the users.
35 | 
36 | 
37 | ## Maintainers
38 | 
39 | Please feel free to reach out to the maintainers if you have any questions or need help with the project.
40 | 
41 | * [wd0517](https://github.com/wd0517)
42 | * [634750802](https://github.com/634750802)
43 | * [Mini256](https://github.com/Mini256)
44 | * [IANTHEREAL](https://github.com/IANTHEREAL)
45 | * [Cheese](https://github.com/Icemap)
46 | 
47 | ## Discussion
48 | 
49 | If you have any questions or suggestions, please feel free to open a discussion in the [Discussions](https://github.com/pingcap/tidb-vector-python/)
50 | 
51 | or contact us via [@TiDB_Developer](https://twitter.com/TiDB_Developer) on Twitter.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
 2 | 
 3 | format:	## Run code autoformatters (black).
 4 | 	pre-commit install
 5 | 	pre-commit run black --all-files
 6 | 
 7 | lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
 8 | 	tox -e lint
 9 | 
10 | test:
11 | 	tox
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tidb-vector-python
  2 | 
  3 | Use TiDB Vector Search with Python.
  4 | 
  5 | ## Usage
  6 | 
  7 | TiDB is a SQL database so that this package introduces Vector Search capability for Python ORMs:
  8 | 
  9 | - [#SQLAlchemy](#sqlalchemy)
 10 | - [#Peewee](#peewee)
 11 | - [#Django](#django)
 12 | 
 13 | Pick one that you are familiar with to get started. If you are not using any of them, we recommend [#SQLAlchemy](#sqlalchemy).
 14 | 
 15 | We also provide a Vector Search client for simple usage:
 16 | 
 17 | - [#TiDB Vector Client](#tidb-vector-client)
 18 | 
 19 | ### SQLAlchemy
 20 | 
 21 | Install:
 22 | 
 23 | ```bash
 24 | pip install tidb-vector sqlalchemy pymysql
 25 | ```
 26 | 
 27 | Usage:
 28 | 
 29 | ```python
 30 | from sqlalchemy import Integer, Column
 31 | from sqlalchemy import create_engine, select
 32 | from sqlalchemy.dialects.mysql import LONGTEXT
 33 | from sqlalchemy.orm import Session, declarative_base
 34 | 
 35 | import tidb_vector
 36 | from tidb_vector.sqlalchemy import VectorType, VectorAdaptor
 37 | 
 38 | engine = create_engine("mysql+pymysql://root@127.0.0.1:4000/test")
 39 | Base = declarative_base()
 40 | 
 41 | 
 42 | # Define table schema
 43 | class Doc(Base):
 44 |     __tablename__ = "doc"
 45 |     id = Column(Integer, primary_key=True)
 46 |     embedding = Column(VectorType(dim=3))
 47 |     content = Column(LONGTEXT)
 48 | 
 49 | 
 50 | # Create empty table
 51 | Base.metadata.drop_all(engine)  # clean data from last run
 52 | Base.metadata.create_all(engine)
 53 | 
 54 | # Create index for L2 distance
 55 | VectorAdaptor(engine).create_vector_index(
 56 |     Doc.embedding, tidb_vector.DistanceMetric.L2, skip_existing=True
 57 |     # For cosine distance, use tidb_vector.DistanceMetric.COSINE
 58 | )
 59 | 
 60 | # Insert content with vectors
 61 | with Session(engine) as session:
 62 |     session.add(Doc(id=1, content="dog", embedding=[1, 2, 1]))
 63 |     session.add(Doc(id=2, content="fish", embedding=[1, 2, 4]))
 64 |     session.add(Doc(id=3, content="tree", embedding=[1, 0, 0]))
 65 |     session.commit()
 66 | 
 67 | # Perform Vector Search for Top K=1
 68 | with Session(engine) as session:
 69 |     results = session.execute(
 70 |         select(Doc.id, Doc.content)
 71 |         .order_by(Doc.embedding.l2_distance([1, 2, 3]))
 72 |         # For cosine distance, use Doc.embedding.cosine_distance(...)
 73 |         .limit(1)
 74 |     ).all()
 75 |     print(results)
 76 | 
 77 | # Perform filtered Vector Search by adding a Where Clause:
 78 | with Session(engine) as session:
 79 |     results = session.execute(
 80 |         select(Doc.id, Doc.content)
 81 |         .where(Doc.content == "dog")
 82 |         .order_by(Doc.embedding.l2_distance([1, 2, 3]))
 83 |         .limit(1)
 84 |     ).all()
 85 |     print(results)
 86 | ```
 87 | 
 88 | ### Peewee
 89 | 
 90 | Install:
 91 | 
 92 | ```bash
 93 | pip install tidb-vector peewee pymysql
 94 | ```
 95 | 
 96 | Usage:
 97 | 
 98 | ```python
 99 | import tidb_vector
100 | from peewee import Model, MySQLDatabase, IntegerField, TextField
101 | from tidb_vector.peewee import VectorField, VectorAdaptor
102 | 
103 | db = MySQLDatabase(
104 |     database="test",
105 |     user="root",
106 |     password="",
107 |     host="127.0.0.1",
108 |     port=4000,
109 | )
110 | 
111 | 
112 | # Define table schema
113 | class Doc(Model):
114 |     class Meta:
115 |         database = db
116 |         table_name = "peewee_test"
117 | 
118 |     id = IntegerField(primary_key=True)
119 |     embedding = VectorField(3)
120 |     content = TextField()
121 | 
122 | 
123 | # Create empty table and index for L2 distance
124 | db.drop_tables([Doc])  # clean data from last run
125 | db.create_tables([Doc])
126 | # For cosine distance, use tidb_vector.DistanceMetric.COSINE
127 | VectorAdaptor(db).create_vector_index(Doc.embedding, tidb_vector.DistanceMetric.L2)
128 | 
129 | # Insert content with vectors
130 | Doc.insert_many(
131 |     [
132 |         {"id": 1, "content": "dog", "embedding": [1, 2, 1]},
133 |         {"id": 2, "content": "fish", "embedding": [1, 2, 4]},
134 |         {"id": 3, "content": "tree", "embedding": [1, 0, 0]},
135 |     ]
136 | ).execute()
137 | 
138 | # Perform Vector Search for Top K=1
139 | cursor = (
140 |     Doc.select(Doc.id, Doc.content)
141 |     # For cosine distance, use Doc.embedding.cosine_distance(...)
142 |     .order_by(Doc.embedding.l2_distance([1, 2, 3]))
143 |     .limit(1)
144 | )
145 | for row in cursor:
146 |     print(row.id, row.content)
147 | 
148 | 
149 | # Perform filtered Vector Search by adding a Where Clause:
150 | cursor = (
151 |     Doc.select(Doc.id, Doc.content)
152 |     .where(Doc.content == "dog")
153 |     .order_by(Doc.embedding.l2_distance([1, 2, 3]))
154 |     .limit(1)
155 | )
156 | for row in cursor:
157 |     print(row.id, row.content)
158 | ```
159 | 
160 | ### Django
161 | 
162 | > [!TIP]
163 | >
164 | > Django is a full-featured web framework, not just an ORM. The following usage introducutions are provided for existing Django users.
165 | >
166 | > For new users to get started, consider using SQLAlchemy or Peewee.
167 | 
168 | Install:
169 | 
170 | ```bash
171 | pip install 'django-tidb[vector]~=5.0.0' 'django~=5.0.0'  mysqlclient
172 | ```
173 | 
174 | Usage:
175 | 
176 | 1\. Configure `django_tidb` as engine, like:
177 | 
178 | ```python
179 | DATABASES = {
180 |     'default': {
181 |         'ENGINE': 'django_tidb',
182 |         'NAME': 'django',
183 |         'USER': 'root',
184 |         'PASSWORD': '',
185 |         'HOST': '127.0.0.1',
186 |         'PORT': 4000,
187 |     },
188 | }
189 | ```
190 | 
191 | 2\. Define a model with a vector field and vector index:
192 | 
193 | ```python
194 | from django.db import models
195 | from django_tidb.fields.vector import VectorField, VectorIndex, L2Distance
196 | 
197 | class Doc(models.Model):
198 |     id = models.IntegerField(primary_key=True)
199 |     embedding = VectorField(dimensions=3)
200 |     content = models.TextField()
201 |     class Meta:
202 |         indexes = [VectorIndex(L2Distance("embedding"), name="idx")]
203 | ```
204 | 
205 | 3\. Insert data:
206 | 
207 | ```python
208 | Doc.objects.create(id=1, content="dog", embedding=[1, 2, 1])
209 | Doc.objects.create(id=2, content="fish", embedding=[1, 2, 4])
210 | Doc.objects.create(id=3, content="tree", embedding=[1, 0, 0])
211 | ```
212 | 
213 | 4\. Perform Vector Search for Top K=1:
214 | 
215 | ```python
216 | queryset = (
217 |     Doc.objects
218 |         .order_by(L2Distance("embedding", [1, 2, 3]))
219 |         .values("id", "content")[:1]
220 | )
221 | print(queryset)
222 | ```
223 | 
224 | 5\. Perform filtered Vector Search by adding a Where Clause:
225 | 
226 | ```python
227 | queryset = (
228 |      Doc.objects
229 |           .filter(content="dog")
230 |           .order_by(L2Distance("embedding", [1, 2, 3]))
231 |           .values("id", "content")[:1]
232 | )
233 | print(queryset)
234 | ```
235 | 
236 | For more details, see [django-tidb](https://github.com/pingcap/django-tidb?tab=readme-ov-file#vector-beta).
237 | 
238 | ### TiDB Vector Client
239 | 
240 | Within the framework, you can directly utilize the built-in `TiDBVectorClient`, as demonstrated by integrations like [Langchain](https://python.langchain.com/docs/integrations/vectorstores/tidb_vector) and [Llama index](https://docs.llamaindex.ai/en/stable/community/integrations/vector_stores.html#using-a-vector-store-as-an-index), to seamlessly interact with TiDB Vector. This approach abstracts away the need to manage the underlying ORM, simplifying your interaction with the vector store.
241 | 
242 | We provide `TiDBVectorClient` which is based on sqlalchemy, you need to use `pip install tidb-vector[client]` to install it.
243 | 
244 | Create a `TiDBVectorClient` instance:
245 | 
246 | ```python
247 | from tidb_vector.integrations import TiDBVectorClient
248 | 
249 | TABLE_NAME = 'vector_test'
250 | CONNECTION_STRING = 'mysql+pymysql://<USER>:<PASSWORD>@<HOST>:4000/<DB>?ssl_verify_cert=true&ssl_verify_identity=true'
251 | 
252 | tidb_vs = TiDBVectorClient(
253 |     # the table which will store the vector data
254 |     table_name=TABLE_NAME,
255 |     # tidb connection string
256 |     connection_string=CONNECTION_STRING,
257 |     # the dimension of the vector, in this example, we use the ada model, which has 1536 dimensions
258 |     vector_dimension=1536,
259 |     # if recreate the table if it already exists
260 |     drop_existing_table=True,
261 | )
262 | ```
263 | 
264 | Bulk insert:
265 | 
266 | ```python
267 | ids = [
268 |     "f8e7dee2-63b6-42f1-8b60-2d46710c1971",
269 |     "8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6",
270 |     "e4991349-d00b-485c-a481-f61695f2b5ae",
271 | ]
272 | documents = ["foo", "bar", "baz"]
273 | embeddings = [
274 |     text_to_embedding("foo"),
275 |     text_to_embedding("bar"),
276 |     text_to_embedding("baz"),
277 | ]
278 | metadatas = [
279 |     {"page": 1, "category": "P1"},
280 |     {"page": 2, "category": "P1"},
281 |     {"page": 3, "category": "P2"},
282 | ]
283 | 
284 | tidb_vs.insert(
285 |     ids=ids,
286 |     texts=documents,
287 |     embeddings=embeddings,
288 |     metadatas=metadatas,
289 | )
290 | ```
291 | 
292 | Query:
293 | 
294 | ```python
295 | tidb_vs.query(text_to_embedding("foo"), k=3)
296 | 
297 | # query with filter
298 | tidb_vs.query(text_to_embedding("foo"), k=3, filter={"category": "P1"})
299 | ```
300 | 
301 | Bulk delete:
302 | 
303 | ```python
304 | tidb_vs.delete(["f8e7dee2-63b6-42f1-8b60-2d46710c1971"])
305 | 
306 | # delete with filter
307 | tidb_vs.delete(["f8e7dee2-63b6-42f1-8b60-2d46710c1971"], filter={"category": "P1"})
308 | ```
309 | 
310 | ## Examples
311 | 
312 | There are some examples to show how to use the tidb-vector-python to interact with TiDB Vector in different scenarios.
313 | 
314 | - [OpenAI Embedding](./examples/openai_embedding/README.md): use the OpenAI embedding model to generate vectors for text data, store them in TiDB Vector, and search for similar text.
315 | - [Image Search](./examples/image_search/README.md): use the OpenAI CLIP model to generate vectors for image and text, store them in TiDB Vector, and search for similar images.
316 | - [LlamaIndex RAG with UI](./examples/llamaindex-tidb-vector-with-ui/README.md): use the LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application.
317 | - [Chat with URL](./llamaindex-tidb-vector/README.md): use LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application that can chat with a URL.
318 | - [GraphRAG](./examples/graphrag-demo/README.md): 20 lines code of using TiDB Serverless to build a Knowledge Graph based RAG application.
319 | - [GraphRAG Step by Step Tutorial](./examples/graphrag-step-by-step-tutorial/README.md): Step by step tutorial to build a Knowledge Graph based RAG application with Colab notebook. In this tutorial, you will learn how to extract knowledge from a text corpus, build a Knowledge Graph, store the Knowledge Graph in TiDB Serverless, and search from the Knowledge Graph.
320 | - [Vector Search Notebook with SQLAlchemy](https://colab.research.google.com/drive/1LuJn4mtKsjr3lHbzMa2RM-oroUvpy83y?usp=sharing): use [SQLAlchemy](https://www.sqlalchemy.org/) to interact with TiDB Serverless: connect db, index&store data and then search vectors.
321 | - [Build RAG with Jina AI Embeddings](./examples/jina-ai-embeddings-demo/README.md): use Jina AI to generate embeddings for text data, store the embeddings in TiDB Vector Storage, and search for similar embeddings.
322 | - [Semantic Cache](./examples/semantic-cache/README.md): build a semantic cache with Jina AI and TiDB Vector.
323 | 
324 | for more examples, see the [examples](./examples) directory.
325 | 
326 | ## Contributing
327 | 
328 | Please feel free to reach out to the maintainers if you have any questions or need help with the project. Before contributing, please read the [CONTRIBUTING.md](./CONTRIBUTING.md) file.
329 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Vector Examples
 2 | 
 3 | This directory contains examples of how to use the TiDB as a vector database.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | Please follow the instructions below to set up a TiDB Serverless cluster with built-in vector search supported.
 8 | 
 9 | 1. Sign up [TiDB Cloud](https://tidbcloud.com)
10 | 2. Follow this [tutorial](https://docs.pingcap.com/tidbcloud/tidb-cloud-quickstart#step-1-create-a-tidb-cluster) to create a TiDB Serverless cluster
11 | 3. Navigate to the [Clusters](https://tidbcloud.com/console/clusters) page, and then click the name of your target cluster to go to its overview page
12 | 4. Click Connect in the upper-right corner.
13 | 5. In the connection dialog, select General from the Connect With dropdown and keep the default setting of the Endpoint Type as Public.
14 | 6. If you have not set a password yet, click Create password to generate a random password.
15 | 
16 | <div align="center">
17 |     <picture>
18 |         <img alt="The connection dialog of TiDB Serverless" src="./static/images/tidbcloud-connect-parameters.png" width="600">
19 |     </picture>
20 |     <div><i>The connection dialog of TiDB Serverless</i></div>
21 | </div>
22 | 7. Save the connection parameters to a safe place. You will need them to connect to the TiDB Serverless cluster in the following examples.
23 | 
24 | ## Examples
25 | - [OpenAI Embedding](./openai_embedding/README.md): use the OpenAI embedding model to generate vectors for text data.
26 | - [Image Search](./image_search/README.md): use the OpenAI CLIP model to generate vectors for image and text.
27 | - [LlamaIndex RAG with UI](./llamaindex-tidb-vector-with-ui/README.md): use the LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application.
28 | - [Chat with URL](./llamaindex-tidb-vector/README.md): use LlamaIndex to build an [RAG(Retrieval-Augmented Generation)](https://docs.llamaindex.ai/en/latest/getting_started/concepts/) application that can chat with a URL.
29 | - [GraphRAG](./graphrag-demo/README.md): 20 lines code of using TiDB Serverless to build a Knowledge Graph based RAG application.
30 | - [GraphRAG Step by Step Tutorial](./graphrag-step-by-step-tutorial/README.md): Step by step tutorial to build a Knowledge Graph based RAG application with Colab notebook. In this tutorial, you will learn how to extract knowledge from a text corpus, build a Knowledge Graph, store the Knowledge Graph in TiDB Serverless, and search from the Knowledge Graph.
31 | - [Vector Search Notebook with SQLAlchemy](https://colab.research.google.com/drive/1LuJn4mtKsjr3lHbzMa2RM-oroUvpy83y?usp=sharing): use [SQLAlchemy](https://www.sqlalchemy.org/) to interact with TiDB Serverless: connect db, index&store data and then search vectors.
32 | - [Build RAG with Jina AI Embeddings](./jina-ai-embeddings-demo/README.md): use Jina AI to generate embeddings for text data, store the embeddings in TiDB Vector Storage, and search for similar embeddings.
33 | - [Semantic Cache](./semantic-cache/README.md): build a semantic cache with Jina AI and TiDB Vector.
34 | 
35 | ## Real World Applications
36 | 
37 | ### TiDB.ai
38 | 
39 | [tidb.ai](https://tidb.ai) is an amazing out-of-the-box Graph RAG(Retrieval Augmented Generation) template project based on the TiDB vector store, it contains ui and server logic, fork it on [github](https://github.com/pingcap/tidb.ai) and deploy your own.
40 | 
41 | ![out-of-box-conversational-search](https://github.com/pingcap/tidb.ai/assets/1237528/0784e26e-8392-4bbe-bda1-6a680b12a805 "Image Title")
42 | 


--------------------------------------------------------------------------------
/examples/dspy-demo/.env.example:
--------------------------------------------------------------------------------
 1 | # A example database URL to connect to a TiDB cluster from macOS:
 2 | # mysql+pymysql://<PREFIX>.root:<PASSWORD>@gateway01.<REGION>.prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true
 3 | TIDB_DATABASE_URL="mysql+pymysql://<USER>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA_PATH>&ssl_verify_cert=true&ssl_verify_identity=true"
 4 | 
 5 | # The name of the language model to use for the language model-based retriever.
 6 | LM_MODEL_NAME="<MODEL>"
 7 | 
 8 | # The base URL of the Ollama API.
 9 | OLLAMA_BASE_URL="http://<HOST>:11434"
10 | 
11 | # The API key to use for the Ollama API.
12 | OLLAMA_API_KEY="ollama"
13 | 
14 | # sentence-transformers model
15 | SENTENCE_TRANSFORMERS_MODEL="<MODEL>"


--------------------------------------------------------------------------------
/examples/dspy-demo/README.md:
--------------------------------------------------------------------------------
  1 | # DSPy Demo
  2 | 
  3 | This example demonstrates how to use the DSPy and TiDB Serverless to build a simple RAG application.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - A running TiDB Serverless cluster
  8 | - Python 3.10 or later
  9 | - Ollama or OpenAI
 10 | 
 11 | ## Run the example
 12 | 
 13 | ### Clone this repo
 14 | 
 15 | ```bash
 16 | git clone https://github.com/pingcap/tidb-vector-python.git
 17 | ```
 18 | 
 19 | ### Create a virtual environment
 20 | 
 21 | ```bash
 22 | cd tidb-vector-python/examples/dspy-demo
 23 | python3 -m venv .venv
 24 | source .venv/bin/activate
 25 | ```
 26 | 
 27 | ### Install dependencies
 28 | 
 29 | ```bash
 30 | pip install -r requirements.txt
 31 | ```
 32 | 
 33 | ### Set the environment variables
 34 | 
 35 | Get the TiDB connection string via `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as
 36 | described in the [Prerequisites](../README.md#prerequisites) section.
 37 | 
 38 | The TiDB connection string will look like:
 39 | 
 40 | ```
 41 | mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True
 42 | ```
 43 | 
 44 | ### Run this example
 45 | 
 46 | ```text
 47 | $ python3 example.py
 48 | Connected to TiDB.
 49 | describe table: {'success': True, 'result': 6, 'error': None}
 50 | Initializing the TidbRM model...
 51 | TidbRM model initialized successfully.
 52 | Loading sample data...
 53 | sample_data.txt found.
 54 | Sample data loaded successfully.
 55 | Embedding sample data...
 56 | 0 At My Wind [-0.27386308 -0.3816067  -0.12257734  0.04750763 -0.12517984]
 57 | 1 Little Win [ 0.05535038 -0.2605278  -0.19080743 -0.3411712  -0.0255685 ]
 58 | 2 Storm Wind [-0.24868685 -0.21516131 -0.03831396  0.08118728 -0.05171517]
 59 | 3 Yes I Am ( [ 0.07458089 -0.31562874 -0.14104412 -0.13799803 -0.02719649]
 60 | 4 The Great  [-0.08352712  0.12166582 -0.07781561  0.2473993  -0.3156342 ]
 61 | 5 Rosario Da [-0.02921938  0.16053236 -0.2157185  -0.14237025  0.3970173 ]
 62 | 6 Robert B.  [-0.05834749 -0.16446972 -0.00786973  0.02972636  0.03525066]
 63 | 7 Richard M. [-0.13012317 -0.20805678  0.0064573   0.05393503  0.043081  ]
 64 | 8 Everything [ 0.09028038  0.03007011 -0.29266015 -0.27439988 -0.2159805 ]
 65 | 9 Everything [-0.02172723  0.22668567 -0.17105839  0.04179271 -0.18812893]
 66 | 10 Janick Ger [ 0.179568   -0.4577289  -0.05370283  0.09678644 -0.27309376]
 67 | 11 Dave Murra [ 0.01501587 -0.32756883 -0.08704209 -0.07916276 -0.23352458]
 68 | 12 Roy Z | Ro [ 0.12954581 -0.27150235 -0.0992474   0.14631633  0.09378276]
 69 | 13 Heather Ba [ 0.07651925 -0.23409796 -0.03234328  0.01846722 -0.09262329]
 70 | 14 Gianfranco [ 0.00602041  0.29790103 -0.2082347   0.12557846  0.13808164]
 71 | Sample data embedded successfully.
 72 | Sample data number: 15
 73 | Inserting documents into TiDB...
 74 | Documents inserted successfully.
 75 | Answering the question: 'who write At My Window'...
 76 | Townes Van Zandt wrote At My Window, which is an album released in 1987. The reasoning provided explains that Townes Van Zandt was the songwriter behind this album, showcasing his unique style and poetic lyrics. The album features a mix of folk, country, and Americana sounds, exploring themes of love, loss, and self-discovery. At My Window is often cited as one of the greatest albums in American music history, having had a significant influence on many other artists.
 77 | 
 78 | 
 79 | 
 80 | Answer questions with short factoid answers.
 81 | 
 82 | ---
 83 | 
 84 | Follow the following format.
 85 | 
 86 | Context: may contain relevant facts
 87 | 
 88 | Question: ${question}
 89 | 
 90 | Reasoning: Let's think step by step in order to ${produce the answer}. We ...
 91 | 
 92 | Answer: often between 1 and 5 words
 93 | 
 94 | ---
 95 | 
 96 | Context:
 97 | [1] «{'long_text': 'Rosario Dawson | Rosario Isabel Dawson (born May 9, 1979) is an American actress, producer, singer, comic book writer, and political activist. She made her film debut in the 1995 teen drama "Kids". Her subsequent film roles include "He Got Game", "Men in Black II", "25th Hour", "Rent", "Sin City", "Death Proof", "Seven Pounds", "", and "Top Five". Dawson has also provided voice-over work for Disney and DC.'}»
 98 | [2] «{'long_text': 'Dave Murray (musician) | David Michael "Dave" Murray (born 23 December 1956) is an English guitarist and songwriter best known as one of the earliest members of the British heavy metal band Iron Maiden. Along with the group\'s bassist and primary songwriter Steve Harris, Murray has appeared on all of the band\'s releases.'}»
 99 | [3] «{'long_text': 'Heather Baker | Heather Baker (born October 9, 1984) is a female American songwriter, guitarist, producer and founder of the Electronica band Antiwave. Heather is known for being a session and touring guitar player for the likes of Bonnie Mckee (Pulse Music Publishing), Krewella (Columbia Records), Kerli (Island Records), The Iron Maidens (Powerslave Records) and currently plays with the band Fake Figures (members of Atreyu, Eyelid and Scars of Tomorrow) and NoMBe (TH3RD BRAIN)'}»
100 | [4] «{'long_text': 'Janick Gers | Janick Robert Gers ( ; born 27 January 1957 in Hartlepool, England) is an English musician, best known for being one of the three current guitarists in Iron Maiden, along with Dave Murray and Adrian Smith, as well as his earlier work with Gillan and White Spirit.'}»
101 | [5] «{'long_text': 'Robert B. Sherman | Robert Bernard Sherman (December 19, 1925 – March 6, 2012) was an American songwriter who specialized in musical films with his brother Richard Morton Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers\' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "The Slipper and the Rose", and "Charlotte\'s Web". Their most well known work, however, remains the theme park song "It\'s a Small World (After All)". According to Time.com, this song is the most performed song of all time.'}»
102 | [6] «{'long_text': 'Richard M. Sherman | Richard Morton Sherman (born June 12, 1928) is an American songwriter who specialized in musical films with his brother Robert Bernard Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers\' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "Snoopy Come Home", "Bedknobs and Broomsticks", "The Slipper and the Rose", and "Charlotte\'s Web". Their most well known work, however, remains the theme park song "It\'s a Small World (After All)". According to Time.com, this song is the most performed song of all time.'}»
103 | [7] «{'long_text': 'Everything Changes (Julian Lennon album) | Everything Changes is the sixth studio album by English singer-songwriter Julian Lennon. It was released on 2 October 2011.'}»
104 | [8] «{'long_text': 'Roy Z | Roy Z (born February, 1968) is an American guitarist, songwriter and producer, best known for his work with Bruce Dickinson (from Iron Maiden), Halford, and Judas Priest. He also is the founder of Tribe of Gypsies, a Latin influenced hard rock band.'}»
105 | [9] «{'long_text': 'Gianfranco Rosi (director) | Gianfranco Rosi is an Italian director, cinematographer, producer and screenwriter. His film "Sacro GRA" won Golden Lion at 70th Venice International Film Festival. "Sacro GRA" is the first documentary film to win Golden Lion in history of the Venice film festival and the first Italian film to win in fifteen years, after Gianni Amelio\'s "The Way We Laughed" won the award in 1998. His 2016 film "Fire at Sea", a documentary focused on European migrant crisis on the Sicilan island of Lampedusa, won the Golden Bear at the 66th Berlin International Film Festival. Rosi is the only documentary filmmaker to win two top prizes at major European film festivals (Cannes, Berlin and Venice) and is currently the only filmmaker besides Michael Haneke, Jafar Panahi, Ang Lee, and Ken Loach to win two top European festival prizes in the 21st century.'}»
106 | [10] «{'long_text': 'The Great Victorian Collection | The Great Victorian Collection, published in 1975, is a novel by Northern Irish-Canadian writer Brian Moore. Set in Carmel, California, it tells the story of a man who dreams that the empty parking lot he can see from his hotel window has been transformed by the arrival of a collection of priceless Victoriana on display in a vast open-air market. When he awakes he finds that he can no longer distinguish the dream from reality.'}»
107 | [11] «{'long_text': 'Everything Has Changed | "Everything Has Changed" is a song written and performed by American singer-songwriter Taylor Swift and English singer-songwriter Ed Sheeran, taken from Swift\'s fourth studio album, "Red" (2012). Produced by Butch Walker, the track was released as the sixth single from the album on July 16, 2013. "Everything Has Changed" is a guitar ballad combining folk and pop genres about "wanting to get to know a new lover better".'}»
108 | [12] «{'long_text': 'Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released.'}»
109 | [13] «{'long_text': 'Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee.'}»
110 | [14] «{'long_text': 'Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Records on September 21, 1993 (see 1993 in music). The title is generally thought to refer to Etheridge\'s recent coming out as a lesbian, confirming long-standing rumors about her personal life. This is the album that gave Etheridge national recognition. The rock ballad "Come to My Window" was the first single released from the album, which peaked at No. 25 on the "Billboard" Hot 100, and its video featured actress Juliette Lewis having a nervous breakdown. This single brought the album into the public consciousness and was quickly followed by "I\'m the Only One", which became a major hit and reached No. 8 on the Hot 100, and "If I Wanted To", which hit No. 16.'}»
111 | [15] «{'long_text': 'At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt\'s first studio album in the nine years that followed 1978\'s "Flyin\' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.'}»
112 | 
113 | Question: who write At My Window
114 | 
115 | Reasoning: Let's think step by step in order to At My Window is an album written and performed by Townes Van Zandt. The album was released in 1987 and it's considered one of his best works, showcasing his unique songwriting style and poetic lyrics. The album features a mix of folk, country, and Americana sounds, with songs that explore themes of love, loss, and self-discovery. At My Window is often cited as one of the greatest albums in the history of American music, and it has had a significant influence on many other artists. Townes Van Zandt was an American singer-songwriter who was active from the 1960s until his death in 1997. He was known for his poetic lyrics and his ability to tell stories through his
116 | 
117 | Answer: Townes Van Zandt wrote At My Window, which is an album released in 1987. The reasoning provided explains that Townes Van Zandt was the songwriter behind this album, showcasing his unique style and poetic lyrics. The album features a mix of folk, country, and Americana sounds, exploring themes of love, loss, and self-discovery. At My Window is often cited as one of the greatest albums in American music history, having had a significant influence on many other artists.
118 | ```
119 |  


--------------------------------------------------------------------------------
/examples/dspy-demo/example.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from functools import partial
  3 | import dspy
  4 | from dotenv import load_dotenv
  5 | from dspy.datasets import HotPotQA
  6 | from dspy.evaluate import Evaluate
  7 | from dspy.teleprompt import BootstrapFewShot
  8 | from sentence_transformers import SentenceTransformer
  9 | from tidb_vector.integrations import TiDBVectorClient
 10 | from utils import sentence_transformer_embedding_function, TidbRM, RAG
 11 | 
 12 | # Load the environment variables from the .env file.
 13 | load_dotenv()
 14 | 
 15 | embed_model = SentenceTransformer(os.environ.get('SENTENCE_TRANSFORMERS_MODEL'), trust_remote_code=True)
 16 | embed_model_dim = embed_model.get_sentence_embedding_dimension()
 17 | embedding_function = partial(sentence_transformer_embedding_function, embed_model)
 18 | 
 19 | # The configuration for the TiDBVectorClient.
 20 | tidb_vector_client = TiDBVectorClient(
 21 |     # The table which will store the TiDB vector data.
 22 |     table_name=os.environ.get('TIDB_TABLE_NAME', 'embedded_documents'),
 23 |     # The connection string to the TiDB cluster.
 24 |     # The connection string should be in the format of:
 25 |     # mysql+pymysql://<USER>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA_PATH>&ssl_verify_cert=true&ssl_verify_identity=true
 26 |     connection_string=os.environ.get('TIDB_DATABASE_URL'),
 27 |     # The dimension of the vector generated by the embedding model.
 28 |     vector_dimension=embed_model_dim,
 29 |     # Determine whether to recreate the table if it already exists.
 30 |     drop_existing_table=True,
 31 | )
 32 | 
 33 | print("Connected to TiDB.")
 34 | print("describe table:", tidb_vector_client.execute("describe embedded_documents;"))
 35 | 
 36 | print("Initializing the TidbRM model...")
 37 | retriever_model = TidbRM(tidb_vector_client=tidb_vector_client, embedding_function=embedding_function)
 38 | print("TidbRM model initialized successfully.")
 39 | 
 40 | print("Loading sample data...")
 41 | # test sample data
 42 | # load sample_data.txt  if not local file, you can use requests.get(url).text
 43 | # sample data url: https://raw.githubusercontent.com/wxywb/dspy_dataset_sample/master/sample_data.txt
 44 | with open('sample_data.txt', 'r') as f:
 45 |     # I prepare a small set of data for speeding up embedding, you can replace it with your own data.
 46 |     print("sample_data.txt found.")
 47 |     sample_data = f.read()
 48 | print("Sample data loaded successfully.")
 49 | 
 50 | print("Embedding sample data...")
 51 | documents = []
 52 | for idx, passage in enumerate(sample_data.split('\n')[:3]):
 53 |     embedding = embedding_function([passage])[0]
 54 |     print(idx, passage[:10], embedding[:5])
 55 |     if len(passage) == 0:
 56 |         continue
 57 |     documents.append({
 58 |         "id": str(idx),
 59 |         "text": passage,
 60 |         "embedding": embedding,
 61 |         "metadata": {"category": "album"},
 62 |     })
 63 | print("Sample data embedded successfully.")
 64 | print("Sample data number:", len(documents))
 65 | 
 66 | print("Inserting documents into TiDB...")
 67 | tidb_vector_client.insert(
 68 |     ids=[doc["id"] for doc in documents],
 69 |     texts=[doc["text"] for doc in documents],
 70 |     embeddings=[doc["embedding"] for doc in documents],
 71 |     metadatas=[doc["metadata"] for doc in documents],
 72 | )
 73 | print("Documents inserted successfully.")
 74 | 
 75 | language_model = dspy.OllamaLocal(
 76 |     model=os.environ.get('LM_MODEL_NAME', 'llama3:8b'),
 77 |     base_url=os.environ.get('OLLAMA_BASE_URL'),
 78 |     api_key=os.environ.get('OLLAMA_API_KEY')
 79 | )
 80 | dspy.settings.configure(lm=language_model)
 81 | 
 82 | rag = RAG(retriever_model)
 83 | 
 84 | dataset = HotPotQA(train_seed=1, train_size=2, eval_seed=2023, dev_size=5, test_size=0)
 85 | # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
 86 | trainset = [x.with_inputs('question') for x in dataset.train]
 87 | devset = [x.with_inputs('question') for x in dataset.dev]
 88 | 
 89 | metric = dspy.evaluate.answer_exact_match
 90 | evaluate_on_hotpotqa = Evaluate(devset=devset[:], display_progress=True, display_table=False)
 91 | score = evaluate_on_hotpotqa(rag, metric=metric)
 92 | print('rag:', score)
 93 | 
 94 | 
 95 | # Validation logic: check that the predicted answer is correct.
 96 | # Also check that the retrieved context does contain that answer.
 97 | def validate_context_and_answer(example, pred, trace=None):
 98 |     answer_em = dspy.evaluate.answer_exact_match(example, pred)
 99 |     answer_pm = dspy.evaluate.answer_passage_match(example, pred)
100 |     return answer_em and answer_pm
101 | 
102 | 
103 | # Set up a basic teleprompter, which will compile our RAG program.
104 | teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
105 | 
106 | # Compile!
107 | compiled_rag = teleprompter.compile(rag, trainset=trainset)
108 | # Now compiled_rag is optimized and ready to answer your new question!
109 | score = evaluate_on_hotpotqa(compiled_rag, metric=metric)
110 | print('compile_rag:', score)
111 | 
112 | if __name__ == '__main__':
113 |     print("Answering the question: 'who write At My Window'...")
114 |     print(rag("who write At My Window").answer)
115 |     print(language_model.inspect_history(n=1))
116 | 


--------------------------------------------------------------------------------
/examples/dspy-demo/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMySQL==1.1.0
2 | SQLAlchemy==2.0.30
3 | dspy-ai==2.4.9
4 | openai==1.35.1
5 | sentence-transformers==3.0.1
6 | tidb-vector


--------------------------------------------------------------------------------
/examples/dspy-demo/sample_data.txt:
--------------------------------------------------------------------------------
 1 | At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt's first studio album in the nine years that followed 1978's "Flyin' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.
 2 | Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee.
 3 | Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released.
 4 | Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Records on September 21, 1993 (see 1993 in music). The title is generally thought to refer to Etheridge's recent coming out as a lesbian, confirming long-standing rumors about her personal life. This is the album that gave Etheridge national recognition. The rock ballad "Come to My Window" was the first single released from the album, which peaked at No. 25 on the "Billboard" Hot 100, and its video featured actress Juliette Lewis having a nervous breakdown. This single brought the album into the public consciousness and was quickly followed by "I'm the Only One", which became a major hit and reached No. 8 on the Hot 100, and "If I Wanted To", which hit No. 16.
 5 | The Great Victorian Collection | The Great Victorian Collection, published in 1975, is a novel by Northern Irish-Canadian writer Brian Moore. Set in Carmel, California, it tells the story of a man who dreams that the empty parking lot he can see from his hotel window has been transformed by the arrival of a collection of priceless Victoriana on display in a vast open-air market. When he awakes he finds that he can no longer distinguish the dream from reality.
 6 | Rosario Dawson | Rosario Isabel Dawson (born May 9, 1979) is an American actress, producer, singer, comic book writer, and political activist. She made her film debut in the 1995 teen drama "Kids". Her subsequent film roles include "He Got Game", "Men in Black II", "25th Hour", "Rent", "Sin City", "Death Proof", "Seven Pounds", "", and "Top Five". Dawson has also provided voice-over work for Disney and DC.
 7 | Robert B. Sherman | Robert Bernard Sherman (December 19, 1925 – March 6, 2012) was an American songwriter who specialized in musical films with his brother Richard Morton Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time.
 8 | Richard M. Sherman | Richard Morton Sherman (born June 12, 1928) is an American songwriter who specialized in musical films with his brother Robert Bernard Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "Snoopy Come Home", "Bedknobs and Broomsticks", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time.
 9 | Everything Has Changed | "Everything Has Changed" is a song written and performed by American singer-songwriter Taylor Swift and English singer-songwriter Ed Sheeran, taken from Swift's fourth studio album, "Red" (2012). Produced by Butch Walker, the track was released as the sixth single from the album on July 16, 2013. "Everything Has Changed" is a guitar ballad combining folk and pop genres about "wanting to get to know a new lover better".
10 | Everything Changes (Julian Lennon album) | Everything Changes is the sixth studio album by English singer-songwriter Julian Lennon. It was released on 2 October 2011.
11 | Janick Gers | Janick Robert Gers ( ; born 27 January 1957 in Hartlepool, England) is an English musician, best known for being one of the three current guitarists in Iron Maiden, along with Dave Murray and Adrian Smith, as well as his earlier work with Gillan and White Spirit.
12 | Dave Murray (musician) | David Michael "Dave" Murray (born 23 December 1956) is an English guitarist and songwriter best known as one of the earliest members of the British heavy metal band Iron Maiden. Along with the group's bassist and primary songwriter Steve Harris, Murray has appeared on all of the band's releases.
13 | Roy Z | Roy Z (born February, 1968) is an American guitarist, songwriter and producer, best known for his work with Bruce Dickinson (from Iron Maiden), Halford, and Judas Priest. He also is the founder of Tribe of Gypsies, a Latin influenced hard rock band.
14 | Heather Baker | Heather Baker (born October 9, 1984) is a female American songwriter, guitarist, producer and founder of the Electronica band Antiwave. Heather is known for being a session and touring guitar player for the likes of Bonnie Mckee (Pulse Music Publishing), Krewella (Columbia Records), Kerli (Island Records), The Iron Maidens (Powerslave Records) and currently plays with the band Fake Figures (members of Atreyu, Eyelid and Scars of Tomorrow) and NoMBe (TH3RD BRAIN)
15 | Gianfranco Rosi (director) | Gianfranco Rosi is an Italian director, cinematographer, producer and screenwriter. His film "Sacro GRA" won Golden Lion at 70th Venice International Film Festival. "Sacro GRA" is the first documentary film to win Golden Lion in history of the Venice film festival and the first Italian film to win in fifteen years, after Gianni Amelio's "The Way We Laughed" won the award in 1998. His 2016 film "Fire at Sea", a documentary focused on European migrant crisis on the Sicilan island of Lampedusa, won the Golden Bear at the 66th Berlin International Film Festival. Rosi is the only documentary filmmaker to win two top prizes at major European film festivals (Cannes, Berlin and Venice) and is currently the only filmmaker besides Michael Haneke, Jafar Panahi, Ang Lee, and Ken Loach to win two top European festival prizes in the 21st century.


--------------------------------------------------------------------------------
/examples/dspy-demo/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | import pytest
 4 | from functools import partial
 5 | from utils import sentence_transformer_embedding_function, Vector, Vectors
 6 | from pydantic import BaseModel, ValidationError
 7 | from sentence_transformers import SentenceTransformer
 8 | 
 9 | load_dotenv()
10 | 
11 | 
12 | class VectorModel(BaseModel):
13 |     vector: Vector
14 | 
15 | 
16 | class VectorsModel(BaseModel):
17 |     vectors: Vectors
18 | 
19 | 
20 | @pytest.fixture(scope='module')
21 | def embed_model():
22 |     return SentenceTransformer(os.environ.get('SENTENCE_TRANSFORMERS_MODEL'), trust_remote_code=True)
23 | 
24 | 
25 | def test_sentence_transformer_embedding_function_return_shape(embed_model: SentenceTransformer):
26 |     embed_model_dim = embed_model.get_sentence_embedding_dimension()
27 | 
28 |     assert embed_model.encode(["Hello, world!"]).shape == (1, embed_model_dim)
29 |     assert embed_model.encode(["Hello, world!", "hi"]).shape == (2, embed_model_dim)
30 |     assert embed_model.encode("Hello, World!").shape == (embed_model_dim,)
31 | 
32 | 
33 | def test_embedding_function(embed_model: SentenceTransformer):
34 |     embedding_function = partial(sentence_transformer_embedding_function, embed_model)
35 |     try:
36 |         vector = embedding_function(sentences="Hello, world!")
37 |         VectorModel(vector=vector)
38 |     except ValidationError:
39 |         assert False
40 |     try:
41 |         vectors = embedding_function(sentences=["Hello, world!"])
42 |         VectorsModel(vectors=vectors)
43 |     except ValidationError:
44 |         assert False
45 |     try:
46 |         vectors = embedding_function(sentences=["Hello, world!", "hi"])
47 |         VectorsModel(vectors=vectors)
48 |     except ValidationError:
49 |         assert False
50 | 


--------------------------------------------------------------------------------
/examples/dspy-demo/utils.py:
--------------------------------------------------------------------------------
  1 | from dsp import dotdict
  2 | from typing import Union, List, Optional
  3 | import dspy
  4 | from sentence_transformers import SentenceTransformer
  5 | from tidb_vector.integrations import TiDBVectorClient
  6 | 
  7 | # Vector and Vectors
  8 | # https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-encoding_format
  9 | Vector = Union[List[float], List[int]]
 10 | Vectors = List[Vector]
 11 | 
 12 | 
 13 | def sentence_transformer_embedding_function(
 14 |         embed_model: SentenceTransformer,
 15 |         sentences: Union[str, List[str]]
 16 | ) -> Union[Vector, Vectors]:
 17 |     """
 18 |     Generates vector embeddings for the given text using the sentence-transformers model.
 19 | 
 20 |     Args:
 21 |         embed_model (SentenceTransformer): The sentence-transformers model to use.
 22 |         sentences (Union[str, List[str]]): The text or list of texts for which to generate embeddings.
 23 | 
 24 |     Returns:
 25 |         if sentences is a single string:
 26 |             List[float]: The embedding for the input sentence.
 27 |         if sentences is a list of strings:
 28 |             List[List[float]]: The embeddings for the input sentences.
 29 | 
 30 | 
 31 |     Examples:
 32 |         Below is a code snippet that shows how to use this function:
 33 |         ```python
 34 |         embeddings = sentence_transformer_embedding_function("Hello, world!")
 35 |         ```
 36 |         or
 37 |         ```python
 38 |         embeddings = sentence_transformer_embedding_function(["Hello, world!"])
 39 |         ```
 40 |     """
 41 | 
 42 |     return embed_model.encode(sentences).tolist()
 43 | 
 44 | 
 45 | class TidbRM(dspy.Retrieve):
 46 |     """
 47 |     A retrieval module that uses TiDBVectorClient to return passages for a given query.
 48 | 
 49 |     Args:
 50 |         tidb_vector_client (TiDBVectorClient): The TiDBVectorClient instance to use for querying TiDB.
 51 |         embedding_function (callable): The function to convert a list of text to embeddings.
 52 |             The embedding function should take a list of text strings as input and output a list of embeddings.
 53 |         k (int, optional): The number of top passages to retrieve. Defaults to 3.
 54 | 
 55 |     Returns:
 56 |         dspy.Prediction: An object containing the retrieved passages.
 57 | 
 58 |     Examples:
 59 |         Below is a code snippet that shows how to use this as the default retriever:
 60 |         use OpenAI
 61 |         ```python
 62 |         llm = dspy.OpenAI(model="gpt-3.5-turbo")
 63 |         retriever_model = TidbRM(
 64 |             tidb_vector_client=tidb_vector_client,
 65 |             embedding_function=sentence_transformer_embedding_function
 66 |         )
 67 |         dspy.settings.configure(rm=retriever_model)
 68 |         ```
 69 | 
 70 |         use Ollama
 71 |         ```python
 72 |         llm = dspy.OllamaLocal(model="llama3:8b")
 73 |         retriever_model = TidbRM(
 74 |             tidb_vector_client=tidb_vector_client,
 75 |             embedding_function=llm
 76 |         )
 77 | 
 78 |     """
 79 | 
 80 |     def __init__(self, tidb_vector_client: TiDBVectorClient, embedding_function: Optional[callable] = None, k: int = 3):
 81 |         super().__init__(k)
 82 |         self.tidb_vector_client = tidb_vector_client
 83 |         self.embedding_function = embedding_function
 84 |         self.top_k = k
 85 | 
 86 |     def forward(self, query_or_queries: Union[str, List[str]], k: Optional[int] = None, **kwargs) -> dspy.Prediction:
 87 |         """
 88 |         Retrieve passages for the given query.
 89 | 
 90 |         Args:
 91 |             query_or_queries (Union[str, List[str]]): The query or queries for which to retrieve passages.
 92 |             k (Optional[int]): The number of top passages to retrieve. Defaults to 3.
 93 | 
 94 |         Returns:
 95 |             dspy.Prediction: An object containing the retrieved passages.
 96 | 
 97 |         Examples:
 98 |             Below is a code snippet that shows how to use this function:
 99 |             ```python
100 |             passages = self.retrieve("Hello, world!")
101 |             ```
102 |         """
103 |         query_embeddings = self.embedding_function(query_or_queries)
104 |         k = k or self.top_k
105 |         tidb_vector_res = self.tidb_vector_client.query(query_vector=query_embeddings, k=k)
106 |         passages_scores = {}
107 |         for res in tidb_vector_res:
108 |             res.metadata = dotdict(res.metadata)
109 |             passages_scores[res.document] = res.distance
110 |         sorted_passages = sorted(passages_scores.items(), key=lambda x: x[1], reverse=True)
111 | 
112 |         return dspy.Prediction(passages=[dotdict({"long_text": passage}) for passage, _ in sorted_passages])
113 | 
114 | 
115 | class GenerateAnswer(dspy.Signature):
116 |     """Answer questions with short factoid answers."""
117 | 
118 |     context = dspy.InputField(desc="may contain relevant facts")
119 |     question = dspy.InputField()
120 |     answer = dspy.OutputField(desc="often between 1 and 5 words")
121 | 
122 | 
123 | class RAG(dspy.Module):
124 |     def __init__(self, rm):
125 |         super().__init__()
126 |         self.retrieve = rm
127 | 
128 |         # This signature indicates the task imposed on the COT module.
129 |         self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
130 | 
131 |     def forward(self, question):
132 |         # Use milvus_rm to retrieve context for the question.
133 |         context = self.retrieve(question).passages
134 |         # COT module takes "context, query" and output "answer".
135 |         prediction = self.generate_answer(context=context, question=question)
136 |         return dspy.Prediction(context=[item.long_text for item in context], answer=prediction.answer)
137 | 


--------------------------------------------------------------------------------
/examples/gemini-ai-embeddings-demo/README.md:
--------------------------------------------------------------------------------
 1 | # GeminiAI Embedding Example
 2 | 
 3 | This example demonstrates how to utilize GeminiAI embedding for semantic search. According to GeminiAI's [documentation](https://ai.google.dev/gemini-api/docs/embeddings), we will use cosine similarity to calculate vector distance.
 4 | 
 5 | You can run this example in two ways:
 6 | 
 7 | - [Run in Jupyter Notebook](#jupyter-notebook)
 8 | - [Run in Local](#run-in-local)
 9 | 
10 | ## Jupyter Notebook
11 | 
12 | Notebook: [example.ipynb](./example.ipynb)
13 | 
14 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/gemini-ai-embeddings-demo/example.ipynb).
15 | 
16 | ## Run in Local
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | python3 -m venv .venv
22 | source .venv/bin/activate
23 | ```
24 | 
25 | ### Install the requirements
26 | 
27 | ```bash
28 | pip install -r requirements.txt
29 | ```
30 | 
31 | ### Set the environment variables
32 | 
33 | Get the `GEMINI_API_KEY` from [GeminiAI](https://ai.google.dev/gemini-api/docs/quickstart)
34 | 
35 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.
36 | 
37 | ```bash
38 | export GEMINI_API_KEY="*******"
39 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com"
40 | export TIDB_USERNAME="****.root"
41 | export TIDB_PASSWORD="****"
42 | ```
43 | 
44 | ### Run the example
45 | 
46 | ```bash
47 | python3 example.py
48 | ```
49 | 


--------------------------------------------------------------------------------
/examples/gemini-ai-embeddings-demo/example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "ewKGZW06kmIv"
  7 |       },
  8 |       "source": [
  9 |         "# Example of Embedding\n",
 10 |         "\n",
 11 |         "It is an embedding example that uses `tidb_vector_python` as its library."
 12 |       ]
 13 |     },
 14 |     {
 15 |       "cell_type": "markdown",
 16 |       "metadata": {
 17 |         "id": "F1fsS576izUl"
 18 |       },
 19 |       "source": [
 20 |         "## Install Dependencies"
 21 |       ]
 22 |     },
 23 |     {
 24 |       "cell_type": "code",
 25 |       "execution_count": null,
 26 |       "metadata": {
 27 |         "id": "pTpKX_lDizUp"
 28 |       },
 29 |       "outputs": [],
 30 |       "source": [
 31 |         "%%capture\n",
 32 |         "%pip install google.generativeai peewee pymysql tidb_vector"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "metadata": {
 38 |         "id": "psEHGWiHizUq"
 39 |       },
 40 |       "source": [
 41 |         "## Preapre the environment\n",
 42 |         "\n",
 43 |         "> **Note:**\n",
 44 |         ">\n",
 45 |         "> - You can get the `GEMINI_API_KEY` from [GeminiAI](https://ai.google.dev/gemini-api/docs/quickstart).\n",
 46 |         "> - You can get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.\n",
 47 |         "\n",
 48 |         "Set the embedding model as `models/embedding-001`, and\n",
 49 |         "the amount of embedding dimensions is `768`."
 50 |       ]
 51 |     },
 52 |     {
 53 |       "cell_type": "code",
 54 |       "execution_count": null,
 55 |       "metadata": {
 56 |         "id": "MgKOjwmYizUq"
 57 |       },
 58 |       "outputs": [],
 59 |       "source": [
 60 |         "import getpass\n",
 61 |         "\n",
 62 |         "GEMINI_API_KEY = getpass.getpass(\"Enter your GeminiAI API key: \")\n",
 63 |         "TIDB_HOST = input(\"Enter your TiDB host: \")\n",
 64 |         "TIDB_USERNAME = input(\"Enter your TiDB username: \")\n",
 65 |         "TIDB_PASSWORD = getpass.getpass(\"Enter your TiDB password: \")\n",
 66 |         "\n",
 67 |         "embedding_model = \"models/embedding-001\"\n",
 68 |         "embedding_dimensions = 768"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "metadata": {
 74 |         "id": "3WbH_BITizUr"
 75 |       },
 76 |       "source": [
 77 |         "## Initial the Clients of OpenAI and Database"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "execution_count": null,
 83 |       "metadata": {
 84 |         "id": "UWtcs58-izUr"
 85 |       },
 86 |       "outputs": [],
 87 |       "source": [
 88 |         "import google.generativeai as genai\n",
 89 |         "from peewee import Model, MySQLDatabase, TextField, SQL\n",
 90 |         "from tidb_vector.peewee import VectorField\n",
 91 |         "\n",
 92 |         "genai.configure(api_key=GEMINI_API_KEY)\n",
 93 |         "db = MySQLDatabase(\n",
 94 |         "   'test',\n",
 95 |         "    user=TIDB_USERNAME,\n",
 96 |         "    password=TIDB_PASSWORD,\n",
 97 |         "    host=TIDB_HOST,\n",
 98 |         "    port=4000,\n",
 99 |         "    ssl_verify_cert=True,\n",
100 |         "    ssl_verify_identity=True\n",
101 |         ")\n",
102 |         "db.connect()"
103 |       ]
104 |     },
105 |     {
106 |       "cell_type": "markdown",
107 |       "metadata": {
108 |         "id": "uOyjrmWJizUr"
109 |       },
110 |       "source": [
111 |         "## Prepare the Context\n",
112 |         "\n",
113 |         "In this case, contexts are the documents, use the openai embeddings model to get the embeddings of the documents, and store them in the TiDB."
114 |       ]
115 |     },
116 |     {
117 |       "cell_type": "code",
118 |       "execution_count": null,
119 |       "metadata": {
120 |         "id": "_e5P_m0MizUs"
121 |       },
122 |       "outputs": [],
123 |       "source": [
124 |         "documents = [\n",
125 |         "   \"TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.\",\n",
126 |         "   \"TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.\",\n",
127 |         "   \"TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. \",\n",
128 |         "]\n",
129 |         "\n",
130 |         "class DocModel(Model):\n",
131 |         "    text = TextField()\n",
132 |         "    embedding = VectorField(dimensions=embedding_dimensions)\n",
133 |         "\n",
134 |         "    class Meta:\n",
135 |         "        database = db\n",
136 |         "        table_name = \"gemini_embedding_test\"\n",
137 |         "\n",
138 |         "    def __str__(self):\n",
139 |         "        return self.text\n",
140 |         "\n",
141 |         "db.drop_tables([DocModel])\n",
142 |         "db.create_tables([DocModel])\n",
143 |         "\n",
144 |         "embeddings = genai.embed_content(model=embedding_model, content=documents, task_type=\"retrieval_document\")\n",
145 |         "data_source = [\n",
146 |         "    {\"text\": doc, \"embedding\": emb}\n",
147 |         "    for doc, emb in zip(documents, embeddings['embedding'])\n",
148 |         "]\n",
149 |         "DocModel.insert_many(data_source).execute()"
150 |       ]
151 |     },
152 |     {
153 |       "cell_type": "markdown",
154 |       "metadata": {
155 |         "id": "zMP-P1g8izUs"
156 |       },
157 |       "source": [
158 |         "## Initial the Vector of Question\n",
159 |         "\n",
160 |         "Ask a question, use the openai embeddings model to get the embeddings of the question"
161 |       ]
162 |     },
163 |     {
164 |       "cell_type": "code",
165 |       "execution_count": null,
166 |       "metadata": {
167 |         "id": "-zrTOxs4izUt"
168 |       },
169 |       "outputs": [],
170 |       "source": [
171 |         "question = \"what is TiKV?\"\n",
172 |         "question_embedding = genai.embed_content(model=embedding_model, content=[question], task_type=\"retrieval_query\")['embedding'][0]"
173 |       ]
174 |     },
175 |     {
176 |       "cell_type": "markdown",
177 |       "metadata": {
178 |         "id": "atc0gXVZizUt"
179 |       },
180 |       "source": [
181 |         "## Retrieve by Cosine Distance of Vectors\n",
182 |         "Get the relevant documents from the TiDB by comparing the embeddings of the question and the documents"
183 |       ]
184 |     },
185 |     {
186 |       "cell_type": "code",
187 |       "execution_count": null,
188 |       "metadata": {
189 |         "id": "DTtJRX64izUt"
190 |       },
191 |       "outputs": [],
192 |       "source": [
193 |         "related_docs = DocModel.select(\n",
194 |         "    DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias(\"distance\")\n",
195 |         ").order_by(SQL(\"distance\")).limit(3)\n",
196 |         "\n",
197 |         "print(\"Question:\", question)\n",
198 |         "print(\"Related documents:\")\n",
199 |         "for doc in related_docs:\n",
200 |         "    print(doc.distance, doc.text)"
201 |       ]
202 |     },
203 |     {
204 |       "cell_type": "markdown",
205 |       "metadata": {
206 |         "id": "bYBetPchmNUp"
207 |       },
208 |       "source": [
209 |         "## Cleanup"
210 |       ]
211 |     },
212 |     {
213 |       "cell_type": "code",
214 |       "execution_count": null,
215 |       "metadata": {
216 |         "id": "Lh27gC7gizUt"
217 |       },
218 |       "outputs": [],
219 |       "source": [
220 |         "db.close()"
221 |       ]
222 |     }
223 |   ],
224 |   "metadata": {
225 |     "colab": {
226 |       "provenance": [],
227 |       "toc_visible": true
228 |     },
229 |     "kernelspec": {
230 |       "display_name": ".venv",
231 |       "language": "python",
232 |       "name": "python3"
233 |     },
234 |     "language_info": {
235 |       "codemirror_mode": {
236 |         "name": "ipython",
237 |         "version": 3
238 |       },
239 |       "file_extension": ".py",
240 |       "mimetype": "text/x-python",
241 |       "name": "python",
242 |       "nbconvert_exporter": "python",
243 |       "pygments_lexer": "ipython3",
244 |       "version": "3.10.13"
245 |     }
246 |   },
247 |   "nbformat": 4,
248 |   "nbformat_minor": 0
249 | }
250 | 


--------------------------------------------------------------------------------
/examples/gemini-ai-embeddings-demo/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from peewee import Model, MySQLDatabase, TextField, SQL
 3 | from tidb_vector.peewee import VectorField
 4 | import google.generativeai as genai # Hypothetical import for Gemini API client
 5 | 
 6 | # Init Gemini client
 7 | # Adjust the initialization according to the Gemini API documentation
 8 | genai.configure(api_key=os.environ.get('GEMINI_API_KEY'))
 9 | embedding_model = 'models/embedding-001' # Replace with the actual model name
10 | embedding_dimensions = 768  # Adjust if different for the Gemini model
11 | 
12 | # Init TiDB connection
13 | db = MySQLDatabase(
14 |     'test',
15 |     user=os.environ.get('TIDB_USERNAME'),
16 |     password=os.environ.get('TIDB_PASSWORD'),
17 |     host=os.environ.get('TIDB_HOST'),
18 |     port=4000,
19 |     ssl_verify_cert=True,
20 |     ssl_verify_identity=True
21 | )
22 | 
23 | documents = [
24 |     "TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.",
25 |     "TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.",
26 |     "TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability.",
27 | ]
28 | 
29 | # Define a model with a VectorField to store the embeddings
30 | class DocModel(Model):
31 |     text = TextField()
32 |     embedding = VectorField(dimensions=embedding_dimensions)
33 | 
34 |     class Meta:
35 |         database = db
36 |         table_name = "gemini_embedding_test"
37 | 
38 |     def __str__(self):
39 |         return self.text
40 | 
41 | db.connect()
42 | db.drop_tables([DocModel])
43 | db.create_tables([DocModel])
44 | 
45 | # Insert the documents and their embeddings into TiDB
46 | embeddings = genai.embed_content(model=embedding_model, content=documents, task_type="retrieval_document")
47 | data_source = [
48 |     {"text": doc, "embedding": emb}
49 |     for doc, emb in zip(documents, embeddings['embedding'])
50 | ]
51 | DocModel.insert_many(data_source).execute()
52 | 
53 | # Query the most similar documents to a question
54 | # 1. Generate the embedding of the question
55 | # 2. Query the most similar documents based on the cosine distance in TiDB
56 | # 3. Print the results
57 | question = "what is TiKV?"
58 | question_embedding = genai.embed_content(model=embedding_model, content=[question], task_type="retrieval_query")['embedding'][0]
59 | related_docs = DocModel.select(
60 |     DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias("distance")
61 | ).order_by(SQL("distance")).limit(3)
62 | 
63 | print("Question:", question)
64 | print("Related documents:")
65 | for doc in related_docs:
66 |     print(doc.distance, doc.text)
67 | 
68 | db.close()
69 | 
70 | # Expected Output:
71 | # 
72 | # Question: what is TiKV?
73 | # Related documents:
74 | # 0.22371791507562544 TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. 
75 | # 0.3317073143109729 TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.
76 | # 0.3690570695898543 TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.


--------------------------------------------------------------------------------
/examples/gemini-ai-embeddings-demo/requirements.txt:
--------------------------------------------------------------------------------
1 | google.generativeai
2 | peewee
3 | tidb-vector


--------------------------------------------------------------------------------
/examples/graphrag-demo/README.md:
--------------------------------------------------------------------------------
 1 | # GraphRAG Demo
 2 | 
 3 | This example demonstrates how to use the DSPy and TiDB Serverless to build a simple GraphRAG application. It crawled an example webpage and index the content to TiDB Serverless with Graph, then use the Graph and Vector to search the content and generate the answer with OpenAI.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster
 8 |   - Vector search enabled
 9 |   - Run the [init.sql](./init.sql) in your cluster
10 | - Python 3.8 or later
11 | - OpenAI [API key](https://platform.openai.com/docs/quickstart)
12 | 
13 | ## Run the example
14 | 
15 | ### Clone this repo
16 | 
17 | ```bash
18 | git clone https://github.com/pingcap/tidb-vector-python.git
19 | ```
20 | 
21 | ### Create a virtual environment
22 | 
23 | ```bash
24 | cd tidb-vector-python/examples/graphrag-demo
25 | python3 -m venv .venv
26 | source .venv/bin/activate
27 | ```
28 | 
29 | ### Install dependencies
30 | 
31 | ```bash
32 | pip install -r requirements.txt
33 | ```
34 | 
35 | ### Set the environment variables
36 | 
37 | Get the TiDB connection string via `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.
38 | 
39 | The TiDB connection string will look like:
40 | 
41 | ```
42 | mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True
43 | ```
44 | 
45 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart)
46 | 
47 | ### Run this example
48 | 
49 | 
50 | ```text
51 | $ python3 graphrag-demo.py
52 | Input your TIDB connection string:
53 | Input your OpenAI API Key:
54 | Enter your question:
55 | ```
56 | 


--------------------------------------------------------------------------------
/examples/graphrag-demo/graphrag-demo.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine, text
 2 | import openai
 3 | import getpass
 4 | 
 5 | # TiDB Connection String Pattern:
 6 | # mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True
 7 | 
 8 | db_engine = create_engine(getpass.getpass("Input your TIDB connection string:"))
 9 | oai_cli = openai.OpenAI(api_key=getpass.getpass("Input your OpenAI API Key:"))
10 | question = input("Enter your question:")
11 | embedding = str(oai_cli.embeddings.create(input=[question], model="text-embedding-3-small").data[0].embedding)
12 | 
13 | with db_engine.connect() as conn:
14 |     result = conn.execute(text("""
15 |     WITH initial_entity AS (
16 |         SELECT id FROM `entities`
17 |         ORDER BY VEC_Cosine_Distance(description_vec, :embedding) LIMIT 1
18 |     ), entities_ids AS (
19 |         SELECT source_entity_id i FROM relationships r INNER JOIN initial_entity i ON r.target_entity_id = i.id
20 |         UNION SELECT target_entity_id i FROM relationships r INNER JOIN initial_entity i ON r.source_entity_id = i.id
21 |         UNION SELECT initial_entity.id i FROM initial_entity
22 |     ) SELECT description FROM `entities` WHERE id IN (SELECT i FROM entities_ids);"""), {"embedding": embedding}).fetchall()
23 | 
24 |     print(oai_cli.chat.completions.create(model="gpt-4o", messages=[
25 |         {"role": "system", "content": f"Please carefully answer the question by {str(result)}"},
26 |         {"role": "user", "content": question}]).choices[0].message.content)
27 | 


--------------------------------------------------------------------------------
/examples/graphrag-demo/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMySQL==1.1.0
2 | openai==1.27.0
3 | SQLAlchemy==2.0.30


--------------------------------------------------------------------------------
/examples/graphrag-step-by-step-tutorial/README.md:
--------------------------------------------------------------------------------
 1 | # GraphRAG Step by Step Tutorials
 2 | 
 3 | This example demonstrates how to achieve GraphRAG using just 10 lines of core code. The steps outlined in this example guide you through setting up the environment, handling dependencies, and executing the core code to build and query a knowledge graph. Below are the main sections and steps involved in this process.
 4 | 
 5 | ## Table of Contents
 6 | 
 7 | 1. **Setting**
 8 | 2. **Dependencies**
 9 | 3. **Prerequisites**
10 | 4. **Core Code**
11 |    - Part 1: Indexing
12 |      - Set OpenAI and DSPy
13 |      - Load Raw Wikipedia Page
14 |      - Extract Raw Wikipedia Page to Knowledge Graph
15 |      - Let's Show the Graph
16 |      - Save Graph to TiDB Serverless
17 |    - Part 2: Retrieve
18 |      - Ask Question
19 |      - Find Entities and Relationships
20 |    - Part 3: Generate Answer
21 | 
22 | 
23 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/graphrag-step-by-step-tutorial/example.ipynb).
24 | 


--------------------------------------------------------------------------------
/examples/image_search/README.md:
--------------------------------------------------------------------------------
1 | # Image Search Example
2 | 
3 | This example shows how to use OpenAI CLIP to encode images as embeddings and store them in TiDB Serverless. It also demonstrates how to use the CLIP model to encode query text and search for the most similar images.
4 | 
5 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/image_search/example.ipynb).
6 | 


--------------------------------------------------------------------------------
/examples/image_search/example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Example of Image Search\n",
  8 |     "\n",
  9 |     "It is an example of image search using [OpenAI CLIP](https://huggingface.co/docs/transformers/model_doc/clip) and TiDB Serverless Vector Search.\n",
 10 |     "\n",
 11 |     "We will use the CLIP model to encode the image to a 512-dimensional vector and store them in TiDB Serverless. Then use the same model to encode the text query and search for the most similar images in TiDB Serverless."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Install dependencies\n"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "%pip install -q torch transformers requests ipyplot datasets sqlalchemy pymysql tidb_vector"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Prepare the environment\n",
 35 |     "\n",
 36 |     "> **Note:**\n",
 37 |     ">\n",
 38 |     "> - You can get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.\n",
 39 |     "> - In this example, we use CLIP to generate text and image embeddings with 512 dimensions.\n"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "import getpass\n",
 49 |     "\n",
 50 |     "TIDB_HOST = input(\"Enter your TiDB host: \")\n",
 51 |     "TIDB_USERNAME = input(\"Enter your TiDB username: \")\n",
 52 |     "TIDB_PASSWORD = getpass.getpass(\"Enter your TiDB password: \")\n",
 53 |     "\n",
 54 |     "CLIP_DIMENSION = 512"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Initial the Database and Table"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "from sqlalchemy import URL, create_engine, Column, Integer\n",
 71 |     "from sqlalchemy.orm import declarative_base, sessionmaker\n",
 72 |     "from tidb_vector.sqlalchemy import VectorType\n",
 73 |     "\n",
 74 |     "engine = create_engine(URL(\n",
 75 |     "    \"mysql+pymysql\",\n",
 76 |     "    username=TIDB_USERNAME,\n",
 77 |     "    password=TIDB_PASSWORD,\n",
 78 |     "    host=TIDB_HOST,\n",
 79 |     "    port=4000,\n",
 80 |     "    database=\"test\",\n",
 81 |     "    query={\"ssl_verify_cert\": True, \"ssl_verify_identity\": True},\n",
 82 |     "))\n",
 83 |     "\n",
 84 |     "Session = sessionmaker(bind=engine)\n",
 85 |     "Base = declarative_base()\n",
 86 |     "\n",
 87 |     "class ImageSearchTest(Base):\n",
 88 |     "    __tablename__ = \"image_search_test\"\n",
 89 |     "\n",
 90 |     "    id = Column(Integer, primary_key=True)\n",
 91 |     "    image_id = Column(Integer)\n",
 92 |     "    embedding = Column(VectorType(CLIP_DIMENSION))\n",
 93 |     "\n",
 94 |     "Base.metadata.drop_all(engine)\n",
 95 |     "Base.metadata.create_all(engine)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Initial CLIP model"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "import torch\n",
112 |     "from transformers import CLIPProcessor, CLIPModel\n",
113 |     "\n",
114 |     "\n",
115 |     "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
116 |     "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "## Load test images"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "import datasets\n",
133 |     "\n",
134 |     "imagenet_datasets = datasets.load_dataset('theodor1289/imagenet-1k_tiny', split='train')"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "# inspect the imagenet datasets\n",
144 |     "imagenet_datasets[0]"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "extract the images"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "import ipyplot\n",
161 |     "\n",
162 |     "imagenet_images = [i['image'] for i in imagenet_datasets]\n",
163 |     "ipyplot.plot_images(imagenet_images, max_images=20, img_width=100)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "## Define the encode function and other helper functions"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "def encode_images_to_embeddings(images):\n",
180 |     "    # accept a list of images and return the image embeddings\n",
181 |     "    with torch.no_grad():\n",
182 |     "        inputs = processor(images=images, return_tensors=\"pt\")\n",
183 |     "        image_features = model.get_image_features(**inputs)\n",
184 |     "        return image_features.cpu().detach().numpy()\n",
185 |     "\n",
186 |     "def encode_text_to_embedding(text):\n",
187 |     "    # accept a text and return the text embedding\n",
188 |     "    with torch.no_grad():\n",
189 |     "        inputs = processor(text=text, return_tensors=\"pt\")\n",
190 |     "        text_features = model.get_text_features(**inputs)\n",
191 |     "        return text_features.cpu().detach().numpy()[0]\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "## Store the images and their corresponding image embeddings in TiDB Serverless"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "images_embedding = encode_images_to_embeddings(imagenet_images)\n",
208 |     "objects = []\n",
209 |     "\n",
210 |     "for i, embedding in enumerate(images_embedding):\n",
211 |     "    img = imagenet_images[i]\n",
212 |     "    objects.append(\n",
213 |     "        ImageSearchTest(\n",
214 |     "            image_id=i,\n",
215 |     "            embedding=embedding\n",
216 |     "        )\n",
217 |     "    )\n",
218 |     "\n",
219 |     "with Session() as session:\n",
220 |     "    session.add_all(objects)\n",
221 |     "    session.commit()"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## Search for similar images using the text query"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "from sqlalchemy import asc\n",
238 |     "\n",
239 |     "query_text = \"dog\"\n",
240 |     "query_text_embedding = encode_text_to_embedding(query_text)\n",
241 |     "\n",
242 |     "with Session() as session:\n",
243 |     "    results = session.query(\n",
244 |     "        ImageSearchTest,\n",
245 |     "        ImageSearchTest.embedding.cosine_distance(query_text_embedding).label(\"distance\"),\n",
246 |     "    ).order_by(\n",
247 |     "        asc(\"distance\")\n",
248 |     "    ).limit(5).all()\n",
249 |     "\n",
250 |     "\n",
251 |     "    similar_images = []\n",
252 |     "    similarities = []\n",
253 |     "    for obj, d in results:\n",
254 |     "        similar_images.append(imagenet_images[obj.image_id])\n",
255 |     "        similarities.append(round(1 - d, 3))\n",
256 |     "\n",
257 |     "# display the similar images\n",
258 |     "ipyplot.plot_images(similar_images, labels=similarities, img_width=100)\n"
259 |    ]
260 |   }
261 |  ],
262 |  "metadata": {
263 |   "kernelspec": {
264 |    "display_name": ".venv",
265 |    "language": "python",
266 |    "name": "python3"
267 |   },
268 |   "language_info": {
269 |    "codemirror_mode": {
270 |     "name": "ipython",
271 |     "version": 3
272 |    },
273 |    "file_extension": ".py",
274 |    "mimetype": "text/x-python",
275 |    "name": "python",
276 |    "nbconvert_exporter": "python",
277 |    "pygments_lexer": "ipython3",
278 |    "version": "3.12.2"
279 |   }
280 |  },
281 |  "nbformat": 4,
282 |  "nbformat_minor": 2
283 | }
284 | 


--------------------------------------------------------------------------------
/examples/jina-ai-embeddings-demo/README.md:
--------------------------------------------------------------------------------
 1 | # Jina AI Embeddings Demo
 2 | This is a simple demo to show how to use Jina AI to generate embeddings for text data. Then store the embeddings in TiDB Vector Storage and search for similar embeddings.
 3 | 
 4 | ## Prerequisites
 5 | 
 6 | - A running TiDB Serverless cluster with vector search enabled
 7 | - Python 3.8 or later
 8 | - Jina AI API key
 9 | 
10 | ## Run the example
11 | 
12 | ### Clone this repo
13 | 
14 | ```bash
15 | git clone https://github.com/pingcap/tidb-vector-python.git
16 | ```
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | cd tidb-vector-python/examples/jina-ai-embeddings-demo
22 | python3 -m venv .venv
23 | source .venv/bin/activate
24 | ```
25 | 
26 | ### Install dependencies
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ### Set the environment variables
33 | 
34 | Get the Jina AI API key from the [Jina AI Embedding API](https://jina.ai/embeddings/) page
35 | 
36 | Get the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then replace the following placeholders to get the `TIDB_DATABASE_URL`.
37 | 
38 | ```bash
39 | export JINA_API_KEY="****"
40 | export TIDB_DATABASE_URL="mysql+pymysql://<USERNAME>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA>&ssl_verify_cert=true&ssl_verify_identity=true"
41 | ```
42 | or create a `.env` file with the above environment variables.
43 | 
44 | 
45 | ### Run this example
46 | 
47 | ```text
48 | $ python jina-ai-embeddings-demo.py
49 | - Inserting Data to TiDB...
50 |   - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.
51 |   - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.
52 | - List All Documents and Their Distances to the Query:
53 |   - distance: 0.3585317326132522
54 |     content: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.
55 |   - distance: 0.10858102967720984
56 |     content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.
57 | - The Most Relevant Document and Its Distance to the Query:
58 |   - distance: 0.10858102967720984
59 |     content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.
60 | ```


--------------------------------------------------------------------------------
/examples/jina-ai-embeddings-demo/jina-ai-embeddings-demo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import requests
  3 | import dotenv
  4 | 
  5 | from sqlalchemy import Column, Integer, String, create_engine
  6 | from sqlalchemy.orm import Session, declarative_base
  7 | from tidb_vector.sqlalchemy import VectorType
  8 | 
  9 | dotenv.load_dotenv()
 10 | 
 11 | 
 12 | # Step 1. Define a helper function to generate embeddings using Jina AI's API.
 13 | JINAAI_API_KEY = os.getenv('JINAAI_API_KEY')
 14 | assert JINAAI_API_KEY is not None
 15 | 
 16 | 
 17 | def generate_embeddings(text: str):
 18 |     JINAAI_API_URL = 'https://api.jina.ai/v1/embeddings'
 19 |     JINAAI_HEADERS = {
 20 |         'Content-Type': 'application/json',
 21 |         'Authorization': f'Bearer {JINAAI_API_KEY}'
 22 |     }
 23 |     JINAAI_REQUEST_DATA = {
 24 |         'input': [text],
 25 |         'model': 'jina-embeddings-v2-base-en'  # with dimisions 768.
 26 |     }
 27 |     response = requests.post(JINAAI_API_URL, headers=JINAAI_HEADERS, json=JINAAI_REQUEST_DATA)
 28 |     return response.json()['data'][0]['embedding']
 29 | 
 30 | 
 31 | # Step 2. Connect TiDB Serverless
 32 | TIDB_DATABASE_URL = os.getenv('TIDB_DATABASE_URL')
 33 | assert TIDB_DATABASE_URL is not None
 34 | engine = create_engine(url=TIDB_DATABASE_URL, pool_recycle=300)
 35 | 
 36 | 
 37 | # Step 3. Create the vector table.
 38 | Base = declarative_base()
 39 | 
 40 | 
 41 | class Document(Base):
 42 |     __tablename__ = "jinaai_tidb_demo_documents"
 43 | 
 44 |     id = Column(Integer, primary_key=True)
 45 |     content = Column(String(255), nullable=False)
 46 |     content_vec = Column(
 47 |         # DIMENSIONS is determined by the embedding model,
 48 |         # for Jina AI's jina-embeddings-v2-base-en model it's 768.
 49 |         VectorType(dim=768),
 50 |     )
 51 | 
 52 | 
 53 | Base.metadata.create_all(engine)
 54 | 
 55 | 
 56 | # Step 4. Generate embeddings for texts via Jina AI API and store them in TiDB.
 57 | 
 58 | TEXTS = [
 59 |     'Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.',
 60 |     'TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.',
 61 | ]
 62 | 
 63 | data = []
 64 | for text in TEXTS:
 65 |     # Generate the embedding for the text via Jina AI API.
 66 |     embedding = generate_embeddings(text)
 67 |     data.append({
 68 |         'text': text,
 69 |         'embedding': embedding
 70 |     })
 71 | 
 72 | with Session(engine) as session:
 73 |     print('- Inserting Data to TiDB...')
 74 |     for item in data:
 75 |         print(f'  - Inserting: {item["text"]}')
 76 |         session.add(Document(
 77 |             content=item['text'],
 78 |             content_vec=item['embedding']
 79 |         ))
 80 |     session.commit()
 81 | 
 82 | 
 83 | # Step 5. Query the most relevant document based on the query.
 84 | query = 'What is TiDB?'
 85 | # Generate the embedding for the query via Jina AI API.
 86 | query_embedding = generate_embeddings(query)
 87 | with Session(engine) as session:
 88 |     print('- List All Documents and Their Distances to the Query:')
 89 |     for doc, distance in session.query(
 90 |         Document,
 91 |         Document.content_vec.cosine_distance(query_embedding).label('distance')
 92 |     ).all():
 93 |         print(f'  - distance: {distance}\n'
 94 |               f'    content: {doc.content}')
 95 | 
 96 |     print('- The Most Relevant Document and Its Distance to the Query:')
 97 |     doc, distance = session.query(
 98 |         Document,
 99 |         Document.content_vec.cosine_distance(query_embedding).label('distance')
100 |     ).order_by(
101 |         'distance'
102 |     ).limit(1).first()
103 |     print(f'  - distance: {distance}\n'
104 |           f'    content: {doc.content}')
105 | 
106 | # Expected Output:
107 | #
108 | # - Inserting Data to TiDB...
109 | #   - Inserting: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.
110 | #   - Inserting: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.
111 | # - List All Documents and Their Distances to the Query:
112 | #   - distance: 0.3585317326132522
113 | #     content: Jina AI offers best-in-class embeddings, reranker and prompt optimizer, enabling advanced multimodal AI.
114 | #   - distance: 0.10858102967720984
115 | #     content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.
116 | # - The Most Relevant Document and Its Distance to the Query:
117 | #   - distance: 0.10858102967720984
118 | #     content: TiDB is an open-source MySQL-compatible database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.
119 | 
120 | 


--------------------------------------------------------------------------------
/examples/jina-ai-embeddings-demo/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | PyMySQL
3 | openai==1.27.0
4 | SQLAlchemy
5 | tidb-vector>=0.0.9
6 | python-dotenv


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/.env.example:
--------------------------------------------------------------------------------
 1 | # A example database URL to connect to a TiDB cluster from macOS:
 2 | # mysql+pymysql://<PREFIX>.root:<PASSWORD>@gateway01.<REGION>.prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true
 3 | TIDB_DATABASE_URL="mysql+pymysql://<USER>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA_PATH>&ssl_verify_cert=true&ssl_verify_identity=true"
 4 | 
 5 | # The name of the language model to use for the language model-based retriever.
 6 | LM_MODEL_NAME="<MODEL>"
 7 | 
 8 | # The base URL of the Ollama API.
 9 | OLLAMA_BASE_URL="http://<HOST>:11434"
10 | 
11 | # The API key to use for the Ollama API.
12 | OLLAMA_API_KEY="ollama"
13 | 
14 | # sentence-transformers model
15 | SENTENCE_TRANSFORMERS_MODEL="<MODEL>"


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/README.md:
--------------------------------------------------------------------------------
  1 | # LangChain Agent Demo
  2 | 
  3 | An Agent demo, Classify and Extract information from text using TiDBVectorClient, LangChain, and LLM.
  4 | 
  5 | e.g.    
  6 | input: "At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987."
  7 | 
  8 | query related documents: 
  9 |   - "At My Window (album) | At My Window is an album ... "
 10 |   - "Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. ... "
 11 |   - "Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. ... "
 12 |    
 13 | classify the input text: `{"category": "album", "reason": "The document is about an album named 'At My Window'."}`
 14 |    
 15 | This demo is similar to the official cookbook, but replaces the knowledge part with tidbVectorClient. It tests the
 16 | project's compatibility with both the official features and LangChain.
 17 | 
 18 | - https://cookbook.openai.com/examples/how_to_build_a_tool-using_agent_with_langchain
 19 | - https://learn.deeplearning.ai/courses/functions-tools-agents-langchain/
 20 | 
 21 | 
 22 | 
 23 | ## Prerequisites
 24 | 
 25 | - TiDB Serverless cluster
 26 | - Python 3.10 or later
 27 | - Ollama or OpenAI
 28 | - langchain==0.2.10
 29 | - langchain-community==0.2.9
 30 | 
 31 | ## Run the example
 32 | 
 33 | ### Clone this repo
 34 | 
 35 | ```bash
 36 | git clone https://github.com/pingcap/tidb-vector-python.git
 37 | ```
 38 | 
 39 | ### Create a virtual environment
 40 | 
 41 | ```bash
 42 | cd tidb-vector-python/examples/langchain-agent-demo
 43 | python3 -m venv .venv
 44 | source .venv/bin/activate
 45 | ```
 46 | 
 47 | ### Install dependencies
 48 | 
 49 | ```bash
 50 | pip install -r requirements.txt
 51 | ```
 52 | 
 53 | ### Set the environment variables
 54 | 
 55 | Get the TiDB connection string via `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as
 56 | described in the [Prerequisites](../README.md#prerequisites) section.
 57 | 
 58 | The TiDB connection string will look like:
 59 | 
 60 | ```
 61 | mysql+pymysql://{TIDB_USER}:{TIDB_PASSWORD}@{TIDB_HOST}:{TIDB_PORT}/{TIDB_DB_NAME}?ssl_verify_cert=True&ssl_verify_identity=True
 62 | ```
 63 | 
 64 | ### Run the example
 65 | ```text
 66 | python ./tidb-vector-python/examples/langchain-agent-demo/example.py 
 67 | Connected to TiDB.
 68 | describe table:
 69 | {'success': True, 'result': 6, 'error': None}
 70 | Initializing the retriever...
 71 | Retriever initialized successfully.
 72 | Loading sample data...
 73 | sample_data.txt found.
 74 | Sample data loaded successfully.
 75 | Embedding sample data...
 76 | 0 At My Wind [-0.14979149401187897, 0.07634416222572327, 0.07299982756376266, 0.153825044631958, 0.04083935171365738]
 77 | 1 Little Win [0.32180845737457275, 0.5461692214012146, -0.014786622487008572, 0.03591456636786461, -0.22666659951210022]
 78 | 2 Storm Wind [-0.022210828959941864, 0.16006261110305786, 0.14314979314804077, -0.08256750553846359, 0.14658856391906738]
 79 | Sample data embedded successfully.
 80 | Sample data number: 3
 81 | Inserting documents into TiDB...
 82 | Documents inserted successfully.
 83 | # ---- Init Finish ----
 84 | > Entering new RunnableSequence chain...
 85 | > Entering new RunnableParallel<documents,input> chain...
 86 | > Entering new RunnableSequence chain...
 87 | > Entering new RunnablePassthrough chain...
 88 | > Finished chain.
 89 | {'At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt\'s first studio album in the nine years that followed 1978\'s "Flyin\' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.': 0.6090894176961388, 'Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee.': 0.8308758434772159, 'Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released.': 0.9628706551444856}
 90 | > Entering new RunnableLambda chain...
 91 | > Finished chain.
 92 | > Finished chain.
 93 | > Finished chain.
 94 | > Entering new PromptTemplate chain...
 95 | > Finished chain.
 96 | > Entering new OpenAIToolsAgentOutputParser chain...
 97 | > Finished chain.
 98 | > Finished chain.
 99 | [ToolAgentAction(tool='Classification', tool_input={'category': 'album', 'reason': "The document is about an album named 'At My Window'."}, log='\nInvoking: `Classification` with `{\'category\': \'album\', \'reason\': "The document is about an album named \'At My Window\'."}`\n\n\n', message_log=[AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_wqo6r2px', 'function': {'arguments': '{"category":"album","reason":"The document is about an album named \'At My Window\'."}', 'name': 'Classification'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 33, 'prompt_tokens': 397, 'total_tokens': 430}, 'model_name': 'mistral:latest', 'system_fingerprint': 'fp_ollama', 'finish_reason': 'stop', 'logprobs': None}, id='run-cb5b41e6-5978-4164-8ac8-16a9116e47bd-0', tool_calls=[{'name': 'Classification', 'args': {'category': 'album', 'reason': "The document is about an album named 'At My Window'."}, 'id': 'call_wqo6r2px', 'type': 'tool_call'}], usage_metadata={'input_tokens': 397, 'output_tokens': 33, 'total_tokens': 430})], tool_call_id='call_wqo6r2px')]
100 | ```
101 | 


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/langchain-agent-demo/__init__.py


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from enum import Enum
 3 | from langchain_core.pydantic_v1 import BaseModel, Field
 4 | from utils import format_docs
 5 | from knowledge_base import retriever
 6 | from langchain_core.prompts import PromptTemplate
 7 | from langchain_core.runnables import RunnablePassthrough
 8 | from langchain_core.messages import HumanMessage
 9 | from langchain_core.utils.function_calling import convert_to_openai_tool
10 | from langchain_core.callbacks import FileCallbackHandler, StdOutCallbackHandler
11 | from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
12 | from langchain_openai import ChatOpenAI
13 | from dotenv import load_dotenv
14 | from dotenv import find_dotenv
15 | from loguru import logger
16 | 
17 | logfile = "output.log"
18 | logger.add(logfile, colorize=True, enqueue=True)
19 | handler_file = FileCallbackHandler(logfile)
20 | handler_strout = StdOutCallbackHandler()
21 | 
22 | _ = load_dotenv(find_dotenv())
23 | 
24 | 
25 | class ClassEnum(str, Enum):
26 |     album = "album"
27 |     director = "director"
28 |     actor = "actor"
29 |     book = "book"
30 |     songwriter = "songwriter"
31 |     musician = "musician"
32 |     others = "others"
33 | 
34 | 
35 | class Classification(BaseModel):
36 |     """Classify the document into a category."""
37 | 
38 |     # ! Only Hinting category is not work for 'convert method', need to specify the values of the category in desc,
39 |     category: ClassEnum = Field(
40 |         description=f"The category of the document, should be one of the following values: {[e.value for e in ClassEnum]}"
41 |     )
42 |     reason: str = Field(description="The reason for the classification.")
43 | 
44 | 
45 | model = ChatOpenAI(
46 |     base_url=os.environ.get('OLLAMA_BASE_URL'),
47 |     api_key=os.environ.get('OLLAMA_API_KEY'),
48 |     # model need support instruction function-calling.
49 |     model=os.environ.get('LM_MODEL_NAME'),
50 |     temperature=0,
51 | )
52 | tools = [convert_to_openai_tool(Classification)]
53 | model_with_tools = model.bind_tools(tools=tools, tool_choice='required')
54 | 
55 | parser = OpenAIToolsAgentOutputParser()
56 | prompt = PromptTemplate(
57 |     template="""
58 | You are an intelligent assistant, you will receive some documents about input, base on these info,
59 | tasked with classifying items based on their descriptions, use function calling 'Classification'.
60 | 
61 | related documents: {documents}
62 | input: {input}
63 | """,
64 |     input_variables=["documents", "input"],
65 | )
66 | 
67 | chain = {"documents": retriever | format_docs, "input": RunnablePassthrough()} | prompt | model_with_tools | parser
68 | 
69 | resp = chain.invoke(HumanMessage(content="At My Window"), {"callbacks": [handler_file, handler_strout]})
70 | print(resp)
71 | 
72 | #
73 | if __name__ == '__main__':
74 |     pass
75 | 


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/knowledge_base.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, List, Callable
  2 | from dotenv import load_dotenv, find_dotenv
  3 | from langchain_core.callbacks import CallbackManagerForRetrieverRun
  4 | from langchain_core.documents import Document
  5 | from sentence_transformers import SentenceTransformer
  6 | from functools import partial
  7 | import os
  8 | from tidb_vector.integrations import TiDBVectorClient
  9 | from langchain_core.retrievers import BaseRetriever
 10 | 
 11 | # load environment variables
 12 | _ = load_dotenv(find_dotenv())
 13 | 
 14 | # https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-encoding_format
 15 | Vector = Union[List[float], List[int]]
 16 | Vectors = List[Vector]
 17 | 
 18 | 
 19 | def sentence_transformer_embedding_function(
 20 |     embed_model: SentenceTransformer, sentences: Union[str, List[str]]
 21 | ) -> Union[Vector, Vectors]:
 22 |     """
 23 |     Generates vector embeddings for the given text using the sentence-transformers model.
 24 | 
 25 |     Args:
 26 |         embed_model (SentenceTransformer): The sentence-transformers model to use.
 27 |         sentences (Union[str, List[str]]): The text or list of texts for which to generate embeddings.
 28 | 
 29 |     Returns:
 30 |         if sentences is a single string:
 31 |             List[float]: The embedding for the input sentence.
 32 |         if sentences is a list of strings:
 33 |             List[List[float]]: The embeddings for the input sentences.
 34 | 
 35 | 
 36 |     Examples:
 37 |         Below is a code snippet that shows how to use this function:
 38 |         ```python
 39 |         embeddings = sentence_transformer_embedding_function("Hello, world!")
 40 |         ```
 41 |         or
 42 |         ```python
 43 |         embeddings = sentence_transformer_embedding_function(["Hello, world!"])
 44 |         ```
 45 |     """
 46 | 
 47 |     return embed_model.encode(sentences).tolist()
 48 | 
 49 | 
 50 | class TiRetriever(BaseRetriever):
 51 |     """A retriever that contains the top k documents that contain the user query.
 52 | 
 53 |     This retriever only implements the sync method _get_relevant_documents.
 54 | 
 55 |     If the retriever were to involve file access or network access, it could benefit
 56 |     from a native async implementation of `_aget_relevant_documents`.
 57 | 
 58 |     As usual, with Runnables, there's a default async implementation that's provided
 59 |     that delegates to the sync implementation running on another thread.
 60 |     """
 61 | 
 62 |     """Vector Database Client. For example, TiDBVectorClient."""
 63 |     rm: TiDBVectorClient
 64 |     """"""
 65 |     embedding_function: Callable[[str], Vector]
 66 |     """The number of top documents to return."""
 67 |     k: int
 68 | 
 69 |     def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
 70 |         """Sync implementations for retriever."""
 71 |         query_embeddings = self.embedding_function(str(query))
 72 |         tidb_vector_res = self.rm.query(query_embeddings, k=self.k)
 73 |         passages_scores = {}
 74 |         for res in tidb_vector_res:
 75 |             passages_scores[res.document] = res.distance
 76 |         sorted_passages = sorted(passages_scores.items(), key=lambda x: x[1], reverse=True)
 77 |         return [Document(text) for (text, score) in sorted_passages]
 78 | 
 79 | 
 80 | embed_model = SentenceTransformer(os.environ.get('SENTENCE_TRANSFORMERS_MODEL'), trust_remote_code=True)
 81 | embed_model_dim = embed_model.get_sentence_embedding_dimension()
 82 | 
 83 | embedding_function = partial(sentence_transformer_embedding_function, embed_model)
 84 | 
 85 | tidb_vector_client = TiDBVectorClient(
 86 |     table_name=os.environ.get('TIDB_TABLE_NAME', 'embedded_documents'),
 87 |     connection_string=os.environ.get('TIDB_DATABASE_URL'),
 88 |     vector_dimension=embed_model_dim,
 89 |     drop_existing_table=True,
 90 | )
 91 | 
 92 | print("Connected to TiDB.")
 93 | print("describe table:")
 94 | print(tidb_vector_client.execute("describe embedded_documents;"))
 95 | 
 96 | print("Initializing the retriever...")
 97 | retriever = TiRetriever(rm=tidb_vector_client, embedding_function=embedding_function, k=3)
 98 | print("Retriever initialized successfully.")
 99 | 
100 | print("Loading sample data...")
101 | # test sample data
102 | # load sample_data.txt  if not local file, you can use requests.get(url).text
103 | # sample data url: https://raw.githubusercontent.com/wxywb/dspy_dataset_sample/master/sample_data.txt
104 | with open('sample_data.txt', 'r') as f:
105 |     # I prepare a small set of data for speeding up embedding, you can replace it with your own data.
106 |     print("sample_data.txt found.")
107 |     sample_data = f.read()
108 | print("Sample data loaded successfully.")
109 | 
110 | print("Embedding sample data...")
111 | documents = []
112 | for idx, passage in enumerate(sample_data.split('\n')[:3]):
113 |     embedding = embedding_function([passage])[0]
114 |     print(idx, passage[:10], embedding[:5])
115 |     if len(passage) == 0:
116 |         continue
117 |     documents.append(
118 |         {
119 |             "id": str(idx),
120 |             "text": passage,
121 |             "embedding": embedding,
122 |             "metadata": {"category": "album"},
123 |         }
124 |     )
125 | print("Sample data embedded successfully.")
126 | print("Sample data number:", len(documents))
127 | 
128 | print("Inserting documents into TiDB...")
129 | tidb_vector_client.insert(
130 |     ids=[doc["id"] for doc in documents],
131 |     texts=[doc["text"] for doc in documents],
132 |     embeddings=[doc["embedding"] for doc in documents],
133 |     metadatas=[doc["metadata"] for doc in documents],
134 | )
135 | print("Documents inserted successfully.")
136 | 
137 | print("# ---- Init Finish ----")
138 | 


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMySQL==1.1.0
2 | mysqlclient==2.2.4
3 | langchain==0.2.10
4 | langchain-community==0.2.9
5 | sentence-transformers==3.0.1
6 | langchain_openai==0.1.17
7 | loguru==0.7.2
8 | 
9 | 


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/sample_data.txt:
--------------------------------------------------------------------------------
 1 | At My Window (album) | At My Window is an album released by Folk/country singer-songwriter Townes Van Zandt in 1987. This was Van Zandt's first studio album in the nine years that followed 1978's "Flyin' Shoes", and his only studio album recorded in the 1980s. Although the songwriter had become less prolific, this release showed that the quality of his material remained high.
 2 | Little Window | Little Window is the debut album of American singer-songwriter Baby Dee. The album was released in 2002 on the Durtro label. It was produced, composed, and performed entirely by Dee.
 3 | Storm Windows | Storm Windows is the seventh album by American folk singer and songwriter John Prine, released in 1980. It was his last release on a major label – he would next join Al Bunetta and Dan Einstein to form Oh Boy Records on which all his subsequent recordings were released.
 4 | Yes I Am (Melissa Etheridge album) | Yes I Am is the fourth studio album by American singer-songwriter Melissa Etheridge, released by Island Records on September 21, 1993 (see 1993 in music). The title is generally thought to refer to Etheridge's recent coming out as a lesbian, confirming long-standing rumors about her personal life. This is the album that gave Etheridge national recognition. The rock ballad "Come to My Window" was the first single released from the album, which peaked at No. 25 on the "Billboard" Hot 100, and its video featured actress Juliette Lewis having a nervous breakdown. This single brought the album into the public consciousness and was quickly followed by "I'm the Only One", which became a major hit and reached No. 8 on the Hot 100, and "If I Wanted To", which hit No. 16.
 5 | The Great Victorian Collection | The Great Victorian Collection, published in 1975, is a novel by Northern Irish-Canadian writer Brian Moore. Set in Carmel, California, it tells the story of a man who dreams that the empty parking lot he can see from his hotel window has been transformed by the arrival of a collection of priceless Victoriana on display in a vast open-air market. When he awakes he finds that he can no longer distinguish the dream from reality.
 6 | Rosario Dawson | Rosario Isabel Dawson (born May 9, 1979) is an American actress, producer, singer, comic book writer, and political activist. She made her film debut in the 1995 teen drama "Kids". Her subsequent film roles include "He Got Game", "Men in Black II", "25th Hour", "Rent", "Sin City", "Death Proof", "Seven Pounds", "", and "Top Five". Dawson has also provided voice-over work for Disney and DC.
 7 | Robert B. Sherman | Robert Bernard Sherman (December 19, 1925 – March 6, 2012) was an American songwriter who specialized in musical films with his brother Richard Morton Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time.
 8 | Richard M. Sherman | Richard Morton Sherman (born June 12, 1928) is an American songwriter who specialized in musical films with his brother Robert Bernard Sherman. According to the official Walt Disney Company website and independent fact checkers, "the Sherman Brothers were responsible for more motion picture musical song scores than any other songwriting team in film history." Some of the Sherman Brothers' best known songs were incorporated into live action and animation musical films including: "Mary Poppins", "The Jungle Book", "The Many Adventures of Winnie the Pooh", "Chitty Chitty Bang Bang", "Snoopy Come Home", "Bedknobs and Broomsticks", "The Slipper and the Rose", and "Charlotte's Web". Their most well known work, however, remains the theme park song "It's a Small World (After All)". According to Time.com, this song is the most performed song of all time.
 9 | Everything Has Changed | "Everything Has Changed" is a song written and performed by American singer-songwriter Taylor Swift and English singer-songwriter Ed Sheeran, taken from Swift's fourth studio album, "Red" (2012). Produced by Butch Walker, the track was released as the sixth single from the album on July 16, 2013. "Everything Has Changed" is a guitar ballad combining folk and pop genres about "wanting to get to know a new lover better".
10 | Everything Changes (Julian Lennon album) | Everything Changes is the sixth studio album by English singer-songwriter Julian Lennon. It was released on 2 October 2011.
11 | Janick Gers | Janick Robert Gers ( ; born 27 January 1957 in Hartlepool, England) is an English musician, best known for being one of the three current guitarists in Iron Maiden, along with Dave Murray and Adrian Smith, as well as his earlier work with Gillan and White Spirit.
12 | Dave Murray (musician) | David Michael "Dave" Murray (born 23 December 1956) is an English guitarist and songwriter best known as one of the earliest members of the British heavy metal band Iron Maiden. Along with the group's bassist and primary songwriter Steve Harris, Murray has appeared on all of the band's releases.
13 | Roy Z | Roy Z (born February, 1968) is an American guitarist, songwriter and producer, best known for his work with Bruce Dickinson (from Iron Maiden), Halford, and Judas Priest. He also is the founder of Tribe of Gypsies, a Latin influenced hard rock band.
14 | Heather Baker | Heather Baker (born October 9, 1984) is a female American songwriter, guitarist, producer and founder of the Electronica band Antiwave. Heather is known for being a session and touring guitar player for the likes of Bonnie Mckee (Pulse Music Publishing), Krewella (Columbia Records), Kerli (Island Records), The Iron Maidens (Powerslave Records) and currently plays with the band Fake Figures (members of Atreyu, Eyelid and Scars of Tomorrow) and NoMBe (TH3RD BRAIN)
15 | Gianfranco Rosi (director) | Gianfranco Rosi is an Italian director, cinematographer, producer and screenwriter. His film "Sacro GRA" won Golden Lion at 70th Venice International Film Festival. "Sacro GRA" is the first documentary film to win Golden Lion in history of the Venice film festival and the first Italian film to win in fifteen years, after Gianni Amelio's "The Way We Laughed" won the award in 1998. His 2016 film "Fire at Sea", a documentary focused on European migrant crisis on the Sicilan island of Lampedusa, won the Golden Bear at the 66th Berlin International Film Festival. Rosi is the only documentary filmmaker to win two top prizes at major European film festivals (Cannes, Berlin and Venice) and is currently the only filmmaker besides Michael Haneke, Jafar Panahi, Ang Lee, and Ken Loach to win two top European festival prizes in the 21st century.


--------------------------------------------------------------------------------
/examples/langchain-agent-demo/utils.py:
--------------------------------------------------------------------------------
1 | def format_docs(docs):
2 |     return "\n\n".join([d.page_content for d in docs])
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector-with-ui/README.md:
--------------------------------------------------------------------------------
 1 | # LlamaIndex RAG Example with Simple UI
 2 | 
 3 | This example demonstrates how to use the LlamaIndex and TiDB Serverless to build a simple RAG(Retrival-Augmented Generation) application with simple UI.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster with vector search enabled
 8 | - Python 3.8 or later
 9 | - OpenAI [API key](https://platform.openai.com/docs/quickstart)
10 | 
11 | ## Run the example
12 | 
13 | ### Clone this repo
14 | 
15 | ```bash
16 | git clone https://github.com/pingcap/tidb-vector-python.git
17 | ```
18 | 
19 | ### Create a virtual environment
20 | 
21 | ```bash
22 | cd tidb-vector-python/examples/llamaindex-tidb-vector-with-ui
23 | python3 -m venv .venv
24 | source .venv/bin/activate
25 | ```
26 | 
27 | ### Install dependencies
28 | 
29 | ```bash
30 | pip install -r requirements.txt
31 | ```
32 | 
33 | ### Set the environment variables
34 | 
35 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart)
36 | 
37 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.
38 | 
39 | ```bash
40 | export OPENAI_API_KEY="sk-*******"
41 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com"
42 | export TIDB_USERNAME="****.root"
43 | export TIDB_PASSWORD="****"
44 | ```
45 | 
46 | ### Prepare data and run the server
47 | 
48 | ```bash
49 | # prepare the data
50 | python app.py prepare
51 | 
52 | # runserver
53 | python app.py runserver
54 | ```
55 | 
56 | Now you can visit [http://127.0.0.1:3000/](http://127.0.0.1:3000/) to interact with the RAG application.
57 | 


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector-with-ui/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import logging
  5 | import click
  6 | import uvicorn
  7 | import fastapi
  8 | import asyncio
  9 | from enum import Enum
 10 | from sqlalchemy import URL
 11 | from fastapi.encoders import jsonable_encoder
 12 | from fastapi.responses import StreamingResponse, HTMLResponse, JSONResponse
 13 | from fastapi.templating import Jinja2Templates
 14 | from llama_index.core import VectorStoreIndex, StorageContext
 15 | from llama_index.core.base.response.schema import StreamingResponse as llamaStreamingResponse
 16 | from llama_index.vector_stores.tidbvector import TiDBVectorStore
 17 | from llama_index.readers.web import SimpleWebPageReader
 18 | 
 19 | 
 20 | logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 21 | logger = logging.getLogger()
 22 | 
 23 | 
 24 | class EventType(Enum):
 25 |     META = 1
 26 |     ANSWER = 2
 27 | 
 28 | 
 29 | logger.info("Initializing TiDB Vector Store....")
 30 | tidb_connection_url = URL(
 31 |     "mysql+pymysql",
 32 |     username=os.environ['TIDB_USERNAME'],
 33 |     password=os.environ['TIDB_PASSWORD'],
 34 |     host=os.environ['TIDB_HOST'],
 35 |     port=4000,
 36 |     database="test",
 37 |     query={"ssl_verify_cert": True, "ssl_verify_identity": True},
 38 | )
 39 | tidbvec = TiDBVectorStore(
 40 |     connection_string=tidb_connection_url,
 41 |     table_name="llama_index_rag_test",
 42 |     distance_strategy="cosine",
 43 |     vector_dimension=1536, # Length of the vectors returned by the model
 44 |     drop_existing_table=False,
 45 | )
 46 | tidb_vec_index = VectorStoreIndex.from_vector_store(tidbvec)
 47 | storage_context = StorageContext.from_defaults(vector_store=tidbvec)
 48 | query_engine = tidb_vec_index.as_query_engine(streaming=True)
 49 | logger.info("TiDB Vector Store initialized successfully")
 50 | 
 51 | 
 52 | def do_prepare_data():
 53 |     logger.info("Preparing the data for the application")
 54 |     documents = SimpleWebPageReader(html_to_text=True).load_data(
 55 |         ["http://paulgraham.com/worked.html"]
 56 |     )
 57 |     tidb_vec_index.from_documents(documents, storage_context=storage_context, show_progress=True)
 58 |     logger.info("Data preparation complete")
 59 | 
 60 | 
 61 | # https://stackoverflow.com/questions/76288582/is-there-a-way-to-stream-output-in-fastapi-from-the-response-i-get-from-llama-in
 62 | async def astreamer(response: llamaStreamingResponse):
 63 |     try:
 64 |         meta = json.dumps(jsonable_encoder(list(vars(node) for node in response.source_nodes)))
 65 |         yield f'{EventType.META.value}: {meta}\n\n'
 66 |         for i in response.response_gen:
 67 |             yield f'{EventType.ANSWER.value}: {i}\n\n'
 68 |             await asyncio.sleep(.1)
 69 |     except asyncio.CancelledError as e:
 70 |         print('cancelled')
 71 | 
 72 | 
 73 | app = fastapi.FastAPI()
 74 | templates = Jinja2Templates(directory="templates")
 75 | 
 76 | 
 77 | @app.get('/', response_class=HTMLResponse)
 78 | def index(request: fastapi.Request):
 79 |     return templates.TemplateResponse("index.html", {"request": request})
 80 | 
 81 | 
 82 | @app.get('/ask')
 83 | async def ask(q: str):
 84 |     response = query_engine.query(q)
 85 |     return StreamingResponse(astreamer(response), media_type='text/event-stream')
 86 | 
 87 | 
 88 | @click.group(context_settings={'max_content_width': 150})
 89 | def cli():
 90 |     pass
 91 | 
 92 | 
 93 | @cli.command()
 94 | @click.option('--host', default='127.0.0.1', help="Host, default=127.0.0.1")
 95 | @click.option('--port', default=3000, help="Port, default=3000")
 96 | @click.option('--reload', is_flag=True, help="Enable auto-reload")
 97 | def runserver(host, port, reload):
 98 |     uvicorn.run(
 99 |         "__main__:app", host=host, port=port, reload=reload,
100 |         log_level="debug", workers=1,
101 |     )
102 | 
103 | 
104 | @cli.command()
105 | def prepare():
106 |     do_prepare_data()
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     cli()
111 | 


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector-with-ui/requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | fastapi
3 | uvicorn
4 | Jinja2
5 | llama-index
6 | llama-index-readers-web
7 | llama-index-vector-stores-tidbvector


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector-with-ui/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN">
  3 | <!-- Generated by AI -->
  4 | 
  5 | <head>
  6 |   <meta charset="UTF-8">
  7 |   <title>LlamaIndex & TiDB RAG Demo</title>
  8 |   <script src="https://cdn.tailwindcss.com"></script>
  9 |   <script src="https://unpkg.com/marked@0.3.6"></script>
 10 |   <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css" rel="stylesheet">
 11 |   <script src="https://cdn.jsdelivr.net/npm/vue@2.6.14/dist/vue.min.js"></script>
 12 | </head>
 13 | 
 14 | <body>
 15 |   <div id="app" class="flex justify-around p-10 min-h-screen">
 16 |     <div class="w-1/2">
 17 |       <div class="flex justify-between">
 18 |         <label class="block w-4/5">
 19 |           <input type="text" v-model="question" placeholder="Input question..."
 20 |             class="p-2 w-full border-solid border-gray-600 focus:border-gray-900 border rounded-none focus:outline-none">
 21 |         </label>
 22 |         <button type="button" class="bg-black ml-2 w-1/5 p-2 rounded	text-white" :disabled="loading"
 23 |           @click="askQuestion">
 24 |           <i v-if="loading && answer == ''" class="fas fa-spinner animate-spin"></i>
 25 |           Ask
 26 |         </button>
 27 |       </div>
 28 |       <div id="answer" class="w-full min-h-max border py-2 px-6 mt-6 bg-slate-100">
 29 |         <h1 class="text-center">Answer Body</h1>
 30 |         <hr class="my-2" />
 31 |         <div v-html="compiledMarkdown">Empty</div>
 32 |       </div>
 33 |     </div>
 34 |     <div class="w-1/2 ml-4 border p-2 bg-slate-100 min-h-fit">
 35 |       <h1 class="text-center">Chunks Retrieved</h1>
 36 |       <hr class="my-2" />
 37 |       <ul v-for="node in source_nodes" class="list-decimal ml-4">
 38 |         <li class="pl-4" v-text="node.node.text"></li>
 39 |       </ul>
 40 |     </div>
 41 |   </div>
 42 |   <script>
 43 |     new Vue({
 44 |       el: '#app',
 45 |       data: {
 46 |         question: 'What did the author do at each stage of his/her growth? Use markdown format if possible',
 47 |         answer: 'Empty...',
 48 |         requestID: '',
 49 |         loading: false,
 50 |         source_nodes: [],
 51 |       },
 52 |       computed: {
 53 |         compiledMarkdown: function () {
 54 |           return marked(this.answer, { sanitize: true });
 55 |         }
 56 |       },
 57 |       filters: {
 58 |         pretty: function (value) {
 59 |           return JSON.stringify(JSON.parse(value), null, 2);
 60 |         }
 61 |       },
 62 |       methods: {
 63 |         askQuestion: function () {
 64 |           const self = this;
 65 |           if (this.question.trim() === '') {
 66 |             alert('Please input a question.');
 67 |             return;
 68 |           }
 69 |           self.source_nodes = [];
 70 |           self.answer = '';
 71 |           self.loading = true;
 72 |           fetch(`/ask?q=${encodeURIComponent(this.question)}`)
 73 |             .then(response => {
 74 |               const reader = response.body.getReader();
 75 |               const stream = new ReadableStream({
 76 |                 start(controller) {
 77 |                   function push() {
 78 |                     reader.read().then(({ done, value }) => {
 79 |                       if (done) {
 80 |                         controller.close();
 81 |                         self.loading = false;
 82 |                         return;
 83 |                       }
 84 |                       const chunkValue = new TextDecoder("utf-8").decode(value);
 85 |                       chunkValue
 86 |                         .split('\n\n')
 87 |                         .forEach(chunk => {
 88 |                           if (chunk.startsWith('1: ')) {
 89 |                             const meta = chunk.substring(3);
 90 |                             self.source_nodes = JSON.parse(chunk.replace('1: ', ''));
 91 |                           } else {
 92 |                             const data = chunk.replace('2: ', '');
 93 |                             if (data) {
 94 |                               self.answer += data;
 95 |                             }
 96 |                           }
 97 |                         });
 98 |                       controller.enqueue(value);
 99 |                       push();
100 |                     }).catch(error => {
101 |                       self.loading = false;
102 |                       console.error('Fetch error:', error);
103 |                       controller.error(error);
104 |                     });
105 |                   }
106 |                   push();
107 |                 }
108 |               });
109 |               self.requestID = response.headers.get('X-Request-ID');
110 |               return new Response(stream);
111 |             })
112 |             .catch(error => {
113 |               console.error('Fetch error:', error);
114 |               self.loading = false;
115 |               alert('An error occurred while fetching the response.');
116 |             });
117 |         }
118 |       }
119 |     });
120 |   </script>
121 | </body>
122 | 
123 | </html>
124 | 


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector/README.md:
--------------------------------------------------------------------------------
 1 | # LlamaIndex RAG Example
 2 | 
 3 | This example demonstrates how to use the LlamaIndex and TiDB Serverless to build a simple RAG(Retrival-Augmented Generation) application. It crawl an example webpage and index the content to TiDB Serverless with LlamaIndex, then use the LlamaIndex to search the content and generate the answer with OpenAI.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster with vector search enabled
 8 | - Python 3.8 or later
 9 | - OpenAI [API key](https://platform.openai.com/docs/quickstart)
10 | 
11 | ## Run the example
12 | 
13 | ### Clone this repo
14 | 
15 | ```bash
16 | git clone https://github.com/pingcap/tidb-vector-python.git
17 | ```
18 | 
19 | ### Create a virtual environment
20 | 
21 | ```bash
22 | cd tidb-vector-python/examples/llamaindex-tidb-vector
23 | python3 -m venv .venv
24 | source .venv/bin/activate
25 | ```
26 | 
27 | ### Install dependencies
28 | 
29 | ```bash
30 | pip install -r requirements.txt
31 | ```
32 | 
33 | ### Set the environment variables
34 | 
35 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart)
36 | 
37 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.
38 | 
39 | ```bash
40 | export OPENAI_API_KEY="sk-*******"
41 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com"
42 | export TIDB_USERNAME="****.root"
43 | export TIDB_PASSWORD="****"
44 | ```
45 | 
46 | ### Run this example
47 | 
48 | ```text
49 | $ python chat_with_url.py --help
50 | Usage: chat_with_url.py [OPTIONS]
51 | 
52 | Options:
53 |   --url TEXT  URL you want to talk to,
54 |               default=https://docs.pingcap.com/tidb/stable/overview
55 |   --help      Show this message and exit.
56 | $
57 | $ python chat_with_url.py
58 | Enter your question: tidb vs mysql
59 | TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads. It is MySQL compatible and features horizontal scalability, strong consistency, and high availability. TiDB is designed to provide users with a one-stop database solution that covers OLTP, OLAP, and HTAP services. It offers easy horizontal scaling, financial-grade high availability, real-time HTAP capabilities, cloud-native features, and compatibility with the MySQL protocol and ecosystem.
60 | Enter your question:
61 | ```


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector/chat_with_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | 
 4 | import click
 5 | from sqlalchemy import URL
 6 | from llama_index.core import VectorStoreIndex, StorageContext
 7 | from llama_index.vector_stores.tidbvector import TiDBVectorStore # type: ignore
 8 | from llama_index.readers.web import SimpleWebPageReader
 9 | 
10 | 
11 | tidb_connection_url = URL(
12 |     "mysql+pymysql",
13 |     username=os.environ['TIDB_USERNAME'],
14 |     password=os.environ['TIDB_PASSWORD'],
15 |     host=os.environ['TIDB_HOST'],
16 |     port=4000,
17 |     database="test",
18 |     query={"ssl_verify_cert": True, "ssl_verify_identity": True},
19 | )
20 | tidbvec = TiDBVectorStore(
21 |     connection_string=tidb_connection_url,
22 |     table_name="llama_index_rag_test",
23 |     distance_strategy="cosine",
24 |     vector_dimension=1536, # The dimension is decided by the model
25 |     drop_existing_table=False,
26 | )
27 | tidb_vec_index = VectorStoreIndex.from_vector_store(tidbvec)
28 | storage_context = StorageContext.from_defaults(vector_store=tidbvec)
29 | query_engine = tidb_vec_index.as_query_engine(streaming=True)
30 | 
31 | 
32 | def do_prepare_data(url):
33 |     documents = SimpleWebPageReader(html_to_text=True).load_data([url,])
34 |     tidb_vec_index.from_documents(documents, storage_context=storage_context, show_progress=True)
35 | 
36 | 
37 | _default_url = 'https://docs.pingcap.com/tidb/stable/overview'
38 | 
39 | @click.command()
40 | @click.option('--url',default=_default_url,
41 |               help=f'URL you want to talk to, default={_default_url}')
42 | def chat_with_url(url):
43 |     do_prepare_data(url)
44 |     while True:
45 |         question = click.prompt("Enter your question")
46 |         response = query_engine.query(question)
47 |         click.echo(response)
48 | 
49 | if __name__ == '__main__':
50 |     chat_with_url()
51 | 


--------------------------------------------------------------------------------
/examples/llamaindex-tidb-vector/requirements.txt:
--------------------------------------------------------------------------------
1 | click==8.1.7
2 | SQLAlchemy==2.0.29
3 | llama-index==0.10.29
4 | llama-index-readers-web==0.1.8
5 | llama-index-vector-stores-tidbvector==0.1.2
6 | 


--------------------------------------------------------------------------------
/examples/openai_embedding/README.md:
--------------------------------------------------------------------------------
 1 | # OpenAI Embedding Example
 2 | 
 3 | This example demonstrates how to utilize OpenAI embedding for semantic search. According to OpenAI's [documentation](https://platform.openai.com/docs/guides/embeddings/which-distance-function-should-i-use), we will use cosine similarity to calculate vector distance.
 4 | 
 5 | You can run this example in two ways:
 6 | 
 7 | - [Run in Jupyter Notebook](#jupyter-notebook)
 8 | - [Run in Local](#run-in-local)
 9 | 
10 | ## Jupyter Notebook
11 | 
12 | Notebook: [example.ipynb](./example.ipynb)
13 | 
14 | Try it in the [Google colab](https://colab.research.google.com/github/pingcap/tidb-vector-python/blob/main/examples/openai_embedding/example.ipynb).
15 | 
16 | ## Run in Local
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | python3 -m venv .venv
22 | source .venv/bin/activate
23 | ```
24 | 
25 | ### Install the requirements
26 | 
27 | ```bash
28 | pip install -r requirements.txt
29 | ```
30 | 
31 | ### Set the environment variables
32 | 
33 | Get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart)
34 | 
35 | Get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.
36 | 
37 | ```bash
38 | export OPENAI_API_KEY="sk-*******"
39 | export TIDB_HOST="gateway01.*******.shared.aws.tidbcloud.com"
40 | export TIDB_USERNAME="****.root"
41 | export TIDB_PASSWORD="****"
42 | ```
43 | 
44 | ### Run the example
45 | 
46 | ```bash
47 | python3 example.py
48 | ```
49 | 


--------------------------------------------------------------------------------
/examples/openai_embedding/example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "# Example of Embedding\n",
  7 |         "\n",
  8 |         "It is an embedding example that uses `tidb_vector_python` as its library."
  9 |       ],
 10 |       "metadata": {
 11 |         "id": "ewKGZW06kmIv"
 12 |       }
 13 |     },
 14 |     {
 15 |       "cell_type": "markdown",
 16 |       "metadata": {
 17 |         "id": "F1fsS576izUl"
 18 |       },
 19 |       "source": [
 20 |         "## Install Dependencies"
 21 |       ]
 22 |     },
 23 |     {
 24 |       "cell_type": "code",
 25 |       "execution_count": null,
 26 |       "metadata": {
 27 |         "id": "pTpKX_lDizUp"
 28 |       },
 29 |       "outputs": [],
 30 |       "source": [
 31 |         "%%capture\n",
 32 |         "%pip install openai peewee pymysql tidb_vector"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "metadata": {
 38 |         "id": "psEHGWiHizUq"
 39 |       },
 40 |       "source": [
 41 |         "## Preapre the environment\n",
 42 |         "\n",
 43 |         "> **Note:**\n",
 44 |         ">\n",
 45 |         "> - You can get the `OPENAI_API_KEY` from [OpenAI](https://platform.openai.com/docs/quickstart).\n",
 46 |         "> - You can get the `TIDB_HOST`, `TIDB_USERNAME`, and `TIDB_PASSWORD` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section.\n",
 47 |         "\n",
 48 |         "Set the embedding model as `text-embedding-3-small`, and\n",
 49 |         "the amount of embedding dimensions is `1536`."
 50 |       ]
 51 |     },
 52 |     {
 53 |       "cell_type": "code",
 54 |       "execution_count": null,
 55 |       "metadata": {
 56 |         "id": "MgKOjwmYizUq"
 57 |       },
 58 |       "outputs": [],
 59 |       "source": [
 60 |         "import getpass\n",
 61 |         "\n",
 62 |         "OPENAI_API_KEY = getpass.getpass(\"Enter your OpenAI API key: \")\n",
 63 |         "TIDB_HOST = input(\"Enter your TiDB host: \")\n",
 64 |         "TIDB_USERNAME = input(\"Enter your TiDB username: \")\n",
 65 |         "TIDB_PASSWORD = getpass.getpass(\"Enter your TiDB password: \")\n",
 66 |         "\n",
 67 |         "embedding_model = \"text-embedding-3-small\"\n",
 68 |         "embedding_dimensions = 1536"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "metadata": {
 74 |         "id": "3WbH_BITizUr"
 75 |       },
 76 |       "source": [
 77 |         "## Initial the Clients of OpenAI and Database"
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "code",
 82 |       "execution_count": null,
 83 |       "metadata": {
 84 |         "id": "UWtcs58-izUr"
 85 |       },
 86 |       "outputs": [],
 87 |       "source": [
 88 |         "from openai import OpenAI\n",
 89 |         "from peewee import Model, MySQLDatabase, TextField, SQL\n",
 90 |         "from tidb_vector.peewee import VectorField\n",
 91 |         "\n",
 92 |         "client = OpenAI(api_key=OPENAI_API_KEY)\n",
 93 |         "db = MySQLDatabase(\n",
 94 |         "   'test',\n",
 95 |         "    user=TIDB_USERNAME,\n",
 96 |         "    password=TIDB_PASSWORD,\n",
 97 |         "    host=TIDB_HOST,\n",
 98 |         "    port=4000,\n",
 99 |         "    ssl_verify_cert=True,\n",
100 |         "    ssl_verify_identity=True\n",
101 |         ")\n",
102 |         "db.connect()"
103 |       ]
104 |     },
105 |     {
106 |       "cell_type": "markdown",
107 |       "metadata": {
108 |         "id": "uOyjrmWJizUr"
109 |       },
110 |       "source": [
111 |         "## Prepare the Context\n",
112 |         "\n",
113 |         "In this case, contexts are the documents, use the openai embeddings model to get the embeddings of the documents, and store them in the TiDB."
114 |       ]
115 |     },
116 |     {
117 |       "cell_type": "code",
118 |       "execution_count": null,
119 |       "metadata": {
120 |         "id": "_e5P_m0MizUs"
121 |       },
122 |       "outputs": [],
123 |       "source": [
124 |         "documents = [\n",
125 |         "   \"TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.\",\n",
126 |         "   \"TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.\",\n",
127 |         "   \"TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. \",\n",
128 |         "]\n",
129 |         "\n",
130 |         "class DocModel(Model):\n",
131 |         "    text = TextField()\n",
132 |         "    embedding = VectorField(dimensions=embedding_dimensions)\n",
133 |         "\n",
134 |         "    class Meta:\n",
135 |         "        database = db\n",
136 |         "        table_name = \"openai_embedding_test\"\n",
137 |         "\n",
138 |         "    def __str__(self):\n",
139 |         "        return self.text\n",
140 |         "\n",
141 |         "db.drop_tables([DocModel])\n",
142 |         "db.create_tables([DocModel])\n",
143 |         "\n",
144 |         "embeddings = [\n",
145 |         "    r.embedding\n",
146 |         "    for r in client.embeddings.create(\n",
147 |         "      input=documents, model=embedding_model\n",
148 |         "    ).data\n",
149 |         "]\n",
150 |         "data_source = [\n",
151 |         "    {\"text\": doc, \"embedding\": emb}\n",
152 |         "    for doc, emb in zip(documents, embeddings)\n",
153 |         "]\n",
154 |         "DocModel.insert_many(data_source).execute()"
155 |       ]
156 |     },
157 |     {
158 |       "cell_type": "markdown",
159 |       "metadata": {
160 |         "id": "zMP-P1g8izUs"
161 |       },
162 |       "source": [
163 |         "## Initial the Vector of Question\n",
164 |         "\n",
165 |         "Ask a question, use the openai embeddings model to get the embeddings of the question"
166 |       ]
167 |     },
168 |     {
169 |       "cell_type": "code",
170 |       "execution_count": null,
171 |       "metadata": {
172 |         "id": "-zrTOxs4izUt"
173 |       },
174 |       "outputs": [],
175 |       "source": [
176 |         "question = \"what is TiKV?\"\n",
177 |         "question_embedding = client.embeddings.create(input=question, model=embedding_model).data[0].embedding"
178 |       ]
179 |     },
180 |     {
181 |       "cell_type": "markdown",
182 |       "metadata": {
183 |         "id": "atc0gXVZizUt"
184 |       },
185 |       "source": [
186 |         "## Retrieve by Cosine Distance of Vectors\n",
187 |         "Get the relevant documents from the TiDB by comparing the embeddings of the question and the documents"
188 |       ]
189 |     },
190 |     {
191 |       "cell_type": "code",
192 |       "execution_count": null,
193 |       "metadata": {
194 |         "id": "DTtJRX64izUt"
195 |       },
196 |       "outputs": [],
197 |       "source": [
198 |         "related_docs = DocModel.select(\n",
199 |         "    DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias(\"distance\")\n",
200 |         ").order_by(SQL(\"distance\")).limit(3)\n",
201 |         "\n",
202 |         "print(\"Question:\", question)\n",
203 |         "print(\"Related documents:\")\n",
204 |         "for doc in related_docs:\n",
205 |         "    print(doc.distance, doc.text)"
206 |       ]
207 |     },
208 |     {
209 |       "cell_type": "markdown",
210 |       "source": [
211 |         "## Cleanup"
212 |       ],
213 |       "metadata": {
214 |         "id": "bYBetPchmNUp"
215 |       }
216 |     },
217 |     {
218 |       "cell_type": "code",
219 |       "execution_count": null,
220 |       "metadata": {
221 |         "id": "Lh27gC7gizUt"
222 |       },
223 |       "outputs": [],
224 |       "source": [
225 |         "db.close()"
226 |       ]
227 |     }
228 |   ],
229 |   "metadata": {
230 |     "kernelspec": {
231 |       "display_name": ".venv",
232 |       "language": "python",
233 |       "name": "python3"
234 |     },
235 |     "language_info": {
236 |       "codemirror_mode": {
237 |         "name": "ipython",
238 |         "version": 3
239 |       },
240 |       "file_extension": ".py",
241 |       "mimetype": "text/x-python",
242 |       "name": "python",
243 |       "nbconvert_exporter": "python",
244 |       "pygments_lexer": "ipython3",
245 |       "version": "3.11.6"
246 |     },
247 |     "colab": {
248 |       "provenance": [],
249 |       "toc_visible": true
250 |     }
251 |   },
252 |   "nbformat": 4,
253 |   "nbformat_minor": 0
254 | }
255 | 


--------------------------------------------------------------------------------
/examples/openai_embedding/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from openai import OpenAI
 3 | from peewee import Model, MySQLDatabase, TextField, SQL
 4 | from tidb_vector.peewee import VectorField
 5 | 
 6 | # Init OpenAI client
 7 | # In this example, we use the text-embedding-3-small model to generate embeddings
 8 | client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
 9 | embedding_model = "text-embedding-3-small"
10 | embedding_dimensions = 1536
11 | 
12 | # Init TiDB connection
13 | # Note: TiDB Serverless requires secure connection, so we need to set ssl_verify_cert and ssl_verify_identity to True
14 | # Remember to set the environment variables with your own TiDB credentials
15 | db = MySQLDatabase(
16 |    'test',
17 |     user=os.environ.get('TIDB_USERNAME'),
18 |     password=os.environ.get('TIDB_PASSWORD'),
19 |     host=os.environ.get('TIDB_HOST'),
20 |     port=4000,
21 |     ssl_verify_cert=True,
22 |     ssl_verify_identity=True
23 | )
24 | 
25 | documents = [
26 |    "TiDB is an open-source distributed SQL database that supports Hybrid Transactional and Analytical Processing (HTAP) workloads.",
27 |    "TiFlash is the key component that makes TiDB essentially an Hybrid Transactional/Analytical Processing (HTAP) database. As a columnar storage extension of TiKV, TiFlash provides both good isolation level and strong consistency guarantee.",
28 |    "TiKV is a distributed and transactional key-value database, which provides transactional APIs with ACID compliance. With the implementation of the Raft consensus algorithm and consensus state stored in RocksDB, TiKV guarantees data consistency between multiple replicas and high availability. ",
29 | ]
30 | 
31 | # Define a model with a VectorField to store the embeddings
32 | class DocModel(Model):
33 |     text = TextField()
34 |     embedding = VectorField(dimensions=embedding_dimensions)
35 | 
36 |     class Meta:
37 |         database = db
38 |         table_name = "openai_embedding_test"
39 |     
40 |     def __str__(self):
41 |         return self.text
42 | 
43 | db.connect()
44 | db.drop_tables([DocModel])
45 | db.create_tables([DocModel])
46 | 
47 | # Insert the documents and their embeddings into TiDB
48 | embeddings = [
49 |     r.embedding
50 |     for r in client.embeddings.create(
51 |       input=documents, model=embedding_model
52 |     ).data
53 | ]
54 | data_source = [
55 |     {"text": doc, "embedding": emb}
56 |     for doc, emb in zip(documents, embeddings)
57 | ]
58 | DocModel.insert_many(data_source).execute()
59 | 
60 | # Query the most similar documents to a question
61 | # 1. Generate the embedding of the question
62 | # 2. Query the most similar documents based on the cosine distance in TiDB
63 | # 3. Print the results
64 | question = "what is TiKV?"
65 | question_embedding = client.embeddings.create(input=question, model=embedding_model).data[0].embedding
66 | related_docs = DocModel.select(
67 |     DocModel.text, DocModel.embedding.cosine_distance(question_embedding).alias("distance")
68 | ).order_by(SQL("distance")).limit(3)
69 | 
70 | print("Question:", question)
71 | print("Related documents:")
72 | for doc in related_docs:
73 |     print(doc.distance, doc.text)
74 | 
75 | db.close()


--------------------------------------------------------------------------------
/examples/openai_embedding/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | peewee
3 | pymysql
4 | tidb-vector


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/.env.example:
--------------------------------------------------------------------------------
1 | TIDB_HOST='xxxxxxxx.aws.tidbcloud.com'
2 | TIDB_PORT='4000'
3 | TIDB_USERNAME='xxxxxxxxxxx.root'
4 | TIDB_PASSWORD='xxxxxxx'
5 | TIDB_DATABASE='test'
6 | # The CA certificate file path.
7 | # The example path is for macOS.
8 | # For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path.
9 | TIDB_CA_PATH='/etc/ssl/cert.pem'


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/README.md:
--------------------------------------------------------------------------------
 1 | # Integrate TiDB Vector Search with Django ORM
 2 | 
 3 | This is a simple demo to show how to integrate TiDB Vector Search with the Django ORM to search for similar text in a TiDB Serverless cluster.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster with vector search enabled
 8 | - Python 3.8 or later
 9 | 
10 | ## Run the example
11 | 
12 | ### Clone this repo
13 | 
14 | ```bash
15 | git clone https://github.com/pingcap/tidb-vector-python.git
16 | ```
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | cd tidb-vector-python/examples/orm-django-quickstart
22 | python3 -m venv .venv
23 | source .venv/bin/activate
24 | ```
25 | 
26 | ### Install dependencies
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ### Set the environment variables
33 | 
34 | Create a `.env` file via the following command.
35 | 
36 | ```shell
37 | cp .env.example .env
38 | ```
39 | 
40 | Copy the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then set up the following environment variables in the `.env` file.
41 | 
42 | ```bash
43 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com
44 | TIDB_PORT=4000
45 | TIDB_USERNAME=******.root
46 | TIDB_PASSWORD=********
47 | TIDB_DATABASE=test
48 | # For macOS. For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path .
49 | TIDB_CA_PATH=/etc/ssl/cert.pem
50 | ```
51 | 
52 | ### Run this example
53 | 
54 | Migrate the table schema:
55 | 
56 | ```shell
57 | pyhton manage.py migrate
58 | ```
59 | 
60 | Run the server:
61 | 
62 | ```shell
63 | python manage.py runserver
64 | ```
65 | 
66 | Open your browser and visit `http://localhost:8000/`.
67 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def main():
 8 |     """Run administrative tasks."""
 9 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sample_project.settings")
10 |     try:
11 |         from django.core.management import execute_from_command_line
12 |     except ImportError as exc:
13 |         raise ImportError(
14 |             "Couldn't import Django. Are you sure it's installed and "
15 |             "available on your PYTHONPATH environment variable? Did you "
16 |             "forget to activate a virtual environment?"
17 |         ) from exc
18 |     execute_from_command_line(sys.argv)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/requirements.txt:
--------------------------------------------------------------------------------
1 | Django==4.2.4
2 | django-tidb>=5.0.1
3 | mysqlclient==2.2.0
4 | python-dotenv==1.0.0
5 | tidb-vector>=0.0.9
6 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/orm-django-quickstart/sample_project/__init__.py


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/asgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASGI config for sample_project project.
 3 | 
 4 | It exposes the ASGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.asgi import get_asgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sample_project.settings")
15 | 
16 | application = get_asgi_application()
17 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/forms.py:
--------------------------------------------------------------------------------
1 | from django import forms
2 | from django.db import models, transaction
3 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/migrations/0001_initial.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 4.2.4 on 2024-11-05 05:36
 2 | 
 3 | from django.db import migrations, models
 4 | import django_tidb.fields.vector
 5 | 
 6 | 
 7 | class Migration(migrations.Migration):
 8 | 
 9 |     initial = True
10 | 
11 |     dependencies = []
12 | 
13 |     operations = [
14 |         migrations.CreateModel(
15 |             name="Document",
16 |             fields=[
17 |                 (
18 |                     "id",
19 |                     models.BigAutoField(
20 |                         auto_created=True,
21 |                         primary_key=True,
22 |                         serialize=False,
23 |                         verbose_name="ID",
24 |                     ),
25 |                 ),
26 |                 ("content", models.TextField()),
27 |                 ("embedding", django_tidb.fields.vector.VectorField(dimensions=3)),
28 |             ],
29 |             options={
30 |                 "indexes": [
31 |                     django_tidb.fields.vector.VectorIndex(
32 |                         django_tidb.fields.vector.L2Distance("embedding"), name="idx_l2"
33 |                     ),
34 |                     django_tidb.fields.vector.VectorIndex(
35 |                         django_tidb.fields.vector.CosineDistance("embedding"),
36 |                         name="idx_cos",
37 |                     ),
38 |                 ],
39 |             },
40 |         ),
41 |     ]
42 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/orm-django-quickstart/sample_project/migrations/__init__.py


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/models.py:
--------------------------------------------------------------------------------
 1 | from django.db import models
 2 | from django_tidb.fields.vector import VectorField, VectorIndex, CosineDistance, L2Distance
 3 | 
 4 | 
 5 | class Document(models.Model):
 6 |     content = models.TextField()
 7 |     embedding = VectorField(dimensions=3)
 8 |     class Meta:
 9 |         indexes = [
10 |             VectorIndex(L2Distance("embedding"), name='idx_l2'),
11 |             VectorIndex(CosineDistance("embedding"), name='idx_cos'),
12 |         ]
13 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for sample_project project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 4.2.4.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/4.2/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/4.2/ref/settings/
 11 | """
 12 | import os
 13 | from pathlib import Path
 14 | 
 15 | import dotenv
 16 | 
 17 | dotenv.load_dotenv()
 18 | 
 19 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
 20 | BASE_DIR = Path(__file__).resolve().parent.parent
 21 | 
 22 | 
 23 | # Quick-start development settings - unsuitable for production
 24 | # See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
 25 | 
 26 | # SECURITY WARNING: keep the secret key used in production secret!
 27 | SECRET_KEY = "django-insecure-axer15+p=pea&u%9e4t^3314jagd+1e$5!i9%oh1^yu@1hf6w4"
 28 | 
 29 | # SECURITY WARNING: don't run with debug turned on in production!
 30 | DEBUG = True
 31 | 
 32 | ALLOWED_HOSTS = []
 33 | 
 34 | 
 35 | # Application definition
 36 | 
 37 | INSTALLED_APPS = [
 38 |     "django.contrib.admin",
 39 |     "django.contrib.auth",
 40 |     "django.contrib.contenttypes",
 41 |     "django.contrib.sessions",
 42 |     "django.contrib.messages",
 43 |     "django.contrib.staticfiles",
 44 |     "sample_project",
 45 | ]
 46 | 
 47 | MIDDLEWARE = [
 48 |     "django.middleware.security.SecurityMiddleware",
 49 |     "django.contrib.sessions.middleware.SessionMiddleware",
 50 |     "django.middleware.common.CommonMiddleware",
 51 |     "django.middleware.csrf.CsrfViewMiddleware",
 52 |     "django.contrib.auth.middleware.AuthenticationMiddleware",
 53 |     "django.contrib.messages.middleware.MessageMiddleware",
 54 |     "django.middleware.clickjacking.XFrameOptionsMiddleware",
 55 | ]
 56 | 
 57 | ROOT_URLCONF = "sample_project.urls"
 58 | 
 59 | TEMPLATES = [
 60 |     {
 61 |         "BACKEND": "django.template.backends.django.DjangoTemplates",
 62 |         "DIRS": [],
 63 |         "APP_DIRS": True,
 64 |         "OPTIONS": {
 65 |             "context_processors": [
 66 |                 "django.template.context_processors.debug",
 67 |                 "django.template.context_processors.request",
 68 |                 "django.contrib.auth.context_processors.auth",
 69 |                 "django.contrib.messages.context_processors.messages",
 70 |             ],
 71 |         },
 72 |     },
 73 | ]
 74 | 
 75 | WSGI_APPLICATION = "sample_project.wsgi.application"
 76 | 
 77 | 
 78 | # Database
 79 | # https://docs.djangoproject.com/en/4.2/ref/settings/#databases
 80 | 
 81 | DATABASES = {
 82 |     "default": {
 83 |         # https://github.com/pingcap/django-tidb
 84 |         "ENGINE": "django_tidb",
 85 |         "HOST": os.environ.get("TIDB_HOST", "127.0.0.1"),
 86 |         "PORT": int(os.environ.get("TIDB_PORT", 4000)),
 87 |         "USER": os.environ.get("TIDB_USERNAME", "root"),
 88 |         "PASSWORD": os.environ.get("TIDB_PASSWORD", ""),
 89 |         "NAME": os.environ.get("TIDB_DATABASE", "test"),
 90 |         "OPTIONS": {
 91 |             "charset": "utf8mb4",
 92 |         },
 93 |     }
 94 | }
 95 | 
 96 | TIDB_CA_PATH = os.environ.get("TIDB_CA_PATH", "")
 97 | if TIDB_CA_PATH:
 98 |     DATABASES["default"]["OPTIONS"]["ssl_mode"] = "VERIFY_IDENTITY"
 99 |     DATABASES["default"]["OPTIONS"]["ssl"] = {
100 |         "ca": TIDB_CA_PATH,
101 |     }
102 | 
103 | # Password validation
104 | # https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
105 | 
106 | AUTH_PASSWORD_VALIDATORS = [
107 |     {
108 |         "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
109 |     },
110 |     {
111 |         "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
112 |     },
113 |     {
114 |         "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
115 |     },
116 |     {
117 |         "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
118 |     },
119 | ]
120 | 
121 | 
122 | # Internationalization
123 | # https://docs.djangoproject.com/en/4.2/topics/i18n/
124 | 
125 | LANGUAGE_CODE = "en-us"
126 | 
127 | TIME_ZONE = "UTC"
128 | 
129 | USE_I18N = True
130 | 
131 | USE_TZ = True
132 | 
133 | 
134 | # Static files (CSS, JavaScript, Images)
135 | # https://docs.djangoproject.com/en/4.2/howto/static-files/
136 | 
137 | STATIC_URL = "static/"
138 | 
139 | # Default primary key field type
140 | # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
141 | 
142 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
143 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | URL configuration for sample_project project.
 3 | 
 4 | The `urlpatterns` list routes URLs to views. For more information please see:
 5 |     https://docs.djangoproject.com/en/4.2/topics/http/urls/
 6 | Examples:
 7 | Function views
 8 |     1. Add an import:  from my_app import views
 9 |     2. Add a URL to urlpatterns:  path('', views.home, name='home')
10 | Class-based views
11 |     1. Add an import:  from other_app.views import Home
12 |     2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
13 | Including another URLconf
14 |     1. Import the include() function: from django.urls import include, path
15 |     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
16 | """
17 | from django.urls import path
18 | 
19 | from . import views
20 | 
21 | urlpatterns = [
22 |     path("", views.list_routes, name="index"),
23 |     path("insert_documents", views.insert_documents, name="insert_documents"),
24 |     path("get_nearest_neighbors_documents", views.get_nearest_neighbors_documents, name="get_nearest_neighbors_documents"),
25 |     path("get_documents_within_distance", views.get_documents_within_distance, name="get_documents_within_distance"),
26 | ]
27 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/views.py:
--------------------------------------------------------------------------------
 1 | from django.http import HttpResponse, JsonResponse
 2 | from django_tidb.fields.vector import CosineDistance
 3 | 
 4 | from .models import Document
 5 | 
 6 | 
 7 | # Insert 3 documents.
 8 | def insert_documents(request):
 9 |     Document.objects.create(content="dog", embedding=[1, 2, 1])
10 |     Document.objects.create(content="fish", embedding=[1, 2, 4])
11 |     Document.objects.create(content="tree", embedding=[1, 0, 0])
12 | 
13 |     return HttpResponse("Insert documents successfully.")
14 | 
15 | 
16 | # Get 3-nearest neighbor documents.
17 | def get_nearest_neighbors_documents(request):
18 |     results = Document.objects.annotate(
19 |         distance=CosineDistance('embedding', [1, 2, 3])
20 |     ).order_by('distance')[:3]
21 |     response = []
22 |     for doc in results:
23 |         response.append({
24 |             'distance': doc.distance,
25 |             'document': doc.content
26 |         })
27 | 
28 |     return JsonResponse(response, safe=False)
29 | 
30 | 
31 | # Get documents within a certain distance.
32 | def get_documents_within_distance(request):
33 |     results = Document.objects.annotate(
34 |         distance=CosineDistance('embedding', [1, 2, 3])
35 |     ).filter(distance__lt=0.2).order_by('distance')[:3]
36 |     response = []
37 |     for doc in results:
38 |         response.append({
39 |             'distance': doc.distance,
40 |             'document': doc.content
41 |         })
42 | 
43 |     return JsonResponse(response, safe=False)
44 | 
45 | 
46 | def list_routes(request):
47 |     return JsonResponse({
48 |         'routes': [
49 |             '/insert_documents',
50 |             '/get_nearest_neighbors_documents',
51 |             '/get_documents_within_distance'
52 |         ]
53 |     })
54 | 


--------------------------------------------------------------------------------
/examples/orm-django-quickstart/sample_project/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for sample_project project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "sample_project.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/examples/orm-peewee-quickstart/.env.example:
--------------------------------------------------------------------------------
1 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com
2 | TIDB_PORT=4000
3 | TIDB_USERNAME=******.root
4 | TIDB_PASSWORD=********
5 | TIDB_DATABASE=test
6 | # TiDB Serverless Cluster requires SSL connection for public network access.
7 | # For local TiDB cluster, please set TIDB_SSL=false to disable SSL.
8 | TIDB_SSL=true


--------------------------------------------------------------------------------
/examples/orm-peewee-quickstart/README.md:
--------------------------------------------------------------------------------
 1 | # Integrate TiDB Vector Search with Peewee ORM
 2 | 
 3 | This is a simple demo to show how to integrate TiDB Vector Search with the Peewee ORM to search for similar text in a TiDB Serverless cluster.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster with vector search enabled
 8 | - Python 3.8 or later
 9 | 
10 | ## Run the example
11 | 
12 | ### Clone this repo
13 | 
14 | ```bash
15 | git clone https://github.com/pingcap/tidb-vector-python.git
16 | ```
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | cd tidb-vector-python/examples/orm-peewee-quickstart
22 | python3 -m venv .venv
23 | source .venv/bin/activate
24 | ```
25 | 
26 | ### Install dependencies
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ### Set the environment variables
33 | 
34 | Create a `.env` file via the following command.
35 | 
36 | ```shell
37 | cp .env.example .env
38 | ```
39 | 
40 | Copy the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then set up the following environment variables in the `.env` file.
41 | 
42 | ```bash
43 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com
44 | TIDB_PORT=4000
45 | TIDB_USERNAME=******.root
46 | TIDB_PASSWORD=********
47 | TIDB_DATABASE=test
48 | # For macOS. For other platforms, please refer https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters#root-certificate-default-path .
49 | TIDB_CA_PATH=/etc/ssl/cert.pem
50 | ```
51 | 
52 | ### Run this example
53 | 
54 | ```text
55 | $ python peewee-quickstart.py
56 | Get 3-nearest neighbor documents:
57 |   - distance: 0.00853986601633272
58 |     document: fish
59 |   - distance: 0.12712843905603044
60 |     document: dog
61 |   - distance: 0.7327387580875756
62 |     document: tree
63 | Get documents within a certain distance:
64 |   - distance: 0.00853986601633272
65 |     document: fish
66 |   - distance: 0.12712843905603044
67 |     document: dog
68 | ```


--------------------------------------------------------------------------------
/examples/orm-peewee-quickstart/peewee-quickstart.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import dotenv
 3 | 
 4 | from tidb_vector.peewee import VectorField, VectorAdaptor
 5 | from tidb_vector.constants import DistanceMetric
 6 | from peewee import Model, MySQLDatabase, TextField
 7 | 
 8 | dotenv.load_dotenv()
 9 | 
10 | # Step 1: Connect to TiDB using Peewee.
11 | 
12 | # Using `pymysql` as the driver.
13 | ssl_kwargs = {
14 |     'ssl_verify_cert': True,
15 |     'ssl_verify_identity': True,
16 | }
17 | 
18 | # Using `mysqlclient` as the driver.
19 | # ssl_kwargs = {
20 | #     'ssl_mode': 'VERIFY_IDENTITY',
21 | #     'ssl': {
22 | #         # Root certificate default path
23 | #         # https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters/#root-certificate-default-path
24 | #         'ca': os.environ.get('TIDB_CA_PATH', '/path/to/ca.pem'),
25 | #     },
26 | # }
27 | 
28 | db = MySQLDatabase(
29 |     database=os.environ.get('TIDB_DATABASE', 'test'),
30 |     user=os.environ.get('TIDB_USERNAME', 'root'),
31 |     password=os.environ.get('TIDB_PASSWORD', ''),
32 |     host=os.environ.get('TIDB_HOST', 'localhost'),
33 |     port=int(os.environ.get('TIDB_PORT', '4000')),
34 |     **ssl_kwargs if os.environ.get('TIDB_SSL', 'false').lower() == 'true' else {},
35 | )
36 | 
37 | 
38 | # Step 2: Define a table with a vector column.
39 | 
40 | # Create table without HNSW index.
41 | class Document(Model):
42 |     class Meta:
43 |         database = db
44 |         table_name = 'peewee_demo_documents'
45 | 
46 |     content = TextField()
47 |     embedding = VectorField(3)
48 | 
49 | 
50 | # Create table with HNSW index.
51 | class DocumentWithIndex(Model):
52 |     class Meta:
53 |         database = db
54 |         table_name = 'peewee_demo_documents_with_index'
55 | 
56 |     content = TextField()
57 |     embedding = VectorField(3)
58 | 
59 | 
60 | db.connect()
61 | db.drop_tables([Document, DocumentWithIndex])
62 | db.create_tables([Document, DocumentWithIndex])
63 | VectorAdaptor(db).create_vector_index(
64 |     DocumentWithIndex.embedding,
65 |     DistanceMetric.COSINE,
66 | )
67 | 
68 | # Step 3. Insert embeddings into the table.
69 | Document.create(content='dog', embedding=[1, 2, 1])
70 | Document.create(content='fish', embedding=[1, 2, 4])
71 | Document.create(content='tree', embedding=[1, 0, 0])
72 | 
73 | # Step 4. Get the 3-nearest neighbor documents.
74 | print('Get 3-nearest neighbor documents:')
75 | distance = Document.embedding.cosine_distance([1, 2, 3]).alias('distance')
76 | results = Document.select(Document, distance).order_by(distance).limit(3)
77 | 
78 | for doc in results:
79 |     print(f'  - distance: {doc.distance}\n'
80 |           f'    document: {doc.content}')
81 | 
82 | # Step 5. Get documents within a certain distance.
83 | print('Get documents within a certain distance:')
84 | distance_expression = Document.embedding.cosine_distance([1, 2, 3])
85 | distance = distance_expression.alias('distance')
86 | results = Document.select(Document, distance).where(distance_expression < 0.2).order_by(distance).limit(3)
87 | 
88 | for doc in results:
89 |     print(f'  - distance: {doc.distance}\n'
90 |           f'    document: {doc.content}')
91 | 


--------------------------------------------------------------------------------
/examples/orm-peewee-quickstart/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMySQL==1.1.0
2 | python-dotenv==1.0.0
3 | peewee==3.17.5
4 | tidb-vector>=0.0.9


--------------------------------------------------------------------------------
/examples/orm-sqlalchemy-quickstart/.env.example:
--------------------------------------------------------------------------------
1 | TIDB_HOST=gateway01.****.prod.aws.tidbcloud.com
2 | TIDB_PORT=4000
3 | TIDB_USERNAME=******.root
4 | TIDB_PASSWORD=********
5 | TIDB_DATABASE=test
6 | # TiDB Serverless Cluster requires SSL connection for public network access.
7 | # For local TiDB cluster, please set TIDB_SSL=false to disable SSL.
8 | TIDB_SSL=true


--------------------------------------------------------------------------------
/examples/orm-sqlalchemy-quickstart/README.md:
--------------------------------------------------------------------------------
 1 | # Integrate TiDB Vector Search with SQLAlchemy ORM
 2 | 
 3 | This is a simple demo to show how to integrate TiDB Vector Search with the SQLAlchemy ORM to search for similar text in a TiDB Serverless cluster.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster with vector search enabled
 8 | - Python 3.8 or later
 9 | 
10 | ## Run the example
11 | 
12 | ### Clone this repo
13 | 
14 | ```bash
15 | git clone https://github.com/pingcap/tidb-vector-python.git
16 | ```
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | cd tidb-vector-python/examples/orm-sqlalchemy-quickstart
22 | python3 -m venv .venv
23 | source .venv/bin/activate
24 | ```
25 | 
26 | ### Install dependencies
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ### Set the environment variables
33 | 
34 | Create a `.env` file via the following command.
35 | 
36 | ```shell
37 | cp .env.example .env
38 | ```
39 | 
40 | Copy the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then replace the placeholders in the `.env` file.
41 | 
42 | ```bash
43 | TIDB_DATABASE_URL=mysql+pymysql://<USERNAME>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA>&ssl_verify_cert=true&ssl_verify_identity=true
44 | ```
45 | 
46 | ### Run this example
47 | 
48 | ```text
49 | $ python sqlalchemy-quickstart.py
50 | Get 3-nearest neighbor documents:
51 |   - distance: 0.00853986601633272
52 |     document: fish
53 |   - distance: 0.12712843905603044
54 |     document: dog
55 |   - distance: 0.7327387580875756
56 |     document: tree
57 | Get documents within a certain distance:
58 |   - distance: 0.00853986601633272
59 |     document: fish
60 |   - distance: 0.12712843905603044
61 |     document: dog
62 | ```


--------------------------------------------------------------------------------
/examples/orm-sqlalchemy-quickstart/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMySQL==1.1.0
2 | python-dotenv==1.0.0
3 | SQLAlchemy==2.0.30
4 | tidb-vector>=0.0.9


--------------------------------------------------------------------------------
/examples/orm-sqlalchemy-quickstart/sqlalchemy-quickstart.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import dotenv
 3 | 
 4 | from sqlalchemy import Column, Integer, create_engine, Text, URL
 5 | from sqlalchemy.orm import declarative_base, Session
 6 | from tidb_vector.sqlalchemy import VectorType, VectorAdaptor
 7 | from tidb_vector.constants import DistanceMetric
 8 | 
 9 | dotenv.load_dotenv()
10 | 
11 | # Step 1: Connect to TiDB using SQLAlchemy.
12 | 
13 | # Using `pymysql` as the driver.
14 | drivername = 'mysql+pymysql'
15 | ssl_kwargs = {
16 |     'ssl_verify_cert': 'true',
17 |     'ssl_verify_identity': 'true',
18 | }
19 | 
20 | # Using `mysqlclient` as the driver.
21 | # drivername = 'mysql+mysqldb'
22 | # ssl_kwargs = {
23 | #     'ssl_mode': 'VERIFY_IDENTITY',
24 | #     'ssl': {
25 | #         # Root certificate default path
26 | #         # https://docs.pingcap.com/tidbcloud/secure-connections-to-serverless-clusters/#root-certificate-default-path
27 | #         'ca': os.environ.get('TIDB_CA_PATH', '/path/to/ca.pem'),
28 | #     },
29 | # }
30 | 
31 | engine = create_engine(URL.create(
32 |     drivername=drivername,
33 |     username=os.environ['TIDB_USERNAME'],
34 |     password=os.environ['TIDB_PASSWORD'],
35 |     host=os.environ['TIDB_HOST'],
36 |     port=os.environ['TIDB_PORT'],
37 |     database=os.environ['TIDB_DATABASE'],
38 |     query=ssl_kwargs if os.environ.get('TIDB_SSL', 'false').lower() == 'true' else {},
39 | ))
40 | 
41 | 
42 | # Step 2: Define a table with a vector column.
43 | Base = declarative_base()
44 | 
45 | 
46 | class Document(Base):
47 |     __tablename__ = 'sqlalchemy_demo_documents'
48 |     id = Column(Integer, primary_key=True)
49 |     content = Column(Text)
50 |     embedding = Column(VectorType(3))
51 | 
52 | 
53 | # Or add HNSW index when creating table.
54 | class DocumentWithIndex(Base):
55 |     __tablename__ = 'sqlalchemy_demo_documents_with_index'
56 |     id = Column(Integer, primary_key=True)
57 |     content = Column(Text)
58 |     embedding = Column(VectorType(3))
59 | 
60 | 
61 | Base.metadata.drop_all(engine)
62 | Base.metadata.create_all(engine)
63 | VectorAdaptor(engine).create_vector_index(
64 |     DocumentWithIndex.embedding,
65 |     DistanceMetric.COSINE,
66 |     skip_existing=True,
67 | )
68 | 
69 | 
70 | # Step 3: Insert embeddings into the table.
71 | with Session(engine) as session:
72 |     session.add(Document(content="dog", embedding=[1, 2, 1]))
73 |     session.add(Document(content="fish", embedding=[1, 2, 4]))
74 |     session.add(Document(content="tree", embedding=[1, 0, 0]))
75 |     session.commit()
76 | 
77 | 
78 | # Step 4: Get the 3-nearest neighbor documents.
79 | print('Get 3-nearest neighbor documents:')
80 | with Session(engine) as session:
81 |     distance = Document.embedding.cosine_distance([1, 2, 3]).label('distance')
82 |     results = session.query(Document, distance).order_by(distance).limit(3).all()
83 | 
84 |     for doc, distance in results:
85 |         print(f'  - distance: {distance}\n'
86 |               f'    document: {doc.content}')
87 | 
88 | # Step 5: Get documents within a certain distance.
89 | print('Get documents within a certain distance:')
90 | with (Session(engine) as session):
91 |     distance = Document.embedding.cosine_distance([1, 2, 3]).label('distance')
92 |     results = session.query(
93 |         Document, distance
94 |     ).filter(distance < 0.2).order_by(distance).limit(3).all()
95 | 
96 |     for doc, distance in results:
97 |         print(f'  - distance: {distance}\n'
98 |               f'    document: {doc.content}')
99 | 


--------------------------------------------------------------------------------
/examples/python-client-quickstart/.env.example:
--------------------------------------------------------------------------------
1 | # A example database URL to connect to a TiDB cluster from macOS:
2 | # mysql+pymysql://<PREFIX>.root:<PASSWORD>@gateway01.<REGION>.prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true
3 | TIDB_DATABASE_URL="mysql+pymysql://<USER>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA_PATH>&ssl_verify_cert=true&ssl_verify_identity=true"
4 | 


--------------------------------------------------------------------------------
/examples/python-client-quickstart/README.md:
--------------------------------------------------------------------------------
 1 | # TiDB Vector Search Python Client Quickstart
 2 | 
 3 | This is a simple demo to show how to use the TiDB Vector Search Python Client to search for similar text in a TiDB Serverless cluster.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - A running TiDB Serverless cluster with vector search enabled
 8 | - Python 3.8 or later
 9 | 
10 | ## Run the example
11 | 
12 | ### Clone this repo
13 | 
14 | ```bash
15 | git clone https://github.com/pingcap/tidb-vector-python.git
16 | ```
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | cd tidb-vector-python/examples/python-client-quickstart
22 | python3 -m venv .venv
23 | source .venv/bin/activate
24 | ```
25 | 
26 | ### Install dependencies
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ### Set the environment variables
33 | 
34 | Get the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, `DATABASE`, and `CA` parameters from the TiDB Cloud console (see [Prerequisites](../README.md#prerequisites)), and then replace the following placeholders to get the `TIDB_DATABASE_URL`.
35 | 
36 | ```bash
37 | export TIDB_DATABASE_URL="mysql+pymysql://<USERNAME>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA>&ssl_verify_cert=true&ssl_verify_identity=true"
38 | ```
39 | or create a `.env` file with the above environment variables.
40 | 
41 | ### Run this example
42 | 
43 | ```text
44 | $ python example.py
45 | Downloading and loading the embedding model...
46 | Search result ("a swimming animal"):
47 | - text: "fish", distance: 0.4562914811223072
48 | - text: "dog", distance: 0.6469335836410557
49 | - text: "tree", distance: 0.798545178640937
50 | ```


--------------------------------------------------------------------------------
/examples/python-client-quickstart/example.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from tidb_vector.integrations import TiDBVectorClient
 4 | from sentence_transformers import SentenceTransformer
 5 | from dotenv import load_dotenv
 6 | 
 7 | # Step 1. Initialize embedding model
 8 | 
 9 | print("Downloading and loading the embedding model...")
10 | embed_model = SentenceTransformer("sentence-transformers/msmarco-MiniLM-L12-cos-v5", trust_remote_code=True)
11 | embed_model_dims = embed_model.get_sentence_embedding_dimension()
12 | 
13 | 
14 | def text_to_embedding(text):
15 |     """Generates vector embeddings for the given text."""
16 |     embedding = embed_model.encode(text)
17 |     return embedding.tolist()
18 | 
19 | 
20 | # Step 2. Initialize TiDBVectorClient instance
21 | 
22 | load_dotenv()
23 | 
24 | vector_store = TiDBVectorClient(
25 |     # The table which will store the vector data.
26 |     table_name='embedded_documents',
27 |     # The connection string to the TiDB cluster.
28 |     # The connection string should be in the format of:
29 |     # mysql+pymysql://<USER>:<PASSWORD>@<HOST>:4000/<DATABASE>?ssl_ca=<CA_PATH>&ssl_verify_cert=true&ssl_verify_identity=true
30 |     connection_string=os.environ.get('TIDB_DATABASE_URL'),
31 |     # The dimension of the vector generated by the embedding model.
32 |     vector_dimension=embed_model_dims,
33 |     # Determine whether to recreate the table if it already exists.
34 |     drop_existing_table=True,
35 | )
36 | 
37 | # Step 3. Bulk insert objects and their embeddings
38 | 
39 | documents = [
40 |     {
41 |         "id": "f8e7dee2-63b6-42f1-8b60-2d46710c1971",
42 |         "text": "dog",
43 |         "embedding": text_to_embedding("dog"),
44 |         "metadata": {"category": "animal"},
45 |     },
46 |     {
47 |         "id": "8dde1fbc-2522-4ca2-aedf-5dcb2966d1c6",
48 |         "text": "fish",
49 |         "embedding": text_to_embedding("fish"),
50 |         "metadata": {"category": "animal"},
51 |     },
52 |     {
53 |         "id": "e4991349-d00b-485c-a481-f61695f2b5ae",
54 |         "text": "tree",
55 |         "embedding": text_to_embedding("tree"),
56 |         "metadata": {"category": "plant"},
57 |     },
58 | ]
59 | 
60 | vector_store.insert(
61 |     ids=[doc["id"] for doc in documents],
62 |     texts=[doc["text"] for doc in documents],
63 |     embeddings=[doc["embedding"] for doc in documents],
64 |     metadatas=[doc["metadata"] for doc in documents],
65 | )
66 | 
67 | # Step 4. Perform vector search to find the most semantically similar documents to the query.
68 | 
69 | 
70 | def print_result(query, result):
71 |     print(f"Search result (\"{query}\"):")
72 |     for r in result:
73 |         print(f"- text: \"{r.document}\", distance: {r.distance}")
74 | 
75 | 
76 | query = "a swimming animal"
77 | query_embedding = text_to_embedding(query)
78 | search_result = vector_store.query(query_embedding, k=3)
79 | print_result(query, search_result)
80 | 


--------------------------------------------------------------------------------
/examples/python-client-quickstart/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==1.0.0
2 | PyMySQL==1.1.0
3 | sentence-transformers==3.0.1
4 | SQLAlchemy==2.0.30
5 | tidb-vector
6 | 


--------------------------------------------------------------------------------
/examples/semantic-cache/README.md:
--------------------------------------------------------------------------------
 1 | # Semantic Cache with Jina AI and TiDB Vector
 2 | Semantic cache is a cache that stores the semantic information of the data. It can be used to speed up the search process by storing the embeddings of the data and searching for similar embeddings. This example demonstrates how to use Jina AI to generate embeddings for text data and store the embeddings in TiDB Vector Storage. It also shows how to search for similar embeddings in TiDB Vector Storage.
 3 | 
 4 | ## Prerequisites
 5 | 
 6 | - A running TiDB Serverless cluster with vector search enabled
 7 | - Python 3.8 or later
 8 | - Jina AI API key
 9 | 
10 | ## Run the example
11 | 
12 | ### Clone this repo
13 | 
14 | ```bash
15 | git clone https://github.com/pingcap/tidb-vector-python.git
16 | ```
17 | 
18 | ### Create a virtual environment
19 | 
20 | ```bash
21 | cd tidb-vector-python/examples/semantic-cache
22 | python3 -m venv .venv
23 | source .venv/bin/activate
24 | ```
25 | 
26 | ### Install dependencies
27 | 
28 | ```bash
29 | pip install -r requirements.txt
30 | ```
31 | 
32 | ### Set the environment variables
33 | 
34 | Get the `HOST`, `PORT`, `USERNAME`, `PASSWORD`, and `DATABASE` from the TiDB Cloud console, as described in the [Prerequisites](../README.md#prerequisites) section. Then set the following environment variables:
35 | 
36 | ```bash
37 | export DATABASE_URI="mysql+pymysql://34u7xMnnDLSkjV1.root:<PASSWORD>@gateway01.eu-central-1.prod.aws.tidbcloud.com:4000/test?ssl_ca=/etc/ssl/cert.pem&ssl_verify_cert=true&ssl_verify_identity=true"
38 | ```
39 | or create a `.env` file with the above environment variables.
40 | 
41 | 
42 | ### Run this example
43 | 
44 | 
45 | #### Start the semantic cache server
46 |   
47 |   ```bash
48 | fastapi dev cache.py
49 |   ```
50 | 
51 | #### Test the API
52 | 
53 | Get the Jina AI API key from the [Jina AI Embedding API](https://jina.ai/embeddings/) page, and save it somewhere safe for later use.
54 | 
55 | `POST /set`
56 | 
57 | ```bash
58 | curl --location ':8000/set' \
59 | --header 'Content-Type: application/json' \
60 | --header 'Authorization: Bearer <your jina token>' \
61 | --data '{
62 |     "key": "what is tidb",
63 |     "value": "tidb is a mysql-compatible and htap database"
64 | }'
65 | ```
66 | 
67 | `GET /get/<key>`
68 | 
69 | ```bash
70 | curl --location ':8000/get/what%27s%20tidb%20and%20tikv?max_distance=0.5' \
71 | --header 'Content-Type: application/json' \
72 | --header 'Authorization: Bearer <your jina token>'
73 | ```


--------------------------------------------------------------------------------
/examples/semantic-cache/cache.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | from typing import Optional, Annotated
  4 | 
  5 | import requests
  6 | import dotenv
  7 | from fastapi import Depends, FastAPI
  8 | from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
  9 | from sqlmodel import (
 10 |     SQLModel,
 11 |     Session,
 12 |     create_engine,
 13 |     select,
 14 |     Field,
 15 |     Column,
 16 |     String,
 17 |     Text,
 18 |     DateTime,
 19 | )
 20 | from sqlalchemy import func
 21 | from tidb_vector.sqlalchemy import VectorType
 22 | dotenv.load_dotenv()
 23 | 
 24 | 
 25 | # Configuration from .env
 26 | # Example: "mysql+pymysql://<username>:<password>@<host>:<port>/<database>?ssl_mode=VERIFY_IDENTITY&ssl_ca=/etc/ssl/cert.pem"
 27 | DATABASE_URI = os.getenv('DATABASE_URI')
 28 | # Ref: https://docs.pingcap.com/tidb/stable/time-to-live
 29 | # Default: 604800 SECOND (1 week)
 30 | TIME_TO_LIVE = os.getenv('TIME_TO_LIVE')
 31 | 
 32 | 
 33 | # Get Embeddings from Jina AI
 34 | def generate_embeddings(jinaai_api_key: str, text: str):
 35 |     JINAAI_API_URL = 'https://api.jina.ai/v1/embeddings'
 36 |     JINAAI_HEADERS = {
 37 |         'Content-Type': 'application/json',
 38 |         'Authorization': f'Bearer {jinaai_api_key}'
 39 |     }
 40 |     JINAAI_REQUEST_DATA = {
 41 |         'input': [text],
 42 |         'model': 'jina-embeddings-v2-base-en'  # with dimisions 768
 43 |     }
 44 |     response = requests.post(JINAAI_API_URL, headers=JINAAI_HEADERS, json=JINAAI_REQUEST_DATA)
 45 |     return response.json()['data'][0]['embedding']
 46 | 
 47 | 
 48 | class Cache(SQLModel, table=True):
 49 |     __table_args__ = {
 50 |         # Ref: https://docs.pingcap.com/tidb/stable/time-to-live
 51 |         'mysql_TTL': f'created_at + INTERVAL {TIME_TO_LIVE} SECOND',
 52 |     }
 53 | 
 54 |     id: Optional[int] = Field(default=None, primary_key=True)
 55 |     key: str = Field(sa_column=Column(String(255), unique=True, nullable=False))
 56 |     key_vec: Optional[list[float]]= Field(
 57 |         sa_column=Column(
 58 |             VectorType(768),
 59 |             default=None,
 60 |             nullable=False,
 61 |         )
 62 |     )
 63 |     value: Optional[str] = Field(sa_column=Column(Text))
 64 |     created_at: datetime = Field(
 65 |         sa_column=Column(DateTime, server_default=func.now(), nullable=False)
 66 |     )
 67 |     updated_at: datetime = Field(
 68 |         sa_column=Column(
 69 |             DateTime, server_default=func.now(), onupdate=func.now(), nullable=False
 70 |         )
 71 |     )
 72 | 
 73 | engine = create_engine(DATABASE_URI)
 74 | SQLModel.metadata.create_all(engine)
 75 | 
 76 | app = FastAPI()
 77 | security = HTTPBearer()
 78 | 
 79 | @app.get("/")
 80 | def index():
 81 |     return {
 82 |         "message": "Welcome to Semantic Cache API, it is built using Jina AI Embeddings API and TiDB Vector",
 83 |         "docs": "/docs",
 84 |         "redoc": "/redoc",
 85 |         "about": "https://github.com/pingcap/tidb-vector-python/blob/main/examples/semantic-cache/README.md",
 86 |         "config": {
 87 |             "TIME_TO_LIVE": int(TIME_TO_LIVE),
 88 |             "EMBEDDING_DIMENSIONS": 768,
 89 |             "EMBEDDING_PROVIDER": "Jina AI",
 90 |             "EMBEDDING_MODEL": "jina-embeddings-v2-base-en",
 91 |         }
 92 |     }
 93 | 
 94 | 
 95 | # /set method of Semantic Cache
 96 | @app.post("/set")
 97 | def set(
 98 |     credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)],
 99 |     cache: Cache,
100 | ):
101 |     cache.key_vec = generate_embeddings(credentials.credentials, cache.key)
102 | 
103 |     with Session(engine) as session:
104 |         session.add(cache)
105 |         session.commit()
106 | 
107 |     return {'message': 'Cache has been set'}
108 | 
109 | 
110 | @app.get("/get/{key}")
111 | def get(
112 |     credentials: Annotated[HTTPAuthorizationCredentials, Depends(security)],
113 |     key: str,
114 |     max_distance: Optional[float] = 0.1,
115 | ):
116 |     key_vec = generate_embeddings(credentials.credentials, key)
117 |     # The max value of distance is 0.3
118 |     max_distance = min(max_distance, 0.3)
119 | 
120 |     with Session(engine) as session:
121 |         result = session.exec(
122 |             select(
123 |                 Cache,
124 |                 Cache.key_vec.cosine_distance(key_vec).label('distance')
125 |             ).order_by(
126 |                 'distance'
127 |             ).limit(1)
128 |         ).first()
129 | 
130 |         if result is None:
131 |             return {"message": "Cache not found"}, 404
132 | 
133 |         cache, distance = result
134 |         if distance > max_distance:
135 |             return {"message": "Cache not found"}, 404
136 | 
137 |         return {
138 |             "key": cache.key,
139 |             "value": cache.value,
140 |             "distance": distance
141 |         }


--------------------------------------------------------------------------------
/examples/semantic-cache/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | PyMySQL
3 | sqlmodel==0.0.19
4 | tidb-vector>=0.0.9
5 | python-dotenv
6 | fastapi
7 | 


--------------------------------------------------------------------------------
/examples/static/images/tidbcloud-connect-parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/examples/static/images/tidbcloud-connect-parameters.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "tidb-vector"
 3 | # this version is usless, now read the version from __init__.py
 4 | version = "0.0.0"
 5 | description = "A Python client for TiDB Vector"
 6 | authors = ["IANTHEREAL <argregoryian@gmail.com>"]
 7 | license = "Apache-2.0"
 8 | readme = "README.md"
 9 | packages = [{include = "tidb_vector"}]
10 | 
11 | [tool.poetry-version-plugin]
12 | source = "init"
13 | 
14 | [tool.poetry.dependencies]
15 | python = ">=3.8.1,<4.0"
16 | numpy = "^1"
17 | SQLAlchemy = {version = ">=1.4,<3", optional = true}
18 | 
19 | [tool.poetry.extras]
20 | client = ["SQLAlchemy"]
21 | 
22 | [tool.poetry.group.test.dependencies]
23 | # The only dependencies that should be added are
24 | # dependencies used for running tests.
25 | # Any dependencies that do not meet that criteria will be removed.
26 | pytest = "^7.3.0"
27 | pytest-cov = "^4.0.0"
28 | pytest-dotenv = "^0.5.2"
29 | 
30 | [build-system]
31 | requires = ["poetry-core"]
32 | build-backend = "poetry.core.masonry.api"
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/__init__.py


--------------------------------------------------------------------------------
/tests/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class TestConfig:
 5 |     TIDB_HOST = os.getenv("TEST_TIDB_HOST", "127.0.0.1")
 6 |     TIDB_USER = os.getenv("TEST_TIDB_USER", "root")
 7 |     TIDB_PASSWORD = os.getenv("TEST_TIDB_PASSWORD", "")
 8 |     TIDB_PORT = int(os.getenv("TEST_TIDB_PORT", "4000"))
 9 |     TIDB_SSL = os.getenv("TEST_TIDB_SSL", "false").lower() in ["true", "1"]
10 | 


--------------------------------------------------------------------------------
/tests/integrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/integrations/__init__.py


--------------------------------------------------------------------------------
/tests/integrations/test_utils.py:
--------------------------------------------------------------------------------
 1 | """Test TiDB Vector Search functionality."""
 2 | from __future__ import annotations
 3 | 
 4 | from tidb_vector.integrations.utils import extract_info_from_column_definition
 5 | 
 6 | 
 7 | def test_extract_info_from_column_definition():
 8 |     # Test case with dimension and distance metric
 9 |     column_type = "VECTOR(128)"
10 |     column_comment = "hnsw(distance=cosine)"
11 |     expected_result = (128, "cosine")
12 |     assert (
13 |         extract_info_from_column_definition(column_type, column_comment)
14 |         == expected_result
15 |     )
16 | 
17 |     # Test case with dimension but no distance metric
18 |     column_type = "VECTOR(256)"
19 |     column_comment = "some comment"
20 |     expected_result = (256, None)
21 |     assert (
22 |         extract_info_from_column_definition(column_type, column_comment)
23 |         == expected_result
24 |     )
25 | 
26 |     # Test case with no dimension and no distance metric
27 |     column_type = "VECTOR"
28 |     column_comment = "another comment"
29 |     expected_result = (None, None)
30 |     assert (
31 |         extract_info_from_column_definition(column_type, column_comment)
32 |         == expected_result
33 |     )
34 | 
35 |     # Test case with no dimension and no comment
36 |     column_type = "VECTOR"
37 |     column_comment = ""
38 |     expected_result = (None, None)
39 |     assert (
40 |         extract_info_from_column_definition(column_type, column_comment)
41 |         == expected_result
42 |     )
43 | 
44 |     # Test case with dimension but no comment
45 |     column_type = "VECTOR(256)"
46 |     column_comment = ""
47 |     expected_result = (256, None)
48 |     assert (
49 |         extract_info_from_column_definition(column_type, column_comment)
50 |         == expected_result
51 |     )
52 | 
53 |     # Test case without index type
54 |     column_type = "VECTOR"
55 |     column_comment = "distance=l2"
56 |     expected_result = (None, "l2")
57 |     assert (
58 |         extract_info_from_column_definition(column_type, column_comment)
59 |         == expected_result
60 |     )
61 | 
62 |     # Test case with addition comment content
63 |     column_type = "VECTOR(128)"
64 |     column_comment = "test, hnsw(distance=l2)"
65 |     expected_result = (128, "l2")
66 |     assert (
67 |         extract_info_from_column_definition(column_type, column_comment)
68 |         == expected_result
69 |     )
70 | 


--------------------------------------------------------------------------------
/tests/peewee/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/peewee/__init__.py


--------------------------------------------------------------------------------
/tests/sqlalchemy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pingcap/tidb-vector-python/3740022a4aac62891650abc8160fcef97fc26be7/tests/sqlalchemy/__init__.py


--------------------------------------------------------------------------------
/tidb_vector/__init__.py:
--------------------------------------------------------------------------------
1 | from .constants import MAX_DIM, MIN_DIM, DistanceMetric, VectorDataType
2 | 
3 | __version__ = "0.0.14"
4 | __all__ = ["MAX_DIM", "MIN_DIM", "DistanceMetric", "VectorDataType"]
5 | 


--------------------------------------------------------------------------------
/tidb_vector/constants.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | import typing
 3 | 
 4 | import numpy
 5 | 
 6 | # TiDB Vector has a limitation on the dimension length
 7 | MAX_DIM = 16000
 8 | MIN_DIM = 1
 9 | 
10 | 
11 | VectorDataType = typing.Union[numpy.ndarray, typing.List[float]]
12 | 
13 | 
14 | class DistanceMetric(enum.Enum):
15 |     """
16 |     An enumeration representing different types of distance metrics.
17 | 
18 |     - `DistanceMetric.L2`: L2 (Euclidean) distance metric.
19 |     - `DistanceMetric.COSINE`: Cosine distance metric.
20 |     """
21 | 
22 |     L2 = "L2"
23 |     COSINE = "COSINE"
24 | 
25 |     def to_sql_func(self):
26 |         """
27 |         Converts the DistanceMetric to its corresponding SQL function name.
28 | 
29 |         Returns:
30 |             str: The SQL function name.
31 | 
32 |         Raises:
33 |             ValueError: If the DistanceMetric enum member is not supported.
34 |         """
35 |         if self == DistanceMetric.L2:
36 |             return "VEC_L2_DISTANCE"
37 |         elif self == DistanceMetric.COSINE:
38 |             return "VEC_COSINE_DISTANCE"
39 |         else:
40 |             raise ValueError("unsupported distance metric")
41 | 


--------------------------------------------------------------------------------
/tidb_vector/integrations/__init__.py:
--------------------------------------------------------------------------------
 1 | from tidb_vector.integrations.vector_client import TiDBVectorClient
 2 | from tidb_vector.integrations.utils import (
 3 |     EmbeddingColumnMismatchError,
 4 |     check_table_existence,
 5 |     get_embedding_column_definition,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "TiDBVectorClient",
10 |     "EmbeddingColumnMismatchError",
11 |     "check_table_existence",
12 |     "get_embedding_column_definition",
13 | ]
14 | 


--------------------------------------------------------------------------------
/tidb_vector/integrations/utils.py:
--------------------------------------------------------------------------------
  1 | import sqlalchemy
  2 | import re
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | 
  6 | class EmbeddingColumnMismatchError(ValueError):
  7 |     """
  8 |     Exception raised when the existing embedding column does not match the expected dimension.
  9 | 
 10 |     Attributes:
 11 |         existing_col (str): The definition of the existing embedding column.
 12 |         expected_col (str): The definition of the expected embedding column.
 13 |     """
 14 | 
 15 |     def __init__(self, existing_col, expected_col):
 16 |         self.existing_col = existing_col
 17 |         self.expected_col = expected_col
 18 |         super().__init__(
 19 |             f"The existing embedding column ({existing_col}) does not match the expected dimension ({expected_col})."
 20 |         )
 21 | 
 22 | 
 23 | def check_table_existence(
 24 |     connection_string: str,
 25 |     table_name: str,
 26 |     engine_args: Optional[Dict[str, Any]] = None,
 27 | ) -> bool:
 28 |     """
 29 |     Check if the vector table exists in the database
 30 | 
 31 |     Args:
 32 |         connection_string (str): The connection string for the database.
 33 |         table_name (str): The name of the table to check.
 34 |         engine_args (Optional[Dict[str, Any]]): Additional arguments for the engine.
 35 | 
 36 |     Returns:
 37 |         bool: True if the table exists, False otherwise.
 38 |     """
 39 |     engine = sqlalchemy.create_engine(connection_string, **(engine_args or {}))
 40 |     try:
 41 |         inspector = sqlalchemy.inspect(engine)
 42 |         return table_name in inspector.get_table_names()
 43 |     finally:
 44 |         engine.dispose()
 45 | 
 46 | 
 47 | def get_embedding_column_definition(
 48 |     connection_string: str,
 49 |     table_name: str,
 50 |     column_name: str,
 51 |     engine_args: Optional[Dict[str, Any]] = None,
 52 | ):
 53 |     """
 54 |     Retrieves the column definition of an embedding column from a database table.
 55 | 
 56 |     Args:
 57 |         connection_string (str): The connection string to the database.
 58 |         table_name (str): The name of the table.
 59 |         column_name (str): The name of the column.
 60 |         engine_args (Optional[Dict[str, Any]]): Additional arguments for the engine.
 61 | 
 62 |     Returns:
 63 |         tuple: A tuple containing the dimension (int or None) and distance metric (str or None).
 64 |     """
 65 |     engine = sqlalchemy.create_engine(connection_string, **(engine_args or {}))
 66 |     try:
 67 |         with engine.connect() as connection:
 68 |             query = f"""SELECT COLUMN_TYPE, COLUMN_COMMENT
 69 |                         FROM INFORMATION_SCHEMA.COLUMNS
 70 |                         WHERE TABLE_NAME = '{table_name}' AND COLUMN_NAME = '{column_name}'"""
 71 |             result = connection.execute(sqlalchemy.text(query)).fetchone()
 72 |             if result:
 73 |                 return extract_info_from_column_definition(result[0], result[1])
 74 |     finally:
 75 |         engine.dispose()
 76 | 
 77 |     return None, None
 78 | 
 79 | 
 80 | def extract_info_from_column_definition(column_type, column_comment):
 81 |     """
 82 |     Extracts the dimension and distance metric from a column definition,
 83 |     supporting both optional dimension and optional comment.
 84 | 
 85 |     Args:
 86 |         column_type (str): The column definition, possibly including dimension and a comment.
 87 | 
 88 |     Returns:
 89 |         tuple: A tuple containing the dimension (int or None) and the distance metric (str or None).
 90 |     """
 91 |     # Try to extract the dimension, which is optional.
 92 |     dimension_match = re.search(r"VECTOR(?:\((\d+)\))?", column_type, re.IGNORECASE)
 93 |     dimension = (
 94 |         int(dimension_match.group(1))
 95 |         if dimension_match and dimension_match.group(1)
 96 |         else None
 97 |     )
 98 | 
 99 |     # Extracting index type and distance metric from the comment, supporting both single and double quotes.
100 |     distance_match = re.search(r"distance=([^,\)]+)", column_comment)
101 |     distance = distance_match.group(1) if distance_match else None
102 | 
103 |     return dimension, distance
104 | 


--------------------------------------------------------------------------------
/tidb_vector/peewee/__init__.py:
--------------------------------------------------------------------------------
1 | from .vector_type import VectorField
2 | from .adaptor import VectorAdaptor
3 | 
4 | __all__ = ["VectorField", "VectorAdaptor"]
5 | 


--------------------------------------------------------------------------------
/tidb_vector/peewee/adaptor.py:
--------------------------------------------------------------------------------
  1 | import peewee
  2 | import tidb_vector
  3 | from .vector_type import VectorField
  4 | 
  5 | 
  6 | class VectorAdaptor:
  7 |     """
  8 |     A wrapper over existing Peewee Database to provide additional vector search capabilities.
  9 |     """
 10 | 
 11 |     engine: peewee.Database
 12 | 
 13 |     def __init__(self, engine: peewee.Database):
 14 |         self.engine = engine
 15 | 
 16 |     def _check_vector_column(self, field: VectorField):
 17 |         if not isinstance(field, VectorField):
 18 |             raise ValueError("Not a vector field")
 19 | 
 20 |     def has_vector_index(self, field: VectorField) -> bool:
 21 |         """
 22 |         Check if the index for the vector column exists.
 23 |         """
 24 | 
 25 |         self._check_vector_column(field)
 26 | 
 27 |         table_name = field.model._meta.table_name
 28 | 
 29 |         # TODO: Better quote
 30 |         cursor: peewee.CursorWrapper = self.engine.execute_sql(
 31 |             f"SHOW INDEX FROM `{table_name}`"
 32 |         )
 33 |         column_name_idx = None
 34 |         for idx, column in enumerate(cursor.description):
 35 |             if column[0].lower() == "column_name":
 36 |                 column_name_idx = idx
 37 |                 break
 38 |         if column_name_idx is None:
 39 |             raise ValueError("Failed to parse SHOW INDEX result")
 40 | 
 41 |         for row in cursor:
 42 |             column_name = row[column_name_idx]
 43 |             if column_name.lower() == field.name.lower():
 44 |                 return True
 45 | 
 46 |         return False
 47 | 
 48 |     def create_vector_index(
 49 |         self,
 50 |         field: VectorField,
 51 |         distance_metric: tidb_vector.DistanceMetric,
 52 |         skip_existing: bool = False,
 53 |     ):
 54 |         """
 55 |         Create vector index for the vector column.
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         field : peewee.Field
 60 |             The field for which the vector index is to be created.
 61 | 
 62 |         distance_metric : tidb_vector.DistanceMetric
 63 |             The distance metric to be used for the vector index.
 64 |                 Available values are:
 65 |                 - tidb_vector.DistanceMetric.L2
 66 |                 - tidb_vector.DistanceMetric.COSINE
 67 | 
 68 |         skip_existing : bool
 69 |             If True, skips creating the index if it already exists. Default is False.
 70 | 
 71 |         Raises
 72 |         ------
 73 |         ValueError
 74 |             If the vector field does not have a fixed dimension.
 75 | 
 76 |         ValueError
 77 |             If the field is not a vector field.
 78 | 
 79 |         Note
 80 |         ----
 81 |         If you want to use high-avaliability columnar storage feature, use raw SQL instead.
 82 | 
 83 |         """
 84 | 
 85 |         self._check_vector_column(field)
 86 | 
 87 |         if field.dimensions is None:
 88 |             raise ValueError(
 89 |                 "Vector index is only supported for fixed dimension vectors"
 90 |             )
 91 | 
 92 |         if skip_existing:
 93 |             if self.has_vector_index(field):
 94 |                 # TODO: Currently there is no easy way to verify whether the distance
 95 |                 # metric is correct. We should check it and throw error if distance metric is not matching
 96 |                 return
 97 | 
 98 |         table_name = field.model._meta.table_name
 99 |         column_name = field.name
100 |         index_name = f"vec_idx_{field.name}"
101 | 
102 |         self.engine.execute_sql(f"ALTER TABLE `{table_name}` SET TIFLASH REPLICA 1")
103 |         self.engine.execute_sql(
104 |             f"""
105 |             ALTER TABLE `{table_name}`
106 |             ADD VECTOR INDEX `{index_name}` (({distance_metric.to_sql_func()}(`{column_name}`)))
107 |             """
108 |         )
109 | 


--------------------------------------------------------------------------------
/tidb_vector/peewee/vector_type.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from peewee import Field, fn
 3 | 
 4 | from tidb_vector.utils import decode_vector, encode_vector
 5 | 
 6 | 
 7 | class VectorField(Field):
 8 |     dimensions: Optional[int]
 9 | 
10 |     field_type = "VECTOR"
11 | 
12 |     def __init__(self, dimensions: Optional[int] = None, *args, **kwargs):
13 |         self.dimensions = dimensions
14 |         super(VectorField, self).__init__(*args, **kwargs)
15 | 
16 |     def get_modifiers(self):
17 |         return self.dimensions and [self.dimensions] or None
18 | 
19 |     def db_value(self, value):
20 |         return encode_vector(value)
21 | 
22 |     def python_value(self, value):
23 |         return decode_vector(value)
24 | 
25 |     def l1_distance(self, vector):
26 |         return fn.VEC_L1_DISTANCE(self, self.to_value(vector))
27 | 
28 |     def l2_distance(self, vector):
29 |         return fn.VEC_L2_DISTANCE(self, self.to_value(vector))
30 | 
31 |     def cosine_distance(self, vector):
32 |         return fn.VEC_COSINE_DISTANCE(self, self.to_value(vector))
33 | 
34 |     def negative_inner_product(self, vector):
35 |         return fn.VEC_NEGATIVE_INNER_PRODUCT(self, self.to_value(vector))
36 | 


--------------------------------------------------------------------------------
/tidb_vector/sqlalchemy/__init__.py:
--------------------------------------------------------------------------------
1 | from .vector_type import VectorType
2 | from .adaptor import VectorAdaptor
3 | 
4 | __all__ = ["VectorType", "VectorAdaptor"]
5 | 


--------------------------------------------------------------------------------
/tidb_vector/sqlalchemy/adaptor.py:
--------------------------------------------------------------------------------
  1 | import sqlalchemy
  2 | import tidb_vector
  3 | from .vector_type import VectorType
  4 | 
  5 | 
  6 | class VectorAdaptor:
  7 |     """
  8 |     A wrapper over existing SQLAlchemy engine to provide additional vector search capabilities.
  9 |     """
 10 | 
 11 |     engine: sqlalchemy.Engine
 12 | 
 13 |     def __init__(self, engine: sqlalchemy.Engine):
 14 |         self.engine = engine
 15 | 
 16 |     def _check_vector_column(self, column: sqlalchemy.Column):
 17 |         if not isinstance(column.type, VectorType):
 18 |             raise ValueError("Not a vector column")
 19 | 
 20 |     def has_vector_index(self, column: sqlalchemy.Column) -> bool:
 21 |         """
 22 |         Check if the index for the vector column exists.
 23 |         """
 24 | 
 25 |         self._check_vector_column(column)
 26 | 
 27 |         with self.engine.begin() as conn:
 28 |             table_name = conn.dialect.identifier_preparer.format_table(column.table)
 29 |             query = sqlalchemy.text(f"SHOW INDEX FROM {table_name}")
 30 |             result = conn.execute(query)
 31 |             result_dict = result.mappings().all()
 32 |             for row in result_dict:
 33 |                 if row["Column_name"].lower() == column.name.lower():
 34 |                     return True
 35 |         return False
 36 | 
 37 |     def create_vector_index(
 38 |         self,
 39 |         column: sqlalchemy.Column,
 40 |         distance_metric: tidb_vector.DistanceMetric,
 41 |         skip_existing: bool = False,
 42 |     ):
 43 |         """
 44 |         Create vector index for the vector column.
 45 | 
 46 |         Parameters
 47 |         ----------
 48 |         column : sqlalchemy.Column
 49 |             The column for which the vector index is to be created.
 50 | 
 51 |         distance_metric : tidb_vector.DistanceMetric
 52 |             The distance metric to be used for the vector index.
 53 |                 Available values are:
 54 |                 - tidb_vector.DistanceMetric.L2
 55 |                 - tidb_vector.DistanceMetric.COSINE
 56 | 
 57 |         skip_existing : bool
 58 |             If True, skips creating the index if it already exists. Default is False.
 59 | 
 60 |         Raises
 61 |         ------
 62 |         ValueError
 63 |             If the vector column does not have a fixed dimension.
 64 | 
 65 |         ValueError
 66 |             If the column is not a vector column.
 67 | 
 68 |         Note
 69 |         ----
 70 |         If you want to use high-avaliability columnar storage feature, use raw SQL instead.
 71 | 
 72 |         """
 73 | 
 74 |         self._check_vector_column(column)
 75 | 
 76 |         if column.type.dim is None:
 77 |             raise ValueError(
 78 |                 "Vector index is only supported for fixed dimension vectors"
 79 |             )
 80 | 
 81 |         if skip_existing:
 82 |             if self.has_vector_index(column):
 83 |                 # TODO: Currently there is no easy way to verify whether the distance
 84 |                 # metric is correct. We should check it and throw error if distance metric is not matching
 85 |                 return
 86 | 
 87 |         with self.engine.begin() as conn:
 88 |             table_name = conn.dialect.identifier_preparer.format_table(column.table)
 89 |             column_name = conn.dialect.identifier_preparer.format_column(column)
 90 |             index_name = conn.dialect.identifier_preparer.quote(
 91 |                 f"vec_idx_{column.name}"
 92 |             )
 93 | 
 94 |             query = sqlalchemy.text(f"ALTER TABLE {table_name} SET TIFLASH REPLICA 1")
 95 |             conn.execute(query)
 96 | 
 97 |             query = sqlalchemy.text(
 98 |                 f"""
 99 |                 ALTER TABLE {table_name}
100 |                 ADD VECTOR INDEX {index_name} (({distance_metric.to_sql_func()}({column_name})))
101 |                 """
102 |             )
103 |             conn.execute(query)
104 | 


--------------------------------------------------------------------------------
/tidb_vector/sqlalchemy/vector_type.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import sqlalchemy
 3 | import tidb_vector
 4 | import tidb_vector.utils
 5 | 
 6 | 
 7 | class VectorType(sqlalchemy.types.UserDefinedType):
 8 |     """
 9 |     Represents a vector column type in TiDB.
10 |     """
11 | 
12 |     dim: Optional[int]
13 | 
14 |     cache_ok = True
15 | 
16 |     def __init__(self, dim: Optional[int] = None):
17 |         if dim is not None and not isinstance(dim, int):
18 |             raise ValueError("expected dimension to be an integer or None")
19 | 
20 |         # tidb vector dimention length has limitation
21 |         if dim is not None and (dim < tidb_vector.MIN_DIM or dim > tidb_vector.MAX_DIM):
22 |             raise ValueError(
23 |                 f"expected dimension to be in [{tidb_vector.MIN_DIM}, {tidb_vector.MAX_DIM}]"
24 |             )
25 | 
26 |         super(sqlalchemy.types.UserDefinedType, self).__init__()
27 |         self.dim = dim
28 | 
29 |     def get_col_spec(self, **kw):
30 |         """
31 |         Returns the column specification for the vector column.
32 | 
33 |         If the dimension is not specified, it returns "VECTOR".
34 |         Otherwise, it returns "VECTOR(<dimension>)".
35 | 
36 |         :param kw: Additional keyword arguments.
37 |         :return: The column specification string.
38 |         """
39 | 
40 |         if self.dim is None:
41 |             return "VECTOR"
42 |         return f"VECTOR({self.dim})"
43 | 
44 |     def bind_processor(self, dialect):
45 |         """Convert the vector float array to a string representation suitable for binding to a database column."""
46 | 
47 |         def process(value):
48 |             return tidb_vector.utils.encode_vector(value, self.dim)
49 | 
50 |         return process
51 | 
52 |     def result_processor(self, dialect, coltype):
53 |         """Convert the vector data from the database into vector array."""
54 | 
55 |         def process(value):
56 |             return tidb_vector.utils.decode_vector(value)
57 | 
58 |         return process
59 | 
60 |     class comparator_factory(sqlalchemy.types.UserDefinedType.Comparator):
61 |         """Returns a comparator factory that provides the distance functions."""
62 | 
63 |         def l1_distance(self, other: tidb_vector.VectorDataType):
64 |             formatted_other = tidb_vector.utils.encode_vector(other)
65 |             return sqlalchemy.func.VEC_L1_DISTANCE(self, formatted_other).label(
66 |                 "l1_distance"
67 |             )
68 | 
69 |         def l2_distance(self, other: tidb_vector.VectorDataType):
70 |             formatted_other = tidb_vector.utils.encode_vector(other)
71 |             return sqlalchemy.func.VEC_L2_DISTANCE(self, formatted_other).label(
72 |                 "l2_distance"
73 |             )
74 | 
75 |         def cosine_distance(self, other: tidb_vector.VectorDataType):
76 |             formatted_other = tidb_vector.utils.encode_vector(other)
77 |             return sqlalchemy.func.VEC_COSINE_DISTANCE(self, formatted_other).label(
78 |                 "cosine_distance"
79 |             )
80 | 
81 |         def negative_inner_product(self, other: tidb_vector.VectorDataType):
82 |             formatted_other = tidb_vector.utils.encode_vector(other)
83 |             return sqlalchemy.func.VEC_NEGATIVE_INNER_PRODUCT(
84 |                 self, formatted_other
85 |             ).label("negative_inner_product")
86 | 


--------------------------------------------------------------------------------
/tidb_vector/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tidb_vector
 3 | 
 4 | 
 5 | def encode_vector(value: tidb_vector.VectorDataType, dim=None):
 6 |     if value is None:
 7 |         return value
 8 | 
 9 |     if dim is not None and len(value) != dim:
10 |         raise ValueError(f"expected {dim} dimensions, but got {len(value)}")
11 | 
12 |     if isinstance(value, np.ndarray):
13 |         if value.ndim != 1:
14 |             raise ValueError("expected ndim to be 1")
15 |         return f"[{','.join(map(str, value))}]"
16 | 
17 |     return str(value)
18 | 
19 | 
20 | def decode_vector(value: str) -> np.ndarray:
21 |     if value is None:
22 |         return value
23 | 
24 |     if value == "[]":
25 |         return np.array([], dtype=np.float32)
26 | 
27 |     return np.array(value[1:-1].split(","), dtype=np.float32)
28 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | alwayscopy=true
 3 | envlist = py311,py310,py39,py38,lint
 4 | 
 5 | [gh-actions]
 6 | python =
 7 |     3.8: py38
 8 |     3.9: py39
 9 |     3.10: py310
10 |     3.11: py311
11 | 
12 | [testenv]
13 | passenv = *
14 | deps =
15 |   pytest
16 |   peewee
17 |   sqlalchemy
18 |   pymysql
19 | commands =
20 |   pytest tests
21 | setenv =
22 |   LANG = en_US.utf-8
23 | 
24 | [testenv:lint]
25 | skip_install = True
26 | allowlist_externals = bash
27 | deps =
28 |   flake8==6.0.0
29 |   black==23.7.0
30 | commands =
31 |   bash -c "flake8 --max-line-length 130 tidb_vector tests"
32 |   bash -c "black --diff --check tidb_vector tests"
33 | 


--------------------------------------------------------------------------------