├── tests ├── __init__.py ├── utils.py ├── conftest.py ├── pg_vectorizer_test.py ├── sync_client_test.py └── async_client_test.py ├── timescale_vector ├── typings │ ├── __init__.py │ ├── pgvector.pyi │ ├── psycopg2 │ │ ├── __init__.pyi │ │ ├── extras.pyi │ │ ├── pool.pyi │ │ └── extensions.pyi │ ├── langchain │ │ └── docstore │ │ │ └── document.pyi │ ├── asyncpg │ │ ├── pool.pyi │ │ ├── connection.pyi │ │ └── __init__.pyi │ ├── langchain_community │ │ └── vectorstores │ │ │ └── timescalevector.pyi │ └── vcr.pyi ├── __init__.py ├── client │ ├── __init__.py │ ├── utils.py │ ├── uuid_time_range.py │ ├── index.py │ ├── predicates.py │ ├── async_client.py │ ├── sync_client.py │ └── query_builder.py └── pgvectorizer.py ├── nbs ├── requirements.txt ├── sidebar.yml ├── nbdev.yml ├── _quarto.yml ├── styles.css └── 01_pgvectorizer.ipynb ├── MANIFEST.in ├── .github └── workflows │ ├── deploy.yaml │ ├── pyright.yaml │ ├── ruff.yaml │ └── test.yaml ├── docker-compose.yaml ├── CHANGELOG.md ├── NOTICE ├── Untitled.ipynb ├── .gitignore ├── pyproject.toml ├── LICENSE └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /timescale_vector/typings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /timescale_vector/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.9" 2 | -------------------------------------------------------------------------------- /nbs/requirements.txt: -------------------------------------------------------------------------------- 1 | python-dotenv 2 | asyncpg 3 | psycopg2 4 | pgvector 5 | numpy -------------------------------------------------------------------------------- /timescale_vector/typings/pgvector.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | def register_vector(conn_or_curs: Any) -> None: ... 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include settings.ini 2 | include LICENSE 3 | include CONTRIBUTING.md 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | -------------------------------------------------------------------------------- /nbs/sidebar.yml: -------------------------------------------------------------------------------- 1 | website: 2 | sidebar: 3 | contents: 4 | - index.ipynb 5 | - 00_vector.ipynb 6 | - tsv_python_getting_started_tutorial.ipynb 7 | -------------------------------------------------------------------------------- /timescale_vector/typings/psycopg2/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, TypeVar 2 | 3 | from psycopg2.extensions import connection 4 | 5 | T = TypeVar("T") 6 | 7 | def connect(dsn: str = "", **kwargs: Any) -> connection: ... 8 | -------------------------------------------------------------------------------- /timescale_vector/typings/psycopg2/extras.pyi: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | from psycopg2.extensions import cursor 4 | 5 | class DictCursor(cursor, Protocol): 6 | def __init__(self) -> None: ... 7 | 8 | def register_uuid(oids: int | None = None, conn_or_curs: cursor | None = None) -> None: ... 9 | -------------------------------------------------------------------------------- /nbs/nbdev.yml: -------------------------------------------------------------------------------- 1 | project: 2 | output-dir: _docs 3 | 4 | website: 5 | title: "timescale-vector" 6 | site-url: "https://timescale.github.io/python-vector" 7 | description: "Python library for storing vector data in Postgres" 8 | repo-branch: main 9 | repo-url: "https://github.com/timescale/python-vector" 10 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy to GitHub Pages 2 | 3 | permissions: 4 | contents: write 5 | pages: write 6 | 7 | on: 8 | push: 9 | branches: [ "main", "master" ] 10 | workflow_dispatch: 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | steps: [uses: fastai/workflows/quarto-ghp@master] 15 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | db: 3 | image: timescale/timescaledb-ha:pg16 4 | ports: 5 | - "5432:5432" 6 | environment: 7 | - POSTGRES_PASSWORD=postgres 8 | - POSTGRES_USER=postgres 9 | - POSTGRES_DB=postgres 10 | - TIMESCALEDB_TELEMETRY=off 11 | volumes: 12 | - ./data:/var/lib/postgresql/data -------------------------------------------------------------------------------- /nbs/_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | 4 | format: 5 | html: 6 | theme: cosmo 7 | css: styles.css 8 | toc: true 9 | 10 | website: 11 | twitter-card: true 12 | open-graph: true 13 | repo-actions: [issue] 14 | navbar: 15 | background: primary 16 | search: true 17 | sidebar: 18 | style: floating 19 | 20 | metadata-files: [nbdev.yml, sidebar.yml] -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Release notes 2 | 3 | 4 | 5 | ## 0.0.5 6 | 7 | Added contains operator to support metadata array values. 8 | 9 | ## 0.0.4 10 | 11 | Various usability improvements. 12 | 13 | ## 0.0.3 14 | 15 | Add PgVectorizer 16 | Add ability to have predicates on uuid timestamp 17 | 18 | ## 0.0.2 19 | 20 | Add ability to infer start and end date from filters 21 | 22 | 23 | ## 0.0.1 24 | 25 | First Release! 26 | 27 | -------------------------------------------------------------------------------- /timescale_vector/typings/psycopg2/pool.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Hashable 2 | from typing import Any 3 | 4 | from psycopg2.extensions import connection 5 | 6 | class SimpleConnectionPool: 7 | def __init__(self, minconn: int, maxconn: int, dsn: str, **kwargs: Any) -> None: ... 8 | def getconn(self, key: Hashable | None = None) -> connection: ... 9 | def putconn(self, conn: connection, key: Hashable | None = None, close: bool = False) -> None: ... 10 | def closeall(self) -> None: ... 11 | -------------------------------------------------------------------------------- /.github/workflows/pyright.yaml: -------------------------------------------------------------------------------- 1 | name: Type Checking 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | 7 | jobs: 8 | pyright: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | - name: Install uv 17 | run: pip install uv 18 | - name: Create venv 19 | run: uv venv 20 | - name: Install dependencies 21 | run: | 22 | uv sync 23 | - name: Run Pyright 24 | run: uv run pyright -------------------------------------------------------------------------------- /.github/workflows/ruff.yaml: -------------------------------------------------------------------------------- 1 | name: Ruff Linting and Formatting 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | 7 | jobs: 8 | ruff: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | - name: Install uv 17 | run: pip install uv 18 | - name: Create venv 19 | run: uv venv 20 | - name: Install dependencies 21 | run: | 22 | uv sync 23 | - name: Run Ruff linter 24 | run: uv run ruff check . 25 | - name: Run Ruff formatter 26 | run: uv run ruff format . --check -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Python client libraries for TimescaleDB (TM) Vector 2 | 3 | Copyright (c) 2018-2022 Timescale, Inc. All Rights Reserved. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any 3 | 4 | import vcr 5 | 6 | vcr_cassette_path = os.path.join(os.path.dirname(__file__), "vcr_cassettes") 7 | 8 | 9 | def remove_set_cookie_header(response: dict[str, Any]): 10 | headers = response["headers"] 11 | headers_to_remove = ["set-cookie", "Set-Cookie"] 12 | 13 | for header in headers_to_remove: 14 | if header in headers: 15 | del headers[header] 16 | 17 | return response 18 | 19 | 20 | http_recorder = vcr.VCR( 21 | cassette_library_dir=vcr_cassette_path, 22 | record_mode="once", 23 | filter_headers=["authorization", "cookie"], 24 | before_record_response=remove_set_cookie_header, 25 | ) 26 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | pull_request: 5 | branches: [ main ] 6 | 7 | jobs: 8 | pytest: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: '3.10' 16 | - name: Install uv 17 | run: pip install uv 18 | - name: Create venv 19 | run: uv venv 20 | - name: Install dependencies 21 | run: | 22 | uv sync 23 | - name: Start docker-compose 24 | run: docker compose up -d 25 | - name: Run Test 26 | run: uv run pytest 27 | - name: Logs 28 | run: docker compose logs 29 | - name: Stop docker-compose 30 | run: docker compose down -------------------------------------------------------------------------------- /Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "d02cf1aa-4cf0-4656-a2f8-100f39233f37", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [] 10 | } 11 | ], 12 | "metadata": { 13 | "kernelspec": { 14 | "display_name": "Python 3 (ipykernel)", 15 | "language": "python", 16 | "name": "python3" 17 | }, 18 | "language_info": { 19 | "codemirror_mode": { 20 | "name": "ipython", 21 | "version": 3 22 | }, 23 | "file_extension": ".py", 24 | "mimetype": "text/x-python", 25 | "name": "python", 26 | "nbconvert_exporter": "python", 27 | "pygments_lexer": "ipython3", 28 | "version": "3.10.12" 29 | } 30 | }, 31 | "nbformat": 4, 32 | "nbformat_minor": 5 33 | } 34 | -------------------------------------------------------------------------------- /timescale_vector/typings/langchain/docstore/document.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, TypeVar 2 | 3 | from typing_extensions import TypedDict 4 | 5 | class Metadata(TypedDict, total=False): 6 | id: str 7 | blog_id: str 8 | author: str 9 | category: str 10 | published_time: str 11 | 12 | T = TypeVar("T") 13 | 14 | class Document: 15 | """Documents are the basic unit of text in LangChain.""" 16 | 17 | page_content: str 18 | metadata: dict[str, Any] 19 | 20 | def __init__( 21 | self, 22 | page_content: str, 23 | metadata: dict[str, Any] | None = None, 24 | ) -> None: ... 25 | @property 26 | def lc_kwargs(self) -> dict[str, Any]: ... 27 | @classmethod 28 | def is_lc_serializable(cls) -> bool: ... 29 | -------------------------------------------------------------------------------- /nbs/styles.css: -------------------------------------------------------------------------------- 1 | .cell { 2 | margin-bottom: 1rem; 3 | } 4 | 5 | .cell > .sourceCode { 6 | margin-bottom: 0; 7 | } 8 | 9 | .cell-output > pre { 10 | margin-bottom: 0; 11 | } 12 | 13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { 14 | margin-left: 0.8rem; 15 | margin-top: 0; 16 | background: none; 17 | border-left: 2px solid lightsalmon; 18 | border-top-left-radius: 0; 19 | border-top-right-radius: 0; 20 | } 21 | 22 | .cell-output > .sourceCode { 23 | border: none; 24 | } 25 | 26 | .cell-output > .sourceCode { 27 | background: none; 28 | margin-top: 0; 29 | } 30 | 31 | div.description { 32 | padding-left: 2px; 33 | padding-top: 5px; 34 | font-style: italic; 35 | font-size: 135%; 36 | opacity: 70%; 37 | } 38 | -------------------------------------------------------------------------------- /timescale_vector/typings/asyncpg/pool.pyi: -------------------------------------------------------------------------------- 1 | from contextlib import AbstractAsyncContextManager 2 | from typing import Any 3 | 4 | from . import connection 5 | 6 | class Pool: 7 | def __init__(self) -> None: ... 8 | def acquire(self, *, timeout: float | None = None) -> PoolAcquireContext: ... 9 | def release(self, connection: connection.Connection) -> None: ... 10 | async def close(self) -> None: ... 11 | def terminate(self) -> None: ... 12 | 13 | # Context manager support 14 | async def __aenter__(self) -> Pool: ... 15 | async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ... 16 | 17 | class PoolAcquireContext(AbstractAsyncContextManager["connection.Connection"]): 18 | async def __aenter__(self) -> connection.Connection: ... 19 | async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ... 20 | -------------------------------------------------------------------------------- /timescale_vector/typings/psycopg2/extensions.pyi: -------------------------------------------------------------------------------- 1 | from typing import Any, Protocol 2 | 3 | class cursor(Protocol): 4 | def execute(self, query: str, vars: Any | None = None) -> Any: ... 5 | def executemany(self, query: str, vars_list: list[Any]) -> Any: ... 6 | def fetchone(self) -> tuple[Any, ...] | None: ... 7 | def fetchall(self) -> list[tuple[Any, ...]]: ... 8 | def __enter__(self) -> cursor: ... 9 | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ... 10 | 11 | class connection(Protocol): 12 | def cursor(self, cursor_factory: Any | None = None) -> cursor: ... 13 | def commit(self) -> None: ... 14 | def close(self) -> None: ... 15 | def __enter__(self) -> connection: ... 16 | def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ... 17 | 18 | def register_uuid(oids: Any | None = None, conn_or_curs: Any | None = None) -> None: ... 19 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import psycopg2 4 | import pytest 5 | 6 | # from dotenv import find_dotenv, load_dotenv 7 | 8 | 9 | @pytest.fixture(scope="module") 10 | def setup_env_variables() -> None: 11 | os.environ.clear() 12 | os.environ["TIMESCALE_SERVICE_URL"] = "postgres://postgres:postgres@localhost:5432/postgres" 13 | os.environ["OPENAI_API_KEY"] = "fake key" 14 | 15 | 16 | @pytest.fixture(scope="module") 17 | def service_url(setup_env_variables: None) -> str: # noqa: ARG001 18 | # _ = load_dotenv(find_dotenv(), override=True) 19 | return os.environ["TIMESCALE_SERVICE_URL"] 20 | 21 | 22 | @pytest.fixture(scope="module", autouse=True) 23 | def setup_db(service_url: str) -> None: 24 | conn = psycopg2.connect(service_url) 25 | with conn.cursor() as cursor: 26 | cursor.execute("CREATE EXTENSION IF NOT EXISTS ai CASCADE;") 27 | cursor.execute("CREATE SCHEMA IF NOT EXISTS temp;") 28 | conn.commit() 29 | conn.close() 30 | -------------------------------------------------------------------------------- /timescale_vector/typings/langchain_community/vectorstores/timescalevector.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Sequence 2 | from datetime import timedelta 3 | from typing import Any 4 | 5 | from langchain.docstore.document import Document 6 | from langchain.schema.embeddings import Embeddings 7 | 8 | class TimescaleVector: 9 | def __init__( 10 | self, 11 | collection_name: str, 12 | service_url: str, 13 | embedding: Embeddings, 14 | time_partition_interval: timedelta | None = None, 15 | ) -> None: ... 16 | def add_texts( 17 | self, 18 | texts: Sequence[str], 19 | metadatas: list[dict[str, Any]] | None = None, 20 | ids: list[str] | None = None, 21 | **kwargs: Any, 22 | ) -> list[str]: ... 23 | def delete_by_metadata( 24 | self, 25 | metadata_filter: dict[str, Any] | list[dict[str, Any]], 26 | ) -> None: ... 27 | def similarity_search_with_score( 28 | self, 29 | query: str, 30 | k: int = 4, 31 | filter: dict[str, Any] | list[dict[str, Any]] | None = None, 32 | predicates: Any | None = None, 33 | **kwargs: Any, 34 | ) -> list[tuple[Document, float]]: ... 35 | -------------------------------------------------------------------------------- /timescale_vector/client/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "SEARCH_RESULT_ID_IDX", 3 | "SEARCH_RESULT_METADATA_IDX", 4 | "SEARCH_RESULT_CONTENTS_IDX", 5 | "SEARCH_RESULT_EMBEDDING_IDX", 6 | "SEARCH_RESULT_DISTANCE_IDX", 7 | "uuid_from_time", 8 | "BaseIndex", 9 | "IvfflatIndex", 10 | "HNSWIndex", 11 | "DiskAnnIndex", 12 | "QueryParams", 13 | "DiskAnnIndexParams", 14 | "IvfflatIndexParams", 15 | "HNSWIndexParams", 16 | "UUIDTimeRange", 17 | "Predicates", 18 | "QueryBuilder", 19 | "Async", 20 | "Sync", 21 | ] 22 | 23 | from timescale_vector.client.async_client import Async 24 | from timescale_vector.client.index import ( 25 | BaseIndex, 26 | DiskAnnIndex, 27 | DiskAnnIndexParams, 28 | HNSWIndex, 29 | HNSWIndexParams, 30 | IvfflatIndex, 31 | IvfflatIndexParams, 32 | QueryParams, 33 | ) 34 | from timescale_vector.client.predicates import Predicates 35 | from timescale_vector.client.query_builder import QueryBuilder 36 | from timescale_vector.client.sync_client import Sync 37 | from timescale_vector.client.utils import uuid_from_time 38 | from timescale_vector.client.uuid_time_range import UUIDTimeRange 39 | 40 | SEARCH_RESULT_ID_IDX = 0 41 | SEARCH_RESULT_METADATA_IDX = 1 42 | SEARCH_RESULT_CONTENTS_IDX = 2 43 | SEARCH_RESULT_EMBEDDING_IDX = 3 44 | SEARCH_RESULT_DISTANCE_IDX = 4 45 | -------------------------------------------------------------------------------- /timescale_vector/typings/asyncpg/connection.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Sequence 2 | from typing import Any 3 | 4 | from . import Record 5 | 6 | class Connection: 7 | # Transaction management 8 | async def execute(self, query: str, *args: Any, timeout: float | None = None) -> str: ... 9 | async def executemany( 10 | self, command: str, args: Sequence[Sequence[Any]], *, timeout: float | None = None 11 | ) -> str: ... 12 | async def fetch(self, query: str, *args: Any, timeout: float | None = None) -> list[Record]: ... 13 | async def fetchval(self, query: str, *args: Any, column: int = 0, timeout: float | None = None) -> Any: ... 14 | async def fetchrow(self, query: str, *args: Any, timeout: float | None = None) -> Record | None: ... 15 | async def set_type_codec( 16 | self, typename: str, *, schema: str = "public", encoder: Any, decoder: Any, format: str = "text" 17 | ) -> None: ... 18 | 19 | # Transaction context 20 | def transaction(self, *, isolation: str = "read_committed") -> Transaction: ... 21 | async def close(self, *, timeout: float | None = None) -> None: ... 22 | 23 | class Transaction: 24 | async def __aenter__(self) -> Transaction: ... 25 | async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ... 26 | async def start(self) -> None: ... 27 | async def commit(self) -> None: ... 28 | async def rollback(self) -> None: ... 29 | -------------------------------------------------------------------------------- /timescale_vector/typings/asyncpg/__init__.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Sequence 2 | from typing import Any, Protocol, TypeVar 3 | 4 | from . import connection, pool 5 | 6 | # Core types 7 | T = TypeVar("T") 8 | 9 | class Record(Protocol): 10 | def __getitem__(self, key: int | str) -> Any: ... 11 | def __iter__(self) -> Any: ... 12 | def __len__(self) -> int: ... 13 | def get(self, key: str, default: T = None) -> T | None: ... 14 | def keys(self) -> Sequence[str]: ... 15 | def values(self) -> Sequence[Any]: ... 16 | def items(self) -> Sequence[tuple[str, Any]]: ... 17 | 18 | # Allow dictionary-style access to fields 19 | def __getattr__(self, name: str) -> Any: ... 20 | 21 | # Re-exports 22 | Connection = connection.Connection 23 | Pool = pool.Pool 24 | Record = Record 25 | 26 | # Functions 27 | async def connect( 28 | dsn: str | None = None, 29 | *, 30 | host: str | None = None, 31 | port: int | None = None, 32 | user: str | None = None, 33 | password: str | None = None, 34 | database: str | None = None, 35 | timeout: int = 60, 36 | ) -> Connection: ... 37 | async def create_pool( 38 | dsn: str | None = None, 39 | *, 40 | min_size: int = 10, 41 | max_size: int = 10, 42 | max_queries: int = 50000, 43 | max_inactive_connection_lifetime: float = 300.0, 44 | setup: Any | None = None, 45 | init: Any | None = None, 46 | **connect_kwargs: Any, 47 | ) -> Pool: ... 48 | -------------------------------------------------------------------------------- /timescale_vector/typings/vcr.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable 2 | from typing import Any, Literal, Protocol, TypeAlias, TypeVar, overload 3 | 4 | _T = TypeVar("_T") 5 | _F = TypeVar("_F", bound=Callable[..., Any]) 6 | 7 | class VCRConfig(Protocol): 8 | filter_headers: list[str] 9 | ignore_localhost: bool 10 | ignore_hosts: list[str] 11 | record_mode: Literal["once", "new_episodes", "none", "all"] 12 | match_on: list[str] 13 | 14 | class _Cassette: 15 | def __init__(self, path: str) -> None: ... 16 | def play_response(self, request: Any) -> Any: ... 17 | def append(self, request: Any, response: Any) -> None: ... 18 | def responses_of(self, request: Any) -> list[Any]: ... 19 | 20 | class VCR: 21 | def __init__(self, **kwargs: Any) -> None: ... 22 | @overload 23 | def use_cassette(self, path: str) -> Callable[[_F], _F]: ... 24 | @overload 25 | def use_cassette(self, path: str, **kwargs: Any) -> Callable[[_F], _F]: ... 26 | def record_mode(self) -> str: ... 27 | def turn_off(self, *, allow_playback: bool = ...) -> None: ... 28 | def turn_on(self) -> None: ... 29 | def serialize(self) -> dict[str, Any]: ... 30 | 31 | @overload 32 | def use_cassette(path: str) -> Callable[[_F], _F]: ... 33 | @overload 34 | def use_cassette(path: str, **kwargs: Any) -> Callable[[_F], _F]: ... 35 | def use_cassette(path: str, **kwargs: Any) -> _Cassette: ... 36 | 37 | default_vcr: VCR 38 | 39 | class VCRError(Exception): ... 40 | class CannotOverwriteExistingCassetteException(VCRError): ... 41 | class UnhandledHTTPRequestError(VCRError): ... 42 | 43 | # Common kwargs for reference (these aren't actually part of the type system) 44 | COMMON_KWARGS: TypeAlias = Literal[ 45 | "record_mode", # : Literal["once", "new_episodes", "none", "all"] 46 | "match_on", # : list[str] - e.g. ["uri", "method", "body"] 47 | "filter_headers", # : list[str] - headers to filter out 48 | "before_record_response", # : Callable[[Any], Any] 49 | "before_record_request", # : Callable[[Any], Any] 50 | "ignore_localhost", # : bool 51 | "ignore_hosts", # : list[str] 52 | ] 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _docs/ 2 | _proc/ 3 | 4 | *.bak 5 | .gitattributes 6 | .last_checked 7 | .gitconfig 8 | *.bak 9 | *.log 10 | *~ 11 | ~* 12 | _tmp* 13 | tmp* 14 | tags 15 | *.pkg 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | env/ 28 | build/ 29 | develop-eggs/ 30 | dist/ 31 | downloads/ 32 | eggs/ 33 | .eggs/ 34 | lib/ 35 | lib64/ 36 | parts/ 37 | sdist/ 38 | var/ 39 | wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | 44 | # PyInstaller 45 | # Usually these files are written by a python script from a template 46 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 47 | *.manifest 48 | *.spec 49 | 50 | # Installer logs 51 | pip-log.txt 52 | pip-delete-this-directory.txt 53 | 54 | # Unit test / coverage reports 55 | htmlcov/ 56 | .tox/ 57 | .coverage 58 | .coverage.* 59 | .cache 60 | nosetests.xml 61 | coverage.xml 62 | *.cover 63 | .hypothesis/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # celery beat schedule file 93 | celerybeat-schedule 94 | 95 | # SageMath parsed files 96 | *.sage.py 97 | 98 | # dotenv 99 | .env 100 | 101 | # virtualenv 102 | .venv 103 | venv/ 104 | ENV/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | 119 | .vscode 120 | *.swp 121 | 122 | # osx generated files 123 | .DS_Store 124 | .DS_Store? 125 | .Trashes 126 | ehthumbs.db 127 | Thumbs.db 128 | .idea 129 | 130 | # pytest 131 | .pytest_cache 132 | 133 | # tools/trust-doc-nbs 134 | docs_src/.last_checked 135 | 136 | # symlinks to fastai 137 | docs_src/fastai 138 | tools/fastai 139 | 140 | # link checker 141 | checklink/cookies.txt 142 | 143 | # .gitconfig is now autogenerated 144 | .gitconfig 145 | 146 | # Quarto installer 147 | .deb 148 | .pkg 149 | 150 | # Quarto 151 | .quarto 152 | token 153 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "timescale-vector" 3 | version = "0.0.9" 4 | description = "Python library for storing vector data in Postgres" 5 | authors = [ 6 | {name = "Matvey Arye", email = "mat@timescale.com"}, 7 | ] 8 | requires-python = ">=3.10" 9 | license = {text = "Apache-2.0"} 10 | readme = "README.md" 11 | classifiers = [ 12 | "Development Status :: 3 - Alpha", 13 | "Intended Audience :: Developers", 14 | "License :: OSI Approved :: Apache Software License", 15 | "Programming Language :: Python :: 3.10", 16 | ] 17 | 18 | dependencies = [ 19 | "python-dotenv>=1.0.1", 20 | "asyncpg>=0.29.0", 21 | "psycopg2>=2.9.9", 22 | "pgvector>=0.3.5", 23 | "numpy>=1,<2", 24 | ] 25 | 26 | [project.urls] 27 | repository = "https://github.com/timescale/python-vector" 28 | documentation = "https://timescale.github.io/python-vector" 29 | 30 | [build-system] 31 | requires = ["hatchling"] 32 | build-backend = "hatchling.build" 33 | 34 | [tool.hatch.build.targets.wheel] 35 | packages = ["timescale_vector"] 36 | 37 | [tool.pytest.ini_options] 38 | addopts = [ 39 | "--import-mode=importlib", 40 | ] 41 | 42 | 43 | [tool.mypy] 44 | strict = true 45 | ignore_missing_imports = true 46 | namespace_packages = true 47 | 48 | [tool.pyright] 49 | typeCheckingMode = "strict" 50 | stubPath = "timescale_vector/typings" 51 | 52 | [tool.ruff] 53 | line-length = 120 54 | indent-width = 4 55 | output-format = "grouped" 56 | target-version = "py310" 57 | 58 | exclude = [ 59 | ".bzr", 60 | ".direnv", 61 | ".eggs", 62 | ".git", 63 | ".git-rewrite", 64 | ".hg", 65 | ".ipynb_checkpoints", 66 | ".mypy_cache", 67 | ".nox", 68 | ".pants.d", 69 | ".pyenv", 70 | ".pytest_cache", 71 | ".pytype", 72 | ".ruff_cache", 73 | ".svn", 74 | ".tox", 75 | ".venv", 76 | ".vscode", 77 | "__pypackages__", 78 | "_build", 79 | "buck-out", 80 | "build", 81 | "dist", 82 | "node_modules", 83 | "site-packages", 84 | "venv", 85 | "nbs" 86 | ] 87 | 88 | [tool.ruff.format] 89 | docstring-code-format = true 90 | quote-style = "double" 91 | indent-style = "space" 92 | skip-magic-trailing-comma = false 93 | line-ending = "auto" 94 | 95 | [tool.ruff.lint] 96 | select = [ 97 | "E", 98 | "F", 99 | "UP", 100 | "B", 101 | "SIM", 102 | "I", 103 | "ARG", 104 | "W291", 105 | "PIE", 106 | "Q" 107 | ] 108 | 109 | [tool.uv] 110 | dev-dependencies = [ 111 | "ruff>=0.6.9", 112 | "pytest>=8.3.3", 113 | "langchain>=0.3.3", 114 | "langchain-openai>=0.2.2", 115 | "langchain-community>=0.3.2", 116 | "pandas>=2.2.3", 117 | "pytest-asyncio>=0.24.0", 118 | "pyright>=1.1.386", 119 | "vcrpy>=6.0.2", 120 | ] 121 | -------------------------------------------------------------------------------- /timescale_vector/client/utils.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | import random 3 | import uuid 4 | from datetime import datetime, timezone 5 | from typing import Any 6 | 7 | 8 | # copied from Cassandra: https://docs.datastax.com/en/drivers/python/3.2/_modules/cassandra/util.html#uuid_from_time 9 | def uuid_from_time( 10 | time_arg: float | datetime | None = None, node: Any = None, clock_seq: int | None = None 11 | ) -> uuid.UUID: 12 | """ 13 | Converts a datetime or timestamp to a type 1 `uuid.UUID`. 14 | 15 | Parameters 16 | ---------- 17 | time_arg 18 | The time to use for the timestamp portion of the UUID. 19 | This can either be a `datetime` object or a timestamp in seconds 20 | (as returned from `time.time()`). 21 | node 22 | Bytes for the UUID (up to 48 bits). If not specified, this 23 | field is randomized. 24 | clock_seq 25 | Clock sequence field for the UUID (up to 14 bits). If not specified, 26 | a random sequence is generated. 27 | 28 | Returns 29 | ------- 30 | uuid.UUID: For the given time, node, and clock sequence 31 | """ 32 | if time_arg is None: 33 | return uuid.uuid1(node, clock_seq) 34 | if isinstance(time_arg, datetime): 35 | # this is different from the Cassandra version, 36 | # we assume that a naive datetime is in system time and convert it to UTC 37 | # we do this because naive datetimes are interpreted as timestamps (without timezone) in postgres 38 | time_arg_dt: datetime = time_arg # type: ignore 39 | if time_arg_dt.tzinfo is None: 40 | time_arg_dt = time_arg_dt.astimezone(timezone.utc) 41 | seconds = int(calendar.timegm(time_arg_dt.utctimetuple())) 42 | microseconds = (seconds * 1e6) + time_arg_dt.time().microsecond 43 | else: 44 | microseconds = int(float(time_arg) * 1e6) 45 | 46 | # 0x01b21dd213814000 is the number of 100-ns intervals between the 47 | # UUID epoch 1582-10-15 00:00:00 and the Unix epoch 1970-01-01 00:00:00. 48 | intervals = int(microseconds * 10) + 0x01B21DD213814000 49 | 50 | time_low = intervals & 0xFFFFFFFF 51 | time_mid = (intervals >> 32) & 0xFFFF 52 | time_hi_version = (intervals >> 48) & 0x0FFF 53 | 54 | if clock_seq is None: 55 | clock_seq = random.getrandbits(14) 56 | else: 57 | if clock_seq > 0x3FFF: 58 | raise ValueError("clock_seq is out of range (need a 14-bit value)") 59 | 60 | clock_seq_low = clock_seq & 0xFF 61 | clock_seq_hi_variant = 0x80 | ((clock_seq >> 8) & 0x3F) 62 | 63 | if node is None: 64 | node = random.getrandbits(48) 65 | 66 | return uuid.UUID( 67 | fields=( 68 | time_low, 69 | time_mid, 70 | time_hi_version, 71 | clock_seq_hi_variant, 72 | clock_seq_low, 73 | node, 74 | ), 75 | version=1, 76 | ) 77 | -------------------------------------------------------------------------------- /timescale_vector/client/uuid_time_range.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta, timezone 2 | from typing import Any 3 | 4 | 5 | class UUIDTimeRange: 6 | @staticmethod 7 | def _parse_datetime(input_datetime: datetime | str | None | Any) -> datetime | None: 8 | """ 9 | Parse a datetime object or string representation of a datetime. 10 | 11 | Args: 12 | input_datetime (datetime or str): Input datetime or string. 13 | 14 | Returns: 15 | datetime: Parsed datetime object. 16 | 17 | Raises: 18 | ValueError: If the input cannot be parsed as a datetime. 19 | """ 20 | if input_datetime is None or input_datetime == "None": 21 | return None 22 | 23 | if isinstance(input_datetime, datetime): 24 | # If input is already a datetime object, return it as is 25 | return input_datetime 26 | 27 | if isinstance(input_datetime, str): 28 | try: 29 | # Attempt to parse the input string into a datetime 30 | return datetime.fromisoformat(input_datetime) 31 | except ValueError: 32 | raise ValueError(f"Invalid datetime string format: {input_datetime}") from None 33 | 34 | raise ValueError("Input must be a datetime object or string") 35 | 36 | def __init__( 37 | self, 38 | start_date: datetime | str | None = None, 39 | end_date: datetime | str | None = None, 40 | time_delta: timedelta | None = None, 41 | start_inclusive: bool = True, 42 | end_inclusive: bool = False, 43 | ): 44 | """ 45 | A UUIDTimeRange is a time range predicate on the UUID Version 1 timestamps. 46 | 47 | Note that naive datetime objects are interpreted as local time on the python client side 48 | and converted to UTC before being sent to the database. 49 | """ 50 | start_date = UUIDTimeRange._parse_datetime(start_date) 51 | end_date = UUIDTimeRange._parse_datetime(end_date) 52 | 53 | if start_date is not None and end_date is not None and start_date > end_date: 54 | raise Exception("start_date must be before end_date") 55 | 56 | if start_date is None and end_date is None: 57 | raise Exception("start_date and end_date cannot both be None") 58 | 59 | if start_date is not None and start_date.tzinfo is None: 60 | start_date = start_date.astimezone(timezone.utc) 61 | 62 | if end_date is not None and end_date.tzinfo is None: 63 | end_date = end_date.astimezone(timezone.utc) 64 | 65 | if time_delta is not None: 66 | if end_date is None and start_date is not None: 67 | end_date = start_date + time_delta 68 | elif start_date is None and end_date is not None: 69 | start_date = end_date - time_delta 70 | else: 71 | raise Exception("time_delta, start_date and end_date cannot all be specified at the same time") 72 | 73 | self.start_date: datetime | None = start_date 74 | self.end_date: datetime | None = end_date 75 | self.start_inclusive: bool = start_inclusive 76 | self.end_inclusive: bool = end_inclusive 77 | 78 | def __str__(self) -> str: 79 | start_str = f"[{self.start_date}" if self.start_inclusive else f"({self.start_date}" 80 | end_str = f"{self.end_date}]" if self.end_inclusive else f"{self.end_date})" 81 | 82 | return f"UUIDTimeRange {start_str}, {end_str}" 83 | 84 | def build_query(self, params: list[Any]) -> tuple[str, list[Any]]: 85 | column = "uuid_timestamp(id)" 86 | queries: list[str] = [] 87 | if self.start_date is not None: 88 | if self.start_inclusive: 89 | queries.append(f"{column} >= ${len(params)+1}") 90 | else: 91 | queries.append(f"{column} > ${len(params)+1}") 92 | params.append(self.start_date) 93 | if self.end_date is not None: 94 | if self.end_inclusive: 95 | queries.append(f"{column} <= ${len(params)+1}") 96 | else: 97 | queries.append(f"{column} < ${len(params)+1}") 98 | params.append(self.end_date) 99 | return " AND ".join(queries), params 100 | -------------------------------------------------------------------------------- /timescale_vector/pgvectorizer.py: -------------------------------------------------------------------------------- 1 | # pyright: reportPrivateUsage=false 2 | __all__ = ["Vectorize"] 3 | 4 | import re 5 | from collections.abc import Callable 6 | from typing import Any 7 | 8 | import psycopg2.extras 9 | import psycopg2.pool 10 | 11 | from . import client 12 | 13 | 14 | def _create_ident(base: str, suffix: str) -> str: 15 | if len(base) + len(suffix) > 62: 16 | base = base[: 62 - len(suffix)] 17 | return re.sub(r"[^a-zA-Z0-9_]", "_", f"{base}_{suffix}") 18 | 19 | 20 | class Vectorize: 21 | def __init__( 22 | self, 23 | service_url: str, 24 | table_name: str, 25 | schema_name: str = "public", 26 | id_column_name: str = "id", 27 | work_queue_table_name: str | None = None, 28 | trigger_name: str = "track_changes_for_embedding", 29 | trigger_name_fn: str | None = None, 30 | ) -> None: 31 | self.service_url = service_url 32 | self.table_name_unquoted = table_name 33 | self.schema_name_unquoted = schema_name 34 | self.table_name = client.QueryBuilder._quote_ident(table_name) 35 | self.schema_name = client.QueryBuilder._quote_ident(schema_name) 36 | self.id_column_name = client.QueryBuilder._quote_ident(id_column_name) 37 | if work_queue_table_name is None: 38 | work_queue_table_name = _create_ident(table_name, "embedding_work_queue") 39 | self.work_queue_table_name = client.QueryBuilder._quote_ident(work_queue_table_name) 40 | 41 | self.trigger_name = client.QueryBuilder._quote_ident(trigger_name) 42 | 43 | if trigger_name_fn is None: 44 | trigger_name_fn = _create_ident(table_name, "wq_for_embedding") 45 | self.trigger_name_fn = client.QueryBuilder._quote_ident(trigger_name_fn) 46 | 47 | def register(self) -> None: 48 | with psycopg2.connect(self.service_url) as conn, conn.cursor() as cursor: 49 | cursor.execute(f""" 50 | SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}') is not null; 51 | """) 52 | table_exists = cursor.fetchone() 53 | if table_exists and table_exists[0]: 54 | return 55 | 56 | cursor.execute(f""" 57 | CREATE TABLE {self.schema_name}.{self.work_queue_table_name} ( 58 | id int 59 | ); 60 | 61 | CREATE INDEX ON {self.schema_name}.{self.work_queue_table_name}(id); 62 | 63 | CREATE OR REPLACE FUNCTION {self.schema_name}.{self.trigger_name_fn}() 64 | RETURNS TRIGGER LANGUAGE PLPGSQL AS $$ 65 | BEGIN 66 | IF (TG_OP = 'DELETE') THEN 67 | INSERT INTO {self.work_queue_table_name} 68 | VALUES (OLD.{self.id_column_name}); 69 | ELSE 70 | INSERT INTO {self.work_queue_table_name} 71 | VALUES (NEW.{self.id_column_name}); 72 | END IF; 73 | RETURN NULL; 74 | END; 75 | $$; 76 | 77 | CREATE TRIGGER {self.trigger_name} 78 | AFTER INSERT OR UPDATE OR DELETE 79 | ON {self.schema_name}.{self.table_name} 80 | FOR EACH ROW EXECUTE PROCEDURE {self.schema_name}.{self.trigger_name_fn}(); 81 | 82 | INSERT INTO {self.schema_name}.{self.work_queue_table_name} SELECT {self.id_column_name} 83 | FROM {self.schema_name}.{self.table_name}; 84 | """) 85 | 86 | def process( 87 | self, 88 | embed_and_write_cb: Callable[[list[Any], "Vectorize"], None], 89 | batch_size: int = 10, 90 | autoregister: bool = True, 91 | ) -> int: 92 | if autoregister: 93 | self.register() 94 | 95 | with ( 96 | psycopg2.connect(self.service_url) as conn, 97 | conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor, 98 | ): 99 | cursor.execute(f""" 100 | SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}')::oid; 101 | """) 102 | table_oid = cursor.fetchone() 103 | if table_oid is None: 104 | return 0 105 | 106 | cursor.execute(f""" 107 | WITH selected_rows AS ( 108 | SELECT id 109 | FROM {self.schema_name}.{self.work_queue_table_name} 110 | LIMIT {int(batch_size)} 111 | FOR UPDATE SKIP LOCKED 112 | ), 113 | locked_items AS ( 114 | SELECT id, pg_try_advisory_xact_lock({int(table_oid[0])}, id) AS locked 115 | FROM (SELECT DISTINCT id FROM selected_rows ORDER BY id) as ids 116 | ), 117 | deleted_rows AS ( 118 | DELETE FROM {self.schema_name}.{self.work_queue_table_name} 119 | WHERE id IN (SELECT id FROM locked_items WHERE locked = true ORDER BY id) 120 | ) 121 | SELECT locked_items.id as locked_id, {self.table_name}.* 122 | FROM locked_items 123 | LEFT JOIN {self.schema_name}.{self.table_name} 124 | ON {self.table_name}.{self.id_column_name} = locked_items.id 125 | WHERE locked = true 126 | ORDER BY locked_items.id 127 | """) 128 | res = cursor.fetchall() 129 | if len(res) > 0: 130 | embed_and_write_cb(res, self) 131 | return len(res) 132 | -------------------------------------------------------------------------------- /timescale_vector/client/index.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections.abc import Callable 3 | from typing import Any 4 | 5 | from typing_extensions import override 6 | 7 | 8 | class BaseIndex: 9 | def get_index_method(self, distance_type: str) -> str: 10 | index_method = "invalid" 11 | if distance_type == "<->": 12 | index_method = "vector_l2_ops" 13 | elif distance_type == "<#>": 14 | index_method = "vector_ip_ops" 15 | elif distance_type == "<=>": 16 | index_method = "vector_cosine_ops" 17 | else: 18 | raise ValueError(f"Unknown distance type {distance_type}") 19 | return index_method 20 | 21 | def create_index_query( 22 | self, 23 | table_name_quoted: str, 24 | column_name_quoted: str, 25 | index_name_quoted: str, 26 | distance_type: str, 27 | num_records_callback: Callable[[], int], 28 | ) -> str: 29 | raise NotImplementedError() 30 | 31 | 32 | class IvfflatIndex(BaseIndex): 33 | def __init__(self, num_records: int | None = None, num_lists: int | None = None) -> None: 34 | """ 35 | Pgvector's ivfflat index. 36 | """ 37 | self.num_records: int | None = num_records 38 | self.num_lists: int | None = num_lists 39 | 40 | def get_num_records(self, num_record_callback: Callable[[], int]) -> int: 41 | if self.num_records is not None: 42 | return self.num_records 43 | return num_record_callback() 44 | 45 | def get_num_lists(self, num_records_callback: Callable[[], int]) -> int: 46 | if self.num_lists is not None: 47 | return self.num_lists 48 | 49 | num_records = self.get_num_records(num_records_callback) 50 | num_lists = num_records / 1000 51 | if num_lists < 10: 52 | num_lists = 10 53 | if num_records > 1000000: 54 | num_lists = math.sqrt(num_records) 55 | return int(num_lists) 56 | 57 | def create_index_query( 58 | self, 59 | table_name_quoted: str, 60 | column_name_quoted: str, 61 | index_name_quoted: str, 62 | distance_type: str, 63 | num_records_callback: Callable[[], int], 64 | ) -> str: 65 | index_method = self.get_index_method(distance_type) 66 | num_lists = self.get_num_lists(num_records_callback) 67 | 68 | return ( 69 | f"CREATE INDEX {index_name_quoted} ON {table_name_quoted}" 70 | f"USING ivfflat ({column_name_quoted} {index_method}) WITH (lists = {num_lists});" 71 | ) 72 | 73 | 74 | class HNSWIndex(BaseIndex): 75 | def __init__(self, m: int | None = None, ef_construction: int | None = None) -> None: 76 | """ 77 | Pgvector's hnsw index. 78 | """ 79 | self.m: int | None = m 80 | self.ef_construction: int | None = ef_construction 81 | 82 | @override 83 | def create_index_query( 84 | self, 85 | table_name_quoted: str, 86 | column_name_quoted: str, 87 | index_name_quoted: str, 88 | distance_type: str, 89 | num_records_callback: Callable[[], int], 90 | ) -> str: 91 | index_method = self.get_index_method(distance_type) 92 | 93 | with_clauses: list[str] = [] 94 | if self.m is not None: 95 | with_clauses.append(f"m = {self.m}") 96 | if self.ef_construction is not None: 97 | with_clauses.append(f"ef_construction = {self.ef_construction}") 98 | 99 | with_clause = "" 100 | if len(with_clauses) > 0: 101 | with_clause = "WITH (" + ", ".join(with_clauses) + ")" 102 | 103 | return ( 104 | f"CREATE INDEX {index_name_quoted} ON {table_name_quoted}" 105 | f"USING hnsw ({column_name_quoted} {index_method}) {with_clause};" 106 | ) 107 | 108 | 109 | class DiskAnnIndex(BaseIndex): 110 | def __init__( 111 | self, 112 | search_list_size: int | None = None, 113 | num_neighbors: int | None = None, 114 | max_alpha: float | None = None, 115 | storage_layout: str | None = None, 116 | num_dimensions: int | None = None, 117 | num_bits_per_dimension: int | None = None, 118 | ) -> None: 119 | """ 120 | Timescale's vector index. 121 | """ 122 | self.search_list_size: int | None = search_list_size 123 | self.num_neighbors: int | None = num_neighbors 124 | self.max_alpha: float | None = max_alpha 125 | self.storage_layout: str | None = storage_layout 126 | self.num_dimensions: int | None = num_dimensions 127 | self.num_bits_per_dimension: int | None = num_bits_per_dimension 128 | 129 | @override 130 | def create_index_query( 131 | self, 132 | table_name_quoted: str, 133 | column_name_quoted: str, 134 | index_name_quoted: str, 135 | distance_type: str, 136 | num_records_callback: Callable[[], int], 137 | ) -> str: 138 | if distance_type != "<=>": 139 | raise ValueError( 140 | f"Timescale's vector index only supports cosine distance, but distance_type was {distance_type}" 141 | ) 142 | 143 | with_clauses: list[str] = [] 144 | if self.search_list_size is not None: 145 | with_clauses.append(f"search_list_size = {self.search_list_size}") 146 | if self.num_neighbors is not None: 147 | with_clauses.append(f"num_neighbors = {self.num_neighbors}") 148 | if self.max_alpha is not None: 149 | with_clauses.append(f"max_alpha = {self.max_alpha}") 150 | if self.storage_layout is not None: 151 | with_clauses.append(f"storage_layout = {self.storage_layout}") 152 | if self.num_dimensions is not None: 153 | with_clauses.append(f"num_dimensions = {self.num_dimensions}") 154 | if self.num_bits_per_dimension is not None: 155 | with_clauses.append(f"num_bits_per_dimension = {self.num_bits_per_dimension}") 156 | 157 | with_clause = "" 158 | if len(with_clauses) > 0: 159 | with_clause = "WITH (" + ", ".join(with_clauses) + ")" 160 | 161 | return ( 162 | f"CREATE INDEX {index_name_quoted} ON {table_name_quoted}" 163 | f"USING diskann ({column_name_quoted}) {with_clause};" 164 | ) 165 | 166 | 167 | class QueryParams: 168 | def __init__(self, params: dict[str, Any]) -> None: 169 | self.params: dict[str, Any] = params 170 | 171 | def get_statements(self) -> list[str]: 172 | return ["SET LOCAL " + key + " = " + str(value) for key, value in self.params.items()] 173 | 174 | 175 | class DiskAnnIndexParams(QueryParams): 176 | def __init__(self, search_list_size: int | None = None, rescore: int | None = None) -> None: 177 | params: dict[str, Any] = {} 178 | if search_list_size is not None: 179 | params["diskann.query_search_list_size"] = search_list_size 180 | if rescore is not None: 181 | params["diskann.query_rescore"] = rescore 182 | super().__init__(params) 183 | 184 | 185 | class IvfflatIndexParams(QueryParams): 186 | def __init__(self, probes: int) -> None: 187 | super().__init__({"ivfflat.probes": probes}) 188 | 189 | 190 | class HNSWIndexParams(QueryParams): 191 | def __init__(self, ef_search: int) -> None: 192 | super().__init__({"hnsw.ef_search": ef_search}) 193 | -------------------------------------------------------------------------------- /tests/pg_vectorizer_test.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from typing import Any 3 | 4 | import psycopg2 5 | from langchain.docstore.document import Document 6 | from langchain.text_splitter import CharacterTextSplitter 7 | from langchain_community.vectorstores.timescalevector import TimescaleVector 8 | from langchain_openai import OpenAIEmbeddings 9 | 10 | from tests.utils import http_recorder 11 | from timescale_vector import client 12 | from timescale_vector.pgvectorizer import Vectorize 13 | 14 | 15 | def get_document(blog: dict[str, Any]) -> list[Document]: 16 | text_splitter = CharacterTextSplitter( 17 | chunk_size=1000, 18 | chunk_overlap=200, 19 | ) 20 | docs: list[Document] = [] 21 | for chunk in text_splitter.split_text(blog["contents"]): 22 | content = f"Author {blog['author']}, title: {blog['title']}, contents:{chunk}" 23 | metadata = { 24 | "id": str(client.uuid_from_time(blog["published_time"])), 25 | "blog_id": blog["id"], 26 | "author": blog["author"], 27 | "category": blog["category"], 28 | "published_time": blog["published_time"].isoformat(), 29 | } 30 | docs.append(Document(page_content=content, metadata=metadata)) 31 | return docs 32 | 33 | 34 | @http_recorder.use_cassette("pg_vectorizer.yaml") 35 | def test_pg_vectorizer(service_url: str) -> None: 36 | with psycopg2.connect(service_url) as conn, conn.cursor() as cursor: 37 | for item in ["blog", "blog_embedding_work_queue", "blog_embedding"]: 38 | cursor.execute(f"DROP TABLE IF EXISTS {item};") 39 | 40 | for item in ["public", "test"]: 41 | cursor.execute(f"DROP SCHEMA IF EXISTS {item} CASCADE;") 42 | cursor.execute(f"CREATE SCHEMA {item};") 43 | 44 | with psycopg2.connect(service_url) as conn, conn.cursor() as cursor: 45 | cursor.execute(""" 46 | CREATE TABLE IF NOT EXISTS blog ( 47 | id SERIAL PRIMARY KEY NOT NULL, 48 | title TEXT NOT NULL, 49 | author TEXT NOT NULL, 50 | contents TEXT NOT NULL, 51 | category TEXT NOT NULL, 52 | published_time TIMESTAMPTZ NULL --NULL if not yet published 53 | ); 54 | """) 55 | cursor.execute(""" 56 | insert into blog (title, author, contents, category, published_time) 57 | VALUES ('first', 'mat', 'first_post', 'personal', '2021-01-01'); 58 | """) 59 | 60 | def embed_and_write(blog_instances: list[Any], vectorizer: Vectorize) -> None: 61 | TABLE_NAME = vectorizer.table_name_unquoted + "_embedding" 62 | embedding = OpenAIEmbeddings() 63 | vector_store = TimescaleVector( 64 | collection_name=TABLE_NAME, 65 | service_url=service_url, 66 | embedding=embedding, 67 | time_partition_interval=timedelta(days=30), 68 | ) 69 | 70 | # delete old embeddings for all ids in the work queue 71 | metadata_for_delete = [{"blog_id": blog["locked_id"]} for blog in blog_instances] 72 | vector_store.delete_by_metadata(metadata_for_delete) 73 | 74 | documents: list[Document] = [] 75 | for blog in blog_instances: 76 | # skip blogs that are not published yet, or are deleted (will be None because of left join) 77 | if blog["published_time"] is not None: 78 | documents.extend(get_document(blog)) 79 | 80 | if len(documents) == 0: 81 | return 82 | 83 | texts = [d.page_content for d in documents] 84 | metadatas = [d.metadata for d in documents] 85 | ids = [d.metadata["id"] for d in documents] 86 | vector_store.add_texts(texts, metadatas, ids) 87 | 88 | vectorizer = Vectorize(service_url, "blog") 89 | vectorizer.register() 90 | # should be idempotent 91 | vectorizer.register() 92 | 93 | assert vectorizer.process(embed_and_write) == 1 94 | assert vectorizer.process(embed_and_write) == 0 95 | 96 | TABLE_NAME = "blog_embedding" 97 | embedding = OpenAIEmbeddings() 98 | vector_store = TimescaleVector( 99 | collection_name=TABLE_NAME, 100 | service_url=service_url, 101 | embedding=embedding, 102 | time_partition_interval=timedelta(days=30), 103 | ) 104 | 105 | res = vector_store.similarity_search_with_score("first", 10) 106 | assert len(res) == 1 107 | 108 | with psycopg2.connect(service_url) as conn, conn.cursor() as cursor: 109 | cursor.execute(""" 110 | insert into blog 111 | (title, author, contents, category, published_time) 112 | VALUES 113 | ('2', 'mat', 'second_post', 'personal', '2021-01-01'); 114 | insert into blog 115 | (title, author, contents, category, published_time) 116 | VALUES 117 | ('3', 'mat', 'third_post', 'personal', '2021-01-01'); 118 | """) 119 | assert vectorizer.process(embed_and_write) == 2 120 | assert vectorizer.process(embed_and_write) == 0 121 | 122 | res = vector_store.similarity_search_with_score("first", 10) 123 | assert len(res) == 3 124 | 125 | with psycopg2.connect(service_url) as conn, conn.cursor() as cursor: 126 | cursor.execute(""" 127 | DELETE FROM blog WHERE title = '3'; 128 | """) 129 | assert vectorizer.process(embed_and_write) == 1 130 | assert vectorizer.process(embed_and_write) == 0 131 | res = vector_store.similarity_search_with_score("first", 10) 132 | assert len(res) == 2 133 | 134 | res = vector_store.similarity_search_with_score("second", 10) 135 | assert len(res) == 2 136 | content = res[0][0].page_content 137 | assert "new version" not in content 138 | with psycopg2.connect(service_url) as conn, conn.cursor() as cursor: 139 | cursor.execute(""" 140 | update blog set contents = 'second post new version' WHERE title = '2'; 141 | """) 142 | assert vectorizer.process(embed_and_write) == 1 143 | assert vectorizer.process(embed_and_write) == 0 144 | res = vector_store.similarity_search_with_score("second", 10) 145 | assert len(res) == 2 146 | content = res[0][0].page_content 147 | assert "new version" in content 148 | 149 | with psycopg2.connect(service_url) as conn, conn.cursor() as cursor: 150 | cursor.execute(""" 151 | CREATE TABLE IF NOT EXISTS test.blog_table_name_that_is_really_really_long_and_i_mean_long ( 152 | id SERIAL PRIMARY KEY NOT NULL, 153 | title TEXT NOT NULL, 154 | author TEXT NOT NULL, 155 | contents TEXT NOT NULL, 156 | category TEXT NOT NULL, 157 | published_time TIMESTAMPTZ NULL --NULL if not yet published 158 | ); 159 | """) 160 | cursor.execute(""" 161 | insert into test.blog_table_name_that_is_really_really_long_and_i_mean_long 162 | (title, author, contents, category, published_time) 163 | VALUES 164 | ('first', 'mat', 'first_post', 'personal', '2021-01-01'); 165 | """) 166 | 167 | vectorizer = Vectorize( 168 | service_url, 169 | "blog_table_name_that_is_really_really_long_and_i_mean_long", 170 | schema_name="test", 171 | ) 172 | assert vectorizer.process(embed_and_write) == 1 173 | assert vectorizer.process(embed_and_write) == 0 174 | -------------------------------------------------------------------------------- /timescale_vector/client/predicates.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from typing import Any, Literal, Union, get_args, get_origin 4 | 5 | 6 | def get_runtime_types(typ) -> tuple[type, ...]: # type: ignore 7 | """Convert a type with generic parameters to runtime types. 8 | Necessary because Generic types cant be passed to isinstance in python 3.10""" 9 | return tuple(get_origin(t) or t for t in get_args(typ)) # type: ignore 10 | 11 | 12 | class Predicates: 13 | logical_operators: dict[str, str] = { 14 | "AND": "AND", 15 | "OR": "OR", 16 | "NOT": "NOT", 17 | } 18 | 19 | operators_mapping: dict[str, str] = { 20 | "=": "=", 21 | "==": "=", 22 | ">=": ">=", 23 | ">": ">", 24 | "<=": "<=", 25 | "<": "<", 26 | "!=": "<>", 27 | "@>": "@>", # array contains 28 | } 29 | 30 | PredicateValue = str | int | float | datetime | list[Any] | tuple[Any] 31 | 32 | def __init__( 33 | self, 34 | *clauses: Union[ 35 | "Predicates", 36 | tuple[str, PredicateValue], 37 | tuple[str, str, PredicateValue], 38 | str, 39 | PredicateValue, 40 | ], 41 | operator: Literal["AND", "OR", "NOT"] = "AND", 42 | ): 43 | """ 44 | Predicates class defines predicates on the object metadata. 45 | Predicates can be combined using logical operators (&, |, and ~). 46 | 47 | Parameters 48 | ---------- 49 | clauses 50 | Predicate clauses. Can be either another Predicates object 51 | or a tuple of the form (field, operator, value) or (field, value). 52 | Operator 53 | Logical operator to use when combining the clauses. 54 | Can be one of 'AND', 'OR', 'NOT'. Defaults to 'AND'. 55 | """ 56 | if operator not in self.logical_operators: 57 | raise ValueError(f"invalid operator: {operator}") 58 | self.operator: str = operator 59 | if isinstance(clauses[0], str): 60 | if len(clauses) != 3 or not ( 61 | isinstance(clauses[1], str) and isinstance(clauses[2], get_runtime_types(self.PredicateValue)) 62 | ): 63 | raise ValueError(f"Invalid clause format: {clauses}") 64 | self.clauses = [clauses] 65 | else: 66 | self.clauses = list(clauses) 67 | 68 | def add_clause( 69 | self, 70 | *clause: Union[ 71 | "Predicates", 72 | tuple[str, PredicateValue], 73 | tuple[str, str, PredicateValue], 74 | str, 75 | PredicateValue, 76 | ], 77 | ) -> None: 78 | """ 79 | Add a clause to the predicates object. 80 | 81 | Parameters 82 | ---------- 83 | clause: 'Predicates' or Tuple[str, str] or Tuple[str, str, str] 84 | Predicate clause. Can be either another Predicates object or a tuple of the form (field, operator, value) 85 | or (field, value). 86 | """ 87 | if isinstance(clause[0], str): 88 | if len(clause) != 3 or not ( 89 | isinstance(clause[1], str) and isinstance(clause[2], get_runtime_types(self.PredicateValue)) 90 | ): 91 | raise ValueError(f"Invalid clause format: {clause}") 92 | self.clauses.append(clause) # type: ignore 93 | else: 94 | self.clauses.extend(list(clause)) # type: ignore 95 | 96 | def __and__(self, other: "Predicates") -> "Predicates": 97 | new_predicates = Predicates(self, other, operator="AND") 98 | return new_predicates 99 | 100 | def __or__(self, other: "Predicates") -> "Predicates": 101 | new_predicates = Predicates(self, other, operator="OR") 102 | return new_predicates 103 | 104 | def __invert__(self) -> "Predicates": 105 | new_predicates = Predicates(self, operator="NOT") 106 | return new_predicates 107 | 108 | def __eq__(self, other: object) -> bool: 109 | if not isinstance(other, Predicates): 110 | return False 111 | 112 | return self.operator == other.operator and self.clauses == other.clauses 113 | 114 | def __repr__(self) -> str: 115 | if self.operator: 116 | return f"{self.operator}({', '.join(repr(clause) for clause in self.clauses)})" 117 | else: 118 | return repr(self.clauses) 119 | 120 | def build_query(self, params: list[Any]) -> tuple[str, list[Any]]: 121 | """ 122 | Build the SQL query string and parameters for the predicates object. 123 | """ 124 | if not self.clauses: 125 | return "", [] 126 | 127 | where_conditions: list[str] = [] 128 | 129 | for clause in self.clauses: 130 | if isinstance(clause, Predicates): 131 | child_where_clause, params = clause.build_query(params) 132 | where_conditions.append(f"({child_where_clause})") 133 | elif isinstance(clause, tuple): 134 | if len(clause) == 2: 135 | field, value = clause 136 | operator = "=" # Default operator 137 | elif len(clause) == 3: 138 | field, operator, value = clause 139 | if operator not in self.operators_mapping: 140 | raise ValueError(f"Invalid operator: {operator}") 141 | operator = self.operators_mapping[operator] 142 | else: 143 | raise ValueError("Invalid clause format") 144 | 145 | index = len(params) + 1 146 | param_name = f"${index}" 147 | 148 | if field == "__uuid_timestamp": 149 | # convert str to timestamp in the database, it's better at it than python 150 | if isinstance(value, str): 151 | where_conditions.append(f"uuid_timestamp(id) {operator} ({param_name}::text)::timestamptz") 152 | else: 153 | where_conditions.append(f"uuid_timestamp(id) {operator} {param_name}") 154 | params.append(value) 155 | 156 | elif operator == "@>" and isinstance(value, list | tuple): 157 | if len(value) == 0: 158 | raise ValueError("Invalid value. Empty lists and empty tuples are not supported.") 159 | json_value = json.dumps(value) 160 | where_conditions.append(f"metadata @> jsonb_build_object('{field}', {param_name}::jsonb)") 161 | params.append(json_value) 162 | 163 | else: 164 | field_cast = "" 165 | if isinstance(value, int): 166 | field_cast = "::int" 167 | elif isinstance(value, float): 168 | field_cast = "::numeric" 169 | elif isinstance(value, datetime): 170 | field_cast = "::timestamptz" 171 | where_conditions.append(f"(metadata->>'{field}'){field_cast} {operator} {param_name}") 172 | params.append(value) 173 | 174 | if self.operator == "NOT": 175 | or_clauses = " OR ".join(where_conditions) 176 | # use IS DISTINCT FROM to treat all-null clauses as False and pass the filter 177 | where_clause = f"TRUE IS DISTINCT FROM ({or_clauses})" 178 | else: 179 | where_clause = (" " + self.operator + " ").join(where_conditions) 180 | return where_clause, params 181 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2022, fastai 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /timescale_vector/client/async_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import uuid 3 | from collections.abc import Mapping 4 | from datetime import datetime, timedelta 5 | from typing import Any, Literal, cast 6 | 7 | from asyncpg import Connection, Pool, Record, connect, create_pool 8 | from asyncpg.pool import PoolAcquireContext 9 | from pgvector.asyncpg import register_vector # type: ignore 10 | 11 | from timescale_vector.client.index import BaseIndex, QueryParams 12 | from timescale_vector.client.predicates import Predicates 13 | from timescale_vector.client.query_builder import QueryBuilder 14 | from timescale_vector.client.uuid_time_range import UUIDTimeRange 15 | 16 | 17 | class Async(QueryBuilder): 18 | def __init__( 19 | self, 20 | service_url: str, 21 | table_name: str, 22 | num_dimensions: int, 23 | distance_type: str = "cosine", 24 | id_type: Literal["UUID"] | Literal["TEXT"] = "UUID", 25 | time_partition_interval: timedelta | None = None, 26 | max_db_connections: int | None = None, 27 | infer_filters: bool = True, 28 | schema_name: str | None = None, 29 | ) -> None: 30 | """ 31 | Initializes a async client for storing vector data. 32 | 33 | Parameters 34 | ---------- 35 | service_url 36 | The connection string for the database. 37 | table_name 38 | The name of the table. 39 | num_dimensions 40 | The number of dimensions for the embedding vector. 41 | distance_type 42 | The distance type for indexing. 43 | id_type 44 | The type of the id column. Can be either 'UUID' or 'TEXT'. 45 | time_partition_interval 46 | The time interval for partitioning the table (optional). 47 | infer_filters 48 | Whether to infer start and end times from the special __start_date and __end_date filters. 49 | schema_name 50 | The schema name for the table (optional, uses the database's default schema if not specified). 51 | """ 52 | self.builder = QueryBuilder( 53 | table_name, 54 | num_dimensions, 55 | distance_type, 56 | id_type, 57 | time_partition_interval, 58 | infer_filters, 59 | schema_name, 60 | ) 61 | self.service_url: str = service_url 62 | self.pool: Pool | None = None 63 | self.max_db_connections: int | None = max_db_connections 64 | self.time_partition_interval: timedelta | None = time_partition_interval 65 | 66 | async def _default_max_db_connections(self) -> int: 67 | """ 68 | Gets a default value for the number of max db connections to use. 69 | 70 | Returns 71 | ------- 72 | int 73 | """ 74 | query = self.builder.default_max_db_connection_query() 75 | conn: Connection = await connect(dsn=self.service_url) 76 | num_connections = await conn.fetchval(query) 77 | await conn.close() 78 | if num_connections is None: 79 | return 10 80 | return cast(int, num_connections) 81 | 82 | async def connect(self) -> PoolAcquireContext: 83 | """ 84 | Establishes a connection to a PostgreSQL database using asyncpg. 85 | 86 | Returns 87 | ------- 88 | asyncpg.Connection: The established database connection. 89 | """ 90 | if self.pool is None: 91 | if self.max_db_connections is None: 92 | self.max_db_connections = await self._default_max_db_connections() 93 | 94 | async def init(conn: Connection) -> None: 95 | schema = await self._detect_vector_schema(conn) 96 | if schema is None: 97 | raise ValueError("pg_vector extension not found") 98 | await register_vector(conn, schema=schema) 99 | # decode to a dict, but accept a string as input in upsert 100 | await conn.set_type_codec("jsonb", encoder=str, decoder=json.loads, schema="pg_catalog") 101 | 102 | self.pool = await create_pool( 103 | dsn=self.service_url, 104 | init=init, 105 | min_size=1, 106 | max_size=self.max_db_connections, 107 | ) 108 | 109 | return self.pool.acquire() 110 | 111 | async def close(self) -> None: 112 | if self.pool is not None: 113 | await self.pool.close() 114 | 115 | async def table_is_empty(self) -> bool: 116 | """ 117 | Checks if the table is empty. 118 | 119 | Returns 120 | ------- 121 | bool: True if the table is empty, False otherwise. 122 | """ 123 | query = self.builder.get_row_exists_query() 124 | async with await self.connect() as pool: 125 | rec = await pool.fetchrow(query) 126 | return rec is None 127 | 128 | def munge_record(self, records: list[tuple[Any, ...]]) -> list[tuple[uuid.UUID, str, str, list[float]]]: 129 | metadata_is_dict = isinstance(records[0][1], dict) 130 | if metadata_is_dict: 131 | return list(map(lambda item: Async._convert_record_meta_to_json(item), records)) 132 | return records 133 | 134 | async def _detect_vector_schema(self, conn: Connection) -> str | None: 135 | query = """ 136 | select n.nspname 137 | from pg_extension x 138 | inner join pg_namespace n on (x.extnamespace = n.oid) 139 | where x.extname = 'vector'; 140 | """ 141 | 142 | return await conn.fetchval(query) 143 | 144 | @staticmethod 145 | def _convert_record_meta_to_json(item: tuple[Any, ...]) -> tuple[uuid.UUID, str, str, list[float]]: 146 | if not isinstance(item[1], dict): 147 | raise ValueError("Cannot mix dictionary and string metadata fields in the same upsert") 148 | return item[0], json.dumps(item[1]), item[2], item[3] 149 | 150 | async def upsert(self, records: list[tuple[Any, ...]]) -> None: 151 | """ 152 | Performs upsert operation for multiple records. 153 | 154 | Parameters 155 | ---------- 156 | records 157 | List of records to upsert. Each record is a tuple of the form (id, metadata, contents, embedding). 158 | 159 | Returns 160 | ------- 161 | None 162 | """ 163 | munged_records = self.munge_record(records) 164 | query = self.builder.get_upsert_query() 165 | async with await self.connect() as pool: 166 | await pool.executemany(query, munged_records) 167 | 168 | async def create_tables(self) -> None: 169 | """ 170 | Creates necessary tables. 171 | 172 | Returns 173 | ------- 174 | None 175 | """ 176 | query = self.builder.get_create_query() 177 | # don't use a connection pool for this because the vector extension may not be installed yet 178 | # and if it's not installed, register_vector will fail. 179 | conn = await connect(dsn=self.service_url) 180 | await conn.execute(query) 181 | await conn.close() 182 | 183 | async def delete_all(self, drop_index: bool = True) -> None: 184 | """ 185 | Deletes all data. Also drops the index if `drop_index` is true. 186 | 187 | Returns 188 | ------- 189 | None 190 | """ 191 | if drop_index: 192 | await self.drop_embedding_index() 193 | query = self.builder.delete_all_query() 194 | async with await self.connect() as pool: 195 | await pool.execute(query) 196 | 197 | async def delete_by_ids(self, ids: list[uuid.UUID] | list[str]) -> list[Record]: 198 | """ 199 | Delete records by id. 200 | """ 201 | (query, params) = self.builder.delete_by_ids_query(ids) 202 | async with await self.connect() as pool: 203 | return await pool.fetch(query, *params) 204 | 205 | async def delete_by_metadata(self, filter: dict[str, str] | list[dict[str, str]]) -> list[Record]: 206 | """ 207 | Delete records by metadata filters. 208 | """ 209 | (query, params) = self.builder.delete_by_metadata_query(filter) 210 | async with await self.connect() as pool: 211 | return await pool.fetch(query, *params) 212 | 213 | async def drop_table(self) -> None: 214 | """ 215 | Drops the table 216 | 217 | Returns 218 | ------- 219 | None 220 | """ 221 | query = self.builder.drop_table_query() 222 | async with await self.connect() as pool: 223 | await pool.execute(query) 224 | 225 | async def _get_approx_count(self) -> int: 226 | """ 227 | Retrieves an approximate count of records in the table. 228 | 229 | Returns 230 | ------- 231 | int: Approximate count of records. 232 | """ 233 | query = self.builder.get_approx_count_query() 234 | async with await self.connect() as pool: 235 | rec = await pool.fetchrow(query) 236 | return cast(int, rec[0] if rec is not None else 0) 237 | 238 | async def drop_embedding_index(self) -> None: 239 | """ 240 | Drop any index on the emedding 241 | 242 | Returns 243 | ------- 244 | None 245 | """ 246 | query = self.builder.drop_embedding_index_query() 247 | async with await self.connect() as pool: 248 | await pool.execute(query) 249 | 250 | async def create_embedding_index(self, index: BaseIndex) -> None: 251 | """ 252 | Creates an index for the table. 253 | 254 | Parameters 255 | ---------- 256 | index 257 | The index to create. 258 | 259 | Returns 260 | ------- 261 | None 262 | """ 263 | num_records = await self._get_approx_count() 264 | query = self.builder.create_embedding_index_query(index, lambda: num_records) 265 | 266 | async with await self.connect() as pool: 267 | await pool.execute(query) 268 | 269 | async def search( 270 | self, 271 | query_embedding: list[float] | None = None, 272 | limit: int = 10, 273 | filter: Mapping[str, datetime | str] | list[dict[str, str]] | None = None, 274 | predicates: Predicates | None = None, 275 | uuid_time_filter: UUIDTimeRange | None = None, 276 | query_params: QueryParams | None = None, 277 | ) -> list[Record]: 278 | """ 279 | Retrieves similar records using a similarity query. 280 | 281 | Parameters 282 | ---------- 283 | query_embedding 284 | The query embedding vector. 285 | limit 286 | The number of nearest neighbors to retrieve. 287 | filter 288 | A filter for metadata. Should be specified as a key-value object or a list of key-value objects 289 | (where any objects in the list are matched). 290 | predicates 291 | A Predicates object to filter the results. Predicates support more complex queries than the filter 292 | parameter. Predicates can be combined using logical operators (&, |, and ~). 293 | uuid_time_filter 294 | A UUIDTimeRange object to filter the results by time using the id column. 295 | query_params 296 | 297 | Returns 298 | ------- 299 | List: List of similar records. 300 | """ 301 | (query, params) = self.builder.search_query(query_embedding, limit, filter, predicates, uuid_time_filter) 302 | if query_params is not None: 303 | async with await self.connect() as pool, pool.transaction(): 304 | # Looks like there is no way to pipeline this: https://github.com/MagicStack/asyncpg/issues/588 305 | statements = query_params.get_statements() 306 | for statement in statements: 307 | await pool.execute(statement) 308 | return await pool.fetch(query, *params) 309 | else: 310 | async with await self.connect() as pool: 311 | return await pool.fetch(query, *params) 312 | -------------------------------------------------------------------------------- /tests/sync_client_test.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from datetime import datetime, timedelta 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from timescale_vector.client import ( 8 | SEARCH_RESULT_CONTENTS_IDX, 9 | SEARCH_RESULT_DISTANCE_IDX, 10 | SEARCH_RESULT_ID_IDX, 11 | SEARCH_RESULT_METADATA_IDX, 12 | DiskAnnIndex, 13 | DiskAnnIndexParams, 14 | HNSWIndex, 15 | IvfflatIndex, 16 | Predicates, 17 | Sync, 18 | UUIDTimeRange, 19 | uuid_from_time, 20 | ) 21 | 22 | 23 | @pytest.mark.parametrize("schema", ["temp", None]) 24 | def test_sync_client(service_url: str, schema: str) -> None: 25 | vec = Sync(service_url, "data_table", 2, schema_name=schema) 26 | vec.create_tables() 27 | empty = vec.table_is_empty() 28 | 29 | assert empty 30 | vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])]) 31 | empty = vec.table_is_empty() 32 | assert not empty 33 | 34 | vec.upsert( 35 | [ 36 | (uuid.uuid4(), """{"key":"val"}""", "the brown fox", [1.0, 1.3]), 37 | (uuid.uuid4(), """{"key":"val2"}""", "the brown fox", [1.0, 1.4]), 38 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.5]), 39 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]), 40 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]), 41 | (uuid.uuid4(), """{"key2":"val2"}""", "the brown fox", [1.0, 1.7]), 42 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]), 43 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.9]), 44 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 100.8]), 45 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 101.8]), 46 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]), 47 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]), 48 | ( 49 | uuid.uuid4(), 50 | """{"key_1":"val_1", "key_2":"val_2"}""", 51 | "the brown fox", 52 | [1.0, 1.8], 53 | ), 54 | (uuid.uuid4(), """{"key0": [1,2,3,4]}""", "the brown fox", [1.0, 1.8]), 55 | ( 56 | uuid.uuid4(), 57 | """{"key0": [5,6,7], "key3": 3}""", 58 | "the brown fox", 59 | [1.0, 1.8], 60 | ), 61 | ] 62 | ) 63 | 64 | vec.create_embedding_index(IvfflatIndex()) 65 | vec.drop_embedding_index() 66 | vec.create_embedding_index(IvfflatIndex(100)) 67 | vec.drop_embedding_index() 68 | vec.create_embedding_index(HNSWIndex()) 69 | vec.drop_embedding_index() 70 | vec.create_embedding_index(HNSWIndex(20, 125)) 71 | vec.drop_embedding_index() 72 | vec.create_embedding_index(DiskAnnIndex()) 73 | vec.drop_embedding_index() 74 | vec.create_embedding_index(DiskAnnIndex(50, 50, 1.5)) 75 | 76 | rec = vec.search([1.0, 2.0]) 77 | assert len(rec) == 10 78 | rec = vec.search(np.array([1.0, 2.0])) 79 | assert len(rec) == 10 80 | rec = vec.search([1.0, 2.0], limit=4) 81 | assert len(rec) == 4 82 | rec = vec.search(limit=4) 83 | assert len(rec) == 4 84 | rec = vec.search([1.0, 2.0], limit=4, filter={"key2": "val2"}) 85 | assert len(rec) == 1 86 | rec = vec.search([1.0, 2.0], limit=4, filter={"key2": "does not exist"}) 87 | assert len(rec) == 0 88 | rec = vec.search(limit=4, filter={"key2": "does not exist"}) 89 | assert len(rec) == 0 90 | rec = vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1"}) 91 | assert len(rec) == 1 92 | rec = vec.search([1.0, 2.0], filter={"key_1": "val_1", "key_2": "val_2"}) 93 | assert len(rec) == 1 94 | rec = vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1", "key_2": "val_3"}) 95 | assert len(rec) == 0 96 | 97 | rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 98 | assert len(rec) == 2 99 | 100 | rec = vec.search( 101 | [1.0, 2.0], 102 | limit=4, 103 | filter=[ 104 | {"key_1": "val_1"}, 105 | {"key2": "val2"}, 106 | {"no such key": "no such val"}, 107 | ], 108 | ) 109 | assert len(rec) == 2 110 | 111 | raised = False 112 | try: 113 | # can't upsert using both keys and dictionaries 114 | vec.upsert( 115 | [ 116 | (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]), 117 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]), 118 | ] 119 | ) 120 | except ValueError: 121 | raised = True 122 | assert raised 123 | 124 | raised = False 125 | try: 126 | # can't upsert using both keys and dictionaries opposite order 127 | vec.upsert( 128 | [ 129 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]), 130 | (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]), 131 | ] 132 | ) 133 | except BaseException: 134 | raised = True 135 | assert raised 136 | 137 | rec = vec.search([1.0, 2.0], filter={"key_1": "val_1", "key_2": "val_2"}) 138 | assert rec[0][SEARCH_RESULT_CONTENTS_IDX] == "the brown fox" 139 | assert rec[0]["contents"] == "the brown fox" # type: ignore 140 | assert rec[0][SEARCH_RESULT_METADATA_IDX] == { 141 | "key_1": "val_1", 142 | "key_2": "val_2", 143 | } 144 | assert rec[0]["metadata"] == {"key_1": "val_1", "key_2": "val_2"} # type: ignore 145 | assert isinstance(rec[0][SEARCH_RESULT_METADATA_IDX], dict) 146 | assert rec[0][SEARCH_RESULT_DISTANCE_IDX] == 0.0009438353921149556 147 | assert rec[0]["distance"] == 0.0009438353921149556 # type: ignore 148 | 149 | rec = vec.search([1.0, 2.0], limit=4, predicates=Predicates("key", "==", "val2")) 150 | assert len(rec) == 1 151 | 152 | rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 153 | assert len(rec) == 2 154 | vec.delete_by_ids([rec[0][SEARCH_RESULT_ID_IDX]]) 155 | rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 156 | assert len(rec) == 1 157 | vec.delete_by_metadata([{"key_1": "val_1"}, {"key2": "val2"}]) 158 | rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 159 | assert len(rec) == 0 160 | rec = vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}]) 161 | assert len(rec) == 4 162 | vec.delete_by_metadata([{"key2": "val"}]) 163 | rec = vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}]) 164 | assert len(rec) == 0 165 | 166 | assert not vec.table_is_empty() 167 | vec.delete_all() 168 | assert vec.table_is_empty() 169 | 170 | vec.drop_table() 171 | vec.close() 172 | 173 | vec = Sync(service_url, "data_table", 2, id_type="TEXT", schema_name=schema) 174 | vec.create_tables() 175 | assert vec.table_is_empty() 176 | vec.upsert([("Not a valid UUID", {"key": "val"}, "the brown fox", [1.0, 1.2])]) 177 | assert not vec.table_is_empty() 178 | vec.delete_by_ids(["Not a valid UUID"]) 179 | assert vec.table_is_empty() 180 | vec.drop_table() 181 | vec.close() 182 | 183 | vec = Sync( 184 | service_url, 185 | "data_table", 186 | 2, 187 | time_partition_interval=timedelta(seconds=60), 188 | schema_name=schema, 189 | ) 190 | vec.create_tables() 191 | assert vec.table_is_empty() 192 | id = uuid.uuid1() 193 | vec.upsert([(id, {"key": "val"}, "the brown fox", [1.0, 1.2])]) 194 | assert not vec.table_is_empty() 195 | vec.delete_by_ids([id]) 196 | assert vec.table_is_empty() 197 | raised = False 198 | try: 199 | # can't upsert with uuid type 4 in time partitioned table 200 | vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])]) 201 | # pass 202 | except BaseException: 203 | raised = True 204 | assert raised 205 | 206 | specific_datetime = datetime(2018, 8, 10, 15, 30, 0) 207 | vec.upsert( 208 | [ 209 | # current time 210 | (uuid.uuid1(), {"key": "val"}, "the brown fox", [1.0, 1.2]), 211 | # time in 2018 212 | ( 213 | uuid_from_time(specific_datetime), 214 | {"key": "val"}, 215 | "the brown fox", 216 | [1.0, 1.2], 217 | ), 218 | ] 219 | ) 220 | 221 | def search_date(start_date: datetime | str | None, end_date: datetime | str | None, expected: int) -> None: 222 | # using uuid_time_filter 223 | rec = vec.search( 224 | [1.0, 2.0], 225 | limit=4, 226 | uuid_time_filter=UUIDTimeRange(start_date, end_date), 227 | ) 228 | assert len(rec) == expected 229 | rec = vec.search( 230 | [1.0, 2.0], 231 | limit=4, 232 | uuid_time_filter=UUIDTimeRange(str(start_date), str(end_date)), 233 | ) 234 | assert len(rec) == expected 235 | 236 | # using filters 237 | filter: dict[str, str | datetime] = {} 238 | if start_date is not None: 239 | filter["__start_date"] = start_date 240 | if end_date is not None: 241 | filter["__end_date"] = end_date 242 | rec = vec.search([1.0, 2.0], limit=4, filter=filter) 243 | assert len(rec) == expected 244 | # using filters with string dates 245 | filter = {} 246 | if start_date is not None: 247 | filter["__start_date"] = str(start_date) 248 | if end_date is not None: 249 | filter["__end_date"] = str(end_date) 250 | rec = vec.search([1.0, 2.0], limit=4, filter=filter) 251 | assert len(rec) == expected 252 | # using predicates 253 | predicates: list[tuple[str, str, str | datetime]] = [] 254 | if start_date is not None: 255 | predicates.append(("__uuid_timestamp", ">=", start_date)) 256 | if end_date is not None: 257 | predicates.append(("__uuid_timestamp", "<", end_date)) 258 | rec = vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates)) 259 | assert len(rec) == expected 260 | # using predicates with string dates 261 | predicates = [] 262 | if start_date is not None: 263 | predicates.append(("__uuid_timestamp", ">=", str(start_date))) 264 | if end_date is not None: 265 | predicates.append(("__uuid_timestamp", "<", str(end_date))) 266 | rec = vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates)) 267 | assert len(rec) == expected 268 | 269 | assert not vec.table_is_empty() 270 | 271 | search_date( 272 | specific_datetime - timedelta(days=7), 273 | specific_datetime + timedelta(days=7), 274 | 1, 275 | ) 276 | search_date(specific_datetime - timedelta(days=7), None, 2) 277 | search_date(None, specific_datetime + timedelta(days=7), 1) 278 | search_date( 279 | specific_datetime - timedelta(days=7), 280 | specific_datetime - timedelta(days=2), 281 | 0, 282 | ) 283 | 284 | # check timedelta handling 285 | rec = vec.search( 286 | [1.0, 2.0], 287 | limit=4, 288 | uuid_time_filter=UUIDTimeRange(start_date=specific_datetime, time_delta=timedelta(days=7)), 289 | ) 290 | assert len(rec) == 1 291 | # end is exclusive 292 | rec = vec.search( 293 | [1.0, 2.0], 294 | limit=4, 295 | uuid_time_filter=UUIDTimeRange(end_date=specific_datetime, time_delta=timedelta(days=7)), 296 | ) 297 | assert len(rec) == 0 298 | rec = vec.search( 299 | [1.0, 2.0], 300 | limit=4, 301 | uuid_time_filter=UUIDTimeRange( 302 | end_date=specific_datetime + timedelta(seconds=1), 303 | time_delta=timedelta(days=7), 304 | ), 305 | ) 306 | assert len(rec) == 1 307 | rec = vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(10, 5)) 308 | assert len(rec) == 2 309 | rec = vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(100, rescore=2)) 310 | assert len(rec) == 2 311 | vec.drop_table() 312 | vec.close() 313 | -------------------------------------------------------------------------------- /timescale_vector/client/sync_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import uuid 4 | from collections.abc import Iterator, Mapping 5 | from contextlib import contextmanager 6 | from datetime import datetime, timedelta 7 | from typing import Any, Literal 8 | 9 | import numpy as np 10 | from numpy import ndarray 11 | from pgvector.psycopg2 import register_vector # type: ignore 12 | from psycopg2 import connect 13 | from psycopg2.extensions import connection as PSYConnection 14 | from psycopg2.extras import DictCursor, register_uuid 15 | from psycopg2.pool import SimpleConnectionPool 16 | 17 | from timescale_vector.client.index import BaseIndex, QueryParams 18 | from timescale_vector.client.predicates import Predicates 19 | from timescale_vector.client.query_builder import QueryBuilder 20 | from timescale_vector.client.uuid_time_range import UUIDTimeRange 21 | 22 | 23 | class Sync: 24 | translated_queries: dict[str, str] = {} 25 | 26 | def __init__( 27 | self, 28 | service_url: str, 29 | table_name: str, 30 | num_dimensions: int, 31 | distance_type: str = "cosine", 32 | id_type: Literal["UUID"] | Literal["TEXT"] = "UUID", 33 | time_partition_interval: timedelta | None = None, 34 | max_db_connections: int | None = None, 35 | infer_filters: bool = True, 36 | schema_name: str | None = None, 37 | ) -> None: 38 | """ 39 | Initializes a sync client for storing vector data. 40 | 41 | Parameters 42 | ---------- 43 | service_url 44 | The connection string for the database. 45 | table_name 46 | The name of the table. 47 | num_dimensions 48 | The number of dimensions for the embedding vector. 49 | distance_type 50 | The distance type for indexing. 51 | id_type 52 | The type of the primary id column. Can be either 'UUID' or 'TEXT'. 53 | time_partition_interval 54 | The time interval for partitioning the table (optional). 55 | infer_filters 56 | Whether to infer start and end times from the special __start_date and __end_date filters. 57 | schema_name 58 | The schema name for the table (optional, uses the database's default schema if not specified). 59 | """ 60 | self.builder = QueryBuilder( 61 | table_name, 62 | num_dimensions, 63 | distance_type, 64 | id_type, 65 | time_partition_interval, 66 | infer_filters, 67 | schema_name, 68 | ) 69 | self.service_url: str = service_url 70 | self.pool: SimpleConnectionPool | None = None 71 | self.max_db_connections: int | None = max_db_connections 72 | self.time_partition_interval: timedelta | None = time_partition_interval 73 | register_uuid() 74 | 75 | def default_max_db_connections(self) -> int: 76 | """ 77 | Gets a default value for the number of max db connections to use. 78 | """ 79 | query = self.builder.default_max_db_connection_query() 80 | conn = connect(dsn=self.service_url) 81 | with conn.cursor() as cur: 82 | cur.execute(query) 83 | num_connections = cur.fetchone() 84 | conn.close() 85 | return num_connections[0] # type: ignore 86 | 87 | @contextmanager 88 | def connect(self) -> Iterator[PSYConnection]: 89 | """ 90 | Establishes a connection to a PostgreSQL database using psycopg2 and allows it's 91 | use in a context manager. 92 | """ 93 | if self.pool is None: 94 | if self.max_db_connections is None: 95 | self.max_db_connections = self.default_max_db_connections() 96 | 97 | self.pool = SimpleConnectionPool( 98 | 1, 99 | self.max_db_connections, 100 | dsn=self.service_url, 101 | cursor_factory=DictCursor, 102 | ) 103 | 104 | connection = self.pool.getconn() 105 | register_vector(connection) 106 | try: 107 | yield connection 108 | connection.commit() 109 | finally: 110 | self.pool.putconn(connection) 111 | 112 | def close(self) -> None: 113 | if self.pool is not None: 114 | self.pool.closeall() 115 | 116 | def _translate_to_pyformat(self, query_string: str, params: list[Any] | None) -> tuple[str, dict[str, Any]]: 117 | """ 118 | Translates dollar sign number parameters and list parameters to pyformat strings. 119 | 120 | Args: 121 | query_string (str): The query string with parameters. 122 | params (list|None): List of parameter values. 123 | 124 | Returns: 125 | str: The query string with translated pyformat parameters. 126 | dict: A dictionary mapping parameter numbers to their values. 127 | """ 128 | 129 | translated_params: dict[str, Any] = {} 130 | if params is not None: 131 | for idx, param in enumerate(params): 132 | translated_params[str(idx + 1)] = param 133 | 134 | if query_string in self.translated_queries: 135 | return self.translated_queries[query_string], translated_params 136 | 137 | dollar_params = re.findall(r"\$[0-9]+", query_string) 138 | translated_string = query_string 139 | for dollar_param in dollar_params: 140 | # Extract the number after the $ 141 | param_number = int(dollar_param[1:]) 142 | pyformat_param = ("%s" if param_number == 0 else f"%({param_number})s") if params is not None else "%s" 143 | translated_string = translated_string.replace(dollar_param, pyformat_param) 144 | 145 | self.translated_queries[query_string] = translated_string 146 | return self.translated_queries[query_string], translated_params 147 | 148 | def table_is_empty(self) -> bool: 149 | """ 150 | Checks if the table is empty. 151 | 152 | Returns 153 | ------- 154 | bool: True if the table is empty, False otherwise. 155 | """ 156 | query = self.builder.get_row_exists_query() 157 | with self.connect() as conn, conn.cursor() as cur: 158 | cur.execute(query) 159 | rec = cur.fetchone() 160 | return rec is None 161 | 162 | def munge_record(self, records: list[tuple[Any, ...]]) -> list[tuple[uuid.UUID, str, str, list[float]]]: 163 | metadata_is_dict = isinstance(records[0][1], dict) 164 | if metadata_is_dict: 165 | return list(map(lambda item: Sync._convert_record_meta_to_json(item), records)) 166 | 167 | return records 168 | 169 | @staticmethod 170 | def _convert_record_meta_to_json(item: tuple[Any, ...]) -> tuple[uuid.UUID, str, str, list[float]]: 171 | if not isinstance(item[1], dict): 172 | raise ValueError("Cannot mix dictionary and string metadata fields in the same upsert") 173 | return item[0], json.dumps(item[1]), item[2], item[3] 174 | 175 | def upsert(self, records: list[tuple[Any, ...]]) -> None: 176 | """ 177 | Performs upsert operation for multiple records. 178 | 179 | Parameters 180 | ---------- 181 | records 182 | Records to upsert. 183 | 184 | Returns 185 | ------- 186 | None 187 | """ 188 | munged_records = self.munge_record(records) 189 | query = self.builder.get_upsert_query() 190 | query, _ = self._translate_to_pyformat(query, None) 191 | with self.connect() as conn, conn.cursor() as cur: 192 | cur.executemany(query, munged_records) 193 | 194 | def create_tables(self) -> None: 195 | """ 196 | Creates necessary tables. 197 | 198 | Returns 199 | ------- 200 | None 201 | """ 202 | query = self.builder.get_create_query() 203 | # don't use a connection pool for this because the vector extension may not be installed yet 204 | # and if it's not installed, register_vector will fail. 205 | conn = connect(dsn=self.service_url) 206 | with conn.cursor() as cur: 207 | cur.execute(query) 208 | conn.commit() 209 | conn.close() 210 | 211 | def delete_all(self, drop_index: bool = True) -> None: 212 | """ 213 | Deletes all data. Also drops the index if `drop_index` is true. 214 | 215 | Returns 216 | ------- 217 | None 218 | """ 219 | if drop_index: 220 | self.drop_embedding_index() 221 | query = self.builder.delete_all_query() 222 | with self.connect() as conn, conn.cursor() as cur: 223 | cur.execute(query) 224 | 225 | def delete_by_ids(self, ids: list[uuid.UUID] | list[str]) -> None: 226 | """ 227 | Delete records by id. 228 | 229 | Parameters 230 | ---------- 231 | ids 232 | List of ids to delete. 233 | """ 234 | (query, params) = self.builder.delete_by_ids_query(ids) 235 | translated_query, translated_params = self._translate_to_pyformat(query, params) 236 | with self.connect() as conn, conn.cursor() as cur: 237 | cur.execute(translated_query, translated_params) 238 | 239 | def delete_by_metadata(self, filter: dict[str, str] | list[dict[str, str]]) -> None: 240 | """ 241 | Delete records by metadata filters. 242 | """ 243 | (query, params) = self.builder.delete_by_metadata_query(filter) 244 | translated_query, translated_params = self._translate_to_pyformat(query, params) 245 | with self.connect() as conn, conn.cursor() as cur: 246 | cur.execute(translated_query, translated_params) 247 | 248 | def drop_table(self) -> None: 249 | """ 250 | Drops the table 251 | 252 | Returns 253 | ------- 254 | None 255 | """ 256 | query = self.builder.drop_table_query() 257 | with self.connect() as conn, conn.cursor() as cur: 258 | cur.execute(query) 259 | 260 | def _get_approx_count(self) -> int: 261 | """ 262 | Retrieves an approximate count of records in the table. 263 | 264 | Returns 265 | ------- 266 | int: Approximate count of records. 267 | """ 268 | query = self.builder.get_approx_count_query() 269 | with self.connect() as conn, conn.cursor() as cur: 270 | cur.execute(query) 271 | rec = cur.fetchone() 272 | return rec[0] if rec is not None else 0 273 | 274 | def drop_embedding_index(self) -> None: 275 | """ 276 | Drop any index on the emedding 277 | 278 | Returns 279 | -------- 280 | None 281 | """ 282 | query = self.builder.drop_embedding_index_query() 283 | with self.connect() as conn, conn.cursor() as cur: 284 | cur.execute(query) 285 | 286 | def create_embedding_index(self, index: BaseIndex) -> None: 287 | """ 288 | Creates an index on the embedding for the table. 289 | 290 | Parameters 291 | ---------- 292 | index 293 | The index to create. 294 | 295 | Returns 296 | -------- 297 | None 298 | """ 299 | query = self.builder.create_embedding_index_query(index, lambda: self._get_approx_count()) 300 | with self.connect() as conn, conn.cursor() as cur: 301 | cur.execute(query) 302 | 303 | def search( 304 | self, 305 | query_embedding: ndarray[Any, Any] | list[float] | None = None, 306 | limit: int = 10, 307 | filter: Mapping[str, datetime | str] | list[dict[str, str]] | None = None, 308 | predicates: Predicates | None = None, 309 | uuid_time_filter: UUIDTimeRange | None = None, 310 | query_params: QueryParams | None = None, 311 | ) -> list[tuple[Any, ...]]: 312 | """ 313 | Retrieves similar records using a similarity query. 314 | 315 | Parameters 316 | ---------- 317 | query_embedding 318 | The query embedding vector. 319 | limit 320 | The number of nearest neighbors to retrieve. 321 | filter 322 | A filter for metadata. Should be specified as a key-value object or a list of key-value objects 323 | (where any objects in the list are matched). 324 | predicates 325 | A Predicates object to filter the results. Predicates support more complex queries 326 | than the filter parameter. Predicates can be combined using logical operators (&, |, and ~). 327 | 328 | Returns 329 | -------- 330 | List: List of similar records. 331 | """ 332 | query_embedding_np = np.array(query_embedding) if query_embedding is not None else None 333 | 334 | (query, params) = self.builder.search_query(query_embedding_np, limit, filter, predicates, uuid_time_filter) 335 | translated_query, translated_params = self._translate_to_pyformat(query, params) 336 | 337 | if query_params is not None: 338 | prefix = "; ".join(query_params.get_statements()) 339 | translated_query = f"{prefix}; {translated_query}" 340 | 341 | with self.connect() as conn, conn.cursor() as cur: 342 | cur.execute(translated_query, translated_params) 343 | return cur.fetchall() 344 | -------------------------------------------------------------------------------- /timescale_vector/client/query_builder.py: -------------------------------------------------------------------------------- 1 | # pyright: reportPrivateUsage=false 2 | import json 3 | import uuid 4 | from collections.abc import Callable, Mapping 5 | from datetime import datetime, timedelta 6 | from typing import Any 7 | 8 | import numpy as np 9 | 10 | from timescale_vector.client.index import BaseIndex 11 | from timescale_vector.client.predicates import Predicates 12 | from timescale_vector.client.uuid_time_range import UUIDTimeRange 13 | 14 | 15 | class QueryBuilder: 16 | def __init__( 17 | self, 18 | table_name: str, 19 | num_dimensions: int, 20 | distance_type: str, 21 | id_type: str, 22 | time_partition_interval: timedelta | None, 23 | infer_filters: bool, 24 | schema_name: str | None, 25 | ) -> None: 26 | """ 27 | Initializes a base Vector object to generate queries for vector clients. 28 | 29 | Parameters 30 | ---------- 31 | table_name 32 | The name of the table. 33 | num_dimensions 34 | The number of dimensions for the embedding vector. 35 | distance_type 36 | The distance type for indexing. 37 | id_type 38 | The type of the id column. Can be either 'UUID' or 'TEXT'. 39 | time_partition_interval 40 | The time interval for partitioning the table (optional). 41 | infer_filters 42 | Whether to infer start and end times from the special __start_date and __end_date filters. 43 | schema_name 44 | The schema name for the table (optional, uses the database's default schema if not specified). 45 | """ 46 | self.table_name: str = table_name 47 | self.schema_name: str | None = schema_name 48 | self.num_dimensions: int = num_dimensions 49 | if distance_type == "cosine" or distance_type == "<=>": 50 | self.distance_type: str = "<=>" 51 | elif distance_type == "euclidean" or distance_type == "<->" or distance_type == "l2": 52 | self.distance_type = "<->" 53 | else: 54 | raise ValueError(f"unrecognized distance_type {distance_type}") 55 | 56 | if id_type.lower() != "uuid" and id_type.lower() != "text": 57 | raise ValueError(f"unrecognized id_type {id_type}") 58 | 59 | if time_partition_interval is not None and id_type.lower() != "uuid": 60 | raise ValueError("time partitioning is only supported for uuid id_type") 61 | 62 | self.id_type: str = id_type.lower() 63 | self.time_partition_interval: timedelta | None = time_partition_interval 64 | self.infer_filters: bool = infer_filters 65 | 66 | @staticmethod 67 | def _quote_ident(ident: str) -> str: 68 | """ 69 | Quotes an identifier to prevent SQL injection. 70 | 71 | Parameters 72 | ---------- 73 | ident 74 | The identifier to be quoted. 75 | 76 | Returns 77 | ------- 78 | str: The quoted identifier. 79 | """ 80 | return '"{}"'.format(ident.replace('"', '""')) 81 | 82 | def _quoted_table_name(self) -> str: 83 | if self.schema_name is not None: 84 | return self._quote_ident(self.schema_name) + "." + self._quote_ident(self.table_name) 85 | else: 86 | return self._quote_ident(self.table_name) 87 | 88 | def get_row_exists_query(self) -> str: 89 | """ 90 | Generates a query to check if any rows exist in the table. 91 | 92 | Returns 93 | ------- 94 | str: The query to check for row existence. 95 | """ 96 | return f"SELECT 1 FROM {self._quoted_table_name()} LIMIT 1" 97 | 98 | def get_upsert_query(self) -> str: 99 | """ 100 | Generates an upsert query. 101 | 102 | Returns 103 | ------- 104 | str: The upsert query. 105 | """ 106 | return ( 107 | f"INSERT INTO {self._quoted_table_name()} (id, metadata, contents, embedding) " 108 | f"VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING" 109 | ) 110 | 111 | def get_approx_count_query(self) -> str: 112 | """ 113 | Generate a query to find the approximate count of records in the table. 114 | 115 | Returns 116 | ------- 117 | str: the query. 118 | """ 119 | # todo optimize with approx 120 | return f"SELECT COUNT(*) as cnt FROM {self._quoted_table_name()}" 121 | 122 | def get_create_query(self) -> str: 123 | """ 124 | Generates a query to create the tables, indexes, and extensions needed to store the vector data. 125 | 126 | Returns 127 | ------- 128 | str: The create table query. 129 | """ 130 | hypertable_sql = "" 131 | if self.time_partition_interval is not None: 132 | hypertable_sql = f""" 133 | CREATE EXTENSION IF NOT EXISTS timescaledb; 134 | 135 | CREATE OR REPLACE FUNCTION public.uuid_timestamp(uuid UUID) RETURNS TIMESTAMPTZ AS $$ 136 | DECLARE 137 | bytes bytea; 138 | BEGIN 139 | bytes := uuid_send(uuid); 140 | if (get_byte(bytes, 6) >> 4)::int2 != 1 then 141 | RAISE EXCEPTION 'UUID version is not 1'; 142 | end if; 143 | RETURN to_timestamp( 144 | ( 145 | ( 146 | (get_byte(bytes, 0)::bigint << 24) | 147 | (get_byte(bytes, 1)::bigint << 16) | 148 | (get_byte(bytes, 2)::bigint << 8) | 149 | (get_byte(bytes, 3)::bigint << 0) 150 | ) + ( 151 | ((get_byte(bytes, 4)::bigint << 8 | 152 | get_byte(bytes, 5)::bigint)) << 32 153 | ) + ( 154 | (((get_byte(bytes, 6)::bigint & 15) << 8 | get_byte(bytes, 7)::bigint) & 4095) << 48 155 | ) - 122192928000000000 156 | ) / 10000 / 1000::double precision 157 | ); 158 | END 159 | $$ LANGUAGE plpgsql 160 | IMMUTABLE PARALLEL SAFE 161 | RETURNS NULL ON NULL INPUT; 162 | 163 | SELECT create_hypertable('{self._quoted_table_name()}', 164 | 'id', 165 | if_not_exists=> true, 166 | time_partitioning_func=>'public.uuid_timestamp', 167 | chunk_time_interval => '{str(self.time_partition_interval.total_seconds())} seconds'::interval); 168 | """ 169 | return f""" 170 | CREATE EXTENSION IF NOT EXISTS vector; 171 | CREATE EXTENSION IF NOT EXISTS vectorscale; 172 | 173 | 174 | CREATE TABLE IF NOT EXISTS {self._quoted_table_name()} ( 175 | id {self.id_type} PRIMARY KEY, 176 | metadata JSONB, 177 | contents TEXT, 178 | embedding VECTOR({self.num_dimensions}) 179 | ); 180 | 181 | CREATE INDEX IF NOT EXISTS {self._quote_ident(self.table_name + "_meta_idx")} ON {self._quoted_table_name()} 182 | USING GIN(metadata jsonb_path_ops); 183 | 184 | {hypertable_sql} 185 | """ 186 | 187 | def _get_embedding_index_name_quoted(self) -> str: 188 | return self._quote_ident(self.table_name + "_embedding_idx") 189 | 190 | def _get_schema_qualified_embedding_index_name_quoted(self) -> str: 191 | if self.schema_name is not None: 192 | return self._quote_ident(self.schema_name) + "." + self._get_embedding_index_name_quoted() 193 | else: 194 | return self._get_embedding_index_name_quoted() 195 | 196 | def drop_embedding_index_query(self) -> str: 197 | return f"DROP INDEX IF EXISTS {self._get_schema_qualified_embedding_index_name_quoted()};" 198 | 199 | def delete_all_query(self) -> str: 200 | return f"TRUNCATE {self._quoted_table_name()};" 201 | 202 | def delete_by_ids_query(self, ids: list[uuid.UUID] | list[str]) -> tuple[str, list[Any]]: 203 | query = f"DELETE FROM {self._quoted_table_name()} WHERE id = ANY($1::{self.id_type}[]);" 204 | return (query, [ids]) 205 | 206 | def delete_by_metadata_query( 207 | self, filter_conditions: dict[str, str] | list[dict[str, str]] 208 | ) -> tuple[str, list[Any]]: 209 | params: list[Any] = [] 210 | (where, params) = self._where_clause_for_filter(params, filter_conditions) 211 | query = f"DELETE FROM {self._quoted_table_name()} WHERE {where};" 212 | return (query, params) 213 | 214 | def drop_table_query(self) -> str: 215 | return f"DROP TABLE IF EXISTS {self._quoted_table_name()};" 216 | 217 | def default_max_db_connection_query(self) -> str: 218 | """ 219 | Generates a query to get the default max db connections. This uses a heuristic to determine the max connections 220 | based on the max_connections setting in postgres 221 | and the number of currently used connections. This heuristic leaves 4 connections in reserve. 222 | """ 223 | return ( 224 | "SELECT greatest(1, ((SELECT setting::int FROM pg_settings " 225 | "WHERE name='max_connections')-(SELECT count(*) FROM pg_stat_activity) - 4)::int)" 226 | ) 227 | 228 | def create_embedding_index_query(self, index: BaseIndex, num_records_callback: Callable[[], int]) -> str: 229 | """ 230 | Generates an embedding index creation query. 231 | 232 | Parameters 233 | ---------- 234 | index 235 | The index to create. 236 | num_records_callback 237 | A callback function to get the number of records in the table. 238 | 239 | Returns 240 | ------- 241 | str: The index creation query. 242 | """ 243 | column_name = "embedding" 244 | index_name_quoted = self._get_embedding_index_name_quoted() 245 | query = index.create_index_query( 246 | self._quoted_table_name(), 247 | self._quote_ident(column_name), 248 | index_name_quoted, 249 | self.distance_type, 250 | num_records_callback, 251 | ) 252 | return query 253 | 254 | def _where_clause_for_filter( 255 | self, params: list[Any], filter: Mapping[str, datetime | str] | list[dict[str, str]] | None 256 | ) -> tuple[str, list[Any]]: 257 | if filter is None: 258 | return "TRUE", params 259 | 260 | if isinstance(filter, dict): 261 | where = f"metadata @> ${len(params)+1}" 262 | json_object = json.dumps(filter) 263 | params = params + [json_object] 264 | elif isinstance(filter, list): 265 | any_params: list[str] = [] 266 | for _idx, filter_dict in enumerate(filter, start=len(params) + 1): 267 | any_params.append(json.dumps(filter_dict)) 268 | where = f"metadata @> ANY(${len(params) + 1}::jsonb[])" 269 | params = params + [any_params] 270 | else: 271 | raise ValueError(f"Unknown filter type: {type(filter)}") 272 | 273 | return where, params 274 | 275 | def search_query( 276 | self, 277 | query_embedding: list[float] | np.ndarray[Any, Any] | None, 278 | limit: int = 10, 279 | filter: Mapping[str, datetime | str] | list[dict[str, str]] | None = None, 280 | predicates: Predicates | None = None, 281 | uuid_time_filter: UUIDTimeRange | None = None, 282 | ) -> tuple[str, list[Any]]: 283 | """ 284 | Generates a similarity query. 285 | 286 | Returns: 287 | Tuple[str, List]: A tuple containing the query and parameters. 288 | """ 289 | params: list[Any] = [] 290 | if query_embedding is not None: 291 | distance = f"embedding {self.distance_type} ${len(params)+1}" 292 | params = params + [query_embedding] 293 | order_by_clause = f"ORDER BY {distance} ASC" 294 | else: 295 | distance = "-1.0" 296 | order_by_clause = "" 297 | 298 | if ( 299 | self.infer_filters 300 | and uuid_time_filter is None 301 | and isinstance(filter, dict) 302 | and ("__start_date" in filter or "__end_date" in filter) 303 | ): 304 | start_date = UUIDTimeRange._parse_datetime(filter.get("__start_date")) 305 | end_date = UUIDTimeRange._parse_datetime(filter.get("__end_date")) 306 | 307 | uuid_time_filter = UUIDTimeRange(start_date, end_date) 308 | 309 | if start_date is not None: 310 | del filter["__start_date"] 311 | if end_date is not None: 312 | del filter["__end_date"] 313 | 314 | where_clauses: list[str] = [] 315 | if filter is not None: 316 | (where_filter, params) = self._where_clause_for_filter(params, filter) 317 | where_clauses.append(where_filter) 318 | 319 | if predicates is not None: 320 | (where_predicates, params) = predicates.build_query(params) 321 | where_clauses.append(where_predicates) 322 | 323 | if uuid_time_filter is not None: 324 | (where_time, params) = uuid_time_filter.build_query(params) 325 | where_clauses.append(where_time) 326 | 327 | where = " AND ".join(where_clauses) if len(where_clauses) > 0 else "TRUE" 328 | 329 | query = f""" 330 | SELECT 331 | id, metadata, contents, embedding, {distance} as distance 332 | FROM 333 | {self._quoted_table_name()} 334 | WHERE 335 | {where} 336 | {order_by_clause} 337 | LIMIT {limit} 338 | """ 339 | return query, params 340 | -------------------------------------------------------------------------------- /tests/async_client_test.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from datetime import datetime, timedelta 3 | 4 | import pytest 5 | 6 | from timescale_vector.client import ( 7 | SEARCH_RESULT_METADATA_IDX, 8 | Async, 9 | DiskAnnIndex, 10 | DiskAnnIndexParams, 11 | HNSWIndex, 12 | IvfflatIndex, 13 | Predicates, 14 | UUIDTimeRange, 15 | uuid_from_time, 16 | ) 17 | 18 | 19 | @pytest.mark.asyncio 20 | @pytest.mark.parametrize("schema", ["temp", None]) 21 | async def test_vector(service_url: str, schema: str) -> None: 22 | vec = Async(service_url, "data_table", 2, schema_name=schema) 23 | await vec.drop_table() 24 | await vec.create_tables() 25 | empty = await vec.table_is_empty() 26 | assert empty 27 | await vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])]) 28 | empty = await vec.table_is_empty() 29 | assert not empty 30 | 31 | await vec.upsert( 32 | [ 33 | (uuid.uuid4(), """{"key":"val"}""", "the brown fox", [1.0, 1.3]), 34 | ( 35 | uuid.uuid4(), 36 | """{"key":"val2", "key_10": "10", "key_11": "11.3"}""", 37 | "the brown fox", 38 | [1.0, 1.4], 39 | ), 40 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.5]), 41 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]), 42 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]), 43 | (uuid.uuid4(), """{"key2":"val2"}""", "the brown fox", [1.0, 1.7]), 44 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]), 45 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.9]), 46 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 100.8]), 47 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 101.8]), 48 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]), 49 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]), 50 | ( 51 | uuid.uuid4(), 52 | """{"key_1":"val_1", "key_2":"val_2"}""", 53 | "the brown fox", 54 | [1.0, 1.8], 55 | ), 56 | (uuid.uuid4(), """{"key0": [1,2,3,4]}""", "the brown fox", [1.0, 1.8]), 57 | ( 58 | uuid.uuid4(), 59 | """{"key0": [8,9,"A"]}""", 60 | "the brown fox", 61 | [1.0, 1.8], 62 | ), # mixed types 63 | ( 64 | uuid.uuid4(), 65 | """{"key0": [5,6,7], "key3": 3}""", 66 | "the brown fox", 67 | [1.0, 1.8], 68 | ), 69 | (uuid.uuid4(), """{"key0": ["B", "C"]}""", "the brown fox", [1.0, 1.8]), 70 | ] 71 | ) 72 | 73 | await vec.create_embedding_index(IvfflatIndex()) 74 | await vec.drop_embedding_index() 75 | await vec.create_embedding_index(IvfflatIndex(100)) 76 | await vec.drop_embedding_index() 77 | await vec.create_embedding_index(HNSWIndex()) 78 | await vec.drop_embedding_index() 79 | await vec.create_embedding_index(HNSWIndex(20, 125)) 80 | await vec.drop_embedding_index() 81 | await vec.create_embedding_index(DiskAnnIndex()) 82 | await vec.drop_embedding_index() 83 | await vec.create_embedding_index(DiskAnnIndex(50, 50, 1.5, "memory_optimized", 2, 1)) 84 | 85 | rec = await vec.search([1.0, 2.0]) 86 | assert len(rec) == 10 87 | rec = await vec.search([1.0, 2.0], limit=4) 88 | assert len(rec) == 4 89 | rec = await vec.search(limit=4) 90 | assert len(rec) == 4 91 | rec = await vec.search([1.0, 2.0], limit=4, filter={"key2": "val2"}) 92 | assert len(rec) == 1 93 | rec = await vec.search([1.0, 2.0], limit=4, filter={"key2": "does not exist"}) 94 | assert len(rec) == 0 95 | rec = await vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1"}) 96 | assert len(rec) == 1 97 | rec = await vec.search([1.0, 2.0], filter={"key_1": "val_1", "key_2": "val_2"}) 98 | assert len(rec) == 1 99 | rec = await vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1", "key_2": "val_3"}) 100 | assert len(rec) == 0 101 | rec = await vec.search(limit=4, filter={"key_1": "val_1", "key_2": "val_3"}) 102 | assert len(rec) == 0 103 | rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 104 | assert len(rec) == 2 105 | rec = await vec.search(limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 106 | assert len(rec) == 2 107 | 108 | rec = await vec.search( 109 | [1.0, 2.0], 110 | limit=4, 111 | filter=[ 112 | {"key_1": "val_1"}, 113 | {"key2": "val2"}, 114 | {"no such key": "no such val"}, 115 | ], 116 | ) 117 | assert len(rec) == 2 118 | 119 | assert isinstance(rec[0][SEARCH_RESULT_METADATA_IDX], dict) 120 | assert isinstance(rec[0]["metadata"], dict) 121 | assert rec[0]["contents"] == "the brown fox" 122 | 123 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(("key", "val2"))) 124 | assert len(rec) == 1 125 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(("key", "==", "val2"))) 126 | assert len(rec) == 1 127 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key", "==", "val2")) 128 | assert len(rec) == 1 129 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<", 100)) 130 | assert len(rec) == 1 131 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<", 10)) 132 | assert len(rec) == 0 133 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<=", 10)) 134 | assert len(rec) == 1 135 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<=", 10.0)) 136 | assert len(rec) == 1 137 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_11", "<=", 11.3)) 138 | assert len(rec) == 1 139 | rec = await vec.search(limit=4, predicates=Predicates("key_11", ">=", 11.29999)) 140 | assert len(rec) == 1 141 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_11", "<", 11.299999)) 142 | assert len(rec) == 0 143 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [1, 2])) 144 | assert len(rec) == 1 145 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [3, 7])) 146 | assert len(rec) == 0 147 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [42])) 148 | assert len(rec) == 0 149 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [4])) 150 | assert len(rec) == 1 151 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [9, "A"])) 152 | assert len(rec) == 1 153 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", ["A"])) 154 | assert len(rec) == 1 155 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", ("C", "B"))) 156 | assert len(rec) == 1 157 | 158 | rec = await vec.search( 159 | [1.0, 2.0], 160 | limit=4, 161 | predicates=Predicates(*[("key", "val2"), ("key_10", "<", 100)]), 162 | ) 163 | assert len(rec) == 1 164 | rec = await vec.search( 165 | [1.0, 2.0], 166 | limit=4, 167 | predicates=Predicates(("key", "val2"), ("key_10", "<", 100), operator="AND"), 168 | ) 169 | assert len(rec) == 1 170 | rec = await vec.search( 171 | [1.0, 2.0], 172 | limit=4, 173 | predicates=Predicates(("key", "val2"), ("key_2", "val_2"), operator="OR"), 174 | ) 175 | assert len(rec) == 2 176 | rec = await vec.search( 177 | [1.0, 2.0], 178 | limit=4, 179 | predicates=Predicates("key_10", "<", 100) 180 | & ( 181 | Predicates( 182 | "key", 183 | "==", 184 | "val2", 185 | ) 186 | | Predicates("key_2", "==", "val_2") 187 | ), 188 | ) 189 | assert len(rec) == 1 190 | rec = await vec.search( 191 | [1.0, 2.0], 192 | limit=4, 193 | predicates=Predicates("key_10", "<", 100) 194 | and (Predicates("key", "==", "val2") or Predicates("key_2", "==", "val_2")), 195 | ) 196 | assert len(rec) == 1 197 | rec = await vec.search( 198 | [1.0, 2.0], 199 | limit=4, 200 | predicates=Predicates("key0", "@>", [6, 7]) and Predicates("key3", "==", 3), 201 | ) 202 | assert len(rec) == 1 203 | rec = await vec.search( 204 | [1.0, 2.0], 205 | limit=4, 206 | predicates=Predicates("key0", "@>", [6, 7]) and Predicates("key3", "==", 6), 207 | ) 208 | assert len(rec) == 0 209 | rec = await vec.search(limit=4, predicates=~Predicates(("key", "val2"), ("key_10", "<", 100))) 210 | assert len(rec) == 4 211 | 212 | raised = False 213 | try: 214 | # can't upsert using both keys and dictionaries 215 | await vec.upsert( 216 | [ 217 | (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]), 218 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]), 219 | ] 220 | ) 221 | except ValueError: 222 | raised = True 223 | assert raised 224 | 225 | raised = False 226 | try: 227 | # can't upsert using both keys and dictionaries opposite order 228 | await vec.upsert( 229 | [ 230 | (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]), 231 | (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]), 232 | ] 233 | ) 234 | except BaseException: 235 | raised = True 236 | assert raised 237 | 238 | rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 239 | assert len(rec) == 2 240 | await vec.delete_by_ids([rec[0]["id"]]) 241 | rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 242 | assert len(rec) == 1 243 | await vec.delete_by_metadata([{"key_1": "val_1"}, {"key2": "val2"}]) 244 | rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}]) 245 | assert len(rec) == 0 246 | rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}]) 247 | assert len(rec) == 4 248 | await vec.delete_by_metadata([{"key2": "val"}]) 249 | rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}]) 250 | assert len(rec) == 0 251 | 252 | assert not await vec.table_is_empty() 253 | await vec.delete_all() 254 | assert await vec.table_is_empty() 255 | 256 | await vec.drop_table() 257 | await vec.close() 258 | 259 | vec = Async(service_url, "data_table", 2, id_type="TEXT") 260 | await vec.create_tables() 261 | empty = await vec.table_is_empty() 262 | assert empty 263 | await vec.upsert([("Not a valid UUID", {"key": "val"}, "the brown fox", [1.0, 1.2])]) 264 | empty = await vec.table_is_empty() 265 | assert not empty 266 | await vec.delete_by_ids(["Not a valid UUID"]) 267 | empty = await vec.table_is_empty() 268 | assert empty 269 | await vec.drop_table() 270 | await vec.close() 271 | 272 | vec = Async(service_url, "data_table", 2, time_partition_interval=timedelta(seconds=60)) 273 | await vec.create_tables() 274 | empty = await vec.table_is_empty() 275 | assert empty 276 | id = uuid.uuid1() 277 | await vec.upsert([(id, {"key": "val"}, "the brown fox", [1.0, 1.2])]) 278 | empty = await vec.table_is_empty() 279 | assert not empty 280 | await vec.delete_by_ids([id]) 281 | empty = await vec.table_is_empty() 282 | assert empty 283 | 284 | raised = False 285 | try: 286 | # can't upsert with uuid type 4 in time partitioned table 287 | await vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])]) 288 | except BaseException: 289 | raised = True 290 | assert raised 291 | 292 | specific_datetime = datetime(2018, 8, 10, 15, 30, 0) 293 | await vec.upsert( 294 | [ 295 | # current time 296 | (uuid.uuid1(), {"key": "val"}, "the brown fox", [1.0, 1.2]), 297 | # time in 2018 298 | ( 299 | uuid_from_time(specific_datetime), 300 | {"key": "val"}, 301 | "the brown fox", 302 | [1.0, 1.2], 303 | ), 304 | ] 305 | ) 306 | assert not await vec.table_is_empty() 307 | 308 | # check all the possible ways to specify a date range 309 | async def search_date(start_date: datetime | str | None, end_date: datetime | str | None, expected: int) -> None: 310 | # using uuid_time_filter 311 | rec = await vec.search( 312 | [1.0, 2.0], 313 | limit=4, 314 | uuid_time_filter=UUIDTimeRange(start_date, end_date), 315 | ) 316 | assert len(rec) == expected 317 | rec = await vec.search( 318 | [1.0, 2.0], 319 | limit=4, 320 | uuid_time_filter=UUIDTimeRange(str(start_date), str(end_date)), 321 | ) 322 | assert len(rec) == expected 323 | 324 | # using filters 325 | filter: dict[str, str | datetime] = {} 326 | if start_date is not None: 327 | filter["__start_date"] = start_date 328 | if end_date is not None: 329 | filter["__end_date"] = end_date 330 | rec = await vec.search([1.0, 2.0], limit=4, filter=filter) 331 | assert len(rec) == expected 332 | # using filters with string dates 333 | filter = {} 334 | if start_date is not None: 335 | filter["__start_date"] = str(start_date) 336 | if end_date is not None: 337 | filter["__end_date"] = str(end_date) 338 | rec = await vec.search([1.0, 2.0], limit=4, filter=filter) 339 | assert len(rec) == expected 340 | # using predicates 341 | predicates: list[tuple[str, str, str | datetime]] = [] 342 | if start_date is not None: 343 | predicates.append(("__uuid_timestamp", ">=", start_date)) 344 | if end_date is not None: 345 | predicates.append(("__uuid_timestamp", "<", end_date)) 346 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates)) 347 | assert len(rec) == expected 348 | # using predicates with string dates 349 | predicates = [] 350 | if start_date is not None: 351 | predicates.append(("__uuid_timestamp", ">=", str(start_date))) 352 | if end_date is not None: 353 | predicates.append(("__uuid_timestamp", "<", str(end_date))) 354 | rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates)) 355 | assert len(rec) == expected 356 | 357 | await search_date( 358 | specific_datetime - timedelta(days=7), 359 | specific_datetime + timedelta(days=7), 360 | 1, 361 | ) 362 | await search_date(specific_datetime - timedelta(days=7), None, 2) 363 | await search_date(None, specific_datetime + timedelta(days=7), 1) 364 | await search_date( 365 | specific_datetime - timedelta(days=7), 366 | specific_datetime - timedelta(days=2), 367 | 0, 368 | ) 369 | 370 | # check timedelta handling 371 | rec = await vec.search( 372 | [1.0, 2.0], 373 | limit=4, 374 | uuid_time_filter=UUIDTimeRange(start_date=specific_datetime, time_delta=timedelta(days=7)), 375 | ) 376 | assert len(rec) == 1 377 | # end is exclusive 378 | rec = await vec.search( 379 | [1.0, 2.0], 380 | limit=4, 381 | uuid_time_filter=UUIDTimeRange(end_date=specific_datetime, time_delta=timedelta(days=7)), 382 | ) 383 | assert len(rec) == 0 384 | rec = await vec.search( 385 | [1.0, 2.0], 386 | limit=4, 387 | uuid_time_filter=UUIDTimeRange( 388 | end_date=specific_datetime + timedelta(seconds=1), 389 | time_delta=timedelta(days=7), 390 | ), 391 | ) 392 | assert len(rec) == 1 393 | rec = await vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(10, 5)) 394 | assert len(rec) == 2 395 | rec = await vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(100)) 396 | assert len(rec) == 2 397 | await vec.drop_table() 398 | await vec.close() 399 | -------------------------------------------------------------------------------- /nbs/01_pgvectorizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "# PgVectorizer" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "#| default_exp pgvectorizer" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "#| export\n", 27 | "import psycopg2.pool\n", 28 | "from contextlib import contextmanager\n", 29 | "import psycopg2.extras\n", 30 | "import pgvector.psycopg2\n", 31 | "import numpy as np\n", 32 | "import re\n", 33 | "\n", 34 | "from timescale_vector import client" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "#| export\n", 44 | "def _create_ident(base: str, suffix: str):\n", 45 | " if len(base) + len(suffix) > 62:\n", 46 | " base = base[:62 - len(suffix)]\n", 47 | " return re.sub(r'[^a-zA-Z0-9_]', '_', f\"{base}_{suffix}\")\n", 48 | "\n", 49 | "class Vectorize:\n", 50 | " def __init__(self,\n", 51 | " service_url: str, \n", 52 | " table_name: str,\n", 53 | " schema_name: str='public',\n", 54 | " id_column_name: str='id', \n", 55 | " work_queue_table_name: str=None, \n", 56 | " trigger_name: str='track_changes_for_embedding', \n", 57 | " trigger_name_fn: str=None) -> None:\n", 58 | " self.service_url = service_url\n", 59 | " self.table_name_unquoted = table_name\n", 60 | " self.schema_name_unquoted = schema_name\n", 61 | " self.table_name = client.QueryBuilder._quote_ident(table_name)\n", 62 | " self.schema_name = client.QueryBuilder._quote_ident(schema_name)\n", 63 | " self.id_column_name = client.QueryBuilder._quote_ident(id_column_name)\n", 64 | " if work_queue_table_name is None:\n", 65 | " work_queue_table_name = _create_ident(table_name, 'embedding_work_queue')\n", 66 | " self.work_queue_table_name = client.QueryBuilder._quote_ident(work_queue_table_name)\n", 67 | " \n", 68 | " self.trigger_name = client.QueryBuilder._quote_ident(trigger_name)\n", 69 | "\n", 70 | " if trigger_name_fn is None:\n", 71 | " trigger_name_fn = _create_ident(table_name, 'wq_for_embedding')\n", 72 | " self.trigger_name_fn = client.QueryBuilder._quote_ident(trigger_name_fn) \n", 73 | "\n", 74 | "\n", 75 | " def register(self): \n", 76 | " with psycopg2.connect(self.service_url) as conn:\n", 77 | " with conn.cursor() as cursor:\n", 78 | " cursor.execute(f\"\"\"\n", 79 | " SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}') is not null; \n", 80 | " \"\"\")\n", 81 | " table_exists = cursor.fetchone()[0]\n", 82 | " if table_exists:\n", 83 | " return\n", 84 | " \n", 85 | " cursor.execute(f\"\"\"\n", 86 | " CREATE TABLE {self.schema_name}.{self.work_queue_table_name} (\n", 87 | " id int\n", 88 | " );\n", 89 | "\n", 90 | " CREATE INDEX ON {self.schema_name}.{self.work_queue_table_name}(id);\n", 91 | "\n", 92 | " CREATE OR REPLACE FUNCTION {self.schema_name}.{self.trigger_name_fn}() RETURNS TRIGGER LANGUAGE PLPGSQL AS $$ \n", 93 | " BEGIN \n", 94 | " IF (TG_OP = 'DELETE') THEN\n", 95 | " INSERT INTO {self.work_queue_table_name} \n", 96 | " VALUES (OLD.{self.id_column_name});\n", 97 | " ELSE\n", 98 | " INSERT INTO {self.work_queue_table_name} \n", 99 | " VALUES (NEW.{self.id_column_name});\n", 100 | " END IF;\n", 101 | " RETURN NULL;\n", 102 | " END; \n", 103 | " $$;\n", 104 | "\n", 105 | " CREATE TRIGGER {self.trigger_name} \n", 106 | " AFTER INSERT OR UPDATE OR DELETE\n", 107 | " ON {self.schema_name}.{self.table_name} \n", 108 | " FOR EACH ROW EXECUTE PROCEDURE {self.schema_name}.{self.trigger_name_fn}();\n", 109 | "\n", 110 | " INSERT INTO {self.schema_name}.{self.work_queue_table_name} SELECT {self.id_column_name} FROM {self.schema_name}.{self.table_name};\n", 111 | " \"\"\")\n", 112 | "\n", 113 | " def process(self, embed_and_write_cb, batch_size:int=10, autoregister=True):\n", 114 | " if autoregister:\n", 115 | " self.register()\n", 116 | " \n", 117 | " with psycopg2.connect(self.service_url) as conn:\n", 118 | " with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:\n", 119 | " cursor.execute(f\"\"\"\n", 120 | " SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}')::oid; \n", 121 | " \"\"\")\n", 122 | " table_oid = cursor.fetchone()[0]\n", 123 | " \n", 124 | " cursor.execute(f\"\"\"\n", 125 | " WITH selected_rows AS (\n", 126 | " SELECT id\n", 127 | " FROM {self.schema_name}.{self.work_queue_table_name}\n", 128 | " LIMIT {int(batch_size)}\n", 129 | " FOR UPDATE SKIP LOCKED\n", 130 | " ), \n", 131 | " locked_items AS (\n", 132 | " SELECT id, pg_try_advisory_xact_lock({int(table_oid)}, id) AS locked\n", 133 | " FROM (SELECT DISTINCT id FROM selected_rows ORDER BY id) as ids\n", 134 | " ),\n", 135 | " deleted_rows AS (\n", 136 | " DELETE FROM {self.schema_name}.{self.work_queue_table_name}\n", 137 | " WHERE id IN (SELECT id FROM locked_items WHERE locked = true ORDER BY id)\n", 138 | " )\n", 139 | " SELECT locked_items.id as locked_id, {self.table_name}.*\n", 140 | " FROM locked_items\n", 141 | " LEFT JOIN {self.schema_name}.{self.table_name} ON {self.table_name}.{self.id_column_name} = locked_items.id\n", 142 | " WHERE locked = true\n", 143 | " ORDER BY locked_items.id\n", 144 | " \"\"\")\n", 145 | " res = cursor.fetchall()\n", 146 | " if len(res) > 0:\n", 147 | " embed_and_write_cb(res, self)\n", 148 | " return len(res)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "#| hide\n", 158 | "from dotenv import load_dotenv, find_dotenv\n", 159 | "import os" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "_ = load_dotenv(find_dotenv(), override=True)\n", 169 | "service_url = os.environ['TIMESCALE_SERVICE_URL']" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "\n", 179 | "#| hide\n", 180 | "with psycopg2.connect(service_url) as conn:\n", 181 | " with conn.cursor() as cursor:\n", 182 | " for item in ['blog', 'blog_embedding_work_queue', 'blog_embedding']:\n", 183 | " cursor.execute(f\"DROP TABLE IF EXISTS {item};\")\n", 184 | " \n", 185 | " for item in ['public','test']:\n", 186 | " cursor.execute(f\"DROP SCHEMA IF EXISTS {item} CASCADE;\")\n", 187 | " cursor.execute(f\"CREATE SCHEMA {item};\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "with psycopg2.connect(service_url) as conn:\n", 197 | " with conn.cursor() as cursor:\n", 198 | " cursor.execute('''\n", 199 | " CREATE TABLE IF NOT EXISTS blog (\n", 200 | " id SERIAL PRIMARY KEY NOT NULL,\n", 201 | " title TEXT NOT NULL,\n", 202 | " author TEXT NOT NULL,\n", 203 | " contents TEXT NOT NULL,\n", 204 | " category TEXT NOT NULL,\n", 205 | " published_time TIMESTAMPTZ NULL --NULL if not yet published\n", 206 | " );\n", 207 | " ''')\n", 208 | " cursor.execute('''\n", 209 | " insert into blog (title, author, contents, category, published_time) VALUES ('first', 'mat', 'first_post', 'personal', '2021-01-01');\n", 210 | " ''')\n", 211 | "\n", 212 | "\n", 213 | "vectorizer = Vectorize(service_url, 'blog')\n", 214 | "vectorizer.register()\n", 215 | "# should be idempotent\n", 216 | "vectorizer.register()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "from langchain.docstore.document import Document\n", 226 | "from langchain.text_splitter import CharacterTextSplitter\n", 227 | "from timescale_vector import client\n", 228 | "from langchain_openai import OpenAIEmbeddings\n", 229 | "from langchain_community.vectorstores.timescalevector import TimescaleVector\n", 230 | "from datetime import timedelta" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "def get_document(blog):\n", 240 | " text_splitter = CharacterTextSplitter(\n", 241 | " chunk_size=1000,\n", 242 | " chunk_overlap=200,\n", 243 | " )\n", 244 | " docs = []\n", 245 | " for chunk in text_splitter.split_text(blog['contents']):\n", 246 | " content = f\"Author {blog['author']}, title: {blog['title']}, contents:{chunk}\"\n", 247 | " metadata = {\n", 248 | " \"id\": str(client.uuid_from_time(blog['published_time'])),\n", 249 | " \"blog_id\": blog['id'], \n", 250 | " \"author\": blog['author'], \n", 251 | " \"category\": blog['category'],\n", 252 | " \"published_time\": blog['published_time'].isoformat(),\n", 253 | " }\n", 254 | " docs.append(Document(page_content=content, metadata=metadata))\n", 255 | " return docs\n", 256 | "\n", 257 | "def embed_and_write(blog_instances, vectorizer):\n", 258 | " TABLE_NAME = vectorizer.table_name_unquoted +\"_embedding\"\n", 259 | " embedding = OpenAIEmbeddings()\n", 260 | " vector_store = TimescaleVector(\n", 261 | " collection_name=TABLE_NAME,\n", 262 | " service_url=service_url,\n", 263 | " embedding=embedding,\n", 264 | " time_partition_interval=timedelta(days=30),\n", 265 | " )\n", 266 | "\n", 267 | " # delete old embeddings for all ids in the work queue\n", 268 | " metadata_for_delete = [{\"blog_id\": blog['locked_id']} for blog in blog_instances]\n", 269 | " vector_store.delete_by_metadata(metadata_for_delete)\n", 270 | "\n", 271 | " documents = []\n", 272 | " for blog in blog_instances:\n", 273 | " # skip blogs that are not published yet, or are deleted (will be None because of left join)\n", 274 | " if blog['published_time'] != None:\n", 275 | " documents.extend(get_document(blog))\n", 276 | "\n", 277 | " if len(documents) == 0:\n", 278 | " return\n", 279 | "\n", 280 | " texts = [d.page_content for d in documents]\n", 281 | " metadatas = [d.metadata for d in documents]\n", 282 | " ids = [d.metadata[\"id\"] for d in documents]\n", 283 | " vector_store.add_texts(texts, metadatas, ids)\n", 284 | "\n", 285 | "vectorizer = Vectorize(service_url, 'blog')\n", 286 | "assert vectorizer.process(embed_and_write) == 1\n", 287 | "assert vectorizer.process(embed_and_write) == 0\n", 288 | "\n", 289 | "TABLE_NAME = \"blog_embedding\"\n", 290 | "embedding = OpenAIEmbeddings()\n", 291 | "vector_store = TimescaleVector(\n", 292 | " collection_name=TABLE_NAME,\n", 293 | " service_url=service_url,\n", 294 | " embedding=embedding,\n", 295 | " time_partition_interval=timedelta(days=30),\n", 296 | ")\n", 297 | "\n", 298 | "res = vector_store.similarity_search_with_score(\"first\", 10)\n", 299 | "assert len(res) == 1\n", 300 | "\n", 301 | "\n", 302 | "with psycopg2.connect(service_url) as conn:\n", 303 | " with conn.cursor() as cursor:\n", 304 | " cursor.execute('''\n", 305 | " insert into blog (title, author, contents, category, published_time) VALUES ('2', 'mat', 'second_post', 'personal', '2021-01-01');\n", 306 | " insert into blog (title, author, contents, category, published_time) VALUES ('3', 'mat', 'third_post', 'personal', '2021-01-01');\n", 307 | " ''')\n", 308 | "assert vectorizer.process(embed_and_write) == 2\n", 309 | "assert vectorizer.process(embed_and_write) == 0\n", 310 | "\n", 311 | "res = vector_store.similarity_search_with_score(\"first\", 10)\n", 312 | "assert len(res) == 3\n", 313 | "\n", 314 | "with psycopg2.connect(service_url) as conn:\n", 315 | " with conn.cursor() as cursor:\n", 316 | " cursor.execute('''\n", 317 | " DELETE FROM blog WHERE title = '3';\n", 318 | " ''')\n", 319 | "assert vectorizer.process(embed_and_write) == 1\n", 320 | "assert vectorizer.process(embed_and_write) == 0\n", 321 | "res = vector_store.similarity_search_with_score(\"first\", 10)\n", 322 | "assert len(res) == 2\n", 323 | "\n", 324 | "res = vector_store.similarity_search_with_score(\"second\", 10)\n", 325 | "assert len(res) == 2\n", 326 | "content = res[0][0].page_content\n", 327 | "assert \"new version\" not in content\n", 328 | "with psycopg2.connect(service_url) as conn:\n", 329 | " with conn.cursor() as cursor:\n", 330 | " cursor.execute('''\n", 331 | " update blog set contents = 'second post new version' WHERE title = '2';\n", 332 | " ''')\n", 333 | "assert vectorizer.process(embed_and_write) == 1\n", 334 | "assert vectorizer.process(embed_and_write) == 0\n", 335 | "res = vector_store.similarity_search_with_score(\"second\", 10)\n", 336 | "assert len(res) == 2\n", 337 | "content = res[0][0].page_content\n", 338 | "assert \"new version\" in content\n", 339 | "\n", 340 | "\n", 341 | "with psycopg2.connect(service_url) as conn:\n", 342 | " with conn.cursor() as cursor:\n", 343 | " cursor.execute('''\n", 344 | " CREATE TABLE IF NOT EXISTS test.blog_table_name_that_is_really_really_long_and_i_mean_long (\n", 345 | " id SERIAL PRIMARY KEY NOT NULL,\n", 346 | " title TEXT NOT NULL,\n", 347 | " author TEXT NOT NULL,\n", 348 | " contents TEXT NOT NULL,\n", 349 | " category TEXT NOT NULL,\n", 350 | " published_time TIMESTAMPTZ NULL --NULL if not yet published\n", 351 | " );\n", 352 | " ''')\n", 353 | " cursor.execute('''\n", 354 | " insert into test.blog_table_name_that_is_really_really_long_and_i_mean_long (title, author, contents, category, published_time) VALUES ('first', 'mat', 'first_post', 'personal', '2021-01-01');\n", 355 | " ''')\n", 356 | "\n", 357 | "vectorizer = Vectorize(service_url, 'blog_table_name_that_is_really_really_long_and_i_mean_long', schema_name='test')\n", 358 | "assert vectorizer.process(embed_and_write) == 1\n", 359 | "assert vectorizer.process(embed_and_write) == 0" 360 | ] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "python3", 366 | "language": "python", 367 | "name": "python3" 368 | } 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 4 372 | } 373 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Timescale Vector 2 | 3 | 4 | 5 | PostgreSQL++ for AI Applications. 6 | 7 | - [Signup for Timescale 8 | Vector](https://console.cloud.timescale.com/signup?utm_campaign=vectorlaunch&utm_source=github&utm_medium=direct): 9 | Get 90 days free to try Timescale Vector on the Timescale cloud data 10 | platform. There is no self-managed version at this time. 11 | - [Documentation](https://timescale.github.io/python-vector/): Learn the 12 | key features of Timescale Vector and how to use them. 13 | - [Getting Started 14 | Tutorial](https://timescale.github.io/python-vector/tsv_python_getting_started_tutorial.html): 15 | Learn how to use Timescale Vector for semantic search on a real-world 16 | dataset. 17 | - [Learn 18 | more](https://www.timescale.com/blog/how-we-made-postgresql-the-best-vector-database/?utm_campaign=vectorlaunch&utm_source=github&utm_medium=direct): 19 | Learn more about Timescale Vector, how it works and why we built it. 20 | 21 | If you prefer to use an LLM development or data framework, see Timescale 22 | Vector’s integrations with 23 | [LangChain](https://python.langchain.com/docs/integrations/vectorstores/timescalevector) 24 | and 25 | [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/vector_stores/Timescalevector.html) 26 | 27 | ## Install 28 | 29 | To install the main library use: 30 | 31 | ``` sh 32 | pip install timescale_vector 33 | ``` 34 | 35 | We also use `dotenv` in our examples for passing around secrets and 36 | keys. You can install that with: 37 | 38 | ``` sh 39 | pip install python-dotenv 40 | ``` 41 | 42 | If you run into installation errors related to the psycopg2 package, you 43 | will need to install some prerequisites. The timescale-vector package 44 | explicitly depends on psycopg2 (the non-binary version). This adheres to 45 | [the advice provided by 46 | psycopg2](https://www.psycopg.org/docs/install.html#psycopg-vs-psycopg-binary). 47 | Building psycopg from source [requires a few prerequisites to be 48 | installed](https://www.psycopg.org/docs/install.html#build-prerequisites). 49 | Make sure these are installed before trying to 50 | `pip install timescale_vector`. 51 | 52 | ## Basic usage 53 | 54 | First, import all the necessary libraries: 55 | 56 | ``` python 57 | from dotenv import load_dotenv, find_dotenv 58 | import os 59 | from timescale_vector import client 60 | import uuid 61 | from datetime import datetime, timedelta 62 | ``` 63 | 64 | Load up your PostgreSQL credentials. Safest way is with a .env file: 65 | 66 | ``` python 67 | _ = load_dotenv(find_dotenv(), override=True) 68 | service_url = os.environ['TIMESCALE_SERVICE_URL'] 69 | ``` 70 | 71 | Next, create the client. In this tutorial, we will use the sync client. 72 | But we have an async client as well (with an identical interface that 73 | uses async functions). 74 | 75 | The client constructor takes three required arguments: 76 | 77 | | name | description | 78 | |----------------|-------------------------------------------------------------------------------------------| 79 | | service_url | Timescale service URL / connection string | 80 | | table_name | Name of the table to use for storing the embeddings. Think of this as the collection name | 81 | | num_dimensions | Number of dimensions in the vector | 82 | 83 | You can also specify the schema name, distance type, primary key type, 84 | etc. as optional parameters. Please see the documentation for details. 85 | 86 | ``` python 87 | vec = client.Sync(service_url, "my_data", 2) 88 | ``` 89 | 90 | Next, create the tables for the collection: 91 | 92 | ``` python 93 | vec.create_tables() 94 | ``` 95 | 96 | Next, insert some data. The data record contains: 97 | 98 | - A UUID to uniquely identify the embedding 99 | - A JSON blob of metadata about the embedding 100 | - The text the embedding represents 101 | - The embedding itself 102 | 103 | Because this data includes UUIDs which become primary keys, we ingest 104 | with upserts. 105 | 106 | ``` python 107 | vec.upsert([\ 108 | (uuid.uuid1(), {"animal": "fox"}, "the brown fox", [1.0,1.3]),\ 109 | (uuid.uuid1(), {"animal": "fox", "action":"jump"}, "jumped over the", [1.0,10.8]),\ 110 | ]) 111 | ``` 112 | 113 | You can now create a vector index to speed up similarity search: 114 | 115 | ``` python 116 | vec.create_embedding_index(client.DiskAnnIndex()) 117 | ``` 118 | 119 | Now, you can query for similar items: 120 | 121 | ``` python 122 | vec.search([1.0, 9.0]) 123 | ``` 124 | 125 | [[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'), 126 | {'action': 'jump', 'animal': 'fox'}, 127 | 'jumped over the', 128 | array([ 1. , 10.8], dtype=float32), 129 | 0.00016793422934946456], 130 | [UUID('4494c12c-4a0d-11ef-94a3-6ee10b77fd09'), 131 | {'animal': 'fox'}, 132 | 'the brown fox', 133 | array([1. , 1.3], dtype=float32), 134 | 0.14489260377438218]] 135 | 136 | There are many search options which we will cover below in the 137 | `Advanced search` section. 138 | 139 | As one example, we will return one item using a similarity search 140 | constrained by a metadata filter. 141 | 142 | ``` python 143 | vec.search([1.0, 9.0], limit=1, filter={"action": "jump"}) 144 | ``` 145 | 146 | [[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'), 147 | {'action': 'jump', 'animal': 'fox'}, 148 | 'jumped over the', 149 | array([ 1. , 10.8], dtype=float32), 150 | 0.00016793422934946456]] 151 | 152 | The returned records contain 5 fields: 153 | 154 | | name | description | 155 | |-----------|---------------------------------------------------------| 156 | | id | The UUID of the record | 157 | | metadata | The JSON metadata associated with the record | 158 | | contents | the text content that was embedded | 159 | | embedding | The vector embedding | 160 | | distance | The distance between the query embedding and the vector | 161 | 162 | You can access the fields by simply using the record as a dictionary 163 | keyed on the field name: 164 | 165 | ``` python 166 | records = vec.search([1.0, 9.0], limit=1, filter={"action": "jump"}) 167 | (records[0]["id"],records[0]["metadata"], records[0]["contents"], records[0]["embedding"], records[0]["distance"]) 168 | ``` 169 | 170 | (UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'), 171 | {'action': 'jump', 'animal': 'fox'}, 172 | 'jumped over the', 173 | array([ 1. , 10.8], dtype=float32), 174 | 0.00016793422934946456) 175 | 176 | You can delete by ID: 177 | 178 | ``` python 179 | vec.delete_by_ids([records[0]["id"]]) 180 | ``` 181 | 182 | Or you can delete by metadata filters: 183 | 184 | ``` python 185 | vec.delete_by_metadata({"action": "jump"}) 186 | ``` 187 | 188 | To delete all records use: 189 | 190 | ``` python 191 | vec.delete_all() 192 | ``` 193 | 194 | ## Advanced usage 195 | 196 | In this section, we will go into more detail about our feature. We will 197 | cover: 198 | 199 | 1. Search filter options - how to narrow your search by additional 200 | constraints 201 | 2. Indexing - how to speed up your similarity queries 202 | 3. Time-based partitioning - how to optimize similarity queries that 203 | filter on time 204 | 4. Setting different distance types to use in distance calculations 205 | 206 | ### Search options 207 | 208 | The `search` function is very versatile and allows you to search for the 209 | right vector in a wide variety of ways. We’ll describe the search option 210 | in 3 parts: 211 | 212 | 1. We’ll cover basic similarity search. 213 | 2. Then, we’ll describe how to filter your search based on the 214 | associated metadata. 215 | 3. Finally, we’ll talk about filtering on time when time-partitioning 216 | is enabled. 217 | 218 | Let’s use the following data for our example: 219 | 220 | ``` python 221 | vec.upsert([\ 222 | (uuid.uuid1(), {"animal":"fox", "action": "sit", "times":1}, "the brown fox", [1.0,1.3]),\ 223 | (uuid.uuid1(), {"animal":"fox", "action": "jump", "times":100}, "jumped over the", [1.0,10.8]),\ 224 | ]) 225 | ``` 226 | 227 | The basic query looks like: 228 | 229 | ``` python 230 | vec.search([1.0, 9.0]) 231 | ``` 232 | 233 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 234 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 235 | 'jumped over the', 236 | array([ 1. , 10.8], dtype=float32), 237 | 0.00016793422934946456], 238 | [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), 239 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 240 | 'the brown fox', 241 | array([1. , 1.3], dtype=float32), 242 | 0.14489260377438218]] 243 | 244 | You could provide a limit for the number of items returned: 245 | 246 | ``` python 247 | vec.search([1.0, 9.0], limit=1) 248 | ``` 249 | 250 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 251 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 252 | 'jumped over the', 253 | array([ 1. , 10.8], dtype=float32), 254 | 0.00016793422934946456]] 255 | 256 | #### Narrowing your search by metadata 257 | 258 | We have two main ways to filter results by metadata: - `filters` for 259 | equality matches on metadata. - `predicates` for complex conditions on 260 | metadata. 261 | 262 | Filters are more likely to be performant but are more limited in what 263 | they can express, so we suggest using those if your use case allows it. 264 | 265 | ##### Filters 266 | 267 | You could specify a match on the metadata as a dictionary where all keys 268 | have to match the provided values (keys not in the filter are 269 | unconstrained): 270 | 271 | ``` python 272 | vec.search([1.0, 9.0], limit=1, filter={"action": "sit"}) 273 | ``` 274 | 275 | [[UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), 276 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 277 | 'the brown fox', 278 | array([1. , 1.3], dtype=float32), 279 | 0.14489260377438218]] 280 | 281 | You can also specify a list of filter dictionaries, where an item is 282 | returned if it matches any dict: 283 | 284 | ``` python 285 | vec.search([1.0, 9.0], limit=2, filter=[{"action": "jump"}, {"animal": "fox"}]) 286 | ``` 287 | 288 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 289 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 290 | 'jumped over the', 291 | array([ 1. , 10.8], dtype=float32), 292 | 0.00016793422934946456], 293 | [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), 294 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 295 | 'the brown fox', 296 | array([1. , 1.3], dtype=float32), 297 | 0.14489260377438218]] 298 | 299 | ##### Predicates 300 | 301 | Predicates allow for more complex search conditions. For example, you 302 | could use greater than and less than conditions on numeric values. 303 | 304 | ``` python 305 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("times", ">", 1)) 306 | ``` 307 | 308 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 309 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 310 | 'jumped over the', 311 | array([ 1. , 10.8], dtype=float32), 312 | 0.00016793422934946456]] 313 | 314 | [`Predicates`](https://timescale.github.io/python-vector/vector.html#predicates) 315 | objects are defined by the name of the metadata key, an operator, and a 316 | value. 317 | 318 | The supported operators are: `==`, `!=`, `<`, `<=`, `>`, `>=` 319 | 320 | The type of the values determines the type of comparison to perform. For 321 | example, passing in `"Sam"` (a string) will do a string comparison while 322 | a `10` (an int) will perform an integer comparison while a `10.0` 323 | (float) will do a float comparison. It is important to note that using a 324 | value of `"10"` will do a string comparison as well so it’s important to 325 | use the right type. Supported Python types are: `str`, `int`, and 326 | `float`. One more example with a string comparison: 327 | 328 | ``` python 329 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump")) 330 | ``` 331 | 332 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 333 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 334 | 'jumped over the', 335 | array([ 1. , 10.8], dtype=float32), 336 | 0.00016793422934946456]] 337 | 338 | The real power of predicates is that they can also be combined using the 339 | `&` operator (for combining predicates with AND semantics) and `|`(for 340 | combining using OR semantic). So you can do: 341 | 342 | ``` python 343 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump") & client.Predicates("times", ">", 1)) 344 | ``` 345 | 346 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 347 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 348 | 'jumped over the', 349 | array([ 1. , 10.8], dtype=float32), 350 | 0.00016793422934946456]] 351 | 352 | Just for sanity, let’s show a case where no results are returned because 353 | or predicates: 354 | 355 | ``` python 356 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump") & client.Predicates("times", "==", 1)) 357 | ``` 358 | 359 | [] 360 | 361 | And one more example where we define the predicates as a variable and 362 | use grouping with parenthesis: 363 | 364 | ``` python 365 | my_predicates = client.Predicates("action", "==", "jump") & (client.Predicates("times", "==", 1) | client.Predicates("times", ">", 1)) 366 | vec.search([1.0, 9.0], limit=2, predicates=my_predicates) 367 | ``` 368 | 369 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 370 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 371 | 'jumped over the', 372 | array([ 1. , 10.8], dtype=float32), 373 | 0.00016793422934946456]] 374 | 375 | We also have some semantic sugar for combining many predicates with AND 376 | semantics. You can pass in multiple 3-tuples to 377 | [`Predicates`](https://timescale.github.io/python-vector/vector.html#predicates): 378 | 379 | ``` python 380 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates(("action", "==", "jump"), ("times", ">", 10))) 381 | ``` 382 | 383 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 384 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 385 | 'jumped over the', 386 | array([ 1. , 10.8], dtype=float32), 387 | 0.00016793422934946456]] 388 | 389 | #### Filter your search by time 390 | 391 | When using `time-partitioning`(see below). You can very efficiently 392 | filter your search by time. Time-partitioning makes a timestamp embedded 393 | as part of the UUID-based ID associated with an embedding. Let us first 394 | create a collection with time partitioning and insert some data (one 395 | item from January 2018 and another in January 2019): 396 | 397 | ``` python 398 | tpvec = client.Sync(service_url, "time_partitioned_table", 2, time_partition_interval=timedelta(hours=6)) 399 | tpvec.create_tables() 400 | 401 | specific_datetime = datetime(2018, 1, 1, 12, 0, 0) 402 | tpvec.upsert([\ 403 | (client.uuid_from_time(specific_datetime), {"animal":"fox", "action": "sit", "times":1}, "the brown fox", [1.0,1.3]),\ 404 | (client.uuid_from_time(specific_datetime+timedelta(days=365)), {"animal":"fox", "action": "jump", "times":100}, "jumped over the", [1.0,10.8]),\ 405 | ]) 406 | ``` 407 | 408 | Then, you can filter using the timestamps by specifing a 409 | `uuid_time_filter`: 410 | 411 | ``` python 412 | tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(specific_datetime, specific_datetime+timedelta(days=1))) 413 | ``` 414 | 415 | [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), 416 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 417 | 'the brown fox', 418 | array([1. , 1.3], dtype=float32), 419 | 0.14489260377438218]] 420 | 421 | A 422 | [`UUIDTimeRange`](https://timescale.github.io/python-vector/vector.html#uuidtimerange) 423 | can specify a start_date or end_date or both(as in the example above). 424 | Specifying only the start_date or end_date leaves the other end 425 | unconstrained. 426 | 427 | ``` python 428 | tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime)) 429 | ``` 430 | 431 | [[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'), 432 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 433 | 'jumped over the', 434 | array([ 1. , 10.8], dtype=float32), 435 | 0.00016793422934946456], 436 | [UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), 437 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 438 | 'the brown fox', 439 | array([1. , 1.3], dtype=float32), 440 | 0.14489260377438218]] 441 | 442 | You have the option to define the inclusivity of the start and end dates 443 | with the `start_inclusive` and `end_inclusive` parameters. Setting 444 | `start_inclusive` to true results in comparisons using the `>=` 445 | operator, whereas setting it to false applies the `>` operator. By 446 | default, the start date is inclusive, while the end date is exclusive. 447 | One example: 448 | 449 | ``` python 450 | tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime, start_inclusive=False)) 451 | ``` 452 | 453 | [[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'), 454 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 455 | 'jumped over the', 456 | array([ 1. , 10.8], dtype=float32), 457 | 0.00016793422934946456]] 458 | 459 | Notice how the results are different when we use the 460 | `start_inclusive=False` option because the first row has the exact 461 | timestamp specified by `start_date`. 462 | 463 | We’ve also made it easy to integrate time filters using the `filter` and 464 | `predicates` parameters described above using special reserved key names 465 | to make it appear that the timestamps are part of your metadata. We 466 | found this useful when integrating with other systems that just want to 467 | specify a set of filters (often these are “auto retriever” type 468 | systems). The reserved key names are `__start_date` and `__end_date` for 469 | filters and `__uuid_timestamp` for predicates. Some examples below: 470 | 471 | ``` python 472 | tpvec.search([1.0, 9.0], limit=4, filter={ "__start_date": specific_datetime, "__end_date": specific_datetime+timedelta(days=1)}) 473 | ``` 474 | 475 | [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), 476 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 477 | 'the brown fox', 478 | array([1. , 1.3], dtype=float32), 479 | 0.14489260377438218]] 480 | 481 | ``` python 482 | tpvec.search([1.0, 9.0], limit=4, 483 | predicates=client.Predicates("__uuid_timestamp", ">=", specific_datetime) & client.Predicates("__uuid_timestamp", "<", specific_datetime+timedelta(days=1))) 484 | ``` 485 | 486 | [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'), 487 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 488 | 'the brown fox', 489 | array([1. , 1.3], dtype=float32), 490 | 0.14489260377438218]] 491 | 492 | ### Indexing 493 | 494 | Indexing speeds up queries over your data. By default, we set up indexes 495 | to query your data by the UUID and the metadata. 496 | 497 | But to speed up similarity search based on the embeddings, you have to 498 | create additional indexes. 499 | 500 | Note that if performing a query without an index, you will always get an 501 | exact result, but the query will be slow (it has to read all of the data 502 | you store for every query). With an index, your queries will be 503 | order-of-magnitude faster, but the results are approximate (because 504 | there are no known indexing techniques that are exact). 505 | 506 | Nevertheless, there are excellent approximate algorithms. There are 3 507 | different indexing algorithms available on the Timescale platform: 508 | Timescale Vector index, pgvector HNSW, and pgvector ivfflat. Below are 509 | the trade-offs between these algorithms: 510 | 511 | | Algorithm | Build speed | Query speed | Need to rebuild after updates | 512 | |------------------|-------------|-------------|-------------------------------| 513 | | StreamingDiskANN | Fast | Fastest | No | 514 | | pgvector hnsw | Slowest | Faster | No | 515 | | pgvector ivfflat | Fastest | Slowest | Yes | 516 | 517 | You can see 518 | [benchmarks](https://www.timescale.com/blog/how-we-made-postgresql-the-best-vector-database/) 519 | on our blog. 520 | 521 | We recommend using the Timescale Vector index for most use cases. This 522 | can be created with: 523 | 524 | ``` python 525 | vec.create_embedding_index(client.DiskAnnIndex()) 526 | ``` 527 | 528 | Indexes are created for a particular distance metric type. So it is 529 | important that the same distance metric is set on the client during 530 | index creation as it is during queries. See the `distance type` section 531 | below. 532 | 533 | Each of these indexes has a set of build-time options for controlling 534 | the speed/accuracy trade-off when creating the index and an additional 535 | query-time option for controlling accuracy during a particular query. We 536 | have smart defaults for all of these options but will also describe the 537 | details below so that you can adjust these options manually. 538 | 539 | #### StreamingDiskANN index 540 | 541 | The StreamingDiskANN index from pgvectorscale is a graph-based algorithm 542 | that uses the [DiskANN](https://github.com/microsoft/DiskANN) algorithm. 543 | You can read more about it on our 544 | [blog](https://www.timescale.com/blog/how-we-made-postgresql-as-fast-as-pinecone-for-vector-data/) 545 | announcing its release. 546 | 547 | To create this index, run: 548 | 549 | ``` python 550 | vec.create_embedding_index(client.DiskAnnIndex()) 551 | ``` 552 | 553 | The above command will create the index using smart defaults. There are 554 | a number of parameters you could tune to adjust the accuracy/speed 555 | trade-off. 556 | 557 | The parameters you can set at index build time are: 558 | 559 | | Parameter name | Description | Default value | 560 | |--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------| 561 | | `storage_layout` | `memory_optimized` which uses SBQ to compress vector data or `plain` which stores data uncompressed | memory_optimized | 562 | | `num_neighbors` | Sets the maximum number of neighbors per node. Higher values increase accuracy but make the graph traversal slower. | 50 | 563 | | `search_list_size` | This is the S parameter used in the greedy search algorithm used during construction. Higher values improve graph quality at the cost of slower index builds. | 100 | 564 | | `max_alpha` | Is the alpha parameter in the algorithm. Higher values improve graph quality at the cost of slower index builds. | 1.2 | 565 | | `num_dimensions` | The number of dimensions to index. By default, all dimensions are indexed. But you can also index less dimensions to make use of [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) | 0 (all dimensions) | 566 | | `num_bits_per_dimension` | Number of bits used to encode each dimension when using SBQ | 2 for less than 900 dimensions, 1 otherwise | 567 | 568 | To set these parameters, you could run: 569 | 570 | ``` python 571 | vec.create_embedding_index(client.DiskAnnIndex(num_neighbors=50, search_list_size=100, max_alpha=1.0, storage_layout="memory_optimized", num_dimensions=0, num_bits_per_dimension=1)) 572 | ``` 573 | 574 | You can also set a parameter to control the accuracy vs. query speed 575 | trade-off at query time. The parameter is set in the `search()` function 576 | using the `query_params` argment. 577 | 578 | | Parameter name | Description | Default value | 579 | |--------------------|-------------------------------------------------------------------------|---------------| 580 | | `search_list_size` | The number of additional candidates considered during the graph search. | 100 | 581 | | `rescore` | The number of elements rescored (0 to disable rescoring) | 50 | 582 | 583 | We suggest using the `rescore` parameter to fine-tune accuracy. 584 | 585 | ``` python 586 | vec.search([1.0, 9.0], limit=4, query_params=client.DiskAnnIndexParams(rescore=400, search_list_size=10)) 587 | ``` 588 | 589 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 590 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 591 | 'jumped over the', 592 | array([ 1. , 10.8], dtype=float32), 593 | 0.00016793422934946456], 594 | [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), 595 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 596 | 'the brown fox', 597 | array([1. , 1.3], dtype=float32), 598 | 0.14489260377438218]] 599 | 600 | To drop the index, run: 601 | 602 | ``` python 603 | vec.drop_embedding_index() 604 | ``` 605 | 606 | #### pgvector HNSW index 607 | 608 | Pgvector provides a graph-based indexing algorithm based on the popular 609 | [HNSW algorithm](https://arxiv.org/abs/1603.09320). 610 | 611 | To create this index, run: 612 | 613 | ``` python 614 | vec.create_embedding_index(client.HNSWIndex()) 615 | ``` 616 | 617 | The above command will create the index using smart defaults. There are 618 | a number of parameters you could tune to adjust the accuracy/speed 619 | trade-off. 620 | 621 | The parameters you can set at index build time are: 622 | 623 | | Parameter name | Description | Default value | 624 | |-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| 625 | | m | Represents the maximum number of connections per layer. Think of these connections as edges created for each node during graph construction. Increasing m increases accuracy but also increases index build time and size. | 16 | 626 | | ef_construction | Represents the size of the dynamic candidate list for constructing the graph. It influences the trade-off between index quality and construction speed. Increasing ef_construction enables more accurate search results at the expense of lengthier index build times. | 64 | 627 | 628 | To set these parameters, you could run: 629 | 630 | ``` python 631 | vec.create_embedding_index(client.HNSWIndex(m=16, ef_construction=64)) 632 | ``` 633 | 634 | You can also set a parameter to control the accuracy vs. query speed 635 | trade-off at query time. The parameter is set in the `search()` function 636 | using the `query_params` argument. You can set the `ef_search`(default: 637 | 40). This parameter specifies the size of the dynamic candidate list 638 | used during search. Higher values improve query accuracy while making 639 | the query slower. 640 | 641 | You can specify this value during search as follows: 642 | 643 | ``` python 644 | vec.search([1.0, 9.0], limit=4, query_params=client.HNSWIndexParams(ef_search=10)) 645 | ``` 646 | 647 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 648 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 649 | 'jumped over the', 650 | array([ 1. , 10.8], dtype=float32), 651 | 0.00016793422934946456], 652 | [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), 653 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 654 | 'the brown fox', 655 | array([1. , 1.3], dtype=float32), 656 | 0.14489260377438218]] 657 | 658 | To drop the index run: 659 | 660 | ``` python 661 | vec.drop_embedding_index() 662 | ``` 663 | 664 | #### pgvector ivfflat index 665 | 666 | Pgvector provides a clustering-based indexing algorithm. Our [blog 667 | post](https://www.timescale.com/blog/nearest-neighbor-indexes-what-are-ivfflat-indexes-in-pgvector-and-how-do-they-work/) 668 | describes how it works in detail. It provides the fastest index-build 669 | speed but the slowest query speeds of any indexing algorithm. 670 | 671 | To create this index, run: 672 | 673 | ``` python 674 | vec.create_embedding_index(client.IvfflatIndex()) 675 | ``` 676 | 677 | Note: *ivfflat should never be created on empty tables* because it needs 678 | to cluster data, and that only happens when an index is first created, 679 | not when new rows are inserted or modified. Also, if your table 680 | undergoes a lot of modifications, you will need to rebuild this index 681 | occasionally to maintain good accuracy. See our [blog 682 | post](https://www.timescale.com/blog/nearest-neighbor-indexes-what-are-ivfflat-indexes-in-pgvector-and-how-do-they-work/) 683 | for details. 684 | 685 | Pgvector ivfflat has a `lists` index parameter that is automatically set 686 | with a smart default based on the number of rows in your table. If you 687 | know that you’ll have a different table size, you can specify the number 688 | of records to use for calculating the `lists` parameter as follows: 689 | 690 | ``` python 691 | vec.create_embedding_index(client.IvfflatIndex(num_records=1000000)) 692 | ``` 693 | 694 | You can also set the `lists` parameter directly: 695 | 696 | ``` python 697 | vec.create_embedding_index(client.IvfflatIndex(num_lists=100)) 698 | ``` 699 | 700 | You can also set a parameter to control the accuracy vs. query speed 701 | trade-off at query time. The parameter is set in the `search()` function 702 | using the `query_params` argument. You can set the `probes`. This 703 | parameter specifies the number of clusters searched during a query. It 704 | is recommended to set this parameter to `sqrt(lists)` where lists is the 705 | `num_list` parameter used above during index creation. Higher values 706 | improve query accuracy while making the query slower. 707 | 708 | You can specify this value during search as follows: 709 | 710 | ``` python 711 | vec.search([1.0, 9.0], limit=4, query_params=client.IvfflatIndexParams(probes=10)) 712 | ``` 713 | 714 | [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'), 715 | {'times': 100, 'action': 'jump', 'animal': 'fox'}, 716 | 'jumped over the', 717 | array([ 1. , 10.8], dtype=float32), 718 | 0.00016793422934946456], 719 | [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'), 720 | {'times': 1, 'action': 'sit', 'animal': 'fox'}, 721 | 'the brown fox', 722 | array([1. , 1.3], dtype=float32), 723 | 0.14489260377438218]] 724 | 725 | To drop the index, run: 726 | 727 | ``` python 728 | vec.drop_embedding_index() 729 | ``` 730 | 731 | ### Time partitioning 732 | 733 | In many use cases where you have many embeddings, time is an important 734 | component associated with the embeddings. For example, when embedding 735 | news stories, you often search by time as well as similarity (e.g., 736 | stories related to Bitcoin in the past week or stories about Clinton in 737 | November 2016). 738 | 739 | Yet, traditionally, searching by two components “similarity” and “time” 740 | is challenging for Approximate Nearest Neighbor (ANN) indexes and makes 741 | the similarity-search index less effective. 742 | 743 | One approach to solving this is partitioning the data by time and 744 | creating ANN indexes on each partition individually. Then, during 745 | search, you can: 746 | 747 | - Step 1: filter our partitions that don’t match the time predicate. 748 | - Step 2: perform the similarity search on all matching partitions. 749 | - Step 3: combine all the results from each partition in step 2, rerank, 750 | and filter out results by time. 751 | 752 | Step 1 makes the search a lot more efficient by filtering out whole 753 | swaths of data in one go. 754 | 755 | Timescale-vector supports time partitioning using TimescaleDB’s 756 | hypertables. To use this feature, simply indicate the length of time for 757 | each partition when creating the client: 758 | 759 | ``` python 760 | from datetime import timedelta 761 | from datetime import datetime 762 | ``` 763 | 764 | ``` python 765 | vec = client.Async(service_url, "my_data_with_time_partition", 2, time_partition_interval=timedelta(hours=6)) 766 | await vec.create_tables() 767 | ``` 768 | 769 | Then, insert data where the IDs use UUIDs v1 and the time component of 770 | the UUID specifies the time of the embedding. For example, to create an 771 | embedding for the current time, simply do: 772 | 773 | ``` python 774 | id = uuid.uuid1() 775 | await vec.upsert([(id, {"key": "val"}, "the brown fox", [1.0, 1.2])]) 776 | ``` 777 | 778 | To insert data for a specific time in the past, create the UUID using 779 | our 780 | [`uuid_from_time`](https://timescale.github.io/python-vector/vector.html#uuid_from_time) 781 | function 782 | 783 | ``` python 784 | specific_datetime = datetime(2018, 8, 10, 15, 30, 0) 785 | await vec.upsert([(client.uuid_from_time(specific_datetime), {"key": "val"}, "the brown fox", [1.0, 1.2])]) 786 | ``` 787 | 788 | You can then query the data by specifying a `uuid_time_filter` in the 789 | search call: 790 | 791 | ``` python 792 | rec = await vec.search([1.0, 2.0], limit=4, uuid_time_filter=client.UUIDTimeRange(specific_datetime-timedelta(days=7), specific_datetime+timedelta(days=7))) 793 | ``` 794 | 795 | ### Distance metrics 796 | 797 | By default, we use cosine distance to measure how similarly an embedding 798 | is to a given query. In addition to cosine distance, we also support 799 | Euclidean/L2 distance. The distance type is set when creating the client 800 | using the `distance_type` parameter. For example, to use the Euclidean 801 | distance metric, you can create the client with: 802 | 803 | ``` python 804 | vec = client.Sync(service_url, "my_data", 2, distance_type="euclidean") 805 | ``` 806 | 807 | Valid values for `distance_type` are `cosine` and `euclidean`. 808 | 809 | It is important to note that you should use consistent distance types on 810 | clients that create indexes and perform queries. That is because an 811 | index is only valid for one particular type of distance measure. 812 | 813 | Please note the Timescale Vector index only supports cosine distance at 814 | this time. 815 | 816 | # LangChain integration 817 | 818 | [LangChain](https://www.langchain.com/) is a popular framework for 819 | development applications powered by LLMs. Timescale Vector has a native 820 | LangChain integration, enabling you to use Timescale Vector as a 821 | vectorstore and leverage all its capabilities in your applications built 822 | with LangChain. 823 | 824 | Here are resources about using Timescale Vector with LangChain: 825 | 826 | - [Getting started with LangChain and Timescale 827 | Vector](https://python.langchain.com/docs/integrations/vectorstores/timescalevector): 828 | You’ll learn how to use Timescale Vector for (1) semantic search, (2) 829 | time-based vector search, (3) self-querying, and (4) how to create 830 | indexes to speed up queries. 831 | - [PostgreSQL Self 832 | Querying](https://python.langchain.com/docs/integrations/retrievers/self_query/timescalevector_self_query): 833 | Learn how to use Timescale Vector with self-querying in LangChain. 834 | - [LangChain template: RAG with conversational 835 | retrieval](https://github.com/langchain-ai/langchain/tree/master/templates/rag-timescale-conversation): 836 | This template is used for conversational retrieval, which is one of 837 | the most popular LLM use-cases. It passes both a conversation history 838 | and retrieved documents into an LLM for synthesis. 839 | - [LangChain template: RAG with time-based search and self-query 840 | retrieval](https://github.com/langchain-ai/langchain/tree/master/templates/rag-timescale-hybrid-search-time):This 841 | template shows how to use timescale-vector with the self-query 842 | retriver to perform hybrid search on similarity and time. This is 843 | useful any time your data has a strong time-based component. 844 | - [Learn more about Timescale Vector and 845 | LangChain](https://blog.langchain.dev/timescale-vector-x-langchain-making-postgresql-a-better-vector-database-for-ai-applications/) 846 | 847 | # LlamaIndex integration 848 | 849 | \[LlamaIndex\] is a popular data framework for connecting custom data 850 | sources to large language models (LLMs). Timescale Vector has a native 851 | LlamaIndex integration, enabling you to use Timescale Vector as a 852 | vectorstore and leverage all its capabilities in your applications built 853 | with LlamaIndex. 854 | 855 | Here are resources about using Timescale Vector with LlamaIndex: 856 | 857 | - [Getting started with LlamaIndex and Timescale 858 | Vector](https://docs.llamaindex.ai/en/stable/examples/vector_stores/Timescalevector.html): 859 | You’ll learn how to use Timescale Vector for (1) similarity 860 | search, (2) time-based vector search, (3) faster search with indexes, 861 | and (4) retrieval and query engine. 862 | - [Time-based 863 | retrieval](https://youtu.be/EYMZVfKcRzM?si=I0H3uUPgzKbQw__W): Learn 864 | how to power RAG applications with time-based retrieval. 865 | - [Llama Pack: Auto Retrieval with time-based 866 | search](https://github.com/run-llama/llama-hub/tree/main/llama_hub/llama_packs/timescale_vector_autoretrieval): 867 | This pack demonstrates performing auto-retrieval for hybrid search 868 | based on both similarity and time, using the timescale-vector 869 | (PostgreSQL) vectorstore. 870 | - [Learn more about Timescale Vector and 871 | LlamaIndex](https://www.timescale.com/blog/timescale-vector-x-llamaindex-making-postgresql-a-better-vector-database-for-ai-applications/) 872 | 873 | # PgVectorize 874 | 875 | PgVectorize enables you to create vector embeddings from any data that 876 | you already have stored in PostgreSQL. You can get more background 877 | information in our [blog 878 | post](https://www.timescale.com/blog/a-complete-guide-to-creating-and-storing-embeddings-for-postgresql-data/) 879 | announcing this feature, as well as a [“how we built 880 | in”](https://www.timescale.com/blog/how-we-designed-a-resilient-vector-embedding-creation-system-for-postgresql-data/) 881 | post going into the details of the design. 882 | 883 | To create vector embeddings, simply attach PgVectorize to any PostgreSQL 884 | table, and it will automatically sync that table’s data with a set of 885 | embeddings stored in Timescale Vector. For example, let’s say you have a 886 | blog table defined in the following way: 887 | 888 | ``` python 889 | import psycopg2 890 | from langchain.docstore.document import Document 891 | from langchain.text_splitter import CharacterTextSplitter 892 | from timescale_vector import client, pgvectorizer 893 | from langchain_openai import OpenAIEmbeddings 894 | from langchain_community.vectorstores.timescalevector import TimescaleVector 895 | from datetime import timedelta 896 | ``` 897 | 898 | ``` python 899 | with psycopg2.connect(service_url) as conn: 900 | with conn.cursor() as cursor: 901 | cursor.execute(''' 902 | CREATE TABLE IF NOT EXISTS blog ( 903 | id SERIAL PRIMARY KEY NOT NULL, 904 | title TEXT NOT NULL, 905 | author TEXT NOT NULL, 906 | contents TEXT NOT NULL, 907 | category TEXT NOT NULL, 908 | published_time TIMESTAMPTZ NULL --NULL if not yet published 909 | ); 910 | ''') 911 | ``` 912 | 913 | You can insert some data as follows: 914 | 915 | ``` python 916 | with psycopg2.connect(service_url) as conn: 917 | with conn.cursor() as cursor: 918 | cursor.execute(''' 919 | INSERT INTO blog (title, author, contents, category, published_time) VALUES ('First Post', 'Matvey Arye', 'some super interesting content about cats.', 'AI', '2021-01-01'); 920 | ''') 921 | ``` 922 | 923 | Now, say you want to embed these blogs in Timescale Vector. First, you 924 | need to define an `embed_and_write` function that takes a set of blog 925 | posts, creates the embeddings, and writes them into TimescaleVector. For 926 | example, if using LangChain, it could look something like the following. 927 | 928 | ``` python 929 | def get_document(blog): 930 | text_splitter = CharacterTextSplitter( 931 | chunk_size=1000, 932 | chunk_overlap=200, 933 | ) 934 | docs = [] 935 | for chunk in text_splitter.split_text(blog['contents']): 936 | content = f"Author {blog['author']}, title: {blog['title']}, contents:{chunk}" 937 | metadata = { 938 | "id": str(client.uuid_from_time(blog['published_time'])), 939 | "blog_id": blog['id'], 940 | "author": blog['author'], 941 | "category": blog['category'], 942 | "published_time": blog['published_time'].isoformat(), 943 | } 944 | docs.append(Document(page_content=content, metadata=metadata)) 945 | return docs 946 | 947 | def embed_and_write(blog_instances, vectorizer): 948 | embedding = OpenAIEmbeddings() 949 | vector_store = TimescaleVector( 950 | collection_name="blog_embedding", 951 | service_url=service_url, 952 | embedding=embedding, 953 | time_partition_interval=timedelta(days=30), 954 | ) 955 | 956 | # delete old embeddings for all ids in the work queue. locked_id is a special column that is set to the primary key of the table being 957 | # embedded. For items that are deleted, it is the only key that is set. 958 | metadata_for_delete = [{"blog_id": blog['locked_id']} for blog in blog_instances] 959 | vector_store.delete_by_metadata(metadata_for_delete) 960 | 961 | documents = [] 962 | for blog in blog_instances: 963 | # skip blogs that are not published yet, or are deleted (in which case it will be NULL) 964 | if blog['published_time'] != None: 965 | documents.extend(get_document(blog)) 966 | 967 | if len(documents) == 0: 968 | return 969 | 970 | texts = [d.page_content for d in documents] 971 | metadatas = [d.metadata for d in documents] 972 | ids = [d.metadata["id"] for d in documents] 973 | vector_store.add_texts(texts, metadatas, ids) 974 | ``` 975 | 976 | Then, all you have to do is run the following code in a scheduled job 977 | (cron job, Lambda job, etc): 978 | 979 | ``` python 980 | # this job should be run on a schedule 981 | vectorizer = pgvectorizer.Vectorize(service_url, 'blog') 982 | while vectorizer.process(embed_and_write) > 0: 983 | pass 984 | ``` 985 | 986 | Every time that job runs, it will sync the table with your embeddings. 987 | It will sync all inserts, updates, and deletes to an embeddings table 988 | called `blog_embedding`. 989 | 990 | Now, you can simply search the embeddings as follows (again, using 991 | LangChain in the example): 992 | 993 | ``` python 994 | embedding = OpenAIEmbeddings() 995 | vector_store = TimescaleVector( 996 | collection_name="blog_embedding", 997 | service_url=service_url, 998 | embedding=embedding, 999 | time_partition_interval=timedelta(days=30), 1000 | ) 1001 | 1002 | res = vector_store.similarity_search_with_score("Blogs about cats") 1003 | res 1004 | ``` 1005 | 1006 | [(Document(metadata={'id': '334e4800-4bee-11eb-a52a-57b3c4a96ccb', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00-05:00'}, page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.'), 1007 | 0.12680577303752072)] 1008 | 1009 | ## Development 1010 | 1011 | This project is developed with [nbdev](https://nbdev.fast.ai/). Please 1012 | see that website for the development process. 1013 | --------------------------------------------------------------------------------