├── tests
    ├── __init__.py
    ├── utils.py
    ├── conftest.py
    ├── pg_vectorizer_test.py
    ├── sync_client_test.py
    └── async_client_test.py
├── timescale_vector
    ├── typings
    │   ├── __init__.py
    │   ├── pgvector.pyi
    │   ├── psycopg2
    │   │   ├── __init__.pyi
    │   │   ├── extras.pyi
    │   │   ├── pool.pyi
    │   │   └── extensions.pyi
    │   ├── langchain
    │   │   └── docstore
    │   │   │   └── document.pyi
    │   ├── asyncpg
    │   │   ├── pool.pyi
    │   │   ├── connection.pyi
    │   │   └── __init__.pyi
    │   ├── langchain_community
    │   │   └── vectorstores
    │   │   │   └── timescalevector.pyi
    │   └── vcr.pyi
    ├── __init__.py
    ├── client
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── uuid_time_range.py
    │   ├── index.py
    │   ├── predicates.py
    │   ├── async_client.py
    │   ├── sync_client.py
    │   └── query_builder.py
    └── pgvectorizer.py
├── nbs
    ├── requirements.txt
    ├── sidebar.yml
    ├── nbdev.yml
    ├── _quarto.yml
    ├── styles.css
    └── 01_pgvectorizer.ipynb
├── MANIFEST.in
├── .github
    └── workflows
    │   ├── deploy.yaml
    │   ├── pyright.yaml
    │   ├── ruff.yaml
    │   └── test.yaml
├── docker-compose.yaml
├── CHANGELOG.md
├── NOTICE
├── Untitled.ipynb
├── .gitignore
├── pyproject.toml
├── LICENSE
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/timescale_vector/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.9"
2 | 


--------------------------------------------------------------------------------
/nbs/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv
2 | asyncpg
3 | psycopg2
4 | pgvector
5 | numpy


--------------------------------------------------------------------------------
/timescale_vector/typings/pgvector.pyi:
--------------------------------------------------------------------------------
1 | from typing import Any
2 | 
3 | def register_vector(conn_or_curs: Any) -> None: ...
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include settings.ini
2 | include LICENSE
3 | include CONTRIBUTING.md
4 | include README.md
5 | recursive-exclude * __pycache__
6 | 


--------------------------------------------------------------------------------
/nbs/sidebar.yml:
--------------------------------------------------------------------------------
1 | website:
2 |   sidebar:
3 |     contents:
4 |       - index.ipynb
5 |       - 00_vector.ipynb
6 |       - tsv_python_getting_started_tutorial.ipynb
7 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/psycopg2/__init__.pyi:
--------------------------------------------------------------------------------
1 | from typing import Any, TypeVar
2 | 
3 | from psycopg2.extensions import connection
4 | 
5 | T = TypeVar("T")
6 | 
7 | def connect(dsn: str = "", **kwargs: Any) -> connection: ...
8 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/psycopg2/extras.pyi:
--------------------------------------------------------------------------------
1 | from typing import Protocol
2 | 
3 | from psycopg2.extensions import cursor
4 | 
5 | class DictCursor(cursor, Protocol):
6 |     def __init__(self) -> None: ...
7 | 
8 | def register_uuid(oids: int | None = None, conn_or_curs: cursor | None = None) -> None: ...
9 | 


--------------------------------------------------------------------------------
/nbs/nbdev.yml:
--------------------------------------------------------------------------------
 1 | project:
 2 |   output-dir: _docs
 3 | 
 4 | website:
 5 |   title: "timescale-vector"
 6 |   site-url: "https://timescale.github.io/python-vector"
 7 |   description: "Python library for storing vector data in Postgres"
 8 |   repo-branch: main
 9 |   repo-url: "https://github.com/timescale/python-vector"
10 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
 1 | name: Deploy to GitHub Pages
 2 | 
 3 | permissions:
 4 |   contents: write
 5 |   pages: write
 6 | 
 7 | on:
 8 |   push:
 9 |     branches: [ "main", "master" ]
10 |   workflow_dispatch:
11 | jobs:
12 |   deploy:
13 |     runs-on: ubuntu-latest
14 |     steps: [uses: fastai/workflows/quarto-ghp@master]
15 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   db:
 3 |     image: timescale/timescaledb-ha:pg16
 4 |     ports:
 5 |       - "5432:5432"
 6 |     environment:
 7 |       - POSTGRES_PASSWORD=postgres
 8 |       - POSTGRES_USER=postgres
 9 |       - POSTGRES_DB=postgres
10 |       - TIMESCALEDB_TELEMETRY=off
11 |     volumes:
12 |       - ./data:/var/lib/postgresql/data


--------------------------------------------------------------------------------
/nbs/_quarto.yml:
--------------------------------------------------------------------------------
 1 | project:
 2 |   type: website
 3 | 
 4 | format:
 5 |   html:
 6 |     theme: cosmo
 7 |     css: styles.css
 8 |     toc: true
 9 | 
10 | website:
11 |   twitter-card: true
12 |   open-graph: true
13 |   repo-actions: [issue]
14 |   navbar:
15 |     background: primary
16 |     search: true
17 |   sidebar:
18 |     style: floating
19 | 
20 | metadata-files: [nbdev.yml, sidebar.yml]


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Release notes
 2 | 
 3 | <!-- do not remove -->
 4 | 
 5 | ## 0.0.5
 6 | 
 7 | Added contains operator to support metadata array values.
 8 | 
 9 | ## 0.0.4
10 | 
11 | Various usability improvements.
12 | 
13 | ## 0.0.3
14 | 
15 | Add PgVectorizer
16 | Add ability to have predicates on uuid timestamp
17 | 
18 | ## 0.0.2
19 | 
20 | Add ability to infer start and end date from filters
21 | 
22 | 
23 | ## 0.0.1
24 | 
25 | First Release!
26 | 
27 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/psycopg2/pool.pyi:
--------------------------------------------------------------------------------
 1 | from collections.abc import Hashable
 2 | from typing import Any
 3 | 
 4 | from psycopg2.extensions import connection
 5 | 
 6 | class SimpleConnectionPool:
 7 |     def __init__(self, minconn: int, maxconn: int, dsn: str, **kwargs: Any) -> None: ...
 8 |     def getconn(self, key: Hashable | None = None) -> connection: ...
 9 |     def putconn(self, conn: connection, key: Hashable | None = None, close: bool = False) -> None: ...
10 |     def closeall(self) -> None: ...
11 | 


--------------------------------------------------------------------------------
/.github/workflows/pyright.yaml:
--------------------------------------------------------------------------------
 1 | name: Type Checking
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   pyright:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v4
14 |       with:
15 |         python-version: '3.10'
16 |     - name: Install uv
17 |       run: pip install uv
18 |     - name: Create venv
19 |       run: uv venv
20 |     - name: Install dependencies
21 |       run: |
22 |         uv sync
23 |     - name: Run Pyright
24 |       run: uv run pyright


--------------------------------------------------------------------------------
/.github/workflows/ruff.yaml:
--------------------------------------------------------------------------------
 1 | name: Ruff Linting and Formatting
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   ruff:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v4
14 |       with:
15 |         python-version: '3.10'
16 |     - name: Install uv
17 |       run: pip install uv
18 |     - name: Create venv
19 |       run: uv venv
20 |     - name: Install dependencies
21 |       run: |
22 |         uv sync
23 |     - name: Run Ruff linter
24 |       run: uv run ruff check .
25 |     - name: Run Ruff formatter
26 |       run: uv run ruff format . --check


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Python client libraries for TimescaleDB (TM) Vector
 2 | 
 3 | Copyright (c) 2018-2022  Timescale, Inc. All Rights Reserved.
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License");
 6 | you may not use this file except in compliance with the License.
 7 | You may obtain a copy of the License at
 8 | 
 9 |     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any
 3 | 
 4 | import vcr
 5 | 
 6 | vcr_cassette_path = os.path.join(os.path.dirname(__file__), "vcr_cassettes")
 7 | 
 8 | 
 9 | def remove_set_cookie_header(response: dict[str, Any]):
10 |     headers = response["headers"]
11 |     headers_to_remove = ["set-cookie", "Set-Cookie"]
12 | 
13 |     for header in headers_to_remove:
14 |         if header in headers:
15 |             del headers[header]
16 | 
17 |     return response
18 | 
19 | 
20 | http_recorder = vcr.VCR(
21 |     cassette_library_dir=vcr_cassette_path,
22 |     record_mode="once",
23 |     filter_headers=["authorization", "cookie"],
24 |     before_record_response=remove_set_cookie_header,
25 | )
26 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   pytest:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v4
14 |       with:
15 |         python-version: '3.10'
16 |     - name: Install uv
17 |       run: pip install uv
18 |     - name: Create venv
19 |       run: uv venv
20 |     - name: Install dependencies
21 |       run: |
22 |         uv sync
23 |     - name: Start docker-compose 
24 |       run: docker compose up -d
25 |     - name: Run Test
26 |       run: uv run pytest
27 |     - name: Logs
28 |       run: docker compose logs
29 |     - name: Stop docker-compose
30 |       run: docker compose down


--------------------------------------------------------------------------------
/Untitled.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "d02cf1aa-4cf0-4656-a2f8-100f39233f37",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": []
10 |   }
11 |  ],
12 |  "metadata": {
13 |   "kernelspec": {
14 |    "display_name": "Python 3 (ipykernel)",
15 |    "language": "python",
16 |    "name": "python3"
17 |   },
18 |   "language_info": {
19 |    "codemirror_mode": {
20 |     "name": "ipython",
21 |     "version": 3
22 |    },
23 |    "file_extension": ".py",
24 |    "mimetype": "text/x-python",
25 |    "name": "python",
26 |    "nbconvert_exporter": "python",
27 |    "pygments_lexer": "ipython3",
28 |    "version": "3.10.12"
29 |   }
30 |  },
31 |  "nbformat": 4,
32 |  "nbformat_minor": 5
33 | }
34 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/langchain/docstore/document.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any, TypeVar
 2 | 
 3 | from typing_extensions import TypedDict
 4 | 
 5 | class Metadata(TypedDict, total=False):
 6 |     id: str
 7 |     blog_id: str
 8 |     author: str
 9 |     category: str
10 |     published_time: str
11 | 
12 | T = TypeVar("T")
13 | 
14 | class Document:
15 |     """Documents are the basic unit of text in LangChain."""
16 | 
17 |     page_content: str
18 |     metadata: dict[str, Any]
19 | 
20 |     def __init__(
21 |         self,
22 |         page_content: str,
23 |         metadata: dict[str, Any] | None = None,
24 |     ) -> None: ...
25 |     @property
26 |     def lc_kwargs(self) -> dict[str, Any]: ...
27 |     @classmethod
28 |     def is_lc_serializable(cls) -> bool: ...
29 | 


--------------------------------------------------------------------------------
/nbs/styles.css:
--------------------------------------------------------------------------------
 1 | .cell {
 2 |   margin-bottom: 1rem;
 3 | }
 4 | 
 5 | .cell > .sourceCode {
 6 |   margin-bottom: 0;
 7 | }
 8 | 
 9 | .cell-output > pre {
10 |   margin-bottom: 0;
11 | }
12 | 
13 | .cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre {
14 |   margin-left: 0.8rem;
15 |   margin-top: 0;
16 |   background: none;
17 |   border-left: 2px solid lightsalmon;
18 |   border-top-left-radius: 0;
19 |   border-top-right-radius: 0;
20 | }
21 | 
22 | .cell-output > .sourceCode {
23 |   border: none;
24 | }
25 | 
26 | .cell-output > .sourceCode {
27 |   background: none;
28 |   margin-top: 0;
29 | }
30 | 
31 | div.description {
32 |   padding-left: 2px;
33 |   padding-top: 5px;
34 |   font-style: italic;
35 |   font-size: 135%;
36 |   opacity: 70%;
37 | }
38 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/asyncpg/pool.pyi:
--------------------------------------------------------------------------------
 1 | from contextlib import AbstractAsyncContextManager
 2 | from typing import Any
 3 | 
 4 | from . import connection
 5 | 
 6 | class Pool:
 7 |     def __init__(self) -> None: ...
 8 |     def acquire(self, *, timeout: float | None = None) -> PoolAcquireContext: ...
 9 |     def release(self, connection: connection.Connection) -> None: ...
10 |     async def close(self) -> None: ...
11 |     def terminate(self) -> None: ...
12 | 
13 |     # Context manager support
14 |     async def __aenter__(self) -> Pool: ...
15 |     async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ...
16 | 
17 | class PoolAcquireContext(AbstractAsyncContextManager["connection.Connection"]):
18 |     async def __aenter__(self) -> connection.Connection: ...
19 |     async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ...
20 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/psycopg2/extensions.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any, Protocol
 2 | 
 3 | class cursor(Protocol):
 4 |     def execute(self, query: str, vars: Any | None = None) -> Any: ...
 5 |     def executemany(self, query: str, vars_list: list[Any]) -> Any: ...
 6 |     def fetchone(self) -> tuple[Any, ...] | None: ...
 7 |     def fetchall(self) -> list[tuple[Any, ...]]: ...
 8 |     def __enter__(self) -> cursor: ...
 9 |     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ...
10 | 
11 | class connection(Protocol):
12 |     def cursor(self, cursor_factory: Any | None = None) -> cursor: ...
13 |     def commit(self) -> None: ...
14 |     def close(self) -> None: ...
15 |     def __enter__(self) -> connection: ...
16 |     def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ...
17 | 
18 | def register_uuid(oids: Any | None = None, conn_or_curs: Any | None = None) -> None: ...
19 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import psycopg2
 4 | import pytest
 5 | 
 6 | # from dotenv import find_dotenv, load_dotenv
 7 | 
 8 | 
 9 | @pytest.fixture(scope="module")
10 | def setup_env_variables() -> None:
11 |     os.environ.clear()
12 |     os.environ["TIMESCALE_SERVICE_URL"] = "postgres://postgres:postgres@localhost:5432/postgres"
13 |     os.environ["OPENAI_API_KEY"] = "fake key"
14 | 
15 | 
16 | @pytest.fixture(scope="module")
17 | def service_url(setup_env_variables: None) -> str:  # noqa: ARG001
18 |     # _ = load_dotenv(find_dotenv(), override=True)
19 |     return os.environ["TIMESCALE_SERVICE_URL"]
20 | 
21 | 
22 | @pytest.fixture(scope="module", autouse=True)
23 | def setup_db(service_url: str) -> None:
24 |     conn = psycopg2.connect(service_url)
25 |     with conn.cursor() as cursor:
26 |         cursor.execute("CREATE EXTENSION IF NOT EXISTS ai CASCADE;")
27 |         cursor.execute("CREATE SCHEMA IF NOT EXISTS temp;")
28 |     conn.commit()
29 |     conn.close()
30 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/langchain_community/vectorstores/timescalevector.pyi:
--------------------------------------------------------------------------------
 1 | from collections.abc import Sequence
 2 | from datetime import timedelta
 3 | from typing import Any
 4 | 
 5 | from langchain.docstore.document import Document
 6 | from langchain.schema.embeddings import Embeddings
 7 | 
 8 | class TimescaleVector:
 9 |     def __init__(
10 |         self,
11 |         collection_name: str,
12 |         service_url: str,
13 |         embedding: Embeddings,
14 |         time_partition_interval: timedelta | None = None,
15 |     ) -> None: ...
16 |     def add_texts(
17 |         self,
18 |         texts: Sequence[str],
19 |         metadatas: list[dict[str, Any]] | None = None,
20 |         ids: list[str] | None = None,
21 |         **kwargs: Any,
22 |     ) -> list[str]: ...
23 |     def delete_by_metadata(
24 |         self,
25 |         metadata_filter: dict[str, Any] | list[dict[str, Any]],
26 |     ) -> None: ...
27 |     def similarity_search_with_score(
28 |         self,
29 |         query: str,
30 |         k: int = 4,
31 |         filter: dict[str, Any] | list[dict[str, Any]] | None = None,
32 |         predicates: Any | None = None,
33 |         **kwargs: Any,
34 |     ) -> list[tuple[Document, float]]: ...
35 | 


--------------------------------------------------------------------------------
/timescale_vector/client/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "SEARCH_RESULT_ID_IDX",
 3 |     "SEARCH_RESULT_METADATA_IDX",
 4 |     "SEARCH_RESULT_CONTENTS_IDX",
 5 |     "SEARCH_RESULT_EMBEDDING_IDX",
 6 |     "SEARCH_RESULT_DISTANCE_IDX",
 7 |     "uuid_from_time",
 8 |     "BaseIndex",
 9 |     "IvfflatIndex",
10 |     "HNSWIndex",
11 |     "DiskAnnIndex",
12 |     "QueryParams",
13 |     "DiskAnnIndexParams",
14 |     "IvfflatIndexParams",
15 |     "HNSWIndexParams",
16 |     "UUIDTimeRange",
17 |     "Predicates",
18 |     "QueryBuilder",
19 |     "Async",
20 |     "Sync",
21 | ]
22 | 
23 | from timescale_vector.client.async_client import Async
24 | from timescale_vector.client.index import (
25 |     BaseIndex,
26 |     DiskAnnIndex,
27 |     DiskAnnIndexParams,
28 |     HNSWIndex,
29 |     HNSWIndexParams,
30 |     IvfflatIndex,
31 |     IvfflatIndexParams,
32 |     QueryParams,
33 | )
34 | from timescale_vector.client.predicates import Predicates
35 | from timescale_vector.client.query_builder import QueryBuilder
36 | from timescale_vector.client.sync_client import Sync
37 | from timescale_vector.client.utils import uuid_from_time
38 | from timescale_vector.client.uuid_time_range import UUIDTimeRange
39 | 
40 | SEARCH_RESULT_ID_IDX = 0
41 | SEARCH_RESULT_METADATA_IDX = 1
42 | SEARCH_RESULT_CONTENTS_IDX = 2
43 | SEARCH_RESULT_EMBEDDING_IDX = 3
44 | SEARCH_RESULT_DISTANCE_IDX = 4
45 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/asyncpg/connection.pyi:
--------------------------------------------------------------------------------
 1 | from collections.abc import Sequence
 2 | from typing import Any
 3 | 
 4 | from . import Record
 5 | 
 6 | class Connection:
 7 |     # Transaction management
 8 |     async def execute(self, query: str, *args: Any, timeout: float | None = None) -> str: ...
 9 |     async def executemany(
10 |         self, command: str, args: Sequence[Sequence[Any]], *, timeout: float | None = None
11 |     ) -> str: ...
12 |     async def fetch(self, query: str, *args: Any, timeout: float | None = None) -> list[Record]: ...
13 |     async def fetchval(self, query: str, *args: Any, column: int = 0, timeout: float | None = None) -> Any: ...
14 |     async def fetchrow(self, query: str, *args: Any, timeout: float | None = None) -> Record | None: ...
15 |     async def set_type_codec(
16 |         self, typename: str, *, schema: str = "public", encoder: Any, decoder: Any, format: str = "text"
17 |     ) -> None: ...
18 | 
19 |     # Transaction context
20 |     def transaction(self, *, isolation: str = "read_committed") -> Transaction: ...
21 |     async def close(self, *, timeout: float | None = None) -> None: ...
22 | 
23 | class Transaction:
24 |     async def __aenter__(self) -> Transaction: ...
25 |     async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: ...
26 |     async def start(self) -> None: ...
27 |     async def commit(self) -> None: ...
28 |     async def rollback(self) -> None: ...
29 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/asyncpg/__init__.pyi:
--------------------------------------------------------------------------------
 1 | from collections.abc import Sequence
 2 | from typing import Any, Protocol, TypeVar
 3 | 
 4 | from . import connection, pool
 5 | 
 6 | # Core types
 7 | T = TypeVar("T")
 8 | 
 9 | class Record(Protocol):
10 |     def __getitem__(self, key: int | str) -> Any: ...
11 |     def __iter__(self) -> Any: ...
12 |     def __len__(self) -> int: ...
13 |     def get(self, key: str, default: T = None) -> T | None: ...
14 |     def keys(self) -> Sequence[str]: ...
15 |     def values(self) -> Sequence[Any]: ...
16 |     def items(self) -> Sequence[tuple[str, Any]]: ...
17 | 
18 |     # Allow dictionary-style access to fields
19 |     def __getattr__(self, name: str) -> Any: ...
20 | 
21 | # Re-exports
22 | Connection = connection.Connection
23 | Pool = pool.Pool
24 | Record = Record
25 | 
26 | # Functions
27 | async def connect(
28 |     dsn: str | None = None,
29 |     *,
30 |     host: str | None = None,
31 |     port: int | None = None,
32 |     user: str | None = None,
33 |     password: str | None = None,
34 |     database: str | None = None,
35 |     timeout: int = 60,
36 | ) -> Connection: ...
37 | async def create_pool(
38 |     dsn: str | None = None,
39 |     *,
40 |     min_size: int = 10,
41 |     max_size: int = 10,
42 |     max_queries: int = 50000,
43 |     max_inactive_connection_lifetime: float = 300.0,
44 |     setup: Any | None = None,
45 |     init: Any | None = None,
46 |     **connect_kwargs: Any,
47 | ) -> Pool: ...
48 | 


--------------------------------------------------------------------------------
/timescale_vector/typings/vcr.pyi:
--------------------------------------------------------------------------------
 1 | from collections.abc import Callable
 2 | from typing import Any, Literal, Protocol, TypeAlias, TypeVar, overload
 3 | 
 4 | _T = TypeVar("_T")
 5 | _F = TypeVar("_F", bound=Callable[..., Any])
 6 | 
 7 | class VCRConfig(Protocol):
 8 |     filter_headers: list[str]
 9 |     ignore_localhost: bool
10 |     ignore_hosts: list[str]
11 |     record_mode: Literal["once", "new_episodes", "none", "all"]
12 |     match_on: list[str]
13 | 
14 | class _Cassette:
15 |     def __init__(self, path: str) -> None: ...
16 |     def play_response(self, request: Any) -> Any: ...
17 |     def append(self, request: Any, response: Any) -> None: ...
18 |     def responses_of(self, request: Any) -> list[Any]: ...
19 | 
20 | class VCR:
21 |     def __init__(self, **kwargs: Any) -> None: ...
22 |     @overload
23 |     def use_cassette(self, path: str) -> Callable[[_F], _F]: ...
24 |     @overload
25 |     def use_cassette(self, path: str, **kwargs: Any) -> Callable[[_F], _F]: ...
26 |     def record_mode(self) -> str: ...
27 |     def turn_off(self, *, allow_playback: bool = ...) -> None: ...
28 |     def turn_on(self) -> None: ...
29 |     def serialize(self) -> dict[str, Any]: ...
30 | 
31 | @overload
32 | def use_cassette(path: str) -> Callable[[_F], _F]: ...
33 | @overload
34 | def use_cassette(path: str, **kwargs: Any) -> Callable[[_F], _F]: ...
35 | def use_cassette(path: str, **kwargs: Any) -> _Cassette: ...
36 | 
37 | default_vcr: VCR
38 | 
39 | class VCRError(Exception): ...
40 | class CannotOverwriteExistingCassetteException(VCRError): ...
41 | class UnhandledHTTPRequestError(VCRError): ...
42 | 
43 | # Common kwargs for reference (these aren't actually part of the type system)
44 | COMMON_KWARGS: TypeAlias = Literal[
45 |     "record_mode",  # : Literal["once", "new_episodes", "none", "all"]
46 |     "match_on",  # : list[str] - e.g. ["uri", "method", "body"]
47 |     "filter_headers",  # : list[str] - headers to filter out
48 |     "before_record_response",  # : Callable[[Any], Any]
49 |     "before_record_request",  # : Callable[[Any], Any]
50 |     "ignore_localhost",  # : bool
51 |     "ignore_hosts",  # : list[str]
52 | ]
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | _docs/
  2 | _proc/
  3 | 
  4 | *.bak
  5 | .gitattributes
  6 | .last_checked
  7 | .gitconfig
  8 | *.bak
  9 | *.log
 10 | *~
 11 | ~*
 12 | _tmp*
 13 | tmp*
 14 | tags
 15 | *.pkg
 16 | 
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | env/
 28 | build/
 29 | develop-eggs/
 30 | dist/
 31 | downloads/
 32 | eggs/
 33 | .eggs/
 34 | lib/
 35 | lib64/
 36 | parts/
 37 | sdist/
 38 | var/
 39 | wheels/
 40 | *.egg-info/
 41 | .installed.cfg
 42 | *.egg
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .coverage
 58 | .coverage.*
 59 | .cache
 60 | nosetests.xml
 61 | coverage.xml
 62 | *.cover
 63 | .hypothesis/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # dotenv
 99 | .env
100 | 
101 | # virtualenv
102 | .venv
103 | venv/
104 | ENV/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | 
119 | .vscode
120 | *.swp
121 | 
122 | # osx generated files
123 | .DS_Store
124 | .DS_Store?
125 | .Trashes
126 | ehthumbs.db
127 | Thumbs.db
128 | .idea
129 | 
130 | # pytest
131 | .pytest_cache
132 | 
133 | # tools/trust-doc-nbs
134 | docs_src/.last_checked
135 | 
136 | # symlinks to fastai
137 | docs_src/fastai
138 | tools/fastai
139 | 
140 | # link checker
141 | checklink/cookies.txt
142 | 
143 | # .gitconfig is now autogenerated
144 | .gitconfig
145 | 
146 | # Quarto installer
147 | .deb
148 | .pkg
149 | 
150 | # Quarto
151 | .quarto
152 | token
153 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "timescale-vector"
  3 | version = "0.0.9"
  4 | description = "Python library for storing vector data in Postgres"
  5 | authors = [
  6 |     {name = "Matvey Arye", email = "mat@timescale.com"},
  7 | ]
  8 | requires-python = ">=3.10"
  9 | license = {text = "Apache-2.0"}
 10 | readme = "README.md"
 11 | classifiers = [
 12 |     "Development Status :: 3 - Alpha",
 13 |     "Intended Audience :: Developers",
 14 |     "License :: OSI Approved :: Apache Software License",
 15 |     "Programming Language :: Python :: 3.10",
 16 | ]
 17 | 
 18 | dependencies = [
 19 |     "python-dotenv>=1.0.1",
 20 |     "asyncpg>=0.29.0",
 21 |     "psycopg2>=2.9.9",
 22 |     "pgvector>=0.3.5",
 23 |     "numpy>=1,<2",
 24 | ]
 25 | 
 26 | [project.urls]
 27 | repository = "https://github.com/timescale/python-vector"
 28 | documentation = "https://timescale.github.io/python-vector"
 29 | 
 30 | [build-system]
 31 | requires = ["hatchling"]
 32 | build-backend = "hatchling.build"
 33 | 
 34 | [tool.hatch.build.targets.wheel]
 35 | packages = ["timescale_vector"]
 36 | 
 37 | [tool.pytest.ini_options]
 38 | addopts = [
 39 |     "--import-mode=importlib",
 40 | ]
 41 | 
 42 | 
 43 | [tool.mypy]
 44 | strict = true
 45 | ignore_missing_imports = true
 46 | namespace_packages = true
 47 | 
 48 | [tool.pyright]
 49 | typeCheckingMode = "strict"
 50 | stubPath = "timescale_vector/typings"
 51 | 
 52 | [tool.ruff]
 53 | line-length = 120
 54 | indent-width = 4
 55 | output-format = "grouped"
 56 | target-version = "py310"
 57 | 
 58 | exclude = [
 59 |     ".bzr",
 60 |     ".direnv",
 61 |     ".eggs",
 62 |     ".git",
 63 |     ".git-rewrite",
 64 |     ".hg",
 65 |     ".ipynb_checkpoints",
 66 |     ".mypy_cache",
 67 |     ".nox",
 68 |     ".pants.d",
 69 |     ".pyenv",
 70 |     ".pytest_cache",
 71 |     ".pytype",
 72 |     ".ruff_cache",
 73 |     ".svn",
 74 |     ".tox",
 75 |     ".venv",
 76 |     ".vscode",
 77 |     "__pypackages__",
 78 |     "_build",
 79 |     "buck-out",
 80 |     "build",
 81 |     "dist",
 82 |     "node_modules",
 83 |     "site-packages",
 84 |     "venv",
 85 |     "nbs"
 86 | ]
 87 | 
 88 | [tool.ruff.format]
 89 | docstring-code-format = true
 90 | quote-style = "double"
 91 | indent-style = "space"
 92 | skip-magic-trailing-comma = false
 93 | line-ending = "auto"
 94 | 
 95 | [tool.ruff.lint]
 96 | select = [
 97 |     "E",
 98 |     "F",
 99 |     "UP",
100 |     "B",
101 |     "SIM",
102 |     "I",
103 |     "ARG",
104 |     "W291",
105 |     "PIE",
106 |     "Q"
107 | ]
108 | 
109 | [tool.uv]
110 | dev-dependencies = [
111 |     "ruff>=0.6.9",
112 |     "pytest>=8.3.3",
113 |     "langchain>=0.3.3",
114 |     "langchain-openai>=0.2.2",
115 |     "langchain-community>=0.3.2",
116 |     "pandas>=2.2.3",
117 |     "pytest-asyncio>=0.24.0",
118 |     "pyright>=1.1.386",
119 |     "vcrpy>=6.0.2",
120 | ]
121 | 


--------------------------------------------------------------------------------
/timescale_vector/client/utils.py:
--------------------------------------------------------------------------------
 1 | import calendar
 2 | import random
 3 | import uuid
 4 | from datetime import datetime, timezone
 5 | from typing import Any
 6 | 
 7 | 
 8 | # copied from Cassandra: https://docs.datastax.com/en/drivers/python/3.2/_modules/cassandra/util.html#uuid_from_time
 9 | def uuid_from_time(
10 |     time_arg: float | datetime | None = None, node: Any = None, clock_seq: int | None = None
11 | ) -> uuid.UUID:
12 |     """
13 |     Converts a datetime or timestamp to a type 1 `uuid.UUID`.
14 | 
15 |     Parameters
16 |     ----------
17 |     time_arg
18 |         The time to use for the timestamp portion of the UUID.
19 |         This can either be a `datetime` object or a timestamp in seconds
20 |         (as returned from `time.time()`).
21 |     node
22 |         Bytes for the UUID (up to 48 bits). If not specified, this
23 |         field is randomized.
24 |     clock_seq
25 |         Clock sequence field for the UUID (up to 14 bits). If not specified,
26 |         a random sequence is generated.
27 | 
28 |     Returns
29 |     -------
30 |         uuid.UUID:  For the given time, node, and clock sequence
31 |     """
32 |     if time_arg is None:
33 |         return uuid.uuid1(node, clock_seq)
34 |     if isinstance(time_arg, datetime):
35 |         # this is different from the Cassandra version,
36 |         # we assume that a naive datetime is in system time and convert it to UTC
37 |         # we do this because naive datetimes are interpreted as timestamps (without timezone) in postgres
38 |         time_arg_dt: datetime = time_arg  # type: ignore
39 |         if time_arg_dt.tzinfo is None:
40 |             time_arg_dt = time_arg_dt.astimezone(timezone.utc)
41 |         seconds = int(calendar.timegm(time_arg_dt.utctimetuple()))
42 |         microseconds = (seconds * 1e6) + time_arg_dt.time().microsecond
43 |     else:
44 |         microseconds = int(float(time_arg) * 1e6)
45 | 
46 |     # 0x01b21dd213814000 is the number of 100-ns intervals between the
47 |     # UUID epoch 1582-10-15 00:00:00 and the Unix epoch 1970-01-01 00:00:00.
48 |     intervals = int(microseconds * 10) + 0x01B21DD213814000
49 | 
50 |     time_low = intervals & 0xFFFFFFFF
51 |     time_mid = (intervals >> 32) & 0xFFFF
52 |     time_hi_version = (intervals >> 48) & 0x0FFF
53 | 
54 |     if clock_seq is None:
55 |         clock_seq = random.getrandbits(14)
56 |     else:
57 |         if clock_seq > 0x3FFF:
58 |             raise ValueError("clock_seq is out of range (need a 14-bit value)")
59 | 
60 |     clock_seq_low = clock_seq & 0xFF
61 |     clock_seq_hi_variant = 0x80 | ((clock_seq >> 8) & 0x3F)
62 | 
63 |     if node is None:
64 |         node = random.getrandbits(48)
65 | 
66 |     return uuid.UUID(
67 |         fields=(
68 |             time_low,
69 |             time_mid,
70 |             time_hi_version,
71 |             clock_seq_hi_variant,
72 |             clock_seq_low,
73 |             node,
74 |         ),
75 |         version=1,
76 |     )
77 | 


--------------------------------------------------------------------------------
/timescale_vector/client/uuid_time_range.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta, timezone
  2 | from typing import Any
  3 | 
  4 | 
  5 | class UUIDTimeRange:
  6 |     @staticmethod
  7 |     def _parse_datetime(input_datetime: datetime | str | None | Any) -> datetime | None:
  8 |         """
  9 |         Parse a datetime object or string representation of a datetime.
 10 | 
 11 |         Args:
 12 |             input_datetime (datetime or str): Input datetime or string.
 13 | 
 14 |         Returns:
 15 |             datetime: Parsed datetime object.
 16 | 
 17 |         Raises:
 18 |             ValueError: If the input cannot be parsed as a datetime.
 19 |         """
 20 |         if input_datetime is None or input_datetime == "None":
 21 |             return None
 22 | 
 23 |         if isinstance(input_datetime, datetime):
 24 |             # If input is already a datetime object, return it as is
 25 |             return input_datetime
 26 | 
 27 |         if isinstance(input_datetime, str):
 28 |             try:
 29 |                 # Attempt to parse the input string into a datetime
 30 |                 return datetime.fromisoformat(input_datetime)
 31 |             except ValueError:
 32 |                 raise ValueError(f"Invalid datetime string format: {input_datetime}") from None
 33 | 
 34 |         raise ValueError("Input must be a datetime object or string")
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         start_date: datetime | str | None = None,
 39 |         end_date: datetime | str | None = None,
 40 |         time_delta: timedelta | None = None,
 41 |         start_inclusive: bool = True,
 42 |         end_inclusive: bool = False,
 43 |     ):
 44 |         """
 45 |         A UUIDTimeRange is a time range predicate on the UUID Version 1 timestamps.
 46 | 
 47 |         Note that naive datetime objects are interpreted as local time on the python client side
 48 |         and converted to UTC before being sent to the database.
 49 |         """
 50 |         start_date = UUIDTimeRange._parse_datetime(start_date)
 51 |         end_date = UUIDTimeRange._parse_datetime(end_date)
 52 | 
 53 |         if start_date is not None and end_date is not None and start_date > end_date:
 54 |             raise Exception("start_date must be before end_date")
 55 | 
 56 |         if start_date is None and end_date is None:
 57 |             raise Exception("start_date and end_date cannot both be None")
 58 | 
 59 |         if start_date is not None and start_date.tzinfo is None:
 60 |             start_date = start_date.astimezone(timezone.utc)
 61 | 
 62 |         if end_date is not None and end_date.tzinfo is None:
 63 |             end_date = end_date.astimezone(timezone.utc)
 64 | 
 65 |         if time_delta is not None:
 66 |             if end_date is None and start_date is not None:
 67 |                 end_date = start_date + time_delta
 68 |             elif start_date is None and end_date is not None:
 69 |                 start_date = end_date - time_delta
 70 |             else:
 71 |                 raise Exception("time_delta, start_date and end_date cannot all be specified at the same time")
 72 | 
 73 |         self.start_date: datetime | None = start_date
 74 |         self.end_date: datetime | None = end_date
 75 |         self.start_inclusive: bool = start_inclusive
 76 |         self.end_inclusive: bool = end_inclusive
 77 | 
 78 |     def __str__(self) -> str:
 79 |         start_str = f"[{self.start_date}" if self.start_inclusive else f"({self.start_date}"
 80 |         end_str = f"{self.end_date}]" if self.end_inclusive else f"{self.end_date})"
 81 | 
 82 |         return f"UUIDTimeRange {start_str}, {end_str}"
 83 | 
 84 |     def build_query(self, params: list[Any]) -> tuple[str, list[Any]]:
 85 |         column = "uuid_timestamp(id)"
 86 |         queries: list[str] = []
 87 |         if self.start_date is not None:
 88 |             if self.start_inclusive:
 89 |                 queries.append(f"{column} >= ${len(params)+1}")
 90 |             else:
 91 |                 queries.append(f"{column} > ${len(params)+1}")
 92 |             params.append(self.start_date)
 93 |         if self.end_date is not None:
 94 |             if self.end_inclusive:
 95 |                 queries.append(f"{column} <= ${len(params)+1}")
 96 |             else:
 97 |                 queries.append(f"{column} < ${len(params)+1}")
 98 |             params.append(self.end_date)
 99 |         return " AND ".join(queries), params
100 | 


--------------------------------------------------------------------------------
/timescale_vector/pgvectorizer.py:
--------------------------------------------------------------------------------
  1 | # pyright: reportPrivateUsage=false
  2 | __all__ = ["Vectorize"]
  3 | 
  4 | import re
  5 | from collections.abc import Callable
  6 | from typing import Any
  7 | 
  8 | import psycopg2.extras
  9 | import psycopg2.pool
 10 | 
 11 | from . import client
 12 | 
 13 | 
 14 | def _create_ident(base: str, suffix: str) -> str:
 15 |     if len(base) + len(suffix) > 62:
 16 |         base = base[: 62 - len(suffix)]
 17 |     return re.sub(r"[^a-zA-Z0-9_]", "_", f"{base}_{suffix}")
 18 | 
 19 | 
 20 | class Vectorize:
 21 |     def __init__(
 22 |         self,
 23 |         service_url: str,
 24 |         table_name: str,
 25 |         schema_name: str = "public",
 26 |         id_column_name: str = "id",
 27 |         work_queue_table_name: str | None = None,
 28 |         trigger_name: str = "track_changes_for_embedding",
 29 |         trigger_name_fn: str | None = None,
 30 |     ) -> None:
 31 |         self.service_url = service_url
 32 |         self.table_name_unquoted = table_name
 33 |         self.schema_name_unquoted = schema_name
 34 |         self.table_name = client.QueryBuilder._quote_ident(table_name)
 35 |         self.schema_name = client.QueryBuilder._quote_ident(schema_name)
 36 |         self.id_column_name = client.QueryBuilder._quote_ident(id_column_name)
 37 |         if work_queue_table_name is None:
 38 |             work_queue_table_name = _create_ident(table_name, "embedding_work_queue")
 39 |         self.work_queue_table_name = client.QueryBuilder._quote_ident(work_queue_table_name)
 40 | 
 41 |         self.trigger_name = client.QueryBuilder._quote_ident(trigger_name)
 42 | 
 43 |         if trigger_name_fn is None:
 44 |             trigger_name_fn = _create_ident(table_name, "wq_for_embedding")
 45 |         self.trigger_name_fn = client.QueryBuilder._quote_ident(trigger_name_fn)
 46 | 
 47 |     def register(self) -> None:
 48 |         with psycopg2.connect(self.service_url) as conn, conn.cursor() as cursor:
 49 |             cursor.execute(f"""
 50 |                 SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}') is not null;
 51 |             """)
 52 |             table_exists = cursor.fetchone()
 53 |             if table_exists and table_exists[0]:
 54 |                 return
 55 | 
 56 |             cursor.execute(f"""
 57 |                 CREATE TABLE {self.schema_name}.{self.work_queue_table_name} (
 58 |                     id int
 59 |                 );
 60 | 
 61 |                 CREATE INDEX ON {self.schema_name}.{self.work_queue_table_name}(id);
 62 | 
 63 |                 CREATE OR REPLACE FUNCTION {self.schema_name}.{self.trigger_name_fn}()
 64 |                 RETURNS TRIGGER LANGUAGE PLPGSQL AS $$
 65 |                 BEGIN
 66 |                     IF (TG_OP = 'DELETE') THEN
 67 |                         INSERT INTO {self.work_queue_table_name}
 68 |                         VALUES (OLD.{self.id_column_name});
 69 |                     ELSE
 70 |                         INSERT INTO {self.work_queue_table_name}
 71 |                         VALUES (NEW.{self.id_column_name});
 72 |                     END IF;
 73 |                     RETURN NULL;
 74 |                 END;
 75 |                 $$;
 76 | 
 77 |                 CREATE TRIGGER {self.trigger_name}
 78 |                 AFTER INSERT OR UPDATE OR DELETE
 79 |                 ON {self.schema_name}.{self.table_name}
 80 |                 FOR EACH ROW EXECUTE PROCEDURE {self.schema_name}.{self.trigger_name_fn}();
 81 | 
 82 |                 INSERT INTO {self.schema_name}.{self.work_queue_table_name} SELECT {self.id_column_name}
 83 |                 FROM {self.schema_name}.{self.table_name};
 84 |             """)
 85 | 
 86 |     def process(
 87 |         self,
 88 |         embed_and_write_cb: Callable[[list[Any], "Vectorize"], None],
 89 |         batch_size: int = 10,
 90 |         autoregister: bool = True,
 91 |     ) -> int:
 92 |         if autoregister:
 93 |             self.register()
 94 | 
 95 |         with (
 96 |             psycopg2.connect(self.service_url) as conn,
 97 |             conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor,
 98 |         ):
 99 |             cursor.execute(f"""
100 |                 SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}')::oid;
101 |             """)
102 |             table_oid = cursor.fetchone()
103 |             if table_oid is None:
104 |                 return 0
105 | 
106 |             cursor.execute(f"""
107 |                 WITH selected_rows AS (
108 |                     SELECT id
109 |                     FROM {self.schema_name}.{self.work_queue_table_name}
110 |                     LIMIT {int(batch_size)}
111 |                     FOR UPDATE SKIP LOCKED
112 |                 ),
113 |                 locked_items AS (
114 |                     SELECT id, pg_try_advisory_xact_lock({int(table_oid[0])}, id) AS locked
115 |                     FROM (SELECT DISTINCT id FROM selected_rows ORDER BY id) as ids
116 |                 ),
117 |                 deleted_rows AS (
118 |                     DELETE FROM {self.schema_name}.{self.work_queue_table_name}
119 |                     WHERE id IN (SELECT id FROM locked_items WHERE locked = true ORDER BY id)
120 |                 )
121 |                 SELECT locked_items.id as locked_id, {self.table_name}.*
122 |                 FROM locked_items
123 |                 LEFT JOIN {self.schema_name}.{self.table_name}
124 |                 ON {self.table_name}.{self.id_column_name} = locked_items.id
125 |                 WHERE locked = true
126 |                 ORDER BY locked_items.id
127 |             """)
128 |             res = cursor.fetchall()
129 |             if len(res) > 0:
130 |                 embed_and_write_cb(res, self)
131 |             return len(res)
132 | 


--------------------------------------------------------------------------------
/timescale_vector/client/index.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections.abc import Callable
  3 | from typing import Any
  4 | 
  5 | from typing_extensions import override
  6 | 
  7 | 
  8 | class BaseIndex:
  9 |     def get_index_method(self, distance_type: str) -> str:
 10 |         index_method = "invalid"
 11 |         if distance_type == "<->":
 12 |             index_method = "vector_l2_ops"
 13 |         elif distance_type == "<#>":
 14 |             index_method = "vector_ip_ops"
 15 |         elif distance_type == "<=>":
 16 |             index_method = "vector_cosine_ops"
 17 |         else:
 18 |             raise ValueError(f"Unknown distance type {distance_type}")
 19 |         return index_method
 20 | 
 21 |     def create_index_query(
 22 |         self,
 23 |         table_name_quoted: str,
 24 |         column_name_quoted: str,
 25 |         index_name_quoted: str,
 26 |         distance_type: str,
 27 |         num_records_callback: Callable[[], int],
 28 |     ) -> str:
 29 |         raise NotImplementedError()
 30 | 
 31 | 
 32 | class IvfflatIndex(BaseIndex):
 33 |     def __init__(self, num_records: int | None = None, num_lists: int | None = None) -> None:
 34 |         """
 35 |         Pgvector's ivfflat index.
 36 |         """
 37 |         self.num_records: int | None = num_records
 38 |         self.num_lists: int | None = num_lists
 39 | 
 40 |     def get_num_records(self, num_record_callback: Callable[[], int]) -> int:
 41 |         if self.num_records is not None:
 42 |             return self.num_records
 43 |         return num_record_callback()
 44 | 
 45 |     def get_num_lists(self, num_records_callback: Callable[[], int]) -> int:
 46 |         if self.num_lists is not None:
 47 |             return self.num_lists
 48 | 
 49 |         num_records = self.get_num_records(num_records_callback)
 50 |         num_lists = num_records / 1000
 51 |         if num_lists < 10:
 52 |             num_lists = 10
 53 |         if num_records > 1000000:
 54 |             num_lists = math.sqrt(num_records)
 55 |         return int(num_lists)
 56 | 
 57 |     def create_index_query(
 58 |         self,
 59 |         table_name_quoted: str,
 60 |         column_name_quoted: str,
 61 |         index_name_quoted: str,
 62 |         distance_type: str,
 63 |         num_records_callback: Callable[[], int],
 64 |     ) -> str:
 65 |         index_method = self.get_index_method(distance_type)
 66 |         num_lists = self.get_num_lists(num_records_callback)
 67 | 
 68 |         return (
 69 |             f"CREATE INDEX {index_name_quoted} ON {table_name_quoted}"
 70 |             f"USING ivfflat ({column_name_quoted} {index_method}) WITH (lists = {num_lists});"
 71 |         )
 72 | 
 73 | 
 74 | class HNSWIndex(BaseIndex):
 75 |     def __init__(self, m: int | None = None, ef_construction: int | None = None) -> None:
 76 |         """
 77 |         Pgvector's hnsw index.
 78 |         """
 79 |         self.m: int | None = m
 80 |         self.ef_construction: int | None = ef_construction
 81 | 
 82 |     @override
 83 |     def create_index_query(
 84 |         self,
 85 |         table_name_quoted: str,
 86 |         column_name_quoted: str,
 87 |         index_name_quoted: str,
 88 |         distance_type: str,
 89 |         num_records_callback: Callable[[], int],
 90 |     ) -> str:
 91 |         index_method = self.get_index_method(distance_type)
 92 | 
 93 |         with_clauses: list[str] = []
 94 |         if self.m is not None:
 95 |             with_clauses.append(f"m = {self.m}")
 96 |         if self.ef_construction is not None:
 97 |             with_clauses.append(f"ef_construction = {self.ef_construction}")
 98 | 
 99 |         with_clause = ""
100 |         if len(with_clauses) > 0:
101 |             with_clause = "WITH (" + ", ".join(with_clauses) + ")"
102 | 
103 |         return (
104 |             f"CREATE INDEX {index_name_quoted} ON {table_name_quoted}"
105 |             f"USING hnsw ({column_name_quoted} {index_method}) {with_clause};"
106 |         )
107 | 
108 | 
109 | class DiskAnnIndex(BaseIndex):
110 |     def __init__(
111 |         self,
112 |         search_list_size: int | None = None,
113 |         num_neighbors: int | None = None,
114 |         max_alpha: float | None = None,
115 |         storage_layout: str | None = None,
116 |         num_dimensions: int | None = None,
117 |         num_bits_per_dimension: int | None = None,
118 |     ) -> None:
119 |         """
120 |         Timescale's vector index.
121 |         """
122 |         self.search_list_size: int | None = search_list_size
123 |         self.num_neighbors: int | None = num_neighbors
124 |         self.max_alpha: float | None = max_alpha
125 |         self.storage_layout: str | None = storage_layout
126 |         self.num_dimensions: int | None = num_dimensions
127 |         self.num_bits_per_dimension: int | None = num_bits_per_dimension
128 | 
129 |     @override
130 |     def create_index_query(
131 |         self,
132 |         table_name_quoted: str,
133 |         column_name_quoted: str,
134 |         index_name_quoted: str,
135 |         distance_type: str,
136 |         num_records_callback: Callable[[], int],
137 |     ) -> str:
138 |         if distance_type != "<=>":
139 |             raise ValueError(
140 |                 f"Timescale's vector index only supports cosine distance, but distance_type was {distance_type}"
141 |             )
142 | 
143 |         with_clauses: list[str] = []
144 |         if self.search_list_size is not None:
145 |             with_clauses.append(f"search_list_size = {self.search_list_size}")
146 |         if self.num_neighbors is not None:
147 |             with_clauses.append(f"num_neighbors = {self.num_neighbors}")
148 |         if self.max_alpha is not None:
149 |             with_clauses.append(f"max_alpha = {self.max_alpha}")
150 |         if self.storage_layout is not None:
151 |             with_clauses.append(f"storage_layout = {self.storage_layout}")
152 |         if self.num_dimensions is not None:
153 |             with_clauses.append(f"num_dimensions = {self.num_dimensions}")
154 |         if self.num_bits_per_dimension is not None:
155 |             with_clauses.append(f"num_bits_per_dimension = {self.num_bits_per_dimension}")
156 | 
157 |         with_clause = ""
158 |         if len(with_clauses) > 0:
159 |             with_clause = "WITH (" + ", ".join(with_clauses) + ")"
160 | 
161 |         return (
162 |             f"CREATE INDEX {index_name_quoted} ON {table_name_quoted}"
163 |             f"USING diskann ({column_name_quoted}) {with_clause};"
164 |         )
165 | 
166 | 
167 | class QueryParams:
168 |     def __init__(self, params: dict[str, Any]) -> None:
169 |         self.params: dict[str, Any] = params
170 | 
171 |     def get_statements(self) -> list[str]:
172 |         return ["SET LOCAL " + key + " = " + str(value) for key, value in self.params.items()]
173 | 
174 | 
175 | class DiskAnnIndexParams(QueryParams):
176 |     def __init__(self, search_list_size: int | None = None, rescore: int | None = None) -> None:
177 |         params: dict[str, Any] = {}
178 |         if search_list_size is not None:
179 |             params["diskann.query_search_list_size"] = search_list_size
180 |         if rescore is not None:
181 |             params["diskann.query_rescore"] = rescore
182 |         super().__init__(params)
183 | 
184 | 
185 | class IvfflatIndexParams(QueryParams):
186 |     def __init__(self, probes: int) -> None:
187 |         super().__init__({"ivfflat.probes": probes})
188 | 
189 | 
190 | class HNSWIndexParams(QueryParams):
191 |     def __init__(self, ef_search: int) -> None:
192 |         super().__init__({"hnsw.ef_search": ef_search})
193 | 


--------------------------------------------------------------------------------
/tests/pg_vectorizer_test.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | from typing import Any
  3 | 
  4 | import psycopg2
  5 | from langchain.docstore.document import Document
  6 | from langchain.text_splitter import CharacterTextSplitter
  7 | from langchain_community.vectorstores.timescalevector import TimescaleVector
  8 | from langchain_openai import OpenAIEmbeddings
  9 | 
 10 | from tests.utils import http_recorder
 11 | from timescale_vector import client
 12 | from timescale_vector.pgvectorizer import Vectorize
 13 | 
 14 | 
 15 | def get_document(blog: dict[str, Any]) -> list[Document]:
 16 |     text_splitter = CharacterTextSplitter(
 17 |         chunk_size=1000,
 18 |         chunk_overlap=200,
 19 |     )
 20 |     docs: list[Document] = []
 21 |     for chunk in text_splitter.split_text(blog["contents"]):
 22 |         content = f"Author {blog['author']}, title: {blog['title']}, contents:{chunk}"
 23 |         metadata = {
 24 |             "id": str(client.uuid_from_time(blog["published_time"])),
 25 |             "blog_id": blog["id"],
 26 |             "author": blog["author"],
 27 |             "category": blog["category"],
 28 |             "published_time": blog["published_time"].isoformat(),
 29 |         }
 30 |         docs.append(Document(page_content=content, metadata=metadata))
 31 |     return docs
 32 | 
 33 | 
 34 | @http_recorder.use_cassette("pg_vectorizer.yaml")
 35 | def test_pg_vectorizer(service_url: str) -> None:
 36 |     with psycopg2.connect(service_url) as conn, conn.cursor() as cursor:
 37 |         for item in ["blog", "blog_embedding_work_queue", "blog_embedding"]:
 38 |             cursor.execute(f"DROP TABLE IF EXISTS {item};")
 39 | 
 40 |         for item in ["public", "test"]:
 41 |             cursor.execute(f"DROP SCHEMA IF EXISTS {item} CASCADE;")
 42 |             cursor.execute(f"CREATE SCHEMA {item};")
 43 | 
 44 |     with psycopg2.connect(service_url) as conn, conn.cursor() as cursor:
 45 |         cursor.execute("""
 46 |         CREATE TABLE IF NOT EXISTS blog (
 47 |             id              SERIAL PRIMARY KEY NOT NULL,
 48 |             title           TEXT NOT NULL,
 49 |             author          TEXT NOT NULL,
 50 |             contents        TEXT NOT NULL,
 51 |             category        TEXT NOT NULL,
 52 |             published_time  TIMESTAMPTZ NULL --NULL if not yet published
 53 |         );
 54 |         """)
 55 |         cursor.execute("""
 56 |             insert into blog (title, author, contents, category, published_time)
 57 |             VALUES ('first', 'mat', 'first_post', 'personal', '2021-01-01');
 58 |         """)
 59 | 
 60 |     def embed_and_write(blog_instances: list[Any], vectorizer: Vectorize) -> None:
 61 |         TABLE_NAME = vectorizer.table_name_unquoted + "_embedding"
 62 |         embedding = OpenAIEmbeddings()
 63 |         vector_store = TimescaleVector(
 64 |             collection_name=TABLE_NAME,
 65 |             service_url=service_url,
 66 |             embedding=embedding,
 67 |             time_partition_interval=timedelta(days=30),
 68 |         )
 69 | 
 70 |         # delete old embeddings for all ids in the work queue
 71 |         metadata_for_delete = [{"blog_id": blog["locked_id"]} for blog in blog_instances]
 72 |         vector_store.delete_by_metadata(metadata_for_delete)
 73 | 
 74 |         documents: list[Document] = []
 75 |         for blog in blog_instances:
 76 |             # skip blogs that are not published yet, or are deleted (will be None because of left join)
 77 |             if blog["published_time"] is not None:
 78 |                 documents.extend(get_document(blog))
 79 | 
 80 |         if len(documents) == 0:
 81 |             return
 82 | 
 83 |         texts = [d.page_content for d in documents]
 84 |         metadatas = [d.metadata for d in documents]
 85 |         ids = [d.metadata["id"] for d in documents]
 86 |         vector_store.add_texts(texts, metadatas, ids)
 87 | 
 88 |     vectorizer = Vectorize(service_url, "blog")
 89 |     vectorizer.register()
 90 |     # should be idempotent
 91 |     vectorizer.register()
 92 | 
 93 |     assert vectorizer.process(embed_and_write) == 1
 94 |     assert vectorizer.process(embed_and_write) == 0
 95 | 
 96 |     TABLE_NAME = "blog_embedding"
 97 |     embedding = OpenAIEmbeddings()
 98 |     vector_store = TimescaleVector(
 99 |         collection_name=TABLE_NAME,
100 |         service_url=service_url,
101 |         embedding=embedding,
102 |         time_partition_interval=timedelta(days=30),
103 |     )
104 | 
105 |     res = vector_store.similarity_search_with_score("first", 10)
106 |     assert len(res) == 1
107 | 
108 |     with psycopg2.connect(service_url) as conn, conn.cursor() as cursor:
109 |         cursor.execute("""
110 |             insert into blog
111 |                 (title, author, contents, category, published_time)
112 |             VALUES
113 |                 ('2', 'mat', 'second_post', 'personal', '2021-01-01');
114 |             insert into blog
115 |                 (title, author, contents, category, published_time)
116 |             VALUES
117 |                 ('3', 'mat', 'third_post', 'personal', '2021-01-01');
118 |         """)
119 |     assert vectorizer.process(embed_and_write) == 2
120 |     assert vectorizer.process(embed_and_write) == 0
121 | 
122 |     res = vector_store.similarity_search_with_score("first", 10)
123 |     assert len(res) == 3
124 | 
125 |     with psycopg2.connect(service_url) as conn, conn.cursor() as cursor:
126 |         cursor.execute("""
127 |             DELETE FROM blog WHERE title = '3';
128 |         """)
129 |     assert vectorizer.process(embed_and_write) == 1
130 |     assert vectorizer.process(embed_and_write) == 0
131 |     res = vector_store.similarity_search_with_score("first", 10)
132 |     assert len(res) == 2
133 | 
134 |     res = vector_store.similarity_search_with_score("second", 10)
135 |     assert len(res) == 2
136 |     content = res[0][0].page_content
137 |     assert "new version" not in content
138 |     with psycopg2.connect(service_url) as conn, conn.cursor() as cursor:
139 |         cursor.execute("""
140 |             update blog set contents = 'second post new version' WHERE title = '2';
141 |         """)
142 |     assert vectorizer.process(embed_and_write) == 1
143 |     assert vectorizer.process(embed_and_write) == 0
144 |     res = vector_store.similarity_search_with_score("second", 10)
145 |     assert len(res) == 2
146 |     content = res[0][0].page_content
147 |     assert "new version" in content
148 | 
149 |     with psycopg2.connect(service_url) as conn, conn.cursor() as cursor:
150 |         cursor.execute("""
151 |         CREATE TABLE IF NOT EXISTS test.blog_table_name_that_is_really_really_long_and_i_mean_long (
152 |             id              SERIAL PRIMARY KEY NOT NULL,
153 |             title           TEXT NOT NULL,
154 |             author          TEXT NOT NULL,
155 |             contents        TEXT NOT NULL,
156 |             category        TEXT NOT NULL,
157 |             published_time  TIMESTAMPTZ NULL --NULL if not yet published
158 |         );
159 |         """)
160 |         cursor.execute("""
161 |             insert into test.blog_table_name_that_is_really_really_long_and_i_mean_long
162 |                 (title, author, contents, category, published_time)
163 |             VALUES
164 |                 ('first', 'mat', 'first_post', 'personal', '2021-01-01');
165 |         """)
166 | 
167 |     vectorizer = Vectorize(
168 |         service_url,
169 |         "blog_table_name_that_is_really_really_long_and_i_mean_long",
170 |         schema_name="test",
171 |     )
172 |     assert vectorizer.process(embed_and_write) == 1
173 |     assert vectorizer.process(embed_and_write) == 0
174 | 


--------------------------------------------------------------------------------
/timescale_vector/client/predicates.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime
  3 | from typing import Any, Literal, Union, get_args, get_origin
  4 | 
  5 | 
  6 | def get_runtime_types(typ) -> tuple[type, ...]:  # type: ignore
  7 |     """Convert a type with generic parameters to runtime types.
  8 |     Necessary because Generic types cant be passed to isinstance in python 3.10"""
  9 |     return tuple(get_origin(t) or t for t in get_args(typ))  # type: ignore
 10 | 
 11 | 
 12 | class Predicates:
 13 |     logical_operators: dict[str, str] = {
 14 |         "AND": "AND",
 15 |         "OR": "OR",
 16 |         "NOT": "NOT",
 17 |     }
 18 | 
 19 |     operators_mapping: dict[str, str] = {
 20 |         "=": "=",
 21 |         "==": "=",
 22 |         ">=": ">=",
 23 |         ">": ">",
 24 |         "<=": "<=",
 25 |         "<": "<",
 26 |         "!=": "<>",
 27 |         "@>": "@>",  # array contains
 28 |     }
 29 | 
 30 |     PredicateValue = str | int | float | datetime | list[Any] | tuple[Any]
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         *clauses: Union[
 35 |             "Predicates",
 36 |             tuple[str, PredicateValue],
 37 |             tuple[str, str, PredicateValue],
 38 |             str,
 39 |             PredicateValue,
 40 |         ],
 41 |         operator: Literal["AND", "OR", "NOT"] = "AND",
 42 |     ):
 43 |         """
 44 |         Predicates class defines predicates on the object metadata.
 45 |         Predicates can be combined using logical operators (&, |, and ~).
 46 | 
 47 |         Parameters
 48 |         ----------
 49 |         clauses
 50 |             Predicate clauses. Can be either another Predicates object
 51 |             or a tuple of the form (field, operator, value) or (field, value).
 52 |         Operator
 53 |             Logical operator to use when combining the clauses.
 54 |             Can be one of 'AND', 'OR', 'NOT'. Defaults to 'AND'.
 55 |         """
 56 |         if operator not in self.logical_operators:
 57 |             raise ValueError(f"invalid operator: {operator}")
 58 |         self.operator: str = operator
 59 |         if isinstance(clauses[0], str):
 60 |             if len(clauses) != 3 or not (
 61 |                 isinstance(clauses[1], str) and isinstance(clauses[2], get_runtime_types(self.PredicateValue))
 62 |             ):
 63 |                 raise ValueError(f"Invalid clause format: {clauses}")
 64 |             self.clauses = [clauses]
 65 |         else:
 66 |             self.clauses = list(clauses)
 67 | 
 68 |     def add_clause(
 69 |         self,
 70 |         *clause: Union[
 71 |             "Predicates",
 72 |             tuple[str, PredicateValue],
 73 |             tuple[str, str, PredicateValue],
 74 |             str,
 75 |             PredicateValue,
 76 |         ],
 77 |     ) -> None:
 78 |         """
 79 |         Add a clause to the predicates object.
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         clause: 'Predicates' or Tuple[str, str] or Tuple[str, str, str]
 84 |             Predicate clause. Can be either another Predicates object or a tuple of the form (field, operator, value)
 85 |             or (field, value).
 86 |         """
 87 |         if isinstance(clause[0], str):
 88 |             if len(clause) != 3 or not (
 89 |                 isinstance(clause[1], str) and isinstance(clause[2], get_runtime_types(self.PredicateValue))
 90 |             ):
 91 |                 raise ValueError(f"Invalid clause format: {clause}")
 92 |             self.clauses.append(clause)  # type: ignore
 93 |         else:
 94 |             self.clauses.extend(list(clause))  # type: ignore
 95 | 
 96 |     def __and__(self, other: "Predicates") -> "Predicates":
 97 |         new_predicates = Predicates(self, other, operator="AND")
 98 |         return new_predicates
 99 | 
100 |     def __or__(self, other: "Predicates") -> "Predicates":
101 |         new_predicates = Predicates(self, other, operator="OR")
102 |         return new_predicates
103 | 
104 |     def __invert__(self) -> "Predicates":
105 |         new_predicates = Predicates(self, operator="NOT")
106 |         return new_predicates
107 | 
108 |     def __eq__(self, other: object) -> bool:
109 |         if not isinstance(other, Predicates):
110 |             return False
111 | 
112 |         return self.operator == other.operator and self.clauses == other.clauses
113 | 
114 |     def __repr__(self) -> str:
115 |         if self.operator:
116 |             return f"{self.operator}({', '.join(repr(clause) for clause in self.clauses)})"
117 |         else:
118 |             return repr(self.clauses)
119 | 
120 |     def build_query(self, params: list[Any]) -> tuple[str, list[Any]]:
121 |         """
122 |         Build the SQL query string and parameters for the predicates object.
123 |         """
124 |         if not self.clauses:
125 |             return "", []
126 | 
127 |         where_conditions: list[str] = []
128 | 
129 |         for clause in self.clauses:
130 |             if isinstance(clause, Predicates):
131 |                 child_where_clause, params = clause.build_query(params)
132 |                 where_conditions.append(f"({child_where_clause})")
133 |             elif isinstance(clause, tuple):
134 |                 if len(clause) == 2:
135 |                     field, value = clause
136 |                     operator = "="  # Default operator
137 |                 elif len(clause) == 3:
138 |                     field, operator, value = clause
139 |                     if operator not in self.operators_mapping:
140 |                         raise ValueError(f"Invalid operator: {operator}")
141 |                     operator = self.operators_mapping[operator]
142 |                 else:
143 |                     raise ValueError("Invalid clause format")
144 | 
145 |                 index = len(params) + 1
146 |                 param_name = f"${index}"
147 | 
148 |                 if field == "__uuid_timestamp":
149 |                     # convert str to timestamp in the database, it's better at it than python
150 |                     if isinstance(value, str):
151 |                         where_conditions.append(f"uuid_timestamp(id) {operator} ({param_name}::text)::timestamptz")
152 |                     else:
153 |                         where_conditions.append(f"uuid_timestamp(id) {operator} {param_name}")
154 |                     params.append(value)
155 | 
156 |                 elif operator == "@>" and isinstance(value, list | tuple):
157 |                     if len(value) == 0:
158 |                         raise ValueError("Invalid value. Empty lists and empty tuples are not supported.")
159 |                     json_value = json.dumps(value)
160 |                     where_conditions.append(f"metadata @> jsonb_build_object('{field}', {param_name}::jsonb)")
161 |                     params.append(json_value)
162 | 
163 |                 else:
164 |                     field_cast = ""
165 |                     if isinstance(value, int):
166 |                         field_cast = "::int"
167 |                     elif isinstance(value, float):
168 |                         field_cast = "::numeric"
169 |                     elif isinstance(value, datetime):
170 |                         field_cast = "::timestamptz"
171 |                     where_conditions.append(f"(metadata->>'{field}'){field_cast} {operator} {param_name}")
172 |                     params.append(value)
173 | 
174 |         if self.operator == "NOT":
175 |             or_clauses = " OR ".join(where_conditions)
176 |             # use IS DISTINCT FROM to treat all-null clauses as False and pass the filter
177 |             where_clause = f"TRUE IS DISTINCT FROM ({or_clauses})"
178 |         else:
179 |             where_clause = (" " + self.operator + " ").join(where_conditions)
180 |         return where_clause, params
181 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2022, fastai
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/timescale_vector/client/async_client.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import uuid
  3 | from collections.abc import Mapping
  4 | from datetime import datetime, timedelta
  5 | from typing import Any, Literal, cast
  6 | 
  7 | from asyncpg import Connection, Pool, Record, connect, create_pool
  8 | from asyncpg.pool import PoolAcquireContext
  9 | from pgvector.asyncpg import register_vector  # type: ignore
 10 | 
 11 | from timescale_vector.client.index import BaseIndex, QueryParams
 12 | from timescale_vector.client.predicates import Predicates
 13 | from timescale_vector.client.query_builder import QueryBuilder
 14 | from timescale_vector.client.uuid_time_range import UUIDTimeRange
 15 | 
 16 | 
 17 | class Async(QueryBuilder):
 18 |     def __init__(
 19 |         self,
 20 |         service_url: str,
 21 |         table_name: str,
 22 |         num_dimensions: int,
 23 |         distance_type: str = "cosine",
 24 |         id_type: Literal["UUID"] | Literal["TEXT"] = "UUID",
 25 |         time_partition_interval: timedelta | None = None,
 26 |         max_db_connections: int | None = None,
 27 |         infer_filters: bool = True,
 28 |         schema_name: str | None = None,
 29 |     ) -> None:
 30 |         """
 31 |         Initializes a async client for storing vector data.
 32 | 
 33 |         Parameters
 34 |         ----------
 35 |         service_url
 36 |             The connection string for the database.
 37 |         table_name
 38 |             The name of the table.
 39 |         num_dimensions
 40 |             The number of dimensions for the embedding vector.
 41 |         distance_type
 42 |             The distance type for indexing.
 43 |         id_type
 44 |             The type of the id column. Can be either 'UUID' or 'TEXT'.
 45 |         time_partition_interval
 46 |             The time interval for partitioning the table (optional).
 47 |         infer_filters
 48 |             Whether to infer start and end times from the special __start_date and __end_date filters.
 49 |         schema_name
 50 |             The schema name for the table (optional, uses the database's default schema if not specified).
 51 |         """
 52 |         self.builder = QueryBuilder(
 53 |             table_name,
 54 |             num_dimensions,
 55 |             distance_type,
 56 |             id_type,
 57 |             time_partition_interval,
 58 |             infer_filters,
 59 |             schema_name,
 60 |         )
 61 |         self.service_url: str = service_url
 62 |         self.pool: Pool | None = None
 63 |         self.max_db_connections: int | None = max_db_connections
 64 |         self.time_partition_interval: timedelta | None = time_partition_interval
 65 | 
 66 |     async def _default_max_db_connections(self) -> int:
 67 |         """
 68 |         Gets a default value for the number of max db connections to use.
 69 | 
 70 |         Returns
 71 |         -------
 72 |             int
 73 |         """
 74 |         query = self.builder.default_max_db_connection_query()
 75 |         conn: Connection = await connect(dsn=self.service_url)
 76 |         num_connections = await conn.fetchval(query)
 77 |         await conn.close()
 78 |         if num_connections is None:
 79 |             return 10
 80 |         return cast(int, num_connections)
 81 | 
 82 |     async def connect(self) -> PoolAcquireContext:
 83 |         """
 84 |         Establishes a connection to a PostgreSQL database using asyncpg.
 85 | 
 86 |         Returns
 87 |         -------
 88 |             asyncpg.Connection: The established database connection.
 89 |         """
 90 |         if self.pool is None:
 91 |             if self.max_db_connections is None:
 92 |                 self.max_db_connections = await self._default_max_db_connections()
 93 | 
 94 |             async def init(conn: Connection) -> None:
 95 |                 schema = await self._detect_vector_schema(conn)
 96 |                 if schema is None:
 97 |                     raise ValueError("pg_vector extension not found")
 98 |                 await register_vector(conn, schema=schema)
 99 |                 # decode to a dict, but accept a string as input in upsert
100 |                 await conn.set_type_codec("jsonb", encoder=str, decoder=json.loads, schema="pg_catalog")
101 | 
102 |             self.pool = await create_pool(
103 |                 dsn=self.service_url,
104 |                 init=init,
105 |                 min_size=1,
106 |                 max_size=self.max_db_connections,
107 |             )
108 | 
109 |         return self.pool.acquire()
110 | 
111 |     async def close(self) -> None:
112 |         if self.pool is not None:
113 |             await self.pool.close()
114 | 
115 |     async def table_is_empty(self) -> bool:
116 |         """
117 |         Checks if the table is empty.
118 | 
119 |         Returns
120 |         -------
121 |             bool: True if the table is empty, False otherwise.
122 |         """
123 |         query = self.builder.get_row_exists_query()
124 |         async with await self.connect() as pool:
125 |             rec = await pool.fetchrow(query)
126 |             return rec is None
127 | 
128 |     def munge_record(self, records: list[tuple[Any, ...]]) -> list[tuple[uuid.UUID, str, str, list[float]]]:
129 |         metadata_is_dict = isinstance(records[0][1], dict)
130 |         if metadata_is_dict:
131 |             return list(map(lambda item: Async._convert_record_meta_to_json(item), records))
132 |         return records
133 | 
134 |     async def _detect_vector_schema(self, conn: Connection) -> str | None:
135 |         query = """
136 |         select n.nspname
137 |             from pg_extension x
138 |             inner join pg_namespace n on (x.extnamespace = n.oid)
139 |             where x.extname = 'vector';
140 |         """
141 | 
142 |         return await conn.fetchval(query)
143 | 
144 |     @staticmethod
145 |     def _convert_record_meta_to_json(item: tuple[Any, ...]) -> tuple[uuid.UUID, str, str, list[float]]:
146 |         if not isinstance(item[1], dict):
147 |             raise ValueError("Cannot mix dictionary and string metadata fields in the same upsert")
148 |         return item[0], json.dumps(item[1]), item[2], item[3]
149 | 
150 |     async def upsert(self, records: list[tuple[Any, ...]]) -> None:
151 |         """
152 |         Performs upsert operation for multiple records.
153 | 
154 |         Parameters
155 |         ----------
156 |         records
157 |             List of records to upsert. Each record is a tuple of the form (id, metadata, contents, embedding).
158 | 
159 |         Returns
160 |         -------
161 |             None
162 |         """
163 |         munged_records = self.munge_record(records)
164 |         query = self.builder.get_upsert_query()
165 |         async with await self.connect() as pool:
166 |             await pool.executemany(query, munged_records)
167 | 
168 |     async def create_tables(self) -> None:
169 |         """
170 |         Creates necessary tables.
171 | 
172 |         Returns
173 |         -------
174 |             None
175 |         """
176 |         query = self.builder.get_create_query()
177 |         # don't use a connection pool for this because the vector extension may not be installed yet
178 |         # and if it's not installed, register_vector will fail.
179 |         conn = await connect(dsn=self.service_url)
180 |         await conn.execute(query)
181 |         await conn.close()
182 | 
183 |     async def delete_all(self, drop_index: bool = True) -> None:
184 |         """
185 |         Deletes all data. Also drops the index if `drop_index` is true.
186 | 
187 |         Returns
188 |         -------
189 |             None
190 |         """
191 |         if drop_index:
192 |             await self.drop_embedding_index()
193 |         query = self.builder.delete_all_query()
194 |         async with await self.connect() as pool:
195 |             await pool.execute(query)
196 | 
197 |     async def delete_by_ids(self, ids: list[uuid.UUID] | list[str]) -> list[Record]:
198 |         """
199 |         Delete records by id.
200 |         """
201 |         (query, params) = self.builder.delete_by_ids_query(ids)
202 |         async with await self.connect() as pool:
203 |             return await pool.fetch(query, *params)
204 | 
205 |     async def delete_by_metadata(self, filter: dict[str, str] | list[dict[str, str]]) -> list[Record]:
206 |         """
207 |         Delete records by metadata filters.
208 |         """
209 |         (query, params) = self.builder.delete_by_metadata_query(filter)
210 |         async with await self.connect() as pool:
211 |             return await pool.fetch(query, *params)
212 | 
213 |     async def drop_table(self) -> None:
214 |         """
215 |         Drops the table
216 | 
217 |         Returns
218 |         -------
219 |             None
220 |         """
221 |         query = self.builder.drop_table_query()
222 |         async with await self.connect() as pool:
223 |             await pool.execute(query)
224 | 
225 |     async def _get_approx_count(self) -> int:
226 |         """
227 |         Retrieves an approximate count of records in the table.
228 | 
229 |         Returns
230 |         -------
231 |             int: Approximate count of records.
232 |         """
233 |         query = self.builder.get_approx_count_query()
234 |         async with await self.connect() as pool:
235 |             rec = await pool.fetchrow(query)
236 |             return cast(int, rec[0] if rec is not None else 0)
237 | 
238 |     async def drop_embedding_index(self) -> None:
239 |         """
240 |         Drop any index on the emedding
241 | 
242 |         Returns
243 |         -------
244 |             None
245 |         """
246 |         query = self.builder.drop_embedding_index_query()
247 |         async with await self.connect() as pool:
248 |             await pool.execute(query)
249 | 
250 |     async def create_embedding_index(self, index: BaseIndex) -> None:
251 |         """
252 |         Creates an index for the table.
253 | 
254 |         Parameters
255 |         ----------
256 |         index
257 |             The index to create.
258 | 
259 |         Returns
260 |         -------
261 |             None
262 |         """
263 |         num_records = await self._get_approx_count()
264 |         query = self.builder.create_embedding_index_query(index, lambda: num_records)
265 | 
266 |         async with await self.connect() as pool:
267 |             await pool.execute(query)
268 | 
269 |     async def search(
270 |         self,
271 |         query_embedding: list[float] | None = None,
272 |         limit: int = 10,
273 |         filter: Mapping[str, datetime | str] | list[dict[str, str]] | None = None,
274 |         predicates: Predicates | None = None,
275 |         uuid_time_filter: UUIDTimeRange | None = None,
276 |         query_params: QueryParams | None = None,
277 |     ) -> list[Record]:
278 |         """
279 |         Retrieves similar records using a similarity query.
280 | 
281 |         Parameters
282 |         ----------
283 |         query_embedding
284 |             The query embedding vector.
285 |         limit
286 |             The number of nearest neighbors to retrieve.
287 |         filter
288 |             A filter for metadata. Should be specified as a key-value object or a list of key-value objects
289 |             (where any objects in the list are matched).
290 |         predicates
291 |             A Predicates object to filter the results. Predicates support more complex queries than the filter
292 |             parameter. Predicates can be combined using logical operators (&, |, and ~).
293 |         uuid_time_filter
294 |             A UUIDTimeRange object to filter the results by time using the id column.
295 |         query_params
296 | 
297 |         Returns
298 |         -------
299 |             List: List of similar records.
300 |         """
301 |         (query, params) = self.builder.search_query(query_embedding, limit, filter, predicates, uuid_time_filter)
302 |         if query_params is not None:
303 |             async with await self.connect() as pool, pool.transaction():
304 |                 # Looks like there is no way to pipeline this: https://github.com/MagicStack/asyncpg/issues/588
305 |                 statements = query_params.get_statements()
306 |                 for statement in statements:
307 |                     await pool.execute(statement)
308 |                 return await pool.fetch(query, *params)
309 |         else:
310 |             async with await self.connect() as pool:
311 |                 return await pool.fetch(query, *params)
312 | 


--------------------------------------------------------------------------------
/tests/sync_client_test.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from datetime import datetime, timedelta
  3 | 
  4 | import numpy as np
  5 | import pytest
  6 | 
  7 | from timescale_vector.client import (
  8 |     SEARCH_RESULT_CONTENTS_IDX,
  9 |     SEARCH_RESULT_DISTANCE_IDX,
 10 |     SEARCH_RESULT_ID_IDX,
 11 |     SEARCH_RESULT_METADATA_IDX,
 12 |     DiskAnnIndex,
 13 |     DiskAnnIndexParams,
 14 |     HNSWIndex,
 15 |     IvfflatIndex,
 16 |     Predicates,
 17 |     Sync,
 18 |     UUIDTimeRange,
 19 |     uuid_from_time,
 20 | )
 21 | 
 22 | 
 23 | @pytest.mark.parametrize("schema", ["temp", None])
 24 | def test_sync_client(service_url: str, schema: str) -> None:
 25 |     vec = Sync(service_url, "data_table", 2, schema_name=schema)
 26 |     vec.create_tables()
 27 |     empty = vec.table_is_empty()
 28 | 
 29 |     assert empty
 30 |     vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])])
 31 |     empty = vec.table_is_empty()
 32 |     assert not empty
 33 | 
 34 |     vec.upsert(
 35 |         [
 36 |             (uuid.uuid4(), """{"key":"val"}""", "the brown fox", [1.0, 1.3]),
 37 |             (uuid.uuid4(), """{"key":"val2"}""", "the brown fox", [1.0, 1.4]),
 38 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.5]),
 39 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]),
 40 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]),
 41 |             (uuid.uuid4(), """{"key2":"val2"}""", "the brown fox", [1.0, 1.7]),
 42 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]),
 43 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.9]),
 44 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 100.8]),
 45 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 101.8]),
 46 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]),
 47 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]),
 48 |             (
 49 |                 uuid.uuid4(),
 50 |                 """{"key_1":"val_1", "key_2":"val_2"}""",
 51 |                 "the brown fox",
 52 |                 [1.0, 1.8],
 53 |             ),
 54 |             (uuid.uuid4(), """{"key0": [1,2,3,4]}""", "the brown fox", [1.0, 1.8]),
 55 |             (
 56 |                 uuid.uuid4(),
 57 |                 """{"key0": [5,6,7], "key3": 3}""",
 58 |                 "the brown fox",
 59 |                 [1.0, 1.8],
 60 |             ),
 61 |         ]
 62 |     )
 63 | 
 64 |     vec.create_embedding_index(IvfflatIndex())
 65 |     vec.drop_embedding_index()
 66 |     vec.create_embedding_index(IvfflatIndex(100))
 67 |     vec.drop_embedding_index()
 68 |     vec.create_embedding_index(HNSWIndex())
 69 |     vec.drop_embedding_index()
 70 |     vec.create_embedding_index(HNSWIndex(20, 125))
 71 |     vec.drop_embedding_index()
 72 |     vec.create_embedding_index(DiskAnnIndex())
 73 |     vec.drop_embedding_index()
 74 |     vec.create_embedding_index(DiskAnnIndex(50, 50, 1.5))
 75 | 
 76 |     rec = vec.search([1.0, 2.0])
 77 |     assert len(rec) == 10
 78 |     rec = vec.search(np.array([1.0, 2.0]))
 79 |     assert len(rec) == 10
 80 |     rec = vec.search([1.0, 2.0], limit=4)
 81 |     assert len(rec) == 4
 82 |     rec = vec.search(limit=4)
 83 |     assert len(rec) == 4
 84 |     rec = vec.search([1.0, 2.0], limit=4, filter={"key2": "val2"})
 85 |     assert len(rec) == 1
 86 |     rec = vec.search([1.0, 2.0], limit=4, filter={"key2": "does not exist"})
 87 |     assert len(rec) == 0
 88 |     rec = vec.search(limit=4, filter={"key2": "does not exist"})
 89 |     assert len(rec) == 0
 90 |     rec = vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1"})
 91 |     assert len(rec) == 1
 92 |     rec = vec.search([1.0, 2.0], filter={"key_1": "val_1", "key_2": "val_2"})
 93 |     assert len(rec) == 1
 94 |     rec = vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1", "key_2": "val_3"})
 95 |     assert len(rec) == 0
 96 | 
 97 |     rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
 98 |     assert len(rec) == 2
 99 | 
100 |     rec = vec.search(
101 |         [1.0, 2.0],
102 |         limit=4,
103 |         filter=[
104 |             {"key_1": "val_1"},
105 |             {"key2": "val2"},
106 |             {"no such key": "no such val"},
107 |         ],
108 |     )
109 |     assert len(rec) == 2
110 | 
111 |     raised = False
112 |     try:
113 |         # can't upsert using both keys and dictionaries
114 |         vec.upsert(
115 |             [
116 |                 (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]),
117 |                 (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]),
118 |             ]
119 |         )
120 |     except ValueError:
121 |         raised = True
122 |     assert raised
123 | 
124 |     raised = False
125 |     try:
126 |         # can't upsert using both keys and dictionaries opposite order
127 |         vec.upsert(
128 |             [
129 |                 (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]),
130 |                 (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]),
131 |             ]
132 |         )
133 |     except BaseException:
134 |         raised = True
135 |     assert raised
136 | 
137 |     rec = vec.search([1.0, 2.0], filter={"key_1": "val_1", "key_2": "val_2"})
138 |     assert rec[0][SEARCH_RESULT_CONTENTS_IDX] == "the brown fox"
139 |     assert rec[0]["contents"] == "the brown fox"  # type: ignore
140 |     assert rec[0][SEARCH_RESULT_METADATA_IDX] == {
141 |         "key_1": "val_1",
142 |         "key_2": "val_2",
143 |     }
144 |     assert rec[0]["metadata"] == {"key_1": "val_1", "key_2": "val_2"}  # type: ignore
145 |     assert isinstance(rec[0][SEARCH_RESULT_METADATA_IDX], dict)
146 |     assert rec[0][SEARCH_RESULT_DISTANCE_IDX] == 0.0009438353921149556
147 |     assert rec[0]["distance"] == 0.0009438353921149556  # type: ignore
148 | 
149 |     rec = vec.search([1.0, 2.0], limit=4, predicates=Predicates("key", "==", "val2"))
150 |     assert len(rec) == 1
151 | 
152 |     rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
153 |     assert len(rec) == 2
154 |     vec.delete_by_ids([rec[0][SEARCH_RESULT_ID_IDX]])
155 |     rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
156 |     assert len(rec) == 1
157 |     vec.delete_by_metadata([{"key_1": "val_1"}, {"key2": "val2"}])
158 |     rec = vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
159 |     assert len(rec) == 0
160 |     rec = vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}])
161 |     assert len(rec) == 4
162 |     vec.delete_by_metadata([{"key2": "val"}])
163 |     rec = vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}])
164 |     assert len(rec) == 0
165 | 
166 |     assert not vec.table_is_empty()
167 |     vec.delete_all()
168 |     assert vec.table_is_empty()
169 | 
170 |     vec.drop_table()
171 |     vec.close()
172 | 
173 |     vec = Sync(service_url, "data_table", 2, id_type="TEXT", schema_name=schema)
174 |     vec.create_tables()
175 |     assert vec.table_is_empty()
176 |     vec.upsert([("Not a valid UUID", {"key": "val"}, "the brown fox", [1.0, 1.2])])
177 |     assert not vec.table_is_empty()
178 |     vec.delete_by_ids(["Not a valid UUID"])
179 |     assert vec.table_is_empty()
180 |     vec.drop_table()
181 |     vec.close()
182 | 
183 |     vec = Sync(
184 |         service_url,
185 |         "data_table",
186 |         2,
187 |         time_partition_interval=timedelta(seconds=60),
188 |         schema_name=schema,
189 |     )
190 |     vec.create_tables()
191 |     assert vec.table_is_empty()
192 |     id = uuid.uuid1()
193 |     vec.upsert([(id, {"key": "val"}, "the brown fox", [1.0, 1.2])])
194 |     assert not vec.table_is_empty()
195 |     vec.delete_by_ids([id])
196 |     assert vec.table_is_empty()
197 |     raised = False
198 |     try:
199 |         # can't upsert with uuid type 4 in time partitioned table
200 |         vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])])
201 |         # pass
202 |     except BaseException:
203 |         raised = True
204 |     assert raised
205 | 
206 |     specific_datetime = datetime(2018, 8, 10, 15, 30, 0)
207 |     vec.upsert(
208 |         [
209 |             # current time
210 |             (uuid.uuid1(), {"key": "val"}, "the brown fox", [1.0, 1.2]),
211 |             # time in 2018
212 |             (
213 |                 uuid_from_time(specific_datetime),
214 |                 {"key": "val"},
215 |                 "the brown fox",
216 |                 [1.0, 1.2],
217 |             ),
218 |         ]
219 |     )
220 | 
221 |     def search_date(start_date: datetime | str | None, end_date: datetime | str | None, expected: int) -> None:
222 |         # using uuid_time_filter
223 |         rec = vec.search(
224 |             [1.0, 2.0],
225 |             limit=4,
226 |             uuid_time_filter=UUIDTimeRange(start_date, end_date),
227 |         )
228 |         assert len(rec) == expected
229 |         rec = vec.search(
230 |             [1.0, 2.0],
231 |             limit=4,
232 |             uuid_time_filter=UUIDTimeRange(str(start_date), str(end_date)),
233 |         )
234 |         assert len(rec) == expected
235 | 
236 |         # using filters
237 |         filter: dict[str, str | datetime] = {}
238 |         if start_date is not None:
239 |             filter["__start_date"] = start_date
240 |         if end_date is not None:
241 |             filter["__end_date"] = end_date
242 |         rec = vec.search([1.0, 2.0], limit=4, filter=filter)
243 |         assert len(rec) == expected
244 |         # using filters with string dates
245 |         filter = {}
246 |         if start_date is not None:
247 |             filter["__start_date"] = str(start_date)
248 |         if end_date is not None:
249 |             filter["__end_date"] = str(end_date)
250 |         rec = vec.search([1.0, 2.0], limit=4, filter=filter)
251 |         assert len(rec) == expected
252 |         # using predicates
253 |         predicates: list[tuple[str, str, str | datetime]] = []
254 |         if start_date is not None:
255 |             predicates.append(("__uuid_timestamp", ">=", start_date))
256 |         if end_date is not None:
257 |             predicates.append(("__uuid_timestamp", "<", end_date))
258 |         rec = vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates))
259 |         assert len(rec) == expected
260 |         # using predicates with string dates
261 |         predicates = []
262 |         if start_date is not None:
263 |             predicates.append(("__uuid_timestamp", ">=", str(start_date)))
264 |         if end_date is not None:
265 |             predicates.append(("__uuid_timestamp", "<", str(end_date)))
266 |         rec = vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates))
267 |         assert len(rec) == expected
268 | 
269 |     assert not vec.table_is_empty()
270 | 
271 |     search_date(
272 |         specific_datetime - timedelta(days=7),
273 |         specific_datetime + timedelta(days=7),
274 |         1,
275 |     )
276 |     search_date(specific_datetime - timedelta(days=7), None, 2)
277 |     search_date(None, specific_datetime + timedelta(days=7), 1)
278 |     search_date(
279 |         specific_datetime - timedelta(days=7),
280 |         specific_datetime - timedelta(days=2),
281 |         0,
282 |     )
283 | 
284 |     # check timedelta handling
285 |     rec = vec.search(
286 |         [1.0, 2.0],
287 |         limit=4,
288 |         uuid_time_filter=UUIDTimeRange(start_date=specific_datetime, time_delta=timedelta(days=7)),
289 |     )
290 |     assert len(rec) == 1
291 |     # end is exclusive
292 |     rec = vec.search(
293 |         [1.0, 2.0],
294 |         limit=4,
295 |         uuid_time_filter=UUIDTimeRange(end_date=specific_datetime, time_delta=timedelta(days=7)),
296 |     )
297 |     assert len(rec) == 0
298 |     rec = vec.search(
299 |         [1.0, 2.0],
300 |         limit=4,
301 |         uuid_time_filter=UUIDTimeRange(
302 |             end_date=specific_datetime + timedelta(seconds=1),
303 |             time_delta=timedelta(days=7),
304 |         ),
305 |     )
306 |     assert len(rec) == 1
307 |     rec = vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(10, 5))
308 |     assert len(rec) == 2
309 |     rec = vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(100, rescore=2))
310 |     assert len(rec) == 2
311 |     vec.drop_table()
312 |     vec.close()
313 | 


--------------------------------------------------------------------------------
/timescale_vector/client/sync_client.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import uuid
  4 | from collections.abc import Iterator, Mapping
  5 | from contextlib import contextmanager
  6 | from datetime import datetime, timedelta
  7 | from typing import Any, Literal
  8 | 
  9 | import numpy as np
 10 | from numpy import ndarray
 11 | from pgvector.psycopg2 import register_vector  # type: ignore
 12 | from psycopg2 import connect
 13 | from psycopg2.extensions import connection as PSYConnection
 14 | from psycopg2.extras import DictCursor, register_uuid
 15 | from psycopg2.pool import SimpleConnectionPool
 16 | 
 17 | from timescale_vector.client.index import BaseIndex, QueryParams
 18 | from timescale_vector.client.predicates import Predicates
 19 | from timescale_vector.client.query_builder import QueryBuilder
 20 | from timescale_vector.client.uuid_time_range import UUIDTimeRange
 21 | 
 22 | 
 23 | class Sync:
 24 |     translated_queries: dict[str, str] = {}
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         service_url: str,
 29 |         table_name: str,
 30 |         num_dimensions: int,
 31 |         distance_type: str = "cosine",
 32 |         id_type: Literal["UUID"] | Literal["TEXT"] = "UUID",
 33 |         time_partition_interval: timedelta | None = None,
 34 |         max_db_connections: int | None = None,
 35 |         infer_filters: bool = True,
 36 |         schema_name: str | None = None,
 37 |     ) -> None:
 38 |         """
 39 |         Initializes a sync client for storing vector data.
 40 | 
 41 |         Parameters
 42 |         ----------
 43 |         service_url
 44 |             The connection string for the database.
 45 |         table_name
 46 |             The name of the table.
 47 |         num_dimensions
 48 |             The number of dimensions for the embedding vector.
 49 |         distance_type
 50 |             The distance type for indexing.
 51 |         id_type
 52 |             The type of the primary id column. Can be either 'UUID' or 'TEXT'.
 53 |         time_partition_interval
 54 |             The time interval for partitioning the table (optional).
 55 |         infer_filters
 56 |             Whether to infer start and end times from the special __start_date and __end_date filters.
 57 |         schema_name
 58 |             The schema name for the table (optional, uses the database's default schema if not specified).
 59 |         """
 60 |         self.builder = QueryBuilder(
 61 |             table_name,
 62 |             num_dimensions,
 63 |             distance_type,
 64 |             id_type,
 65 |             time_partition_interval,
 66 |             infer_filters,
 67 |             schema_name,
 68 |         )
 69 |         self.service_url: str = service_url
 70 |         self.pool: SimpleConnectionPool | None = None
 71 |         self.max_db_connections: int | None = max_db_connections
 72 |         self.time_partition_interval: timedelta | None = time_partition_interval
 73 |         register_uuid()
 74 | 
 75 |     def default_max_db_connections(self) -> int:
 76 |         """
 77 |         Gets a default value for the number of max db connections to use.
 78 |         """
 79 |         query = self.builder.default_max_db_connection_query()
 80 |         conn = connect(dsn=self.service_url)
 81 |         with conn.cursor() as cur:
 82 |             cur.execute(query)
 83 |             num_connections = cur.fetchone()
 84 |         conn.close()
 85 |         return num_connections[0]  # type: ignore
 86 | 
 87 |     @contextmanager
 88 |     def connect(self) -> Iterator[PSYConnection]:
 89 |         """
 90 |         Establishes a connection to a PostgreSQL database using psycopg2 and allows it's
 91 |         use in a context manager.
 92 |         """
 93 |         if self.pool is None:
 94 |             if self.max_db_connections is None:
 95 |                 self.max_db_connections = self.default_max_db_connections()
 96 | 
 97 |             self.pool = SimpleConnectionPool(
 98 |                 1,
 99 |                 self.max_db_connections,
100 |                 dsn=self.service_url,
101 |                 cursor_factory=DictCursor,
102 |             )
103 | 
104 |         connection = self.pool.getconn()
105 |         register_vector(connection)
106 |         try:
107 |             yield connection
108 |             connection.commit()
109 |         finally:
110 |             self.pool.putconn(connection)
111 | 
112 |     def close(self) -> None:
113 |         if self.pool is not None:
114 |             self.pool.closeall()
115 | 
116 |     def _translate_to_pyformat(self, query_string: str, params: list[Any] | None) -> tuple[str, dict[str, Any]]:
117 |         """
118 |         Translates dollar sign number parameters and list parameters to pyformat strings.
119 | 
120 |         Args:
121 |             query_string (str): The query string with parameters.
122 |             params (list|None): List of parameter values.
123 | 
124 |         Returns:
125 |             str: The query string with translated pyformat parameters.
126 |             dict: A dictionary mapping parameter numbers to their values.
127 |         """
128 | 
129 |         translated_params: dict[str, Any] = {}
130 |         if params is not None:
131 |             for idx, param in enumerate(params):
132 |                 translated_params[str(idx + 1)] = param
133 | 
134 |         if query_string in self.translated_queries:
135 |             return self.translated_queries[query_string], translated_params
136 | 
137 |         dollar_params = re.findall(r"\$[0-9]+", query_string)
138 |         translated_string = query_string
139 |         for dollar_param in dollar_params:
140 |             # Extract the number after the $
141 |             param_number = int(dollar_param[1:])
142 |             pyformat_param = ("%s" if param_number == 0 else f"%({param_number})s") if params is not None else "%s"
143 |             translated_string = translated_string.replace(dollar_param, pyformat_param)
144 | 
145 |         self.translated_queries[query_string] = translated_string
146 |         return self.translated_queries[query_string], translated_params
147 | 
148 |     def table_is_empty(self) -> bool:
149 |         """
150 |         Checks if the table is empty.
151 | 
152 |         Returns
153 |         -------
154 |             bool: True if the table is empty, False otherwise.
155 |         """
156 |         query = self.builder.get_row_exists_query()
157 |         with self.connect() as conn, conn.cursor() as cur:
158 |             cur.execute(query)
159 |             rec = cur.fetchone()
160 |             return rec is None
161 | 
162 |     def munge_record(self, records: list[tuple[Any, ...]]) -> list[tuple[uuid.UUID, str, str, list[float]]]:
163 |         metadata_is_dict = isinstance(records[0][1], dict)
164 |         if metadata_is_dict:
165 |             return list(map(lambda item: Sync._convert_record_meta_to_json(item), records))
166 | 
167 |         return records
168 | 
169 |     @staticmethod
170 |     def _convert_record_meta_to_json(item: tuple[Any, ...]) -> tuple[uuid.UUID, str, str, list[float]]:
171 |         if not isinstance(item[1], dict):
172 |             raise ValueError("Cannot mix dictionary and string metadata fields in the same upsert")
173 |         return item[0], json.dumps(item[1]), item[2], item[3]
174 | 
175 |     def upsert(self, records: list[tuple[Any, ...]]) -> None:
176 |         """
177 |         Performs upsert operation for multiple records.
178 | 
179 |         Parameters
180 |         ----------
181 |         records
182 |             Records to upsert.
183 | 
184 |         Returns
185 |         -------
186 |             None
187 |         """
188 |         munged_records = self.munge_record(records)
189 |         query = self.builder.get_upsert_query()
190 |         query, _ = self._translate_to_pyformat(query, None)
191 |         with self.connect() as conn, conn.cursor() as cur:
192 |             cur.executemany(query, munged_records)
193 | 
194 |     def create_tables(self) -> None:
195 |         """
196 |         Creates necessary tables.
197 | 
198 |         Returns
199 |         -------
200 |             None
201 |         """
202 |         query = self.builder.get_create_query()
203 |         # don't use a connection pool for this because the vector extension may not be installed yet
204 |         # and if it's not installed, register_vector will fail.
205 |         conn = connect(dsn=self.service_url)
206 |         with conn.cursor() as cur:
207 |             cur.execute(query)
208 |         conn.commit()
209 |         conn.close()
210 | 
211 |     def delete_all(self, drop_index: bool = True) -> None:
212 |         """
213 |         Deletes all data. Also drops the index if `drop_index` is true.
214 | 
215 |         Returns
216 |         -------
217 |             None
218 |         """
219 |         if drop_index:
220 |             self.drop_embedding_index()
221 |         query = self.builder.delete_all_query()
222 |         with self.connect() as conn, conn.cursor() as cur:
223 |             cur.execute(query)
224 | 
225 |     def delete_by_ids(self, ids: list[uuid.UUID] | list[str]) -> None:
226 |         """
227 |         Delete records by id.
228 | 
229 |         Parameters
230 |         ----------
231 |         ids
232 |             List of ids to delete.
233 |         """
234 |         (query, params) = self.builder.delete_by_ids_query(ids)
235 |         translated_query, translated_params = self._translate_to_pyformat(query, params)
236 |         with self.connect() as conn, conn.cursor() as cur:
237 |             cur.execute(translated_query, translated_params)
238 | 
239 |     def delete_by_metadata(self, filter: dict[str, str] | list[dict[str, str]]) -> None:
240 |         """
241 |         Delete records by metadata filters.
242 |         """
243 |         (query, params) = self.builder.delete_by_metadata_query(filter)
244 |         translated_query, translated_params = self._translate_to_pyformat(query, params)
245 |         with self.connect() as conn, conn.cursor() as cur:
246 |             cur.execute(translated_query, translated_params)
247 | 
248 |     def drop_table(self) -> None:
249 |         """
250 |         Drops the table
251 | 
252 |         Returns
253 |         -------
254 |             None
255 |         """
256 |         query = self.builder.drop_table_query()
257 |         with self.connect() as conn, conn.cursor() as cur:
258 |             cur.execute(query)
259 | 
260 |     def _get_approx_count(self) -> int:
261 |         """
262 |         Retrieves an approximate count of records in the table.
263 | 
264 |         Returns
265 |         -------
266 |             int: Approximate count of records.
267 |         """
268 |         query = self.builder.get_approx_count_query()
269 |         with self.connect() as conn, conn.cursor() as cur:
270 |             cur.execute(query)
271 |             rec = cur.fetchone()
272 |             return rec[0] if rec is not None else 0
273 | 
274 |     def drop_embedding_index(self) -> None:
275 |         """
276 |         Drop any index on the emedding
277 | 
278 |         Returns
279 |         --------
280 |             None
281 |         """
282 |         query = self.builder.drop_embedding_index_query()
283 |         with self.connect() as conn, conn.cursor() as cur:
284 |             cur.execute(query)
285 | 
286 |     def create_embedding_index(self, index: BaseIndex) -> None:
287 |         """
288 |         Creates an index on the embedding for the table.
289 | 
290 |         Parameters
291 |         ----------
292 |         index
293 |             The index to create.
294 | 
295 |         Returns
296 |         --------
297 |             None
298 |         """
299 |         query = self.builder.create_embedding_index_query(index, lambda: self._get_approx_count())
300 |         with self.connect() as conn, conn.cursor() as cur:
301 |             cur.execute(query)
302 | 
303 |     def search(
304 |         self,
305 |         query_embedding: ndarray[Any, Any] | list[float] | None = None,
306 |         limit: int = 10,
307 |         filter: Mapping[str, datetime | str] | list[dict[str, str]] | None = None,
308 |         predicates: Predicates | None = None,
309 |         uuid_time_filter: UUIDTimeRange | None = None,
310 |         query_params: QueryParams | None = None,
311 |     ) -> list[tuple[Any, ...]]:
312 |         """
313 |         Retrieves similar records using a similarity query.
314 | 
315 |         Parameters
316 |         ----------
317 |         query_embedding
318 |             The query embedding vector.
319 |         limit
320 |             The number of nearest neighbors to retrieve.
321 |         filter
322 |             A filter for metadata. Should be specified as a key-value object or a list of key-value objects
323 |             (where any objects in the list are matched).
324 |         predicates
325 |             A Predicates object to filter the results. Predicates support more complex queries
326 |             than the filter parameter. Predicates can be combined using logical operators (&, |, and ~).
327 | 
328 |         Returns
329 |         --------
330 |             List: List of similar records.
331 |         """
332 |         query_embedding_np = np.array(query_embedding) if query_embedding is not None else None
333 | 
334 |         (query, params) = self.builder.search_query(query_embedding_np, limit, filter, predicates, uuid_time_filter)
335 |         translated_query, translated_params = self._translate_to_pyformat(query, params)
336 | 
337 |         if query_params is not None:
338 |             prefix = "; ".join(query_params.get_statements())
339 |             translated_query = f"{prefix}; {translated_query}"
340 | 
341 |         with self.connect() as conn, conn.cursor() as cur:
342 |             cur.execute(translated_query, translated_params)
343 |             return cur.fetchall()
344 | 


--------------------------------------------------------------------------------
/timescale_vector/client/query_builder.py:
--------------------------------------------------------------------------------
  1 | # pyright: reportPrivateUsage=false
  2 | import json
  3 | import uuid
  4 | from collections.abc import Callable, Mapping
  5 | from datetime import datetime, timedelta
  6 | from typing import Any
  7 | 
  8 | import numpy as np
  9 | 
 10 | from timescale_vector.client.index import BaseIndex
 11 | from timescale_vector.client.predicates import Predicates
 12 | from timescale_vector.client.uuid_time_range import UUIDTimeRange
 13 | 
 14 | 
 15 | class QueryBuilder:
 16 |     def __init__(
 17 |         self,
 18 |         table_name: str,
 19 |         num_dimensions: int,
 20 |         distance_type: str,
 21 |         id_type: str,
 22 |         time_partition_interval: timedelta | None,
 23 |         infer_filters: bool,
 24 |         schema_name: str | None,
 25 |     ) -> None:
 26 |         """
 27 |         Initializes a base Vector object to generate queries for vector clients.
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         table_name
 32 |             The name of the table.
 33 |         num_dimensions
 34 |             The number of dimensions for the embedding vector.
 35 |         distance_type
 36 |             The distance type for indexing.
 37 |         id_type
 38 |             The type of the id column. Can be either 'UUID' or 'TEXT'.
 39 |         time_partition_interval
 40 |             The time interval for partitioning the table (optional).
 41 |         infer_filters
 42 |             Whether to infer start and end times from the special __start_date and __end_date filters.
 43 |         schema_name
 44 |             The schema name for the table (optional, uses the database's default schema if not specified).
 45 |         """
 46 |         self.table_name: str = table_name
 47 |         self.schema_name: str | None = schema_name
 48 |         self.num_dimensions: int = num_dimensions
 49 |         if distance_type == "cosine" or distance_type == "<=>":
 50 |             self.distance_type: str = "<=>"
 51 |         elif distance_type == "euclidean" or distance_type == "<->" or distance_type == "l2":
 52 |             self.distance_type = "<->"
 53 |         else:
 54 |             raise ValueError(f"unrecognized distance_type {distance_type}")
 55 | 
 56 |         if id_type.lower() != "uuid" and id_type.lower() != "text":
 57 |             raise ValueError(f"unrecognized id_type {id_type}")
 58 | 
 59 |         if time_partition_interval is not None and id_type.lower() != "uuid":
 60 |             raise ValueError("time partitioning is only supported for uuid id_type")
 61 | 
 62 |         self.id_type: str = id_type.lower()
 63 |         self.time_partition_interval: timedelta | None = time_partition_interval
 64 |         self.infer_filters: bool = infer_filters
 65 | 
 66 |     @staticmethod
 67 |     def _quote_ident(ident: str) -> str:
 68 |         """
 69 |         Quotes an identifier to prevent SQL injection.
 70 | 
 71 |         Parameters
 72 |         ----------
 73 |         ident
 74 |             The identifier to be quoted.
 75 | 
 76 |         Returns
 77 |         -------
 78 |             str: The quoted identifier.
 79 |         """
 80 |         return '"{}"'.format(ident.replace('"', '""'))
 81 | 
 82 |     def _quoted_table_name(self) -> str:
 83 |         if self.schema_name is not None:
 84 |             return self._quote_ident(self.schema_name) + "." + self._quote_ident(self.table_name)
 85 |         else:
 86 |             return self._quote_ident(self.table_name)
 87 | 
 88 |     def get_row_exists_query(self) -> str:
 89 |         """
 90 |         Generates a query to check if any rows exist in the table.
 91 | 
 92 |         Returns
 93 |         -------
 94 |             str: The query to check for row existence.
 95 |         """
 96 |         return f"SELECT 1 FROM {self._quoted_table_name()} LIMIT 1"
 97 | 
 98 |     def get_upsert_query(self) -> str:
 99 |         """
100 |         Generates an upsert query.
101 | 
102 |         Returns
103 |         -------
104 |             str: The upsert query.
105 |         """
106 |         return (
107 |             f"INSERT INTO {self._quoted_table_name()} (id, metadata, contents, embedding) "
108 |             f"VALUES ($1, $2, $3, $4) ON CONFLICT DO NOTHING"
109 |         )
110 | 
111 |     def get_approx_count_query(self) -> str:
112 |         """
113 |         Generate a query to find the approximate count of records in the table.
114 | 
115 |         Returns
116 |         -------
117 |             str: the query.
118 |         """
119 |         # todo optimize with approx
120 |         return f"SELECT COUNT(*) as cnt FROM {self._quoted_table_name()}"
121 | 
122 |     def get_create_query(self) -> str:
123 |         """
124 |         Generates a query to create the tables, indexes, and extensions needed to store the vector data.
125 | 
126 |         Returns
127 |         -------
128 |             str: The create table query.
129 |         """
130 |         hypertable_sql = ""
131 |         if self.time_partition_interval is not None:
132 |             hypertable_sql = f"""
133 |                 CREATE EXTENSION IF NOT EXISTS timescaledb;
134 | 
135 |                 CREATE OR REPLACE FUNCTION public.uuid_timestamp(uuid UUID) RETURNS TIMESTAMPTZ AS $$
136 |                 DECLARE
137 |                 bytes bytea;
138 |                 BEGIN
139 |                 bytes := uuid_send(uuid);
140 |                 if  (get_byte(bytes, 6) >> 4)::int2 != 1 then
141 |                     RAISE EXCEPTION 'UUID version is not 1';
142 |                 end if;
143 |                 RETURN to_timestamp(
144 |                             (
145 |                                 (
146 |                                 (get_byte(bytes, 0)::bigint << 24) |
147 |                                 (get_byte(bytes, 1)::bigint << 16) |
148 |                                 (get_byte(bytes, 2)::bigint <<  8) |
149 |                                 (get_byte(bytes, 3)::bigint <<  0)
150 |                                 ) + (
151 |                                 ((get_byte(bytes, 4)::bigint << 8 |
152 |                                 get_byte(bytes, 5)::bigint)) << 32
153 |                                 ) + (
154 |                                 (((get_byte(bytes, 6)::bigint & 15) << 8 | get_byte(bytes, 7)::bigint) & 4095) << 48
155 |                                 ) - 122192928000000000
156 |                             ) / 10000 / 1000::double precision
157 |                         );
158 |                 END
159 |                 $$ LANGUAGE plpgsql
160 |                 IMMUTABLE PARALLEL SAFE
161 |                 RETURNS NULL ON NULL INPUT;
162 | 
163 |                 SELECT create_hypertable('{self._quoted_table_name()}',
164 |                     'id',
165 |                     if_not_exists=> true,
166 |                     time_partitioning_func=>'public.uuid_timestamp',
167 |                     chunk_time_interval => '{str(self.time_partition_interval.total_seconds())} seconds'::interval);
168 |             """
169 |         return f"""
170 | CREATE EXTENSION IF NOT EXISTS vector;
171 | CREATE EXTENSION IF NOT EXISTS vectorscale;
172 | 
173 | 
174 | CREATE TABLE IF NOT EXISTS {self._quoted_table_name()} (
175 |     id {self.id_type} PRIMARY KEY,
176 |     metadata JSONB,
177 |     contents TEXT,
178 |     embedding VECTOR({self.num_dimensions})
179 | );
180 | 
181 | CREATE INDEX IF NOT EXISTS {self._quote_ident(self.table_name + "_meta_idx")} ON {self._quoted_table_name()}
182 | USING GIN(metadata jsonb_path_ops);
183 | 
184 | {hypertable_sql}
185 | """
186 | 
187 |     def _get_embedding_index_name_quoted(self) -> str:
188 |         return self._quote_ident(self.table_name + "_embedding_idx")
189 | 
190 |     def _get_schema_qualified_embedding_index_name_quoted(self) -> str:
191 |         if self.schema_name is not None:
192 |             return self._quote_ident(self.schema_name) + "." + self._get_embedding_index_name_quoted()
193 |         else:
194 |             return self._get_embedding_index_name_quoted()
195 | 
196 |     def drop_embedding_index_query(self) -> str:
197 |         return f"DROP INDEX IF EXISTS {self._get_schema_qualified_embedding_index_name_quoted()};"
198 | 
199 |     def delete_all_query(self) -> str:
200 |         return f"TRUNCATE {self._quoted_table_name()};"
201 | 
202 |     def delete_by_ids_query(self, ids: list[uuid.UUID] | list[str]) -> tuple[str, list[Any]]:
203 |         query = f"DELETE FROM {self._quoted_table_name()} WHERE id = ANY($1::{self.id_type}[]);"
204 |         return (query, [ids])
205 | 
206 |     def delete_by_metadata_query(
207 |         self, filter_conditions: dict[str, str] | list[dict[str, str]]
208 |     ) -> tuple[str, list[Any]]:
209 |         params: list[Any] = []
210 |         (where, params) = self._where_clause_for_filter(params, filter_conditions)
211 |         query = f"DELETE FROM {self._quoted_table_name()} WHERE {where};"
212 |         return (query, params)
213 | 
214 |     def drop_table_query(self) -> str:
215 |         return f"DROP TABLE IF EXISTS {self._quoted_table_name()};"
216 | 
217 |     def default_max_db_connection_query(self) -> str:
218 |         """
219 |         Generates a query to get the default max db connections. This uses a heuristic to determine the max connections
220 |         based on the max_connections setting in postgres
221 |         and the number of currently used connections. This heuristic leaves 4 connections in reserve.
222 |         """
223 |         return (
224 |             "SELECT greatest(1, ((SELECT setting::int FROM pg_settings "
225 |             "WHERE name='max_connections')-(SELECT count(*) FROM pg_stat_activity) - 4)::int)"
226 |         )
227 | 
228 |     def create_embedding_index_query(self, index: BaseIndex, num_records_callback: Callable[[], int]) -> str:
229 |         """
230 |         Generates an embedding index creation query.
231 | 
232 |         Parameters
233 |         ----------
234 |         index
235 |             The index to create.
236 |         num_records_callback
237 |             A callback function to get the number of records in the table.
238 | 
239 |         Returns
240 |         -------
241 |             str: The index creation query.
242 |         """
243 |         column_name = "embedding"
244 |         index_name_quoted = self._get_embedding_index_name_quoted()
245 |         query = index.create_index_query(
246 |             self._quoted_table_name(),
247 |             self._quote_ident(column_name),
248 |             index_name_quoted,
249 |             self.distance_type,
250 |             num_records_callback,
251 |         )
252 |         return query
253 | 
254 |     def _where_clause_for_filter(
255 |         self, params: list[Any], filter: Mapping[str, datetime | str] | list[dict[str, str]] | None
256 |     ) -> tuple[str, list[Any]]:
257 |         if filter is None:
258 |             return "TRUE", params
259 | 
260 |         if isinstance(filter, dict):
261 |             where = f"metadata @> ${len(params)+1}"
262 |             json_object = json.dumps(filter)
263 |             params = params + [json_object]
264 |         elif isinstance(filter, list):
265 |             any_params: list[str] = []
266 |             for _idx, filter_dict in enumerate(filter, start=len(params) + 1):
267 |                 any_params.append(json.dumps(filter_dict))
268 |             where = f"metadata @> ANY(${len(params) + 1}::jsonb[])"
269 |             params = params + [any_params]
270 |         else:
271 |             raise ValueError(f"Unknown filter type: {type(filter)}")
272 | 
273 |         return where, params
274 | 
275 |     def search_query(
276 |         self,
277 |         query_embedding: list[float] | np.ndarray[Any, Any] | None,
278 |         limit: int = 10,
279 |         filter: Mapping[str, datetime | str] | list[dict[str, str]] | None = None,
280 |         predicates: Predicates | None = None,
281 |         uuid_time_filter: UUIDTimeRange | None = None,
282 |     ) -> tuple[str, list[Any]]:
283 |         """
284 |         Generates a similarity query.
285 | 
286 |         Returns:
287 |             Tuple[str, List]: A tuple containing the query and parameters.
288 |         """
289 |         params: list[Any] = []
290 |         if query_embedding is not None:
291 |             distance = f"embedding {self.distance_type} ${len(params)+1}"
292 |             params = params + [query_embedding]
293 |             order_by_clause = f"ORDER BY {distance} ASC"
294 |         else:
295 |             distance = "-1.0"
296 |             order_by_clause = ""
297 | 
298 |         if (
299 |             self.infer_filters
300 |             and uuid_time_filter is None
301 |             and isinstance(filter, dict)
302 |             and ("__start_date" in filter or "__end_date" in filter)
303 |         ):
304 |             start_date = UUIDTimeRange._parse_datetime(filter.get("__start_date"))
305 |             end_date = UUIDTimeRange._parse_datetime(filter.get("__end_date"))
306 | 
307 |             uuid_time_filter = UUIDTimeRange(start_date, end_date)
308 | 
309 |             if start_date is not None:
310 |                 del filter["__start_date"]
311 |             if end_date is not None:
312 |                 del filter["__end_date"]
313 | 
314 |         where_clauses: list[str] = []
315 |         if filter is not None:
316 |             (where_filter, params) = self._where_clause_for_filter(params, filter)
317 |             where_clauses.append(where_filter)
318 | 
319 |         if predicates is not None:
320 |             (where_predicates, params) = predicates.build_query(params)
321 |             where_clauses.append(where_predicates)
322 | 
323 |         if uuid_time_filter is not None:
324 |             (where_time, params) = uuid_time_filter.build_query(params)
325 |             where_clauses.append(where_time)
326 | 
327 |         where = " AND ".join(where_clauses) if len(where_clauses) > 0 else "TRUE"
328 | 
329 |         query = f"""
330 |         SELECT
331 |             id, metadata, contents, embedding, {distance} as distance
332 |         FROM
333 |            {self._quoted_table_name()}
334 |         WHERE
335 |            {where}
336 |         {order_by_clause}
337 |         LIMIT {limit}
338 |         """
339 |         return query, params
340 | 


--------------------------------------------------------------------------------
/tests/async_client_test.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | from datetime import datetime, timedelta
  3 | 
  4 | import pytest
  5 | 
  6 | from timescale_vector.client import (
  7 |     SEARCH_RESULT_METADATA_IDX,
  8 |     Async,
  9 |     DiskAnnIndex,
 10 |     DiskAnnIndexParams,
 11 |     HNSWIndex,
 12 |     IvfflatIndex,
 13 |     Predicates,
 14 |     UUIDTimeRange,
 15 |     uuid_from_time,
 16 | )
 17 | 
 18 | 
 19 | @pytest.mark.asyncio
 20 | @pytest.mark.parametrize("schema", ["temp", None])
 21 | async def test_vector(service_url: str, schema: str) -> None:
 22 |     vec = Async(service_url, "data_table", 2, schema_name=schema)
 23 |     await vec.drop_table()
 24 |     await vec.create_tables()
 25 |     empty = await vec.table_is_empty()
 26 |     assert empty
 27 |     await vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])])
 28 |     empty = await vec.table_is_empty()
 29 |     assert not empty
 30 | 
 31 |     await vec.upsert(
 32 |         [
 33 |             (uuid.uuid4(), """{"key":"val"}""", "the brown fox", [1.0, 1.3]),
 34 |             (
 35 |                 uuid.uuid4(),
 36 |                 """{"key":"val2", "key_10": "10", "key_11": "11.3"}""",
 37 |                 "the brown fox",
 38 |                 [1.0, 1.4],
 39 |             ),
 40 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.5]),
 41 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]),
 42 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.6]),
 43 |             (uuid.uuid4(), """{"key2":"val2"}""", "the brown fox", [1.0, 1.7]),
 44 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]),
 45 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.9]),
 46 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 100.8]),
 47 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 101.8]),
 48 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]),
 49 |             (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.8]),
 50 |             (
 51 |                 uuid.uuid4(),
 52 |                 """{"key_1":"val_1", "key_2":"val_2"}""",
 53 |                 "the brown fox",
 54 |                 [1.0, 1.8],
 55 |             ),
 56 |             (uuid.uuid4(), """{"key0": [1,2,3,4]}""", "the brown fox", [1.0, 1.8]),
 57 |             (
 58 |                 uuid.uuid4(),
 59 |                 """{"key0": [8,9,"A"]}""",
 60 |                 "the brown fox",
 61 |                 [1.0, 1.8],
 62 |             ),  # mixed types
 63 |             (
 64 |                 uuid.uuid4(),
 65 |                 """{"key0": [5,6,7], "key3": 3}""",
 66 |                 "the brown fox",
 67 |                 [1.0, 1.8],
 68 |             ),
 69 |             (uuid.uuid4(), """{"key0": ["B", "C"]}""", "the brown fox", [1.0, 1.8]),
 70 |         ]
 71 |     )
 72 | 
 73 |     await vec.create_embedding_index(IvfflatIndex())
 74 |     await vec.drop_embedding_index()
 75 |     await vec.create_embedding_index(IvfflatIndex(100))
 76 |     await vec.drop_embedding_index()
 77 |     await vec.create_embedding_index(HNSWIndex())
 78 |     await vec.drop_embedding_index()
 79 |     await vec.create_embedding_index(HNSWIndex(20, 125))
 80 |     await vec.drop_embedding_index()
 81 |     await vec.create_embedding_index(DiskAnnIndex())
 82 |     await vec.drop_embedding_index()
 83 |     await vec.create_embedding_index(DiskAnnIndex(50, 50, 1.5, "memory_optimized", 2, 1))
 84 | 
 85 |     rec = await vec.search([1.0, 2.0])
 86 |     assert len(rec) == 10
 87 |     rec = await vec.search([1.0, 2.0], limit=4)
 88 |     assert len(rec) == 4
 89 |     rec = await vec.search(limit=4)
 90 |     assert len(rec) == 4
 91 |     rec = await vec.search([1.0, 2.0], limit=4, filter={"key2": "val2"})
 92 |     assert len(rec) == 1
 93 |     rec = await vec.search([1.0, 2.0], limit=4, filter={"key2": "does not exist"})
 94 |     assert len(rec) == 0
 95 |     rec = await vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1"})
 96 |     assert len(rec) == 1
 97 |     rec = await vec.search([1.0, 2.0], filter={"key_1": "val_1", "key_2": "val_2"})
 98 |     assert len(rec) == 1
 99 |     rec = await vec.search([1.0, 2.0], limit=4, filter={"key_1": "val_1", "key_2": "val_3"})
100 |     assert len(rec) == 0
101 |     rec = await vec.search(limit=4, filter={"key_1": "val_1", "key_2": "val_3"})
102 |     assert len(rec) == 0
103 |     rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
104 |     assert len(rec) == 2
105 |     rec = await vec.search(limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
106 |     assert len(rec) == 2
107 | 
108 |     rec = await vec.search(
109 |         [1.0, 2.0],
110 |         limit=4,
111 |         filter=[
112 |             {"key_1": "val_1"},
113 |             {"key2": "val2"},
114 |             {"no such key": "no such val"},
115 |         ],
116 |     )
117 |     assert len(rec) == 2
118 | 
119 |     assert isinstance(rec[0][SEARCH_RESULT_METADATA_IDX], dict)
120 |     assert isinstance(rec[0]["metadata"], dict)
121 |     assert rec[0]["contents"] == "the brown fox"
122 | 
123 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(("key", "val2")))
124 |     assert len(rec) == 1
125 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(("key", "==", "val2")))
126 |     assert len(rec) == 1
127 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key", "==", "val2"))
128 |     assert len(rec) == 1
129 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<", 100))
130 |     assert len(rec) == 1
131 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<", 10))
132 |     assert len(rec) == 0
133 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<=", 10))
134 |     assert len(rec) == 1
135 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_10", "<=", 10.0))
136 |     assert len(rec) == 1
137 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_11", "<=", 11.3))
138 |     assert len(rec) == 1
139 |     rec = await vec.search(limit=4, predicates=Predicates("key_11", ">=", 11.29999))
140 |     assert len(rec) == 1
141 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key_11", "<", 11.299999))
142 |     assert len(rec) == 0
143 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [1, 2]))
144 |     assert len(rec) == 1
145 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [3, 7]))
146 |     assert len(rec) == 0
147 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [42]))
148 |     assert len(rec) == 0
149 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [4]))
150 |     assert len(rec) == 1
151 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", [9, "A"]))
152 |     assert len(rec) == 1
153 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", ["A"]))
154 |     assert len(rec) == 1
155 |     rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates("key0", "@>", ("C", "B")))
156 |     assert len(rec) == 1
157 | 
158 |     rec = await vec.search(
159 |         [1.0, 2.0],
160 |         limit=4,
161 |         predicates=Predicates(*[("key", "val2"), ("key_10", "<", 100)]),
162 |     )
163 |     assert len(rec) == 1
164 |     rec = await vec.search(
165 |         [1.0, 2.0],
166 |         limit=4,
167 |         predicates=Predicates(("key", "val2"), ("key_10", "<", 100), operator="AND"),
168 |     )
169 |     assert len(rec) == 1
170 |     rec = await vec.search(
171 |         [1.0, 2.0],
172 |         limit=4,
173 |         predicates=Predicates(("key", "val2"), ("key_2", "val_2"), operator="OR"),
174 |     )
175 |     assert len(rec) == 2
176 |     rec = await vec.search(
177 |         [1.0, 2.0],
178 |         limit=4,
179 |         predicates=Predicates("key_10", "<", 100)
180 |         & (
181 |             Predicates(
182 |                 "key",
183 |                 "==",
184 |                 "val2",
185 |             )
186 |             | Predicates("key_2", "==", "val_2")
187 |         ),
188 |     )
189 |     assert len(rec) == 1
190 |     rec = await vec.search(
191 |         [1.0, 2.0],
192 |         limit=4,
193 |         predicates=Predicates("key_10", "<", 100)
194 |         and (Predicates("key", "==", "val2") or Predicates("key_2", "==", "val_2")),
195 |     )
196 |     assert len(rec) == 1
197 |     rec = await vec.search(
198 |         [1.0, 2.0],
199 |         limit=4,
200 |         predicates=Predicates("key0", "@>", [6, 7]) and Predicates("key3", "==", 3),
201 |     )
202 |     assert len(rec) == 1
203 |     rec = await vec.search(
204 |         [1.0, 2.0],
205 |         limit=4,
206 |         predicates=Predicates("key0", "@>", [6, 7]) and Predicates("key3", "==", 6),
207 |     )
208 |     assert len(rec) == 0
209 |     rec = await vec.search(limit=4, predicates=~Predicates(("key", "val2"), ("key_10", "<", 100)))
210 |     assert len(rec) == 4
211 | 
212 |     raised = False
213 |     try:
214 |         # can't upsert using both keys and dictionaries
215 |         await vec.upsert(
216 |             [
217 |                 (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]),
218 |                 (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]),
219 |             ]
220 |         )
221 |     except ValueError:
222 |         raised = True
223 |     assert raised
224 | 
225 |     raised = False
226 |     try:
227 |         # can't upsert using both keys and dictionaries opposite order
228 |         await vec.upsert(
229 |             [
230 |                 (uuid.uuid4(), """{"key2":"val"}""", "the brown fox", [1.0, 1.2]),
231 |                 (uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2]),
232 |             ]
233 |         )
234 |     except BaseException:
235 |         raised = True
236 |     assert raised
237 | 
238 |     rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
239 |     assert len(rec) == 2
240 |     await vec.delete_by_ids([rec[0]["id"]])
241 |     rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
242 |     assert len(rec) == 1
243 |     await vec.delete_by_metadata([{"key_1": "val_1"}, {"key2": "val2"}])
244 |     rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key_1": "val_1"}, {"key2": "val2"}])
245 |     assert len(rec) == 0
246 |     rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}])
247 |     assert len(rec) == 4
248 |     await vec.delete_by_metadata([{"key2": "val"}])
249 |     rec = await vec.search([1.0, 2.0], limit=4, filter=[{"key2": "val"}])
250 |     assert len(rec) == 0
251 | 
252 |     assert not await vec.table_is_empty()
253 |     await vec.delete_all()
254 |     assert await vec.table_is_empty()
255 | 
256 |     await vec.drop_table()
257 |     await vec.close()
258 | 
259 |     vec = Async(service_url, "data_table", 2, id_type="TEXT")
260 |     await vec.create_tables()
261 |     empty = await vec.table_is_empty()
262 |     assert empty
263 |     await vec.upsert([("Not a valid UUID", {"key": "val"}, "the brown fox", [1.0, 1.2])])
264 |     empty = await vec.table_is_empty()
265 |     assert not empty
266 |     await vec.delete_by_ids(["Not a valid UUID"])
267 |     empty = await vec.table_is_empty()
268 |     assert empty
269 |     await vec.drop_table()
270 |     await vec.close()
271 | 
272 |     vec = Async(service_url, "data_table", 2, time_partition_interval=timedelta(seconds=60))
273 |     await vec.create_tables()
274 |     empty = await vec.table_is_empty()
275 |     assert empty
276 |     id = uuid.uuid1()
277 |     await vec.upsert([(id, {"key": "val"}, "the brown fox", [1.0, 1.2])])
278 |     empty = await vec.table_is_empty()
279 |     assert not empty
280 |     await vec.delete_by_ids([id])
281 |     empty = await vec.table_is_empty()
282 |     assert empty
283 | 
284 |     raised = False
285 |     try:
286 |         # can't upsert with uuid type 4 in time partitioned table
287 |         await vec.upsert([(uuid.uuid4(), {"key": "val"}, "the brown fox", [1.0, 1.2])])
288 |     except BaseException:
289 |         raised = True
290 |     assert raised
291 | 
292 |     specific_datetime = datetime(2018, 8, 10, 15, 30, 0)
293 |     await vec.upsert(
294 |         [
295 |             # current time
296 |             (uuid.uuid1(), {"key": "val"}, "the brown fox", [1.0, 1.2]),
297 |             # time in 2018
298 |             (
299 |                 uuid_from_time(specific_datetime),
300 |                 {"key": "val"},
301 |                 "the brown fox",
302 |                 [1.0, 1.2],
303 |             ),
304 |         ]
305 |     )
306 |     assert not await vec.table_is_empty()
307 | 
308 |     # check all the possible ways to specify a date range
309 |     async def search_date(start_date: datetime | str | None, end_date: datetime | str | None, expected: int) -> None:
310 |         # using uuid_time_filter
311 |         rec = await vec.search(
312 |             [1.0, 2.0],
313 |             limit=4,
314 |             uuid_time_filter=UUIDTimeRange(start_date, end_date),
315 |         )
316 |         assert len(rec) == expected
317 |         rec = await vec.search(
318 |             [1.0, 2.0],
319 |             limit=4,
320 |             uuid_time_filter=UUIDTimeRange(str(start_date), str(end_date)),
321 |         )
322 |         assert len(rec) == expected
323 | 
324 |         # using filters
325 |         filter: dict[str, str | datetime] = {}
326 |         if start_date is not None:
327 |             filter["__start_date"] = start_date
328 |         if end_date is not None:
329 |             filter["__end_date"] = end_date
330 |         rec = await vec.search([1.0, 2.0], limit=4, filter=filter)
331 |         assert len(rec) == expected
332 |         # using filters with string dates
333 |         filter = {}
334 |         if start_date is not None:
335 |             filter["__start_date"] = str(start_date)
336 |         if end_date is not None:
337 |             filter["__end_date"] = str(end_date)
338 |         rec = await vec.search([1.0, 2.0], limit=4, filter=filter)
339 |         assert len(rec) == expected
340 |         # using predicates
341 |         predicates: list[tuple[str, str, str | datetime]] = []
342 |         if start_date is not None:
343 |             predicates.append(("__uuid_timestamp", ">=", start_date))
344 |         if end_date is not None:
345 |             predicates.append(("__uuid_timestamp", "<", end_date))
346 |         rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates))
347 |         assert len(rec) == expected
348 |         # using predicates with string dates
349 |         predicates = []
350 |         if start_date is not None:
351 |             predicates.append(("__uuid_timestamp", ">=", str(start_date)))
352 |         if end_date is not None:
353 |             predicates.append(("__uuid_timestamp", "<", str(end_date)))
354 |         rec = await vec.search([1.0, 2.0], limit=4, predicates=Predicates(*predicates))
355 |         assert len(rec) == expected
356 | 
357 |     await search_date(
358 |         specific_datetime - timedelta(days=7),
359 |         specific_datetime + timedelta(days=7),
360 |         1,
361 |     )
362 |     await search_date(specific_datetime - timedelta(days=7), None, 2)
363 |     await search_date(None, specific_datetime + timedelta(days=7), 1)
364 |     await search_date(
365 |         specific_datetime - timedelta(days=7),
366 |         specific_datetime - timedelta(days=2),
367 |         0,
368 |     )
369 | 
370 |     # check timedelta handling
371 |     rec = await vec.search(
372 |         [1.0, 2.0],
373 |         limit=4,
374 |         uuid_time_filter=UUIDTimeRange(start_date=specific_datetime, time_delta=timedelta(days=7)),
375 |     )
376 |     assert len(rec) == 1
377 |     # end is exclusive
378 |     rec = await vec.search(
379 |         [1.0, 2.0],
380 |         limit=4,
381 |         uuid_time_filter=UUIDTimeRange(end_date=specific_datetime, time_delta=timedelta(days=7)),
382 |     )
383 |     assert len(rec) == 0
384 |     rec = await vec.search(
385 |         [1.0, 2.0],
386 |         limit=4,
387 |         uuid_time_filter=UUIDTimeRange(
388 |             end_date=specific_datetime + timedelta(seconds=1),
389 |             time_delta=timedelta(days=7),
390 |         ),
391 |     )
392 |     assert len(rec) == 1
393 |     rec = await vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(10, 5))
394 |     assert len(rec) == 2
395 |     rec = await vec.search([1.0, 2.0], limit=4, query_params=DiskAnnIndexParams(100))
396 |     assert len(rec) == 2
397 |     await vec.drop_table()
398 |     await vec.close()
399 | 


--------------------------------------------------------------------------------
/nbs/01_pgvectorizer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# PgVectorizer"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "#| default_exp pgvectorizer"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "#| export\n",
 27 |     "import psycopg2.pool\n",
 28 |     "from contextlib import contextmanager\n",
 29 |     "import psycopg2.extras\n",
 30 |     "import pgvector.psycopg2\n",
 31 |     "import numpy as np\n",
 32 |     "import re\n",
 33 |     "\n",
 34 |     "from timescale_vector import client"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "#| export\n",
 44 |     "def _create_ident(base: str, suffix: str):\n",
 45 |     "     if len(base) + len(suffix) > 62:\n",
 46 |     "            base = base[:62 - len(suffix)]\n",
 47 |     "     return re.sub(r'[^a-zA-Z0-9_]', '_', f\"{base}_{suffix}\")\n",
 48 |     "\n",
 49 |     "class Vectorize:\n",
 50 |     "    def __init__(self,\n",
 51 |     "                 service_url: str, \n",
 52 |     "                 table_name: str,\n",
 53 |     "                 schema_name: str='public',\n",
 54 |     "                 id_column_name: str='id', \n",
 55 |     "                 work_queue_table_name: str=None, \n",
 56 |     "                 trigger_name: str='track_changes_for_embedding', \n",
 57 |     "                 trigger_name_fn: str=None) -> None:\n",
 58 |     "        self.service_url = service_url\n",
 59 |     "        self.table_name_unquoted = table_name\n",
 60 |     "        self.schema_name_unquoted = schema_name\n",
 61 |     "        self.table_name = client.QueryBuilder._quote_ident(table_name)\n",
 62 |     "        self.schema_name = client.QueryBuilder._quote_ident(schema_name)\n",
 63 |     "        self.id_column_name = client.QueryBuilder._quote_ident(id_column_name)\n",
 64 |     "        if work_queue_table_name is None:\n",
 65 |     "            work_queue_table_name = _create_ident(table_name, 'embedding_work_queue')\n",
 66 |     "        self.work_queue_table_name = client.QueryBuilder._quote_ident(work_queue_table_name)\n",
 67 |     "        \n",
 68 |     "        self.trigger_name = client.QueryBuilder._quote_ident(trigger_name)\n",
 69 |     "\n",
 70 |     "        if trigger_name_fn is None:\n",
 71 |     "            trigger_name_fn = _create_ident(table_name, 'wq_for_embedding')\n",
 72 |     "        self.trigger_name_fn = client.QueryBuilder._quote_ident(trigger_name_fn) \n",
 73 |     "\n",
 74 |     "\n",
 75 |     "    def register(self):        \n",
 76 |     "        with psycopg2.connect(self.service_url) as conn:\n",
 77 |     "            with conn.cursor() as cursor:\n",
 78 |     "                cursor.execute(f\"\"\"\n",
 79 |     "                    SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}') is not null; \n",
 80 |     "                \"\"\")\n",
 81 |     "                table_exists = cursor.fetchone()[0]\n",
 82 |     "                if table_exists:\n",
 83 |     "                    return\n",
 84 |     "                \n",
 85 |     "                cursor.execute(f\"\"\"\n",
 86 |     "                    CREATE TABLE {self.schema_name}.{self.work_queue_table_name} (\n",
 87 |     "                        id int\n",
 88 |     "                    );\n",
 89 |     "\n",
 90 |     "                    CREATE INDEX ON {self.schema_name}.{self.work_queue_table_name}(id);\n",
 91 |     "\n",
 92 |     "                    CREATE OR REPLACE FUNCTION {self.schema_name}.{self.trigger_name_fn}() RETURNS TRIGGER LANGUAGE PLPGSQL AS $$ \n",
 93 |     "                    BEGIN \n",
 94 |     "                        IF (TG_OP = 'DELETE') THEN\n",
 95 |     "                            INSERT INTO {self.work_queue_table_name} \n",
 96 |     "                            VALUES (OLD.{self.id_column_name});\n",
 97 |     "                        ELSE\n",
 98 |     "                            INSERT INTO {self.work_queue_table_name} \n",
 99 |     "                            VALUES (NEW.{self.id_column_name});\n",
100 |     "                        END IF;\n",
101 |     "                        RETURN NULL;\n",
102 |     "                    END; \n",
103 |     "                    $$;\n",
104 |     "\n",
105 |     "                    CREATE TRIGGER {self.trigger_name} \n",
106 |     "                    AFTER INSERT OR UPDATE OR DELETE\n",
107 |     "                    ON {self.schema_name}.{self.table_name} \n",
108 |     "                    FOR EACH ROW EXECUTE PROCEDURE {self.schema_name}.{self.trigger_name_fn}();\n",
109 |     "\n",
110 |     "                    INSERT INTO {self.schema_name}.{self.work_queue_table_name} SELECT {self.id_column_name} FROM {self.schema_name}.{self.table_name};\n",
111 |     "                \"\"\")\n",
112 |     "\n",
113 |     "    def process(self, embed_and_write_cb, batch_size:int=10, autoregister=True):\n",
114 |     "        if autoregister:\n",
115 |     "            self.register()\n",
116 |     "            \n",
117 |     "        with psycopg2.connect(self.service_url) as conn:\n",
118 |     "            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:\n",
119 |     "                cursor.execute(f\"\"\"\n",
120 |     "                    SELECT to_regclass('{self.schema_name}.{self.work_queue_table_name}')::oid; \n",
121 |     "                \"\"\")\n",
122 |     "                table_oid = cursor.fetchone()[0]\n",
123 |     "            \n",
124 |     "                cursor.execute(f\"\"\"\n",
125 |     "                    WITH selected_rows AS (\n",
126 |     "                        SELECT id\n",
127 |     "                        FROM {self.schema_name}.{self.work_queue_table_name}\n",
128 |     "                        LIMIT {int(batch_size)}\n",
129 |     "                        FOR UPDATE SKIP LOCKED\n",
130 |     "                    ), \n",
131 |     "                    locked_items AS (\n",
132 |     "                        SELECT id, pg_try_advisory_xact_lock({int(table_oid)}, id) AS locked\n",
133 |     "                        FROM (SELECT DISTINCT id FROM selected_rows ORDER BY id) as ids\n",
134 |     "                    ),\n",
135 |     "                    deleted_rows AS (\n",
136 |     "                        DELETE FROM {self.schema_name}.{self.work_queue_table_name}\n",
137 |     "                        WHERE id IN (SELECT id FROM locked_items WHERE locked = true ORDER BY id)\n",
138 |     "                    )\n",
139 |     "                    SELECT locked_items.id as locked_id, {self.table_name}.*\n",
140 |     "                    FROM locked_items\n",
141 |     "                    LEFT JOIN {self.schema_name}.{self.table_name} ON {self.table_name}.{self.id_column_name} = locked_items.id\n",
142 |     "                    WHERE locked = true\n",
143 |     "                    ORDER BY locked_items.id\n",
144 |     "                \"\"\")\n",
145 |     "                res = cursor.fetchall()\n",
146 |     "                if len(res) > 0:\n",
147 |     "                    embed_and_write_cb(res, self)\n",
148 |     "                return len(res)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "#| hide\n",
158 |     "from dotenv import load_dotenv, find_dotenv\n",
159 |     "import os"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "_ = load_dotenv(find_dotenv(), override=True)\n",
169 |     "service_url = os.environ['TIMESCALE_SERVICE_URL']"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "\n",
179 |     "#| hide\n",
180 |     "with psycopg2.connect(service_url) as conn:\n",
181 |     "    with conn.cursor() as cursor:\n",
182 |     "        for item in ['blog', 'blog_embedding_work_queue', 'blog_embedding']:\n",
183 |     "            cursor.execute(f\"DROP TABLE IF EXISTS {item};\")\n",
184 |     "        \n",
185 |     "        for item in ['public','test']:\n",
186 |     "            cursor.execute(f\"DROP SCHEMA IF EXISTS {item} CASCADE;\")\n",
187 |     "            cursor.execute(f\"CREATE SCHEMA {item};\")"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "with psycopg2.connect(service_url) as conn:\n",
197 |     "    with conn.cursor() as cursor:\n",
198 |     "        cursor.execute('''\n",
199 |     "        CREATE TABLE IF NOT EXISTS blog (\n",
200 |     "            id              SERIAL PRIMARY KEY NOT NULL,\n",
201 |     "            title           TEXT NOT NULL,\n",
202 |     "            author          TEXT NOT NULL,\n",
203 |     "            contents        TEXT NOT NULL,\n",
204 |     "            category        TEXT NOT NULL,\n",
205 |     "            published_time  TIMESTAMPTZ NULL --NULL if not yet published\n",
206 |     "        );\n",
207 |     "        ''')\n",
208 |     "        cursor.execute('''\n",
209 |     "            insert into blog (title, author, contents, category, published_time) VALUES ('first', 'mat', 'first_post', 'personal', '2021-01-01');\n",
210 |     "        ''')\n",
211 |     "\n",
212 |     "\n",
213 |     "vectorizer = Vectorize(service_url, 'blog')\n",
214 |     "vectorizer.register()\n",
215 |     "# should be idempotent\n",
216 |     "vectorizer.register()"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "from langchain.docstore.document import Document\n",
226 |     "from langchain.text_splitter import CharacterTextSplitter\n",
227 |     "from timescale_vector import client\n",
228 |     "from langchain_openai import OpenAIEmbeddings\n",
229 |     "from langchain_community.vectorstores.timescalevector import TimescaleVector\n",
230 |     "from datetime import timedelta"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "def get_document(blog):\n",
240 |     "    text_splitter = CharacterTextSplitter(\n",
241 |     "        chunk_size=1000,\n",
242 |     "        chunk_overlap=200,\n",
243 |     "    )\n",
244 |     "    docs = []\n",
245 |     "    for chunk in text_splitter.split_text(blog['contents']):\n",
246 |     "        content = f\"Author {blog['author']}, title: {blog['title']}, contents:{chunk}\"\n",
247 |     "        metadata = {\n",
248 |     "            \"id\": str(client.uuid_from_time(blog['published_time'])),\n",
249 |     "            \"blog_id\": blog['id'], \n",
250 |     "            \"author\": blog['author'], \n",
251 |     "            \"category\": blog['category'],\n",
252 |     "            \"published_time\": blog['published_time'].isoformat(),\n",
253 |     "        }\n",
254 |     "        docs.append(Document(page_content=content, metadata=metadata))\n",
255 |     "    return docs\n",
256 |     "\n",
257 |     "def embed_and_write(blog_instances, vectorizer):\n",
258 |     "    TABLE_NAME = vectorizer.table_name_unquoted +\"_embedding\"\n",
259 |     "    embedding = OpenAIEmbeddings()\n",
260 |     "    vector_store = TimescaleVector(\n",
261 |     "        collection_name=TABLE_NAME,\n",
262 |     "        service_url=service_url,\n",
263 |     "        embedding=embedding,\n",
264 |     "        time_partition_interval=timedelta(days=30),\n",
265 |     "    )\n",
266 |     "\n",
267 |     "    # delete old embeddings for all ids in the work queue\n",
268 |     "    metadata_for_delete = [{\"blog_id\": blog['locked_id']} for blog in blog_instances]\n",
269 |     "    vector_store.delete_by_metadata(metadata_for_delete)\n",
270 |     "\n",
271 |     "    documents = []\n",
272 |     "    for blog in blog_instances:\n",
273 |     "        # skip blogs that are not published yet, or are deleted (will be None because of left join)\n",
274 |     "        if blog['published_time'] != None:\n",
275 |     "            documents.extend(get_document(blog))\n",
276 |     "\n",
277 |     "    if len(documents) == 0:\n",
278 |     "        return\n",
279 |     "\n",
280 |     "    texts = [d.page_content for d in documents]\n",
281 |     "    metadatas = [d.metadata for d in documents]\n",
282 |     "    ids = [d.metadata[\"id\"] for d in documents]\n",
283 |     "    vector_store.add_texts(texts, metadatas, ids)\n",
284 |     "\n",
285 |     "vectorizer = Vectorize(service_url, 'blog')\n",
286 |     "assert vectorizer.process(embed_and_write) == 1\n",
287 |     "assert vectorizer.process(embed_and_write) == 0\n",
288 |     "\n",
289 |     "TABLE_NAME = \"blog_embedding\"\n",
290 |     "embedding = OpenAIEmbeddings()\n",
291 |     "vector_store = TimescaleVector(\n",
292 |     "    collection_name=TABLE_NAME,\n",
293 |     "    service_url=service_url,\n",
294 |     "    embedding=embedding,\n",
295 |     "    time_partition_interval=timedelta(days=30),\n",
296 |     ")\n",
297 |     "\n",
298 |     "res = vector_store.similarity_search_with_score(\"first\", 10)\n",
299 |     "assert len(res) == 1\n",
300 |     "\n",
301 |     "\n",
302 |     "with psycopg2.connect(service_url) as conn:\n",
303 |     "    with conn.cursor() as cursor:\n",
304 |     "        cursor.execute('''\n",
305 |     "            insert into blog (title, author, contents, category, published_time) VALUES ('2', 'mat', 'second_post', 'personal', '2021-01-01');\n",
306 |     "            insert into blog (title, author, contents, category, published_time) VALUES ('3', 'mat', 'third_post', 'personal', '2021-01-01');\n",
307 |     "        ''')\n",
308 |     "assert vectorizer.process(embed_and_write) == 2\n",
309 |     "assert vectorizer.process(embed_and_write) == 0\n",
310 |     "\n",
311 |     "res = vector_store.similarity_search_with_score(\"first\", 10)\n",
312 |     "assert len(res) == 3\n",
313 |     "\n",
314 |     "with psycopg2.connect(service_url) as conn:\n",
315 |     "    with conn.cursor() as cursor:\n",
316 |     "        cursor.execute('''\n",
317 |     "            DELETE FROM blog WHERE title = '3';\n",
318 |     "        ''')\n",
319 |     "assert vectorizer.process(embed_and_write) == 1\n",
320 |     "assert vectorizer.process(embed_and_write) == 0\n",
321 |     "res = vector_store.similarity_search_with_score(\"first\", 10)\n",
322 |     "assert len(res) == 2\n",
323 |     "\n",
324 |     "res = vector_store.similarity_search_with_score(\"second\", 10)\n",
325 |     "assert len(res) == 2\n",
326 |     "content = res[0][0].page_content\n",
327 |     "assert \"new version\" not in content\n",
328 |     "with psycopg2.connect(service_url) as conn:\n",
329 |     "    with conn.cursor() as cursor:\n",
330 |     "        cursor.execute('''\n",
331 |     "            update blog set contents = 'second post new version' WHERE title = '2';\n",
332 |     "        ''')\n",
333 |     "assert vectorizer.process(embed_and_write) == 1\n",
334 |     "assert vectorizer.process(embed_and_write) == 0\n",
335 |     "res = vector_store.similarity_search_with_score(\"second\", 10)\n",
336 |     "assert len(res) == 2\n",
337 |     "content = res[0][0].page_content\n",
338 |     "assert \"new version\" in content\n",
339 |     "\n",
340 |     "\n",
341 |     "with psycopg2.connect(service_url) as conn:\n",
342 |     "    with conn.cursor() as cursor:\n",
343 |     "        cursor.execute('''\n",
344 |     "        CREATE TABLE IF NOT EXISTS test.blog_table_name_that_is_really_really_long_and_i_mean_long (\n",
345 |     "            id              SERIAL PRIMARY KEY NOT NULL,\n",
346 |     "            title           TEXT NOT NULL,\n",
347 |     "            author          TEXT NOT NULL,\n",
348 |     "            contents        TEXT NOT NULL,\n",
349 |     "            category        TEXT NOT NULL,\n",
350 |     "            published_time  TIMESTAMPTZ NULL --NULL if not yet published\n",
351 |     "        );\n",
352 |     "        ''')\n",
353 |     "        cursor.execute('''\n",
354 |     "            insert into test.blog_table_name_that_is_really_really_long_and_i_mean_long (title, author, contents, category, published_time) VALUES ('first', 'mat', 'first_post', 'personal', '2021-01-01');\n",
355 |     "        ''')\n",
356 |     "\n",
357 |     "vectorizer = Vectorize(service_url, 'blog_table_name_that_is_really_really_long_and_i_mean_long', schema_name='test')\n",
358 |     "assert vectorizer.process(embed_and_write) == 1\n",
359 |     "assert vectorizer.process(embed_and_write) == 0"
360 |    ]
361 |   }
362 |  ],
363 |  "metadata": {
364 |   "kernelspec": {
365 |    "display_name": "python3",
366 |    "language": "python",
367 |    "name": "python3"
368 |   }
369 |  },
370 |  "nbformat": 4,
371 |  "nbformat_minor": 4
372 | }
373 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
   1 | # Timescale Vector
   2 | 
   3 | <!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->
   4 | 
   5 | PostgreSQL++ for AI Applications.
   6 | 
   7 | - [Signup for Timescale
   8 |   Vector](https://console.cloud.timescale.com/signup?utm_campaign=vectorlaunch&utm_source=github&utm_medium=direct):
   9 |   Get 90 days free to try Timescale Vector on the Timescale cloud data
  10 |   platform. There is no self-managed version at this time.
  11 | - [Documentation](https://timescale.github.io/python-vector/): Learn the
  12 |   key features of Timescale Vector and how to use them.
  13 | - [Getting Started
  14 |   Tutorial](https://timescale.github.io/python-vector/tsv_python_getting_started_tutorial.html):
  15 |   Learn how to use Timescale Vector for semantic search on a real-world
  16 |   dataset.
  17 | - [Learn
  18 |   more](https://www.timescale.com/blog/how-we-made-postgresql-the-best-vector-database/?utm_campaign=vectorlaunch&utm_source=github&utm_medium=direct):
  19 |   Learn more about Timescale Vector, how it works and why we built it.
  20 | 
  21 | If you prefer to use an LLM development or data framework, see Timescale
  22 | Vector’s integrations with
  23 | [LangChain](https://python.langchain.com/docs/integrations/vectorstores/timescalevector)
  24 | and
  25 | [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/vector_stores/Timescalevector.html)
  26 | 
  27 | ## Install
  28 | 
  29 | To install the main library use:
  30 | 
  31 | ``` sh
  32 | pip install timescale_vector
  33 | ```
  34 | 
  35 | We also use `dotenv` in our examples for passing around secrets and
  36 | keys. You can install that with:
  37 | 
  38 | ``` sh
  39 | pip install python-dotenv
  40 | ```
  41 | 
  42 | If you run into installation errors related to the psycopg2 package, you
  43 | will need to install some prerequisites. The timescale-vector package
  44 | explicitly depends on psycopg2 (the non-binary version). This adheres to
  45 | [the advice provided by
  46 | psycopg2](https://www.psycopg.org/docs/install.html#psycopg-vs-psycopg-binary).
  47 | Building psycopg from source [requires a few prerequisites to be
  48 | installed](https://www.psycopg.org/docs/install.html#build-prerequisites).
  49 | Make sure these are installed before trying to
  50 | `pip install timescale_vector`.
  51 | 
  52 | ## Basic usage
  53 | 
  54 | First, import all the necessary libraries:
  55 | 
  56 | ``` python
  57 | from dotenv import load_dotenv, find_dotenv
  58 | import os
  59 | from timescale_vector import client
  60 | import uuid
  61 | from datetime import datetime, timedelta
  62 | ```
  63 | 
  64 | Load up your PostgreSQL credentials. Safest way is with a .env file:
  65 | 
  66 | ``` python
  67 | _ = load_dotenv(find_dotenv(), override=True) 
  68 | service_url  = os.environ['TIMESCALE_SERVICE_URL']
  69 | ```
  70 | 
  71 | Next, create the client. In this tutorial, we will use the sync client.
  72 | But we have an async client as well (with an identical interface that
  73 | uses async functions).
  74 | 
  75 | The client constructor takes three required arguments:
  76 | 
  77 | | name           | description                                                                               |
  78 | |----------------|-------------------------------------------------------------------------------------------|
  79 | | service_url    | Timescale service URL / connection string                                                 |
  80 | | table_name     | Name of the table to use for storing the embeddings. Think of this as the collection name |
  81 | | num_dimensions | Number of dimensions in the vector                                                        |
  82 | 
  83 | You can also specify the schema name, distance type, primary key type,
  84 | etc. as optional parameters. Please see the documentation for details.
  85 | 
  86 | ``` python
  87 | vec  = client.Sync(service_url, "my_data", 2)
  88 | ```
  89 | 
  90 | Next, create the tables for the collection:
  91 | 
  92 | ``` python
  93 | vec.create_tables()
  94 | ```
  95 | 
  96 | Next, insert some data. The data record contains:
  97 | 
  98 | - A UUID to uniquely identify the embedding
  99 | - A JSON blob of metadata about the embedding
 100 | - The text the embedding represents
 101 | - The embedding itself
 102 | 
 103 | Because this data includes UUIDs which become primary keys, we ingest
 104 | with upserts.
 105 | 
 106 | ``` python
 107 | vec.upsert([\
 108 |     (uuid.uuid1(), {"animal": "fox"}, "the brown fox", [1.0,1.3]),\
 109 |     (uuid.uuid1(), {"animal": "fox", "action":"jump"}, "jumped over the", [1.0,10.8]),\
 110 | ])
 111 | ```
 112 | 
 113 | You can now create a vector index to speed up similarity search:
 114 | 
 115 | ``` python
 116 | vec.create_embedding_index(client.DiskAnnIndex())
 117 | ```
 118 | 
 119 | Now, you can query for similar items:
 120 | 
 121 | ``` python
 122 | vec.search([1.0, 9.0])
 123 | ```
 124 | 
 125 |     [[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'),
 126 |       {'action': 'jump', 'animal': 'fox'},
 127 |       'jumped over the',
 128 |       array([ 1. , 10.8], dtype=float32),
 129 |       0.00016793422934946456],
 130 |      [UUID('4494c12c-4a0d-11ef-94a3-6ee10b77fd09'),
 131 |       {'animal': 'fox'},
 132 |       'the brown fox',
 133 |       array([1. , 1.3], dtype=float32),
 134 |       0.14489260377438218]]
 135 | 
 136 | There are many search options which we will cover below in the
 137 | `Advanced search` section.
 138 | 
 139 | As one example, we will return one item using a similarity search
 140 | constrained by a metadata filter.
 141 | 
 142 | ``` python
 143 | vec.search([1.0, 9.0], limit=1, filter={"action": "jump"})
 144 | ```
 145 | 
 146 |     [[UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'),
 147 |       {'action': 'jump', 'animal': 'fox'},
 148 |       'jumped over the',
 149 |       array([ 1. , 10.8], dtype=float32),
 150 |       0.00016793422934946456]]
 151 | 
 152 | The returned records contain 5 fields:
 153 | 
 154 | | name      | description                                             |
 155 | |-----------|---------------------------------------------------------|
 156 | | id        | The UUID of the record                                  |
 157 | | metadata  | The JSON metadata associated with the record            |
 158 | | contents  | the text content that was embedded                      |
 159 | | embedding | The vector embedding                                    |
 160 | | distance  | The distance between the query embedding and the vector |
 161 | 
 162 | You can access the fields by simply using the record as a dictionary
 163 | keyed on the field name:
 164 | 
 165 | ``` python
 166 | records = vec.search([1.0, 9.0], limit=1, filter={"action": "jump"})
 167 | (records[0]["id"],records[0]["metadata"], records[0]["contents"], records[0]["embedding"], records[0]["distance"])
 168 | ```
 169 | 
 170 |     (UUID('4494c186-4a0d-11ef-94a3-6ee10b77fd09'),
 171 |      {'action': 'jump', 'animal': 'fox'},
 172 |      'jumped over the',
 173 |      array([ 1. , 10.8], dtype=float32),
 174 |      0.00016793422934946456)
 175 | 
 176 | You can delete by ID:
 177 | 
 178 | ``` python
 179 | vec.delete_by_ids([records[0]["id"]])
 180 | ```
 181 | 
 182 | Or you can delete by metadata filters:
 183 | 
 184 | ``` python
 185 | vec.delete_by_metadata({"action": "jump"})
 186 | ```
 187 | 
 188 | To delete all records use:
 189 | 
 190 | ``` python
 191 | vec.delete_all()
 192 | ```
 193 | 
 194 | ## Advanced usage
 195 | 
 196 | In this section, we will go into more detail about our feature. We will
 197 | cover:
 198 | 
 199 | 1.  Search filter options - how to narrow your search by additional
 200 |     constraints
 201 | 2.  Indexing - how to speed up your similarity queries
 202 | 3.  Time-based partitioning - how to optimize similarity queries that
 203 |     filter on time
 204 | 4.  Setting different distance types to use in distance calculations
 205 | 
 206 | ### Search options
 207 | 
 208 | The `search` function is very versatile and allows you to search for the
 209 | right vector in a wide variety of ways. We’ll describe the search option
 210 | in 3 parts:
 211 | 
 212 | 1.  We’ll cover basic similarity search.
 213 | 2.  Then, we’ll describe how to filter your search based on the
 214 |     associated metadata.
 215 | 3.  Finally, we’ll talk about filtering on time when time-partitioning
 216 |     is enabled.
 217 | 
 218 | Let’s use the following data for our example:
 219 | 
 220 | ``` python
 221 | vec.upsert([\
 222 |     (uuid.uuid1(), {"animal":"fox", "action": "sit", "times":1}, "the brown fox", [1.0,1.3]),\
 223 |     (uuid.uuid1(),  {"animal":"fox", "action": "jump", "times":100}, "jumped over the", [1.0,10.8]),\
 224 | ])
 225 | ```
 226 | 
 227 | The basic query looks like:
 228 | 
 229 | ``` python
 230 | vec.search([1.0, 9.0])
 231 | ```
 232 | 
 233 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 234 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 235 |       'jumped over the',
 236 |       array([ 1. , 10.8], dtype=float32),
 237 |       0.00016793422934946456],
 238 |      [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),
 239 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 240 |       'the brown fox',
 241 |       array([1. , 1.3], dtype=float32),
 242 |       0.14489260377438218]]
 243 | 
 244 | You could provide a limit for the number of items returned:
 245 | 
 246 | ``` python
 247 | vec.search([1.0, 9.0], limit=1)
 248 | ```
 249 | 
 250 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 251 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 252 |       'jumped over the',
 253 |       array([ 1. , 10.8], dtype=float32),
 254 |       0.00016793422934946456]]
 255 | 
 256 | #### Narrowing your search by metadata
 257 | 
 258 | We have two main ways to filter results by metadata: - `filters` for
 259 | equality matches on metadata. - `predicates` for complex conditions on
 260 | metadata.
 261 | 
 262 | Filters are more likely to be performant but are more limited in what
 263 | they can express, so we suggest using those if your use case allows it.
 264 | 
 265 | ##### Filters
 266 | 
 267 | You could specify a match on the metadata as a dictionary where all keys
 268 | have to match the provided values (keys not in the filter are
 269 | unconstrained):
 270 | 
 271 | ``` python
 272 | vec.search([1.0, 9.0], limit=1, filter={"action": "sit"})
 273 | ```
 274 | 
 275 |     [[UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),
 276 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 277 |       'the brown fox',
 278 |       array([1. , 1.3], dtype=float32),
 279 |       0.14489260377438218]]
 280 | 
 281 | You can also specify a list of filter dictionaries, where an item is
 282 | returned if it matches any dict:
 283 | 
 284 | ``` python
 285 | vec.search([1.0, 9.0], limit=2, filter=[{"action": "jump"}, {"animal": "fox"}])
 286 | ```
 287 | 
 288 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 289 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 290 |       'jumped over the',
 291 |       array([ 1. , 10.8], dtype=float32),
 292 |       0.00016793422934946456],
 293 |      [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),
 294 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 295 |       'the brown fox',
 296 |       array([1. , 1.3], dtype=float32),
 297 |       0.14489260377438218]]
 298 | 
 299 | ##### Predicates
 300 | 
 301 | Predicates allow for more complex search conditions. For example, you
 302 | could use greater than and less than conditions on numeric values.
 303 | 
 304 | ``` python
 305 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("times", ">", 1))
 306 | ```
 307 | 
 308 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 309 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 310 |       'jumped over the',
 311 |       array([ 1. , 10.8], dtype=float32),
 312 |       0.00016793422934946456]]
 313 | 
 314 | [`Predicates`](https://timescale.github.io/python-vector/vector.html#predicates)
 315 | objects are defined by the name of the metadata key, an operator, and a
 316 | value.
 317 | 
 318 | The supported operators are: `==`, `!=`, `<`, `<=`, `>`, `>=`
 319 | 
 320 | The type of the values determines the type of comparison to perform. For
 321 | example, passing in `"Sam"` (a string) will do a string comparison while
 322 | a `10` (an int) will perform an integer comparison while a `10.0`
 323 | (float) will do a float comparison. It is important to note that using a
 324 | value of `"10"` will do a string comparison as well so it’s important to
 325 | use the right type. Supported Python types are: `str`, `int`, and
 326 | `float`. One more example with a string comparison:
 327 | 
 328 | ``` python
 329 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump"))
 330 | ```
 331 | 
 332 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 333 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 334 |       'jumped over the',
 335 |       array([ 1. , 10.8], dtype=float32),
 336 |       0.00016793422934946456]]
 337 | 
 338 | The real power of predicates is that they can also be combined using the
 339 | `&` operator (for combining predicates with AND semantics) and `|`(for
 340 | combining using OR semantic). So you can do:
 341 | 
 342 | ``` python
 343 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump") & client.Predicates("times", ">", 1))
 344 | ```
 345 | 
 346 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 347 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 348 |       'jumped over the',
 349 |       array([ 1. , 10.8], dtype=float32),
 350 |       0.00016793422934946456]]
 351 | 
 352 | Just for sanity, let’s show a case where no results are returned because
 353 | or predicates:
 354 | 
 355 | ``` python
 356 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates("action", "==", "jump") & client.Predicates("times", "==", 1))
 357 | ```
 358 | 
 359 |     []
 360 | 
 361 | And one more example where we define the predicates as a variable and
 362 | use grouping with parenthesis:
 363 | 
 364 | ``` python
 365 | my_predicates = client.Predicates("action", "==", "jump") & (client.Predicates("times", "==", 1) | client.Predicates("times", ">", 1))
 366 | vec.search([1.0, 9.0], limit=2, predicates=my_predicates)
 367 | ```
 368 | 
 369 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 370 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 371 |       'jumped over the',
 372 |       array([ 1. , 10.8], dtype=float32),
 373 |       0.00016793422934946456]]
 374 | 
 375 | We also have some semantic sugar for combining many predicates with AND
 376 | semantics. You can pass in multiple 3-tuples to
 377 | [`Predicates`](https://timescale.github.io/python-vector/vector.html#predicates):
 378 | 
 379 | ``` python
 380 | vec.search([1.0, 9.0], limit=2, predicates=client.Predicates(("action", "==", "jump"), ("times", ">", 10)))
 381 | ```
 382 | 
 383 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 384 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 385 |       'jumped over the',
 386 |       array([ 1. , 10.8], dtype=float32),
 387 |       0.00016793422934946456]]
 388 | 
 389 | #### Filter your search by time
 390 | 
 391 | When using `time-partitioning`(see below). You can very efficiently
 392 | filter your search by time. Time-partitioning makes a timestamp embedded
 393 | as part of the UUID-based ID associated with an embedding. Let us first
 394 | create a collection with time partitioning and insert some data (one
 395 | item from January 2018 and another in January 2019):
 396 | 
 397 | ``` python
 398 | tpvec = client.Sync(service_url, "time_partitioned_table", 2, time_partition_interval=timedelta(hours=6))
 399 | tpvec.create_tables()
 400 | 
 401 | specific_datetime = datetime(2018, 1, 1, 12, 0, 0)
 402 | tpvec.upsert([\
 403 |     (client.uuid_from_time(specific_datetime), {"animal":"fox", "action": "sit", "times":1}, "the brown fox", [1.0,1.3]),\
 404 |     (client.uuid_from_time(specific_datetime+timedelta(days=365)),  {"animal":"fox", "action": "jump", "times":100}, "jumped over the", [1.0,10.8]),\
 405 | ])
 406 | ```
 407 | 
 408 | Then, you can filter using the timestamps by specifing a
 409 | `uuid_time_filter`:
 410 | 
 411 | ``` python
 412 | tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(specific_datetime, specific_datetime+timedelta(days=1)))
 413 | ```
 414 | 
 415 |     [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),
 416 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 417 |       'the brown fox',
 418 |       array([1. , 1.3], dtype=float32),
 419 |       0.14489260377438218]]
 420 | 
 421 | A
 422 | [`UUIDTimeRange`](https://timescale.github.io/python-vector/vector.html#uuidtimerange)
 423 | can specify a start_date or end_date or both(as in the example above).
 424 | Specifying only the start_date or end_date leaves the other end
 425 | unconstrained.
 426 | 
 427 | ``` python
 428 | tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime))
 429 | ```
 430 | 
 431 |     [[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'),
 432 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 433 |       'jumped over the',
 434 |       array([ 1. , 10.8], dtype=float32),
 435 |       0.00016793422934946456],
 436 |      [UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),
 437 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 438 |       'the brown fox',
 439 |       array([1. , 1.3], dtype=float32),
 440 |       0.14489260377438218]]
 441 | 
 442 | You have the option to define the inclusivity of the start and end dates
 443 | with the `start_inclusive` and `end_inclusive` parameters. Setting
 444 | `start_inclusive` to true results in comparisons using the `>=`
 445 | operator, whereas setting it to false applies the `>` operator. By
 446 | default, the start date is inclusive, while the end date is exclusive.
 447 | One example:
 448 | 
 449 | ``` python
 450 | tpvec.search([1.0, 9.0], limit=4, uuid_time_filter=client.UUIDTimeRange(start_date=specific_datetime, start_inclusive=False))
 451 | ```
 452 | 
 453 |     [[UUID('ac8be800-0de6-11e9-a5fd-5a100e653c25'),
 454 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 455 |       'jumped over the',
 456 |       array([ 1. , 10.8], dtype=float32),
 457 |       0.00016793422934946456]]
 458 | 
 459 | Notice how the results are different when we use the
 460 | `start_inclusive=False` option because the first row has the exact
 461 | timestamp specified by `start_date`.
 462 | 
 463 | We’ve also made it easy to integrate time filters using the `filter` and
 464 | `predicates` parameters described above using special reserved key names
 465 | to make it appear that the timestamps are part of your metadata. We
 466 | found this useful when integrating with other systems that just want to
 467 | specify a set of filters (often these are “auto retriever” type
 468 | systems). The reserved key names are `__start_date` and `__end_date` for
 469 | filters and `__uuid_timestamp` for predicates. Some examples below:
 470 | 
 471 | ``` python
 472 | tpvec.search([1.0, 9.0], limit=4, filter={ "__start_date": specific_datetime, "__end_date": specific_datetime+timedelta(days=1)})
 473 | ```
 474 | 
 475 |     [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),
 476 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 477 |       'the brown fox',
 478 |       array([1. , 1.3], dtype=float32),
 479 |       0.14489260377438218]]
 480 | 
 481 | ``` python
 482 | tpvec.search([1.0, 9.0], limit=4, 
 483 |              predicates=client.Predicates("__uuid_timestamp", ">=", specific_datetime) & client.Predicates("__uuid_timestamp", "<", specific_datetime+timedelta(days=1)))
 484 | ```
 485 | 
 486 |     [[UUID('33c52800-ef15-11e7-8a12-ea51d07b6447'),
 487 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 488 |       'the brown fox',
 489 |       array([1. , 1.3], dtype=float32),
 490 |       0.14489260377438218]]
 491 | 
 492 | ### Indexing
 493 | 
 494 | Indexing speeds up queries over your data. By default, we set up indexes
 495 | to query your data by the UUID and the metadata.
 496 | 
 497 | But to speed up similarity search based on the embeddings, you have to
 498 | create additional indexes.
 499 | 
 500 | Note that if performing a query without an index, you will always get an
 501 | exact result, but the query will be slow (it has to read all of the data
 502 | you store for every query). With an index, your queries will be
 503 | order-of-magnitude faster, but the results are approximate (because
 504 | there are no known indexing techniques that are exact).
 505 | 
 506 | Nevertheless, there are excellent approximate algorithms. There are 3
 507 | different indexing algorithms available on the Timescale platform:
 508 | Timescale Vector index, pgvector HNSW, and pgvector ivfflat. Below are
 509 | the trade-offs between these algorithms:
 510 | 
 511 | | Algorithm        | Build speed | Query speed | Need to rebuild after updates |
 512 | |------------------|-------------|-------------|-------------------------------|
 513 | | StreamingDiskANN | Fast        | Fastest     | No                            |
 514 | | pgvector hnsw    | Slowest     | Faster      | No                            |
 515 | | pgvector ivfflat | Fastest     | Slowest     | Yes                           |
 516 | 
 517 | You can see
 518 | [benchmarks](https://www.timescale.com/blog/how-we-made-postgresql-the-best-vector-database/)
 519 | on our blog.
 520 | 
 521 | We recommend using the Timescale Vector index for most use cases. This
 522 | can be created with:
 523 | 
 524 | ``` python
 525 | vec.create_embedding_index(client.DiskAnnIndex())
 526 | ```
 527 | 
 528 | Indexes are created for a particular distance metric type. So it is
 529 | important that the same distance metric is set on the client during
 530 | index creation as it is during queries. See the `distance type` section
 531 | below.
 532 | 
 533 | Each of these indexes has a set of build-time options for controlling
 534 | the speed/accuracy trade-off when creating the index and an additional
 535 | query-time option for controlling accuracy during a particular query. We
 536 | have smart defaults for all of these options but will also describe the
 537 | details below so that you can adjust these options manually.
 538 | 
 539 | #### StreamingDiskANN index
 540 | 
 541 | The StreamingDiskANN index from pgvectorscale is a graph-based algorithm
 542 | that uses the [DiskANN](https://github.com/microsoft/DiskANN) algorithm.
 543 | You can read more about it on our
 544 | [blog](https://www.timescale.com/blog/how-we-made-postgresql-as-fast-as-pinecone-for-vector-data/)
 545 | announcing its release.
 546 | 
 547 | To create this index, run:
 548 | 
 549 | ``` python
 550 | vec.create_embedding_index(client.DiskAnnIndex())
 551 | ```
 552 | 
 553 | The above command will create the index using smart defaults. There are
 554 | a number of parameters you could tune to adjust the accuracy/speed
 555 | trade-off.
 556 | 
 557 | The parameters you can set at index build time are:
 558 | 
 559 | | Parameter name           | Description                                                                                                                                                                                      | Default value                               |
 560 | |--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------|
 561 | | `storage_layout`         | `memory_optimized` which uses SBQ to compress vector data or `plain` which stores data uncompressed                                                                                              | memory_optimized                            |
 562 | | `num_neighbors`          | Sets the maximum number of neighbors per node. Higher values increase accuracy but make the graph traversal slower.                                                                              | 50                                          |
 563 | | `search_list_size`       | This is the S parameter used in the greedy search algorithm used during construction. Higher values improve graph quality at the cost of slower index builds.                                    | 100                                         |
 564 | | `max_alpha`              | Is the alpha parameter in the algorithm. Higher values improve graph quality at the cost of slower index builds.                                                                                 | 1.2                                         |
 565 | | `num_dimensions`         | The number of dimensions to index. By default, all dimensions are indexed. But you can also index less dimensions to make use of [Matryoshka embeddings](https://huggingface.co/blog/matryoshka) | 0 (all dimensions)                          |
 566 | | `num_bits_per_dimension` | Number of bits used to encode each dimension when using SBQ                                                                                                                                      | 2 for less than 900 dimensions, 1 otherwise |
 567 | 
 568 | To set these parameters, you could run:
 569 | 
 570 | ``` python
 571 | vec.create_embedding_index(client.DiskAnnIndex(num_neighbors=50, search_list_size=100, max_alpha=1.0, storage_layout="memory_optimized", num_dimensions=0, num_bits_per_dimension=1))
 572 | ```
 573 | 
 574 | You can also set a parameter to control the accuracy vs. query speed
 575 | trade-off at query time. The parameter is set in the `search()` function
 576 | using the `query_params` argment.
 577 | 
 578 | | Parameter name     | Description                                                             | Default value |
 579 | |--------------------|-------------------------------------------------------------------------|---------------|
 580 | | `search_list_size` | The number of additional candidates considered during the graph search. | 100           |
 581 | | `rescore`          | The number of elements rescored (0 to disable rescoring)                | 50            |
 582 | 
 583 | We suggest using the `rescore` parameter to fine-tune accuracy.
 584 | 
 585 | ``` python
 586 | vec.search([1.0, 9.0], limit=4, query_params=client.DiskAnnIndexParams(rescore=400, search_list_size=10))
 587 | ```
 588 | 
 589 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 590 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 591 |       'jumped over the',
 592 |       array([ 1. , 10.8], dtype=float32),
 593 |       0.00016793422934946456],
 594 |      [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),
 595 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 596 |       'the brown fox',
 597 |       array([1. , 1.3], dtype=float32),
 598 |       0.14489260377438218]]
 599 | 
 600 | To drop the index, run:
 601 | 
 602 | ``` python
 603 | vec.drop_embedding_index()
 604 | ```
 605 | 
 606 | #### pgvector HNSW index
 607 | 
 608 | Pgvector provides a graph-based indexing algorithm based on the popular
 609 | [HNSW algorithm](https://arxiv.org/abs/1603.09320).
 610 | 
 611 | To create this index, run:
 612 | 
 613 | ``` python
 614 | vec.create_embedding_index(client.HNSWIndex())
 615 | ```
 616 | 
 617 | The above command will create the index using smart defaults. There are
 618 | a number of parameters you could tune to adjust the accuracy/speed
 619 | trade-off.
 620 | 
 621 | The parameters you can set at index build time are:
 622 | 
 623 | | Parameter name  | Description                                                                                                                                                                                                                                                            | Default value |
 624 | |-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
 625 | | m               | Represents the maximum number of connections per layer. Think of these connections as edges created for each node during graph construction. Increasing m increases accuracy but also increases index build time and size.                                             | 16            |
 626 | | ef_construction | Represents the size of the dynamic candidate list for constructing the graph. It influences the trade-off between index quality and construction speed. Increasing ef_construction enables more accurate search results at the expense of lengthier index build times. | 64            |
 627 | 
 628 | To set these parameters, you could run:
 629 | 
 630 | ``` python
 631 | vec.create_embedding_index(client.HNSWIndex(m=16, ef_construction=64))
 632 | ```
 633 | 
 634 | You can also set a parameter to control the accuracy vs. query speed
 635 | trade-off at query time. The parameter is set in the `search()` function
 636 | using the `query_params` argument. You can set the `ef_search`(default:
 637 | 40). This parameter specifies the size of the dynamic candidate list
 638 | used during search. Higher values improve query accuracy while making
 639 | the query slower.
 640 | 
 641 | You can specify this value during search as follows:
 642 | 
 643 | ``` python
 644 | vec.search([1.0, 9.0], limit=4, query_params=client.HNSWIndexParams(ef_search=10))
 645 | ```
 646 | 
 647 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 648 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 649 |       'jumped over the',
 650 |       array([ 1. , 10.8], dtype=float32),
 651 |       0.00016793422934946456],
 652 |      [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),
 653 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 654 |       'the brown fox',
 655 |       array([1. , 1.3], dtype=float32),
 656 |       0.14489260377438218]]
 657 | 
 658 | To drop the index run:
 659 | 
 660 | ``` python
 661 | vec.drop_embedding_index()
 662 | ```
 663 | 
 664 | #### pgvector ivfflat index
 665 | 
 666 | Pgvector provides a clustering-based indexing algorithm. Our [blog
 667 | post](https://www.timescale.com/blog/nearest-neighbor-indexes-what-are-ivfflat-indexes-in-pgvector-and-how-do-they-work/)
 668 | describes how it works in detail. It provides the fastest index-build
 669 | speed but the slowest query speeds of any indexing algorithm.
 670 | 
 671 | To create this index, run:
 672 | 
 673 | ``` python
 674 | vec.create_embedding_index(client.IvfflatIndex())
 675 | ```
 676 | 
 677 | Note: *ivfflat should never be created on empty tables* because it needs
 678 | to cluster data, and that only happens when an index is first created,
 679 | not when new rows are inserted or modified. Also, if your table
 680 | undergoes a lot of modifications, you will need to rebuild this index
 681 | occasionally to maintain good accuracy. See our [blog
 682 | post](https://www.timescale.com/blog/nearest-neighbor-indexes-what-are-ivfflat-indexes-in-pgvector-and-how-do-they-work/)
 683 | for details.
 684 | 
 685 | Pgvector ivfflat has a `lists` index parameter that is automatically set
 686 | with a smart default based on the number of rows in your table. If you
 687 | know that you’ll have a different table size, you can specify the number
 688 | of records to use for calculating the `lists` parameter as follows:
 689 | 
 690 | ``` python
 691 | vec.create_embedding_index(client.IvfflatIndex(num_records=1000000))
 692 | ```
 693 | 
 694 | You can also set the `lists` parameter directly:
 695 | 
 696 | ``` python
 697 | vec.create_embedding_index(client.IvfflatIndex(num_lists=100))
 698 | ```
 699 | 
 700 | You can also set a parameter to control the accuracy vs. query speed
 701 | trade-off at query time. The parameter is set in the `search()` function
 702 | using the `query_params` argument. You can set the `probes`. This
 703 | parameter specifies the number of clusters searched during a query. It
 704 | is recommended to set this parameter to `sqrt(lists)` where lists is the
 705 | `num_list` parameter used above during index creation. Higher values
 706 | improve query accuracy while making the query slower.
 707 | 
 708 | You can specify this value during search as follows:
 709 | 
 710 | ``` python
 711 | vec.search([1.0, 9.0], limit=4, query_params=client.IvfflatIndexParams(probes=10))
 712 | ```
 713 | 
 714 |     [[UUID('456dbbbc-4a0d-11ef-94a3-6ee10b77fd09'),
 715 |       {'times': 100, 'action': 'jump', 'animal': 'fox'},
 716 |       'jumped over the',
 717 |       array([ 1. , 10.8], dtype=float32),
 718 |       0.00016793422934946456],
 719 |      [UUID('456dbb6c-4a0d-11ef-94a3-6ee10b77fd09'),
 720 |       {'times': 1, 'action': 'sit', 'animal': 'fox'},
 721 |       'the brown fox',
 722 |       array([1. , 1.3], dtype=float32),
 723 |       0.14489260377438218]]
 724 | 
 725 | To drop the index, run:
 726 | 
 727 | ``` python
 728 | vec.drop_embedding_index()
 729 | ```
 730 | 
 731 | ### Time partitioning
 732 | 
 733 | In many use cases where you have many embeddings, time is an important
 734 | component associated with the embeddings. For example, when embedding
 735 | news stories, you often search by time as well as similarity (e.g.,
 736 | stories related to Bitcoin in the past week or stories about Clinton in
 737 | November 2016).
 738 | 
 739 | Yet, traditionally, searching by two components “similarity” and “time”
 740 | is challenging for Approximate Nearest Neighbor (ANN) indexes and makes
 741 | the similarity-search index less effective.
 742 | 
 743 | One approach to solving this is partitioning the data by time and
 744 | creating ANN indexes on each partition individually. Then, during
 745 | search, you can:
 746 | 
 747 | - Step 1: filter our partitions that don’t match the time predicate.
 748 | - Step 2: perform the similarity search on all matching partitions.
 749 | - Step 3: combine all the results from each partition in step 2, rerank,
 750 |   and filter out results by time.
 751 | 
 752 | Step 1 makes the search a lot more efficient by filtering out whole
 753 | swaths of data in one go.
 754 | 
 755 | Timescale-vector supports time partitioning using TimescaleDB’s
 756 | hypertables. To use this feature, simply indicate the length of time for
 757 | each partition when creating the client:
 758 | 
 759 | ``` python
 760 | from datetime import timedelta
 761 | from datetime import datetime
 762 | ```
 763 | 
 764 | ``` python
 765 | vec = client.Async(service_url, "my_data_with_time_partition", 2, time_partition_interval=timedelta(hours=6))
 766 | await vec.create_tables()
 767 | ```
 768 | 
 769 | Then, insert data where the IDs use UUIDs v1 and the time component of
 770 | the UUID specifies the time of the embedding. For example, to create an
 771 | embedding for the current time, simply do:
 772 | 
 773 | ``` python
 774 | id = uuid.uuid1()
 775 | await vec.upsert([(id, {"key": "val"}, "the brown fox", [1.0, 1.2])])
 776 | ```
 777 | 
 778 | To insert data for a specific time in the past, create the UUID using
 779 | our
 780 | [`uuid_from_time`](https://timescale.github.io/python-vector/vector.html#uuid_from_time)
 781 | function
 782 | 
 783 | ``` python
 784 | specific_datetime = datetime(2018, 8, 10, 15, 30, 0)
 785 | await vec.upsert([(client.uuid_from_time(specific_datetime), {"key": "val"}, "the brown fox", [1.0, 1.2])])
 786 | ```
 787 | 
 788 | You can then query the data by specifying a `uuid_time_filter` in the
 789 | search call:
 790 | 
 791 | ``` python
 792 | rec = await vec.search([1.0, 2.0], limit=4, uuid_time_filter=client.UUIDTimeRange(specific_datetime-timedelta(days=7), specific_datetime+timedelta(days=7)))
 793 | ```
 794 | 
 795 | ### Distance metrics
 796 | 
 797 | By default, we use cosine distance to measure how similarly an embedding
 798 | is to a given query. In addition to cosine distance, we also support
 799 | Euclidean/L2 distance. The distance type is set when creating the client
 800 | using the `distance_type` parameter. For example, to use the Euclidean
 801 | distance metric, you can create the client with:
 802 | 
 803 | ``` python
 804 | vec  = client.Sync(service_url, "my_data", 2, distance_type="euclidean")
 805 | ```
 806 | 
 807 | Valid values for `distance_type` are `cosine` and `euclidean`.
 808 | 
 809 | It is important to note that you should use consistent distance types on
 810 | clients that create indexes and perform queries. That is because an
 811 | index is only valid for one particular type of distance measure.
 812 | 
 813 | Please note the Timescale Vector index only supports cosine distance at
 814 | this time.
 815 | 
 816 | # LangChain integration
 817 | 
 818 | [LangChain](https://www.langchain.com/) is a popular framework for
 819 | development applications powered by LLMs. Timescale Vector has a native
 820 | LangChain integration, enabling you to use Timescale Vector as a
 821 | vectorstore and leverage all its capabilities in your applications built
 822 | with LangChain.
 823 | 
 824 | Here are resources about using Timescale Vector with LangChain:
 825 | 
 826 | - [Getting started with LangChain and Timescale
 827 |   Vector](https://python.langchain.com/docs/integrations/vectorstores/timescalevector):
 828 |   You’ll learn how to use Timescale Vector for (1) semantic search, (2)
 829 |   time-based vector search, (3) self-querying, and (4) how to create
 830 |   indexes to speed up queries.
 831 | - [PostgreSQL Self
 832 |   Querying](https://python.langchain.com/docs/integrations/retrievers/self_query/timescalevector_self_query):
 833 |   Learn how to use Timescale Vector with self-querying in LangChain.
 834 | - [LangChain template: RAG with conversational
 835 |   retrieval](https://github.com/langchain-ai/langchain/tree/master/templates/rag-timescale-conversation):
 836 |   This template is used for conversational retrieval, which is one of
 837 |   the most popular LLM use-cases. It passes both a conversation history
 838 |   and retrieved documents into an LLM for synthesis.
 839 | - [LangChain template: RAG with time-based search and self-query
 840 |   retrieval](https://github.com/langchain-ai/langchain/tree/master/templates/rag-timescale-hybrid-search-time):This
 841 |   template shows how to use timescale-vector with the self-query
 842 |   retriver to perform hybrid search on similarity and time. This is
 843 |   useful any time your data has a strong time-based component.
 844 | - [Learn more about Timescale Vector and
 845 |   LangChain](https://blog.langchain.dev/timescale-vector-x-langchain-making-postgresql-a-better-vector-database-for-ai-applications/)
 846 | 
 847 | # LlamaIndex integration
 848 | 
 849 | \[LlamaIndex\] is a popular data framework for connecting custom data
 850 | sources to large language models (LLMs). Timescale Vector has a native
 851 | LlamaIndex integration, enabling you to use Timescale Vector as a
 852 | vectorstore and leverage all its capabilities in your applications built
 853 | with LlamaIndex.
 854 | 
 855 | Here are resources about using Timescale Vector with LlamaIndex:
 856 | 
 857 | - [Getting started with LlamaIndex and Timescale
 858 |   Vector](https://docs.llamaindex.ai/en/stable/examples/vector_stores/Timescalevector.html):
 859 |   You’ll learn how to use Timescale Vector for (1) similarity
 860 |   search, (2) time-based vector search, (3) faster search with indexes,
 861 |   and (4) retrieval and query engine.
 862 | - [Time-based
 863 |   retrieval](https://youtu.be/EYMZVfKcRzM?si=I0H3uUPgzKbQw__W): Learn
 864 |   how to power RAG applications with time-based retrieval.
 865 | - [Llama Pack: Auto Retrieval with time-based
 866 |   search](https://github.com/run-llama/llama-hub/tree/main/llama_hub/llama_packs/timescale_vector_autoretrieval):
 867 |   This pack demonstrates performing auto-retrieval for hybrid search
 868 |   based on both similarity and time, using the timescale-vector
 869 |   (PostgreSQL) vectorstore.  
 870 | - [Learn more about Timescale Vector and
 871 |   LlamaIndex](https://www.timescale.com/blog/timescale-vector-x-llamaindex-making-postgresql-a-better-vector-database-for-ai-applications/)
 872 | 
 873 | # PgVectorize
 874 | 
 875 | PgVectorize enables you to create vector embeddings from any data that
 876 | you already have stored in PostgreSQL. You can get more background
 877 | information in our [blog
 878 | post](https://www.timescale.com/blog/a-complete-guide-to-creating-and-storing-embeddings-for-postgresql-data/)
 879 | announcing this feature, as well as a [“how we built
 880 | in”](https://www.timescale.com/blog/how-we-designed-a-resilient-vector-embedding-creation-system-for-postgresql-data/)
 881 | post going into the details of the design.
 882 | 
 883 | To create vector embeddings, simply attach PgVectorize to any PostgreSQL
 884 | table, and it will automatically sync that table’s data with a set of
 885 | embeddings stored in Timescale Vector. For example, let’s say you have a
 886 | blog table defined in the following way:
 887 | 
 888 | ``` python
 889 | import psycopg2
 890 | from langchain.docstore.document import Document
 891 | from langchain.text_splitter import CharacterTextSplitter
 892 | from timescale_vector import client, pgvectorizer
 893 | from langchain_openai import OpenAIEmbeddings
 894 | from langchain_community.vectorstores.timescalevector import TimescaleVector
 895 | from datetime import timedelta
 896 | ```
 897 | 
 898 | ``` python
 899 | with psycopg2.connect(service_url) as conn:
 900 |     with conn.cursor() as cursor:
 901 |         cursor.execute('''
 902 |         CREATE TABLE IF NOT EXISTS blog (
 903 |             id              SERIAL PRIMARY KEY NOT NULL,
 904 |             title           TEXT NOT NULL,
 905 |             author          TEXT NOT NULL,
 906 |             contents        TEXT NOT NULL,
 907 |             category        TEXT NOT NULL,
 908 |             published_time  TIMESTAMPTZ NULL --NULL if not yet published
 909 |         );
 910 |         ''')
 911 | ```
 912 | 
 913 | You can insert some data as follows:
 914 | 
 915 | ``` python
 916 | with psycopg2.connect(service_url) as conn:
 917 |     with conn.cursor() as cursor:
 918 |         cursor.execute('''
 919 |             INSERT INTO blog (title, author, contents, category, published_time) VALUES ('First Post', 'Matvey Arye', 'some super interesting content about cats.', 'AI', '2021-01-01');
 920 |         ''')
 921 | ```
 922 | 
 923 | Now, say you want to embed these blogs in Timescale Vector. First, you
 924 | need to define an `embed_and_write` function that takes a set of blog
 925 | posts, creates the embeddings, and writes them into TimescaleVector. For
 926 | example, if using LangChain, it could look something like the following.
 927 | 
 928 | ``` python
 929 | def get_document(blog):
 930 |     text_splitter = CharacterTextSplitter(
 931 |         chunk_size=1000,
 932 |         chunk_overlap=200,
 933 |     )
 934 |     docs = []
 935 |     for chunk in text_splitter.split_text(blog['contents']):
 936 |         content = f"Author {blog['author']}, title: {blog['title']}, contents:{chunk}"
 937 |         metadata = {
 938 |             "id": str(client.uuid_from_time(blog['published_time'])),
 939 |             "blog_id": blog['id'], 
 940 |             "author": blog['author'], 
 941 |             "category": blog['category'],
 942 |             "published_time": blog['published_time'].isoformat(),
 943 |         }
 944 |         docs.append(Document(page_content=content, metadata=metadata))
 945 |     return docs
 946 | 
 947 | def embed_and_write(blog_instances, vectorizer):
 948 |     embedding = OpenAIEmbeddings()
 949 |     vector_store = TimescaleVector(
 950 |         collection_name="blog_embedding",
 951 |         service_url=service_url,
 952 |         embedding=embedding,
 953 |         time_partition_interval=timedelta(days=30),
 954 |     )
 955 | 
 956 |     # delete old embeddings for all ids in the work queue. locked_id is a special column that is set to the primary key of the table being
 957 |     # embedded. For items that are deleted, it is the only key that is set.
 958 |     metadata_for_delete = [{"blog_id": blog['locked_id']} for blog in blog_instances]
 959 |     vector_store.delete_by_metadata(metadata_for_delete)
 960 | 
 961 |     documents = []
 962 |     for blog in blog_instances:
 963 |         # skip blogs that are not published yet, or are deleted (in which case it will be NULL)
 964 |         if blog['published_time'] != None:
 965 |             documents.extend(get_document(blog))
 966 | 
 967 |     if len(documents) == 0:
 968 |         return
 969 |     
 970 |     texts = [d.page_content for d in documents]
 971 |     metadatas = [d.metadata for d in documents]
 972 |     ids = [d.metadata["id"] for d in documents]
 973 |     vector_store.add_texts(texts, metadatas, ids)
 974 | ```
 975 | 
 976 | Then, all you have to do is run the following code in a scheduled job
 977 | (cron job, Lambda job, etc):
 978 | 
 979 | ``` python
 980 | # this job should be run on a schedule
 981 | vectorizer = pgvectorizer.Vectorize(service_url, 'blog')
 982 | while vectorizer.process(embed_and_write) > 0:
 983 |     pass
 984 | ```
 985 | 
 986 | Every time that job runs, it will sync the table with your embeddings.
 987 | It will sync all inserts, updates, and deletes to an embeddings table
 988 | called `blog_embedding`.
 989 | 
 990 | Now, you can simply search the embeddings as follows (again, using
 991 | LangChain in the example):
 992 | 
 993 | ``` python
 994 | embedding = OpenAIEmbeddings()
 995 | vector_store = TimescaleVector(
 996 |     collection_name="blog_embedding",
 997 |     service_url=service_url,
 998 |     embedding=embedding,
 999 |     time_partition_interval=timedelta(days=30),
1000 | )
1001 | 
1002 | res = vector_store.similarity_search_with_score("Blogs about cats")
1003 | res
1004 | ```
1005 | 
1006 |     [(Document(metadata={'id': '334e4800-4bee-11eb-a52a-57b3c4a96ccb', 'author': 'Matvey Arye', 'blog_id': 1, 'category': 'AI', 'published_time': '2021-01-01T00:00:00-05:00'}, page_content='Author Matvey Arye, title: First Post, contents:some super interesting content about cats.'),
1007 |       0.12680577303752072)]
1008 | 
1009 | ## Development
1010 | 
1011 | This project is developed with [nbdev](https://nbdev.fast.ai/). Please
1012 | see that website for the development process.
1013 | 


--------------------------------------------------------------------------------