├── app ├── services │ ├── __init__.py │ ├── llm_factory.py │ └── synthesizer.py ├── example.env ├── insert_vectors.py ├── config │ └── settings.py ├── similarity_search.py └── database │ └── vector_store.py ├── requirements.txt ├── docker └── docker-compose.yml ├── LICENCE ├── .gitignore ├── data └── faq_dataset.csv └── README.md /app/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | openai 3 | psycopg 4 | python-dotenv 5 | timescale-vector 6 | instructor 7 | anthropic -------------------------------------------------------------------------------- /app/example.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | TIMESCALE_SERVICE_URL=postgres://postgres:password@localhost:5432/postgres 3 | -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | name: timescaledb 2 | 3 | services: 4 | timescaledb: 5 | image: timescale/timescaledb-ha:pg16 6 | container_name: timescaledb 7 | environment: 8 | - POSTGRES_DB=postgres 9 | - POSTGRES_PASSWORD=password 10 | ports: 11 | - "5432:5432" 12 | volumes: 13 | - timescaledb_data:/var/lib/postgresql/data 14 | restart: unless-stopped 15 | 16 | volumes: 17 | timescaledb_data: -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Datalumina B.V. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | *.code-workspace 72 | 73 | # Spyder 74 | .spyproject/ 75 | 76 | # Jupyter NB Checkpoints 77 | .ipynb_checkpoints/ 78 | 79 | # Mac OS-specific storage files 80 | .DS_Store 81 | 82 | # vim 83 | *.swp 84 | *.swo 85 | 86 | # Mypy cache 87 | .mypy_cache/ 88 | 89 | # Exclude virtual environment 90 | .venv/ 91 | 92 | # Exclude trained models 93 | /models/ 94 | 95 | # exclude data from source control by default 96 | # /data/ -------------------------------------------------------------------------------- /app/services/llm_factory.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Type 2 | 3 | import instructor 4 | from anthropic import Anthropic 5 | from openai import OpenAI 6 | from pydantic import BaseModel 7 | 8 | from config.settings import get_settings 9 | 10 | 11 | class LLMFactory: 12 | def __init__(self, provider: str): 13 | self.provider = provider 14 | self.settings = getattr(get_settings(), provider) 15 | self.client = self._initialize_client() 16 | 17 | def _initialize_client(self) -> Any: 18 | client_initializers = { 19 | "openai": lambda s: instructor.from_openai(OpenAI(api_key=s.api_key)), 20 | "anthropic": lambda s: instructor.from_anthropic( 21 | Anthropic(api_key=s.api_key) 22 | ), 23 | "llama": lambda s: instructor.from_openai( 24 | OpenAI(base_url=s.base_url, api_key=s.api_key), 25 | mode=instructor.Mode.JSON, 26 | ), 27 | } 28 | 29 | initializer = client_initializers.get(self.provider) 30 | if initializer: 31 | return initializer(self.settings) 32 | raise ValueError(f"Unsupported LLM provider: {self.provider}") 33 | 34 | def create_completion( 35 | self, response_model: Type[BaseModel], messages: List[Dict[str, str]], **kwargs 36 | ) -> Any: 37 | completion_params = { 38 | "model": kwargs.get("model", self.settings.default_model), 39 | "temperature": kwargs.get("temperature", self.settings.temperature), 40 | "max_retries": kwargs.get("max_retries", self.settings.max_retries), 41 | "max_tokens": kwargs.get("max_tokens", self.settings.max_tokens), 42 | "response_model": response_model, 43 | "messages": messages, 44 | } 45 | return self.client.chat.completions.create(**completion_params) 46 | -------------------------------------------------------------------------------- /app/insert_vectors.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pandas as pd 4 | from database.vector_store import VectorStore 5 | from timescale_vector.client import uuid_from_time 6 | 7 | # Initialize VectorStore 8 | vec = VectorStore() 9 | 10 | # Read the CSV file 11 | df = pd.read_csv("../data/faq_dataset.csv", sep=";") 12 | 13 | 14 | # Prepare data for insertion 15 | def prepare_record(row): 16 | """Prepare a record for insertion into the vector store. 17 | 18 | This function creates a record with a UUID version 1 as the ID, which captures 19 | the current time or a specified time. 20 | 21 | Note: 22 | - By default, this function uses the current time for the UUID. 23 | - To use a specific time: 24 | 1. Import the datetime module. 25 | 2. Create a datetime object for your desired time. 26 | 3. Use uuid_from_time(your_datetime) instead of uuid_from_time(datetime.now()). 27 | 28 | Example: 29 | from datetime import datetime 30 | specific_time = datetime(2023, 1, 1, 12, 0, 0) 31 | id = str(uuid_from_time(specific_time)) 32 | 33 | This is useful when your content already has an associated datetime. 34 | """ 35 | content = f"Question: {row['question']}\nAnswer: {row['answer']}" 36 | embedding = vec.get_embedding(content) 37 | return pd.Series( 38 | { 39 | "id": str(uuid_from_time(datetime.now())), 40 | "metadata": { 41 | "category": row["category"], 42 | "created_at": datetime.now().isoformat(), 43 | }, 44 | "contents": content, 45 | "embedding": embedding, 46 | } 47 | ) 48 | 49 | 50 | records_df = df.apply(prepare_record, axis=1) 51 | 52 | # Create tables and insert data 53 | vec.create_tables() 54 | vec.create_index() # DiskAnnIndex 55 | vec.upsert(records_df) 56 | -------------------------------------------------------------------------------- /app/config/settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import timedelta 4 | from functools import lru_cache 5 | from typing import Optional 6 | 7 | from dotenv import load_dotenv 8 | from pydantic import BaseModel, Field 9 | 10 | load_dotenv(dotenv_path="./.env") 11 | 12 | 13 | def setup_logging(): 14 | """Configure basic logging for the application.""" 15 | logging.basicConfig( 16 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 17 | ) 18 | 19 | 20 | class LLMSettings(BaseModel): 21 | """Base settings for Language Model configurations.""" 22 | 23 | temperature: float = 0.0 24 | max_tokens: Optional[int] = None 25 | max_retries: int = 3 26 | 27 | 28 | class OpenAISettings(LLMSettings): 29 | """OpenAI-specific settings extending LLMSettings.""" 30 | 31 | api_key: str = Field(default_factory=lambda: os.getenv("OPENAI_API_KEY")) 32 | default_model: str = Field(default="gpt-4o") 33 | embedding_model: str = Field(default="text-embedding-3-small") 34 | 35 | 36 | class DatabaseSettings(BaseModel): 37 | """Database connection settings.""" 38 | 39 | service_url: str = Field(default_factory=lambda: os.getenv("TIMESCALE_SERVICE_URL")) 40 | 41 | 42 | class VectorStoreSettings(BaseModel): 43 | """Settings for the VectorStore.""" 44 | 45 | table_name: str = "embeddings" 46 | embedding_dimensions: int = 1536 47 | time_partition_interval: timedelta = timedelta(days=7) 48 | 49 | 50 | class Settings(BaseModel): 51 | """Main settings class combining all sub-settings.""" 52 | 53 | openai: OpenAISettings = Field(default_factory=OpenAISettings) 54 | database: DatabaseSettings = Field(default_factory=DatabaseSettings) 55 | vector_store: VectorStoreSettings = Field(default_factory=VectorStoreSettings) 56 | 57 | 58 | @lru_cache() 59 | def get_settings() -> Settings: 60 | """Create and return a cached instance of the Settings.""" 61 | settings = Settings() 62 | setup_logging() 63 | return settings 64 | -------------------------------------------------------------------------------- /data/faq_dataset.csv: -------------------------------------------------------------------------------- 1 | question;answer;category 2 | What are your shipping options?;We offer standard (3-5 business days) and express (1-2 business days) shipping options.;Shipping 3 | How can I track my order?;You can track your order by logging into your account and viewing the order status or using the tracking number sent to your email.;Order Management 4 | What is your return policy?;We offer a 30-day return policy for most items. Please ensure the item is unused and in its original packaging.;Returns 5 | Do you offer international shipping?;Yes, we ship to most countries worldwide. Shipping costs and delivery times vary by location.;Shipping 6 | How do I cancel an order?;You can cancel an order within 24 hours of placement by contacting our customer service team.;Order Management 7 | What payment methods do you accept?;We accept major credit cards, PayPal, and Apple Pay.;Payment 8 | Are your products authentic?;Yes, all our products are 100% authentic and sourced directly from authorized distributors.;Product Information 9 | How do I contact customer service?;You can reach our customer service team via email, phone, or live chat on our website.;Customer Support 10 | Do you offer gift wrapping?;Yes, we offer gift wrapping for a small additional fee. You can select this option during checkout.;Services 11 | What is your price match policy?;We offer price matching for identical items from authorized retailers within 14 days of purchase.;Pricing 12 | How do I create an account?;You can create an account by clicking the "Sign Up" button on our homepage and following the prompts.;Account Management 13 | Do you have a loyalty program?;Yes, we have a rewards program where you earn points on purchases that can be redeemed for discounts.;Rewards 14 | What should I do if I receive a damaged item?;Please contact our customer service team immediately with photos of the damaged item and packaging.;Returns 15 | How can I check the status of my refund?;Refund status can be checked in your account under "Order History" or by contacting customer service.;Returns 16 | Do you offer discounts for bulk orders?;Yes, we offer discounts for bulk orders. Please contact our sales team for a custom quote.;Pricing 17 | How do I unsubscribe from your email newsletter?;You can unsubscribe by clicking the "Unsubscribe" link at the bottom of any of our email newsletters.;Communication 18 | What is your warranty policy?;Warranty policies vary by product. Please check the product page or contact us for specific warranty information.;Product Information 19 | Do you have physical store locations?;We are an online-only store and do not have physical retail locations at this time.;Company Information 20 | How do I apply a promotional code to my order?;You can enter your promotional code in the designated field during the checkout process.;Payment 21 | What is your privacy policy regarding customer data?;We take data privacy seriously. Our detailed privacy policy can be found on our website footer.;Privacy -------------------------------------------------------------------------------- /app/similarity_search.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from database.vector_store import VectorStore 3 | from services.synthesizer import Synthesizer 4 | from timescale_vector import client 5 | 6 | # Initialize VectorStore 7 | vec = VectorStore() 8 | 9 | # -------------------------------------------------------------- 10 | # Shipping question 11 | # -------------------------------------------------------------- 12 | 13 | relevant_question = "What are your shipping options?" 14 | results = vec.search(relevant_question, limit=3) 15 | 16 | response = Synthesizer.generate_response(question=relevant_question, context=results) 17 | 18 | print(f"\n{response.answer}") 19 | print("\nThought process:") 20 | for thought in response.thought_process: 21 | print(f"- {thought}") 22 | print(f"\nContext: {response.enough_context}") 23 | 24 | # -------------------------------------------------------------- 25 | # Irrelevant question 26 | # -------------------------------------------------------------- 27 | 28 | irrelevant_question = "What is the weather in Tokyo?" 29 | 30 | results = vec.search(irrelevant_question, limit=3) 31 | 32 | response = Synthesizer.generate_response(question=irrelevant_question, context=results) 33 | 34 | print(f"\n{response.answer}") 35 | print("\nThought process:") 36 | for thought in response.thought_process: 37 | print(f"- {thought}") 38 | print(f"\nContext: {response.enough_context}") 39 | 40 | # -------------------------------------------------------------- 41 | # Metadata filtering 42 | # -------------------------------------------------------------- 43 | 44 | metadata_filter = {"category": "Shipping"} 45 | 46 | results = vec.search(relevant_question, limit=3, metadata_filter=metadata_filter) 47 | 48 | response = Synthesizer.generate_response(question=relevant_question, context=results) 49 | 50 | print(f"\n{response.answer}") 51 | print("\nThought process:") 52 | for thought in response.thought_process: 53 | print(f"- {thought}") 54 | print(f"\nContext: {response.enough_context}") 55 | 56 | # -------------------------------------------------------------- 57 | # Advanced filtering using Predicates 58 | # -------------------------------------------------------------- 59 | 60 | predicates = client.Predicates("category", "==", "Shipping") 61 | results = vec.search(relevant_question, limit=3, predicates=predicates) 62 | 63 | 64 | predicates = client.Predicates("category", "==", "Shipping") | client.Predicates( 65 | "category", "==", "Services" 66 | ) 67 | results = vec.search(relevant_question, limit=3, predicates=predicates) 68 | 69 | 70 | predicates = client.Predicates("category", "==", "Shipping") & client.Predicates( 71 | "created_at", ">", "2024-09-01" 72 | ) 73 | results = vec.search(relevant_question, limit=3, predicates=predicates) 74 | 75 | # -------------------------------------------------------------- 76 | # Time-based filtering 77 | # -------------------------------------------------------------- 78 | 79 | # September — Returning results 80 | time_range = (datetime(2024, 9, 1), datetime(2024, 9, 30)) 81 | results = vec.search(relevant_question, limit=3, time_range=time_range) 82 | 83 | # August — Not returning any results 84 | time_range = (datetime(2024, 8, 1), datetime(2024, 8, 30)) 85 | results = vec.search(relevant_question, limit=3, time_range=time_range) 86 | -------------------------------------------------------------------------------- /app/services/synthesizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import pandas as pd 3 | from pydantic import BaseModel, Field 4 | from services.llm_factory import LLMFactory 5 | 6 | 7 | class SynthesizedResponse(BaseModel): 8 | thought_process: List[str] = Field( 9 | description="List of thoughts that the AI assistant had while synthesizing the answer" 10 | ) 11 | answer: str = Field(description="The synthesized answer to the user's question") 12 | enough_context: bool = Field( 13 | description="Whether the assistant has enough context to answer the question" 14 | ) 15 | 16 | 17 | class Synthesizer: 18 | SYSTEM_PROMPT = """ 19 | # Role and Purpose 20 | You are an AI assistant for an e-commerce FAQ system. Your task is to synthesize a coherent and helpful answer 21 | based on the given question and relevant context retrieved from a knowledge database. 22 | 23 | # Guidelines: 24 | 1. Provide a clear and concise answer to the question. 25 | 2. Use only the information from the relevant context to support your answer. 26 | 3. The context is retrieved based on cosine similarity, so some information might be missing or irrelevant. 27 | 4. Be transparent when there is insufficient information to fully answer the question. 28 | 5. Do not make up or infer information not present in the provided context. 29 | 6. If you cannot answer the question based on the given context, clearly state that. 30 | 7. Maintain a helpful and professional tone appropriate for customer service. 31 | 8. Adhere strictly to company guidelines and policies by using only the provided knowledge base. 32 | 33 | Review the question from the user: 34 | """ 35 | 36 | @staticmethod 37 | def generate_response(question: str, context: pd.DataFrame) -> SynthesizedResponse: 38 | """Generates a synthesized response based on the question and context. 39 | 40 | Args: 41 | question: The user's question. 42 | context: The relevant context retrieved from the knowledge base. 43 | 44 | Returns: 45 | A SynthesizedResponse containing thought process and answer. 46 | """ 47 | context_str = Synthesizer.dataframe_to_json( 48 | context, columns_to_keep=["content", "category"] 49 | ) 50 | 51 | messages = [ 52 | {"role": "system", "content": Synthesizer.SYSTEM_PROMPT}, 53 | {"role": "user", "content": f"# User question:\n{question}"}, 54 | { 55 | "role": "assistant", 56 | "content": f"# Retrieved information:\n{context_str}", 57 | }, 58 | ] 59 | 60 | llm = LLMFactory("openai") 61 | return llm.create_completion( 62 | response_model=SynthesizedResponse, 63 | messages=messages, 64 | ) 65 | 66 | @staticmethod 67 | def dataframe_to_json( 68 | context: pd.DataFrame, 69 | columns_to_keep: List[str], 70 | ) -> str: 71 | """ 72 | Convert the context DataFrame to a JSON string. 73 | 74 | Args: 75 | context (pd.DataFrame): The context DataFrame. 76 | columns_to_keep (List[str]): The columns to include in the output. 77 | 78 | Returns: 79 | str: A JSON string representation of the selected columns. 80 | """ 81 | return context[columns_to_keep].to_json(orient="records", indent=2) 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building a High-Performance RAG Solution with Pgvectorscale and Python 2 | 3 | This tutorial will guide you through setting up and using `pgvectorscale` with Docker and Python, leveraging OpenAI's powerful `text-embedding-3-small` model for embeddings. You'll learn to build a cutting-edge RAG (Retrieval-Augmented Generation) solution, combining advanced retrieval techniques (including hybrid search) with intelligent answer generation based on the retrieved context. Perfect for AI engineers looking to enhance their projects with state-of-the-art vector search and generation capabilities with the power of PostgreSQL. 4 | 5 | ## YouTube Tutorial 6 | You can watch the full tutorial here on [YouTube](https://youtu.be/hAdEuDBN57g). 7 | 8 | ## Pgvectorscale Documentation 9 | 10 | For more information about using PostgreSQL as a vector database in AI applications with Timescale, check out these resources: 11 | 12 | - [GitHub Repository: pgvectorscale](https://github.com/timescale/pgvectorscale) 13 | - [Blog Post: PostgreSQL and Pgvector: Now Faster Than Pinecone, 75% Cheaper, and 100% Open Source](https://www.timescale.com/blog/pgvector-is-now-as-fast-as-pinecone-at-75-less-cost/) 14 | - [Blog Post: RAG Is More Than Just Vector Search](https://www.timescale.com/blog/rag-is-more-than-just-vector-search/) 15 | - [Blog Post: A Python Library for Using PostgreSQL as a Vector Database in AI Applications](https://www.timescale.com/blog/a-python-library-for-using-postgresql-as-a-vector-database-in-ai-applications/) 16 | 17 | ## Why PostgreSQL? 18 | 19 | Using PostgreSQL with pgvectorscale as your vector database offers several key advantages over dedicated vector databases: 20 | 21 | - PostgreSQL is a robust, open-source database with a rich ecosystem of tools, drivers, and connectors. This ensures transparency, community support, and continuous improvements. 22 | 23 | - By using PostgreSQL, you can manage both your relational and vector data within a single database. This reduces operational complexity, as there's no need to maintain and synchronize multiple databases. 24 | 25 | - Pgvectorscale enhances pgvector with faster search capabilities, higher recall, and efficient time-based filtering. It leverages advanced indexing techniques, such as the DiskANN-inspired index, to significantly speed up Approximate Nearest Neighbor (ANN) searches. 26 | 27 | Pgvectorscale Vector builds on top of [pgvector](https://github.com/pgvector/pgvector), offering improved performance and additional features, making PostgreSQL a powerful and versatile choice for AI applications. 28 | 29 | ## Prerequisites 30 | 31 | - Docker 32 | - Python 3.7+ 33 | - OpenAI API key 34 | - PostgreSQL GUI client 35 | 36 | ## Steps 37 | 38 | 1. Set up Docker environment 39 | 2. Connect to the database using a PostgreSQL GUI client (I use TablePlus) 40 | 3. Create a Python script to insert document chunks as vectors using OpenAI embeddings 41 | 4. Create a Python function to perform similarity search 42 | 43 | ## Detailed Instructions 44 | 45 | ### 1. Set up Docker environment 46 | 47 | Create a `docker-compose.yml` file with the following content: 48 | 49 | ```yaml 50 | services: 51 | timescaledb: 52 | image: timescale/timescaledb-ha:pg16 53 | container_name: timescaledb 54 | environment: 55 | - POSTGRES_DB=postgres 56 | - POSTGRES_PASSWORD=password 57 | ports: 58 | - "5432:5432" 59 | volumes: 60 | - timescaledb_data:/var/lib/postgresql/data 61 | restart: unless-stopped 62 | 63 | volumes: 64 | timescaledb_data: 65 | ``` 66 | 67 | Run the Docker container: 68 | 69 | ```bash 70 | docker compose up -d 71 | ``` 72 | 73 | ### 2. Connect to the database using a PostgreSQL GUI client 74 | 75 | - Open client 76 | - Create a new connection with the following details: 77 | - Host: localhost 78 | - Port: 5432 79 | - User: postgres 80 | - Password: password 81 | - Database: postgres 82 | 83 | ### 3. Create a Python script to insert document chunks as vectors 84 | 85 | See `insert_vectors.py` for the implementation. This script uses OpenAI's `text-embedding-3-small` model to generate embeddings. 86 | 87 | ### 4. Create a Python function to perform similarity search 88 | 89 | See `similarity_search.py` for the implementation. This script also uses OpenAI's `text-embedding-3-small` model for query embedding. 90 | 91 | ## Usage 92 | 93 | 1. Create a copy of `example.env` and rename it to `.env` 94 | 2. Open `.env` and fill in your OpenAI API key. Leave the database settings as is 95 | 3. Run the Docker container 96 | 4. Install the required Python packages using `pip install -r requirements.txt` 97 | 5. Execute `insert_vectors.py` to populate the database 98 | 6. Play with `similarity_search.py` to perform similarity searches 99 | 100 | ## Using ANN search indexes to speed up queries 101 | 102 | Timescale Vector offers indexing options to accelerate similarity queries, particularly beneficial for large vector datasets (10k+ vectors): 103 | 104 | 1. Supported indexes: 105 | - timescale_vector_index (default): A DiskANN-inspired graph index 106 | - pgvector's HNSW: Hierarchical Navigable Small World graph index 107 | - pgvector's IVFFLAT: Inverted file index 108 | 109 | 2. The DiskANN-inspired index is Timescale's latest offering, providing improved performance. Refer to the [Timescale Vector explainer blog](https://www.timescale.com/blog/pgvector-is-now-as-fast-as-pinecone-at-75-less-cost/) for detailed information and benchmarks. 110 | 111 | For optimal query performance, creating an index on the embedding column is recommended, especially for large vector datasets. 112 | 113 | ## Cosine Similarity in Vector Search 114 | 115 | ### What is Cosine Similarity? 116 | 117 | Cosine similarity measures the cosine of the angle between two vectors in a multi-dimensional space. It's a measure of orientation rather than magnitude. 118 | 119 | - Range: -1 to 1 (for normalized vectors, which is typical in text embeddings) 120 | - 1: Vectors point in the same direction (most similar) 121 | - 0: Vectors are orthogonal (unrelated) 122 | - -1: Vectors point in opposite directions (most dissimilar) 123 | 124 | ### Cosine Distance 125 | 126 | In pgvector, the `<=>` operator computes cosine distance, which is 1 - cosine similarity. 127 | 128 | - Range: 0 to 2 129 | - 0: Identical vectors (most similar) 130 | - 1: Orthogonal vectors 131 | - 2: Opposite vectors (most dissimilar) 132 | 133 | ### Interpreting Results 134 | 135 | When you get results from similarity_search: 136 | 137 | - Lower distance values indicate higher similarity. 138 | - A distance of 0 would mean exact match (rarely happens with embeddings). 139 | - Distances closer to 0 indicate high similarity. 140 | - Distances around 1 suggest little to no similarity. 141 | - Distances approaching 2 indicate opposite meanings (rare in practice). 142 | -------------------------------------------------------------------------------- /app/database/vector_store.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import Any, List, Optional, Tuple, Union 4 | from datetime import datetime 5 | 6 | import pandas as pd 7 | from config.settings import get_settings 8 | from openai import OpenAI 9 | from timescale_vector import client 10 | 11 | 12 | class VectorStore: 13 | """A class for managing vector operations and database interactions.""" 14 | 15 | def __init__(self): 16 | """Initialize the VectorStore with settings, OpenAI client, and Timescale Vector client.""" 17 | self.settings = get_settings() 18 | self.openai_client = OpenAI(api_key=self.settings.openai.api_key) 19 | self.embedding_model = self.settings.openai.embedding_model 20 | self.vector_settings = self.settings.vector_store 21 | self.vec_client = client.Sync( 22 | self.settings.database.service_url, 23 | self.vector_settings.table_name, 24 | self.vector_settings.embedding_dimensions, 25 | time_partition_interval=self.vector_settings.time_partition_interval, 26 | ) 27 | 28 | def get_embedding(self, text: str) -> List[float]: 29 | """ 30 | Generate embedding for the given text. 31 | 32 | Args: 33 | text: The input text to generate an embedding for. 34 | 35 | Returns: 36 | A list of floats representing the embedding. 37 | """ 38 | text = text.replace("\n", " ") 39 | start_time = time.time() 40 | embedding = ( 41 | self.openai_client.embeddings.create( 42 | input=[text], 43 | model=self.embedding_model, 44 | ) 45 | .data[0] 46 | .embedding 47 | ) 48 | elapsed_time = time.time() - start_time 49 | logging.info(f"Embedding generated in {elapsed_time:.3f} seconds") 50 | return embedding 51 | 52 | def create_tables(self) -> None: 53 | """Create the necessary tablesin the database""" 54 | self.vec_client.create_tables() 55 | 56 | def create_index(self) -> None: 57 | """Create the StreamingDiskANN index to spseed up similarity search""" 58 | self.vec_client.create_embedding_index(client.DiskAnnIndex()) 59 | 60 | def drop_index(self) -> None: 61 | """Drop the StreamingDiskANN index in the database""" 62 | self.vec_client.drop_embedding_index() 63 | 64 | def upsert(self, df: pd.DataFrame) -> None: 65 | """ 66 | Insert or update records in the database from a pandas DataFrame. 67 | 68 | Args: 69 | df: A pandas DataFrame containing the data to insert or update. 70 | Expected columns: id, metadata, contents, embedding 71 | """ 72 | records = df.to_records(index=False) 73 | self.vec_client.upsert(list(records)) 74 | logging.info( 75 | f"Inserted {len(df)} records into {self.vector_settings.table_name}" 76 | ) 77 | 78 | def search( 79 | self, 80 | query_text: str, 81 | limit: int = 5, 82 | metadata_filter: Union[dict, List[dict]] = None, 83 | predicates: Optional[client.Predicates] = None, 84 | time_range: Optional[Tuple[datetime, datetime]] = None, 85 | return_dataframe: bool = True, 86 | ) -> Union[List[Tuple[Any, ...]], pd.DataFrame]: 87 | """ 88 | Query the vector database for similar embeddings based on input text. 89 | 90 | More info: 91 | https://github.com/timescale/docs/blob/latest/ai/python-interface-for-pgvector-and-timescale-vector.md 92 | 93 | Args: 94 | query_text: The input text to search for. 95 | limit: The maximum number of results to return. 96 | metadata_filter: A dictionary or list of dictionaries for equality-based metadata filtering. 97 | predicates: A Predicates object for complex metadata filtering. 98 | - Predicates objects are defined by the name of the metadata key, an operator, and a value. 99 | - Operators: ==, !=, >, >=, <, <= 100 | - & is used to combine multiple predicates with AND operator. 101 | - | is used to combine multiple predicates with OR operator. 102 | time_range: A tuple of (start_date, end_date) to filter results by time. 103 | return_dataframe: Whether to return results as a DataFrame (default: True). 104 | 105 | Returns: 106 | Either a list of tuples or a pandas DataFrame containing the search results. 107 | 108 | Basic Examples: 109 | Basic search: 110 | vector_store.search("What are your shipping options?") 111 | Search with metadata filter: 112 | vector_store.search("Shipping options", metadata_filter={"category": "Shipping"}) 113 | 114 | Predicates Examples: 115 | Search with predicates: 116 | vector_store.search("Pricing", predicates=client.Predicates("price", ">", 100)) 117 | Search with complex combined predicates: 118 | complex_pred = (client.Predicates("category", "==", "Electronics") & client.Predicates("price", "<", 1000)) | \ 119 | (client.Predicates("category", "==", "Books") & client.Predicates("rating", ">=", 4.5)) 120 | vector_store.search("High-quality products", predicates=complex_pred) 121 | 122 | Time-based filtering: 123 | Search with time range: 124 | vector_store.search("Recent updates", time_range=(datetime(2024, 1, 1), datetime(2024, 1, 31))) 125 | """ 126 | query_embedding = self.get_embedding(query_text) 127 | 128 | start_time = time.time() 129 | 130 | search_args = { 131 | "limit": limit, 132 | } 133 | 134 | if metadata_filter: 135 | search_args["filter"] = metadata_filter 136 | 137 | if predicates: 138 | search_args["predicates"] = predicates 139 | 140 | if time_range: 141 | start_date, end_date = time_range 142 | search_args["uuid_time_filter"] = client.UUIDTimeRange(start_date, end_date) 143 | 144 | results = self.vec_client.search(query_embedding, **search_args) 145 | elapsed_time = time.time() - start_time 146 | 147 | logging.info(f"Vector search completed in {elapsed_time:.3f} seconds") 148 | 149 | if return_dataframe: 150 | return self._create_dataframe_from_results(results) 151 | else: 152 | return results 153 | 154 | def _create_dataframe_from_results( 155 | self, 156 | results: List[Tuple[Any, ...]], 157 | ) -> pd.DataFrame: 158 | """ 159 | Create a pandas DataFrame from the search results. 160 | 161 | Args: 162 | results: A list of tuples containing the search results. 163 | 164 | Returns: 165 | A pandas DataFrame containing the formatted search results. 166 | """ 167 | # Convert results to DataFrame 168 | df = pd.DataFrame( 169 | results, columns=["id", "metadata", "content", "embedding", "distance"] 170 | ) 171 | 172 | # Expand metadata column 173 | df = pd.concat( 174 | [df.drop(["metadata"], axis=1), df["metadata"].apply(pd.Series)], axis=1 175 | ) 176 | 177 | # Convert id to string for better readability 178 | df["id"] = df["id"].astype(str) 179 | 180 | return df 181 | 182 | def delete( 183 | self, 184 | ids: List[str] = None, 185 | metadata_filter: dict = None, 186 | delete_all: bool = False, 187 | ) -> None: 188 | """Delete records from the vector database. 189 | 190 | Args: 191 | ids (List[str], optional): A list of record IDs to delete. 192 | metadata_filter (dict, optional): A dictionary of metadata key-value pairs to filter records for deletion. 193 | delete_all (bool, optional): A boolean flag to delete all records. 194 | 195 | Raises: 196 | ValueError: If no deletion criteria are provided or if multiple criteria are provided. 197 | 198 | Examples: 199 | Delete by IDs: 200 | vector_store.delete(ids=["8ab544ae-766a-11ef-81cb-decf757b836d"]) 201 | 202 | Delete by metadata filter: 203 | vector_store.delete(metadata_filter={"category": "Shipping"}) 204 | 205 | Delete all records: 206 | vector_store.delete(delete_all=True) 207 | """ 208 | if sum(bool(x) for x in (ids, metadata_filter, delete_all)) != 1: 209 | raise ValueError( 210 | "Provide exactly one of: ids, metadata_filter, or delete_all" 211 | ) 212 | 213 | if delete_all: 214 | self.vec_client.delete_all() 215 | logging.info(f"Deleted all records from {self.vector_settings.table_name}") 216 | elif ids: 217 | self.vec_client.delete_by_ids(ids) 218 | logging.info( 219 | f"Deleted {len(ids)} records from {self.vector_settings.table_name}" 220 | ) 221 | elif metadata_filter: 222 | self.vec_client.delete_by_metadata(metadata_filter) 223 | logging.info( 224 | f"Deleted records matching metadata filter from {self.vector_settings.table_name}" 225 | ) 226 | --------------------------------------------------------------------------------