├── app
    ├── services
    │   ├── __init__.py
    │   ├── llm_factory.py
    │   └── synthesizer.py
    ├── example.env
    ├── insert_vectors.py
    ├── config
    │   └── settings.py
    ├── similarity_search.py
    └── database
    │   └── vector_store.py
├── requirements.txt
├── docker
    └── docker-compose.yml
├── LICENCE
├── .gitignore
├── data
    └── faq_dataset.csv
└── README.md


/app/services/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | openai
3 | psycopg
4 | python-dotenv
5 | timescale-vector
6 | instructor
7 | anthropic


--------------------------------------------------------------------------------
/app/example.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | TIMESCALE_SERVICE_URL=postgres://postgres:password@localhost:5432/postgres
3 | 


--------------------------------------------------------------------------------
/docker/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | name: timescaledb
 2 | 
 3 | services:
 4 |   timescaledb:
 5 |     image: timescale/timescaledb-ha:pg16
 6 |     container_name: timescaledb
 7 |     environment:
 8 |       - POSTGRES_DB=postgres
 9 |       - POSTGRES_PASSWORD=password
10 |     ports:
11 |       - "5432:5432"
12 |     volumes:
13 |       - timescaledb_data:/var/lib/postgresql/data
14 |     restart: unless-stopped
15 | 
16 | volumes:
17 |   timescaledb_data:


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Datalumina B.V.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *.cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # DotEnv configuration
60 | .env
61 | 
62 | # Database
63 | *.db
64 | *.rdb
65 | 
66 | # Pycharm
67 | .idea
68 | 
69 | # VS Code
70 | .vscode/
71 | *.code-workspace
72 | 
73 | # Spyder
74 | .spyproject/
75 | 
76 | # Jupyter NB Checkpoints
77 | .ipynb_checkpoints/
78 | 
79 | # Mac OS-specific storage files
80 | .DS_Store
81 | 
82 | # vim
83 | *.swp
84 | *.swo
85 | 
86 | # Mypy cache
87 | .mypy_cache/
88 | 
89 | # Exclude virtual environment
90 | .venv/
91 | 
92 | # Exclude trained models
93 | /models/
94 | 
95 | # exclude data from source control by default
96 | # /data/


--------------------------------------------------------------------------------
/app/services/llm_factory.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Type
 2 | 
 3 | import instructor
 4 | from anthropic import Anthropic
 5 | from openai import OpenAI
 6 | from pydantic import BaseModel
 7 | 
 8 | from config.settings import get_settings
 9 | 
10 | 
11 | class LLMFactory:
12 |     def __init__(self, provider: str):
13 |         self.provider = provider
14 |         self.settings = getattr(get_settings(), provider)
15 |         self.client = self._initialize_client()
16 | 
17 |     def _initialize_client(self) -> Any:
18 |         client_initializers = {
19 |             "openai": lambda s: instructor.from_openai(OpenAI(api_key=s.api_key)),
20 |             "anthropic": lambda s: instructor.from_anthropic(
21 |                 Anthropic(api_key=s.api_key)
22 |             ),
23 |             "llama": lambda s: instructor.from_openai(
24 |                 OpenAI(base_url=s.base_url, api_key=s.api_key),
25 |                 mode=instructor.Mode.JSON,
26 |             ),
27 |         }
28 | 
29 |         initializer = client_initializers.get(self.provider)
30 |         if initializer:
31 |             return initializer(self.settings)
32 |         raise ValueError(f"Unsupported LLM provider: {self.provider}")
33 | 
34 |     def create_completion(
35 |         self, response_model: Type[BaseModel], messages: List[Dict[str, str]], **kwargs
36 |     ) -> Any:
37 |         completion_params = {
38 |             "model": kwargs.get("model", self.settings.default_model),
39 |             "temperature": kwargs.get("temperature", self.settings.temperature),
40 |             "max_retries": kwargs.get("max_retries", self.settings.max_retries),
41 |             "max_tokens": kwargs.get("max_tokens", self.settings.max_tokens),
42 |             "response_model": response_model,
43 |             "messages": messages,
44 |         }
45 |         return self.client.chat.completions.create(**completion_params)
46 | 


--------------------------------------------------------------------------------
/app/insert_vectors.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pandas as pd
 4 | from database.vector_store import VectorStore
 5 | from timescale_vector.client import uuid_from_time
 6 | 
 7 | # Initialize VectorStore
 8 | vec = VectorStore()
 9 | 
10 | # Read the CSV file
11 | df = pd.read_csv("../data/faq_dataset.csv", sep=";")
12 | 
13 | 
14 | # Prepare data for insertion
15 | def prepare_record(row):
16 |     """Prepare a record for insertion into the vector store.
17 | 
18 |     This function creates a record with a UUID version 1 as the ID, which captures
19 |     the current time or a specified time.
20 | 
21 |     Note:
22 |         - By default, this function uses the current time for the UUID.
23 |         - To use a specific time:
24 |           1. Import the datetime module.
25 |           2. Create a datetime object for your desired time.
26 |           3. Use uuid_from_time(your_datetime) instead of uuid_from_time(datetime.now()).
27 | 
28 |         Example:
29 |             from datetime import datetime
30 |             specific_time = datetime(2023, 1, 1, 12, 0, 0)
31 |             id = str(uuid_from_time(specific_time))
32 | 
33 |         This is useful when your content already has an associated datetime.
34 |     """
35 |     content = f"Question: {row['question']}\nAnswer: {row['answer']}"
36 |     embedding = vec.get_embedding(content)
37 |     return pd.Series(
38 |         {
39 |             "id": str(uuid_from_time(datetime.now())),
40 |             "metadata": {
41 |                 "category": row["category"],
42 |                 "created_at": datetime.now().isoformat(),
43 |             },
44 |             "contents": content,
45 |             "embedding": embedding,
46 |         }
47 |     )
48 | 
49 | 
50 | records_df = df.apply(prepare_record, axis=1)
51 | 
52 | # Create tables and insert data
53 | vec.create_tables()
54 | vec.create_index()  # DiskAnnIndex
55 | vec.upsert(records_df)
56 | 


--------------------------------------------------------------------------------
/app/config/settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from datetime import timedelta
 4 | from functools import lru_cache
 5 | from typing import Optional
 6 | 
 7 | from dotenv import load_dotenv
 8 | from pydantic import BaseModel, Field
 9 | 
10 | load_dotenv(dotenv_path="./.env")
11 | 
12 | 
13 | def setup_logging():
14 |     """Configure basic logging for the application."""
15 |     logging.basicConfig(
16 |         level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
17 |     )
18 | 
19 | 
20 | class LLMSettings(BaseModel):
21 |     """Base settings for Language Model configurations."""
22 | 
23 |     temperature: float = 0.0
24 |     max_tokens: Optional[int] = None
25 |     max_retries: int = 3
26 | 
27 | 
28 | class OpenAISettings(LLMSettings):
29 |     """OpenAI-specific settings extending LLMSettings."""
30 | 
31 |     api_key: str = Field(default_factory=lambda: os.getenv("OPENAI_API_KEY"))
32 |     default_model: str = Field(default="gpt-4o")
33 |     embedding_model: str = Field(default="text-embedding-3-small")
34 | 
35 | 
36 | class DatabaseSettings(BaseModel):
37 |     """Database connection settings."""
38 | 
39 |     service_url: str = Field(default_factory=lambda: os.getenv("TIMESCALE_SERVICE_URL"))
40 | 
41 | 
42 | class VectorStoreSettings(BaseModel):
43 |     """Settings for the VectorStore."""
44 | 
45 |     table_name: str = "embeddings"
46 |     embedding_dimensions: int = 1536
47 |     time_partition_interval: timedelta = timedelta(days=7)
48 | 
49 | 
50 | class Settings(BaseModel):
51 |     """Main settings class combining all sub-settings."""
52 | 
53 |     openai: OpenAISettings = Field(default_factory=OpenAISettings)
54 |     database: DatabaseSettings = Field(default_factory=DatabaseSettings)
55 |     vector_store: VectorStoreSettings = Field(default_factory=VectorStoreSettings)
56 | 
57 | 
58 | @lru_cache()
59 | def get_settings() -> Settings:
60 |     """Create and return a cached instance of the Settings."""
61 |     settings = Settings()
62 |     setup_logging()
63 |     return settings
64 | 


--------------------------------------------------------------------------------
/data/faq_dataset.csv:
--------------------------------------------------------------------------------
 1 | question;answer;category
 2 | What are your shipping options?;We offer standard (3-5 business days) and express (1-2 business days) shipping options.;Shipping
 3 | How can I track my order?;You can track your order by logging into your account and viewing the order status or using the tracking number sent to your email.;Order Management
 4 | What is your return policy?;We offer a 30-day return policy for most items. Please ensure the item is unused and in its original packaging.;Returns
 5 | Do you offer international shipping?;Yes, we ship to most countries worldwide. Shipping costs and delivery times vary by location.;Shipping
 6 | How do I cancel an order?;You can cancel an order within 24 hours of placement by contacting our customer service team.;Order Management
 7 | What payment methods do you accept?;We accept major credit cards, PayPal, and Apple Pay.;Payment
 8 | Are your products authentic?;Yes, all our products are 100% authentic and sourced directly from authorized distributors.;Product Information
 9 | How do I contact customer service?;You can reach our customer service team via email, phone, or live chat on our website.;Customer Support
10 | Do you offer gift wrapping?;Yes, we offer gift wrapping for a small additional fee. You can select this option during checkout.;Services
11 | What is your price match policy?;We offer price matching for identical items from authorized retailers within 14 days of purchase.;Pricing
12 | How do I create an account?;You can create an account by clicking the "Sign Up" button on our homepage and following the prompts.;Account Management
13 | Do you have a loyalty program?;Yes, we have a rewards program where you earn points on purchases that can be redeemed for discounts.;Rewards
14 | What should I do if I receive a damaged item?;Please contact our customer service team immediately with photos of the damaged item and packaging.;Returns
15 | How can I check the status of my refund?;Refund status can be checked in your account under "Order History" or by contacting customer service.;Returns
16 | Do you offer discounts for bulk orders?;Yes, we offer discounts for bulk orders. Please contact our sales team for a custom quote.;Pricing
17 | How do I unsubscribe from your email newsletter?;You can unsubscribe by clicking the "Unsubscribe" link at the bottom of any of our email newsletters.;Communication
18 | What is your warranty policy?;Warranty policies vary by product. Please check the product page or contact us for specific warranty information.;Product Information
19 | Do you have physical store locations?;We are an online-only store and do not have physical retail locations at this time.;Company Information
20 | How do I apply a promotional code to my order?;You can enter your promotional code in the designated field during the checkout process.;Payment
21 | What is your privacy policy regarding customer data?;We take data privacy seriously. Our detailed privacy policy can be found on our website footer.;Privacy


--------------------------------------------------------------------------------
/app/similarity_search.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from database.vector_store import VectorStore
 3 | from services.synthesizer import Synthesizer
 4 | from timescale_vector import client
 5 | 
 6 | # Initialize VectorStore
 7 | vec = VectorStore()
 8 | 
 9 | # --------------------------------------------------------------
10 | # Shipping question
11 | # --------------------------------------------------------------
12 | 
13 | relevant_question = "What are your shipping options?"
14 | results = vec.search(relevant_question, limit=3)
15 | 
16 | response = Synthesizer.generate_response(question=relevant_question, context=results)
17 | 
18 | print(f"\n{response.answer}")
19 | print("\nThought process:")
20 | for thought in response.thought_process:
21 |     print(f"- {thought}")
22 | print(f"\nContext: {response.enough_context}")
23 | 
24 | # --------------------------------------------------------------
25 | # Irrelevant question
26 | # --------------------------------------------------------------
27 | 
28 | irrelevant_question = "What is the weather in Tokyo?"
29 | 
30 | results = vec.search(irrelevant_question, limit=3)
31 | 
32 | response = Synthesizer.generate_response(question=irrelevant_question, context=results)
33 | 
34 | print(f"\n{response.answer}")
35 | print("\nThought process:")
36 | for thought in response.thought_process:
37 |     print(f"- {thought}")
38 | print(f"\nContext: {response.enough_context}")
39 | 
40 | # --------------------------------------------------------------
41 | # Metadata filtering
42 | # --------------------------------------------------------------
43 | 
44 | metadata_filter = {"category": "Shipping"}
45 | 
46 | results = vec.search(relevant_question, limit=3, metadata_filter=metadata_filter)
47 | 
48 | response = Synthesizer.generate_response(question=relevant_question, context=results)
49 | 
50 | print(f"\n{response.answer}")
51 | print("\nThought process:")
52 | for thought in response.thought_process:
53 |     print(f"- {thought}")
54 | print(f"\nContext: {response.enough_context}")
55 | 
56 | # --------------------------------------------------------------
57 | # Advanced filtering using Predicates
58 | # --------------------------------------------------------------
59 | 
60 | predicates = client.Predicates("category", "==", "Shipping")
61 | results = vec.search(relevant_question, limit=3, predicates=predicates)
62 | 
63 | 
64 | predicates = client.Predicates("category", "==", "Shipping") | client.Predicates(
65 |     "category", "==", "Services"
66 | )
67 | results = vec.search(relevant_question, limit=3, predicates=predicates)
68 | 
69 | 
70 | predicates = client.Predicates("category", "==", "Shipping") & client.Predicates(
71 |     "created_at", ">", "2024-09-01"
72 | )
73 | results = vec.search(relevant_question, limit=3, predicates=predicates)
74 | 
75 | # --------------------------------------------------------------
76 | # Time-based filtering
77 | # --------------------------------------------------------------
78 | 
79 | # September — Returning results
80 | time_range = (datetime(2024, 9, 1), datetime(2024, 9, 30))
81 | results = vec.search(relevant_question, limit=3, time_range=time_range)
82 | 
83 | # August — Not returning any results
84 | time_range = (datetime(2024, 8, 1), datetime(2024, 8, 30))
85 | results = vec.search(relevant_question, limit=3, time_range=time_range)
86 | 


--------------------------------------------------------------------------------
/app/services/synthesizer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import pandas as pd
 3 | from pydantic import BaseModel, Field
 4 | from services.llm_factory import LLMFactory
 5 | 
 6 | 
 7 | class SynthesizedResponse(BaseModel):
 8 |     thought_process: List[str] = Field(
 9 |         description="List of thoughts that the AI assistant had while synthesizing the answer"
10 |     )
11 |     answer: str = Field(description="The synthesized answer to the user's question")
12 |     enough_context: bool = Field(
13 |         description="Whether the assistant has enough context to answer the question"
14 |     )
15 | 
16 | 
17 | class Synthesizer:
18 |     SYSTEM_PROMPT = """
19 |     # Role and Purpose
20 |     You are an AI assistant for an e-commerce FAQ system. Your task is to synthesize a coherent and helpful answer 
21 |     based on the given question and relevant context retrieved from a knowledge database.
22 | 
23 |     # Guidelines:
24 |     1. Provide a clear and concise answer to the question.
25 |     2. Use only the information from the relevant context to support your answer.
26 |     3. The context is retrieved based on cosine similarity, so some information might be missing or irrelevant.
27 |     4. Be transparent when there is insufficient information to fully answer the question.
28 |     5. Do not make up or infer information not present in the provided context.
29 |     6. If you cannot answer the question based on the given context, clearly state that.
30 |     7. Maintain a helpful and professional tone appropriate for customer service.
31 |     8. Adhere strictly to company guidelines and policies by using only the provided knowledge base.
32 |     
33 |     Review the question from the user:
34 |     """
35 | 
36 |     @staticmethod
37 |     def generate_response(question: str, context: pd.DataFrame) -> SynthesizedResponse:
38 |         """Generates a synthesized response based on the question and context.
39 | 
40 |         Args:
41 |             question: The user's question.
42 |             context: The relevant context retrieved from the knowledge base.
43 | 
44 |         Returns:
45 |             A SynthesizedResponse containing thought process and answer.
46 |         """
47 |         context_str = Synthesizer.dataframe_to_json(
48 |             context, columns_to_keep=["content", "category"]
49 |         )
50 | 
51 |         messages = [
52 |             {"role": "system", "content": Synthesizer.SYSTEM_PROMPT},
53 |             {"role": "user", "content": f"# User question:\n{question}"},
54 |             {
55 |                 "role": "assistant",
56 |                 "content": f"# Retrieved information:\n{context_str}",
57 |             },
58 |         ]
59 | 
60 |         llm = LLMFactory("openai")
61 |         return llm.create_completion(
62 |             response_model=SynthesizedResponse,
63 |             messages=messages,
64 |         )
65 | 
66 |     @staticmethod
67 |     def dataframe_to_json(
68 |         context: pd.DataFrame,
69 |         columns_to_keep: List[str],
70 |     ) -> str:
71 |         """
72 |         Convert the context DataFrame to a JSON string.
73 | 
74 |         Args:
75 |             context (pd.DataFrame): The context DataFrame.
76 |             columns_to_keep (List[str]): The columns to include in the output.
77 | 
78 |         Returns:
79 |             str: A JSON string representation of the selected columns.
80 |         """
81 |         return context[columns_to_keep].to_json(orient="records", indent=2)
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Building a High-Performance RAG Solution with Pgvectorscale and Python
  2 | 
  3 | This tutorial will guide you through setting up and using `pgvectorscale` with Docker and Python, leveraging OpenAI's powerful `text-embedding-3-small` model for embeddings. You'll learn to build a cutting-edge RAG (Retrieval-Augmented Generation) solution, combining advanced retrieval techniques (including hybrid search) with intelligent answer generation based on the retrieved context. Perfect for AI engineers looking to enhance their projects with state-of-the-art vector search and generation capabilities with the power of PostgreSQL.
  4 | 
  5 | ## YouTube Tutorial
  6 | You can watch the full tutorial here on [YouTube](https://youtu.be/hAdEuDBN57g).
  7 | 
  8 | ## Pgvectorscale Documentation
  9 | 
 10 | For more information about using PostgreSQL as a vector database in AI applications with Timescale, check out these resources:
 11 | 
 12 | - [GitHub Repository: pgvectorscale](https://github.com/timescale/pgvectorscale)
 13 | - [Blog Post: PostgreSQL and Pgvector: Now Faster Than Pinecone, 75% Cheaper, and 100% Open Source](https://www.timescale.com/blog/pgvector-is-now-as-fast-as-pinecone-at-75-less-cost/)
 14 | - [Blog Post: RAG Is More Than Just Vector Search](https://www.timescale.com/blog/rag-is-more-than-just-vector-search/)
 15 | - [Blog Post: A Python Library for Using PostgreSQL as a Vector Database in AI Applications](https://www.timescale.com/blog/a-python-library-for-using-postgresql-as-a-vector-database-in-ai-applications/)
 16 | 
 17 | ## Why PostgreSQL?
 18 | 
 19 | Using PostgreSQL with pgvectorscale as your vector database offers several key advantages over dedicated vector databases:
 20 | 
 21 | - PostgreSQL is a robust, open-source database with a rich ecosystem of tools, drivers, and connectors. This ensures transparency, community support, and continuous improvements.
 22 | 
 23 | - By using PostgreSQL, you can manage both your relational and vector data within a single database. This reduces operational complexity, as there's no need to maintain and synchronize multiple databases.
 24 | 
 25 | - Pgvectorscale enhances pgvector with faster search capabilities, higher recall, and efficient time-based filtering. It leverages advanced indexing techniques, such as the DiskANN-inspired index, to significantly speed up Approximate Nearest Neighbor (ANN) searches.
 26 | 
 27 | Pgvectorscale Vector builds on top of [pgvector](https://github.com/pgvector/pgvector), offering improved performance and additional features, making PostgreSQL a powerful and versatile choice for AI applications.
 28 | 
 29 | ## Prerequisites
 30 | 
 31 | - Docker
 32 | - Python 3.7+
 33 | - OpenAI API key
 34 | - PostgreSQL GUI client
 35 | 
 36 | ## Steps
 37 | 
 38 | 1. Set up Docker environment
 39 | 2. Connect to the database using a PostgreSQL GUI client (I use TablePlus)
 40 | 3. Create a Python script to insert document chunks as vectors using OpenAI embeddings
 41 | 4. Create a Python function to perform similarity search
 42 | 
 43 | ## Detailed Instructions
 44 | 
 45 | ### 1. Set up Docker environment
 46 | 
 47 | Create a `docker-compose.yml` file with the following content:
 48 | 
 49 | ```yaml
 50 | services:
 51 |   timescaledb:
 52 |     image: timescale/timescaledb-ha:pg16
 53 |     container_name: timescaledb
 54 |     environment:
 55 |       - POSTGRES_DB=postgres
 56 |       - POSTGRES_PASSWORD=password
 57 |     ports:
 58 |       - "5432:5432"
 59 |     volumes:
 60 |       - timescaledb_data:/var/lib/postgresql/data
 61 |     restart: unless-stopped
 62 | 
 63 | volumes:
 64 |   timescaledb_data:
 65 | ```
 66 | 
 67 | Run the Docker container:
 68 | 
 69 | ```bash
 70 | docker compose up -d
 71 | ```
 72 | 
 73 | ### 2. Connect to the database using a PostgreSQL GUI client
 74 | 
 75 | - Open client
 76 | - Create a new connection with the following details:
 77 |   - Host: localhost
 78 |   - Port: 5432
 79 |   - User: postgres
 80 |   - Password: password
 81 |   - Database: postgres
 82 | 
 83 | ### 3. Create a Python script to insert document chunks as vectors
 84 | 
 85 | See `insert_vectors.py` for the implementation. This script uses OpenAI's `text-embedding-3-small` model to generate embeddings.
 86 | 
 87 | ### 4. Create a Python function to perform similarity search
 88 | 
 89 | See `similarity_search.py` for the implementation. This script also uses OpenAI's `text-embedding-3-small` model for query embedding.
 90 | 
 91 | ## Usage
 92 | 
 93 | 1. Create a copy of `example.env` and rename it to `.env`
 94 | 2. Open `.env` and fill in your OpenAI API key. Leave the database settings as is
 95 | 3. Run the Docker container
 96 | 4. Install the required Python packages using `pip install -r requirements.txt`
 97 | 5. Execute `insert_vectors.py` to populate the database
 98 | 6. Play with `similarity_search.py` to perform similarity searches
 99 | 
100 | ## Using ANN search indexes to speed up queries
101 | 
102 | Timescale Vector offers indexing options to accelerate similarity queries, particularly beneficial for large vector datasets (10k+ vectors):
103 | 
104 | 1. Supported indexes:
105 |    - timescale_vector_index (default): A DiskANN-inspired graph index
106 |    - pgvector's HNSW: Hierarchical Navigable Small World graph index
107 |    - pgvector's IVFFLAT: Inverted file index
108 | 
109 | 2. The DiskANN-inspired index is Timescale's latest offering, providing improved performance. Refer to the [Timescale Vector explainer blog](https://www.timescale.com/blog/pgvector-is-now-as-fast-as-pinecone-at-75-less-cost/) for detailed information and benchmarks.
110 | 
111 | For optimal query performance, creating an index on the embedding column is recommended, especially for large vector datasets.
112 | 
113 | ## Cosine Similarity in Vector Search
114 | 
115 | ### What is Cosine Similarity?
116 | 
117 | Cosine similarity measures the cosine of the angle between two vectors in a multi-dimensional space. It's a measure of orientation rather than magnitude.
118 | 
119 | - Range: -1 to 1 (for normalized vectors, which is typical in text embeddings)
120 | - 1: Vectors point in the same direction (most similar)
121 | - 0: Vectors are orthogonal (unrelated)
122 | - -1: Vectors point in opposite directions (most dissimilar)
123 | 
124 | ### Cosine Distance
125 | 
126 | In pgvector, the `<=>` operator computes cosine distance, which is 1 - cosine similarity.
127 | 
128 | - Range: 0 to 2
129 | - 0: Identical vectors (most similar)
130 | - 1: Orthogonal vectors
131 | - 2: Opposite vectors (most dissimilar)
132 | 
133 | ### Interpreting Results
134 | 
135 | When you get results from similarity_search:
136 | 
137 | - Lower distance values indicate higher similarity.
138 | - A distance of 0 would mean exact match (rarely happens with embeddings).
139 | - Distances closer to 0 indicate high similarity.
140 | - Distances around 1 suggest little to no similarity.
141 | - Distances approaching 2 indicate opposite meanings (rare in practice).
142 | 


--------------------------------------------------------------------------------
/app/database/vector_store.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | from typing import Any, List, Optional, Tuple, Union
  4 | from datetime import datetime
  5 | 
  6 | import pandas as pd
  7 | from config.settings import get_settings
  8 | from openai import OpenAI
  9 | from timescale_vector import client
 10 | 
 11 | 
 12 | class VectorStore:
 13 |     """A class for managing vector operations and database interactions."""
 14 | 
 15 |     def __init__(self):
 16 |         """Initialize the VectorStore with settings, OpenAI client, and Timescale Vector client."""
 17 |         self.settings = get_settings()
 18 |         self.openai_client = OpenAI(api_key=self.settings.openai.api_key)
 19 |         self.embedding_model = self.settings.openai.embedding_model
 20 |         self.vector_settings = self.settings.vector_store
 21 |         self.vec_client = client.Sync(
 22 |             self.settings.database.service_url,
 23 |             self.vector_settings.table_name,
 24 |             self.vector_settings.embedding_dimensions,
 25 |             time_partition_interval=self.vector_settings.time_partition_interval,
 26 |         )
 27 | 
 28 |     def get_embedding(self, text: str) -> List[float]:
 29 |         """
 30 |         Generate embedding for the given text.
 31 | 
 32 |         Args:
 33 |             text: The input text to generate an embedding for.
 34 | 
 35 |         Returns:
 36 |             A list of floats representing the embedding.
 37 |         """
 38 |         text = text.replace("\n", " ")
 39 |         start_time = time.time()
 40 |         embedding = (
 41 |             self.openai_client.embeddings.create(
 42 |                 input=[text],
 43 |                 model=self.embedding_model,
 44 |             )
 45 |             .data[0]
 46 |             .embedding
 47 |         )
 48 |         elapsed_time = time.time() - start_time
 49 |         logging.info(f"Embedding generated in {elapsed_time:.3f} seconds")
 50 |         return embedding
 51 | 
 52 |     def create_tables(self) -> None:
 53 |         """Create the necessary tablesin the database"""
 54 |         self.vec_client.create_tables()
 55 | 
 56 |     def create_index(self) -> None:
 57 |         """Create the StreamingDiskANN index to spseed up similarity search"""
 58 |         self.vec_client.create_embedding_index(client.DiskAnnIndex())
 59 | 
 60 |     def drop_index(self) -> None:
 61 |         """Drop the StreamingDiskANN index in the database"""
 62 |         self.vec_client.drop_embedding_index()
 63 | 
 64 |     def upsert(self, df: pd.DataFrame) -> None:
 65 |         """
 66 |         Insert or update records in the database from a pandas DataFrame.
 67 | 
 68 |         Args:
 69 |             df: A pandas DataFrame containing the data to insert or update.
 70 |                 Expected columns: id, metadata, contents, embedding
 71 |         """
 72 |         records = df.to_records(index=False)
 73 |         self.vec_client.upsert(list(records))
 74 |         logging.info(
 75 |             f"Inserted {len(df)} records into {self.vector_settings.table_name}"
 76 |         )
 77 | 
 78 |     def search(
 79 |         self,
 80 |         query_text: str,
 81 |         limit: int = 5,
 82 |         metadata_filter: Union[dict, List[dict]] = None,
 83 |         predicates: Optional[client.Predicates] = None,
 84 |         time_range: Optional[Tuple[datetime, datetime]] = None,
 85 |         return_dataframe: bool = True,
 86 |     ) -> Union[List[Tuple[Any, ...]], pd.DataFrame]:
 87 |         """
 88 |         Query the vector database for similar embeddings based on input text.
 89 | 
 90 |         More info:
 91 |             https://github.com/timescale/docs/blob/latest/ai/python-interface-for-pgvector-and-timescale-vector.md
 92 | 
 93 |         Args:
 94 |             query_text: The input text to search for.
 95 |             limit: The maximum number of results to return.
 96 |             metadata_filter: A dictionary or list of dictionaries for equality-based metadata filtering.
 97 |             predicates: A Predicates object for complex metadata filtering.
 98 |                 - Predicates objects are defined by the name of the metadata key, an operator, and a value.
 99 |                 - Operators: ==, !=, >, >=, <, <=
100 |                 - & is used to combine multiple predicates with AND operator.
101 |                 - | is used to combine multiple predicates with OR operator.
102 |             time_range: A tuple of (start_date, end_date) to filter results by time.
103 |             return_dataframe: Whether to return results as a DataFrame (default: True).
104 | 
105 |         Returns:
106 |             Either a list of tuples or a pandas DataFrame containing the search results.
107 | 
108 |         Basic Examples:
109 |             Basic search:
110 |                 vector_store.search("What are your shipping options?")
111 |             Search with metadata filter:
112 |                 vector_store.search("Shipping options", metadata_filter={"category": "Shipping"})
113 |         
114 |         Predicates Examples:
115 |             Search with predicates:
116 |                 vector_store.search("Pricing", predicates=client.Predicates("price", ">", 100))
117 |             Search with complex combined predicates:
118 |                 complex_pred = (client.Predicates("category", "==", "Electronics") & client.Predicates("price", "<", 1000)) | \
119 |                                (client.Predicates("category", "==", "Books") & client.Predicates("rating", ">=", 4.5))
120 |                 vector_store.search("High-quality products", predicates=complex_pred)
121 |         
122 |         Time-based filtering:
123 |             Search with time range:
124 |                 vector_store.search("Recent updates", time_range=(datetime(2024, 1, 1), datetime(2024, 1, 31)))
125 |         """
126 |         query_embedding = self.get_embedding(query_text)
127 | 
128 |         start_time = time.time()
129 | 
130 |         search_args = {
131 |             "limit": limit,
132 |         }
133 | 
134 |         if metadata_filter:
135 |             search_args["filter"] = metadata_filter
136 | 
137 |         if predicates:
138 |             search_args["predicates"] = predicates
139 | 
140 |         if time_range:
141 |             start_date, end_date = time_range
142 |             search_args["uuid_time_filter"] = client.UUIDTimeRange(start_date, end_date)
143 | 
144 |         results = self.vec_client.search(query_embedding, **search_args)
145 |         elapsed_time = time.time() - start_time
146 | 
147 |         logging.info(f"Vector search completed in {elapsed_time:.3f} seconds")
148 | 
149 |         if return_dataframe:
150 |             return self._create_dataframe_from_results(results)
151 |         else:
152 |             return results
153 | 
154 |     def _create_dataframe_from_results(
155 |         self,
156 |         results: List[Tuple[Any, ...]],
157 |     ) -> pd.DataFrame:
158 |         """
159 |         Create a pandas DataFrame from the search results.
160 | 
161 |         Args:
162 |             results: A list of tuples containing the search results.
163 | 
164 |         Returns:
165 |             A pandas DataFrame containing the formatted search results.
166 |         """
167 |         # Convert results to DataFrame
168 |         df = pd.DataFrame(
169 |             results, columns=["id", "metadata", "content", "embedding", "distance"]
170 |         )
171 | 
172 |         # Expand metadata column
173 |         df = pd.concat(
174 |             [df.drop(["metadata"], axis=1), df["metadata"].apply(pd.Series)], axis=1
175 |         )
176 | 
177 |         # Convert id to string for better readability
178 |         df["id"] = df["id"].astype(str)
179 | 
180 |         return df
181 | 
182 |     def delete(
183 |         self,
184 |         ids: List[str] = None,
185 |         metadata_filter: dict = None,
186 |         delete_all: bool = False,
187 |     ) -> None:
188 |         """Delete records from the vector database.
189 | 
190 |         Args:
191 |             ids (List[str], optional): A list of record IDs to delete.
192 |             metadata_filter (dict, optional): A dictionary of metadata key-value pairs to filter records for deletion.
193 |             delete_all (bool, optional): A boolean flag to delete all records.
194 | 
195 |         Raises:
196 |             ValueError: If no deletion criteria are provided or if multiple criteria are provided.
197 | 
198 |         Examples:
199 |             Delete by IDs:
200 |                 vector_store.delete(ids=["8ab544ae-766a-11ef-81cb-decf757b836d"])
201 | 
202 |             Delete by metadata filter:
203 |                 vector_store.delete(metadata_filter={"category": "Shipping"})
204 | 
205 |             Delete all records:
206 |                 vector_store.delete(delete_all=True)
207 |         """
208 |         if sum(bool(x) for x in (ids, metadata_filter, delete_all)) != 1:
209 |             raise ValueError(
210 |                 "Provide exactly one of: ids, metadata_filter, or delete_all"
211 |             )
212 | 
213 |         if delete_all:
214 |             self.vec_client.delete_all()
215 |             logging.info(f"Deleted all records from {self.vector_settings.table_name}")
216 |         elif ids:
217 |             self.vec_client.delete_by_ids(ids)
218 |             logging.info(
219 |                 f"Deleted {len(ids)} records from {self.vector_settings.table_name}"
220 |             )
221 |         elif metadata_filter:
222 |             self.vec_client.delete_by_metadata(metadata_filter)
223 |             logging.info(
224 |                 f"Deleted records matching metadata filter from {self.vector_settings.table_name}"
225 |             )
226 | 


--------------------------------------------------------------------------------