├── .gitignore ├── .env.example ├── postgres ├── init.sql └── Dockerfile ├── docker-compose.yaml ├── README.md ├── bella_vista.txt └── code.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .env -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=sk-... -------------------------------------------------------------------------------- /postgres/init.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION vector; -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | postgres: 4 | build: ./postgres 5 | ports: 6 | - "5433:5432" 7 | environment: 8 | POSTGRES_USER: admin 9 | POSTGRES_PASSWORD: admin 10 | POSTGRES_DB: vectordb 11 | -------------------------------------------------------------------------------- /postgres/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a specific version of the Postgres image 2 | FROM postgres:13 3 | 4 | # Install necessary packages and set up locale 5 | RUN apt-get update && apt-get upgrade -y && \ 6 | apt-get install -y wget build-essential postgresql-server-dev-13 locales && \ 7 | echo "LC_ALL=en_US.UTF-8" >> /etc/environment && \ 8 | echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen && \ 9 | echo "LANG=en_US.UTF-8" > /etc/locale.conf && \ 10 | locale-gen en_US.UTF-8 && \ 11 | rm -rf /var/lib/apt/lists/* 12 | 13 | # Set environment variables for locale 14 | ENV LANG en_US.UTF-8 15 | ENV LC_ALL en_US.UTF-8 16 | 17 | # Install pgvector 18 | RUN wget https://github.com/pgvector/pgvector/archive/v0.4.4.tar.gz && \ 19 | tar -xzvf v0.4.4.tar.gz && \ 20 | cd pgvector-0.4.4 && \ 21 | make && \ 22 | make install 23 | 24 | COPY ./init.sql /docker-entrypoint-initdb.d/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vector Search with LangChain Indexing API 2 | 3 | This project demonstrates the use of the LangChain indexing API for efficient vector searching. It focuses on providing an efficient workflow to index, search, and manage documents in a vectorized format. 4 | 5 | ## Overview: 6 | 7 | LangChain's indexing API offers a powerful yet simple method for handling large amounts of textual data, allowing users to extract meaningful insights with vector search capabilities. Specifically, this API provides: 8 | 9 | - **Efficient Indexing**: Avoid duplications and re-computations, saving on storage and computational resources. 10 | - **Synchronization**: Ensures your vector store remains updated, eliminating redundancies. 11 | - **Transformation Handling**: Seamlessly work with documents even after they undergo multiple transformation steps, such as text chunking. 12 | 13 | The aim is to make vector searches more streamlined and cost-effective, enhancing the overall search quality and results. 14 | -------------------------------------------------------------------------------- /bella_vista.txt: -------------------------------------------------------------------------------- 1 | Q: What are the hours of operation for Bella Vista? 2 | A: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome guests from 12 p.m. to 10 p.m. 3 | 4 | Q: What type of cuisine does Bella Vista serve? 5 | A: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally. 6 | 7 | Q: Do you offer vegetarian or vegan options at Bella Vista? 8 | A: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and vegan dishes. Our chefs are also happy to customize dishes based on dietary needs. 9 | 10 | Q: Is Bella Vista family-friendly? 11 | A: Yes, Bella Vista is a family-friendly establishment. We have a dedicated kids' menu and offer high chairs and booster seats for our younger guests. 12 | 13 | Q: Can I book private events at Bella Vista? 14 | A: Certainly! Bella Vista has a private dining area perfect for events, parties, or corporate gatherings. We also offer catering services for off-site events. 15 | 16 | Q: What's the ambiance like at Bella Vista? 17 | A: Bella Vista boasts a cozy and elegant setting, with ambient lighting, comfortable seating, and a stunning view of the city skyline. Whether you're looking for a romantic dinner or a casual meal with friends, Bella Vista provides the perfect atmosphere. 18 | 19 | Q: Do I need a reservation for Bella Vista? 20 | A: While walk-ins are always welcome, we recommend making a reservation, especially during weekends and holidays, to ensure a seamless dining experience. -------------------------------------------------------------------------------- /code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Indexing API" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "from langchain.embeddings import OpenAIEmbeddings\n", 17 | "from langchain.text_splitter import CharacterTextSplitter\n", 18 | "from langchain.vectorstores.pgvector import PGVector\n", 19 | "from langchain.document_loaders import TextLoader\n", 20 | "\n", 21 | "embeddings = OpenAIEmbeddings()\n", 22 | "\n", 23 | "CONNECTION_STRING = \"postgresql+psycopg2://admin:admin@127.0.0.1:5433/vectordb\"\n", 24 | "COLLECTION_NAME = \"vectordb\"\n", 25 | "\n", 26 | "\n", 27 | "vectorstore = PGVector.from_documents(\n", 28 | " [],\n", 29 | " embeddings,\n", 30 | " collection_name=COLLECTION_NAME,\n", 31 | " connection_string=CONNECTION_STRING\n", 32 | ")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Lets add Documents and Embeddings!" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stderr", 49 | "output_type": "stream", 50 | "text": [ 51 | "Created a chunk of size 177, which is longer than the specified 150\n", 52 | "Created a chunk of size 229, which is longer than the specified 150\n", 53 | "Created a chunk of size 233, which is longer than the specified 150\n", 54 | "Created a chunk of size 185, which is longer than the specified 150\n", 55 | "Created a chunk of size 203, which is longer than the specified 150\n", 56 | "Created a chunk of size 299, which is longer than the specified 150\n" 57 | ] 58 | }, 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "7\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "loader = TextLoader(\"./bella_vista.txt\")\n", 69 | "documents = loader.load()\n", 70 | "text_splitter = CharacterTextSplitter(chunk_size=150, chunk_overlap=20)\n", 71 | "docs = text_splitter.split_documents(documents)\n", 72 | "print(len(docs))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "from langchain.indexes import SQLRecordManager, index" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "namespace = f\"pgvector/{COLLECTION_NAME}\"\n", 91 | "record_manager = SQLRecordManager(\n", 92 | " namespace, db_url=CONNECTION_STRING\n", 93 | ")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "record_manager.create_schema()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "Update the documents to see some changes" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stderr", 119 | "output_type": "stream", 120 | "text": [ 121 | "Created a chunk of size 177, which is longer than the specified 150\n", 122 | "Created a chunk of size 229, which is longer than the specified 150\n", 123 | "Created a chunk of size 233, which is longer than the specified 150\n", 124 | "Created a chunk of size 185, which is longer than the specified 150\n", 125 | "Created a chunk of size 203, which is longer than the specified 150\n", 126 | "Created a chunk of size 299, which is longer than the specified 150\n" 127 | ] 128 | }, 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "page_content='Q: What are the hours of operation for Bella Vista?\\nA: Bella Vista is open from 11 a.m. to 11 p.m. from Monday to Saturday. On Sundays, we welcome guests from 12 p.m. to 10 p.m.' metadata={'source': './bella_vista.txt'}\n", 134 | "page_content='Q: What type of cuisine does Bella Vista serve?\\nA: Bella Vista offers a delightful blend of Mediterranean and contemporary American cuisine. We pride ourselves on using the freshest ingredients, many of which are sourced locally.' metadata={'source': './bella_vista.txt'}\n", 135 | "page_content='Q: Do you offer vegetarian or vegan options at Bella Vista?\\nA: Absolutely! Bella Vista boasts a diverse menu that includes a variety of vegetarian and vegan dishes. Our chefs are also happy to customize dishes based on dietary needs.' metadata={'source': './bella_vista.txt'}\n", 136 | "page_content=\"Q: Is Bella Vista family-friendly?\\nA: Yes, Bella Vista is a family-friendly establishment. We have a dedicated kids' menu and offer high chairs and booster seats for our younger guests.\" metadata={'source': './bella_vista.txt'}\n", 137 | "page_content='Q: Can I book private events at Bella Vista?\\nA: Certainly! Bella Vista has a private dining area perfect for events, parties, or corporate gatherings. We also offer catering services for off-site events.' metadata={'source': './bella_vista.txt'}\n", 138 | "page_content=\"Q: What's the ambiance like at Bella Vista?\\nA: Bella Vista boasts a cozy and elegant setting, with ambient lighting, comfortable seating, and a stunning view of the city skyline. Whether you're looking for a romantic dinner or a casual meal with friends, Bella Vista provides the perfect atmosphere.\" metadata={'source': './bella_vista.txt'}\n", 139 | "page_content='Q: Do I need a reservation for Bella Vista?\\nA: While walk-ins are always welcome, we recommend making a reservation, especially during weekends and holidays, to ensure a seamless dining experience.' metadata={'source': './bella_vista.txt'}\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "loader = TextLoader(\"./bella_vista.txt\")\n", 145 | "documents = loader.load()\n", 146 | "text_splitter = CharacterTextSplitter(chunk_size=150, chunk_overlap=20)\n", 147 | "docs = text_splitter.split_documents(documents)\n", 148 | "for doc in docs:\n", 149 | " print(doc)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "{'num_added': 7, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}" 161 | ] 162 | }, 163 | "execution_count": 7, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "index(\n", 170 | " docs,\n", 171 | " record_manager,\n", 172 | " vectorstore,\n", 173 | " cleanup=None,\n", 174 | " source_id_key=\"source\",\n", 175 | ")" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 8, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "from langchain.schema import Document\n", 185 | "\n", 186 | "docs[1].page_content = \"updated\"\n", 187 | "del docs[0]\n", 188 | "docs.append(Document(page_content=\"new content\", metadata={\"source\": \"important\"}))" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "{'num_added': 2, 'num_updated': 0, 'num_skipped': 5, 'num_deleted': 0}" 200 | ] 201 | }, 202 | "execution_count": 9, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "index(\n", 209 | " docs,\n", 210 | " record_manager,\n", 211 | " vectorstore,\n", 212 | " cleanup=None,\n", 213 | " source_id_key=\"source\",\n", 214 | ")" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 10, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "{'num_added': 0, 'num_updated': 0, 'num_skipped': 7, 'num_deleted': 2}" 226 | ] 227 | }, 228 | "execution_count": 10, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "index(\n", 235 | " docs,\n", 236 | " record_manager,\n", 237 | " vectorstore,\n", 238 | " cleanup=\"incremental\",\n", 239 | " source_id_key=\"source\",\n", 240 | ")" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 11, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "{'num_added': 0, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 7}" 252 | ] 253 | }, 254 | "execution_count": 11, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "index([], record_manager, vectorstore, cleanup=\"full\", source_id_key=\"source\")" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 3", 267 | "language": "python", 268 | "name": "python3" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 3 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython3", 280 | "version": "3.11.0" 281 | }, 282 | "orig_nbformat": 4 283 | }, 284 | "nbformat": 4, 285 | "nbformat_minor": 2 286 | } 287 | --------------------------------------------------------------------------------