├── .streamlit ├── .gitkeep └── secrets.example.toml ├── requirements.txt ├── .github └── dependabot.yaml ├── LICENSE ├── .gitignore ├── chat_with_pdf.py ├── chat_with_pdf_query.py └── README.md /.streamlit/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain-couchbase==1.0.0 2 | streamlit==1.51.0 3 | langchain-community==0.4.1 4 | pypdf==6.4.0 5 | langchain-openai==1.1.0 -------------------------------------------------------------------------------- /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: . 5 | schedule: 6 | interval: monthly 7 | -------------------------------------------------------------------------------- /.streamlit/secrets.example.toml: -------------------------------------------------------------------------------- 1 | # Common configuration for both FTS and GSI approaches 2 | OPENAI_API_KEY = "" 3 | DB_CONN_STR = "" 4 | DB_USERNAME = "" 5 | DB_PASSWORD = "" 6 | DB_BUCKET = "" 7 | DB_SCOPE = "" 8 | DB_COLLECTION = "" 9 | CACHE_COLLECTION = "" 10 | 11 | # Required ONLY for FTS approach (chat_with_pdf.py) 12 | INDEX_NAME = "" 13 | 14 | # Optional authentication for both approaches 15 | AUTH_ENABLED = "False" 16 | LOGIN_PASSWORD = "" 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Couchbase 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Streamlit secrets 163 | secrets.toml -------------------------------------------------------------------------------- /chat_with_pdf.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from langchain_couchbase.vectorstores import CouchbaseSearchVectorStore 3 | from langchain_openai import OpenAIEmbeddings 4 | from langchain_community.document_loaders import PyPDFLoader 5 | from langchain_text_splitters import RecursiveCharacterTextSplitter 6 | import os 7 | import streamlit as st 8 | from langchain_core.prompts import ChatPromptTemplate 9 | from langchain_openai import ChatOpenAI 10 | from langchain_core.runnables import RunnablePassthrough 11 | from langchain_core.output_parsers import StrOutputParser 12 | from langchain_core.globals import set_llm_cache 13 | from langchain_couchbase.cache import CouchbaseCache 14 | import time 15 | from couchbase.cluster import Cluster 16 | from couchbase.auth import PasswordAuthenticator 17 | from couchbase.options import ClusterOptions 18 | from datetime import timedelta 19 | 20 | 21 | def parse_bool(value: str): 22 | """Parse boolean values from environment variables""" 23 | return value.lower() in ("yes", "true", "t", "1") 24 | 25 | 26 | def check_environment_variable(variable_name): 27 | """Check if environment variable is set""" 28 | if variable_name not in os.environ: 29 | st.error( 30 | f"{variable_name} environment variable is not set. Please add it to the secrets.toml file" 31 | ) 32 | st.stop() 33 | 34 | 35 | def save_to_vector_store(uploaded_file, vector_store): 36 | """Chunk the PDF & store it in Couchbase Vector Store""" 37 | if uploaded_file is not None: 38 | temp_dir = tempfile.TemporaryDirectory() 39 | temp_file_path = os.path.join(temp_dir.name, uploaded_file.name) 40 | 41 | with open(temp_file_path, "wb") as f: 42 | f.write(uploaded_file.getvalue()) 43 | loader = PyPDFLoader(temp_file_path) 44 | docs = loader.load() 45 | 46 | text_splitter = RecursiveCharacterTextSplitter( 47 | chunk_size=1500, chunk_overlap=150 48 | ) 49 | 50 | doc_pages = text_splitter.split_documents(docs) 51 | 52 | vector_store.add_documents(doc_pages) 53 | st.info(f"PDF loaded into vector store in {len(doc_pages)} documents") 54 | 55 | 56 | @st.cache_resource(show_spinner="Connecting to Vector Store") 57 | def get_vector_store( 58 | _cluster, 59 | db_bucket, 60 | db_scope, 61 | db_collection, 62 | _embedding, 63 | index_name, 64 | ): 65 | """Return the Couchbase vector store""" 66 | vector_store = CouchbaseSearchVectorStore( 67 | cluster=_cluster, 68 | bucket_name=db_bucket, 69 | scope_name=db_scope, 70 | collection_name=db_collection, 71 | embedding=_embedding, 72 | index_name=index_name, 73 | ) 74 | return vector_store 75 | 76 | 77 | @st.cache_resource(show_spinner="Connecting to Cache") 78 | def get_cache(_cluster, db_bucket, db_scope, cache_collection): 79 | """Return the Couchbase cache""" 80 | cache = CouchbaseCache( 81 | cluster=_cluster, 82 | bucket_name=db_bucket, 83 | scope_name=db_scope, 84 | collection_name=cache_collection, 85 | ) 86 | return cache 87 | 88 | 89 | @st.cache_resource(show_spinner="Connecting to Couchbase") 90 | def connect_to_couchbase(connection_string, db_username, db_password): 91 | """Connect to couchbase""" 92 | auth = PasswordAuthenticator(db_username, db_password) 93 | options = ClusterOptions(auth) 94 | cluster = Cluster(connection_string, options) 95 | 96 | # Wait until the cluster is ready for use. 97 | cluster.wait_until_ready(timedelta(seconds=5)) 98 | 99 | return cluster 100 | 101 | 102 | def stream_string(s, chunk_size=10): 103 | """Stream a string with a delay to simulate streaming""" 104 | for i in range(0, len(s), chunk_size): 105 | yield s[i : i + chunk_size] 106 | time.sleep(0.02) 107 | 108 | 109 | if __name__ == "__main__": 110 | st.set_page_config( 111 | page_title="Chat with your PDF using Langchain, Couchbase & OpenAI", 112 | page_icon="🤖", 113 | layout="centered", 114 | initial_sidebar_state="auto", 115 | menu_items=None, 116 | ) 117 | 118 | AUTH_ENABLED = parse_bool(os.getenv("AUTH_ENABLED", "False")) 119 | 120 | if not AUTH_ENABLED: 121 | st.session_state.auth = True 122 | else: 123 | # Authorization 124 | if "auth" not in st.session_state: 125 | st.session_state.auth = False 126 | 127 | AUTH = os.getenv("LOGIN_PASSWORD") 128 | check_environment_variable("LOGIN_PASSWORD") 129 | 130 | # Authentication 131 | user_pwd = st.text_input("Enter password", type="password") 132 | pwd_submit = st.button("Submit") 133 | 134 | if pwd_submit and user_pwd == AUTH: 135 | st.session_state.auth = True 136 | elif pwd_submit and user_pwd != AUTH: 137 | st.error("Incorrect password") 138 | 139 | if st.session_state.auth: 140 | # Load environment variables 141 | DB_CONN_STR = os.getenv("DB_CONN_STR") 142 | DB_USERNAME = os.getenv("DB_USERNAME") 143 | DB_PASSWORD = os.getenv("DB_PASSWORD") 144 | DB_BUCKET = os.getenv("DB_BUCKET") 145 | DB_SCOPE = os.getenv("DB_SCOPE") 146 | DB_COLLECTION = os.getenv("DB_COLLECTION") 147 | INDEX_NAME = os.getenv("INDEX_NAME") 148 | CACHE_COLLECTION = os.getenv("CACHE_COLLECTION") 149 | 150 | # Ensure that all environment variables are set 151 | required_env_vars = [ 152 | "OPENAI_API_KEY", 153 | "DB_CONN_STR", 154 | "DB_USERNAME", 155 | "DB_PASSWORD", 156 | "DB_BUCKET", 157 | "DB_SCOPE", 158 | "DB_COLLECTION", 159 | "INDEX_NAME", 160 | "CACHE_COLLECTION", 161 | ] 162 | for var in required_env_vars: 163 | check_environment_variable(var) 164 | 165 | # Use OpenAI Embeddings 166 | embedding = OpenAIEmbeddings() 167 | 168 | # Connect to Couchbase Vector Store 169 | cluster = connect_to_couchbase(DB_CONN_STR, DB_USERNAME, DB_PASSWORD) 170 | 171 | vector_store = get_vector_store( 172 | cluster, 173 | DB_BUCKET, 174 | DB_SCOPE, 175 | DB_COLLECTION, 176 | embedding, 177 | INDEX_NAME, 178 | ) 179 | 180 | # Use couchbase vector store as a retriever for RAG 181 | retriever = vector_store.as_retriever() 182 | 183 | # Set the LLM cache 184 | cache = get_cache(cluster, DB_BUCKET, DB_SCOPE, CACHE_COLLECTION) 185 | set_llm_cache(cache) 186 | 187 | # Build the prompt for the RAG 188 | template = """You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below: 189 | {context} 190 | 191 | Question: {question}""" 192 | 193 | prompt = ChatPromptTemplate.from_template(template) 194 | 195 | # Use OpenAI GPT 4 as the LLM for the RAG 196 | llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview", streaming=True) 197 | 198 | # RAG chain 199 | chain = ( 200 | {"context": retriever, "question": RunnablePassthrough()} 201 | | prompt 202 | | llm 203 | | StrOutputParser() 204 | ) 205 | 206 | # Pure OpenAI output without RAG 207 | template_without_rag = """You are a helpful bot. Answer the question as truthfully as possible. 208 | 209 | Question: {question}""" 210 | 211 | prompt_without_rag = ChatPromptTemplate.from_template(template_without_rag) 212 | 213 | llm_without_rag = ChatOpenAI(model="gpt-4-1106-preview", streaming=True) 214 | 215 | chain_without_rag = ( 216 | {"question": RunnablePassthrough()} 217 | | prompt_without_rag 218 | | llm_without_rag 219 | | StrOutputParser() 220 | ) 221 | 222 | # Frontend 223 | couchbase_logo = ( 224 | "https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png" 225 | ) 226 | 227 | st.title("Chat with PDF (Search Vector Store)") 228 | st.markdown( 229 | "Answers with [Couchbase logo](https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png) are generated using *RAG* while 🤖️ are generated by pure *LLM (ChatGPT)*" 230 | ) 231 | 232 | with st.sidebar: 233 | st.header("Upload your PDF") 234 | with st.form("upload pdf"): 235 | uploaded_file = st.file_uploader( 236 | "Choose a PDF.", 237 | help="The document will be deleted after one hour of inactivity (TTL).", 238 | type="pdf", 239 | ) 240 | submitted = st.form_submit_button("Upload") 241 | if submitted: 242 | # store the PDF in the vector store after chunking 243 | save_to_vector_store(uploaded_file, vector_store) 244 | 245 | st.subheader("How does it work?") 246 | st.markdown( 247 | """ 248 | For each question, you will get two answers: 249 | * one using RAG ([Couchbase logo](https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png)) 250 | * one using pure LLM - OpenAI (🤖️). 251 | """ 252 | ) 253 | 254 | st.markdown( 255 | "For RAG, we are using [Langchain](https://langchain.com/), [Couchbase Vector Search using Search Service](https://couchbase.com/) & [OpenAI](https://openai.com/). We fetch parts of the PDF relevant to the question using Vector Search using the Search (FTS) Service and add it as the context to the LLM. The LLM is instructed to answer based on the context from the Vector Store." 256 | ) 257 | 258 | # View Code 259 | if st.checkbox("View Code"): 260 | st.write( 261 | "View the code here: [Github](https://github.com/couchbase-examples/rag-demo/blob/main/chat_with_pdf.py)" 262 | ) 263 | 264 | if "messages" not in st.session_state: 265 | st.session_state.messages = [] 266 | st.session_state.messages.append( 267 | { 268 | "role": "assistant", 269 | "content": "Hi, I'm a chatbot who can chat with the PDF. How can I help you?", 270 | "avatar": "🤖️", 271 | } 272 | ) 273 | 274 | # Display chat messages from history on app rerun 275 | for message in st.session_state.messages: 276 | with st.chat_message(message["role"], avatar=message["avatar"]): 277 | st.markdown(message["content"]) 278 | 279 | # React to user input 280 | if question := st.chat_input("Ask a question based on the PDF"): 281 | # Display user message in chat message container 282 | st.chat_message("user").markdown(question) 283 | 284 | # Add user message to chat history 285 | st.session_state.messages.append( 286 | {"role": "user", "content": question, "avatar": "👤"} 287 | ) 288 | 289 | # Add placeholder for streaming the response 290 | with st.chat_message("assistant", avatar=couchbase_logo): 291 | # Get the response from the RAG & stream it 292 | # In order to cache the response, we need to invoke the chain and cache the response locally as OpenAI does not support it yet 293 | # Ref: https://github.com/langchain-ai/langchain/issues/9762 294 | 295 | rag_response = chain.invoke(question) 296 | 297 | st.write_stream(stream_string(rag_response)) 298 | 299 | st.session_state.messages.append( 300 | { 301 | "role": "assistant", 302 | "content": rag_response, 303 | "avatar": couchbase_logo, 304 | } 305 | ) 306 | 307 | # Get the response from the pure LLM & stream it 308 | pure_llm_response = chain_without_rag.invoke(question) 309 | 310 | # Add placeholder for streaming the response 311 | with st.chat_message("ai", avatar="🤖️"): 312 | st.write_stream(stream_string(pure_llm_response)) 313 | 314 | st.session_state.messages.append( 315 | { 316 | "role": "assistant", 317 | "content": pure_llm_response, 318 | "avatar": "🤖️", 319 | } 320 | ) 321 | -------------------------------------------------------------------------------- /chat_with_pdf_query.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from langchain_couchbase import CouchbaseQueryVectorStore 3 | from langchain_couchbase.vectorstores import DistanceStrategy 4 | from langchain_openai import OpenAIEmbeddings 5 | from langchain_community.document_loaders import PyPDFLoader 6 | from langchain_text_splitters import RecursiveCharacterTextSplitter 7 | import os 8 | import streamlit as st 9 | from langchain_core.prompts import ChatPromptTemplate 10 | from langchain_openai import ChatOpenAI 11 | from langchain_core.runnables import RunnablePassthrough 12 | from langchain_core.output_parsers import StrOutputParser 13 | from langchain_core.globals import set_llm_cache 14 | from langchain_couchbase.cache import CouchbaseCache 15 | import time 16 | from couchbase.cluster import Cluster 17 | from couchbase.auth import PasswordAuthenticator 18 | from couchbase.options import ClusterOptions 19 | from datetime import timedelta 20 | 21 | 22 | def parse_bool(value: str): 23 | """Parse boolean values from environment variables""" 24 | return value.lower() in ("yes", "true", "t", "1") 25 | 26 | 27 | def check_environment_variable(variable_name): 28 | """Check if environment variable is set""" 29 | if variable_name not in os.environ: 30 | st.error( 31 | f"{variable_name} environment variable is not set. Please add it to the secrets.toml file" 32 | ) 33 | st.stop() 34 | 35 | 36 | def save_to_vector_store(uploaded_file, vector_store): 37 | """Chunk the PDF & store it in Couchbase Vector Store""" 38 | if uploaded_file is not None: 39 | temp_dir = tempfile.TemporaryDirectory() 40 | temp_file_path = os.path.join(temp_dir.name, uploaded_file.name) 41 | 42 | with open(temp_file_path, "wb") as f: 43 | f.write(uploaded_file.getvalue()) 44 | loader = PyPDFLoader(temp_file_path) 45 | docs = loader.load() 46 | 47 | text_splitter = RecursiveCharacterTextSplitter( 48 | chunk_size=1500, chunk_overlap=150 49 | ) 50 | 51 | doc_pages = text_splitter.split_documents(docs) 52 | 53 | vector_store.add_documents(doc_pages) 54 | st.info(f"PDF loaded into vector store in {len(doc_pages)} documents") 55 | 56 | 57 | @st.cache_resource(show_spinner="Connecting to Vector Store") 58 | def get_vector_store( 59 | _cluster, 60 | db_bucket, 61 | db_scope, 62 | db_collection, 63 | _embedding, 64 | distance_strategy: DistanceStrategy, 65 | ): 66 | """Return the Couchbase vector store""" 67 | vector_store = CouchbaseQueryVectorStore( 68 | cluster=_cluster, 69 | bucket_name=db_bucket, 70 | scope_name=db_scope, 71 | collection_name=db_collection, 72 | embedding=_embedding, 73 | distance_metric=distance_strategy, 74 | ) 75 | return vector_store 76 | 77 | 78 | @st.cache_resource(show_spinner="Connecting to Cache") 79 | def get_cache(_cluster, db_bucket, db_scope, cache_collection): 80 | """Return the Couchbase cache""" 81 | cache = CouchbaseCache( 82 | cluster=_cluster, 83 | bucket_name=db_bucket, 84 | scope_name=db_scope, 85 | collection_name=cache_collection, 86 | ) 87 | return cache 88 | 89 | 90 | @st.cache_resource(show_spinner="Connecting to Couchbase") 91 | def connect_to_couchbase(connection_string, db_username, db_password): 92 | """Connect to couchbase""" 93 | auth = PasswordAuthenticator(db_username, db_password) 94 | options = ClusterOptions(auth) 95 | cluster = Cluster(connection_string, options) 96 | 97 | # Wait until the cluster is ready for use. 98 | cluster.wait_until_ready(timedelta(seconds=5)) 99 | 100 | return cluster 101 | 102 | 103 | def stream_string(s, chunk_size=10): 104 | """Stream a string with a delay to simulate streaming""" 105 | for i in range(0, len(s), chunk_size): 106 | yield s[i : i + chunk_size] 107 | time.sleep(0.02) 108 | 109 | 110 | if __name__ == "__main__": 111 | st.set_page_config( 112 | page_title="Chat with your PDF using Langchain, Couchbase & OpenAI", 113 | page_icon="🤖", 114 | layout="centered", 115 | initial_sidebar_state="auto", 116 | menu_items=None, 117 | ) 118 | 119 | AUTH_ENABLED = parse_bool(os.getenv("AUTH_ENABLED", "False")) 120 | 121 | if not AUTH_ENABLED: 122 | st.session_state.auth = True 123 | else: 124 | # Authorization 125 | if "auth" not in st.session_state: 126 | st.session_state.auth = False 127 | 128 | AUTH = os.getenv("LOGIN_PASSWORD") 129 | check_environment_variable("LOGIN_PASSWORD") 130 | 131 | # Authentication 132 | user_pwd = st.text_input("Enter password", type="password") 133 | pwd_submit = st.button("Submit") 134 | 135 | if pwd_submit and user_pwd == AUTH: 136 | st.session_state.auth = True 137 | elif pwd_submit and user_pwd != AUTH: 138 | st.error("Incorrect password") 139 | 140 | if st.session_state.auth: 141 | # Load environment variables 142 | DB_CONN_STR = os.getenv("DB_CONN_STR") 143 | DB_USERNAME = os.getenv("DB_USERNAME") 144 | DB_PASSWORD = os.getenv("DB_PASSWORD") 145 | DB_BUCKET = os.getenv("DB_BUCKET") 146 | DB_SCOPE = os.getenv("DB_SCOPE") 147 | DB_COLLECTION = os.getenv("DB_COLLECTION") 148 | CACHE_COLLECTION = os.getenv("CACHE_COLLECTION") 149 | 150 | # Ensure that all environment variables are set 151 | required_env_vars = [ 152 | "OPENAI_API_KEY", 153 | "DB_CONN_STR", 154 | "DB_USERNAME", 155 | "DB_PASSWORD", 156 | "DB_BUCKET", 157 | "DB_SCOPE", 158 | "DB_COLLECTION", 159 | "CACHE_COLLECTION", 160 | ] 161 | for var in required_env_vars: 162 | check_environment_variable(var) 163 | 164 | # Use OpenAI Embeddings 165 | embedding = OpenAIEmbeddings() 166 | 167 | # Connect to Couchbase Vector Store 168 | cluster = connect_to_couchbase(DB_CONN_STR, DB_USERNAME, DB_PASSWORD) 169 | 170 | vector_store = get_vector_store( 171 | cluster, 172 | DB_BUCKET, 173 | DB_SCOPE, 174 | DB_COLLECTION, 175 | embedding, 176 | distance_strategy=DistanceStrategy.COSINE, 177 | ) 178 | 179 | # Use couchbase vector store as a retriever for RAG 180 | retriever = vector_store.as_retriever() 181 | 182 | # Set the LLM cache 183 | cache = get_cache(cluster, DB_BUCKET, DB_SCOPE, CACHE_COLLECTION) 184 | set_llm_cache(cache) 185 | 186 | # Build the prompt for the RAG 187 | template = """You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below: 188 | {context} 189 | 190 | Question: {question}""" 191 | 192 | prompt = ChatPromptTemplate.from_template(template) 193 | 194 | # Use OpenAI GPT 4 as the LLM for the RAG 195 | llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview", streaming=True) 196 | 197 | # RAG chain 198 | chain = ( 199 | {"context": retriever, "question": RunnablePassthrough()} 200 | | prompt 201 | | llm 202 | | StrOutputParser() 203 | ) 204 | 205 | # Pure OpenAI output without RAG 206 | template_without_rag = """You are a helpful bot. Answer the question as truthfully as possible. 207 | 208 | Question: {question}""" 209 | 210 | prompt_without_rag = ChatPromptTemplate.from_template(template_without_rag) 211 | 212 | llm_without_rag = ChatOpenAI(model="gpt-4-1106-preview", streaming=True) 213 | 214 | chain_without_rag = ( 215 | {"question": RunnablePassthrough()} 216 | | prompt_without_rag 217 | | llm_without_rag 218 | | StrOutputParser() 219 | ) 220 | 221 | # Frontend 222 | couchbase_logo = ( 223 | "https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png" 224 | ) 225 | 226 | st.title("Chat with PDF (Query Vector Store)") 227 | st.markdown( 228 | "Answers with [Couchbase logo](https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png) are generated using *RAG* while 🤖️ are generated by pure *LLM (ChatGPT)*" 229 | ) 230 | 231 | with st.sidebar: 232 | st.header("Upload your PDF") 233 | with st.form("upload pdf"): 234 | uploaded_file = st.file_uploader( 235 | "Choose a PDF.", 236 | help="The document will be deleted after one hour of inactivity (TTL).", 237 | type="pdf", 238 | ) 239 | submitted = st.form_submit_button("Upload") 240 | if submitted: 241 | # store the PDF in the vector store after chunking 242 | save_to_vector_store(uploaded_file, vector_store) 243 | 244 | st.subheader("How does it work?") 245 | st.markdown( 246 | """ 247 | For each question, you will get two answers: 248 | * one using RAG ([Couchbase logo](https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png)) 249 | * one using pure LLM - OpenAI (🤖️). 250 | """ 251 | ) 252 | 253 | st.markdown( 254 | "For RAG, we are using [Langchain](https://langchain.com/), [Couchbase Query Vector Search](https://couchbase.com/) & [OpenAI](https://openai.com/). We fetch parts of the PDF relevant to the question using Vector Search using the Query and Indexing Services and add it as the context to the LLM. The LLM is instructed to answer based on the context from the Vector Store." 255 | ) 256 | 257 | # View Code 258 | if st.checkbox("View Code"): 259 | st.write( 260 | "View the code here: [Github](https://github.com/couchbase-examples/rag-demo/blob/main/chat_with_pdf_query.py)" 261 | ) 262 | 263 | if "messages" not in st.session_state: 264 | st.session_state.messages = [] 265 | st.session_state.messages.append( 266 | { 267 | "role": "assistant", 268 | "content": "Hi, I'm a chatbot who can chat with the PDF. How can I help you?", 269 | "avatar": "🤖️", 270 | } 271 | ) 272 | 273 | # Display chat messages from history on app rerun 274 | for message in st.session_state.messages: 275 | with st.chat_message(message["role"], avatar=message["avatar"]): 276 | st.markdown(message["content"]) 277 | 278 | # React to user input 279 | if question := st.chat_input("Ask a question based on the PDF"): 280 | # Display user message in chat message container 281 | st.chat_message("user").markdown(question) 282 | 283 | # Add user message to chat history 284 | st.session_state.messages.append( 285 | {"role": "user", "content": question, "avatar": "👤"} 286 | ) 287 | 288 | # Add placeholder for streaming the response 289 | with st.chat_message("assistant", avatar=couchbase_logo): 290 | # Get the response from the RAG & stream it 291 | # In order to cache the response, we need to invoke the chain and cache the response locally as OpenAI does not support it yet 292 | # Ref: https://github.com/langchain-ai/langchain/issues/9762 293 | 294 | rag_response = chain.invoke(question) 295 | 296 | st.write_stream(stream_string(rag_response)) 297 | 298 | st.session_state.messages.append( 299 | { 300 | "role": "assistant", 301 | "content": rag_response, 302 | "avatar": couchbase_logo, 303 | } 304 | ) 305 | 306 | # Get the response from the pure LLM & stream it 307 | pure_llm_response = chain_without_rag.invoke(question) 308 | 309 | # Add placeholder for streaming the response 310 | with st.chat_message("ai", avatar="🤖️"): 311 | st.write_stream(stream_string(pure_llm_response)) 312 | 313 | st.session_state.messages.append( 314 | { 315 | "role": "assistant", 316 | "content": pure_llm_response, 317 | "avatar": "🤖️", 318 | } 319 | ) 320 | 321 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## RAG Demo using Couchbase, Streamlit, LangChain, and OpenAI 2 | 3 | This is a demo app built to chat with your custom PDFs using the vector search capabilities of Couchbase to augment the OpenAI results in a Retrieval-Augmented-Generation (RAG) model. 4 | 5 | The demo also caches the LLM responses using [CouchbaseCache](https://couchbase-ecosystem.github.io/langchain-couchbase/langchain_couchbase.html#module-langchain_couchbase.cache ) to avoid repeated calls to the LLMs saving time and cost. You need to specify just the collection (in the same scope and bucket for simplicity) to cache the LLM responses. 6 | 7 | ## Two Vector Search Implementations 8 | 9 | This demo provides two implementations showcasing different Couchbase vector search approaches: 10 | 11 | 1. **CouchbaseQueryVectorStore** (`chat_with_pdf_query.py`) - Using Couchbase Vector Search (Hyperscale/Composite Vector Indexes) with the Query and Indexing Services. 12 | 2. **CouchbaseSearchVectorStore** (`chat_with_pdf.py`) - Using Couchbase Search (formerly known as Full Text Search) Service. 13 | 14 | 15 | ### How does it work? 16 | 17 | You can upload your PDFs with custom data & ask questions about the data in the chat box. 18 | 19 | For each question, you will get two answers: 20 | 21 | - one using RAG (Couchbase logo) 22 | - one using pure LLM - OpenAI (🤖). 23 | 24 | For RAG, we are using LangChain, Couchbase Vector Search & OpenAI. We fetch parts of the PDF relevant to the question using Vector search & add it as the context to the LLM. The LLM is instructed to answer based on the context from the Vector Store. 25 | 26 | All LLM responses are cached in the collection specified. If the same exact question is asked again, the results are fetched from the Cache instead of calling the LLM. 27 | 28 | > Note: The streaming of Cached responses is purely for visual experience as OpenAI integration cannot stream responses from the Cache due to a known [issue](https://github.com/langchain-ai/langchain/issues/9762). 29 | 30 | 31 | 32 | ## Setup Instructions 33 | 34 | ### Install dependencies 35 | 36 | `pip install -r requirements.txt` 37 | 38 | ### Set the environment secrets 39 | 40 | Copy the `secrets.example.toml` file in `.streamlit` folder and rename it to `secrets.toml` and replace the placeholders with the actual values for your environment. 41 | 42 | **For Couchbase Vector Search - Hyperscale/Composite (`chat_with_pdf_query.py`):** 43 | ```toml 44 | OPENAI_API_KEY = "" 45 | DB_CONN_STR = "" 46 | DB_USERNAME = "" 47 | DB_PASSWORD = "" 48 | DB_BUCKET = "" 49 | DB_SCOPE = "" 50 | DB_COLLECTION = "" 51 | CACHE_COLLECTION = "" 52 | AUTH_ENABLED = "False" 53 | LOGIN_PASSWORD = "" 54 | ``` 55 | 56 | **For Couchbase Search (`chat_with_pdf.py`):** 57 | ```toml 58 | OPENAI_API_KEY = "" 59 | DB_CONN_STR = "" 60 | DB_USERNAME = "" 61 | DB_PASSWORD = "" 62 | DB_BUCKET = "" 63 | DB_SCOPE = "" 64 | DB_COLLECTION = "" 65 | CACHE_COLLECTION = "" 66 | INDEX_NAME = "" 67 | AUTH_ENABLED = "False" 68 | LOGIN_PASSWORD = "" 69 | ``` 70 | 71 | > **Note:** Couchbase Vector Search approach does not require `INDEX_NAME` parameter. 72 | 73 | 74 | 75 | ## Approach 1: Couchbase Vector Search (Hyperscale/Composite) 76 | 77 | For the full tutorial on Couchbase Vector Search approach, please visit [Developer Portal - Couchbase Vector Search](https://developer.couchbase.com/tutorial-python-langchain-pdf-chat-query). 78 | 79 | ### Prerequisites 80 | - Couchbase Server 8.0+ or Couchbase Capella 81 | 82 | This approach uses `CouchbaseQueryVectorStore` which leverages Couchbase's Hyperscale and Composite Vector Indexes (built on Global Secondary Index infrastructure). The vector search is performed using SQL++ queries with cosine similarity distance metric. 83 | 84 | ### Understanding Vector Index Types 85 | 86 | Couchbase offers different types of vector indexes for Couchbase Vector Search: 87 | 88 | **Hyperscale Vector Indexes (BHIVE)** 89 | - Best for pure vector searches - content discovery, recommendations, semantic search 90 | - High performance with low memory footprint - designed to scale to billions of vectors 91 | - Optimized for concurrent operations - supports simultaneous searches and inserts 92 | - Use when: You primarily perform vector-only queries without complex scalar filtering 93 | - Ideal for: Large-scale semantic search, recommendation systems, content discovery 94 | 95 | **Composite Vector Indexes** 96 | - Best for filtered vector searches - combines vector search with scalar value filtering 97 | - Efficient pre-filtering - scalar attributes reduce the vector comparison scope 98 | - Use when: Your queries combine vector similarity with scalar filters that eliminate large portions of data 99 | - Ideal for: Compliance-based filtering, user-specific searches, time-bounded queries 100 | 101 | **Choosing the Right Index Type** 102 | - Start with Hyperscale Vector Index for pure vector searches and large datasets 103 | - Use Composite Vector Index when scalar filters significantly reduce your search space 104 | - Consider your dataset size: Hyperscale scales to billions, Composite works well for tens of millions to billions 105 | 106 | For more details, see the [Couchbase Vector Index documentation](https://docs.couchbase.com/server/current/vector-index/use-vector-indexes.html). 107 | 108 | ### Index Configuration (Optional) 109 | 110 | While the application works without creating indexes manually, you can optionally create a vector index for better performance. 111 | 112 | > **Important:** The vector index should be created **after** ingesting the documents (uploading PDFs). 113 | 114 | **Using LangChain:** 115 | 116 | You can create the index programmatically after uploading your PDFs: 117 | 118 | ```python 119 | from langchain_couchbase.vectorstores import IndexType 120 | 121 | # Create a vector index on the collection 122 | vector_store.create_index( 123 | index_name="idx_vector", 124 | dimension=1536, 125 | similarity="cosine", 126 | index_type=IndexType.BHIVE, # or IndexType.COMPOSITE 127 | index_description="IVF,SQ8" 128 | ) 129 | ``` 130 | 131 | For more details on the `create_index()` method, see the [LangChain Couchbase API documentation](https://couchbase-ecosystem.github.io/langchain-couchbase/langchain_couchbase.html#langchain_couchbase.vectorstores.query_vector_store.CouchbaseQueryVectorStore.create_index). 132 | 133 | **Understanding Index Configuration Parameters:** 134 | 135 | The `description` parameter controls how Couchbase optimizes vector storage and search performance: 136 | 137 | **Format:** `'IVF[],{PQ|SQ}'` 138 | 139 | **Centroids (IVF - Inverted File):** 140 | - Controls how the dataset is subdivided for faster searches 141 | - More centroids = faster search, slower training 142 | - Fewer centroids = slower search, faster training 143 | - If omitted (like `IVF,SQ8`), Couchbase auto-selects based on dataset size 144 | 145 | **Quantization Options:** 146 | - **SQ (Scalar Quantization)**: `SQ4`, `SQ6`, `SQ8` (4, 6, or 8 bits per dimension) 147 | - **PQ (Product Quantization)**: `PQx` (e.g., `PQ32x8`) 148 | - Higher values = better accuracy, larger index size 149 | 150 | **Common Examples:** 151 | - `IVF,SQ8` - Auto centroids, 8-bit scalar quantization (good default) 152 | - `IVF1000,SQ6` - 1000 centroids, 6-bit scalar quantization 153 | - `IVF,PQ32x8` - Auto centroids, 32 subquantizers with 8 bits 154 | 155 | For detailed configuration options, see the [Quantization & Centroid Settings](https://docs.couchbase.com/server/current/vector-index/hyperscale-vector-index.html#algo_settings). 156 | 157 | > **Note:** In Couchbase Vector Search, the distance represents the vector distance between the query and document embeddings. Lower distance indicates higher similarity, while higher distance indicates lower similarity. This demo uses cosine similarity for measuring document relevance. 158 | 159 | ### Run the Couchbase Vector Search application 160 | 161 | ```bash 162 | streamlit run chat_with_pdf_query.py 163 | ``` 164 | 165 | 166 | ## Approach 2: Couchbase Search 167 | 168 | For the full tutorial on Couchbase Search approach, please visit [Developer Portal - Couchbase Search](https://developer.couchbase.com/tutorial-python-langchain-pdf-chat). 169 | 170 | ### Prerequisites 171 | - Couchbase Server 7.6+ or Couchbase Capella 172 | 173 | ### Create the Search Index 174 | 175 | We need to create the Search Index in Couchbase. For this demo, you can import the following index using the instructions. 176 | 177 | - [Couchbase Capella](https://docs.couchbase.com/cloud/search/import-search-index.html) 178 | 179 | - Copy the index definition to a new file index.json 180 | - Import the file in Capella using the instructions in the documentation. 181 | - Click on Create Index to create the index. 182 | 183 | - [Couchbase Server](https://docs.couchbase.com/server/current/search/import-search-index.html) 184 | 185 | - Click on Search -> Add Index -> Import 186 | - Copy the following Index definition in the Import screen 187 | - Click on Create Index to create the index. 188 | 189 | #### Index Definition 190 | 191 | Here, we are creating the index `pdf_search` on the documents in the `docs` collection within the `shared` scope in the bucket `pdf-docs`. The Vector field is set to `embedding` with 1536 dimensions and the text field set to `text`. We are also indexing and storing all the fields under `metadata` in the document as a dynamic mapping to account for varying document structures. The similarity metric is set to `dot_product`. If there is a change in these parameters, please adapt the index accordingly. 192 | 193 | ```json 194 | { 195 | "name": "pdf_search", 196 | "type": "fulltext-index", 197 | "params": { 198 | "doc_config": { 199 | "docid_prefix_delim": "", 200 | "docid_regexp": "", 201 | "mode": "scope.collection.type_field", 202 | "type_field": "type" 203 | }, 204 | "mapping": { 205 | "default_analyzer": "standard", 206 | "default_datetime_parser": "dateTimeOptional", 207 | "default_field": "_all", 208 | "default_mapping": { 209 | "dynamic": true, 210 | "enabled": false 211 | }, 212 | "default_type": "_default", 213 | "docvalues_dynamic": false, 214 | "index_dynamic": true, 215 | "store_dynamic": false, 216 | "type_field": "_type", 217 | "types": { 218 | "shared.docs": { 219 | "dynamic": true, 220 | "enabled": true, 221 | "properties": { 222 | "embedding": { 223 | "enabled": true, 224 | "dynamic": false, 225 | "fields": [ 226 | { 227 | "dims": 1536, 228 | "index": true, 229 | "name": "embedding", 230 | "similarity": "dot_product", 231 | "type": "vector", 232 | "vector_index_optimized_for": "recall" 233 | } 234 | ] 235 | }, 236 | "text": { 237 | "enabled": true, 238 | "dynamic": false, 239 | "fields": [ 240 | { 241 | "index": true, 242 | "name": "text", 243 | "store": true, 244 | "type": "text" 245 | } 246 | ] 247 | } 248 | } 249 | } 250 | } 251 | }, 252 | "store": { 253 | "indexType": "scorch", 254 | "segmentVersion": 16 255 | } 256 | }, 257 | "sourceType": "gocbcore", 258 | "sourceName": "pdf-docs", 259 | "sourceParams": {}, 260 | "planParams": { 261 | "maxPartitionsPerPIndex": 64, 262 | "indexPartitions": 16, 263 | "numReplicas": 0 264 | } 265 | } 266 | ``` 267 | 268 | ### Run the Couchbase Search application 269 | 270 | ```bash 271 | streamlit run chat_with_pdf.py 272 | ``` --------------------------------------------------------------------------------