├── .github └── dependabot.yaml ├── .gitignore ├── .streamlit ├── .gitkeep └── secrets.example.toml ├── LICENSE ├── README.md ├── chat_with_pdf.py └── requirements.txt /.github/dependabot.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: . 5 | schedule: 6 | interval: monthly 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Streamlit secrets 163 | secrets.toml -------------------------------------------------------------------------------- /.streamlit/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/couchbase-examples/rag-demo/d91ae72c43e6b432af7927551b71adfc200c27ea/.streamlit/.gitkeep -------------------------------------------------------------------------------- /.streamlit/secrets.example.toml: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY = "" 2 | DB_CONN_STR = "" 3 | DB_USERNAME = "" 4 | DB_PASSWORD = "" 5 | DB_BUCKET = "" 6 | DB_SCOPE = "" 7 | DB_COLLECTION = "" 8 | CACHE_COLLECTION = "" 9 | INDEX_NAME = "" 10 | AUTH_ENABLED = "False" # enables authentication for the streamlit app using LOGIN_PASSWORD 11 | LOGIN_PASSWORD = " Note that you need Couchbase Server 7.6 or higher for Vector Search. 10 | 11 | ### How does it work? 12 | 13 | You can upload your PDFs with custom data & ask questions about the data in the chat box. 14 | 15 | For each question, you will get two answers: 16 | 17 | - one using RAG (Couchbase logo) 18 | - one using pure LLM - OpenAI (🤖). 19 | 20 | For RAG, we are using LangChain, Couchbase Vector Search & OpenAI. We fetch parts of the PDF relevant to the question using Vector search & add it as the context to the LLM. The LLM is instructed to answer based on the context from the Vector Store. 21 | 22 | All LLM responses are cached in the collection specified. If the same exact question is asked again, the results are fetched from the Cache instead of calling the LLM. 23 | 24 | > Note: The streaming of Cached responses is purely for visual experience as OpenAI integration cannot stream responses from the Cache due to a known [issue](https://github.com/langchain-ai/langchain/issues/9762). 25 | 26 | ### How to Run 27 | 28 | - #### Install dependencies 29 | 30 | `pip install -r requirements.txt` 31 | 32 | - #### Set the environment secrets 33 | 34 | Copy the `secrets.example.toml` file in `.streamlit` folder and rename it to `secrets.toml` and replace the placeholders with the actual values for your environment 35 | 36 | ``` 37 | OPENAI_API_KEY = "" 38 | DB_CONN_STR = "" 39 | DB_USERNAME = "" 40 | DB_PASSWORD = "" 41 | DB_BUCKET = "" 42 | DB_SCOPE = "" 43 | DB_COLLECTION = "" 44 | CACHE_COLLECTION = "" 45 | INDEX_NAME = "" 46 | AUTH_ENABLED = "True/False" # enables authentication for the streamlit app using LOGIN_PASSWORD 47 | LOGIN_PASSWORD = "" 48 | ``` 49 | 50 | - #### Create the Search Index on Full Text Service 51 | 52 | We need to create the Search Index on the Full Text Service in Couchbase. For this demo, you can import the following index using the instructions. 53 | 54 | - [Couchbase Capella](https://docs.couchbase.com/cloud/search/import-search-index.html) 55 | 56 | - Copy the index definition to a new file index.json 57 | - Import the file in Capella using the instructions in the documentation. 58 | - Click on Create Index to create the index. 59 | 60 | - [Couchbase Server](https://docs.couchbase.com/server/current/search/import-search-index.html) 61 | 62 | - Click on Search -> Add Index -> Import 63 | - Copy the following Index definition in the Import screen 64 | - Click on Create Index to create the index. 65 | 66 | #### Index Definition 67 | 68 | Here, we are creating the index `pdf_search` on the documents in the `docs` collection within the `shared` scope in the bucket `pdf-docs`. The Vector field is set to `embeddings` with 1536 dimensions and the text field set to `text`. We are also indexing and storing all the fields under `metadata` in the document as a dynamic mapping to account for varying document structures. The similarity metric is set to `dot_product`. If there is a change in these parameters, please adapt the index accordingly. 69 | 70 | ``` 71 | { 72 | "name": "pdf_search", 73 | "type": "fulltext-index", 74 | "params": { 75 | "doc_config": { 76 | "docid_prefix_delim": "", 77 | "docid_regexp": "", 78 | "mode": "scope.collection.type_field", 79 | "type_field": "type" 80 | }, 81 | "mapping": { 82 | "default_analyzer": "standard", 83 | "default_datetime_parser": "dateTimeOptional", 84 | "default_field": "_all", 85 | "default_mapping": { 86 | "dynamic": true, 87 | "enabled": false 88 | }, 89 | "default_type": "_default", 90 | "docvalues_dynamic": false, 91 | "index_dynamic": true, 92 | "store_dynamic": false, 93 | "type_field": "_type", 94 | "types": { 95 | "shared.docs": { 96 | "dynamic": true, 97 | "enabled": true, 98 | "properties": { 99 | "embedding": { 100 | "enabled": true, 101 | "dynamic": false, 102 | "fields": [ 103 | { 104 | "dims": 1536, 105 | "index": true, 106 | "name": "embedding", 107 | "similarity": "dot_product", 108 | "type": "vector", 109 | "vector_index_optimized_for": "recall" 110 | } 111 | ] 112 | }, 113 | "text": { 114 | "enabled": true, 115 | "dynamic": false, 116 | "fields": [ 117 | { 118 | "index": true, 119 | "name": "text", 120 | "store": true, 121 | "type": "text" 122 | } 123 | ] 124 | } 125 | } 126 | } 127 | } 128 | }, 129 | "store": { 130 | "indexType": "scorch", 131 | "segmentVersion": 16 132 | } 133 | }, 134 | "sourceType": "gocbcore", 135 | "sourceName": "pdf-docs", 136 | "sourceParams": {}, 137 | "planParams": { 138 | "maxPartitionsPerPIndex": 64, 139 | "indexPartitions": 16, 140 | "numReplicas": 0 141 | } 142 | } 143 | ``` 144 | 145 | - #### Run the application 146 | 147 | `streamlit run chat_with_pdf.py` 148 | -------------------------------------------------------------------------------- /chat_with_pdf.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from langchain_couchbase.vectorstores import CouchbaseSearchVectorStore 3 | from langchain_openai import OpenAIEmbeddings 4 | from langchain_community.document_loaders import PyPDFLoader 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter 6 | import os 7 | import streamlit as st 8 | from langchain_core.prompts import ChatPromptTemplate 9 | from langchain_openai import ChatOpenAI 10 | from langchain_core.runnables import RunnablePassthrough 11 | from langchain_core.output_parsers import StrOutputParser 12 | from langchain_core.globals import set_llm_cache 13 | from langchain_couchbase.cache import CouchbaseCache 14 | import time 15 | 16 | 17 | def parse_bool(value: str): 18 | """Parse boolean values from environment variables""" 19 | return value.lower() in ("yes", "true", "t", "1") 20 | 21 | 22 | def check_environment_variable(variable_name): 23 | """Check if environment variable is set""" 24 | if variable_name not in os.environ: 25 | st.error( 26 | f"{variable_name} environment variable is not set. Please add it to the secrets.toml file" 27 | ) 28 | st.stop() 29 | 30 | 31 | def save_to_vector_store(uploaded_file, vector_store): 32 | """Chunk the PDF & store it in Couchbase Vector Store""" 33 | if uploaded_file is not None: 34 | temp_dir = tempfile.TemporaryDirectory() 35 | temp_file_path = os.path.join(temp_dir.name, uploaded_file.name) 36 | 37 | with open(temp_file_path, "wb") as f: 38 | f.write(uploaded_file.getvalue()) 39 | loader = PyPDFLoader(temp_file_path) 40 | docs = loader.load() 41 | 42 | text_splitter = RecursiveCharacterTextSplitter( 43 | chunk_size=1500, chunk_overlap=150 44 | ) 45 | 46 | doc_pages = text_splitter.split_documents(docs) 47 | 48 | vector_store.add_documents(doc_pages) 49 | st.info(f"PDF loaded into vector store in {len(doc_pages)} documents") 50 | 51 | 52 | @st.cache_resource(show_spinner="Connecting to Vector Store") 53 | def get_vector_store( 54 | _cluster, 55 | db_bucket, 56 | db_scope, 57 | db_collection, 58 | _embedding, 59 | index_name, 60 | ): 61 | """Return the Couchbase vector store""" 62 | vector_store = CouchbaseSearchVectorStore( 63 | cluster=_cluster, 64 | bucket_name=db_bucket, 65 | scope_name=db_scope, 66 | collection_name=db_collection, 67 | embedding=_embedding, 68 | index_name=index_name, 69 | ) 70 | return vector_store 71 | 72 | 73 | @st.cache_resource(show_spinner="Connecting to Cache") 74 | def get_cache(_cluster, db_bucket, db_scope, cache_collection): 75 | """Return the Couchbase cache""" 76 | cache = CouchbaseCache( 77 | cluster=_cluster, 78 | bucket_name=db_bucket, 79 | scope_name=db_scope, 80 | collection_name=cache_collection, 81 | ) 82 | return cache 83 | 84 | 85 | @st.cache_resource(show_spinner="Connecting to Couchbase") 86 | def connect_to_couchbase(connection_string, db_username, db_password): 87 | """Connect to couchbase""" 88 | from couchbase.cluster import Cluster 89 | from couchbase.auth import PasswordAuthenticator 90 | from couchbase.options import ClusterOptions 91 | from datetime import timedelta 92 | 93 | auth = PasswordAuthenticator(db_username, db_password) 94 | options = ClusterOptions(auth) 95 | connect_string = connection_string 96 | cluster = Cluster(connect_string, options) 97 | 98 | # Wait until the cluster is ready for use. 99 | cluster.wait_until_ready(timedelta(seconds=5)) 100 | 101 | return cluster 102 | 103 | 104 | def stream_string(s, chunk_size=10): 105 | """Stream a string with a delay to simulate streaming""" 106 | for i in range(0, len(s), chunk_size): 107 | yield s[i : i + chunk_size] 108 | time.sleep(0.02) 109 | 110 | 111 | if __name__ == "__main__": 112 | st.set_page_config( 113 | page_title="Chat with your PDF using Langchain, Couchbase & OpenAI", 114 | page_icon="🤖", 115 | layout="centered", 116 | initial_sidebar_state="auto", 117 | menu_items=None, 118 | ) 119 | 120 | AUTH_ENABLED = parse_bool(os.getenv("AUTH_ENABLED", "False")) 121 | 122 | if not AUTH_ENABLED: 123 | st.session_state.auth = True 124 | else: 125 | # Authorization 126 | if "auth" not in st.session_state: 127 | st.session_state.auth = False 128 | 129 | AUTH = os.getenv("LOGIN_PASSWORD") 130 | check_environment_variable("LOGIN_PASSWORD") 131 | 132 | # Authentication 133 | user_pwd = st.text_input("Enter password", type="password") 134 | pwd_submit = st.button("Submit") 135 | 136 | if pwd_submit and user_pwd == AUTH: 137 | st.session_state.auth = True 138 | elif pwd_submit and user_pwd != AUTH: 139 | st.error("Incorrect password") 140 | 141 | if st.session_state.auth: 142 | # Load environment variables 143 | DB_CONN_STR = os.getenv("DB_CONN_STR") 144 | DB_USERNAME = os.getenv("DB_USERNAME") 145 | DB_PASSWORD = os.getenv("DB_PASSWORD") 146 | DB_BUCKET = os.getenv("DB_BUCKET") 147 | DB_SCOPE = os.getenv("DB_SCOPE") 148 | DB_COLLECTION = os.getenv("DB_COLLECTION") 149 | INDEX_NAME = os.getenv("INDEX_NAME") 150 | CACHE_COLLECTION = os.getenv("CACHE_COLLECTION") 151 | 152 | # Ensure that all environment variables are set 153 | check_environment_variable("OPENAI_API_KEY") 154 | check_environment_variable("DB_CONN_STR") 155 | check_environment_variable("DB_USERNAME") 156 | check_environment_variable("DB_PASSWORD") 157 | check_environment_variable("DB_BUCKET") 158 | check_environment_variable("DB_SCOPE") 159 | check_environment_variable("DB_COLLECTION") 160 | check_environment_variable("INDEX_NAME") 161 | check_environment_variable("CACHE_COLLECTION") 162 | 163 | # Use OpenAI Embeddings 164 | embedding = OpenAIEmbeddings() 165 | 166 | # Connect to Couchbase Vector Store 167 | cluster = connect_to_couchbase(DB_CONN_STR, DB_USERNAME, DB_PASSWORD) 168 | 169 | vector_store = get_vector_store( 170 | cluster, 171 | DB_BUCKET, 172 | DB_SCOPE, 173 | DB_COLLECTION, 174 | embedding, 175 | INDEX_NAME, 176 | ) 177 | 178 | # Use couchbase vector store as a retriever for RAG 179 | retriever = vector_store.as_retriever() 180 | 181 | # Set the LLM cache 182 | cache = get_cache(cluster, DB_BUCKET, DB_SCOPE, CACHE_COLLECTION) 183 | set_llm_cache(cache) 184 | 185 | # Build the prompt for the RAG 186 | template = """You are a helpful bot. If you cannot answer based on the context provided, respond with a generic answer. Answer the question as truthfully as possible using the context below: 187 | {context} 188 | 189 | Question: {question}""" 190 | 191 | prompt = ChatPromptTemplate.from_template(template) 192 | 193 | # Use OpenAI GPT 4 as the LLM for the RAG 194 | llm = ChatOpenAI(temperature=0, model="gpt-4-1106-preview", streaming=True) 195 | 196 | # RAG chain 197 | chain = ( 198 | {"context": retriever, "question": RunnablePassthrough()} 199 | | prompt 200 | | llm 201 | | StrOutputParser() 202 | ) 203 | 204 | # Pure OpenAI output without RAG 205 | template_without_rag = """You are a helpful bot. Answer the question as truthfully as possible. 206 | 207 | Question: {question}""" 208 | 209 | prompt_without_rag = ChatPromptTemplate.from_template(template_without_rag) 210 | 211 | llm_without_rag = ChatOpenAI(model="gpt-4-1106-preview", streaming=True) 212 | 213 | chain_without_rag = ( 214 | {"question": RunnablePassthrough()} 215 | | prompt_without_rag 216 | | llm_without_rag 217 | | StrOutputParser() 218 | ) 219 | 220 | # Frontend 221 | couchbase_logo = ( 222 | "https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png" 223 | ) 224 | 225 | st.title("Chat with PDF") 226 | st.markdown( 227 | "Answers with [Couchbase logo](https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png) are generated using *RAG* while 🤖 are generated by pure *LLM (ChatGPT)*" 228 | ) 229 | 230 | with st.sidebar: 231 | st.header("Upload your PDF") 232 | with st.form("upload pdf"): 233 | uploaded_file = st.file_uploader( 234 | "Choose a PDF.", 235 | help="The document will be deleted after one hour of inactivity (TTL).", 236 | type="pdf", 237 | ) 238 | submitted = st.form_submit_button("Upload") 239 | if submitted: 240 | # store the PDF in the vector store after chunking 241 | save_to_vector_store(uploaded_file, vector_store) 242 | 243 | st.subheader("How does it work?") 244 | st.markdown( 245 | """ 246 | For each question, you will get two answers: 247 | * one using RAG ([Couchbase logo](https://emoji.slack-edge.com/T024FJS4M/couchbase/4a361e948b15ed91.png)) 248 | * one using pure LLM - OpenAI (🤖). 249 | """ 250 | ) 251 | 252 | st.markdown( 253 | "For RAG, we are using [Langchain](https://langchain.com/), [Couchbase Vector Search](https://couchbase.com/) & [OpenAI](https://openai.com/). We fetch parts of the PDF relevant to the question using Vector search & add it as the context to the LLM. The LLM is instructed to answer based on the context from the Vector Store." 254 | ) 255 | 256 | # View Code 257 | if st.checkbox("View Code"): 258 | st.write( 259 | "View the code here: [Github](https://github.com/couchbase-examples/rag-demo/blob/main/chat_with_pdf.py)" 260 | ) 261 | 262 | if "messages" not in st.session_state: 263 | st.session_state.messages = [] 264 | st.session_state.messages.append( 265 | { 266 | "role": "assistant", 267 | "content": "Hi, I'm a chatbot who can chat with the PDF. How can I help you?", 268 | "avatar": "🤖", 269 | } 270 | ) 271 | 272 | # Display chat messages from history on app rerun 273 | for message in st.session_state.messages: 274 | with st.chat_message(message["role"], avatar=message["avatar"]): 275 | st.markdown(message["content"]) 276 | 277 | # React to user input 278 | if question := st.chat_input("Ask a question based on the PDF"): 279 | # Display user message in chat message container 280 | st.chat_message("user").markdown(question) 281 | 282 | # Add user message to chat history 283 | st.session_state.messages.append( 284 | {"role": "user", "content": question, "avatar": "👤"} 285 | ) 286 | 287 | # Add placeholder for streaming the response 288 | with st.chat_message("assistant", avatar=couchbase_logo): 289 | # Get the response from the RAG & stream it 290 | # In order to cache the response, we need to invoke the chain and cache the response locally as OpenAI does not support it yet 291 | # Ref: https://github.com/langchain-ai/langchain/issues/9762 292 | 293 | rag_response = chain.invoke(question) 294 | 295 | st.write_stream(stream_string(rag_response)) 296 | 297 | st.session_state.messages.append( 298 | { 299 | "role": "assistant", 300 | "content": rag_response, 301 | "avatar": couchbase_logo, 302 | } 303 | ) 304 | 305 | # Get the response from the pure LLM & stream it 306 | pure_llm_response = chain_without_rag.invoke(question) 307 | 308 | # Add placeholder for streaming the response 309 | with st.chat_message("ai", avatar="🤖"): 310 | st.write_stream(stream_string(pure_llm_response)) 311 | 312 | st.session_state.messages.append( 313 | { 314 | "role": "assistant", 315 | "content": pure_llm_response, 316 | "avatar": "🤖", 317 | } 318 | ) 319 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain-couchbase==0.3.0 2 | streamlit==1.45.1 3 | langchain-community==0.3.24 4 | pypdf==5.6.0 5 | langchain-openai==0.3.18 --------------------------------------------------------------------------------