├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── brain.py
├── compare medium.gif
├── requirements.txt
└── thumbnail.webp


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Avra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Retrieval Augmented Generation (RAG) for chatbots
 2 | RAG enabled Chatbots using [LangChain](https://www.langchain.com) and [Databutton](https://databutton.com/login?utm_source=github&utm_medium=avra&utm_article=rag)
 3 | ![](https://github.com/avrabyt/RAG-Chatbot/blob/main/thumbnail.webp)
 4 | 
 5 | - For the front-end : `app.py`
 6 | - PDF parsing and indexing : `brain.py`
 7 | - API keys are maintained over databutton secret management
 8 | - Indexed are stored over session state 
 9 | 
10 | Oversimplified explanation : (**Retrieval**) Fetch the top N similar contexts via similarity search from the indexed PDF files -> concatanate those to the prompt (**Prompt Augumentation**) -> Pass it to the LLM -> which further generates response (**Generation**) like any LLM does. **More in the blog!**
11 | 
12 | **Blog Post - [Here](https://medium.com/databutton/why-your-next-ai-product-needs-rag-implemented-in-it-9ee22f9770c8)**
13 | 
14 | **YouTube video - [Here](https://youtu.be/Yh1GEWqgkt0)**
15 | 
16 | To get started quickly, you can use the “Chat with PDF” [template](https://databutton.com/new?templateId=pt-x2Rh7dEYwIuCxXaR) within Databutton 🚀
17 | 
18 | > Alternatively, you can use [Streamlit](https://streamlit.io) to build and deploy! In that case few changes such as `st.secrets` needs to be implemented!
19 | 
20 | # Similar projects
21 | 
22 | #### [Building a Simple Chatbot using ChatGPTAPI & Databutton with memory 🧠](https://github.com/avrabyt/MemoryBot)
23 | 
24 | >Memory implementation can also be an interesting feature in this current RAG enabled Chatbot.
25 | 
26 | Repo - [MemoryBot](https://github.com/avrabyt/MemoryBot)
27 | 
28 | The live demo app is hosted over [here](https://next.databutton.com/v/lgzxq112/Memory_Bot)
29 | 
30 | Blog - [here](https://medium.com/@avra42/how-to-build-a-chatbot-with-chatgpt-api-and-a-conversational-memory-in-python-8d856cda4542) 
31 | 
32 | Video - [here](https://youtu.be/cHjlperESbg)
33 | 
34 | #### [PDF Chatbot with Memory](https://github.com/avrabyt/PersonalMemoryBot)
35 | > Similar to Chat with PDF approach, with enabled memory. 
36 | 
37 | Demo App - [here](https://next.databutton.com/v/lgzxq112/Personalised_Memory_Bot)
38 | 
39 | Video - [here](https://youtu.be/daMNGGPJkEE)
40 | 
41 | Blog - [here](https://medium.com/@avra42/how-to-build-a-personalized-pdf-chat-bot-with-conversational-memory-965280c160f8)
42 | 
43 | ![](https://github.com/avrabyt/RAG-Chatbot/blob/main/compare%20medium.gif)
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | # Import necessary libraries
  2 | import databutton as db
  3 | import streamlit as st
  4 | import openai
  5 | from brain import get_index_for_pdf
  6 | from langchain.chains import RetrievalQA
  7 | from langchain.chat_models import ChatOpenAI
  8 | import os
  9 | 
 10 | # Set the title for the Streamlit app
 11 | st.title("RAG enhanced Chatbot")
 12 | 
 13 | # Set up the OpenAI API key from databutton secrets
 14 | os.environ["OPENAI_API_KEY"] = db.secrets.get("OPENAI_API_KEY")
 15 | openai.api_key = db.secrets.get("OPENAI_API_KEY")
 16 | 
 17 | 
 18 | # Cached function to create a vectordb for the provided PDF files
 19 | @st.cache_data
 20 | def create_vectordb(files, filenames):
 21 |     # Show a spinner while creating the vectordb
 22 |     with st.spinner("Vector database"):
 23 |         vectordb = get_index_for_pdf(
 24 |             [file.getvalue() for file in files], filenames, openai.api_key
 25 |         )
 26 |     return vectordb
 27 | 
 28 | 
 29 | # Upload PDF files using Streamlit's file uploader
 30 | pdf_files = st.file_uploader("", type="pdf", accept_multiple_files=True)
 31 | 
 32 | # If PDF files are uploaded, create the vectordb and store it in the session state
 33 | if pdf_files:
 34 |     pdf_file_names = [file.name for file in pdf_files]
 35 |     st.session_state["vectordb"] = create_vectordb(pdf_files, pdf_file_names)
 36 | 
 37 | # Define the template for the chatbot prompt
 38 | prompt_template = """
 39 |     You are a helpful Assistant who answers to users questions based on multiple contexts given to you.
 40 | 
 41 |     Keep your answer short and to the point.
 42 |     
 43 |     The evidence are the context of the pdf extract with metadata. 
 44 |     
 45 |     Carefully focus on the metadata specially 'filename' and 'page' whenever answering.
 46 |     
 47 |     Make sure to add filename and page number at the end of sentence you are citing to.
 48 |         
 49 |     Reply "Not applicable" if text is irrelevant.
 50 |      
 51 |     The PDF content is:
 52 |     {pdf_extract}
 53 | """
 54 | 
 55 | # Get the current prompt from the session state or set a default value
 56 | prompt = st.session_state.get("prompt", [{"role": "system", "content": "none"}])
 57 | 
 58 | # Display previous chat messages
 59 | for message in prompt:
 60 |     if message["role"] != "system":
 61 |         with st.chat_message(message["role"]):
 62 |             st.write(message["content"])
 63 | 
 64 | # Get the user's question using Streamlit's chat input
 65 | question = st.chat_input("Ask anything")
 66 | 
 67 | # Handle the user's question
 68 | if question:
 69 |     vectordb = st.session_state.get("vectordb", None)
 70 |     if not vectordb:
 71 |         with st.message("assistant"):
 72 |             st.write("You need to provide a PDF")
 73 |             st.stop()
 74 | 
 75 |     # Search the vectordb for similar content to the user's question
 76 |     search_results = vectordb.similarity_search(question, k=3)
 77 |     # search_results
 78 |     pdf_extract = "/n ".join([result.page_content for result in search_results])
 79 | 
 80 |     # Update the prompt with the pdf extract
 81 |     prompt[0] = {
 82 |         "role": "system",
 83 |         "content": prompt_template.format(pdf_extract=pdf_extract),
 84 |     }
 85 | 
 86 |     # Add the user's question to the prompt and display it
 87 |     prompt.append({"role": "user", "content": question})
 88 |     with st.chat_message("user"):
 89 |         st.write(question)
 90 | 
 91 |     # Display an empty assistant message while waiting for the response
 92 |     with st.chat_message("assistant"):
 93 |         botmsg = st.empty()
 94 | 
 95 |     # Call ChatGPT with streaming and display the response as it comes
 96 |     response = []
 97 |     result = ""
 98 |     for chunk in openai.ChatCompletion.create(
 99 |         model="gpt-3.5-turbo", messages=prompt, stream=True
100 |     ):
101 |         text = chunk.choices[0].get("delta", {}).get("content")
102 |         if text is not None:
103 |             response.append(text)
104 |             result = "".join(response).strip()
105 |             botmsg.write(result)
106 | 
107 |     # Add the assistant's response to the prompt
108 |     prompt.append({"role": "assistant", "content": result})
109 | 
110 |     # Store the updated prompt in the session state
111 |     st.session_state["prompt"] = prompt
112 |     prompt.append({"role": "assistant", "content": result})
113 | 
114 |     # Store the updated prompt in the session state
115 |     st.session_state["prompt"] = prompt
116 | 


--------------------------------------------------------------------------------
/brain.py:
--------------------------------------------------------------------------------
 1 | import databutton as db
 2 | import re
 3 | from io import BytesIO
 4 | from typing import Tuple, List
 5 | import pickle
 6 | 
 7 | from langchain.docstore.document import Document
 8 | from langchain.embeddings.openai import OpenAIEmbeddings
 9 | from langchain.text_splitter import RecursiveCharacterTextSplitter
10 | from langchain.vectorstores.faiss import FAISS
11 | from pypdf import PdfReader
12 | import faiss
13 | 
14 | 
15 | def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]:
16 |     pdf = PdfReader(file)
17 |     output = []
18 |     for page in pdf.pages:
19 |         text = page.extract_text()
20 |         text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
21 |         text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
22 |         text = re.sub(r"\n\s*\n", "\n\n", text)
23 |         output.append(text)
24 |     return output, filename
25 | 
26 | 
27 | def text_to_docs(text: List[str], filename: str) -> List[Document]:
28 |     if isinstance(text, str):
29 |         text = [text]
30 |     page_docs = [Document(page_content=page) for page in text]
31 |     for i, doc in enumerate(page_docs):
32 |         doc.metadata["page"] = i + 1
33 | 
34 |     doc_chunks = []
35 |     for doc in page_docs:
36 |         text_splitter = RecursiveCharacterTextSplitter(
37 |             chunk_size=4000,
38 |             separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
39 |             chunk_overlap=0,
40 |         )
41 |         chunks = text_splitter.split_text(doc.page_content)
42 |         for i, chunk in enumerate(chunks):
43 |             doc = Document(
44 |                 page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
45 |             )
46 |             doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
47 |             doc.metadata["filename"] = filename  # Add filename to metadata
48 |             doc_chunks.append(doc)
49 |     return doc_chunks
50 | 
51 | 
52 | def docs_to_index(docs, openai_api_key):
53 |     index = FAISS.from_documents(docs, OpenAIEmbeddings(openai_api_key=openai_api_key))
54 |     return index
55 | 
56 | 
57 | def get_index_for_pdf(pdf_files, pdf_names, openai_api_key):
58 |     documents = []
59 |     for pdf_file, pdf_name in zip(pdf_files, pdf_names):
60 |         text, filename = parse_pdf(BytesIO(pdf_file), pdf_name)
61 |         documents = documents + text_to_docs(text, filename)
62 |     index = docs_to_index(documents, openai_api_key)
63 |     return index
64 | 
65 | 


--------------------------------------------------------------------------------
/compare medium.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avrabyt/RAG-Chatbot/c07bdcf04410840e432ef1347a2d2d9aa8b20bdf/compare medium.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai
2 | langchain
3 | faiss-cpu
4 | pypdf
5 | tiktoken
6 | 


--------------------------------------------------------------------------------
/thumbnail.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/avrabyt/RAG-Chatbot/c07bdcf04410840e432ef1347a2d2d9aa8b20bdf/thumbnail.webp


--------------------------------------------------------------------------------