├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── chatbot_app.py ├── constants.py ├── ingest.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python image 2 | FROM python:3.10 3 | 4 | # Set the working directory inside the container 5 | WORKDIR /app 6 | 7 | # Copy the requirements.txt file first to leverage Docker cache 8 | COPY requirements.txt . 9 | 10 | # Install required Python packages 11 | RUN pip install -r requirements.txt --default-timeout=100 future 12 | 13 | # Copy the rest of the application files to the container's working directory 14 | COPY . . 15 | 16 | # Expose the port that Streamlit will run on 17 | EXPOSE 8501 18 | 19 | # Command to run your Streamlit application 20 | CMD ["streamlit", "run", "chatbot_app.py"] 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 AI Anytime 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chat-with-PDF-Chatbot 2 | This Chatbot is an interactive app developed to assist users to interact with their PDF. It is built using Open Source Stack. No OpenAI is required. 3 | 4 | ## Getting Started 5 | 6 | Follow these steps to set up and run the project on your local machine. 7 | 8 | 9 | ### Installation 10 | 11 | ```sh 12 | ## Clone the repository 13 | git clone 14 | 15 | ## Create the necessary folders 16 | mkdir db 17 | mkdir models 18 | ## Add your model files to the 'models' folder 19 | mkdir docs 20 | 21 | ---- 22 | ### Usage 23 | 24 | ## Run the ingestion script to prepare the data 25 | 26 | `python ingest.py` 27 | 28 | ## Start the chatbot application using Streamlit 29 | 30 | `streamlit run chatbot_app.py` -------------------------------------------------------------------------------- /chatbot_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import os 3 | import base64 4 | import time 5 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 6 | from transformers import pipeline 7 | import torch 8 | import textwrap 9 | from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader 10 | from langchain.text_splitter import RecursiveCharacterTextSplitter 11 | from langchain.embeddings import SentenceTransformerEmbeddings 12 | from langchain.vectorstores import Chroma 13 | from langchain.llms import HuggingFacePipeline 14 | from langchain.chains import RetrievalQA 15 | from constants import CHROMA_SETTINGS 16 | from streamlit_chat import message 17 | 18 | st.set_page_config(layout="wide") 19 | 20 | device = torch.device('cpu') 21 | 22 | checkpoint = "MBZUAI/LaMini-T5-738M" 23 | print(f"Checkpoint path: {checkpoint}") # Add this line for debugging 24 | tokenizer = AutoTokenizer.from_pretrained(checkpoint) 25 | base_model = AutoModelForSeq2SeqLM.from_pretrained( 26 | checkpoint, 27 | device_map=device, 28 | torch_dtype=torch.float32 29 | ) 30 | 31 | persist_directory = "db" 32 | 33 | @st.cache_resource 34 | def data_ingestion(): 35 | for root, dirs, files in os.walk("docs"): 36 | for file in files: 37 | if file.endswith(".pdf"): 38 | print(file) 39 | loader = PDFMinerLoader(os.path.join(root, file)) 40 | documents = loader.load() 41 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500) 42 | texts = text_splitter.split_documents(documents) 43 | #create embeddings here 44 | embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") 45 | #create vector store here 46 | db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) 47 | db.persist() 48 | db=None 49 | 50 | @st.cache_resource 51 | def llm_pipeline(): 52 | pipe = pipeline( 53 | 'text2text-generation', 54 | model = base_model, 55 | tokenizer = tokenizer, 56 | max_length = 256, 57 | do_sample = True, 58 | temperature = 0.3, 59 | top_p= 0.95, 60 | device=device 61 | ) 62 | local_llm = HuggingFacePipeline(pipeline=pipe) 63 | return local_llm 64 | 65 | @st.cache_resource 66 | def qa_llm(): 67 | llm = llm_pipeline() 68 | embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") 69 | db = Chroma(persist_directory="db", embedding_function = embeddings, client_settings=CHROMA_SETTINGS) 70 | retriever = db.as_retriever() 71 | qa = RetrievalQA.from_chain_type( 72 | llm = llm, 73 | chain_type = "stuff", 74 | retriever = retriever, 75 | return_source_documents=True 76 | ) 77 | return qa 78 | 79 | def process_answer(instruction): 80 | response = '' 81 | instruction = instruction 82 | qa = qa_llm() 83 | generated_text = qa(instruction) 84 | answer = generated_text['result'] 85 | return answer 86 | 87 | def get_file_size(file): 88 | file.seek(0, os.SEEK_END) 89 | file_size = file.tell() 90 | file.seek(0) 91 | return file_size 92 | 93 | @st.cache_data 94 | #function to display the PDF of a given file 95 | def displayPDF(file): 96 | # Opening file from file path 97 | with open(file, "rb") as f: 98 | base64_pdf = base64.b64encode(f.read()).decode('utf-8') 99 | 100 | # Embedding PDF in HTML 101 | pdf_display = F'' 102 | 103 | # Displaying File 104 | st.markdown(pdf_display, unsafe_allow_html=True) 105 | 106 | # Display conversation history using Streamlit messages 107 | def display_conversation(history): 108 | for i in range(len(history["generated"])): 109 | message(history["past"][i], is_user=True, key=str(i) + "_user") 110 | message(history["generated"][i],key=str(i)) 111 | 112 | def main(): 113 | st.markdown("

Chat with your PDF 🦜📄

", unsafe_allow_html=True) 114 | st.markdown("

Built by AI Anytime with ❤️

", unsafe_allow_html=True) 115 | 116 | st.markdown("

Upload your PDF 👇

", unsafe_allow_html=True) 117 | 118 | uploaded_file = st.file_uploader("", type=["pdf"]) 119 | 120 | if uploaded_file is not None: 121 | file_details = { 122 | "Filename": uploaded_file.name, 123 | "File size": get_file_size(uploaded_file) 124 | } 125 | filepath = "docs/"+uploaded_file.name 126 | with open(filepath, "wb") as temp_file: 127 | temp_file.write(uploaded_file.read()) 128 | 129 | col1, col2= st.columns([1,2]) 130 | with col1: 131 | st.markdown("

File details

", unsafe_allow_html=True) 132 | st.json(file_details) 133 | st.markdown("

File preview

", unsafe_allow_html=True) 134 | pdf_view = displayPDF(filepath) 135 | 136 | with col2: 137 | with st.spinner('Embeddings are in process...'): 138 | ingested_data = data_ingestion() 139 | st.success('Embeddings are created successfully!') 140 | st.markdown("

Chat Here

", unsafe_allow_html=True) 141 | 142 | 143 | user_input = st.text_input("", key="input") 144 | 145 | # Initialize session state for generated responses and past messages 146 | if "generated" not in st.session_state: 147 | st.session_state["generated"] = ["I am ready to help you"] 148 | if "past" not in st.session_state: 149 | st.session_state["past"] = ["Hey there!"] 150 | 151 | # Search the database for a response based on user input and update session state 152 | if user_input: 153 | answer = process_answer({'query': user_input}) 154 | st.session_state["past"].append(user_input) 155 | response = answer 156 | st.session_state["generated"].append(response) 157 | 158 | # Display conversation history using Streamlit messages 159 | if st.session_state["generated"]: 160 | display_conversation(st.session_state) 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | if __name__ == "__main__": 169 | main() 170 | 171 | 172 | -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | import chromadb 3 | from chromadb.config import Settings 4 | CHROMA_SETTINGS = Settings( 5 | chroma_db_impl='duckdb+parquet', 6 | persist_directory='db', 7 | anonymized_telemetry=False 8 | ) 9 | -------------------------------------------------------------------------------- /ingest.py: -------------------------------------------------------------------------------- 1 | from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader 2 | from langchain.text_splitter import RecursiveCharacterTextSplitter 3 | from langchain.embeddings import SentenceTransformerEmbeddings 4 | from langchain.vectorstores import Chroma 5 | import os 6 | from constants import CHROMA_SETTINGS 7 | 8 | persist_directory = "db" 9 | 10 | def main(): 11 | for root, dirs, files in os.walk("docs"): 12 | for file in files: 13 | if file.endswith(".pdf"): 14 | print(file) 15 | loader = PyPDFLoader(os.path.join(root, file)) 16 | documents = loader.load() 17 | print("splitting into chunks") 18 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100) 19 | texts = text_splitter.split_documents(documents) 20 | #create embeddings here 21 | print("Loading sentence transformers model") 22 | embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") 23 | #create vector store here 24 | print(f"Creating embeddings. May take some minutes...") 25 | db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS) 26 | db.persist() 27 | db=None 28 | 29 | print(f"Ingestion complete! You can now run privateGPT.py to query your documents") 30 | 31 | if __name__ == "__main__": 32 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.267 2 | streamlit==1.25.0 3 | transformers==4.31.0 4 | torch==2.0.1 5 | einops==0.6.1 6 | bitsandbytes==0.41.1 7 | accelerate==0.21.0 8 | pdfminer.six==20221105 9 | bs4==0.0.1 10 | sentence_transformers 11 | duckdb==0.7.1 12 | chromadb==0.3.26 13 | beautifulsoup4==4.12.2 14 | sentence-transformers==2.2.2 15 | sentencepiece==0.1.99 16 | six==1.16.0 17 | requests==2.31.0 18 | uvicorn==0.18.3 19 | torch==2.0.1 20 | torchvision==0.15.2 21 | streamlit-chat 22 | --------------------------------------------------------------------------------