├── main.py ├── LICENSE ├── README.md ├── pdfquery.py ├── requirements.txt ├── streamlitui.py └── Gemini_ChatPDF.ipynb /main.py: -------------------------------------------------------------------------------- 1 | from langchain.document_loaders import PyPDFLoader 2 | from langchain.vectorstores import Chroma 3 | from langchain.embeddings.openai import OpenAIEmbeddings 4 | from langchain.document_loaders import UnstructuredPDFLoader 5 | # from langchain.llms import OpenAI 6 | from langchain.chat_models import ChatOpenAI 7 | from langchain.chains.question_answering import load_qa_chain 8 | 9 | # Replace book.pdf with any pdf of your choice 10 | loader = UnstructuredPDFLoader("book.pdf") 11 | pages = loader.load_and_split() 12 | embeddings = OpenAIEmbeddings() 13 | docsearch = Chroma.from_documents(pages, embeddings).as_retriever() 14 | 15 | # Choose any query of your choice 16 | query = "Who is Rich Dad?" 17 | docs = docsearch.get_relevant_documents(query) 18 | # chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff") 19 | chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff") 20 | output = chain.run(input_documents=docs, question=query) 21 | print(output) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Anil Chandra Naidu Matcha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ChatPDF 2 | 3 | Chat with any PDF. 4 | 5 | Easily upload the PDF documents you'd like to chat with. Instant answers. Ask questions, extract information, and summarize documents with AI. Sources included. 6 | 7 | ### Tutorial -> https://www.youtube.com/watch?v=3aRc1ijrTVs 8 | 9 | Create app like [ChatPDF](https://www.thesamur.ai/chatpdf-alternative) or [PDF.ai](https://pdf.ai/) in less than 10 lines of code 10 | 11 | ### Getting Started 12 | 13 | Code is up now, ⭐ (Star) the repo to receive updates 14 | 15 | Replit and streamlit version coming soon 16 | 17 | Follow [Anil Chandra Naidu Matcha](https://twitter.com/matchaman11) on twitter for updates 18 | 19 | Subscribe to https://www.youtube.com/@AnilChandraNaiduMatcha for more such video tutorials 20 | 21 | ### How to run ? (Things might change based on OS) 22 | 23 | 1. Create a virtual environment in python https://docs.python.org/3/library/venv.html 24 | 25 | 2. Run "pip install -r requirements.txt" 26 | 27 | 3. Set OPENAI_API_KEY environment variable with your openai key 28 | 29 | 4. Run "python main.py" 30 | 31 | 5. Change pdf file and query in code if you want to try with any other content 32 | 33 | To run streamlit app, follow the steps run "streamlit run streamlitui.py" 34 | 35 | Parts of the streamlit code is inspired from [here](https://github.com/viniciusarruda/chatpdf) 36 | 37 | ### Demo link 38 | 39 | https://heybot.thesamur.ai/ 40 | 41 | ### Also check 42 | 43 | [Chat with Website code](https://github.com/Anil-matcha/Website-to-Chatbot) 44 | 45 | [Chat with CSV code](https://github.com/Anil-matcha/Chat-With-Excel) 46 | 47 | [Chat with Youtube code](https://github.com/Anil-matcha/Chat-Youtube) 48 | 49 | [ChatGPT in Discord code](https://github.com/Anil-matcha/DiscordGPT) 50 | -------------------------------------------------------------------------------- /pdfquery.py: -------------------------------------------------------------------------------- 1 | import os 2 | from langchain.embeddings.openai import OpenAIEmbeddings 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter 4 | from langchain.vectorstores import Chroma 5 | from langchain.document_loaders import PyPDFium2Loader 6 | from langchain.chains.question_answering import load_qa_chain 7 | # from langchain.llms import OpenAI 8 | from langchain.chat_models import ChatOpenAI 9 | 10 | 11 | class PDFQuery: 12 | def __init__(self, openai_api_key = None) -> None: 13 | self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) 14 | os.environ["OPENAI_API_KEY"] = openai_api_key 15 | self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 16 | # self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key) 17 | self.llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key) 18 | self.chain = None 19 | self.db = None 20 | 21 | def ask(self, question: str) -> str: 22 | if self.chain is None: 23 | response = "Please, add a document." 24 | else: 25 | docs = self.db.get_relevant_documents(question) 26 | response = self.chain.run(input_documents=docs, question=question) 27 | return response 28 | 29 | def ingest(self, file_path: os.PathLike) -> None: 30 | loader = PyPDFium2Loader(file_path) 31 | documents = loader.load() 32 | splitted_documents = self.text_splitter.split_documents(documents) 33 | self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever() 34 | # self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff") 35 | self.chain = load_qa_chain(ChatOpenAI(temperature=0), chain_type="stuff") 36 | 37 | def forget(self) -> None: 38 | self.db = None 39 | self.chain = None -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | anyio==3.6.2 4 | argilla==1.6.0 5 | async-timeout==4.0.2 6 | attrs==23.1.0 7 | backoff==2.2.1 8 | certifi==2023.5.7 9 | cffi==1.15.1 10 | charset-normalizer==3.1.0 11 | chromadb==0.3.22 12 | click==8.1.3 13 | clickhouse-connect==0.5.23 14 | colorama==0.4.6 15 | commonmark==0.9.1 16 | cryptography==40.0.2 17 | dataclasses-json==0.5.7 18 | Deprecated==1.2.13 19 | duckdb==0.7.1 20 | et-xmlfile==1.1.0 21 | faiss-cpu==1.7.4 22 | fastapi==0.95.1 23 | filelock==3.12.0 24 | frozenlist==1.3.3 25 | fsspec==2023.5.0 26 | greenlet==2.0.2 27 | h11==0.14.0 28 | hnswlib==0.7.0 29 | httpcore==0.16.3 30 | httptools==0.5.0 31 | httpx==0.23.3 32 | huggingface-hub==0.14.1 33 | idna==3.4 34 | importlib-metadata==6.6.0 35 | Jinja2==3.1.2 36 | joblib==1.2.0 37 | langchain==0.0.161 38 | lxml==4.9.2 39 | lz4==4.3.2 40 | Markdown==3.4.3 41 | MarkupSafe==2.1.2 42 | marshmallow==3.19.0 43 | marshmallow-enum==1.5.1 44 | monotonic==1.6 45 | mpmath==1.3.0 46 | msg-parser==1.2.0 47 | multidict==6.0.4 48 | mypy-extensions==1.0.0 49 | networkx==3.1 50 | nltk==3.8.1 51 | numexpr==2.8.4 52 | numpy==1.23.5 53 | olefile==0.46 54 | openai==0.27.6 55 | openapi-schema-pydantic==1.2.4 56 | openpyxl==3.1.2 57 | packaging==23.1 58 | pandas==1.5.3 59 | pdfminer.six==20221105 60 | Pillow==9.5.0 61 | posthog==3.0.1 62 | pycparser==2.21 63 | pydantic==1.10.7 64 | Pygments==2.15.1 65 | pypandoc==1.11 66 | pypdf==3.8.1 67 | python-dateutil==2.8.2 68 | python-docx==0.8.11 69 | python-dotenv==1.0.0 70 | python-magic==0.4.27 71 | python-pptx==0.6.21 72 | pytz==2023.3 73 | PyYAML==6.0 74 | regex==2023.5.5 75 | requests==2.30.0 76 | rfc3986==1.5.0 77 | rich==13.0.1 78 | scikit-learn==1.2.2 79 | scipy==1.10.1 80 | sentence-transformers==2.2.2 81 | sentencepiece==0.1.99 82 | six==1.16.0 83 | sniffio==1.3.0 84 | SQLAlchemy==2.0.12 85 | starlette==0.26.1 86 | sympy==1.11.1 87 | tenacity==8.2.2 88 | threadpoolctl==3.1.0 89 | tiktoken==0.4.0 90 | tokenizers==0.13.3 91 | torch==2.0.1 92 | torchvision==0.15.2 93 | tqdm==4.65.0 94 | transformers==4.28.1 95 | typing-inspect==0.8.0 96 | typing_extensions==4.5.0 97 | unstructured==0.6.3 98 | urllib3==2.0.2 99 | uvicorn==0.22.0 100 | watchfiles==0.19.0 101 | websockets==11.0.3 102 | wrapt==1.14.1 103 | XlsxWriter==3.1.0 104 | yarl==1.9.2 105 | zipp==3.15.0 106 | zstandard==0.21.0 107 | pypdfium2==4.15.0 108 | -------------------------------------------------------------------------------- /streamlitui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import streamlit as st 4 | from streamlit_chat import message 5 | from pdfquery import PDFQuery 6 | 7 | st.set_page_config(page_title="ChatPDF") 8 | 9 | 10 | def display_messages(): 11 | st.subheader("Chat") 12 | for i, (msg, is_user) in enumerate(st.session_state["messages"]): 13 | message(msg, is_user=is_user, key=str(i)) 14 | st.session_state["thinking_spinner"] = st.empty() 15 | 16 | 17 | def process_input(): 18 | if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0: 19 | user_text = st.session_state["user_input"].strip() 20 | with st.session_state["thinking_spinner"], st.spinner(f"Thinking"): 21 | query_text = st.session_state["pdfquery"].ask(user_text) 22 | 23 | st.session_state["messages"].append((user_text, True)) 24 | st.session_state["messages"].append((query_text, False)) 25 | 26 | 27 | def read_and_save_file(): 28 | st.session_state["pdfquery"].forget() # to reset the knowledge base 29 | st.session_state["messages"] = [] 30 | st.session_state["user_input"] = "" 31 | 32 | for file in st.session_state["file_uploader"]: 33 | with tempfile.NamedTemporaryFile(delete=False) as tf: 34 | tf.write(file.getbuffer()) 35 | file_path = tf.name 36 | 37 | with st.session_state["ingestion_spinner"], st.spinner(f"Ingesting {file.name}"): 38 | st.session_state["pdfquery"].ingest(file_path) 39 | os.remove(file_path) 40 | 41 | 42 | def is_openai_api_key_set() -> bool: 43 | return len(st.session_state["OPENAI_API_KEY"]) > 0 44 | 45 | 46 | def main(): 47 | if len(st.session_state) == 0: 48 | st.session_state["messages"] = [] 49 | st.session_state["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "") 50 | if is_openai_api_key_set(): 51 | st.session_state["pdfquery"] = PDFQuery(st.session_state["OPENAI_API_KEY"]) 52 | else: 53 | st.session_state["pdfquery"] = None 54 | 55 | st.header("ChatPDF") 56 | 57 | if st.text_input("OpenAI API Key", value=st.session_state["OPENAI_API_KEY"], key="input_OPENAI_API_KEY", type="password"): 58 | if ( 59 | len(st.session_state["input_OPENAI_API_KEY"]) > 0 60 | and st.session_state["input_OPENAI_API_KEY"] != st.session_state["OPENAI_API_KEY"] 61 | ): 62 | st.session_state["OPENAI_API_KEY"] = st.session_state["input_OPENAI_API_KEY"] 63 | if st.session_state["pdfquery"] is not None: 64 | st.warning("Please, upload the files again.") 65 | st.session_state["messages"] = [] 66 | st.session_state["user_input"] = "" 67 | st.session_state["pdfquery"] = PDFQuery(st.session_state["OPENAI_API_KEY"]) 68 | 69 | st.subheader("Upload a document") 70 | st.file_uploader( 71 | "Upload document", 72 | type=["pdf"], 73 | key="file_uploader", 74 | on_change=read_and_save_file, 75 | label_visibility="collapsed", 76 | accept_multiple_files=True, 77 | disabled=not is_openai_api_key_set(), 78 | ) 79 | 80 | st.session_state["ingestion_spinner"] = st.empty() 81 | 82 | display_messages() 83 | st.text_input("Message", key="user_input", disabled=not is_openai_api_key_set(), on_change=process_input) 84 | 85 | st.divider() 86 | st.markdown("Source code: [Github](https://github.com/Anil-matcha/ChatPDF)") 87 | 88 | 89 | if __name__ == "__main__": 90 | main() -------------------------------------------------------------------------------- /Gemini_ChatPDF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyN5Se2cWEHsjMjNgFwBd1HT", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "ChatPDF flow\n", 33 | "\n", 34 | "![ChatPDF flow](https://miro.medium.com/v2/resize:fit:1400/1*leoW-Pn0ohWalrUBbzdidA.png)" 35 | ], 36 | "metadata": { 37 | "id": "0V6pj2_r8Pbx" 38 | } 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "source": [ 43 | "Setup Google api key" 44 | ], 45 | "metadata": { 46 | "id": "_H6RsKxj4Rxt" 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "source": [ 52 | "import os\n", 53 | "os.environ[\"GOOGLE_API_KEY\"] = \"gemini-api-key\"" 54 | ], 55 | "metadata": { 56 | "id": "5PpWJiu32_rI" 57 | }, 58 | "execution_count": null, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "source": [ 64 | "Install requried dependancies" 65 | ], 66 | "metadata": { 67 | "id": "cT2EMASV4aCm" 68 | } 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "colab": { 75 | "base_uri": "https://localhost:8080/" 76 | }, 77 | "id": "6O5CTvJzPZ6T", 78 | "outputId": "38b6eac2-828c-47ba-b9d9-606cefd5ec8b" 79 | }, 80 | "outputs": [ 81 | { 82 | "output_type": "stream", 83 | "name": "stdout", 84 | "text": [ 85 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m811.8/811.8 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 86 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.6/17.6 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 87 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m284.0/284.0 kB\u001b[0m \u001b[31m29.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 88 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m132.8/132.8 kB\u001b[0m \u001b[31m12.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 89 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.4/239.4 kB\u001b[0m \u001b[31m22.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 90 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m45.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 91 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.7/55.7 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 92 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 93 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 94 | "\u001b[?25h" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "%pip install -U --quiet langchain-google-genai langchain faiss-cpu pypdf sentence-transformers" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "source": [ 105 | "Add necessary imports" 106 | ], 107 | "metadata": { 108 | "id": "ba_-ophd4nwH" 109 | } 110 | }, 111 | { 112 | "cell_type": "code", 113 | "source": [ 114 | "from langchain.document_loaders import PyPDFLoader\n", 115 | "from langchain.embeddings import HuggingFaceEmbeddings\n", 116 | "from langchain_google_genai import ChatGoogleGenerativeAI\n", 117 | "from langchain import FAISS" 118 | ], 119 | "metadata": { 120 | "id": "NsKZYeSf4im_" 121 | }, 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "source": [ 128 | "Load the documents" 129 | ], 130 | "metadata": { 131 | "id": "e156mdeh4xZq" 132 | } 133 | }, 134 | { 135 | "cell_type": "code", 136 | "source": [ 137 | "loader = PyPDFLoader(\"./Apps.pdf\")\n", 138 | "pages = loader.load_and_split()" 139 | ], 140 | "metadata": { 141 | "id": "Bpef2RpP3N_o" 142 | }, 143 | "execution_count": null, 144 | "outputs": [] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "source": [ 149 | "Create a vector db index" 150 | ], 151 | "metadata": { 152 | "id": "lhNGhtHe4y5a" 153 | } 154 | }, 155 | { 156 | "cell_type": "code", 157 | "source": [ 158 | "embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", 159 | "db = FAISS.from_documents(pages, embeddings)" 160 | ], 161 | "metadata": { 162 | "id": "KZa9CsMY3Peo" 163 | }, 164 | "execution_count": null, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "source": [ 170 | "Search relevant docs" 171 | ], 172 | "metadata": { 173 | "id": "w4bkl40C45nl" 174 | } 175 | }, 176 | { 177 | "cell_type": "code", 178 | "source": [ 179 | "query = \"What is Gista?\"\n", 180 | "docs = db.similarity_search(query)" 181 | ], 182 | "metadata": { 183 | "id": "TQ8-IEnG3Rhd" 184 | }, 185 | "execution_count": null, 186 | "outputs": [] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "source": [ 191 | "Invoke Gemini api with the qa prompt" 192 | ], 193 | "metadata": { 194 | "id": "P8i6Xnes5Kg2" 195 | } 196 | }, 197 | { 198 | "cell_type": "code", 199 | "source": [ 200 | "content = \"\\n\".join([x.page_content for x in docs])\n", 201 | "qa_prompt = \"Use the following pieces of context to answer the user's question. If you don't know the answer, just say that you don't know, don't try to make up an answer.----------------\"\n", 202 | "input_text = qa_prompt+\"\\nContext:\"+content+\"\\nUser question:\\n\"+query\n", 203 | "llm = ChatGoogleGenerativeAI(model=\"gemini-pro\")\n", 204 | "result = llm.invoke(input_text)\n", 205 | "result.content" 206 | ], 207 | "metadata": { 208 | "colab": { 209 | "base_uri": "https://localhost:8080/", 210 | "height": 53 211 | }, 212 | "id": "qw6FRgUz3ZBc", 213 | "outputId": "af31e57c-c3ee-4fbb-81d6-0728ab05f764" 214 | }, 215 | "execution_count": null, 216 | "outputs": [ 217 | { 218 | "output_type": "execute_result", 219 | "data": { 220 | "text/plain": [ 221 | "'Gista is a professional AI tool that empowers businesses to build their own ChatGPT Plugin, an AI assistant, and deploy custom chatbots on their websites.'" 222 | ], 223 | "application/vnd.google.colaboratory.intrinsic+json": { 224 | "type": "string" 225 | } 226 | }, 227 | "metadata": {}, 228 | "execution_count": 12 229 | } 230 | ] 231 | } 232 | ] 233 | } --------------------------------------------------------------------------------