├── LICENSE ├── README.md ├── main.py ├── pdfquery.py └── streamlitui.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 ncodepro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ChatPDF 2 | Chat with any PDF. 3 | 4 | Easily upload the PDF documents you'd like to chat with. Instant answers. Ask questions, extract information, and summarize documents with AI. Sources included. 5 | 6 | Create app like ChatPDF or PDF.ai in less than 10 lines of code 7 | 8 | Getting Started 9 | Code is up now, ⭐ (Star) the repo to receive updates 10 | 11 | Replit and Colab version coming soon 12 | 13 | Follow [Ankur Singh](https://twitter.com/ankur_maker) on twitter for updates 14 | 15 | How to run ? (Things might change based on OS) 16 | 17 | Create a virtual environment in python https://docs.python.org/3/library/venv.html 18 | 19 | Run "pip install langchain pypdf chromadb openai tiktoken" 20 | 21 | Set OPENAI_API_KEY environment variable with your openai key 22 | 23 | Run "python main.py" 24 | 25 | Change pdf file path and ask other questions 26 | 27 | To run streamlit app, follow the steps run "streamlit run streamlitui.py" 28 | 29 | Demo link 30 | https://heybot.thesamur.ai/ 31 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Import necessary libraries for pdf loading, vectorization, embeddings, 2 | # LLM model, and question answering chain. 3 | from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader 4 | from langchain.vectorstores import Chroma 5 | from langchain.embeddings.openai import OpenAIEmbeddings 6 | from langchain.llms import OpenAI 7 | from langchain.chains.question_answering import load_qa_chain 8 | import os 9 | 10 | # Set OpenAI API key as an environment variable. 11 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_KEY" 12 | 13 | # Specify the path to the pdf file. 14 | pdf_path = "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf" 15 | 16 | # Create a PyPDFLoader object with the pdf path. 17 | loader = PyPDFLoader(pdf_path) 18 | 19 | # Load and split the pages of the pdf document. 20 | pages = loader.load_and_split() 21 | 22 | # Create an OpenAIEmbeddings object for embeddings. 23 | embeddings = OpenAIEmbeddings() 24 | 25 | # Create a Chroma object from documents for vectorization, and transform it into a document retriever. 26 | docsearch = Chroma.from_documents(pages, embeddings).as_retriever() 27 | 28 | # Define the query for the model. 29 | query = "What does Naval's thought about on how to be happy" 30 | 31 | # Retrieve the documents relevant to the query. 32 | docs = docsearch.get_relevant_documents(query) 33 | 34 | # Load a question answering chain with OpenAI's model (temperature of 0 means deterministic responses). 35 | # Chain_type is set to "stuff". 36 | chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff") 37 | 38 | # Run the question answering chain with the relevant documents and the query. 39 | output = chain.run(input_documents=docs, question=query) 40 | 41 | # Print the output from the question answering chain. 42 | print(output) 43 | -------------------------------------------------------------------------------- /pdfquery.py: -------------------------------------------------------------------------------- 1 | import os 2 | from langchain.embeddings.openai import OpenAIEmbeddings 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter 4 | from langchain.vectorstores import Chroma 5 | from langchain.document_loaders import PyPDFium2Loader 6 | from langchain.chains.question_answering import load_qa_chain 7 | from langchain.llms import OpenAI 8 | 9 | class PDFQuery: 10 | def __init__(self, openai_api_key=None): 11 | """ 12 | Initializes the PDFQuery object. 13 | 14 | Args: 15 | openai_api_key (str): The API key for OpenAI. 16 | """ 17 | self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key) 18 | os.environ["OPENAI_API_KEY"] = openai_api_key 19 | self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) 20 | self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key) 21 | self.chain = None 22 | self.db = None 23 | 24 | def ask(self, question): 25 | """ 26 | Asks a question from the ingested document. 27 | 28 | Args: 29 | question (str): The question to be asked. 30 | 31 | Returns: 32 | str: The response to the question. 33 | """ 34 | if self.chain is None: 35 | return "Please, add a document." 36 | 37 | docs = self.db.get_relevant_documents(question) 38 | return self.chain.run(input_documents=docs, question=question) 39 | 40 | def ingest(self, file_path): 41 | """ 42 | Ingests a document from a file path. 43 | 44 | Args: 45 | file_path (str): The path to the file. 46 | """ 47 | loader = PyPDFium2Loader(file_path) 48 | documents = loader.load() 49 | splitted_documents = self.text_splitter.split_documents(documents) 50 | self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever() 51 | self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff") 52 | 53 | def forget(self): 54 | """ 55 | Removes the ingested documents and resets the chain. 56 | """ 57 | self.db = None 58 | self.chain = None 59 | -------------------------------------------------------------------------------- /streamlitui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import streamlit as st 4 | from streamlit_chat import message 5 | from pdfquery import PDFQuery 6 | 7 | # Set page title 8 | st.set_page_config(page_title="ChatPDF") 9 | 10 | def display_messages(): 11 | """ 12 | Display chat messages in the Streamlit app. 13 | """ 14 | st.subheader("Chat") 15 | for i, (msg, is_user) in enumerate(st.session_state["messages"]): 16 | message(msg, is_user=is_user, key=str(i)) 17 | st.session_state["thinking_spinner"] = st.empty() 18 | 19 | def process_input(): 20 | """ 21 | Process user input in the chat interface. 22 | """ 23 | if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0: 24 | user_text = st.session_state["user_input"].strip() 25 | with st.session_state["thinking_spinner"], st.spinner(f"Thinking"): 26 | query_text = st.session_state["pdfquery"].ask(user_text) 27 | 28 | st.session_state["messages"].append((user_text, True)) 29 | st.session_state["messages"].append((query_text, False)) 30 | 31 | def read_and_save_file(): 32 | """ 33 | Read and save files uploaded by the user. 34 | """ 35 | st.session_state["pdfquery"].forget() # to reset the knowledge base 36 | st.session_state["messages"] = [] 37 | st.session_state["user_input"] = "" 38 | 39 | for file in st.session_state["file_uploader"]: 40 | with tempfile.NamedTemporaryFile(delete=False) as tf: 41 | tf.write(file.getbuffer()) 42 | file_path = tf.name 43 | 44 | with st.session_state["ingestion_spinner"], st.spinner(f"Ingesting {file.name}"): 45 | st.session_state["pdfquery"].ingest(file_path) 46 | os.remove(file_path) 47 | 48 | def is_openai_api_key_set() -> bool: 49 | """ 50 | Check if the OpenAI API key is set. 51 | 52 | Returns: 53 | bool: True if the OpenAI API key is set, False otherwise. 54 | """ 55 | return len(st.session_state["OPENAI_API_KEY"]) > 0 56 | 57 | def main(): 58 | """ 59 | Main function to run the Streamlit app. 60 | """ 61 | if len(st.session_state) == 0: 62 | st.session_state["messages"] = [] 63 | st.session_state["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "") 64 | if is_openai_api_key_set(): 65 | st.session_state["pdfquery"] = PDFQuery(st.session_state["OPENAI_API_KEY"]) 66 | else: 67 | st.session_state["pdfquery"] = None 68 | 69 | st.header("ChatPDF") 70 | 71 | if st.text_input("OpenAI API Key", value=st.session_state["OPENAI_API_KEY"], key="input_OPENAI_API_KEY", type="password"): 72 | if ( 73 | len(st.session_state["input_OPENAI_API_KEY"]) > 0 74 | and st.session_state["input_OPENAI_API_KEY"] != st.session_state["OPENAI_API_KEY"] 75 | ): 76 | st.session_state["OPENAI_API_KEY"] = st.session_state["input_OPENAI_API_KEY"] 77 | if st.session_state["pdfquery"] is not None: 78 | st.warning("Please, upload the files again.") 79 | st.session_state["messages"] = [] 80 | st.session_state["user_input"] = "" 81 | st.session_state["pdfquery"] = PDFQuery(st.session_state["OPENAI_API_KEY"]) 82 | 83 | st.subheader("Upload a document") 84 | st.file_uploader( 85 | "Upload document", 86 | type=["pdf"], 87 | key="file_uploader", 88 | on_change=read_and_save_file, 89 | label_visibility="collapsed", 90 | accept_multiple_files=True, 91 | disabled=not is_openai_api_key_set(), 92 | ) 93 | 94 | st.session_state["ingestion_spinner"] = st.empty() 95 | 96 | display_messages() 97 | st.text_input("Message", key="user_input", disabled=not is_openai_api_key_set(), on_change=process_input) 98 | 99 | st.divider() 100 | st.markdown("Source code: [Github](https://github.com/ncodepro/pdfchatbot)") 101 | 102 | 103 | if __name__ == "__main__": 104 | main() 105 | --------------------------------------------------------------------------------