├── LICENSE
├── README.md
├── main.py
├── pdfquery.py
└── streamlitui.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 ncodepro
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ChatPDF
 2 | Chat with any PDF.
 3 | 
 4 | Easily upload the PDF documents you'd like to chat with. Instant answers. Ask questions, extract information, and summarize documents with AI. Sources included.
 5 | 
 6 | Create app like ChatPDF or PDF.ai in less than 10 lines of code
 7 | 
 8 | Getting Started
 9 | Code is up now, ⭐ (Star) the repo to receive updates
10 | 
11 | Replit and Colab version coming soon
12 | 
13 | Follow [Ankur Singh](https://twitter.com/ankur_maker) on twitter for updates
14 | 
15 | How to run ? (Things might change based on OS)
16 | 
17 | Create a virtual environment in python https://docs.python.org/3/library/venv.html
18 | 
19 | Run "pip install langchain pypdf chromadb openai tiktoken"
20 | 
21 | Set OPENAI_API_KEY environment variable with your openai key
22 | 
23 | Run "python main.py"
24 | 
25 | Change pdf file path and ask other questions
26 | 
27 | To run streamlit app, follow the steps run "streamlit run streamlitui.py"
28 | 
29 | Demo link
30 | https://heybot.thesamur.ai/
31 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # Import necessary libraries for pdf loading, vectorization, embeddings,
 2 | # LLM model, and question answering chain.
 3 | from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader
 4 | from langchain.vectorstores import Chroma
 5 | from langchain.embeddings.openai import OpenAIEmbeddings
 6 | from langchain.llms import OpenAI
 7 | from langchain.chains.question_answering import load_qa_chain
 8 | import os
 9 | 
10 | # Set OpenAI API key as an environment variable.
11 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_KEY"
12 | 
13 | # Specify the path to the pdf file.
14 | pdf_path = "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf"
15 | 
16 | # Create a PyPDFLoader object with the pdf path.
17 | loader = PyPDFLoader(pdf_path)
18 | 
19 | # Load and split the pages of the pdf document.
20 | pages = loader.load_and_split()
21 | 
22 | # Create an OpenAIEmbeddings object for embeddings.
23 | embeddings = OpenAIEmbeddings()
24 | 
25 | # Create a Chroma object from documents for vectorization, and transform it into a document retriever.
26 | docsearch = Chroma.from_documents(pages, embeddings).as_retriever()
27 | 
28 | # Define the query for the model.
29 | query = "What does Naval's thought about on how to be happy"
30 | 
31 | # Retrieve the documents relevant to the query.
32 | docs = docsearch.get_relevant_documents(query)
33 | 
34 | # Load a question answering chain with OpenAI's model (temperature of 0 means deterministic responses).
35 | # Chain_type is set to "stuff".
36 | chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
37 | 
38 | # Run the question answering chain with the relevant documents and the query.
39 | output = chain.run(input_documents=docs, question=query)
40 | 
41 | # Print the output from the question answering chain.
42 | print(output)
43 | 


--------------------------------------------------------------------------------
/pdfquery.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from langchain.embeddings.openai import OpenAIEmbeddings
 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 4 | from langchain.vectorstores import Chroma
 5 | from langchain.document_loaders import PyPDFium2Loader
 6 | from langchain.chains.question_answering import load_qa_chain
 7 | from langchain.llms import OpenAI
 8 | 
 9 | class PDFQuery:
10 |     def __init__(self, openai_api_key=None):
11 |         """
12 |         Initializes the PDFQuery object.
13 | 
14 |         Args:
15 |         openai_api_key (str): The API key for OpenAI.
16 |         """
17 |         self.embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
18 |         os.environ["OPENAI_API_KEY"] = openai_api_key
19 |         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
20 |         self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
21 |         self.chain = None
22 |         self.db = None
23 | 
24 |     def ask(self, question):
25 |         """
26 |         Asks a question from the ingested document.
27 | 
28 |         Args:
29 |         question (str): The question to be asked.
30 | 
31 |         Returns:
32 |         str: The response to the question.
33 |         """
34 |         if self.chain is None:
35 |             return "Please, add a document."
36 |         
37 |         docs = self.db.get_relevant_documents(question)
38 |         return self.chain.run(input_documents=docs, question=question)
39 | 
40 |     def ingest(self, file_path):
41 |         """
42 |         Ingests a document from a file path.
43 | 
44 |         Args:
45 |         file_path (str): The path to the file.
46 |         """
47 |         loader = PyPDFium2Loader(file_path)
48 |         documents = loader.load()
49 |         splitted_documents = self.text_splitter.split_documents(documents)
50 |         self.db = Chroma.from_documents(splitted_documents, self.embeddings).as_retriever()
51 |         self.chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
52 | 
53 |     def forget(self):
54 |         """
55 |         Removes the ingested documents and resets the chain.
56 |         """
57 |         self.db = None
58 |         self.chain = None
59 | 


--------------------------------------------------------------------------------
/streamlitui.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import streamlit as st
  4 | from streamlit_chat import message
  5 | from pdfquery import PDFQuery
  6 | 
  7 | # Set page title
  8 | st.set_page_config(page_title="ChatPDF")
  9 | 
 10 | def display_messages():
 11 |     """
 12 |     Display chat messages in the Streamlit app.
 13 |     """
 14 |     st.subheader("Chat")
 15 |     for i, (msg, is_user) in enumerate(st.session_state["messages"]):
 16 |         message(msg, is_user=is_user, key=str(i))
 17 |     st.session_state["thinking_spinner"] = st.empty()
 18 | 
 19 | def process_input():
 20 |     """
 21 |     Process user input in the chat interface.
 22 |     """
 23 |     if st.session_state["user_input"] and len(st.session_state["user_input"].strip()) > 0:
 24 |         user_text = st.session_state["user_input"].strip()
 25 |         with st.session_state["thinking_spinner"], st.spinner(f"Thinking"):
 26 |             query_text = st.session_state["pdfquery"].ask(user_text)
 27 | 
 28 |         st.session_state["messages"].append((user_text, True))
 29 |         st.session_state["messages"].append((query_text, False))
 30 | 
 31 | def read_and_save_file():
 32 |     """
 33 |     Read and save files uploaded by the user.
 34 |     """
 35 |     st.session_state["pdfquery"].forget()  # to reset the knowledge base
 36 |     st.session_state["messages"] = []
 37 |     st.session_state["user_input"] = ""
 38 | 
 39 |     for file in st.session_state["file_uploader"]:
 40 |         with tempfile.NamedTemporaryFile(delete=False) as tf:
 41 |             tf.write(file.getbuffer())
 42 |             file_path = tf.name
 43 | 
 44 |         with st.session_state["ingestion_spinner"], st.spinner(f"Ingesting {file.name}"):
 45 |             st.session_state["pdfquery"].ingest(file_path)
 46 |         os.remove(file_path)
 47 | 
 48 | def is_openai_api_key_set() -> bool:
 49 |     """
 50 |     Check if the OpenAI API key is set.
 51 | 
 52 |     Returns:
 53 |     bool: True if the OpenAI API key is set, False otherwise.
 54 |     """
 55 |     return len(st.session_state["OPENAI_API_KEY"]) > 0
 56 | 
 57 | def main():
 58 |     """
 59 |     Main function to run the Streamlit app.
 60 |     """
 61 |     if len(st.session_state) == 0:
 62 |         st.session_state["messages"] = []
 63 |         st.session_state["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", "")
 64 |         if is_openai_api_key_set():
 65 |             st.session_state["pdfquery"] = PDFQuery(st.session_state["OPENAI_API_KEY"])
 66 |         else:
 67 |             st.session_state["pdfquery"] = None
 68 | 
 69 |     st.header("ChatPDF")
 70 | 
 71 |     if st.text_input("OpenAI API Key", value=st.session_state["OPENAI_API_KEY"], key="input_OPENAI_API_KEY", type="password"):
 72 |         if (
 73 |             len(st.session_state["input_OPENAI_API_KEY"]) > 0
 74 |             and st.session_state["input_OPENAI_API_KEY"] != st.session_state["OPENAI_API_KEY"]
 75 |         ):
 76 |             st.session_state["OPENAI_API_KEY"] = st.session_state["input_OPENAI_API_KEY"]
 77 |             if st.session_state["pdfquery"] is not None:
 78 |                 st.warning("Please, upload the files again.")
 79 |             st.session_state["messages"] = []
 80 |             st.session_state["user_input"] = ""
 81 |             st.session_state["pdfquery"] = PDFQuery(st.session_state["OPENAI_API_KEY"])
 82 | 
 83 |     st.subheader("Upload a document")
 84 |     st.file_uploader(
 85 |         "Upload document",
 86 |         type=["pdf"],
 87 |         key="file_uploader",
 88 |         on_change=read_and_save_file,
 89 |         label_visibility="collapsed",
 90 |         accept_multiple_files=True,
 91 |         disabled=not is_openai_api_key_set(),
 92 |     )
 93 | 
 94 |     st.session_state["ingestion_spinner"] = st.empty()
 95 | 
 96 |     display_messages()
 97 |     st.text_input("Message", key="user_input", disabled=not is_openai_api_key_set(), on_change=process_input)
 98 | 
 99 |     st.divider()
100 |     st.markdown("Source code: [Github](https://github.com/ncodepro/pdfchatbot)")
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------