├── .env
├── README.md
├── app_local.py
├── apppp.py
├── pic.jpg
└── requirements.txt


/.env:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY=


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Build a Multiple PDF chat App with llama3 having a High speed Inference with Memory.
2 | <img width="950" alt="image" src="https://github.com/InsightEdge01/Multi-PDF-llama3Chat/assets/131486782/91f76aae-2d7f-45c9-b898-d6f25a64fd73">
3 | 


--------------------------------------------------------------------------------
/app_local.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import PyPDF2
  3 | from langchain_community.embeddings import OllamaEmbeddings
  4 | from langchain.text_splitter import RecursiveCharacterTextSplitter
  5 | from langchain_community.vectorstores import Chroma
  6 | from langchain.chains import ConversationalRetrievalChain
  7 | from langchain_community.chat_models import ChatOllama
  8 | from langchain.memory import ChatMessageHistory, ConversationBufferMemory
  9 | import chainlit as cl
 10 | 
 11 | 
 12 | 
 13 | @cl.on_chat_start
 14 | async def on_chat_start():
 15 |     files = None #Initialize variable to store uploaded files
 16 | 
 17 |     # Wait for the user to upload files
 18 |     while files is None:
 19 |         files = await cl.AskFileMessage(
 20 |             content="Please upload one or more pdf files to begin!",
 21 |             accept=["application/pdf"],
 22 |             max_size_mb=500,# Optionally limit the file size,
 23 |             max_files=10,
 24 |             timeout=180, # Set a timeout for user response,
 25 |         ).send()
 26 | 
 27 |     # Process each uploaded file
 28 |     texts = []
 29 |     metadatas = []
 30 |     for file in files:
 31 |         print(file) # Print the file object for debugging
 32 | 
 33 |         # Read the PDF file
 34 |         pdf = PyPDF2.PdfReader(file.path)
 35 |         pdf_text = ""
 36 |         for page in pdf.pages:
 37 |             pdf_text += page.extract_text()
 38 |             
 39 |         # Split the text into chunks
 40 |         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
 41 |         file_texts = text_splitter.split_text(pdf_text)
 42 |         texts.extend(file_texts)
 43 | 
 44 |         # Create a metadata for each chunk
 45 |         file_metadatas = [{"source": f"{i}-{file.name}"} for i in range(len(file_texts))]
 46 |         metadatas.extend(file_metadatas)
 47 | 
 48 |     # Create a Chroma vector store
 49 |     embeddings = OllamaEmbeddings(model="nomic-embed-text")
 50 |     docsearch = await cl.make_async(Chroma.from_texts)(
 51 |         texts, embeddings, metadatas=metadatas
 52 |     )
 53 |     
 54 |     # Initialize message history for conversation
 55 |     message_history = ChatMessageHistory()
 56 |     
 57 |     # Memory for conversational context
 58 |     memory = ConversationBufferMemory(
 59 |         memory_key="chat_history",
 60 |         output_key="answer",
 61 |         chat_memory=message_history,
 62 |         return_messages=True,
 63 |     )
 64 | 
 65 |     # Create a chain that uses the Chroma vector store
 66 |     chain = ConversationalRetrievalChain.from_llm(
 67 |         ChatOllama(model="llama3"),
 68 |         chain_type="stuff",
 69 |         retriever=docsearch.as_retriever(),
 70 |         memory=memory,
 71 |         return_source_documents=True,
 72 |     )
 73 |     
 74 |     # Sending an image with the number of files
 75 |     elements = [
 76 |     cl.Image(name="image", display="inline", path="pic.jpg")
 77 |     ]
 78 |     # Inform the user that processing has ended.You can now chat.
 79 |     msg = cl.Message(content=f"Processing {len(files)} files done. You can now ask questions!",elements=elements)
 80 |     await msg.send()
 81 | 
 82 |     #store the chain in user session
 83 |     cl.user_session.set("chain", chain)
 84 | 
 85 | 
 86 | @cl.on_message
 87 | async def main(message: cl.Message):
 88 |      # Retrieve the chain from user session
 89 |     chain = cl.user_session.get("chain") 
 90 |     #call backs happens asynchronously/parallel 
 91 |     cb = cl.AsyncLangchainCallbackHandler()
 92 |     
 93 |     # call the chain with user's message content
 94 |     res = await chain.ainvoke(message.content, callbacks=[cb])
 95 |     answer = res["answer"]
 96 |     source_documents = res["source_documents"] 
 97 | 
 98 |     text_elements = [] # Initialize list to store text elements
 99 |     
100 |     # Process source documents if available
101 |     if source_documents:
102 |         for source_idx, source_doc in enumerate(source_documents):
103 |             source_name = f"source_{source_idx}"
104 |             # Create the text element referenced in the message
105 |             text_elements.append(
106 |                 cl.Text(content=source_doc.page_content, name=source_name)
107 |             )
108 |         source_names = [text_el.name for text_el in text_elements]
109 |         
110 |          # Add source references to the answer
111 |         if source_names:
112 |             answer += f"\nSources: {', '.join(source_names)}"
113 |         else:
114 |             answer += "\nNo sources found"
115 |     #return results
116 |     await cl.Message(content=answer, elements=text_elements).send()


--------------------------------------------------------------------------------
/apppp.py:
--------------------------------------------------------------------------------
  1 | import PyPDF2
  2 | from langchain_community.embeddings import OllamaEmbeddings
  3 | from langchain.text_splitter import RecursiveCharacterTextSplitter
  4 | from langchain_community.vectorstores import Chroma
  5 | from langchain.chains import ConversationalRetrievalChain
  6 | from langchain.memory import ChatMessageHistory, ConversationBufferMemory
  7 | import chainlit as cl
  8 | from langchain_groq import ChatGroq
  9 | from dotenv import load_dotenv
 10 | import os
 11 | 
 12 | # Loading environment variables from .env file
 13 | load_dotenv() 
 14 | 
 15 | # Function to initialize conversation chain with GROQ language model
 16 | groq_api_key = os.environ['GROQ_API_KEY']
 17 | 
 18 | # Initializing GROQ chat with provided API key, model name, and settings
 19 | llm_groq = ChatGroq(
 20 |             groq_api_key=groq_api_key, model_name="llama3-70b-8192",
 21 |                          temperature=0.2)
 22 | 
 23 | 
 24 | @cl.on_chat_start
 25 | async def on_chat_start():
 26 |     files = None #Initialize variable to store uploaded files
 27 | 
 28 |     # Wait for the user to upload files
 29 |     while files is None:
 30 |         files = await cl.AskFileMessage(
 31 |             content="Please upload one or more pdf files to begin!",
 32 |             accept=["application/pdf"],
 33 |             max_size_mb=100,# Optionally limit the file size,
 34 |             max_files=10,
 35 |             timeout=180, # Set a timeout for user response,
 36 |         ).send()
 37 | 
 38 |     # Process each uploaded file
 39 |     texts = []
 40 |     metadatas = []
 41 |     for file in files:
 42 |         print(file) # Print the file object for debugging
 43 | 
 44 |         # Read the PDF file
 45 |         pdf = PyPDF2.PdfReader(file.path)
 46 |         pdf_text = ""
 47 |         for page in pdf.pages:
 48 |             pdf_text += page.extract_text()
 49 |             
 50 |         # Split the text into chunks
 51 |         text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=50)
 52 |         file_texts = text_splitter.split_text(pdf_text)
 53 |         texts.extend(file_texts)
 54 | 
 55 |         # Create a metadata for each chunk
 56 |         file_metadatas = [{"source": f"{i}-{file.name}"} for i in range(len(file_texts))]
 57 |         metadatas.extend(file_metadatas)
 58 | 
 59 |     # Create a Chroma vector store
 60 |     embeddings = OllamaEmbeddings(model="nomic-embed-text")
 61 |     docsearch = await cl.make_async(Chroma.from_texts)(
 62 |         texts, embeddings, metadatas=metadatas
 63 |     )
 64 |     
 65 |     # Initialize message history for conversation
 66 |     message_history = ChatMessageHistory()
 67 |     
 68 |     # Memory for conversational context
 69 |     memory = ConversationBufferMemory(
 70 |         memory_key="chat_history",
 71 |         output_key="answer",
 72 |         chat_memory=message_history,
 73 |         return_messages=True,
 74 |     )
 75 | 
 76 |     # Create a chain that uses the Chroma vector store
 77 |     chain = ConversationalRetrievalChain.from_llm(
 78 |         llm=llm_groq,
 79 |         chain_type="stuff",
 80 |         retriever=docsearch.as_retriever(),
 81 |         memory=memory,
 82 |         return_source_documents=True,
 83 |     )
 84 |     
 85 |     # Sending an image with the number of files
 86 |     elements = [
 87 |     cl.Image(name="image", display="inline", path="pic.jpg")
 88 |     ]
 89 |     # Inform the user that processing has ended.You can now chat.
 90 |     msg = cl.Message(content=f"Processing {len(files)} files done. You can now ask questions!",elements=elements)
 91 |     await msg.send()
 92 | 
 93 |     #store the chain in user session
 94 |     cl.user_session.set("chain", chain)
 95 | 
 96 | 
 97 | @cl.on_message
 98 | async def main(message: cl.Message):
 99 |      # Retrieve the chain from user session
100 |     chain = cl.user_session.get("chain") 
101 |     #call backs happens asynchronously/parallel 
102 |     cb = cl.AsyncLangchainCallbackHandler()
103 |     
104 |     # call the chain with user's message content
105 |     res = await chain.ainvoke(message.content, callbacks=[cb])
106 |     answer = res["answer"]
107 |     source_documents = res["source_documents"] 
108 | 
109 |     text_elements = [] # Initialize list to store text elements
110 |     
111 |     # Process source documents if available
112 |     if source_documents:
113 |         for source_idx, source_doc in enumerate(source_documents):
114 |             source_name = f"source_{source_idx}"
115 |             # Create the text element referenced in the message
116 |             text_elements.append(
117 |                 cl.Text(content=source_doc.page_content, name=source_name)
118 |             )
119 |         source_names = [text_el.name for text_el in text_elements]
120 |         
121 |          # Add source references to the answer
122 |         if source_names:
123 |             answer += f"\nSources: {', '.join(source_names)}"
124 |         else:
125 |             answer += "\nNo sources found"
126 |     #return results
127 |     await cl.Message(content=answer, elements=text_elements).send()


--------------------------------------------------------------------------------
/pic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsightEdge01/Multi-PDF-llama3Chat/46e57deb90bb13ff419f5b072af9a9148c6573c4/pic.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #chainlit==1.0.200
 2 | chainlit
 3 | langchain
 4 | langchain_community
 5 | PyPDF2
 6 | chromadb
 7 | groq
 8 | langchain-groq
 9 | ollama
10 | python-dotenv
11 | 


--------------------------------------------------------------------------------