├── .env.example
├── diagram.png
├── src
    ├── __init__.py
    ├── pipeline.py
    ├── imageprocessing.py
    ├── docparser.py
    ├── doc_qa.py
    └── chunkers.py
├── requirements.txt
├── LICENSE
├── main.py
└── README.md


/.env.example:
--------------------------------------------------------------------------------
1 | LLAMA_CLOUD_API_KEY=...
2 | GOOGLE_API_KEY=...
3 | TAVILY_API_KEY=...


--------------------------------------------------------------------------------
/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/santiago68310/RAG-based-multimodal-agent/HEAD/diagram.png


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | from .docparser import DocParser
2 | from .chunkers import Chunker, SemanticChunker, AgenticChunker
3 | from .imageprocessing import ImageProcessor
4 | from .doc_qa import QA, AgenticQA, indexing


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | chromadb==0.5.23
 2 | img2table[easyocr]==1.4.0
 3 | langchain==0.3.13
 4 | langchain-chroma==0.1.4
 5 | langchain_core==0.3.28
 6 | langchain_experimental==0.3.4
 7 | langchain_google_genai==2.0.7
 8 | llama_parse==0.5.18
 9 | opencv_contrib_python_headless==4.10.0.84
10 | opencv_python_headless==4.10.0.84
11 | pymupdf4llm==0.0.17
12 | PyMuPDF==1.24.14
13 | python-dotenv==1.0.1
14 | uuid6==2024.7.10


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 MARK
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from src import pipeline
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument(
 6 |     '--InputPath', 
 7 |     help= 'Directory path containing files to be processed, or a single file path')
 8 | 
 9 | parser.add_argument(
10 |     '--parser_name', 
11 |     help='Specify the name of the parser to use for document processing. Possible values: ["LlamaParse", "pymupdf4llm"]'
12 | )
13 | 
14 | parser.add_argument(
15 |     '--chunking_strategy', 
16 |     help='Define the chunking strategy to apply when processing documents. Possible values: ["semantic", "agentic"]'
17 | )
18 | 
19 | parser.add_argument(
20 |     '--retrieval_strategy', 
21 |     help='Specify the retrieval strategy for querying indexed documents. Possible values:["semantic", "agentic"]'
22 | )
23 | 
24 | def main():
25 |     args = parser.parse_args()
26 | 
27 |     pipeline.pipeline(args.InputPath, 
28 |                       parser_name=args.parser_name, 
29 |                       chunking_strategy=args.chunking_strategy, 
30 |                       retrieval_strategy=args.retrieval_strategy)
31 |     
32 | 
33 | if __name__ == '__main__':
34 |     main()


--------------------------------------------------------------------------------
/src/pipeline.py:
--------------------------------------------------------------------------------
 1 | from .chunkers import Chunker
 2 | from .docparser import DocParser
 3 | from .imageprocessing import ImageProcessor
 4 | from glob import glob
 5 | from pathlib import Path
 6 | from .doc_qa import QA, AgenticQA, indexing
 7 | 
 8 | 
 9 | def list_supported_files(inputPath, supported_extensions= [".pdf"]):
10 |     """
11 |     Lists all supported files in the given input path.
12 |     
13 |     Args:
14 |         inputPath (str): The path where files are located.
15 | 
16 |     Returns:
17 |         List[str]: A list of file paths with supported extensions.
18 |     """
19 |     # Retrieve all files matching the input path and filter by supported extensions
20 |     file_list = glob(f"{inputPath}/**/*", recursive=True)
21 |     return [f for f in file_list if Path(f).suffix in supported_extensions]
22 | 
23 | 
24 | def pipeline(inputPath, 
25 |              parser_name, 
26 |              chunking_strategy, 
27 |              retrieval_strategy):
28 | 
29 |     parser= DocParser(parser_name= parser_name)
30 |     chunker= Chunker(chunking_strategy)
31 |     image_processor= ImageProcessor()
32 | 
33 |     files_list= list_supported_files(inputPath)
34 |     chunks, image_documents= [], []
35 | 
36 |     for file_path in files_list:
37 |         print("processing started ...")
38 | 
39 |         text_docs= parser.parsing_function(file_path)
40 |         parser.extract_tables(file_path)
41 | 
42 |         chunks.extend(chunker.build_chunks(text_docs, source= file_path))
43 |         image_documents.extend(image_processor.get_image_documents())
44 | 
45 |     doc_indexing= indexing()
46 |     retriever= doc_indexing.index_documents(chunks + image_documents)
47 | 
48 |     if retrieval_strategy == "agentic":
49 |         agentic_qa= AgenticQA()
50 |         agentic_qa.run(retriever)
51 |         agentic_qa.query()
52 |     else:
53 |         qa = QA(retriever)
54 |         qa.query()
55 | 


--------------------------------------------------------------------------------
/src/imageprocessing.py:
--------------------------------------------------------------------------------
  1 | from langchain_core.prompts import ChatPromptTemplate
  2 | from langchain_core.messages import HumanMessage
  3 | from typing import List
  4 | import glob, time, base64, logging, uuid6
  5 | from pathlib import Path
  6 | from langchain_google_genai import ChatGoogleGenerativeAI
  7 | from langchain_core.documents import Document
  8 | from dotenv import find_dotenv, load_dotenv
  9 | 
 10 | load_dotenv(find_dotenv())
 11 | logging.basicConfig(level=logging.INFO)
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class ImageProcessor:
 15 |     def __init__(self):
 16 |         self.image_dir= "./parsed_assets/"
 17 |         self.llm = ChatGoogleGenerativeAI(
 18 |             model="gemini-1.5-flash",
 19 |             temperature=0
 20 |         )
 21 | 
 22 |     @staticmethod
 23 |     def retry_with_delay(func, *args, delay=2, retries=30, **kwargs):
 24 |         """
 25 |         Helper method to retry a function call with a delay.
 26 |         """
 27 |         for attempt in range(retries):
 28 |             try:
 29 |                 return func(*args, **kwargs)
 30 |             except Exception as e:
 31 |                 logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying...")
 32 |                 time.sleep(delay)
 33 |         raise RuntimeError("Exceeded maximum retries.")
 34 | 
 35 |     def encode_image(self, image_path):
 36 |         """Getting the base64 string"""
 37 |         with open(image_path, "rb") as image_file:
 38 |             return base64.b64encode(image_file.read()).decode("utf-8")
 39 |     
 40 |     def image_summarize(self, img_base64):
 41 |         """Make image summary"""
 42 |         prompt = """You are an assistant tasked with summarizing images for retrieval. \
 43 |                     These summaries will be embedded and used to retrieve the raw image. \
 44 |                     Give a concise summary of the image that is well optimized for retrieval."""
 45 |         # chat = ChatGoogleGenerativeAI(model="gpt-4-vision-preview", max_tokens=1024)
 46 | 
 47 |         msg = self.llm.invoke(
 48 |             [HumanMessage(
 49 |                     content=[
 50 |                         {"type": "text", "text": prompt},
 51 |                         {"type": "image_url",
 52 |                          "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"},
 53 |                         },]
 54 |                 )])
 55 |         return msg.content
 56 | 
 57 |     def get_image_summaries(self):
 58 |                             # image_paths: List[str])->List[str]:
 59 |         """
 60 |         Generates summaries for a list of images using a generative AI model.
 61 | 
 62 |         Args:
 63 |             image_paths (List[str]): A list of file paths to images.
 64 | 
 65 |         Returns:
 66 |             List[str]: A list of textual summaries for each image.
 67 |         """
 68 |         image_summaries = []
 69 |         # for i, img_path in enumerate(image_paths):
 70 |         for img_path in sorted(glob.glob(f"{self.image_dir}*.png")):
 71 |             base64_image = self.encode_image(img_path)
 72 | 
 73 |             # img_base64_list.append(base64_image)
 74 |             # Append the AI-generated summary to the list
 75 |             image_summaries.append(
 76 |                 self.retry_with_delay(self.image_summarize, base64_image)
 77 |                 )
 78 |         return image_summaries
 79 | 
 80 |     def get_image_documents(self)->List[Document]:
 81 |         """
 82 |         Extracts images from files and generates corresponding text nodes with metadata.
 83 | 
 84 |         Args:
 85 |             files_to_process (List[str]): A list of file paths to extract images from.
 86 | 
 87 |         Returns:
 88 |             List[TextNode]: A list of nodes containing image summaries and metadata.
 89 |         """
 90 |         image_documents = []
 91 |         # Generate summaries for the extracted images
 92 |         image_summaries = self.get_image_summaries()
 93 |         image_paths= sorted(glob.glob(f"{self.image_dir}*.png"))
 94 | 
 95 |         for summary, image_path in zip(image_summaries, image_paths):
 96 |             # Append the created node to the list
 97 |             image_documents.append(
 98 |                 Document(
 99 |                 page_content=summary,
100 |                 metadata={"source": Path(image_path).name},
101 |                 id= str(uuid6.uuid6()),
102 |                 )
103 |             )
104 | 
105 |         return image_documents
106 | 


--------------------------------------------------------------------------------
/src/docparser.py:
--------------------------------------------------------------------------------
  1 | import pymupdf4llm, cv2, fitz, os, io
  2 | from pathlib import Path
  3 | from typing import List
  4 | from llama_parse import LlamaParse
  5 | from img2table.ocr import EasyOCR
  6 | from img2table.document import PDF, Image
  7 | 
  8 | from dotenv import find_dotenv, load_dotenv
  9 | load_dotenv(find_dotenv())
 10 | 
 11 | class DocParser:
 12 |     def __init__(self, parser_name):
 13 |         self.parser_name= parser_name
 14 |         self.assets_dir= "./parsed_assets/"
 15 |         self.parser_function_map= {
 16 |             "LlamaParse": self.with_LlamaParse,
 17 |             "pymupdf4llm": self.with_pymupdf4llm
 18 |         }
 19 |         self.parsing_function= self.parser_function_map[parser_name]
 20 | 
 21 |         # Instantiation of OCR
 22 |         self.ocr = EasyOCR(lang=["en"])
 23 |                 # Ensure the save directory exists
 24 |         os.makedirs(self.assets_dir, exist_ok=True)
 25 | 
 26 |     def parse(self, file_path):
 27 |         text_docs= self.parsing_function(file_path)
 28 |         if self.parser_name=="LlamaParse":
 29 |             self.extract_images(file_path)
 30 |         
 31 |         self.extract_tables(file_path)
 32 |         return text_docs
 33 |     
 34 |     def with_LlamaParse(self, file_path):
 35 |         print("LLamaParse is being used ...")
 36 |         parser = LlamaParse(result_type="markdown", verbose=False)
 37 |         data= parser.load_data(file_path=file_path)
 38 |         text_docs= [x.text for x in data]
 39 |         return text_docs
 40 |     
 41 |     def with_pymupdf4llm(self, file_path):
 42 |         #No need for standalone image extraction step, already done here
 43 |         output = pymupdf4llm.to_markdown(
 44 |             file_path, 
 45 |             write_images=True,
 46 |             image_path=self.assets_dir,
 47 |             extract_words= True,
 48 |             show_progress= False)
 49 | 
 50 |         text_docs= [x["text"].replace("-----", "") 
 51 |                     for x in output]
 52 |         return text_docs
 53 | 
 54 |     def extract_tables(self, file_path):
 55 |         # Instantiation of document, either an image or a PDF
 56 |         if Path(file_path).suffix==".pdf":
 57 |             doc = PDF(file_path)
 58 |         else:
 59 |             doc = Image(file_path)
 60 |         # Table extraction
 61 |         extracted_tables = doc.extract_tables(ocr=self.ocr,
 62 |                                             implicit_rows=True,
 63 |                                             implicit_columns=True,
 64 |                                             borderless_tables=True)
 65 |         
 66 |         margin= 20
 67 |         save_dir= Path(self.assets_dir)
 68 |         file_stem= Path(file_path).stem
 69 | 
 70 |         for p, (image, tables) in enumerate(
 71 |             zip(doc._images, 
 72 |                 extracted_tables.values())):
 73 |             for i, t in enumerate(tables):
 74 |                 table_image= image[t.bbox.y1-margin:t.bbox.y2+margin,
 75 |                                 t.bbox.x1-margin:t.bbox.x2+margin]
 76 |                 cv2.imwrite(save_dir.joinpath(f"{file_stem}_{p}_table{i}.png"), table_image)
 77 | 
 78 | 
 79 |     def extract_images(self, filepath):
 80 |         """
 81 |         Extracts images from the provided files and saves them to the specified directory.
 82 | 
 83 |         Args:
 84 |             files_to_process (List[str]): List of file paths to extract images from.
 85 |             save_dir (str): Directory to save the extracted images.
 86 | 
 87 |         Returns:
 88 |             None
 89 |         """
 90 |         # for filepath in files_to_process:
 91 |             # Open the document using PyMuPDF
 92 |         doc = fitz.open(filepath)
 93 |         save_dir= Path(self.assets_dir)
 94 | 
 95 |         for p in range(len(doc)):
 96 |             page = doc[p]
 97 | 
 98 |             # Iterate through images on the page
 99 |             for i, img in enumerate(page.get_images(), start=1):
100 |                 xref = img[0]  # Image reference ID
101 | 
102 |                 # Extract image bytes
103 |                 base_image = doc.extract_image(xref)
104 |                 image_bytes = base_image["image"]
105 | 
106 |                 # Create a PIL Image object from the bytes
107 |                 pil_image = Image.open(io.BytesIO(image_bytes))
108 | 
109 |                 # Save the image with a structured name
110 |                 image_name = f"{save_dir.joinpath(Path(filepath).stem)}_{p}_image{i}.png"
111 |                 pil_image.save(image_name)


--------------------------------------------------------------------------------
/src/doc_qa.py:
--------------------------------------------------------------------------------
  1 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
  2 | from langchain import hub
  3 | from langchain_google_genai import ChatGoogleGenerativeAI
  4 | from langchain.chains.combine_documents import create_stuff_documents_chain
  5 | from langchain.chains import create_history_aware_retriever, create_retrieval_chain
  6 | from langchain_core.tools import Tool
  7 | from langchain_community.tools.tavily_search import TavilySearchResults
  8 | from langchain.agents import AgentExecutor, create_react_agent
  9 | from langchain_core.messages import AIMessage, HumanMessage
 10 | 
 11 | from langchain_chroma import Chroma
 12 | from langchain_google_genai import GoogleGenerativeAIEmbeddings
 13 | import uuid6
 14 | 
 15 | class indexing:
 16 |    def __init__(self):
 17 |       self.embedding_function= GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
 18 |       pass
 19 |    
 20 |    def index_documents(self, documents, 
 21 |                        collection_name="Agentic_retrieval", 
 22 |                        top_k= 3):
 23 |       vector_store = Chroma(
 24 |          collection_name= collection_name,
 25 |          embedding_function=self.embedding_function)
 26 |       
 27 |       vector_store.add_documents(
 28 |          documents=documents, 
 29 |          ids=[str(uuid6.uuid6()) for _ in documents])
 30 | 
 31 |       retriever = vector_store.as_retriever(
 32 |         search_type="similarity",
 33 |         search_kwargs={"k": top_k},)
 34 |       
 35 |       return retriever
 36 | 
 37 | class QA:
 38 |   def __init__(self, retriever) -> None:
 39 |     self.system_template = """
 40 |       Answer the user's questions based on the below context. 
 41 |       If the context doesn't contain any relevant information to the question, don't make something up and just say "I don't know":
 42 | 
 43 |       <context>
 44 |       {context}
 45 |       </context>
 46 |       """
 47 | 
 48 |     self.question_answering_prompt = ChatPromptTemplate.from_messages(
 49 |     [("system", self.system_template),
 50 |      MessagesPlaceholder(variable_name="messages"),]
 51 |     )
 52 |     self.retriever= retriever
 53 |     self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
 54 | 
 55 |     self.qa_chain = create_stuff_documents_chain(self.llm, 
 56 |                                                  self.question_answering_prompt
 57 |                                                  )
 58 | 
 59 |   def query(self):
 60 |     while True:
 61 |       query = input("You: ")
 62 |       if query.lower() == "exit":
 63 |           break
 64 |       docs = self.retriever.invoke(query)
 65 | 
 66 |       response = self.qa_chain.invoke(
 67 |           {"context": docs,
 68 |            "messages": [HumanMessage(content=query)]
 69 |            }
 70 |           )
 71 |       print(f"AI: {response}")
 72 | 
 73 | 
 74 | class AgenticQA:
 75 |   def __init__(self) -> None:
 76 |     self.contextualize_q_system_prompt = (
 77 |       "Given a chat history and the latest user question "
 78 |       "which might reference context in the chat history, "
 79 |       "formulate a standalone question which can be understood "
 80 |       "without the chat history. Do NOT answer the question, just "
 81 |       "reformulate it if needed and otherwise return it as is."
 82 |     )
 83 |     self.chat_history = []
 84 | 
 85 |     self.contextualize_q_prompt = ChatPromptTemplate.from_messages(
 86 |       [
 87 |           ("system", self.contextualize_q_system_prompt),
 88 |           MessagesPlaceholder("chat_history"),
 89 |           ("human", "{input}"),
 90 |       ]
 91 |     )
 92 | 
 93 |     self.qa_system_prompt = (
 94 |       "You are an assistant for question-answering tasks. Use "
 95 |       "the following pieces of retrieved context to answer the "
 96 |       "question."
 97 |       "\n\n"
 98 |       "{context}"
 99 |     )
100 |     self.qa_prompt = ChatPromptTemplate.from_messages(
101 |       [
102 |           ("system", self.qa_system_prompt),
103 |           MessagesPlaceholder("chat_history"),
104 |           ("human", "{input}"),
105 |       ]
106 |     )
107 |     self.react_docstore_prompt = hub.pull("aallali/react_tool_priority")
108 |     self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
109 |     
110 |   def create_rag_chain(self, retriever):
111 |     history_aware_retriever = create_history_aware_retriever(
112 |         self.llm, retriever, self.contextualize_q_prompt
113 |     )
114 |     question_answer_chain = create_stuff_documents_chain(self.llm, 
115 |                                                          self.qa_prompt)
116 | 
117 |     self.rag_chain = create_retrieval_chain(
118 |         history_aware_retriever, question_answer_chain)
119 |   
120 |   def create_rag_agent(self):
121 |     self.agent = create_react_agent(
122 |         llm=self.llm,
123 |         tools=self.tools,
124 |         prompt=self.react_docstore_prompt)
125 | 
126 |   def execute_rag_agent(self):
127 |     self.agent_executor = AgentExecutor.from_agent_and_tools(
128 |         agent=self.agent, 
129 |         tools=self.tools, 
130 |         handle_parsing_errors=True, 
131 |         verbose=True,)
132 |     
133 |   def run(self, retriever):
134 |       self.create_rag_chain(retriever)
135 | 
136 |       self.tools = [
137 |           Tool(
138 |           name="Answer Question",
139 |           func=lambda query, **kwargs: self.rag_chain.invoke({
140 |               "input": query,
141 |               "chat_history": kwargs.get("chat_history", [])
142 |           }),
143 |           description=(
144 |               "A chat assistant tool designed to provide answers based on document knowledge. "
145 |               "Maintains the context of previous questions and answers for continuity."),
146 |           ),
147 |           TavilySearchResults(max_results=2)]
148 | 
149 |       self.create_rag_agent()
150 |       self.execute_rag_agent()
151 | 
152 |   def query(self):
153 |       while True:
154 |           query = input("You: ")
155 |           if query.lower() == "exit":
156 |               break
157 |           response = self.agent_executor.invoke(
158 |               {"input": query, 
159 |                "chat_history": self.chat_history})
160 |           print(f"AI: {response['output']}")
161 | 
162 |           # Update history
163 |           self.chat_history.append(HumanMessage(content=query))
164 |           self.chat_history.append(AIMessage(content=response["output"]))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RAG-based Multimodal Agent
  2 | 
  3 | A sophisticated Retrieval-Augmented Generation (RAG) system that combines advanced document processing with intelligent retrieval mechanisms to deliver accurate, context-aware responses from multimodal data sources.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Overview](#overview)
  8 | - [Key Features](#key-features)
  9 | - [Installation](#installation)
 10 | - [Usage](#usage)
 11 | - [API Reference](#api-reference)
 12 | - [Contributing](#contributing)
 13 | - [License](#license)
 14 | 
 15 | ## Overview
 16 | 
 17 | This project implements an advanced RAG framework that enhances traditional retrieval-augmented generation by introducing **agentic approaches** to both document chunking and information retrieval. Unlike conventional RAG systems that rely on static processing methods, this solution dynamically adapts to complex, multimodal data through intelligent decision-making processes.
 18 | 
 19 | ### Key Innovations
 20 | 
 21 | - **Agentic Chunking**: Simulates human-like document segmentation for optimal content organization
 22 | - **Multimodal Processing**: Handles text, images, and tables with specialized extraction and summarization
 23 | - **Intelligent Retrieval**: Implements ReAct-based reasoning for contextually appropriate information retrieval
 24 | - **Dynamic Adaptation**: Automatically adjusts processing strategies based on content complexity
 25 | 
 26 | ## Key Features
 27 | 
 28 | ### 📄 Advanced Document Processing
 29 | 
 30 | - **Semantic Chunking**: Splits documents into semantically coherent, meaningful segments
 31 | - **Agentic Chunking**: Employs iterative, context-aware segmentation that mimics human judgment
 32 |   - Groups sentences based on topic and context
 33 |   - Processes documents iteratively from start to finish
 34 |   - Optimizes chunk boundaries for maximum semantic coherence
 35 |   - [Learn more about Agentic Chunking](https://gleen.ai/blog/agentic-chunking-enhancing-rag-answers-for-completeness-and-accuracy/)
 36 | 
 37 | ### 🖼️ Multimodal Content Handling
 38 | 
 39 | - **Image Detection & Analysis**: Automatically identifies and processes images using PyMuPDF
 40 | - **Table Extraction**: Detects and extracts tabular data using img2table
 41 | - **Content Summarization**: Generates text descriptions of images and tables using Gemini-1.5-Flash
 42 | - **Unified Embedding**: Creates embeddings for all content types using text-embedding-004
 43 | 
 44 | ### 🔍 Intelligent Retrieval Systems
 45 | 
 46 | #### Semantic Retrieval
 47 | - Embedding-based similarity search
 48 | - Vector database integration
 49 | - Context-aware matching
 50 | 
 51 | #### Agentic Retrieval (ReAct Process)
 52 | 1. **Query Rephrasing**: Enhances queries based on chat history context
 53 | 2. **Semantic Retrieval**: Performs initial information gathering
 54 | 3. **Relevance Assessment**: Evaluates retrieved content for query relevance and completeness
 55 | 4. **Dynamic Response**: Generates answers using retrieved documents or web search as needed
 56 | 
 57 | ## Installation
 58 | 
 59 | ### Prerequisites
 60 | 
 61 | - Python 3.8 or higher
 62 | - Git
 63 | 
 64 | ### Setup Instructions
 65 | 
 66 | 1. **Clone the repository**
 67 |    ```bash
 68 |    git clone https://github.com/santiago9631/RAG-based-multimodal-agent.git
 69 |    cd RAG-based-multimodal-agent
 70 |    ```
 71 | 
 72 | 2. **Install dependencies**
 73 |    ```bash
 74 |    pip install -r requirements.txt
 75 |    ```
 76 | 
 77 | 3. **Configure environment variables**
 78 |    
 79 |    Create a `.env` file in the project root with the following variables:
 80 |    ```bash
 81 |    LLAMA_CLOUD_API_KEY=your_llamacloud_api_key_here
 82 |    GOOGLE_API_KEY=your_google_api_key_here
 83 |    TAVILY_API_KEY=your_tavily_api_key_here
 84 |    ```
 85 | 
 86 | ## Usage
 87 | 
 88 | ### Document Processing
 89 | 
 90 | Process your documents using the following command:
 91 | 
 92 | ```bash
 93 | python main.py --InputPath <path> --parser_name <parser> --chunking_strategy <chunking> --retrieval_strategy <retrieval>
 94 | ```
 95 | 
 96 | #### Parameters
 97 | 
 98 | | Parameter | Description | Options |
 99 | |-----------|-------------|---------|
100 | | `--InputPath` | Path to directory containing files or single file path | Any valid file/directory path |
101 | | `--parser_name` | Document parser to use | `LlamaParse`, `pymupdf4llm` |
102 | | `--chunking_strategy` | Document chunking approach | `semantic`, `agentic` |
103 | | `--retrieval_strategy` | Information retrieval method | `semantic`, `agentic` |
104 | 
105 | #### Example Usage
106 | 
107 | ```bash
108 | # Process a single PDF with agentic chunking and retrieval
109 | python main.py --InputPath document.pdf --parser_name LlamaParse --chunking_strategy agentic --retrieval_strategy agentic
110 | 
111 | # Process multiple PDFs in a directory
112 | python main.py --InputPath ./documents/ --parser_name pymupdf4llm --chunking_strategy semantic --retrieval_strategy semantic
113 | ```
114 | 
115 | ### Interactive Querying
116 | 
117 | After processing documents, the system provides an interactive terminal interface for querying:
118 | 
119 | 1. Enter your questions in the terminal
120 | 2. Receive contextually relevant answers based on processed content
121 | 3. Ask follow-up questions for deeper exploration
122 | 
123 | **Note**: Currently supports PDF files only. All PDF files in the input directory will be processed automatically.
124 | 
125 | ## API Reference
126 | 
127 | ### Supported File Formats
128 | - **PDF**: Full support with text, image, and table extraction
129 | 
130 | ### Parser Options
131 | - **LlamaParse**: Advanced PDF parsing with enhanced structure recognition
132 | - **pymupdf4llm**: Lightweight PDF processing optimized for LLM integration
133 | 
134 | ### Chunking Strategies
135 | - **Semantic**: Traditional semantic boundary detection
136 | - **Agentic**: Human-like iterative segmentation process
137 | 
138 | ### Retrieval Strategies
139 | - **Semantic**: Vector similarity-based retrieval
140 | - **Agentic**: ReAct-based intelligent retrieval with dynamic adaptation
141 | 
142 | ## Contributing
143 | 
144 | We welcome contributions! Please feel free to submit issues, feature requests, or pull requests.
145 | 
146 | ## Workflow Diagram
147 | 
148 | The following diagram illustrates the complete workflow of the RAG-based multimodal agent:
149 | 
150 | ![RAG-based Multimodal Agent Workflow](./diagram.png)
151 | 
152 | ## License
153 | 
154 | This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details.
155 | 
156 | 


--------------------------------------------------------------------------------
/src/chunkers.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from langchain_core.prompts import ChatPromptTemplate
  3 | from typing import List
  4 | from langchain_core.pydantic_v1 import BaseModel, Field
  5 | from langchain import hub
  6 | import time, logging, uuid6
  7 | from langchain_core.documents import Document
  8 | from dotenv import find_dotenv, load_dotenv
  9 | from langchain_google_genai import (
 10 |     GoogleGenerativeAIEmbeddings, 
 11 |     ChatGoogleGenerativeAI)
 12 | from langchain_experimental.text_splitter import SemanticChunker
 13 | 
 14 | 
 15 | load_dotenv(find_dotenv())
 16 | 
 17 | logging.basicConfig(level=logging.INFO)
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | class Chunker:
 21 |     def __init__(self, strategy):
 22 |         self.semantic_chunker= SemanticChunker_langchain()
 23 |         self.agentic_chunker= AgenticChunker()
 24 |         self.strategy_chunker_map= {
 25 |             "semantic": self.semantic_chunker,
 26 |             "agentic": self.agentic_chunker
 27 |         }
 28 |         self.chunker= self.strategy_chunker_map[strategy]
 29 | 
 30 |     def build_chunks(self, texts, source):
 31 |         return self.chunker.build_chunks(texts, source)
 32 | 
 33 | class SemanticChunker_langchain:
 34 |     #https://python.langchain.com/v0.2/docs/how_to/semantic-chunker/
 35 |     def __init__(self):
 36 |         self.embed_model_name= "models/text-embedding-004"
 37 | 
 38 |     def build_chunks(self, texts, source):
 39 |         text_splitter = SemanticChunker(
 40 |             GoogleGenerativeAIEmbeddings(
 41 |                 model=self.embed_model_name))
 42 | 
 43 |         chunks= text_splitter.create_documents(
 44 |             texts=texts,
 45 |             metadatas= [{"source": source}]*len(texts)
 46 |             )
 47 |         return chunks
 48 | 
 49 | class ChunkMeta(BaseModel):
 50 |     title: str = Field(description="The title of the chunk.")
 51 |     summary: str = Field(description="The summary of the chunk.")
 52 | 
 53 | class ChunkID(BaseModel):
 54 |     chunk_id: int = Field(description="The chunk id.")
 55 | 
 56 | class Sentences(BaseModel):
 57 |     sentences: List[str]
 58 | 
 59 | class AgenticChunker:
 60 |     def __init__(self):
 61 |         """
 62 |         Initializes the AgenticChunker with:
 63 |         - An empty dictionary for storing chunks.
 64 |         - A large language model (LLM) for processing and summarizing text.
 65 |         - A placeholder for raw text input.
 66 |         """
 67 |         self.chunks = {}
 68 |         self.llm = ChatGoogleGenerativeAI(
 69 |             model="gemini-1.5-flash",
 70 |             temperature=0
 71 |         )
 72 |         # self.raw_text = ""
 73 | 
 74 |     @staticmethod
 75 |     def retry_with_delay(func, *args, delay=2, retries=30, **kwargs):
 76 |         """
 77 |         Helper method to retry a function call with a delay.
 78 |         """
 79 |         for attempt in range(retries):
 80 |             try:
 81 |                 return func(*args, **kwargs)
 82 |             except Exception as e:
 83 |                 logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying...")
 84 |                 time.sleep(delay)
 85 |         raise RuntimeError("Exceeded maximum retries.")
 86 | 
 87 |     def extract_propositions_list(self, raw_text):
 88 |         """
 89 |         Extracts a list of propositions from the raw text using an LLM.
 90 |         """
 91 |         logger.info("Extracting propositions from raw text.")
 92 |         extraction_llm = self.llm.with_structured_output(Sentences)
 93 |         obj = hub.pull("wfh/proposal-indexing")
 94 |         extraction_chain = obj | extraction_llm
 95 |         self.propositions_list = self.retry_with_delay(extraction_chain.invoke, raw_text).sentences
 96 | 
 97 |     def build_chunks(self, raw_text, source=""):
 98 |         """
 99 |         Processes the list of propositions and organizes them into chunks.
100 |         """
101 |         chunks_as_documents=[]
102 |         logger.info("Building chunks from propositions.")
103 |         self.extract_propositions_list(raw_text)
104 |         for proposition in self.propositions_list:
105 |             self.find_chunk_and_push_proposition(proposition)
106 |         
107 |         for chunk_id in self.chunks:
108 |             chunk_content= " ".join(self.chunks[chunk_id]["propositions"])
109 |             chunks_as_documents.append(Document(
110 |                 page_content=chunk_content,
111 |                 metadata={"source": f"{source}_{chunk_id}"},
112 |                 id= str(uuid6.uuid6()),
113 |                 ))
114 | 
115 |         return chunks_as_documents
116 | 
117 |     def create_prompt_template(self, messages):
118 |         """
119 |         Helper method to create prompt templates.
120 |         """
121 |         return ChatPromptTemplate.from_messages(messages)
122 | 
123 |     def upsert_chunk(self, chunk_id, propositions):
124 |         """
125 |         Creates or updates a chunk with the given propositions.
126 |         """
127 |         summary_llm = self.llm.with_structured_output(ChunkMeta)
128 |         prompt = self.create_prompt_template([
129 |             ("system", "Generate a new or updated summary and title based on the propositions."),
130 |             ("user", "propositions:{propositions}")
131 |         ])
132 |         summary_chain = prompt | summary_llm
133 | 
134 |         chunk_meta = self.retry_with_delay(summary_chain.invoke, {"propositions": propositions})
135 |         self.chunks[chunk_id] = {
136 |             "summary": chunk_meta.summary,
137 |             "title": chunk_meta.title,
138 |             "propositions": propositions
139 |         }
140 | 
141 |     def find_chunk_and_push_proposition(self, proposition):
142 |         """
143 |         Finds the most relevant chunk for a proposition or creates a new one if none match.
144 |         """
145 |         logger.info(f"Finding chunk for proposition: {proposition}")
146 |         allocation_llm = self.llm.with_structured_output(ChunkID)
147 |         allocation_prompt = self.create_prompt_template([
148 |             ("system", "Using the chunk IDs and summaries, determine the best chunk for the proposition. "
149 |                       "If no chunk matches, generate a new chunk ID. Return only the chunk ID."),
150 |             ("user", "proposition:{proposition}\nchunks_summaries:{chunks_summaries}")
151 |         ])
152 |         allocation_chain = allocation_prompt | allocation_llm
153 | 
154 |         chunks_summaries = {
155 |             chunk_id: chunk["summary"] for chunk_id, chunk in self.chunks.items()
156 |         }
157 | 
158 |         best_chunk_id = self.retry_with_delay(
159 |             allocation_chain.invoke, {
160 |                 "proposition": proposition,
161 |                 "chunks_summaries": chunks_summaries
162 |             }
163 |         ).chunk_id
164 | 
165 |         if best_chunk_id not in self.chunks:
166 |             logger.info(f"Creating new chunk for proposition: {proposition}")
167 |             self.upsert_chunk(best_chunk_id, [proposition])
168 |         else:
169 |             logger.info(f"Adding proposition to existing chunk ID: {best_chunk_id}")
170 |             current_propositions = self.chunks[best_chunk_id]["propositions"]
171 |             self.upsert_chunk(best_chunk_id, current_propositions + [proposition])
172 | 


--------------------------------------------------------------------------------