├── .env.example ├── diagram.png ├── src ├── __init__.py ├── pipeline.py ├── imageprocessing.py ├── docparser.py ├── doc_qa.py └── chunkers.py ├── requirements.txt ├── LICENSE ├── main.py └── README.md /.env.example: -------------------------------------------------------------------------------- 1 | LLAMA_CLOUD_API_KEY=... 2 | GOOGLE_API_KEY=... 3 | TAVILY_API_KEY=... -------------------------------------------------------------------------------- /diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/santiago68310/RAG-based-multimodal-agent/HEAD/diagram.png -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | from .docparser import DocParser 2 | from .chunkers import Chunker, SemanticChunker, AgenticChunker 3 | from .imageprocessing import ImageProcessor 4 | from .doc_qa import QA, AgenticQA, indexing -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chromadb==0.5.23 2 | img2table[easyocr]==1.4.0 3 | langchain==0.3.13 4 | langchain-chroma==0.1.4 5 | langchain_core==0.3.28 6 | langchain_experimental==0.3.4 7 | langchain_google_genai==2.0.7 8 | llama_parse==0.5.18 9 | opencv_contrib_python_headless==4.10.0.84 10 | opencv_python_headless==4.10.0.84 11 | pymupdf4llm==0.0.17 12 | PyMuPDF==1.24.14 13 | python-dotenv==1.0.1 14 | uuid6==2024.7.10 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 MARK 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from src import pipeline 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument( 6 | '--InputPath', 7 | help= 'Directory path containing files to be processed, or a single file path') 8 | 9 | parser.add_argument( 10 | '--parser_name', 11 | help='Specify the name of the parser to use for document processing. Possible values: ["LlamaParse", "pymupdf4llm"]' 12 | ) 13 | 14 | parser.add_argument( 15 | '--chunking_strategy', 16 | help='Define the chunking strategy to apply when processing documents. Possible values: ["semantic", "agentic"]' 17 | ) 18 | 19 | parser.add_argument( 20 | '--retrieval_strategy', 21 | help='Specify the retrieval strategy for querying indexed documents. Possible values:["semantic", "agentic"]' 22 | ) 23 | 24 | def main(): 25 | args = parser.parse_args() 26 | 27 | pipeline.pipeline(args.InputPath, 28 | parser_name=args.parser_name, 29 | chunking_strategy=args.chunking_strategy, 30 | retrieval_strategy=args.retrieval_strategy) 31 | 32 | 33 | if __name__ == '__main__': 34 | main() -------------------------------------------------------------------------------- /src/pipeline.py: -------------------------------------------------------------------------------- 1 | from .chunkers import Chunker 2 | from .docparser import DocParser 3 | from .imageprocessing import ImageProcessor 4 | from glob import glob 5 | from pathlib import Path 6 | from .doc_qa import QA, AgenticQA, indexing 7 | 8 | 9 | def list_supported_files(inputPath, supported_extensions= [".pdf"]): 10 | """ 11 | Lists all supported files in the given input path. 12 | 13 | Args: 14 | inputPath (str): The path where files are located. 15 | 16 | Returns: 17 | List[str]: A list of file paths with supported extensions. 18 | """ 19 | # Retrieve all files matching the input path and filter by supported extensions 20 | file_list = glob(f"{inputPath}/**/*", recursive=True) 21 | return [f for f in file_list if Path(f).suffix in supported_extensions] 22 | 23 | 24 | def pipeline(inputPath, 25 | parser_name, 26 | chunking_strategy, 27 | retrieval_strategy): 28 | 29 | parser= DocParser(parser_name= parser_name) 30 | chunker= Chunker(chunking_strategy) 31 | image_processor= ImageProcessor() 32 | 33 | files_list= list_supported_files(inputPath) 34 | chunks, image_documents= [], [] 35 | 36 | for file_path in files_list: 37 | print("processing started ...") 38 | 39 | text_docs= parser.parsing_function(file_path) 40 | parser.extract_tables(file_path) 41 | 42 | chunks.extend(chunker.build_chunks(text_docs, source= file_path)) 43 | image_documents.extend(image_processor.get_image_documents()) 44 | 45 | doc_indexing= indexing() 46 | retriever= doc_indexing.index_documents(chunks + image_documents) 47 | 48 | if retrieval_strategy == "agentic": 49 | agentic_qa= AgenticQA() 50 | agentic_qa.run(retriever) 51 | agentic_qa.query() 52 | else: 53 | qa = QA(retriever) 54 | qa.query() 55 | -------------------------------------------------------------------------------- /src/imageprocessing.py: -------------------------------------------------------------------------------- 1 | from langchain_core.prompts import ChatPromptTemplate 2 | from langchain_core.messages import HumanMessage 3 | from typing import List 4 | import glob, time, base64, logging, uuid6 5 | from pathlib import Path 6 | from langchain_google_genai import ChatGoogleGenerativeAI 7 | from langchain_core.documents import Document 8 | from dotenv import find_dotenv, load_dotenv 9 | 10 | load_dotenv(find_dotenv()) 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | class ImageProcessor: 15 | def __init__(self): 16 | self.image_dir= "./parsed_assets/" 17 | self.llm = ChatGoogleGenerativeAI( 18 | model="gemini-1.5-flash", 19 | temperature=0 20 | ) 21 | 22 | @staticmethod 23 | def retry_with_delay(func, *args, delay=2, retries=30, **kwargs): 24 | """ 25 | Helper method to retry a function call with a delay. 26 | """ 27 | for attempt in range(retries): 28 | try: 29 | return func(*args, **kwargs) 30 | except Exception as e: 31 | logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying...") 32 | time.sleep(delay) 33 | raise RuntimeError("Exceeded maximum retries.") 34 | 35 | def encode_image(self, image_path): 36 | """Getting the base64 string""" 37 | with open(image_path, "rb") as image_file: 38 | return base64.b64encode(image_file.read()).decode("utf-8") 39 | 40 | def image_summarize(self, img_base64): 41 | """Make image summary""" 42 | prompt = """You are an assistant tasked with summarizing images for retrieval. \ 43 | These summaries will be embedded and used to retrieve the raw image. \ 44 | Give a concise summary of the image that is well optimized for retrieval.""" 45 | # chat = ChatGoogleGenerativeAI(model="gpt-4-vision-preview", max_tokens=1024) 46 | 47 | msg = self.llm.invoke( 48 | [HumanMessage( 49 | content=[ 50 | {"type": "text", "text": prompt}, 51 | {"type": "image_url", 52 | "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}, 53 | },] 54 | )]) 55 | return msg.content 56 | 57 | def get_image_summaries(self): 58 | # image_paths: List[str])->List[str]: 59 | """ 60 | Generates summaries for a list of images using a generative AI model. 61 | 62 | Args: 63 | image_paths (List[str]): A list of file paths to images. 64 | 65 | Returns: 66 | List[str]: A list of textual summaries for each image. 67 | """ 68 | image_summaries = [] 69 | # for i, img_path in enumerate(image_paths): 70 | for img_path in sorted(glob.glob(f"{self.image_dir}*.png")): 71 | base64_image = self.encode_image(img_path) 72 | 73 | # img_base64_list.append(base64_image) 74 | # Append the AI-generated summary to the list 75 | image_summaries.append( 76 | self.retry_with_delay(self.image_summarize, base64_image) 77 | ) 78 | return image_summaries 79 | 80 | def get_image_documents(self)->List[Document]: 81 | """ 82 | Extracts images from files and generates corresponding text nodes with metadata. 83 | 84 | Args: 85 | files_to_process (List[str]): A list of file paths to extract images from. 86 | 87 | Returns: 88 | List[TextNode]: A list of nodes containing image summaries and metadata. 89 | """ 90 | image_documents = [] 91 | # Generate summaries for the extracted images 92 | image_summaries = self.get_image_summaries() 93 | image_paths= sorted(glob.glob(f"{self.image_dir}*.png")) 94 | 95 | for summary, image_path in zip(image_summaries, image_paths): 96 | # Append the created node to the list 97 | image_documents.append( 98 | Document( 99 | page_content=summary, 100 | metadata={"source": Path(image_path).name}, 101 | id= str(uuid6.uuid6()), 102 | ) 103 | ) 104 | 105 | return image_documents 106 | -------------------------------------------------------------------------------- /src/docparser.py: -------------------------------------------------------------------------------- 1 | import pymupdf4llm, cv2, fitz, os, io 2 | from pathlib import Path 3 | from typing import List 4 | from llama_parse import LlamaParse 5 | from img2table.ocr import EasyOCR 6 | from img2table.document import PDF, Image 7 | 8 | from dotenv import find_dotenv, load_dotenv 9 | load_dotenv(find_dotenv()) 10 | 11 | class DocParser: 12 | def __init__(self, parser_name): 13 | self.parser_name= parser_name 14 | self.assets_dir= "./parsed_assets/" 15 | self.parser_function_map= { 16 | "LlamaParse": self.with_LlamaParse, 17 | "pymupdf4llm": self.with_pymupdf4llm 18 | } 19 | self.parsing_function= self.parser_function_map[parser_name] 20 | 21 | # Instantiation of OCR 22 | self.ocr = EasyOCR(lang=["en"]) 23 | # Ensure the save directory exists 24 | os.makedirs(self.assets_dir, exist_ok=True) 25 | 26 | def parse(self, file_path): 27 | text_docs= self.parsing_function(file_path) 28 | if self.parser_name=="LlamaParse": 29 | self.extract_images(file_path) 30 | 31 | self.extract_tables(file_path) 32 | return text_docs 33 | 34 | def with_LlamaParse(self, file_path): 35 | print("LLamaParse is being used ...") 36 | parser = LlamaParse(result_type="markdown", verbose=False) 37 | data= parser.load_data(file_path=file_path) 38 | text_docs= [x.text for x in data] 39 | return text_docs 40 | 41 | def with_pymupdf4llm(self, file_path): 42 | #No need for standalone image extraction step, already done here 43 | output = pymupdf4llm.to_markdown( 44 | file_path, 45 | write_images=True, 46 | image_path=self.assets_dir, 47 | extract_words= True, 48 | show_progress= False) 49 | 50 | text_docs= [x["text"].replace("-----", "") 51 | for x in output] 52 | return text_docs 53 | 54 | def extract_tables(self, file_path): 55 | # Instantiation of document, either an image or a PDF 56 | if Path(file_path).suffix==".pdf": 57 | doc = PDF(file_path) 58 | else: 59 | doc = Image(file_path) 60 | # Table extraction 61 | extracted_tables = doc.extract_tables(ocr=self.ocr, 62 | implicit_rows=True, 63 | implicit_columns=True, 64 | borderless_tables=True) 65 | 66 | margin= 20 67 | save_dir= Path(self.assets_dir) 68 | file_stem= Path(file_path).stem 69 | 70 | for p, (image, tables) in enumerate( 71 | zip(doc._images, 72 | extracted_tables.values())): 73 | for i, t in enumerate(tables): 74 | table_image= image[t.bbox.y1-margin:t.bbox.y2+margin, 75 | t.bbox.x1-margin:t.bbox.x2+margin] 76 | cv2.imwrite(save_dir.joinpath(f"{file_stem}_{p}_table{i}.png"), table_image) 77 | 78 | 79 | def extract_images(self, filepath): 80 | """ 81 | Extracts images from the provided files and saves them to the specified directory. 82 | 83 | Args: 84 | files_to_process (List[str]): List of file paths to extract images from. 85 | save_dir (str): Directory to save the extracted images. 86 | 87 | Returns: 88 | None 89 | """ 90 | # for filepath in files_to_process: 91 | # Open the document using PyMuPDF 92 | doc = fitz.open(filepath) 93 | save_dir= Path(self.assets_dir) 94 | 95 | for p in range(len(doc)): 96 | page = doc[p] 97 | 98 | # Iterate through images on the page 99 | for i, img in enumerate(page.get_images(), start=1): 100 | xref = img[0] # Image reference ID 101 | 102 | # Extract image bytes 103 | base_image = doc.extract_image(xref) 104 | image_bytes = base_image["image"] 105 | 106 | # Create a PIL Image object from the bytes 107 | pil_image = Image.open(io.BytesIO(image_bytes)) 108 | 109 | # Save the image with a structured name 110 | image_name = f"{save_dir.joinpath(Path(filepath).stem)}_{p}_image{i}.png" 111 | pil_image.save(image_name) -------------------------------------------------------------------------------- /src/doc_qa.py: -------------------------------------------------------------------------------- 1 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder 2 | from langchain import hub 3 | from langchain_google_genai import ChatGoogleGenerativeAI 4 | from langchain.chains.combine_documents import create_stuff_documents_chain 5 | from langchain.chains import create_history_aware_retriever, create_retrieval_chain 6 | from langchain_core.tools import Tool 7 | from langchain_community.tools.tavily_search import TavilySearchResults 8 | from langchain.agents import AgentExecutor, create_react_agent 9 | from langchain_core.messages import AIMessage, HumanMessage 10 | 11 | from langchain_chroma import Chroma 12 | from langchain_google_genai import GoogleGenerativeAIEmbeddings 13 | import uuid6 14 | 15 | class indexing: 16 | def __init__(self): 17 | self.embedding_function= GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") 18 | pass 19 | 20 | def index_documents(self, documents, 21 | collection_name="Agentic_retrieval", 22 | top_k= 3): 23 | vector_store = Chroma( 24 | collection_name= collection_name, 25 | embedding_function=self.embedding_function) 26 | 27 | vector_store.add_documents( 28 | documents=documents, 29 | ids=[str(uuid6.uuid6()) for _ in documents]) 30 | 31 | retriever = vector_store.as_retriever( 32 | search_type="similarity", 33 | search_kwargs={"k": top_k},) 34 | 35 | return retriever 36 | 37 | class QA: 38 | def __init__(self, retriever) -> None: 39 | self.system_template = """ 40 | Answer the user's questions based on the below context. 41 | If the context doesn't contain any relevant information to the question, don't make something up and just say "I don't know": 42 | 43 | 44 | {context} 45 | 46 | """ 47 | 48 | self.question_answering_prompt = ChatPromptTemplate.from_messages( 49 | [("system", self.system_template), 50 | MessagesPlaceholder(variable_name="messages"),] 51 | ) 52 | self.retriever= retriever 53 | self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") 54 | 55 | self.qa_chain = create_stuff_documents_chain(self.llm, 56 | self.question_answering_prompt 57 | ) 58 | 59 | def query(self): 60 | while True: 61 | query = input("You: ") 62 | if query.lower() == "exit": 63 | break 64 | docs = self.retriever.invoke(query) 65 | 66 | response = self.qa_chain.invoke( 67 | {"context": docs, 68 | "messages": [HumanMessage(content=query)] 69 | } 70 | ) 71 | print(f"AI: {response}") 72 | 73 | 74 | class AgenticQA: 75 | def __init__(self) -> None: 76 | self.contextualize_q_system_prompt = ( 77 | "Given a chat history and the latest user question " 78 | "which might reference context in the chat history, " 79 | "formulate a standalone question which can be understood " 80 | "without the chat history. Do NOT answer the question, just " 81 | "reformulate it if needed and otherwise return it as is." 82 | ) 83 | self.chat_history = [] 84 | 85 | self.contextualize_q_prompt = ChatPromptTemplate.from_messages( 86 | [ 87 | ("system", self.contextualize_q_system_prompt), 88 | MessagesPlaceholder("chat_history"), 89 | ("human", "{input}"), 90 | ] 91 | ) 92 | 93 | self.qa_system_prompt = ( 94 | "You are an assistant for question-answering tasks. Use " 95 | "the following pieces of retrieved context to answer the " 96 | "question." 97 | "\n\n" 98 | "{context}" 99 | ) 100 | self.qa_prompt = ChatPromptTemplate.from_messages( 101 | [ 102 | ("system", self.qa_system_prompt), 103 | MessagesPlaceholder("chat_history"), 104 | ("human", "{input}"), 105 | ] 106 | ) 107 | self.react_docstore_prompt = hub.pull("aallali/react_tool_priority") 108 | self.llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash") 109 | 110 | def create_rag_chain(self, retriever): 111 | history_aware_retriever = create_history_aware_retriever( 112 | self.llm, retriever, self.contextualize_q_prompt 113 | ) 114 | question_answer_chain = create_stuff_documents_chain(self.llm, 115 | self.qa_prompt) 116 | 117 | self.rag_chain = create_retrieval_chain( 118 | history_aware_retriever, question_answer_chain) 119 | 120 | def create_rag_agent(self): 121 | self.agent = create_react_agent( 122 | llm=self.llm, 123 | tools=self.tools, 124 | prompt=self.react_docstore_prompt) 125 | 126 | def execute_rag_agent(self): 127 | self.agent_executor = AgentExecutor.from_agent_and_tools( 128 | agent=self.agent, 129 | tools=self.tools, 130 | handle_parsing_errors=True, 131 | verbose=True,) 132 | 133 | def run(self, retriever): 134 | self.create_rag_chain(retriever) 135 | 136 | self.tools = [ 137 | Tool( 138 | name="Answer Question", 139 | func=lambda query, **kwargs: self.rag_chain.invoke({ 140 | "input": query, 141 | "chat_history": kwargs.get("chat_history", []) 142 | }), 143 | description=( 144 | "A chat assistant tool designed to provide answers based on document knowledge. " 145 | "Maintains the context of previous questions and answers for continuity."), 146 | ), 147 | TavilySearchResults(max_results=2)] 148 | 149 | self.create_rag_agent() 150 | self.execute_rag_agent() 151 | 152 | def query(self): 153 | while True: 154 | query = input("You: ") 155 | if query.lower() == "exit": 156 | break 157 | response = self.agent_executor.invoke( 158 | {"input": query, 159 | "chat_history": self.chat_history}) 160 | print(f"AI: {response['output']}") 161 | 162 | # Update history 163 | self.chat_history.append(HumanMessage(content=query)) 164 | self.chat_history.append(AIMessage(content=response["output"])) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAG-based Multimodal Agent 2 | 3 | A sophisticated Retrieval-Augmented Generation (RAG) system that combines advanced document processing with intelligent retrieval mechanisms to deliver accurate, context-aware responses from multimodal data sources. 4 | 5 | ## Table of Contents 6 | 7 | - [Overview](#overview) 8 | - [Key Features](#key-features) 9 | - [Installation](#installation) 10 | - [Usage](#usage) 11 | - [API Reference](#api-reference) 12 | - [Contributing](#contributing) 13 | - [License](#license) 14 | 15 | ## Overview 16 | 17 | This project implements an advanced RAG framework that enhances traditional retrieval-augmented generation by introducing **agentic approaches** to both document chunking and information retrieval. Unlike conventional RAG systems that rely on static processing methods, this solution dynamically adapts to complex, multimodal data through intelligent decision-making processes. 18 | 19 | ### Key Innovations 20 | 21 | - **Agentic Chunking**: Simulates human-like document segmentation for optimal content organization 22 | - **Multimodal Processing**: Handles text, images, and tables with specialized extraction and summarization 23 | - **Intelligent Retrieval**: Implements ReAct-based reasoning for contextually appropriate information retrieval 24 | - **Dynamic Adaptation**: Automatically adjusts processing strategies based on content complexity 25 | 26 | ## Key Features 27 | 28 | ### 📄 Advanced Document Processing 29 | 30 | - **Semantic Chunking**: Splits documents into semantically coherent, meaningful segments 31 | - **Agentic Chunking**: Employs iterative, context-aware segmentation that mimics human judgment 32 | - Groups sentences based on topic and context 33 | - Processes documents iteratively from start to finish 34 | - Optimizes chunk boundaries for maximum semantic coherence 35 | - [Learn more about Agentic Chunking](https://gleen.ai/blog/agentic-chunking-enhancing-rag-answers-for-completeness-and-accuracy/) 36 | 37 | ### 🖼️ Multimodal Content Handling 38 | 39 | - **Image Detection & Analysis**: Automatically identifies and processes images using PyMuPDF 40 | - **Table Extraction**: Detects and extracts tabular data using img2table 41 | - **Content Summarization**: Generates text descriptions of images and tables using Gemini-1.5-Flash 42 | - **Unified Embedding**: Creates embeddings for all content types using text-embedding-004 43 | 44 | ### 🔍 Intelligent Retrieval Systems 45 | 46 | #### Semantic Retrieval 47 | - Embedding-based similarity search 48 | - Vector database integration 49 | - Context-aware matching 50 | 51 | #### Agentic Retrieval (ReAct Process) 52 | 1. **Query Rephrasing**: Enhances queries based on chat history context 53 | 2. **Semantic Retrieval**: Performs initial information gathering 54 | 3. **Relevance Assessment**: Evaluates retrieved content for query relevance and completeness 55 | 4. **Dynamic Response**: Generates answers using retrieved documents or web search as needed 56 | 57 | ## Installation 58 | 59 | ### Prerequisites 60 | 61 | - Python 3.8 or higher 62 | - Git 63 | 64 | ### Setup Instructions 65 | 66 | 1. **Clone the repository** 67 | ```bash 68 | git clone https://github.com/santiago9631/RAG-based-multimodal-agent.git 69 | cd RAG-based-multimodal-agent 70 | ``` 71 | 72 | 2. **Install dependencies** 73 | ```bash 74 | pip install -r requirements.txt 75 | ``` 76 | 77 | 3. **Configure environment variables** 78 | 79 | Create a `.env` file in the project root with the following variables: 80 | ```bash 81 | LLAMA_CLOUD_API_KEY=your_llamacloud_api_key_here 82 | GOOGLE_API_KEY=your_google_api_key_here 83 | TAVILY_API_KEY=your_tavily_api_key_here 84 | ``` 85 | 86 | ## Usage 87 | 88 | ### Document Processing 89 | 90 | Process your documents using the following command: 91 | 92 | ```bash 93 | python main.py --InputPath --parser_name --chunking_strategy --retrieval_strategy 94 | ``` 95 | 96 | #### Parameters 97 | 98 | | Parameter | Description | Options | 99 | |-----------|-------------|---------| 100 | | `--InputPath` | Path to directory containing files or single file path | Any valid file/directory path | 101 | | `--parser_name` | Document parser to use | `LlamaParse`, `pymupdf4llm` | 102 | | `--chunking_strategy` | Document chunking approach | `semantic`, `agentic` | 103 | | `--retrieval_strategy` | Information retrieval method | `semantic`, `agentic` | 104 | 105 | #### Example Usage 106 | 107 | ```bash 108 | # Process a single PDF with agentic chunking and retrieval 109 | python main.py --InputPath document.pdf --parser_name LlamaParse --chunking_strategy agentic --retrieval_strategy agentic 110 | 111 | # Process multiple PDFs in a directory 112 | python main.py --InputPath ./documents/ --parser_name pymupdf4llm --chunking_strategy semantic --retrieval_strategy semantic 113 | ``` 114 | 115 | ### Interactive Querying 116 | 117 | After processing documents, the system provides an interactive terminal interface for querying: 118 | 119 | 1. Enter your questions in the terminal 120 | 2. Receive contextually relevant answers based on processed content 121 | 3. Ask follow-up questions for deeper exploration 122 | 123 | **Note**: Currently supports PDF files only. All PDF files in the input directory will be processed automatically. 124 | 125 | ## API Reference 126 | 127 | ### Supported File Formats 128 | - **PDF**: Full support with text, image, and table extraction 129 | 130 | ### Parser Options 131 | - **LlamaParse**: Advanced PDF parsing with enhanced structure recognition 132 | - **pymupdf4llm**: Lightweight PDF processing optimized for LLM integration 133 | 134 | ### Chunking Strategies 135 | - **Semantic**: Traditional semantic boundary detection 136 | - **Agentic**: Human-like iterative segmentation process 137 | 138 | ### Retrieval Strategies 139 | - **Semantic**: Vector similarity-based retrieval 140 | - **Agentic**: ReAct-based intelligent retrieval with dynamic adaptation 141 | 142 | ## Contributing 143 | 144 | We welcome contributions! Please feel free to submit issues, feature requests, or pull requests. 145 | 146 | ## Workflow Diagram 147 | 148 | The following diagram illustrates the complete workflow of the RAG-based multimodal agent: 149 | 150 | ![RAG-based Multimodal Agent Workflow](./diagram.png) 151 | 152 | ## License 153 | 154 | This project is licensed under the MIT License - see the [LICENSE](./LICENSE) file for details. 155 | 156 | -------------------------------------------------------------------------------- /src/chunkers.py: -------------------------------------------------------------------------------- 1 | 2 | from langchain_core.prompts import ChatPromptTemplate 3 | from typing import List 4 | from langchain_core.pydantic_v1 import BaseModel, Field 5 | from langchain import hub 6 | import time, logging, uuid6 7 | from langchain_core.documents import Document 8 | from dotenv import find_dotenv, load_dotenv 9 | from langchain_google_genai import ( 10 | GoogleGenerativeAIEmbeddings, 11 | ChatGoogleGenerativeAI) 12 | from langchain_experimental.text_splitter import SemanticChunker 13 | 14 | 15 | load_dotenv(find_dotenv()) 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | class Chunker: 21 | def __init__(self, strategy): 22 | self.semantic_chunker= SemanticChunker_langchain() 23 | self.agentic_chunker= AgenticChunker() 24 | self.strategy_chunker_map= { 25 | "semantic": self.semantic_chunker, 26 | "agentic": self.agentic_chunker 27 | } 28 | self.chunker= self.strategy_chunker_map[strategy] 29 | 30 | def build_chunks(self, texts, source): 31 | return self.chunker.build_chunks(texts, source) 32 | 33 | class SemanticChunker_langchain: 34 | #https://python.langchain.com/v0.2/docs/how_to/semantic-chunker/ 35 | def __init__(self): 36 | self.embed_model_name= "models/text-embedding-004" 37 | 38 | def build_chunks(self, texts, source): 39 | text_splitter = SemanticChunker( 40 | GoogleGenerativeAIEmbeddings( 41 | model=self.embed_model_name)) 42 | 43 | chunks= text_splitter.create_documents( 44 | texts=texts, 45 | metadatas= [{"source": source}]*len(texts) 46 | ) 47 | return chunks 48 | 49 | class ChunkMeta(BaseModel): 50 | title: str = Field(description="The title of the chunk.") 51 | summary: str = Field(description="The summary of the chunk.") 52 | 53 | class ChunkID(BaseModel): 54 | chunk_id: int = Field(description="The chunk id.") 55 | 56 | class Sentences(BaseModel): 57 | sentences: List[str] 58 | 59 | class AgenticChunker: 60 | def __init__(self): 61 | """ 62 | Initializes the AgenticChunker with: 63 | - An empty dictionary for storing chunks. 64 | - A large language model (LLM) for processing and summarizing text. 65 | - A placeholder for raw text input. 66 | """ 67 | self.chunks = {} 68 | self.llm = ChatGoogleGenerativeAI( 69 | model="gemini-1.5-flash", 70 | temperature=0 71 | ) 72 | # self.raw_text = "" 73 | 74 | @staticmethod 75 | def retry_with_delay(func, *args, delay=2, retries=30, **kwargs): 76 | """ 77 | Helper method to retry a function call with a delay. 78 | """ 79 | for attempt in range(retries): 80 | try: 81 | return func(*args, **kwargs) 82 | except Exception as e: 83 | logger.warning(f"Attempt {attempt + 1} failed: {e}. Retrying...") 84 | time.sleep(delay) 85 | raise RuntimeError("Exceeded maximum retries.") 86 | 87 | def extract_propositions_list(self, raw_text): 88 | """ 89 | Extracts a list of propositions from the raw text using an LLM. 90 | """ 91 | logger.info("Extracting propositions from raw text.") 92 | extraction_llm = self.llm.with_structured_output(Sentences) 93 | obj = hub.pull("wfh/proposal-indexing") 94 | extraction_chain = obj | extraction_llm 95 | self.propositions_list = self.retry_with_delay(extraction_chain.invoke, raw_text).sentences 96 | 97 | def build_chunks(self, raw_text, source=""): 98 | """ 99 | Processes the list of propositions and organizes them into chunks. 100 | """ 101 | chunks_as_documents=[] 102 | logger.info("Building chunks from propositions.") 103 | self.extract_propositions_list(raw_text) 104 | for proposition in self.propositions_list: 105 | self.find_chunk_and_push_proposition(proposition) 106 | 107 | for chunk_id in self.chunks: 108 | chunk_content= " ".join(self.chunks[chunk_id]["propositions"]) 109 | chunks_as_documents.append(Document( 110 | page_content=chunk_content, 111 | metadata={"source": f"{source}_{chunk_id}"}, 112 | id= str(uuid6.uuid6()), 113 | )) 114 | 115 | return chunks_as_documents 116 | 117 | def create_prompt_template(self, messages): 118 | """ 119 | Helper method to create prompt templates. 120 | """ 121 | return ChatPromptTemplate.from_messages(messages) 122 | 123 | def upsert_chunk(self, chunk_id, propositions): 124 | """ 125 | Creates or updates a chunk with the given propositions. 126 | """ 127 | summary_llm = self.llm.with_structured_output(ChunkMeta) 128 | prompt = self.create_prompt_template([ 129 | ("system", "Generate a new or updated summary and title based on the propositions."), 130 | ("user", "propositions:{propositions}") 131 | ]) 132 | summary_chain = prompt | summary_llm 133 | 134 | chunk_meta = self.retry_with_delay(summary_chain.invoke, {"propositions": propositions}) 135 | self.chunks[chunk_id] = { 136 | "summary": chunk_meta.summary, 137 | "title": chunk_meta.title, 138 | "propositions": propositions 139 | } 140 | 141 | def find_chunk_and_push_proposition(self, proposition): 142 | """ 143 | Finds the most relevant chunk for a proposition or creates a new one if none match. 144 | """ 145 | logger.info(f"Finding chunk for proposition: {proposition}") 146 | allocation_llm = self.llm.with_structured_output(ChunkID) 147 | allocation_prompt = self.create_prompt_template([ 148 | ("system", "Using the chunk IDs and summaries, determine the best chunk for the proposition. " 149 | "If no chunk matches, generate a new chunk ID. Return only the chunk ID."), 150 | ("user", "proposition:{proposition}\nchunks_summaries:{chunks_summaries}") 151 | ]) 152 | allocation_chain = allocation_prompt | allocation_llm 153 | 154 | chunks_summaries = { 155 | chunk_id: chunk["summary"] for chunk_id, chunk in self.chunks.items() 156 | } 157 | 158 | best_chunk_id = self.retry_with_delay( 159 | allocation_chain.invoke, { 160 | "proposition": proposition, 161 | "chunks_summaries": chunks_summaries 162 | } 163 | ).chunk_id 164 | 165 | if best_chunk_id not in self.chunks: 166 | logger.info(f"Creating new chunk for proposition: {proposition}") 167 | self.upsert_chunk(best_chunk_id, [proposition]) 168 | else: 169 | logger.info(f"Adding proposition to existing chunk ID: {best_chunk_id}") 170 | current_propositions = self.chunks[best_chunk_id]["propositions"] 171 | self.upsert_chunk(best_chunk_id, current_propositions + [proposition]) 172 | --------------------------------------------------------------------------------