├── .github └── workflows │ └── publish.yml ├── .gitignore ├── .idea └── .gitignore ├── LICENSE ├── README.md ├── examples ├── data │ ├── pymupdf_output.json │ ├── test.pdf │ ├── test_chunks.json │ ├── test_ocr_doc.pdf │ ├── test_ocr_doc.png │ └── test_ocr_doc_chunks.json ├── ocr_example.py └── rag_example.py ├── poetry.lock ├── pyproject.toml ├── smart_llm_loader ├── __init__.py ├── document_loader.py ├── llm.py ├── prompts.py ├── schema.py └── utils.py └── tests ├── conftest.py ├── test_document_loader.py ├── test_image_processor.py └── test_llm_processing.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Set up Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: '3.12' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install poetry 20 | - name: Build and publish 21 | env: 22 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }} 23 | run: | 24 | poetry build 25 | poetry publish -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | ENV/ 29 | 30 | # IDE 31 | .idea/ 32 | .vscode/ 33 | *.swp 34 | *.swo 35 | 36 | # Distribution 37 | dist/ 38 | build/ 39 | 40 | # Misc 41 | .DS_Store 42 | .coverage 43 | htmlcov/ 44 | .pytest_cache/ -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 David Emmanuel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SmartLLMLoader 2 | 3 | smart-llm-loader is a lightweight yet powerful Python package that transforms any document into LLM-ready chunks. It handles the entire document processing pipeline: 4 | 5 | - 📄 Converts documents to clean markdown 6 | - 🔍 Built-in OCR for scanned documents and images 7 | - ✂️ Smart, context-aware text chunking 8 | - 🔌 Seamless integration with LangChain and LlamaIndex 9 | - 📦 Ready for vector stores and LLM ingestion 10 | 11 | Spend less time on preprocessing headaches and more time building what matters. From RAG systems to chatbots to document Q&A, 12 | SmartLLMLoader handles the heavy lifting so you can focus on creating exceptional AI applications. 13 | 14 | SmartLLMLoader's chunking approach has been benchmarked against traditional methods, showing superior performance particularly when paired with Google's Gemini Flash model. This combination offers an efficient and cost-effective solution for document chunking in RAG systems. View the detailed performance comparison [here](https://www.sergey.fyi/articles/gemini-flash-2). 15 | 16 | 17 | ## Features 18 | 19 | - Support for multiple LLM providers 20 | - In-built OCR for scanned documents and images 21 | - Flexible document type support 22 | - Supports different chunking strategies such as: context-aware chunking and page-based chunking 23 | - Supports custom prompts and custom chunking 24 | 25 | ## Installation 26 | 27 | ### System Dependencies 28 | 29 | First, install Poppler if you don't have it already (required for PDF processing): 30 | 31 | **Ubuntu/Debian:** 32 | ```bash 33 | sudo apt-get install poppler-utils 34 | ``` 35 | 36 | **macOS:** 37 | ```bash 38 | brew install poppler 39 | ``` 40 | 41 | **Windows:** 42 | 1. Download the latest [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows/releases/) 43 | 2. Extract the downloaded file 44 | 3. Add the `bin` directory to your system PATH 45 | 46 | ### Package Installation 47 | 48 | You can install SmartLLMLoader using pip: 49 | 50 | ```bash 51 | pip install smart-llm-loader 52 | ``` 53 | 54 | Or using Poetry: 55 | 56 | ```bash 57 | poetry add smart-llm-loader 58 | ``` 59 | 60 | ## Quick Start 61 | smart-llm-loader package uses litellm to call the LLM so any arguments supported by litellm can be used. You can find the litellm documentation [here](https://docs.litellm.ai/docs/providers). 62 | You can use any multi-modal model supported by litellm. 63 | 64 | ```python 65 | from smart_llm_loader import SmartLLMLoader 66 | 67 | 68 | # Using Gemini Flash model 69 | os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY" 70 | model = "gemini/gemini-1.5-flash" 71 | 72 | # Using openai model 73 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" 74 | model = "openai/gpt-4o" 75 | 76 | # Using anthropic model 77 | os.environ["ANTHROPIC_API_KEY"] = "YOUR_ANTHROPIC_API_KEY" 78 | model = "anthropic/claude-3-5-sonnet" 79 | 80 | 81 | # Initialize the document loader 82 | loader = SmartLLMLoader( 83 | file_path="your_document.pdf", 84 | chunk_strategy="contextual", 85 | model=model, 86 | ) 87 | # Load and split the document into chunks 88 | documents = loader.load_and_split() 89 | ``` 90 | 91 | ## Parameters 92 | 93 | ```python 94 | class SmartLLMLoader(BaseLoader): 95 | """A flexible document loader that supports multiple input types.""" 96 | 97 | def __init__( 98 | self, 99 | file_path: Optional[Union[str, Path]] = None, # path to the document to load 100 | url: Optional[str] = None, # url to the document to load 101 | chunk_strategy: str = 'contextual', # chunking strategy to use (page, contextual, custom) 102 | custom_prompt: Optional[str] = None, # custom prompt to use 103 | model: str = "gemini/gemini-2.0-flash", # LLM model to use 104 | save_output: bool = False, # whether to save the output to a file 105 | output_dir: Optional[Union[str, Path]] = None, # directory to save the output to 106 | api_key: Optional[str] = None, # API key to use 107 | **kwargs, 108 | ): 109 | ``` 110 | 111 | ## Comparison with Traditional Methods 112 | 113 | Let's see SmartLLMLoader in action! We'll compare it with PyMuPDF (a popular traditional document loader) to demonstrate why SmartLLMLoader's intelligent chunking makes such a difference in real-world applications. 114 | 115 | ### The Challenge: Processing an Invoice 116 | We'll process this sample invoice that includes headers, tables, and complex formatting: 117 | 118 | ![Sample Invoice Document](https://raw.githubusercontent.com/AskYourPdf/llm-loader/refs/heads/master/examples/data/test_ocr_doc.png?height=200) 119 | 120 | ### Head-to-Head Comparison 121 | 122 | #### 1. SmartLLMLoader Output 123 | SmartLLMLoader intelligently breaks down the document into semantic chunks, preserving structure and meaning (note that the json output below has been formatted for readability): 124 | 125 | ```json 126 | [ 127 | { 128 | "content": "Invoice no: 27301261\nDate of issue: 10/09/2012", 129 | "metadata": { 130 | "page": 0, 131 | "semantic_theme": "invoice_header", 132 | "source": "data/test_ocr_doc.pdf" 133 | } 134 | }, 135 | { 136 | "content": "Seller:\nWilliams LLC\n72074 Taylor Plains Suite 342\nWest Alexandria, AR 97978\nTax Id: 922-88-2832\nIBAN: GB70FTNR64199348221780", 137 | "metadata": { 138 | "page": 0, 139 | "semantic_theme": "seller_information", 140 | "source": "data/test_ocr_doc.pdf" 141 | } 142 | }, 143 | { 144 | "content": "Client:\nHernandez-Anderson\n084 Carter Lane Apt. 846\nSouth Ronaldbury, AZ 91030\nTax Id: 959-74-5868", 145 | "metadata": { 146 | "page": 0, 147 | "semantic_theme": "client_information", 148 | "source": "data/test_ocr_doc.pdf" 149 | } 150 | }, 151 | { 152 | "content": 153 | "Item table:\n" 154 | "| No. | Description | Qty | UM | Net price | Net worth | VAT [%] | Gross worth |\n" 155 | "|-----|-----------------------------------------------------------|------|------|-----------|-----------|---------|-------------|\n" 156 | "| 1 | Lilly Pulitzer dress Size 2 | 5.00 | each | 45.00 | 225.00 | 10% | 247.50 |\n" 157 | "| 2 | New ERIN Erin Fertherston Straight Dress White Sequence Lining Sleeveless SZ 10 | 1.00 | each | 59.99 | 59.99 | 10% | 65.99 |\n" 158 | "| 3 | Sequence dress Size Small | 3.00 | each | 35.00 | 105.00 | 10% | 115.50 |\n" 159 | "| 4 | fire los angeles dress Medium | 3.00 | each | 6.50 | 19.50 | 10% | 21.45 |\n" 160 | "| 5 | Eileen Fisher Women's Long Sleeve Fleece Lined Front Pockets Dress XS Gray | 3.00 | each | 15.99 | 47.97 | 10% | 52.77 |\n" 161 | "| 6 | Lularoe Nicole Dress Size Small Light Solid Grey/White Ringer Tee Trim | 2.00 | each | 3.75 | 7.50 | 10% | 8.25 |\n" 162 | "| 7 | J.Crew Collection Black & White sweater Dress sz S | 1.00 | each | 30.00 | 30.00 | 10% | 33.00 |", 163 | "metadata": { 164 | "page": 0, 165 | "semantic_theme": "items_table", 166 | "source": "data/test_ocr_doc.pdf" 167 | } 168 | }, 169 | { 170 | "content": "Summary table:\n" 171 | "| VAT [%] | Net worth | VAT | Gross worth |\n" 172 | "|---------|-----------|--------|-------------|\n" 173 | "| 10% | 494,96 | 49,50 | 544,46 |\n" 174 | "| Total | $ 494,96 | $ 49,50| $ 544,46 |", 175 | "metadata": { 176 | "page": 0, 177 | "semantic_theme": "summary_table", 178 | "source": "data/test_ocr_doc.pdf" 179 | } 180 | } 181 | ] 182 | ``` 183 | 184 | **Key Benefits:** 185 | - ✨ Clean, structured chunks 186 | - 🎯 Semantic understanding 187 | - 📊 Preserved table formatting 188 | - 🏷️ Intelligent metadata tagging 189 | 190 | #### 2. Traditional PyMuPDF Output 191 | PyMuPDF provides a basic text extraction without semantic understanding: 192 | 193 | ```json 194 | [ 195 | { 196 | "page": 0, 197 | "content": "Invoice no: 27301261 \nDate of issue: \nSeller: \nWilliams LLC \n72074 Taylor Plains Suite 342 \nWest 198 | Alexandria, AR 97978 \nTax Id: 922-88-2832 \nIBAN: GB70FTNR64199348221780 \nITEMS \nNo. \nDescription \n2l \nLilly 199 | Pulitzer dress Size 2 \n2. \nNew ERIN Erin Fertherston \nStraight Dress White Sequence \nLining Sleeveless SZ 10 200 | \n3. \n Sequence dress Size Small \n4. \nfire los angeles dress Medium \nL \nEileen Fisher Women's Long \nSleeve 201 | Fleece Lined Front \nPockets Dress XS Gray \n6. \nLularoe Nicole Dress Size Small \nLight Solid Grey/ White 202 | Ringer \nTee Trim \nT \nJ.Crew Collection Black & White \nsweater Dress sz S \nSUMMARY \nTotal \n2,00 \n1,00 203 | \nVAT [%] \n10% \n10/09/2012 \neach \neach \nClient: \nHernandez-Anderson \n084 Carter Lane Apt. 846 \nSouth 204 | Ronaldbury, AZ 91030 \nTax Id: 959-74-5868 \nNet price \n Net worth \nVAT [%] \n45,00 \n225,00 \n10% \n59,99 205 | \n59,99 \n10% \n35,00 \n105,00 \n10% \n6,50 \n19,50 \n10% \n15,99 \n47,97 \n10% \n3,75 \n7.50 \n10% \n30,00 206 | \n30,00 \n10% \nNet worth \nVAT \n494,96 \n49,50 \n$ 494,96 \n$49,50 \nGross \nworth \n247,50 \n65,99 \n115,50 207 | \n21,45 \n52,77 \n8,25 \n33,00 \nGross worth \n544,46 \n$ 544,46 \n", 208 | "metadata": { 209 | "source": "./data/test_ocr_doc.pdf", 210 | "file_path": "./data/test_ocr_doc.pdf", 211 | "page": 0, 212 | "total_pages": 1, 213 | "format": "PDF 1.5", 214 | "title": "", 215 | "author": "", 216 | "subject": "", 217 | "keywords": "", 218 | "creator": "", 219 | "producer": "AskYourPDF.com", 220 | "creationDate": "", 221 | "modDate": "D:20250213152908Z", 222 | "trapped": "" 223 | } 224 | } 225 | ] 226 | ``` 227 | 228 | ### Real-World Impact: RAG Performance 229 | 230 | Let's see how this difference affects a real Question-Answering system: 231 | 232 | ```python 233 | question = "What is the total gross worth for item 1 and item 7?" 234 | 235 | # SmartLLMLoader Result ✅ 236 | "The total gross worth for item 1 (Lilly Pulitzer dress) is $247.50 and for item 7 237 | (J.Crew Collection sweater dress) is $33.00. 238 | Total: $280.50" 239 | 240 | # PyMuPDF Result ❌ 241 | "The total gross worth for item 1 is $45.00, and for item 7 it is $33.00. 242 | Total: $78.00" 243 | ``` 244 | 245 | **Why SmartLLMLoader Won:** 246 | - 🎯 Maintained table structure 247 | - 💡 Preserved relationships between data 248 | - 📊 Accurate calculations 249 | - 🤖 Better context for the LLM 250 | 251 | You can try it yourself by running the complete [RAG example](./examples/rag_example.py) to see the difference in action! 252 | 253 | ## License 254 | 255 | This project is licensed under the MIT License - see the LICENSE file for details. 256 | 257 | ## Contributing 258 | 259 | Contributions are welcome! Please feel free to submit a Pull Request. 260 | 261 | ## Authors 262 | 263 | - David Emmanuel ([@drmingler](https://github.com/drmingler)) -------------------------------------------------------------------------------- /examples/data/pymupdf_output.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "page": 0, 4 | "content": "Invoice no: 27301261 \nDate of issue: \nSeller: \nWilliams LLC \n72074 Taylor Plains Suite 342 \nWest Alexandria, AR 97978 \nTax Id: 922-88-2832 \nIBAN: GB70FTNR64199348221780 \nITEMS \nNo. \nDescription \n2l \nLilly Pulitzer dress Size 2 \n2. \nNew ERIN Erin Fertherston \nStraight Dress White Sequence \nLining Sleeveless SZ 10 \n3. \n Sequence dress Size Small \n4. \nfire los angeles dress Medium \nL \nEileen Fisher Women's Long \nSleeve Fleece Lined Front \nPockets Dress XS Gray \n6. \nLularoe Nicole Dress Size Small \nLight Solid Grey/ White Ringer \nTee Trim \nT \nJ.Crew Collection Black & White \nsweater Dress sz S \nSUMMARY \nTotal \n2,00 \n1,00 \nVAT [%] \n10% \n10/09/2012 \neach \neach \nClient: \nHernandez-Anderson \n084 Carter Lane Apt. 846 \nSouth Ronaldbury, AZ 91030 \nTax Id: 959-74-5868 \nNet price \n Net worth \nVAT [%] \n45,00 \n225,00 \n10% \n59,99 \n59,99 \n10% \n35,00 \n105,00 \n10% \n6,50 \n19,50 \n10% \n15,99 \n47,97 \n10% \n3,75 \n7.50 \n10% \n30,00 \n30,00 \n10% \nNet worth \nVAT \n494,96 \n49,50 \n$ 494,96 \n$49,50 \nGross \nworth \n247,50 \n65,99 \n115,50 \n21,45 \n52,77 \n8,25 \n33,00 \nGross worth \n544,46 \n$ 544,46 \n", 5 | "metadata": { 6 | "source": "./data/test_ocr_doc.pdf", 7 | "file_path": "./data/test_ocr_doc.pdf", 8 | "page": 0, 9 | "total_pages": 1, 10 | "format": "PDF 1.5", 11 | "title": "", 12 | "author": "", 13 | "subject": "", 14 | "keywords": "", 15 | "creator": "", 16 | "producer": "AskYourPDF.com", 17 | "creationDate": "", 18 | "modDate": "D:20250213152908Z", 19 | "trapped": "" 20 | } 21 | } 22 | ] 23 | 24 | 25 | -------------------------------------------------------------------------------- /examples/data/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/smart-llm-loader/8ccb7c66a7944180a201f03b8e5549156ae10d08/examples/data/test.pdf -------------------------------------------------------------------------------- /examples/data/test_chunks.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "content": "Synchronizing Machine Learning Algorithms, Realtime Robotic Control and Simulated Environment with 080\nVincent Berenz, Felix Widmaier, Simon Guist, Bernhard Schölkopf and Dieter Büchler\nMax Planck Institute for Intelligent Systems, Tübingen, Germany\n(work presented at the Robot Software Architectures Workshop - RSA 2023, ICRA)", 4 | "metadata": { 5 | "page": 0, 6 | "semantic_theme": "title", 7 | "source": "data/test.pdf" 8 | } 9 | }, 10 | { 11 | "content": "Abstract—Robotic applications require the integration of various modalities, encompassing perception, control of real robots and possibly the control of simulated environments. While the state-of-the-art robotic software solutions such as ROS 2 provide most of the required features, flexible synchronization between algorithms, data streams and control loops can be tedious. 080 is a versatile C++ framework for robotics which provides a shared memory model and a command framework for real-time critical systems. It enables expert users to set up complex robotic systems and generate Python bindings for scientists. 080's unique feature is its flexible synchronization between processes, including the traditional blocking commands and the novel \"bursting mode\", which allows user code to control the execution of the lower process control loop. This makes it particularly useful for setups that mix real and simulated environments.", 12 | "metadata": { 13 | "page": 0, 14 | "semantic_theme": "abstract", 15 | "source": "data/test.pdf" 16 | } 17 | }, 18 | { 19 | "content": "I. INTRODUCTION\n080¹ is an open-source C++ toolbox that allows expert users to create custom Python API suitable for interacting with complex robotic setup [1]. 080 provides functions for:\n• the spawning realtime processes (e.g., running on RT-Preempt, low latency kernel or Xenomai)\n• the synchronization of these processes\n• the asynchronous access to a shared memory hosting the history of all sensor data\n• the sending of custom commands, including blocking and not blocking commands (see section II-B)\n• the automated generation of customized Python bindings\nWhile 080 shares similarities with ROS (spawning of processes and C++/Python interoperability) and actionlib (management of commands) [2][3], it has differences to them: it relies on a shared memory model rather than a publish-subscribe model. But the core difference, and novelty of 080, is its flexibility regarding synchronization. When using 080, users may either synchronize their higher level code with the lower level control process via blocking commands (see section II-B). Alternatively, it is possible to synchronize the lower level control process to the higher level code via the new 'bursting mode'. In bursting mode, the low-level control process blocks until the user process sends a request to run one or more iterations. This unique feature is useful when interacting with a simulated robot or even, as shown in section III, an experimental setup involving a real robot and a simulated environment.\nHowever, as opposed to ROS, 080 does not support network communication, as it requires the processes it orchestrates to run on the same computer.\nThe 080 framework is a two-levels system. The first level involves the expert user, who is responsible for implementing the C++ driver classes that are specific to the hardware used in the experiment. During compilation, 080 utilizes these classes as templates to generate executables that create the real-time control processes. In addition to implementing the driver classes, the expert user is responsible for generating a Python API tailored to the needs of the users. 080 allows for automated generation of Python bindings. As a result, the users will have access to a simple Python interface. These users can focus on designing experiments without being burdened by the implementation details of the robotic setup.", 20 | "metadata": { 21 | "page": 0, 22 | "semantic_theme": "introduction", 23 | "source": "data/test.pdf" 24 | } 25 | }, 26 | { 27 | "content": "II. OVERVIEW\nA. Backend, frontend and shared memory\n080 is based on the interaction between:\n• a backend, i.e., a process which communicates in realtime with a hardware device. It is responsible for sending commands to the device and receiving observations from", 28 | "metadata": { 29 | "page": 0, 30 | "semantic_theme": "overview", 31 | "source": "data/test.pdf" 32 | } 33 | }, 34 | { 35 | "content": "An image of a robotic arm is shown in the top right of the page. The robotic arm is yellow and black with a gripper at the end. It is mounted on a black base and appears to be in a laboratory setting. A computer monitor is visible in the background, showing a graphical user interface that seems to be related to the robotic arm control.", 36 | "metadata": { 37 | "page": 0, 38 | "semantic_theme": "image_description", 39 | "source": "data/test.pdf" 40 | } 41 | }, 42 | { 43 | "content": "**A. Shared Memory**\n\nUpon spawning, a backend creates a dedicated shared memory. It uses this shared memory to 1) read user commands and 2) write sensor and/or state related data. A frontend provides methods for connecting to a related backend's shared memory for 1) writing user commands and 2) for reading data.", 44 | "metadata": { 45 | "page": 1, 46 | "semantic_theme": "System Architecture", 47 | "source": "data/test.pdf" 48 | } 49 | }, 50 | { 51 | "content": "**B. Realtime control and commands**\n\nStable control on realtime critical systems requires high-frequency software loops. 080 backend processes are developed in C++ to comply with real-time constraints, calculating the desired state for each actuator at each iteration. Frontends provide Python methods that send higher-level, non-realtime commands to the backend's shared memory. These commands specify implicit desired state trajectories that rely on interpolation (based on specified durations, speeds, or numbers of server iterations). For instance, if a user command specifies that a robot should reach a desired state over a specified duration, the backend will generate the higher-frequency low-level commands that interpolate from the robot's current state to the desired state. By translating the frontend's commands into low-level commands that operate the system at the required frequency, the backend ensures stable control of the real-time critical systems. The frontend's API is flexible and allows for queuing or interrupting commands, as well as issuing both blocking and non-blocking commands.", 52 | "metadata": { 53 | "page": 1, 54 | "semantic_theme": "System Architecture", 55 | "source": "data/test.pdf" 56 | } 57 | }, 58 | { 59 | "content": "**C. Reading observation**\n\nThe backend writes current actuator state and custom sensor information to shared memory at each control iteration, which can be retrieved using various methods provided by the frontend API. Users can request the latest information, information from past server iterations, or wait for future data using a blocking method (which can be used for synchronizing user processes with backend, see section II-E). Multiple instances of the frontend can read asynchronously from shared memory. For example, this enables users to run logging scripts for the robot's state in parallel with control scripts.", 60 | "metadata": { 61 | "page": 1, 62 | "semantic_theme": "System Architecture", 63 | "source": "data/test.pdf" 64 | } 65 | }, 66 | { 67 | "content": "**D. Embedding backends**\n\nIn addition to generating executables that spawn backend processes, 080's API also supports embedding instances of C++ or Python backends in other processes. This feature can be utilized to extend 080's functionality to simulations. Section III provides an example of o80 backends being used to control the movement of bodies in a Mujoco environment.", 68 | "metadata": { 69 | "page": 1, 70 | "semantic_theme": "System Architecture", 71 | "source": "data/test.pdf" 72 | } 73 | }, 74 | { 75 | "content": "**E. Synchronization and bursting mode**\n\n080 provides two synchronization modes: \"normal\" and \"bursting\". In normal mode, the backend process runs in real-time at its required frequency, while the user Python process can synchronize with it through the blocking waiting methods mentioned earlier. However, in bursting mode, the backend process blocks until the frontend requires it to run one or more iterations. Bursting mode is typically used when interacting with a simulator. The frontend allows users to create commands that require several backend iterations to execute, which can then be executed as fast as the simulator allows.", 76 | "metadata": { 77 | "page": 1, 78 | "semantic_theme": "System Architecture", 79 | "source": "data/test.pdf" 80 | } 81 | }, 82 | { 83 | "content": "**III. HYSR TRAINING OF TABLE TENNIS PLAYING ROBOT**\n\nA team of researchers from the Max Planck Institute for Intelligent Systems is exploring the potential of reinforcement learning for teaching a robotic arm, actuated by a pneumatic artificial muscle (PAM), to play table tennis [4]. The scientists are using a hybrid simulation and real training (HYSR) technique that involves mirroring the movements of a real robotic arm with a Mujoco simulated arm. This approach allows the real robot to interact with a single or multiple simulated balls that are being replayed from recorded ball trajectories, facilitating practical long-term learning of table tennis². Additionally, virtual environments can be adapted for data-efficient training by, for instance, playing with multiple virtual balls [5].", 84 | "metadata": { 85 | "page": 1, 86 | "semantic_theme": "Application", 87 | "source": "data/test.pdf" 88 | } 89 | }, 90 | { 91 | "content": "**To set up the experiment, the researchers required:**\n\n* A real-time control process that sends pressure commands to the PAM controllers of the real robot at a fixed frequency of 500Hz.\n* A Mujoco simulated robot that mirrors the movements of the real robot and replays recorded ball trajectories. Each iteration of the Mujoco simulator takes 0.02 seconds.\n* A GYM reinforcement learning environment with a step function running at 100Hz.\n* Control of other hardware for real ball experiments, including a Vicon system for localizing the robot and table, a ball launcher, and an RGB-based ball detection system.", 92 | "metadata": { 93 | "page": 1, 94 | "semantic_theme": "Application", 95 | "source": "data/test.pdf" 96 | } 97 | }, 98 | { 99 | "content": "080 allowed to solve all the synchronization issues related to this setup. A backend process runs at 500Hz and controls the real pneumatic muscles while reading related robot states. A backend instance, running in bursting mode, is embedded in the Mujoco simulated environment. Frontends, connected to both backends, are embedded in the learning environment, asynchronously sending pressure actions and reading states to/from the real robot, sending mirroring states to the simulated robot, and sending bursting commands to the Mujoco simulated environment.\n\nIn addition, 080 simplified the process of spawning new processes that create additional frontends, which can easily access the shared memory history to log data, visualize the robot state in real-time, and monitor both the simulated and real robot state.\n\nThe code and documentation of this project are available as open source online [6].", 100 | "metadata": { 101 | "page": 1, 102 | "semantic_theme": "Application", 103 | "source": "data/test.pdf" 104 | } 105 | }, 106 | { 107 | "content": "**IV. CONCLUSION**\n\n080 is a versatile middleware system that offers flexible control of robotic systems. It allows expert users to develop a user-friendly Python API that makes it easier for machine learning scientists to use complex robotic setups. Its", 108 | "metadata": { 109 | "page": 1, 110 | "semantic_theme": "Conclusion", 111 | "source": "data/test.pdf" 112 | } 113 | }, 114 | { 115 | "content": "shared memory model, different synchronization modes, and interpolation-based command framework distinguish it from ROS. For more information and code examples, we refer to 080's comprehensive documentation [7].", 116 | "metadata": { 117 | "page": 2, 118 | "semantic_theme": "system_description", 119 | "source": "data/test.pdf" 120 | } 121 | }, 122 | { 123 | "content": "REFERENCES\n[1] V. Berenz, M. Naveau, F. Widmaier, M. Wüthrich, J.-C. Passy, S. Guist, and D. Büchler, \"The 080 c++ templated toolbox: Designing customized python apis for synchronizing realtime processes,\" Journal of Open Source Software, vol. 6, no. 66, p. 2752, 2021. [Online]. Available: https://doi.org/10.21105/joss.02752\n[2] Stanford Artificial Intelligence Laboratory et al., \"Robotic operating system.\" [Online]. Available: https://www.ros.org\n[3] M. Carroll, J. Perron, E. Marder-Eppstein, V. Pradeep, and M. Arguedas, \"actionlib,\" 2009. [Online]. Available: http://wiki.ros.org/actionlib\n[4] D. Büchler, S. Guist, R. Calandra, V. Berenz, B. Schölkopf, and J. Peters, \"Learning to play table tennis from scratch using muscular robots,\" IEEE Transactions on Robotics (T-RO), vol. 38, no. 6, pp. 3850-3860, 2022.\n[5] S. Guist, J. Schneider, A. Dittrich, V. Berenz, B. Schölkopf, and D. Büchler, \"Hindsight states: Blending sim and real task elements for efficient reinforcement learning,\" arXiv preprint arXiv:2303.02234, 2023.\n[6] V. Berenz, F. Widmaier, S. Guist, and D. Büchler, \"PAM robot software documentation,\" 2020. [Online]. Available: https://intelligent-soft-robots.github.io/pam_documentation/\n[7] V. Berenz, S. Guist, and D. Büchler, \"080 robot software documentation,\" 2020. [Online]. Available: http://people.tuebingen.mpg.de/mpi-is-software/o80/docs/o80/index.html", 124 | "metadata": { 125 | "page": 2, 126 | "semantic_theme": "references", 127 | "source": "data/test.pdf" 128 | } 129 | } 130 | ] -------------------------------------------------------------------------------- /examples/data/test_ocr_doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/smart-llm-loader/8ccb7c66a7944180a201f03b8e5549156ae10d08/examples/data/test_ocr_doc.pdf -------------------------------------------------------------------------------- /examples/data/test_ocr_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/smart-llm-loader/8ccb7c66a7944180a201f03b8e5549156ae10d08/examples/data/test_ocr_doc.png -------------------------------------------------------------------------------- /examples/data/test_ocr_doc_chunks.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "content": "Invoice no: 27301261\nDate of issue: 10/09/2012\nSeller: Williams LLC\n72074 Taylor Plains Suite 342\nWest Alexandria, AR 97978\nTax Id: 922-88-2832\nIBAN: GB70FTNR64199348221780", 4 | "metadata": { 5 | "page": 0, 6 | "semantic_theme": "invoice_header", 7 | "source": "data/test_ocr_doc.pdf" 8 | } 9 | }, 10 | { 11 | "content": "Client: Hernandez-Anderson\n084 Carter Lane Apt. 846\nSouth Ronaldbury, AZ 91030\nTax Id: 959-74-5868", 12 | "metadata": { 13 | "page": 0, 14 | "semantic_theme": "invoice_client", 15 | "source": "data/test_ocr_doc.pdf" 16 | } 17 | }, 18 | { 19 | "content": "| No. | Description | Qty | UM | Net price | Net worth | VAT [%] | Gross worth |\n|---|---|---|---|---|---|---|---|\n| 1. | Lilly Pulitzer dress Size 2 | 5.00 | each | 45.00 | 225,00 | 10% | 247,50 |\n| 2. | New ERIN Erin Fertherston Straight Dress White Sequence Lining Sleeveless SZ 10 | 1,00 | each | 59,99 | 59,99 | 10% | 65,99 |\n| 3. | Sequence dress Size Small | 3,00 | each | 35,00 | 105,00 | 10% | 115,50 |\n| 4. | fire los angeles dress Medium | 3.00 | each | 6,50 | 19,50 | 10% | 21,45 |\n| 5. | Eileen Fisher Women's Long Sleeve Fleece Lined Front Pockets Dress XS Gray | 3,00 | each | 15,99 | 47,97 | 10% | 52,77 |\n| 6. | Lularoe Nicole Dress Size Small Light Solid Grey/White Ringer Tee Trim | 2,00 | each | 3,75 | 7,50 | 10% | 8,25 |\n| 7. | J.Crew Collection Black & White sweater Dress sz S | 1,00 | each | 30,00 | 30,00 | 10% | 33,00 |", 20 | "metadata": { 21 | "page": 0, 22 | "semantic_theme": "invoice_items", 23 | "source": "data/test_ocr_doc.pdf" 24 | } 25 | }, 26 | { 27 | "content": "| VAT [%] | Net worth | VAT | Gross worth |\n|---|---|---|---| \n| 10% | 494,96 | 49,50 | 544,46 |\n| Total | $ 494,96 | $ 49,50 | $ 544,46 |", 28 | "metadata": { 29 | "page": 0, 30 | "semantic_theme": "invoice_summary", 31 | "source": "data/test_ocr_doc.pdf" 32 | } 33 | } 34 | ] -------------------------------------------------------------------------------- /examples/ocr_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications. 3 | """ 4 | import os 5 | from dotenv import load_dotenv 6 | 7 | from smart_llm_loader import SmartLLMLoader 8 | 9 | # Load environment variables 10 | load_dotenv() 11 | 12 | # OpenAI API key since we are using the gpt-4o-mini model for question-answering 13 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" 14 | 15 | # Gemini API key since we are using the gemini flash model 16 | os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY" 17 | 18 | 19 | def process_with_llmloader(): 20 | """Process documents using SmartLLMLoader with Gemini Flash.""" 21 | 22 | # Initialize the loader from the smart-llm-loader package 23 | loader = SmartLLMLoader( 24 | file_path="./data/test_ocr_doc.pdf", 25 | chunk_strategy="contextual", 26 | model="gemini/gemini-1.5-flash", 27 | save_output=True, 28 | # output_dir="./data", 29 | ) 30 | 31 | docs = loader.load_and_split() 32 | return docs 33 | 34 | 35 | def process_with_pymupdf(): 36 | """Process documents using PyMuPDF loader.""" 37 | import json 38 | from langchain_community.document_loaders import PyMuPDFLoader 39 | 40 | # Initialize the PyMuPDF loader 41 | loader = PyMuPDFLoader("./data/test_ocr_doc.pdf") 42 | docs = loader.load() 43 | 44 | output_data = [] 45 | for doc in docs: 46 | output_data.append({ 47 | "page": doc.metadata["page"], 48 | "content": doc.page_content, 49 | "metadata": doc.metadata 50 | }) 51 | 52 | # Save as JSON 53 | output_path = "data/pymupdf_output.json" 54 | with open(output_path, "w", encoding="utf-8") as f: 55 | json.dump(output_data, f, indent=2, ensure_ascii=False) 56 | 57 | return docs 58 | 59 | 60 | def main(): 61 | results = process_with_llmloader() 62 | print(results) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /examples/rag_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications. 3 | """ 4 | import os 5 | from dotenv import load_dotenv 6 | from langchain.text_splitter import RecursiveCharacterTextSplitter 7 | from langchain_community.document_loaders import PyMuPDFLoader 8 | from langchain_core.output_parsers import StrOutputParser 9 | from langchain_core.prompts import PromptTemplate 10 | from langchain_core.runnables import RunnablePassthrough 11 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings 12 | from langchain_community.vectorstores import FAISS 13 | 14 | from smart_llm_loader import SmartLLMLoader 15 | 16 | # Load environment variables 17 | load_dotenv() 18 | 19 | # OpenAI API key since we are using the gpt-4o-mini model for question-answering 20 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" 21 | 22 | # Gemini API key since we are using the gemini flash model 23 | os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY" 24 | 25 | 26 | def create_rag_chain(retriever, llm): 27 | """Create a RAG chain with the given retriever and LLM.""" 28 | prompt_template = PromptTemplate.from_template( 29 | """ 30 | You are an assistant for question-answering tasks. 31 | Use the following pieces of retrieved context to answer the question. 32 | If you don't know the answer, just say that you don't know. 33 | Use three sentences maximum and keep the answer concise 34 | Question: {question} 35 | Context: {context} 36 | Answer:""" 37 | ) 38 | 39 | def format_docs(docs): 40 | return "\n\n".join(doc.page_content for doc in docs) 41 | 42 | return ( 43 | {"context": retriever | format_docs, "question": RunnablePassthrough()} 44 | | prompt_template 45 | | llm 46 | | StrOutputParser() 47 | ) 48 | 49 | 50 | def process_with_llmloader(): 51 | """Process documents using SmartLLMLoader with Gemini Flash.""" 52 | llm = ChatOpenAI(model="gpt-4o-mini") 53 | 54 | # Initialize the loader from the smart-llm-loader package 55 | loader = SmartLLMLoader( 56 | file_path="./data/test_ocr_doc.pdf", 57 | chunk_strategy="contextual", 58 | model="gemini/gemini-1.5-flash", 59 | ) 60 | 61 | docs = loader.load_and_split() 62 | vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings()) 63 | rag_chain = create_rag_chain(vectorstore.as_retriever(), llm) 64 | return rag_chain 65 | 66 | 67 | def process_with_pymupdf(): 68 | """Process documents using PyMuPDF with recursive chunking.""" 69 | llm = ChatOpenAI(model="gpt-4o-mini") 70 | 71 | # Load document with PyMuPDF 72 | loader = PyMuPDFLoader("./data/test_ocr_doc.pdf") 73 | documents = loader.load() 74 | 75 | # Create text splitter for recursive chunking 76 | text_splitter = RecursiveCharacterTextSplitter( 77 | chunk_size=1000, 78 | chunk_overlap=200, 79 | length_function=len, 80 | is_separator_regex=False, 81 | ) 82 | 83 | docs = text_splitter.split_documents(documents) 84 | vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings()) 85 | rag_chain = create_rag_chain(vectorstore.as_retriever(), llm) 86 | return rag_chain 87 | 88 | 89 | def main(): 90 | # Example using LLMLoader 91 | print("\n=== Using LLMLoader ===") 92 | llm_chain = process_with_llmloader() 93 | question = "What is the total gross worth for item 1 and item 7?" 94 | answer = llm_chain.invoke(question) 95 | print(f"Question: {question}") 96 | print(f"Answer: {answer}") 97 | 98 | # Example using PyMuPDF 99 | print("\n=== Using PyMuPDF ===") 100 | pymupdf_chain = process_with_pymupdf() 101 | answer = pymupdf_chain.invoke(question) 102 | print(f"Question: {question}") 103 | print(f"Answer: {answer}") 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "smart-llm-loader" 3 | version = "0.1.1" 4 | description = "A powerful PDF processing toolkit that seamlessly integrates with LLMs for intelligent document chunking and RAG applications. Features smart context-aware segmentation, multi-LLM support, and optimized content extraction for enhanced RAG performance." 5 | authors = ["drmingler "] 6 | readme = "README.md" 7 | packages = [{include = "smart_llm_loader"}] 8 | license = "MIT" 9 | repository = "https://github.com/drmingler/smart-llm-loader" 10 | keywords = ["pdf", "llm", "rag", "document-processing", "ai"] 11 | classifiers = [ 12 | "Development Status :: 4 - Beta", 13 | "Intended Audience :: Developers", 14 | "License :: OSI Approved :: MIT License", 15 | "Programming Language :: Python :: 3", 16 | "Programming Language :: Python :: 3.12", 17 | "Topic :: Software Development :: Libraries :: Python Modules", 18 | "Topic :: Text Processing :: General" 19 | ] 20 | 21 | [tool.poetry.dependencies] 22 | python = "^3.9" 23 | langchain = "^0.1.0" 24 | langchain-community = "^0.0.10" 25 | langchain-core = "^0.1.10" 26 | requests = "^2.31.0" 27 | python-dotenv = "^1.0.0" 28 | pypdf = "^3.17.1" 29 | faiss-cpu = "^1.7.4" 30 | tiktoken = "^0.8.0" 31 | litellm = "^1.61.3" 32 | pdf2image = "^1.17.0" 33 | 34 | [tool.poetry.group.dev.dependencies] 35 | pytest = "^7.4.0" 36 | pytest-asyncio = "^0.23.0" 37 | pytest-cov = "^4.1.0" 38 | pytest-mock = "^3.12.0" 39 | 40 | [build-system] 41 | requires = ["poetry-core"] 42 | build-backend = "poetry.core.masonry.api" 43 | 44 | [tool.pytest.ini_options] 45 | testpaths = ["tests"] 46 | python_files = ["test_*.py"] 47 | addopts = "-v --cov=smart_llm_loader --cov-report=term-missing" 48 | asyncio_mode = "auto" -------------------------------------------------------------------------------- /smart_llm_loader/__init__.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import platform 4 | from pathlib import Path 5 | 6 | def _check_poppler_installation(): 7 | """Check if poppler is installed and provide installation instructions if it's not.""" 8 | system = platform.system().lower() 9 | 10 | try: 11 | if system == "darwin": # macOS 12 | subprocess.run(["pdftoppm", "-v"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) 13 | elif system == "linux": 14 | subprocess.run(["pdftoppm", "-v"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) 15 | elif system == "windows": 16 | # On Windows, check if poppler is in PATH or in common installation locations 17 | poppler_path = None 18 | for path in sys.path: 19 | if Path(path).joinpath("poppler/bin").exists(): 20 | poppler_path = path 21 | break 22 | if not poppler_path: 23 | raise FileNotFoundError 24 | except (subprocess.SubprocessError, FileNotFoundError): 25 | instructions = { 26 | "darwin": "Install poppler using: brew install poppler", 27 | "linux": "Install poppler using: sudo apt-get install poppler-utils (Ubuntu/Debian) or sudo yum install poppler-utils (CentOS/RHEL)", 28 | "windows": "Download and install poppler from: http://blog.alivate.com.au/poppler-windows/ and add it to your PATH" 29 | } 30 | 31 | print(f"\n⚠️ WARNING: Required system dependency 'poppler' not found!") 32 | print(f"This package requires poppler for PDF processing.") 33 | print(f"\nTo install poppler on your system:") 34 | print(f"{instructions.get(system, 'Please install poppler for your operating system')}") 35 | print("\nContinuing without PDF processing capability...\n") 36 | 37 | # Run the check when the package is imported 38 | _check_poppler_installation() 39 | 40 | # Import main package components 41 | from .document_loader import SmartLLMLoader 42 | 43 | __version__ = "0.1.0" 44 | -------------------------------------------------------------------------------- /smart_llm_loader/document_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Document loader module for handling different types of inputs (files and URLs). 3 | """ 4 | import os 5 | from pathlib import Path 6 | import tempfile 7 | from typing import AsyncIterator, List, Optional, Iterator, Tuple, Union 8 | from langchain_community.document_loaders.base import BaseLoader 9 | from langchain_core.documents import Document 10 | import requests 11 | 12 | from smart_llm_loader.llm import ImageProcessor, LLMProcessing 13 | from smart_llm_loader.utils import copy_file, save_output_file, is_pdf 14 | 15 | 16 | class SmartLLMLoader(BaseLoader): 17 | """A flexible document loader that supports multiple input types.""" 18 | 19 | def __init__( 20 | self, 21 | file_path: Optional[Union[str, Path]] = None, 22 | url: Optional[str] = None, 23 | chunk_strategy: str = 'contextual', 24 | custom_prompt: Optional[str] = None, 25 | model: str = "gemini/gemini-2.0-flash", 26 | save_output: bool = False, 27 | output_dir: Optional[Union[str, Path]] = None, 28 | **kwargs, 29 | ): 30 | """Initialize the DocumentLoader with a file path or URL.""" 31 | 32 | """ 33 | Args: 34 | file_path: Path to the file to load 35 | url: URL to load the document from 36 | chunk_strategy: Strategy to use for chunking the document page, contextual or custom 37 | custom_prompt: Custom prompt to use for chunking the document, this will override the default prompt 38 | save_output: Whether to save the output files 39 | output_dir: Directory to save output files (if save_output is True) 40 | **kwargs: Additional arguments that will be passed to the litellm.completion method. 41 | Refer: https://docs.litellm.ai/docs/completion/input and https://docs.litellm.ai/docs/providers 42 | """ 43 | self.chunk_strategy = chunk_strategy 44 | self.custom_prompt = custom_prompt 45 | self.llm_processor = LLMProcessing(model=model, **kwargs) 46 | 47 | if file_path and url: 48 | raise ValueError("Only one of file_path or url should be provided.") 49 | 50 | if not file_path and not url: 51 | raise ValueError("Either file_path or url must be provided.") 52 | 53 | self.file_path, self.output_dir = ( 54 | self._load_from_path(file_path, save_output, output_dir) 55 | if file_path 56 | else self._load_from_url(url, save_output, output_dir) 57 | ) 58 | 59 | @staticmethod 60 | def _load_from_path( 61 | file_path: Union[str, Path], save_output: bool = False, output_dir: Optional[Union[str, Path]] = None 62 | ) -> Tuple[Path, Optional[Path]]: 63 | """Load documents from a file path.""" 64 | file_path = Path(file_path) 65 | if not file_path.exists(): 66 | raise FileNotFoundError(f"File not found: {file_path}") 67 | 68 | if save_output or output_dir: 69 | output_dir = Path(output_dir) if output_dir else Path(f"{os.getcwd()}/{file_path.stem}") 70 | output_dir.mkdir(parents=True, exist_ok=True) 71 | output_file = output_dir / file_path.name 72 | copy_file(file_path, output_file) 73 | 74 | return file_path, output_dir 75 | 76 | @staticmethod 77 | def _load_from_url( 78 | url: str, save_output: bool = False, output_dir: Optional[Union[str, Path]] = None 79 | ) -> Tuple[Path, Optional[Path]]: 80 | """Load documents from a URL.""" 81 | response = requests.get(url) 82 | response.raise_for_status() 83 | is_link_to_pdf = is_pdf(url, response) 84 | 85 | if is_link_to_pdf: 86 | with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file: 87 | temp_path = Path(temp_file.name) 88 | temp_file.write(response.content) 89 | 90 | if save_output or output_dir: 91 | url_filename = url.split('/')[-1] or 'output' 92 | url_filename = url_filename if ".pdf" in url_filename else url_filename + ".pdf" 93 | output_dir = Path(output_dir) if output_dir else Path(f"{os.getcwd()}/{Path(url_filename).stem}") 94 | output_dir.mkdir(parents=True, exist_ok=True) 95 | output_file = output_dir / url_filename 96 | copy_file(temp_path, output_file) 97 | 98 | return temp_path, output_dir 99 | else: 100 | raise ValueError("The URL does not point to a PDF file.") 101 | 102 | async def aload(self) -> list[Document]: 103 | """Load Documents and split into chunks using LLM-based OCR processing. async version""" 104 | return await self.llm_processor.async_process_document_with_llm( 105 | self.file_path, chunk_strategy="page", output_dir=self.output_dir 106 | ) 107 | 108 | def load(self) -> List[Document]: 109 | """Load documents from either a file path or URL. 110 | 111 | Processes the document using LLM-based OCR with basic page-level chunking. 112 | 113 | Returns: 114 | List[Document]: List of processed document chunks 115 | """ 116 | documents = self.llm_processor.process_document_with_llm( 117 | self.file_path, chunk_strategy="page", output_dir=self.output_dir 118 | ) 119 | return documents 120 | 121 | def load_and_split(self, text_splitter: Optional = None) -> List[Document]: 122 | """Load Documents and split into chunks using LLM-based OCR processing. 123 | 124 | Args: 125 | text_splitter: Optional text splitter (not used in current implementation) 126 | 127 | Returns: 128 | List[Document]: List of processed and chunked documents based on specified strategy 129 | """ 130 | documents = self.llm_processor.process_document_with_llm( 131 | self.file_path, self.chunk_strategy, self.custom_prompt, output_dir=self.output_dir 132 | ) 133 | return documents 134 | 135 | def _create_document(self, chunk: dict, page_num: int) -> Document: 136 | """Helper method to create a Document object from a chunk.""" 137 | return Document( 138 | page_content=chunk['content'], 139 | metadata={ 140 | 'page': page_num, 141 | 'semantic_theme': chunk.get('theme'), 142 | 'source': self.file_path, 143 | }, 144 | ) 145 | 146 | def lazy_load(self) -> Iterator[Document]: 147 | """Load Documents lazily, processing and yielding one page at a time. 148 | 149 | Yields: 150 | Document: Processed document chunks one at a time to conserve memory 151 | """ 152 | images = ImageProcessor.pdf_to_images(self.file_path) 153 | prompt = self.llm_processor.get_chunk_prompt('page') 154 | 155 | documents = [] 156 | for page_num, image in enumerate(images): 157 | result = self.llm_processor.process_image_with_llm(image, prompt) 158 | for chunk in result['markdown_chunks']: 159 | if chunk.get('content') is None: 160 | continue 161 | doc = self._create_document(chunk, page_num) 162 | documents.append(doc) 163 | yield doc 164 | 165 | save_output_file(documents, self.output_dir) 166 | 167 | async def alazy_load(self) -> AsyncIterator[Document]: 168 | """Load Documents lazily and asynchronously, processing and yielding one page at a time. 169 | 170 | Yields: 171 | Document: Processed document chunks one at a time asynchronously 172 | """ 173 | images = ImageProcessor.pdf_to_images(self.file_path) 174 | prompt = self.llm_processor.get_chunk_prompt('page') 175 | 176 | documents = [] 177 | for page_num, image in enumerate(images): 178 | result = await self.llm_processor.async_process_image_with_llm(image, prompt) 179 | for chunk in result['markdown_chunks']: 180 | if chunk.get('content') is None: 181 | continue 182 | doc = self._create_document(chunk, page_num) 183 | documents.append(doc) 184 | yield doc 185 | 186 | save_output_file(documents, self.output_dir) 187 | -------------------------------------------------------------------------------- /smart_llm_loader/llm.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from pathlib import Path 3 | from typing import List, Optional, Union 4 | from base64 import b64encode 5 | import io 6 | from multiprocessing import cpu_count 7 | 8 | from PIL.Image import Image 9 | from langchain_core.documents import Document 10 | from pdf2image import convert_from_path 11 | from litellm import completion, validate_environment, supports_vision, check_valid_key, acompletion 12 | 13 | from smart_llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT 14 | from smart_llm_loader.schema import OCRResponse 15 | from smart_llm_loader.utils import save_output_file 16 | 17 | 18 | class ImageProcessor: 19 | @staticmethod 20 | def pdf_to_images(file_path: Optional[Union[str, Path]] = None) -> list[Image]: 21 | """Convert PDF pages to images all at once for better performance. 22 | 23 | Args: 24 | file_path: Path to the PDF file to convert 25 | 26 | Returns: 27 | list[Image]: List of PIL Image objects, one per PDF page 28 | """ 29 | images = convert_from_path( 30 | file_path, 31 | dpi=300, 32 | fmt='PNG', 33 | size=(None, 1056), 34 | thread_count=cpu_count(), 35 | use_pdftocairo=True, 36 | ) 37 | return images 38 | 39 | @staticmethod 40 | def image_to_base64(image: Image) -> str: 41 | """Convert an image to a base64 string. 42 | 43 | Args: 44 | image: PIL Image object to convert 45 | 46 | Returns: 47 | str: Base64 encoded string representation of the image 48 | """ 49 | img_byte_arr = io.BytesIO() 50 | image.save(img_byte_arr, format='PNG') 51 | img_bytes = img_byte_arr.getvalue() 52 | return b64encode(img_bytes).decode('utf-8') 53 | 54 | 55 | class LLMProcessing: 56 | def __init__(self, model: str = "gemini/gemini-2.0-flash", **kwargs): 57 | self._validate_model(model, **kwargs) 58 | self.model = model 59 | self.kwargs = kwargs 60 | 61 | @staticmethod 62 | def _validate_model(model: str, **kwargs) -> None: 63 | """Validate that the model is properly configured for vision tasks.""" 64 | environment = validate_environment(model=model) 65 | api_key = kwargs.get("api_key") 66 | 67 | if not environment["keys_in_environment"] and not api_key: 68 | raise ValueError(f"Missing environment variables for {model}: {environment}") 69 | 70 | if not supports_vision(model=model): 71 | raise ValueError(f"Model '{model}' is not a supported vision model.") 72 | 73 | if not check_valid_key(model=model, api_key=api_key): 74 | raise ValueError(f"Failed to access model '{model}'. Please check your API key and model availability.") 75 | 76 | @staticmethod 77 | def get_chunk_prompt(strategy: str, custom_prompt: Optional[str] = None) -> str: 78 | if strategy == 'custom' and not custom_prompt: 79 | raise ValueError("Custom prompt is not provided. A custom prompt is required for 'custom' strategy.") 80 | 81 | if custom_prompt: 82 | return custom_prompt 83 | 84 | elif strategy == 'page': 85 | return DEFAULT_PAGE_CHUNK_PROMPT 86 | 87 | elif strategy == 'contextual': 88 | return DEFAULT_CHUNK_PROMPT 89 | 90 | else: 91 | raise ValueError(f"Invalid chunk strategy: {strategy}, must be one of 'page', 'contextual' or 'custom'") 92 | 93 | @staticmethod 94 | def prepare_llm_messages(page_as_image: Image, prompt: str) -> List[dict]: 95 | base64_image = ImageProcessor.image_to_base64(page_as_image) 96 | messages = [ 97 | {"role": "system", "content": prompt}, 98 | { 99 | "role": "user", 100 | "content": [ 101 | {"type": "text", "text": "Process this image:"}, 102 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}, 103 | ], 104 | }, 105 | ] 106 | return messages 107 | 108 | @staticmethod 109 | def serialize_response(results: List[dict], file_path: Optional[Union[str, Path]] = None) -> List[Document]: 110 | documents = [] 111 | for page_num, result in enumerate(results): 112 | for chunk in result['markdown_chunks']: 113 | if chunk.get('theme') is None and chunk.get('content') is None: 114 | continue 115 | 116 | doc = Document( 117 | page_content=chunk['content'], 118 | metadata={ 119 | 'page': page_num, 120 | 'semantic_theme': chunk.get('theme'), 121 | 'source': file_path, 122 | }, 123 | ) 124 | documents.append(doc) 125 | 126 | return documents 127 | 128 | def process_document_with_llm( 129 | self, 130 | file_path: Optional[Union[str, Path]] = None, 131 | chunk_strategy: str = 'page', 132 | custom_prompt: Optional[str] = None, 133 | output_dir: Optional[Union[str, Path]] = None, 134 | ) -> List[Document]: 135 | """Process a document with LLM for OCR and chunking. 136 | 137 | Args: 138 | file_path: Path to the document to process 139 | chunk_strategy: Strategy for chunking ('page', 'contextual', or 'custom') 140 | custom_prompt: Custom prompt to use for chunking 141 | output_dir: Directory to save processed output 142 | 143 | Returns: 144 | List[Document]: List of processed document chunks with metadata 145 | """ 146 | 147 | async def process_pdf(): 148 | images = ImageProcessor.pdf_to_images(file_path) 149 | prompt = self.get_chunk_prompt(chunk_strategy, custom_prompt) 150 | return await asyncio.gather(*[self.async_process_image_with_llm(img, prompt) for img in images]) 151 | 152 | results = asyncio.run(process_pdf()) 153 | documents = self.serialize_response(list(results), file_path) 154 | save_output_file(documents, output_dir) 155 | return documents 156 | 157 | async def async_process_document_with_llm( 158 | self, 159 | file_path: Optional[Union[str, Path]] = None, 160 | chunk_strategy: str = 'page', 161 | custom_prompt: Optional[str] = None, 162 | output_dir: Optional[Union[str, Path]] = None, 163 | ) -> List[Document]: 164 | """Process a document with LLM for OCR and chunking asynchronously.""" 165 | images = ImageProcessor.pdf_to_images(file_path) 166 | prompt = self.get_chunk_prompt(chunk_strategy, custom_prompt) 167 | results = list(await asyncio.gather(*[self.async_process_image_with_llm(img, prompt) for img in images])) 168 | documents = self.serialize_response(list(results), file_path) 169 | save_output_file(documents, output_dir) 170 | return documents 171 | 172 | async def async_process_image_with_llm(self, page_as_image: Image, prompt: str) -> dict: 173 | """Convert image to base64 and chunk the image with LLM asynchronously. 174 | 175 | Args: 176 | page_as_image: PIL Image object to process 177 | prompt: Prompt to use for LLM processing 178 | 179 | Returns: 180 | dict: Processed chunks with content and metadata 181 | """ 182 | messages = self.prepare_llm_messages(page_as_image, prompt) 183 | try: 184 | response = await acompletion( 185 | model=self.model, 186 | messages=messages, 187 | response_format=OCRResponse, 188 | **self.kwargs, 189 | ) 190 | 191 | result = response.choices[0].message.content 192 | _response = OCRResponse.model_validate_json(result) 193 | return _response.model_dump() 194 | 195 | except Exception as e: 196 | print(f"Error in LLM processing: {e}") 197 | return {"markdown_chunks": [{"content": None, "page": None, "theme": None}]} 198 | 199 | def process_image_with_llm(self, page_as_image: Image, prompt: str) -> dict: 200 | """Convert image to base64 and chunk the image with LLM.""" 201 | messages = self.prepare_llm_messages(page_as_image, prompt) 202 | try: 203 | response = completion( 204 | model=self.model, 205 | messages=messages, 206 | response_format=OCRResponse, 207 | **self.kwargs, 208 | ) 209 | 210 | result = response.choices[0].message.content 211 | _response = OCRResponse.model_validate_json(result) 212 | return _response.model_dump() 213 | 214 | except Exception as e: 215 | print(f"Error in LLM processing: {e}") 216 | return {"markdown_chunks": [{"content": None, "page": None, "theme": None}]} -------------------------------------------------------------------------------- /smart_llm_loader/prompts.py: -------------------------------------------------------------------------------- 1 | DEFAULT_CHUNK_PROMPT = """OCR the following page into Markdown format. 2 | - All tables, charts, and other visual elements must be formatted in Markdown. 3 | - Do not exclude any content from the page. 4 | - Chunk the page into sections with same semantic theme. 5 | - Our goal is to identify parts of the page with same semantic theme. These chunks will 6 | be embedded and used in a RAG pipeline. 7 | - All chunks must be in Markdown format. 8 | 9 | Images in the document should be properly described in details such that an LLM can understand the 10 | image and answer questions about the image without seeing the image. 11 | The image description should be returned as a chunk too. 12 | """ 13 | 14 | DEFAULT_PAGE_CHUNK_PROMPT = """OCR the following page into Markdown format. 15 | - All tables must be formatted in Markdown. 16 | - The contents of the page should be returned as a single chunk. 17 | - Do not exclude any content from the page. 18 | - Also return the semantic theme of the page. 19 | 20 | Images in the document should be properly described in details such that an LLM can understand the image and answer 21 | questions about the image without seeing the image. 22 | The description should be returned as a part of the page content. 23 | """ -------------------------------------------------------------------------------- /smart_llm_loader/schema.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | from pydantic import BaseModel 3 | 4 | 5 | class Chunk(BaseModel): 6 | content: str 7 | theme: Optional[str] = None 8 | 9 | 10 | class OCRResponse(BaseModel): 11 | markdown_chunks: List[Chunk] 12 | -------------------------------------------------------------------------------- /smart_llm_loader/utils.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import shutil 3 | from typing import List 4 | 5 | import requests 6 | from langchain_core.documents import Document 7 | import json 8 | 9 | 10 | def is_pdf(url: str, response: requests.Response) -> bool: 11 | """Check if the URL points to a PDF file.""" 12 | return url.lower().endswith('.pdf') or response.headers.get('Content-Type', '').lower() in [ 13 | 'application/pdf', 14 | 'binary/octet-stream', 15 | ] 16 | 17 | 18 | def save_output_file(documents: List[Document], output_dir: Path) -> None: 19 | """Save the chunks and input file to a folder.""" 20 | if not output_dir or not documents: 21 | return 22 | 23 | output_dir.mkdir(exist_ok=True) 24 | chunks_data = [ 25 | { 26 | "content": doc.page_content, 27 | "metadata": {**doc.metadata, "source": str(doc.metadata["source"]) if "source" in doc.metadata else None}, 28 | } 29 | for doc in documents 30 | ] 31 | 32 | identifier = documents[0].metadata.get("source") or output_dir.stem 33 | identifier = Path(identifier).name.rsplit('.', 1)[0] 34 | 35 | chunks_file = output_dir / f"{identifier}_chunks.json" 36 | with open(chunks_file, "w", encoding="utf-8") as f: 37 | json.dump(chunks_data, f, indent=2, ensure_ascii=False) 38 | 39 | 40 | def copy_file(file_path: Path, output_file: Path) -> None: 41 | """Copy the file to the output directory.""" 42 | try: 43 | shutil.copy2(file_path, output_file) 44 | except shutil.SameFileError: 45 | pass 46 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from PIL import Image 3 | 4 | 5 | @pytest.fixture(autouse=True) 6 | def mock_env_vars(monkeypatch): 7 | """Mock environment variables needed for testing.""" 8 | monkeypatch.setenv("GEMINI_API_KEY", "test_api_key") 9 | 10 | 11 | @pytest.fixture 12 | def test_dir(tmp_path): 13 | """Create a temporary directory for test files.""" 14 | return tmp_path 15 | 16 | 17 | @pytest.fixture 18 | def sample_pdf_path(test_dir): 19 | """Create a sample PDF file for testing.""" 20 | pdf_path = test_dir / "test.pdf" 21 | pdf_path.write_bytes(b"%PDF-1.4\n%EOF") # Minimal valid PDF 22 | return pdf_path 23 | 24 | 25 | @pytest.fixture 26 | def sample_image(): 27 | """Create a sample image for testing.""" 28 | return Image.new('RGB', (100, 100), color='white') 29 | 30 | 31 | @pytest.fixture 32 | def output_dir(test_dir): 33 | """Create an output directory for test results.""" 34 | output_path = test_dir / "output" 35 | output_path.mkdir(exist_ok=True) 36 | return output_path 37 | -------------------------------------------------------------------------------- /tests/test_document_loader.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pytest 3 | import tempfile 4 | from unittest.mock import Mock 5 | from langchain_core.documents import Document 6 | 7 | from smart_llm_loader.document_loader import SmartLLMLoader 8 | 9 | 10 | @pytest.fixture(autouse=True) 11 | def mock_llm_validation(mocker): 12 | """Mock LLM validation for all tests.""" 13 | mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) 14 | mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True) 15 | mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True) 16 | 17 | 18 | @pytest.fixture 19 | def sample_pdf_path(tmp_path): 20 | pdf_path = tmp_path / "test.pdf" 21 | pdf_path.write_bytes(b"%PDF-1.4\n%EOF") # Minimal valid PDF 22 | return pdf_path 23 | 24 | 25 | @pytest.fixture 26 | def mock_response(): 27 | mock = Mock() 28 | mock.content = b"%PDF-1.4\n%EOF" # Minimal valid PDF content 29 | mock.headers = {"content-type": "application/pdf"} 30 | return mock 31 | 32 | 33 | def test_init_with_file_path(sample_pdf_path): 34 | loader = SmartLLMLoader(file_path=sample_pdf_path) 35 | assert str(loader.file_path) == str(sample_pdf_path) 36 | assert loader.output_dir is None 37 | 38 | 39 | def test_init_with_url(mocker, mock_response): 40 | url = "http://example.com/test.pdf" 41 | mocker.patch('requests.get', return_value=mock_response) 42 | 43 | with tempfile.NamedTemporaryFile(suffix='.pdf') as temp_file: 44 | mocker.patch('tempfile.NamedTemporaryFile', return_value=temp_file) 45 | loader = SmartLLMLoader(url=url) 46 | assert isinstance(loader.file_path, Path) 47 | 48 | 49 | def test_init_with_both_file_and_url(sample_pdf_path): 50 | with pytest.raises(ValueError, match=r"Only one of file_path or url should be provided\."): 51 | SmartLLMLoader(file_path=sample_pdf_path, url="http://example.com/test.pdf") 52 | 53 | 54 | def test_init_with_neither_file_nor_url(): 55 | with pytest.raises(ValueError, match=r"Either file_path or url must be provided\."): 56 | SmartLLMLoader() 57 | 58 | 59 | def test_load_from_path_with_output_dir(sample_pdf_path, tmp_path): 60 | output_dir = tmp_path / "output" 61 | loader = SmartLLMLoader(file_path=sample_pdf_path, save_output=True, output_dir=output_dir) 62 | 63 | assert loader.output_dir == output_dir 64 | assert (output_dir / sample_pdf_path.name).exists() 65 | 66 | 67 | def test_load_from_url_invalid_content(mocker): 68 | url = "http://example.com/test.txt" 69 | mock_resp = Mock() 70 | mock_resp.content = b"Not a PDF" 71 | mock_resp.headers = {"content-type": "text/plain"} 72 | mocker.patch('requests.get', return_value=mock_resp) 73 | 74 | with pytest.raises(ValueError, match=r"The URL does not point to a PDF file\."): 75 | SmartLLMLoader(url=url) 76 | 77 | 78 | def test_load_method(mocker, sample_pdf_path): 79 | mock_documents = [Document(page_content="Test content")] 80 | mocker.patch('smart_llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents) 81 | 82 | loader = SmartLLMLoader(file_path=sample_pdf_path) 83 | documents = loader.load() 84 | 85 | assert len(documents) == 1 86 | assert documents[0].page_content == "Test content" 87 | 88 | 89 | @pytest.mark.asyncio 90 | async def test_aload_method(mocker, sample_pdf_path): 91 | mock_documents = [Document(page_content="Test content")] 92 | mocker.patch('smart_llm_loader.llm.LLMProcessing.async_process_document_with_llm', return_value=mock_documents) 93 | 94 | loader = SmartLLMLoader(file_path=sample_pdf_path) 95 | documents = await loader.aload() 96 | 97 | assert len(documents) == 1 98 | assert documents[0].page_content == "Test content" 99 | 100 | 101 | def test_load_and_split_method(mocker, sample_pdf_path): 102 | mock_documents = [Document(page_content="Test content")] 103 | mocker.patch('smart_llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents) 104 | 105 | loader = SmartLLMLoader(file_path=sample_pdf_path, chunk_strategy="contextual") 106 | documents = loader.load_and_split() 107 | 108 | assert len(documents) == 1 109 | assert documents[0].page_content == "Test content" 110 | 111 | 112 | def test_create_document(sample_pdf_path): 113 | loader = SmartLLMLoader(file_path=sample_pdf_path) 114 | chunk = {"content": "Test content", "theme": "Test theme"} 115 | page_num = 1 116 | 117 | doc = loader._create_document(chunk, page_num) 118 | 119 | assert isinstance(doc, Document) 120 | assert doc.page_content == "Test content" 121 | assert doc.metadata["page"] == page_num 122 | assert doc.metadata["semantic_theme"] == "Test theme" 123 | assert doc.metadata["source"] == loader.file_path 124 | 125 | 126 | def test_lazy_load(mocker, sample_pdf_path): 127 | # Mock the necessary components 128 | mock_images = [Mock()] 129 | mock_result = {"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]} 130 | 131 | mocker.patch('smart_llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images) 132 | mocker.patch('smart_llm_loader.llm.LLMProcessing.process_image_with_llm', return_value=mock_result) 133 | 134 | loader = SmartLLMLoader(file_path=sample_pdf_path) 135 | documents = list(loader.lazy_load()) 136 | 137 | assert len(documents) == 1 138 | assert documents[0].page_content == "Test content" 139 | assert documents[0].metadata["semantic_theme"] == "Test theme" 140 | 141 | 142 | @pytest.mark.asyncio 143 | async def test_alazy_load(mocker, sample_pdf_path): 144 | # Mock the necessary components 145 | mock_images = [Mock()] 146 | mock_result = {"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]} 147 | 148 | mocker.patch('smart_llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images) 149 | mocker.patch('smart_llm_loader.llm.LLMProcessing.async_process_image_with_llm', return_value=mock_result) 150 | 151 | loader = SmartLLMLoader(file_path=sample_pdf_path) 152 | documents = [doc async for doc in loader.alazy_load()] 153 | 154 | assert len(documents) == 1 155 | assert documents[0].page_content == "Test content" 156 | assert documents[0].metadata["semantic_theme"] == "Test theme" 157 | -------------------------------------------------------------------------------- /tests/test_image_processor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from PIL import Image 3 | import io 4 | import base64 5 | 6 | from smart_llm_loader.llm import ImageProcessor 7 | 8 | 9 | @pytest.fixture 10 | def sample_pdf_path(tmp_path): 11 | # Create a dummy PDF file for testing 12 | pdf_path = tmp_path / "test.pdf" 13 | pdf_path.write_bytes(b"%PDF-1.4\n%EOF") # Minimal valid PDF 14 | return pdf_path 15 | 16 | 17 | @pytest.fixture 18 | def sample_image(): 19 | # Create a simple test image 20 | img = Image.new('RGB', (100, 100), color='white') 21 | return img 22 | 23 | 24 | def test_pdf_to_images(sample_pdf_path, mocker): 25 | # Mock pdf2image.convert_from_path 26 | mock_images = [Image.new('RGB', (100, 100)) for _ in range(2)] 27 | mocker.patch('smart_llm_loader.llm.convert_from_path', return_value=mock_images) 28 | 29 | images = ImageProcessor.pdf_to_images(sample_pdf_path) 30 | 31 | assert len(images) == 2 32 | assert all(isinstance(img, Image.Image) for img in images) 33 | 34 | 35 | def test_image_to_base64(sample_image): 36 | base64_str = ImageProcessor.image_to_base64(sample_image) 37 | 38 | # Verify it's a valid base64 string 39 | assert isinstance(base64_str, str) 40 | 41 | # Verify we can decode it back to an image 42 | try: 43 | decoded = base64.b64decode(base64_str) 44 | Image.open(io.BytesIO(decoded)) 45 | except Exception as e: 46 | pytest.fail(f"Failed to decode base64 string: {e}") 47 | 48 | 49 | def test_pdf_to_images_file_not_found(): 50 | with pytest.raises(Exception): 51 | ImageProcessor.pdf_to_images("nonexistent.pdf") 52 | -------------------------------------------------------------------------------- /tests/test_llm_processing.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from PIL import Image 3 | from langchain_core.documents import Document 4 | from unittest.mock import Mock 5 | 6 | from smart_llm_loader.llm import LLMProcessing 7 | from smart_llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT 8 | 9 | 10 | @pytest.fixture 11 | def llm_processor(mocker): 12 | # Mock all validation functions 13 | mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) 14 | mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True) 15 | mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True) 16 | return LLMProcessing(model="gemini/gemini-2.0-flash") 17 | 18 | 19 | @pytest.fixture 20 | def sample_image(): 21 | return Image.new('RGB', (100, 100), color='white') 22 | 23 | 24 | def test_validate_model_valid(mocker): 25 | # Mock the validation functions 26 | mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) 27 | mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True) 28 | mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True) 29 | 30 | # Should not raise any exceptions 31 | LLMProcessing(model="gemini/gemini-2.0-flash") 32 | 33 | 34 | def test_validate_model_missing_env_vars(mocker): 35 | mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": False}) 36 | 37 | with pytest.raises(ValueError, match="Missing environment variables"): 38 | LLMProcessing(model="gemini/gemini-2.0-flash") 39 | 40 | 41 | def test_validate_model_unsupported_vision(mocker): 42 | mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True}) 43 | mocker.patch('smart_llm_loader.llm.supports_vision', return_value=False) 44 | 45 | with pytest.raises(ValueError, match="not a supported vision model"): 46 | LLMProcessing(model="unsupported-model") 47 | 48 | 49 | def test_get_chunk_prompt_default_page(): 50 | prompt = LLMProcessing.get_chunk_prompt('page') 51 | assert prompt == DEFAULT_PAGE_CHUNK_PROMPT 52 | 53 | 54 | def test_get_chunk_prompt_default_contextual(): 55 | prompt = LLMProcessing.get_chunk_prompt('contextual') 56 | assert prompt == DEFAULT_CHUNK_PROMPT 57 | 58 | 59 | def test_get_chunk_prompt_custom(): 60 | custom_prompt = "Custom test prompt" 61 | prompt = LLMProcessing.get_chunk_prompt('custom', custom_prompt) 62 | assert prompt == custom_prompt 63 | 64 | 65 | def test_get_chunk_prompt_custom_missing(): 66 | with pytest.raises(ValueError, match="Custom prompt is not provided"): 67 | LLMProcessing.get_chunk_prompt('custom') 68 | 69 | 70 | def test_get_chunk_prompt_invalid_strategy(): 71 | with pytest.raises(ValueError, match="Invalid chunk strategy"): 72 | LLMProcessing.get_chunk_prompt('invalid') 73 | 74 | 75 | def test_prepare_llm_messages(sample_image): 76 | prompt = "Test prompt" 77 | messages = LLMProcessing.prepare_llm_messages(sample_image, prompt) 78 | 79 | assert len(messages) == 2 80 | assert messages[0]["role"] == "system" 81 | assert messages[0]["content"] == prompt 82 | assert messages[1]["role"] == "user" 83 | assert len(messages[1]["content"]) == 2 84 | assert messages[1]["content"][0]["type"] == "text" 85 | assert messages[1]["content"][1]["type"] == "image_url" 86 | 87 | 88 | def test_serialize_response(): 89 | results = [ 90 | { 91 | "markdown_chunks": [ 92 | {"content": "Test content 1", "theme": "Theme 1"}, 93 | {"content": "Test content 2", "theme": "Theme 2"} 94 | ] 95 | } 96 | ] 97 | file_path = "test.pdf" 98 | 99 | documents = LLMProcessing.serialize_response(results, file_path) 100 | 101 | assert len(documents) == 2 102 | assert all(isinstance(doc, Document) for doc in documents) 103 | assert documents[0].page_content == "Test content 1" 104 | assert documents[0].metadata["semantic_theme"] == "Theme 1" 105 | assert documents[0].metadata["source"] == file_path 106 | 107 | 108 | @pytest.mark.asyncio 109 | async def test_async_process_image_with_llm_success(llm_processor, sample_image, mocker): 110 | # Create a mock response with the correct structure 111 | mock_response = Mock() 112 | mock_response.choices = [ 113 | Mock( 114 | message=Mock( 115 | content='{"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]}' 116 | ) 117 | ) 118 | ] 119 | mocker.patch('smart_llm_loader.llm.acompletion', return_value=mock_response) 120 | 121 | result = await llm_processor.async_process_image_with_llm(sample_image, "Test prompt") 122 | 123 | assert "markdown_chunks" in result 124 | assert len(result["markdown_chunks"]) == 1 125 | assert result["markdown_chunks"][0]["content"] == "Test content" 126 | 127 | 128 | @pytest.mark.asyncio 129 | async def test_async_process_image_with_llm_error(llm_processor, sample_image, mocker): 130 | mocker.patch('smart_llm_loader.llm.acompletion', side_effect=Exception("Test error")) 131 | 132 | result = await llm_processor.async_process_image_with_llm(sample_image, "Test prompt") 133 | 134 | assert "markdown_chunks" in result 135 | assert result["markdown_chunks"][0]["content"] is None 136 | --------------------------------------------------------------------------------