├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── .idea
    └── .gitignore
├── LICENSE
├── README.md
├── examples
    ├── data
    │   ├── pymupdf_output.json
    │   ├── test.pdf
    │   ├── test_chunks.json
    │   ├── test_ocr_doc.pdf
    │   ├── test_ocr_doc.png
    │   └── test_ocr_doc_chunks.json
    ├── ocr_example.py
    └── rag_example.py
├── poetry.lock
├── pyproject.toml
├── smart_llm_loader
    ├── __init__.py
    ├── document_loader.py
    ├── llm.py
    ├── prompts.py
    ├── schema.py
    └── utils.py
└── tests
    ├── conftest.py
    ├── test_document_loader.py
    ├── test_image_processor.py
    └── test_llm_processing.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v2
14 |       with:
15 |         python-version: '3.12'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install poetry
20 |     - name: Build and publish
21 |       env:
22 |         POETRY_PYPI_TOKEN_PYPI: ${{ secrets.PYPI_TOKEN }}
23 |       run: |
24 |         poetry build
25 |         poetry publish 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | ENV/
29 | 
30 | # IDE
31 | .idea/
32 | .vscode/
33 | *.swp
34 | *.swo
35 | 
36 | # Distribution
37 | dist/
38 | build/
39 | 
40 | # Misc
41 | .DS_Store
42 | .coverage
43 | htmlcov/
44 | .pytest_cache/ 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 David Emmanuel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE. 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SmartLLMLoader
  2 | 
  3 | smart-llm-loader is a lightweight yet powerful Python package that transforms any document into LLM-ready chunks. It handles the entire document processing pipeline:
  4 | 
  5 | - 📄 Converts documents to clean markdown
  6 | - 🔍 Built-in OCR for scanned documents and images
  7 | - ✂️ Smart, context-aware text chunking
  8 | - 🔌 Seamless integration with LangChain and LlamaIndex
  9 | - 📦 Ready for vector stores and LLM ingestion
 10 | 
 11 | Spend less time on preprocessing headaches and more time building what matters. From RAG systems to chatbots to document Q&A, 
 12 | SmartLLMLoader handles the heavy lifting so you can focus on creating exceptional AI applications. 
 13 | 
 14 | SmartLLMLoader's chunking approach has been benchmarked against traditional methods, showing superior performance particularly when paired with Google's Gemini Flash model. This combination offers an efficient and cost-effective solution for document chunking in RAG systems. View the detailed performance comparison [here](https://www.sergey.fyi/articles/gemini-flash-2).
 15 | 
 16 | 
 17 | ## Features
 18 | 
 19 | - Support for multiple LLM providers
 20 | - In-built OCR for scanned documents and images
 21 | - Flexible document type support
 22 | - Supports different chunking strategies such as: context-aware chunking and  page-based chunking
 23 | - Supports custom prompts and custom chunking
 24 | 
 25 | ## Installation
 26 | 
 27 | ### System Dependencies
 28 | 
 29 | First, install Poppler if you don't have it already (required for PDF processing):
 30 | 
 31 | **Ubuntu/Debian:**
 32 | ```bash
 33 | sudo apt-get install poppler-utils
 34 | ```
 35 | 
 36 | **macOS:**
 37 | ```bash
 38 | brew install poppler
 39 | ```
 40 | 
 41 | **Windows:**
 42 | 1. Download the latest [Poppler for Windows](https://github.com/oschwartz10612/poppler-windows/releases/)
 43 | 2. Extract the downloaded file
 44 | 3. Add the `bin` directory to your system PATH
 45 | 
 46 | ### Package Installation
 47 | 
 48 | You can install SmartLLMLoader using pip:
 49 | 
 50 | ```bash
 51 | pip install smart-llm-loader
 52 | ```
 53 | 
 54 | Or using Poetry:
 55 | 
 56 | ```bash
 57 | poetry add smart-llm-loader
 58 | ```
 59 | 
 60 | ## Quick Start
 61 | smart-llm-loader package uses litellm to call the LLM so any arguments supported by litellm can be used. You can find the litellm documentation [here](https://docs.litellm.ai/docs/providers).
 62 | You can use any multi-modal model supported by litellm.
 63 | 
 64 | ```python
 65 | from smart_llm_loader import SmartLLMLoader
 66 | 
 67 | 
 68 | # Using Gemini Flash model
 69 | os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY"
 70 | model = "gemini/gemini-1.5-flash"
 71 | 
 72 | # Using openai model
 73 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
 74 | model = "openai/gpt-4o"
 75 | 
 76 | # Using anthropic model
 77 | os.environ["ANTHROPIC_API_KEY"] = "YOUR_ANTHROPIC_API_KEY"
 78 | model = "anthropic/claude-3-5-sonnet"
 79 | 
 80 | 
 81 | # Initialize the document loader
 82 | loader = SmartLLMLoader(
 83 |     file_path="your_document.pdf",
 84 |     chunk_strategy="contextual",
 85 |     model=model,
 86 | )
 87 | # Load and split the document into chunks
 88 | documents = loader.load_and_split()
 89 | ```
 90 | 
 91 | ## Parameters
 92 | 
 93 | ```python
 94 | class SmartLLMLoader(BaseLoader):
 95 |     """A flexible document loader that supports multiple input types."""
 96 | 
 97 |     def __init__(
 98 |             self,
 99 |             file_path: Optional[Union[str, Path]] = None, # path to the document to load
100 |             url: Optional[str] = None, # url to the document to load
101 |             chunk_strategy: str = 'contextual', # chunking strategy to use (page, contextual, custom)
102 |             custom_prompt: Optional[str] = None, # custom prompt to use
103 |             model: str = "gemini/gemini-2.0-flash", # LLM model to use
104 |             save_output: bool = False, # whether to save the output to a file
105 |             output_dir: Optional[Union[str, Path]] = None, # directory to save the output to
106 |             api_key: Optional[str] = None, # API key to use
107 |             **kwargs,
108 |     ):
109 | ```
110 | 
111 | ## Comparison with Traditional Methods
112 | 
113 | Let's see SmartLLMLoader in action! We'll compare it with PyMuPDF (a popular traditional document loader) to demonstrate why SmartLLMLoader's intelligent chunking makes such a difference in real-world applications.
114 | 
115 | ### The Challenge: Processing an Invoice
116 | We'll process this sample invoice that includes headers, tables, and complex formatting:
117 | 
118 | ![Sample Invoice Document](https://raw.githubusercontent.com/AskYourPdf/llm-loader/refs/heads/master/examples/data/test_ocr_doc.png?height=200)
119 | 
120 | ### Head-to-Head Comparison
121 | 
122 | #### 1. SmartLLMLoader Output
123 | SmartLLMLoader intelligently breaks down the document into semantic chunks, preserving structure and meaning (note that the json output below has been formatted for readability):
124 | 
125 | ```json
126 | [
127 |   {
128 |     "content": "Invoice no: 27301261\nDate of issue: 10/09/2012",
129 |     "metadata": {
130 |       "page": 0,
131 |       "semantic_theme": "invoice_header",
132 |       "source": "data/test_ocr_doc.pdf"
133 |     }
134 |   },
135 |   {
136 |     "content": "Seller:\nWilliams LLC\n72074 Taylor Plains Suite 342\nWest Alexandria, AR 97978\nTax Id: 922-88-2832\nIBAN: GB70FTNR64199348221780",
137 |     "metadata": {
138 |       "page": 0,
139 |       "semantic_theme": "seller_information",
140 |       "source": "data/test_ocr_doc.pdf"
141 |     }
142 |   },
143 |   {
144 |     "content": "Client:\nHernandez-Anderson\n084 Carter Lane Apt. 846\nSouth Ronaldbury, AZ 91030\nTax Id: 959-74-5868",
145 |     "metadata": {
146 |       "page": 0,
147 |       "semantic_theme": "client_information",
148 |       "source": "data/test_ocr_doc.pdf"
149 |     }
150 |   },
151 |   {
152 |     "content":
153 |     "Item table:\n"
154 |     "| No. | Description                                               | Qty  | UM   | Net price | Net worth | VAT [%] | Gross worth |\n"
155 |     "|-----|-----------------------------------------------------------|------|------|-----------|-----------|---------|-------------|\n"
156 |     "| 1   | Lilly Pulitzer dress Size 2                               | 5.00 | each | 45.00     | 225.00    | 10%     | 247.50      |\n"
157 |     "| 2   | New ERIN Erin Fertherston Straight Dress White Sequence Lining Sleeveless SZ 10 | 1.00 | each | 59.99     | 59.99     | 10%     | 65.99       |\n"
158 |     "| 3   | Sequence dress Size Small                                 | 3.00 | each | 35.00     | 105.00    | 10%     | 115.50      |\n"
159 |     "| 4   | fire los angeles dress Medium                             | 3.00 | each | 6.50      | 19.50     | 10%     | 21.45       |\n"
160 |     "| 5   | Eileen Fisher Women's Long Sleeve Fleece Lined Front Pockets Dress XS Gray | 3.00 | each | 15.99     | 47.97     | 10%     | 52.77       |\n"
161 |     "| 6   | Lularoe Nicole Dress Size Small Light Solid Grey/White Ringer Tee Trim | 2.00 | each | 3.75      | 7.50      | 10%     | 8.25        |\n"
162 |     "| 7   | J.Crew Collection Black & White sweater Dress sz S        | 1.00 | each | 30.00     | 30.00     | 10%     | 33.00       |",
163 |     "metadata": {
164 |       "page": 0,
165 |       "semantic_theme": "items_table",
166 |       "source": "data/test_ocr_doc.pdf"
167 |     }
168 |   },
169 |   {
170 |     "content": "Summary table:\n"
171 |     "| VAT [%] | Net worth | VAT    | Gross worth |\n"
172 |     "|---------|-----------|--------|-------------|\n"
173 |     "| 10%     | 494,96    | 49,50  | 544,46      |\n"
174 |     "| Total   | $ 494,96  | $ 49,50| $ 544,46    |",
175 |     "metadata": {
176 |       "page": 0,
177 |       "semantic_theme": "summary_table",
178 |       "source": "data/test_ocr_doc.pdf"
179 |     }
180 |   }
181 | ]
182 | ```
183 | 
184 | **Key Benefits:**
185 | - ✨ Clean, structured chunks
186 | - 🎯 Semantic understanding
187 | - 📊 Preserved table formatting
188 | - 🏷️ Intelligent metadata tagging
189 | 
190 | #### 2. Traditional PyMuPDF Output
191 | PyMuPDF provides a basic text extraction without semantic understanding:
192 | 
193 | ```json
194 | [
195 |   {
196 |     "page": 0,
197 |     "content": "Invoice no: 27301261  \nDate of issue: \nSeller: \nWilliams LLC \n72074 Taylor Plains Suite 342 \nWest
198 |      Alexandria, AR 97978 \nTax Id: 922-88-2832 \nIBAN: GB70FTNR64199348221780 \nITEMS \nNo. \nDescription \n2l \nLilly
199 |       Pulitzer dress Size 2 \n2. \nNew ERIN Erin Fertherston \nStraight Dress White Sequence \nLining Sleeveless SZ 10
200 |        \n3. \n Sequence dress Size Small \n4. \nfire los angeles dress Medium \nL \nEileen Fisher Women's Long \nSleeve
201 |         Fleece Lined Front \nPockets Dress XS Gray \n6. \nLularoe Nicole Dress Size Small \nLight Solid Grey/ White 
202 |         Ringer \nTee Trim \nT \nJ.Crew Collection Black & White \nsweater Dress sz S \nSUMMARY \nTotal \n2,00 \n1,00
203 |          \nVAT [%] \n10% \n10/09/2012 \neach \neach \nClient: \nHernandez-Anderson \n084 Carter Lane Apt. 846 \nSouth 
204 |          Ronaldbury, AZ 91030 \nTax Id: 959-74-5868 \nNet price \n Net worth \nVAT [%] \n45,00 \n225,00 \n10% \n59,99 
205 |          \n59,99 \n10% \n35,00 \n105,00 \n10% \n6,50 \n19,50 \n10% \n15,99 \n47,97 \n10% \n3,75 \n7.50 \n10% \n30,00 
206 |          \n30,00 \n10% \nNet worth \nVAT \n494,96 \n49,50 \n$ 494,96 \n$49,50 \nGross \nworth \n247,50 \n65,99 \n115,50
207 |           \n21,45 \n52,77 \n8,25 \n33,00 \nGross worth \n544,46 \n$ 544,46 \n",
208 |     "metadata": {
209 |       "source": "./data/test_ocr_doc.pdf",
210 |       "file_path": "./data/test_ocr_doc.pdf",
211 |       "page": 0,
212 |       "total_pages": 1,
213 |       "format": "PDF 1.5",
214 |       "title": "",
215 |       "author": "",
216 |       "subject": "",
217 |       "keywords": "",
218 |       "creator": "",
219 |       "producer": "AskYourPDF.com",
220 |       "creationDate": "",
221 |       "modDate": "D:20250213152908Z",
222 |       "trapped": ""
223 |     }
224 |   }
225 | ]
226 | ```
227 | 
228 | ### Real-World Impact: RAG Performance
229 | 
230 | Let's see how this difference affects a real Question-Answering system:
231 | 
232 | ```python
233 | question = "What is the total gross worth for item 1 and item 7?"
234 | 
235 | # SmartLLMLoader Result ✅
236 | "The total gross worth for item 1 (Lilly Pulitzer dress) is $247.50 and for item 7 
237 | (J.Crew Collection sweater dress) is $33.00. 
238 | Total: $280.50"
239 | 
240 | # PyMuPDF Result ❌
241 | "The total gross worth for item 1 is $45.00, and for item 7 it is $33.00. 
242 | Total: $78.00"
243 | ```
244 | 
245 | **Why SmartLLMLoader Won:**
246 | - 🎯 Maintained table structure
247 | - 💡 Preserved relationships between data
248 | - 📊 Accurate calculations
249 | - 🤖 Better context for the LLM
250 | 
251 | You can try it yourself by running the complete [RAG example](./examples/rag_example.py) to see the difference in action!
252 | 
253 | ## License
254 | 
255 | This project is licensed under the MIT License - see the LICENSE file for details.
256 | 
257 | ## Contributing
258 | 
259 | Contributions are welcome! Please feel free to submit a Pull Request.
260 | 
261 | ## Authors
262 | 
263 | - David Emmanuel ([@drmingler](https://github.com/drmingler))


--------------------------------------------------------------------------------
/examples/data/pymupdf_output.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "page": 0,
 4 |     "content": "Invoice no: 27301261  \nDate of issue: \nSeller: \nWilliams LLC \n72074 Taylor Plains Suite 342 \nWest Alexandria, AR 97978 \nTax Id: 922-88-2832 \nIBAN: GB70FTNR64199348221780 \nITEMS \nNo. \nDescription \n2l \nLilly Pulitzer dress Size 2 \n2. \nNew ERIN Erin Fertherston \nStraight Dress White Sequence \nLining Sleeveless SZ 10 \n3. \n Sequence dress Size Small \n4. \nfire los angeles dress Medium \nL \nEileen Fisher Women's Long \nSleeve Fleece Lined Front \nPockets Dress XS Gray \n6. \nLularoe Nicole Dress Size Small \nLight Solid Grey/ White Ringer \nTee Trim \nT \nJ.Crew Collection Black & White \nsweater Dress sz S \nSUMMARY \nTotal \n2,00 \n1,00 \nVAT [%] \n10% \n10/09/2012 \neach \neach \nClient: \nHernandez-Anderson \n084 Carter Lane Apt. 846 \nSouth Ronaldbury, AZ 91030 \nTax Id: 959-74-5868 \nNet price \n Net worth \nVAT [%] \n45,00 \n225,00 \n10% \n59,99 \n59,99 \n10% \n35,00 \n105,00 \n10% \n6,50 \n19,50 \n10% \n15,99 \n47,97 \n10% \n3,75 \n7.50 \n10% \n30,00 \n30,00 \n10% \nNet worth \nVAT \n494,96 \n49,50 \n$ 494,96 \n$49,50 \nGross \nworth \n247,50 \n65,99 \n115,50 \n21,45 \n52,77 \n8,25 \n33,00 \nGross worth \n544,46 \n$ 544,46 \n",
 5 |     "metadata": {
 6 |       "source": "./data/test_ocr_doc.pdf",
 7 |       "file_path": "./data/test_ocr_doc.pdf",
 8 |       "page": 0,
 9 |       "total_pages": 1,
10 |       "format": "PDF 1.5",
11 |       "title": "",
12 |       "author": "",
13 |       "subject": "",
14 |       "keywords": "",
15 |       "creator": "",
16 |       "producer": "AskYourPDF.com",
17 |       "creationDate": "",
18 |       "modDate": "D:20250213152908Z",
19 |       "trapped": ""
20 |     }
21 |   }
22 | ]
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/examples/data/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/smart-llm-loader/8ccb7c66a7944180a201f03b8e5549156ae10d08/examples/data/test.pdf


--------------------------------------------------------------------------------
/examples/data/test_chunks.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "content": "Synchronizing Machine Learning Algorithms, Realtime Robotic Control and Simulated Environment with 080\nVincent Berenz, Felix Widmaier, Simon Guist, Bernhard Schölkopf and Dieter Büchler\nMax Planck Institute for Intelligent Systems, Tübingen, Germany\n(work presented at the Robot Software Architectures Workshop - RSA 2023, ICRA)",
  4 |     "metadata": {
  5 |       "page": 0,
  6 |       "semantic_theme": "title",
  7 |       "source": "data/test.pdf"
  8 |     }
  9 |   },
 10 |   {
 11 |     "content": "Abstract—Robotic applications require the integration of various modalities, encompassing perception, control of real robots and possibly the control of simulated environments. While the state-of-the-art robotic software solutions such as ROS 2 provide most of the required features, flexible synchronization between algorithms, data streams and control loops can be tedious. 080 is a versatile C++ framework for robotics which provides a shared memory model and a command framework for real-time critical systems. It enables expert users to set up complex robotic systems and generate Python bindings for scientists. 080's unique feature is its flexible synchronization between processes, including the traditional blocking commands and the novel \"bursting mode\", which allows user code to control the execution of the lower process control loop. This makes it particularly useful for setups that mix real and simulated environments.",
 12 |     "metadata": {
 13 |       "page": 0,
 14 |       "semantic_theme": "abstract",
 15 |       "source": "data/test.pdf"
 16 |     }
 17 |   },
 18 |   {
 19 |     "content": "I. INTRODUCTION\n080¹ is an open-source C++ toolbox that allows expert users to create custom Python API suitable for interacting with complex robotic setup [1]. 080 provides functions for:\n• the spawning realtime processes (e.g., running on RT-Preempt, low latency kernel or Xenomai)\n• the synchronization of these processes\n• the asynchronous access to a shared memory hosting the history of all sensor data\n• the sending of custom commands, including blocking and not blocking commands (see section II-B)\n• the automated generation of customized Python bindings\nWhile 080 shares similarities with ROS (spawning of processes and C++/Python interoperability) and actionlib (management of commands) [2][3], it has differences to them: it relies on a shared memory model rather than a publish-subscribe model. But the core difference, and novelty of 080, is its flexibility regarding synchronization. When using 080, users may either synchronize their higher level code with the lower level control process via blocking commands (see section II-B). Alternatively, it is possible to synchronize the lower level control process to the higher level code via the new 'bursting mode'. In bursting mode, the low-level control process blocks until the user process sends a request to run one or more iterations. This unique feature is useful when interacting with a simulated robot or even, as shown in section III, an experimental setup involving a real robot and a simulated environment.\nHowever, as opposed to ROS, 080 does not support network communication, as it requires the processes it orchestrates to run on the same computer.\nThe 080 framework is a two-levels system. The first level involves the expert user, who is responsible for implementing the C++ driver classes that are specific to the hardware used in the experiment. During compilation, 080 utilizes these classes as templates to generate executables that create the real-time control processes. In addition to implementing the driver classes, the expert user is responsible for generating a Python API tailored to the needs of the users. 080 allows for automated generation of Python bindings. As a result, the users will have access to a simple Python interface. These users can focus on designing experiments without being burdened by the implementation details of the robotic setup.",
 20 |     "metadata": {
 21 |       "page": 0,
 22 |       "semantic_theme": "introduction",
 23 |       "source": "data/test.pdf"
 24 |     }
 25 |   },
 26 |   {
 27 |     "content": "II. OVERVIEW\nA. Backend, frontend and shared memory\n080 is based on the interaction between:\n• a backend, i.e., a process which communicates in realtime with a hardware device. It is responsible for sending commands to the device and receiving observations from",
 28 |     "metadata": {
 29 |       "page": 0,
 30 |       "semantic_theme": "overview",
 31 |       "source": "data/test.pdf"
 32 |     }
 33 |   },
 34 |   {
 35 |     "content": "An image of a robotic arm is shown in the top right of the page.  The robotic arm is yellow and black with a gripper at the end. It is mounted on a black base and appears to be in a laboratory setting. A computer monitor is visible in the background, showing a graphical user interface that seems to be related to the robotic arm control.",
 36 |     "metadata": {
 37 |       "page": 0,
 38 |       "semantic_theme": "image_description",
 39 |       "source": "data/test.pdf"
 40 |     }
 41 |   },
 42 |   {
 43 |     "content": "**A. Shared Memory**\n\nUpon spawning, a backend creates a dedicated shared memory. It uses this shared memory to 1) read user commands and 2) write sensor and/or state related data. A frontend provides methods for connecting to a related backend's shared memory for 1) writing user commands and 2) for reading data.",
 44 |     "metadata": {
 45 |       "page": 1,
 46 |       "semantic_theme": "System Architecture",
 47 |       "source": "data/test.pdf"
 48 |     }
 49 |   },
 50 |   {
 51 |     "content": "**B. Realtime control and commands**\n\nStable control on realtime critical systems requires high-frequency software loops. 080 backend processes are developed in C++ to comply with real-time constraints, calculating the desired state for each actuator at each iteration. Frontends provide Python methods that send higher-level, non-realtime commands to the backend's shared memory. These commands specify implicit desired state trajectories that rely on interpolation (based on specified durations, speeds, or numbers of server iterations). For instance, if a user command specifies that a robot should reach a desired state over a specified duration, the backend will generate the higher-frequency low-level commands that interpolate from the robot's current state to the desired state. By translating the frontend's commands into low-level commands that operate the system at the required frequency, the backend ensures stable control of the real-time critical systems. The frontend's API is flexible and allows for queuing or interrupting commands, as well as issuing both blocking and non-blocking commands.",
 52 |     "metadata": {
 53 |       "page": 1,
 54 |       "semantic_theme": "System Architecture",
 55 |       "source": "data/test.pdf"
 56 |     }
 57 |   },
 58 |   {
 59 |     "content": "**C. Reading observation**\n\nThe backend writes current actuator state and custom sensor information to shared memory at each control iteration, which can be retrieved using various methods provided by the frontend API. Users can request the latest information, information from past server iterations, or wait for future data using a blocking method (which can be used for synchronizing user processes with backend, see section II-E). Multiple instances of the frontend can read asynchronously from shared memory. For example, this enables users to run logging scripts for the robot's state in parallel with control scripts.",
 60 |     "metadata": {
 61 |       "page": 1,
 62 |       "semantic_theme": "System Architecture",
 63 |       "source": "data/test.pdf"
 64 |     }
 65 |   },
 66 |   {
 67 |     "content": "**D. Embedding backends**\n\nIn addition to generating executables that spawn backend processes, 080's API also supports embedding instances of C++ or Python backends in other processes. This feature can be utilized to extend 080's functionality to simulations. Section III provides an example of o80 backends being used to control the movement of bodies in a Mujoco environment.",
 68 |     "metadata": {
 69 |       "page": 1,
 70 |       "semantic_theme": "System Architecture",
 71 |       "source": "data/test.pdf"
 72 |     }
 73 |   },
 74 |   {
 75 |     "content": "**E. Synchronization and bursting mode**\n\n080 provides two synchronization modes: \"normal\" and \"bursting\". In normal mode, the backend process runs in real-time at its required frequency, while the user Python process can synchronize with it through the blocking waiting methods mentioned earlier. However, in bursting mode, the backend process blocks until the frontend requires it to run one or more iterations. Bursting mode is typically used when interacting with a simulator. The frontend allows users to create commands that require several backend iterations to execute, which can then be executed as fast as the simulator allows.",
 76 |     "metadata": {
 77 |       "page": 1,
 78 |       "semantic_theme": "System Architecture",
 79 |       "source": "data/test.pdf"
 80 |     }
 81 |   },
 82 |   {
 83 |     "content": "**III. HYSR TRAINING OF TABLE TENNIS PLAYING ROBOT**\n\nA team of researchers from the Max Planck Institute for Intelligent Systems is exploring the potential of reinforcement learning for teaching a robotic arm, actuated by a pneumatic artificial muscle (PAM), to play table tennis [4]. The scientists are using a hybrid simulation and real training (HYSR) technique that involves mirroring the movements of a real robotic arm with a Mujoco simulated arm. This approach allows the real robot to interact with a single or multiple simulated balls that are being replayed from recorded ball trajectories, facilitating practical long-term learning of table tennis². Additionally, virtual environments can be adapted for data-efficient training by, for instance, playing with multiple virtual balls [5].",
 84 |     "metadata": {
 85 |       "page": 1,
 86 |       "semantic_theme": "Application",
 87 |       "source": "data/test.pdf"
 88 |     }
 89 |   },
 90 |   {
 91 |     "content": "**To set up the experiment, the researchers required:**\n\n*   A real-time control process that sends pressure commands to the PAM controllers of the real robot at a fixed frequency of 500Hz.\n*   A Mujoco simulated robot that mirrors the movements of the real robot and replays recorded ball trajectories. Each iteration of the Mujoco simulator takes 0.02 seconds.\n*   A GYM reinforcement learning environment with a step function running at 100Hz.\n*   Control of other hardware for real ball experiments, including a Vicon system for localizing the robot and table, a ball launcher, and an RGB-based ball detection system.",
 92 |     "metadata": {
 93 |       "page": 1,
 94 |       "semantic_theme": "Application",
 95 |       "source": "data/test.pdf"
 96 |     }
 97 |   },
 98 |   {
 99 |     "content": "080 allowed to solve all the synchronization issues related to this setup. A backend process runs at 500Hz and controls the real pneumatic muscles while reading related robot states. A backend instance, running in bursting mode, is embedded in the Mujoco simulated environment. Frontends, connected to both backends, are embedded in the learning environment, asynchronously sending pressure actions and reading states to/from the real robot, sending mirroring states to the simulated robot, and sending bursting commands to the Mujoco simulated environment.\n\nIn addition, 080 simplified the process of spawning new processes that create additional frontends, which can easily access the shared memory history to log data, visualize the robot state in real-time, and monitor both the simulated and real robot state.\n\nThe code and documentation of this project are available as open source online [6].",
100 |     "metadata": {
101 |       "page": 1,
102 |       "semantic_theme": "Application",
103 |       "source": "data/test.pdf"
104 |     }
105 |   },
106 |   {
107 |     "content": "**IV. CONCLUSION**\n\n080 is a versatile middleware system that offers flexible control of robotic systems. It allows expert users to develop a user-friendly Python API that makes it easier for machine learning scientists to use complex robotic setups. Its",
108 |     "metadata": {
109 |       "page": 1,
110 |       "semantic_theme": "Conclusion",
111 |       "source": "data/test.pdf"
112 |     }
113 |   },
114 |   {
115 |     "content": "shared memory model, different synchronization modes, and interpolation-based command framework distinguish it from ROS. For more information and code examples, we refer to 080's comprehensive documentation [7].",
116 |     "metadata": {
117 |       "page": 2,
118 |       "semantic_theme": "system_description",
119 |       "source": "data/test.pdf"
120 |     }
121 |   },
122 |   {
123 |     "content": "REFERENCES\n[1] V. Berenz, M. Naveau, F. Widmaier, M. Wüthrich, J.-C. Passy, S. Guist, and D. Büchler, \"The 080 c++ templated toolbox: Designing customized python apis for synchronizing realtime processes,\" Journal of Open Source Software, vol. 6, no. 66, p. 2752, 2021. [Online]. Available: https://doi.org/10.21105/joss.02752\n[2] Stanford Artificial Intelligence Laboratory et al., \"Robotic operating system.\" [Online]. Available: https://www.ros.org\n[3] M. Carroll, J. Perron, E. Marder-Eppstein, V. Pradeep, and M. Arguedas, \"actionlib,\" 2009. [Online]. Available: http://wiki.ros.org/actionlib\n[4] D. Büchler, S. Guist, R. Calandra, V. Berenz, B. Schölkopf, and J. Peters, \"Learning to play table tennis from scratch using muscular robots,\" IEEE Transactions on Robotics (T-RO), vol. 38, no. 6, pp. 3850-3860, 2022.\n[5] S. Guist, J. Schneider, A. Dittrich, V. Berenz, B. Schölkopf, and D. Büchler, \"Hindsight states: Blending sim and real task elements for efficient reinforcement learning,\" arXiv preprint arXiv:2303.02234, 2023.\n[6] V. Berenz, F. Widmaier, S. Guist, and D. Büchler, \"PAM robot software documentation,\" 2020. [Online]. Available: https://intelligent-soft-robots.github.io/pam_documentation/\n[7] V. Berenz, S. Guist, and D. Büchler, \"080 robot software documentation,\" 2020. [Online]. Available: http://people.tuebingen.mpg.de/mpi-is-software/o80/docs/o80/index.html",
124 |     "metadata": {
125 |       "page": 2,
126 |       "semantic_theme": "references",
127 |       "source": "data/test.pdf"
128 |     }
129 |   }
130 | ]


--------------------------------------------------------------------------------
/examples/data/test_ocr_doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/smart-llm-loader/8ccb7c66a7944180a201f03b8e5549156ae10d08/examples/data/test_ocr_doc.pdf


--------------------------------------------------------------------------------
/examples/data/test_ocr_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/smart-llm-loader/8ccb7c66a7944180a201f03b8e5549156ae10d08/examples/data/test_ocr_doc.png


--------------------------------------------------------------------------------
/examples/data/test_ocr_doc_chunks.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "content": "Invoice no: 27301261\nDate of issue: 10/09/2012\nSeller: Williams LLC\n72074 Taylor Plains Suite 342\nWest Alexandria, AR 97978\nTax Id: 922-88-2832\nIBAN: GB70FTNR64199348221780",
 4 |     "metadata": {
 5 |       "page": 0,
 6 |       "semantic_theme": "invoice_header",
 7 |       "source": "data/test_ocr_doc.pdf"
 8 |     }
 9 |   },
10 |   {
11 |     "content": "Client: Hernandez-Anderson\n084 Carter Lane Apt. 846\nSouth Ronaldbury, AZ 91030\nTax Id: 959-74-5868",
12 |     "metadata": {
13 |       "page": 0,
14 |       "semantic_theme": "invoice_client",
15 |       "source": "data/test_ocr_doc.pdf"
16 |     }
17 |   },
18 |   {
19 |     "content": "| No. | Description | Qty | UM | Net price | Net worth | VAT [%] | Gross worth |\n|---|---|---|---|---|---|---|---|\n| 1. | Lilly Pulitzer dress Size 2 | 5.00 | each | 45.00 | 225,00 | 10% | 247,50 |\n| 2. | New ERIN Erin Fertherston Straight Dress White Sequence Lining Sleeveless SZ 10 | 1,00 | each | 59,99 | 59,99 | 10% | 65,99 |\n| 3. | Sequence dress Size Small | 3,00 | each | 35,00 | 105,00 | 10% | 115,50 |\n| 4. | fire los angeles dress Medium | 3.00 | each | 6,50 | 19,50 | 10% | 21,45 |\n| 5. | Eileen Fisher Women's Long Sleeve Fleece Lined Front Pockets Dress XS Gray | 3,00 | each | 15,99 | 47,97 | 10% | 52,77 |\n| 6. | Lularoe Nicole Dress Size Small Light Solid Grey/White Ringer Tee Trim | 2,00 | each | 3,75 | 7,50 | 10% | 8,25 |\n| 7. | J.Crew Collection Black & White sweater Dress sz S | 1,00 | each | 30,00 | 30,00 | 10% | 33,00 |",
20 |     "metadata": {
21 |       "page": 0,
22 |       "semantic_theme": "invoice_items",
23 |       "source": "data/test_ocr_doc.pdf"
24 |     }
25 |   },
26 |   {
27 |     "content": "| VAT [%] | Net worth | VAT | Gross worth |\n|---|---|---|---| \n| 10% | 494,96 | 49,50 | 544,46 |\n| Total | $ 494,96 | $ 49,50 | $ 544,46 |",
28 |     "metadata": {
29 |       "page": 0,
30 |       "semantic_theme": "invoice_summary",
31 |       "source": "data/test_ocr_doc.pdf"
32 |     }
33 |   }
34 | ]


--------------------------------------------------------------------------------
/examples/ocr_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications.
 3 | """
 4 | import os
 5 | from dotenv import load_dotenv
 6 | 
 7 | from smart_llm_loader import SmartLLMLoader
 8 | 
 9 | # Load environment variables
10 | load_dotenv()
11 | 
12 | # OpenAI API key since we are using the gpt-4o-mini model for question-answering
13 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
14 | 
15 | # Gemini API key since we are using the gemini flash model
16 | os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY"
17 | 
18 | 
19 | def process_with_llmloader():
20 |     """Process documents using SmartLLMLoader with Gemini Flash."""
21 | 
22 |     # Initialize the loader from the smart-llm-loader package
23 |     loader = SmartLLMLoader(
24 |         file_path="./data/test_ocr_doc.pdf",
25 |         chunk_strategy="contextual",
26 |         model="gemini/gemini-1.5-flash",
27 |         save_output=True,
28 |         # output_dir="./data",
29 |     )
30 | 
31 |     docs = loader.load_and_split()
32 |     return docs
33 | 
34 | 
35 | def process_with_pymupdf():
36 |     """Process documents using PyMuPDF loader."""
37 |     import json
38 |     from langchain_community.document_loaders import PyMuPDFLoader
39 |     
40 |     # Initialize the PyMuPDF loader
41 |     loader = PyMuPDFLoader("./data/test_ocr_doc.pdf")
42 |     docs = loader.load()
43 | 
44 |     output_data = []
45 |     for doc in docs:
46 |         output_data.append({
47 |             "page": doc.metadata["page"],
48 |             "content": doc.page_content,
49 |             "metadata": doc.metadata
50 |         })
51 |     
52 |     # Save as JSON
53 |     output_path = "data/pymupdf_output.json"
54 |     with open(output_path, "w", encoding="utf-8") as f:
55 |         json.dump(output_data, f, indent=2, ensure_ascii=False)
56 |     
57 |     return docs
58 | 
59 | 
60 | def main():
61 |     results = process_with_llmloader()
62 |     print(results)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/examples/rag_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example usage of different document loaders (smart-llm-loader and PyMuPDF) for RAG applications.
  3 | """
  4 | import os
  5 | from dotenv import load_dotenv
  6 | from langchain.text_splitter import RecursiveCharacterTextSplitter
  7 | from langchain_community.document_loaders import PyMuPDFLoader
  8 | from langchain_core.output_parsers import StrOutputParser
  9 | from langchain_core.prompts import PromptTemplate
 10 | from langchain_core.runnables import RunnablePassthrough
 11 | from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 12 | from langchain_community.vectorstores import FAISS
 13 | 
 14 | from smart_llm_loader import SmartLLMLoader
 15 | 
 16 | # Load environment variables
 17 | load_dotenv()
 18 | 
 19 | # OpenAI API key since we are using the gpt-4o-mini model for question-answering
 20 | os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
 21 | 
 22 | # Gemini API key since we are using the gemini flash model
 23 | os.environ["GEMINI_API_KEY"] = "YOUR_GEMINI_API_KEY"
 24 | 
 25 | 
 26 | def create_rag_chain(retriever, llm):
 27 |     """Create a RAG chain with the given retriever and LLM."""
 28 |     prompt_template = PromptTemplate.from_template(
 29 |         """
 30 |     You are an assistant for question-answering tasks.
 31 |     Use the following pieces of retrieved context to answer the question.
 32 |     If you don't know the answer, just say that you don't know.
 33 |     Use three sentences maximum and keep the answer concise
 34 |     Question: {question}
 35 |     Context: {context}
 36 |     Answer:"""
 37 |     )
 38 | 
 39 |     def format_docs(docs):
 40 |         return "\n\n".join(doc.page_content for doc in docs)
 41 | 
 42 |     return (
 43 |             {"context": retriever | format_docs, "question": RunnablePassthrough()}
 44 |             | prompt_template
 45 |             | llm
 46 |             | StrOutputParser()
 47 |     )
 48 | 
 49 | 
 50 | def process_with_llmloader():
 51 |     """Process documents using SmartLLMLoader with Gemini Flash."""
 52 |     llm = ChatOpenAI(model="gpt-4o-mini")
 53 | 
 54 |     # Initialize the loader from the smart-llm-loader package
 55 |     loader = SmartLLMLoader(
 56 |         file_path="./data/test_ocr_doc.pdf",
 57 |         chunk_strategy="contextual",
 58 |         model="gemini/gemini-1.5-flash",
 59 |     )
 60 | 
 61 |     docs = loader.load_and_split()
 62 |     vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings())
 63 |     rag_chain = create_rag_chain(vectorstore.as_retriever(), llm)
 64 |     return rag_chain
 65 | 
 66 | 
 67 | def process_with_pymupdf():
 68 |     """Process documents using PyMuPDF with recursive chunking."""
 69 |     llm = ChatOpenAI(model="gpt-4o-mini")
 70 | 
 71 |     # Load document with PyMuPDF
 72 |     loader = PyMuPDFLoader("./data/test_ocr_doc.pdf")
 73 |     documents = loader.load()
 74 | 
 75 |     # Create text splitter for recursive chunking
 76 |     text_splitter = RecursiveCharacterTextSplitter(
 77 |         chunk_size=1000,
 78 |         chunk_overlap=200,
 79 |         length_function=len,
 80 |         is_separator_regex=False,
 81 |     )
 82 | 
 83 |     docs = text_splitter.split_documents(documents)
 84 |     vectorstore = FAISS.from_documents(documents=docs, embedding=OpenAIEmbeddings())
 85 |     rag_chain = create_rag_chain(vectorstore.as_retriever(), llm)
 86 |     return rag_chain
 87 | 
 88 | 
 89 | def main():
 90 |     # Example using LLMLoader
 91 |     print("\n=== Using LLMLoader ===")
 92 |     llm_chain = process_with_llmloader()
 93 |     question = "What is the total gross worth for item 1 and item 7?"
 94 |     answer = llm_chain.invoke(question)
 95 |     print(f"Question: {question}")
 96 |     print(f"Answer: {answer}")
 97 | 
 98 |     # Example using PyMuPDF
 99 |     print("\n=== Using PyMuPDF ===")
100 |     pymupdf_chain = process_with_pymupdf()
101 |     answer = pymupdf_chain.invoke(question)
102 |     print(f"Question: {question}")
103 |     print(f"Answer: {answer}")
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "smart-llm-loader"
 3 | version = "0.1.1"
 4 | description = "A powerful PDF processing toolkit that seamlessly integrates with LLMs for intelligent document chunking and RAG applications. Features smart context-aware segmentation, multi-LLM support, and optimized content extraction for enhanced RAG performance."
 5 | authors = ["drmingler <davidemmanuel75@gmail.com>"]
 6 | readme = "README.md"
 7 | packages = [{include = "smart_llm_loader"}]
 8 | license = "MIT"
 9 | repository = "https://github.com/drmingler/smart-llm-loader"
10 | keywords = ["pdf", "llm", "rag", "document-processing", "ai"]
11 | classifiers = [
12 |     "Development Status :: 4 - Beta",
13 |     "Intended Audience :: Developers",
14 |     "License :: OSI Approved :: MIT License",
15 |     "Programming Language :: Python :: 3",
16 |     "Programming Language :: Python :: 3.12",
17 |     "Topic :: Software Development :: Libraries :: Python Modules",
18 |     "Topic :: Text Processing :: General"
19 | ]
20 | 
21 | [tool.poetry.dependencies]
22 | python = "^3.9"
23 | langchain = "^0.1.0"
24 | langchain-community = "^0.0.10"
25 | langchain-core = "^0.1.10"
26 | requests = "^2.31.0"
27 | python-dotenv = "^1.0.0"
28 | pypdf = "^3.17.1"
29 | faiss-cpu = "^1.7.4"
30 | tiktoken = "^0.8.0"
31 | litellm = "^1.61.3"
32 | pdf2image = "^1.17.0"
33 | 
34 | [tool.poetry.group.dev.dependencies]
35 | pytest = "^7.4.0"
36 | pytest-asyncio = "^0.23.0"
37 | pytest-cov = "^4.1.0"
38 | pytest-mock = "^3.12.0"
39 | 
40 | [build-system]
41 | requires = ["poetry-core"]
42 | build-backend = "poetry.core.masonry.api"
43 | 
44 | [tool.pytest.ini_options]
45 | testpaths = ["tests"]
46 | python_files = ["test_*.py"]
47 | addopts = "-v --cov=smart_llm_loader --cov-report=term-missing"
48 | asyncio_mode = "auto"


--------------------------------------------------------------------------------
/smart_llm_loader/__init__.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import platform
 4 | from pathlib import Path
 5 | 
 6 | def _check_poppler_installation():
 7 |     """Check if poppler is installed and provide installation instructions if it's not."""
 8 |     system = platform.system().lower()
 9 |     
10 |     try:
11 |         if system == "darwin":  # macOS
12 |             subprocess.run(["pdftoppm", "-v"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
13 |         elif system == "linux":
14 |             subprocess.run(["pdftoppm", "-v"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
15 |         elif system == "windows":
16 |             # On Windows, check if poppler is in PATH or in common installation locations
17 |             poppler_path = None
18 |             for path in sys.path:
19 |                 if Path(path).joinpath("poppler/bin").exists():
20 |                     poppler_path = path
21 |                     break
22 |             if not poppler_path:
23 |                 raise FileNotFoundError
24 |     except (subprocess.SubprocessError, FileNotFoundError):
25 |         instructions = {
26 |             "darwin": "Install poppler using: brew install poppler",
27 |             "linux": "Install poppler using: sudo apt-get install poppler-utils (Ubuntu/Debian) or sudo yum install poppler-utils (CentOS/RHEL)",
28 |             "windows": "Download and install poppler from: http://blog.alivate.com.au/poppler-windows/ and add it to your PATH"
29 |         }
30 |         
31 |         print(f"\n⚠️  WARNING: Required system dependency 'poppler' not found!")
32 |         print(f"This package requires poppler for PDF processing.")
33 |         print(f"\nTo install poppler on your system:")
34 |         print(f"{instructions.get(system, 'Please install poppler for your operating system')}")
35 |         print("\nContinuing without PDF processing capability...\n")
36 | 
37 | # Run the check when the package is imported
38 | _check_poppler_installation()
39 | 
40 | # Import main package components
41 | from .document_loader import SmartLLMLoader
42 | 
43 | __version__ = "0.1.0"
44 | 


--------------------------------------------------------------------------------
/smart_llm_loader/document_loader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Document loader module for handling different types of inputs (files and URLs).
  3 | """
  4 | import os
  5 | from pathlib import Path
  6 | import tempfile
  7 | from typing import AsyncIterator, List, Optional, Iterator, Tuple, Union
  8 | from langchain_community.document_loaders.base import BaseLoader
  9 | from langchain_core.documents import Document
 10 | import requests
 11 | 
 12 | from smart_llm_loader.llm import ImageProcessor, LLMProcessing
 13 | from smart_llm_loader.utils import copy_file, save_output_file, is_pdf
 14 | 
 15 | 
 16 | class SmartLLMLoader(BaseLoader):
 17 |     """A flexible document loader that supports multiple input types."""
 18 | 
 19 |     def __init__(
 20 |             self,
 21 |             file_path: Optional[Union[str, Path]] = None,
 22 |             url: Optional[str] = None,
 23 |             chunk_strategy: str = 'contextual',
 24 |             custom_prompt: Optional[str] = None,
 25 |             model: str = "gemini/gemini-2.0-flash",
 26 |             save_output: bool = False,
 27 |             output_dir: Optional[Union[str, Path]] = None,
 28 |             **kwargs,
 29 |     ):
 30 |         """Initialize the DocumentLoader with a file path or URL."""
 31 | 
 32 |         """
 33 |         Args:
 34 |             file_path: Path to the file to load
 35 |             url: URL to load the document from
 36 |             chunk_strategy: Strategy to use for chunking the document page, contextual or custom
 37 |             custom_prompt: Custom prompt to use for chunking the document, this will override the default prompt
 38 |             save_output: Whether to save the output files
 39 |             output_dir: Directory to save output files (if save_output is True)
 40 |             **kwargs: Additional arguments that will be passed to the litellm.completion method.
 41 |             Refer: https://docs.litellm.ai/docs/completion/input and https://docs.litellm.ai/docs/providers
 42 |         """
 43 |         self.chunk_strategy = chunk_strategy
 44 |         self.custom_prompt = custom_prompt
 45 |         self.llm_processor = LLMProcessing(model=model, **kwargs)
 46 | 
 47 |         if file_path and url:
 48 |             raise ValueError("Only one of file_path or url should be provided.")
 49 | 
 50 |         if not file_path and not url:
 51 |             raise ValueError("Either file_path or url must be provided.")
 52 | 
 53 |         self.file_path, self.output_dir = (
 54 |             self._load_from_path(file_path, save_output, output_dir)
 55 |             if file_path
 56 |             else self._load_from_url(url, save_output, output_dir)
 57 |         )
 58 | 
 59 |     @staticmethod
 60 |     def _load_from_path(
 61 |             file_path: Union[str, Path], save_output: bool = False, output_dir: Optional[Union[str, Path]] = None
 62 |     ) -> Tuple[Path, Optional[Path]]:
 63 |         """Load documents from a file path."""
 64 |         file_path = Path(file_path)
 65 |         if not file_path.exists():
 66 |             raise FileNotFoundError(f"File not found: {file_path}")
 67 | 
 68 |         if save_output or output_dir:
 69 |             output_dir = Path(output_dir) if output_dir else Path(f"{os.getcwd()}/{file_path.stem}")
 70 |             output_dir.mkdir(parents=True, exist_ok=True)
 71 |             output_file = output_dir / file_path.name
 72 |             copy_file(file_path, output_file)
 73 | 
 74 |         return file_path, output_dir
 75 | 
 76 |     @staticmethod
 77 |     def _load_from_url(
 78 |             url: str, save_output: bool = False, output_dir: Optional[Union[str, Path]] = None
 79 |     ) -> Tuple[Path, Optional[Path]]:
 80 |         """Load documents from a URL."""
 81 |         response = requests.get(url)
 82 |         response.raise_for_status()
 83 |         is_link_to_pdf = is_pdf(url, response)
 84 | 
 85 |         if is_link_to_pdf:
 86 |             with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
 87 |                 temp_path = Path(temp_file.name)
 88 |                 temp_file.write(response.content)
 89 | 
 90 |             if save_output or output_dir:
 91 |                 url_filename = url.split('/')[-1] or 'output'
 92 |                 url_filename = url_filename if ".pdf" in url_filename else url_filename + ".pdf"
 93 |                 output_dir = Path(output_dir) if output_dir else Path(f"{os.getcwd()}/{Path(url_filename).stem}")
 94 |                 output_dir.mkdir(parents=True, exist_ok=True)
 95 |                 output_file = output_dir / url_filename
 96 |                 copy_file(temp_path, output_file)
 97 | 
 98 |             return temp_path, output_dir
 99 |         else:
100 |             raise ValueError("The URL does not point to a PDF file.")
101 | 
102 |     async def aload(self) -> list[Document]:
103 |         """Load Documents and split into chunks using LLM-based OCR processing. async version"""
104 |         return await self.llm_processor.async_process_document_with_llm(
105 |             self.file_path, chunk_strategy="page", output_dir=self.output_dir
106 |         )
107 | 
108 |     def load(self) -> List[Document]:
109 |         """Load documents from either a file path or URL.
110 | 
111 |         Processes the document using LLM-based OCR with basic page-level chunking.
112 | 
113 |         Returns:
114 |             List[Document]: List of processed document chunks
115 |         """
116 |         documents = self.llm_processor.process_document_with_llm(
117 |             self.file_path, chunk_strategy="page", output_dir=self.output_dir
118 |         )
119 |         return documents
120 | 
121 |     def load_and_split(self, text_splitter: Optional = None) -> List[Document]:
122 |         """Load Documents and split into chunks using LLM-based OCR processing.
123 | 
124 |         Args:
125 |             text_splitter: Optional text splitter (not used in current implementation)
126 | 
127 |         Returns:
128 |             List[Document]: List of processed and chunked documents based on specified strategy
129 |         """
130 |         documents = self.llm_processor.process_document_with_llm(
131 |             self.file_path, self.chunk_strategy, self.custom_prompt, output_dir=self.output_dir
132 |         )
133 |         return documents
134 | 
135 |     def _create_document(self, chunk: dict, page_num: int) -> Document:
136 |         """Helper method to create a Document object from a chunk."""
137 |         return Document(
138 |             page_content=chunk['content'],
139 |             metadata={
140 |                 'page': page_num,
141 |                 'semantic_theme': chunk.get('theme'),
142 |                 'source': self.file_path,
143 |             },
144 |         )
145 | 
146 |     def lazy_load(self) -> Iterator[Document]:
147 |         """Load Documents lazily, processing and yielding one page at a time.
148 | 
149 |         Yields:
150 |             Document: Processed document chunks one at a time to conserve memory
151 |         """
152 |         images = ImageProcessor.pdf_to_images(self.file_path)
153 |         prompt = self.llm_processor.get_chunk_prompt('page')
154 | 
155 |         documents = []
156 |         for page_num, image in enumerate(images):
157 |             result = self.llm_processor.process_image_with_llm(image, prompt)
158 |             for chunk in result['markdown_chunks']:
159 |                 if chunk.get('content') is None:
160 |                     continue
161 |                 doc = self._create_document(chunk, page_num)
162 |                 documents.append(doc)
163 |                 yield doc
164 | 
165 |         save_output_file(documents, self.output_dir)
166 | 
167 |     async def alazy_load(self) -> AsyncIterator[Document]:
168 |         """Load Documents lazily and asynchronously, processing and yielding one page at a time.
169 | 
170 |         Yields:
171 |             Document: Processed document chunks one at a time asynchronously
172 |         """
173 |         images = ImageProcessor.pdf_to_images(self.file_path)
174 |         prompt = self.llm_processor.get_chunk_prompt('page')
175 | 
176 |         documents = []
177 |         for page_num, image in enumerate(images):
178 |             result = await self.llm_processor.async_process_image_with_llm(image, prompt)
179 |             for chunk in result['markdown_chunks']:
180 |                 if chunk.get('content') is None:
181 |                     continue
182 |                 doc = self._create_document(chunk, page_num)
183 |                 documents.append(doc)
184 |                 yield doc
185 | 
186 |         save_output_file(documents, self.output_dir)
187 | 


--------------------------------------------------------------------------------
/smart_llm_loader/llm.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from pathlib import Path
  3 | from typing import List, Optional, Union
  4 | from base64 import b64encode
  5 | import io
  6 | from multiprocessing import cpu_count
  7 | 
  8 | from PIL.Image import Image
  9 | from langchain_core.documents import Document
 10 | from pdf2image import convert_from_path
 11 | from litellm import completion, validate_environment, supports_vision, check_valid_key, acompletion
 12 | 
 13 | from smart_llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT
 14 | from smart_llm_loader.schema import OCRResponse
 15 | from smart_llm_loader.utils import save_output_file
 16 | 
 17 | 
 18 | class ImageProcessor:
 19 |     @staticmethod
 20 |     def pdf_to_images(file_path: Optional[Union[str, Path]] = None) -> list[Image]:
 21 |         """Convert PDF pages to images all at once for better performance.
 22 | 
 23 |         Args:
 24 |             file_path: Path to the PDF file to convert
 25 | 
 26 |         Returns:
 27 |             list[Image]: List of PIL Image objects, one per PDF page
 28 |         """
 29 |         images = convert_from_path(
 30 |             file_path,
 31 |             dpi=300,
 32 |             fmt='PNG',
 33 |             size=(None, 1056),
 34 |             thread_count=cpu_count(),
 35 |             use_pdftocairo=True,
 36 |         )
 37 |         return images
 38 | 
 39 |     @staticmethod
 40 |     def image_to_base64(image: Image) -> str:
 41 |         """Convert an image to a base64 string.
 42 | 
 43 |         Args:
 44 |             image: PIL Image object to convert
 45 | 
 46 |         Returns:
 47 |             str: Base64 encoded string representation of the image
 48 |         """
 49 |         img_byte_arr = io.BytesIO()
 50 |         image.save(img_byte_arr, format='PNG')
 51 |         img_bytes = img_byte_arr.getvalue()
 52 |         return b64encode(img_bytes).decode('utf-8')
 53 | 
 54 | 
 55 | class LLMProcessing:
 56 |     def __init__(self, model: str = "gemini/gemini-2.0-flash", **kwargs):
 57 |         self._validate_model(model, **kwargs)
 58 |         self.model = model
 59 |         self.kwargs = kwargs
 60 | 
 61 |     @staticmethod
 62 |     def _validate_model(model: str, **kwargs) -> None:
 63 |         """Validate that the model is properly configured for vision tasks."""
 64 |         environment = validate_environment(model=model)
 65 |         api_key = kwargs.get("api_key")
 66 | 
 67 |         if not environment["keys_in_environment"] and not api_key:
 68 |             raise ValueError(f"Missing environment variables for {model}: {environment}")
 69 | 
 70 |         if not supports_vision(model=model):
 71 |             raise ValueError(f"Model '{model}' is not a supported vision model.")
 72 | 
 73 |         if not check_valid_key(model=model, api_key=api_key):
 74 |             raise ValueError(f"Failed to access model '{model}'. Please check your API key and model availability.")
 75 | 
 76 |     @staticmethod
 77 |     def get_chunk_prompt(strategy: str, custom_prompt: Optional[str] = None) -> str:
 78 |         if strategy == 'custom' and not custom_prompt:
 79 |             raise ValueError("Custom prompt is not provided. A custom prompt is required for 'custom' strategy.")
 80 | 
 81 |         if custom_prompt:
 82 |             return custom_prompt
 83 | 
 84 |         elif strategy == 'page':
 85 |             return DEFAULT_PAGE_CHUNK_PROMPT
 86 | 
 87 |         elif strategy == 'contextual':
 88 |             return DEFAULT_CHUNK_PROMPT
 89 | 
 90 |         else:
 91 |             raise ValueError(f"Invalid chunk strategy: {strategy}, must be one of 'page', 'contextual' or 'custom'")
 92 | 
 93 |     @staticmethod
 94 |     def prepare_llm_messages(page_as_image: Image, prompt: str) -> List[dict]:
 95 |         base64_image = ImageProcessor.image_to_base64(page_as_image)
 96 |         messages = [
 97 |             {"role": "system", "content": prompt},
 98 |             {
 99 |                 "role": "user",
100 |                 "content": [
101 |                     {"type": "text", "text": "Process this image:"},
102 |                     {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}},
103 |                 ],
104 |             },
105 |         ]
106 |         return messages
107 | 
108 |     @staticmethod
109 |     def serialize_response(results: List[dict], file_path: Optional[Union[str, Path]] = None) -> List[Document]:
110 |         documents = []
111 |         for page_num, result in enumerate(results):
112 |             for chunk in result['markdown_chunks']:
113 |                 if chunk.get('theme') is None and chunk.get('content') is None:
114 |                     continue
115 | 
116 |                 doc = Document(
117 |                     page_content=chunk['content'],
118 |                     metadata={
119 |                         'page': page_num,
120 |                         'semantic_theme': chunk.get('theme'),
121 |                         'source': file_path,
122 |                     },
123 |                 )
124 |                 documents.append(doc)
125 | 
126 |         return documents
127 | 
128 |     def process_document_with_llm(
129 |             self,
130 |             file_path: Optional[Union[str, Path]] = None,
131 |             chunk_strategy: str = 'page',
132 |             custom_prompt: Optional[str] = None,
133 |             output_dir: Optional[Union[str, Path]] = None,
134 |     ) -> List[Document]:
135 |         """Process a document with LLM for OCR and chunking.
136 | 
137 |         Args:
138 |             file_path: Path to the document to process
139 |             chunk_strategy: Strategy for chunking ('page', 'contextual', or 'custom')
140 |             custom_prompt: Custom prompt to use for chunking
141 |             output_dir: Directory to save processed output
142 | 
143 |         Returns:
144 |             List[Document]: List of processed document chunks with metadata
145 |         """
146 | 
147 |         async def process_pdf():
148 |             images = ImageProcessor.pdf_to_images(file_path)
149 |             prompt = self.get_chunk_prompt(chunk_strategy, custom_prompt)
150 |             return await asyncio.gather(*[self.async_process_image_with_llm(img, prompt) for img in images])
151 | 
152 |         results = asyncio.run(process_pdf())
153 |         documents = self.serialize_response(list(results), file_path)
154 |         save_output_file(documents, output_dir)
155 |         return documents
156 | 
157 |     async def async_process_document_with_llm(
158 |             self,
159 |             file_path: Optional[Union[str, Path]] = None,
160 |             chunk_strategy: str = 'page',
161 |             custom_prompt: Optional[str] = None,
162 |             output_dir: Optional[Union[str, Path]] = None,
163 |     ) -> List[Document]:
164 |         """Process a document with LLM for OCR and chunking asynchronously."""
165 |         images = ImageProcessor.pdf_to_images(file_path)
166 |         prompt = self.get_chunk_prompt(chunk_strategy, custom_prompt)
167 |         results = list(await asyncio.gather(*[self.async_process_image_with_llm(img, prompt) for img in images]))
168 |         documents = self.serialize_response(list(results), file_path)
169 |         save_output_file(documents, output_dir)
170 |         return documents
171 | 
172 |     async def async_process_image_with_llm(self, page_as_image: Image, prompt: str) -> dict:
173 |         """Convert image to base64 and chunk the image with LLM asynchronously.
174 | 
175 |         Args:
176 |             page_as_image: PIL Image object to process
177 |             prompt: Prompt to use for LLM processing
178 | 
179 |         Returns:
180 |             dict: Processed chunks with content and metadata
181 |         """
182 |         messages = self.prepare_llm_messages(page_as_image, prompt)
183 |         try:
184 |             response = await acompletion(
185 |                 model=self.model,
186 |                 messages=messages,
187 |                 response_format=OCRResponse,
188 |                 **self.kwargs,
189 |             )
190 | 
191 |             result = response.choices[0].message.content
192 |             _response = OCRResponse.model_validate_json(result)
193 |             return _response.model_dump()
194 | 
195 |         except Exception as e:
196 |             print(f"Error in LLM processing: {e}")
197 |             return {"markdown_chunks": [{"content": None, "page": None, "theme": None}]}
198 | 
199 |     def process_image_with_llm(self, page_as_image: Image, prompt: str) -> dict:
200 |         """Convert image to base64 and chunk the image with LLM."""
201 |         messages = self.prepare_llm_messages(page_as_image, prompt)
202 |         try:
203 |             response = completion(
204 |                 model=self.model,
205 |                 messages=messages,
206 |                 response_format=OCRResponse,
207 |                 **self.kwargs,
208 |             )
209 | 
210 |             result = response.choices[0].message.content
211 |             _response = OCRResponse.model_validate_json(result)
212 |             return _response.model_dump()
213 | 
214 |         except Exception as e:
215 |             print(f"Error in LLM processing: {e}")
216 |             return {"markdown_chunks": [{"content": None, "page": None, "theme": None}]}


--------------------------------------------------------------------------------
/smart_llm_loader/prompts.py:
--------------------------------------------------------------------------------
 1 | DEFAULT_CHUNK_PROMPT = """OCR the following page into Markdown format. 
 2 | - All tables, charts, and other visual elements must be formatted in Markdown.
 3 | - Do not exclude any content from the page.
 4 | - Chunk the page into sections with same semantic theme.
 5 | - Our goal is to identify parts of the page with same semantic theme. These chunks will
 6 | be embedded and used in a RAG pipeline.
 7 | - All chunks must be in Markdown format.
 8 | 
 9 | Images in the document should be properly described in details such that an LLM can understand the
10 | image and answer questions about the image without seeing the image.
11 | The image description should be returned as a chunk too.
12 | """
13 | 
14 | DEFAULT_PAGE_CHUNK_PROMPT = """OCR the following page into Markdown format. 
15 | - All tables must be formatted in Markdown.
16 | - The contents of the page should be returned as a single chunk.
17 | - Do not exclude any content from the page.
18 | - Also return the semantic theme of the page.
19 | 
20 | Images in the document should be properly described in details such that an LLM can understand the image and answer
21 | questions about the image without seeing the image.
22 | The description should be returned as a part of the page content.
23 | """


--------------------------------------------------------------------------------
/smart_llm_loader/schema.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, List
 2 | from pydantic import BaseModel
 3 | 
 4 | 
 5 | class Chunk(BaseModel):
 6 |     content: str
 7 |     theme: Optional[str] = None
 8 | 
 9 | 
10 | class OCRResponse(BaseModel):
11 |     markdown_chunks: List[Chunk]
12 | 


--------------------------------------------------------------------------------
/smart_llm_loader/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import shutil
 3 | from typing import List
 4 | 
 5 | import requests
 6 | from langchain_core.documents import Document
 7 | import json
 8 | 
 9 | 
10 | def is_pdf(url: str, response: requests.Response) -> bool:
11 |     """Check if the URL points to a PDF file."""
12 |     return url.lower().endswith('.pdf') or response.headers.get('Content-Type', '').lower() in [
13 |         'application/pdf',
14 |         'binary/octet-stream',
15 |     ]
16 | 
17 | 
18 | def save_output_file(documents: List[Document], output_dir: Path) -> None:
19 |     """Save the chunks and input file to a folder."""
20 |     if not output_dir or not documents:
21 |         return
22 | 
23 |     output_dir.mkdir(exist_ok=True)
24 |     chunks_data = [
25 |         {
26 |             "content": doc.page_content,
27 |             "metadata": {**doc.metadata, "source": str(doc.metadata["source"]) if "source" in doc.metadata else None},
28 |         }
29 |         for doc in documents
30 |     ]
31 | 
32 |     identifier = documents[0].metadata.get("source") or output_dir.stem
33 |     identifier = Path(identifier).name.rsplit('.', 1)[0]
34 | 
35 |     chunks_file = output_dir / f"{identifier}_chunks.json"
36 |     with open(chunks_file, "w", encoding="utf-8") as f:
37 |         json.dump(chunks_data, f, indent=2, ensure_ascii=False)
38 | 
39 | 
40 | def copy_file(file_path: Path, output_file: Path) -> None:
41 |     """Copy the file to the output directory."""
42 |     try:
43 |         shutil.copy2(file_path, output_file)
44 |     except shutil.SameFileError:
45 |         pass
46 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from PIL import Image
 3 | 
 4 | 
 5 | @pytest.fixture(autouse=True)
 6 | def mock_env_vars(monkeypatch):
 7 |     """Mock environment variables needed for testing."""
 8 |     monkeypatch.setenv("GEMINI_API_KEY", "test_api_key")
 9 | 
10 | 
11 | @pytest.fixture
12 | def test_dir(tmp_path):
13 |     """Create a temporary directory for test files."""
14 |     return tmp_path
15 | 
16 | 
17 | @pytest.fixture
18 | def sample_pdf_path(test_dir):
19 |     """Create a sample PDF file for testing."""
20 |     pdf_path = test_dir / "test.pdf"
21 |     pdf_path.write_bytes(b"%PDF-1.4\n%EOF")  # Minimal valid PDF
22 |     return pdf_path
23 | 
24 | 
25 | @pytest.fixture
26 | def sample_image():
27 |     """Create a sample image for testing."""
28 |     return Image.new('RGB', (100, 100), color='white')
29 | 
30 | 
31 | @pytest.fixture
32 | def output_dir(test_dir):
33 |     """Create an output directory for test results."""
34 |     output_path = test_dir / "output"
35 |     output_path.mkdir(exist_ok=True)
36 |     return output_path
37 | 


--------------------------------------------------------------------------------
/tests/test_document_loader.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import pytest
  3 | import tempfile
  4 | from unittest.mock import Mock
  5 | from langchain_core.documents import Document
  6 | 
  7 | from smart_llm_loader.document_loader import SmartLLMLoader
  8 | 
  9 | 
 10 | @pytest.fixture(autouse=True)
 11 | def mock_llm_validation(mocker):
 12 |     """Mock LLM validation for all tests."""
 13 |     mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True})
 14 |     mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True)
 15 |     mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True)
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def sample_pdf_path(tmp_path):
 20 |     pdf_path = tmp_path / "test.pdf"
 21 |     pdf_path.write_bytes(b"%PDF-1.4\n%EOF")  # Minimal valid PDF
 22 |     return pdf_path
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def mock_response():
 27 |     mock = Mock()
 28 |     mock.content = b"%PDF-1.4\n%EOF"  # Minimal valid PDF content
 29 |     mock.headers = {"content-type": "application/pdf"}
 30 |     return mock
 31 | 
 32 | 
 33 | def test_init_with_file_path(sample_pdf_path):
 34 |     loader = SmartLLMLoader(file_path=sample_pdf_path)
 35 |     assert str(loader.file_path) == str(sample_pdf_path)
 36 |     assert loader.output_dir is None
 37 | 
 38 | 
 39 | def test_init_with_url(mocker, mock_response):
 40 |     url = "http://example.com/test.pdf"
 41 |     mocker.patch('requests.get', return_value=mock_response)
 42 | 
 43 |     with tempfile.NamedTemporaryFile(suffix='.pdf') as temp_file:
 44 |         mocker.patch('tempfile.NamedTemporaryFile', return_value=temp_file)
 45 |         loader = SmartLLMLoader(url=url)
 46 |         assert isinstance(loader.file_path, Path)
 47 | 
 48 | 
 49 | def test_init_with_both_file_and_url(sample_pdf_path):
 50 |     with pytest.raises(ValueError, match=r"Only one of file_path or url should be provided\."):
 51 |         SmartLLMLoader(file_path=sample_pdf_path, url="http://example.com/test.pdf")
 52 | 
 53 | 
 54 | def test_init_with_neither_file_nor_url():
 55 |     with pytest.raises(ValueError, match=r"Either file_path or url must be provided\."):
 56 |         SmartLLMLoader()
 57 | 
 58 | 
 59 | def test_load_from_path_with_output_dir(sample_pdf_path, tmp_path):
 60 |     output_dir = tmp_path / "output"
 61 |     loader = SmartLLMLoader(file_path=sample_pdf_path, save_output=True, output_dir=output_dir)
 62 | 
 63 |     assert loader.output_dir == output_dir
 64 |     assert (output_dir / sample_pdf_path.name).exists()
 65 | 
 66 | 
 67 | def test_load_from_url_invalid_content(mocker):
 68 |     url = "http://example.com/test.txt"
 69 |     mock_resp = Mock()
 70 |     mock_resp.content = b"Not a PDF"
 71 |     mock_resp.headers = {"content-type": "text/plain"}
 72 |     mocker.patch('requests.get', return_value=mock_resp)
 73 | 
 74 |     with pytest.raises(ValueError, match=r"The URL does not point to a PDF file\."):
 75 |         SmartLLMLoader(url=url)
 76 | 
 77 | 
 78 | def test_load_method(mocker, sample_pdf_path):
 79 |     mock_documents = [Document(page_content="Test content")]
 80 |     mocker.patch('smart_llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents)
 81 | 
 82 |     loader = SmartLLMLoader(file_path=sample_pdf_path)
 83 |     documents = loader.load()
 84 | 
 85 |     assert len(documents) == 1
 86 |     assert documents[0].page_content == "Test content"
 87 | 
 88 | 
 89 | @pytest.mark.asyncio
 90 | async def test_aload_method(mocker, sample_pdf_path):
 91 |     mock_documents = [Document(page_content="Test content")]
 92 |     mocker.patch('smart_llm_loader.llm.LLMProcessing.async_process_document_with_llm', return_value=mock_documents)
 93 | 
 94 |     loader = SmartLLMLoader(file_path=sample_pdf_path)
 95 |     documents = await loader.aload()
 96 | 
 97 |     assert len(documents) == 1
 98 |     assert documents[0].page_content == "Test content"
 99 | 
100 | 
101 | def test_load_and_split_method(mocker, sample_pdf_path):
102 |     mock_documents = [Document(page_content="Test content")]
103 |     mocker.patch('smart_llm_loader.llm.LLMProcessing.process_document_with_llm', return_value=mock_documents)
104 | 
105 |     loader = SmartLLMLoader(file_path=sample_pdf_path, chunk_strategy="contextual")
106 |     documents = loader.load_and_split()
107 | 
108 |     assert len(documents) == 1
109 |     assert documents[0].page_content == "Test content"
110 | 
111 | 
112 | def test_create_document(sample_pdf_path):
113 |     loader = SmartLLMLoader(file_path=sample_pdf_path)
114 |     chunk = {"content": "Test content", "theme": "Test theme"}
115 |     page_num = 1
116 | 
117 |     doc = loader._create_document(chunk, page_num)
118 | 
119 |     assert isinstance(doc, Document)
120 |     assert doc.page_content == "Test content"
121 |     assert doc.metadata["page"] == page_num
122 |     assert doc.metadata["semantic_theme"] == "Test theme"
123 |     assert doc.metadata["source"] == loader.file_path
124 | 
125 | 
126 | def test_lazy_load(mocker, sample_pdf_path):
127 |     # Mock the necessary components
128 |     mock_images = [Mock()]
129 |     mock_result = {"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]}
130 | 
131 |     mocker.patch('smart_llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images)
132 |     mocker.patch('smart_llm_loader.llm.LLMProcessing.process_image_with_llm', return_value=mock_result)
133 | 
134 |     loader = SmartLLMLoader(file_path=sample_pdf_path)
135 |     documents = list(loader.lazy_load())
136 | 
137 |     assert len(documents) == 1
138 |     assert documents[0].page_content == "Test content"
139 |     assert documents[0].metadata["semantic_theme"] == "Test theme"
140 | 
141 | 
142 | @pytest.mark.asyncio
143 | async def test_alazy_load(mocker, sample_pdf_path):
144 |     # Mock the necessary components
145 |     mock_images = [Mock()]
146 |     mock_result = {"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]}
147 | 
148 |     mocker.patch('smart_llm_loader.llm.ImageProcessor.pdf_to_images', return_value=mock_images)
149 |     mocker.patch('smart_llm_loader.llm.LLMProcessing.async_process_image_with_llm', return_value=mock_result)
150 | 
151 |     loader = SmartLLMLoader(file_path=sample_pdf_path)
152 |     documents = [doc async for doc in loader.alazy_load()]
153 | 
154 |     assert len(documents) == 1
155 |     assert documents[0].page_content == "Test content"
156 |     assert documents[0].metadata["semantic_theme"] == "Test theme"
157 | 


--------------------------------------------------------------------------------
/tests/test_image_processor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from PIL import Image
 3 | import io
 4 | import base64
 5 | 
 6 | from smart_llm_loader.llm import ImageProcessor
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def sample_pdf_path(tmp_path):
11 |     # Create a dummy PDF file for testing
12 |     pdf_path = tmp_path / "test.pdf"
13 |     pdf_path.write_bytes(b"%PDF-1.4\n%EOF")  # Minimal valid PDF
14 |     return pdf_path
15 | 
16 | 
17 | @pytest.fixture
18 | def sample_image():
19 |     # Create a simple test image
20 |     img = Image.new('RGB', (100, 100), color='white')
21 |     return img
22 | 
23 | 
24 | def test_pdf_to_images(sample_pdf_path, mocker):
25 |     # Mock pdf2image.convert_from_path
26 |     mock_images = [Image.new('RGB', (100, 100)) for _ in range(2)]
27 |     mocker.patch('smart_llm_loader.llm.convert_from_path', return_value=mock_images)
28 | 
29 |     images = ImageProcessor.pdf_to_images(sample_pdf_path)
30 | 
31 |     assert len(images) == 2
32 |     assert all(isinstance(img, Image.Image) for img in images)
33 | 
34 | 
35 | def test_image_to_base64(sample_image):
36 |     base64_str = ImageProcessor.image_to_base64(sample_image)
37 | 
38 |     # Verify it's a valid base64 string
39 |     assert isinstance(base64_str, str)
40 | 
41 |     # Verify we can decode it back to an image
42 |     try:
43 |         decoded = base64.b64decode(base64_str)
44 |         Image.open(io.BytesIO(decoded))
45 |     except Exception as e:
46 |         pytest.fail(f"Failed to decode base64 string: {e}")
47 | 
48 | 
49 | def test_pdf_to_images_file_not_found():
50 |     with pytest.raises(Exception):
51 |         ImageProcessor.pdf_to_images("nonexistent.pdf")
52 | 


--------------------------------------------------------------------------------
/tests/test_llm_processing.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from PIL import Image
  3 | from langchain_core.documents import Document
  4 | from unittest.mock import Mock
  5 | 
  6 | from smart_llm_loader.llm import LLMProcessing
  7 | from smart_llm_loader.prompts import DEFAULT_PAGE_CHUNK_PROMPT, DEFAULT_CHUNK_PROMPT
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def llm_processor(mocker):
 12 |     # Mock all validation functions
 13 |     mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True})
 14 |     mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True)
 15 |     mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True)
 16 |     return LLMProcessing(model="gemini/gemini-2.0-flash")
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def sample_image():
 21 |     return Image.new('RGB', (100, 100), color='white')
 22 | 
 23 | 
 24 | def test_validate_model_valid(mocker):
 25 |     # Mock the validation functions
 26 |     mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True})
 27 |     mocker.patch('smart_llm_loader.llm.supports_vision', return_value=True)
 28 |     mocker.patch('smart_llm_loader.llm.check_valid_key', return_value=True)
 29 | 
 30 |     # Should not raise any exceptions
 31 |     LLMProcessing(model="gemini/gemini-2.0-flash")
 32 | 
 33 | 
 34 | def test_validate_model_missing_env_vars(mocker):
 35 |     mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": False})
 36 | 
 37 |     with pytest.raises(ValueError, match="Missing environment variables"):
 38 |         LLMProcessing(model="gemini/gemini-2.0-flash")
 39 | 
 40 | 
 41 | def test_validate_model_unsupported_vision(mocker):
 42 |     mocker.patch('smart_llm_loader.llm.validate_environment', return_value={"keys_in_environment": True})
 43 |     mocker.patch('smart_llm_loader.llm.supports_vision', return_value=False)
 44 | 
 45 |     with pytest.raises(ValueError, match="not a supported vision model"):
 46 |         LLMProcessing(model="unsupported-model")
 47 | 
 48 | 
 49 | def test_get_chunk_prompt_default_page():
 50 |     prompt = LLMProcessing.get_chunk_prompt('page')
 51 |     assert prompt == DEFAULT_PAGE_CHUNK_PROMPT
 52 | 
 53 | 
 54 | def test_get_chunk_prompt_default_contextual():
 55 |     prompt = LLMProcessing.get_chunk_prompt('contextual')
 56 |     assert prompt == DEFAULT_CHUNK_PROMPT
 57 | 
 58 | 
 59 | def test_get_chunk_prompt_custom():
 60 |     custom_prompt = "Custom test prompt"
 61 |     prompt = LLMProcessing.get_chunk_prompt('custom', custom_prompt)
 62 |     assert prompt == custom_prompt
 63 | 
 64 | 
 65 | def test_get_chunk_prompt_custom_missing():
 66 |     with pytest.raises(ValueError, match="Custom prompt is not provided"):
 67 |         LLMProcessing.get_chunk_prompt('custom')
 68 | 
 69 | 
 70 | def test_get_chunk_prompt_invalid_strategy():
 71 |     with pytest.raises(ValueError, match="Invalid chunk strategy"):
 72 |         LLMProcessing.get_chunk_prompt('invalid')
 73 | 
 74 | 
 75 | def test_prepare_llm_messages(sample_image):
 76 |     prompt = "Test prompt"
 77 |     messages = LLMProcessing.prepare_llm_messages(sample_image, prompt)
 78 | 
 79 |     assert len(messages) == 2
 80 |     assert messages[0]["role"] == "system"
 81 |     assert messages[0]["content"] == prompt
 82 |     assert messages[1]["role"] == "user"
 83 |     assert len(messages[1]["content"]) == 2
 84 |     assert messages[1]["content"][0]["type"] == "text"
 85 |     assert messages[1]["content"][1]["type"] == "image_url"
 86 | 
 87 | 
 88 | def test_serialize_response():
 89 |     results = [
 90 |         {
 91 |             "markdown_chunks": [
 92 |                 {"content": "Test content 1", "theme": "Theme 1"},
 93 |                 {"content": "Test content 2", "theme": "Theme 2"}
 94 |             ]
 95 |         }
 96 |     ]
 97 |     file_path = "test.pdf"
 98 | 
 99 |     documents = LLMProcessing.serialize_response(results, file_path)
100 | 
101 |     assert len(documents) == 2
102 |     assert all(isinstance(doc, Document) for doc in documents)
103 |     assert documents[0].page_content == "Test content 1"
104 |     assert documents[0].metadata["semantic_theme"] == "Theme 1"
105 |     assert documents[0].metadata["source"] == file_path
106 | 
107 | 
108 | @pytest.mark.asyncio
109 | async def test_async_process_image_with_llm_success(llm_processor, sample_image, mocker):
110 |     # Create a mock response with the correct structure
111 |     mock_response = Mock()
112 |     mock_response.choices = [
113 |         Mock(
114 |             message=Mock(
115 |                 content='{"markdown_chunks": [{"content": "Test content", "theme": "Test theme"}]}'
116 |             )
117 |         )
118 |     ]
119 |     mocker.patch('smart_llm_loader.llm.acompletion', return_value=mock_response)
120 | 
121 |     result = await llm_processor.async_process_image_with_llm(sample_image, "Test prompt")
122 | 
123 |     assert "markdown_chunks" in result
124 |     assert len(result["markdown_chunks"]) == 1
125 |     assert result["markdown_chunks"][0]["content"] == "Test content"
126 | 
127 | 
128 | @pytest.mark.asyncio
129 | async def test_async_process_image_with_llm_error(llm_processor, sample_image, mocker):
130 |     mocker.patch('smart_llm_loader.llm.acompletion', side_effect=Exception("Test error"))
131 | 
132 |     result = await llm_processor.async_process_image_with_llm(sample_image, "Test prompt")
133 | 
134 |     assert "markdown_chunks" in result
135 |     assert result["markdown_chunks"][0]["content"] is None
136 | 


--------------------------------------------------------------------------------