├── .gitattributes ├── .github └── workflows │ └── deploy.yml ├── .gitignore ├── LICENSE ├── README.md ├── pyproject.toml ├── setup.cfg ├── src └── aipdf │ ├── __init__.py │ └── ocr.py └── tests ├── integration_tests ├── __init__.py ├── files │ ├── catalogue.pdf │ └── dictionary-1-5.pdf └── test_aipdf.py └── unit_tests ├── __init__.py └── test_aipdf.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Build and publish to PyPi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | # Push a new release to PyPI 9 | deploy_to_pypi: 10 | name: Publish to PyPI 11 | runs-on: ubuntu-latest 12 | if: github.actor != 'mindsdbadmin' 13 | steps: 14 | - uses: actions/checkout@v4 15 | - name: Set up Python 16 | uses: actions/setup-python@v5.1.0 17 | with: 18 | python-version: ${{ vars.CI_PYTHON_VERSION }} 19 | - name: Install dependencies 20 | run: | 21 | pip install . 22 | pip install setuptools wheel twine build 23 | - name: Clean previous builds 24 | run: rm -rf dist/ build/ *.egg-info 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: __token__ 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python -m build 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | *__pycache__* 3 | **/*.egg-info 4 | dist/* 5 | var 6 | etc 7 | .idea 8 | mindsdb/logs 9 | mindsdb/logs 10 | logs 11 | tests/temp 12 | /integration_tests/flows/config/* 13 | my_model.zip 14 | model.py 15 | tests/integration_tests/flows/model.py 16 | tests/integration_tests/flows/my_model.zip 17 | distributions/docker-compose/*/storage 18 | venv/* 19 | env/* 20 | mindsdb-venv/* 21 | root 22 | static 23 | tmp 24 | .pytest_cache 25 | tests/prediction_latency_test/*.csv 26 | storage_dir 27 | docker/dist/* 28 | .directory 29 | MindsDB.egg-info/* 30 | 31 | # Autokeras generated files 32 | auto_model 33 | autokeras 34 | structured_data_classifier 35 | structured_data_regressor 36 | mindsdb/integrations/handlers/autokeras_handler/structured_data_classifier 37 | 38 | # VisualStudioCode 39 | .vscode 40 | .vscode/* 41 | .history 42 | doc/lib/ 43 | run.csh 44 | 45 | # Virtualenv 46 | bin 47 | lib 48 | lib64 49 | pyvenv.cfg 50 | .python-version 51 | .DS_Store 52 | Scripts 53 | Include 54 | xgboost 55 | 56 | tests/__init__.py 57 | 58 | # Docs 59 | node_modules 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 MindsDB Inc 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AIPDF: Minimalistic PDF to Markdown (and others), with GPT-like Multimodal Models 2 | 3 | AIPDF is a stand-alone, minimalistic, yet powerful pure Python library that leverages multi-modal gen AI models (OpenAI, llama3 or compatible alternatives) to extract data from PDFs and convert it Markdown. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | pip install aipdf 9 | ``` 10 | 11 | ## Quick Start 12 | 13 | ```python 14 | from aipdf import ocr 15 | 16 | # Your API key 17 | # This can also be via the environment variable AIPDF_API_KEY 18 | api_key = 'your_api_key' 19 | 20 | file = open('somepdf.pdf', 'rb') 21 | markdown_pages = ocr(file, api_key) 22 | ``` 23 | 24 | By default, AIPDF attempts to determine which pages to send to the LLM based on their content and whether they can be processed using traditional text parsing. This is done to improve performance, and the behavior can be overridden by setting the `use_llm_for_all` parameter to `True`: 25 | 26 | ```python 27 | markdown_pages = ocr(file, api_key, use_llm_for_all=True) 28 | ``` 29 | 30 | Every call to the LLM is made in parallel, so the processing time is significantly reduced. The above function will make these parallel calls using threading, however, it is also possible to make asynchronous calls instead by using the `ocr_async` function: 31 | 32 | ```python 33 | from aipdf import ocr_async 34 | import asyncio 35 | 36 | # Your API key 37 | # This can also be via the environment variable AIPDF_API_KEY 38 | api_key = 'your_api_key' 39 | 40 | file = open('somepdf.pdf', 'rb') 41 | 42 | async def main(): 43 | markdown_pages = await ocr_async(file, api_key) 44 | return markdown_pages 45 | 46 | markdown_pages = asyncio.run(main()) 47 | ``` 48 | 49 | The maximum number of concurrent requests made to the LLM can also be controlled via the `AIPDF_MAX_CONCURRENT_REQUESTS` environment variable. By default, there is no limit set. 50 | 51 | ## Ollama 52 | 53 | You can use with any ollama multi-modal models 54 | 55 | ```python 56 | ocr(pdf_file, api_key='ollama', model="llama3.2", base_url= 'http://localhost:11434/v1', prompt=...) 57 | ``` 58 | ## Any file system 59 | 60 | We chose that you pass a file object, because that way it is flexible for you to use this with any type of file system, s3, localfiles, urls etc 61 | 62 | ### From url 63 | ```python 64 | 65 | pdf_file = io.BytesIO(requests.get('https://arxiv.org/pdf/2410.02467').content) 66 | 67 | # extract 68 | pages = ocr(pdf_file, api_key, prompt="extract tables, return each table in json") 69 | 70 | ``` 71 | ### From S3 72 | 73 | ```python 74 | 75 | s3 = boto3.client('s3', config=Config(signature_version='s3v4'), 76 | aws_access_key_id=access_token, 77 | aws_secret_access_key='', # Not needed for token-based auth 78 | aws_session_token=access_token) 79 | 80 | 81 | pdf_file = io.BytesIO(s3.get_object(Bucket=bucket_name, Key=object_key)['Body'].read()) 82 | # extract 83 | pages = ocr(pdf_file, api_key, prompt="extract charts data, turn it into tables that represent the variables in the chart") 84 | ``` 85 | 86 | 87 | ## Why AIPDF? 88 | 89 | 1. **Simplicity**: AIPDF provides a straightforward function, it requires minimal setup, dependencies and configuration. 90 | 2. **Power of AI**: Leverages state-of-the-art multi modal models (gpt, llama, ..). 91 | 3. **Customizable**: Tailor the extraction process to your specific needs with custom prompts. 92 | 4. **Efficient**: Utilizes parallel processing for faster extraction of multi-page PDFs. 93 | 94 | ## Requirements 95 | 96 | - Python 3.7+ 97 | 98 | We will keep this super clean, only 2 required libraries: 99 | 100 | - openai library to talk to completion endpoints 101 | - PyMuPDF library for traditional text parsing and image conversion 102 | 103 | ## License 104 | 105 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 106 | 107 | ## Contributing 108 | 109 | Contributions are welcome! Please feel free to submit a Pull Request. 110 | 111 | ## Support 112 | 113 | If you encounter any problems or have any questions, please open an issue on the GitHub repository. 114 | 115 | --- 116 | 117 | AIPDF makes PDF data extraction simple, flexible, and powerful. Try it out and simplify your PDF processing workflow today! 118 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "aipdf" 7 | authors = [ 8 | {name = "Jorge Torres", email = "support@mindsdb.com"}, 9 | ] 10 | description = "A tool to extract PDF files to markdown, or any other format using AI" 11 | readme = "README.md" 12 | requires-python = ">=3.7" 13 | keywords = ["pdf", "markdown", "ai", "conversion", "openai"] 14 | license = {text = "MIT"} 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ] 20 | dependencies = [ 21 | "PyMuPDF==1.25.2", 22 | "openai<2.0.0,>=1.58.1", 23 | ] 24 | dynamic = ["version"] 25 | 26 | [tool.setuptools.dynamic] 27 | version = {attr = "aipdf.__version__"} 28 | 29 | [project.urls] 30 | Homepage = "https://github.com/mindsdb/aipdf" 31 | Repository = "https://github.com/mindsdb/aipdf.git" 32 | 33 | [tool.setuptools_scm] -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = aipdf 3 | description = A tool to extract PDF files to markdown, or any other format using AI 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | 7 | [options] 8 | package_dir = 9 | = src 10 | packages = find: 11 | python_requires = >=3.7 12 | 13 | [options.packages.find] 14 | where = src -------------------------------------------------------------------------------- /src/aipdf/__init__.py: -------------------------------------------------------------------------------- 1 | from .ocr import ocr, ocr_async 2 | 3 | __version__ = "0.0.6.2" 4 | 5 | __all__ = ["__version__", "ocr", "ocr_async"] 6 | -------------------------------------------------------------------------------- /src/aipdf/ocr.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import base64 3 | import concurrent.futures 4 | import io 5 | import logging 6 | import os 7 | 8 | import fitz 9 | from openai import OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI 10 | 11 | 12 | DEFAULT_PROMPT = """ 13 | Extract the full markdown text from the given image, following these guidelines: 14 | - Respond only with markdown, no additional commentary. 15 | - Capture all the text, respecting titles, headers, subheaders, equations, etc. 16 | - If there are tables in this page, convert each one into markdown table format and include it in the response. 17 | - If there are images, provide a brief description of what is shown in each image, and include it in the response. 18 | - if there are charts, for each chart include a markdown table with the data represents the chart, a column for each of the variables of the cart and the relevant estimated values 19 | 20 | """ 21 | DEFAULT_DRAWING_AREA_THRESHOLD = 0.1 # 10% of the page area 22 | DEFAULT_GAP_THRESHOLD = 10 # 10 points 23 | 24 | 25 | def get_openai_client(api_key=None, base_url='https://api.openai.com/v1', is_async=False, **kwargs): 26 | """ 27 | Get an OpenAI client instance. 28 | 29 | Args: 30 | api_key (str): The OpenAI API key. 31 | base_url (str): The base URL for the OpenAI API. 32 | is_async (bool): Whether to create an asynchronous client. 33 | **kwargs: Additional keyword arguments. 34 | 35 | Returns: 36 | OpenAI or AsyncOpenAI: An instance of the OpenAI client. 37 | """ 38 | if not api_key: 39 | api_key = os.getenv("AIPDF_API_KEY") 40 | 41 | if not api_key: 42 | raise ValueError("API key is required. Please provide it as an argument or set the AIPDF_API_KEY environment variable.") 43 | 44 | if base_url and "openai.azure.com" in base_url: 45 | if is_async: 46 | return AsyncAzureOpenAI(api_key=api_key, azure_endpoint=base_url, **kwargs) 47 | else: 48 | return AzureOpenAI(api_key=api_key, azure_endpoint=base_url, **kwargs) 49 | 50 | if is_async: 51 | return AsyncOpenAI(api_key=api_key, base_url=base_url, **kwargs) 52 | else: 53 | return OpenAI(api_key=api_key, base_url=base_url, **kwargs) 54 | 55 | 56 | def _prepare_image_messages(file_object, prompt): 57 | """ 58 | Helper function to prepare messages for OpenAI API call. 59 | 60 | Args: 61 | file_object (io.BytesIO): The image file object. 62 | prompt (str): The prompt to send to the API. 63 | 64 | Returns: 65 | list: The messages list for the API call. 66 | """ 67 | base64_image = base64.b64encode(file_object.read()).decode('utf-8') 68 | 69 | return [ 70 | { 71 | "role": "user", 72 | "content": [ 73 | { 74 | "type": "text", 75 | "text": prompt 76 | }, 77 | { 78 | "type": "image_url", 79 | "image_url": { 80 | "url": f"data:image/jpeg;base64,{base64_image}" 81 | } 82 | } 83 | ] 84 | } 85 | ] 86 | 87 | 88 | def _validate_and_extract_content(response): 89 | """ 90 | Helper function to validate OpenAI API response and extract content. 91 | 92 | Args: 93 | response: The response object from OpenAI API. 94 | 95 | Returns: 96 | str or None: The extracted content, or None if validation fails. 97 | """ 98 | # Validate the response structure before accessing choices 99 | if not response: 100 | logging.error(f"Received empty response from OpenAI API: {response}") 101 | return None 102 | 103 | if not hasattr(response, 'choices') or not response.choices: 104 | logging.error(f"Response does not contain choices or choices is empty. Response: {response}") 105 | return None 106 | 107 | if len(response.choices) == 0: 108 | logging.error(f"Response choices list is empty. Response: {response}") 109 | return None 110 | 111 | first_choice = response.choices[0] 112 | if not hasattr(first_choice, 'message') or not first_choice.message: 113 | logging.error(f"Response choice does not contain message. First choice: {first_choice}") 114 | return None 115 | 116 | if not hasattr(first_choice.message, 'content'): 117 | logging.error(f"Response message does not contain content. Message: {first_choice.message}") 118 | return None 119 | 120 | markdown_content = first_choice.message.content 121 | 122 | # Additional check for empty or None content 123 | if not markdown_content: 124 | logging.warning(f"Response content is empty or None. Content: {repr(markdown_content)}") 125 | return None 126 | 127 | return markdown_content 128 | 129 | 130 | def image_to_markdown(file_object, client, model="gpt-4o", prompt=DEFAULT_PROMPT): 131 | """ 132 | Process a single image file and convert its content to markdown using OpenAI's API. 133 | 134 | Args: 135 | file_object (io.BytesIO): The image file object. 136 | client (OpenAI): The OpenAI client instance. 137 | model (str, optional): by default is gpt-4o 138 | prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT. 139 | 140 | Returns: 141 | str: The markdown representation of the image content, or None if an error occurs. 142 | """ 143 | # Log that we're about to process a page 144 | logging.debug("About to process a page") 145 | 146 | messages = _prepare_image_messages(file_object, prompt) 147 | 148 | try: 149 | response = client.chat.completions.create( 150 | model=model, 151 | messages=messages 152 | ) 153 | 154 | markdown_content = _validate_and_extract_content(response) 155 | 156 | if markdown_content: 157 | logging.debug("Page processed successfully") 158 | return markdown_content 159 | else: 160 | logging.warning("Page is empty or contains no text.") 161 | return None 162 | 163 | except Exception as e: 164 | logging.error(f"An error occurred while processing the image: {e}") 165 | return None 166 | 167 | 168 | async def image_to_markdown_async(file_object, client, model="gpt-4o", prompt=DEFAULT_PROMPT): 169 | """ 170 | Asynchronously process a single image file and convert its content to markdown using OpenAI's API. 171 | 172 | Args: 173 | file_object (io.BytesIO): The image file object. 174 | client (AsyncOpenAI): The AsyncOpenAI client instance. 175 | model (str, optional): by default is gpt-4o 176 | prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT. 177 | 178 | Returns: 179 | tuple: A tuple containing the page number and the markdown representation of the image content, or None if an error occurs. 180 | """ 181 | # Log that we're about to process a page 182 | logging.debug("About to process a page") 183 | 184 | messages = _prepare_image_messages(file_object, prompt) 185 | 186 | try: 187 | response = await client.chat.completions.create( 188 | model=model, 189 | messages=messages 190 | ) 191 | 192 | markdown_content = _validate_and_extract_content(response) 193 | 194 | if markdown_content: 195 | logging.debug("Page processed successfully") 196 | return markdown_content 197 | else: 198 | logging.warning("Page is empty or contains no text.") 199 | return None 200 | 201 | except Exception as e: 202 | logging.error(f"An error occurred while processing the image: {e}") 203 | return None 204 | 205 | 206 | def is_visual_page(page, drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD): 207 | """ 208 | Determine if a page is visual based on presence of images or large drawings. 209 | 210 | Args: 211 | page (fitz.Page): The page object to analyze. 212 | drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual. 213 | 214 | Returns: 215 | bool: True if visual page, False otherwise. 216 | """ 217 | page_area = page.rect.width * page.rect.height 218 | 219 | # Rule 1: If even one image is included, it is a visual page 220 | images = page.get_images(full=True) 221 | if len(images) > 0: 222 | return True 223 | 224 | # Rule 2: If large enough area is covered by real drawings, it is a visual page 225 | drawing_area = 0 226 | for d in page.get_drawings(): 227 | rect = d.get("rect") # Get the bounding box the contains the drawing 228 | if rect: 229 | area = rect.width * rect.height 230 | # Ignore tiny drawings 231 | if area > 5000: # minimum size in points² (~0.7% of page if full-page) 232 | drawing_area += area 233 | 234 | drawing_fraction = drawing_area / page_area 235 | 236 | if drawing_fraction > drawing_area_threshold: 237 | return True 238 | 239 | # Rule 3: If the page does not contain any text, it is a visual page 240 | # These could be scanned images or pages with other complex layouts 241 | if not page.get_text().strip(): 242 | return True 243 | 244 | # Otherwise, it's a text page 245 | return False 246 | 247 | 248 | def page_to_image(page): 249 | """ 250 | Convert a page of a PDF file to an image file. 251 | 252 | Args: 253 | page (fitz.Page): The page object to convert. 254 | 255 | Returns: 256 | bytes: The image file in bytes. 257 | """ 258 | zoom_x = 2.0 # Horizontal zoom 259 | zoom_y = 2.0 # Vertical zoom 260 | mat = fitz.Matrix(zoom_x, zoom_y) # Zoom factor 2 in each dimension 261 | 262 | pix = page.get_pixmap(matrix=mat) 263 | return pix.tobytes("png") 264 | 265 | 266 | def page_to_markdown(page, gap_threshold=DEFAULT_GAP_THRESHOLD): 267 | """ 268 | Convert a page of a PDF file to markdown format. 269 | 270 | Args: 271 | page (fitz.Page): The page object to convert. 272 | gap_threshold (int, optional): The threshold for vertical gaps between text blocks. Defaults to 10. 273 | 274 | Returns: 275 | str: The markdown representation of the page. 276 | """ 277 | blocks = page.get_text("blocks") 278 | blocks.sort(key=lambda block: (block[1], block[0])) 279 | 280 | markdown_page = [] 281 | previous_block_bottom = 0 282 | 283 | for block in blocks: 284 | y0 = block[1] 285 | y1 = block[3] 286 | block_text = block[4] 287 | 288 | # Check if there's a large vertical gap between this block and the previous one 289 | if y0 - previous_block_bottom > gap_threshold: 290 | markdown_page.append("") 291 | 292 | markdown_page.append(block_text) 293 | previous_block_bottom = y1 294 | 295 | return "\n".join(markdown_page) 296 | 297 | 298 | def process_pages(pdf_file, pages_list=None, use_llm_for_all=False, drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD, gap_threshold=DEFAULT_GAP_THRESHOLD): 299 | """ 300 | Process the pages of a PDF file to determine which ones are visual and which ones are text-based. 301 | 302 | Args: 303 | pdf_file (io.BytesIO): The PDF file object. 304 | pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages. 305 | use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False. 306 | drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual. 307 | gap_threshold (int): The threshold for vertical gaps between text blocks. 308 | 309 | Returns: 310 | tuple: A tuple containing a list of markdown-formatted pages and a dictionary of image files. 311 | """ 312 | doc = fitz.open(stream=pdf_file.read(), filetype="pdf") 313 | 314 | pages_list = pages_list or list(range(1, doc.page_count + 1)) # Default to all pages if not provided 315 | 316 | # List to store markdown content for each page 317 | markdown_pages = [None] * len(pages_list) 318 | 319 | image_files = {} 320 | for page_num in pages_list: 321 | page = doc.load_page(page_num - 1) 322 | if not use_llm_for_all and not is_visual_page(page, drawing_area_threshold=drawing_area_threshold): 323 | logging.debug(f"The content of Page {page.number + 1} will be extracted using text parsing.") 324 | # Extract text using traditional OCR 325 | markdown_content = page_to_markdown(page, gap_threshold=gap_threshold) 326 | if markdown_content: 327 | markdown_pages[page_num - 1] = markdown_content 328 | else: 329 | logging.warning(f"Page {page.number + 1} is empty or contains no text.") 330 | markdown_pages[page_num - 1] = f"Page {page.number + 1} is empty or contains no text." 331 | 332 | else: 333 | logging.debug(f"The content of page {page.number + 1} will be extracted using the LLM.") 334 | # Convert page to image 335 | image_file = page_to_image(page) 336 | image_files[page_num - 1] = io.BytesIO(image_file) 337 | 338 | return markdown_pages, image_files 339 | 340 | 341 | def ocr( 342 | pdf_file, 343 | api_key = None, 344 | model="gpt-4o", 345 | base_url='https://api.openai.com/v1', 346 | prompt=DEFAULT_PROMPT, 347 | pages_list=None, 348 | use_llm_for_all=False, 349 | drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD, 350 | gap_threshold=DEFAULT_GAP_THRESHOLD, 351 | logging_level=logging.INFO, 352 | **kwargs 353 | ): 354 | """ 355 | Convert a PDF file to a list of markdown-formatted pages using text parsing and OpenAI's API. 356 | The OpenAI API is called in parallel using threading for each image file. 357 | This function is synchronous. 358 | 359 | Args: 360 | pdf_file (io.BytesIO): The PDF file object. 361 | api_key (str): The OpenAI API key. 362 | model (str, optional): by default is gpt-4o 363 | base_url (str): You can use this one to point the client whereever you need it like Ollama 364 | prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT. 365 | pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages. 366 | use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False. 367 | drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual. 368 | gap_threshold (int): The threshold for vertical gaps between text blocks. 369 | logging_level (int): The logging level. Defaults to logging.INFO. 370 | **kwargs: Additional keyword arguments. 371 | 372 | Returns: 373 | list: A list of strings, each containing the markdown representation of a PDF page. 374 | """ 375 | # Set up logging 376 | logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s') 377 | 378 | client = get_openai_client(api_key=api_key, base_url=base_url, **kwargs) 379 | 380 | # Identify the maximum number of workers for parallel processing 381 | max_workers = os.getenv("AIPDF_MAX_CONCURRENT_REQUESTS", None) 382 | if max_workers: 383 | logging.debug("The maximum number of concurrent requests is set to %s", max_workers) 384 | max_workers = int(max_workers) 385 | 386 | markdown_pages, image_files = process_pages( 387 | pdf_file, 388 | pages_list=pages_list, 389 | use_llm_for_all=use_llm_for_all, 390 | drawing_area_threshold=drawing_area_threshold, 391 | gap_threshold=gap_threshold 392 | ) 393 | 394 | if image_files: 395 | if max_workers: 396 | executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) 397 | else: 398 | executor = concurrent.futures.ThreadPoolExecutor() 399 | 400 | # Process each image file in parallel 401 | with executor: 402 | # Submit tasks for each image file 403 | future_to_page = {executor.submit(image_to_markdown, img_file, client, model, prompt): page_num 404 | for page_num, img_file in image_files.items()} 405 | 406 | # Collect results as they complete 407 | for future in concurrent.futures.as_completed(future_to_page): 408 | page_num = future_to_page[future] 409 | try: 410 | markdown_content = future.result() 411 | if markdown_content: 412 | markdown_pages[page_num] = markdown_content 413 | else: 414 | markdown_pages[page_num] = f"Error processing page {page_num + 1}." 415 | except Exception as e: 416 | logging.error(f"Error processing page {page_num + 1}: {e}") 417 | markdown_pages[page_num] = f"Error processing page {page_num + 1}: {str(e)}" 418 | 419 | return markdown_pages 420 | 421 | 422 | async def ocr_async( 423 | pdf_file, 424 | api_key = None, 425 | model="gpt-4o", 426 | base_url='https://api.openai.com/v1', 427 | prompt=DEFAULT_PROMPT, 428 | pages_list=None, 429 | use_llm_for_all=False, 430 | drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD, 431 | gap_threshold=DEFAULT_GAP_THRESHOLD, 432 | logging_level=logging.INFO, 433 | **kwargs 434 | ): 435 | """ 436 | Convert a PDF file to a list of markdown-formatted pages using text parsing and OpenAI's API. 437 | The OpenAI API is called asynchronously for each image file. 438 | This function is asynchronous. 439 | 440 | Args: 441 | pdf_file (io.BytesIO): The PDF file object. 442 | api_key (str): The OpenAI API key. 443 | model (str, optional): by default is gpt-4o 444 | base_url (str): You can use this one to point the client whereever you need it like Ollama 445 | prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT. 446 | pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages. 447 | use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False. 448 | drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual. 449 | gap_threshold (int): The threshold for vertical gaps between text blocks. 450 | logging_level (int): The logging level. Defaults to logging.INFO. 451 | **kwargs: Additional keyword arguments. 452 | 453 | Returns: 454 | list: A list of strings, each containing the markdown representation of a PDF page. 455 | """ 456 | # Set up logging 457 | logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s') 458 | 459 | client = get_openai_client(api_key=api_key, base_url=base_url, is_async=True, **kwargs) 460 | 461 | # Set up a semaphore for limiting concurrent requests if specified 462 | semaphore = None 463 | max_concurrent_requests = os.getenv("AIPDF_MAX_CONCURRENT_REQUESTS", None) 464 | if max_concurrent_requests: 465 | logging.debug("The maximum number of concurrent requests is set to %s", max_concurrent_requests) 466 | max_concurrent_requests = int(max_concurrent_requests) 467 | semaphore = asyncio.Semaphore(max_concurrent_requests) 468 | 469 | markdown_pages, image_files = process_pages( 470 | pdf_file, 471 | pages_list=pages_list, 472 | use_llm_for_all=use_llm_for_all, 473 | drawing_area_threshold=drawing_area_threshold, 474 | gap_threshold=gap_threshold 475 | ) 476 | 477 | if image_files: 478 | # Process each image file in parallel 479 | tasks = [] 480 | 481 | async def task_wrapper(img_file, page_num): 482 | if semaphore: 483 | async with semaphore: 484 | markdown_content = await image_to_markdown_async(img_file, client, model, prompt) 485 | else: 486 | markdown_content = await image_to_markdown_async(img_file, client, model, prompt) 487 | return page_num, markdown_content 488 | 489 | tasks = [task_wrapper(img_file, page_num) for page_num, img_file in image_files.items()] 490 | 491 | # Collect results as they complete 492 | results = await asyncio.gather(*tasks) 493 | 494 | for page_num, markdown_content in results: 495 | if markdown_content: 496 | markdown_pages[page_num] = markdown_content 497 | else: 498 | markdown_pages[page_num] = f"Error processing page {page_num + 1}." 499 | 500 | return markdown_pages 501 | -------------------------------------------------------------------------------- /tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/integration_tests/__init__.py -------------------------------------------------------------------------------- /tests/integration_tests/files/catalogue.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/integration_tests/files/catalogue.pdf -------------------------------------------------------------------------------- /tests/integration_tests/files/dictionary-1-5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/integration_tests/files/dictionary-1-5.pdf -------------------------------------------------------------------------------- /tests/integration_tests/test_aipdf.py: -------------------------------------------------------------------------------- 1 | import io 2 | import logging 3 | import os 4 | import time 5 | import unittest 6 | 7 | from src.aipdf.ocr import ocr, ocr_async 8 | 9 | 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 11 | 12 | 13 | class TestOCRIntegration(unittest.TestCase): 14 | def setUp(self): 15 | # Path to the directory containing test PDF files 16 | self.files_dir = os.path.join(os.path.dirname(__file__), "files") 17 | 18 | def test_ocr_on_sample_pdfs(self): 19 | # Iterate through all PDF files in the files directory 20 | for file_name in os.listdir(self.files_dir): 21 | if file_name.endswith(".pdf"): 22 | file_path = os.path.join(self.files_dir, file_name) 23 | with open(file_path, "rb") as pdf_file: 24 | pdf_bytes = io.BytesIO(pdf_file.read()) 25 | 26 | start_time = time.time() 27 | result = ocr(pdf_bytes) 28 | elapsed_time = time.time() - start_time 29 | logging.info(f"Processed {file_name} in {elapsed_time:.2f} seconds") 30 | 31 | self.assertIsInstance(result, list) 32 | self.assertGreater(len(result), 0, f"Result is empty for file: {file_name}") 33 | for page_content in result: 34 | self.assertIsInstance(page_content, str) 35 | self.assertGreater(len(page_content.strip()), 0, f"Page content is empty for file: {file_name}") 36 | 37 | 38 | class TestOCRAsyncIntegration(unittest.IsolatedAsyncioTestCase): 39 | def setUp(self): 40 | # Path to the directory containing test PDF files 41 | self.files_dir = os.path.join(os.path.dirname(__file__), "files") 42 | 43 | async def test_ocr_async_on_sample_pdfs(self): 44 | # Iterate through all PDF files in the files directory 45 | for file_name in os.listdir(self.files_dir): 46 | if file_name.endswith(".pdf"): 47 | file_path = os.path.join(self.files_dir, file_name) 48 | with open(file_path, "rb") as pdf_file: 49 | pdf_bytes = io.BytesIO(pdf_file.read()) 50 | 51 | start_time = time.time() 52 | result = await ocr_async(pdf_bytes) 53 | elapsed_time = time.time() - start_time 54 | logging.info(f"Processed {file_name} in {elapsed_time:.2f} seconds") 55 | 56 | self.assertIsInstance(result, list) 57 | self.assertGreater(len(result), 0, f"Result is empty for file: {file_name}") 58 | for page_content in result: 59 | self.assertIsInstance(page_content, str) 60 | self.assertGreater(len(page_content.strip()), 0, f"Page content is empty for file: {file_name}") 61 | 62 | 63 | if __name__ == "__main__": 64 | unittest.main() -------------------------------------------------------------------------------- /tests/unit_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/unit_tests/__init__.py -------------------------------------------------------------------------------- /tests/unit_tests/test_aipdf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock, patch 3 | import io 4 | from aipdf.ocr import ( 5 | image_to_markdown, 6 | is_visual_page, 7 | page_to_image, 8 | page_to_markdown, 9 | ocr, 10 | ) 11 | 12 | 13 | class TestPageToImage(unittest.TestCase): 14 | @patch("fitz.Page") 15 | def test_page_to_image(self, mock_page): 16 | # Mock page pixmap 17 | mock_pixmap = MagicMock() 18 | mock_pixmap.tobytes.return_value = b"fake image bytes" 19 | mock_page.get_pixmap.return_value = mock_pixmap 20 | 21 | result = page_to_image(mock_page) 22 | 23 | self.assertEqual(result, b"fake image bytes") 24 | mock_page.get_pixmap.assert_called_once() 25 | 26 | 27 | class TestPageToMarkdown(unittest.TestCase): 28 | @patch("fitz.Page") 29 | def test_page_to_markdown(self, mock_page): 30 | # Mock page text blocks 31 | mock_page.get_text.return_value = [ 32 | (0, 0, 100, 50, "Header"), 33 | (0, 60, 100, 100, "Body text"), 34 | ] 35 | 36 | result = page_to_markdown(mock_page) 37 | 38 | self.assertEqual(result, "Header\nBody text") 39 | mock_page.get_text.assert_called_once_with("blocks") 40 | 41 | 42 | class TestIsVisualPage(unittest.TestCase): 43 | @patch("fitz.Page") 44 | def test_is_visual_page_with_images(self, mock_page): 45 | # Mock page with images 46 | mock_page.get_images.return_value = [("image1",)] 47 | 48 | result = is_visual_page(mock_page) 49 | 50 | self.assertTrue(result) 51 | 52 | @patch("fitz.Page") 53 | def test_is_visual_page_with_drawings(self, mock_page): 54 | # Mock page with drawings 55 | mock_page.rect.width = 100 56 | mock_page.rect.height = 100 57 | mock_page.get_images.return_value = [] 58 | 59 | mock_rect = MagicMock() 60 | mock_rect.width = 100 61 | mock_rect.height = 100 62 | mock_drawing = {"rect": mock_rect} 63 | mock_page.get_drawings.return_value = [mock_drawing] 64 | mock_page.get_drawings.return_value = [mock_drawing] 65 | 66 | result = is_visual_page(mock_page) 67 | 68 | self.assertTrue(result) 69 | 70 | @patch("fitz.Page") 71 | def test_is_visual_page_with_no_visual_content(self, mock_page): 72 | # Mock page with no images or drawings 73 | mock_page.rect.width = 100 74 | mock_page.rect.height = 100 75 | mock_page.get_images.return_value = [] 76 | mock_page.get_drawings.return_value = [] 77 | mock_page.get_text.return_value = "Some text" 78 | 79 | result = is_visual_page(mock_page) 80 | 81 | self.assertFalse(result) 82 | 83 | 84 | class TestImageToMarkdown(unittest.TestCase): 85 | @patch("openai.OpenAI") 86 | def test_image_to_markdown_success(self, mock_openai): 87 | # Mock OpenAI client response 88 | mock_client = MagicMock() 89 | mock_openai.return_value = mock_client 90 | mock_client.chat.completions.create.return_value = MagicMock( 91 | choices=[MagicMock(message=MagicMock(content="Markdown content"))] 92 | ) 93 | 94 | file_object = io.BytesIO(b"fake image data") 95 | result = image_to_markdown(file_object, mock_client) 96 | 97 | self.assertEqual(result, "Markdown content") 98 | mock_client.chat.completions.create.assert_called_once() 99 | 100 | @patch("openai.OpenAI") 101 | def test_image_to_markdown_failure(self, mock_openai): 102 | # Mock OpenAI client to raise an exception 103 | mock_client = MagicMock() 104 | mock_openai.return_value = mock_client 105 | mock_client.chat.completions.create.side_effect = Exception("API error") 106 | 107 | file_object = io.BytesIO(b"fake image data") 108 | result = image_to_markdown(file_object, mock_client) 109 | 110 | self.assertIsNone(result) 111 | 112 | 113 | class TestOCR(unittest.TestCase): 114 | @patch("fitz.open") 115 | @patch("openai.OpenAI") 116 | def test_ocr_with_text_pages(self, mock_openai, mock_fitz_open): 117 | # Mock the PDF document 118 | mock_doc = MagicMock() 119 | mock_page = MagicMock() 120 | mock_page.rect.width = 100 121 | mock_page.rect.height = 100 122 | mock_doc.page_count = 1 123 | mock_doc.load_page.return_value = mock_page 124 | mock_fitz_open.return_value = mock_doc 125 | 126 | # Define get_text to return text and blocks 127 | def mock_get_text(arg = "text"): 128 | if arg == "text": 129 | return "Header\nBody text" 130 | elif arg == "blocks": 131 | return [ 132 | (0, 0, 100, 50, "Header"), 133 | (0, 60, 100, 100, "Body text"), 134 | ] 135 | else: 136 | return "" 137 | mock_page.get_text.side_effect = mock_get_text 138 | 139 | # Mock OpenAI client 140 | mock_client = MagicMock() 141 | mock_openai.return_value = mock_client 142 | 143 | pdf_file = io.BytesIO(b"Header\nBody text") 144 | result = ocr(pdf_file, api_key="fake_api_key") 145 | 146 | self.assertEqual(len(result), 1) 147 | self.assertEqual(result[0], "Header\nBody text") 148 | mock_page.get_text.assert_any_call() 149 | mock_page.get_text.assert_any_call("blocks") 150 | 151 | 152 | if __name__ == "__main__": 153 | unittest.main() --------------------------------------------------------------------------------