├── .gitattributes
├── .github
    └── workflows
    │   └── deploy.yml
├── .gitignore
├── LICENSE
├── README.md
├── pyproject.toml
├── setup.cfg
├── src
    └── aipdf
    │   ├── __init__.py
    │   └── ocr.py
└── tests
    ├── integration_tests
        ├── __init__.py
        ├── files
        │   ├── catalogue.pdf
        │   └── dictionary-1-5.pdf
        └── test_aipdf.py
    └── unit_tests
        ├── __init__.py
        └── test_aipdf.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
 1 | name: Build and publish to PyPi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | jobs:
 8 |  #  Push a new release to PyPI
 9 |   deploy_to_pypi:
10 |     name: Publish to PyPI
11 |     runs-on: ubuntu-latest
12 |     if: github.actor != 'mindsdbadmin'
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5.1.0
17 |         with:
18 |           python-version: ${{ vars.CI_PYTHON_VERSION }}
19 |       - name: Install dependencies
20 |         run: |
21 |           pip install .
22 |           pip install setuptools wheel twine build
23 |       - name: Clean previous builds
24 |         run: rm -rf dist/ build/ *.egg-info
25 |       - name: Build and publish
26 |         env:
27 |           TWINE_USERNAME: __token__
28 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |         run: |
30 |           python -m build
31 |           twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.pyc
 2 | *__pycache__*
 3 | **/*.egg-info
 4 | dist/*
 5 | var
 6 | etc
 7 | .idea
 8 | mindsdb/logs
 9 | mindsdb/logs
10 | logs
11 | tests/temp
12 | /integration_tests/flows/config/*
13 | my_model.zip
14 | model.py
15 | tests/integration_tests/flows/model.py
16 | tests/integration_tests/flows/my_model.zip
17 | distributions/docker-compose/*/storage
18 | venv/*
19 | env/*
20 | mindsdb-venv/*
21 | root
22 | static
23 | tmp
24 | .pytest_cache
25 | tests/prediction_latency_test/*.csv
26 | storage_dir
27 | docker/dist/*
28 | .directory
29 | MindsDB.egg-info/*
30 | 
31 | # Autokeras generated files
32 | auto_model
33 | autokeras
34 | structured_data_classifier
35 | structured_data_regressor
36 | mindsdb/integrations/handlers/autokeras_handler/structured_data_classifier
37 | 
38 | # VisualStudioCode
39 | .vscode
40 | .vscode/*
41 | .history
42 | doc/lib/
43 | run.csh
44 | 
45 | # Virtualenv
46 | bin
47 | lib
48 | lib64
49 | pyvenv.cfg
50 | .python-version
51 | .DS_Store
52 | Scripts
53 | Include
54 | xgboost
55 | 
56 | tests/__init__.py
57 | 
58 | # Docs
59 | node_modules
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2024 MindsDB Inc
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AIPDF: Minimalistic PDF to Markdown (and others), with GPT-like Multimodal Models
  2 | 
  3 | AIPDF is a stand-alone, minimalistic, yet powerful pure Python library that leverages multi-modal gen AI models (OpenAI, llama3 or compatible alternatives) to extract data from PDFs and convert it Markdown. 
  4 | 
  5 | ## Installation
  6 | 
  7 | ```bash
  8 | pip install aipdf
  9 | ```
 10 | 
 11 | ## Quick Start
 12 | 
 13 | ```python
 14 | from aipdf import ocr
 15 | 
 16 | # Your API key
 17 | # This can also be via the environment variable AIPDF_API_KEY
 18 | api_key = 'your_api_key'
 19 | 
 20 | file = open('somepdf.pdf', 'rb')
 21 | markdown_pages = ocr(file, api_key)
 22 | ```
 23 | 
 24 | By default, AIPDF attempts to determine which pages to send to the LLM based on their content and whether they can be processed using traditional text parsing. This is done to improve performance, and the behavior can be overridden by setting the `use_llm_for_all` parameter to `True`:
 25 | 
 26 | ```python
 27 | markdown_pages = ocr(file, api_key, use_llm_for_all=True)
 28 | ```
 29 | 
 30 | Every call to the LLM is made in parallel, so the processing time is significantly reduced. The above function will make these parallel calls using threading, however, it is also possible to make asynchronous calls instead by using the `ocr_async` function:
 31 | 
 32 | ```python
 33 | from aipdf import ocr_async
 34 | import asyncio
 35 | 
 36 | # Your API key
 37 | # This can also be via the environment variable AIPDF_API_KEY
 38 | api_key = 'your_api_key'
 39 | 
 40 | file = open('somepdf.pdf', 'rb')
 41 | 
 42 | async def main():
 43 |     markdown_pages = await ocr_async(file, api_key)
 44 |     return markdown_pages
 45 | 
 46 | markdown_pages = asyncio.run(main())
 47 | ```
 48 | 
 49 | The maximum number of concurrent requests made to the LLM can also be controlled via the `AIPDF_MAX_CONCURRENT_REQUESTS` environment variable. By default, there is no limit set.
 50 | 
 51 | ##  Ollama
 52 | 
 53 | You can use with any ollama multi-modal models 
 54 | 
 55 | ```python
 56 | ocr(pdf_file, api_key='ollama', model="llama3.2", base_url= 'http://localhost:11434/v1', prompt=...)
 57 | ```
 58 | ## Any file system
 59 | 
 60 | We chose that you pass a file object, because that way it is flexible for you to use this with any type of file system, s3, localfiles, urls etc
 61 | 
 62 | ### From url
 63 | ```python
 64 | 
 65 | pdf_file = io.BytesIO(requests.get('https://arxiv.org/pdf/2410.02467').content)
 66 | 
 67 | # extract
 68 | pages = ocr(pdf_file, api_key, prompt="extract tables, return each table in json")
 69 | 
 70 | ```
 71 | ### From S3
 72 | 
 73 | ```python
 74 | 
 75 | s3 = boto3.client('s3', config=Config(signature_version='s3v4'),
 76 |                   aws_access_key_id=access_token,
 77 |                   aws_secret_access_key='', # Not needed for token-based auth
 78 |                   aws_session_token=access_token)
 79 | 
 80 | 
 81 | pdf_file = io.BytesIO(s3.get_object(Bucket=bucket_name, Key=object_key)['Body'].read())
 82 | # extract 
 83 | pages = ocr(pdf_file, api_key, prompt="extract charts data, turn it into tables that represent the variables in the chart")
 84 | ```
 85 | 
 86 | 
 87 | ## Why AIPDF?
 88 | 
 89 | 1. **Simplicity**: AIPDF provides a straightforward function, it requires minimal setup, dependencies and configuration.
 90 | 2. **Power of AI**: Leverages state-of-the-art multi modal models (gpt, llama, ..).
 91 | 3. **Customizable**: Tailor the extraction process to your specific needs with custom prompts.
 92 | 4. **Efficient**: Utilizes parallel processing for faster extraction of multi-page PDFs.
 93 | 
 94 | ## Requirements
 95 | 
 96 | - Python 3.7+
 97 | 
 98 | We will keep this super clean, only 2 required libraries:
 99 | 
100 | - openai library to talk to completion endpoints
101 | - PyMuPDF library for traditional text parsing and image conversion
102 | 
103 | ## License
104 | 
105 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
106 | 
107 | ## Contributing
108 | 
109 | Contributions are welcome! Please feel free to submit a Pull Request.
110 | 
111 | ## Support
112 | 
113 | If you encounter any problems or have any questions, please open an issue on the GitHub repository.
114 | 
115 | ---
116 | 
117 | AIPDF makes PDF data extraction simple, flexible, and powerful. Try it out and simplify your PDF processing workflow today!
118 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "aipdf"
 7 | authors = [
 8 |     {name = "Jorge Torres", email = "support@mindsdb.com"},
 9 | ]
10 | description = "A tool to extract PDF files to markdown, or any other format using AI"
11 | readme = "README.md"
12 | requires-python = ">=3.7"
13 | keywords = ["pdf", "markdown", "ai", "conversion", "openai"]
14 | license = {text = "MIT"}
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "License :: OSI Approved :: MIT License",
18 |     "Operating System :: OS Independent",
19 | ]
20 | dependencies = [
21 |     "PyMuPDF==1.25.2",
22 |     "openai<2.0.0,>=1.58.1",
23 | ]
24 | dynamic = ["version"]
25 | 
26 | [tool.setuptools.dynamic]
27 | version = {attr = "aipdf.__version__"}
28 | 
29 | [project.urls]
30 | Homepage = "https://github.com/mindsdb/aipdf"
31 | Repository = "https://github.com/mindsdb/aipdf.git"
32 | 
33 | [tool.setuptools_scm]


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = aipdf
 3 | description = A tool to extract PDF files to markdown, or any other format using AI
 4 | long_description = file: README.md
 5 | long_description_content_type = text/markdown
 6 | 
 7 | [options]
 8 | package_dir =
 9 |     = src
10 | packages = find:
11 | python_requires = >=3.7
12 | 
13 | [options.packages.find]
14 | where = src


--------------------------------------------------------------------------------
/src/aipdf/__init__.py:
--------------------------------------------------------------------------------
1 | from .ocr import ocr, ocr_async
2 | 
3 | __version__ = "0.0.6.2"
4 | 
5 | __all__ = ["__version__", "ocr", "ocr_async"]
6 | 


--------------------------------------------------------------------------------
/src/aipdf/ocr.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import base64
  3 | import concurrent.futures
  4 | import io
  5 | import logging
  6 | import os
  7 | 
  8 | import fitz
  9 | from openai import OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI
 10 | 
 11 | 
 12 | DEFAULT_PROMPT = """
 13 | Extract the full markdown text from the given image, following these guidelines:
 14 | - Respond only with markdown, no additional commentary.  
 15 | - Capture all the text, respecting titles, headers, subheaders, equations, etc.
 16 | - If there are tables in this page, convert each one into markdown table format and include it in the response.
 17 | - If there are images, provide a brief description of what is shown in each image, and include it in the response.
 18 | - if there are charts, for each chart include a markdown table with the data represents the chart, a column for each of the variables of the cart and the relevant estimated values
 19 |           
 20 | """
 21 | DEFAULT_DRAWING_AREA_THRESHOLD = 0.1  # 10% of the page area
 22 | DEFAULT_GAP_THRESHOLD = 10  # 10 points
 23 | 
 24 | 
 25 | def get_openai_client(api_key=None, base_url='https://api.openai.com/v1', is_async=False, **kwargs):
 26 |     """
 27 |     Get an OpenAI client instance.
 28 | 
 29 |     Args:
 30 |         api_key (str): The OpenAI API key.
 31 |         base_url (str): The base URL for the OpenAI API.
 32 |         is_async (bool): Whether to create an asynchronous client.
 33 |         **kwargs: Additional keyword arguments.
 34 | 
 35 |     Returns:
 36 |         OpenAI or AsyncOpenAI: An instance of the OpenAI client.
 37 |     """
 38 |     if not api_key:
 39 |         api_key = os.getenv("AIPDF_API_KEY")
 40 | 
 41 |     if not api_key:
 42 |         raise ValueError("API key is required. Please provide it as an argument or set the AIPDF_API_KEY environment variable.")
 43 |     
 44 |     if base_url and "openai.azure.com" in base_url:
 45 |         if is_async:
 46 |             return AsyncAzureOpenAI(api_key=api_key, azure_endpoint=base_url, **kwargs)
 47 |         else:
 48 |             return AzureOpenAI(api_key=api_key, azure_endpoint=base_url, **kwargs)
 49 | 
 50 |     if is_async:
 51 |         return AsyncOpenAI(api_key=api_key, base_url=base_url, **kwargs)
 52 |     else:
 53 |         return OpenAI(api_key=api_key, base_url=base_url, **kwargs)
 54 | 
 55 | 
 56 | def _prepare_image_messages(file_object, prompt):
 57 |     """
 58 |     Helper function to prepare messages for OpenAI API call.
 59 |     
 60 |     Args:
 61 |         file_object (io.BytesIO): The image file object.
 62 |         prompt (str): The prompt to send to the API.
 63 |         
 64 |     Returns:
 65 |         list: The messages list for the API call.
 66 |     """
 67 |     base64_image = base64.b64encode(file_object.read()).decode('utf-8')
 68 |     
 69 |     return [
 70 |         {
 71 |             "role": "user",
 72 |             "content": [
 73 |                 {
 74 |                     "type": "text",
 75 |                     "text": prompt
 76 |                 },
 77 |                 {
 78 |                     "type": "image_url",
 79 |                     "image_url": {
 80 |                         "url": f"data:image/jpeg;base64,{base64_image}"
 81 |                     }
 82 |                 }
 83 |             ]
 84 |         }
 85 |     ]
 86 | 
 87 | 
 88 | def _validate_and_extract_content(response):
 89 |     """
 90 |     Helper function to validate OpenAI API response and extract content.
 91 |     
 92 |     Args:
 93 |         response: The response object from OpenAI API.
 94 |         
 95 |     Returns:
 96 |         str or None: The extracted content, or None if validation fails.
 97 |     """
 98 |     # Validate the response structure before accessing choices
 99 |     if not response:
100 |         logging.error(f"Received empty response from OpenAI API: {response}")
101 |         return None
102 |         
103 |     if not hasattr(response, 'choices') or not response.choices:
104 |         logging.error(f"Response does not contain choices or choices is empty. Response: {response}")
105 |         return None
106 |         
107 |     if len(response.choices) == 0:
108 |         logging.error(f"Response choices list is empty. Response: {response}")
109 |         return None
110 |         
111 |     first_choice = response.choices[0]
112 |     if not hasattr(first_choice, 'message') or not first_choice.message:
113 |         logging.error(f"Response choice does not contain message. First choice: {first_choice}")
114 |         return None
115 |         
116 |     if not hasattr(first_choice.message, 'content'):
117 |         logging.error(f"Response message does not contain content. Message: {first_choice.message}")
118 |         return None
119 |         
120 |     markdown_content = first_choice.message.content
121 |     
122 |     # Additional check for empty or None content
123 |     if not markdown_content:
124 |         logging.warning(f"Response content is empty or None. Content: {repr(markdown_content)}")
125 |         return None
126 |         
127 |     return markdown_content
128 | 
129 | 
130 | def image_to_markdown(file_object, client, model="gpt-4o",  prompt=DEFAULT_PROMPT):
131 |     """
132 |     Process a single image file and convert its content to markdown using OpenAI's API.
133 | 
134 |     Args:
135 |         file_object (io.BytesIO): The image file object.
136 |         client (OpenAI): The OpenAI client instance.
137 |         model (str, optional): by default is gpt-4o
138 |         prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
139 | 
140 |     Returns:
141 |         str: The markdown representation of the image content, or None if an error occurs.
142 |     """
143 |     # Log that we're about to process a page
144 |     logging.debug("About to process a page")
145 | 
146 |     messages = _prepare_image_messages(file_object, prompt)
147 | 
148 |     try:
149 |         response = client.chat.completions.create(
150 |             model=model,
151 |             messages=messages
152 |         )
153 | 
154 |         markdown_content = _validate_and_extract_content(response)
155 |         
156 |         if markdown_content:
157 |             logging.debug("Page processed successfully")
158 |             return markdown_content
159 |         else:
160 |             logging.warning("Page is empty or contains no text.")
161 |             return None
162 | 
163 |     except Exception as e:
164 |         logging.error(f"An error occurred while processing the image: {e}")
165 |         return None
166 |     
167 | 
168 | async def image_to_markdown_async(file_object, client, model="gpt-4o", prompt=DEFAULT_PROMPT):
169 |     """
170 |     Asynchronously process a single image file and convert its content to markdown using OpenAI's API.
171 | 
172 |     Args:
173 |         file_object (io.BytesIO): The image file object.
174 |         client (AsyncOpenAI): The AsyncOpenAI client instance.
175 |         model (str, optional): by default is gpt-4o
176 |         prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
177 | 
178 |     Returns:
179 |         tuple: A tuple containing the page number and the markdown representation of the image content, or None if an error occurs.
180 |     """
181 |     # Log that we're about to process a page
182 |     logging.debug("About to process a page")
183 | 
184 |     messages = _prepare_image_messages(file_object, prompt)
185 | 
186 |     try:
187 |         response = await client.chat.completions.create(
188 |             model=model,
189 |             messages=messages
190 |         )
191 | 
192 |         markdown_content = _validate_and_extract_content(response)
193 |         
194 |         if markdown_content:
195 |             logging.debug("Page processed successfully")
196 |             return markdown_content
197 |         else:
198 |             logging.warning("Page is empty or contains no text.")
199 |             return None
200 | 
201 |     except Exception as e:
202 |         logging.error(f"An error occurred while processing the image: {e}")
203 |         return None
204 | 
205 | 
206 | def is_visual_page(page, drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD):
207 |     """
208 |     Determine if a page is visual based on presence of images or large drawings.
209 | 
210 |     Args:
211 |         page (fitz.Page): The page object to analyze.
212 |         drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
213 | 
214 |     Returns:
215 |         bool: True if visual page, False otherwise.
216 |     """
217 |     page_area = page.rect.width * page.rect.height
218 | 
219 |     # Rule 1: If even one image is included, it is a visual page
220 |     images = page.get_images(full=True)
221 |     if len(images) > 0:
222 |         return True
223 | 
224 |     # Rule 2: If large enough area is covered by real drawings, it is a visual page
225 |     drawing_area = 0
226 |     for d in page.get_drawings():
227 |         rect = d.get("rect")  # Get the bounding box the contains the drawing
228 |         if rect:
229 |             area = rect.width * rect.height
230 |             # Ignore tiny drawings
231 |             if area > 5000:  # minimum size in points² (~0.7% of page if full-page)
232 |                 drawing_area += area
233 | 
234 |     drawing_fraction = drawing_area / page_area
235 | 
236 |     if drawing_fraction > drawing_area_threshold:
237 |         return True
238 | 
239 |     # Rule 3: If the page does not contain any text, it is a visual page
240 |     # These could be scanned images or pages with other complex layouts
241 |     if not page.get_text().strip():
242 |         return True
243 | 
244 |     # Otherwise, it's a text page
245 |     return False
246 | 
247 | 
248 | def page_to_image(page):
249 |     """
250 |     Convert a page of a PDF file to an image file.
251 | 
252 |     Args:
253 |         page (fitz.Page): The page object to convert.
254 | 
255 |     Returns:
256 |         bytes: The image file in bytes.
257 |     """
258 |     zoom_x = 2.0  # Horizontal zoom
259 |     zoom_y = 2.0  # Vertical zoom
260 |     mat = fitz.Matrix(zoom_x, zoom_y)  # Zoom factor 2 in each dimension
261 | 
262 |     pix = page.get_pixmap(matrix=mat)
263 |     return pix.tobytes("png")
264 | 
265 | 
266 | def page_to_markdown(page, gap_threshold=DEFAULT_GAP_THRESHOLD):
267 |     """
268 |     Convert a page of a PDF file to markdown format.
269 | 
270 |     Args:
271 |         page (fitz.Page): The page object to convert.
272 |         gap_threshold (int, optional): The threshold for vertical gaps between text blocks. Defaults to 10.
273 | 
274 |     Returns:
275 |         str: The markdown representation of the page.
276 |     """
277 |     blocks = page.get_text("blocks")
278 |     blocks.sort(key=lambda block: (block[1], block[0]))
279 | 
280 |     markdown_page = []
281 |     previous_block_bottom = 0
282 | 
283 |     for block in blocks:
284 |         y0 = block[1]
285 |         y1 = block[3]
286 |         block_text = block[4]
287 | 
288 |         # Check if there's a large vertical gap between this block and the previous one
289 |         if y0 - previous_block_bottom > gap_threshold:
290 |             markdown_page.append("")
291 | 
292 |         markdown_page.append(block_text)
293 |         previous_block_bottom = y1
294 | 
295 |     return "\n".join(markdown_page)
296 | 
297 | 
298 | def process_pages(pdf_file, pages_list=None, use_llm_for_all=False, drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD, gap_threshold=DEFAULT_GAP_THRESHOLD):
299 |     """
300 |     Process the pages of a PDF file to determine which ones are visual and which ones are text-based.
301 | 
302 |     Args:
303 |         pdf_file (io.BytesIO): The PDF file object.
304 |         pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages.
305 |         use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
306 |         drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
307 |         gap_threshold (int): The threshold for vertical gaps between text blocks.
308 | 
309 |     Returns:
310 |         tuple: A tuple containing a list of markdown-formatted pages and a dictionary of image files.
311 |     """
312 |     doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
313 | 
314 |     pages_list = pages_list or list(range(1, doc.page_count + 1))  # Default to all pages if not provided
315 | 
316 |     # List to store markdown content for each page
317 |     markdown_pages = [None] * len(pages_list)
318 | 
319 |     image_files = {}
320 |     for page_num in pages_list:
321 |         page = doc.load_page(page_num - 1)
322 |         if not use_llm_for_all and not is_visual_page(page, drawing_area_threshold=drawing_area_threshold):
323 |             logging.debug(f"The content of Page {page.number + 1} will be extracted using text parsing.")
324 |             # Extract text using traditional OCR
325 |             markdown_content = page_to_markdown(page, gap_threshold=gap_threshold)
326 |             if markdown_content:
327 |                 markdown_pages[page_num - 1] = markdown_content
328 |             else:
329 |                 logging.warning(f"Page {page.number + 1} is empty or contains no text.")
330 |                 markdown_pages[page_num - 1] = f"Page {page.number + 1} is empty or contains no text."
331 | 
332 |         else:
333 |             logging.debug(f"The content of page {page.number + 1} will be extracted using the LLM.")
334 |             # Convert page to image
335 |             image_file = page_to_image(page)
336 |             image_files[page_num - 1] = io.BytesIO(image_file)
337 | 
338 |     return markdown_pages, image_files
339 | 
340 | 
341 | def ocr(
342 |     pdf_file, 
343 |     api_key = None,
344 |     model="gpt-4o", 
345 |     base_url='https://api.openai.com/v1', 
346 |     prompt=DEFAULT_PROMPT, 
347 |     pages_list=None,
348 |     use_llm_for_all=False,
349 |     drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD,
350 |     gap_threshold=DEFAULT_GAP_THRESHOLD,
351 |     logging_level=logging.INFO,
352 |     **kwargs
353 |     ):
354 |     """
355 |     Convert a PDF file to a list of markdown-formatted pages using text parsing and OpenAI's API.
356 |     The OpenAI API is called in parallel using threading for each image file.
357 |     This function is synchronous.
358 | 
359 |     Args:
360 |         pdf_file (io.BytesIO): The PDF file object.
361 |         api_key (str): The OpenAI API key.
362 |         model (str, optional): by default is gpt-4o
363 |         base_url (str): You can use this one to point the client whereever you need it like Ollama
364 |         prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
365 |         pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages.
366 |         use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
367 |         drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
368 |         gap_threshold (int): The threshold for vertical gaps between text blocks.
369 |         logging_level (int): The logging level. Defaults to logging.INFO.
370 |         **kwargs: Additional keyword arguments.
371 | 
372 |     Returns:
373 |         list: A list of strings, each containing the markdown representation of a PDF page.
374 |     """
375 |     # Set up logging
376 |     logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
377 | 
378 |     client = get_openai_client(api_key=api_key, base_url=base_url, **kwargs)
379 |     
380 |     # Identify the maximum number of workers for parallel processing
381 |     max_workers = os.getenv("AIPDF_MAX_CONCURRENT_REQUESTS", None)
382 |     if max_workers:
383 |         logging.debug("The maximum number of concurrent requests is set to %s", max_workers)
384 |         max_workers = int(max_workers)
385 | 
386 |     markdown_pages, image_files = process_pages(
387 |         pdf_file,
388 |         pages_list=pages_list,
389 |         use_llm_for_all=use_llm_for_all,
390 |         drawing_area_threshold=drawing_area_threshold,
391 |         gap_threshold=gap_threshold
392 |     )
393 | 
394 |     if image_files:
395 |         if max_workers:
396 |             executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
397 |         else:
398 |             executor = concurrent.futures.ThreadPoolExecutor()
399 | 
400 |         # Process each image file in parallel
401 |         with executor:
402 |             # Submit tasks for each image file
403 |             future_to_page = {executor.submit(image_to_markdown, img_file, client, model, prompt): page_num 
404 |                             for page_num, img_file in image_files.items()}
405 |             
406 |             # Collect results as they complete
407 |             for future in concurrent.futures.as_completed(future_to_page):
408 |                 page_num = future_to_page[future]
409 |                 try:
410 |                     markdown_content = future.result()
411 |                     if markdown_content:
412 |                         markdown_pages[page_num] = markdown_content
413 |                     else:
414 |                         markdown_pages[page_num] = f"Error processing page {page_num + 1}."
415 |                 except Exception as e:
416 |                     logging.error(f"Error processing page {page_num + 1}: {e}")
417 |                     markdown_pages[page_num] = f"Error processing page {page_num + 1}: {str(e)}"
418 | 
419 |     return markdown_pages
420 | 
421 | 
422 | async def ocr_async(
423 |     pdf_file, 
424 |     api_key = None,
425 |     model="gpt-4o",
426 |     base_url='https://api.openai.com/v1',
427 |     prompt=DEFAULT_PROMPT,
428 |     pages_list=None,
429 |     use_llm_for_all=False,
430 |     drawing_area_threshold=DEFAULT_DRAWING_AREA_THRESHOLD,
431 |     gap_threshold=DEFAULT_GAP_THRESHOLD,
432 |     logging_level=logging.INFO,
433 |     **kwargs
434 |     ):
435 |     """
436 |     Convert a PDF file to a list of markdown-formatted pages using text parsing and OpenAI's API.
437 |     The OpenAI API is called asynchronously for each image file.
438 |     This function is asynchronous.
439 | 
440 |     Args:
441 |         pdf_file (io.BytesIO): The PDF file object.
442 |         api_key (str): The OpenAI API key.
443 |         model (str, optional): by default is gpt-4o
444 |         base_url (str): You can use this one to point the client whereever you need it like Ollama
445 |         prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT.
446 |         pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages.
447 |         use_llm_for_all (bool, optional): If True, all pages will be processed using the LLM, regardless of visual content. Defaults to False.
448 |         drawing_area_threshold (float): Minimum fraction of page area that drawings must cover to be visual.
449 |         gap_threshold (int): The threshold for vertical gaps between text blocks.
450 |         logging_level (int): The logging level. Defaults to logging.INFO.
451 |         **kwargs: Additional keyword arguments.
452 | 
453 |     Returns:
454 |         list: A list of strings, each containing the markdown representation of a PDF page.
455 |     """
456 |     # Set up logging
457 |     logging.basicConfig(level=logging_level, format='%(asctime)s - %(levelname)s - %(message)s')
458 | 
459 |     client = get_openai_client(api_key=api_key, base_url=base_url, is_async=True, **kwargs)
460 | 
461 |     # Set up a semaphore for limiting concurrent requests if specified
462 |     semaphore = None
463 |     max_concurrent_requests = os.getenv("AIPDF_MAX_CONCURRENT_REQUESTS", None)
464 |     if max_concurrent_requests:
465 |         logging.debug("The maximum number of concurrent requests is set to %s", max_concurrent_requests)
466 |         max_concurrent_requests = int(max_concurrent_requests)
467 |         semaphore = asyncio.Semaphore(max_concurrent_requests)
468 | 
469 |     markdown_pages, image_files = process_pages(
470 |         pdf_file,
471 |         pages_list=pages_list,
472 |         use_llm_for_all=use_llm_for_all,
473 |         drawing_area_threshold=drawing_area_threshold,
474 |         gap_threshold=gap_threshold
475 |     )
476 | 
477 |     if image_files:
478 |         # Process each image file in parallel
479 |         tasks = []
480 | 
481 |         async def task_wrapper(img_file, page_num):
482 |             if semaphore:
483 |                 async with semaphore:
484 |                     markdown_content = await image_to_markdown_async(img_file, client, model, prompt)
485 |             else:
486 |                 markdown_content = await image_to_markdown_async(img_file, client, model, prompt)
487 |             return page_num, markdown_content
488 | 
489 |         tasks = [task_wrapper(img_file, page_num) for page_num, img_file in image_files.items()]
490 | 
491 |         # Collect results as they complete
492 |         results = await asyncio.gather(*tasks)
493 | 
494 |         for page_num, markdown_content in results:
495 |             if markdown_content:
496 |                 markdown_pages[page_num] = markdown_content
497 |             else:
498 |                 markdown_pages[page_num] = f"Error processing page {page_num + 1}."
499 | 
500 |     return markdown_pages
501 | 


--------------------------------------------------------------------------------
/tests/integration_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/integration_tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration_tests/files/catalogue.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/integration_tests/files/catalogue.pdf


--------------------------------------------------------------------------------
/tests/integration_tests/files/dictionary-1-5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/integration_tests/files/dictionary-1-5.pdf


--------------------------------------------------------------------------------
/tests/integration_tests/test_aipdf.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import logging
 3 | import os
 4 | import time
 5 | import unittest
 6 | 
 7 | from src.aipdf.ocr import ocr, ocr_async
 8 | 
 9 | 
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11 | 
12 | 
13 | class TestOCRIntegration(unittest.TestCase):
14 |     def setUp(self):
15 |         # Path to the directory containing test PDF files
16 |         self.files_dir = os.path.join(os.path.dirname(__file__), "files")
17 | 
18 |     def test_ocr_on_sample_pdfs(self):
19 |         # Iterate through all PDF files in the files directory
20 |         for file_name in os.listdir(self.files_dir):
21 |             if file_name.endswith(".pdf"):
22 |                 file_path = os.path.join(self.files_dir, file_name)
23 |                 with open(file_path, "rb") as pdf_file:
24 |                     pdf_bytes = io.BytesIO(pdf_file.read())
25 | 
26 |                 start_time = time.time()
27 |                 result = ocr(pdf_bytes)
28 |                 elapsed_time = time.time() - start_time
29 |                 logging.info(f"Processed {file_name} in {elapsed_time:.2f} seconds")
30 | 
31 |                 self.assertIsInstance(result, list)
32 |                 self.assertGreater(len(result), 0, f"Result is empty for file: {file_name}")
33 |                 for page_content in result:
34 |                     self.assertIsInstance(page_content, str)
35 |                     self.assertGreater(len(page_content.strip()), 0, f"Page content is empty for file: {file_name}")
36 | 
37 | 
38 | class TestOCRAsyncIntegration(unittest.IsolatedAsyncioTestCase):
39 |     def setUp(self):
40 |         # Path to the directory containing test PDF files
41 |         self.files_dir = os.path.join(os.path.dirname(__file__), "files")
42 | 
43 |     async def test_ocr_async_on_sample_pdfs(self):
44 |         # Iterate through all PDF files in the files directory
45 |         for file_name in os.listdir(self.files_dir):
46 |             if file_name.endswith(".pdf"):
47 |                 file_path = os.path.join(self.files_dir, file_name)
48 |                 with open(file_path, "rb") as pdf_file:
49 |                     pdf_bytes = io.BytesIO(pdf_file.read())
50 | 
51 |                 start_time = time.time()
52 |                 result = await ocr_async(pdf_bytes)
53 |                 elapsed_time = time.time() - start_time
54 |                 logging.info(f"Processed {file_name} in {elapsed_time:.2f} seconds")
55 | 
56 |                 self.assertIsInstance(result, list)
57 |                 self.assertGreater(len(result), 0, f"Result is empty for file: {file_name}")
58 |                 for page_content in result:
59 |                     self.assertIsInstance(page_content, str)
60 |                     self.assertGreater(len(page_content.strip()), 0, f"Page content is empty for file: {file_name}")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     unittest.main()


--------------------------------------------------------------------------------
/tests/unit_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mindsdb/aipdf/38650dc305218ee4ee37ee7eea61a5b88172b35b/tests/unit_tests/__init__.py


--------------------------------------------------------------------------------
/tests/unit_tests/test_aipdf.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import MagicMock, patch
  3 | import io
  4 | from aipdf.ocr import (
  5 |     image_to_markdown,
  6 |     is_visual_page,
  7 |     page_to_image,
  8 |     page_to_markdown,
  9 |     ocr,
 10 | )
 11 | 
 12 | 
 13 | class TestPageToImage(unittest.TestCase):
 14 |     @patch("fitz.Page")
 15 |     def test_page_to_image(self, mock_page):
 16 |         # Mock page pixmap
 17 |         mock_pixmap = MagicMock()
 18 |         mock_pixmap.tobytes.return_value = b"fake image bytes"
 19 |         mock_page.get_pixmap.return_value = mock_pixmap
 20 | 
 21 |         result = page_to_image(mock_page)
 22 | 
 23 |         self.assertEqual(result, b"fake image bytes")
 24 |         mock_page.get_pixmap.assert_called_once()
 25 | 
 26 | 
 27 | class TestPageToMarkdown(unittest.TestCase):
 28 |     @patch("fitz.Page")
 29 |     def test_page_to_markdown(self, mock_page):
 30 |         # Mock page text blocks
 31 |         mock_page.get_text.return_value = [
 32 |             (0, 0, 100, 50, "Header"),
 33 |             (0, 60, 100, 100, "Body text"),
 34 |         ]
 35 | 
 36 |         result = page_to_markdown(mock_page)
 37 | 
 38 |         self.assertEqual(result, "Header\nBody text")
 39 |         mock_page.get_text.assert_called_once_with("blocks")
 40 | 
 41 | 
 42 | class TestIsVisualPage(unittest.TestCase):
 43 |     @patch("fitz.Page")
 44 |     def test_is_visual_page_with_images(self, mock_page):
 45 |         # Mock page with images
 46 |         mock_page.get_images.return_value = [("image1",)]
 47 | 
 48 |         result = is_visual_page(mock_page)
 49 | 
 50 |         self.assertTrue(result)
 51 | 
 52 |     @patch("fitz.Page")
 53 |     def test_is_visual_page_with_drawings(self, mock_page):
 54 |         # Mock page with drawings
 55 |         mock_page.rect.width = 100
 56 |         mock_page.rect.height = 100
 57 |         mock_page.get_images.return_value = []
 58 | 
 59 |         mock_rect = MagicMock()
 60 |         mock_rect.width = 100
 61 |         mock_rect.height = 100
 62 |         mock_drawing = {"rect": mock_rect}
 63 |         mock_page.get_drawings.return_value = [mock_drawing]
 64 |         mock_page.get_drawings.return_value = [mock_drawing]
 65 | 
 66 |         result = is_visual_page(mock_page)
 67 | 
 68 |         self.assertTrue(result)
 69 | 
 70 |     @patch("fitz.Page")
 71 |     def test_is_visual_page_with_no_visual_content(self, mock_page):
 72 |         # Mock page with no images or drawings
 73 |         mock_page.rect.width = 100
 74 |         mock_page.rect.height = 100
 75 |         mock_page.get_images.return_value = []
 76 |         mock_page.get_drawings.return_value = []
 77 |         mock_page.get_text.return_value = "Some text"
 78 | 
 79 |         result = is_visual_page(mock_page)
 80 | 
 81 |         self.assertFalse(result)
 82 | 
 83 | 
 84 | class TestImageToMarkdown(unittest.TestCase):
 85 |     @patch("openai.OpenAI")
 86 |     def test_image_to_markdown_success(self, mock_openai):
 87 |         # Mock OpenAI client response
 88 |         mock_client = MagicMock()
 89 |         mock_openai.return_value = mock_client
 90 |         mock_client.chat.completions.create.return_value = MagicMock(
 91 |             choices=[MagicMock(message=MagicMock(content="Markdown content"))]
 92 |         )
 93 | 
 94 |         file_object = io.BytesIO(b"fake image data")
 95 |         result = image_to_markdown(file_object, mock_client)
 96 | 
 97 |         self.assertEqual(result, "Markdown content")
 98 |         mock_client.chat.completions.create.assert_called_once()
 99 | 
100 |     @patch("openai.OpenAI")
101 |     def test_image_to_markdown_failure(self, mock_openai):
102 |         # Mock OpenAI client to raise an exception
103 |         mock_client = MagicMock()
104 |         mock_openai.return_value = mock_client
105 |         mock_client.chat.completions.create.side_effect = Exception("API error")
106 | 
107 |         file_object = io.BytesIO(b"fake image data")
108 |         result = image_to_markdown(file_object, mock_client)
109 | 
110 |         self.assertIsNone(result)
111 | 
112 | 
113 | class TestOCR(unittest.TestCase):
114 |     @patch("fitz.open")
115 |     @patch("openai.OpenAI")
116 |     def test_ocr_with_text_pages(self, mock_openai, mock_fitz_open):
117 |         # Mock the PDF document
118 |         mock_doc = MagicMock()
119 |         mock_page = MagicMock()
120 |         mock_page.rect.width = 100
121 |         mock_page.rect.height = 100
122 |         mock_doc.page_count = 1
123 |         mock_doc.load_page.return_value = mock_page
124 |         mock_fitz_open.return_value = mock_doc
125 | 
126 |         # Define get_text to return text and blocks
127 |         def mock_get_text(arg = "text"):
128 |             if arg == "text":
129 |                 return "Header\nBody text"
130 |             elif arg == "blocks":
131 |                 return [
132 |                     (0, 0, 100, 50, "Header"),
133 |                     (0, 60, 100, 100, "Body text"),
134 |                 ]
135 |             else:
136 |                 return ""
137 |         mock_page.get_text.side_effect = mock_get_text
138 | 
139 |         # Mock OpenAI client
140 |         mock_client = MagicMock()
141 |         mock_openai.return_value = mock_client
142 | 
143 |         pdf_file = io.BytesIO(b"Header\nBody text")
144 |         result = ocr(pdf_file, api_key="fake_api_key")
145 | 
146 |         self.assertEqual(len(result), 1)
147 |         self.assertEqual(result[0], "Header\nBody text")
148 |         mock_page.get_text.assert_any_call()
149 |         mock_page.get_text.assert_any_call("blocks")
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     unittest.main()


--------------------------------------------------------------------------------