├── .env.example ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.cpu.yml ├── docker-compose.gpu.yml ├── document_converter ├── __init__.py ├── route.py ├── schema.py ├── service.py └── utils.py ├── images ├── docling.png ├── marker.png ├── original.png ├── pymupdf.png └── pypdf.png ├── main.py ├── poetry.lock ├── pyproject.toml └── worker ├── __init__.py ├── celery_config.py └── tasks.py /.env.example: -------------------------------------------------------------------------------- 1 | # Configuration for Redis connection settings 2 | 3 | # Use this if Redis is hosted externally or deployed separately outside of Docker Compose. 4 | # Replace "localhost" with the actual host address or IP of your Redis instance. 5 | # REDIS_HOST="redis://:/" 6 | # Example for a hosted Redis: 7 | # REDIS_HOST="redis://redis.example.com:6379/0" 8 | 9 | # If using Docker Compose and Redis is one of the services defined within the docker-compose.yml file, 10 | # you can use the service name 'redis' as the host. This ensures the internal communication between services. 11 | REDIS_HOST=redis://redis:6379/0 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | 10 | # Additional IDE files 11 | *.iml 12 | *.iws 13 | /out/ 14 | /target/ 15 | /build/ 16 | .idea/libraries/ 17 | .idea/modules.xml 18 | .idea/misc.xml 19 | .env 20 | .DS_Store 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.4.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | # - repo: https://github.com/adrienverge/yamllint 8 | # rev: v1.32.0 9 | # hooks: 10 | # - id: yamllint 11 | # args: [--format, parsable, --strict] 12 | - repo: https://github.com/psf/black 13 | rev: 23.9.1 14 | hooks: 15 | - id: black 16 | args: ['--line-length', '120', '--target-version', 'py39', '--skip-string-normalization'] 17 | language_version: python3.9 18 | # - repo: https://github.com/PyCQA/flake8 19 | # rev: 6.1.0 20 | # hooks: 21 | # - id: flake8 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a base image with CUDA support and the desired Python version 2 | FROM python:3.12-slim-bookworm 3 | 4 | ARG CPU_ONLY=false 5 | WORKDIR /app 6 | 7 | RUN apt-get update \ 8 | && apt-get install -y redis-server libgl1 libglib2.0-0 curl wget git procps \ 9 | && apt-get clean 10 | 11 | # Install Poetry and configure it 12 | RUN pip install poetry \ 13 | && poetry config virtualenvs.create false 14 | 15 | COPY pyproject.toml poetry.lock ./ 16 | 17 | # Install dependencies before torch 18 | RUN poetry install --no-interaction --no-root 19 | 20 | # Install PyTorch separately based on CPU_ONLY flag 21 | RUN if [ "$CPU_ONLY" = "true" ]; then \ 22 | pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu; \ 23 | else \ 24 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \ 25 | fi 26 | 27 | ENV HF_HOME=/tmp/ \ 28 | TORCH_HOME=/tmp/ \ 29 | OMP_NUM_THREADS=4 30 | 31 | RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);' 32 | 33 | # Pre-download EasyOCR models in compatible groups 34 | RUN python -c 'import easyocr; \ 35 | reader = easyocr.Reader(["fr", "de", "es", "en", "it", "pt"], gpu=True); \ 36 | print("EasyOCR models downloaded successfully")' 37 | 38 | COPY . . 39 | 40 | EXPOSE 8080 41 | 42 | CMD ["poetry", "run", "uvicorn", "--port", "8080", "--host", "0.0.0.0", "main:app"] 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 David Emmanuel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Documents to Markdown Converter Server 2 | 3 | > [!IMPORTANT] 4 | > This backend server is a robust, scalable solution for effortlessly converting a wide range of document formats—including PDF, DOCX, PPTX, CSV, HTML, JPG, PNG, TIFF, BMP, AsciiDoc, and Markdown—into Markdown. Powered by [Docling](https://github.com/DS4SD/docling) (IBM's advanced document parser), this service is built with FastAPI, Celery, and Redis, ensuring fast, efficient processing. Optimized for both CPU and GPU modes, with GPU highly recommended for production environments, this solution offers high performance and flexibility, making it ideal for handling complex document processing at scale. 5 | 6 | ## Comparison to Other Parsing Libraries 7 | 8 | | Original PDF | 9 | |--------------| 10 | | | 11 | 12 | | Docling-API | Marker | 13 | |-------------|--------| 14 | | | | 15 | 16 | | PyPDF | PyMuPDF4LLM | 17 | |-------|-------------| 18 | | | | 19 | 20 | ## Features 21 | - **Multiple Format Support**: Converts various document types including: 22 | - PDF files 23 | - Microsoft Word documents (DOCX) 24 | - PowerPoint presentations (PPTX) 25 | - HTML files 26 | - Images (JPG, PNG, TIFF, BMP) 27 | - AsciiDoc files 28 | - Markdown files 29 | - CSV files 30 | 31 | - **Conversion Capabilities**: 32 | - Text extraction and formatting 33 | - Table detection, extraction and conversion 34 | - Image extraction and processing 35 | - Multi-language OCR support (French, German, Spanish, English, Italian, Portuguese etc) 36 | - Configurable image resolution scaling 37 | 38 | - **API Endpoints**: 39 | - Synchronous single document conversion 40 | - Synchronous batch document conversion 41 | - Asynchronous single document conversion with job tracking 42 | - Asynchronous batch conversion with job tracking 43 | 44 | - **Processing Modes**: 45 | - CPU-only processing for standard deployments 46 | - GPU-accelerated processing for improved performance 47 | - Distributed task processing using Celery 48 | - Task monitoring through Flower dashboard 49 | 50 | ## Environment Setup (Running Locally) 51 | 52 | ### Prerequisites 53 | - Python 3.8 or higher 54 | - Poetry (Python package manager) 55 | - Redis server (for task queue) 56 | 57 | ### 1. Install Poetry (if not already installed) 58 | ```bash 59 | curl -sSL https://install.python-poetry.org | python3 - 60 | ``` 61 | 62 | ### 2. Clone and Setup Project 63 | ```bash 64 | git clone https://github.com/drmingler/docling-api.git 65 | cd docling-api 66 | poetry install 67 | ``` 68 | 69 | ### 3. Configure Environment 70 | Create a `.env` file in the project root: 71 | ```bash 72 | REDIS_HOST=redis://localhost:6379/0 73 | ENV=development 74 | ``` 75 | 76 | ### 4. Start Redis Server 77 | Start Redis locally (install if not already installed): 78 | 79 | #### For MacOS: 80 | ```bash 81 | brew install redis 82 | brew services start redis 83 | ``` 84 | 85 | #### For Ubuntu/Debian: 86 | ```bash 87 | sudo apt-get install redis-server 88 | sudo service redis-server start 89 | ``` 90 | 91 | ### 5. Start the Application Components 92 | 93 | 1. Start the FastAPI server: 94 | ```bash 95 | poetry run uvicorn main:app --reload --port 8080 96 | ``` 97 | 98 | 2. Start Celery worker (in a new terminal): 99 | ```bash 100 | poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info 101 | ``` 102 | 103 | 3. Start Flower dashboard for monitoring (optional, in a new terminal): 104 | ```bash 105 | poetry run celery -A worker.celery_config flower --port=5555 106 | ``` 107 | 108 | ### 6. Verify Installation 109 | 110 | 1. Check if the API server is running: 111 | ```bash 112 | curl http://localhost:8080/docs 113 | ``` 114 | 115 | 2. Test Celery worker: 116 | ```bash 117 | curl -X POST "http://localhost:8080/documents/convert" \ 118 | -H "accept: application/json" \ 119 | -H "Content-Type: multipart/form-data" \ 120 | -F "document=@/path/to/test.pdf" 121 | ``` 122 | 123 | 3. Access monitoring dashboard: 124 | - Open http://localhost:5555 in your browser to view the Flower dashboard 125 | 126 | ### Development Notes 127 | 128 | - The API documentation is available at http://localhost:8080/docs 129 | - Redis is used as both message broker and result backend for Celery tasks 130 | - The service supports both synchronous and asynchronous document conversion 131 | - For development, the server runs with auto-reload enabled 132 | 133 | ## Environment Setup (Running in Docker) 134 | 135 | 1. Clone the repository: 136 | ```bash 137 | git clone https://github.com/drmingler/docling-api.git 138 | cd docling-api 139 | ``` 140 | 141 | 2. Create a `.env` file: 142 | ```bash 143 | REDIS_HOST=redis://redis:6379/0 144 | ENV=production 145 | ``` 146 | 147 | ### CPU Mode 148 | To start the service using CPU-only processing, use the following command. You can adjust the number of Celery workers by specifying the --scale option. In this example, 1 worker will be created: 149 | ```bash 150 | docker-compose -f docker-compose.cpu.yml up --build --scale celery_worker=1 151 | ``` 152 | 153 | ### GPU Mode (Recommend for production) 154 | For production, it is recommended to enable GPU acceleration, as it significantly improves performance. Use the command below to start the service with GPU support. You can also scale the number of Celery workers using the --scale option; here, 3 workers will be launched: 155 | ```bash 156 | docker-compose -f docker-compose.gpu.yml up --build --scale celery_worker=3 157 | ``` 158 | 159 | ## Service Components 160 | 161 | The service will start the following components: 162 | 163 | - **API Server**: http://localhost:8080 164 | - **Redis**: http://localhost:6379 165 | - **Flower Dashboard**: http://localhost:5556 166 | 167 | ## API Usage 168 | 169 | ### Synchronous Conversion 170 | 171 | Convert a single document immediately: 172 | 173 | ```bash 174 | curl -X POST "http://localhost:8080/documents/convert" \ 175 | -H "accept: application/json" \ 176 | -H "Content-Type: multipart/form-data" \ 177 | -F "document=@/path/to/document.pdf" \ 178 | -F "extract_tables_as_images=true" \ 179 | -F "image_resolution_scale=4" 180 | ``` 181 | 182 | ### Asynchronous Conversion 183 | 184 | 1. Submit a document for conversion: 185 | 186 | ```bash 187 | curl -X POST "http://localhost:8080/conversion-jobs" \ 188 | -H "accept: application/json" \ 189 | -H "Content-Type: multipart/form-data" \ 190 | -F "document=@/path/to/document.pdf" 191 | ``` 192 | 193 | 2. Check conversion status: 194 | 195 | ```bash 196 | curl -X GET "http://localhost:8080/conversion-jobs/{job_id}" \ 197 | -H "accept: application/json" 198 | ``` 199 | 200 | ### Batch Processing 201 | 202 | Convert multiple documents asynchronously: 203 | 204 | ```bash 205 | curl -X POST "http://localhost:8080/batch-conversion-jobs" \ 206 | -H "accept: application/json" \ 207 | -H "Content-Type: multipart/form-data" \ 208 | -F "documents=@/path/to/document1.pdf" \ 209 | -F "documents=@/path/to/document2.pdf" 210 | ``` 211 | 212 | ## Configuration Options 213 | 214 | - `image_resolution_scale`: Control the resolution of extracted images (1-4) 215 | - `extract_tables_as_images`: Extract tables as images (true/false) 216 | - `CPU_ONLY`: Build argument to switch between CPU/GPU modes 217 | 218 | ## Monitoring 219 | 220 | - Access the Flower dashboard to monitor Celery tasks and workers 221 | - View task status, success/failure rates, and worker performance 222 | - Monitor resource usage and task queues 223 | 224 | ## Architecture 225 | 226 | The service uses a distributed architecture with the following components: 227 | 228 | 1. FastAPI application serving the REST API 229 | 2. Celery workers for distributed task processing 230 | 3. Redis as message broker and result backend 231 | 4. Flower for task monitoring and management 232 | 5. Docling for the file conversion 233 | 234 | ## Performance Considerations 235 | 236 | - GPU mode provides significantly faster processing for large documents 237 | - CPU mode is suitable for smaller deployments or when GPU is not available 238 | - Multiple workers can be scaled horizontally for increased throughput 239 | 240 | ## License 241 | The codebase is under MIT license. See LICENSE for more information 242 | 243 | ## Acknowledgements 244 | - [Docling](https://github.com/DS4SD/docling) the state-of-the-art document conversion library by IBM 245 | - [FastAPI](https://fastapi.tiangolo.com/) the web framework 246 | - [Celery](https://docs.celeryq.dev/en/stable/) for distributed task processing 247 | - [Flower](https://flower.readthedocs.io/en/latest/) for monitoring and management 248 | -------------------------------------------------------------------------------- /docker-compose.cpu.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | celery_worker: 5 | build: 6 | context: . 7 | args: 8 | CPU_ONLY: "true" 9 | image: converter-cpu-image 10 | command: poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info 11 | volumes: 12 | - .:/app 13 | - model_cache:/tmp 14 | environment: 15 | - REDIS_HOST=${REDIS_HOST} 16 | - ENV=production 17 | restart: on-failure 18 | depends_on: 19 | - redis 20 | 21 | app: 22 | container_name: marker-api-cpu 23 | build: 24 | context: . 25 | args: 26 | CPU_ONLY: "true" 27 | image: converter-cpu-image 28 | command: poetry run uvicorn --port 8080 --host 0.0.0.0 main:app 29 | environment: 30 | - REDIS_HOST=${REDIS_HOST} 31 | - ENV=production 32 | - MALLOC_ARENA_MAX=2 33 | - OMP_NUM_THREADS=2 34 | - PYTHONMALLOC=malloc 35 | ports: 36 | - "8080:8080" 37 | volumes: 38 | - .:/app 39 | - model_cache:/tmp 40 | restart: on-failure 41 | depends_on: 42 | - redis 43 | 44 | redis: 45 | container_name: redis 46 | image: redis:7.2.4-alpine 47 | ports: 48 | - "6379:6379" 49 | 50 | flower: 51 | container_name: flower_cpu 52 | build: 53 | context: . 54 | args: 55 | CPU_ONLY: "true" 56 | image: converter-cpu-image 57 | command: poetry run celery -A worker.celery_config flower --port=5555 58 | ports: 59 | - "5556:5555" 60 | volumes: 61 | - .:/app 62 | - model_cache:/tmp 63 | environment: 64 | - REDIS_HOST=${REDIS_HOST} 65 | - ENV=production 66 | depends_on: 67 | - app 68 | - redis 69 | - celery_worker 70 | 71 | volumes: 72 | model_cache: 73 | -------------------------------------------------------------------------------- /docker-compose.gpu.yml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | services: 4 | celery_worker: 5 | build: 6 | context: . 7 | args: 8 | CPU_ONLY: "false" 9 | image: converter-gpu-image 10 | command: poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info 11 | volumes: 12 | - .:/app 13 | environment: 14 | - REDIS_HOST=${REDIS_HOST} 15 | - ENV=production 16 | deploy: 17 | resources: 18 | reservations: 19 | devices: 20 | - driver: nvidia 21 | count: 1 22 | capabilities: [gpu] 23 | depends_on: 24 | - redis 25 | 26 | app: 27 | container_name: marker-api-gpu 28 | build: 29 | context: . 30 | args: 31 | CPU_ONLY: "false" 32 | image: converter-gpu-image 33 | command: poetry run uvicorn --port 8080 --host 0.0.0.0 main:app 34 | environment: 35 | - REDIS_HOST=${REDIS_HOST} 36 | - ENV=production 37 | - NVIDIA_VISIBLE_DEVICES=all 38 | ports: 39 | - "8080:8080" 40 | volumes: 41 | - .:/app 42 | deploy: 43 | resources: 44 | reservations: 45 | devices: 46 | - driver: nvidia 47 | count: 1 48 | capabilities: [gpu] 49 | depends_on: 50 | - redis 51 | - celery_worker 52 | 53 | redis: 54 | container_name: redis 55 | image: redis:7.2.4-alpine 56 | ports: 57 | - "6379:6379" 58 | 59 | flower: 60 | container_name: flower_gpu 61 | build: 62 | context: . 63 | args: 64 | CPU_ONLY: "false" 65 | image: converter-gpu-image 66 | command: poetry run celery -A worker.celery_config flower --port=5555 67 | ports: 68 | - "5556:5555" 69 | volumes: 70 | - .:/app 71 | environment: 72 | - REDIS_HOST=${REDIS_HOST} 73 | - ENV=production 74 | depends_on: 75 | - app 76 | - redis 77 | - celery_worker 78 | deploy: 79 | resources: 80 | reservations: 81 | devices: 82 | - driver: nvidia 83 | count: 1 84 | capabilities: [gpu] 85 | -------------------------------------------------------------------------------- /document_converter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/document_converter/__init__.py -------------------------------------------------------------------------------- /document_converter/route.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from typing import List 3 | from fastapi import APIRouter, File, HTTPException, UploadFile, Query 4 | 5 | from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult 6 | from document_converter.service import DocumentConverterService, DoclingDocumentConversion 7 | from document_converter.utils import is_file_format_supported 8 | from worker.tasks import convert_document_task, convert_documents_task 9 | 10 | router = APIRouter() 11 | 12 | # Could be docling or another converter as long as it implements DocumentConversionBase 13 | converter = DoclingDocumentConversion() 14 | document_converter_service = DocumentConverterService(document_converter=converter) 15 | 16 | 17 | # Document direct conversion endpoints 18 | @router.post( 19 | '/documents/convert', 20 | response_model=ConversionResult, 21 | response_model_exclude_unset=True, 22 | description="Convert a single document synchronously", 23 | ) 24 | async def convert_single_document( 25 | document: UploadFile = File(...), 26 | extract_tables_as_images: bool = False, 27 | image_resolution_scale: int = Query(4, ge=1, le=4), 28 | ): 29 | file_bytes = await document.read() 30 | if not is_file_format_supported(file_bytes, document.filename): 31 | raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") 32 | 33 | return document_converter_service.convert_document( 34 | (document.filename, BytesIO(file_bytes)), 35 | extract_tables=extract_tables_as_images, 36 | image_resolution_scale=image_resolution_scale, 37 | ) 38 | 39 | 40 | @router.post( 41 | '/documents/batch-convert', 42 | response_model=List[ConversionResult], 43 | response_model_exclude_unset=True, 44 | description="Convert multiple documents synchronously", 45 | ) 46 | async def convert_multiple_documents( 47 | documents: List[UploadFile] = File(...), 48 | extract_tables_as_images: bool = False, 49 | image_resolution_scale: int = Query(4, ge=1, le=4), 50 | ): 51 | doc_streams = [] 52 | for document in documents: 53 | file_bytes = await document.read() 54 | if not is_file_format_supported(file_bytes, document.filename): 55 | raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") 56 | doc_streams.append((document.filename, BytesIO(file_bytes))) 57 | 58 | return document_converter_service.convert_documents( 59 | doc_streams, 60 | extract_tables=extract_tables_as_images, 61 | image_resolution_scale=image_resolution_scale, 62 | ) 63 | 64 | 65 | # Asynchronous conversion jobs endpoints 66 | @router.post( 67 | '/conversion-jobs', 68 | response_model=ConversationJobResult, 69 | description="Create a conversion job for a single document", 70 | ) 71 | async def create_single_document_conversion_job( 72 | document: UploadFile = File(...), 73 | extract_tables_as_images: bool = False, 74 | image_resolution_scale: int = Query(4, ge=1, le=4), 75 | ): 76 | file_bytes = await document.read() 77 | if not is_file_format_supported(file_bytes, document.filename): 78 | raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") 79 | 80 | task = convert_document_task.delay( 81 | (document.filename, file_bytes), 82 | extract_tables=extract_tables_as_images, 83 | image_resolution_scale=image_resolution_scale, 84 | ) 85 | 86 | return ConversationJobResult(job_id=task.id, status="IN_PROGRESS") 87 | 88 | 89 | @router.get( 90 | '/conversion-jobs/{job_id}', 91 | response_model=ConversationJobResult, 92 | description="Get the status of a single document conversion job", 93 | response_model_exclude_unset=True, 94 | ) 95 | async def get_conversion_job_status(job_id: str): 96 | return document_converter_service.get_single_document_task_result(job_id) 97 | 98 | 99 | @router.post( 100 | '/batch-conversion-jobs', 101 | response_model=BatchConversionJobResult, 102 | response_model_exclude_unset=True, 103 | description="Create a conversion job for multiple documents", 104 | ) 105 | async def create_batch_conversion_job( 106 | documents: List[UploadFile] = File(...), 107 | extract_tables_as_images: bool = False, 108 | image_resolution_scale: int = Query(4, ge=1, le=4), 109 | ): 110 | """Create a batch conversion job for multiple documents.""" 111 | doc_data = [] 112 | for document in documents: 113 | file_bytes = await document.read() 114 | if not is_file_format_supported(file_bytes, document.filename): 115 | raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}") 116 | doc_data.append((document.filename, file_bytes)) 117 | 118 | task = convert_documents_task.delay( 119 | doc_data, 120 | extract_tables=extract_tables_as_images, 121 | image_resolution_scale=image_resolution_scale, 122 | ) 123 | 124 | return BatchConversionJobResult(job_id=task.id, status="IN_PROGRESS") 125 | 126 | 127 | @router.get( 128 | '/batch-conversion-jobs/{job_id}', 129 | response_model=BatchConversionJobResult, 130 | response_model_exclude_unset=True, 131 | description="Get the status of a batch conversion job", 132 | ) 133 | async def get_batch_conversion_job_status(job_id: str): 134 | """Get the status and results of a batch conversion job.""" 135 | return document_converter_service.get_batch_conversion_task_result(job_id) 136 | -------------------------------------------------------------------------------- /document_converter/schema.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List, Literal, Optional 3 | 4 | 5 | class ImageData(BaseModel): 6 | type: Optional[Literal["table", "picture"]] = Field(None, description="The type of the image") 7 | filename: Optional[str] = Field(None, description="The filename of the image") 8 | image: Optional[str] = Field(None, description="The image data") 9 | 10 | 11 | class ConversionResult(BaseModel): 12 | filename: str = Field(None, description="The filename of the document") 13 | markdown: str = Field(None, description="The markdown content of the document") 14 | images: List[ImageData] = Field(default_factory=list, description="The images in the document") 15 | error: Optional[str] = Field(None, description="The error that occurred during the conversion") 16 | 17 | 18 | class BatchConversionResult(BaseModel): 19 | conversion_results: List[ConversionResult] = Field( 20 | default_factory=list, description="The results of the conversions" 21 | ) 22 | 23 | 24 | class ConversationJobResult(BaseModel): 25 | job_id: Optional[str] = Field(None, description="The id of the conversion job") 26 | result: Optional[ConversionResult] = Field(None, description="The result of the conversion job") 27 | error: Optional[str] = Field(None, description="The error that occurred during the conversion job") 28 | status: Literal["IN_PROGRESS", "SUCCESS", "FAILURE"] = Field(None, description="The status of the conversion job") 29 | 30 | 31 | class BatchConversionJobResult(BaseModel): 32 | job_id: str = Field(..., description="The id of the conversion job") 33 | conversion_results: List[ConversationJobResult] = Field( 34 | default_factory=list, description="The results of the conversion job" 35 | ) 36 | status: Literal["IN_PROGRESS", "SUCCESS", "FAILURE"] = Field( 37 | None, description="The status of the entire conversion jobs in the batch" 38 | ) 39 | error: Optional[str] = Field(None, description="If the entire batch failed, this will be the error message") 40 | -------------------------------------------------------------------------------- /document_converter/service.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | from abc import ABC, abstractmethod 4 | from io import BytesIO 5 | from typing import List, Tuple 6 | 7 | from celery.result import AsyncResult 8 | from docling.datamodel.base_models import InputFormat, DocumentStream 9 | from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions 10 | from docling.document_converter import PdfFormatOption, DocumentConverter 11 | from docling_core.types.doc import ImageRefMode, TableItem, PictureItem 12 | from fastapi import HTTPException 13 | 14 | from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult, ImageData 15 | from document_converter.utils import handle_csv_file 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | IMAGE_RESOLUTION_SCALE = 4 19 | 20 | 21 | class DocumentConversionBase(ABC): 22 | @abstractmethod 23 | def convert(self, document: Tuple[str, BytesIO], **kwargs) -> ConversionResult: 24 | pass 25 | 26 | @abstractmethod 27 | def convert_batch(self, documents: List[Tuple[str, BytesIO]], **kwargs) -> List[ConversionResult]: 28 | pass 29 | 30 | 31 | class DoclingDocumentConversion(DocumentConversionBase): 32 | """Document conversion implementation using Docling. 33 | 34 | You can initialize with default pipeline options or provide your own: 35 | 36 | Example: 37 | ```python 38 | # Using default options 39 | converter = DoclingDocumentConversion() 40 | 41 | # Or customize with your own pipeline options 42 | pipeline_options = PdfPipelineOptions() 43 | pipeline_options.do_ocr = True 44 | pipeline_options.ocr_options = RapidOcrOptions() # Use RapidOcrOptions instead of EasyOCR (note : you need to install the OCR package) 45 | pipeline_options.generate_page_images = True 46 | 47 | converter = DoclingDocumentConversion(pipeline_options=pipeline_options) 48 | ``` 49 | """ 50 | 51 | def __init__(self, pipeline_options: PdfPipelineOptions = None): 52 | self.pipeline_options = pipeline_options if pipeline_options else self._setup_default_pipeline_options() 53 | 54 | def _update_pipeline_options(self, extract_tables: bool, image_resolution_scale: int) -> PdfPipelineOptions: 55 | self.pipeline_options.images_scale = image_resolution_scale 56 | self.pipeline_options.generate_table_images = extract_tables 57 | return self.pipeline_options 58 | 59 | @staticmethod 60 | def _setup_default_pipeline_options() -> PdfPipelineOptions: 61 | pipeline_options = PdfPipelineOptions() 62 | pipeline_options.generate_page_images = False 63 | pipeline_options.generate_picture_images = True 64 | pipeline_options.ocr_options = EasyOcrOptions(lang=["fr", "de", "es", "en", "it", "pt"]) 65 | 66 | return pipeline_options 67 | 68 | @staticmethod 69 | def _process_document_images(conv_res) -> Tuple[str, List[ImageData]]: 70 | images = [] 71 | table_counter = 0 72 | picture_counter = 0 73 | content_md = conv_res.document.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER) 74 | 75 | for element, _level in conv_res.document.iterate_items(): 76 | if isinstance(element, (TableItem, PictureItem)) and element.image: 77 | img_buffer = BytesIO() 78 | element.image.pil_image.save(img_buffer, format="PNG") 79 | 80 | if isinstance(element, TableItem): 81 | table_counter += 1 82 | image_name = f"table-{table_counter}.png" 83 | image_type = "table" 84 | else: 85 | picture_counter += 1 86 | image_name = f"picture-{picture_counter}.png" 87 | image_type = "picture" 88 | content_md = content_md.replace("", image_name, 1) 89 | 90 | image_bytes = base64.b64encode(img_buffer.getvalue()).decode('utf-8') 91 | images.append(ImageData(type=image_type, filename=image_name, image=image_bytes)) 92 | 93 | return content_md, images 94 | 95 | def convert( 96 | self, 97 | document: Tuple[str, BytesIO], 98 | extract_tables: bool = False, 99 | image_resolution_scale: int = IMAGE_RESOLUTION_SCALE, 100 | ) -> ConversionResult: 101 | filename, file = document 102 | pipeline_options = self._update_pipeline_options(extract_tables, image_resolution_scale) 103 | doc_converter = DocumentConverter( 104 | format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} 105 | ) 106 | 107 | if filename.lower().endswith('.csv'): 108 | file, error = handle_csv_file(file) 109 | if error: 110 | return ConversionResult(filename=filename, error=error) 111 | 112 | conv_res = doc_converter.convert(DocumentStream(name=filename, stream=file), raises_on_error=False) 113 | doc_filename = conv_res.input.file.stem 114 | 115 | if conv_res.errors: 116 | logging.error(f"Failed to convert {filename}: {conv_res.errors[0].error_message}") 117 | return ConversionResult(filename=doc_filename, error=conv_res.errors[0].error_message) 118 | 119 | content_md, images = self._process_document_images(conv_res) 120 | return ConversionResult(filename=doc_filename, markdown=content_md, images=images) 121 | 122 | def convert_batch( 123 | self, 124 | documents: List[Tuple[str, BytesIO]], 125 | extract_tables: bool = False, 126 | image_resolution_scale: int = IMAGE_RESOLUTION_SCALE, 127 | ) -> List[ConversionResult]: 128 | pipeline_options = self._update_pipeline_options(extract_tables, image_resolution_scale) 129 | doc_converter = DocumentConverter( 130 | format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} 131 | ) 132 | 133 | conv_results = doc_converter.convert_all( 134 | [DocumentStream(name=filename, stream=file) for filename, file in documents], 135 | raises_on_error=False, 136 | ) 137 | 138 | results = [] 139 | for conv_res in conv_results: 140 | doc_filename = conv_res.input.file.stem 141 | 142 | if conv_res.errors: 143 | logging.error(f"Failed to convert {conv_res.input.name}: {conv_res.errors[0].error_message}") 144 | results.append(ConversionResult(filename=conv_res.input.name, error=conv_res.errors[0].error_message)) 145 | continue 146 | 147 | content_md, images = self._process_document_images(conv_res) 148 | results.append(ConversionResult(filename=doc_filename, markdown=content_md, images=images)) 149 | 150 | return results 151 | 152 | 153 | class DocumentConverterService: 154 | def __init__(self, document_converter: DocumentConversionBase): 155 | self.document_converter = document_converter 156 | 157 | def convert_document(self, document: Tuple[str, BytesIO], **kwargs) -> ConversionResult: 158 | result = self.document_converter.convert(document, **kwargs) 159 | if result.error: 160 | logging.error(f"Failed to convert {document[0]}: {result.error}") 161 | raise HTTPException(status_code=500, detail=result.error) 162 | return result 163 | 164 | def convert_documents(self, documents: List[Tuple[str, BytesIO]], **kwargs) -> List[ConversionResult]: 165 | return self.document_converter.convert_batch(documents, **kwargs) 166 | 167 | def convert_document_task( 168 | self, 169 | document: Tuple[str, bytes], 170 | **kwargs, 171 | ) -> ConversionResult: 172 | document = (document[0], BytesIO(document[1])) 173 | return self.document_converter.convert(document, **kwargs) 174 | 175 | def convert_documents_task( 176 | self, 177 | documents: List[Tuple[str, bytes]], 178 | **kwargs, 179 | ) -> List[ConversionResult]: 180 | documents = [(filename, BytesIO(file)) for filename, file in documents] 181 | return self.document_converter.convert_batch(documents, **kwargs) 182 | 183 | def get_single_document_task_result(self, job_id: str) -> ConversationJobResult: 184 | """Get the status and result of a document conversion job. 185 | 186 | Returns: 187 | - IN_PROGRESS: When task is still running 188 | - SUCCESS: When conversion completed successfully 189 | - FAILURE: When task failed or conversion had errors 190 | """ 191 | 192 | task = AsyncResult(job_id) 193 | if task.state == 'PENDING': 194 | return ConversationJobResult(job_id=job_id, status="IN_PROGRESS") 195 | 196 | elif task.state == 'SUCCESS': 197 | result = task.get() 198 | # Check if the conversion result contains an error 199 | if result.get('error'): 200 | return ConversationJobResult(job_id=job_id, status="FAILURE", error=result['error']) 201 | 202 | return ConversationJobResult(job_id=job_id, status="SUCCESS", result=ConversionResult(**result)) 203 | 204 | else: 205 | return ConversationJobResult(job_id=job_id, status="FAILURE", error=str(task.result)) 206 | 207 | def get_batch_conversion_task_result(self, job_id: str) -> BatchConversionJobResult: 208 | """Get the status and results of a batch conversion job. 209 | 210 | Returns: 211 | - IN_PROGRESS: When task is still running 212 | - SUCCESS: A batch is successful as long as the task is successful 213 | - FAILURE: When the task fails for any reason 214 | """ 215 | 216 | task = AsyncResult(job_id) 217 | if task.state == 'PENDING': 218 | return BatchConversionJobResult(job_id=job_id, status="IN_PROGRESS") 219 | 220 | # Task completed successfully, but need to check individual conversion results 221 | if task.state == 'SUCCESS': 222 | conversion_results = task.get() 223 | job_results = [] 224 | 225 | for result in conversion_results: 226 | if result.get('error'): 227 | job_result = ConversationJobResult(status="FAILURE", error=result['error']) 228 | else: 229 | job_result = ConversationJobResult( 230 | status="SUCCESS", result=ConversionResult(**result).model_dump(exclude_unset=True) 231 | ) 232 | job_results.append(job_result) 233 | 234 | return BatchConversionJobResult(job_id=job_id, status="SUCCESS", conversion_results=job_results) 235 | 236 | return BatchConversionJobResult(job_id=job_id, status="FAILURE", error=str(task.result)) 237 | -------------------------------------------------------------------------------- /document_converter/utils.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | import re 3 | from enum import Enum 4 | from typing import Dict, List, Optional, Tuple 5 | 6 | import filetype 7 | 8 | 9 | class InputFormat(str, Enum): 10 | DOCX = "docx" 11 | PPTX = "pptx" 12 | HTML = "html" 13 | IMAGE = "image" 14 | PDF = "pdf" 15 | ASCIIDOC = "asciidoc" 16 | MD = "md" 17 | CSV = "csv" 18 | 19 | 20 | class OutputFormat(str, Enum): 21 | MARKDOWN = "md" 22 | JSON = "json" 23 | TEXT = "text" 24 | DOCTAGS = "doctags" 25 | 26 | 27 | FormatToExtensions: Dict[InputFormat, List[str]] = { 28 | InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"], 29 | InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"], 30 | InputFormat.PDF: ["pdf"], 31 | InputFormat.MD: ["md"], 32 | InputFormat.HTML: ["html", "htm", "xhtml"], 33 | InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], 34 | InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], 35 | InputFormat.CSV: ["csv"], 36 | } 37 | 38 | FormatToMimeType: Dict[InputFormat, List[str]] = { 39 | InputFormat.DOCX: [ 40 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 41 | "application/vnd.openxmlformats-officedocument.wordprocessingml.template", 42 | ], 43 | InputFormat.PPTX: [ 44 | "application/vnd.openxmlformats-officedocument.presentationml.template", 45 | "application/vnd.openxmlformats-officedocument.presentationml.slideshow", 46 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 47 | ], 48 | InputFormat.HTML: ["text/html", "application/xhtml+xml"], 49 | InputFormat.IMAGE: [ 50 | "image/png", 51 | "image/jpeg", 52 | "image/tiff", 53 | "image/gif", 54 | "image/bmp", 55 | ], 56 | InputFormat.PDF: ["application/pdf"], 57 | InputFormat.ASCIIDOC: ["text/asciidoc"], 58 | InputFormat.MD: ["text/markdown", "text/x-markdown"], 59 | InputFormat.CSV: ["text/csv"], 60 | } 61 | MimeTypeToFormat = {mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes} 62 | 63 | 64 | def detect_html_xhtml(content): 65 | content_str = content.decode("ascii", errors="ignore").lower() 66 | # Remove XML comments 67 | content_str = re.sub(r"", "", content_str, flags=re.DOTALL) 68 | content_str = content_str.lstrip() 69 | 70 | if re.match(r"<\?xml", content_str): 71 | if "xhtml" in content_str[:1000]: 72 | return "application/xhtml+xml" 73 | 74 | if re.match(r" bool: 81 | """Check if a file is a CSV based on its extension.""" 82 | return filename and filename.lower().endswith('.csv') 83 | 84 | 85 | def guess_format(obj: bytes, filename: str = None): 86 | content = b"" 87 | mime = None 88 | 89 | if isinstance(obj, bytes): 90 | content = obj 91 | # Special handling for CSV files 92 | if is_csv_file(filename): 93 | return InputFormat.CSV 94 | 95 | mime = filetype.guess_mime(content) 96 | if mime is None: 97 | ext = filename.rsplit(".", 1)[-1] if ("." in filename and not filename.startswith(".")) else "" 98 | mime = mime_from_extension(ext) 99 | 100 | mime = mime or detect_html_xhtml(content) 101 | mime = mime or "text/plain" 102 | return MimeTypeToFormat.get(mime) 103 | 104 | 105 | def handle_csv_file(file: BytesIO) -> Tuple[BytesIO, Optional[str]]: 106 | """Handle CSV file encoding by trying multiple encodings. 107 | 108 | Returns: 109 | Tuple[BytesIO, Optional[str]]: (processed file, error message if any) 110 | """ 111 | SUPPORTED_CSV_ENCODINGS = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1'] 112 | for encoding in SUPPORTED_CSV_ENCODINGS: 113 | try: 114 | file.seek(0) 115 | content = file.read().decode(encoding) 116 | return BytesIO(content.encode('utf-8')), None 117 | except UnicodeDecodeError: 118 | continue 119 | return file, f"Could not decode CSV file. Supported encodings: {', '.join(SUPPORTED_CSV_ENCODINGS)}" 120 | 121 | 122 | def mime_from_extension(ext): 123 | mime = None 124 | if ext in FormatToExtensions[InputFormat.ASCIIDOC]: 125 | mime = FormatToMimeType[InputFormat.ASCIIDOC][0] 126 | elif ext in FormatToExtensions[InputFormat.HTML]: 127 | mime = FormatToMimeType[InputFormat.HTML][0] 128 | elif ext in FormatToExtensions[InputFormat.MD]: 129 | mime = FormatToMimeType[InputFormat.MD][0] 130 | elif ext in FormatToExtensions[InputFormat.CSV]: 131 | mime = FormatToMimeType[InputFormat.CSV][0] 132 | 133 | return mime 134 | 135 | 136 | def is_file_format_supported(file_bytes: bytes, filename: str) -> bool: 137 | return guess_format(file_bytes, filename) in FormatToExtensions.keys() 138 | -------------------------------------------------------------------------------- /images/docling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/docling.png -------------------------------------------------------------------------------- /images/marker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/marker.png -------------------------------------------------------------------------------- /images/original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/original.png -------------------------------------------------------------------------------- /images/pymupdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/pymupdf.png -------------------------------------------------------------------------------- /images/pypdf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/pypdf.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from fastapi.middleware.cors import CORSMiddleware 3 | 4 | from document_converter.route import router as document_converter_router 5 | 6 | app = FastAPI() 7 | 8 | 9 | app.add_middleware( 10 | CORSMiddleware, 11 | allow_origins=["*"], 12 | allow_methods=["*"], 13 | allow_headers=["*"], 14 | allow_credentials=True, 15 | ) 16 | 17 | 18 | app.include_router(document_converter_router, prefix="", tags=["document-converter"]) 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "document-to-markdown" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["drmingler "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.12" 10 | fastapi = "^0.115.4" 11 | uvicorn = "^0.32.0" 12 | docling = "^2.25.1" 13 | python-multipart = "^0.0.17" 14 | celery = "^5.4.0" 15 | flower = "^2.0.1" 16 | redis = "^5.2.0" 17 | gunicorn = "^23.0.0" 18 | 19 | 20 | [build-system] 21 | requires = ["poetry-core"] 22 | build-backend = "poetry.core.masonry.api" 23 | -------------------------------------------------------------------------------- /worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/worker/__init__.py -------------------------------------------------------------------------------- /worker/celery_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from celery import Celery 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv(".env") 6 | celery_app = Celery( 7 | "document_converter", 8 | broker=os.environ.get("REDIS_HOST", "redis://localhost:6379/0"), 9 | backend=os.environ.get("REDIS_HOST", "redis://localhost:6379/0"), 10 | include=["worker.tasks"], 11 | ) 12 | -------------------------------------------------------------------------------- /worker/tasks.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Tuple 2 | from document_converter.service import IMAGE_RESOLUTION_SCALE, DoclingDocumentConversion, DocumentConverterService 3 | from worker.celery_config import celery_app 4 | 5 | 6 | @celery_app.task(name="celery.ping") 7 | def ping(): 8 | print("Ping task received!") # or use a logger 9 | return "pong" 10 | 11 | 12 | @celery_app.task(bind=True, name="convert_document") 13 | def convert_document_task( 14 | self, 15 | document: Tuple[str, bytes], 16 | extract_tables: bool = False, 17 | image_resolution_scale: int = IMAGE_RESOLUTION_SCALE, 18 | ) -> Dict[str, Any]: 19 | document_service = DocumentConverterService(document_converter=DoclingDocumentConversion()) 20 | result = document_service.convert_document_task( 21 | document, extract_tables=extract_tables, image_resolution_scale=image_resolution_scale 22 | ) 23 | return result.model_dump(exclude_unset=True) 24 | 25 | 26 | @celery_app.task(bind=True, name="convert_documents") 27 | def convert_documents_task( 28 | self, 29 | documents: List[Tuple[str, bytes]], 30 | extract_tables: bool = False, 31 | image_resolution_scale: int = IMAGE_RESOLUTION_SCALE, 32 | ) -> List[Dict[str, Any]]: 33 | document_service = DocumentConverterService(document_converter=DoclingDocumentConversion()) 34 | results = document_service.convert_documents_task( 35 | documents, extract_tables=extract_tables, image_resolution_scale=image_resolution_scale 36 | ) 37 | return [result.model_dump(exclude_unset=True) for result in results] 38 | --------------------------------------------------------------------------------