├── .env.example
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── docker-compose.cpu.yml
├── docker-compose.gpu.yml
├── document_converter
    ├── __init__.py
    ├── route.py
    ├── schema.py
    ├── service.py
    └── utils.py
├── images
    ├── docling.png
    ├── marker.png
    ├── original.png
    ├── pymupdf.png
    └── pypdf.png
├── main.py
├── poetry.lock
├── pyproject.toml
└── worker
    ├── __init__.py
    ├── celery_config.py
    └── tasks.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # Configuration for Redis connection settings
 2 | 
 3 | # Use this if Redis is hosted externally or deployed separately outside of Docker Compose.
 4 | # Replace "localhost" with the actual host address or IP of your Redis instance.
 5 | # REDIS_HOST="redis://<redis_host>:<redis_port>/<database_number>"
 6 | # Example for a hosted Redis:
 7 | # REDIS_HOST="redis://redis.example.com:6379/0"
 8 | 
 9 | # If using Docker Compose and Redis is one of the services defined within the docker-compose.yml file,
10 | # you can use the service name 'redis' as the host. This ensures the internal communication between services.
11 | REDIS_HOST=redis://redis:6379/0
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Default ignored files
 2 | /shelf/
 3 | /workspace.xml
 4 | # Editor-based HTTP Client requests
 5 | /httpRequests/
 6 | # Datasource local storage ignored files
 7 | /dataSources/
 8 | /dataSources.local.xml
 9 | 
10 | # Additional IDE files
11 | *.iml
12 | *.iws
13 | /out/
14 | /target/
15 | /build/
16 | .idea/libraries/
17 | .idea/modules.xml
18 | .idea/misc.xml
19 | .env
20 | .DS_Store
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |       - id: end-of-file-fixer
 6 |       - id: trailing-whitespace
 7 | #  - repo: https://github.com/adrienverge/yamllint
 8 | #    rev: v1.32.0
 9 | #    hooks:
10 | #      - id: yamllint
11 | #        args: [--format, parsable, --strict]
12 |   - repo: https://github.com/psf/black
13 |     rev: 23.9.1
14 |     hooks:
15 |       - id: black
16 |         args: ['--line-length', '120', '--target-version', 'py39', '--skip-string-normalization']
17 |         language_version: python3.9
18 | #  - repo: https://github.com/PyCQA/flake8
19 | #    rev: 6.1.0
20 | #    hooks:
21 | #    - id: flake8
22 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a base image with CUDA support and the desired Python version
 2 | FROM python:3.12-slim-bookworm
 3 | 
 4 | ARG CPU_ONLY=false
 5 | WORKDIR /app
 6 | 
 7 | RUN apt-get update \
 8 |     && apt-get install -y redis-server libgl1 libglib2.0-0 curl wget git procps \
 9 |     && apt-get clean
10 | 
11 | # Install Poetry and configure it
12 | RUN pip install poetry \
13 |     && poetry config virtualenvs.create false
14 | 
15 | COPY pyproject.toml poetry.lock ./
16 | 
17 | # Install dependencies before torch
18 | RUN poetry install --no-interaction --no-root
19 | 
20 | # Install PyTorch separately based on CPU_ONLY flag
21 | RUN if [ "$CPU_ONLY" = "true" ]; then \
22 |     pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cpu; \
23 |     else \
24 |     pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \
25 |     fi
26 | 
27 | ENV HF_HOME=/tmp/ \
28 |     TORCH_HOME=/tmp/ \
29 |     OMP_NUM_THREADS=4
30 | 
31 | RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);'
32 | 
33 | # Pre-download EasyOCR models in compatible groups
34 | RUN python -c 'import easyocr; \
35 |     reader = easyocr.Reader(["fr", "de", "es", "en", "it", "pt"], gpu=True); \
36 |     print("EasyOCR models downloaded successfully")'
37 | 
38 | COPY . .
39 | 
40 | EXPOSE 8080
41 | 
42 | CMD ["poetry", "run", "uvicorn", "--port", "8080", "--host", "0.0.0.0", "main:app"]
43 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 David Emmanuel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Documents to Markdown Converter Server
  2 | 
  3 | > [!IMPORTANT]
  4 | > This backend server is a robust, scalable solution for effortlessly converting a wide range of document formats—including PDF, DOCX, PPTX, CSV, HTML, JPG, PNG, TIFF, BMP, AsciiDoc, and Markdown—into Markdown. Powered by [Docling](https://github.com/DS4SD/docling) (IBM's advanced document parser), this service is built with FastAPI, Celery, and Redis, ensuring fast, efficient processing. Optimized for both CPU and GPU modes, with GPU highly recommended for production environments, this solution offers high performance and flexibility, making it ideal for handling complex document processing at scale.
  5 | 
  6 | ## Comparison to Other Parsing Libraries
  7 | 
  8 | | Original PDF |
  9 | |--------------|
 10 | | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/original.png" width="500"/> |
 11 | 
 12 | | Docling-API | Marker |
 13 | |-------------|--------|
 14 | | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/docling.png" width="500"/> | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/marker.png" width="500"/> |
 15 | 
 16 | | PyPDF | PyMuPDF4LLM |
 17 | |-------|-------------|
 18 | | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/pypdf.png" width="500"/> | <img src="https://raw.githubusercontent.com/drmingler/docling-api/refs/heads/main/images/pymupdf.png" width="500"/> |
 19 | 
 20 | ## Features
 21 | - **Multiple Format Support**: Converts various document types including:
 22 |   - PDF files
 23 |   - Microsoft Word documents (DOCX)
 24 |   - PowerPoint presentations (PPTX)
 25 |   - HTML files
 26 |   - Images (JPG, PNG, TIFF, BMP)
 27 |   - AsciiDoc files
 28 |   - Markdown files
 29 |   - CSV files
 30 | 
 31 | - **Conversion Capabilities**:
 32 |   - Text extraction and formatting
 33 |   - Table detection, extraction and conversion
 34 |   - Image extraction and processing
 35 |   - Multi-language OCR support (French, German, Spanish, English, Italian, Portuguese etc)
 36 |   - Configurable image resolution scaling
 37 | 
 38 | - **API Endpoints**:
 39 |   - Synchronous single document conversion
 40 |   - Synchronous batch document conversion
 41 |   - Asynchronous single document conversion with job tracking
 42 |   - Asynchronous batch conversion with job tracking
 43 | 
 44 | - **Processing Modes**:
 45 |   - CPU-only processing for standard deployments
 46 |   - GPU-accelerated processing for improved performance
 47 |   - Distributed task processing using Celery
 48 |   - Task monitoring through Flower dashboard
 49 | 
 50 | ## Environment Setup (Running Locally)
 51 | 
 52 | ### Prerequisites
 53 | - Python 3.8 or higher
 54 | - Poetry (Python package manager)
 55 | - Redis server (for task queue)
 56 | 
 57 | ### 1. Install Poetry (if not already installed)
 58 | ```bash
 59 | curl -sSL https://install.python-poetry.org | python3 -
 60 | ```
 61 | 
 62 | ### 2. Clone and Setup Project
 63 | ```bash
 64 | git clone https://github.com/drmingler/docling-api.git
 65 | cd docling-api
 66 | poetry install
 67 | ```
 68 | 
 69 | ### 3. Configure Environment
 70 | Create a `.env` file in the project root:
 71 | ```bash
 72 | REDIS_HOST=redis://localhost:6379/0
 73 | ENV=development
 74 | ```
 75 | 
 76 | ### 4. Start Redis Server
 77 | Start Redis locally (install if not already installed):
 78 | 
 79 | #### For MacOS:
 80 | ```bash
 81 | brew install redis
 82 | brew services start redis
 83 | ```
 84 | 
 85 | #### For Ubuntu/Debian:
 86 | ```bash
 87 | sudo apt-get install redis-server
 88 | sudo service redis-server start
 89 | ```
 90 | 
 91 | ### 5. Start the Application Components
 92 | 
 93 | 1. Start the FastAPI server:
 94 | ```bash
 95 | poetry run uvicorn main:app --reload --port 8080
 96 | ```
 97 | 
 98 | 2. Start Celery worker (in a new terminal):
 99 | ```bash
100 | poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
101 | ```
102 | 
103 | 3. Start Flower dashboard for monitoring (optional, in a new terminal):
104 | ```bash
105 | poetry run celery -A worker.celery_config flower --port=5555
106 | ```
107 | 
108 | ### 6. Verify Installation
109 | 
110 | 1. Check if the API server is running:
111 | ```bash
112 | curl http://localhost:8080/docs
113 | ```
114 | 
115 | 2. Test Celery worker:
116 | ```bash
117 | curl -X POST "http://localhost:8080/documents/convert" \
118 |   -H "accept: application/json" \
119 |   -H "Content-Type: multipart/form-data" \
120 |   -F "document=@/path/to/test.pdf"
121 | ```
122 | 
123 | 3. Access monitoring dashboard:
124 | - Open http://localhost:5555 in your browser to view the Flower dashboard
125 | 
126 | ### Development Notes
127 | 
128 | - The API documentation is available at http://localhost:8080/docs
129 | - Redis is used as both message broker and result backend for Celery tasks
130 | - The service supports both synchronous and asynchronous document conversion
131 | - For development, the server runs with auto-reload enabled
132 | 
133 | ## Environment Setup (Running in Docker)
134 | 
135 | 1. Clone the repository:
136 | ```bash
137 | git clone https://github.com/drmingler/docling-api.git
138 | cd docling-api
139 | ```
140 | 
141 | 2. Create a `.env` file:
142 | ```bash
143 | REDIS_HOST=redis://redis:6379/0
144 | ENV=production
145 | ```
146 | 
147 | ### CPU Mode
148 | To start the service using CPU-only processing, use the following command. You can adjust the number of Celery workers by specifying the --scale option. In this example, 1 worker will be created:
149 | ```bash
150 | docker-compose -f docker-compose.cpu.yml up --build --scale celery_worker=1
151 | ```
152 | 
153 | ### GPU Mode (Recommend for production)
154 | For production, it is recommended to enable GPU acceleration, as it significantly improves performance. Use the command below to start the service with GPU support. You can also scale the number of Celery workers using the --scale option; here, 3 workers will be launched:
155 | ```bash
156 | docker-compose -f docker-compose.gpu.yml up --build --scale celery_worker=3
157 | ```
158 | 
159 | ## Service Components
160 | 
161 | The service will start the following components:
162 | 
163 | - **API Server**: http://localhost:8080
164 | - **Redis**: http://localhost:6379
165 | - **Flower Dashboard**: http://localhost:5556
166 | 
167 | ## API Usage
168 | 
169 | ### Synchronous Conversion
170 | 
171 | Convert a single document immediately:
172 | 
173 | ```bash
174 | curl -X POST "http://localhost:8080/documents/convert" \
175 |   -H "accept: application/json" \
176 |   -H "Content-Type: multipart/form-data" \
177 |   -F "document=@/path/to/document.pdf" \
178 |   -F "extract_tables_as_images=true" \
179 |   -F "image_resolution_scale=4"
180 | ```
181 | 
182 | ### Asynchronous Conversion
183 | 
184 | 1. Submit a document for conversion:
185 | 
186 | ```bash
187 | curl -X POST "http://localhost:8080/conversion-jobs" \
188 |   -H "accept: application/json" \
189 |   -H "Content-Type: multipart/form-data" \
190 |   -F "document=@/path/to/document.pdf"
191 | ```
192 | 
193 | 2. Check conversion status:
194 | 
195 | ```bash
196 | curl -X GET "http://localhost:8080/conversion-jobs/{job_id}" \
197 |   -H "accept: application/json"
198 | ```
199 | 
200 | ### Batch Processing
201 | 
202 | Convert multiple documents asynchronously:
203 | 
204 | ```bash
205 | curl -X POST "http://localhost:8080/batch-conversion-jobs" \
206 |   -H "accept: application/json" \
207 |   -H "Content-Type: multipart/form-data" \
208 |   -F "documents=@/path/to/document1.pdf" \
209 |   -F "documents=@/path/to/document2.pdf"
210 | ```
211 | 
212 | ## Configuration Options
213 | 
214 | - `image_resolution_scale`: Control the resolution of extracted images (1-4)
215 | - `extract_tables_as_images`: Extract tables as images (true/false)
216 | - `CPU_ONLY`: Build argument to switch between CPU/GPU modes
217 | 
218 | ## Monitoring
219 | 
220 | - Access the Flower dashboard to monitor Celery tasks and workers
221 | - View task status, success/failure rates, and worker performance
222 | - Monitor resource usage and task queues
223 | 
224 | ## Architecture
225 | 
226 | The service uses a distributed architecture with the following components:
227 | 
228 | 1. FastAPI application serving the REST API
229 | 2. Celery workers for distributed task processing
230 | 3. Redis as message broker and result backend
231 | 4. Flower for task monitoring and management
232 | 5. Docling for the file conversion
233 | 
234 | ## Performance Considerations
235 | 
236 | - GPU mode provides significantly faster processing for large documents
237 | - CPU mode is suitable for smaller deployments or when GPU is not available
238 | - Multiple workers can be scaled horizontally for increased throughput
239 | 
240 | ## License
241 | The codebase is under MIT license. See LICENSE for more information
242 | 
243 | ## Acknowledgements
244 | - [Docling](https://github.com/DS4SD/docling) the state-of-the-art document conversion library by IBM
245 | - [FastAPI](https://fastapi.tiangolo.com/) the web framework
246 | - [Celery](https://docs.celeryq.dev/en/stable/) for distributed task processing
247 | - [Flower](https://flower.readthedocs.io/en/latest/) for monitoring and management
248 | 


--------------------------------------------------------------------------------
/docker-compose.cpu.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | services:
 4 |   celery_worker:
 5 |     build:
 6 |       context: .
 7 |       args:
 8 |         CPU_ONLY: "true"
 9 |     image: converter-cpu-image
10 |     command: poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
11 |     volumes:
12 |       - .:/app
13 |       - model_cache:/tmp
14 |     environment:
15 |       - REDIS_HOST=${REDIS_HOST}
16 |       - ENV=production
17 |     restart: on-failure
18 |     depends_on:
19 |       - redis
20 | 
21 |   app:
22 |     container_name: marker-api-cpu
23 |     build:
24 |       context: .
25 |       args:
26 |         CPU_ONLY: "true"
27 |     image: converter-cpu-image
28 |     command: poetry run uvicorn --port 8080 --host 0.0.0.0 main:app
29 |     environment:
30 |       - REDIS_HOST=${REDIS_HOST}
31 |       - ENV=production
32 |       - MALLOC_ARENA_MAX=2
33 |       - OMP_NUM_THREADS=2
34 |       - PYTHONMALLOC=malloc
35 |     ports:
36 |       - "8080:8080"
37 |     volumes:
38 |       - .:/app
39 |       - model_cache:/tmp
40 |     restart: on-failure
41 |     depends_on:
42 |       - redis
43 | 
44 |   redis:
45 |     container_name: redis
46 |     image: redis:7.2.4-alpine
47 |     ports:
48 |       - "6379:6379"
49 | 
50 |   flower:
51 |     container_name: flower_cpu
52 |     build:
53 |       context: .
54 |       args:
55 |         CPU_ONLY: "true"
56 |     image: converter-cpu-image
57 |     command: poetry run celery -A worker.celery_config flower --port=5555
58 |     ports:
59 |       - "5556:5555"
60 |     volumes:
61 |       - .:/app
62 |       - model_cache:/tmp
63 |     environment:
64 |       - REDIS_HOST=${REDIS_HOST}
65 |       - ENV=production
66 |     depends_on:
67 |       - app
68 |       - redis
69 |       - celery_worker
70 | 
71 | volumes:
72 |   model_cache:
73 | 


--------------------------------------------------------------------------------
/docker-compose.gpu.yml:
--------------------------------------------------------------------------------
 1 | version: "3.8"
 2 | 
 3 | services:
 4 |   celery_worker:
 5 |     build:
 6 |       context: .
 7 |       args:
 8 |         CPU_ONLY: "false"
 9 |     image: converter-gpu-image
10 |     command: poetry run celery -A worker.celery_config worker --pool=solo -n worker_primary --loglevel=info
11 |     volumes:
12 |       - .:/app
13 |     environment:
14 |       - REDIS_HOST=${REDIS_HOST}
15 |       - ENV=production
16 |     deploy:
17 |       resources:
18 |         reservations:
19 |           devices:
20 |             - driver: nvidia
21 |               count: 1
22 |               capabilities: [gpu]
23 |     depends_on:
24 |       - redis
25 | 
26 |   app:
27 |     container_name: marker-api-gpu
28 |     build:
29 |       context: .
30 |       args:
31 |         CPU_ONLY: "false"
32 |     image: converter-gpu-image
33 |     command: poetry run uvicorn --port 8080 --host 0.0.0.0 main:app
34 |     environment:
35 |       - REDIS_HOST=${REDIS_HOST}
36 |       - ENV=production
37 |       - NVIDIA_VISIBLE_DEVICES=all
38 |     ports:
39 |       - "8080:8080"
40 |     volumes:
41 |       - .:/app
42 |     deploy:
43 |       resources:
44 |         reservations:
45 |           devices:
46 |             - driver: nvidia
47 |               count: 1
48 |               capabilities: [gpu]
49 |     depends_on:
50 |       - redis
51 |       - celery_worker
52 | 
53 |   redis:
54 |     container_name: redis
55 |     image: redis:7.2.4-alpine
56 |     ports:
57 |       - "6379:6379"
58 | 
59 |   flower:
60 |     container_name: flower_gpu
61 |     build:
62 |       context: .
63 |       args:
64 |         CPU_ONLY: "false"
65 |     image: converter-gpu-image
66 |     command: poetry run celery -A worker.celery_config flower --port=5555
67 |     ports:
68 |       - "5556:5555"
69 |     volumes:
70 |       - .:/app
71 |     environment:
72 |       - REDIS_HOST=${REDIS_HOST}
73 |       - ENV=production
74 |     depends_on:
75 |       - app
76 |       - redis
77 |       - celery_worker
78 |     deploy:
79 |       resources:
80 |         reservations:
81 |           devices:
82 |             - driver: nvidia
83 |               count: 1
84 |               capabilities: [gpu]
85 | 


--------------------------------------------------------------------------------
/document_converter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/document_converter/__init__.py


--------------------------------------------------------------------------------
/document_converter/route.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | from typing import List
  3 | from fastapi import APIRouter, File, HTTPException, UploadFile, Query
  4 | 
  5 | from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult
  6 | from document_converter.service import DocumentConverterService, DoclingDocumentConversion
  7 | from document_converter.utils import is_file_format_supported
  8 | from worker.tasks import convert_document_task, convert_documents_task
  9 | 
 10 | router = APIRouter()
 11 | 
 12 | # Could be docling or another converter as long as it implements DocumentConversionBase
 13 | converter = DoclingDocumentConversion()
 14 | document_converter_service = DocumentConverterService(document_converter=converter)
 15 | 
 16 | 
 17 | # Document direct conversion endpoints
 18 | @router.post(
 19 |     '/documents/convert',
 20 |     response_model=ConversionResult,
 21 |     response_model_exclude_unset=True,
 22 |     description="Convert a single document synchronously",
 23 | )
 24 | async def convert_single_document(
 25 |     document: UploadFile = File(...),
 26 |     extract_tables_as_images: bool = False,
 27 |     image_resolution_scale: int = Query(4, ge=1, le=4),
 28 | ):
 29 |     file_bytes = await document.read()
 30 |     if not is_file_format_supported(file_bytes, document.filename):
 31 |         raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
 32 | 
 33 |     return document_converter_service.convert_document(
 34 |         (document.filename, BytesIO(file_bytes)),
 35 |         extract_tables=extract_tables_as_images,
 36 |         image_resolution_scale=image_resolution_scale,
 37 |     )
 38 | 
 39 | 
 40 | @router.post(
 41 |     '/documents/batch-convert',
 42 |     response_model=List[ConversionResult],
 43 |     response_model_exclude_unset=True,
 44 |     description="Convert multiple documents synchronously",
 45 | )
 46 | async def convert_multiple_documents(
 47 |     documents: List[UploadFile] = File(...),
 48 |     extract_tables_as_images: bool = False,
 49 |     image_resolution_scale: int = Query(4, ge=1, le=4),
 50 | ):
 51 |     doc_streams = []
 52 |     for document in documents:
 53 |         file_bytes = await document.read()
 54 |         if not is_file_format_supported(file_bytes, document.filename):
 55 |             raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
 56 |         doc_streams.append((document.filename, BytesIO(file_bytes)))
 57 | 
 58 |     return document_converter_service.convert_documents(
 59 |         doc_streams,
 60 |         extract_tables=extract_tables_as_images,
 61 |         image_resolution_scale=image_resolution_scale,
 62 |     )
 63 | 
 64 | 
 65 | # Asynchronous conversion jobs endpoints
 66 | @router.post(
 67 |     '/conversion-jobs',
 68 |     response_model=ConversationJobResult,
 69 |     description="Create a conversion job for a single document",
 70 | )
 71 | async def create_single_document_conversion_job(
 72 |     document: UploadFile = File(...),
 73 |     extract_tables_as_images: bool = False,
 74 |     image_resolution_scale: int = Query(4, ge=1, le=4),
 75 | ):
 76 |     file_bytes = await document.read()
 77 |     if not is_file_format_supported(file_bytes, document.filename):
 78 |         raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
 79 | 
 80 |     task = convert_document_task.delay(
 81 |         (document.filename, file_bytes),
 82 |         extract_tables=extract_tables_as_images,
 83 |         image_resolution_scale=image_resolution_scale,
 84 |     )
 85 | 
 86 |     return ConversationJobResult(job_id=task.id, status="IN_PROGRESS")
 87 | 
 88 | 
 89 | @router.get(
 90 |     '/conversion-jobs/{job_id}',
 91 |     response_model=ConversationJobResult,
 92 |     description="Get the status of a single document conversion job",
 93 |     response_model_exclude_unset=True,
 94 | )
 95 | async def get_conversion_job_status(job_id: str):
 96 |     return document_converter_service.get_single_document_task_result(job_id)
 97 | 
 98 | 
 99 | @router.post(
100 |     '/batch-conversion-jobs',
101 |     response_model=BatchConversionJobResult,
102 |     response_model_exclude_unset=True,
103 |     description="Create a conversion job for multiple documents",
104 | )
105 | async def create_batch_conversion_job(
106 |     documents: List[UploadFile] = File(...),
107 |     extract_tables_as_images: bool = False,
108 |     image_resolution_scale: int = Query(4, ge=1, le=4),
109 | ):
110 |     """Create a batch conversion job for multiple documents."""
111 |     doc_data = []
112 |     for document in documents:
113 |         file_bytes = await document.read()
114 |         if not is_file_format_supported(file_bytes, document.filename):
115 |             raise HTTPException(status_code=400, detail=f"Unsupported file format: {document.filename}")
116 |         doc_data.append((document.filename, file_bytes))
117 | 
118 |     task = convert_documents_task.delay(
119 |         doc_data,
120 |         extract_tables=extract_tables_as_images,
121 |         image_resolution_scale=image_resolution_scale,
122 |     )
123 | 
124 |     return BatchConversionJobResult(job_id=task.id, status="IN_PROGRESS")
125 | 
126 | 
127 | @router.get(
128 |     '/batch-conversion-jobs/{job_id}',
129 |     response_model=BatchConversionJobResult,
130 |     response_model_exclude_unset=True,
131 |     description="Get the status of a batch conversion job",
132 | )
133 | async def get_batch_conversion_job_status(job_id: str):
134 |     """Get the status and results of a batch conversion job."""
135 |     return document_converter_service.get_batch_conversion_task_result(job_id)
136 | 


--------------------------------------------------------------------------------
/document_converter/schema.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import List, Literal, Optional
 3 | 
 4 | 
 5 | class ImageData(BaseModel):
 6 |     type: Optional[Literal["table", "picture"]] = Field(None, description="The type of the image")
 7 |     filename: Optional[str] = Field(None, description="The filename of the image")
 8 |     image: Optional[str] = Field(None, description="The image data")
 9 | 
10 | 
11 | class ConversionResult(BaseModel):
12 |     filename: str = Field(None, description="The filename of the document")
13 |     markdown: str = Field(None, description="The markdown content of the document")
14 |     images: List[ImageData] = Field(default_factory=list, description="The images in the document")
15 |     error: Optional[str] = Field(None, description="The error that occurred during the conversion")
16 | 
17 | 
18 | class BatchConversionResult(BaseModel):
19 |     conversion_results: List[ConversionResult] = Field(
20 |         default_factory=list, description="The results of the conversions"
21 |     )
22 | 
23 | 
24 | class ConversationJobResult(BaseModel):
25 |     job_id: Optional[str] = Field(None, description="The id of the conversion job")
26 |     result: Optional[ConversionResult] = Field(None, description="The result of the conversion job")
27 |     error: Optional[str] = Field(None, description="The error that occurred during the conversion job")
28 |     status: Literal["IN_PROGRESS", "SUCCESS", "FAILURE"] = Field(None, description="The status of the conversion job")
29 | 
30 | 
31 | class BatchConversionJobResult(BaseModel):
32 |     job_id: str = Field(..., description="The id of the conversion job")
33 |     conversion_results: List[ConversationJobResult] = Field(
34 |         default_factory=list, description="The results of the conversion job"
35 |     )
36 |     status: Literal["IN_PROGRESS", "SUCCESS", "FAILURE"] = Field(
37 |         None, description="The status of the entire conversion jobs in the batch"
38 |     )
39 |     error: Optional[str] = Field(None, description="If the entire batch failed, this will be the error message")
40 | 


--------------------------------------------------------------------------------
/document_converter/service.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import logging
  3 | from abc import ABC, abstractmethod
  4 | from io import BytesIO
  5 | from typing import List, Tuple
  6 | 
  7 | from celery.result import AsyncResult
  8 | from docling.datamodel.base_models import InputFormat, DocumentStream
  9 | from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
 10 | from docling.document_converter import PdfFormatOption, DocumentConverter
 11 | from docling_core.types.doc import ImageRefMode, TableItem, PictureItem
 12 | from fastapi import HTTPException
 13 | 
 14 | from document_converter.schema import BatchConversionJobResult, ConversationJobResult, ConversionResult, ImageData
 15 | from document_converter.utils import handle_csv_file
 16 | 
 17 | logging.basicConfig(level=logging.INFO)
 18 | IMAGE_RESOLUTION_SCALE = 4
 19 | 
 20 | 
 21 | class DocumentConversionBase(ABC):
 22 |     @abstractmethod
 23 |     def convert(self, document: Tuple[str, BytesIO], **kwargs) -> ConversionResult:
 24 |         pass
 25 | 
 26 |     @abstractmethod
 27 |     def convert_batch(self, documents: List[Tuple[str, BytesIO]], **kwargs) -> List[ConversionResult]:
 28 |         pass
 29 | 
 30 | 
 31 | class DoclingDocumentConversion(DocumentConversionBase):
 32 |     """Document conversion implementation using Docling.
 33 | 
 34 |     You can initialize with default pipeline options or provide your own:
 35 | 
 36 |     Example:
 37 |         ```python
 38 |         # Using default options
 39 |         converter = DoclingDocumentConversion()
 40 | 
 41 |         # Or customize with your own pipeline options
 42 |         pipeline_options = PdfPipelineOptions()
 43 |         pipeline_options.do_ocr = True
 44 |         pipeline_options.ocr_options = RapidOcrOptions()  # Use RapidOcrOptions instead of EasyOCR (note : you need to install the OCR package)
 45 |         pipeline_options.generate_page_images = True
 46 | 
 47 |         converter = DoclingDocumentConversion(pipeline_options=pipeline_options)
 48 |         ```
 49 |     """
 50 | 
 51 |     def __init__(self, pipeline_options: PdfPipelineOptions = None):
 52 |         self.pipeline_options = pipeline_options if pipeline_options else self._setup_default_pipeline_options()
 53 | 
 54 |     def _update_pipeline_options(self, extract_tables: bool, image_resolution_scale: int) -> PdfPipelineOptions:
 55 |         self.pipeline_options.images_scale = image_resolution_scale
 56 |         self.pipeline_options.generate_table_images = extract_tables
 57 |         return self.pipeline_options
 58 | 
 59 |     @staticmethod
 60 |     def _setup_default_pipeline_options() -> PdfPipelineOptions:
 61 |         pipeline_options = PdfPipelineOptions()
 62 |         pipeline_options.generate_page_images = False
 63 |         pipeline_options.generate_picture_images = True
 64 |         pipeline_options.ocr_options = EasyOcrOptions(lang=["fr", "de", "es", "en", "it", "pt"])
 65 | 
 66 |         return pipeline_options
 67 | 
 68 |     @staticmethod
 69 |     def _process_document_images(conv_res) -> Tuple[str, List[ImageData]]:
 70 |         images = []
 71 |         table_counter = 0
 72 |         picture_counter = 0
 73 |         content_md = conv_res.document.export_to_markdown(image_mode=ImageRefMode.PLACEHOLDER)
 74 | 
 75 |         for element, _level in conv_res.document.iterate_items():
 76 |             if isinstance(element, (TableItem, PictureItem)) and element.image:
 77 |                 img_buffer = BytesIO()
 78 |                 element.image.pil_image.save(img_buffer, format="PNG")
 79 | 
 80 |                 if isinstance(element, TableItem):
 81 |                     table_counter += 1
 82 |                     image_name = f"table-{table_counter}.png"
 83 |                     image_type = "table"
 84 |                 else:
 85 |                     picture_counter += 1
 86 |                     image_name = f"picture-{picture_counter}.png"
 87 |                     image_type = "picture"
 88 |                     content_md = content_md.replace("<!-- image -->", image_name, 1)
 89 | 
 90 |                 image_bytes = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
 91 |                 images.append(ImageData(type=image_type, filename=image_name, image=image_bytes))
 92 | 
 93 |         return content_md, images
 94 | 
 95 |     def convert(
 96 |         self,
 97 |         document: Tuple[str, BytesIO],
 98 |         extract_tables: bool = False,
 99 |         image_resolution_scale: int = IMAGE_RESOLUTION_SCALE,
100 |     ) -> ConversionResult:
101 |         filename, file = document
102 |         pipeline_options = self._update_pipeline_options(extract_tables, image_resolution_scale)
103 |         doc_converter = DocumentConverter(
104 |             format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
105 |         )
106 | 
107 |         if filename.lower().endswith('.csv'):
108 |             file, error = handle_csv_file(file)
109 |             if error:
110 |                 return ConversionResult(filename=filename, error=error)
111 | 
112 |         conv_res = doc_converter.convert(DocumentStream(name=filename, stream=file), raises_on_error=False)
113 |         doc_filename = conv_res.input.file.stem
114 | 
115 |         if conv_res.errors:
116 |             logging.error(f"Failed to convert {filename}: {conv_res.errors[0].error_message}")
117 |             return ConversionResult(filename=doc_filename, error=conv_res.errors[0].error_message)
118 | 
119 |         content_md, images = self._process_document_images(conv_res)
120 |         return ConversionResult(filename=doc_filename, markdown=content_md, images=images)
121 | 
122 |     def convert_batch(
123 |         self,
124 |         documents: List[Tuple[str, BytesIO]],
125 |         extract_tables: bool = False,
126 |         image_resolution_scale: int = IMAGE_RESOLUTION_SCALE,
127 |     ) -> List[ConversionResult]:
128 |         pipeline_options = self._update_pipeline_options(extract_tables, image_resolution_scale)
129 |         doc_converter = DocumentConverter(
130 |             format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
131 |         )
132 | 
133 |         conv_results = doc_converter.convert_all(
134 |             [DocumentStream(name=filename, stream=file) for filename, file in documents],
135 |             raises_on_error=False,
136 |         )
137 | 
138 |         results = []
139 |         for conv_res in conv_results:
140 |             doc_filename = conv_res.input.file.stem
141 | 
142 |             if conv_res.errors:
143 |                 logging.error(f"Failed to convert {conv_res.input.name}: {conv_res.errors[0].error_message}")
144 |                 results.append(ConversionResult(filename=conv_res.input.name, error=conv_res.errors[0].error_message))
145 |                 continue
146 | 
147 |             content_md, images = self._process_document_images(conv_res)
148 |             results.append(ConversionResult(filename=doc_filename, markdown=content_md, images=images))
149 | 
150 |         return results
151 | 
152 | 
153 | class DocumentConverterService:
154 |     def __init__(self, document_converter: DocumentConversionBase):
155 |         self.document_converter = document_converter
156 | 
157 |     def convert_document(self, document: Tuple[str, BytesIO], **kwargs) -> ConversionResult:
158 |         result = self.document_converter.convert(document, **kwargs)
159 |         if result.error:
160 |             logging.error(f"Failed to convert {document[0]}: {result.error}")
161 |             raise HTTPException(status_code=500, detail=result.error)
162 |         return result
163 | 
164 |     def convert_documents(self, documents: List[Tuple[str, BytesIO]], **kwargs) -> List[ConversionResult]:
165 |         return self.document_converter.convert_batch(documents, **kwargs)
166 | 
167 |     def convert_document_task(
168 |         self,
169 |         document: Tuple[str, bytes],
170 |         **kwargs,
171 |     ) -> ConversionResult:
172 |         document = (document[0], BytesIO(document[1]))
173 |         return self.document_converter.convert(document, **kwargs)
174 | 
175 |     def convert_documents_task(
176 |         self,
177 |         documents: List[Tuple[str, bytes]],
178 |         **kwargs,
179 |     ) -> List[ConversionResult]:
180 |         documents = [(filename, BytesIO(file)) for filename, file in documents]
181 |         return self.document_converter.convert_batch(documents, **kwargs)
182 | 
183 |     def get_single_document_task_result(self, job_id: str) -> ConversationJobResult:
184 |         """Get the status and result of a document conversion job.
185 | 
186 |         Returns:
187 |         - IN_PROGRESS: When task is still running
188 |         - SUCCESS: When conversion completed successfully
189 |         - FAILURE: When task failed or conversion had errors
190 |         """
191 | 
192 |         task = AsyncResult(job_id)
193 |         if task.state == 'PENDING':
194 |             return ConversationJobResult(job_id=job_id, status="IN_PROGRESS")
195 | 
196 |         elif task.state == 'SUCCESS':
197 |             result = task.get()
198 |             # Check if the conversion result contains an error
199 |             if result.get('error'):
200 |                 return ConversationJobResult(job_id=job_id, status="FAILURE", error=result['error'])
201 | 
202 |             return ConversationJobResult(job_id=job_id, status="SUCCESS", result=ConversionResult(**result))
203 | 
204 |         else:
205 |             return ConversationJobResult(job_id=job_id, status="FAILURE", error=str(task.result))
206 | 
207 |     def get_batch_conversion_task_result(self, job_id: str) -> BatchConversionJobResult:
208 |         """Get the status and results of a batch conversion job.
209 | 
210 |         Returns:
211 |         - IN_PROGRESS: When task is still running
212 |         - SUCCESS: A batch is successful as long as the task is successful
213 |         - FAILURE: When the task fails for any reason
214 |         """
215 | 
216 |         task = AsyncResult(job_id)
217 |         if task.state == 'PENDING':
218 |             return BatchConversionJobResult(job_id=job_id, status="IN_PROGRESS")
219 | 
220 |         # Task completed successfully, but need to check individual conversion results
221 |         if task.state == 'SUCCESS':
222 |             conversion_results = task.get()
223 |             job_results = []
224 | 
225 |             for result in conversion_results:
226 |                 if result.get('error'):
227 |                     job_result = ConversationJobResult(status="FAILURE", error=result['error'])
228 |                 else:
229 |                     job_result = ConversationJobResult(
230 |                         status="SUCCESS", result=ConversionResult(**result).model_dump(exclude_unset=True)
231 |                     )
232 |                 job_results.append(job_result)
233 | 
234 |             return BatchConversionJobResult(job_id=job_id, status="SUCCESS", conversion_results=job_results)
235 | 
236 |         return BatchConversionJobResult(job_id=job_id, status="FAILURE", error=str(task.result))
237 | 


--------------------------------------------------------------------------------
/document_converter/utils.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | import re
  3 | from enum import Enum
  4 | from typing import Dict, List, Optional, Tuple
  5 | 
  6 | import filetype
  7 | 
  8 | 
  9 | class InputFormat(str, Enum):
 10 |     DOCX = "docx"
 11 |     PPTX = "pptx"
 12 |     HTML = "html"
 13 |     IMAGE = "image"
 14 |     PDF = "pdf"
 15 |     ASCIIDOC = "asciidoc"
 16 |     MD = "md"
 17 |     CSV = "csv"
 18 | 
 19 | 
 20 | class OutputFormat(str, Enum):
 21 |     MARKDOWN = "md"
 22 |     JSON = "json"
 23 |     TEXT = "text"
 24 |     DOCTAGS = "doctags"
 25 | 
 26 | 
 27 | FormatToExtensions: Dict[InputFormat, List[str]] = {
 28 |     InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
 29 |     InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
 30 |     InputFormat.PDF: ["pdf"],
 31 |     InputFormat.MD: ["md"],
 32 |     InputFormat.HTML: ["html", "htm", "xhtml"],
 33 |     InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
 34 |     InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
 35 |     InputFormat.CSV: ["csv"],
 36 | }
 37 | 
 38 | FormatToMimeType: Dict[InputFormat, List[str]] = {
 39 |     InputFormat.DOCX: [
 40 |         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
 41 |         "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
 42 |     ],
 43 |     InputFormat.PPTX: [
 44 |         "application/vnd.openxmlformats-officedocument.presentationml.template",
 45 |         "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
 46 |         "application/vnd.openxmlformats-officedocument.presentationml.presentation",
 47 |     ],
 48 |     InputFormat.HTML: ["text/html", "application/xhtml+xml"],
 49 |     InputFormat.IMAGE: [
 50 |         "image/png",
 51 |         "image/jpeg",
 52 |         "image/tiff",
 53 |         "image/gif",
 54 |         "image/bmp",
 55 |     ],
 56 |     InputFormat.PDF: ["application/pdf"],
 57 |     InputFormat.ASCIIDOC: ["text/asciidoc"],
 58 |     InputFormat.MD: ["text/markdown", "text/x-markdown"],
 59 |     InputFormat.CSV: ["text/csv"],
 60 | }
 61 | MimeTypeToFormat = {mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes}
 62 | 
 63 | 
 64 | def detect_html_xhtml(content):
 65 |     content_str = content.decode("ascii", errors="ignore").lower()
 66 |     # Remove XML comments
 67 |     content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
 68 |     content_str = content_str.lstrip()
 69 | 
 70 |     if re.match(r"<\?xml", content_str):
 71 |         if "xhtml" in content_str[:1000]:
 72 |             return "application/xhtml+xml"
 73 | 
 74 |     if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
 75 |         return "text/html"
 76 | 
 77 |     return None
 78 | 
 79 | 
 80 | def is_csv_file(filename: str) -> bool:
 81 |     """Check if a file is a CSV based on its extension."""
 82 |     return filename and filename.lower().endswith('.csv')
 83 | 
 84 | 
 85 | def guess_format(obj: bytes, filename: str = None):
 86 |     content = b""
 87 |     mime = None
 88 | 
 89 |     if isinstance(obj, bytes):
 90 |         content = obj
 91 |         # Special handling for CSV files
 92 |         if is_csv_file(filename):
 93 |             return InputFormat.CSV
 94 | 
 95 |         mime = filetype.guess_mime(content)
 96 |         if mime is None:
 97 |             ext = filename.rsplit(".", 1)[-1] if ("." in filename and not filename.startswith(".")) else ""
 98 |             mime = mime_from_extension(ext)
 99 | 
100 |     mime = mime or detect_html_xhtml(content)
101 |     mime = mime or "text/plain"
102 |     return MimeTypeToFormat.get(mime)
103 | 
104 | 
105 | def handle_csv_file(file: BytesIO) -> Tuple[BytesIO, Optional[str]]:
106 |     """Handle CSV file encoding by trying multiple encodings.
107 | 
108 |     Returns:
109 |         Tuple[BytesIO, Optional[str]]: (processed file, error message if any)
110 |     """
111 |     SUPPORTED_CSV_ENCODINGS = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
112 |     for encoding in SUPPORTED_CSV_ENCODINGS:
113 |         try:
114 |             file.seek(0)
115 |             content = file.read().decode(encoding)
116 |             return BytesIO(content.encode('utf-8')), None
117 |         except UnicodeDecodeError:
118 |             continue
119 |     return file, f"Could not decode CSV file. Supported encodings: {', '.join(SUPPORTED_CSV_ENCODINGS)}"
120 | 
121 | 
122 | def mime_from_extension(ext):
123 |     mime = None
124 |     if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
125 |         mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
126 |     elif ext in FormatToExtensions[InputFormat.HTML]:
127 |         mime = FormatToMimeType[InputFormat.HTML][0]
128 |     elif ext in FormatToExtensions[InputFormat.MD]:
129 |         mime = FormatToMimeType[InputFormat.MD][0]
130 |     elif ext in FormatToExtensions[InputFormat.CSV]:
131 |         mime = FormatToMimeType[InputFormat.CSV][0]
132 | 
133 |     return mime
134 | 
135 | 
136 | def is_file_format_supported(file_bytes: bytes, filename: str) -> bool:
137 |     return guess_format(file_bytes, filename) in FormatToExtensions.keys()
138 | 


--------------------------------------------------------------------------------
/images/docling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/docling.png


--------------------------------------------------------------------------------
/images/marker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/marker.png


--------------------------------------------------------------------------------
/images/original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/original.png


--------------------------------------------------------------------------------
/images/pymupdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/pymupdf.png


--------------------------------------------------------------------------------
/images/pypdf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/images/pypdf.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from fastapi.middleware.cors import CORSMiddleware
 3 | 
 4 | from document_converter.route import router as document_converter_router
 5 | 
 6 | app = FastAPI()
 7 | 
 8 | 
 9 | app.add_middleware(
10 |     CORSMiddleware,
11 |     allow_origins=["*"],
12 |     allow_methods=["*"],
13 |     allow_headers=["*"],
14 |     allow_credentials=True,
15 | )
16 | 
17 | 
18 | app.include_router(document_converter_router, prefix="", tags=["document-converter"])
19 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "document-to-markdown"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["drmingler <davidemmanuel75@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.12"
10 | fastapi = "^0.115.4"
11 | uvicorn = "^0.32.0"
12 | docling = "^2.25.1"
13 | python-multipart = "^0.0.17"
14 | celery = "^5.4.0"
15 | flower = "^2.0.1"
16 | redis = "^5.2.0"
17 | gunicorn = "^23.0.0"
18 | 
19 | 
20 | [build-system]
21 | requires = ["poetry-core"]
22 | build-backend = "poetry.core.masonry.api"
23 | 


--------------------------------------------------------------------------------
/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drmingler/docling-api/e10fd49ea6767d1d3b19f535b8d668f8a18ff953/worker/__init__.py


--------------------------------------------------------------------------------
/worker/celery_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from celery import Celery
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv(".env")
 6 | celery_app = Celery(
 7 |     "document_converter",
 8 |     broker=os.environ.get("REDIS_HOST", "redis://localhost:6379/0"),
 9 |     backend=os.environ.get("REDIS_HOST", "redis://localhost:6379/0"),
10 |     include=["worker.tasks"],
11 | )
12 | 


--------------------------------------------------------------------------------
/worker/tasks.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List, Tuple
 2 | from document_converter.service import IMAGE_RESOLUTION_SCALE, DoclingDocumentConversion, DocumentConverterService
 3 | from worker.celery_config import celery_app
 4 | 
 5 | 
 6 | @celery_app.task(name="celery.ping")
 7 | def ping():
 8 |     print("Ping task received!")  # or use a logger
 9 |     return "pong"
10 | 
11 | 
12 | @celery_app.task(bind=True, name="convert_document")
13 | def convert_document_task(
14 |     self,
15 |     document: Tuple[str, bytes],
16 |     extract_tables: bool = False,
17 |     image_resolution_scale: int = IMAGE_RESOLUTION_SCALE,
18 | ) -> Dict[str, Any]:
19 |     document_service = DocumentConverterService(document_converter=DoclingDocumentConversion())
20 |     result = document_service.convert_document_task(
21 |         document, extract_tables=extract_tables, image_resolution_scale=image_resolution_scale
22 |     )
23 |     return result.model_dump(exclude_unset=True)
24 | 
25 | 
26 | @celery_app.task(bind=True, name="convert_documents")
27 | def convert_documents_task(
28 |     self,
29 |     documents: List[Tuple[str, bytes]],
30 |     extract_tables: bool = False,
31 |     image_resolution_scale: int = IMAGE_RESOLUTION_SCALE,
32 | ) -> List[Dict[str, Any]]:
33 |     document_service = DocumentConverterService(document_converter=DoclingDocumentConversion())
34 |     results = document_service.convert_documents_task(
35 |         documents, extract_tables=extract_tables, image_resolution_scale=image_resolution_scale
36 |     )
37 |     return [result.model_dump(exclude_unset=True) for result in results]
38 | 


--------------------------------------------------------------------------------