├── swift_ocr ├── py.typed ├── api │ ├── routes │ │ ├── __init__.py │ │ ├── health.py │ │ └── ocr.py │ ├── __init__.py │ ├── router.py │ ├── deps.py │ └── exceptions.py ├── config │ ├── __init__.py │ └── settings.py ├── services │ ├── __init__.py │ ├── pdf.py │ └── ocr.py ├── schemas │ ├── __init__.py │ └── ocr.py ├── __init__.py ├── core │ ├── __init__.py │ ├── logging.py │ ├── exceptions.py │ └── retry.py ├── __main__.py └── app.py ├── main.py ├── requirements.txt ├── .gitignore ├── LICENSE.md ├── .env.example ├── pyproject.toml └── README.md /swift_ocr/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /swift_ocr/api/routes/__init__.py: -------------------------------------------------------------------------------- 1 | """API route modules.""" 2 | 3 | from swift_ocr.api.routes import health, ocr 4 | 5 | __all__ = ["health", "ocr"] 6 | -------------------------------------------------------------------------------- /swift_ocr/config/__init__.py: -------------------------------------------------------------------------------- 1 | """Configuration module for Swift OCR.""" 2 | 3 | from swift_ocr.config.settings import Settings, get_settings 4 | 5 | __all__ = ["Settings", "get_settings"] 6 | -------------------------------------------------------------------------------- /swift_ocr/services/__init__.py: -------------------------------------------------------------------------------- 1 | """Services for Swift OCR.""" 2 | 3 | from swift_ocr.services.pdf import PDFService 4 | from swift_ocr.services.ocr import OCRService 5 | 6 | __all__ = ["PDFService", "OCRService"] 7 | -------------------------------------------------------------------------------- /swift_ocr/api/__init__.py: -------------------------------------------------------------------------------- 1 | """API module for Swift OCR.""" 2 | 3 | from swift_ocr.api.deps import get_ocr_service, get_pdf_service, get_settings 4 | from swift_ocr.api.router import api_router 5 | 6 | __all__ = [ 7 | "api_router", 8 | "get_ocr_service", 9 | "get_pdf_service", 10 | "get_settings", 11 | ] 12 | -------------------------------------------------------------------------------- /swift_ocr/api/router.py: -------------------------------------------------------------------------------- 1 | """ 2 | API router aggregating all route modules. 3 | """ 4 | 5 | from fastapi import APIRouter 6 | 7 | from swift_ocr.api.routes import health, ocr 8 | 9 | api_router = APIRouter() 10 | 11 | # Include route modules 12 | api_router.include_router(health.router, tags=["Health"]) 13 | api_router.include_router(ocr.router, tags=["OCR"]) 14 | -------------------------------------------------------------------------------- /swift_ocr/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | """Request and response schemas for Swift OCR.""" 2 | 3 | from swift_ocr.schemas.ocr import ( 4 | OCRRequest, 5 | OCRResponse, 6 | OCRStatus, 7 | HealthResponse, 8 | PageImage, 9 | ) 10 | 11 | __all__ = [ 12 | "OCRRequest", 13 | "OCRResponse", 14 | "OCRStatus", 15 | "HealthResponse", 16 | "PageImage", 17 | ] 18 | -------------------------------------------------------------------------------- /swift_ocr/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Swift OCR - LLM-powered PDF to Markdown converter. 3 | 4 | A high-performance OCR engine that uses GPT-4 Vision to convert PDFs 5 | into beautifully formatted Markdown. 6 | """ 7 | 8 | __version__ = "2.0.0" 9 | __author__ = "Yiğit Konur" 10 | __license__ = "AGPL-3.0" 11 | 12 | from swift_ocr.app import create_app 13 | 14 | __all__ = ["create_app", "__version__"] 15 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Swift OCR - Entry point for backward compatibility. 3 | 4 | This file provides backward compatibility with the original API. 5 | The refactored code is in the swift_ocr package. 6 | 7 | Usage: 8 | uvicorn main:app --reload 9 | 10 | Or use the new package directly: 11 | uvicorn swift_ocr.app:app --reload 12 | python -m swift_ocr 13 | """ 14 | 15 | from swift_ocr.app import app 16 | 17 | __all__ = ["app"] 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Swift OCR Dependencies 2 | # Python 3.8+ required 3 | 4 | # Web Framework 5 | fastapi>=0.109.0,<1.0.0 6 | uvicorn[standard]>=0.27.0,<1.0.0 7 | python-multipart>=0.0.6,<1.0.0 8 | 9 | # Configuration 10 | pydantic>=2.5.0,<3.0.0 11 | pydantic-settings>=2.1.0,<3.0.0 12 | python-dotenv>=1.0.0,<2.0.0 13 | 14 | # HTTP Client 15 | requests>=2.31.0,<3.0.0 16 | 17 | # PDF Processing 18 | PyMuPDF>=1.23.0,<2.0.0 19 | 20 | # OpenAI 21 | openai>=1.12.0,<2.0.0 22 | -------------------------------------------------------------------------------- /swift_ocr/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Core utilities for Swift OCR.""" 2 | 3 | from swift_ocr.core.exceptions import ( 4 | SwiftOCRError, 5 | PDFDownloadError, 6 | PDFConversionError, 7 | OCRProcessingError, 8 | RateLimitError, 9 | ValidationError, 10 | ) 11 | from swift_ocr.core.logging import get_logger, setup_logging 12 | from swift_ocr.core.retry import retry_with_backoff 13 | 14 | __all__ = [ 15 | "SwiftOCRError", 16 | "PDFDownloadError", 17 | "PDFConversionError", 18 | "OCRProcessingError", 19 | "RateLimitError", 20 | "ValidationError", 21 | "get_logger", 22 | "setup_logging", 23 | "retry_with_backoff", 24 | ] 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .nox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | *.py,cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Environments 54 | .env 55 | .venv 56 | env/ 57 | venv/ 58 | ENV/ 59 | env.bak/ 60 | venv.bak/ 61 | 62 | # IDE 63 | .idea/ 64 | .vscode/ 65 | *.swp 66 | *.swo 67 | *~ 68 | 69 | # macOS 70 | .DS_Store 71 | .AppleDouble 72 | .LSOverride 73 | ._* 74 | 75 | # Thumbnails 76 | Thumbs.db 77 | ehthumbs.db 78 | 79 | # Project specific 80 | *.pdf 81 | *.log 82 | temp/ 83 | tmp/ 84 | -------------------------------------------------------------------------------- /swift_ocr/api/deps.py: -------------------------------------------------------------------------------- 1 | """ 2 | FastAPI dependencies for dependency injection. 3 | 4 | Provides service instances and settings to route handlers. 5 | """ 6 | 7 | from functools import lru_cache 8 | from typing import Annotated, Generator 9 | 10 | from fastapi import Depends 11 | 12 | from swift_ocr.config import Settings 13 | from swift_ocr.config.settings import get_settings as _get_settings 14 | from swift_ocr.services.ocr import OCRService 15 | from swift_ocr.services.pdf import PDFService 16 | 17 | 18 | def get_settings() -> Settings: 19 | """Get application settings.""" 20 | return _get_settings() 21 | 22 | 23 | @lru_cache 24 | def get_pdf_service( 25 | settings: Annotated[Settings, Depends(get_settings)], 26 | ) -> PDFService: 27 | """ 28 | Get PDF service instance. 29 | 30 | Uses lru_cache to ensure single instance per settings configuration. 31 | """ 32 | return PDFService(settings) 33 | 34 | 35 | @lru_cache 36 | def get_ocr_service( 37 | settings: Annotated[Settings, Depends(get_settings)], 38 | ) -> OCRService: 39 | """ 40 | Get OCR service instance. 41 | 42 | Uses lru_cache to ensure single instance per settings configuration. 43 | """ 44 | return OCRService(settings) 45 | 46 | 47 | # Type aliases for cleaner dependency injection 48 | SettingsDep = Annotated[Settings, Depends(get_settings)] 49 | PDFServiceDep = Annotated[PDFService, Depends(get_pdf_service)] 50 | OCRServiceDep = Annotated[OCRService, Depends(get_ocr_service)] 51 | -------------------------------------------------------------------------------- /swift_ocr/api/routes/health.py: -------------------------------------------------------------------------------- 1 | """ 2 | Health check endpoints. 3 | 4 | Provides endpoints for monitoring application health. 5 | """ 6 | 7 | from datetime import datetime 8 | 9 | from fastapi import APIRouter 10 | 11 | from swift_ocr import __version__ 12 | from swift_ocr.api.deps import SettingsDep 13 | from swift_ocr.schemas import HealthResponse 14 | 15 | router = APIRouter() 16 | 17 | 18 | @router.get( 19 | "/health", 20 | response_model=HealthResponse, 21 | summary="Health Check", 22 | description="Check if the service is healthy and properly configured.", 23 | ) 24 | async def health_check(settings: SettingsDep) -> HealthResponse: 25 | """ 26 | Perform a health check. 27 | 28 | Returns: 29 | HealthResponse with current status and configuration info. 30 | """ 31 | return HealthResponse( 32 | status="healthy", 33 | version=__version__, 34 | timestamp=datetime.utcnow(), 35 | openai_configured=bool( 36 | settings.openai_api_key 37 | and settings.azure_openai_endpoint 38 | and settings.openai_deployment_id 39 | ), 40 | ) 41 | 42 | 43 | @router.get( 44 | "/", 45 | response_model=HealthResponse, 46 | summary="Root Health Check", 47 | description="Root endpoint returning health status.", 48 | include_in_schema=False, 49 | ) 50 | async def root_health(settings: SettingsDep) -> HealthResponse: 51 | """Root endpoint redirecting to health check.""" 52 | return await health_check(settings) 53 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2024 Yiğit Konur 5 | 6 | This program is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU Affero General Public License as published 8 | by the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This program is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU Affero General Public License for more details. 15 | 16 | You should have received a copy of the GNU Affero General Public License 17 | along with this program. If not, see . 18 | 19 | Also add information on how to contact you by electronic and paper mail. 20 | 21 | If your software can interact with users remotely through a computer 22 | network, you should also make sure that it provides a way for users to 23 | get its source. For example, if your program is a web application, its 24 | interface could display a "Source" link that leads users to an archive 25 | of the code. There are many ways you could offer source, and different 26 | solutions will be better for different programs; see section 13 for the 27 | specific requirements. 28 | 29 | You should also get your employer (if you work as a programmer) or school, 30 | if any, to sign a "copyright disclaimer" for the program, if necessary. 31 | For more information on this, and how to apply and follow the GNU AGPL, see 32 | . 33 | -------------------------------------------------------------------------------- /swift_ocr/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entry point for running Swift OCR as a module. 3 | 4 | Usage: 5 | python -m swift_ocr 6 | python -m swift_ocr --host 0.0.0.0 --port 8080 7 | """ 8 | 9 | import argparse 10 | import sys 11 | 12 | 13 | def main() -> int: 14 | """Main entry point for CLI.""" 15 | parser = argparse.ArgumentParser( 16 | prog="swift-ocr", 17 | description="Swift OCR - LLM-powered PDF to Markdown converter", 18 | ) 19 | parser.add_argument( 20 | "--host", 21 | type=str, 22 | default="0.0.0.0", 23 | help="Host to bind to (default: 0.0.0.0)", 24 | ) 25 | parser.add_argument( 26 | "--port", 27 | type=int, 28 | default=8000, 29 | help="Port to bind to (default: 8000)", 30 | ) 31 | parser.add_argument( 32 | "--reload", 33 | action="store_true", 34 | help="Enable auto-reload for development", 35 | ) 36 | parser.add_argument( 37 | "--workers", 38 | type=int, 39 | default=1, 40 | help="Number of worker processes (default: 1)", 41 | ) 42 | parser.add_argument( 43 | "--version", 44 | action="store_true", 45 | help="Show version and exit", 46 | ) 47 | 48 | args = parser.parse_args() 49 | 50 | if args.version: 51 | from swift_ocr import __version__ 52 | print(f"Swift OCR v{__version__}") 53 | return 0 54 | 55 | try: 56 | import uvicorn 57 | 58 | uvicorn.run( 59 | "swift_ocr.app:app", 60 | host=args.host, 61 | port=args.port, 62 | reload=args.reload, 63 | workers=args.workers if not args.reload else 1, 64 | log_level="info", 65 | ) 66 | return 0 67 | except KeyboardInterrupt: 68 | print("\nShutting down...") 69 | return 0 70 | except Exception as e: 71 | print(f"Error: {e}", file=sys.stderr) 72 | return 1 73 | 74 | 75 | if __name__ == "__main__": 76 | sys.exit(main()) 77 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Swift OCR Configuration 2 | # Copy this file to .env and fill in your values 3 | 4 | # =========================================== 5 | # REQUIRED: Azure OpenAI Configuration 6 | # =========================================== 7 | 8 | # Your Azure OpenAI API key 9 | OPENAI_API_KEY=your_api_key_here 10 | 11 | # Azure OpenAI endpoint URL 12 | # Format: https://your-resource-name.openai.azure.com/ 13 | AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ 14 | 15 | # Your GPT-4 Vision deployment name/ID 16 | OPENAI_DEPLOYMENT_ID=gpt-4-vision 17 | 18 | # =========================================== 19 | # OPTIONAL: API Configuration 20 | # =========================================== 21 | 22 | # OpenAI API version (default: 2024-02-15-preview) 23 | # OPENAI_API_VERSION=2024-02-15-preview 24 | 25 | # =========================================== 26 | # OPTIONAL: Performance Tuning 27 | # =========================================== 28 | 29 | # Number of pages to process per OCR request (1-10) 30 | # Lower = more accurate, Higher = faster 31 | # Default: 1 32 | # BATCH_SIZE=1 33 | 34 | # Maximum concurrent OCR API calls (1-50) 35 | # Higher values process faster but may hit rate limits 36 | # Default: 5 37 | # MAX_CONCURRENT_OCR_REQUESTS=5 38 | 39 | # Maximum concurrent PDF page conversions (1-16) 40 | # Match this to your CPU core count for best performance 41 | # Default: 4 42 | # MAX_CONCURRENT_PDF_CONVERSION=4 43 | 44 | # PDF rendering zoom factor (1-4) 45 | # Higher = better quality but larger images 46 | # Default: 2 47 | # PDF_ZOOM_FACTOR=2 48 | 49 | # =========================================== 50 | # OPTIONAL: OCR Model Parameters 51 | # =========================================== 52 | 53 | # Temperature for OCR model (0.0-2.0) 54 | # Lower = more deterministic output 55 | # Default: 0.1 56 | # OCR_TEMPERATURE=0.1 57 | 58 | # Maximum tokens in OCR response (100-128000) 59 | # Default: 4000 60 | # OCR_MAX_TOKENS=4000 61 | 62 | # =========================================== 63 | # OPTIONAL: Retry Configuration 64 | # =========================================== 65 | 66 | # Maximum retry attempts for failed requests 67 | # Default: 10 68 | # MAX_RETRIES=10 69 | 70 | # Initial retry delay in seconds 71 | # Default: 1.0 72 | # RETRY_BASE_DELAY=1.0 73 | 74 | # Maximum retry delay in seconds 75 | # Default: 120.0 76 | # RETRY_MAX_DELAY=120.0 77 | 78 | # =========================================== 79 | # OPTIONAL: Server Configuration 80 | # =========================================== 81 | 82 | # Server host 83 | # Default: 0.0.0.0 84 | # HOST=0.0.0.0 85 | 86 | # Server port 87 | # Default: 8000 88 | # PORT=8000 89 | 90 | # Enable debug mode (true/false) 91 | # Default: false 92 | # DEBUG=false 93 | -------------------------------------------------------------------------------- /swift_ocr/app.py: -------------------------------------------------------------------------------- 1 | """ 2 | FastAPI application factory. 3 | 4 | Creates and configures the FastAPI application instance. 5 | """ 6 | 7 | import logging 8 | from contextlib import asynccontextmanager 9 | from typing import AsyncGenerator 10 | 11 | from fastapi import FastAPI 12 | from fastapi.middleware.cors import CORSMiddleware 13 | 14 | from swift_ocr import __version__ 15 | from swift_ocr.api.exceptions import register_exception_handlers 16 | from swift_ocr.api.router import api_router 17 | from swift_ocr.config import get_settings 18 | from swift_ocr.core.logging import setup_logging, get_logger 19 | 20 | logger = get_logger(__name__) 21 | 22 | 23 | @asynccontextmanager 24 | async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: 25 | """ 26 | Application lifespan context manager. 27 | 28 | Handles startup and shutdown events. 29 | """ 30 | # Startup 31 | settings = get_settings() 32 | log_level = logging.DEBUG if settings.debug else logging.INFO 33 | setup_logging(level=log_level) 34 | 35 | logger.info(f"Starting Swift OCR v{__version__}") 36 | logger.info(f"Debug mode: {settings.debug}") 37 | logger.info(f"OpenAI endpoint: {settings.azure_openai_endpoint}") 38 | logger.info(f"Batch size: {settings.batch_size}") 39 | logger.info(f"Max concurrent OCR requests: {settings.max_concurrent_ocr_requests}") 40 | 41 | yield 42 | 43 | # Shutdown 44 | logger.info("Shutting down Swift OCR") 45 | 46 | 47 | def create_app() -> FastAPI: 48 | """ 49 | Create and configure the FastAPI application. 50 | 51 | Returns: 52 | Configured FastAPI application instance 53 | """ 54 | settings = get_settings() 55 | 56 | app = FastAPI( 57 | title="Swift OCR API", 58 | description=( 59 | "LLM-powered OCR API that converts PDFs to beautifully formatted Markdown. " 60 | "Uses GPT-4 Vision for human-level text extraction with table preservation, " 61 | "header detection, and image descriptions." 62 | ), 63 | version=__version__, 64 | docs_url="/docs", 65 | redoc_url="/redoc", 66 | openapi_url="/openapi.json", 67 | lifespan=lifespan, 68 | ) 69 | 70 | # Add CORS middleware 71 | app.add_middleware( 72 | CORSMiddleware, 73 | allow_origins=["*"], # Configure appropriately for production 74 | allow_credentials=True, 75 | allow_methods=["*"], 76 | allow_headers=["*"], 77 | ) 78 | 79 | # Register exception handlers 80 | register_exception_handlers(app) 81 | 82 | # Include API routes 83 | app.include_router(api_router) 84 | 85 | return app 86 | 87 | 88 | # Create default app instance for uvicorn 89 | app = create_app() 90 | -------------------------------------------------------------------------------- /swift_ocr/core/logging.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging configuration for Swift OCR. 3 | 4 | Provides structured logging with configurable levels and formats. 5 | """ 6 | 7 | import logging 8 | import sys 9 | from functools import lru_cache 10 | from typing import Optional 11 | 12 | 13 | class ColoredFormatter(logging.Formatter): 14 | """Custom formatter with colored output for terminals.""" 15 | 16 | COLORS = { 17 | "DEBUG": "\033[36m", # Cyan 18 | "INFO": "\033[32m", # Green 19 | "WARNING": "\033[33m", # Yellow 20 | "ERROR": "\033[31m", # Red 21 | "CRITICAL": "\033[35m", # Magenta 22 | } 23 | RESET = "\033[0m" 24 | 25 | def format(self, record: logging.LogRecord) -> str: 26 | # Add color to levelname 27 | color = self.COLORS.get(record.levelname, "") 28 | record.levelname = f"{color}{record.levelname}{self.RESET}" 29 | return super().format(record) 30 | 31 | 32 | def setup_logging( 33 | level: int = logging.INFO, 34 | *, 35 | use_colors: bool = True, 36 | log_format: Optional[str] = None, 37 | ) -> None: 38 | """ 39 | Configure application logging. 40 | 41 | Args: 42 | level: Logging level (default: INFO) 43 | use_colors: Enable colored output in terminals 44 | log_format: Custom log format string 45 | """ 46 | format_str = log_format or "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s" 47 | 48 | # Create handler 49 | handler = logging.StreamHandler(sys.stdout) 50 | 51 | # Use colored formatter if enabled and stdout is a TTY 52 | if use_colors and sys.stdout.isatty(): 53 | formatter = ColoredFormatter(format_str, datefmt="%Y-%m-%d %H:%M:%S") 54 | else: 55 | formatter = logging.Formatter(format_str, datefmt="%Y-%m-%d %H:%M:%S") 56 | 57 | handler.setFormatter(formatter) 58 | 59 | # Configure root logger 60 | root_logger = logging.getLogger() 61 | root_logger.setLevel(level) 62 | 63 | # Remove existing handlers to avoid duplicates 64 | root_logger.handlers.clear() 65 | root_logger.addHandler(handler) 66 | 67 | # Reduce noise from third-party libraries 68 | logging.getLogger("httpx").setLevel(logging.WARNING) 69 | logging.getLogger("httpcore").setLevel(logging.WARNING) 70 | logging.getLogger("openai").setLevel(logging.WARNING) 71 | logging.getLogger("urllib3").setLevel(logging.WARNING) 72 | 73 | 74 | @lru_cache 75 | def get_logger(name: str) -> logging.Logger: 76 | """ 77 | Get a logger instance with the given name. 78 | 79 | Uses lru_cache to avoid creating multiple loggers for the same name. 80 | 81 | Args: 82 | name: Logger name (typically __name__) 83 | 84 | Returns: 85 | logging.Logger: Configured logger instance 86 | """ 87 | return logging.getLogger(name) 88 | -------------------------------------------------------------------------------- /swift_ocr/api/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Exception handlers for FastAPI. 3 | 4 | Provides consistent error responses across the application. 5 | """ 6 | 7 | from fastapi import FastAPI, HTTPException, Request 8 | from fastapi.responses import JSONResponse 9 | from pydantic import ValidationError as PydanticValidationError 10 | 11 | from swift_ocr.core.exceptions import SwiftOCRError 12 | from swift_ocr.core.logging import get_logger 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | def register_exception_handlers(app: FastAPI) -> None: 18 | """ 19 | Register all exception handlers with the FastAPI app. 20 | 21 | Args: 22 | app: FastAPI application instance 23 | """ 24 | 25 | @app.exception_handler(SwiftOCRError) 26 | async def swift_ocr_error_handler( 27 | request: Request, 28 | exc: SwiftOCRError, 29 | ) -> JSONResponse: 30 | """Handle Swift OCR custom exceptions.""" 31 | logger.error( 32 | f"SwiftOCRError: {exc.message}", 33 | extra={"context": exc.context, "status_code": exc.status_code}, 34 | ) 35 | return JSONResponse( 36 | status_code=exc.status_code, 37 | content={ 38 | "detail": exc.detail, 39 | "error": { 40 | "message": exc.message, 41 | "type": type(exc).__name__, 42 | "context": exc.context if exc.context else None, 43 | }, 44 | }, 45 | ) 46 | 47 | @app.exception_handler(HTTPException) 48 | async def http_exception_handler( 49 | request: Request, 50 | exc: HTTPException, 51 | ) -> JSONResponse: 52 | """Handle FastAPI HTTP exceptions.""" 53 | logger.error(f"HTTPException: {exc.detail}", extra={"status_code": exc.status_code}) 54 | return JSONResponse( 55 | status_code=exc.status_code, 56 | content={"detail": exc.detail}, 57 | ) 58 | 59 | @app.exception_handler(PydanticValidationError) 60 | async def pydantic_validation_handler( 61 | request: Request, 62 | exc: PydanticValidationError, 63 | ) -> JSONResponse: 64 | """Handle Pydantic validation errors.""" 65 | logger.error(f"ValidationError: {exc.errors()}") 66 | return JSONResponse( 67 | status_code=422, 68 | content={ 69 | "detail": "Validation error", 70 | "errors": exc.errors(), 71 | }, 72 | ) 73 | 74 | @app.exception_handler(Exception) 75 | async def unhandled_exception_handler( 76 | request: Request, 77 | exc: Exception, 78 | ) -> JSONResponse: 79 | """Handle any unhandled exceptions.""" 80 | logger.exception(f"Unhandled exception: {exc}") 81 | return JSONResponse( 82 | status_code=500, 83 | content={ 84 | "detail": "An unexpected error occurred. Please try again later.", 85 | }, 86 | ) 87 | -------------------------------------------------------------------------------- /swift_ocr/config/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Application settings using Pydantic Settings. 3 | 4 | Supports loading from environment variables and .env files. 5 | """ 6 | 7 | from functools import lru_cache 8 | from typing import Optional 9 | 10 | from pydantic import Field, field_validator 11 | from pydantic_settings import BaseSettings, SettingsConfigDict 12 | 13 | 14 | class Settings(BaseSettings): 15 | """ 16 | Application settings with validation. 17 | 18 | All settings can be overridden via environment variables. 19 | """ 20 | 21 | model_config = SettingsConfigDict( 22 | env_file=".env", 23 | env_file_encoding="utf-8", 24 | case_sensitive=False, 25 | extra="ignore", 26 | ) 27 | 28 | # API Configuration 29 | app_name: str = Field(default="Swift OCR", description="Application name") 30 | app_version: str = Field(default="2.0.0", description="Application version") 31 | debug: bool = Field(default=False, description="Enable debug mode") 32 | 33 | # OpenAI / Azure OpenAI Configuration 34 | openai_api_key: str = Field(..., description="OpenAI API key") 35 | azure_openai_endpoint: str = Field(..., description="Azure OpenAI endpoint URL") 36 | openai_deployment_id: str = Field(..., description="OpenAI deployment/model ID") 37 | openai_api_version: str = Field(default="2024-02-15-preview", description="OpenAI API version") 38 | 39 | # OCR Configuration 40 | batch_size: int = Field( 41 | default=1, 42 | ge=1, 43 | le=10, 44 | description="Number of pages to process per OCR request (1-10)" 45 | ) 46 | max_concurrent_ocr_requests: int = Field( 47 | default=5, 48 | ge=1, 49 | le=50, 50 | description="Maximum concurrent OCR API calls" 51 | ) 52 | max_concurrent_pdf_conversion: int = Field( 53 | default=4, 54 | ge=1, 55 | le=16, 56 | description="Maximum concurrent PDF page conversions" 57 | ) 58 | 59 | # OCR Model Parameters 60 | ocr_temperature: float = Field(default=0.1, ge=0.0, le=2.0) 61 | ocr_max_tokens: int = Field(default=4000, ge=100, le=128000) 62 | ocr_top_p: float = Field(default=0.95, ge=0.0, le=1.0) 63 | 64 | # Retry Configuration 65 | max_retries: int = Field(default=10, ge=1, le=50) 66 | retry_base_delay: float = Field(default=1.0, ge=0.1) 67 | retry_max_delay: float = Field(default=120.0, ge=1.0) 68 | 69 | # PDF Configuration 70 | pdf_zoom_factor: int = Field(default=2, ge=1, le=4, description="PDF rendering zoom factor") 71 | pdf_download_timeout: int = Field(default=30, ge=5, le=300, description="PDF download timeout in seconds") 72 | 73 | # Server Configuration 74 | host: str = Field(default="0.0.0.0", description="Server host") 75 | port: int = Field(default=8000, ge=1, le=65535, description="Server port") 76 | 77 | @field_validator("azure_openai_endpoint") 78 | @classmethod 79 | def validate_endpoint(cls, v: str) -> str: 80 | """Ensure endpoint has proper format.""" 81 | if not v.startswith(("http://", "https://")): 82 | raise ValueError("Azure OpenAI endpoint must start with http:// or https://") 83 | return v.rstrip("/") 84 | 85 | @property 86 | def is_production(self) -> bool: 87 | """Check if running in production mode.""" 88 | return not self.debug 89 | 90 | 91 | @lru_cache 92 | def get_settings() -> Settings: 93 | """ 94 | Get cached settings instance. 95 | 96 | Uses lru_cache to ensure settings are only loaded once. 97 | Call `get_settings.cache_clear()` to reload. 98 | 99 | Returns: 100 | Settings: The application settings. 101 | 102 | Raises: 103 | ValidationError: If required settings are missing or invalid. 104 | """ 105 | return Settings() 106 | -------------------------------------------------------------------------------- /swift_ocr/core/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom exceptions for Swift OCR. 3 | 4 | Provides a hierarchy of exceptions for better error handling and reporting. 5 | """ 6 | 7 | from typing import Any, Optional 8 | 9 | 10 | class SwiftOCRError(Exception): 11 | """Base exception for all Swift OCR errors.""" 12 | 13 | def __init__( 14 | self, 15 | message: str, 16 | *, 17 | status_code: int = 500, 18 | detail: Optional[str] = None, 19 | context: Optional[dict[str, Any]] = None, 20 | ) -> None: 21 | super().__init__(message) 22 | self.message = message 23 | self.status_code = status_code 24 | self.detail = detail or message 25 | self.context = context or {} 26 | 27 | def __str__(self) -> str: 28 | if self.context: 29 | return f"{self.message} (context: {self.context})" 30 | return self.message 31 | 32 | 33 | class ValidationError(SwiftOCRError): 34 | """Raised when input validation fails.""" 35 | 36 | def __init__( 37 | self, 38 | message: str, 39 | *, 40 | field: Optional[str] = None, 41 | context: Optional[dict[str, Any]] = None, 42 | ) -> None: 43 | ctx = context or {} 44 | if field: 45 | ctx["field"] = field 46 | super().__init__(message, status_code=400, context=ctx) 47 | 48 | 49 | class PDFDownloadError(SwiftOCRError): 50 | """Raised when PDF download fails.""" 51 | 52 | def __init__( 53 | self, 54 | message: str, 55 | *, 56 | url: Optional[str] = None, 57 | status_code: int = 400, 58 | context: Optional[dict[str, Any]] = None, 59 | ) -> None: 60 | ctx = context or {} 61 | if url: 62 | ctx["url"] = url 63 | super().__init__(message, status_code=status_code, context=ctx) 64 | 65 | 66 | class PDFConversionError(SwiftOCRError): 67 | """Raised when PDF to image conversion fails.""" 68 | 69 | def __init__( 70 | self, 71 | message: str, 72 | *, 73 | page_number: Optional[int] = None, 74 | context: Optional[dict[str, Any]] = None, 75 | ) -> None: 76 | ctx = context or {} 77 | if page_number is not None: 78 | ctx["page_number"] = page_number 79 | super().__init__(message, status_code=500, context=ctx) 80 | 81 | 82 | class OCRProcessingError(SwiftOCRError): 83 | """Raised when OCR processing fails.""" 84 | 85 | def __init__( 86 | self, 87 | message: str, 88 | *, 89 | batch_info: Optional[str] = None, 90 | status_code: int = 500, 91 | context: Optional[dict[str, Any]] = None, 92 | ) -> None: 93 | ctx = context or {} 94 | if batch_info: 95 | ctx["batch_info"] = batch_info 96 | super().__init__(message, status_code=status_code, context=ctx) 97 | 98 | 99 | class RateLimitError(SwiftOCRError): 100 | """Raised when API rate limit is exceeded.""" 101 | 102 | def __init__( 103 | self, 104 | message: str = "Rate limit exceeded", 105 | *, 106 | retry_after: Optional[float] = None, 107 | context: Optional[dict[str, Any]] = None, 108 | ) -> None: 109 | ctx = context or {} 110 | if retry_after is not None: 111 | ctx["retry_after"] = retry_after 112 | super().__init__(message, status_code=429, context=ctx) 113 | 114 | 115 | class TimeoutError(SwiftOCRError): 116 | """Raised when an operation times out.""" 117 | 118 | def __init__( 119 | self, 120 | message: str = "Operation timed out", 121 | *, 122 | timeout_seconds: Optional[float] = None, 123 | context: Optional[dict[str, Any]] = None, 124 | ) -> None: 125 | ctx = context or {} 126 | if timeout_seconds is not None: 127 | ctx["timeout_seconds"] = timeout_seconds 128 | super().__init__(message, status_code=504, context=ctx) 129 | -------------------------------------------------------------------------------- /swift_ocr/schemas/ocr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pydantic schemas for OCR requests and responses. 3 | 4 | Provides validated data models for the API. 5 | """ 6 | 7 | from datetime import datetime 8 | from enum import Enum 9 | from typing import Optional 10 | 11 | from pydantic import BaseModel, Field, HttpUrl, ConfigDict 12 | 13 | 14 | class OCRStatus(str, Enum): 15 | """OCR processing status.""" 16 | 17 | SUCCESS = "success" 18 | ERROR = "error" 19 | PARTIAL = "partial" 20 | 21 | 22 | class OCRRequest(BaseModel): 23 | """Request model for OCR endpoint.""" 24 | 25 | model_config = ConfigDict( 26 | json_schema_extra={ 27 | "example": { 28 | "url": "https://example.com/document.pdf" 29 | } 30 | } 31 | ) 32 | 33 | url: Optional[HttpUrl] = Field( 34 | default=None, 35 | description="URL of the PDF to process. Provide either this or upload a file." 36 | ) 37 | 38 | 39 | class PageImage(BaseModel): 40 | """Represents a single page image with metadata.""" 41 | 42 | page_number: int = Field(..., ge=1, description="1-indexed page number") 43 | data_url: str = Field(..., description="Base64-encoded image data URL") 44 | 45 | @property 46 | def image_size(self) -> int: 47 | """Get approximate size of the image in bytes.""" 48 | # Base64 encoding increases size by ~33% 49 | return int(len(self.data_url) * 0.75) 50 | 51 | 52 | class OCRResponse(BaseModel): 53 | """Response model for successful OCR processing.""" 54 | 55 | model_config = ConfigDict( 56 | json_schema_extra={ 57 | "example": { 58 | "text": "# Document Title\n\nExtracted content...", 59 | "status": "success", 60 | "pages_processed": 5, 61 | "processing_time_ms": 1234 62 | } 63 | } 64 | ) 65 | 66 | text: str = Field(..., description="Extracted text in Markdown format") 67 | status: OCRStatus = Field( 68 | default=OCRStatus.SUCCESS, 69 | description="Processing status" 70 | ) 71 | pages_processed: Optional[int] = Field( 72 | default=None, 73 | ge=0, 74 | description="Number of pages processed" 75 | ) 76 | processing_time_ms: Optional[int] = Field( 77 | default=None, 78 | ge=0, 79 | description="Processing time in milliseconds" 80 | ) 81 | 82 | 83 | class ErrorDetail(BaseModel): 84 | """Detailed error information.""" 85 | 86 | message: str = Field(..., description="Human-readable error message") 87 | code: Optional[str] = Field(default=None, description="Error code") 88 | context: Optional[dict] = Field(default=None, description="Additional context") 89 | 90 | 91 | class ErrorResponse(BaseModel): 92 | """Response model for errors.""" 93 | 94 | model_config = ConfigDict( 95 | json_schema_extra={ 96 | "example": { 97 | "detail": "No PDF file or URL provided", 98 | "error": { 99 | "message": "No PDF file or URL provided", 100 | "code": "VALIDATION_ERROR" 101 | } 102 | } 103 | } 104 | ) 105 | 106 | detail: str = Field(..., description="Error message") 107 | error: Optional[ErrorDetail] = Field(default=None, description="Detailed error info") 108 | 109 | 110 | class HealthResponse(BaseModel): 111 | """Response model for health check endpoint.""" 112 | 113 | model_config = ConfigDict( 114 | json_schema_extra={ 115 | "example": { 116 | "status": "healthy", 117 | "version": "2.0.0", 118 | "timestamp": "2024-01-01T00:00:00Z" 119 | } 120 | } 121 | ) 122 | 123 | status: str = Field(..., description="Health status") 124 | version: str = Field(..., description="Application version") 125 | timestamp: datetime = Field( 126 | default_factory=datetime.utcnow, 127 | description="Current server time" 128 | ) 129 | openai_configured: bool = Field( 130 | default=True, 131 | description="Whether OpenAI is properly configured" 132 | ) 133 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=68.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "swift-ocr" 7 | version = "2.0.0" 8 | description = "LLM-powered OCR engine that converts PDFs to beautifully formatted Markdown" 9 | readme = "README.md" 10 | license = {text = "AGPL-3.0"} 11 | authors = [ 12 | {name = "Yiğit Konur", email = "yigit@konur.dev"} 13 | ] 14 | maintainers = [ 15 | {name = "Yiğit Konur", email = "yigit@konur.dev"} 16 | ] 17 | keywords = [ 18 | "ocr", 19 | "pdf", 20 | "markdown", 21 | "gpt-4", 22 | "openai", 23 | "vision", 24 | "document-processing", 25 | "text-extraction", 26 | ] 27 | classifiers = [ 28 | "Development Status :: 4 - Beta", 29 | "Environment :: Web Environment", 30 | "Framework :: FastAPI", 31 | "Intended Audience :: Developers", 32 | "License :: OSI Approved :: GNU Affero General Public License v3", 33 | "Operating System :: OS Independent", 34 | "Programming Language :: Python :: 3", 35 | "Programming Language :: Python :: 3.8", 36 | "Programming Language :: Python :: 3.9", 37 | "Programming Language :: Python :: 3.10", 38 | "Programming Language :: Python :: 3.11", 39 | "Programming Language :: Python :: 3.12", 40 | "Topic :: Scientific/Engineering :: Image Recognition", 41 | "Topic :: Text Processing :: General", 42 | "Typing :: Typed", 43 | ] 44 | requires-python = ">=3.8" 45 | dependencies = [ 46 | "fastapi>=0.109.0,<1.0.0", 47 | "uvicorn[standard]>=0.27.0,<1.0.0", 48 | "python-multipart>=0.0.6,<1.0.0", 49 | "pydantic>=2.5.0,<3.0.0", 50 | "pydantic-settings>=2.1.0,<3.0.0", 51 | "python-dotenv>=1.0.0,<2.0.0", 52 | "requests>=2.31.0,<3.0.0", 53 | "PyMuPDF>=1.23.0,<2.0.0", 54 | "openai>=1.12.0,<2.0.0", 55 | ] 56 | 57 | [project.optional-dependencies] 58 | dev = [ 59 | "pytest>=7.4.0", 60 | "pytest-asyncio>=0.21.0", 61 | "pytest-cov>=4.1.0", 62 | "httpx>=0.25.0", 63 | "ruff>=0.1.0", 64 | "mypy>=1.7.0", 65 | "pre-commit>=3.5.0", 66 | ] 67 | 68 | [project.urls] 69 | Homepage = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown" 70 | Documentation = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown#readme" 71 | Repository = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown.git" 72 | Issues = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown/issues" 73 | 74 | [project.scripts] 75 | swift-ocr = "swift_ocr.__main__:main" 76 | 77 | [tool.setuptools.packages.find] 78 | where = ["."] 79 | include = ["swift_ocr*"] 80 | 81 | [tool.setuptools.package-data] 82 | swift_ocr = ["py.typed"] 83 | 84 | # Ruff configuration (linting + formatting) 85 | [tool.ruff] 86 | target-version = "py38" 87 | line-length = 100 88 | select = [ 89 | "E", # pycodestyle errors 90 | "W", # pycodestyle warnings 91 | "F", # Pyflakes 92 | "I", # isort 93 | "B", # flake8-bugbear 94 | "C4", # flake8-comprehensions 95 | "UP", # pyupgrade 96 | "ARG", # flake8-unused-arguments 97 | "SIM", # flake8-simplify 98 | ] 99 | ignore = [ 100 | "E501", # line too long (handled by formatter) 101 | "B008", # do not perform function calls in argument defaults 102 | "B904", # raise without from inside except 103 | ] 104 | 105 | [tool.ruff.isort] 106 | known-first-party = ["swift_ocr"] 107 | 108 | [tool.ruff.per-file-ignores] 109 | "__init__.py" = ["F401"] 110 | 111 | # MyPy configuration 112 | [tool.mypy] 113 | python_version = "3.8" 114 | warn_return_any = true 115 | warn_unused_ignores = true 116 | disallow_untyped_defs = true 117 | ignore_missing_imports = true 118 | 119 | [[tool.mypy.overrides]] 120 | module = "fitz.*" 121 | ignore_missing_imports = true 122 | 123 | # Pytest configuration 124 | [tool.pytest.ini_options] 125 | asyncio_mode = "auto" 126 | testpaths = ["tests"] 127 | python_files = ["test_*.py"] 128 | python_functions = ["test_*"] 129 | addopts = "-v --tb=short" 130 | 131 | # Coverage configuration 132 | [tool.coverage.run] 133 | source = ["swift_ocr"] 134 | branch = true 135 | omit = ["*/tests/*", "*/__main__.py"] 136 | 137 | [tool.coverage.report] 138 | exclude_lines = [ 139 | "pragma: no cover", 140 | "def __repr__", 141 | "raise NotImplementedError", 142 | "if TYPE_CHECKING:", 143 | "if __name__ == .__main__.:", 144 | ] 145 | -------------------------------------------------------------------------------- /swift_ocr/core/retry.py: -------------------------------------------------------------------------------- 1 | """ 2 | Retry utilities with exponential backoff. 3 | 4 | Provides decorators and functions for retrying failed operations. 5 | """ 6 | 7 | import asyncio 8 | from functools import wraps 9 | from typing import Any, Callable, Optional, Tuple, Type, TypeVar 10 | 11 | from swift_ocr.core.exceptions import RateLimitError, SwiftOCRError 12 | from swift_ocr.core.logging import get_logger 13 | 14 | logger = get_logger(__name__) 15 | 16 | T = TypeVar("T") 17 | 18 | 19 | async def retry_with_backoff( 20 | func: Callable[..., Any], 21 | *args: Any, 22 | max_retries: int = 10, 23 | base_delay: float = 1.0, 24 | max_delay: float = 120.0, 25 | retryable_exceptions: Tuple[Type[Exception], ...] = (RateLimitError, asyncio.TimeoutError), 26 | **kwargs: Any, 27 | ) -> Any: 28 | """ 29 | Retry an async function with exponential backoff. 30 | 31 | Args: 32 | func: The async function to retry 33 | *args: Positional arguments for the function 34 | max_retries: Maximum number of retry attempts 35 | base_delay: Initial delay in seconds 36 | max_delay: Maximum delay in seconds 37 | retryable_exceptions: Tuple of exceptions that should trigger a retry 38 | **kwargs: Keyword arguments for the function 39 | 40 | Returns: 41 | The result of the function if successful 42 | 43 | Raises: 44 | The last exception if all retries fail 45 | """ 46 | last_exception: Optional[Exception] = None 47 | 48 | for attempt in range(1, max_retries + 1): 49 | try: 50 | return await func(*args, **kwargs) 51 | except retryable_exceptions as e: 52 | last_exception = e 53 | 54 | if attempt == max_retries: 55 | logger.error( 56 | f"Max retries ({max_retries}) exceeded for {func.__name__}", 57 | extra={"error": str(e)}, 58 | ) 59 | raise 60 | 61 | # Calculate delay with exponential backoff 62 | delay = min(base_delay * (2 ** (attempt - 1)), max_delay) 63 | 64 | # If it's a rate limit error with retry_after, use that instead 65 | if isinstance(e, RateLimitError) and e.context.get("retry_after"): 66 | delay = e.context["retry_after"] 67 | 68 | logger.warning( 69 | f"Attempt {attempt}/{max_retries} failed for {func.__name__}. " 70 | f"Retrying in {delay:.1f}s...", 71 | extra={"error": str(e), "delay": delay}, 72 | ) 73 | 74 | await asyncio.sleep(delay) 75 | except Exception as e: 76 | # Non-retryable exception, raise immediately 77 | logger.error( 78 | f"Non-retryable error in {func.__name__}: {e}", 79 | extra={"error_type": type(e).__name__}, 80 | ) 81 | raise 82 | 83 | # This should never be reached, but just in case 84 | if last_exception: 85 | raise last_exception 86 | raise RuntimeError(f"Retry loop completed without result for {func.__name__}") 87 | 88 | 89 | def with_retry( 90 | max_retries: int = 10, 91 | base_delay: float = 1.0, 92 | max_delay: float = 120.0, 93 | retryable_exceptions: Tuple[Type[Exception], ...] = (RateLimitError, asyncio.TimeoutError), 94 | ) -> Callable[[Callable[..., T]], Callable[..., T]]: 95 | """ 96 | Decorator for adding retry logic to async functions. 97 | 98 | Args: 99 | max_retries: Maximum number of retry attempts 100 | base_delay: Initial delay in seconds 101 | max_delay: Maximum delay in seconds 102 | retryable_exceptions: Tuple of exceptions that should trigger a retry 103 | 104 | Returns: 105 | Decorated function with retry logic 106 | 107 | Example: 108 | @with_retry(max_retries=5, base_delay=2.0) 109 | async def call_api(): 110 | ... 111 | """ 112 | def decorator(func: Callable[..., T]) -> Callable[..., T]: 113 | @wraps(func) 114 | async def wrapper(*args: Any, **kwargs: Any) -> T: 115 | return await retry_with_backoff( 116 | func, 117 | *args, 118 | max_retries=max_retries, 119 | base_delay=base_delay, 120 | max_delay=max_delay, 121 | retryable_exceptions=retryable_exceptions, 122 | **kwargs, 123 | ) 124 | return wrapper 125 | return decorator 126 | -------------------------------------------------------------------------------- /swift_ocr/api/routes/ocr.py: -------------------------------------------------------------------------------- 1 | """ 2 | OCR API endpoints. 3 | 4 | Provides endpoints for PDF to Markdown conversion. 5 | """ 6 | 7 | import asyncio 8 | import time 9 | from typing import Optional 10 | 11 | from fastapi import APIRouter, File, Form, HTTPException, UploadFile 12 | 13 | from swift_ocr.api.deps import OCRServiceDep, PDFServiceDep, SettingsDep 14 | from swift_ocr.core.exceptions import ( 15 | OCRProcessingError, 16 | PDFConversionError, 17 | PDFDownloadError, 18 | SwiftOCRError, 19 | ValidationError, 20 | ) 21 | from swift_ocr.core.logging import get_logger 22 | from swift_ocr.schemas import OCRRequest, OCRResponse, OCRStatus 23 | 24 | logger = get_logger(__name__) 25 | 26 | router = APIRouter() 27 | 28 | 29 | @router.post( 30 | "/ocr", 31 | response_model=OCRResponse, 32 | summary="Extract Text from PDF", 33 | description="Convert a PDF document to Markdown text using OCR. " 34 | "Provide either a file upload or a URL to a PDF.", 35 | responses={ 36 | 200: {"description": "Successfully extracted text"}, 37 | 400: {"description": "Invalid input (no file/URL or invalid PDF)"}, 38 | 422: {"description": "Validation error"}, 39 | 429: {"description": "Rate limit exceeded"}, 40 | 500: {"description": "Internal processing error"}, 41 | 504: {"description": "Timeout during processing"}, 42 | }, 43 | ) 44 | async def ocr_endpoint( 45 | settings: SettingsDep, 46 | pdf_service: PDFServiceDep, 47 | ocr_service: OCRServiceDep, 48 | file: Optional[UploadFile] = File(None, description="PDF file to process"), 49 | url: Optional[str] = Form(None, description="URL of PDF to process"), 50 | ) -> OCRResponse: 51 | """ 52 | Perform OCR on a PDF document. 53 | 54 | Accepts either: 55 | - A PDF file upload via multipart/form-data 56 | - A URL pointing to a PDF file 57 | 58 | Returns the extracted text in Markdown format. 59 | """ 60 | start_time = time.perf_counter() 61 | 62 | try: 63 | # Validate input 64 | pdf_bytes = await _get_pdf_bytes(pdf_service, file, url) 65 | 66 | # Convert PDF to images 67 | logger.info("Converting PDF to images...") 68 | loop = asyncio.get_event_loop() 69 | pages = await loop.run_in_executor( 70 | None, 71 | pdf_service.convert_to_images, 72 | pdf_bytes, 73 | ) 74 | 75 | if not pages: 76 | raise ValidationError("PDF contains no pages") 77 | 78 | # Encode images to base64 79 | page_images = pdf_service.encode_pages_to_base64(pages) 80 | 81 | # Perform OCR 82 | logger.info(f"Starting OCR on {len(page_images)} pages...") 83 | extracted_text = await ocr_service.process_pages(page_images) 84 | 85 | if not extracted_text: 86 | raise OCRProcessingError("OCR completed but no text was extracted") 87 | 88 | # Calculate processing time 89 | processing_time_ms = int((time.perf_counter() - start_time) * 1000) 90 | 91 | logger.info( 92 | f"OCR complete: {len(extracted_text):,} chars from {len(pages)} pages " 93 | f"in {processing_time_ms}ms" 94 | ) 95 | 96 | return OCRResponse( 97 | text=extracted_text, 98 | status=OCRStatus.SUCCESS, 99 | pages_processed=len(pages), 100 | processing_time_ms=processing_time_ms, 101 | ) 102 | 103 | except SwiftOCRError as e: 104 | logger.error(f"OCR error: {e}") 105 | raise HTTPException(status_code=e.status_code, detail=e.detail) 106 | except HTTPException: 107 | raise 108 | except Exception as e: 109 | logger.exception(f"Unexpected error in OCR endpoint: {e}") 110 | raise HTTPException( 111 | status_code=500, 112 | detail="An unexpected error occurred during OCR processing", 113 | ) 114 | 115 | 116 | async def _get_pdf_bytes( 117 | pdf_service: PDFServiceDep, 118 | file: Optional[UploadFile], 119 | url: Optional[str], 120 | ) -> bytes: 121 | """ 122 | Get PDF bytes from either file upload or URL. 123 | 124 | Args: 125 | pdf_service: PDF service instance 126 | file: Uploaded file (optional) 127 | url: URL to download from (optional) 128 | 129 | Returns: 130 | PDF file content as bytes 131 | 132 | Raises: 133 | ValidationError: If input is invalid 134 | PDFDownloadError: If download fails 135 | """ 136 | # Validate that exactly one input is provided 137 | if not file and not url: 138 | raise ValidationError( 139 | "No PDF provided. Please upload a file or provide a URL.", 140 | field="file/url", 141 | ) 142 | 143 | if file and url: 144 | raise ValidationError( 145 | "Please provide either a file or a URL, not both.", 146 | field="file/url", 147 | ) 148 | 149 | if file: 150 | return await _read_uploaded_file(file) 151 | else: 152 | return pdf_service.download_pdf(url) 153 | 154 | 155 | async def _read_uploaded_file(file: UploadFile) -> bytes: 156 | """ 157 | Read and validate an uploaded PDF file. 158 | 159 | Args: 160 | file: Uploaded file 161 | 162 | Returns: 163 | PDF content as bytes 164 | 165 | Raises: 166 | ValidationError: If file is invalid 167 | """ 168 | # Check content type 169 | content_type = file.content_type or "" 170 | if content_type and "pdf" not in content_type.lower(): 171 | # Some clients don't send correct content-type, so we'll also check magic bytes 172 | pass 173 | 174 | try: 175 | pdf_bytes = await file.read() 176 | except Exception as e: 177 | raise ValidationError( 178 | f"Failed to read uploaded file: {e}", 179 | field="file", 180 | ) 181 | 182 | if not pdf_bytes: 183 | raise ValidationError( 184 | "Uploaded file is empty", 185 | field="file", 186 | ) 187 | 188 | # Check PDF magic bytes 189 | if not pdf_bytes.startswith(b"%PDF"): 190 | raise ValidationError( 191 | "Uploaded file is not a valid PDF", 192 | field="file", 193 | ) 194 | 195 | logger.info(f"Read uploaded PDF: {len(pdf_bytes):,} bytes") 196 | return pdf_bytes 197 | -------------------------------------------------------------------------------- /swift_ocr/services/pdf.py: -------------------------------------------------------------------------------- 1 | """ 2 | PDF processing service. 3 | 4 | Handles PDF download, conversion to images, and encoding. 5 | """ 6 | 7 | import base64 8 | import os 9 | import tempfile 10 | from concurrent.futures import ProcessPoolExecutor, as_completed 11 | from dataclasses import dataclass 12 | from typing import List, Optional, Tuple 13 | 14 | import fitz # PyMuPDF 15 | import requests 16 | 17 | from swift_ocr.config import Settings 18 | from swift_ocr.core.exceptions import PDFConversionError, PDFDownloadError 19 | from swift_ocr.core.logging import get_logger 20 | from swift_ocr.schemas import PageImage 21 | 22 | logger = get_logger(__name__) 23 | 24 | 25 | @dataclass 26 | class PDFPage: 27 | """Represents a rendered PDF page.""" 28 | 29 | page_number: int # 1-indexed 30 | image_bytes: bytes 31 | 32 | @property 33 | def size_bytes(self) -> int: 34 | return len(self.image_bytes) 35 | 36 | 37 | def _convert_single_page(args: Tuple[str, int, int]) -> Tuple[int, bytes]: 38 | """ 39 | Convert a single PDF page to PNG image bytes. 40 | 41 | This function runs in a separate process for parallelization. 42 | 43 | Args: 44 | args: Tuple of (pdf_path, page_index, zoom_factor) 45 | 46 | Returns: 47 | Tuple of (page_number, image_bytes) where page_number is 1-indexed 48 | """ 49 | pdf_path, page_index, zoom = args 50 | 51 | try: 52 | doc = fitz.open(pdf_path) 53 | page = doc.load_page(page_index) 54 | matrix = fitz.Matrix(zoom, zoom) 55 | pixmap = page.get_pixmap(matrix=matrix) 56 | image_bytes = pixmap.tobytes("png") 57 | doc.close() 58 | return (page_index + 1, image_bytes) # Convert to 1-indexed 59 | except Exception as e: 60 | raise PDFConversionError( 61 | f"Failed to render page {page_index + 1}", 62 | page_number=page_index + 1, 63 | context={"error": str(e)}, 64 | ) 65 | 66 | 67 | class PDFService: 68 | """ 69 | Service for PDF processing operations. 70 | 71 | Handles downloading, converting to images, and encoding PDFs. 72 | """ 73 | 74 | def __init__(self, settings: Settings) -> None: 75 | """ 76 | Initialize PDF service. 77 | 78 | Args: 79 | settings: Application settings 80 | """ 81 | self.settings = settings 82 | self._temp_files: List[str] = [] 83 | 84 | def download_pdf(self, url: str) -> bytes: 85 | """ 86 | Download a PDF file from a URL. 87 | 88 | Args: 89 | url: URL of the PDF file 90 | 91 | Returns: 92 | PDF file content as bytes 93 | 94 | Raises: 95 | PDFDownloadError: If download fails or content is not a PDF 96 | """ 97 | logger.info(f"Downloading PDF from: {url}") 98 | 99 | try: 100 | response = requests.get( 101 | str(url), 102 | timeout=self.settings.pdf_download_timeout, 103 | headers={"User-Agent": "SwiftOCR/2.0"}, 104 | ) 105 | response.raise_for_status() 106 | 107 | content_type = response.headers.get("Content-Type", "") 108 | if "application/pdf" not in content_type.lower(): 109 | # Some servers don't set content-type correctly, check magic bytes 110 | if not response.content.startswith(b"%PDF"): 111 | logger.warning(f"Invalid content type: {content_type}") 112 | raise PDFDownloadError( 113 | "URL does not point to a valid PDF file", 114 | url=url, 115 | context={"content_type": content_type}, 116 | ) 117 | 118 | logger.info(f"Downloaded PDF: {len(response.content):,} bytes") 119 | return response.content 120 | 121 | except requests.exceptions.Timeout: 122 | raise PDFDownloadError( 123 | "Timeout while downloading PDF", 124 | url=url, 125 | status_code=504, 126 | ) 127 | except requests.exceptions.HTTPError as e: 128 | raise PDFDownloadError( 129 | f"HTTP error while downloading PDF: {e}", 130 | url=url, 131 | status_code=getattr(e.response, "status_code", 400), 132 | ) 133 | except requests.exceptions.RequestException as e: 134 | raise PDFDownloadError( 135 | f"Failed to download PDF: {e}", 136 | url=url, 137 | ) 138 | 139 | def convert_to_images( 140 | self, 141 | pdf_bytes: bytes, 142 | *, 143 | zoom: Optional[int] = None, 144 | ) -> List[PDFPage]: 145 | """ 146 | Convert PDF bytes to a list of page images. 147 | 148 | Uses multiprocessing for parallel conversion. 149 | 150 | Args: 151 | pdf_bytes: PDF file content 152 | zoom: Zoom factor for rendering (default from settings) 153 | 154 | Returns: 155 | List of PDFPage objects with rendered images 156 | 157 | Raises: 158 | PDFConversionError: If conversion fails 159 | """ 160 | zoom = zoom or self.settings.pdf_zoom_factor 161 | 162 | # Save to temporary file for multiprocessing 163 | temp_path = self._save_to_temp_file(pdf_bytes) 164 | 165 | try: 166 | # Get page count 167 | doc = fitz.open(temp_path) 168 | page_count = doc.page_count 169 | doc.close() 170 | 171 | logger.info(f"Converting PDF with {page_count} pages (zoom={zoom}x)") 172 | 173 | # Prepare arguments for each page 174 | args_list = [(temp_path, i, zoom) for i in range(page_count)] 175 | pages: List[PDFPage] = [] 176 | 177 | # Use multiprocessing for parallel conversion 178 | max_workers = min( 179 | self.settings.max_concurrent_pdf_conversion, 180 | page_count, 181 | ) 182 | 183 | with ProcessPoolExecutor(max_workers=max_workers) as executor: 184 | future_to_page = { 185 | executor.submit(_convert_single_page, args): args[1] 186 | for args in args_list 187 | } 188 | 189 | for future in as_completed(future_to_page): 190 | page_index = future_to_page[future] 191 | try: 192 | page_num, image_bytes = future.result() 193 | pages.append(PDFPage( 194 | page_number=page_num, 195 | image_bytes=image_bytes, 196 | )) 197 | except Exception as e: 198 | logger.error(f"Failed to convert page {page_index + 1}: {e}") 199 | raise PDFConversionError( 200 | f"Failed to convert page {page_index + 1}", 201 | page_number=page_index + 1, 202 | context={"error": str(e)}, 203 | ) 204 | 205 | # Sort by page number to maintain order 206 | pages.sort(key=lambda p: p.page_number) 207 | 208 | total_size = sum(p.size_bytes for p in pages) 209 | logger.info(f"Converted {len(pages)} pages, total size: {total_size:,} bytes") 210 | 211 | return pages 212 | 213 | finally: 214 | self._cleanup_temp_file(temp_path) 215 | 216 | def encode_pages_to_base64(self, pages: List[PDFPage]) -> List[PageImage]: 217 | """ 218 | Encode page images to base64 data URLs. 219 | 220 | Args: 221 | pages: List of PDF pages with image bytes 222 | 223 | Returns: 224 | List of PageImage objects with base64-encoded data URLs 225 | """ 226 | encoded: List[PageImage] = [] 227 | 228 | for page in pages: 229 | base64_str = base64.b64encode(page.image_bytes).decode("utf-8") 230 | data_url = f"data:image/png;base64,{base64_str}" 231 | 232 | encoded.append(PageImage( 233 | page_number=page.page_number, 234 | data_url=data_url, 235 | )) 236 | 237 | logger.debug(f"Encoded {len(encoded)} pages to base64") 238 | return encoded 239 | 240 | def _save_to_temp_file(self, pdf_bytes: bytes) -> str: 241 | """Save PDF bytes to a temporary file.""" 242 | with tempfile.NamedTemporaryFile( 243 | delete=False, 244 | suffix=".pdf", 245 | prefix="swift_ocr_", 246 | ) as f: 247 | f.write(pdf_bytes) 248 | temp_path = f.name 249 | 250 | self._temp_files.append(temp_path) 251 | logger.debug(f"Saved PDF to temp file: {temp_path}") 252 | return temp_path 253 | 254 | def _cleanup_temp_file(self, path: str) -> None: 255 | """Clean up a temporary file.""" 256 | try: 257 | if os.path.exists(path): 258 | os.remove(path) 259 | logger.debug(f"Cleaned up temp file: {path}") 260 | if path in self._temp_files: 261 | self._temp_files.remove(path) 262 | except Exception as e: 263 | logger.warning(f"Failed to clean up temp file {path}: {e}") 264 | 265 | def cleanup_all(self) -> None: 266 | """Clean up all temporary files.""" 267 | for path in list(self._temp_files): 268 | self._cleanup_temp_file(path) 269 | -------------------------------------------------------------------------------- /swift_ocr/services/ocr.py: -------------------------------------------------------------------------------- 1 | """ 2 | OCR service using OpenAI Vision API. 3 | 4 | Handles text extraction from images using GPT-4 Vision. 5 | """ 6 | 7 | import asyncio 8 | from typing import List, Optional 9 | 10 | from openai import AsyncAzureOpenAI, OpenAIError 11 | 12 | from swift_ocr.config import Settings 13 | from swift_ocr.core.exceptions import OCRProcessingError, RateLimitError 14 | from swift_ocr.core.logging import get_logger 15 | from swift_ocr.core.retry import retry_with_backoff 16 | from swift_ocr.schemas import PageImage 17 | 18 | logger = get_logger(__name__) 19 | 20 | 21 | # System prompt for OCR 22 | SYSTEM_PROMPT = """You are an OCR assistant. Extract all text from the provided images (Describe images as if you're explaining them to a blind person eg: `[Image: In this picture, 8 people are posed hugging each other]`), which are attached to the document. Use markdown formatting for: 23 | 24 | - Headings (# for main, ## for sub) 25 | - Lists (- for unordered, 1. for ordered) 26 | - Emphasis (* for italics, ** for bold) 27 | - Links ([text](URL)) 28 | - Tables (use markdown table format) 29 | 30 | For non-text elements, describe them: [Image: Brief description] 31 | 32 | Maintain logical flow and use horizontal rules (---) to separate sections if needed. Adjust formatting to preserve readability. 33 | 34 | Note any issues or ambiguities at the end of your output. 35 | 36 | Be thorough and accurate in transcribing all text content.""" 37 | 38 | USER_PROMPT = """Never skip any context! Convert document as is be creative to use markdown effectively to reproduce the same document by using markdown. Translate image text to markdown sequentially. Preserve order and completeness. Separate images with `---`. No skips or comments. Start with first image immediately.""" 39 | 40 | 41 | class OCRService: 42 | """ 43 | Service for OCR processing using OpenAI Vision API. 44 | 45 | Handles batching, retry logic, and text extraction. 46 | """ 47 | 48 | def __init__(self, settings: Settings) -> None: 49 | """ 50 | Initialize OCR service. 51 | 52 | Args: 53 | settings: Application settings 54 | """ 55 | self.settings = settings 56 | self._client: Optional[AsyncAzureOpenAI] = None 57 | 58 | @property 59 | def client(self) -> AsyncAzureOpenAI: 60 | """Get or create the OpenAI client (lazy initialization).""" 61 | if self._client is None: 62 | self._client = AsyncAzureOpenAI( 63 | azure_endpoint=self.settings.azure_openai_endpoint, 64 | api_version=self.settings.openai_api_version, 65 | api_key=self.settings.openai_api_key, 66 | ) 67 | return self._client 68 | 69 | async def process_pages( 70 | self, 71 | pages: List[PageImage], 72 | *, 73 | batch_size: Optional[int] = None, 74 | ) -> str: 75 | """ 76 | Process multiple pages and extract text. 77 | 78 | Args: 79 | pages: List of page images to process 80 | batch_size: Number of pages per OCR request (default from settings) 81 | 82 | Returns: 83 | Extracted text in Markdown format 84 | 85 | Raises: 86 | OCRProcessingError: If text extraction fails 87 | """ 88 | batch_size = batch_size or self.settings.batch_size 89 | batches = self._create_batches(pages, batch_size) 90 | 91 | logger.info(f"Processing {len(pages)} pages in {len(batches)} batches") 92 | 93 | # Process batches with concurrency limit 94 | semaphore = asyncio.Semaphore(self.settings.max_concurrent_ocr_requests) 95 | 96 | async def process_with_semaphore(batch: List[PageImage]) -> str: 97 | async with semaphore: 98 | return await self._process_batch(batch) 99 | 100 | tasks = [ 101 | asyncio.create_task(process_with_semaphore(batch)) 102 | for batch in batches 103 | ] 104 | 105 | try: 106 | results = await asyncio.gather(*tasks, return_exceptions=True) 107 | except Exception as e: 108 | logger.error(f"Error processing batches: {e}") 109 | raise OCRProcessingError(f"Batch processing failed: {e}") 110 | 111 | # Check for exceptions in results 112 | texts: List[str] = [] 113 | for i, result in enumerate(results): 114 | if isinstance(result, Exception): 115 | logger.error(f"Batch {i + 1} failed: {result}") 116 | raise OCRProcessingError( 117 | f"Batch {i + 1} failed", 118 | batch_info=f"pages {batches[i][0].page_number}-{batches[i][-1].page_number}", 119 | context={"error": str(result)}, 120 | ) 121 | texts.append(result) 122 | 123 | # Concatenate results 124 | final_text = "\n\n".join(texts) 125 | logger.info(f"OCR complete: {len(final_text):,} characters extracted") 126 | 127 | return final_text 128 | 129 | async def _process_batch(self, batch: List[PageImage]) -> str: 130 | """ 131 | Process a single batch of pages with retry logic. 132 | 133 | Args: 134 | batch: List of page images in this batch 135 | 136 | Returns: 137 | Extracted text from the batch 138 | """ 139 | page_range = f"{batch[0].page_number}-{batch[-1].page_number}" 140 | logger.debug(f"Processing batch: pages {page_range}") 141 | 142 | async def make_request() -> str: 143 | return await self._call_openai_api(batch) 144 | 145 | return await retry_with_backoff( 146 | make_request, 147 | max_retries=self.settings.max_retries, 148 | base_delay=self.settings.retry_base_delay, 149 | max_delay=self.settings.retry_max_delay, 150 | retryable_exceptions=(RateLimitError, asyncio.TimeoutError), 151 | ) 152 | 153 | async def _call_openai_api(self, batch: List[PageImage]) -> str: 154 | """ 155 | Make the actual API call to OpenAI. 156 | 157 | Args: 158 | batch: List of page images to process 159 | 160 | Returns: 161 | Extracted text 162 | 163 | Raises: 164 | RateLimitError: If rate limited 165 | OCRProcessingError: If API call fails 166 | """ 167 | messages = self._build_messages(batch) 168 | 169 | try: 170 | response = await self.client.chat.completions.create( 171 | model=self.settings.openai_deployment_id, 172 | messages=messages, 173 | temperature=self.settings.ocr_temperature, 174 | max_tokens=self.settings.ocr_max_tokens, 175 | top_p=self.settings.ocr_top_p, 176 | frequency_penalty=0, 177 | presence_penalty=0, 178 | ) 179 | 180 | return self._extract_text_from_response(response) 181 | 182 | except OpenAIError as e: 183 | error_str = str(e).lower() 184 | if "rate limit" in error_str or "429" in error_str: 185 | raise RateLimitError( 186 | "OpenAI rate limit exceeded", 187 | context={"error": str(e)}, 188 | ) 189 | 190 | logger.error(f"OpenAI API error: {e}") 191 | raise OCRProcessingError( 192 | f"OCR API call failed: {e}", 193 | status_code=502, 194 | ) 195 | except asyncio.TimeoutError: 196 | raise # Let retry logic handle this 197 | except Exception as e: 198 | logger.exception(f"Unexpected error during OCR: {e}") 199 | raise OCRProcessingError(f"Unexpected OCR error: {e}") 200 | 201 | def _build_messages(self, batch: List[PageImage]) -> List[dict]: 202 | """ 203 | Build the message payload for the OpenAI API. 204 | 205 | Args: 206 | batch: List of page images 207 | 208 | Returns: 209 | List of message dictionaries 210 | """ 211 | messages = [ 212 | {"role": "system", "content": SYSTEM_PROMPT}, 213 | {"role": "user", "content": USER_PROMPT}, 214 | ] 215 | 216 | if len(batch) == 1: 217 | # Single page: simple format 218 | page = batch[0] 219 | messages.append({ 220 | "role": "user", 221 | "content": f"Page {page.page_number}:", 222 | }) 223 | messages.append({ 224 | "role": "user", 225 | "content": [ 226 | {"type": "image_url", "image_url": {"url": page.data_url}} 227 | ], 228 | }) 229 | else: 230 | # Multiple pages: include page numbers in content 231 | messages.append({ 232 | "role": "user", 233 | "content": "Please perform OCR on the following images. " 234 | "Ensure that the extracted text includes the corresponding page numbers.", 235 | }) 236 | 237 | content = [] 238 | for page in batch: 239 | content.append({"type": "text", "text": f"Page {page.page_number}:"}) 240 | content.append({"type": "image_url", "image_url": {"url": page.data_url}}) 241 | 242 | messages.append({"role": "user", "content": content}) 243 | 244 | return messages 245 | 246 | def _extract_text_from_response(self, response) -> str: 247 | """ 248 | Extract text content from the API response. 249 | 250 | Args: 251 | response: OpenAI API response 252 | 253 | Returns: 254 | Extracted text 255 | 256 | Raises: 257 | OCRProcessingError: If no text was extracted 258 | """ 259 | if ( 260 | not response.choices 261 | or not hasattr(response.choices[0].message, "content") 262 | or not response.choices[0].message.content 263 | ): 264 | raise OCRProcessingError("No text extracted from OCR response") 265 | 266 | text = response.choices[0].message.content.strip() 267 | logger.debug(f"Extracted {len(text):,} characters from response") 268 | return text 269 | 270 | def _create_batches( 271 | self, 272 | items: List[PageImage], 273 | batch_size: int, 274 | ) -> List[List[PageImage]]: 275 | """Split items into batches of specified size.""" 276 | batches = [ 277 | items[i:i + batch_size] 278 | for i in range(0, len(items), batch_size) 279 | ] 280 | return batches 281 | 282 | async def close(self) -> None: 283 | """Close the OpenAI client.""" 284 | if self._client is not None: 285 | await self._client.close() 286 | self._client = None 287 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

⚡ Swift OCR ⚡

2 |

Stop squinting at PDFs. Start extracting clean markdown.

3 | 4 |

5 | 6 | The LLM-powered OCR engine that turns any PDF into beautifully formatted Markdown. It reads your documents like a human, handles messy layouts, and outputs text your AI can actually understand. 7 | 8 |

9 | 10 |

11 | 12 | python 13 | fastapi 14 |   •   15 | 16 | license 17 | platform 18 |

19 | 20 |

21 | gpt-4/5 vision 22 | markdown output 23 |

24 | 25 |
26 | 27 | ### 🧭 Quick Navigation 28 | 29 | [**⚡ Get Started**](#-get-started-in-60-seconds) • 30 | [**✨ Key Features**](#-feature-breakdown-the-secret-sauce) • 31 | [**🎮 Usage & Examples**](#-usage-fire-and-forget) • 32 | [**💰 Cost Breakdown**](#-cost-breakdown-stupidly-cheap) • 33 | [**⚙️ Configuration**](#️-configuration) • 34 | [**🏗️ Project Structure**](#️-project-structure) 35 | 36 |
37 | 38 | --- 39 | 40 | **Swift OCR** is the document processor your AI assistant wishes it had. Stop feeding your LLM screenshots and praying it reads them correctly. This tool acts like a professional transcriber, reading every page of your PDF, intelligently handling tables, headers, and mixed layouts, then packaging everything into perfectly structured Markdown so your AI can actually work with it. 41 | 42 |
43 | 44 | 45 | 50 | 55 | 60 | 61 |
46 |

🧠

47 | GPT-4 Vision
48 | Human-level reading accuracy 49 |
51 |

52 | Parallel Processing
53 | Multi-page PDFs in seconds 54 |
56 |

📝

57 | Clean Markdown
58 | Tables, headers, lists—all formatted 59 |
62 |
63 | 64 | How it slaps: 65 | - **You:** `curl -X POST "http://localhost:8000/ocr" -F "file=@messy_document.pdf"` 66 | - **Swift OCR:** Converts pages → Sends to GPT-4 Vision → Formats as Markdown 67 | - **You:** Get perfectly structured text with tables, headers, and lists intact. 68 | - **Result:** Your AI finally understands that 50-page contract. ☕ 69 | 70 | --- 71 | 72 | ## 📹 Demo 73 | 74 | https://github.com/user-attachments/assets/6b39f3ea-248e-4c29-ac2e-b57de64d5d65 75 | 76 | *Demo video showcasing the conversion of NASA's Apollo 17 flight documents—complete with unorganized, horizontally and vertically oriented pages—into well-structured Markdown format without breaking a sweat.* 77 | 78 | --- 79 | 80 | ## 💥 Why This Slaps Other Methods 81 | 82 | Manually extracting text from PDFs is a vibe-killer. Swift OCR makes traditional OCR look ancient. 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 99 | 108 | 109 |
❌ The Old Way (Pain)✅ The Swift OCR Way (Glory)
91 |
    92 |
  1. Run Tesseract. Get garbled text.
  2. 93 |
  3. Tables? What tables? Just random words now.
  4. 94 |
  5. Manually fix formatting for 2 hours.
  6. 95 |
  7. Feed broken context to your AI.
  8. 96 |
  9. Get a useless answer. Cry.
  10. 97 |
98 |
100 |
    101 |
  1. Upload PDF to Swift OCR.
  2. 102 |
  3. Get perfectly formatted Markdown.
  4. 103 |
  5. Tables intact. Headers preserved.
  6. 104 |
  7. Feed clean context to your AI.
  8. 105 |
  9. Get genius-level answers. Go grab a coffee. ☕
  10. 106 |
107 |
110 | 111 | We're not just running basic OCR. We're using **GPT-4 Vision** to actually *understand* your documents—handling rotated pages, complex tables, mixed layouts, and even describing images for accessibility. 112 | 113 | --- 114 | 115 | ## 💰 Cost Breakdown: Stupidly Cheap 116 | 117 | Our solution offers an optimal balance of affordability and accuracy that makes enterprise OCR solutions look like highway robbery. 118 | 119 |
120 | 121 | | Metric | Value | 122 | |:------:|:------| 123 | | **Avg tokens/page** | ~1,500 (including prompt) | 124 | | **GPT-4o input cost** | $5 per million tokens | 125 | | **GPT-4o output cost** | $15 per million tokens | 126 | | **Cost per 1,000 pages** | **~$15** | 127 | 128 |
129 | 130 | ### 💡 Want It Even Cheaper? 131 | 132 | | Optimization | Cost per 1,000 pages | 133 | |:------------:|:--------------------:| 134 | | **GPT-4o (default)** | ~$15 | 135 | | **GPT-4o mini** | ~$8 | 136 | | **Batch API** | ~$4 | 137 | 138 | ### 🆚 Market Comparison 139 | 140 |
141 | 142 | | Solution | Cost per 1,000 pages | Tables? | Markdown? | 143 | |:--------:|:-------------------:|:-------:|:---------:| 144 | | **Swift OCR** | **$15** | ✅ Perfect | ✅ Native | 145 | | CloudConvert (PDFTron) | ~$30 | ⚠️ Basic | ❌ No | 146 | | Adobe Acrobat API | ~$50+ | ✅ Good | ❌ No | 147 | | Tesseract (free) | $0 | ❌ Broken | ❌ No | 148 | 149 |
150 | 151 | > **Bottom line:** Half the cost of competitors, 10x the quality. It's not just about being cheaper—it's about getting output you can actually use. 152 | 153 | --- 154 | 155 | ## 🚀 Get Started in 60 Seconds 156 | 157 | ### Prerequisites 158 | 159 | - **Python 3.8+** 160 | - **Azure OpenAI** account (with GPT-4 Vision deployment) 161 | 162 | ### Installation 163 | 164 | ```bash 165 | # Clone the repo 166 | git clone https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown.git 167 | cd swift-ocr-llm-powered-pdf-to-markdown 168 | 169 | # Create virtual environment (recommended) 170 | python3 -m venv venv 171 | source venv/bin/activate # Windows: venv\Scripts\activate 172 | 173 | # Install dependencies 174 | pip install -r requirements.txt 175 | ``` 176 | 177 | ### Configure Environment 178 | 179 | Create a `.env` file in the root directory: 180 | 181 | ```env 182 | # Required 183 | OPENAI_API_KEY=your_openai_api_key 184 | AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/ 185 | OPENAI_DEPLOYMENT_ID=your_gpt4_vision_deployment 186 | 187 | # Optional (sensible defaults) 188 | OPENAI_API_VERSION=gpt-4o 189 | BATCH_SIZE=1 # Images per OCR request (1-10) 190 | MAX_CONCURRENT_OCR_REQUESTS=5 # Parallel OCR calls 191 | MAX_CONCURRENT_PDF_CONVERSION=4 # Parallel page rendering 192 | ``` 193 | 194 | ### Run It 195 | 196 | ```bash 197 | # Option 1: Classic uvicorn (backward compatible) 198 | uvicorn main:app --reload 199 | 200 | # Option 2: Using the new package 201 | uvicorn swift_ocr.app:app --reload 202 | 203 | # Option 3: As a Python module 204 | python -m swift_ocr 205 | 206 | # Option 4: With CLI arguments 207 | python -m swift_ocr --host 0.0.0.0 --port 8080 --workers 4 208 | ``` 209 | 210 | 🎉 **API is now live at `http://127.0.0.1:8000`** 211 | 212 | > **✨ Pro tip:** Check out the auto-generated docs at `http://127.0.0.1:8000/docs` 213 | 214 | --- 215 | 216 | ## 🎮 Usage: Fire and Forget 217 | 218 | ### API Endpoint 219 | 220 | **POST** `/ocr` 221 | 222 | Accept a PDF file upload OR a URL to a PDF. Returns beautifully formatted Markdown. 223 | 224 | ### Examples 225 | 226 | **Upload a PDF file:** 227 | 228 | ```bash 229 | curl -X POST "http://127.0.0.1:8000/ocr" \ 230 | -F "file=@/path/to/your/document.pdf" 231 | ``` 232 | 233 | **Process a PDF from URL:** 234 | 235 | ```bash 236 | curl -X POST "http://127.0.0.1:8000/ocr" \ 237 | -H "Content-Type: application/json" \ 238 | -d '{"url": "https://example.com/document.pdf"}' 239 | ``` 240 | 241 | ### Response 242 | 243 | ```json 244 | { 245 | "text": "# Document Title\n\n## Section 1\n\nExtracted text with **formatting** preserved...\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Data | Data |" 246 | } 247 | ``` 248 | 249 | ### Response (v2.0+) 250 | 251 | The new response includes additional metadata: 252 | 253 | ```json 254 | { 255 | "text": "# Document Title\n\n## Section 1\n\nExtracted text...", 256 | "status": "success", 257 | "pages_processed": 5, 258 | "processing_time_ms": 1234 259 | } 260 | ``` 261 | 262 | ### Health Check 263 | 264 | ```bash 265 | curl http://127.0.0.1:8000/health 266 | ``` 267 | 268 | ```json 269 | { 270 | "status": "healthy", 271 | "version": "2.0.0", 272 | "timestamp": "2024-01-01T00:00:00Z", 273 | "openai_configured": true 274 | } 275 | ``` 276 | 277 | ### Error Codes 278 | 279 | | Code | Meaning | 280 | |:----:|:--------| 281 | | `200` | Success—Markdown text returned | 282 | | `400` | Bad request (no file/URL, or both provided) | 283 | | `422` | Validation error | 284 | | `429` | Rate limited—retry with backoff | 285 | | `500` | Processing error | 286 | | `504` | Timeout downloading PDF | 287 | 288 | --- 289 | 290 | ## ✨ Feature Breakdown: The Secret Sauce 291 | 292 |
293 | 294 | | Feature | What It Does | Why You Care | 295 | | :---: | :--- | :--- | 296 | | **🧠 GPT-4 Vision**
`Human-level OCR` | Uses OpenAI's most capable vision model to read documents | Actually understands context, not just character shapes | 297 | | **⚡ Parallel Processing**
`Multiprocessing + async` | Converts PDF pages and calls OCR in parallel | 50-page PDF in seconds, not minutes | 298 | | **📊 Table Preservation**
`Markdown tables` | Detects and formats tables as proper Markdown | Your data stays structured, not flattened to gibberish | 299 | | **🔄 Smart Batching**
`Configurable batch size` | Groups pages to optimize API calls vs accuracy | Balance speed and cost for your use case | 300 | | **🛡️ Retry with Backoff**
`Exponential backoff` | Automatically retries on rate limits and timeouts | Handles API hiccups without crashing | 301 | | **📄 Flexible Input**
`File upload or URL` | Accept PDFs directly or fetch from any URL | Works with your existing workflow | 302 | | **🖼️ Image Descriptions**
`Accessibility-friendly` | Describes non-text elements: `[Image: description]` | Context your AI can actually use | 303 | 304 |
305 | 306 | --- 307 | 308 | ## ⚙️ Configuration 309 | 310 | All settings are managed via environment variables. Tune these for your workload: 311 | 312 |
313 | 314 | | Variable | Default | Description | 315 | |:---------|:-------:|:------------| 316 | | `OPENAI_API_KEY` | — | Your Azure OpenAI API key | 317 | | `AZURE_OPENAI_ENDPOINT` | — | Your Azure OpenAI endpoint URL | 318 | | `OPENAI_DEPLOYMENT_ID` | — | Your GPT-4 Vision deployment ID | 319 | | `OPENAI_API_VERSION` | `gpt-4o` | API version | 320 | | `BATCH_SIZE` | `1` | Pages per OCR request (1-10). Higher = faster but less accurate | 321 | | `MAX_CONCURRENT_OCR_REQUESTS` | `5` | Parallel OCR calls. Increase for throughput | 322 | | `MAX_CONCURRENT_PDF_CONVERSION` | `4` | Parallel page renders. Match your CPU cores | 323 | 324 |
325 | 326 | ### Performance Tuning Tips 327 | 328 | - **High accuracy, slower:** `BATCH_SIZE=1` 329 | - **Balanced:** `BATCH_SIZE=5`, `MAX_CONCURRENT_OCR_REQUESTS=10` 330 | - **Maximum throughput:** `BATCH_SIZE=10`, `MAX_CONCURRENT_OCR_REQUESTS=20` (watch rate limits!) 331 | 332 | --- 333 | 334 | ## 🏗️ Project Structure 335 | 336 | World-class Python engineering with atomic modules and clean separation of concerns: 337 | 338 | ``` 339 | swift_ocr/ 340 | ├── __init__.py # Package init with version 341 | ├── __main__.py # CLI entry point (python -m swift_ocr) 342 | ├── app.py # FastAPI app factory 343 | ├── config/ 344 | │ ├── __init__.py 345 | │ └── settings.py # Pydantic Settings (type-safe config) 346 | ├── core/ 347 | │ ├── __init__.py 348 | │ ├── exceptions.py # Custom exception hierarchy 349 | │ ├── logging.py # Structured logging setup 350 | │ └── retry.py # Exponential backoff utilities 351 | ├── schemas/ 352 | │ ├── __init__.py 353 | │ └── ocr.py # Pydantic request/response models 354 | ├── services/ 355 | │ ├── __init__.py 356 | │ ├── ocr.py # OpenAI Vision OCR service 357 | │ └── pdf.py # PDF conversion service 358 | └── api/ 359 | ├── __init__.py 360 | ├── deps.py # Dependency injection 361 | ├── exceptions.py # FastAPI exception handlers 362 | ├── router.py # Route aggregation 363 | └── routes/ 364 | ├── __init__.py 365 | ├── health.py # Health check endpoints 366 | └── ocr.py # OCR endpoints 367 | ``` 368 | 369 |
370 | Key architectural decisions 371 | 372 | | Pattern | Implementation | Benefit | 373 | | :--- | :--- | :--- | 374 | | **Pydantic Settings** | `config/settings.py` | Type-safe config with `.env` support and validation | 375 | | **Dependency Injection** | `api/deps.py` | Testable, swappable services | 376 | | **Custom Exceptions** | `core/exceptions.py` | Rich error context with proper HTTP status codes | 377 | | **Retry with Backoff** | `core/retry.py` | Handles rate limits and transient failures | 378 | | **App Factory** | `app.py` | Configurable app creation for testing | 379 | | **Typed Throughout** | `py.typed` marker | Full mypy compatibility | 380 | 381 |
382 | 383 | --- 384 | 385 | ## 🔥 Common Issues & Quick Fixes 386 | 387 |
388 | Expand for troubleshooting tips 389 | 390 | | Problem | Solution | 391 | | :--- | :--- | 392 | | **"Missing required environment variables"** | Check your `.env` file has all three required variables: `OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT`, `OPENAI_DEPLOYMENT_ID` | 393 | | **Rate limit errors (429)** | Reduce `MAX_CONCURRENT_OCR_REQUESTS` or `BATCH_SIZE`. The retry logic will handle temporary limits automatically. | 394 | | **Timeout errors** | Large PDFs take time. The system has exponential backoff built in—give it a moment. | 395 | | **Garbled output** | Make sure your PDF isn't password-protected or corrupted. Try opening it locally first. | 396 | | **Tables not formatting correctly** | Some extremely complex tables may need `BATCH_SIZE=1` for best accuracy. | 397 | | **"Failed to initialize OpenAI client"** | Verify your Azure endpoint URL format: `https://your-resource.openai.azure.com/` | 398 | 399 |
400 | 401 | --- 402 | 403 | ## 📜 License 404 | 405 | This project uses **PyMuPDF** for PDF processing, which requires the **GNU AGPL v3.0** license. 406 | 407 | > **Want MIT instead?** Fork this project and swap PyMuPDF for `pdf2image` + Poppler. The rest of the code is yours to use freely. 408 | 409 | ``` 410 | GNU AFFERO GENERAL PUBLIC LICENSE 411 | Version 3, 19 November 2007 412 | 413 | Copyright (C) 2024 Yiğit Konur 414 | ``` 415 | 416 | See [LICENSE.md](LICENSE.md) for the full license text. 417 | 418 | --- 419 | 420 |
421 | 422 | **Built with 🔥 because manually transcribing PDFs is a soul-crushing waste of time.** 423 | 424 | [Report Bug](https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown/issues) • 425 | [Request Feature](https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown/issues) 426 | 427 |
428 | --------------------------------------------------------------------------------