├── swift_ocr
├── py.typed
├── api
│ ├── routes
│ │ ├── __init__.py
│ │ ├── health.py
│ │ └── ocr.py
│ ├── __init__.py
│ ├── router.py
│ ├── deps.py
│ └── exceptions.py
├── config
│ ├── __init__.py
│ └── settings.py
├── services
│ ├── __init__.py
│ ├── pdf.py
│ └── ocr.py
├── schemas
│ ├── __init__.py
│ └── ocr.py
├── __init__.py
├── core
│ ├── __init__.py
│ ├── logging.py
│ ├── exceptions.py
│ └── retry.py
├── __main__.py
└── app.py
├── main.py
├── requirements.txt
├── .gitignore
├── LICENSE.md
├── .env.example
├── pyproject.toml
└── README.md
/swift_ocr/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/swift_ocr/api/routes/__init__.py:
--------------------------------------------------------------------------------
1 | """API route modules."""
2 |
3 | from swift_ocr.api.routes import health, ocr
4 |
5 | __all__ = ["health", "ocr"]
6 |
--------------------------------------------------------------------------------
/swift_ocr/config/__init__.py:
--------------------------------------------------------------------------------
1 | """Configuration module for Swift OCR."""
2 |
3 | from swift_ocr.config.settings import Settings, get_settings
4 |
5 | __all__ = ["Settings", "get_settings"]
6 |
--------------------------------------------------------------------------------
/swift_ocr/services/__init__.py:
--------------------------------------------------------------------------------
1 | """Services for Swift OCR."""
2 |
3 | from swift_ocr.services.pdf import PDFService
4 | from swift_ocr.services.ocr import OCRService
5 |
6 | __all__ = ["PDFService", "OCRService"]
7 |
--------------------------------------------------------------------------------
/swift_ocr/api/__init__.py:
--------------------------------------------------------------------------------
1 | """API module for Swift OCR."""
2 |
3 | from swift_ocr.api.deps import get_ocr_service, get_pdf_service, get_settings
4 | from swift_ocr.api.router import api_router
5 |
6 | __all__ = [
7 | "api_router",
8 | "get_ocr_service",
9 | "get_pdf_service",
10 | "get_settings",
11 | ]
12 |
--------------------------------------------------------------------------------
/swift_ocr/api/router.py:
--------------------------------------------------------------------------------
1 | """
2 | API router aggregating all route modules.
3 | """
4 |
5 | from fastapi import APIRouter
6 |
7 | from swift_ocr.api.routes import health, ocr
8 |
9 | api_router = APIRouter()
10 |
11 | # Include route modules
12 | api_router.include_router(health.router, tags=["Health"])
13 | api_router.include_router(ocr.router, tags=["OCR"])
14 |
--------------------------------------------------------------------------------
/swift_ocr/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | """Request and response schemas for Swift OCR."""
2 |
3 | from swift_ocr.schemas.ocr import (
4 | OCRRequest,
5 | OCRResponse,
6 | OCRStatus,
7 | HealthResponse,
8 | PageImage,
9 | )
10 |
11 | __all__ = [
12 | "OCRRequest",
13 | "OCRResponse",
14 | "OCRStatus",
15 | "HealthResponse",
16 | "PageImage",
17 | ]
18 |
--------------------------------------------------------------------------------
/swift_ocr/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Swift OCR - LLM-powered PDF to Markdown converter.
3 |
4 | A high-performance OCR engine that uses GPT-4 Vision to convert PDFs
5 | into beautifully formatted Markdown.
6 | """
7 |
8 | __version__ = "2.0.0"
9 | __author__ = "Yiğit Konur"
10 | __license__ = "AGPL-3.0"
11 |
12 | from swift_ocr.app import create_app
13 |
14 | __all__ = ["create_app", "__version__"]
15 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Swift OCR - Entry point for backward compatibility.
3 |
4 | This file provides backward compatibility with the original API.
5 | The refactored code is in the swift_ocr package.
6 |
7 | Usage:
8 | uvicorn main:app --reload
9 |
10 | Or use the new package directly:
11 | uvicorn swift_ocr.app:app --reload
12 | python -m swift_ocr
13 | """
14 |
15 | from swift_ocr.app import app
16 |
17 | __all__ = ["app"]
18 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Swift OCR Dependencies
2 | # Python 3.8+ required
3 |
4 | # Web Framework
5 | fastapi>=0.109.0,<1.0.0
6 | uvicorn[standard]>=0.27.0,<1.0.0
7 | python-multipart>=0.0.6,<1.0.0
8 |
9 | # Configuration
10 | pydantic>=2.5.0,<3.0.0
11 | pydantic-settings>=2.1.0,<3.0.0
12 | python-dotenv>=1.0.0,<2.0.0
13 |
14 | # HTTP Client
15 | requests>=2.31.0,<3.0.0
16 |
17 | # PDF Processing
18 | PyMuPDF>=1.23.0,<2.0.0
19 |
20 | # OpenAI
21 | openai>=1.12.0,<2.0.0
22 |
--------------------------------------------------------------------------------
/swift_ocr/core/__init__.py:
--------------------------------------------------------------------------------
1 | """Core utilities for Swift OCR."""
2 |
3 | from swift_ocr.core.exceptions import (
4 | SwiftOCRError,
5 | PDFDownloadError,
6 | PDFConversionError,
7 | OCRProcessingError,
8 | RateLimitError,
9 | ValidationError,
10 | )
11 | from swift_ocr.core.logging import get_logger, setup_logging
12 | from swift_ocr.core.retry import retry_with_backoff
13 |
14 | __all__ = [
15 | "SwiftOCRError",
16 | "PDFDownloadError",
17 | "PDFConversionError",
18 | "OCRProcessingError",
19 | "RateLimitError",
20 | "ValidationError",
21 | "get_logger",
22 | "setup_logging",
23 | "retry_with_backoff",
24 | ]
25 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | *.manifest
29 | *.spec
30 |
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 |
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .nox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *.cover
45 | *.py,cover
46 | .hypothesis/
47 | .pytest_cache/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Environments
54 | .env
55 | .venv
56 | env/
57 | venv/
58 | ENV/
59 | env.bak/
60 | venv.bak/
61 |
62 | # IDE
63 | .idea/
64 | .vscode/
65 | *.swp
66 | *.swo
67 | *~
68 |
69 | # macOS
70 | .DS_Store
71 | .AppleDouble
72 | .LSOverride
73 | ._*
74 |
75 | # Thumbnails
76 | Thumbs.db
77 | ehthumbs.db
78 |
79 | # Project specific
80 | *.pdf
81 | *.log
82 | temp/
83 | tmp/
84 |
--------------------------------------------------------------------------------
/swift_ocr/api/deps.py:
--------------------------------------------------------------------------------
1 | """
2 | FastAPI dependencies for dependency injection.
3 |
4 | Provides service instances and settings to route handlers.
5 | """
6 |
7 | from functools import lru_cache
8 | from typing import Annotated, Generator
9 |
10 | from fastapi import Depends
11 |
12 | from swift_ocr.config import Settings
13 | from swift_ocr.config.settings import get_settings as _get_settings
14 | from swift_ocr.services.ocr import OCRService
15 | from swift_ocr.services.pdf import PDFService
16 |
17 |
18 | def get_settings() -> Settings:
19 | """Get application settings."""
20 | return _get_settings()
21 |
22 |
23 | @lru_cache
24 | def get_pdf_service(
25 | settings: Annotated[Settings, Depends(get_settings)],
26 | ) -> PDFService:
27 | """
28 | Get PDF service instance.
29 |
30 | Uses lru_cache to ensure single instance per settings configuration.
31 | """
32 | return PDFService(settings)
33 |
34 |
35 | @lru_cache
36 | def get_ocr_service(
37 | settings: Annotated[Settings, Depends(get_settings)],
38 | ) -> OCRService:
39 | """
40 | Get OCR service instance.
41 |
42 | Uses lru_cache to ensure single instance per settings configuration.
43 | """
44 | return OCRService(settings)
45 |
46 |
47 | # Type aliases for cleaner dependency injection
48 | SettingsDep = Annotated[Settings, Depends(get_settings)]
49 | PDFServiceDep = Annotated[PDFService, Depends(get_pdf_service)]
50 | OCRServiceDep = Annotated[OCRService, Depends(get_ocr_service)]
51 |
--------------------------------------------------------------------------------
/swift_ocr/api/routes/health.py:
--------------------------------------------------------------------------------
1 | """
2 | Health check endpoints.
3 |
4 | Provides endpoints for monitoring application health.
5 | """
6 |
7 | from datetime import datetime
8 |
9 | from fastapi import APIRouter
10 |
11 | from swift_ocr import __version__
12 | from swift_ocr.api.deps import SettingsDep
13 | from swift_ocr.schemas import HealthResponse
14 |
15 | router = APIRouter()
16 |
17 |
18 | @router.get(
19 | "/health",
20 | response_model=HealthResponse,
21 | summary="Health Check",
22 | description="Check if the service is healthy and properly configured.",
23 | )
24 | async def health_check(settings: SettingsDep) -> HealthResponse:
25 | """
26 | Perform a health check.
27 |
28 | Returns:
29 | HealthResponse with current status and configuration info.
30 | """
31 | return HealthResponse(
32 | status="healthy",
33 | version=__version__,
34 | timestamp=datetime.utcnow(),
35 | openai_configured=bool(
36 | settings.openai_api_key
37 | and settings.azure_openai_endpoint
38 | and settings.openai_deployment_id
39 | ),
40 | )
41 |
42 |
43 | @router.get(
44 | "/",
45 | response_model=HealthResponse,
46 | summary="Root Health Check",
47 | description="Root endpoint returning health status.",
48 | include_in_schema=False,
49 | )
50 | async def root_health(settings: SettingsDep) -> HealthResponse:
51 | """Root endpoint redirecting to health check."""
52 | return await health_check(settings)
53 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | GNU AFFERO GENERAL PUBLIC LICENSE
2 | Version 3, 19 November 2007
3 |
4 | Copyright (C) 2024 Yiğit Konur
5 |
6 | This program is free software: you can redistribute it and/or modify
7 | it under the terms of the GNU Affero General Public License as published
8 | by the Free Software Foundation, either version 3 of the License, or
9 | (at your option) any later version.
10 |
11 | This program is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU Affero General Public License for more details.
15 |
16 | You should have received a copy of the GNU Affero General Public License
17 | along with this program. If not, see .
18 |
19 | Also add information on how to contact you by electronic and paper mail.
20 |
21 | If your software can interact with users remotely through a computer
22 | network, you should also make sure that it provides a way for users to
23 | get its source. For example, if your program is a web application, its
24 | interface could display a "Source" link that leads users to an archive
25 | of the code. There are many ways you could offer source, and different
26 | solutions will be better for different programs; see section 13 for the
27 | specific requirements.
28 |
29 | You should also get your employer (if you work as a programmer) or school,
30 | if any, to sign a "copyright disclaimer" for the program, if necessary.
31 | For more information on this, and how to apply and follow the GNU AGPL, see
32 | .
33 |
--------------------------------------------------------------------------------
/swift_ocr/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | Entry point for running Swift OCR as a module.
3 |
4 | Usage:
5 | python -m swift_ocr
6 | python -m swift_ocr --host 0.0.0.0 --port 8080
7 | """
8 |
9 | import argparse
10 | import sys
11 |
12 |
13 | def main() -> int:
14 | """Main entry point for CLI."""
15 | parser = argparse.ArgumentParser(
16 | prog="swift-ocr",
17 | description="Swift OCR - LLM-powered PDF to Markdown converter",
18 | )
19 | parser.add_argument(
20 | "--host",
21 | type=str,
22 | default="0.0.0.0",
23 | help="Host to bind to (default: 0.0.0.0)",
24 | )
25 | parser.add_argument(
26 | "--port",
27 | type=int,
28 | default=8000,
29 | help="Port to bind to (default: 8000)",
30 | )
31 | parser.add_argument(
32 | "--reload",
33 | action="store_true",
34 | help="Enable auto-reload for development",
35 | )
36 | parser.add_argument(
37 | "--workers",
38 | type=int,
39 | default=1,
40 | help="Number of worker processes (default: 1)",
41 | )
42 | parser.add_argument(
43 | "--version",
44 | action="store_true",
45 | help="Show version and exit",
46 | )
47 |
48 | args = parser.parse_args()
49 |
50 | if args.version:
51 | from swift_ocr import __version__
52 | print(f"Swift OCR v{__version__}")
53 | return 0
54 |
55 | try:
56 | import uvicorn
57 |
58 | uvicorn.run(
59 | "swift_ocr.app:app",
60 | host=args.host,
61 | port=args.port,
62 | reload=args.reload,
63 | workers=args.workers if not args.reload else 1,
64 | log_level="info",
65 | )
66 | return 0
67 | except KeyboardInterrupt:
68 | print("\nShutting down...")
69 | return 0
70 | except Exception as e:
71 | print(f"Error: {e}", file=sys.stderr)
72 | return 1
73 |
74 |
75 | if __name__ == "__main__":
76 | sys.exit(main())
77 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # Swift OCR Configuration
2 | # Copy this file to .env and fill in your values
3 |
4 | # ===========================================
5 | # REQUIRED: Azure OpenAI Configuration
6 | # ===========================================
7 |
8 | # Your Azure OpenAI API key
9 | OPENAI_API_KEY=your_api_key_here
10 |
11 | # Azure OpenAI endpoint URL
12 | # Format: https://your-resource-name.openai.azure.com/
13 | AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
14 |
15 | # Your GPT-4 Vision deployment name/ID
16 | OPENAI_DEPLOYMENT_ID=gpt-4-vision
17 |
18 | # ===========================================
19 | # OPTIONAL: API Configuration
20 | # ===========================================
21 |
22 | # OpenAI API version (default: 2024-02-15-preview)
23 | # OPENAI_API_VERSION=2024-02-15-preview
24 |
25 | # ===========================================
26 | # OPTIONAL: Performance Tuning
27 | # ===========================================
28 |
29 | # Number of pages to process per OCR request (1-10)
30 | # Lower = more accurate, Higher = faster
31 | # Default: 1
32 | # BATCH_SIZE=1
33 |
34 | # Maximum concurrent OCR API calls (1-50)
35 | # Higher values process faster but may hit rate limits
36 | # Default: 5
37 | # MAX_CONCURRENT_OCR_REQUESTS=5
38 |
39 | # Maximum concurrent PDF page conversions (1-16)
40 | # Match this to your CPU core count for best performance
41 | # Default: 4
42 | # MAX_CONCURRENT_PDF_CONVERSION=4
43 |
44 | # PDF rendering zoom factor (1-4)
45 | # Higher = better quality but larger images
46 | # Default: 2
47 | # PDF_ZOOM_FACTOR=2
48 |
49 | # ===========================================
50 | # OPTIONAL: OCR Model Parameters
51 | # ===========================================
52 |
53 | # Temperature for OCR model (0.0-2.0)
54 | # Lower = more deterministic output
55 | # Default: 0.1
56 | # OCR_TEMPERATURE=0.1
57 |
58 | # Maximum tokens in OCR response (100-128000)
59 | # Default: 4000
60 | # OCR_MAX_TOKENS=4000
61 |
62 | # ===========================================
63 | # OPTIONAL: Retry Configuration
64 | # ===========================================
65 |
66 | # Maximum retry attempts for failed requests
67 | # Default: 10
68 | # MAX_RETRIES=10
69 |
70 | # Initial retry delay in seconds
71 | # Default: 1.0
72 | # RETRY_BASE_DELAY=1.0
73 |
74 | # Maximum retry delay in seconds
75 | # Default: 120.0
76 | # RETRY_MAX_DELAY=120.0
77 |
78 | # ===========================================
79 | # OPTIONAL: Server Configuration
80 | # ===========================================
81 |
82 | # Server host
83 | # Default: 0.0.0.0
84 | # HOST=0.0.0.0
85 |
86 | # Server port
87 | # Default: 8000
88 | # PORT=8000
89 |
90 | # Enable debug mode (true/false)
91 | # Default: false
92 | # DEBUG=false
93 |
--------------------------------------------------------------------------------
/swift_ocr/app.py:
--------------------------------------------------------------------------------
1 | """
2 | FastAPI application factory.
3 |
4 | Creates and configures the FastAPI application instance.
5 | """
6 |
7 | import logging
8 | from contextlib import asynccontextmanager
9 | from typing import AsyncGenerator
10 |
11 | from fastapi import FastAPI
12 | from fastapi.middleware.cors import CORSMiddleware
13 |
14 | from swift_ocr import __version__
15 | from swift_ocr.api.exceptions import register_exception_handlers
16 | from swift_ocr.api.router import api_router
17 | from swift_ocr.config import get_settings
18 | from swift_ocr.core.logging import setup_logging, get_logger
19 |
20 | logger = get_logger(__name__)
21 |
22 |
23 | @asynccontextmanager
24 | async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
25 | """
26 | Application lifespan context manager.
27 |
28 | Handles startup and shutdown events.
29 | """
30 | # Startup
31 | settings = get_settings()
32 | log_level = logging.DEBUG if settings.debug else logging.INFO
33 | setup_logging(level=log_level)
34 |
35 | logger.info(f"Starting Swift OCR v{__version__}")
36 | logger.info(f"Debug mode: {settings.debug}")
37 | logger.info(f"OpenAI endpoint: {settings.azure_openai_endpoint}")
38 | logger.info(f"Batch size: {settings.batch_size}")
39 | logger.info(f"Max concurrent OCR requests: {settings.max_concurrent_ocr_requests}")
40 |
41 | yield
42 |
43 | # Shutdown
44 | logger.info("Shutting down Swift OCR")
45 |
46 |
47 | def create_app() -> FastAPI:
48 | """
49 | Create and configure the FastAPI application.
50 |
51 | Returns:
52 | Configured FastAPI application instance
53 | """
54 | settings = get_settings()
55 |
56 | app = FastAPI(
57 | title="Swift OCR API",
58 | description=(
59 | "LLM-powered OCR API that converts PDFs to beautifully formatted Markdown. "
60 | "Uses GPT-4 Vision for human-level text extraction with table preservation, "
61 | "header detection, and image descriptions."
62 | ),
63 | version=__version__,
64 | docs_url="/docs",
65 | redoc_url="/redoc",
66 | openapi_url="/openapi.json",
67 | lifespan=lifespan,
68 | )
69 |
70 | # Add CORS middleware
71 | app.add_middleware(
72 | CORSMiddleware,
73 | allow_origins=["*"], # Configure appropriately for production
74 | allow_credentials=True,
75 | allow_methods=["*"],
76 | allow_headers=["*"],
77 | )
78 |
79 | # Register exception handlers
80 | register_exception_handlers(app)
81 |
82 | # Include API routes
83 | app.include_router(api_router)
84 |
85 | return app
86 |
87 |
88 | # Create default app instance for uvicorn
89 | app = create_app()
90 |
--------------------------------------------------------------------------------
/swift_ocr/core/logging.py:
--------------------------------------------------------------------------------
1 | """
2 | Logging configuration for Swift OCR.
3 |
4 | Provides structured logging with configurable levels and formats.
5 | """
6 |
7 | import logging
8 | import sys
9 | from functools import lru_cache
10 | from typing import Optional
11 |
12 |
13 | class ColoredFormatter(logging.Formatter):
14 | """Custom formatter with colored output for terminals."""
15 |
16 | COLORS = {
17 | "DEBUG": "\033[36m", # Cyan
18 | "INFO": "\033[32m", # Green
19 | "WARNING": "\033[33m", # Yellow
20 | "ERROR": "\033[31m", # Red
21 | "CRITICAL": "\033[35m", # Magenta
22 | }
23 | RESET = "\033[0m"
24 |
25 | def format(self, record: logging.LogRecord) -> str:
26 | # Add color to levelname
27 | color = self.COLORS.get(record.levelname, "")
28 | record.levelname = f"{color}{record.levelname}{self.RESET}"
29 | return super().format(record)
30 |
31 |
32 | def setup_logging(
33 | level: int = logging.INFO,
34 | *,
35 | use_colors: bool = True,
36 | log_format: Optional[str] = None,
37 | ) -> None:
38 | """
39 | Configure application logging.
40 |
41 | Args:
42 | level: Logging level (default: INFO)
43 | use_colors: Enable colored output in terminals
44 | log_format: Custom log format string
45 | """
46 | format_str = log_format or "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
47 |
48 | # Create handler
49 | handler = logging.StreamHandler(sys.stdout)
50 |
51 | # Use colored formatter if enabled and stdout is a TTY
52 | if use_colors and sys.stdout.isatty():
53 | formatter = ColoredFormatter(format_str, datefmt="%Y-%m-%d %H:%M:%S")
54 | else:
55 | formatter = logging.Formatter(format_str, datefmt="%Y-%m-%d %H:%M:%S")
56 |
57 | handler.setFormatter(formatter)
58 |
59 | # Configure root logger
60 | root_logger = logging.getLogger()
61 | root_logger.setLevel(level)
62 |
63 | # Remove existing handlers to avoid duplicates
64 | root_logger.handlers.clear()
65 | root_logger.addHandler(handler)
66 |
67 | # Reduce noise from third-party libraries
68 | logging.getLogger("httpx").setLevel(logging.WARNING)
69 | logging.getLogger("httpcore").setLevel(logging.WARNING)
70 | logging.getLogger("openai").setLevel(logging.WARNING)
71 | logging.getLogger("urllib3").setLevel(logging.WARNING)
72 |
73 |
74 | @lru_cache
75 | def get_logger(name: str) -> logging.Logger:
76 | """
77 | Get a logger instance with the given name.
78 |
79 | Uses lru_cache to avoid creating multiple loggers for the same name.
80 |
81 | Args:
82 | name: Logger name (typically __name__)
83 |
84 | Returns:
85 | logging.Logger: Configured logger instance
86 | """
87 | return logging.getLogger(name)
88 |
--------------------------------------------------------------------------------
/swift_ocr/api/exceptions.py:
--------------------------------------------------------------------------------
1 | """
2 | Exception handlers for FastAPI.
3 |
4 | Provides consistent error responses across the application.
5 | """
6 |
7 | from fastapi import FastAPI, HTTPException, Request
8 | from fastapi.responses import JSONResponse
9 | from pydantic import ValidationError as PydanticValidationError
10 |
11 | from swift_ocr.core.exceptions import SwiftOCRError
12 | from swift_ocr.core.logging import get_logger
13 |
14 | logger = get_logger(__name__)
15 |
16 |
17 | def register_exception_handlers(app: FastAPI) -> None:
18 | """
19 | Register all exception handlers with the FastAPI app.
20 |
21 | Args:
22 | app: FastAPI application instance
23 | """
24 |
25 | @app.exception_handler(SwiftOCRError)
26 | async def swift_ocr_error_handler(
27 | request: Request,
28 | exc: SwiftOCRError,
29 | ) -> JSONResponse:
30 | """Handle Swift OCR custom exceptions."""
31 | logger.error(
32 | f"SwiftOCRError: {exc.message}",
33 | extra={"context": exc.context, "status_code": exc.status_code},
34 | )
35 | return JSONResponse(
36 | status_code=exc.status_code,
37 | content={
38 | "detail": exc.detail,
39 | "error": {
40 | "message": exc.message,
41 | "type": type(exc).__name__,
42 | "context": exc.context if exc.context else None,
43 | },
44 | },
45 | )
46 |
47 | @app.exception_handler(HTTPException)
48 | async def http_exception_handler(
49 | request: Request,
50 | exc: HTTPException,
51 | ) -> JSONResponse:
52 | """Handle FastAPI HTTP exceptions."""
53 | logger.error(f"HTTPException: {exc.detail}", extra={"status_code": exc.status_code})
54 | return JSONResponse(
55 | status_code=exc.status_code,
56 | content={"detail": exc.detail},
57 | )
58 |
59 | @app.exception_handler(PydanticValidationError)
60 | async def pydantic_validation_handler(
61 | request: Request,
62 | exc: PydanticValidationError,
63 | ) -> JSONResponse:
64 | """Handle Pydantic validation errors."""
65 | logger.error(f"ValidationError: {exc.errors()}")
66 | return JSONResponse(
67 | status_code=422,
68 | content={
69 | "detail": "Validation error",
70 | "errors": exc.errors(),
71 | },
72 | )
73 |
74 | @app.exception_handler(Exception)
75 | async def unhandled_exception_handler(
76 | request: Request,
77 | exc: Exception,
78 | ) -> JSONResponse:
79 | """Handle any unhandled exceptions."""
80 | logger.exception(f"Unhandled exception: {exc}")
81 | return JSONResponse(
82 | status_code=500,
83 | content={
84 | "detail": "An unexpected error occurred. Please try again later.",
85 | },
86 | )
87 |
--------------------------------------------------------------------------------
/swift_ocr/config/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Application settings using Pydantic Settings.
3 |
4 | Supports loading from environment variables and .env files.
5 | """
6 |
7 | from functools import lru_cache
8 | from typing import Optional
9 |
10 | from pydantic import Field, field_validator
11 | from pydantic_settings import BaseSettings, SettingsConfigDict
12 |
13 |
14 | class Settings(BaseSettings):
15 | """
16 | Application settings with validation.
17 |
18 | All settings can be overridden via environment variables.
19 | """
20 |
21 | model_config = SettingsConfigDict(
22 | env_file=".env",
23 | env_file_encoding="utf-8",
24 | case_sensitive=False,
25 | extra="ignore",
26 | )
27 |
28 | # API Configuration
29 | app_name: str = Field(default="Swift OCR", description="Application name")
30 | app_version: str = Field(default="2.0.0", description="Application version")
31 | debug: bool = Field(default=False, description="Enable debug mode")
32 |
33 | # OpenAI / Azure OpenAI Configuration
34 | openai_api_key: str = Field(..., description="OpenAI API key")
35 | azure_openai_endpoint: str = Field(..., description="Azure OpenAI endpoint URL")
36 | openai_deployment_id: str = Field(..., description="OpenAI deployment/model ID")
37 | openai_api_version: str = Field(default="2024-02-15-preview", description="OpenAI API version")
38 |
39 | # OCR Configuration
40 | batch_size: int = Field(
41 | default=1,
42 | ge=1,
43 | le=10,
44 | description="Number of pages to process per OCR request (1-10)"
45 | )
46 | max_concurrent_ocr_requests: int = Field(
47 | default=5,
48 | ge=1,
49 | le=50,
50 | description="Maximum concurrent OCR API calls"
51 | )
52 | max_concurrent_pdf_conversion: int = Field(
53 | default=4,
54 | ge=1,
55 | le=16,
56 | description="Maximum concurrent PDF page conversions"
57 | )
58 |
59 | # OCR Model Parameters
60 | ocr_temperature: float = Field(default=0.1, ge=0.0, le=2.0)
61 | ocr_max_tokens: int = Field(default=4000, ge=100, le=128000)
62 | ocr_top_p: float = Field(default=0.95, ge=0.0, le=1.0)
63 |
64 | # Retry Configuration
65 | max_retries: int = Field(default=10, ge=1, le=50)
66 | retry_base_delay: float = Field(default=1.0, ge=0.1)
67 | retry_max_delay: float = Field(default=120.0, ge=1.0)
68 |
69 | # PDF Configuration
70 | pdf_zoom_factor: int = Field(default=2, ge=1, le=4, description="PDF rendering zoom factor")
71 | pdf_download_timeout: int = Field(default=30, ge=5, le=300, description="PDF download timeout in seconds")
72 |
73 | # Server Configuration
74 | host: str = Field(default="0.0.0.0", description="Server host")
75 | port: int = Field(default=8000, ge=1, le=65535, description="Server port")
76 |
77 | @field_validator("azure_openai_endpoint")
78 | @classmethod
79 | def validate_endpoint(cls, v: str) -> str:
80 | """Ensure endpoint has proper format."""
81 | if not v.startswith(("http://", "https://")):
82 | raise ValueError("Azure OpenAI endpoint must start with http:// or https://")
83 | return v.rstrip("/")
84 |
85 | @property
86 | def is_production(self) -> bool:
87 | """Check if running in production mode."""
88 | return not self.debug
89 |
90 |
91 | @lru_cache
92 | def get_settings() -> Settings:
93 | """
94 | Get cached settings instance.
95 |
96 | Uses lru_cache to ensure settings are only loaded once.
97 | Call `get_settings.cache_clear()` to reload.
98 |
99 | Returns:
100 | Settings: The application settings.
101 |
102 | Raises:
103 | ValidationError: If required settings are missing or invalid.
104 | """
105 | return Settings()
106 |
--------------------------------------------------------------------------------
/swift_ocr/core/exceptions.py:
--------------------------------------------------------------------------------
1 | """
2 | Custom exceptions for Swift OCR.
3 |
4 | Provides a hierarchy of exceptions for better error handling and reporting.
5 | """
6 |
7 | from typing import Any, Optional
8 |
9 |
10 | class SwiftOCRError(Exception):
11 | """Base exception for all Swift OCR errors."""
12 |
13 | def __init__(
14 | self,
15 | message: str,
16 | *,
17 | status_code: int = 500,
18 | detail: Optional[str] = None,
19 | context: Optional[dict[str, Any]] = None,
20 | ) -> None:
21 | super().__init__(message)
22 | self.message = message
23 | self.status_code = status_code
24 | self.detail = detail or message
25 | self.context = context or {}
26 |
27 | def __str__(self) -> str:
28 | if self.context:
29 | return f"{self.message} (context: {self.context})"
30 | return self.message
31 |
32 |
33 | class ValidationError(SwiftOCRError):
34 | """Raised when input validation fails."""
35 |
36 | def __init__(
37 | self,
38 | message: str,
39 | *,
40 | field: Optional[str] = None,
41 | context: Optional[dict[str, Any]] = None,
42 | ) -> None:
43 | ctx = context or {}
44 | if field:
45 | ctx["field"] = field
46 | super().__init__(message, status_code=400, context=ctx)
47 |
48 |
49 | class PDFDownloadError(SwiftOCRError):
50 | """Raised when PDF download fails."""
51 |
52 | def __init__(
53 | self,
54 | message: str,
55 | *,
56 | url: Optional[str] = None,
57 | status_code: int = 400,
58 | context: Optional[dict[str, Any]] = None,
59 | ) -> None:
60 | ctx = context or {}
61 | if url:
62 | ctx["url"] = url
63 | super().__init__(message, status_code=status_code, context=ctx)
64 |
65 |
66 | class PDFConversionError(SwiftOCRError):
67 | """Raised when PDF to image conversion fails."""
68 |
69 | def __init__(
70 | self,
71 | message: str,
72 | *,
73 | page_number: Optional[int] = None,
74 | context: Optional[dict[str, Any]] = None,
75 | ) -> None:
76 | ctx = context or {}
77 | if page_number is not None:
78 | ctx["page_number"] = page_number
79 | super().__init__(message, status_code=500, context=ctx)
80 |
81 |
82 | class OCRProcessingError(SwiftOCRError):
83 | """Raised when OCR processing fails."""
84 |
85 | def __init__(
86 | self,
87 | message: str,
88 | *,
89 | batch_info: Optional[str] = None,
90 | status_code: int = 500,
91 | context: Optional[dict[str, Any]] = None,
92 | ) -> None:
93 | ctx = context or {}
94 | if batch_info:
95 | ctx["batch_info"] = batch_info
96 | super().__init__(message, status_code=status_code, context=ctx)
97 |
98 |
99 | class RateLimitError(SwiftOCRError):
100 | """Raised when API rate limit is exceeded."""
101 |
102 | def __init__(
103 | self,
104 | message: str = "Rate limit exceeded",
105 | *,
106 | retry_after: Optional[float] = None,
107 | context: Optional[dict[str, Any]] = None,
108 | ) -> None:
109 | ctx = context or {}
110 | if retry_after is not None:
111 | ctx["retry_after"] = retry_after
112 | super().__init__(message, status_code=429, context=ctx)
113 |
114 |
115 | class TimeoutError(SwiftOCRError):
116 | """Raised when an operation times out."""
117 |
118 | def __init__(
119 | self,
120 | message: str = "Operation timed out",
121 | *,
122 | timeout_seconds: Optional[float] = None,
123 | context: Optional[dict[str, Any]] = None,
124 | ) -> None:
125 | ctx = context or {}
126 | if timeout_seconds is not None:
127 | ctx["timeout_seconds"] = timeout_seconds
128 | super().__init__(message, status_code=504, context=ctx)
129 |
--------------------------------------------------------------------------------
/swift_ocr/schemas/ocr.py:
--------------------------------------------------------------------------------
1 | """
2 | Pydantic schemas for OCR requests and responses.
3 |
4 | Provides validated data models for the API.
5 | """
6 |
7 | from datetime import datetime
8 | from enum import Enum
9 | from typing import Optional
10 |
11 | from pydantic import BaseModel, Field, HttpUrl, ConfigDict
12 |
13 |
14 | class OCRStatus(str, Enum):
15 | """OCR processing status."""
16 |
17 | SUCCESS = "success"
18 | ERROR = "error"
19 | PARTIAL = "partial"
20 |
21 |
22 | class OCRRequest(BaseModel):
23 | """Request model for OCR endpoint."""
24 |
25 | model_config = ConfigDict(
26 | json_schema_extra={
27 | "example": {
28 | "url": "https://example.com/document.pdf"
29 | }
30 | }
31 | )
32 |
33 | url: Optional[HttpUrl] = Field(
34 | default=None,
35 | description="URL of the PDF to process. Provide either this or upload a file."
36 | )
37 |
38 |
39 | class PageImage(BaseModel):
40 | """Represents a single page image with metadata."""
41 |
42 | page_number: int = Field(..., ge=1, description="1-indexed page number")
43 | data_url: str = Field(..., description="Base64-encoded image data URL")
44 |
45 | @property
46 | def image_size(self) -> int:
47 | """Get approximate size of the image in bytes."""
48 | # Base64 encoding increases size by ~33%
49 | return int(len(self.data_url) * 0.75)
50 |
51 |
52 | class OCRResponse(BaseModel):
53 | """Response model for successful OCR processing."""
54 |
55 | model_config = ConfigDict(
56 | json_schema_extra={
57 | "example": {
58 | "text": "# Document Title\n\nExtracted content...",
59 | "status": "success",
60 | "pages_processed": 5,
61 | "processing_time_ms": 1234
62 | }
63 | }
64 | )
65 |
66 | text: str = Field(..., description="Extracted text in Markdown format")
67 | status: OCRStatus = Field(
68 | default=OCRStatus.SUCCESS,
69 | description="Processing status"
70 | )
71 | pages_processed: Optional[int] = Field(
72 | default=None,
73 | ge=0,
74 | description="Number of pages processed"
75 | )
76 | processing_time_ms: Optional[int] = Field(
77 | default=None,
78 | ge=0,
79 | description="Processing time in milliseconds"
80 | )
81 |
82 |
83 | class ErrorDetail(BaseModel):
84 | """Detailed error information."""
85 |
86 | message: str = Field(..., description="Human-readable error message")
87 | code: Optional[str] = Field(default=None, description="Error code")
88 | context: Optional[dict] = Field(default=None, description="Additional context")
89 |
90 |
91 | class ErrorResponse(BaseModel):
92 | """Response model for errors."""
93 |
94 | model_config = ConfigDict(
95 | json_schema_extra={
96 | "example": {
97 | "detail": "No PDF file or URL provided",
98 | "error": {
99 | "message": "No PDF file or URL provided",
100 | "code": "VALIDATION_ERROR"
101 | }
102 | }
103 | }
104 | )
105 |
106 | detail: str = Field(..., description="Error message")
107 | error: Optional[ErrorDetail] = Field(default=None, description="Detailed error info")
108 |
109 |
110 | class HealthResponse(BaseModel):
111 | """Response model for health check endpoint."""
112 |
113 | model_config = ConfigDict(
114 | json_schema_extra={
115 | "example": {
116 | "status": "healthy",
117 | "version": "2.0.0",
118 | "timestamp": "2024-01-01T00:00:00Z"
119 | }
120 | }
121 | )
122 |
123 | status: str = Field(..., description="Health status")
124 | version: str = Field(..., description="Application version")
125 | timestamp: datetime = Field(
126 | default_factory=datetime.utcnow,
127 | description="Current server time"
128 | )
129 | openai_configured: bool = Field(
130 | default=True,
131 | description="Whether OpenAI is properly configured"
132 | )
133 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=68.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "swift-ocr"
7 | version = "2.0.0"
8 | description = "LLM-powered OCR engine that converts PDFs to beautifully formatted Markdown"
9 | readme = "README.md"
10 | license = {text = "AGPL-3.0"}
11 | authors = [
12 | {name = "Yiğit Konur", email = "yigit@konur.dev"}
13 | ]
14 | maintainers = [
15 | {name = "Yiğit Konur", email = "yigit@konur.dev"}
16 | ]
17 | keywords = [
18 | "ocr",
19 | "pdf",
20 | "markdown",
21 | "gpt-4",
22 | "openai",
23 | "vision",
24 | "document-processing",
25 | "text-extraction",
26 | ]
27 | classifiers = [
28 | "Development Status :: 4 - Beta",
29 | "Environment :: Web Environment",
30 | "Framework :: FastAPI",
31 | "Intended Audience :: Developers",
32 | "License :: OSI Approved :: GNU Affero General Public License v3",
33 | "Operating System :: OS Independent",
34 | "Programming Language :: Python :: 3",
35 | "Programming Language :: Python :: 3.8",
36 | "Programming Language :: Python :: 3.9",
37 | "Programming Language :: Python :: 3.10",
38 | "Programming Language :: Python :: 3.11",
39 | "Programming Language :: Python :: 3.12",
40 | "Topic :: Scientific/Engineering :: Image Recognition",
41 | "Topic :: Text Processing :: General",
42 | "Typing :: Typed",
43 | ]
44 | requires-python = ">=3.8"
45 | dependencies = [
46 | "fastapi>=0.109.0,<1.0.0",
47 | "uvicorn[standard]>=0.27.0,<1.0.0",
48 | "python-multipart>=0.0.6,<1.0.0",
49 | "pydantic>=2.5.0,<3.0.0",
50 | "pydantic-settings>=2.1.0,<3.0.0",
51 | "python-dotenv>=1.0.0,<2.0.0",
52 | "requests>=2.31.0,<3.0.0",
53 | "PyMuPDF>=1.23.0,<2.0.0",
54 | "openai>=1.12.0,<2.0.0",
55 | ]
56 |
57 | [project.optional-dependencies]
58 | dev = [
59 | "pytest>=7.4.0",
60 | "pytest-asyncio>=0.21.0",
61 | "pytest-cov>=4.1.0",
62 | "httpx>=0.25.0",
63 | "ruff>=0.1.0",
64 | "mypy>=1.7.0",
65 | "pre-commit>=3.5.0",
66 | ]
67 |
68 | [project.urls]
69 | Homepage = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown"
70 | Documentation = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown#readme"
71 | Repository = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown.git"
72 | Issues = "https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown/issues"
73 |
74 | [project.scripts]
75 | swift-ocr = "swift_ocr.__main__:main"
76 |
77 | [tool.setuptools.packages.find]
78 | where = ["."]
79 | include = ["swift_ocr*"]
80 |
81 | [tool.setuptools.package-data]
82 | swift_ocr = ["py.typed"]
83 |
84 | # Ruff configuration (linting + formatting)
85 | [tool.ruff]
86 | target-version = "py38"
87 | line-length = 100
88 | select = [
89 | "E", # pycodestyle errors
90 | "W", # pycodestyle warnings
91 | "F", # Pyflakes
92 | "I", # isort
93 | "B", # flake8-bugbear
94 | "C4", # flake8-comprehensions
95 | "UP", # pyupgrade
96 | "ARG", # flake8-unused-arguments
97 | "SIM", # flake8-simplify
98 | ]
99 | ignore = [
100 | "E501", # line too long (handled by formatter)
101 | "B008", # do not perform function calls in argument defaults
102 | "B904", # raise without from inside except
103 | ]
104 |
105 | [tool.ruff.isort]
106 | known-first-party = ["swift_ocr"]
107 |
108 | [tool.ruff.per-file-ignores]
109 | "__init__.py" = ["F401"]
110 |
111 | # MyPy configuration
112 | [tool.mypy]
113 | python_version = "3.8"
114 | warn_return_any = true
115 | warn_unused_ignores = true
116 | disallow_untyped_defs = true
117 | ignore_missing_imports = true
118 |
119 | [[tool.mypy.overrides]]
120 | module = "fitz.*"
121 | ignore_missing_imports = true
122 |
123 | # Pytest configuration
124 | [tool.pytest.ini_options]
125 | asyncio_mode = "auto"
126 | testpaths = ["tests"]
127 | python_files = ["test_*.py"]
128 | python_functions = ["test_*"]
129 | addopts = "-v --tb=short"
130 |
131 | # Coverage configuration
132 | [tool.coverage.run]
133 | source = ["swift_ocr"]
134 | branch = true
135 | omit = ["*/tests/*", "*/__main__.py"]
136 |
137 | [tool.coverage.report]
138 | exclude_lines = [
139 | "pragma: no cover",
140 | "def __repr__",
141 | "raise NotImplementedError",
142 | "if TYPE_CHECKING:",
143 | "if __name__ == .__main__.:",
144 | ]
145 |
--------------------------------------------------------------------------------
/swift_ocr/core/retry.py:
--------------------------------------------------------------------------------
1 | """
2 | Retry utilities with exponential backoff.
3 |
4 | Provides decorators and functions for retrying failed operations.
5 | """
6 |
7 | import asyncio
8 | from functools import wraps
9 | from typing import Any, Callable, Optional, Tuple, Type, TypeVar
10 |
11 | from swift_ocr.core.exceptions import RateLimitError, SwiftOCRError
12 | from swift_ocr.core.logging import get_logger
13 |
14 | logger = get_logger(__name__)
15 |
16 | T = TypeVar("T")
17 |
18 |
19 | async def retry_with_backoff(
20 | func: Callable[..., Any],
21 | *args: Any,
22 | max_retries: int = 10,
23 | base_delay: float = 1.0,
24 | max_delay: float = 120.0,
25 | retryable_exceptions: Tuple[Type[Exception], ...] = (RateLimitError, asyncio.TimeoutError),
26 | **kwargs: Any,
27 | ) -> Any:
28 | """
29 | Retry an async function with exponential backoff.
30 |
31 | Args:
32 | func: The async function to retry
33 | *args: Positional arguments for the function
34 | max_retries: Maximum number of retry attempts
35 | base_delay: Initial delay in seconds
36 | max_delay: Maximum delay in seconds
37 | retryable_exceptions: Tuple of exceptions that should trigger a retry
38 | **kwargs: Keyword arguments for the function
39 |
40 | Returns:
41 | The result of the function if successful
42 |
43 | Raises:
44 | The last exception if all retries fail
45 | """
46 | last_exception: Optional[Exception] = None
47 |
48 | for attempt in range(1, max_retries + 1):
49 | try:
50 | return await func(*args, **kwargs)
51 | except retryable_exceptions as e:
52 | last_exception = e
53 |
54 | if attempt == max_retries:
55 | logger.error(
56 | f"Max retries ({max_retries}) exceeded for {func.__name__}",
57 | extra={"error": str(e)},
58 | )
59 | raise
60 |
61 | # Calculate delay with exponential backoff
62 | delay = min(base_delay * (2 ** (attempt - 1)), max_delay)
63 |
64 | # If it's a rate limit error with retry_after, use that instead
65 | if isinstance(e, RateLimitError) and e.context.get("retry_after"):
66 | delay = e.context["retry_after"]
67 |
68 | logger.warning(
69 | f"Attempt {attempt}/{max_retries} failed for {func.__name__}. "
70 | f"Retrying in {delay:.1f}s...",
71 | extra={"error": str(e), "delay": delay},
72 | )
73 |
74 | await asyncio.sleep(delay)
75 | except Exception as e:
76 | # Non-retryable exception, raise immediately
77 | logger.error(
78 | f"Non-retryable error in {func.__name__}: {e}",
79 | extra={"error_type": type(e).__name__},
80 | )
81 | raise
82 |
83 | # This should never be reached, but just in case
84 | if last_exception:
85 | raise last_exception
86 | raise RuntimeError(f"Retry loop completed without result for {func.__name__}")
87 |
88 |
89 | def with_retry(
90 | max_retries: int = 10,
91 | base_delay: float = 1.0,
92 | max_delay: float = 120.0,
93 | retryable_exceptions: Tuple[Type[Exception], ...] = (RateLimitError, asyncio.TimeoutError),
94 | ) -> Callable[[Callable[..., T]], Callable[..., T]]:
95 | """
96 | Decorator for adding retry logic to async functions.
97 |
98 | Args:
99 | max_retries: Maximum number of retry attempts
100 | base_delay: Initial delay in seconds
101 | max_delay: Maximum delay in seconds
102 | retryable_exceptions: Tuple of exceptions that should trigger a retry
103 |
104 | Returns:
105 | Decorated function with retry logic
106 |
107 | Example:
108 | @with_retry(max_retries=5, base_delay=2.0)
109 | async def call_api():
110 | ...
111 | """
112 | def decorator(func: Callable[..., T]) -> Callable[..., T]:
113 | @wraps(func)
114 | async def wrapper(*args: Any, **kwargs: Any) -> T:
115 | return await retry_with_backoff(
116 | func,
117 | *args,
118 | max_retries=max_retries,
119 | base_delay=base_delay,
120 | max_delay=max_delay,
121 | retryable_exceptions=retryable_exceptions,
122 | **kwargs,
123 | )
124 | return wrapper
125 | return decorator
126 |
--------------------------------------------------------------------------------
/swift_ocr/api/routes/ocr.py:
--------------------------------------------------------------------------------
1 | """
2 | OCR API endpoints.
3 |
4 | Provides endpoints for PDF to Markdown conversion.
5 | """
6 |
7 | import asyncio
8 | import time
9 | from typing import Optional
10 |
11 | from fastapi import APIRouter, File, Form, HTTPException, UploadFile
12 |
13 | from swift_ocr.api.deps import OCRServiceDep, PDFServiceDep, SettingsDep
14 | from swift_ocr.core.exceptions import (
15 | OCRProcessingError,
16 | PDFConversionError,
17 | PDFDownloadError,
18 | SwiftOCRError,
19 | ValidationError,
20 | )
21 | from swift_ocr.core.logging import get_logger
22 | from swift_ocr.schemas import OCRRequest, OCRResponse, OCRStatus
23 |
24 | logger = get_logger(__name__)
25 |
26 | router = APIRouter()
27 |
28 |
29 | @router.post(
30 | "/ocr",
31 | response_model=OCRResponse,
32 | summary="Extract Text from PDF",
33 | description="Convert a PDF document to Markdown text using OCR. "
34 | "Provide either a file upload or a URL to a PDF.",
35 | responses={
36 | 200: {"description": "Successfully extracted text"},
37 | 400: {"description": "Invalid input (no file/URL or invalid PDF)"},
38 | 422: {"description": "Validation error"},
39 | 429: {"description": "Rate limit exceeded"},
40 | 500: {"description": "Internal processing error"},
41 | 504: {"description": "Timeout during processing"},
42 | },
43 | )
44 | async def ocr_endpoint(
45 | settings: SettingsDep,
46 | pdf_service: PDFServiceDep,
47 | ocr_service: OCRServiceDep,
48 | file: Optional[UploadFile] = File(None, description="PDF file to process"),
49 | url: Optional[str] = Form(None, description="URL of PDF to process"),
50 | ) -> OCRResponse:
51 | """
52 | Perform OCR on a PDF document.
53 |
54 | Accepts either:
55 | - A PDF file upload via multipart/form-data
56 | - A URL pointing to a PDF file
57 |
58 | Returns the extracted text in Markdown format.
59 | """
60 | start_time = time.perf_counter()
61 |
62 | try:
63 | # Validate input
64 | pdf_bytes = await _get_pdf_bytes(pdf_service, file, url)
65 |
66 | # Convert PDF to images
67 | logger.info("Converting PDF to images...")
68 | loop = asyncio.get_event_loop()
69 | pages = await loop.run_in_executor(
70 | None,
71 | pdf_service.convert_to_images,
72 | pdf_bytes,
73 | )
74 |
75 | if not pages:
76 | raise ValidationError("PDF contains no pages")
77 |
78 | # Encode images to base64
79 | page_images = pdf_service.encode_pages_to_base64(pages)
80 |
81 | # Perform OCR
82 | logger.info(f"Starting OCR on {len(page_images)} pages...")
83 | extracted_text = await ocr_service.process_pages(page_images)
84 |
85 | if not extracted_text:
86 | raise OCRProcessingError("OCR completed but no text was extracted")
87 |
88 | # Calculate processing time
89 | processing_time_ms = int((time.perf_counter() - start_time) * 1000)
90 |
91 | logger.info(
92 | f"OCR complete: {len(extracted_text):,} chars from {len(pages)} pages "
93 | f"in {processing_time_ms}ms"
94 | )
95 |
96 | return OCRResponse(
97 | text=extracted_text,
98 | status=OCRStatus.SUCCESS,
99 | pages_processed=len(pages),
100 | processing_time_ms=processing_time_ms,
101 | )
102 |
103 | except SwiftOCRError as e:
104 | logger.error(f"OCR error: {e}")
105 | raise HTTPException(status_code=e.status_code, detail=e.detail)
106 | except HTTPException:
107 | raise
108 | except Exception as e:
109 | logger.exception(f"Unexpected error in OCR endpoint: {e}")
110 | raise HTTPException(
111 | status_code=500,
112 | detail="An unexpected error occurred during OCR processing",
113 | )
114 |
115 |
116 | async def _get_pdf_bytes(
117 | pdf_service: PDFServiceDep,
118 | file: Optional[UploadFile],
119 | url: Optional[str],
120 | ) -> bytes:
121 | """
122 | Get PDF bytes from either file upload or URL.
123 |
124 | Args:
125 | pdf_service: PDF service instance
126 | file: Uploaded file (optional)
127 | url: URL to download from (optional)
128 |
129 | Returns:
130 | PDF file content as bytes
131 |
132 | Raises:
133 | ValidationError: If input is invalid
134 | PDFDownloadError: If download fails
135 | """
136 | # Validate that exactly one input is provided
137 | if not file and not url:
138 | raise ValidationError(
139 | "No PDF provided. Please upload a file or provide a URL.",
140 | field="file/url",
141 | )
142 |
143 | if file and url:
144 | raise ValidationError(
145 | "Please provide either a file or a URL, not both.",
146 | field="file/url",
147 | )
148 |
149 | if file:
150 | return await _read_uploaded_file(file)
151 | else:
152 | return pdf_service.download_pdf(url)
153 |
154 |
155 | async def _read_uploaded_file(file: UploadFile) -> bytes:
156 | """
157 | Read and validate an uploaded PDF file.
158 |
159 | Args:
160 | file: Uploaded file
161 |
162 | Returns:
163 | PDF content as bytes
164 |
165 | Raises:
166 | ValidationError: If file is invalid
167 | """
168 | # Check content type
169 | content_type = file.content_type or ""
170 | if content_type and "pdf" not in content_type.lower():
171 | # Some clients don't send correct content-type, so we'll also check magic bytes
172 | pass
173 |
174 | try:
175 | pdf_bytes = await file.read()
176 | except Exception as e:
177 | raise ValidationError(
178 | f"Failed to read uploaded file: {e}",
179 | field="file",
180 | )
181 |
182 | if not pdf_bytes:
183 | raise ValidationError(
184 | "Uploaded file is empty",
185 | field="file",
186 | )
187 |
188 | # Check PDF magic bytes
189 | if not pdf_bytes.startswith(b"%PDF"):
190 | raise ValidationError(
191 | "Uploaded file is not a valid PDF",
192 | field="file",
193 | )
194 |
195 | logger.info(f"Read uploaded PDF: {len(pdf_bytes):,} bytes")
196 | return pdf_bytes
197 |
--------------------------------------------------------------------------------
/swift_ocr/services/pdf.py:
--------------------------------------------------------------------------------
1 | """
2 | PDF processing service.
3 |
4 | Handles PDF download, conversion to images, and encoding.
5 | """
6 |
7 | import base64
8 | import os
9 | import tempfile
10 | from concurrent.futures import ProcessPoolExecutor, as_completed
11 | from dataclasses import dataclass
12 | from typing import List, Optional, Tuple
13 |
14 | import fitz # PyMuPDF
15 | import requests
16 |
17 | from swift_ocr.config import Settings
18 | from swift_ocr.core.exceptions import PDFConversionError, PDFDownloadError
19 | from swift_ocr.core.logging import get_logger
20 | from swift_ocr.schemas import PageImage
21 |
22 | logger = get_logger(__name__)
23 |
24 |
25 | @dataclass
26 | class PDFPage:
27 | """Represents a rendered PDF page."""
28 |
29 | page_number: int # 1-indexed
30 | image_bytes: bytes
31 |
32 | @property
33 | def size_bytes(self) -> int:
34 | return len(self.image_bytes)
35 |
36 |
37 | def _convert_single_page(args: Tuple[str, int, int]) -> Tuple[int, bytes]:
38 | """
39 | Convert a single PDF page to PNG image bytes.
40 |
41 | This function runs in a separate process for parallelization.
42 |
43 | Args:
44 | args: Tuple of (pdf_path, page_index, zoom_factor)
45 |
46 | Returns:
47 | Tuple of (page_number, image_bytes) where page_number is 1-indexed
48 | """
49 | pdf_path, page_index, zoom = args
50 |
51 | try:
52 | doc = fitz.open(pdf_path)
53 | page = doc.load_page(page_index)
54 | matrix = fitz.Matrix(zoom, zoom)
55 | pixmap = page.get_pixmap(matrix=matrix)
56 | image_bytes = pixmap.tobytes("png")
57 | doc.close()
58 | return (page_index + 1, image_bytes) # Convert to 1-indexed
59 | except Exception as e:
60 | raise PDFConversionError(
61 | f"Failed to render page {page_index + 1}",
62 | page_number=page_index + 1,
63 | context={"error": str(e)},
64 | )
65 |
66 |
67 | class PDFService:
68 | """
69 | Service for PDF processing operations.
70 |
71 | Handles downloading, converting to images, and encoding PDFs.
72 | """
73 |
74 | def __init__(self, settings: Settings) -> None:
75 | """
76 | Initialize PDF service.
77 |
78 | Args:
79 | settings: Application settings
80 | """
81 | self.settings = settings
82 | self._temp_files: List[str] = []
83 |
84 | def download_pdf(self, url: str) -> bytes:
85 | """
86 | Download a PDF file from a URL.
87 |
88 | Args:
89 | url: URL of the PDF file
90 |
91 | Returns:
92 | PDF file content as bytes
93 |
94 | Raises:
95 | PDFDownloadError: If download fails or content is not a PDF
96 | """
97 | logger.info(f"Downloading PDF from: {url}")
98 |
99 | try:
100 | response = requests.get(
101 | str(url),
102 | timeout=self.settings.pdf_download_timeout,
103 | headers={"User-Agent": "SwiftOCR/2.0"},
104 | )
105 | response.raise_for_status()
106 |
107 | content_type = response.headers.get("Content-Type", "")
108 | if "application/pdf" not in content_type.lower():
109 | # Some servers don't set content-type correctly, check magic bytes
110 | if not response.content.startswith(b"%PDF"):
111 | logger.warning(f"Invalid content type: {content_type}")
112 | raise PDFDownloadError(
113 | "URL does not point to a valid PDF file",
114 | url=url,
115 | context={"content_type": content_type},
116 | )
117 |
118 | logger.info(f"Downloaded PDF: {len(response.content):,} bytes")
119 | return response.content
120 |
121 | except requests.exceptions.Timeout:
122 | raise PDFDownloadError(
123 | "Timeout while downloading PDF",
124 | url=url,
125 | status_code=504,
126 | )
127 | except requests.exceptions.HTTPError as e:
128 | raise PDFDownloadError(
129 | f"HTTP error while downloading PDF: {e}",
130 | url=url,
131 | status_code=getattr(e.response, "status_code", 400),
132 | )
133 | except requests.exceptions.RequestException as e:
134 | raise PDFDownloadError(
135 | f"Failed to download PDF: {e}",
136 | url=url,
137 | )
138 |
139 | def convert_to_images(
140 | self,
141 | pdf_bytes: bytes,
142 | *,
143 | zoom: Optional[int] = None,
144 | ) -> List[PDFPage]:
145 | """
146 | Convert PDF bytes to a list of page images.
147 |
148 | Uses multiprocessing for parallel conversion.
149 |
150 | Args:
151 | pdf_bytes: PDF file content
152 | zoom: Zoom factor for rendering (default from settings)
153 |
154 | Returns:
155 | List of PDFPage objects with rendered images
156 |
157 | Raises:
158 | PDFConversionError: If conversion fails
159 | """
160 | zoom = zoom or self.settings.pdf_zoom_factor
161 |
162 | # Save to temporary file for multiprocessing
163 | temp_path = self._save_to_temp_file(pdf_bytes)
164 |
165 | try:
166 | # Get page count
167 | doc = fitz.open(temp_path)
168 | page_count = doc.page_count
169 | doc.close()
170 |
171 | logger.info(f"Converting PDF with {page_count} pages (zoom={zoom}x)")
172 |
173 | # Prepare arguments for each page
174 | args_list = [(temp_path, i, zoom) for i in range(page_count)]
175 | pages: List[PDFPage] = []
176 |
177 | # Use multiprocessing for parallel conversion
178 | max_workers = min(
179 | self.settings.max_concurrent_pdf_conversion,
180 | page_count,
181 | )
182 |
183 | with ProcessPoolExecutor(max_workers=max_workers) as executor:
184 | future_to_page = {
185 | executor.submit(_convert_single_page, args): args[1]
186 | for args in args_list
187 | }
188 |
189 | for future in as_completed(future_to_page):
190 | page_index = future_to_page[future]
191 | try:
192 | page_num, image_bytes = future.result()
193 | pages.append(PDFPage(
194 | page_number=page_num,
195 | image_bytes=image_bytes,
196 | ))
197 | except Exception as e:
198 | logger.error(f"Failed to convert page {page_index + 1}: {e}")
199 | raise PDFConversionError(
200 | f"Failed to convert page {page_index + 1}",
201 | page_number=page_index + 1,
202 | context={"error": str(e)},
203 | )
204 |
205 | # Sort by page number to maintain order
206 | pages.sort(key=lambda p: p.page_number)
207 |
208 | total_size = sum(p.size_bytes for p in pages)
209 | logger.info(f"Converted {len(pages)} pages, total size: {total_size:,} bytes")
210 |
211 | return pages
212 |
213 | finally:
214 | self._cleanup_temp_file(temp_path)
215 |
216 | def encode_pages_to_base64(self, pages: List[PDFPage]) -> List[PageImage]:
217 | """
218 | Encode page images to base64 data URLs.
219 |
220 | Args:
221 | pages: List of PDF pages with image bytes
222 |
223 | Returns:
224 | List of PageImage objects with base64-encoded data URLs
225 | """
226 | encoded: List[PageImage] = []
227 |
228 | for page in pages:
229 | base64_str = base64.b64encode(page.image_bytes).decode("utf-8")
230 | data_url = f"data:image/png;base64,{base64_str}"
231 |
232 | encoded.append(PageImage(
233 | page_number=page.page_number,
234 | data_url=data_url,
235 | ))
236 |
237 | logger.debug(f"Encoded {len(encoded)} pages to base64")
238 | return encoded
239 |
240 | def _save_to_temp_file(self, pdf_bytes: bytes) -> str:
241 | """Save PDF bytes to a temporary file."""
242 | with tempfile.NamedTemporaryFile(
243 | delete=False,
244 | suffix=".pdf",
245 | prefix="swift_ocr_",
246 | ) as f:
247 | f.write(pdf_bytes)
248 | temp_path = f.name
249 |
250 | self._temp_files.append(temp_path)
251 | logger.debug(f"Saved PDF to temp file: {temp_path}")
252 | return temp_path
253 |
254 | def _cleanup_temp_file(self, path: str) -> None:
255 | """Clean up a temporary file."""
256 | try:
257 | if os.path.exists(path):
258 | os.remove(path)
259 | logger.debug(f"Cleaned up temp file: {path}")
260 | if path in self._temp_files:
261 | self._temp_files.remove(path)
262 | except Exception as e:
263 | logger.warning(f"Failed to clean up temp file {path}: {e}")
264 |
265 | def cleanup_all(self) -> None:
266 | """Clean up all temporary files."""
267 | for path in list(self._temp_files):
268 | self._cleanup_temp_file(path)
269 |
--------------------------------------------------------------------------------
/swift_ocr/services/ocr.py:
--------------------------------------------------------------------------------
1 | """
2 | OCR service using OpenAI Vision API.
3 |
4 | Handles text extraction from images using GPT-4 Vision.
5 | """
6 |
7 | import asyncio
8 | from typing import List, Optional
9 |
10 | from openai import AsyncAzureOpenAI, OpenAIError
11 |
12 | from swift_ocr.config import Settings
13 | from swift_ocr.core.exceptions import OCRProcessingError, RateLimitError
14 | from swift_ocr.core.logging import get_logger
15 | from swift_ocr.core.retry import retry_with_backoff
16 | from swift_ocr.schemas import PageImage
17 |
18 | logger = get_logger(__name__)
19 |
20 |
21 | # System prompt for OCR
22 | SYSTEM_PROMPT = """You are an OCR assistant. Extract all text from the provided images (Describe images as if you're explaining them to a blind person eg: `[Image: In this picture, 8 people are posed hugging each other]`), which are attached to the document. Use markdown formatting for:
23 |
24 | - Headings (# for main, ## for sub)
25 | - Lists (- for unordered, 1. for ordered)
26 | - Emphasis (* for italics, ** for bold)
27 | - Links ([text](URL))
28 | - Tables (use markdown table format)
29 |
30 | For non-text elements, describe them: [Image: Brief description]
31 |
32 | Maintain logical flow and use horizontal rules (---) to separate sections if needed. Adjust formatting to preserve readability.
33 |
34 | Note any issues or ambiguities at the end of your output.
35 |
36 | Be thorough and accurate in transcribing all text content."""
37 |
38 | USER_PROMPT = """Never skip any context! Convert document as is be creative to use markdown effectively to reproduce the same document by using markdown. Translate image text to markdown sequentially. Preserve order and completeness. Separate images with `---`. No skips or comments. Start with first image immediately."""
39 |
40 |
41 | class OCRService:
42 | """
43 | Service for OCR processing using OpenAI Vision API.
44 |
45 | Handles batching, retry logic, and text extraction.
46 | """
47 |
48 | def __init__(self, settings: Settings) -> None:
49 | """
50 | Initialize OCR service.
51 |
52 | Args:
53 | settings: Application settings
54 | """
55 | self.settings = settings
56 | self._client: Optional[AsyncAzureOpenAI] = None
57 |
58 | @property
59 | def client(self) -> AsyncAzureOpenAI:
60 | """Get or create the OpenAI client (lazy initialization)."""
61 | if self._client is None:
62 | self._client = AsyncAzureOpenAI(
63 | azure_endpoint=self.settings.azure_openai_endpoint,
64 | api_version=self.settings.openai_api_version,
65 | api_key=self.settings.openai_api_key,
66 | )
67 | return self._client
68 |
69 | async def process_pages(
70 | self,
71 | pages: List[PageImage],
72 | *,
73 | batch_size: Optional[int] = None,
74 | ) -> str:
75 | """
76 | Process multiple pages and extract text.
77 |
78 | Args:
79 | pages: List of page images to process
80 | batch_size: Number of pages per OCR request (default from settings)
81 |
82 | Returns:
83 | Extracted text in Markdown format
84 |
85 | Raises:
86 | OCRProcessingError: If text extraction fails
87 | """
88 | batch_size = batch_size or self.settings.batch_size
89 | batches = self._create_batches(pages, batch_size)
90 |
91 | logger.info(f"Processing {len(pages)} pages in {len(batches)} batches")
92 |
93 | # Process batches with concurrency limit
94 | semaphore = asyncio.Semaphore(self.settings.max_concurrent_ocr_requests)
95 |
96 | async def process_with_semaphore(batch: List[PageImage]) -> str:
97 | async with semaphore:
98 | return await self._process_batch(batch)
99 |
100 | tasks = [
101 | asyncio.create_task(process_with_semaphore(batch))
102 | for batch in batches
103 | ]
104 |
105 | try:
106 | results = await asyncio.gather(*tasks, return_exceptions=True)
107 | except Exception as e:
108 | logger.error(f"Error processing batches: {e}")
109 | raise OCRProcessingError(f"Batch processing failed: {e}")
110 |
111 | # Check for exceptions in results
112 | texts: List[str] = []
113 | for i, result in enumerate(results):
114 | if isinstance(result, Exception):
115 | logger.error(f"Batch {i + 1} failed: {result}")
116 | raise OCRProcessingError(
117 | f"Batch {i + 1} failed",
118 | batch_info=f"pages {batches[i][0].page_number}-{batches[i][-1].page_number}",
119 | context={"error": str(result)},
120 | )
121 | texts.append(result)
122 |
123 | # Concatenate results
124 | final_text = "\n\n".join(texts)
125 | logger.info(f"OCR complete: {len(final_text):,} characters extracted")
126 |
127 | return final_text
128 |
129 | async def _process_batch(self, batch: List[PageImage]) -> str:
130 | """
131 | Process a single batch of pages with retry logic.
132 |
133 | Args:
134 | batch: List of page images in this batch
135 |
136 | Returns:
137 | Extracted text from the batch
138 | """
139 | page_range = f"{batch[0].page_number}-{batch[-1].page_number}"
140 | logger.debug(f"Processing batch: pages {page_range}")
141 |
142 | async def make_request() -> str:
143 | return await self._call_openai_api(batch)
144 |
145 | return await retry_with_backoff(
146 | make_request,
147 | max_retries=self.settings.max_retries,
148 | base_delay=self.settings.retry_base_delay,
149 | max_delay=self.settings.retry_max_delay,
150 | retryable_exceptions=(RateLimitError, asyncio.TimeoutError),
151 | )
152 |
153 | async def _call_openai_api(self, batch: List[PageImage]) -> str:
154 | """
155 | Make the actual API call to OpenAI.
156 |
157 | Args:
158 | batch: List of page images to process
159 |
160 | Returns:
161 | Extracted text
162 |
163 | Raises:
164 | RateLimitError: If rate limited
165 | OCRProcessingError: If API call fails
166 | """
167 | messages = self._build_messages(batch)
168 |
169 | try:
170 | response = await self.client.chat.completions.create(
171 | model=self.settings.openai_deployment_id,
172 | messages=messages,
173 | temperature=self.settings.ocr_temperature,
174 | max_tokens=self.settings.ocr_max_tokens,
175 | top_p=self.settings.ocr_top_p,
176 | frequency_penalty=0,
177 | presence_penalty=0,
178 | )
179 |
180 | return self._extract_text_from_response(response)
181 |
182 | except OpenAIError as e:
183 | error_str = str(e).lower()
184 | if "rate limit" in error_str or "429" in error_str:
185 | raise RateLimitError(
186 | "OpenAI rate limit exceeded",
187 | context={"error": str(e)},
188 | )
189 |
190 | logger.error(f"OpenAI API error: {e}")
191 | raise OCRProcessingError(
192 | f"OCR API call failed: {e}",
193 | status_code=502,
194 | )
195 | except asyncio.TimeoutError:
196 | raise # Let retry logic handle this
197 | except Exception as e:
198 | logger.exception(f"Unexpected error during OCR: {e}")
199 | raise OCRProcessingError(f"Unexpected OCR error: {e}")
200 |
201 | def _build_messages(self, batch: List[PageImage]) -> List[dict]:
202 | """
203 | Build the message payload for the OpenAI API.
204 |
205 | Args:
206 | batch: List of page images
207 |
208 | Returns:
209 | List of message dictionaries
210 | """
211 | messages = [
212 | {"role": "system", "content": SYSTEM_PROMPT},
213 | {"role": "user", "content": USER_PROMPT},
214 | ]
215 |
216 | if len(batch) == 1:
217 | # Single page: simple format
218 | page = batch[0]
219 | messages.append({
220 | "role": "user",
221 | "content": f"Page {page.page_number}:",
222 | })
223 | messages.append({
224 | "role": "user",
225 | "content": [
226 | {"type": "image_url", "image_url": {"url": page.data_url}}
227 | ],
228 | })
229 | else:
230 | # Multiple pages: include page numbers in content
231 | messages.append({
232 | "role": "user",
233 | "content": "Please perform OCR on the following images. "
234 | "Ensure that the extracted text includes the corresponding page numbers.",
235 | })
236 |
237 | content = []
238 | for page in batch:
239 | content.append({"type": "text", "text": f"Page {page.page_number}:"})
240 | content.append({"type": "image_url", "image_url": {"url": page.data_url}})
241 |
242 | messages.append({"role": "user", "content": content})
243 |
244 | return messages
245 |
246 | def _extract_text_from_response(self, response) -> str:
247 | """
248 | Extract text content from the API response.
249 |
250 | Args:
251 | response: OpenAI API response
252 |
253 | Returns:
254 | Extracted text
255 |
256 | Raises:
257 | OCRProcessingError: If no text was extracted
258 | """
259 | if (
260 | not response.choices
261 | or not hasattr(response.choices[0].message, "content")
262 | or not response.choices[0].message.content
263 | ):
264 | raise OCRProcessingError("No text extracted from OCR response")
265 |
266 | text = response.choices[0].message.content.strip()
267 | logger.debug(f"Extracted {len(text):,} characters from response")
268 | return text
269 |
270 | def _create_batches(
271 | self,
272 | items: List[PageImage],
273 | batch_size: int,
274 | ) -> List[List[PageImage]]:
275 | """Split items into batches of specified size."""
276 | batches = [
277 | items[i:i + batch_size]
278 | for i in range(0, len(items), batch_size)
279 | ]
280 | return batches
281 |
282 | async def close(self) -> None:
283 | """Close the OpenAI client."""
284 | if self._client is not None:
285 | await self._client.close()
286 | self._client = None
287 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
⚡ Swift OCR ⚡
2 | Stop squinting at PDFs. Start extracting clean markdown.
3 |
4 |
5 |
6 | The LLM-powered OCR engine that turns any PDF into beautifully formatted Markdown. It reads your documents like a human, handles messy layouts, and outputs text your AI can actually understand.
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | •
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | ### 🧭 Quick Navigation
28 |
29 | [**⚡ Get Started**](#-get-started-in-60-seconds) •
30 | [**✨ Key Features**](#-feature-breakdown-the-secret-sauce) •
31 | [**🎮 Usage & Examples**](#-usage-fire-and-forget) •
32 | [**💰 Cost Breakdown**](#-cost-breakdown-stupidly-cheap) •
33 | [**⚙️ Configuration**](#️-configuration) •
34 | [**🏗️ Project Structure**](#️-project-structure)
35 |
36 |
37 |
38 | ---
39 |
40 | **Swift OCR** is the document processor your AI assistant wishes it had. Stop feeding your LLM screenshots and praying it reads them correctly. This tool acts like a professional transcriber, reading every page of your PDF, intelligently handling tables, headers, and mixed layouts, then packaging everything into perfectly structured Markdown so your AI can actually work with it.
41 |
42 |
43 |
44 |
45 |
46 | 🧠
47 | GPT-4 Vision
48 | Human-level reading accuracy
49 | |
50 |
51 | ⚡
52 | Parallel Processing
53 | Multi-page PDFs in seconds
54 | |
55 |
56 | 📝
57 | Clean Markdown
58 | Tables, headers, lists—all formatted
59 | |
60 |
61 |
62 |
63 |
64 | How it slaps:
65 | - **You:** `curl -X POST "http://localhost:8000/ocr" -F "file=@messy_document.pdf"`
66 | - **Swift OCR:** Converts pages → Sends to GPT-4 Vision → Formats as Markdown
67 | - **You:** Get perfectly structured text with tables, headers, and lists intact.
68 | - **Result:** Your AI finally understands that 50-page contract. ☕
69 |
70 | ---
71 |
72 | ## 📹 Demo
73 |
74 | https://github.com/user-attachments/assets/6b39f3ea-248e-4c29-ac2e-b57de64d5d65
75 |
76 | *Demo video showcasing the conversion of NASA's Apollo 17 flight documents—complete with unorganized, horizontally and vertically oriented pages—into well-structured Markdown format without breaking a sweat.*
77 |
78 | ---
79 |
80 | ## 💥 Why This Slaps Other Methods
81 |
82 | Manually extracting text from PDFs is a vibe-killer. Swift OCR makes traditional OCR look ancient.
83 |
84 |
85 |
86 | | ❌ The Old Way (Pain) |
87 | ✅ The Swift OCR Way (Glory) |
88 |
89 |
90 |
91 |
92 | - Run Tesseract. Get garbled text.
93 | - Tables? What tables? Just random words now.
94 | - Manually fix formatting for 2 hours.
95 | - Feed broken context to your AI.
96 | - Get a useless answer. Cry.
97 |
98 | |
99 |
100 |
101 | - Upload PDF to Swift OCR.
102 | - Get perfectly formatted Markdown.
103 | - Tables intact. Headers preserved.
104 | - Feed clean context to your AI.
105 | - Get genius-level answers. Go grab a coffee. ☕
106 |
107 | |
108 |
109 |
110 |
111 | We're not just running basic OCR. We're using **GPT-4 Vision** to actually *understand* your documents—handling rotated pages, complex tables, mixed layouts, and even describing images for accessibility.
112 |
113 | ---
114 |
115 | ## 💰 Cost Breakdown: Stupidly Cheap
116 |
117 | Our solution offers an optimal balance of affordability and accuracy that makes enterprise OCR solutions look like highway robbery.
118 |
119 |
120 |
121 | | Metric | Value |
122 | |:------:|:------|
123 | | **Avg tokens/page** | ~1,500 (including prompt) |
124 | | **GPT-4o input cost** | $5 per million tokens |
125 | | **GPT-4o output cost** | $15 per million tokens |
126 | | **Cost per 1,000 pages** | **~$15** |
127 |
128 |
129 |
130 | ### 💡 Want It Even Cheaper?
131 |
132 | | Optimization | Cost per 1,000 pages |
133 | |:------------:|:--------------------:|
134 | | **GPT-4o (default)** | ~$15 |
135 | | **GPT-4o mini** | ~$8 |
136 | | **Batch API** | ~$4 |
137 |
138 | ### 🆚 Market Comparison
139 |
140 |
141 |
142 | | Solution | Cost per 1,000 pages | Tables? | Markdown? |
143 | |:--------:|:-------------------:|:-------:|:---------:|
144 | | **Swift OCR** | **$15** | ✅ Perfect | ✅ Native |
145 | | CloudConvert (PDFTron) | ~$30 | ⚠️ Basic | ❌ No |
146 | | Adobe Acrobat API | ~$50+ | ✅ Good | ❌ No |
147 | | Tesseract (free) | $0 | ❌ Broken | ❌ No |
148 |
149 |
150 |
151 | > **Bottom line:** Half the cost of competitors, 10x the quality. It's not just about being cheaper—it's about getting output you can actually use.
152 |
153 | ---
154 |
155 | ## 🚀 Get Started in 60 Seconds
156 |
157 | ### Prerequisites
158 |
159 | - **Python 3.8+**
160 | - **Azure OpenAI** account (with GPT-4 Vision deployment)
161 |
162 | ### Installation
163 |
164 | ```bash
165 | # Clone the repo
166 | git clone https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown.git
167 | cd swift-ocr-llm-powered-pdf-to-markdown
168 |
169 | # Create virtual environment (recommended)
170 | python3 -m venv venv
171 | source venv/bin/activate # Windows: venv\Scripts\activate
172 |
173 | # Install dependencies
174 | pip install -r requirements.txt
175 | ```
176 |
177 | ### Configure Environment
178 |
179 | Create a `.env` file in the root directory:
180 |
181 | ```env
182 | # Required
183 | OPENAI_API_KEY=your_openai_api_key
184 | AZURE_OPENAI_ENDPOINT=https://your-resource.openai.azure.com/
185 | OPENAI_DEPLOYMENT_ID=your_gpt4_vision_deployment
186 |
187 | # Optional (sensible defaults)
188 | OPENAI_API_VERSION=gpt-4o
189 | BATCH_SIZE=1 # Images per OCR request (1-10)
190 | MAX_CONCURRENT_OCR_REQUESTS=5 # Parallel OCR calls
191 | MAX_CONCURRENT_PDF_CONVERSION=4 # Parallel page rendering
192 | ```
193 |
194 | ### Run It
195 |
196 | ```bash
197 | # Option 1: Classic uvicorn (backward compatible)
198 | uvicorn main:app --reload
199 |
200 | # Option 2: Using the new package
201 | uvicorn swift_ocr.app:app --reload
202 |
203 | # Option 3: As a Python module
204 | python -m swift_ocr
205 |
206 | # Option 4: With CLI arguments
207 | python -m swift_ocr --host 0.0.0.0 --port 8080 --workers 4
208 | ```
209 |
210 | 🎉 **API is now live at `http://127.0.0.1:8000`**
211 |
212 | > **✨ Pro tip:** Check out the auto-generated docs at `http://127.0.0.1:8000/docs`
213 |
214 | ---
215 |
216 | ## 🎮 Usage: Fire and Forget
217 |
218 | ### API Endpoint
219 |
220 | **POST** `/ocr`
221 |
222 | Accept a PDF file upload OR a URL to a PDF. Returns beautifully formatted Markdown.
223 |
224 | ### Examples
225 |
226 | **Upload a PDF file:**
227 |
228 | ```bash
229 | curl -X POST "http://127.0.0.1:8000/ocr" \
230 | -F "file=@/path/to/your/document.pdf"
231 | ```
232 |
233 | **Process a PDF from URL:**
234 |
235 | ```bash
236 | curl -X POST "http://127.0.0.1:8000/ocr" \
237 | -H "Content-Type: application/json" \
238 | -d '{"url": "https://example.com/document.pdf"}'
239 | ```
240 |
241 | ### Response
242 |
243 | ```json
244 | {
245 | "text": "# Document Title\n\n## Section 1\n\nExtracted text with **formatting** preserved...\n\n| Column 1 | Column 2 |\n|----------|----------|\n| Data | Data |"
246 | }
247 | ```
248 |
249 | ### Response (v2.0+)
250 |
251 | The new response includes additional metadata:
252 |
253 | ```json
254 | {
255 | "text": "# Document Title\n\n## Section 1\n\nExtracted text...",
256 | "status": "success",
257 | "pages_processed": 5,
258 | "processing_time_ms": 1234
259 | }
260 | ```
261 |
262 | ### Health Check
263 |
264 | ```bash
265 | curl http://127.0.0.1:8000/health
266 | ```
267 |
268 | ```json
269 | {
270 | "status": "healthy",
271 | "version": "2.0.0",
272 | "timestamp": "2024-01-01T00:00:00Z",
273 | "openai_configured": true
274 | }
275 | ```
276 |
277 | ### Error Codes
278 |
279 | | Code | Meaning |
280 | |:----:|:--------|
281 | | `200` | Success—Markdown text returned |
282 | | `400` | Bad request (no file/URL, or both provided) |
283 | | `422` | Validation error |
284 | | `429` | Rate limited—retry with backoff |
285 | | `500` | Processing error |
286 | | `504` | Timeout downloading PDF |
287 |
288 | ---
289 |
290 | ## ✨ Feature Breakdown: The Secret Sauce
291 |
292 |
293 |
294 | | Feature | What It Does | Why You Care |
295 | | :---: | :--- | :--- |
296 | | **🧠 GPT-4 Vision**
`Human-level OCR` | Uses OpenAI's most capable vision model to read documents | Actually understands context, not just character shapes |
297 | | **⚡ Parallel Processing**
`Multiprocessing + async` | Converts PDF pages and calls OCR in parallel | 50-page PDF in seconds, not minutes |
298 | | **📊 Table Preservation**
`Markdown tables` | Detects and formats tables as proper Markdown | Your data stays structured, not flattened to gibberish |
299 | | **🔄 Smart Batching**
`Configurable batch size` | Groups pages to optimize API calls vs accuracy | Balance speed and cost for your use case |
300 | | **🛡️ Retry with Backoff**
`Exponential backoff` | Automatically retries on rate limits and timeouts | Handles API hiccups without crashing |
301 | | **📄 Flexible Input**
`File upload or URL` | Accept PDFs directly or fetch from any URL | Works with your existing workflow |
302 | | **🖼️ Image Descriptions**
`Accessibility-friendly` | Describes non-text elements: `[Image: description]` | Context your AI can actually use |
303 |
304 |
305 |
306 | ---
307 |
308 | ## ⚙️ Configuration
309 |
310 | All settings are managed via environment variables. Tune these for your workload:
311 |
312 |
313 |
314 | | Variable | Default | Description |
315 | |:---------|:-------:|:------------|
316 | | `OPENAI_API_KEY` | — | Your Azure OpenAI API key |
317 | | `AZURE_OPENAI_ENDPOINT` | — | Your Azure OpenAI endpoint URL |
318 | | `OPENAI_DEPLOYMENT_ID` | — | Your GPT-4 Vision deployment ID |
319 | | `OPENAI_API_VERSION` | `gpt-4o` | API version |
320 | | `BATCH_SIZE` | `1` | Pages per OCR request (1-10). Higher = faster but less accurate |
321 | | `MAX_CONCURRENT_OCR_REQUESTS` | `5` | Parallel OCR calls. Increase for throughput |
322 | | `MAX_CONCURRENT_PDF_CONVERSION` | `4` | Parallel page renders. Match your CPU cores |
323 |
324 |
325 |
326 | ### Performance Tuning Tips
327 |
328 | - **High accuracy, slower:** `BATCH_SIZE=1`
329 | - **Balanced:** `BATCH_SIZE=5`, `MAX_CONCURRENT_OCR_REQUESTS=10`
330 | - **Maximum throughput:** `BATCH_SIZE=10`, `MAX_CONCURRENT_OCR_REQUESTS=20` (watch rate limits!)
331 |
332 | ---
333 |
334 | ## 🏗️ Project Structure
335 |
336 | World-class Python engineering with atomic modules and clean separation of concerns:
337 |
338 | ```
339 | swift_ocr/
340 | ├── __init__.py # Package init with version
341 | ├── __main__.py # CLI entry point (python -m swift_ocr)
342 | ├── app.py # FastAPI app factory
343 | ├── config/
344 | │ ├── __init__.py
345 | │ └── settings.py # Pydantic Settings (type-safe config)
346 | ├── core/
347 | │ ├── __init__.py
348 | │ ├── exceptions.py # Custom exception hierarchy
349 | │ ├── logging.py # Structured logging setup
350 | │ └── retry.py # Exponential backoff utilities
351 | ├── schemas/
352 | │ ├── __init__.py
353 | │ └── ocr.py # Pydantic request/response models
354 | ├── services/
355 | │ ├── __init__.py
356 | │ ├── ocr.py # OpenAI Vision OCR service
357 | │ └── pdf.py # PDF conversion service
358 | └── api/
359 | ├── __init__.py
360 | ├── deps.py # Dependency injection
361 | ├── exceptions.py # FastAPI exception handlers
362 | ├── router.py # Route aggregation
363 | └── routes/
364 | ├── __init__.py
365 | ├── health.py # Health check endpoints
366 | └── ocr.py # OCR endpoints
367 | ```
368 |
369 |
370 | Key architectural decisions
371 |
372 | | Pattern | Implementation | Benefit |
373 | | :--- | :--- | :--- |
374 | | **Pydantic Settings** | `config/settings.py` | Type-safe config with `.env` support and validation |
375 | | **Dependency Injection** | `api/deps.py` | Testable, swappable services |
376 | | **Custom Exceptions** | `core/exceptions.py` | Rich error context with proper HTTP status codes |
377 | | **Retry with Backoff** | `core/retry.py` | Handles rate limits and transient failures |
378 | | **App Factory** | `app.py` | Configurable app creation for testing |
379 | | **Typed Throughout** | `py.typed` marker | Full mypy compatibility |
380 |
381 |
382 |
383 | ---
384 |
385 | ## 🔥 Common Issues & Quick Fixes
386 |
387 |
388 | Expand for troubleshooting tips
389 |
390 | | Problem | Solution |
391 | | :--- | :--- |
392 | | **"Missing required environment variables"** | Check your `.env` file has all three required variables: `OPENAI_API_KEY`, `AZURE_OPENAI_ENDPOINT`, `OPENAI_DEPLOYMENT_ID` |
393 | | **Rate limit errors (429)** | Reduce `MAX_CONCURRENT_OCR_REQUESTS` or `BATCH_SIZE`. The retry logic will handle temporary limits automatically. |
394 | | **Timeout errors** | Large PDFs take time. The system has exponential backoff built in—give it a moment. |
395 | | **Garbled output** | Make sure your PDF isn't password-protected or corrupted. Try opening it locally first. |
396 | | **Tables not formatting correctly** | Some extremely complex tables may need `BATCH_SIZE=1` for best accuracy. |
397 | | **"Failed to initialize OpenAI client"** | Verify your Azure endpoint URL format: `https://your-resource.openai.azure.com/` |
398 |
399 |
400 |
401 | ---
402 |
403 | ## 📜 License
404 |
405 | This project uses **PyMuPDF** for PDF processing, which requires the **GNU AGPL v3.0** license.
406 |
407 | > **Want MIT instead?** Fork this project and swap PyMuPDF for `pdf2image` + Poppler. The rest of the code is yours to use freely.
408 |
409 | ```
410 | GNU AFFERO GENERAL PUBLIC LICENSE
411 | Version 3, 19 November 2007
412 |
413 | Copyright (C) 2024 Yiğit Konur
414 | ```
415 |
416 | See [LICENSE.md](LICENSE.md) for the full license text.
417 |
418 | ---
419 |
420 |
421 |
422 | **Built with 🔥 because manually transcribing PDFs is a soul-crushing waste of time.**
423 |
424 | [Report Bug](https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown/issues) •
425 | [Request Feature](https://github.com/yigitkonur/swift-ocr-llm-powered-pdf-to-markdown/issues)
426 |
427 |
428 |
--------------------------------------------------------------------------------