├── docs
    ├── README.md
    ├── logo.png
    └── logo-german-ocr.png
├── .gitignore
├── german_ocr
    ├── __init__.py
    ├── utils.py
    ├── README.md
    ├── ollama_backend.py
    ├── hf_backend.py
    ├── cli.py
    └── ocr.py
├── .github
    └── workflows
    │   └── publish.yml
├── pyproject.toml
├── README.md
└── LICENSE


/docs/README.md:
--------------------------------------------------------------------------------
1 | # German-OCR Documentation
2 | 


--------------------------------------------------------------------------------
/docs/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Keyvanhardani/german-ocr/HEAD/docs/logo.png


--------------------------------------------------------------------------------
/docs/logo-german-ocr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Keyvanhardani/german-ocr/HEAD/docs/logo-german-ocr.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | *.egg-info/
21 | .installed.cfg
22 | *.egg
23 | 
24 | # Virtual environments
25 | venv/
26 | ENV/
27 | env/
28 | 
29 | # IDE
30 | .idea/
31 | .vscode/
32 | *.swp
33 | *.swo
34 | 
35 | # Testing
36 | .pytest_cache/
37 | .coverage
38 | htmlcov/
39 | 
40 | # OS
41 | .DS_Store
42 | Thumbs.db
43 | 


--------------------------------------------------------------------------------
/german_ocr/__init__.py:
--------------------------------------------------------------------------------
 1 | """German OCR Package - Production-ready OCR for German documents.
 2 | 
 3 | This package provides a unified interface for German OCR using multiple backends:
 4 | - Ollama (preferred for local inference) - ollama.com/Keyvan/german-ocr
 5 | - HuggingFace Transformers (GPU) - huggingface.co/Keyven/german-ocr
 6 | 
 7 | Based on fine-tuned Qwen2-VL vision-language models.
 8 | 
 9 | Example:
10 |     >>> from german_ocr import GermanOCR
11 |     >>> ocr = GermanOCR()
12 |     >>> text = ocr.extract("invoice.png")
13 |     >>> print(text)
14 | """
15 | 
16 | from german_ocr.ocr import GermanOCR
17 | 
18 | __version__ = "0.2.0"
19 | __all__ = ["GermanOCR"]
20 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 | 
14 |       - name: Set up Python
15 |         uses: actions/setup-python@v5
16 |         with:
17 |           python-version: '3.11'
18 | 
19 |       - name: Install build dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install build twine
23 | 
24 |       - name: Build package
25 |         run: python -m build
26 | 
27 |       - name: Check package
28 |         run: twine check dist/*
29 | 
30 |       - name: Upload artifacts
31 |         uses: actions/upload-artifact@v4
32 |         with:
33 |           name: dist
34 |           path: dist/
35 | 
36 |   publish:
37 |     needs: build
38 |     runs-on: ubuntu-latest
39 |     environment: pypi
40 |     permissions:
41 |       id-token: write
42 |     steps:
43 |       - name: Download artifacts
44 |         uses: actions/download-artifact@v4
45 |         with:
46 |           name: dist
47 |           path: dist/
48 | 
49 |       - name: Publish to PyPI
50 |         uses: pypa/gh-action-pypi-publish@release/v1
51 | 


--------------------------------------------------------------------------------
/german_ocr/utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for the German OCR package."""
 2 | 
 3 | import logging
 4 | from pathlib import Path
 5 | from typing import Union
 6 | 
 7 | from PIL import Image
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def load_image(image_input: Union[str, Path, Image.Image]) -> Image.Image:
13 |     """Load an image from a file path or return the PIL Image as-is.
14 | 
15 |     Args:
16 |         image_input: Path to image file or PIL Image object
17 | 
18 |     Returns:
19 |         PIL Image object
20 | 
21 |     Raises:
22 |         FileNotFoundError: If the image file does not exist
23 |         ValueError: If the image cannot be loaded or is invalid
24 |     """
25 |     if isinstance(image_input, Image.Image):
26 |         return image_input
27 | 
28 |     image_path = Path(image_input)
29 |     if not image_path.exists():
30 |         raise FileNotFoundError(f"Image file not found: {image_path}")
31 | 
32 |     try:
33 |         image = Image.open(image_path)
34 |         # Verify that the image can be loaded
35 |         image.verify()
36 |         # Reopen after verify (verify closes the file)
37 |         image = Image.open(image_path)
38 |         return image
39 |     except Exception as e:
40 |         raise ValueError(f"Failed to load image from {image_path}: {e}") from e
41 | 
42 | 
43 | def setup_logging(level: str = "INFO") -> None:
44 |     """Configure logging for the package.
45 | 
46 |     Args:
47 |         level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
48 |     """
49 |     logging.basicConfig(
50 |         level=getattr(logging, level.upper()),
51 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
52 |         datefmt="%Y-%m-%d %H:%M:%S",
53 |     )
54 | 
55 | 
56 | def validate_backend(backend: str) -> str:
57 |     """Validate and normalize backend name.
58 | 
59 |     Args:
60 |         backend: Backend name to validate
61 | 
62 |     Returns:
63 |         Normalized backend name
64 | 
65 |     Raises:
66 |         ValueError: If backend is not supported
67 |     """
68 |     valid_backends = {"ollama", "huggingface", "hf", "auto"}
69 |     backend_lower = backend.lower()
70 | 
71 |     if backend_lower not in valid_backends:
72 |         raise ValueError(
73 |             f"Unsupported backend: {backend}. " f"Valid options: {', '.join(valid_backends)}"
74 |         )
75 | 
76 |     # Normalize HuggingFace variants
77 |     if backend_lower in {"huggingface", "hf"}:
78 |         return "huggingface"
79 | 
80 |     return backend_lower
81 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "german-ocr"
 7 | version = "0.2.1"
 8 | description = "High-performance German document OCR using fine-tuned Qwen2-VL and Qwen3-VL"
 9 | readme = "README.md"
10 | license = {text = "Apache-2.0"}
11 | authors = [
12 |     {name = "Keyvan Hardani", email = "hello@keyvan.ai"}
13 | ]
14 | maintainers = [
15 |     {name = "Keyvan Hardani", email = "hello@keyvan.ai"}
16 | ]
17 | keywords = [
18 |     "ocr",
19 |     "german",
20 |     "document",
21 |     "invoice",
22 |     "text-extraction",
23 |     "vision-language-model",
24 |     "qwen2-vl",
25 |     "ollama"
26 | ]
27 | classifiers = [
28 |     "Development Status :: 4 - Beta",
29 |     "Intended Audience :: Developers",
30 |     "Intended Audience :: Science/Research",
31 |     "License :: OSI Approved :: Apache Software License",
32 |     "Operating System :: OS Independent",
33 |     "Programming Language :: Python :: 3",
34 |     "Programming Language :: Python :: 3.9",
35 |     "Programming Language :: Python :: 3.10",
36 |     "Programming Language :: Python :: 3.11",
37 |     "Programming Language :: Python :: 3.12",
38 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
39 |     "Topic :: Scientific/Engineering :: Image Recognition",
40 |     "Topic :: Text Processing :: Linguistic",
41 | ]
42 | requires-python = ">=3.9"
43 | dependencies = [
44 |     "Pillow>=9.0.0",
45 |     "httpx>=0.24.0",
46 |     "requests>=2.28.0",
47 | ]
48 | 
49 | [project.optional-dependencies]
50 | hf = [
51 |     "torch>=2.0.0",
52 |     "transformers>=4.40.0",
53 |     "qwen-vl-utils>=0.0.10",
54 | ]
55 | all = [
56 |     "german-ocr[hf]",
57 | ]
58 | dev = [
59 |     "pytest>=7.0.0",
60 |     "pytest-cov>=4.0.0",
61 |     "black>=23.0.0",
62 |     "isort>=5.12.0",
63 |     "mypy>=1.0.0",
64 | ]
65 | 
66 | [project.scripts]
67 | german-ocr = "german_ocr.cli:main"
68 | 
69 | [project.urls]
70 | Homepage = "https://github.com/Keyvanhardani/german-ocr"
71 | Documentation = "https://github.com/Keyvanhardani/german-ocr#readme"
72 | Repository = "https://github.com/Keyvanhardani/german-ocr"
73 | Issues = "https://github.com/Keyvanhardani/german-ocr/issues"
74 | Changelog = "https://github.com/Keyvanhardani/german-ocr/releases"
75 | 
76 | [tool.setuptools.packages.find]
77 | where = ["."]
78 | include = ["german_ocr*"]
79 | 
80 | [tool.black]
81 | line-length = 100
82 | target-version = ["py39", "py310", "py311", "py312"]
83 | 
84 | [tool.isort]
85 | profile = "black"
86 | line_length = 100
87 | 
88 | [tool.mypy]
89 | python_version = "3.9"
90 | warn_return_any = true
91 | warn_unused_configs = true
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="docs/logo-german-ocr.png" alt="German-OCR Logo" width="450"/>
  3 | </p>
  4 | 
  5 | <p align="center">
  6 |   <strong>High-performance German document OCR using fine-tuned Qwen2-VL and Qwen3-VL vision-language models</strong>
  7 | </p>
  8 | 
  9 | <p align="center">
 10 |   <a href="https://pypi.org/project/german-ocr/"><img src="https://badge.fury.io/py/german-ocr.svg" alt="PyPI version"></a>
 11 |   <a href="https://opensource.org/licenses/Apache-2.0"><img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg" alt="License"></a>
 12 |   <a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.9+-blue.svg" alt="Python 3.9+"></a>
 13 |   <a href="https://huggingface.co/Keyven/german-ocr"><img src="https://img.shields.io/badge/%F0%9F%A4%97-HuggingFace-yellow" alt="HuggingFace"></a>
 14 |   <a href="https://ollama.com/Keyvan/german-ocr-turbo"><img src="https://img.shields.io/badge/Ollama-Turbo-green" alt="Ollama"></a>
 15 | </p>
 16 | 
 17 | <p align="center">
 18 |   <img src="https://raw.githubusercontent.com/Keyvanhardani/german-ocr/main/docs/demo.gif" alt="Demo" width="600"/>
 19 | </p>
 20 | 
 21 | ---
 22 | 
 23 | ## Features
 24 | 
 25 | | Feature | Description |
 26 | |---------|-------------|
 27 | | **High Accuracy** | 100% accuracy on German invoice test data |
 28 | | **Multiple Backends** | Ollama (fast, local) or HuggingFace Transformers |
 29 | | **Multiple Output Formats** | Markdown, JSON, HTML, or plain text |
 30 | | **Easy to Use** | Simple Python API and CLI |
 31 | | **Batch Processing** | Process multiple documents efficiently |
 32 | | **Structured Output** | Get results as plain text or JSON with metadata |
 33 | | **Privacy-First** | Runs completely locally - your documents never leave your machine |
 34 | 
 35 | ## Model Variants
 36 | 
 37 | | Model | Size | Base | Speed | Accuracy | Best For |
 38 | |-------|------|------|-------|----------|----------|
 39 | | [Keyvan/german-ocr-turbo](https://ollama.com/Keyvan/german-ocr-turbo) | 1.9 GB | Qwen3-VL-2B | ~5s | 100% | Fastest, recommended |
 40 | | [Keyvan/german-ocr](https://ollama.com/Keyvan/german-ocr) | 3.2 GB | Qwen2.5-VL-3B | ~5-7s | 75% | Standard model |
 41 | | [Keyven/german-ocr](https://huggingface.co/Keyven/german-ocr) | 4.4 GB | Qwen2-VL-2B | ~1-3s | 100% | GPU acceleration |
 42 | 
 43 | ## Installation
 44 | 
 45 | ```bash
 46 | pip install german-ocr
 47 | ```
 48 | 
 49 | ## Quick Start
 50 | 
 51 | ```bash
 52 | # Install Turbo model (fastest, recommended)
 53 | ollama pull Keyvan/german-ocr-turbo
 54 | ```
 55 | 
 56 | ### Python API
 57 | 
 58 | ```python
 59 | from german_ocr import GermanOCR
 60 | 
 61 | # Initialize with Turbo model (default)
 62 | ocr = GermanOCR()
 63 | 
 64 | # Extract text from image
 65 | text = ocr.extract("invoice.png")
 66 | print(text)
 67 | 
 68 | # Different output formats
 69 | text_md = ocr.extract("invoice.png", output_format="markdown")
 70 | text_json = ocr.extract("invoice.png", output_format="json")
 71 | 
 72 | # List available models
 73 | models = GermanOCR.list_models()
 74 | ```
 75 | 
 76 | ### Command Line
 77 | 
 78 | ```bash
 79 | # Single image (uses Turbo by default)
 80 | german-ocr invoice.png
 81 | 
 82 | # Use specific model
 83 | german-ocr --model german-ocr-turbo invoice.png
 84 | 
 85 | # Different output formats
 86 | german-ocr --format json invoice.png
 87 | 
 88 | # List available models
 89 | german-ocr --list-models
 90 | ```
 91 | 
 92 | ## Performance Benchmarks
 93 | 
 94 | Tested on RTX 4060 8GB with 5x warm runs:
 95 | 
 96 | | Model | Size | Time | Accuracy |
 97 | |-------|------|------|----------|
 98 | | **German-OCR Turbo** | 1.9GB | 5.0s | 100% |
 99 | | German-OCR v1 | 3.2GB | 5.5s | 75% |
100 | | DeepSeek-OCR | 6.7GB | 15.8s | 70% |
101 | | MiniCPM-V | 5.5GB | 8.9s | 67% |
102 | | LLaVA 7B | 4.7GB  | 12.9s | 45%     |
103 | 
104 | **German-OCR Turbo is 3x faster than DeepSeek-OCR!**
105 | 
106 | [View full benchmark results](https://german-ocr.github.io)
107 | 
108 | ## License
109 | 
110 | Apache 2.0 - See [LICENSE](LICENSE) for details.
111 | 
112 | ## Author
113 | 
114 | **Keyvan Hardani**
115 | 
116 | <p>
117 |   <a href="https://keyvan.ai"><img src="https://img.shields.io/badge/Website-keyvan.ai-blue?style=flat-square" alt="Website"></a>
118 |   <a href="https://www.linkedin.com/in/keyvanhardani/"><img src="https://img.shields.io/badge/LinkedIn-keyvanhardani-blue?style=flat-square&logo=linkedin" alt="LinkedIn"></a>
119 |   <a href="https://github.com/Keyvanhardani"><img src="https://img.shields.io/badge/GitHub-Keyvanhardani-black?style=flat-square&logo=github" alt="GitHub"></a>
120 | </p>
121 | 
122 | ## Links
123 | 
124 | | Resource | Link |
125 | |----------|------|
126 | | PyPI Package | [pypi.org/project/german-ocr](https://pypi.org/project/german-ocr/) |
127 | | Benchmark Results | [german-ocr.github.io](https://german-ocr.github.io) |
128 | | Ollama Turbo | [ollama.com/Keyvan/german-ocr-turbo](https://ollama.com/Keyvan/german-ocr-turbo) |
129 | | Ollama Standard | [ollama.com/Keyvan/german-ocr](https://ollama.com/Keyvan/german-ocr) |
130 | | HuggingFace Model | [huggingface.co/Keyven/german-ocr](https://huggingface.co/Keyven/german-ocr) |
131 | 
132 | ---
133 | 
134 | <p align="center">
135 |   Made with love in Germany
136 | </p>
137 | 


--------------------------------------------------------------------------------
/german_ocr/README.md:
--------------------------------------------------------------------------------
  1 | # German OCR
  2 | 
  3 | Production-ready Python package for German OCR with automatic backend selection.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Multiple Backends**: Automatic selection between Ollama and HuggingFace Transformers
  8 | - **Simple API**: Extract text from images with just a few lines of code
  9 | - **Batch Processing**: Process multiple images efficiently
 10 | - **CLI Tool**: Command-line interface for quick OCR tasks
 11 | - **Type-Safe**: Full type hints for better IDE support
 12 | - **Well-Tested**: Comprehensive test coverage with pytest
 13 | 
 14 | ## Installation
 15 | 
 16 | ### Basic Installation
 17 | 
 18 | ```bash
 19 | pip install -e .
 20 | ```
 21 | 
 22 | ### With HuggingFace Backend
 23 | 
 24 | ```bash
 25 | pip install -e ".[huggingface]"
 26 | ```
 27 | 
 28 | ### Full Installation (All Backends + Development Tools)
 29 | 
 30 | ```bash
 31 | pip install -e ".[all]"
 32 | ```
 33 | 
 34 | ## Quick Start
 35 | 
 36 | ### Python API
 37 | 
 38 | ```python
 39 | from german_ocr import GermanOCR
 40 | 
 41 | # Initialize with auto-detection
 42 | ocr = GermanOCR()
 43 | 
 44 | # Extract text from an image
 45 | text = ocr.extract("invoice.png")
 46 | print(text)
 47 | 
 48 | # Get structured output
 49 | result = ocr.extract("document.jpg", structured=True)
 50 | print(f"Text: {result['text']}")
 51 | print(f"Backend: {result['backend']}")
 52 | 
 53 | # Batch processing
 54 | images = ["img1.png", "img2.png", "img3.png"]
 55 | results = ocr.extract_batch(images)
 56 | for i, text in enumerate(results):
 57 |     print(f"Image {i+1}: {text}")
 58 | ```
 59 | 
 60 | ### Command Line Interface
 61 | 
 62 | ```bash
 63 | # Extract text from a single image
 64 | german-ocr invoice.png
 65 | 
 66 | # Process all images in a directory
 67 | german-ocr --batch images/
 68 | 
 69 | # Use specific backend
 70 | german-ocr --backend ollama document.jpg
 71 | 
 72 | # Get structured JSON output
 73 | german-ocr --structured invoice.png
 74 | 
 75 | # Save results to file
 76 | german-ocr invoice.png --output result.txt
 77 | 
 78 | # List available backends
 79 | german-ocr --list-backends
 80 | ```
 81 | 
 82 | ## Backend Configuration
 83 | 
 84 | ### Ollama (Recommended)
 85 | 
 86 | Install Ollama and pull the DeepSeek model:
 87 | 
 88 | ```bash
 89 | # Install Ollama from https://ollama.ai
 90 | # Pull the DeepSeek OCR model
 91 | ollama pull deepseek-ocr
 92 | ```
 93 | 
 94 | Then use the Ollama backend:
 95 | 
 96 | ```python
 97 | ocr = GermanOCR(backend="ollama", model_name="deepseek-ocr")
 98 | ```
 99 | 
100 | ### HuggingFace Transformers
101 | 
102 | Install the HuggingFace dependencies:
103 | 
104 | ```bash
105 | pip install -e ".[huggingface]"
106 | ```
107 | 
108 | Use the HuggingFace backend:
109 | 
110 | ```python
111 | ocr = GermanOCR(
112 |     backend="huggingface",
113 |     model_name="deepseek-ai/deepseek-vl-1.3b-chat",
114 |     device="cuda",  # or "cpu", "mps"
115 |     quantization="4bit"  # optional: "4bit", "8bit"
116 | )
117 | ```
118 | 
119 | ## Advanced Usage
120 | 
121 | ### Custom Prompts
122 | 
123 | ```python
124 | custom_prompt = "Extract the invoice number and total amount from this image."
125 | result = ocr.extract("invoice.png", prompt=custom_prompt)
126 | ```
127 | 
128 | ### Backend Information
129 | 
130 | ```python
131 | # Check which backends are available
132 | backends = GermanOCR.list_available_backends()
133 | print(f"Ollama available: {backends['ollama']}")
134 | print(f"HuggingFace available: {backends['huggingface']}")
135 | 
136 | # Get current backend info
137 | ocr = GermanOCR()
138 | info = ocr.get_backend_info()
139 | print(f"Using backend: {info['backend']}")
140 | print(f"Model: {info['model']}")
141 | ```
142 | 
143 | ### Error Handling
144 | 
145 | ```python
146 | from german_ocr import GermanOCR
147 | 
148 | try:
149 |     ocr = GermanOCR()
150 |     text = ocr.extract("document.png")
151 | except FileNotFoundError as e:
152 |     print(f"Image not found: {e}")
153 | except RuntimeError as e:
154 |     print(f"OCR failed: {e}")
155 | ```
156 | 
157 | ## Development
158 | 
159 | ### Running Tests
160 | 
161 | ```bash
162 | # Install development dependencies
163 | pip install -e ".[dev]"
164 | 
165 | # Run tests
166 | pytest tests/german_ocr/
167 | 
168 | # Run with coverage
169 | pytest --cov=german_ocr tests/german_ocr/
170 | ```
171 | 
172 | ### Code Quality
173 | 
174 | ```bash
175 | # Format code
176 | black german_ocr/
177 | 
178 | # Lint code
179 | ruff check german_ocr/
180 | 
181 | # Type checking
182 | mypy german_ocr/
183 | ```
184 | 
185 | ## API Reference
186 | 
187 | ### GermanOCR Class
188 | 
189 | #### `__init__(backend, model_name, device, quantization, log_level)`
190 | 
191 | Initialize the OCR instance.
192 | 
193 | **Parameters:**
194 | - `backend` (str): Backend to use ('auto', 'ollama', 'huggingface')
195 | - `model_name` (str, optional): Model name for the backend
196 | - `device` (str): Device for HF backend ('auto', 'cuda', 'cpu', 'mps')
197 | - `quantization` (str, optional): Quantization mode ('4bit', '8bit')
198 | - `log_level` (str): Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR')
199 | 
200 | #### `extract(image, prompt, structured, **kwargs)`
201 | 
202 | Extract text from a single image.
203 | 
204 | **Parameters:**
205 | - `image` (str | Path | PIL.Image): Image to process
206 | - `prompt` (str, optional): Custom prompt for OCR
207 | - `structured` (bool): Return structured dict instead of string
208 | - `**kwargs`: Backend-specific parameters
209 | 
210 | **Returns:**
211 | - `str` or `dict`: Extracted text or structured result
212 | 
213 | #### `extract_batch(images, prompt, structured, **kwargs)`
214 | 
215 | Extract text from multiple images.
216 | 
217 | **Parameters:**
218 | - `images` (list): List of images to process
219 | - `prompt` (str, optional): Custom prompt for OCR
220 | - `structured` (bool): Return structured dicts
221 | - `**kwargs`: Backend-specific parameters
222 | 
223 | **Returns:**
224 | - `list`: List of extracted texts or structured results
225 | 
226 | #### `get_backend_info()`
227 | 
228 | Get information about the current backend.
229 | 
230 | **Returns:**
231 | - `dict`: Backend information
232 | 
233 | #### `list_available_backends()` (static)
234 | 
235 | List all available backends and their status.
236 | 
237 | **Returns:**
238 | - `dict`: Mapping of backend names to availability
239 | 
240 | ## License
241 | 
242 | Apache-2.0
243 | 
244 | ## Contributing
245 | 
246 | Contributions are welcome! Please feel free to submit a Pull Request.
247 | 


--------------------------------------------------------------------------------
/german_ocr/ollama_backend.py:
--------------------------------------------------------------------------------
  1 | """Ollama backend for German OCR using Qwen2-VL and Qwen3-VL models."""
  2 | 
  3 | import base64
  4 | import logging
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List, Optional, Union
  8 | 
  9 | import requests
 10 | from PIL import Image
 11 | 
 12 | from german_ocr.utils import load_image
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | # Available German-OCR models on Ollama
 17 | AVAILABLE_MODELS = {
 18 |     "german-ocr-turbo": {
 19 |         "name": "Keyvan/german-ocr-turbo",
 20 |         "display": "German-OCR Turbo",
 21 |         "size": "1.9GB",
 22 |         "base": "Qwen3-VL-2B",
 23 |         "speed": "~5s",
 24 |         "accuracy": "100%",
 25 |         "description": "Fastest model, optimized for speed and accuracy",
 26 |     },
 27 |     "german-ocr": {
 28 |         "name": "Keyvan/german-ocr",
 29 |         "display": "German-OCR v1",
 30 |         "size": "3.2GB",
 31 |         "base": "Qwen2.5-VL-3B",
 32 |         "speed": "~5-7s",
 33 |         "accuracy": "75%",
 34 |         "description": "Standard model with high accuracy",
 35 |     },
 36 | }
 37 | 
 38 | DEFAULT_MODEL = "german-ocr-turbo"
 39 | 
 40 | 
 41 | def list_available_models() -> Dict[str, Dict[str, str]]:
 42 |     """List all available German-OCR models."""
 43 |     return AVAILABLE_MODELS.copy()
 44 | 
 45 | 
 46 | def get_model_name(model_key: str) -> str:
 47 |     """Get the full Ollama model name from a short key."""
 48 |     if model_key in AVAILABLE_MODELS:
 49 |         return AVAILABLE_MODELS[model_key]["name"]
 50 |     return model_key
 51 | 
 52 | 
 53 | class OllamaBackend:
 54 |     """Ollama backend for OCR inference."""
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         model_name: str = "german-ocr-turbo",
 59 |         base_url: str = "http://localhost:11434",
 60 |         timeout: int = 120,
 61 |     ) -> None:
 62 |         self.model_name = get_model_name(model_name)
 63 |         self.base_url = base_url.rstrip("/")
 64 |         self.timeout = timeout
 65 |         self._verify_connection()
 66 | 
 67 |     def _verify_connection(self) -> None:
 68 |         try:
 69 |             response = requests.get(f"{self.base_url}/api/tags", timeout=5)
 70 |             response.raise_for_status()
 71 |             logger.info(f"Successfully connected to Ollama at {self.base_url}")
 72 |         except requests.exceptions.RequestException as e:
 73 |             raise ConnectionError(
 74 |                 f"Failed to connect to Ollama server at {self.base_url}. "
 75 |                 f"Make sure Ollama is running. Error: {e}"
 76 |             ) from e
 77 | 
 78 |     def _verify_model(self) -> bool:
 79 |         try:
 80 |             response = requests.get(f"{self.base_url}/api/tags", timeout=5)
 81 |             response.raise_for_status()
 82 |             data = response.json()
 83 |             available_models = [model["name"] for model in data.get("models", [])]
 84 |             return self.model_name in available_models
 85 |         except Exception as e:
 86 |             logger.warning(f"Failed to verify model availability: {e}")
 87 |             return False
 88 | 
 89 |     def _image_to_base64(self, image: Image.Image) -> str:
 90 |         buffer = BytesIO()
 91 |         image.save(buffer, format="PNG")
 92 |         return base64.b64encode(buffer.getvalue()).decode("utf-8")
 93 | 
 94 |     def extract(
 95 |         self,
 96 |         image: Union[str, Path, Image.Image],
 97 |         prompt: Optional[str] = None,
 98 |         structured: bool = False,
 99 |         output_format: str = "markdown",
100 |     ) -> Union[str, Dict[str, Any]]:
101 |         pil_image = load_image(image)
102 |         image_b64 = self._image_to_base64(pil_image)
103 | 
104 |         if prompt is None:
105 |             format_prompts = {
106 |                 "markdown": "Extrahiere den gesamten Text aus diesem Dokument im Markdown-Format.",
107 |                 "json": "Extrahiere den gesamten Text aus diesem Dokument als JSON.",
108 |                 "text": "Extrahiere den gesamten Text aus diesem Dokument als reinen Text.",
109 |                 "html": "Extrahiere den gesamten Text aus diesem Dokument als HTML.",
110 |             }
111 |             prompt = format_prompts.get(output_format, format_prompts["markdown"])
112 | 
113 |         payload = {
114 |             "model": self.model_name,
115 |             "prompt": prompt,
116 |             "images": [image_b64],
117 |             "stream": False,
118 |         }
119 | 
120 |         try:
121 |             response = requests.post(
122 |                 f"{self.base_url}/api/generate",
123 |                 json=payload,
124 |                 timeout=self.timeout,
125 |             )
126 |             response.raise_for_status()
127 |             result = response.json()
128 |             extracted_text = result.get("response", "").strip()
129 | 
130 |             if structured:
131 |                 return {
132 |                     "text": extracted_text,
133 |                     "model": self.model_name,
134 |                     "backend": "ollama",
135 |                     "format": output_format,
136 |                     "confidence": 1.0,
137 |                 }
138 |             return extracted_text
139 | 
140 |         except requests.exceptions.RequestException as e:
141 |             raise RuntimeError(f"OCR extraction failed: {e}") from e
142 | 
143 |     def extract_batch(
144 |         self,
145 |         images: List[Union[str, Path, Image.Image]],
146 |         prompt: Optional[str] = None,
147 |         structured: bool = False,
148 |         output_format: str = "markdown",
149 |     ) -> List[Union[str, Dict[str, Any]]]:
150 |         results = []
151 |         for i, image in enumerate(images):
152 |             try:
153 |                 result = self.extract(
154 |                     image, prompt=prompt, structured=structured, output_format=output_format
155 |                 )
156 |                 results.append(result)
157 |                 logger.info(f"Processed image {i+1}/{len(images)}")
158 |             except Exception as e:
159 |                 logger.error(f"Failed to process image {i+1}: {e}")
160 |                 if structured:
161 |                     results.append({"text": "", "error": str(e), "backend": "ollama"})
162 |                 else:
163 |                     results.append("")
164 |         return results
165 | 
166 |     @staticmethod
167 |     def is_available() -> bool:
168 |         try:
169 |             response = requests.get("http://localhost:11434/api/tags", timeout=2)
170 |             response.raise_for_status()
171 |             return True
172 |         except Exception:
173 |             return False
174 | 
175 |     @staticmethod
176 |     def list_models() -> Dict[str, Dict[str, str]]:
177 |         return list_available_models()
178 | 


--------------------------------------------------------------------------------
/german_ocr/hf_backend.py:
--------------------------------------------------------------------------------
  1 | """HuggingFace Transformers backend for German OCR using Qwen2-VL."""
  2 | 
  3 | import logging
  4 | from pathlib import Path
  5 | from typing import Any, Dict, List, Optional, Union
  6 | 
  7 | import torch
  8 | from PIL import Image
  9 | 
 10 | from german_ocr.utils import load_image
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class HuggingFaceBackend:
 16 |     """HuggingFace Transformers backend for OCR inference.
 17 | 
 18 |     This backend uses HuggingFace Transformers library to perform OCR
 19 |     using Qwen2-VL vision-language models fine-tuned for German documents.
 20 | 
 21 |     Args:
 22 |         model_name: HuggingFace model identifier
 23 |         device: Device to run inference on (auto, cuda, cpu, mps)
 24 |         quantization: Quantization mode (none, 4bit, 8bit)
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         model_name: str = "Keyven/german-ocr",
 30 |         device: str = "auto",
 31 |         quantization: Optional[str] = None,
 32 |     ) -> None:
 33 |         """Initialize the HuggingFace backend."""
 34 |         self.model_name = model_name
 35 |         self.quantization = quantization
 36 |         self.device = self._get_device(device)
 37 | 
 38 |         logger.info(f"Loading model {model_name} on device {self.device}...")
 39 |         self._load_model()
 40 | 
 41 |     def _get_device(self, device: str) -> str:
 42 |         """Determine the device to use for inference.
 43 | 
 44 |         Args:
 45 |             device: Requested device (auto, cuda, cpu, mps)
 46 | 
 47 |         Returns:
 48 |             Device string
 49 |         """
 50 |         if device == "auto":
 51 |             if torch.cuda.is_available():
 52 |                 return "cuda"
 53 |             elif torch.backends.mps.is_available():
 54 |                 return "mps"
 55 |             else:
 56 |                 return "cpu"
 57 |         return device
 58 | 
 59 |     def _load_model(self) -> None:
 60 |         """Load the model and processor.
 61 | 
 62 |         Raises:
 63 |             RuntimeError: If model loading fails
 64 |         """
 65 |         try:
 66 |             from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 67 | 
 68 |             # Load processor
 69 |             self.processor = AutoProcessor.from_pretrained(self.model_name)
 70 | 
 71 |             # Configure quantization if requested
 72 |             model_kwargs: Dict[str, Any] = {"device_map": "auto"}
 73 | 
 74 |             if self.quantization == "4bit":
 75 |                 from transformers import BitsAndBytesConfig
 76 | 
 77 |                 model_kwargs["quantization_config"] = BitsAndBytesConfig(
 78 |                     load_in_4bit=True,
 79 |                     bnb_4bit_compute_dtype=torch.float16,
 80 |                 )
 81 |             elif self.quantization == "8bit":
 82 |                 model_kwargs["load_in_8bit"] = True
 83 |             elif self.device != "cpu":
 84 |                 model_kwargs["torch_dtype"] = torch.float16
 85 | 
 86 |             # Load Qwen2-VL model
 87 |             self.model = Qwen2VLForConditionalGeneration.from_pretrained(
 88 |                 self.model_name, **model_kwargs
 89 |             )
 90 | 
 91 |             self.model.eval()
 92 |             logger.info("Model loaded successfully")
 93 | 
 94 |         except Exception as e:
 95 |             raise RuntimeError(f"Failed to load model {self.model_name}: {e}") from e
 96 | 
 97 |     def extract(
 98 |         self,
 99 |         image: Union[str, Path, Image.Image],
100 |         prompt: Optional[str] = None,
101 |         structured: bool = False,
102 |         max_new_tokens: int = 512,
103 |     ) -> Union[str, Dict[str, Any]]:
104 |         """Extract text from an image using HuggingFace model.
105 | 
106 |         Args:
107 |             image: Path to image file or PIL Image object
108 |             prompt: Custom prompt for OCR (optional)
109 |             structured: Whether to return structured output (dict)
110 |             max_new_tokens: Maximum tokens to generate
111 | 
112 |         Returns:
113 |             Extracted text as string or structured dict
114 | 
115 |         Raises:
116 |             ValueError: If image is invalid
117 |             RuntimeError: If OCR extraction fails
118 |         """
119 |         # Load image
120 |         pil_image = load_image(image)
121 | 
122 |         # Prepare prompt (German for better results)
123 |         if prompt is None:
124 |             prompt = "Extrahiere den gesamten Text aus diesem Dokument im Markdown-Format."
125 | 
126 |         # Prepare inputs using Qwen2-VL chat format
127 |         try:
128 |             from qwen_vl_utils import process_vision_info
129 | 
130 |             messages = [{
131 |                 "role": "user",
132 |                 "content": [
133 |                     {"type": "image", "image": pil_image},
134 |                     {"type": "text", "text": prompt}
135 |                 ]
136 |             }]
137 | 
138 |             text = self.processor.apply_chat_template(
139 |                 messages, tokenize=False, add_generation_prompt=True
140 |             )
141 |             image_inputs, video_inputs = process_vision_info(messages)
142 |             inputs = self.processor(
143 |                 text=[text],
144 |                 images=image_inputs,
145 |                 videos=video_inputs,
146 |                 padding=True,
147 |                 return_tensors="pt"
148 |             ).to(self.model.device)
149 | 
150 |             # Generate
151 |             with torch.no_grad():
152 |                 outputs = self.model.generate(
153 |                     **inputs,
154 |                     max_new_tokens=max_new_tokens,
155 |                     do_sample=False,
156 |                 )
157 | 
158 |             # Decode output
159 |             generated_text = self.processor.batch_decode(
160 |                 outputs[:, inputs.input_ids.shape[1]:],
161 |                 skip_special_tokens=True
162 |             )[0]
163 | 
164 |             if structured:
165 |                 return {
166 |                     "text": generated_text,
167 |                     "model": self.model_name,
168 |                     "backend": "huggingface",
169 |                     "confidence": 1.0,
170 |                 }
171 |             return generated_text
172 | 
173 |         except Exception as e:
174 |             raise RuntimeError(f"OCR extraction failed: {e}") from e
175 | 
176 |     def extract_batch(
177 |         self,
178 |         images: List[Union[str, Path, Image.Image]],
179 |         prompt: Optional[str] = None,
180 |         structured: bool = False,
181 |         max_new_tokens: int = 512,
182 |         batch_size: int = 1,
183 |     ) -> List[Union[str, Dict[str, Any]]]:
184 |         """Extract text from multiple images.
185 | 
186 |         Args:
187 |             images: List of image paths or PIL Image objects
188 |             prompt: Custom prompt for OCR (optional)
189 |             structured: Whether to return structured output
190 |             max_new_tokens: Maximum tokens to generate
191 |             batch_size: Number of images to process at once
192 | 
193 |         Returns:
194 |             List of extracted texts or structured dicts
195 |         """
196 |         results = []
197 | 
198 |         # Process in batches
199 |         for i in range(0, len(images), batch_size):
200 |             batch = images[i : i + batch_size]
201 | 
202 |             for j, image in enumerate(batch):
203 |                 try:
204 |                     result = self.extract(
205 |                         image,
206 |                         prompt=prompt,
207 |                         structured=structured,
208 |                         max_new_tokens=max_new_tokens,
209 |                     )
210 |                     results.append(result)
211 |                     logger.info(f"Processed image {i+j+1}/{len(images)}")
212 |                 except Exception as e:
213 |                     logger.error(f"Failed to process image {i+j+1}: {e}")
214 |                     if structured:
215 |                         results.append(
216 |                             {"text": "", "error": str(e), "backend": "huggingface"}
217 |                         )
218 |                     else:
219 |                         results.append("")
220 | 
221 |         return results
222 | 
223 |     @staticmethod
224 |     def is_available() -> bool:
225 |         """Check if HuggingFace backend is available.
226 | 
227 |         Returns:
228 |             True if transformers library is installed
229 |         """
230 |         try:
231 |             import transformers  # noqa: F401
232 | 
233 |             return True
234 |         except ImportError:
235 |             return False
236 | 


--------------------------------------------------------------------------------
/german_ocr/cli.py:
--------------------------------------------------------------------------------
  1 | """Command Line Interface for German OCR."""
  2 | 
  3 | import argparse
  4 | import json
  5 | import sys
  6 | from pathlib import Path
  7 | from typing import List, Optional
  8 | 
  9 | from german_ocr import GermanOCR
 10 | 
 11 | 
 12 | def find_images_in_directory(directory: Path) -> List[Path]:
 13 |     """Find all image files in a directory.
 14 | 
 15 |     Args:
 16 |         directory: Directory to search
 17 | 
 18 |     Returns:
 19 |         List of image file paths
 20 |     """
 21 |     image_extensions = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".webp"}
 22 |     images = []
 23 | 
 24 |     for ext in image_extensions:
 25 |         images.extend(directory.glob(f"*{ext}"))
 26 |         images.extend(directory.glob(f"*{ext.upper()}"))
 27 | 
 28 |     return sorted(images)
 29 | 
 30 | 
 31 | def process_single_image(
 32 |     ocr: GermanOCR,
 33 |     image_path: Path,
 34 |     structured: bool,
 35 |     output_file: Optional[Path] = None,
 36 | ) -> None:
 37 |     """Process a single image and print or save results.
 38 | 
 39 |     Args:
 40 |         ocr: GermanOCR instance
 41 |         image_path: Path to image file
 42 |         structured: Whether to output structured JSON
 43 |         output_file: Optional output file path
 44 |     """
 45 |     try:
 46 |         result = ocr.extract(image_path, structured=structured)
 47 | 
 48 |         if structured:
 49 |             output = json.dumps(result, indent=2, ensure_ascii=False)
 50 |         else:
 51 |             output = result
 52 | 
 53 |         if output_file:
 54 |             output_file.write_text(output, encoding="utf-8")
 55 |             print(f"Results saved to {output_file}")
 56 |         else:
 57 |             print(output)
 58 | 
 59 |     except Exception as e:
 60 |         print(f"Error processing {image_path}: {e}", file=sys.stderr)
 61 |         sys.exit(1)
 62 | 
 63 | 
 64 | def process_batch(
 65 |     ocr: GermanOCR,
 66 |     images: List[Path],
 67 |     structured: bool,
 68 |     output_file: Optional[Path] = None,
 69 | ) -> None:
 70 |     """Process multiple images and print or save results.
 71 | 
 72 |     Args:
 73 |         ocr: GermanOCR instance
 74 |         images: List of image paths
 75 |         structured: Whether to output structured JSON
 76 |         output_file: Optional output file path
 77 |     """
 78 |     try:
 79 |         print(f"Processing {len(images)} images...")
 80 |         results = ocr.extract_batch(images, structured=structured)
 81 | 
 82 |         if structured:
 83 |             output_data = [
 84 |                 {"image": str(img), "result": result}
 85 |                 for img, result in zip(images, results)
 86 |             ]
 87 |             output = json.dumps(output_data, indent=2, ensure_ascii=False)
 88 |         else:
 89 |             output_data = [
 90 |                 f"=== {img.name} ===\n{result}\n" for img, result in zip(images, results)
 91 |             ]
 92 |             output = "\n".join(output_data)
 93 | 
 94 |         if output_file:
 95 |             output_file.write_text(output, encoding="utf-8")
 96 |             print(f"Results saved to {output_file}")
 97 |         else:
 98 |             print(output)
 99 | 
100 |     except Exception as e:
101 |         print(f"Error processing batch: {e}", file=sys.stderr)
102 |         sys.exit(1)
103 | 
104 | 
105 | def main() -> None:
106 |     """Main entry point for the CLI."""
107 |     parser = argparse.ArgumentParser(
108 |         description="German OCR - Extract text from images using DeepSeek models",
109 |         formatter_class=argparse.RawDescriptionHelpFormatter,
110 |         epilog="""
111 | Examples:
112 |   # Extract text from a single image
113 |   german-ocr invoice.png
114 | 
115 |   # Process all images in a directory
116 |   german-ocr --batch images/
117 | 
118 |   # Use specific backend
119 |   german-ocr --backend ollama document.jpg
120 | 
121 |   # Get structured JSON output
122 |   german-ocr --structured invoice.png
123 | 
124 |   # Save results to file
125 |   german-ocr invoice.png --output result.txt
126 | 
127 |   # Process batch with structured output
128 |   german-ocr --batch folder/ --structured --output results.json
129 |         """,
130 |     )
131 | 
132 |     parser.add_argument(
133 |         "input",
134 |         type=str,
135 |         nargs="?",
136 |         help="Path to image file or directory (with --batch)",
137 |     )
138 | 
139 |     parser.add_argument(
140 |         "--batch",
141 |         action="store_true",
142 |         help="Process all images in the specified directory",
143 |     )
144 | 
145 |     parser.add_argument(
146 |         "--backend",
147 |         type=str,
148 |         choices=["auto", "ollama", "huggingface", "hf"],
149 |         default="auto",
150 |         help="OCR backend to use (default: auto)",
151 |     )
152 | 
153 |     parser.add_argument(
154 |         "--model",
155 |         type=str,
156 |         choices=["german-ocr-turbo", "german-ocr"],
157 |         default="german-ocr-turbo",
158 |         help="Model to use: german-ocr-turbo (fastest, 1.9GB) or german-ocr (3.2GB)",
159 |     )
160 |     
161 |     parser.add_argument(
162 |         "--format",
163 |         type=str,
164 |         choices=["markdown", "json", "text", "html"],
165 |         default="markdown",
166 |         help="Output format for OCR extraction (default: markdown)",
167 |     )
168 |     
169 |     parser.add_argument(
170 |         "--list-models",
171 |         action="store_true",
172 |         help="List available German-OCR models and exit",
173 |     )
174 | 
175 |     parser.add_argument(
176 |         "--device",
177 |         type=str,
178 |         choices=["auto", "cuda", "cpu", "mps"],
179 |         default="auto",
180 |         help="Device for HuggingFace backend (default: auto)",
181 |     )
182 | 
183 |     parser.add_argument(
184 |         "--quantization",
185 |         type=str,
186 |         choices=["none", "4bit", "8bit"],
187 |         help="Quantization mode for HuggingFace backend",
188 |     )
189 | 
190 |     parser.add_argument(
191 |         "--structured",
192 |         action="store_true",
193 |         help="Output structured JSON with metadata",
194 |     )
195 | 
196 |     parser.add_argument(
197 |         "--output",
198 |         "-o",
199 |         type=str,
200 |         help="Output file path (default: stdout)",
201 |     )
202 | 
203 |     parser.add_argument(
204 |         "--verbose",
205 |         "-v",
206 |         action="store_true",
207 |         help="Enable verbose logging",
208 |     )
209 | 
210 |     parser.add_argument(
211 |         "--list-backends",
212 |         action="store_true",
213 |         help="List available backends and exit",
214 |     )
215 | 
216 |     args = parser.parse_args()
217 | 
218 |     # List backends if requested
219 |     if args.list_backends:
220 |         backends = GermanOCR.list_available_backends()
221 |         print("Available backends:")
222 |         for backend, available in backends.items():
223 |             status = "[OK]" if available else "[--]"
224 |             print(f"  {status} {backend}")
225 |         sys.exit(0)
226 |     
227 |     # List models if requested
228 |     if args.list_models:
229 |         from german_ocr.ollama_backend import list_available_models
230 |         models = list_available_models()
231 |         print("Available German-OCR models:")
232 |         print()
233 |         for key, info in models.items():
234 |             print(f"  {key}:")
235 |             print(f"    Name: {info['name']}")
236 |             print(f"    Size: {info['size']}")
237 |             print(f"    Base: {info['base']}")
238 |             print(f"    Speed: {info['speed']}")
239 |             print(f"    Accuracy: {info['accuracy']}")
240 |             print()
241 |         sys.exit(0)
242 | 
243 |     # Validate input was provided
244 |     if args.input is None:
245 |         parser.error("the following arguments are required: input")
246 | 
247 |     # Validate input path
248 |     input_path = Path(args.input)
249 |     if not input_path.exists():
250 |         print(f"Error: {input_path} does not exist", file=sys.stderr)
251 |         sys.exit(1)
252 | 
253 |     # Prepare output file
254 |     output_file = Path(args.output) if args.output else None
255 | 
256 |     # Initialize OCR
257 |     try:
258 |         log_level = "DEBUG" if args.verbose else "INFO"
259 |         ocr = GermanOCR(
260 |             backend=args.backend,
261 |             model_name=args.model,
262 |             device=args.device,
263 |             quantization=args.quantization,
264 |             log_level=log_level,
265 |         )
266 | 
267 |         # Print backend info if verbose
268 |         if args.verbose:
269 |             info = ocr.get_backend_info()
270 |             print(f"Backend: {info['backend']}", file=sys.stderr)
271 |             if "model" in info:
272 |                 print(f"Model: {info['model']}", file=sys.stderr)
273 |             if "device" in info:
274 |                 print(f"Device: {info['device']}", file=sys.stderr)
275 |             print("", file=sys.stderr)
276 | 
277 |     except Exception as e:
278 |         print(f"Error initializing OCR: {e}", file=sys.stderr)
279 |         sys.exit(1)
280 | 
281 |     # Process input
282 |     if args.batch:
283 |         if not input_path.is_dir():
284 |             print(f"Error: {input_path} is not a directory", file=sys.stderr)
285 |             sys.exit(1)
286 | 
287 |         images = find_images_in_directory(input_path)
288 |         if not images:
289 |             print(f"No images found in {input_path}", file=sys.stderr)
290 |             sys.exit(1)
291 | 
292 |         process_batch(ocr, images, args.structured, output_file)
293 |     else:
294 |         if not input_path.is_file():
295 |             print(f"Error: {input_path} is not a file", file=sys.stderr)
296 |             sys.exit(1)
297 | 
298 |         process_single_image(ocr, input_path, args.structured, output_file)
299 | 
300 | 
301 | if __name__ == "__main__":
302 |     main()
303 | 


--------------------------------------------------------------------------------
/german_ocr/ocr.py:
--------------------------------------------------------------------------------
  1 | """Main GermanOCR class with automatic backend selection."""
  2 | 
  3 | import logging
  4 | from pathlib import Path
  5 | from typing import Any, Dict, List, Optional, Union
  6 | 
  7 | from PIL import Image
  8 | 
  9 | from german_ocr.utils import setup_logging, validate_backend
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class GermanOCR:
 15 |     """Production-ready German OCR with automatic backend selection.
 16 | 
 17 |     This class provides a unified interface for German OCR with support for
 18 |     multiple backends (Ollama, HuggingFace). It automatically selects the
 19 |     best available backend or allows manual selection.
 20 | 
 21 |     Args:
 22 |         backend: Backend to use ('auto', 'ollama', 'huggingface', 'hf')
 23 |         model_name: Model name for the selected backend
 24 |         device: Device for HuggingFace backend ('auto', 'cuda', 'cpu', 'mps')
 25 |         quantization: Quantization mode for HF backend ('none', '4bit', '8bit')
 26 |         log_level: Logging level ('DEBUG', 'INFO', 'WARNING', 'ERROR')
 27 | 
 28 |     Example:
 29 |         >>> ocr = GermanOCR()  # Auto-detect best backend
 30 |         >>> text = ocr.extract("invoice.png")
 31 |         >>> print(text)
 32 | 
 33 |         >>> # Use specific backend
 34 |         >>> ocr = GermanOCR(backend="ollama", model_name="Keyvan/german-ocr")
 35 |         >>> results = ocr.extract_batch(["img1.png", "img2.png"])
 36 |     """
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         backend: str = "auto",
 41 |         model_name: Optional[str] = None,
 42 |         device: str = "auto",
 43 |         quantization: Optional[str] = None,
 44 |         log_level: str = "INFO",
 45 |     ) -> None:
 46 |         """Initialize GermanOCR with the specified backend."""
 47 |         setup_logging(log_level)
 48 | 
 49 |         # Validate and normalize backend
 50 |         backend = validate_backend(backend)
 51 | 
 52 |         # Auto-detect backend if needed
 53 |         if backend == "auto":
 54 |             backend = self._detect_backend()
 55 |             logger.info(f"Auto-detected backend: {backend}")
 56 | 
 57 |         self.backend_name = backend
 58 |         self._backend: Optional[Any] = None
 59 | 
 60 |         # Initialize the selected backend
 61 |         if backend == "ollama":
 62 |             self._init_ollama(model_name)
 63 |         elif backend == "huggingface":
 64 |             self._init_huggingface(model_name, device, quantization)
 65 |         else:
 66 |             raise ValueError(f"Unsupported backend: {backend}")
 67 | 
 68 |         logger.info(f"GermanOCR initialized with {backend} backend")
 69 | 
 70 |     def _detect_backend(self) -> str:
 71 |         """Auto-detect the best available backend.
 72 | 
 73 |         Priority order:
 74 |         1. Ollama (fastest for local inference)
 75 |         2. HuggingFace (fallback)
 76 | 
 77 |         Returns:
 78 |             Backend name
 79 | 
 80 |         Raises:
 81 |             RuntimeError: If no backend is available
 82 |         """
 83 |         from german_ocr.hf_backend import HuggingFaceBackend
 84 |         from german_ocr.ollama_backend import OllamaBackend
 85 | 
 86 |         if OllamaBackend.is_available():
 87 |             logger.info("Ollama backend detected and available")
 88 |             return "ollama"
 89 | 
 90 |         if HuggingFaceBackend.is_available():
 91 |             logger.info("HuggingFace backend detected and available")
 92 |             return "huggingface"
 93 | 
 94 |         raise RuntimeError(
 95 |             "No OCR backend available. Please install either:\n"
 96 |             "  - Ollama: https://ollama.ai\n"
 97 |             "  - HuggingFace Transformers: pip install transformers torch"
 98 |         )
 99 | 
100 |     def _init_ollama(self, model_name: Optional[str]) -> None:
101 |         """Initialize Ollama backend.
102 | 
103 |         Args:
104 |             model_name: Ollama model name (optional)
105 |         """
106 |         from german_ocr.ollama_backend import OllamaBackend
107 | 
108 |         default_model = "german-ocr-turbo"
109 |         model = model_name if model_name else default_model
110 | 
111 |         try:
112 |             self._backend = OllamaBackend(model_name=model)
113 |             logger.info(f"Initialized Ollama backend with model: {model}")
114 |         except Exception as e:
115 |             raise RuntimeError(f"Failed to initialize Ollama backend: {e}") from e
116 | 
117 |     def _init_huggingface(
118 |         self,
119 |         model_name: Optional[str],
120 |         device: str,
121 |         quantization: Optional[str],
122 |     ) -> None:
123 |         """Initialize HuggingFace backend.
124 | 
125 |         Args:
126 |             model_name: HuggingFace model identifier (optional)
127 |             device: Device to use for inference
128 |             quantization: Quantization mode
129 |         """
130 |         from german_ocr.hf_backend import HuggingFaceBackend
131 | 
132 |         default_model = "Keyven/german-ocr"
133 |         model = model_name if model_name else default_model
134 | 
135 |         try:
136 |             self._backend = HuggingFaceBackend(
137 |                 model_name=model, device=device, quantization=quantization
138 |             )
139 |             logger.info(f"Initialized HuggingFace backend with model: {model}")
140 |         except Exception as e:
141 |             raise RuntimeError(f"Failed to initialize HuggingFace backend: {e}") from e
142 | 
143 |     def extract(
144 |         self,
145 |         image: Union[str, Path, Image.Image],
146 |         prompt: Optional[str] = None,
147 |         structured: bool = False,
148 |         **kwargs: Any,
149 |     ) -> Union[str, Dict[str, Any]]:
150 |         """Extract text from an image.
151 | 
152 |         Args:
153 |             image: Path to image file or PIL Image object
154 |             prompt: Custom prompt for OCR (optional)
155 |             structured: Whether to return structured output with metadata
156 |             **kwargs: Additional backend-specific parameters
157 | 
158 |         Returns:
159 |             Extracted text as string, or dict if structured=True
160 | 
161 |         Raises:
162 |             ValueError: If image is invalid
163 |             RuntimeError: If OCR extraction fails
164 | 
165 |         Example:
166 |             >>> text = ocr.extract("invoice.png")
167 |             >>> result = ocr.extract("invoice.png", structured=True)
168 |             >>> print(result['text'], result['confidence'])
169 |         """
170 |         if self._backend is None:
171 |             raise RuntimeError("Backend not initialized")
172 | 
173 |         try:
174 |             return self._backend.extract(
175 |                 image=image, prompt=prompt, structured=structured, **kwargs
176 |             )
177 |         except Exception as e:
178 |             logger.error(f"OCR extraction failed: {e}")
179 |             raise
180 | 
181 |     def extract_batch(
182 |         self,
183 |         images: List[Union[str, Path, Image.Image]],
184 |         prompt: Optional[str] = None,
185 |         structured: bool = False,
186 |         **kwargs: Any,
187 |     ) -> List[Union[str, Dict[str, Any]]]:
188 |         """Extract text from multiple images.
189 | 
190 |         Args:
191 |             images: List of image paths or PIL Image objects
192 |             prompt: Custom prompt for OCR (optional)
193 |             structured: Whether to return structured output
194 |             **kwargs: Additional backend-specific parameters
195 | 
196 |         Returns:
197 |             List of extracted texts or structured dicts
198 | 
199 |         Example:
200 |             >>> images = ["img1.png", "img2.png", "img3.png"]
201 |             >>> results = ocr.extract_batch(images)
202 |             >>> for i, text in enumerate(results):
203 |             ...     print(f"Image {i+1}: {text[:50]}...")
204 |         """
205 |         if self._backend is None:
206 |             raise RuntimeError("Backend not initialized")
207 | 
208 |         try:
209 |             return self._backend.extract_batch(
210 |                 images=images, prompt=prompt, structured=structured, **kwargs
211 |             )
212 |         except Exception as e:
213 |             logger.error(f"Batch OCR extraction failed: {e}")
214 |             raise
215 | 
216 |     def get_backend_info(self) -> Dict[str, Any]:
217 |         """Get information about the current backend.
218 | 
219 |         Returns:
220 |             Dictionary with backend details
221 | 
222 |         Example:
223 |             >>> info = ocr.get_backend_info()
224 |             >>> print(f"Using {info['backend']} with {info['model']}")
225 |         """
226 |         info = {"backend": self.backend_name}
227 | 
228 |         if hasattr(self._backend, "model_name"):
229 |             info["model"] = self._backend.model_name
230 | 
231 |         if hasattr(self._backend, "device"):
232 |             info["device"] = self._backend.device
233 | 
234 |         return info
235 | 
236 |     @staticmethod
237 |     def list_available_backends() -> Dict[str, bool]:
238 |         """List all available backends and their status.
239 | 
240 |         Returns:
241 |             Dictionary mapping backend names to availability status
242 | 
243 |         Example:
244 |             >>> backends = GermanOCR.list_available_backends()
245 |             >>> print(f"Ollama available: {backends['ollama']}")
246 |             >>> print(f"HuggingFace available: {backends['huggingface']}")
247 |         """
248 |         from german_ocr.hf_backend import HuggingFaceBackend
249 |         from german_ocr.ollama_backend import OllamaBackend
250 | 
251 |         return {
252 |             "ollama": OllamaBackend.is_available(),
253 |             "huggingface": HuggingFaceBackend.is_available(),
254 |         }
255 | 
256 |     @staticmethod
257 |     def list_models() -> Dict[str, Dict[str, str]]:
258 |         """List all available German-OCR models for Ollama backend.
259 | 
260 |         Returns:
261 |             Dictionary of available models with their details
262 |         """
263 |         from german_ocr.ollama_backend import list_available_models
264 |         return list_available_models()
265 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to the Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2025 Keyvan Hardani
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.
191 | 


--------------------------------------------------------------------------------