├── .env.example ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── build.bat ├── mcp-image-recognition.code-workspace ├── requirements-dev.txt ├── requirements.txt ├── run.bat ├── setup.py ├── src └── image_recognition_server │ ├── __init__.py │ ├── server.py │ ├── utils │ ├── __init__.py │ ├── image.py │ └── ocr.py │ └── vision │ ├── __init__.py │ ├── anthropic.py │ └── openai.py └── tests ├── test_ocr.py └── test_server.py /.env.example: -------------------------------------------------------------------------------- 1 | # Vision Provider (anthropic or openai) 2 | VISION_PROVIDER=openai 3 | 4 | # Fallback Provider (optional, if primary provider fails) 5 | FALLBACK_PROVIDER= 6 | 7 | # Logging Level (DEBUG, INFO, WARNING, ERROR) 8 | LOG_LEVEL=ERROR 9 | 10 | # Anthropic Settings 11 | # ANTHROPIC_API_KEY=aaaaaaa 12 | # ANTHROPIC_MODEL=claude-3.5-lates 13 | 14 | # OpenAI Settings 15 | # OPENAI_TIMEOUT=60 16 | 17 | OPENAI_API_KEY=gggggggggg 18 | OPENAI_BASE_URL= 19 | OPENAI_MODEL=gpt-4o-mini 20 | 21 | # Optional: Set a custom base URL/Model for the OpenAI API 22 | # - openrouter 23 | # OPENAI_API_KEY=ooooooo 24 | # OPENAI_BASE_URL=https://openrouter.ai/api/v1 25 | # OPENAI_MODEL=anthropic/claude-3.5-sonnet:beta 26 | # - grok 27 | # OPENAI_API_KEY=xxxxxxx 28 | # OPENAI_BASE_URL=https://api.x.ai/v1 29 | # OPENAI_MODEL=grok-2-vision-latest 30 | 31 | # Tesseract OCR Settings 32 | # Set to 'true' to enable Tesseract OCR text extraction 33 | # ENABLE_OCR=false 34 | # Path to Tesseract executable 35 | # TESSERACT_CMD= 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # escape=` 2 | 3 | # Use Windows Server Core as base image 4 | FROM mcr.microsoft.com/windows/servercore:ltsc2019 5 | 6 | # Set shell to PowerShell 7 | SHELL ["powershell", "-Command", "$ErrorActionPreference = 'Stop'; $ProgressPreference = 'SilentlyContinue';"] 8 | 9 | # Install Python 3.10 and Tesseract OCR 10 | RUN Invoke-WebRequest -Uri 'https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe' -OutFile 'python-3.10.0-amd64.exe'; ` 11 | Start-Process python-3.10.0-amd64.exe -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1' -Wait; ` 12 | Remove-Item python-3.10.0-amd64.exe; ` 13 | Invoke-WebRequest -Uri 'https://github.com/UB-Mannheim/tesseract/releases/download/v5.5.0/tesseract-ocr-w64-setup-5.5.0.20241111.exe' -OutFile 'tesseract-installer.exe'; ` 14 | Start-Process tesseract-installer.exe -ArgumentList '/S /D=C:\Program Files\Tesseract-OCR' -Wait; ` 15 | Remove-Item tesseract-installer.exe 16 | 17 | # Set working directory 18 | WORKDIR /app 19 | 20 | # Copy project files 21 | COPY requirements.txt . 22 | COPY src/ ./src/ 23 | COPY .env.example ./.env 24 | 25 | # Install dependencies 26 | RUN pip install --no-cache-dir -r requirements.txt 27 | 28 | # Set environment variables 29 | ENV PYTHONPATH=/app/src 30 | ENV TESSERACT_CMD="C:\Program Files\Tesseract-OCR\tesseract.exe" 31 | 32 | # Run the server 33 | CMD ["python", "-m", "src.image_recognition_server.server"] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 mario-andreschak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MCP Image Recognition Server 2 | 3 | An MCP server that provides image recognition capabilities using Anthropic and OpenAI vision APIs. Version 0.1.2. 4 | 5 | ## Features 6 | 7 | - Image description using Anthropic Claude Vision or OpenAI GPT-4 Vision 8 | - Support for multiple image formats (JPEG, PNG, GIF, WebP) 9 | - Configurable primary and fallback providers 10 | - Base64 and file-based image input support 11 | - Optional text extraction using Tesseract OCR 12 | 13 | ## Requirements 14 | 15 | - Python 3.8 or higher 16 | - Tesseract OCR (optional) - Required for text extraction feature 17 | - Windows: Download and install from [UB-Mannheim/tesseract](https://github.com/UB-Mannheim/tesseract/wiki) 18 | - Linux: `sudo apt-get install tesseract-ocr` 19 | - macOS: `brew install tesseract` 20 | 21 | ## Installation 22 | 23 | 1. Clone the repository: 24 | ```bash 25 | git clone https://github.com/mario-andreschak/mcp-image-recognition.git 26 | cd mcp-image-recognition 27 | ``` 28 | 29 | 2. Create and configure your environment file: 30 | ```bash 31 | cp .env.example .env 32 | # Edit .env with your API keys and preferences 33 | ``` 34 | 35 | 3. Build the project: 36 | ```bash 37 | build.bat 38 | ``` 39 | 40 | ## Usage 41 | 42 | ### Running the Server 43 | Spawn the server using python: 44 | ```bash 45 | python -m image_recognition_server.server 46 | ``` 47 | 48 | Start the server using batch instead: 49 | ```bash 50 | run.bat server 51 | ``` 52 | 53 | Start the server in development mode with the MCP Inspector: 54 | ```bash 55 | run.bat debug 56 | ``` 57 | 58 | ### Available Tools 59 | 60 | 1. `describe_image` 61 | - Input: Base64-encoded image data and MIME type 62 | - Output: Detailed description of the image 63 | 64 | 2. `describe_image_from_file` 65 | - Input: Path to an image file 66 | - Output: Detailed description of the image 67 | 68 | ### Environment Configuration 69 | 70 | - `ANTHROPIC_API_KEY`: Your Anthropic API key. 71 | - `OPENAI_API_KEY`: Your OpenAI API key. 72 | - `VISION_PROVIDER`: Primary vision provider (`anthropic` or `openai`). 73 | - `FALLBACK_PROVIDER`: Optional fallback provider. 74 | - `LOG_LEVEL`: Logging level (DEBUG, INFO, WARNING, ERROR). 75 | - `ENABLE_OCR`: Enable Tesseract OCR text extraction (`true` or `false`). 76 | - `TESSERACT_CMD`: Optional custom path to Tesseract executable. 77 | - `OPENAI_MODEL`: OpenAI Model (default: `gpt-4o-mini`). Can use OpenRouter format for other models (e.g., `anthropic/claude-3.5-sonnet:beta`). 78 | - `OPENAI_BASE_URL`: Optional custom base URL for the OpenAI API. Set to `https://openrouter.ai/api/v1` for OpenRouter. 79 | - `OPENAI_TIMEOUT`: Optional custom timeout (in seconds) for the OpenAI API. 80 | 81 | ### Using OpenRouter 82 | 83 | OpenRouter allows you to access various models using the OpenAI API format. To use OpenRouter, follow these steps: 84 | 85 | 1. Obtain an OpenAI API key from OpenRouter. 86 | 2. Set `OPENAI_API_KEY` in your `.env` file to your OpenRouter API key. 87 | 3. Set `OPENAI_BASE_URL` to `https://openrouter.ai/api/v1`. 88 | 4. Set `OPENAI_MODEL` to the desired model using the OpenRouter format (e.g., `anthropic/claude-3.5-sonnet:beta`). 89 | 5. Set `VISION_PROVIDER` to `openai`. 90 | 91 | ### Default Models 92 | 93 | - Anthropic: `claude-3.5-sonnet-beta` 94 | - OpenAI: `gpt-4o-mini` 95 | - OpenRouter: Use the `anthropic/claude-3.5-sonnet:beta` format in `OPENAI_MODEL`. 96 | 97 | ## Development 98 | 99 | ### Running Tests 100 | 101 | Run all tests: 102 | ```bash 103 | run.bat test 104 | ``` 105 | 106 | Run specific test suite: 107 | ```bash 108 | run.bat test server 109 | run.bat test anthropic 110 | run.bat test openai 111 | ``` 112 | 113 | ### Docker Support 114 | 115 | Build the Docker image: 116 | ```bash 117 | docker build -t mcp-image-recognition . 118 | ``` 119 | 120 | Run the container: 121 | ```bash 122 | docker run -it --env-file .env mcp-image-recognition 123 | ``` 124 | 125 | ## License 126 | 127 | MIT License - see LICENSE file for details. 128 | 129 | ## Release History 130 | 131 | - **0.1.2** (2025-02-20): Improved OCR error handling and added comprehensive test coverage for OCR functionality 132 | - **0.1.1** (2025-02-19): Added Tesseract OCR support for text extraction from images (optional feature) 133 | - **0.1.0** (2025-02-19): Initial release with Anthropic and OpenAI vision support 134 | -------------------------------------------------------------------------------- /build.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | REM Build script for MCP Image Recognition Server 3 | 4 | REM Install dependencies 5 | pip install -r requirements.txt 6 | pip install -r requirements-dev.txt 7 | pip install -e . 8 | 9 | REM Run code formatting 10 | black src/ 11 | isort src/ 12 | 13 | REM Run linting 14 | ruff check src/ 15 | mypy src/ 16 | 17 | REM Build package 18 | python setup.py build 19 | 20 | 21 | 22 | 23 | 24 | 25 | REM Run code formatting 26 | @REM black tests/ 27 | @REM isort tests/ 28 | 29 | REM Run linting 30 | @REM ruff check tests/ 31 | @REM mypy tests/ 32 | 33 | REM Run tests 34 | @REM pytest tests/ -v --cov=src 35 | -------------------------------------------------------------------------------- /mcp-image-recognition.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "." 5 | } 6 | ], 7 | "settings": {} 8 | } -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest>=7.0.0 3 | pytest-asyncio>=0.23.0 4 | pytest-cov>=4.1.0 5 | black>=23.0.0 6 | isort>=5.12.0 7 | mypy>=1.0.0 8 | ruff>=0.1.0 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mcp>=1.2.0 2 | anthropic>=0.8.0 3 | openai>=1.0.0 4 | python-dotenv>=1.0.0 5 | Pillow>=10.0.0 6 | numpy>=1.26.0 7 | pandas>=2.1.0 8 | pytesseract>=0.3.13 9 | -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | IF "%1"=="test" ( 3 | IF "%2"=="server" ( 4 | cls 5 | python -m pytest tests/test_server.py -v 6 | ) ELSE IF "%2"=="anthropic" ( 7 | cls 8 | python -m pytest tests/test_anthropic.py -v 9 | ) ELSE IF "%2"=="openai" ( 10 | cls 11 | python -m pytest tests/test_openai.py -v 12 | ) ELSE ( 13 | cls 14 | python -m pytest tests/ -v 15 | ) 16 | ) ELSE IF "%1"=="server" ( 17 | cls 18 | cd ./build/lib 19 | set PYTHONIOENCODING=utf-8 20 | python -m image_recognition_server.server 21 | cd ../.. 22 | ) ELSE IF "%1"=="debug" ( 23 | cls 24 | cd ./build/lib 25 | npx @modelcontextprotocol/inspector python -m image_recognition_server.server 26 | cd ../.. 27 | ) ELSE IF "%1"=="full" ( 28 | build.bat 29 | run.bat debug 30 | ) ELSE ( 31 | echo Invalid command. 32 | echo Usage: 33 | echo run.bat test [server ^| anthropic ^| openai] 34 | echo run.bat server 35 | echo run.bat debug 36 | echo run.bat full 37 | ) 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="mcp-image-recognition", 5 | version="0.1.1", 6 | description="MCP server for image recognition using Anthropic and OpenAI vision APIs", 7 | author="Mario", 8 | packages=find_packages(where="src"), 9 | package_dir={"": "src"}, 10 | python_requires=">=3.10", 11 | install_requires=[ 12 | "mcp>=1.2.0", 13 | "anthropic>=0.8.0", 14 | "openai>=1.0.0", 15 | "python-dotenv>=1.0.0", 16 | "Pillow>=10.0.0", 17 | "numpy>=1.26.0", 18 | "pandas>=2.1.0", 19 | "pytesseract>=0.3.13", 20 | ], 21 | extras_require={ 22 | "dev": [ 23 | "pytest>=7.0.0", 24 | "pytest-asyncio>=0.23.0", 25 | "pytest-cov>=4.1.0", 26 | "black>=23.0.0", 27 | "isort>=5.12.0", 28 | "mypy>=1.0.0", 29 | "ruff>=0.1.0", 30 | ] 31 | }, 32 | ) 33 | -------------------------------------------------------------------------------- /src/image_recognition_server/__init__.py: -------------------------------------------------------------------------------- 1 | """MCP server for image recognition using Anthropic and OpenAI vision APIs.""" 2 | 3 | __version__ = "0.1.0" 4 | -------------------------------------------------------------------------------- /src/image_recognition_server/server.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import logging 4 | import os 5 | from typing import Union 6 | 7 | from dotenv import load_dotenv 8 | from mcp.server.fastmcp import FastMCP 9 | from PIL import Image 10 | 11 | from .utils.image import image_to_base64, validate_base64_image 12 | from .utils.ocr import OCRError, extract_text_from_image 13 | from .vision.anthropic import AnthropicVision 14 | from .vision.openai import OpenAIVision 15 | 16 | # Load environment variables 17 | load_dotenv() 18 | 19 | # Configure encoding, defaulting to UTF-8 20 | DEFAULT_ENCODING = "utf-8" 21 | ENCODING = os.getenv("MCP_OUTPUT_ENCODING", DEFAULT_ENCODING) 22 | 23 | # Configure logging to file 24 | log_file_path = os.path.join(os.path.dirname(__file__), "mcp_server.log") 25 | logging.basicConfig( 26 | level=os.getenv("LOG_LEVEL", "INFO"), 27 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 28 | filename=log_file_path, 29 | filemode="a", # Append to log file 30 | ) 31 | logger = logging.getLogger(__name__) 32 | 33 | logger.info(f"Using encoding: {ENCODING}") 34 | 35 | 36 | def sanitize_output(text: str) -> str: 37 | """Sanitize output string to replace problematic characters.""" 38 | if text is None: 39 | return "" # Return empty string for None 40 | try: 41 | return text.encode(ENCODING, "replace").decode(ENCODING) 42 | except Exception as e: 43 | logger.error(f"Error during sanitization: {str(e)}", exc_info=True) 44 | return text # Return original text if sanitization fails 45 | 46 | 47 | # Create MCP server 48 | mcp = FastMCP( 49 | "mcp-image-recognition", 50 | description="MCP server for image recognition using Anthropic and OpenAI vision APIs", 51 | ) 52 | 53 | 54 | # Initialize vision clients 55 | def get_vision_client() -> Union[AnthropicVision, OpenAIVision]: 56 | """Get the configured vision client based on environment settings.""" 57 | provider = os.getenv("VISION_PROVIDER", "anthropic").lower() 58 | 59 | try: 60 | if provider == "anthropic": 61 | return AnthropicVision() 62 | elif provider == "openai": 63 | return OpenAIVision() 64 | else: 65 | raise ValueError(f"Invalid vision provider: {provider}") 66 | except Exception as e: 67 | # Try fallback provider if configured 68 | fallback = os.getenv("FALLBACK_PROVIDER") 69 | if fallback and fallback.lower() != provider: 70 | logger.warning( 71 | f"Primary provider failed: {str(e)}. Trying fallback: {fallback}" 72 | ) 73 | if fallback.lower() == "anthropic": 74 | return AnthropicVision() 75 | elif fallback.lower() == "openai": 76 | return OpenAIVision() 77 | raise 78 | 79 | 80 | async def process_image_with_ocr(image_data: str, prompt: str) -> str: 81 | """Process image with both vision AI and OCR. 82 | 83 | Args: 84 | image_data: Base64 encoded image data 85 | prompt: Prompt for vision AI 86 | 87 | Returns: 88 | str: Combined description from vision AI and OCR 89 | """ 90 | # Get vision AI description 91 | client = get_vision_client() 92 | 93 | # Handle both sync (Anthropic) and async (OpenAI) clients 94 | if isinstance(client, OpenAIVision): 95 | description = await client.describe_image(image_data, prompt) 96 | else: 97 | description = client.describe_image(image_data, prompt) 98 | 99 | # Check for empty or default response 100 | if not description or description == "No description available.": 101 | raise ValueError("Vision API returned empty or default response") 102 | 103 | # Handle OCR if enabled 104 | ocr_enabled = os.getenv("ENABLE_OCR", "false").lower() == "true" 105 | if ocr_enabled: 106 | try: 107 | # Convert base64 to PIL Image 108 | image_bytes = base64.b64decode(image_data) 109 | image = Image.open(io.BytesIO(image_bytes)) 110 | 111 | # Extract text with OCR required flag 112 | if ocr_text := extract_text_from_image(image, ocr_required=True): 113 | description += ( 114 | f"\n\nAdditionally, this is the output of tesseract-ocr: {ocr_text}" 115 | ) 116 | except OCRError as e: 117 | # Propagate OCR errors when OCR is enabled 118 | logger.error(f"OCR processing failed: {str(e)}") 119 | raise ValueError(f"OCR Error: {str(e)}") 120 | except Exception as e: 121 | logger.error(f"Unexpected error during OCR: {str(e)}") 122 | raise 123 | 124 | return sanitize_output(description) 125 | 126 | 127 | @mcp.tool() 128 | async def describe_image( 129 | image: str, prompt: str = "Please describe this image in detail." 130 | ) -> str: 131 | """Describe the contents of an image using vision AI. 132 | 133 | Args: 134 | image: Image data and MIME type 135 | prompt: Optional prompt to use for the description. 136 | 137 | Returns: 138 | str: Detailed description of the image 139 | """ 140 | try: 141 | logger.info(f"Processing image description request with prompt: {prompt}") 142 | logger.debug(f"Image data length: {len(image)}") 143 | 144 | # Validate image data 145 | if not validate_base64_image(image): 146 | raise ValueError("Invalid base64 image data") 147 | 148 | result = await process_image_with_ocr(image, prompt) 149 | if not result: 150 | raise ValueError("Received empty response from processing") 151 | 152 | logger.info("Successfully processed image") 153 | return sanitize_output(result) 154 | except ValueError as e: 155 | logger.error(f"Input error: {str(e)}") 156 | raise 157 | except Exception as e: 158 | logger.error(f"Error describing image: {str(e)}", exc_info=True) 159 | raise 160 | 161 | 162 | @mcp.tool() 163 | async def describe_image_from_file( 164 | filepath: str, prompt: str = "Please describe this image in detail." 165 | ) -> str: 166 | """Describe the contents of an image file using vision AI. 167 | 168 | Args: 169 | filepath: Path to the image file 170 | prompt: Optional prompt to use for the description. 171 | 172 | Returns: 173 | str: Detailed description of the image 174 | """ 175 | try: 176 | logger.info(f"Processing image file: {filepath}") 177 | 178 | # Convert image to base64 179 | image_data, mime_type = image_to_base64(filepath) 180 | logger.info(f"Successfully converted image to base64. MIME type: {mime_type}") 181 | logger.debug(f"Base64 data length: {len(image_data)}") 182 | 183 | # Use describe_image tool 184 | result = await describe_image(image=image_data, prompt=prompt) 185 | 186 | if not result: 187 | raise ValueError("Received empty response from processing") 188 | 189 | return sanitize_output(result) 190 | except FileNotFoundError: 191 | logger.error(f"Image file not found: {filepath}") 192 | raise 193 | except ValueError as e: 194 | logger.error(f"Input error: {str(e)}") 195 | raise 196 | except Exception as e: 197 | logger.error(f"Error processing image file: {str(e)}", exc_info=True) 198 | raise 199 | 200 | 201 | if __name__ == "__main__": 202 | mcp.run() 203 | -------------------------------------------------------------------------------- /src/image_recognition_server/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Utility functions for image handling and processing.""" 2 | 3 | from .image import image_to_base64, validate_base64_image 4 | 5 | __all__ = ["image_to_base64", "validate_base64_image"] 6 | -------------------------------------------------------------------------------- /src/image_recognition_server/utils/image.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import logging 4 | from pathlib import Path 5 | from typing import Tuple 6 | 7 | from PIL import Image, UnidentifiedImageError 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def image_to_base64(image_path: str) -> Tuple[str, str]: 13 | """Convert an image file to base64 string and detect its MIME type. 14 | 15 | Args: 16 | image_path: Path to the image file 17 | 18 | Returns: 19 | Tuple of (base64_string, mime_type) 20 | 21 | Raises: 22 | FileNotFoundError: If image file doesn't exist 23 | ValueError: If file is not a valid image 24 | """ 25 | path = Path(image_path) 26 | if not path.exists(): 27 | logger.error(f"Image file not found: {image_path}") 28 | raise FileNotFoundError(f"Image file not found: {image_path}") 29 | 30 | try: 31 | # Try to open and validate the image 32 | with Image.open(path) as img: 33 | # Get image format and convert to MIME type 34 | format_to_mime = { 35 | "JPEG": "image/jpeg", 36 | "PNG": "image/png", 37 | "GIF": "image/gif", 38 | "WEBP": "image/webp", 39 | } 40 | mime_type = format_to_mime.get(img.format, "application/octet-stream") 41 | logger.info( 42 | f"Processing image: {image_path}, format: {img.format}, size: {img.size}" 43 | ) 44 | 45 | # Convert to base64 46 | with path.open("rb") as f: 47 | base64_data = base64.b64encode(f.read()).decode("utf-8") 48 | logger.debug(f"Base64 data length: {len(base64_data)}") 49 | 50 | return base64_data, mime_type 51 | 52 | except UnidentifiedImageError as e: 53 | logger.error(f"Invalid image format: {str(e)}") 54 | raise ValueError(f"Invalid image format: {str(e)}") 55 | except OSError as e: 56 | logger.error(f"Failed to read image file: {str(e)}") 57 | raise ValueError(f"Failed to read image file: {str(e)}") 58 | except Exception as e: 59 | logger.error(f"Unexpected error processing image: {str(e)}", exc_info=True) 60 | raise ValueError(f"Failed to process image: {str(e)}") 61 | 62 | 63 | def validate_base64_image(base64_string: str) -> bool: 64 | """Validate if a string is a valid base64-encoded image. 65 | 66 | Args: 67 | base64_string: The base64 string to validate 68 | 69 | Returns: 70 | bool: True if valid, False otherwise 71 | """ 72 | try: 73 | # Try to decode base64 74 | image_data = base64.b64decode(base64_string) 75 | 76 | # Try to open as image 77 | with Image.open(io.BytesIO(image_data)) as img: 78 | logger.debug( 79 | f"Validated base64 image, format: {img.format}, size: {img.size}" 80 | ) 81 | return True 82 | 83 | except Exception as e: 84 | logger.warning(f"Invalid base64 image: {str(e)}") 85 | return False 86 | -------------------------------------------------------------------------------- /src/image_recognition_server/utils/ocr.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | 5 | import pytesseract # type: ignore 6 | from PIL import Image 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class OCRError(Exception): 12 | """Exception raised for OCR-related errors.""" 13 | 14 | pass 15 | 16 | 17 | def extract_text_from_image( 18 | image: Image.Image, ocr_required: bool = False 19 | ) -> Optional[str]: 20 | """Extract text from an image using Tesseract OCR. 21 | 22 | Args: 23 | image: PIL Image object to process 24 | ocr_required: If True, raise error when OCR fails. If False, return None. 25 | 26 | Returns: 27 | Optional[str]: Extracted text if successful, None if Tesseract is not available 28 | and ocr_required is False 29 | 30 | Raises: 31 | OCRError: If OCR fails and ocr_required is True 32 | """ 33 | try: 34 | # Check if custom tesseract path is set in environment and not empty 35 | if tesseract_cmd := os.getenv("TESSERACT_CMD"): 36 | if tesseract_cmd.strip(): # Only set if path is non-empty 37 | pytesseract.pytesseract.tesseract_cmd = tesseract_cmd 38 | 39 | # Extract text from image 40 | text = pytesseract.image_to_string(image) 41 | 42 | # Clean and validate result 43 | text = text.strip() 44 | if text: 45 | logger.info("Successfully extracted text from image using Tesseract") 46 | logger.debug(f"Extracted text length: {len(text)}") 47 | return text 48 | else: 49 | logger.info("No text found in image") 50 | return None 51 | 52 | except Exception as e: 53 | error_msg = f"Failed to extract text using Tesseract: {str(e)}" 54 | if "not installed" in str(e) or "not in your PATH" in str(e): 55 | error_msg = ( 56 | "Tesseract OCR is not installed or not in PATH. " 57 | "Please install Tesseract and ensure it's in your system PATH, " 58 | "or set TESSERACT_CMD environment variable to the executable path." 59 | ) 60 | 61 | logger.warning(error_msg) 62 | if ocr_required: 63 | raise OCRError(error_msg) 64 | return None 65 | -------------------------------------------------------------------------------- /src/image_recognition_server/vision/__init__.py: -------------------------------------------------------------------------------- 1 | """Vision API integrations for image recognition.""" 2 | 3 | from .anthropic import AnthropicVision 4 | from .openai import OpenAIVision 5 | 6 | __all__ = ["AnthropicVision", "OpenAIVision"] 7 | -------------------------------------------------------------------------------- /src/image_recognition_server/vision/anthropic.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | 5 | from anthropic import Anthropic, APIConnectionError, APIError, APITimeoutError 6 | from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class AnthropicVision: 12 | def __init__(self, api_key: Optional[str] = None): 13 | """Initialize Anthropic Vision client. 14 | 15 | Args: 16 | api_key: Optional API key. If not provided, will try to get from environment. 17 | """ 18 | self.api_key = api_key or os.getenv("ANTHROPIC_API_KEY") 19 | if not self.api_key: 20 | raise ValueError( 21 | "Anthropic API key not provided and not found in environment" 22 | ) 23 | 24 | self.client = Anthropic(api_key=self.api_key) 25 | 26 | def describe_image( 27 | self, 28 | image: str, 29 | prompt: str = "Please describe this image in detail.", 30 | mime_type="image/png", 31 | ) -> str: 32 | """Describe an image using Anthropic's Claude Vision. 33 | 34 | Args: 35 | image: string containing the base64 encoded image. 36 | prompt: Optional string containing the prompt. 37 | 38 | 39 | Returns: 40 | str: Description of the image 41 | 42 | Raises: 43 | Exception: If API call fails 44 | """ 45 | try: 46 | 47 | image_block = ImageBlockParam( 48 | type="image", 49 | source={"type": "base64", "media_type": mime_type, "data": image}, 50 | ) 51 | 52 | text_block = TextBlockParam(type="text", text=prompt) 53 | 54 | messages: list[MessageParam] = [ 55 | { 56 | "role": "user", 57 | "content": [image_block, text_block], 58 | } 59 | ] 60 | 61 | # Get model from environment, default to claude-3.5-sonnet-beta 62 | model = os.getenv("ANTHROPIC_MODEL", "claude-3.5-sonnet-beta") 63 | 64 | # Make API call 65 | response = self.client.messages.create( 66 | model=model, max_tokens=1024, messages=messages 67 | ) 68 | 69 | # Extract text from content blocks 70 | description = [] 71 | for block in response.content: 72 | if hasattr(block, "text"): 73 | description.append(block.text) 74 | 75 | # Return combined description or default message 76 | if description: 77 | return " ".join(description) 78 | return "No description available." 79 | 80 | except APITimeoutError as e: 81 | logger.error(f"Anthropic API timeout: {str(e)}") 82 | raise Exception(f"Request timed out: {str(e)}") 83 | except APIConnectionError as e: 84 | logger.error(f"Anthropic API connection error: {str(e)}") 85 | raise Exception(f"Connection error: {str(e)}") 86 | except APIError as e: 87 | logger.error(f"Anthropic API error: {str(e)}") 88 | raise Exception(f"API error: {str(e)}") 89 | except Exception as e: 90 | logger.error( 91 | f"Unexpected error in Anthropic Vision: {str(e)}", exc_info=True 92 | ) 93 | raise Exception(f"Unexpected error: {str(e)}") 94 | -------------------------------------------------------------------------------- /src/image_recognition_server/vision/openai.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Optional 4 | 5 | from openai import (APIConnectionError, APIError, APITimeoutError, AsyncOpenAI, 6 | RateLimitError) 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class OpenAIVision: 12 | def __init__(self, api_key: Optional[str] = None): 13 | """Initialize OpenAI Vision client. 14 | 15 | Args: 16 | api_key: Optional API key. If not provided, will try to get from environment. 17 | """ 18 | self.api_key = api_key or os.getenv("OPENAI_API_KEY") 19 | if not self.api_key: 20 | raise ValueError("OpenAI API key not provided and not found in environment") 21 | 22 | self.base_url = os.getenv("OPENAI_BASE_URL") 23 | timeout_value = os.getenv("OPENAI_TIMEOUT", 60) 24 | self.timeout = float(timeout_value) 25 | self.client = AsyncOpenAI( 26 | api_key=self.api_key, base_url=self.base_url, timeout=self.timeout 27 | ) 28 | 29 | async def describe_image( 30 | self, 31 | image: str, 32 | prompt: str = "Please describe this image in detail.", 33 | mime_type="image/png", 34 | ) -> str: 35 | """Describe an image using OpenAI's GPT-4 Vision. 36 | 37 | Args: 38 | image: String containing base64 encoded image. 39 | prompt: String containing the prompt. 40 | 41 | Returns: 42 | str: Description of the image 43 | 44 | Raises: 45 | Exception: If API call fails 46 | """ 47 | try: 48 | # Get model from environment, default to gpt-4o-mini 49 | model = os.getenv("OPENAI_MODEL", "gpt-4o-mini") 50 | 51 | # Create message content 52 | response = await self.client.chat.completions.create( 53 | model=model, 54 | messages=[ 55 | { 56 | "role": "user", 57 | "content": [ 58 | { 59 | "type": "image_url", 60 | "image_url": { 61 | "url": f"data:{mime_type};base64,{image}" 62 | }, 63 | }, 64 | {"type": "text", "text": prompt}, 65 | ], 66 | } 67 | ], 68 | max_tokens=1024, 69 | ) 70 | 71 | # Extract and return description 72 | return response.choices[0].message.content or "No description available." 73 | 74 | except APITimeoutError as e: 75 | logger.error(f"OpenAI API timeout: {str(e)}") 76 | raise Exception(f"Request timed out: {str(e)}") 77 | except APIConnectionError as e: 78 | logger.error(f"OpenAI API connection error: {str(e)}") 79 | raise Exception(f"Connection error: {str(e)}") 80 | except RateLimitError as e: 81 | logger.error(f"OpenAI API rate limit exceeded: {str(e)}") 82 | raise Exception(f"Rate limit exceeded: {str(e)}") 83 | except APIError as e: 84 | logger.error(f"OpenAI API error: {str(e)}") 85 | raise Exception(f"API error: {str(e)}") 86 | except Exception as e: 87 | logger.error(f"Unexpected error in OpenAI Vision: {str(e)}", exc_info=True) 88 | raise Exception(f"Unexpected error: {str(e)}") 89 | -------------------------------------------------------------------------------- /tests/test_ocr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | from PIL import Image, ImageDraw, ImageFont 4 | from src.image_recognition_server.utils.ocr import extract_text_from_image, OCRError 5 | 6 | @pytest.fixture 7 | def text_image(): 8 | """Create a test image with text.""" 9 | # Create a larger image with high contrast 10 | img = Image.new('RGB', (800, 200), color='white') 11 | d = ImageDraw.Draw(img) 12 | 13 | # Create a simple test string that's easier for OCR 14 | test_string = "TEST" 15 | 16 | # Draw text in large, clear font 17 | d.text((100, 50), test_string, fill='black', font=None) 18 | return img, test_string 19 | 20 | 21 | @pytest.fixture 22 | def empty_image(): 23 | """Create a blank test image.""" 24 | return Image.new('RGB', (100, 100), color='white') 25 | 26 | def test_basic_text_extraction(text_image): 27 | """Test extracting text from an image with clear text.""" 28 | img, expected_text = text_image 29 | result = extract_text_from_image(img) 30 | assert result is not None 31 | assert expected_text in result.upper() # Convert to uppercase for comparison 32 | 33 | def test_empty_image(empty_image): 34 | """Test handling of image with no text.""" 35 | result = extract_text_from_image(empty_image) 36 | assert result is None 37 | 38 | def test_tesseract_not_available(monkeypatch): 39 | """Test error handling when Tesseract isn't accessible.""" 40 | # Create a simple test image 41 | img = Image.new('RGB', (100, 100), color='white') 42 | 43 | # Mock pytesseract to raise an error 44 | def mock_image_to_string(*args, **kwargs): 45 | raise Exception("tesseract is not installed or it's not in your PATH") 46 | 47 | monkeypatch.setattr("pytesseract.image_to_string", mock_image_to_string) 48 | 49 | # Test with ocr_required=False 50 | result = extract_text_from_image(img, ocr_required=False) 51 | assert result is None 52 | 53 | # Test with ocr_required=True 54 | with pytest.raises(OCRError) as exc_info: 55 | extract_text_from_image(img, ocr_required=True) 56 | assert "Tesseract OCR is not installed" in str(exc_info.value) 57 | 58 | def test_custom_tesseract_path(monkeypatch): 59 | """Test using custom Tesseract path via env var.""" 60 | custom_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe" 61 | 62 | # Mock environment variable 63 | monkeypatch.setenv("TESSERACT_CMD", custom_path) 64 | 65 | # Mock pytesseract to verify the custom path was set 66 | def mock_image_to_string(*args, **kwargs): 67 | import pytesseract 68 | assert pytesseract.pytesseract.tesseract_cmd == custom_path 69 | return "Hello World" 70 | 71 | monkeypatch.setattr("pytesseract.image_to_string", mock_image_to_string) 72 | 73 | # Create a simple test image 74 | img = Image.new('RGB', (100, 100), color='white') 75 | result = extract_text_from_image(img) 76 | assert result == "Hello World" 77 | 78 | def test_ocr_required_flag(monkeypatch): 79 | """Test both True/False behaviors of ocr_required flag.""" 80 | img = Image.new('RGB', (100, 100), color='white') 81 | 82 | def mock_image_to_string(*args, **kwargs): 83 | return "" # Simulate no text found 84 | 85 | monkeypatch.setattr("pytesseract.image_to_string", mock_image_to_string) 86 | 87 | # Test with ocr_required=False (default) 88 | result = extract_text_from_image(img) 89 | assert result is None 90 | 91 | # Test with ocr_required=True 92 | result = extract_text_from_image(img, ocr_required=True) 93 | assert result is None # Should still be None since empty string is converted to None 94 | -------------------------------------------------------------------------------- /tests/test_server.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | from pathlib import Path 4 | from typing import AsyncGenerator 5 | 6 | import pytest 7 | import pytest_asyncio 8 | from mcp import ClientSession, StdioServerParameters, Tool, stdio_client 9 | 10 | # Test image (a simple 1x1 pixel PNG) 11 | TEST_IMAGE_DATA = base64.b64encode( 12 | bytes.fromhex( 13 | "89504e470d0a1a0a0000000d494844520000000100000001080600000001f15c" 14 | "4a00000009704859730000000ec400000ec401952b0e1b0000001c4944415478" 15 | "9c636460606062626060606060600000000000ffff030000060001f5f7e3c000" 16 | "00000049454e44ae426082" 17 | ) 18 | ).decode() 19 | 20 | 21 | @pytest_asyncio.fixture 22 | async def client() -> AsyncGenerator[ClientSession, None]: 23 | """Create a test client connected to the server.""" 24 | server_params = StdioServerParameters( 25 | command="python", 26 | args=["-m", "src.image_recognition_server.server"], 27 | env={ 28 | "ANTHROPIC_API_KEY": os.getenv("ANTHROPIC_API_KEY", "test_key"), 29 | "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "test_key"), 30 | "VISION_PROVIDER": "anthropic", 31 | "LOG_LEVEL": "DEBUG", 32 | }, 33 | ) 34 | 35 | async with stdio_client(server_params) as (read, write): 36 | async with ClientSession(read, write) as session: 37 | await session.initialize() 38 | yield session 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_list_tools(client: ClientSession): 43 | """Test that the server exposes the expected tools.""" 44 | tools: list[Tool] = await client.list_tools() 45 | tool_names = {tool.name for tool in tools} 46 | assert "describe_image" in tool_names 47 | assert "describe_image_from_file" in tool_names 48 | 49 | 50 | @pytest.mark.asyncio 51 | async def test_describe_image(client: ClientSession) -> None: 52 | """Test the describe_image tool with a test image.""" 53 | result = await client.call_tool( 54 | "describe_image", 55 | arguments={"image": {"data": TEST_IMAGE_DATA, "mime_type": "image/png"}}, 56 | ) 57 | assert isinstance(result, str) 58 | assert len(result) > 0 59 | 60 | 61 | @pytest.mark.asyncio 62 | async def test_describe_image_from_file(client: ClientSession, tmp_path: Path) -> None: 63 | """Test the describe_image_from_file tool with a test image file.""" 64 | # Create a test image file 65 | image_path = tmp_path / "test.png" 66 | image_data = base64.b64decode(TEST_IMAGE_DATA) 67 | image_path.write_bytes(image_data) 68 | 69 | result = await client.call_tool( 70 | "describe_image_from_file", arguments={"filepath": str(image_path)} 71 | ) 72 | assert isinstance(result, str) 73 | assert len(result) > 0 74 | 75 | 76 | @pytest.mark.asyncio 77 | async def test_invalid_image_data(client: ClientSession) -> None: 78 | """Test that the server handles invalid image data appropriately.""" 79 | with pytest.raises(Exception): 80 | await client.call_tool( 81 | "describe_image", 82 | arguments={"image": {"data": "invalid_base64", "mime_type": "image/png"}}, 83 | ) 84 | 85 | 86 | @pytest.mark.asyncio 87 | async def test_invalid_file_path(client: ClientSession) -> None: 88 | """Test that the server handles invalid file paths appropriately.""" 89 | with pytest.raises(Exception): 90 | await client.call_tool( 91 | "describe_image_from_file", arguments={"filepath": "/nonexistent/path.png"} 92 | ) 93 | --------------------------------------------------------------------------------