├── CLAUDE.md ├── src ├── providers │ ├── base │ │ ├── index.ts │ │ └── VisionProvider.ts │ ├── gemini │ │ └── index.ts │ ├── vertexai │ │ └── index.ts │ └── factory │ │ └── ProviderFactory.ts ├── utils │ ├── index.ts │ ├── credentialsParser.ts │ ├── imageAnnotator.ts │ ├── retry.ts │ └── validation.ts ├── storage │ ├── gcs │ │ ├── index.ts │ │ └── GCSStorage.ts │ └── index.ts ├── index.ts ├── tools │ ├── index.ts │ ├── analyze_video.ts │ ├── analyze_image.ts │ └── compare_images.ts ├── constants │ └── FunctionNames.ts ├── types │ ├── index.ts │ ├── Storage.ts │ ├── ObjectDetection.ts │ ├── Analysis.ts │ ├── Config.ts │ ├── Errors.ts │ └── Providers.ts ├── file-upload │ ├── vertexai │ │ └── VertexAIStorageStrategy.ts │ ├── gemini │ │ └── GeminiFilesAPI.ts │ └── factory │ │ └── FileUploadFactory.ts └── services │ └── FileService.ts ├── .prettierrc ├── .eslintrc.json ├── .claude └── settings.local.json ├── .gitignore ├── docs ├── TASK.md ├── PLAN.md ├── llm_logs │ ├── dynamic-schema-validation-fix-plan.md │ ├── vertex-ai-auth-fix-plan.md │ ├── percentage-scale-text-summary-plan.md │ └── web-context-detection-plan.md └── environment-variable-guide.md ├── LICENSE ├── tsconfig.json ├── package.json ├── .env.example ├── AGENTS.md └── README.md /CLAUDE.md: -------------------------------------------------------------------------------- 1 | AGENTS.md -------------------------------------------------------------------------------- /src/providers/base/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Base provider exports 3 | */ 4 | 5 | export * from './VisionProvider.js'; 6 | -------------------------------------------------------------------------------- /src/providers/gemini/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Gemini provider exports 3 | */ 4 | 5 | export * from './GeminiProvider.js'; 6 | -------------------------------------------------------------------------------- /src/providers/vertexai/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Vertex AI provider exports 3 | */ 4 | 5 | export * from './VertexAIProvider.js'; 6 | -------------------------------------------------------------------------------- /src/utils/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Utility exports 3 | */ 4 | 5 | export * from './validation.js'; 6 | export * from './retry.js'; 7 | -------------------------------------------------------------------------------- /src/storage/gcs/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Google Cloud Storage exports 3 | */ 4 | 5 | export { GCSStorageProvider } from './GCSStorage.js'; 6 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * Main entry point for the AI Vision MCP server 5 | */ 6 | 7 | // Import and start the server 8 | import './server.js'; 9 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "trailingComma": "es5", 4 | "singleQuote": true, 5 | "printWidth": 80, 6 | "tabWidth": 2, 7 | "useTabs": false, 8 | "bracketSpacing": true, 9 | "arrowParens": "avoid", 10 | "endOfLine": "lf" 11 | } -------------------------------------------------------------------------------- /src/storage/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Storage providers exports 3 | */ 4 | 5 | export { 6 | StorageProvider, 7 | StorageFile, 8 | StorageConfig, 9 | } from '../types/Storage.js'; 10 | export { GCSStorageProvider } from './gcs/GCSStorage.js'; 11 | export type { GCSConfig } from '../types/Config.js'; 12 | -------------------------------------------------------------------------------- /src/tools/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Tools exports 3 | */ 4 | 5 | export { analyze_image, type AnalyzeImageArgs } from './analyze_image.js'; 6 | export { analyze_video, type AnalyzeVideoArgs } from './analyze_video.js'; 7 | export { 8 | detect_objects_in_image, 9 | type ObjectDetectionArgs, 10 | } from './detect_objects_in_image.js'; 11 | export { compare_images, type CompareImagesArgs } from './compare_images.js'; 12 | -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "es2022": true, 4 | "node": true, 5 | "jest": true 6 | }, 7 | "extends": [ 8 | "eslint:recommended" 9 | ], 10 | "parser": "@typescript-eslint/parser", 11 | "parserOptions": { 12 | "ecmaVersion": "latest", 13 | "sourceType": "module" 14 | }, 15 | "plugins": [ 16 | "@typescript-eslint" 17 | ], 18 | "rules": { 19 | "no-unused-vars": "off", 20 | "@typescript-eslint/no-unused-vars": "warn", 21 | "@typescript-eslint/no-explicit-any": "off", 22 | "no-console": "off", 23 | "prefer-const": "error", 24 | "no-case-declarations": "off", 25 | "no-useless-escape": "off" 26 | }, 27 | "ignorePatterns": ["dist/", "node_modules/", "*.js", "**/__tests__/**/*"] 28 | } -------------------------------------------------------------------------------- /.claude/settings.local.json: -------------------------------------------------------------------------------- 1 | { 2 | "permissions": { 3 | "allow": [ 4 | "WebSearch", 5 | "WebFetch", 6 | "mcp__context7__resolve-library-id", 7 | "mcp__context7__get-library-docs", 8 | "Bash(npm run build:*)", 9 | "Bash(npm install:*)", 10 | "Bash(npm run lint:*)", 11 | "Bash(npm run format:*)", 12 | "Bash(npm run prepublishOnly:*)", 13 | "Bash(npx:*)", 14 | "Bash(mcp-inspector:*)", 15 | "Bash(node:*)", 16 | "mcp__ai-vision-mcp__analyze_image", 17 | "Bash(cat .env)", 18 | "mcp__ai-vision-mcp__analyze_video", 19 | "mcp__ai-vision-mcp__compare_images", 20 | "mcp__claude-context__index_codebase", 21 | "mcp__claude-context__search_code" 22 | ], 23 | "deny": [], 24 | "ask": [] 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Dependencies 4 | node_modules/ 5 | npm-debug.log* 6 | yarn-debug.log* 7 | yarn-error.log* 8 | 9 | # Build outputs 10 | dist/ 11 | build/ 12 | *.tsbuildinfo 13 | 14 | # Environment variables 15 | .env 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | # IDE 22 | .vscode/ 23 | .idea/ 24 | *.swp 25 | *.swo 26 | *~ 27 | 28 | # OS 29 | .DS_Store 30 | .DS_Store? 31 | ._* 32 | .Spotlight-V100 33 | .Trashes 34 | ehthumbs.db 35 | Thumbs.db 36 | 37 | # Logs 38 | logs 39 | *.log 40 | 41 | # Coverage 42 | coverage/ 43 | coverage-integration/ 44 | 45 | # Jest 46 | .nyc_output 47 | 48 | # Temporary files 49 | *.tmp 50 | *.temp 51 | 52 | # mcp-related configuration 53 | mcp-publisher.exe 54 | .mcpregistry_github_token 55 | .mcpregistry_registry_token -------------------------------------------------------------------------------- /docs/TASK.md: -------------------------------------------------------------------------------- 1 | ## Tasks to do: 2 | 3 | 1. what this image is about: https://images.pexels.com/photos/1391498/pexels-photo-1391498.jpeg 4 | 5 | 2. what this image is about: "C:\Users\tys\Downloads\rqo6ns_1024x1024.jpg" 6 | 7 | 3. compare the images: "C:\Users\tys\Downloads\rqo6ns_1024x1024.jpg" and https://images.pexels.com/photos/1391498/pexels-photo-1391498.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500 8 | 9 | 4. compare the images: https://img.freepik.com/free-photo/beautiful-girl-stands-park_8353-5084.jpg?semt=ais_hybrid&w=740&q=80 and https://images.pexels.com/photos/1391498/pexels-photo-1391498.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500 10 | 11 | 5. Detect the dress wear: https://ichef.bbci.co.uk/images/ic/480xn/p0529h01.jpg 12 | 13 | 6. Detect all buttons in this webpage: "C:\Users\tys\Downloads\export\static\input\tanyongsheng_screenshot.png" 14 | 15 | 7. what this video is about: https://www.youtube.com/watch?v=9hE5-98ZeCg 16 | 17 | 8. what this video is about: "C:\Users\tys\Downloads\test.mp4" 18 | -------------------------------------------------------------------------------- /src/constants/FunctionNames.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Centralized function name constants to avoid hard-coding 3 | */ 4 | 5 | export const FUNCTION_NAMES = { 6 | ANALYZE_IMAGE: 'analyze_image', 7 | COMPARE_IMAGES: 'compare_images', 8 | DETECT_OBJECTS_IN_IMAGE: 'detect_objects_in_image', 9 | ANALYZE_VIDEO: 'analyze_video', 10 | } as const; 11 | 12 | export type FunctionName = (typeof FUNCTION_NAMES)[keyof typeof FUNCTION_NAMES]; 13 | 14 | // Union types for different function groups 15 | export const IMAGE_FUNCTIONS = [ 16 | FUNCTION_NAMES.ANALYZE_IMAGE, 17 | FUNCTION_NAMES.COMPARE_IMAGES, 18 | FUNCTION_NAMES.DETECT_OBJECTS_IN_IMAGE, 19 | ] as const; 20 | 21 | export const VIDEO_FUNCTIONS = [FUNCTION_NAMES.ANALYZE_VIDEO] as const; 22 | 23 | export const ALL_FUNCTIONS = [...IMAGE_FUNCTIONS, ...VIDEO_FUNCTIONS] as const; 24 | 25 | // Type helpers 26 | export type ImageFunctionName = (typeof IMAGE_FUNCTIONS)[number]; 27 | export type VideoFunctionName = (typeof VIDEO_FUNCTIONS)[number]; 28 | export type AllFunctionName = (typeof ALL_FUNCTIONS)[number]; 29 | -------------------------------------------------------------------------------- /src/types/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Central type exports for the Vision MCP Server 3 | */ 4 | 5 | // Export Config types with aliases to avoid conflicts 6 | export type { 7 | Config, 8 | GeminiConfig as GeminiProviderConfig, 9 | VertexAIConfig as VertexAIProviderConfig, 10 | FileUploadConfig, 11 | ApiConfig, 12 | FileProcessingConfig, 13 | LoggingConfig, 14 | DevelopmentConfig, 15 | } from './Config.js'; 16 | 17 | // Export all other types normally 18 | export * from './Analysis.js'; 19 | export type { 20 | StorageProvider, 21 | StorageFile, 22 | StorageConfig, 23 | UploadOptions, 24 | ListOptions, 25 | ListResult, 26 | SignedUrlOptions, 27 | } from './Storage.js'; 28 | export * from './Providers.js'; 29 | export type { 30 | VisionError, 31 | ConfigurationError, 32 | ProviderError, 33 | FileUploadError, 34 | FileNotFoundError, 35 | UnsupportedFileTypeError, 36 | FileSizeExceededError, 37 | RateLimitExceededError, 38 | AuthenticationError, 39 | AuthorizationError, 40 | NetworkError, 41 | ValidationError, 42 | StorageError, 43 | ErrorType, 44 | ErrorDetails, 45 | } from './Errors.js'; 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 AI Vision MCP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "module": "ESNext", 5 | "moduleResolution": "node", 6 | "lib": ["ES2022"], 7 | "outDir": "./dist", 8 | "rootDir": "./src", 9 | "strict": true, 10 | "esModuleInterop": true, 11 | "skipLibCheck": true, 12 | "forceConsistentCasingInFileNames": true, 13 | "declaration": true, 14 | "declarationMap": true, 15 | "sourceMap": true, 16 | "removeComments": true, 17 | "noImplicitAny": true, 18 | "noImplicitReturns": true, 19 | "noImplicitThis": true, 20 | // "noUnusedLocals": true, 21 | // "noUnusedParameters": true, 22 | // "exactOptionalPropertyTypes": true, 23 | "noImplicitOverride": true, 24 | // "noPropertyAccessFromIndexSignature": true, 25 | // "noUncheckedIndexedAccess": true, 26 | "resolveJsonModule": true, 27 | "allowSyntheticDefaultImports": true, 28 | "experimentalDecorators": true, 29 | "emitDecoratorMetadata": true, 30 | "baseUrl": ".", 31 | "paths": { 32 | "@/*": ["src/*"] 33 | } 34 | }, 35 | "include": ["src/**/*"], 36 | "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"] 37 | } -------------------------------------------------------------------------------- /src/types/Storage.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Storage types for cloud storage providers 3 | */ 4 | 5 | export interface StorageProvider { 6 | uploadFile( 7 | buffer: Buffer, 8 | filename: string, 9 | mimeType: string 10 | ): Promise; 11 | downloadFile(fileId: string): Promise; 12 | deleteFile(fileId: string): Promise; 13 | getPublicUrl(fileId: string): Promise; 14 | getSignedUrl(fileId: string, expiresIn: number): Promise; 15 | listFiles(prefix?: string): Promise; 16 | } 17 | 18 | export interface StorageFile { 19 | id: string; 20 | filename: string; 21 | mimeType: string; 22 | size: number; 23 | url: string; 24 | lastModified: string; 25 | etag?: string; 26 | metadata?: Record; 27 | } 28 | 29 | export interface StorageConfig { 30 | accessKey: string; 31 | secretKey: string; 32 | region: string; 33 | bucket: string; 34 | endpoint: string; 35 | cdnUrl?: string; 36 | forcePathStyle?: boolean; 37 | signatureVersion?: string; 38 | } 39 | 40 | export interface UploadOptions { 41 | metadata?: Record; 42 | contentType?: string; 43 | cacheControl?: string; 44 | expires?: Date; 45 | tags?: Record; 46 | } 47 | 48 | export interface ListOptions { 49 | prefix?: string; 50 | maxKeys?: number; 51 | continuationToken?: string; 52 | } 53 | 54 | export interface ListResult { 55 | files: StorageFile[]; 56 | isTruncated: boolean; 57 | nextContinuationToken?: string; 58 | count: number; 59 | } 60 | 61 | export interface SignedUrlOptions { 62 | expiresIn: number; 63 | method?: 'GET' | 'PUT' | 'DELETE'; 64 | contentType?: string; 65 | checksum?: string; 66 | } 67 | 68 | export interface StorageError extends Error { 69 | code: string; 70 | statusCode?: number; 71 | region?: string; 72 | time: Date; 73 | request_id?: string; 74 | } 75 | -------------------------------------------------------------------------------- /src/file-upload/vertexai/VertexAIStorageStrategy.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Vertex AI storage upload strategy using external storage 3 | */ 4 | 5 | import type { 6 | FileUploadStrategy, 7 | UploadedFile, 8 | FileReference, 9 | } from '../../types/Providers.js'; 10 | import type { StorageProvider } from '../../types/Storage.js'; 11 | import { FileUploadError } from '../../types/Errors.js'; 12 | 13 | export class VertexAIStorageStrategy implements FileUploadStrategy { 14 | constructor(private storageProvider: StorageProvider) {} 15 | 16 | async uploadFile( 17 | buffer: Buffer, 18 | filename: string, 19 | mimeType: string 20 | ): Promise { 21 | try { 22 | return await this.storageProvider.uploadFile(buffer, filename, mimeType); 23 | } catch (error) { 24 | throw new FileUploadError( 25 | `Failed to upload file to external storage: ${error instanceof Error ? error.message : String(error)}`, 26 | 'vertex_ai', 27 | error instanceof Error ? error : new Error(String(error)) 28 | ); 29 | } 30 | } 31 | 32 | async getFileForAnalysis(uploadedFile: UploadedFile): Promise { 33 | if (!uploadedFile.id) { 34 | throw new FileUploadError( 35 | 'Uploaded file does not have an ID for analysis', 36 | 'vertex_ai' 37 | ); 38 | } 39 | 40 | // For Vertex AI with native GCS, the URL is already in gs:// format 41 | const gcsUri = await this.storageProvider.getPublicUrl(uploadedFile.id); 42 | 43 | return { 44 | type: 'file_uri', 45 | uri: gcsUri, 46 | mimeType: uploadedFile.mimeType, 47 | }; 48 | } 49 | 50 | async cleanup(fileId: string): Promise { 51 | try { 52 | await this.storageProvider.deleteFile(fileId); 53 | } catch (error) { 54 | // Log error but don't throw - cleanup failures shouldn't block the main flow 55 | console.warn(`Failed to cleanup storage file ${fileId}:`, error); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/file-upload/gemini/GeminiFilesAPI.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Gemini Files API upload strategy 3 | */ 4 | 5 | import type { 6 | FileUploadStrategy, 7 | UploadedFile, 8 | FileReference, 9 | } from '../../types/Providers.js'; 10 | import { GeminiProvider } from '../../providers/gemini/GeminiProvider.js'; 11 | import { FileUploadError } from '../../types/Errors.js'; 12 | 13 | export class GeminiFilesAPI implements FileUploadStrategy { 14 | constructor(private geminiProvider: GeminiProvider) {} 15 | 16 | async uploadFile( 17 | buffer: Buffer, 18 | filename: string, 19 | mimeType: string 20 | ): Promise { 21 | try { 22 | return await this.geminiProvider.uploadFile(buffer, filename, mimeType); 23 | } catch (error) { 24 | throw new FileUploadError( 25 | `Failed to upload file to Gemini Files API: ${error instanceof Error ? error.message : String(error)}`, 26 | 'gemini', 27 | error instanceof Error ? error : new Error(String(error)) 28 | ); 29 | } 30 | } 31 | 32 | async getFileForAnalysis(uploadedFile: UploadedFile): Promise { 33 | if (!uploadedFile.uri) { 34 | throw new FileUploadError( 35 | 'Uploaded file does not have a URI for analysis', 36 | 'gemini' 37 | ); 38 | } 39 | 40 | // Wait for the file to become ACTIVE before returning it for analysis 41 | await this.waitForFileProcessing(uploadedFile.id); 42 | 43 | return { 44 | type: 'file_uri', 45 | uri: uploadedFile.uri, 46 | mimeType: uploadedFile.mimeType, 47 | }; 48 | } 49 | 50 | private async waitForFileProcessing(fileId: string): Promise { 51 | await this.geminiProvider.waitForFileProcessing(fileId); 52 | } 53 | 54 | async cleanup(fileId: string): Promise { 55 | try { 56 | await this.geminiProvider.deleteFile(fileId); 57 | } catch (error) { 58 | // Log error but don't throw - cleanup failures shouldn't block the main flow 59 | console.warn(`Failed to cleanup Gemini file ${fileId}:`, error); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/file-upload/factory/FileUploadFactory.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * File upload strategy factory 3 | */ 4 | 5 | import type { FileUploadStrategy } from '../../types/Providers.js'; 6 | import type { Config } from '../../types/Config.js'; 7 | import { GeminiFilesAPI } from '../gemini/GeminiFilesAPI.js'; 8 | import { VertexAIStorageStrategy } from '../vertexai/VertexAIStorageStrategy.js'; 9 | import { GCSStorageProvider } from '../../storage/gcs/GCSStorage.js'; 10 | import { GeminiProvider } from '../../providers/gemini/GeminiProvider.js'; 11 | import { ConfigurationError } from '../../types/Errors.js'; 12 | import { ConfigService } from '../../services/ConfigService.js'; 13 | 14 | export class FileUploadFactory { 15 | static createStrategy( 16 | config: Config, 17 | type: 'image' | 'video', 18 | visionProvider: GeminiProvider 19 | ): FileUploadStrategy { 20 | const providerName = 21 | type === 'image' ? config.IMAGE_PROVIDER : config.VIDEO_PROVIDER; 22 | 23 | switch (providerName) { 24 | case 'google': 25 | return new GeminiFilesAPI(visionProvider); 26 | 27 | case 'vertex_ai': 28 | // For Vertex AI, we need Google Cloud Storage with native GCS SDK 29 | const gcsConfig = ConfigService.getInstance().getGCSConfig(); 30 | const storageProvider = new GCSStorageProvider(gcsConfig); 31 | return new VertexAIStorageStrategy(storageProvider); 32 | 33 | default: 34 | throw new ConfigurationError( 35 | `Unsupported provider for file upload: ${providerName}` 36 | ); 37 | } 38 | } 39 | 40 | static getThreshold(config: Config, type: 'image' | 'video'): number { 41 | const providerName = 42 | type === 'image' ? config.IMAGE_PROVIDER : config.VIDEO_PROVIDER; 43 | 44 | if (providerName === 'google') { 45 | return config.GEMINI_FILES_API_THRESHOLD || 10 * 1024 * 1024; // 10MB default 46 | } 47 | 48 | if (providerName === 'vertex_ai') { 49 | // Vertex AI requires external storage for all files 50 | return config.VERTEX_AI_FILES_API_THRESHOLD || 0; 51 | } 52 | 53 | return 0; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /docs/PLAN.md: -------------------------------------------------------------------------------- 1 | ## DONE 2 | [x] should we split MAX_TOKENS environment variable into MAX_TOKENS_FOR_IMAGE and MAX_TOKENS_FOR_VIDEO? Justify and don't write the code first. 3 | [x] add detect_objects_in_image mcp function 4 | [x] add native support to change GEMINI_BASE_URL to be switched to another proxy provider 5 | 6 | ## TODO 7 | 8 | **IMPLEMENTED BUT TO BE TESTED** 9 | - add TEMPERATURE_FOR_IMAGE, TOP_P_FOR_IMAGE, TOP_K_FOR_IMAGE, TEMPERATURE_FOR_VIDEO, TOP_P_FOR_VIDEO, TOP_K_FOR_VIDEO (prepare for future....) 10 | - add MAX_TOKENS, TOP_P, TOP_K, TEMPERATURE configurable as environment variable for each function 11 | - add MAX_TOKENS, since above set the standard for TOP_P, TOP_K, TEMPERATURE... 12 | - add ANALYZE_IMAGE_MODEL, COMPARE_IMAGES_MODEL , ANALYZE_VIDEO_MODEL environment variable... 13 | 14 | **URGENT** 15 | - .. 16 | 17 | **ICEBOX** 18 | - optional dependencies to download when define, for example, ai-vision-mcp[google] (Reason: a bit hard to manage, currently only two providers so the dependencies not that large yet ...) 19 | 20 | **DISPOSAL** 21 | 22 | - remove MAX_VIDEO_DURATION environment variable... 23 | 24 | - let user to add their custom SYSTEM_INSTRUCTIONS_FOR_IMAGE_MODEL and SYSTEM_INSTRUCTIONS_FOR_VIDEO_MODEL ... (Reason: hard to control the behaviour, for example, somebody may inject harmful prompt?) 25 | 26 | - add analyze_image description for prompt params: "Detailed text prompt. If the task is **front-end code replication**, the prompt you provide must be: "Describe in detail the layout structure, color style, main components, and interactive elements of the website in this image to facilitate subsequent code generation by the model." + your additional requirements. \ For **other tasks**, the prompt you provide must clearly describe what to analyze, extract, or understand from the image." (Reason: wait too long for such task to complete, but can try to add `timeout` params to mcp client in future) 27 | 28 | - add metadata params per model level like supportsThinking, supportsNoThinking - set thinkingbudget=0 for all models (except gemini 2.5 pro) - (Reason: hard to add another layer `thinking` as I think thinking_budget is not that useful for image analysis - unsure about this...) 29 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ai-vision-mcp", 3 | "version": "0.0.5", 4 | "description": "Vision MCP server that provides AI-powered image and video analysis using Google Gemini and Vertex AI", 5 | "main": "dist/index.js", 6 | "type": "module", 7 | "scripts": { 8 | "build": "tsc", 9 | "dev": "tsc --watch", 10 | "start": "node dist/index.js", 11 | "lint": "eslint src/**/*.ts", 12 | "lint:fix": "eslint src/**/*.ts --fix", 13 | "format": "prettier --write src/**/*.ts", 14 | "prepare": "npm run build", 15 | "prepublishOnly": "npm run lint", 16 | "preversion": "npm run lint", 17 | "version": "npm run format && git add -A src", 18 | "postversion": "git push && git push --tags", 19 | "publish-registry": "mcp-publisher publish", 20 | "publish-registry:dry-run": "mcp-publisher publish --dry-run" 21 | }, 22 | "keywords": [ 23 | "mcp", 24 | "vision", 25 | "ai", 26 | "image-analysis", 27 | "video-analysis", 28 | "gemini", 29 | "vertex-ai", 30 | "google-ai" 31 | ], 32 | "author": "Tan Yong Sheng ", 33 | "license": "MIT", 34 | "bin": { 35 | "ai-vision-mcp": "dist/index.js" 36 | }, 37 | "files": [ 38 | "dist", 39 | "README.md", 40 | "LICENSE" 41 | ], 42 | "repository": { 43 | "type": "git", 44 | "url": "https://github.com/tan-yong-sheng/ai-vision-mcp.git" 45 | }, 46 | "bugs": { 47 | "url": "https://github.com/tan-yong-sheng/ai-vision-mcp/issues" 48 | }, 49 | "homepage": "https://github.com/tan-yong-sheng/ai-vision-mcp#readme", 50 | "dependencies": { 51 | "@google-cloud/storage": "^7.17.1", 52 | "@google/genai": "^1.24.0", 53 | "@modelcontextprotocol/sdk": "^1.0.0", 54 | "dotenv": "^16.4.5", 55 | "mime-types": "^2.1.35", 56 | "node-fetch": "^3.3.2", 57 | "sharp": "^0.33.5", 58 | "zod": "^3.23.8" 59 | }, 60 | "devDependencies": { 61 | "@types/glob": "^8.1.0", 62 | "@types/html-to-text": "^9.0.4", 63 | "@types/http-cache-semantics": "^4.0.4", 64 | "@types/long": "^4.0.2", 65 | "@types/mime-types": "^2.1.4", 66 | "@types/node": "^20.14.0", 67 | "@types/phoenix": "^1.6.6", 68 | "@types/ws": "^8.18.1", 69 | "@types/yauzl": "^2.10.3", 70 | "@typescript-eslint/eslint-plugin": "^7.18.0", 71 | "@typescript-eslint/parser": "^7.18.0", 72 | "eslint": "^8.57.0", 73 | "eslint-config-prettier": "^9.1.2", 74 | "eslint-plugin-prettier": "^5.5.4", 75 | "prettier": "^3.3.2", 76 | "typescript": "^5.5.0" 77 | }, 78 | "engines": { 79 | "node": ">=18.0.0" 80 | }, 81 | "publishConfig": { 82 | "access": "public" 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/tools/analyze_video.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * MCP Tool: analyze_video 3 | * Analyzes a video using AI vision models. Supports URLs and local file paths. 4 | */ 5 | 6 | import type { AnalysisOptions, AnalysisResult } from '../types/Providers.js'; 7 | import type { VisionProvider } from '../types/Providers.js'; 8 | import { FileService } from '../services/FileService.js'; 9 | import type { Config } from '../types/Config.js'; 10 | import { VisionError } from '../types/Errors.js'; 11 | import { FUNCTION_NAMES } from '../constants/FunctionNames.js'; 12 | 13 | export interface AnalyzeVideoArgs { 14 | videoSource: string; // Can be URL or local file path 15 | prompt: string; 16 | options?: AnalysisOptions; 17 | } 18 | 19 | export async function analyze_video( 20 | args: AnalyzeVideoArgs, 21 | config: Config, 22 | videoProvider: VisionProvider, 23 | videoFileService: FileService 24 | ): Promise { 25 | try { 26 | // Validate arguments 27 | if (!args.videoSource) { 28 | throw new VisionError('videoSource is required', 'MISSING_ARGUMENT'); 29 | } 30 | if (!args.prompt) { 31 | throw new VisionError('prompt is required', 'MISSING_ARGUMENT'); 32 | } 33 | 34 | // Handle video source (URL vs local file) 35 | const processedVideoSource = await videoFileService.handleVideoSource( 36 | args.videoSource 37 | ); 38 | 39 | // Merge default options with provided options 40 | const options: AnalysisOptions = { 41 | temperature: 42 | config.TEMPERATURE_FOR_ANALYZE_VIDEO ?? 43 | config.TEMPERATURE_FOR_VIDEO ?? 44 | config.TEMPERATURE, 45 | topP: 46 | config.TOP_P_FOR_ANALYZE_VIDEO ?? 47 | config.TOP_P_FOR_VIDEO ?? 48 | config.TOP_P, 49 | topK: 50 | config.TOP_K_FOR_ANALYZE_VIDEO ?? 51 | config.TOP_K_FOR_VIDEO ?? 52 | config.TOP_K, 53 | maxTokens: 54 | config.MAX_TOKENS_FOR_ANALYZE_VIDEO ?? 55 | config.MAX_TOKENS_FOR_VIDEO ?? 56 | config.MAX_TOKENS, 57 | taskType: 'video', 58 | functionName: FUNCTION_NAMES.ANALYZE_VIDEO, 59 | ...args.options, // User options override defaults 60 | }; 61 | 62 | // Analyze the video 63 | const result = await videoProvider.analyzeVideo( 64 | processedVideoSource, 65 | args.prompt, 66 | options 67 | ); 68 | 69 | return result; 70 | } catch (error) { 71 | console.error('Error in analyze_video tool:', error); 72 | 73 | if (error instanceof VisionError) { 74 | throw error; 75 | } 76 | 77 | throw new VisionError( 78 | `Failed to analyze video: ${error instanceof Error ? error.message : String(error)}`, 79 | 'ANALYSIS_ERROR', 80 | 'gemini', 81 | error instanceof Error ? error : undefined 82 | ); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /docs/llm_logs/dynamic-schema-validation-fix-plan.md: -------------------------------------------------------------------------------- 1 | # Plan: Fix Dynamic MAX_IMAGES_FOR_COMPARISON Schema Validation 2 | 3 | ## Problem Description 4 | 5 | The `compare_images` MCP tool has inconsistent validation: 6 | - **Schema validation** (server.ts): Hardcoded to max 4 images 7 | - **Tool implementation** (compare_images.ts): Respects `MAX_IMAGES_FOR_COMPARISON` environment variable 8 | 9 | This means users cannot use more than 4 images even if they configure `MAX_IMAGES_FOR_COMPARISON=6`. 10 | 11 | ## Root Cause 12 | 13 | The MCP tool registration in `server.ts` uses a static Zod schema that's defined at module load time, before configuration is available: 14 | 15 | ```typescript 16 | // Current - HARDCODED 17 | imageSources: z 18 | .array(z.string()) 19 | .min(2) 20 | .max(4) // ← Static value, ignores config 21 | ``` 22 | 23 | ## Solution Strategy 24 | 25 | ### Option 1: Lazy Schema Generation (Recommended) 26 | Move the schema creation inside the tool handler where config is available. 27 | 28 | ### Option 2: Dynamic Schema Factory 29 | Create a schema factory function that accepts max images parameter. 30 | 31 | ### Option 3: Configuration-based Registration 32 | Register tools after configuration is loaded. 33 | 34 | **Selected: Option 1** - Most straightforward and maintains existing patterns. 35 | 36 | ## Implementation Plan 37 | 38 | 1. **Modify server.ts**: 39 | - Move schema validation from registration to handler 40 | - Use manual validation with config values 41 | - Keep Zod for type safety but make limits dynamic 42 | 43 | 2. **Update validation logic**: 44 | - Read `MAX_IMAGES_FOR_COMPARISON` from config 45 | - Apply dynamic validation in handler 46 | - Maintain backward compatibility 47 | 48 | 3. **Preserve error consistency**: 49 | - Same error format as current Zod validation 50 | - Clear error messages for users 51 | 52 | ## Implementation Details 53 | 54 | ```typescript 55 | // Before (hardcoded) 56 | inputSchema: { 57 | imageSources: z.array(z.string()).min(2).max(4) 58 | } 59 | 60 | // After (dynamic) 61 | inputSchema: { 62 | imageSources: z.array(z.string()).min(2) // Remove max, validate in handler 63 | } 64 | 65 | // Handler validation: 66 | const { config } = getServices(); 67 | const maxImages = config.MAX_IMAGES_FOR_COMPARISON || 4; 68 | if (imageSources.length > maxImages) { 69 | throw new Error(`Maximum ${maxImages} images allowed`); 70 | } 71 | ``` 72 | 73 | ## Benefits 74 | 75 | - ✅ Respects user configuration 76 | - ✅ Consistent behavior across schema and implementation 77 | - ✅ No breaking changes to existing API 78 | - ✅ Maintains type safety 79 | 80 | ## Risk Assessment 81 | 82 | - **Risk Level**: Low 83 | - **Breaking Changes**: None 84 | - **Backward Compatibility**: Full 85 | - **Testing**: Can validate with different MAX_IMAGES_FOR_COMPARISON values 86 | 87 | ## Expected Outcome 88 | 89 | After fix: 90 | - `MAX_IMAGES_FOR_COMPARISON=6` → Users can compare up to 6 images 91 | - `MAX_IMAGES_FOR_COMPARISON=2` → Users can compare up to 2 images 92 | - Default behavior unchanged (max 4 images) -------------------------------------------------------------------------------- /src/types/ObjectDetection.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Object detection types for AI Vision MCP 3 | */ 4 | 5 | import type { AnalysisOptions } from './Providers.js'; 6 | 7 | export interface DetectedObject { 8 | object: string; // Generic category for detected object 9 | label: string; // Descriptive label or instance-specific detail 10 | normalized_box_2d: [number, number, number, number]; // [ymin, xmin, ymax, xmax] normalized to 0-1000 11 | } 12 | 13 | export interface ObjectDetectionResult { 14 | detections: DetectedObject[]; 15 | image_metadata: { 16 | width: number; 17 | height: number; 18 | size_bytes: number; 19 | format: string; 20 | }; 21 | processing_time?: number; 22 | model: string; 23 | provider: string; 24 | } 25 | 26 | export interface ObjectDetectionArgs { 27 | imageSource: string; // URL, base64, or local file path 28 | prompt?: string; // Optional custom detection prompt 29 | outputFilePath?: string; // Optional explicit output path 30 | options?: AnalysisOptions; // Optional API configuration parameters 31 | } 32 | 33 | // Enhanced metadata interface for object detection responses 34 | export interface ObjectDetectionMetadata { 35 | model: string; // "gemini-2.5-flash-lite" 36 | provider: string; // "google" | "vertex_ai" 37 | usage?: { 38 | promptTokenCount: number; 39 | candidatesTokenCount: number; 40 | totalTokenCount: number; 41 | }; 42 | processingTime: number; // milliseconds 43 | fileType?: string; // "image/png" 44 | fileSize?: number; // bytes 45 | modelVersion?: string; // "gemini-2.5-flash-lite" 46 | responseId?: string; // "abc123..." 47 | fileSaveStatus?: 'saved' | 'skipped_due_to_permissions'; // File save status 48 | } 49 | 50 | // MCP response types for different output scenarios 51 | export interface DetectionWithFile { 52 | detections: DetectedObject[]; 53 | file: { 54 | path: string; 55 | size_bytes: number; 56 | format: string; 57 | }; 58 | image_metadata: { 59 | width: number; 60 | height: number; 61 | original_size: number; 62 | }; 63 | summary: string; // Human-readable summary with percentage coordinates 64 | metadata: ObjectDetectionMetadata; // Enhanced metadata 65 | } 66 | 67 | export interface DetectionWithTempFile { 68 | detections: DetectedObject[]; 69 | tempFile: { 70 | path: string; 71 | size_bytes: number; 72 | format: string; 73 | }; 74 | image_metadata: { 75 | width: number; 76 | height: number; 77 | original_size: number; 78 | }; 79 | summary: string; // Human-readable summary with percentage coordinates 80 | metadata: ObjectDetectionMetadata; // Enhanced metadata 81 | } 82 | 83 | export interface DetectionOnly { 84 | detections: DetectedObject[]; 85 | image_metadata: { 86 | width: number; 87 | height: number; 88 | original_size: number; 89 | }; 90 | summary: string; // Human-readable summary with percentage coordinates 91 | metadata: ObjectDetectionMetadata; // Enhanced metadata 92 | } 93 | 94 | // Union type for all possible response types 95 | export type ObjectDetectionResponse = DetectionWithFile | DetectionWithTempFile | DetectionOnly; 96 | -------------------------------------------------------------------------------- /src/types/Analysis.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Analysis types for vision providers 3 | */ 4 | 5 | import { type FunctionName } from '../constants/FunctionNames.js'; 6 | 7 | export type TaskType = 'image' | 'video'; 8 | 9 | export interface AnalysisOptions { 10 | temperature?: number | undefined; 11 | topP?: number | undefined; 12 | topK?: number | undefined; 13 | maxTokens?: number | undefined; 14 | maxTokensForImage?: number | undefined; 15 | maxTokensForVideo?: number | undefined; 16 | stopSequences?: string[] | undefined; 17 | taskType?: TaskType; 18 | functionName?: FunctionName; 19 | responseSchema?: any; // Structured output schema for object detection 20 | systemInstruction?: string | undefined; // System instruction to guide model behavior 21 | } 22 | 23 | export interface AnalysisResult { 24 | text: string; 25 | metadata: AnalysisMetadata; 26 | } 27 | 28 | export interface AnalysisMetadata { 29 | model: string; 30 | provider: string; 31 | usage?: UsageMetadata; 32 | processingTime?: number; 33 | fileType?: string; 34 | fileSize?: number; 35 | modelVersion?: string; // "gemini-2.5-flash-lite" 36 | responseId?: string; // "abc123..." 37 | } 38 | 39 | export interface UsageMetadata { 40 | promptTokenCount: number; 41 | candidatesTokenCount: number; 42 | totalTokenCount: number; 43 | } 44 | 45 | export interface UploadedFile { 46 | id: string; 47 | filename: string; 48 | mimeType: string; 49 | size: number; 50 | url?: string; 51 | uri?: string; 52 | displayName?: string; 53 | state?: 'PROCESSING' | 'ACTIVE' | 'FAILED'; 54 | createTime?: string; 55 | updateTime?: string; 56 | expirationTime?: string; 57 | sha256Hash?: string; 58 | } 59 | 60 | export interface FileReference { 61 | type: 'file_uri' | 'public_url' | 'base64'; 62 | uri?: string; 63 | url?: string; 64 | data?: string; 65 | mimeType: string; 66 | } 67 | 68 | export interface HealthStatus { 69 | status: 'healthy' | 'unhealthy' | 'degraded'; 70 | message?: string; 71 | lastCheck: string; 72 | responseTime?: number; 73 | } 74 | 75 | export interface RateLimitInfo { 76 | requestsPerMinute?: number; 77 | requestsPerDay?: number; 78 | currentUsage?: { 79 | requestsPerMinute: number; 80 | requestsPerDay: number; 81 | }; 82 | resetTime?: string; 83 | } 84 | 85 | export interface ProviderCapabilities { 86 | supportedImageFormats: string[]; 87 | supportedVideoFormats: string[]; 88 | maxImageSize: number; 89 | maxVideoSize: number; 90 | maxVideoDuration: number; 91 | supportsVideo: boolean; 92 | supportsFileUpload: boolean; 93 | } 94 | 95 | export interface ModelCapabilities { 96 | imageAnalysis: boolean; 97 | videoAnalysis: boolean; 98 | maxTokensForImage: number; 99 | maxTokensForVideo: number; 100 | supportedFormats: string[]; 101 | } 102 | 103 | export interface ProviderInfo { 104 | name: string; 105 | version: string; 106 | description: string; 107 | capabilities: ProviderCapabilities; 108 | modelCapabilities: ModelCapabilities; 109 | rateLimit?: RateLimitInfo; // Optional - rate limits vary by user tier/project 110 | } 111 | -------------------------------------------------------------------------------- /src/tools/analyze_image.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * MCP Tool: analyze_image 3 | * Analyzes an image using AI vision models. Supports URLs, base64 data, and local file paths. 4 | */ 5 | 6 | import type { AnalysisOptions, AnalysisResult } from '../types/Providers.js'; 7 | import type { VisionProvider } from '../types/Providers.js'; 8 | import { FileService } from '../services/FileService.js'; 9 | import type { Config } from '../types/Config.js'; 10 | import { VisionError } from '../types/Errors.js'; 11 | import { FUNCTION_NAMES } from '../constants/FunctionNames.js'; 12 | 13 | export interface AnalyzeImageArgs { 14 | imageSource: string; // Can be URL, base64 data, or local file path 15 | prompt: string; 16 | options?: AnalysisOptions; 17 | } 18 | 19 | export async function analyze_image( 20 | args: AnalyzeImageArgs, 21 | config: Config, 22 | imageProvider: VisionProvider, 23 | imageFileService: FileService 24 | ): Promise { 25 | try { 26 | // Validate arguments 27 | if (!args.imageSource) { 28 | throw new VisionError('imageSource is required', 'MISSING_ARGUMENT'); 29 | } 30 | if (!args.prompt) { 31 | throw new VisionError('prompt is required', 'MISSING_ARGUMENT'); 32 | } 33 | 34 | // Handle image source (URL vs local file vs base64) 35 | const processedImageSource = await imageFileService.handleImageSource( 36 | args.imageSource 37 | ); 38 | console.log( 39 | `[analyze_image] Processed image source: ${processedImageSource.substring(0, 100)}${processedImageSource.length > 100 ? '...' : ''}` 40 | ); 41 | console.log(`[analyze_image] Original source: ${args.imageSource}`); 42 | console.log( 43 | `[analyze_image] Processed source starts with data:image: ${processedImageSource.startsWith('data:image/')}` 44 | ); 45 | 46 | // Merge default options with provided options 47 | const options: AnalysisOptions = { 48 | temperature: 49 | config.TEMPERATURE_FOR_ANALYZE_IMAGE ?? 50 | config.TEMPERATURE_FOR_IMAGE ?? 51 | config.TEMPERATURE, 52 | topP: 53 | config.TOP_P_FOR_ANALYZE_IMAGE ?? 54 | config.TOP_P_FOR_IMAGE ?? 55 | config.TOP_P, 56 | topK: 57 | config.TOP_K_FOR_ANALYZE_IMAGE ?? 58 | config.TOP_K_FOR_IMAGE ?? 59 | config.TOP_K, 60 | maxTokens: 61 | config.MAX_TOKENS_FOR_ANALYZE_IMAGE ?? 62 | config.MAX_TOKENS_FOR_IMAGE ?? 63 | config.MAX_TOKENS, 64 | taskType: 'image', 65 | functionName: FUNCTION_NAMES.ANALYZE_IMAGE, 66 | ...args.options, // User options override defaults 67 | }; 68 | 69 | // Analyze the image 70 | const result = await imageProvider.analyzeImage( 71 | processedImageSource, 72 | args.prompt, 73 | options 74 | ); 75 | 76 | return result; 77 | } catch (error) { 78 | console.error('Error in analyze_image tool:', error); 79 | 80 | if (error instanceof VisionError) { 81 | throw error; 82 | } 83 | 84 | throw new VisionError( 85 | `Failed to analyze image: ${error instanceof Error ? error.message : String(error)}`, 86 | 'ANALYSIS_ERROR', 87 | 'gemini', 88 | error instanceof Error ? error : undefined 89 | ); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/utils/credentialsParser.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Utility functions for parsing Google Cloud service account credentials 3 | */ 4 | 5 | import fs from 'fs'; 6 | import { ConfigurationError } from '../types/Errors.js'; 7 | 8 | export interface ServiceAccountCredentials { 9 | type: string; 10 | project_id: string; 11 | private_key_id: string; 12 | private_key: string; 13 | client_email: string; 14 | client_id: string; 15 | auth_uri: string; 16 | token_uri: string; 17 | auth_provider_x509_cert_url: string; 18 | client_x509_cert_url: string; 19 | universe_domain?: string; 20 | } 21 | 22 | /** 23 | * Parse Google Cloud service account credentials from a file path 24 | * @param credentialsPath - Path to the service account JSON file 25 | * @returns Parsed service account credentials 26 | */ 27 | export function parseServiceAccountCredentials( 28 | credentialsPath: string 29 | ): ServiceAccountCredentials { 30 | try { 31 | // Check if file exists 32 | if (!fs.existsSync(credentialsPath)) { 33 | throw new ConfigurationError( 34 | `Service account credentials file not found: ${credentialsPath}`, 35 | 'VERTEX_CREDENTIALS' 36 | ); 37 | } 38 | 39 | // Read and parse the JSON file 40 | const fileContent = fs.readFileSync(credentialsPath, 'utf-8'); 41 | const credentials = JSON.parse(fileContent) as ServiceAccountCredentials; 42 | 43 | // Validate required fields 44 | if (!credentials.project_id) { 45 | throw new ConfigurationError( 46 | 'Service account credentials file is missing "project_id" field', 47 | 'VERTEX_CREDENTIALS' 48 | ); 49 | } 50 | 51 | if (!credentials.private_key) { 52 | throw new ConfigurationError( 53 | 'Service account credentials file is missing "private_key" field', 54 | 'VERTEX_CREDENTIALS' 55 | ); 56 | } 57 | 58 | if (!credentials.client_email) { 59 | throw new ConfigurationError( 60 | 'Service account credentials file is missing "client_email" field', 61 | 'VERTEX_CREDENTIALS' 62 | ); 63 | } 64 | 65 | return credentials; 66 | } catch (error) { 67 | if (error instanceof ConfigurationError) { 68 | throw error; 69 | } 70 | 71 | if (error instanceof SyntaxError) { 72 | throw new ConfigurationError( 73 | `Invalid JSON in service account credentials file: ${credentialsPath}`, 74 | 'VERTEX_CREDENTIALS' 75 | ); 76 | } 77 | 78 | throw new ConfigurationError( 79 | `Failed to parse service account credentials: ${error instanceof Error ? error.message : String(error)}`, 80 | 'VERTEX_CREDENTIALS' 81 | ); 82 | } 83 | } 84 | 85 | /** 86 | * Extract project ID from service account credentials file 87 | * @param credentialsPath - Path to the service account JSON file 88 | * @returns Project ID from the credentials 89 | */ 90 | export function extractProjectIdFromCredentials( 91 | credentialsPath: string 92 | ): string { 93 | const credentials = parseServiceAccountCredentials(credentialsPath); 94 | return credentials.project_id; 95 | } 96 | 97 | /** 98 | * Validate service account credentials file format 99 | * @param credentialsPath - Path to the service account JSON file 100 | * @returns true if valid, throws error otherwise 101 | */ 102 | export function validateServiceAccountCredentials( 103 | credentialsPath: string 104 | ): boolean { 105 | parseServiceAccountCredentials(credentialsPath); 106 | return true; 107 | } 108 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | #=============================================== 2 | # PROVIDER SELECTION 3 | #=============================================== 4 | IMAGE_PROVIDER=google # or vertex_ai 5 | VIDEO_PROVIDER=google # or vertex_ai 6 | 7 | #=============================================== 8 | # MODEL SELECTION 9 | #=============================================== 10 | # IMAGE_MODEL=gemini-2.5-flash-lite 11 | # VIDEO_MODEL=gemini-2.5-flash-pro 12 | 13 | #=============================================== 14 | # FUNCTION-SPECIFIC MODEL SELECTION 15 | #=============================================== 16 | # ANALYZE_IMAGE_MODEL= 17 | # COMPARE_IMAGES_MODEL= 18 | # DETECT_OBJECTS_IN_IMAGE_MODEL= 19 | # ANALYZE_VIDEO_MODEL= 20 | 21 | #=============================================== 22 | # GEMINI API CONFIGURATION (AI Studio) 23 | #=============================================== 24 | GEMINI_API_KEY=your_gemini_api_key_here 25 | # GEMINI_BASE_URL=https://generativelanguage.googleapis.com 26 | 27 | #=============================================== 28 | # VERTEX AI CONFIGURATION 29 | #=============================================== 30 | VERTEX_CREDENTIALS=path/to/service-account.json 31 | # The following are optional and auto-derived from VERTEX_CREDENTIALS: 32 | # VERTEX_PROJECT_ID=your-gcp-project-id 33 | # VERTEX_LOCATION=us-central1 34 | # VERTEX_ENDPOINT=https://aiplatform.googleapis.com 35 | 36 | #=============================================== 37 | # GOOGLE CLOUD STORAGE CONFIGURATION (Required for Vertex AI) 38 | #=============================================== 39 | GCS_BUCKET_NAME=your-vision-files-bucket 40 | # The following are optional and auto-derived from VERTEX_CREDENTIALS: 41 | # GCS_PROJECT_ID - Auto-derived from VERTEX_CREDENTIALS 42 | # GCS_CREDENTIALS - Defaults to VERTEX_CREDENTIALS 43 | # GCS_REGION - Defaults to VERTEX_LOCATION 44 | 45 | #=============================================== 46 | # UNIVERSAL API PARAMETERS 47 | #=============================================== 48 | TEMPERATURE=0.8 49 | TOP_P=0.95 50 | TOP_K=30 51 | MAX_TOKENS=16384 52 | 53 | #=============================================== 54 | # TASK-SPECIFIC API PARAMETERS 55 | #=============================================== 56 | # TEMPERATURE_FOR_IMAGE= 57 | # TOP_P_FOR_IMAGE= 58 | # TOP_K_FOR_IMAGE= 59 | # MAX_TOKENS_FOR_IMAGE= 60 | # TEMPERATURE_FOR_VIDEO= 61 | # TOP_P_FOR_VIDEO= 62 | # TOP_K_FOR_VIDEO= 63 | # MAX_TOKENS_FOR_VIDEO= 64 | 65 | #=============================================== 66 | # FUNCTION-SPECIFIC API PARAMETERS 67 | #=============================================== 68 | # TEMPERATURE_FOR_ANALYZE_IMAGE= 69 | # TOP_P_FOR_ANALYZE_IMAGE= 70 | # TOP_K_FOR_ANALYZE_IMAGE= 71 | # MAX_TOKENS_FOR_ANALYZE_IMAGE= 72 | # TEMPERATURE_FOR_COMPARE_IMAGES= 73 | # TOP_P_FOR_COMPARE_IMAGES= 74 | # TOP_K_FOR_COMPARE_IMAGES= 75 | # MAX_TOKENS_FOR_COMPARE_IMAGES= 76 | # TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE= 77 | # TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE= 78 | # TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE= 79 | # MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE= 80 | # TEMPERATURE_FOR_ANALYZE_VIDEO= 81 | # TOP_P_FOR_ANALYZE_VIDEO= 82 | # TOP_K_FOR_ANALYZE_VIDEO= 83 | # MAX_TOKENS_FOR_ANALYZE_VIDEO= 84 | 85 | #=============================================== 86 | # FILE PROCESSING CONFIGURATION 87 | #=============================================== 88 | MAX_IMAGE_SIZE=20971520 89 | MAX_VIDEO_SIZE=2147483648 90 | ALLOWED_IMAGE_FORMATS=png,jpg,jpeg,webp,gif,bmp,tiff 91 | ALLOWED_VIDEO_FORMATS=mp4,mov,avi,mkv,webm,flv,wmv,3gp 92 | MAX_VIDEO_DURATION=3600 93 | 94 | #=============================================== 95 | # FILE UPLOAD CONFIGURATION 96 | #=============================================== 97 | GEMINI_FILES_API_THRESHOLD=10485760 98 | VERTEX_AI_FILES_API_THRESHOLD=0 99 | 100 | #=============================================== 101 | # LOGGING CONFIGURATION 102 | #=============================================== 103 | LOG_LEVEL=info 104 | 105 | #=============================================== 106 | # DEVELOPMENT CONFIGURATION 107 | #=============================================== 108 | NODE_ENV=production # or 'development' -------------------------------------------------------------------------------- /src/tools/compare_images.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * MCP Tool: compare_images 3 | * Compares multiple images using AI vision models. Supports URLs, base64 data, and local file paths. 4 | */ 5 | 6 | import type { AnalysisOptions, AnalysisResult } from '../types/Providers.js'; 7 | import type { VisionProvider } from '../types/Providers.js'; 8 | import { FileService } from '../services/FileService.js'; 9 | import type { Config } from '../types/Config.js'; 10 | import { VisionError } from '../types/Errors.js'; 11 | import { FUNCTION_NAMES } from '../constants/FunctionNames.js'; 12 | 13 | export interface CompareImagesArgs { 14 | imageSources: string[]; // Array of image sources (URLs/base64/file paths) 15 | prompt: string; 16 | options?: AnalysisOptions; 17 | } 18 | 19 | export async function compare_images( 20 | args: CompareImagesArgs, 21 | config: Config, 22 | imageProvider: VisionProvider, 23 | imageFileService: FileService 24 | ): Promise { 25 | try { 26 | // Validate arguments 27 | if (!args.imageSources || !Array.isArray(args.imageSources)) { 28 | throw new VisionError( 29 | 'imageSources must be an array', 30 | 'MISSING_ARGUMENT' 31 | ); 32 | } 33 | if (!args.prompt) { 34 | throw new VisionError('prompt is required', 'MISSING_ARGUMENT'); 35 | } 36 | 37 | // Validate image count 38 | const maxImages = config.MAX_IMAGES_FOR_COMPARISON || 4; 39 | if (args.imageSources.length < 2) { 40 | throw new VisionError( 41 | 'At least 2 images are required for comparison', 42 | 'INVALID_ARGUMENT' 43 | ); 44 | } 45 | if (args.imageSources.length > maxImages) { 46 | throw new VisionError( 47 | `Maximum ${maxImages} images allowed for comparison, received ${args.imageSources.length}`, 48 | 'INVALID_ARGUMENT' 49 | ); 50 | } 51 | 52 | // Validate each image source 53 | for (let i = 0; i < args.imageSources.length; i++) { 54 | if (!args.imageSources[i] || typeof args.imageSources[i] !== 'string') { 55 | throw new VisionError( 56 | `Image source at index ${i} is invalid`, 57 | 'INVALID_ARGUMENT' 58 | ); 59 | } 60 | } 61 | 62 | console.log( 63 | `[compare_images] Processing ${args.imageSources.length} images for comparison` 64 | ); 65 | 66 | // Process all image sources 67 | const processedImageSources = await Promise.all( 68 | args.imageSources.map(async (imageSource, index) => { 69 | console.log( 70 | `[compare_images] Processing image ${index + 1}: ${imageSource.substring(0, 100)}${imageSource.length > 100 ? '...' : ''}` 71 | ); 72 | return await imageFileService.handleImageSource(imageSource); 73 | }) 74 | ); 75 | 76 | console.log( 77 | `[compare_images] All ${processedImageSources.length} images processed successfully` 78 | ); 79 | 80 | // Merge default options with provided options 81 | const options: AnalysisOptions = { 82 | temperature: 83 | config.TEMPERATURE_FOR_COMPARE_IMAGES ?? 84 | config.TEMPERATURE_FOR_IMAGE ?? 85 | config.TEMPERATURE, 86 | topP: 87 | config.TOP_P_FOR_COMPARE_IMAGES ?? 88 | config.TOP_P_FOR_IMAGE ?? 89 | config.TOP_P, 90 | topK: 91 | config.TOP_K_FOR_COMPARE_IMAGES ?? 92 | config.TOP_K_FOR_IMAGE ?? 93 | config.TOP_K, 94 | maxTokens: 95 | config.MAX_TOKENS_FOR_COMPARE_IMAGES ?? 96 | config.MAX_TOKENS_FOR_IMAGE ?? 97 | config.MAX_TOKENS, 98 | taskType: 'image', 99 | functionName: FUNCTION_NAMES.COMPARE_IMAGES, 100 | ...args.options, // User options override defaults 101 | }; 102 | 103 | // Call the provider's comparison method 104 | const result = await imageProvider.compareImages( 105 | processedImageSources, 106 | args.prompt, 107 | options 108 | ); 109 | 110 | return result; 111 | } catch (error) { 112 | console.error('Error in compare_images tool:', error); 113 | 114 | if (error instanceof VisionError) { 115 | throw error; 116 | } 117 | 118 | throw new VisionError( 119 | `Failed to compare images: ${error instanceof Error ? error.message : String(error)}`, 120 | 'ANALYSIS_ERROR', 121 | 'gemini', 122 | error instanceof Error ? error : undefined 123 | ); 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /docs/llm_logs/vertex-ai-auth-fix-plan.md: -------------------------------------------------------------------------------- 1 | # Plan: Fix Vertex AI Authentication Issue 2 | 3 | ## Problem Description 4 | 5 | The VertexAI provider is failing with `aiplatform.endpoints.predict` permission denied errors because the GoogleGenAI client is not being initialized with proper authentication credentials. 6 | 7 | ## Root Cause Analysis 8 | 9 | 1. **Missing Authentication**: The `VertexAIProvider.ts` constructor initializes the GoogleGenAI client without any authentication configuration 10 | 2. **Credentials Available but Unused**: The `VertexAIConfig` includes a `credentials` field from `VERTEX_CREDENTIALS` environment variable, but it's not passed to the GoogleGenAI client 11 | 3. **No GoogleAuthOptions**: The client config lacks the required `googleAuthOptions` parameter 12 | 13 | ## Current Error Flow 14 | 15 | ``` 16 | Environment Variables → ConfigService → VertexAIConfig → VertexAIProvider Constructor 17 | ↓ 18 | VERTEX_CREDENTIALS ✓ → credentials: "path/to/file.json" → NOT USED ✗ 19 | ↓ 20 | GoogleGenAI client with NO AUTH → 403 Permission Denied 21 | ``` 22 | 23 | ## Solution Options Analysis 24 | 25 | ### Option 1: Environment Variable (Simple) 26 | - ✅ Quick implementation 27 | - ❌ Modifies global process environment 28 | - ❌ Not ideal for multiple concurrent instances 29 | 30 | ### Option 2: GoogleAuthOptions (Recommended) ⭐ 31 | - ✅ Explicit authentication configuration 32 | - ✅ Clean separation of concerns 33 | - ✅ Supports both file paths and credential objects 34 | - ✅ No global environment modification 35 | 36 | ### Option 3: Credential Object Parsing (Complex) 37 | - ✅ Most flexible 38 | - ❌ Higher complexity 39 | - ❌ Requires file system operations 40 | 41 | ## Selected Solution: Option 2 - GoogleAuthOptions 42 | 43 | ### Implementation Steps 44 | 45 | 1. **Modify VertexAI Provider Constructor**: 46 | - Add authentication logic before GoogleGenAI client initialization 47 | - Check if `config.credentials` is provided 48 | - Add `googleAuthOptions` to client configuration 49 | 50 | 2. **Support Multiple Credential Types**: 51 | - File path (most common from README examples) 52 | - JSON string (future flexibility) 53 | 54 | 3. **Maintain Backward Compatibility**: 55 | - Keep existing environment variable support 56 | - No breaking changes to public API 57 | 58 | ### Implementation Details 59 | 60 | ```typescript 61 | // Before (current - broken) 62 | const clientConfig: any = { 63 | vertexai: true, 64 | project: config.projectId, 65 | location: config.location, 66 | }; 67 | 68 | // After (fixed) 69 | const clientConfig: any = { 70 | vertexai: true, 71 | project: config.projectId, 72 | location: config.location, 73 | }; 74 | 75 | // Add authentication if credentials are provided 76 | if (config.credentials) { 77 | clientConfig.googleAuthOptions = { 78 | keyFile: config.credentials 79 | }; 80 | } 81 | ``` 82 | 83 | ### Environment Variables Respected 84 | 85 | From README.md requirements: 86 | - ✅ `VERTEX_CREDENTIALS` - Path to service account JSON file 87 | - ✅ `VERTEX_PROJECT_ID` - Auto-derived from credentials or explicit 88 | - ✅ `VERTEX_LOCATION` - Defaults to 'us-central1' 89 | - ✅ `VERTEX_ENDPOINT` - Defaults to 'https://aiplatform.googleapis.com' 90 | 91 | ### Testing Strategy 92 | 93 | 1. **Verify Current Error**: Confirm 403 permission denied 94 | 2. **Apply Fix**: Implement GoogleAuthOptions 95 | 3. **Test Authentication**: Verify successful API calls 96 | 4. **Test Edge Cases**: Missing credentials, invalid paths 97 | 98 | ### Risk Assessment 99 | 100 | - **Risk Level**: Low 101 | - **Rollback**: Simple revert if issues arise 102 | - **Breaking Changes**: None 103 | - **Dependencies**: No new dependencies required 104 | 105 | ### Expected Outcome 106 | 107 | After implementation: 108 | ``` 109 | Environment Variables → ConfigService → VertexAIConfig → VertexAIProvider Constructor 110 | ↓ 111 | VERTEX_CREDENTIALS ✓ → credentials: "path/to/file.json" → googleAuthOptions ✓ 112 | ↓ 113 | GoogleGenAI client with AUTH → ✅ Success 114 | ``` 115 | 116 | ## Recommendation: Option 2 117 | 118 | **Rationale**: Provides the best balance of implementation simplicity, maintainability, and explicit configuration while respecting all environment variables outlined in the README. -------------------------------------------------------------------------------- /src/types/Config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Configuration types for the Vision MCP Server 3 | */ 4 | 5 | export interface Config { 6 | // Provider selection 7 | IMAGE_PROVIDER: 'google' | 'vertex_ai'; 8 | VIDEO_PROVIDER: 'google' | 'vertex_ai'; 9 | 10 | // Gemini API configuration 11 | GEMINI_API_KEY?: string | undefined; 12 | GEMINI_BASE_URL?: string | undefined; 13 | 14 | // Vertex AI configuration 15 | VERTEX_CREDENTIALS?: string; 16 | VERTEX_PROJECT_ID?: string; 17 | VERTEX_LOCATION?: string; 18 | VERTEX_ENDPOINT?: string; 19 | 20 | // Model configuration 21 | IMAGE_MODEL?: string; 22 | VIDEO_MODEL?: string; 23 | 24 | // Function-specific model configuration 25 | ANALYZE_IMAGE_MODEL?: string; 26 | COMPARE_IMAGES_MODEL?: string; 27 | DETECT_OBJECTS_IN_IMAGE_MODEL?: string; 28 | ANALYZE_VIDEO_MODEL?: string; 29 | 30 | // Google Cloud Storage configuration (for Vertex AI file storage) 31 | GCS_BUCKET_NAME?: string; 32 | GCS_PROJECT_ID?: string; // Auto-derived from VERTEX_CREDENTIALS if not provided 33 | GCS_CREDENTIALS?: string; // Optional: defaults to VERTEX_CREDENTIALS 34 | GCS_REGION?: string; // Optional: defaults to VERTEX_LOCATION 35 | 36 | // Universal API parameters 37 | TEMPERATURE?: number; 38 | TOP_P?: number; 39 | TOP_K?: number; 40 | MAX_TOKENS?: number; 41 | 42 | // Task-specific API parameters 43 | TEMPERATURE_FOR_IMAGE?: number; 44 | TOP_P_FOR_IMAGE?: number; 45 | TOP_K_FOR_IMAGE?: number; 46 | MAX_TOKENS_FOR_IMAGE?: number; 47 | TEMPERATURE_FOR_VIDEO?: number; 48 | TOP_P_FOR_VIDEO?: number; 49 | TOP_K_FOR_VIDEO?: number; 50 | MAX_TOKENS_FOR_VIDEO?: number; 51 | 52 | // Function-specific API parameters 53 | TEMPERATURE_FOR_ANALYZE_IMAGE?: number; 54 | TOP_P_FOR_ANALYZE_IMAGE?: number; 55 | TOP_K_FOR_ANALYZE_IMAGE?: number; 56 | MAX_TOKENS_FOR_ANALYZE_IMAGE?: number; 57 | TEMPERATURE_FOR_COMPARE_IMAGES?: number; 58 | TOP_P_FOR_COMPARE_IMAGES?: number; 59 | TOP_K_FOR_COMPARE_IMAGES?: number; 60 | MAX_TOKENS_FOR_COMPARE_IMAGES?: number; 61 | TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE?: number; 62 | TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE?: number; 63 | TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE?: number; 64 | MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE?: number; 65 | TEMPERATURE_FOR_ANALYZE_VIDEO?: number; 66 | TOP_P_FOR_ANALYZE_VIDEO?: number; 67 | TOP_K_FOR_ANALYZE_VIDEO?: number; 68 | MAX_TOKENS_FOR_ANALYZE_VIDEO?: number; 69 | 70 | // File processing configuration 71 | MAX_IMAGE_SIZE?: number; 72 | MAX_VIDEO_SIZE?: number; 73 | ALLOWED_IMAGE_FORMATS?: string[]; 74 | ALLOWED_VIDEO_FORMATS?: string[]; 75 | MAX_VIDEO_DURATION?: number; 76 | MAX_IMAGES_FOR_COMPARISON?: number; 77 | 78 | // File upload configuration 79 | GEMINI_FILES_API_THRESHOLD?: number; 80 | VERTEX_AI_FILES_API_THRESHOLD?: number; 81 | 82 | // Logging configuration 83 | LOG_LEVEL?: 'info' | 'debug' | 'warn' | 'error'; 84 | 85 | // Development configuration 86 | NODE_ENV?: 'development' | 'production'; 87 | } 88 | 89 | export interface GeminiConfig { 90 | apiKey: string; 91 | baseUrl: string; 92 | imageModel: string; 93 | videoModel: string; 94 | } 95 | 96 | export interface VertexAIConfig { 97 | projectId: string; 98 | location: string; 99 | endpoint: string; 100 | credentials?: string; 101 | imageModel: string; 102 | videoModel: string; 103 | } 104 | 105 | export interface GCSConfig { 106 | bucketName: string; 107 | projectId: string; 108 | credentials: string; 109 | region: string; 110 | } 111 | 112 | export interface FileUploadConfig { 113 | useProviderFilesApi: boolean; 114 | geminiFilesApiThreshold: number; 115 | vertexAIFilesApiThreshold: number; 116 | } 117 | 118 | export interface ApiConfig { 119 | temperature: number; 120 | topP: number; 121 | topK: number; 122 | maxTokens: number; 123 | maxTokensForImage: number; 124 | maxTokensForVideo: number; 125 | temperatureForImage?: number; 126 | topPForImage?: number; 127 | topKForImage?: number; 128 | temperatureForVideo?: number; 129 | topPForVideo?: number; 130 | topKForVideo?: number; 131 | temperatureForAnalyzeImage?: number; 132 | topPForAnalyzeImage?: number; 133 | topKForAnalyzeImage?: number; 134 | maxTokensForAnalyzeImage?: number; 135 | temperatureForCompareImages?: number; 136 | topPForCompareImages?: number; 137 | topKForCompareImages?: number; 138 | maxTokensForCompareImages?: number; 139 | temperatureForDetectObjectsInImage?: number; 140 | topPForDetectObjectsInImage?: number; 141 | topKForDetectObjectsInImage?: number; 142 | maxTokensForDetectObjectsInImage?: number; 143 | temperatureForAnalyzeVideo?: number; 144 | topPForAnalyzeVideo?: number; 145 | topKForAnalyzeVideo?: number; 146 | maxTokensForAnalyzeVideo?: number; 147 | // Model configuration 148 | analyzeImageModel?: string; 149 | compareImagesModel?: string; 150 | detectObjectsInImageModel?: string; 151 | analyzeVideoModel?: string; 152 | } 153 | 154 | export interface FileProcessingConfig { 155 | maxImageSize: number; 156 | maxVideoSize: number; 157 | allowedImageFormats: string[]; 158 | allowedVideoFormats: string[]; 159 | maxVideoDuration: number; 160 | maxImagesForComparison: number; 161 | } 162 | 163 | export interface LoggingConfig { 164 | logLevel: 'info' | 'debug' | 'warn' | 'error'; 165 | } 166 | 167 | export interface DevelopmentConfig { 168 | nodeEnv: 'development' | 'production'; 169 | } 170 | -------------------------------------------------------------------------------- /src/types/Errors.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Error types for the Vision MCP Server 3 | */ 4 | 5 | export class VisionError extends Error { 6 | constructor( 7 | message: string, 8 | public code: string, 9 | public provider?: string, 10 | public originalError?: Error, 11 | public statusCode?: number 12 | ) { 13 | super(message); 14 | this.name = 'VisionError'; 15 | 16 | // Maintains proper stack trace for where our error was thrown (only available on V8) 17 | if (Error.captureStackTrace) { 18 | Error.captureStackTrace(this, VisionError); 19 | } 20 | } 21 | } 22 | 23 | export class ConfigurationError extends VisionError { 24 | constructor(message: string, variable?: string) { 25 | super(message, 'CONFIG_ERROR', undefined, undefined, 400); 26 | this.name = 'ConfigurationError'; 27 | this.variable = variable; 28 | } 29 | 30 | public variable?: string; 31 | } 32 | 33 | export class ProviderError extends VisionError { 34 | constructor( 35 | message: string, 36 | provider: string, 37 | originalError?: Error, 38 | statusCode?: number 39 | ) { 40 | super(message, 'PROVIDER_ERROR', provider, originalError, statusCode); 41 | this.name = 'ProviderError'; 42 | } 43 | } 44 | 45 | export class FileUploadError extends VisionError { 46 | constructor( 47 | message: string, 48 | provider?: string, 49 | originalError?: Error, 50 | statusCode?: number 51 | ) { 52 | super(message, 'FILE_UPLOAD_ERROR', provider, originalError, statusCode); 53 | this.name = 'FileUploadError'; 54 | } 55 | } 56 | 57 | export class FileNotFoundError extends VisionError { 58 | constructor(fileId: string, provider?: string) { 59 | super( 60 | `File not found: ${fileId}`, 61 | 'FILE_NOT_FOUND', 62 | provider, 63 | undefined, 64 | 404 65 | ); 66 | this.name = 'FileNotFoundError'; 67 | this.fileId = fileId; 68 | } 69 | 70 | public fileId: string; 71 | } 72 | 73 | export class UnsupportedFileTypeError extends VisionError { 74 | constructor(mimeType: string, supportedTypes?: string[]) { 75 | const message = supportedTypes 76 | ? `Unsupported file type: ${mimeType}. Supported types: ${supportedTypes.join(', ')}` 77 | : `Unsupported file type: ${mimeType}`; 78 | super(message, 'UNSUPPORTED_FILE_TYPE', undefined, undefined, 400); 79 | this.name = 'UnsupportedFileTypeError'; 80 | this.mimeType = mimeType; 81 | this.supportedTypes = supportedTypes; 82 | } 83 | 84 | public mimeType: string; 85 | public supportedTypes?: string[]; 86 | } 87 | 88 | export class FileSizeExceededError extends VisionError { 89 | constructor(fileSize: number, maxSize: number) { 90 | const message = `File size ${fileSize} bytes exceeds maximum allowed size ${maxSize} bytes`; 91 | super(message, 'FILE_SIZE_EXCEEDED', undefined, undefined, 400); 92 | this.name = 'FileSizeExceededError'; 93 | this.fileSize = fileSize; 94 | this.maxSize = maxSize; 95 | } 96 | 97 | public fileSize: number; 98 | public maxSize: number; 99 | } 100 | 101 | export class RateLimitExceededError extends VisionError { 102 | constructor(message: string, provider?: string, retryAfter?: number) { 103 | super(message, 'RATE_LIMIT_EXCEEDED', provider, undefined, 429); 104 | this.name = 'RateLimitExceededError'; 105 | this.retryAfter = retryAfter; 106 | } 107 | 108 | public retryAfter?: number; 109 | } 110 | 111 | export class AuthenticationError extends VisionError { 112 | constructor(message: string, provider?: string) { 113 | super(message, 'AUTHENTICATION_ERROR', provider, undefined, 401); 114 | this.name = 'AuthenticationError'; 115 | } 116 | } 117 | 118 | export class AuthorizationError extends VisionError { 119 | constructor(message: string, provider?: string) { 120 | super(message, 'AUTHORIZATION_ERROR', provider, undefined, 403); 121 | this.name = 'AuthorizationError'; 122 | } 123 | } 124 | 125 | export class NetworkError extends VisionError { 126 | constructor(message: string, originalError?: Error) { 127 | super(message, 'NETWORK_ERROR', undefined, originalError); 128 | this.name = 'NetworkError'; 129 | } 130 | } 131 | 132 | export class ValidationError extends VisionError { 133 | constructor(message: string, field?: string) { 134 | super(message, 'VALIDATION_ERROR', undefined, undefined, 400); 135 | this.name = 'ValidationError'; 136 | this.field = field; 137 | } 138 | 139 | public field?: string; 140 | } 141 | 142 | export class StorageError extends VisionError { 143 | constructor( 144 | message: string, 145 | storageType?: string, 146 | originalError?: Error, 147 | statusCode?: number 148 | ) { 149 | super(message, 'STORAGE_ERROR', storageType, originalError, statusCode); 150 | this.name = 'StorageError'; 151 | } 152 | } 153 | 154 | export type ErrorType = 155 | | 'CONFIG_ERROR' 156 | | 'PROVIDER_ERROR' 157 | | 'FILE_UPLOAD_ERROR' 158 | | 'FILE_NOT_FOUND' 159 | | 'UNSUPPORTED_FILE_TYPE' 160 | | 'FILE_SIZE_EXCEEDED' 161 | | 'RATE_LIMIT_EXCEEDED' 162 | | 'AUTHENTICATION_ERROR' 163 | | 'AUTHORIZATION_ERROR' 164 | | 'NETWORK_ERROR' 165 | | 'VALIDATION_ERROR' 166 | | 'STORAGE_ERROR'; 167 | 168 | export interface ErrorDetails { 169 | code: ErrorType; 170 | message: string; 171 | provider?: string; 172 | statusCode?: number; 173 | originalError?: string; 174 | timestamp: string; 175 | requestId?: string; 176 | } 177 | -------------------------------------------------------------------------------- /src/providers/factory/ProviderFactory.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Provider factory for creating and managing vision providers 3 | */ 4 | 5 | import type { VisionProvider } from '../../types/Providers.js'; 6 | import type { Config } from '../../types/Config.js'; 7 | import { GeminiProvider } from '../gemini/GeminiProvider.js'; 8 | import { VertexAIProvider } from '../vertexai/VertexAIProvider.js'; 9 | import { ConfigurationError, ProviderError } from '../../types/Errors.js'; 10 | import { ConfigService } from '../../services/ConfigService.js'; 11 | 12 | export class VisionProviderFactory { 13 | private static providers = new Map VisionProvider>(); 14 | 15 | /** 16 | * Register a new provider with the factory 17 | */ 18 | static registerProvider(name: string, factory: () => VisionProvider): void { 19 | this.providers.set(name, factory); 20 | } 21 | 22 | /** 23 | * Create a provider instance based on configuration 24 | */ 25 | static createProvider( 26 | config: Config, 27 | type: 'image' | 'video' 28 | ): VisionProvider { 29 | const providerName = 30 | (config as any)[`${type.toUpperCase()}_PROVIDER`] || 'google'; 31 | const factory = this.providers.get(providerName); 32 | 33 | if (!factory) { 34 | throw new ConfigurationError(`Unsupported provider: ${providerName}`); 35 | } 36 | 37 | try { 38 | const provider = factory(); 39 | 40 | // Set default models if not configured 41 | const defaultModels = this.getDefaultModels(providerName); 42 | provider.setModel( 43 | config.IMAGE_MODEL || defaultModels.image, 44 | config.VIDEO_MODEL || defaultModels.video 45 | ); 46 | 47 | return provider; 48 | } catch (error) { 49 | throw new ProviderError( 50 | `Failed to create ${providerName} provider: ${error instanceof Error ? error.message : String(error)}`, 51 | providerName, 52 | error instanceof Error ? error : new Error(String(error)) 53 | ); 54 | } 55 | } 56 | 57 | /** 58 | * Get list of supported providers 59 | */ 60 | static getSupportedProviders(): string[] { 61 | return Array.from(this.providers.keys()); 62 | } 63 | 64 | /** 65 | * Check if a provider is supported 66 | */ 67 | static isProviderSupported(providerName: string): boolean { 68 | return this.providers.has(providerName); 69 | } 70 | 71 | /** 72 | * Get provider-specific configuration validation rules 73 | */ 74 | static getProviderConfigRequirements(providerName: string): string[] { 75 | switch (providerName) { 76 | case 'google': 77 | return ['GEMINI_API_KEY']; 78 | 79 | case 'vertex_ai': 80 | return ['VERTEX_CREDENTIALS', 'VERTEX_PROJECT_ID', 'VERTEX_LOCATION']; 81 | 82 | default: 83 | return []; 84 | } 85 | } 86 | 87 | /** 88 | * Validate provider configuration 89 | */ 90 | static validateProviderConfig(config: Config, providerName: string): void { 91 | const requirements = this.getProviderConfigRequirements(providerName); 92 | const missing = requirements.filter(req => { 93 | const value = config[req as keyof Config]; 94 | return !value || (typeof value === 'string' && value.trim() === ''); 95 | }); 96 | 97 | if (missing.length > 0) { 98 | throw new ConfigurationError( 99 | `Missing required configuration for ${providerName}: ${missing.join(', ')}` 100 | ); 101 | } 102 | } 103 | 104 | /** 105 | * Get default models for each provider 106 | */ 107 | private static getDefaultModels(providerName: string): { 108 | image: string; 109 | video: string; 110 | } { 111 | const config = ConfigService.getInstance().getConfig(); 112 | 113 | // Resolution priority: 114 | // 1. IMAGE_MODEL/VIDEO_MODEL (if set) - User's explicit choice 115 | // 2. Hardcoded defaults - Last resort 116 | return { 117 | image: config.IMAGE_MODEL || 'gemini-2.5-flash-lite', 118 | video: config.VIDEO_MODEL || 'gemini-2.5-flash', 119 | }; 120 | } 121 | 122 | /** 123 | * Initialize default providers 124 | */ 125 | static initializeDefaultProviders(): void { 126 | // Register Gemini API provider 127 | this.registerProvider('google', () => { 128 | const geminiConfig = ConfigService.getInstance().getGeminiConfig(); 129 | return new GeminiProvider(geminiConfig); 130 | }); 131 | 132 | // Register Vertex AI provider 133 | this.registerProvider('vertex_ai', () => { 134 | const vertexConfig = ConfigService.getInstance().getVertexAIConfig(); 135 | return new VertexAIProvider(vertexConfig); 136 | }); 137 | } 138 | 139 | /** 140 | * Create provider with configuration validation 141 | */ 142 | static createProviderWithValidation( 143 | config: Config, 144 | type: 'image' | 'video' 145 | ): VisionProvider { 146 | const providerName = 147 | (config as any)[`${type.toUpperCase()}_PROVIDER`] || 'google'; 148 | 149 | // Validate configuration before creating provider 150 | this.validateProviderConfig(config, providerName); 151 | 152 | // Create the provider through factory (which now properly initializes with config) 153 | const factory = this.providers.get(providerName); 154 | if (!factory) { 155 | throw new ConfigurationError(`Unsupported provider: ${providerName}`); 156 | } 157 | 158 | try { 159 | const provider = factory(); 160 | 161 | // Set default models if not configured 162 | const defaultModels = this.getDefaultModels(providerName); 163 | provider.setModel( 164 | config.IMAGE_MODEL || defaultModels.image, 165 | config.VIDEO_MODEL || defaultModels.video 166 | ); 167 | 168 | return provider; 169 | } catch (error) { 170 | throw new ProviderError( 171 | `Failed to create ${providerName} provider: ${error instanceof Error ? error.message : String(error)}`, 172 | providerName, 173 | error instanceof Error ? error : new Error(String(error)) 174 | ); 175 | } 176 | } 177 | } 178 | 179 | // Initialize default providers when module is loaded 180 | VisionProviderFactory.initializeDefaultProviders(); 181 | -------------------------------------------------------------------------------- /src/types/Providers.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Provider interface and types 3 | */ 4 | 5 | import type { 6 | AnalysisOptions, 7 | AnalysisResult, 8 | UploadedFile, 9 | FileReference, 10 | HealthStatus, 11 | ProviderCapabilities, 12 | ModelCapabilities, 13 | ProviderInfo, 14 | } from './Analysis.js'; 15 | 16 | export type { 17 | AnalysisOptions, 18 | AnalysisResult, 19 | UploadedFile, 20 | FileReference, 21 | HealthStatus, 22 | ProviderCapabilities, 23 | ModelCapabilities, 24 | ProviderInfo, 25 | } from './Analysis.js'; 26 | 27 | export interface VisionProvider { 28 | // Core capabilities 29 | analyzeImage( 30 | imageSource: string, 31 | prompt: string, 32 | options?: AnalysisOptions 33 | ): Promise; 34 | analyzeVideo( 35 | videoSource: string, 36 | prompt: string, 37 | options?: AnalysisOptions 38 | ): Promise; 39 | compareImages( 40 | imageSources: string[], 41 | prompt: string, 42 | options?: AnalysisOptions 43 | ): Promise; 44 | 45 | // File operations 46 | uploadFile( 47 | buffer: Buffer, 48 | filename: string, 49 | mimeType: string 50 | ): Promise; 51 | downloadFile(fileId: string): Promise; 52 | deleteFile(fileId: string): Promise; 53 | 54 | // Model configuration 55 | setModel(imageModel: string, videoModel: string): void; 56 | getImageModel(): string; 57 | getVideoModel(): string; 58 | 59 | // Provider information 60 | getSupportedFormats(): ProviderCapabilities; 61 | getModelCapabilities(): ModelCapabilities; 62 | getProviderInfo(): ProviderInfo; 63 | 64 | // Health and status 65 | healthCheck(): Promise; 66 | supportsVideo(): boolean; 67 | } 68 | 69 | export interface FileUploadStrategy { 70 | uploadFile( 71 | buffer: Buffer, 72 | filename: string, 73 | mimeType: string 74 | ): Promise; 75 | getFileForAnalysis(uploadedFile: UploadedFile): Promise; 76 | cleanup?(fileId: string): Promise; 77 | } 78 | 79 | export interface ProviderConfig { 80 | name: string; 81 | type: 'image' | 'video'; 82 | models: { 83 | image: string; 84 | video: string; 85 | }; 86 | credentials: Record; 87 | options: Record; 88 | } 89 | 90 | export interface ProviderFactory { 91 | createProvider(config: ProviderConfig): VisionProvider; 92 | getSupportedProviders(): string[]; 93 | registerProvider(name: string, factory: () => VisionProvider): void; 94 | } 95 | 96 | // Gemini-specific types 97 | export interface GeminiConfig { 98 | apiKey: string; 99 | baseUrl: string; 100 | imageModel: string; 101 | videoModel: string; 102 | } 103 | 104 | export interface GeminiFileMetadata { 105 | name: string; 106 | displayName: string; 107 | mimeType: string; 108 | sizeBytes: string; 109 | createTime: string; 110 | updateTime: string; 111 | expirationTime: string; 112 | sha256Hash: string; 113 | uri: string; 114 | state: 'PROCESSING' | 'ACTIVE' | 'FAILED'; 115 | } 116 | 117 | export interface GeminiGenerateContentRequest { 118 | contents: GeminiContent[]; 119 | generationConfig?: GeminiGenerationConfig; 120 | safetySettings?: GeminiSafetySetting[]; 121 | } 122 | 123 | export interface GeminiContent { 124 | role: 'user' | 'model'; 125 | parts: GeminiPart[]; 126 | } 127 | 128 | export type GeminiPart = 129 | | { text: string } 130 | | { inlineData: { mimeType: string; data: string } } 131 | | { fileData: { mimeType: string; fileUri: string } }; 132 | 133 | export interface GeminiGenerationConfig { 134 | temperature?: number; 135 | topP?: number; 136 | topK?: number; 137 | maxOutputTokens?: number; 138 | candidateCount?: number; 139 | stopSequences?: string[]; 140 | } 141 | 142 | export interface GeminiSafetySetting { 143 | category: string; 144 | threshold: string; 145 | } 146 | 147 | export interface GeminiGenerateContentResponse { 148 | candidates: GeminiCandidate[]; 149 | usageMetadata: { 150 | promptTokenCount: number; 151 | candidatesTokenCount: number; 152 | totalTokenCount: number; 153 | }; 154 | } 155 | 156 | export interface GeminiCandidate { 157 | content: GeminiContent; 158 | finishReason: string; 159 | index: number; 160 | safetyRatings?: GeminiSafetyRating[]; 161 | } 162 | 163 | export interface GeminiSafetyRating { 164 | category: string; 165 | probability: string; 166 | blocked: boolean; 167 | } 168 | 169 | // Vertex AI-specific types 170 | export interface VertexAIConfig { 171 | projectId: string; 172 | location: string; 173 | endpoint: string; 174 | credentials?: string; 175 | imageModel: string; 176 | videoModel: string; 177 | } 178 | 179 | export interface VertexAIGenerateContentRequest { 180 | contents: VertexAIContent[]; 181 | generationConfig?: VertexAIGenerationConfig; 182 | safetySettings?: VertexAISafetySetting[]; 183 | } 184 | 185 | export interface VertexAIContent { 186 | role: 'user' | 'model'; 187 | parts: VertexAIPart[]; 188 | } 189 | 190 | export type VertexAIPart = 191 | | { text: string } 192 | | { inlineData: { mimeType: string; data: string } } 193 | | { fileData: { mimeType: string; fileUri: string } }; 194 | 195 | export interface VertexAIGenerationConfig { 196 | temperature?: number; 197 | topP?: number; 198 | topK?: number; 199 | maxOutputTokens?: number; 200 | candidateCount?: number; 201 | stopSequences?: string[]; 202 | } 203 | 204 | export interface VertexAISafetySetting { 205 | category: string; 206 | threshold: string; 207 | } 208 | 209 | export interface VertexAIGenerateContentResponse { 210 | candidates: VertexAICandidate[]; 211 | usageMetadata: { 212 | promptTokenCount: number; 213 | candidatesTokenCount: number; 214 | totalTokenCount: number; 215 | }; 216 | modelVersion?: string; 217 | } 218 | 219 | export interface VertexAICandidate { 220 | content: VertexAIContent; 221 | finishReason: string; 222 | index: number; 223 | safetyRatings?: VertexAISafetyRating[]; 224 | } 225 | 226 | export interface VertexAISafetyRating { 227 | category: string; 228 | probability: string; 229 | blocked: boolean; 230 | } 231 | -------------------------------------------------------------------------------- /src/storage/gcs/GCSStorage.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Google Cloud Storage provider implementation using native GCS SDK 3 | */ 4 | 5 | import { Storage, Bucket, File } from '@google-cloud/storage'; 6 | import { StorageProvider, StorageFile } from '../../types/Storage.js'; 7 | import { StorageError } from '../../types/Errors.js'; 8 | import type { GCSConfig } from '../../types/Config.js'; 9 | 10 | export class GCSStorageProvider implements StorageProvider { 11 | private storage: Storage; 12 | private bucket: Bucket; 13 | private config: GCSConfig; 14 | 15 | constructor(config: GCSConfig) { 16 | this.config = config; 17 | 18 | // Initialize native GCS Storage client 19 | this.storage = new Storage({ 20 | projectId: config.projectId, 21 | keyFilename: config.credentials, 22 | }); 23 | 24 | this.bucket = this.storage.bucket(config.bucketName); 25 | } 26 | 27 | async uploadFile( 28 | buffer: Buffer, 29 | filename: string, 30 | mimeType: string 31 | ): Promise { 32 | try { 33 | const key = this.generateKey(filename); 34 | const file: File = this.bucket.file(key); 35 | 36 | await file.save(buffer, { 37 | contentType: mimeType, 38 | metadata: { 39 | cacheControl: 'public, max-age=31536000', // 1 year 40 | }, 41 | }); 42 | 43 | // Get the file metadata 44 | const [metadata] = await file.getMetadata(); 45 | 46 | return { 47 | id: key, 48 | filename, 49 | mimeType, 50 | size: buffer.length, 51 | url: `gs://${this.config.bucketName}/${key}`, 52 | lastModified: metadata.updated || new Date().toISOString(), 53 | etag: metadata.etag || this.generateETag(buffer), 54 | }; 55 | } catch (error) { 56 | throw new StorageError( 57 | `Failed to upload file to GCS: ${error instanceof Error ? error.message : String(error)}`, 58 | 'gcs', 59 | error instanceof Error ? error : new Error(String(error)) 60 | ); 61 | } 62 | } 63 | 64 | async downloadFile(fileId: string): Promise { 65 | try { 66 | const file: File = this.bucket.file(fileId); 67 | const [buffer] = await file.download(); 68 | return buffer; 69 | } catch (error) { 70 | throw new StorageError( 71 | `Failed to download file from GCS: ${error instanceof Error ? error.message : String(error)}`, 72 | 'gcs', 73 | error instanceof Error ? error : new Error(String(error)) 74 | ); 75 | } 76 | } 77 | 78 | async deleteFile(fileId: string): Promise { 79 | try { 80 | const file: File = this.bucket.file(fileId); 81 | await file.delete(); 82 | } catch (error) { 83 | // Don't throw error if file doesn't exist (404) 84 | if (error instanceof Error && error.message.includes('No such object')) { 85 | return; 86 | } 87 | throw new StorageError( 88 | `Failed to delete file from GCS: ${error instanceof Error ? error.message : String(error)}`, 89 | 'gcs', 90 | error instanceof Error ? error : new Error(String(error)) 91 | ); 92 | } 93 | } 94 | 95 | async getPublicUrl(fileId: string): Promise { 96 | // Return GCS URI format (gs://bucket/path) 97 | return `gs://${this.config.bucketName}/${fileId}`; 98 | } 99 | 100 | async getSignedUrl(fileId: string, expiresIn: number): Promise { 101 | try { 102 | const file: File = this.bucket.file(fileId); 103 | const [signedUrl] = await file.getSignedUrl({ 104 | version: 'v4', 105 | action: 'read', 106 | expires: Date.now() + expiresIn * 1000, // Convert seconds to milliseconds 107 | }); 108 | return signedUrl; 109 | } catch (error) { 110 | throw new StorageError( 111 | `Failed to generate signed URL: ${error instanceof Error ? error.message : String(error)}`, 112 | 'gcs', 113 | error instanceof Error ? error : new Error(String(error)) 114 | ); 115 | } 116 | } 117 | 118 | async listFiles(prefix?: string): Promise { 119 | try { 120 | const [files] = await this.bucket.getFiles({ prefix }); 121 | const storageFiles: StorageFile[] = []; 122 | 123 | for (const file of files) { 124 | const [metadata] = await file.getMetadata(); 125 | const filename = file.name.split('/').pop() || file.name; 126 | 127 | storageFiles.push({ 128 | id: file.name, 129 | filename, 130 | mimeType: metadata.contentType || 'application/octet-stream', 131 | size: parseInt(String(metadata.size || '0'), 10), 132 | url: `gs://${this.config.bucketName}/${file.name}`, 133 | lastModified: metadata.updated || new Date().toISOString(), 134 | etag: metadata.etag || '', 135 | }); 136 | } 137 | 138 | return storageFiles; 139 | } catch (error) { 140 | throw new StorageError( 141 | `Failed to list files from GCS: ${error instanceof Error ? error.message : String(error)}`, 142 | 'gcs', 143 | error instanceof Error ? error : new Error(String(error)) 144 | ); 145 | } 146 | } 147 | 148 | // Private helper methods 149 | 150 | private generateKey(filename: string): string { 151 | // Generate a unique key with timestamp and random UUID 152 | const timestamp = new Date().toISOString().split('T')[0]; // YYYY-MM-DD 153 | const randomId = Math.random().toString(36).substring(2, 15); 154 | const extension = filename.includes('.') 155 | ? `.${filename.split('.').pop()}` 156 | : ''; 157 | 158 | // Organize files by date and type 159 | const type = this.getFileType(filename); 160 | return `${type}/${timestamp}/${randomId}${extension}`; 161 | } 162 | 163 | private getFileType(filename: string): string { 164 | const extension = filename.split('.').pop()?.toLowerCase(); 165 | 166 | if ( 167 | [ 168 | 'jpg', 169 | 'jpeg', 170 | 'png', 171 | 'gif', 172 | 'bmp', 173 | 'webp', 174 | 'tiff', 175 | 'heic', 176 | 'heif', 177 | ].includes(extension || '') 178 | ) { 179 | return 'images'; 180 | } else if ( 181 | ['mp4', 'mov', 'avi', 'mkv', 'webm', 'flv', 'wmv', '3gp', 'm4v'].includes( 182 | extension || '' 183 | ) 184 | ) { 185 | return 'videos'; 186 | } else { 187 | return 'files'; 188 | } 189 | } 190 | 191 | private generateETag(buffer: Buffer): string { 192 | // Simple hash generation - in production, you might want to use a proper hash function 193 | const hash = Buffer.from(buffer).toString('base64').substring(0, 32); 194 | return `"${hash}"`; 195 | } 196 | 197 | // Configuration methods 198 | 199 | public getBucket(): string { 200 | return this.config.bucketName; 201 | } 202 | 203 | public getProjectId(): string { 204 | return this.config.projectId; 205 | } 206 | 207 | public getRegion(): string { 208 | return this.config.region; 209 | } 210 | } 211 | -------------------------------------------------------------------------------- /src/utils/imageAnnotator.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Image annotation utilities using Sharp 3 | * Based on gemini_object_detection.js annotation logic 4 | */ 5 | 6 | import sharp from 'sharp'; 7 | import fs from 'fs/promises'; 8 | import path from 'path'; 9 | import os from 'os'; 10 | import crypto from 'crypto'; 11 | import type { DetectedObject } from '../types/ObjectDetection.js'; 12 | 13 | export interface AnnotationOptions { 14 | lineColor?: string; 15 | lineWidth?: number; 16 | labelColor?: string; 17 | labelHeight?: number; 18 | // REMOVED: pointColor and pointRadius (corner circles no longer used) 19 | } 20 | 21 | export class ImageAnnotator { 22 | private options: Required; 23 | 24 | constructor(options: AnnotationOptions = {}) { 25 | this.options = { 26 | lineColor: options.lineColor || 'red', 27 | lineWidth: options.lineWidth || 3, 28 | labelColor: options.labelColor || 'red', 29 | labelHeight: options.labelHeight || 20, 30 | // REMOVED: pointColor and pointRadius initialization (corner circles no longer used) 31 | }; 32 | } 33 | 34 | /** 35 | * Draw bounding boxes and labels on image using Sharp 36 | * Adapted from gemini_object_detection.js drawAnnotations function 37 | */ 38 | async drawAnnotations( 39 | imageBuffer: Buffer, 40 | detections: DetectedObject[], 41 | imageWidth: number, 42 | imageHeight: number 43 | ): Promise { 44 | let sharpImage = sharp(imageBuffer); 45 | 46 | // Prepare overlays for bounding boxes, corners, and text 47 | const overlays = []; 48 | 49 | for (let idx = 0; idx < detections.length; idx++) { 50 | const detection = detections[idx]; 51 | 52 | // Use normalized_box_2d coordinates (converted to pixels) 53 | if ( 54 | !detection.normalized_box_2d || 55 | detection.normalized_box_2d.length !== 4 56 | ) { 57 | console.warn( 58 | `[ImageAnnotator] Skipping detection without valid normalized_box_2d: ${detection.object}` 59 | ); 60 | continue; 61 | } 62 | 63 | // Convert normalized coordinates to pixels 64 | const [normY1, normX1, normY2, normX2] = detection.normalized_box_2d; 65 | const x1 = Math.round((normX1 / 1000) * imageWidth); // left edge 66 | const y1 = Math.round((normY1 / 1000) * imageHeight); // top edge 67 | const x2 = Math.round((normX2 / 1000) * imageWidth); // right edge 68 | const y2 = Math.round((normY2 / 1000) * imageHeight); // bottom edge 69 | 70 | // Create rectangle overlay (bounding box) 71 | const rectOverlay = await this.createRectangleOverlay( 72 | imageWidth, 73 | imageHeight, 74 | x1, 75 | y1, 76 | x2, 77 | y2 78 | ); 79 | overlays.push({ 80 | input: rectOverlay, 81 | left: 0, 82 | top: 0, 83 | }); 84 | 85 | // REMOVED: Corner circles (were causing "double boxing" visual clutter) 86 | // The 4 corner circles made it appear like buttons were boxed multiple times 87 | 88 | // Create text label 89 | const text = `${detection.object} - ${detection.label}`; 90 | const textOverlay = await this.createTextOverlay(text); 91 | 92 | // Calculate text position (above bounding box) 93 | const textX = x1; 94 | const textY = Math.max(y1 - this.options.labelHeight - 4, 0); 95 | 96 | overlays.push({ 97 | input: textOverlay, 98 | left: textX, 99 | top: textY, 100 | }); 101 | } 102 | 103 | // Composite all overlays onto the original image 104 | if (overlays.length > 0) { 105 | sharpImage = sharpImage.composite(overlays); 106 | } 107 | 108 | return sharpImage.toBuffer(); 109 | } 110 | 111 | /** 112 | * Create a rectangle overlay using SVG 113 | * Adapted from gemini_object_detection.js createRectangleOverlay function 114 | */ 115 | private async createRectangleOverlay( 116 | imageWidth: number, 117 | imageHeight: number, 118 | x1: number, 119 | y1: number, 120 | x2: number, 121 | y3: number 122 | ): Promise { 123 | const rectWidth = x2 - x1; 124 | const rectHeight = y3 - y1; 125 | 126 | const rectangleBuffer = await sharp({ 127 | create: { 128 | width: imageWidth, 129 | height: imageHeight, 130 | channels: 4, 131 | background: { r: 0, g: 0, b: 0, alpha: 0 }, 132 | }, 133 | }) 134 | .composite([ 135 | { 136 | input: Buffer.from( 137 | ` 138 | 140 | ` 141 | ), 142 | left: 0, 143 | top: 0, 144 | }, 145 | ]) 146 | .png() 147 | .toBuffer(); 148 | 149 | return rectangleBuffer; 150 | } 151 | 152 | // REMOVED: createCircleOverlay method (no longer needed since corner circles removed) 153 | 154 | /** 155 | * Create a text overlay using Sharp 156 | * Adapted from gemini_object_detection.js createTextOverlay function 157 | */ 158 | private async createTextOverlay(text: string): Promise { 159 | // Try to find a system font, fallback to default 160 | const fontPaths = [ 161 | 'C:/Windows/Fonts/arial.ttf', // Windows 162 | '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', // Linux 163 | '/System/Library/Fonts/Arial.ttf', // macOS 164 | ]; 165 | 166 | let fontfile = undefined; 167 | for (const fontPath of fontPaths) { 168 | try { 169 | await fs.access(fontPath); 170 | fontfile = fontPath; 171 | break; 172 | } catch { 173 | // Font not found, try next 174 | } 175 | } 176 | 177 | // Calculate approximate text width (rough estimate: 8 pixels per character) 178 | // Add padding for better visual appearance 179 | const estimatedWidth = Math.max(text.length * 8 + 8, 50); // Minimum 50px width 180 | 181 | const textBuffer = await sharp({ 182 | create: { 183 | width: estimatedWidth, 184 | height: this.options.labelHeight, 185 | channels: 4, 186 | background: { r: 255, g: 0, b: 0, alpha: 1 }, // Red background 187 | }, 188 | }) 189 | .composite([ 190 | { 191 | input: { 192 | text: { 193 | text: text, 194 | font: fontfile ? 'Arial' : 'sans-serif', 195 | fontfile: fontfile, 196 | rgba: true, 197 | align: 'left', 198 | }, 199 | }, 200 | left: 2, 201 | top: 2, 202 | }, 203 | ]) 204 | .png() 205 | .toBuffer(); 206 | 207 | return textBuffer; 208 | } 209 | 210 | /** 211 | * Save buffer to a temporary file with unique name 212 | */ 213 | async saveToTempFile( 214 | buffer: Buffer, 215 | extension: string = 'png' 216 | ): Promise { 217 | const tempDir = os.tmpdir(); 218 | const randomId = crypto.randomBytes(8).toString('hex'); 219 | const filename = `ai-vision-mcp-${randomId}.${extension}`; 220 | const tempPath = path.join(tempDir, filename); 221 | 222 | await fs.writeFile(tempPath, buffer); 223 | return tempPath; 224 | } 225 | 226 | /** 227 | * Save buffer to temp file, or gracefully skip if permission denied 228 | */ 229 | async saveToTempFileOrSkip( 230 | buffer: Buffer, 231 | extension: string = 'png' 232 | ): Promise<{ path: string; method: 'temp_file' } | { method: 'skipped' }> { 233 | try { 234 | const tempDir = os.tmpdir(); 235 | const randomId = crypto.randomBytes(8).toString('hex'); 236 | const filename = `ai-vision-mcp-${randomId}.${extension}`; 237 | const tempPath = path.join(tempDir, filename); 238 | 239 | await fs.writeFile(tempPath, buffer); 240 | return { path: tempPath, method: 'temp_file' }; 241 | } catch (error) { 242 | const errorMessage = error instanceof Error ? error.message : String(error); 243 | console.warn(`[ImageAnnotator] Skipped temp file creation due to permission error: ${errorMessage}. Detection results will be returned without file output.`); 244 | return { method: 'skipped' }; 245 | } 246 | } 247 | 248 | /** 249 | * Save buffer to explicit path, ensuring directory exists 250 | */ 251 | async saveToExplicitPath(filePath: string, buffer: Buffer): Promise { 252 | const outputDir = path.dirname(filePath); 253 | await fs.mkdir(outputDir, { recursive: true }); 254 | await fs.writeFile(filePath, buffer); 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /src/utils/retry.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Retry logic and error handling utilities 3 | */ 4 | 5 | import { RateLimitExceededError, NetworkError } from '../types/Errors.js'; 6 | 7 | export interface RetryOptions { 8 | maxRetries?: number; 9 | baseDelay?: number; 10 | maxDelay?: number; 11 | backoffMultiplier?: number; 12 | jitter?: boolean; 13 | retryableErrors?: string[]; 14 | onRetry?: (attempt: number, error: Error) => void; 15 | } 16 | 17 | export interface RetryResult { 18 | result: T; 19 | attempts: number; 20 | totalDelay: number; 21 | } 22 | 23 | const DEFAULT_RETRY_OPTIONS: Required = { 24 | maxRetries: 3, 25 | baseDelay: 1000, 26 | maxDelay: 30000, 27 | backoffMultiplier: 2, 28 | jitter: true, 29 | retryableErrors: [ 30 | 'RATE_LIMIT_EXCEEDED', 31 | 'NETWORK_ERROR', 32 | 'ECONNRESET', 33 | 'ECONNREFUSED', 34 | 'ETIMEDOUT', 35 | 'ENOTFOUND', 36 | ], 37 | onRetry: () => {}, 38 | }; 39 | 40 | export class RetryHandler { 41 | /** 42 | * Execute an operation with retry logic 43 | */ 44 | static async withRetry( 45 | operation: () => Promise, 46 | options: RetryOptions = {} 47 | ): Promise> { 48 | const opts = { ...DEFAULT_RETRY_OPTIONS, ...options }; 49 | let lastError: Error; 50 | let totalDelay = 0; 51 | 52 | for (let attempt = 0; attempt <= opts.maxRetries; attempt++) { 53 | try { 54 | const result = await operation(); 55 | return { 56 | result, 57 | attempts: attempt + 1, 58 | totalDelay, 59 | }; 60 | } catch (error) { 61 | lastError = error instanceof Error ? error : new Error(String(error)); 62 | 63 | // Don't retry on the last attempt 64 | if (attempt === opts.maxRetries || !this.shouldRetry(lastError, opts)) { 65 | throw lastError; 66 | } 67 | 68 | const delay = this.calculateDelay(attempt, opts); 69 | totalDelay += delay; 70 | 71 | opts.onRetry(attempt + 1, lastError); 72 | await this.sleep(delay); 73 | } 74 | } 75 | 76 | throw lastError!; 77 | } 78 | 79 | /** 80 | * Execute an operation with exponential backoff 81 | */ 82 | static async withExponentialBackoff( 83 | operation: () => Promise, 84 | options: RetryOptions = {} 85 | ): Promise { 86 | const result = await this.withRetry(operation, { 87 | ...options, 88 | backoffMultiplier: 2, 89 | jitter: true, 90 | }); 91 | return result.result; 92 | } 93 | 94 | /** 95 | * Execute an operation with linear backoff 96 | */ 97 | static async withLinearBackoff( 98 | operation: () => Promise, 99 | options: RetryOptions = {} 100 | ): Promise { 101 | const result = await this.withRetry(operation, { 102 | ...options, 103 | backoffMultiplier: 1, 104 | jitter: false, 105 | }); 106 | return result.result; 107 | } 108 | 109 | /** 110 | * Determine if an error is retryable 111 | */ 112 | private static shouldRetry( 113 | error: Error, 114 | options: Required 115 | ): boolean { 116 | // Check if it's a VisionError with a retryable code 117 | if ('code' in error && typeof error.code === 'string') { 118 | return options.retryableErrors.includes(error.code); 119 | } 120 | 121 | // Check if it's a RateLimitExceededError 122 | if (error instanceof RateLimitExceededError) { 123 | return true; 124 | } 125 | 126 | // Check if it's a NetworkError 127 | if (error instanceof NetworkError) { 128 | return true; 129 | } 130 | 131 | // Check error message for common network-related errors 132 | const message = error.message.toLowerCase(); 133 | const networkErrorPatterns = [ 134 | 'network error', 135 | 'connection refused', 136 | 'connection reset', 137 | 'name resolution failed', 138 | ]; 139 | 140 | return networkErrorPatterns.some(pattern => message.includes(pattern)); 141 | } 142 | 143 | /** 144 | * Calculate delay before next retry 145 | */ 146 | private static calculateDelay( 147 | attempt: number, 148 | options: Required 149 | ): number { 150 | let delay = 151 | options.baseDelay * Math.pow(options.backoffMultiplier, attempt); 152 | 153 | // Apply jitter if enabled 154 | if (options.jitter) { 155 | delay = delay * (0.5 + Math.random() * 0.5); 156 | } 157 | 158 | // Ensure delay doesn't exceed maximum 159 | return Math.min(delay, options.maxDelay); 160 | } 161 | 162 | /** 163 | * Sleep for the specified number of milliseconds 164 | */ 165 | private static async sleep(ms: number): Promise { 166 | await new Promise(resolve => setTimeout(resolve, ms)); 167 | } 168 | 169 | /** 170 | * Create a retryable version of a function 171 | */ 172 | static wrap Promise>( 173 | fn: T, 174 | options: RetryOptions = {} 175 | ): T { 176 | return (async (...args: Parameters): Promise> => { 177 | const result = await this.withRetry(() => fn(...args), options); 178 | return result.result; 179 | }) as T; 180 | } 181 | } 182 | 183 | /** 184 | * Circuit breaker pattern for handling repeated failures 185 | */ 186 | export interface CircuitBreakerOptions { 187 | failureThreshold?: number; 188 | recoveryDelay?: number; 189 | monitoringPeriod?: number; 190 | onStateChange?: (state: 'CLOSED' | 'OPEN' | 'HALF_OPEN') => void; 191 | } 192 | 193 | export class CircuitBreaker { 194 | private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED'; 195 | private failureCount = 0; 196 | private lastFailureTime = 0; 197 | private successCount = 0; 198 | 199 | constructor(private options: CircuitBreakerOptions = {}) {} 200 | 201 | async execute(operation: () => Promise): Promise { 202 | const opts = { 203 | failureThreshold: 5, 204 | recoveryDelay: 60000, 205 | monitoringPeriod: 10000, 206 | ...this.options, 207 | }; 208 | 209 | if (this.state === 'OPEN') { 210 | if (Date.now() - this.lastFailureTime > opts.recoveryDelay) { 211 | this.setState('HALF_OPEN'); 212 | } else { 213 | throw new Error('Circuit breaker is OPEN'); 214 | } 215 | } 216 | 217 | try { 218 | const result = await operation(); 219 | this.onSuccess(); 220 | return result; 221 | } catch (error) { 222 | this.onFailure(); 223 | throw error; 224 | } 225 | } 226 | 227 | private onSuccess(): void { 228 | this.failureCount = 0; 229 | this.successCount++; 230 | 231 | if (this.state === 'HALF_OPEN') { 232 | this.setState('CLOSED'); 233 | } 234 | } 235 | 236 | private onFailure(): void { 237 | this.failureCount++; 238 | this.lastFailureTime = Date.now(); 239 | 240 | if (this.failureCount >= this.options.failureThreshold!) { 241 | this.setState('OPEN'); 242 | } 243 | } 244 | 245 | private setState(state: 'CLOSED' | 'OPEN' | 'HALF_OPEN'): void { 246 | this.state = state; 247 | this.options.onStateChange?.(state); 248 | } 249 | 250 | getState(): 'CLOSED' | 'OPEN' | 'HALF_OPEN' { 251 | return this.state; 252 | } 253 | 254 | getFailureCount(): number { 255 | return this.failureCount; 256 | } 257 | 258 | getSuccessCount(): number { 259 | return this.successCount; 260 | } 261 | 262 | reset(): void { 263 | this.state = 'CLOSED'; 264 | this.failureCount = 0; 265 | this.successCount = 0; 266 | } 267 | } 268 | 269 | /** 270 | * Bulkhead pattern for limiting concurrent operations 271 | */ 272 | export class Bulkhead { 273 | private running = 0; 274 | private queue: Array<{ 275 | resolve: (value: any) => void; 276 | reject: (reason: any) => void; 277 | operation: () => Promise; 278 | }> = []; 279 | 280 | constructor(private maxConcurrency: number) {} 281 | 282 | async execute(operation: () => Promise): Promise { 283 | return new Promise((resolve, reject) => { 284 | this.queue.push({ resolve, reject, operation }); 285 | this.process(); 286 | }); 287 | } 288 | 289 | private async process(): Promise { 290 | if (this.running >= this.maxConcurrency || this.queue.length === 0) { 291 | return; 292 | } 293 | 294 | this.running++; 295 | const { resolve, reject, operation } = this.queue.shift()!; 296 | 297 | try { 298 | const result = await operation(); 299 | resolve(result); 300 | } catch (error) { 301 | reject(error); 302 | } finally { 303 | this.running--; 304 | this.process(); 305 | } 306 | } 307 | 308 | getRunningCount(): number { 309 | return this.running; 310 | } 311 | 312 | getQueueLength(): number { 313 | return this.queue.length; 314 | } 315 | } 316 | -------------------------------------------------------------------------------- /docs/llm_logs/percentage-scale-text-summary-plan.md: -------------------------------------------------------------------------------- 1 | # Plan: Hybrid Element Identification + Spatial Reference for Object Detection 2 | 3 | **Date**: 2025-01-10 (Updated: 2025-01-10) 4 | **Author**: Claude Code 5 | **Issue**: Improve object detection output by combining CSS selector automation guidance with minimal spatial reference coordinates 6 | 7 | ## Problem Statement 8 | 9 | The current `detect_objects_in_image` output has two competing needs: 10 | 11 | 1. **Web Automation**: Requires CSS selectors, semantic targeting for reliable automation 12 | 2. **Spatial Awareness**: Needs position reference for layout understanding and debugging 13 | 3. **Information Overload**: Current verbose coordinate explanations obscure actionable guidance 14 | 4. **Mixed Priorities**: Unclear whether to focus on automation or spatial reference 15 | 16 | ## Solution: Hybrid Approach - CSS Selectors + Minimal Coordinates 17 | 18 | ### Core Approach 19 | - **Primary Focus**: CSS selectors and semantic targeting (automation best practices) 20 | - **Secondary Reference**: Concise percentage coordinates (spatial awareness) 21 | - **Information Hierarchy**: 1-2 lines per element, automation guidance first 22 | - **Clear Separation**: Distinct purposes for different information types 23 | 24 | ### Rationale for Hybrid Approach 25 | 26 | **Why CSS Selectors (Primary):** 27 | - **Automation Reliability**: Survives layout changes, responsive design, and viewport differences 28 | - **Industry Standard**: Aligns with modern web automation best practices (Playwright, Puppeteer) 29 | - **Maintenance Friendly**: Less brittle than coordinate-based approaches 30 | - **Semantic Accuracy**: Targets elements by their actual purpose and attributes 31 | 32 | **Why Minimal Coordinates (Secondary):** 33 | - **Spatial Reference**: Quick position orientation without overwhelming detail 34 | - **Visual Debugging**: Helps developers locate elements in complex layouts 35 | - **Design Validation**: Useful for QA and design review workflows 36 | - **Non-Automation Use Cases**: Screenshots annotation, layout documentation 37 | 38 | **Why Concise Format (1-2 Lines):** 39 | - **Reduced Cognitive Load**: Focus on essential information only 40 | - **Faster Scanning**: Developers can quickly find what they need 41 | - **Clear Hierarchy**: Automation guidance prominently featured 42 | - **Information Efficiency**: No redundant explanations or verbose calculations 43 | 44 | ### Benefits 45 | - ✅ **Automation-First Design**: CSS selectors prominently featured for web automation 46 | - ✅ **Spatial Context Preserved**: Percentage coordinates provide layout reference 47 | - ✅ **Information Efficiency**: Concise 1-2 line format reduces cognitive load 48 | - ✅ **Multi-Use Case Support**: Serves automation, debugging, and documentation needs 49 | - ✅ **Industry Alignment**: Follows modern web development and testing practices 50 | - ✅ **Reduced Verbosity**: Eliminates redundant coordinate calculations and explanations 51 | 52 | ## Implementation Plan 53 | 54 | ### Phase 1: Update Summary Generator (2 days) 55 | 56 | **File Changes Required**: 57 | - Modify `src/tools/detect_objects_in_image.ts` to generate hybrid summary format 58 | - Implement concise 2-line element description (automation + position) 59 | - Remove verbose coordinate explanations and automation guidance 60 | - Focus on CSS selector recommendations as primary automation method 61 | 62 | **Key Functions**: 63 | ```typescript 64 | function generateDetectionSummary( 65 | detections: DetectedObject[], 66 | imageMetadata: ImageMetadata, 67 | model: string, 68 | provider: string 69 | ): string { 70 | // Generate concise element summaries (1-2 lines each) 71 | // Line 1: CSS selector recommendations 72 | // Line 2: Percentage position reference 73 | // Remove verbose coordinate calculations 74 | } 75 | 76 | function suggestCSSSelectors(detection: DetectedObject): string[] { 77 | // Recommend CSS selectors based on element type and label 78 | // Return 2-3 most likely selectors 79 | } 80 | 81 | function formatPositionReference(detection: DetectedObject): string { 82 | // Return concise position: "78.5% across, 26.7% down (13% × 4.5% size)" 83 | } 84 | ``` 85 | 86 | ### Phase 2: Testing and Validation (1 day) 87 | - Test with various UI element types (buttons, inputs, links, etc.) 88 | - Validate CSS selector recommendations are accurate and useful 89 | - Ensure percentage coordinates provide meaningful spatial reference 90 | - Verify 2-line format provides sufficient information without overload 91 | 92 | ### Phase 3: Documentation Update (1 day) 93 | - Update README.md with new hybrid summary examples 94 | - Document the automation-first approach with spatial reference 95 | - Remove verbose coordinate automation examples 96 | 97 | ## Technical Specifications 98 | 99 | ### Input Data (Simplified) 100 | ```typescript 101 | interface DetectedObject { 102 | object: string; // "button", "input" 103 | label: string; // "Submit button" 104 | normalized_box_2d: [number, number, number, number]; // [ymin, xmin, ymax, xmax] 0-1000 105 | // REMOVED: coordinates object (redundant, confusing) 106 | } 107 | ``` 108 | 109 | ### Output Enhancement (New + Simplified) 110 | ```typescript 111 | interface ObjectDetectionResponse { 112 | detections: DetectedObject[]; 113 | image_metadata: { width: number, height: number, size_bytes: number, format: string }; 114 | 115 | // NEW: Human-readable summary (primary coordinate interface) 116 | summary: string; // Percentage-based descriptions with automation code 117 | 118 | // REMOVED: Complex nested coordinate structures 119 | } 120 | ``` 121 | 122 | ### Coordinate Conversion Logic 123 | ```typescript 124 | // Convert normalized coordinates to percentages AND calculate pixel details 125 | const [ymin, xmin, ymax, xmax] = detection.normalized_box_2d; 126 | const { width: imageWidth, height: imageHeight } = imageMetadata; 127 | 128 | // Percentage calculations 129 | const percentageBox = { 130 | top: ymin / 10, // Convert 245 → 24.5% 131 | left: xmin / 10, // Convert 720 → 72.0% 132 | bottom: ymax / 10, // Convert 290 → 29.0% 133 | right: xmax / 10 // Convert 850 → 85.0% 134 | }; 135 | 136 | const centerX = (xmin + xmax) / 2 / 10; // 78.5% 137 | const centerY = (ymin + ymax) / 2 / 10; // 26.7% 138 | const widthPercent = (xmax - xmin) / 10; // 13.0% 139 | const heightPercent = (ymax - ymin) / 10; // 4.5% 140 | 141 | // Pixel calculations (derived from normalized + image dimensions) 142 | const pixelBox = { 143 | x: Math.round((xmin / 1000) * imageWidth), // 1382 144 | y: Math.round((ymin / 1000) * imageHeight), // 470 145 | width: Math.round(((xmax - xmin) / 1000) * imageWidth), // 250 146 | height: Math.round(((ymax - ymin) / 1000) * imageHeight) // 86 147 | }; 148 | ``` 149 | 150 | ## Sample Output 151 | 152 | ### Updated Text Summary (Hybrid Approach - CSS Selectors + Minimal Coordinates) 153 | ``` 154 | 🖼️ IMAGE ANALYSIS COMPLETE 155 | 156 | 📏 Source Image: 1920×1080 pixels (PNG, 2.0MB) 157 | 🤖 Detection Model: gemini-2.5-flash-lite (google) 158 | 📊 Elements Found: 2 interactive elements detected 159 | 160 | ⚠️ FOR WEB AUTOMATION: 161 | - **RECOMMENDED**: Use CSS selectors for reliable automation (primary approach) 162 | - **REFERENCE ONLY**: Percentage coordinates for spatial context (secondary reference) 163 | - **AVOID**: Direct coordinate-based clicking for automation 164 | 165 | ## 🔍 DETECTED ELEMENTS: 166 | 167 | ### 1. button - Submit Button 168 | - **Automation**: `button[type="submit"]` or `button:has-text("Submit")` 169 | - **Position**: 78.5% across, 26.7% down (13% × 4.5% size) 170 | 171 | ### 2. input - Email Address Field 172 | - **Automation**: `input[type="email"]` or `input[name="email"]` 173 | - **Position**: 40.0% across, 20.0% down (40% × 4% size) 174 | 175 | ### 3. select - Country Dropdown 176 | - **Automation**: `select[name="country"]` or `#country-select` 177 | - **Position**: 25.0% across, 45.0% down (35% × 3% size) 178 | ``` 179 | 180 | ## Risk Assessment 181 | 182 | ### Low Risk 183 | - **Backward Compatibility**: No changes to existing data structure 184 | - **Performance Impact**: Minimal text generation overhead (~1ms) 185 | - **Implementation Simplicity**: Straightforward 2-line format per element 186 | 187 | ### Medium Risk 188 | - **CSS Selector Accuracy**: Need to ensure recommended selectors are practical 189 | - **Balance Maintenance**: Keep automation focus while providing useful spatial reference 190 | 191 | ### High Value 192 | - **Automation-First Approach**: Prominently features industry-standard CSS selectors 193 | - **Information Efficiency**: Concise format reduces cognitive load 194 | - **Multi-Purpose Utility**: Serves both automation and spatial reference needs 195 | - **Developer Experience**: Clear hierarchy and actionable guidance 196 | 197 | ## Success Metrics 198 | 199 | 1. **Automation Adoption**: Increased use of CSS selectors over coordinate-based automation 200 | 2. **Information Efficiency**: Positive feedback on concise 2-line element format 201 | 3. **Dual-Purpose Utility**: Usage for both automation and spatial reference scenarios 202 | 4. **Developer Satisfaction**: Preference for automation-first approach with spatial context 203 | 204 | ## Future Enhancements 205 | 206 | If the hybrid approach proves successful, consider: 207 | - **Context-Aware HTML Elements**: Use specific HTML element names (button, input, select) when analyzing web pages 208 | - **Smart Selector Intelligence**: AI-powered CSS selector suggestions based on visual analysis and common patterns 209 | - **Accessibility Integration**: Include ARIA attributes and accessibility hints in selector recommendations 210 | - **Framework-Specific Guidance**: Tailored selector recommendations for different testing frameworks (Playwright, Puppeteer, Cypress) 211 | 212 | ## Implementation Timeline 213 | 214 | - **Day 1**: Update summary generator for hybrid format (CSS selectors + minimal coordinates) 215 | - **Day 2**: Implement 2-line element descriptions and remove verbose explanations 216 | - **Day 3**: Integration testing with various element types and validation 217 | - **Day 4**: Documentation updates and example refinements 218 | 219 | ## Conclusion 220 | 221 | This hybrid approach represents the optimal balance between automation best practices and spatial reference utility. By prominently featuring CSS selectors while maintaining concise percentage coordinates, the tool provides: 222 | 223 | 1. **Actionable Automation Guidance**: Industry-standard CSS selectors for reliable web automation 224 | 2. **Spatial Context**: Quick position reference without overwhelming detail 225 | 3. **Information Efficiency**: Concise 2-line format that reduces cognitive load 226 | 4. **Multi-Purpose Value**: Serves automation, debugging, and documentation workflows 227 | 228 | The enhancement transforms the object detection output from a coordinate-focused tool into an automation-first solution that still preserves essential spatial awareness - making it valuable for real-world web development and testing workflows while promoting robust, maintainable automation practices. -------------------------------------------------------------------------------- /src/utils/validation.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Zod schemas for data validation 3 | */ 4 | 5 | import { z } from 'zod'; 6 | import type { Config } from '../types/Config.js'; 7 | import type { AnalysisOptions } from '../types/Analysis.js'; 8 | import { 9 | FUNCTION_NAMES, 10 | type FunctionName, 11 | } from '../constants/FunctionNames.js'; 12 | 13 | // Provider selection schemas 14 | const ProviderSchema = z.enum(['google', 'vertex_ai']); 15 | 16 | // Log level schema 17 | const LogLevelSchema = z.enum(['info', 'debug', 'warn', 'error']); 18 | 19 | // Node environment schema 20 | const NodeEnvSchema = z.enum(['development', 'production']); 21 | 22 | // File format schemas 23 | const ImageFormatSchema = z.enum([ 24 | 'png', 25 | 'jpg', 26 | 'jpeg', 27 | 'webp', 28 | 'gif', 29 | 'bmp', 30 | 'tiff', 31 | ]); 32 | 33 | const VideoFormatSchema = z.enum([ 34 | 'mp4', 35 | 'mov', 36 | 'avi', 37 | 'mkv', 38 | 'webm', 39 | 'flv', 40 | 'wmv', 41 | '3gp', 42 | ]); 43 | 44 | // Configuration schema 45 | export const ConfigSchema = z.object({ 46 | IMAGE_PROVIDER: ProviderSchema.optional().default('google'), 47 | VIDEO_PROVIDER: ProviderSchema.optional().default('google'), 48 | 49 | // Model configuration 50 | IMAGE_MODEL: z.string().min(1).optional(), 51 | VIDEO_MODEL: z.string().min(1).optional(), 52 | 53 | // Function-specific model configuration 54 | ANALYZE_IMAGE_MODEL: z.string().min(1).optional(), 55 | COMPARE_IMAGES_MODEL: z.string().min(1).optional(), 56 | DETECT_OBJECTS_IN_IMAGE_MODEL: z.string().min(1).optional(), 57 | ANALYZE_VIDEO_MODEL: z.string().min(1).optional(), 58 | 59 | // Gemini API configuration 60 | GEMINI_API_KEY: z.string().min(1).optional(), 61 | GEMINI_BASE_URL: z 62 | .string() 63 | .url() 64 | .optional() 65 | .default('https://generativelanguage.googleapis.com'), 66 | 67 | // Vertex AI configuration 68 | VERTEX_CREDENTIALS: z.string().min(1).optional(), 69 | VERTEX_PROJECT_ID: z.string().min(1).optional(), 70 | VERTEX_LOCATION: z.string().min(1).optional().default('us-central1'), 71 | VERTEX_ENDPOINT: z 72 | .string() 73 | .url() 74 | .optional() 75 | .default('https://aiplatform.googleapis.com'), 76 | 77 | // Google Cloud Storage configuration (for Vertex AI file storage) 78 | GCS_BUCKET_NAME: z.string().min(1).optional(), 79 | GCS_PROJECT_ID: z.string().min(1).optional(), // Auto-derived from credentials 80 | GCS_CREDENTIALS: z.string().min(1).optional(), // Defaults to VERTEX_CREDENTIALS 81 | GCS_REGION: z.string().min(1).optional().default('us-central1'), 82 | 83 | // Universal API parameters 84 | TEMPERATURE: z.coerce.number().min(0).max(2).optional().default(0.8), 85 | TOP_P: z.coerce.number().min(0).max(1).optional().default(0.95), 86 | TOP_K: z.coerce.number().int().min(1).max(100).optional().default(30), 87 | MAX_TOKENS: z.coerce.number().int().min(1).max(8192).optional().default(1000), 88 | 89 | // Task-specific API parameters 90 | TEMPERATURE_FOR_IMAGE: z.number().min(0).max(2).optional(), 91 | TOP_P_FOR_IMAGE: z.number().min(0).max(1).optional(), 92 | TOP_K_FOR_IMAGE: z.number().int().positive().optional(), 93 | MAX_TOKENS_FOR_IMAGE: z.number().int().positive().optional(), 94 | TEMPERATURE_FOR_VIDEO: z.number().min(0).max(2).optional(), 95 | TOP_P_FOR_VIDEO: z.number().min(0).max(1).optional(), 96 | TOP_K_FOR_VIDEO: z.number().int().positive().optional(), 97 | MAX_TOKENS_FOR_VIDEO: z.number().int().positive().optional(), 98 | 99 | // Function-specific API parameters 100 | TEMPERATURE_FOR_ANALYZE_IMAGE: z.number().min(0).max(2).optional(), 101 | TOP_P_FOR_ANALYZE_IMAGE: z.number().min(0).max(1).optional(), 102 | TOP_K_FOR_ANALYZE_IMAGE: z.number().int().positive().optional(), 103 | MAX_TOKENS_FOR_ANALYZE_IMAGE: z.number().int().positive().optional(), 104 | TEMPERATURE_FOR_COMPARE_IMAGES: z.number().min(0).max(2).optional(), 105 | TOP_P_FOR_COMPARE_IMAGES: z.number().min(0).max(1).optional(), 106 | TOP_K_FOR_COMPARE_IMAGES: z.number().int().positive().optional(), 107 | MAX_TOKENS_FOR_COMPARE_IMAGES: z.number().int().positive().optional(), 108 | TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE: z 109 | .number() 110 | .min(0) 111 | .max(2) 112 | .optional() 113 | .default(0), 114 | TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE: z 115 | .number() 116 | .min(0) 117 | .max(1) 118 | .optional() 119 | .default(0.95), 120 | TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE: z 121 | .number() 122 | .int() 123 | .positive() 124 | .optional() 125 | .default(30), 126 | MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE: z 127 | .number() 128 | .int() 129 | .positive() 130 | .optional() 131 | .default(8192), 132 | TEMPERATURE_FOR_ANALYZE_VIDEO: z.number().min(0).max(2).optional(), 133 | TOP_P_FOR_ANALYZE_VIDEO: z.number().min(0).max(1).optional(), 134 | TOP_K_FOR_ANALYZE_VIDEO: z.number().int().positive().optional(), 135 | MAX_TOKENS_FOR_ANALYZE_VIDEO: z.number().int().positive().optional(), 136 | 137 | // File processing configuration 138 | MAX_IMAGE_SIZE: z.coerce 139 | .number() 140 | .int() 141 | .positive() 142 | .optional() 143 | .default(20 * 1024 * 1024), // 20MB 144 | MAX_VIDEO_SIZE: z.coerce 145 | .number() 146 | .int() 147 | .positive() 148 | .optional() 149 | .default(2 * 1024 * 1024 * 1024), // 2GB 150 | ALLOWED_IMAGE_FORMATS: z 151 | .array(ImageFormatSchema) 152 | .optional() 153 | .default(['png', 'jpg', 'jpeg', 'webp', 'gif', 'bmp', 'tiff']), 154 | ALLOWED_VIDEO_FORMATS: z 155 | .array(VideoFormatSchema) 156 | .optional() 157 | .default(['mp4', 'mov', 'avi', 'mkv', 'webm', 'flv', 'wmv', '3gp']), 158 | MAX_VIDEO_DURATION: z.coerce 159 | .number() 160 | .int() 161 | .positive() 162 | .optional() 163 | .default(3600), // 1 hour 164 | MAX_IMAGES_FOR_COMPARISON: z.coerce 165 | .number() 166 | .int() 167 | .positive() 168 | .optional() 169 | .default(4), // Maximum 4 images for comparison 170 | 171 | // File upload configuration 172 | GEMINI_FILES_API_THRESHOLD: z.coerce 173 | .number() 174 | .int() 175 | .positive() 176 | .optional() 177 | .default(10 * 1024 * 1024), // 10MB 178 | VERTEX_AI_FILES_API_THRESHOLD: z.coerce.number().int().optional().default(0), // Vertex AI requires external storage for all files 179 | 180 | // Logging configuration 181 | LOG_LEVEL: LogLevelSchema.optional().default('info'), 182 | 183 | // Development configuration 184 | NODE_ENV: NodeEnvSchema.optional().default('development'), 185 | }); 186 | 187 | // Analysis options schema 188 | export const AnalysisOptionsSchema = z.object({ 189 | temperature: z.number().min(0).max(2).optional(), 190 | topP: z.number().min(0).max(1).optional(), 191 | maxTokens: z.number().int().positive().optional(), 192 | stopSequences: z.array(z.string()).optional(), 193 | taskType: z.enum(['image', 'video']).optional(), 194 | functionName: z 195 | .enum(Object.values(FUNCTION_NAMES) as [FunctionName, ...FunctionName[]]) 196 | .optional(), 197 | }); 198 | 199 | // MCP tool argument schemas 200 | export const AnalyzeImageArgsSchema = z.object({ 201 | imageSource: z.string().min(1, 'Image source is required'), 202 | prompt: z.string().min(1, 'Prompt is required'), 203 | options: AnalysisOptionsSchema.optional(), 204 | }); 205 | 206 | export const AnalyzeVideoArgsSchema = z.object({ 207 | videoSource: z.string().min(1, 'Video source is required'), 208 | prompt: z.string().min(1, 'Prompt is required'), 209 | options: AnalysisOptionsSchema.optional(), 210 | }); 211 | 212 | // File validation schemas 213 | export const FileValidationSchema = z.object({ 214 | filename: z.string().min(1, 'Filename is required'), 215 | mimeType: z.string().min(1, 'MIME type is required'), 216 | size: z.number().int().nonnegative('File size must be non-negative'), 217 | }); 218 | 219 | // URL validation schema 220 | export const UrlSchema = z.string().url('Invalid URL format'); 221 | 222 | // Base64 validation schema 223 | export const Base64Schema = z 224 | .string() 225 | .regex(/^data:image\/[a-zA-Z]+;base64,/, 'Invalid base64 image format'); 226 | 227 | // Model name validation 228 | export const ModelNameSchema = z.string().min(1, 'Model name is required'); 229 | 230 | // Provider info validation 231 | export const ProviderInfoSchema = z.object({ 232 | name: z.string().min(1, 'Provider name is required'), 233 | type: z.enum(['image', 'video']), 234 | models: z.object({ 235 | image: ModelNameSchema, 236 | video: ModelNameSchema, 237 | }), 238 | credentials: z.record(z.string()), 239 | options: z.record(z.unknown()).optional(), 240 | }); 241 | 242 | // Health status validation 243 | export const HealthStatusSchema = z.object({ 244 | status: z.enum(['healthy', 'unhealthy', 'degraded']), 245 | message: z.string().optional(), 246 | lastCheck: z.string().datetime(), 247 | responseTime: z.number().nonnegative().optional(), 248 | }); 249 | 250 | // Usage metadata validation 251 | export const UsageMetadataSchema = z.object({ 252 | promptTokenCount: z.number().int().nonnegative(), 253 | candidatesTokenCount: z.number().int().nonnegative(), 254 | totalTokenCount: z.number().int().nonnegative(), 255 | }); 256 | 257 | // Analysis result validation 258 | export const AnalysisResultSchema = z.object({ 259 | text: z.string(), 260 | metadata: z.object({ 261 | model: z.string(), 262 | provider: z.string(), 263 | usage: UsageMetadataSchema.optional(), 264 | processingTime: z.number().nonnegative().optional(), 265 | fileType: z.string().optional(), 266 | fileSize: z.number().int().nonnegative().optional(), 267 | }), 268 | }); 269 | 270 | // File reference validation 271 | export const FileReferenceSchema = z.union([ 272 | z.object({ 273 | type: z.literal('file_uri'), 274 | uri: z.string().min(1), 275 | mimeType: z.string().min(1), 276 | }), 277 | z.object({ 278 | type: z.literal('public_url'), 279 | url: z.string().url(), 280 | mimeType: z.string().min(1), 281 | }), 282 | z.object({ 283 | type: z.literal('base64'), 284 | data: z.string().min(1), 285 | mimeType: z.string().min(1), 286 | }), 287 | ]); 288 | 289 | // Validation functions 290 | export const validateConfig = (config: unknown): Config => { 291 | return ConfigSchema.parse(config); 292 | }; 293 | 294 | export const validateAnalysisOptions = (options: unknown): AnalysisOptions => { 295 | return AnalysisOptionsSchema.parse(options); 296 | }; 297 | 298 | export const validateAnalyzeImageArgs = (args: unknown) => { 299 | return AnalyzeImageArgsSchema.parse(args); 300 | }; 301 | 302 | export const validateAnalyzeVideoArgs = (args: unknown) => { 303 | return AnalyzeVideoArgsSchema.parse(args); 304 | }; 305 | 306 | export const validateFile = (file: unknown) => { 307 | return FileValidationSchema.parse(file); 308 | }; 309 | 310 | export const validateUrl = (url: unknown): string => { 311 | return UrlSchema.parse(url); 312 | }; 313 | 314 | export const validateBase64 = (base64: unknown): string => { 315 | return Base64Schema.parse(base64); 316 | }; 317 | 318 | export const validateModelName = (model: unknown): string => { 319 | return ModelNameSchema.parse(model); 320 | }; 321 | 322 | export const validateHealthStatus = (status: unknown) => { 323 | return HealthStatusSchema.parse(status); 324 | }; 325 | 326 | export const validateAnalysisResult = (result: unknown) => { 327 | return AnalysisResultSchema.parse(result); 328 | }; 329 | 330 | export const validateFileReference = (reference: unknown) => { 331 | return FileReferenceSchema.parse(reference); 332 | }; 333 | 334 | // Type guards 335 | export const isValidUrl = (value: unknown): value is string => { 336 | return UrlSchema.safeParse(value).success; 337 | }; 338 | 339 | export const isValidBase64 = (value: unknown): value is string => { 340 | return Base64Schema.safeParse(value).success; 341 | }; 342 | 343 | export const isImageFormat = (mimeType: string): boolean => { 344 | return mimeType.startsWith('image/'); 345 | }; 346 | 347 | export const isVideoFormat = (mimeType: string): boolean => { 348 | return mimeType.startsWith('video/'); 349 | }; 350 | 351 | export const isSupportedImageFormat = ( 352 | mimeType: string, 353 | supportedFormats: string[] 354 | ): boolean => { 355 | const extension = mimeType.split('/')[1]; 356 | return supportedFormats.includes(extension); 357 | }; 358 | 359 | export const isSupportedVideoFormat = ( 360 | mimeType: string, 361 | supportedFormats: string[] 362 | ): boolean => { 363 | const extension = mimeType.split('/')[1]; 364 | return supportedFormats.includes(extension); 365 | }; 366 | 367 | // Error formatting for validation errors 368 | export const formatZodError = (error: z.ZodError): string => { 369 | const errorMessages = error.errors.map(err => { 370 | const path = err.path.join('.'); 371 | return `${path}: ${err.message}`; 372 | }); 373 | return `Validation failed: ${errorMessages.join(', ')}`; 374 | }; 375 | -------------------------------------------------------------------------------- /docs/environment-variable-guide.md: -------------------------------------------------------------------------------- 1 | # Environment Variable Configuration Guide 2 | 3 | This guide provides comprehensive documentation for all environment variables used by the AI Vision MCP Server. 4 | 5 | ## Table of Contents 6 | 7 | - [Quick Setup](#quick-setup) 8 | - [Configuration Priority](#configuration-priority) 9 | - [Environment Variables Reference](#environment-variables-reference) 10 | - [Configuration Examples](#configuration-examples) 11 | - [Troubleshooting](#troubleshooting) 12 | 13 | ## Quick Setup 14 | 15 | ### Google AI Studio Provider (Recommended for simplicity) 16 | 17 | ```bash 18 | export IMAGE_PROVIDER="google" 19 | export VIDEO_PROVIDER="google" 20 | export GEMINI_API_KEY="your-gemini-api-key" 21 | ``` 22 | 23 | ### Vertex AI Provider (Recommended for production) 24 | 25 | ```bash 26 | export IMAGE_PROVIDER="vertex_ai" 27 | export VIDEO_PROVIDER="vertex_ai" 28 | export VERTEX_CREDENTIALS="/path/to/service-account.json" 29 | export GCS_BUCKET_NAME="your-gcs-bucket" 30 | ``` 31 | 32 | Get your Google AI Studio API key [here](https://aistudio.google.com/app/api-keys). 33 | 34 | For Vertex AI setup, see [Vertex AI Setup Guide](provider/vertex-ai-setup-guide.md). 35 | 36 | ## Configuration Priority 37 | 38 | The AI Vision MCP Server uses a hierarchical configuration system where more specific settings override general ones. 39 | 40 | ### AI Parameters Priority (Highest to Lowest) 41 | 42 | 1. **LLM-assigned values** - Parameters passed directly in tool calls (e.g., `{"temperature": 0.1}`) 43 | 2. **Function-specific variables** - `TEMPERATURE_FOR_ANALYZE_IMAGE`, `MAX_TOKENS_FOR_COMPARE_IMAGES`, etc. 44 | 3. **Task-specific variables** - `TEMPERATURE_FOR_IMAGE`, `MAX_TOKENS_FOR_VIDEO`, etc. 45 | 4. **Universal variables** - `TEMPERATURE`, `MAX_TOKENS`, etc. 46 | 5. **System defaults** - Built-in fallback values 47 | 48 | ### Model Selection Priority (Highest to Lowest) 49 | 50 | 1. **Function-specific models** - `ANALYZE_IMAGE_MODEL`, `COMPARE_IMAGES_MODEL`, `ANALYZE_VIDEO_MODEL` 51 | 2. **Task-specific models** - `IMAGE_MODEL`, `VIDEO_MODEL` 52 | 3. **System defaults** - Built-in fallback models (`gemini-2.5-flash-lite`, `gemini-2.5-flash`) 53 | 54 | ## Environment Variables Reference 55 | 56 | ### Provider Selection 57 | 58 | | Variable | Required | Description | Default | 59 | |----------|-----------|-------------|---------| 60 | | `IMAGE_PROVIDER` | Yes | Provider for image analysis | `google` or `vertex_ai` | 61 | | `VIDEO_PROVIDER` | Yes | Provider for video analysis | `google` or `vertex_ai` | 62 | 63 | ### Model Selection 64 | 65 | | Variable | Required | Description | Default | 66 | |----------|-----------|-------------|---------| 67 | | `IMAGE_MODEL` | No | Model for image analysis | `gemini-2.5-flash-lite` | 68 | | `VIDEO_MODEL` | No | Model for video analysis | `gemini-2.5-flash` | 69 | 70 | ### Function-specific Model Selection 71 | 72 | | Variable | Required | Description | Default | 73 | |----------|-----------|-------------|---------| 74 | | `ANALYZE_IMAGE_MODEL` | No | Model for analyze_image function | Uses `IMAGE_MODEL` | 75 | | `COMPARE_IMAGES_MODEL` | No | Model for compare_images function | Uses `IMAGE_MODEL` | 76 | | `DETECT_OBJECTS_IN_IMAGE_MODEL` | No | Model for detect_objects_in_image function | Uses `IMAGE_MODEL` | 77 | | `ANALYZE_VIDEO_MODEL` | No | Model for analyze_video function | Uses `VIDEO_MODEL` | 78 | 79 | ### Google Gemini API Configuration 80 | 81 | | Variable | Required | Description | Default | 82 | |----------|-----------|-------------|---------| 83 | | `GEMINI_API_KEY` | Yes if using `google` provider | Google Gemini API key | Required for Gemini | 84 | | `GEMINI_BASE_URL` | No | Gemini API base URL | `https://generativelanguage.googleapis.com` | 85 | 86 | ### Vertex AI Configuration 87 | 88 | | Variable | Required | Description | Default | 89 | |----------|-----------|-------------|---------| 90 | | `VERTEX_CREDENTIALS` | Yes if using `vertex_ai` provider | Path to GCP service account JSON | Required for Vertex AI | 91 | | `VERTEX_PROJECT_ID` | Auto | Google Cloud project ID | Auto-derived from credentials | 92 | | `VERTEX_LOCATION` | No | Vertex AI region | `us-central1` | 93 | | `VERTEX_ENDPOINT` | No | Vertex AI endpoint URL | `https://aiplatform.googleapis.com` | 94 | 95 | ### Google Cloud Storage (Required for Vertex AI) 96 | 97 | | Variable | Required | Description | Default | 98 | |----------|-----------|-------------|---------| 99 | | `GCS_BUCKET_NAME` | Yes if using `vertex_ai` provider | GCS bucket name for Vertex AI uploads | Required for Vertex AI | 100 | | `GCS_CREDENTIALS` | No | Path to GCS credentials | Defaults to `VERTEX_CREDENTIALS` | 101 | | `GCS_PROJECT_ID` | No | GCS project ID | Auto-derived from `VERTEX_CREDENTIALS` | 102 | | `GCS_REGION` | No | GCS region | Defaults to `VERTEX_LOCATION` | 103 | 104 | ### Universal API Parameters 105 | 106 | | Variable | Required | Description | Range | Default | 107 | |----------|-----------|-------------|-------|---------| 108 | | `TEMPERATURE` | No | AI response temperature | 0.0–2.0 | `0.8` | 109 | | `TOP_P` | No | Top-p sampling parameter | 0.0–1.0 | `0.95` | 110 | | `TOP_K` | No | Top-k sampling parameter | 1–100 | `30` | 111 | | `MAX_TOKENS` | No | Maximum tokens for analysis | 1–8192 | `1000` | 112 | 113 | ### Task-specific API Parameters 114 | 115 | | Variable | Required | Description | Range | Default | 116 | |----------|-----------|-------------|-------|---------| 117 | | `TEMPERATURE_FOR_IMAGE` | No | Image-specific temperature | 0.0–2.0 | Uses `TEMPERATURE` | 118 | | `TOP_P_FOR_IMAGE` | No | Image-specific top-p | 0.0–1.0 | Uses `TOP_P` | 119 | | `TOP_K_FOR_IMAGE` | No | Image-specific top-k | 1–100 | Uses `TOP_K` | 120 | | `MAX_TOKENS_FOR_IMAGE` | No | Maximum tokens for image analysis | 1–8192 | Uses `MAX_TOKENS` | 121 | | `TEMPERATURE_FOR_VIDEO` | No | Video-specific temperature | 0.0–2.0 | Uses `TEMPERATURE` | 122 | | `TOP_P_FOR_VIDEO` | No | Video-specific top-p | 0.0–1.0 | Uses `TOP_P` | 123 | | `TOP_K_FOR_VIDEO` | No | Video-specific top-k | 1–100 | Uses `TOP_K` | 124 | | `MAX_TOKENS_FOR_VIDEO` | No | Maximum tokens for video analysis | 1–8192 | Uses `MAX_TOKENS` | 125 | 126 | ### Function-specific API Parameters 127 | 128 | | Variable | Required | Description | Range | Default | 129 | |----------|-----------|-------------|-------|---------| 130 | | `TEMPERATURE_FOR_ANALYZE_IMAGE` | No | Temperature for analyze_image | 0.0–2.0 | Uses `TEMPERATURE_FOR_IMAGE` | 131 | | `TOP_P_FOR_ANALYZE_IMAGE` | No | Top-p for analyze_image | 0.0–1.0 | Uses `TOP_P_FOR_IMAGE` | 132 | | `TOP_K_FOR_ANALYZE_IMAGE` | No | Top-k for analyze_image | 1–100 | Uses `TOP_K_FOR_IMAGE` | 133 | | `MAX_TOKENS_FOR_ANALYZE_IMAGE` | No | Max tokens for analyze_image | 1–8192 | Uses `MAX_TOKENS_FOR_IMAGE` | 134 | | `TEMPERATURE_FOR_COMPARE_IMAGES` | No | Temperature for compare_images | 0.0–2.0 | Uses `TEMPERATURE_FOR_IMAGE` | 135 | | `TOP_P_FOR_COMPARE_IMAGES` | No | Top-p for compare_images | 0.0–1.0 | Uses `TOP_P_FOR_IMAGE` | 136 | | `TOP_K_FOR_COMPARE_IMAGES` | No | Top-k for compare_images | 1–100 | Uses `TOP_K_FOR_IMAGE` | 137 | | `MAX_TOKENS_FOR_COMPARE_IMAGES` | No | Max tokens for compare_images | 1–8192 | Uses `MAX_TOKENS_FOR_IMAGE` | 138 | | `TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Temperature for object detection | 0.0–2.0 | `0.0` | 139 | | `TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Top-p for object detection | 0.0–1.0 | `0.95` | 140 | | `TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Top-k for object detection | 1–100 | `30` | 141 | | `MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Max tokens for object detection | 1–8192 | `8192` | 142 | | `TEMPERATURE_FOR_ANALYZE_VIDEO` | No | Temperature for analyze_video | 0.0–2.0 | Uses `TEMPERATURE_FOR_VIDEO` | 143 | | `TOP_P_FOR_ANALYZE_VIDEO` | No | Top-p for analyze_video | 0.0–1.0 | Uses `TOP_P_FOR_VIDEO` | 144 | | `TOP_K_FOR_ANALYZE_VIDEO` | No | Top-k for analyze_video | 1–100 | Uses `TOP_K_FOR_VIDEO` | 145 | | `MAX_TOKENS_FOR_ANALYZE_VIDEO` | No | Max tokens for analyze_video | 1–8192 | Uses `MAX_TOKENS_FOR_VIDEO` | 146 | 147 | ### File Processing Configuration 148 | 149 | | Variable | Required | Description | Default | 150 | |----------|-----------|-------------|---------| 151 | | `MAX_IMAGE_SIZE` | No | Maximum image size in bytes | `20971520` (20 MB) | 152 | | `MAX_VIDEO_SIZE` | No | Maximum video size in bytes | `2147483648` (2 GB) | 153 | | `MAX_VIDEO_DURATION` | No | Maximum video duration (seconds) | `3600` (1 hour) | 154 | | `MAX_IMAGES_FOR_COMPARISON` | No | Maximum images for comparison | `4` | 155 | | `ALLOWED_IMAGE_FORMATS` | No | Comma-separated image formats | `png,jpg,jpeg,webp,gif,bmp,tiff` | 156 | | `ALLOWED_VIDEO_FORMATS` | No | Comma-separated video formats | `mp4,mov,avi,mkv,webm,flv,wmv,3gp` | 157 | 158 | ### File Upload Configuration 159 | 160 | | Variable | Required | Description | Default | 161 | |----------|-----------|-------------|---------| 162 | | `GEMINI_FILES_API_THRESHOLD` | No | Size threshold for Gemini Files API | `10485760` (10 MB) | 163 | | `VERTEX_AI_FILES_API_THRESHOLD` | No | Size threshold for Vertex AI uploads | `0` | 164 | 165 | ### Development Configuration 166 | 167 | | Variable | Required | Description | Default | 168 | |----------|-----------|-------------|---------| 169 | | `LOG_LEVEL` | No | Logging level | `info` | 170 | | `NODE_ENV` | No | Environment mode | `development` | 171 | 172 | ## Configuration Examples 173 | 174 | ### Basic Development Setup 175 | 176 | ```bash 177 | # Provider selection 178 | export IMAGE_PROVIDER="google" 179 | export VIDEO_PROVIDER="google" 180 | export GEMINI_API_KEY="your-gemini-api-key" 181 | 182 | # Basic configuration 183 | export TEMPERATURE=0.7 184 | export MAX_TOKENS=1500 185 | export LOG_LEVEL="debug" 186 | ``` 187 | 188 | ### Production Setup with Vertex AI 189 | 190 | ```bash 191 | # Provider selection 192 | export IMAGE_PROVIDER="vertex_ai" 193 | export VIDEO_PROVIDER="vertex_ai" 194 | export VERTEX_CREDENTIALS="/path/to/service-account.json" 195 | export GCS_BUCKET_NAME="your-production-bucket" 196 | 197 | # Production models 198 | export IMAGE_MODEL="gemini-2.5-flash" 199 | export VIDEO_MODEL="gemini-2.5-flash-pro" 200 | 201 | # Production parameters 202 | export TEMPERATURE=0.3 203 | export MAX_TOKENS=2000 204 | export NODE_ENV="production" 205 | export LOG_LEVEL="info" 206 | ``` 207 | 208 | ### Function-specific Optimization 209 | 210 | ```bash 211 | # General settings 212 | export IMAGE_PROVIDER="google" 213 | export GEMINI_API_KEY="your-gemini-api-key" 214 | 215 | # Function-specific optimizations 216 | export TEMPERATURE_FOR_ANALYZE_IMAGE=0.1 # Precise image analysis 217 | export TEMPERATURE_FOR_COMPARE_IMAGES=0.5 # More creative comparisons 218 | export TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=0.0 # Deterministic detection 219 | export MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE=8192 # High token limit for JSON 220 | 221 | # Function-specific models 222 | export ANALYZE_IMAGE_MODEL="gemini-2.5-flash-lite" 223 | export COMPARE_IMAGES_MODEL="gemini-2.5-flash" 224 | export DETECT_OBJECTS_IN_IMAGE_MODEL="gemini-2.5-flash-lite" 225 | ``` 226 | 227 | ### Mixed Provider Setup 228 | 229 | ```bash 230 | # Use Gemini for images (simpler, faster) 231 | export IMAGE_PROVIDER="google" 232 | export GEMINI_API_KEY="your-gemini-api-key" 233 | 234 | # Use Vertex AI for videos (enterprise features) 235 | export VIDEO_PROVIDER="vertex_ai" 236 | export VERTEX_CREDENTIALS="/path/to/service-account.json" 237 | export GCS_BUCKET_NAME="your-mixed-provider-bucket" 238 | 239 | # Task-specific parameters 240 | export TEMPERATURE_FOR_IMAGE=0.2 241 | export TEMPERATURE_FOR_VIDEO=0.5 242 | export MAX_TOKENS_FOR_IMAGE=1000 243 | export MAX_TOKENS_FOR_VIDEO=2000 244 | ``` 245 | 246 | ## File Upload Strategy Configuration 247 | 248 | ### Gemini Provider Strategy 249 | 250 | ```bash 251 | export GEMINI_FILES_API_THRESHOLD=10485760 # 10MB 252 | 253 | # Files ≤ 10MB: Use inline base64 data 254 | # Files > 10MB: Use Gemini Files API 255 | ``` 256 | 257 | ### Vertex AI Provider Strategy 258 | 259 | ```bash 260 | export VERTEX_AI_FILES_API_THRESHOLD=0 # All files use GCS 261 | 262 | # All files: Upload to Google Cloud Storage and use gs:// URIs 263 | ``` 264 | 265 | ## Troubleshooting 266 | 267 | ### Common Issues 268 | 269 | 1. **Missing API Key Error** 270 | ``` 271 | Error: Missing required configuration for google: GEMINI_API_KEY 272 | ``` 273 | **Solution**: Set `GEMINI_API_KEY` environment variable when using `google` provider 274 | 275 | 2. **Vertex AI Authentication Error** 276 | ``` 277 | Error: Missing required configuration for vertex_ai: VERTEX_CREDENTIALS 278 | ``` 279 | **Solution**: Set `VERTEX_CREDENTIALS` and `GCS_BUCKET_NAME` for Vertex AI 280 | 281 | 3. **File Size Limit Exceeded** 282 | ``` 283 | Error: File size exceeds maximum limit 284 | ``` 285 | **Solution**: Increase `MAX_IMAGE_SIZE` or `MAX_VIDEO_SIZE`, or reduce file size 286 | 287 | 4. **Unsupported File Format** 288 | ``` 289 | Error: Unsupported file format 290 | ``` 291 | **Solution**: Check `ALLOWED_IMAGE_FORMATS` and `ALLOWED_VIDEO_FORMATS` settings 292 | 293 | 5. **Token Limit Exceeded** 294 | ``` 295 | Error: Response exceeds max tokens 296 | ``` 297 | **Solution**: Increase relevant `MAX_TOKENS_*` environment variable 298 | 299 | ### Debug Mode 300 | 301 | Enable debug logging to troubleshoot issues: 302 | 303 | ```bash 304 | export LOG_LEVEL="debug" 305 | ``` 306 | 307 | This will provide detailed information about: 308 | - Configuration loading 309 | - Provider initialization 310 | - File processing 311 | - API requests and responses 312 | - Error details 313 | 314 | ### Configuration Validation 315 | 316 | The server validates configuration on startup. Common validation errors: 317 | 318 | - Missing required provider-specific variables 319 | - Invalid file paths in credentials 320 | - Incompatible configuration combinations 321 | - Out-of-range parameter values 322 | 323 | Check the console output for detailed validation messages. 324 | 325 | ## Best Practices 326 | 327 | 1. **Use Environment-specific Files**: Create `.env.development` and `.env.production` files 328 | 2. **Secure Credentials**: Never commit API keys or credentials to version control 329 | 3. **Optimize Token Usage**: Set appropriate `MAX_TOKENS` values for each function type 330 | 4. **Monitor Usage**: Use appropriate temperature settings for your use case 331 | 5. **Test Configuration**: Validate configuration in development before production deployment 332 | 333 | ## Related Documentation 334 | 335 | - [Installation Guide](../README.md#installation) 336 | - [Vertex AI Setup Guide](provider/vertex-ai-setup-guide.md) 337 | - [Technical Specification](SPEC.md) 338 | - [Development Patterns](../CLAUDE.md) -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | Please always use context7 MCP, web search, or web fetch for additional information when fixing bugs or implementing new features. 5 | 6 | ## **CRITICAL: Documentation Maintenance Requirements** 7 | 8 | **BEFORE starting any coding work:** 9 | 1. **ALWAYS create a plan document** in the `docs/llm_logs/` folder before writing any code 10 | 2. **ALWAYS update README.md** when introducing changes that affect: 11 | - New MCP tools or parameters 12 | - Environment variables 13 | - Configuration options 14 | - Installation instructions 15 | - Breaking changes 16 | 3. **ALWAYS update docs/SPEC.md** when introducing changes that affect: 17 | - Architecture modifications 18 | - New provider implementations 19 | - API interface changes 20 | - File handling logic 21 | - Error handling patterns 22 | 23 | **Planning Process:** 24 | - Create plan documents in `docs/llm_logs/` folder (e.g., `docs/llm_logs/feature-name-plan.md`) 25 | - Include architecture decisions, implementation steps, and testing strategy 26 | - Reference this plan in your commit messages 27 | - Keep plan documents as documentation of implementation decisions 28 | 29 | **Solution Planning Best Practices:** 30 | - **ALWAYS present at least 3 options** when planning solutions to problems 31 | - Analyze trade-offs: effort vs. benefit, maintainability vs. speed, risk vs. reward 32 | - Provide clear recommendations with rationale (e.g., "Option 2 recommended because...") 33 | - Consider: quick fixes, balanced approaches, and comprehensive solutions 34 | - Include effort estimates, risk assessments, and rollback strategies for each option 35 | - Use structured format: Option 1 (Simple), Option 2 (Balanced), Option 3 (Comprehensive) 36 | 37 | **Example Planning Structure:** 38 | ``` 39 | ## Plan: [Problem Description] 40 | 41 | ### Option 1: Quick Fix (15 min) 42 | - ✅ Minimal change, fastest implementation 43 | - ❌ Technical debt, not future-proof 44 | - **When to use**: Urgent hotfixes, time pressure 45 | 46 | ### Option 2: Balanced Solution (45 min) - RECOMMENDED 47 | - ✅ Good maintainability, moderate effort 48 | - ✅ Addresses root cause, extensible 49 | - ❌ Longer implementation time 50 | - **When to use**: Most production scenarios 51 | 52 | ### Option 3: Comprehensive Refactor (2 hours) 53 | - ✅ Perfect architecture, future-proof 54 | - ❌ High effort, potential for new bugs 55 | - **When to use**: Major feature additions, architectural improvements 56 | 57 | ### Recommendation: Option 2 58 | **Rationale**: Balances immediate needs with long-term maintainability... 59 | ``` 60 | 61 | **Documentation Synchronization:** 62 | - README.md is for **users** - installation, usage, and configuration 63 | - docs/SPEC.md is for **developers** - technical specifications and architecture 64 | - CLAUDE.md is for **AI assistants** - development patterns and constraints 65 | - All three documents must stay consistent with the actual implementation 66 | 67 | ## Development Commands 68 | 69 | ### Building and Testing 70 | - `npm run build` - Build TypeScript project to `dist/` directory 71 | - `npm run dev` - Start development server with watch mode (tsc --watch) 72 | - `npm start` - Start the built MCP server (node dist/index.js) 73 | 74 | ### Code Quality 75 | - `npm run lint` - Run ESLint on all TypeScript files 76 | - `npm run lint:fix` - Run ESLint with auto-fix 77 | - `npm run format` - Format code with Prettier 78 | 79 | ### Publishing 80 | - `npm run prepublishOnly` - Run lint before publish 81 | - `npm run preversion` - Run lint before version bump 82 | - `npm run version` - Format code and add to git before version 83 | - `npm run prepare` - Build project automatically on install 84 | 85 | ## Architecture Overview 86 | 87 | This is a Model Context Protocol (MCP) server that provides AI-powered image and video analysis using Google Gemini and Vertex AI models. 88 | 89 | ### Core Components 90 | 91 | **Server Architecture** (`src/server.ts`): 92 | - Main MCP server entry point using `@modelcontextprotocol/sdk` 93 | - Lazy-loaded services initialized on first request via `getServices()` function 94 | - Four primary tools: `analyze_image`, `compare_images`, `detect_objects_in_image`, and `analyze_video` 95 | - Comprehensive error handling with custom `VisionError` types 96 | - Graceful shutdown handling for SIGINT/SIGTERM 97 | 98 | **Configuration Hierarchy System**: 99 | The server implements a sophisticated 4-level configuration priority system: 100 | 1. **LLM-assigned values** - Parameters passed directly in tool calls (e.g., `{"temperature": 0.1}`) 101 | 2. **Function-specific variables** - `TEMPERATURE_FOR_ANALYZE_IMAGE`, `MAX_TOKENS_FOR_COMPARE_IMAGES`, etc. 102 | 3. **Task-specific variables** - `TEMPERATURE_FOR_IMAGE`, `MAX_TOKENS_FOR_VIDEO`, etc. 103 | 4. **Universal variables** - `TEMPERATURE`, `MAX_TOKENS`, etc. 104 | 105 | **Provider Factory** (`src/providers/factory/ProviderFactory.ts`): 106 | - Factory pattern for creating AI provider instances with validation (`VisionProviderFactory`) 107 | - Supports two providers: `google` (Gemini API) and `vertex_ai` (Vertex AI) 108 | - `createProviderWithValidation()` method ensures configuration validation before provider creation 109 | - Automatic provider initialization via `initializeDefaultProviders()` on module load 110 | - Configuration requirement validation and error handling with provider context 111 | - Dynamic provider registration support through `registerProvider()` method 112 | 113 | **Configuration Service** (`src/services/ConfigService.ts`): 114 | - Singleton pattern for configuration management via `ConfigService.getInstance()` 115 | - Environment variable validation with Zod schemas 116 | - Provider-specific configuration methods 117 | - Auto-derivation of related settings (e.g., project ID from credentials) 118 | - Hierarchical configuration resolution 119 | 120 | **Configuration Validation** (`src/types/Config.ts` and `src/utils/validation.ts`): 121 | - `Config.ts` defines TypeScript interfaces for all configuration options 122 | - `validation.ts` provides Zod schemas that validate environment variables against these interfaces 123 | - These files must stay synchronized - any new config field in Config.ts requires corresponding validation rules in validation.ts 124 | 125 | **Key Services**: 126 | - `FileService` - Handles file uploads, validation, and processing with support for URLs, local files, and base64, includes cross-platform path handling 127 | - `ConfigService` - Singleton pattern for environment variables and settings with validation 128 | - Vision providers in `src/providers/` - AI model implementations with consistent interfaces 129 | - Storage strategies in `src/storage/` - Google Cloud Storage integration 130 | - File upload strategies in `src/file-upload/` - Provider-specific upload handling 131 | - Image annotation utilities in `src/utils/` - Sharp-based image processing for object detection 132 | 133 | ### MCP Tools Implementation 134 | 135 | **All tools follow consistent patterns:** 136 | - Configuration hierarchy: function-specific → task-specific → universal variables 137 | - File source support: URLs, local files, base64 data 138 | - Error handling with custom `VisionError` types with provider context 139 | - Provider-agnostic interface through factory pattern 140 | - Structured output schemas for object detection 141 | 142 | **Tool-specific behaviors:** 143 | - `detect_objects_in_image`: Returns annotated images with bounding boxes, 2-step file handling (explicit path → temp file), uses structured JSON output with coordinates, includes CSS selector suggestions for web elements 144 | - `compare_images`: Supports 2-4 images with mixed source types, batch processing optimization 145 | - `analyze_image`: Special prompt handling for frontend UI comparison tasks, intelligent file processing based on size 146 | - `analyze_video`: YouTube URL and local file support, GCS integration for Vertex AI, duration and size validation 147 | 148 | ### Provider Implementation 149 | 150 | **Gemini Provider** (`src/providers/gemini/`): 151 | - Direct Google Gemini API integration using `@google/genai` 152 | - Files API for larger uploads (>10MB via `GEMINI_FILES_API_THRESHOLD`) 153 | - Base64 encoding for smaller files (inline data) 154 | - Structured output support for object detection with response schemas 155 | - Native support for both `google` and `vertex_ai` providers using same SDK 156 | 157 | **Vertex AI Provider** (`src/providers/vertexai/`): 158 | - Google Cloud Vertex AI integration using `@google/genai` SDK 159 | - Requires GCS bucket for all file uploads (configured via `VERTEX_AI_FILES_API_THRESHOLD`) 160 | - Service account authentication with auto project ID extraction 161 | - Uses same underlying SDK as Gemini provider for consistency 162 | 163 | ### File Processing Flow 164 | 165 | 1. **Input Validation**: File size, format, and duration checks using configurable limits 166 | 2. **Upload Strategy Selection**: Based on provider and file size thresholds 167 | 3. **File Processing**: MIME type detection, path resolution, cross-platform support (Windows/Unix) 168 | 4. **AI Analysis**: Provider-specific API calls with structured output schemas 169 | 5. **Response Processing**: Structured JSON responses with comprehensive error handling 170 | 171 | ## Critical Development Constraints 172 | 173 | ### Configuration Synchronization 174 | - `src/types/Config.ts` and `src/utils/validation.ts` MUST stay synchronized 175 | - Every new config field in Config.ts requires corresponding Zod validation in validation.ts 176 | - Function-specific environment variables must follow the naming pattern: `TEMPERATURE_FOR_ANALYZE_IMAGE`, etc. 177 | - When adding new configuration, always implement the 4-level hierarchy 178 | 179 | ### Error Handling Requirements 180 | - Always use custom `VisionError` types with provider context 181 | - Include error codes for proper client handling 182 | - Implement retry logic for network failures 183 | - Never expose sensitive credentials in error messages 184 | - Provider-specific error context for debugging 185 | 186 | ### TypeScript Configuration 187 | - ES2022 target with ESNext modules, strict type checking enabled 188 | - Path mapping with `@/*` pointing to `src/*` for clean imports 189 | - Declaration maps and source maps enabled for debugging 190 | - No implicit any, returns, or this allowed (strict mode) 191 | 192 | ### File Organization 193 | ``` 194 | src/ 195 | ├── providers/ # AI provider implementations 196 | │ ├── gemini/ # Google Gemini provider 197 | │ ├── vertexai/ # Vertex AI provider 198 | │ └── factory/ # Provider factory 199 | ├── services/ # Core services 200 | │ ├── ConfigService.ts 201 | │ └── FileService.ts 202 | ├── storage/ # Storage implementations 203 | ├── file-upload/ # File upload strategies 204 | ├── types/ # TypeScript type definitions 205 | ├── utils/ # Utility functions 206 | └── tools/ # MCP tool implementations 207 | ``` 208 | 209 | ## Development Patterns 210 | 211 | 1. **Lazy Loading**: Services initialized on first request via `getServices()` function 212 | 2. **Factory Pattern**: Providers created through `VisionProviderFactory` with validation 213 | 3. **Singleton Pattern**: `ConfigService.getInstance()` ensures consistency 214 | 4. **Strategy Pattern**: File upload strategies selected based on provider and size 215 | 5. **Zod Validation**: All inputs validated with Zod schemas for runtime type safety 216 | 6. **Configuration Hierarchy**: Always implement 4-level priority: LLM-assigned → function-specific → task-specific → universal 217 | 7. **Error Context**: Always include provider information in errors for debugging 218 | 8. **Cross-Platform Support**: Handle both Windows and Unix file paths correctly 219 | 9. **Config Building Pattern**: Use `buildConfigWithOptions()` helper from BaseVisionProvider for consistent config generation 220 | 221 | ### Config Building Pattern (IMPORTANT) 222 | 223 | When implementing provider methods that need AI configuration, **always use** the `buildConfigWithOptions()` helper: 224 | 225 | ```typescript 226 | // ✅ Correct - uses helper method 227 | const config = this.buildConfigWithOptions('image', options?.functionName, options); 228 | 229 | await this.client.models.generateContent({ 230 | model, 231 | contents, 232 | config, // Automatically includes responseSchema and systemInstruction if provided 233 | }); 234 | 235 | // ❌ Incorrect - manual config building (duplicates code) 236 | const config = { 237 | temperature: this.resolveTemperatureForFunction(...), 238 | topP: this.resolveTopPForFunction(...), 239 | topK: this.resolveTopKForFunction(...), 240 | maxOutputTokens: this.resolveMaxTokensForFunction(...), 241 | candidateCount: 1, 242 | }; 243 | if (options?.responseSchema) { 244 | config.responseMimeType = 'application/json'; 245 | config.responseSchema = options.responseSchema; 246 | } 247 | if (options?.systemInstruction) { 248 | config.systemInstruction = options.systemInstruction; 249 | } 250 | // ... manual config building creates maintenance burden 251 | ``` 252 | 253 | **Why use `buildConfigWithOptions()`?** 254 | 255 | 1. **DRY Principle**: Single source of truth for config generation 256 | 2. **Automatic Structured Output**: Handles `responseSchema` and `systemInstruction` automatically 257 | 3. **Consistency**: Same config format across all providers (Gemini, Vertex AI) 258 | 4. **Maintainability**: Adding new config options only requires updating one method 259 | 5. **Type Safety**: Centralized TypeScript type checking 260 | 261 | **This pattern is critical for:** 262 | - Object detection (`detect_objects_in_image`) - requires structured JSON output 263 | - Any future tools that need custom response schemas 264 | - Maintaining consistency between Gemini and Vertex AI providers 265 | 266 | **Reference Implementation:** 267 | - Helper method: `src/providers/base/VisionProvider.ts:354-395` 268 | - Usage in Gemini: `src/providers/gemini/GeminiProvider.ts:185-189, 348-352, 468-472` 269 | - Usage in Vertex AI: `src/providers/vertexai/VertexAIProvider.ts:84-88, 161-165, 246-250` 270 | 271 | ## Environment Variables 272 | 273 | **Required for Development:** 274 | - `IMAGE_PROVIDER` and `VIDEO_PROVIDER`: Set to `google` or `vertex_ai` 275 | - Provider-specific credentials (GEMINI_API_KEY or VERTEX_CREDENTIALS + GCS_BUCKET_NAME) 276 | 277 | **Common Development Overrides:** 278 | - `TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=0` for deterministic object detection 279 | - `LOG_LEVEL=debug` for verbose logging during development 280 | - `NODE_ENV=development` for development-specific behavior 281 | 282 | ## Testing and Debugging 283 | 284 | - Use `npm run dev` for development with automatic rebuilding 285 | - Check console logs for detailed file processing information 286 | - Verify configuration hierarchy by setting different levels of environment variables 287 | - Test with multiple file sources (URLs, local files, base64) to ensure compatibility 288 | - Use structured logging patterns for consistent debugging output -------------------------------------------------------------------------------- /src/providers/base/VisionProvider.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Base vision provider interface and abstract class 3 | */ 4 | 5 | import type { 6 | VisionProvider, 7 | AnalysisOptions, 8 | AnalysisResult, 9 | UploadedFile, 10 | HealthStatus, 11 | ProviderCapabilities, 12 | ModelCapabilities, 13 | ProviderInfo, 14 | } from '../../types/Providers.js'; 15 | import type { TaskType } from '../../types/Analysis.js'; 16 | import { type FunctionName } from '../../constants/FunctionNames.js'; 17 | import { ConfigService } from '../../services/ConfigService.js'; 18 | 19 | export abstract class BaseVisionProvider implements VisionProvider { 20 | protected imageModel: string; 21 | protected videoModel: string; 22 | protected providerName: string; 23 | protected configService: ConfigService; 24 | 25 | constructor(providerName: string, imageModel: string, videoModel: string) { 26 | this.providerName = providerName; 27 | this.imageModel = imageModel; 28 | this.videoModel = videoModel; 29 | this.configService = ConfigService.getInstance(); 30 | } 31 | 32 | // Abstract methods that must be implemented by concrete providers 33 | abstract analyzeImage( 34 | imageSource: string, 35 | prompt: string, 36 | options?: AnalysisOptions 37 | ): Promise; 38 | abstract analyzeVideo( 39 | videoSource: string, 40 | prompt: string, 41 | options?: AnalysisOptions 42 | ): Promise; 43 | abstract compareImages( 44 | imageSources: string[], 45 | prompt: string, 46 | options?: AnalysisOptions 47 | ): Promise; 48 | abstract uploadFile( 49 | buffer: Buffer, 50 | filename: string, 51 | mimeType: string 52 | ): Promise; 53 | abstract downloadFile(fileId: string): Promise; 54 | abstract deleteFile(fileId: string): Promise; 55 | abstract getSupportedFormats(): ProviderCapabilities; 56 | abstract getModelCapabilities(): ModelCapabilities; 57 | abstract getProviderInfo(): ProviderInfo; 58 | abstract healthCheck(): Promise; 59 | 60 | // Concrete implementations for common functionality 61 | setModel(imageModel: string, videoModel: string): void { 62 | this.imageModel = imageModel; 63 | this.videoModel = videoModel; 64 | } 65 | 66 | getImageModel(): string { 67 | return this.imageModel; 68 | } 69 | 70 | getVideoModel(): string { 71 | return this.videoModel; 72 | } 73 | 74 | supportsVideo(): boolean { 75 | const capabilities = this.getSupportedFormats(); 76 | return capabilities.supportsVideo; 77 | } 78 | 79 | protected createAnalysisResult( 80 | text: string, 81 | model: string, 82 | usage?: { 83 | promptTokenCount: number; 84 | candidatesTokenCount: number; 85 | totalTokenCount: number; 86 | }, 87 | processingTime?: number, 88 | fileType?: string, 89 | fileSize?: number, 90 | modelVersion?: string, 91 | responseId?: string 92 | ): AnalysisResult { 93 | return { 94 | text, 95 | metadata: { 96 | model, 97 | provider: this.providerName, 98 | usage, 99 | processingTime, 100 | fileType, 101 | fileSize, 102 | modelVersion, 103 | responseId, 104 | }, 105 | }; 106 | } 107 | 108 | protected createHealthStatus( 109 | status: 'healthy' | 'unhealthy' | 'degraded', 110 | responseTime?: number, 111 | message?: string 112 | ): HealthStatus { 113 | return { 114 | status, 115 | lastCheck: new Date().toISOString(), 116 | responseTime, 117 | message, 118 | }; 119 | } 120 | 121 | protected async measureAsync( 122 | operation: () => Promise 123 | ): Promise<{ result: T; duration: number }> { 124 | const startTime = Date.now(); 125 | const result = await operation(); 126 | const duration = Date.now() - startTime; 127 | return { result, duration }; 128 | } 129 | 130 | protected isValidImageFormat(mimeType: string): boolean { 131 | return mimeType.startsWith('image/'); 132 | } 133 | 134 | protected isValidVideoFormat(mimeType: string): boolean { 135 | return mimeType.startsWith('video/'); 136 | } 137 | 138 | protected getProviderCapabilities(): ProviderCapabilities { 139 | return { 140 | supportedImageFormats: [ 141 | 'png', 142 | 'jpg', 143 | 'jpeg', 144 | 'webp', 145 | 'gif', 146 | 'bmp', 147 | 'tiff', 148 | ], 149 | supportedVideoFormats: [ 150 | 'mp4', 151 | 'mov', 152 | 'avi', 153 | 'mkv', 154 | 'webm', 155 | 'flv', 156 | 'wmv', 157 | '3gp', 158 | ], 159 | maxImageSize: 20 * 1024 * 1024, // 20MB 160 | maxVideoSize: 2 * 1024 * 1024 * 1024, // 2GB 161 | maxVideoDuration: 3600, // 1 hour 162 | supportsVideo: true, 163 | supportsFileUpload: true, 164 | }; 165 | } 166 | 167 | protected getBaseModelCapabilities(): ModelCapabilities { 168 | return { 169 | imageAnalysis: true, 170 | videoAnalysis: this.supportsVideo(), 171 | maxTokensForImage: 500, // Default, will be overridden by specific providers 172 | maxTokensForVideo: 2000, // Default, will be overridden by specific providers 173 | supportedFormats: this.getSupportedFormats().supportedImageFormats.concat( 174 | this.supportsVideo() 175 | ? this.getSupportedFormats().supportedVideoFormats 176 | : [] 177 | ), 178 | }; 179 | } 180 | 181 | protected resolveParameter( 182 | taskType: TaskType, 183 | directValue: number | undefined, 184 | getTaskSpecificValue: (taskType: TaskType) => number | undefined, 185 | getUniversalValue: () => number, 186 | defaultValue: number 187 | ): number { 188 | // Priority hierarchy: LLM-assigned > task-specific > universal > default 189 | if (directValue !== undefined) { 190 | return directValue; 191 | } 192 | 193 | const taskSpecificValue = getTaskSpecificValue(taskType); 194 | if (taskSpecificValue !== undefined) { 195 | return taskSpecificValue; 196 | } 197 | 198 | return getUniversalValue() || defaultValue; 199 | } 200 | 201 | protected resolveParameterWithFunction( 202 | taskType: TaskType, 203 | functionName: FunctionName | undefined, 204 | directValue: number | undefined, 205 | getFunctionSpecificValue: ( 206 | functionName: FunctionName 207 | ) => number | undefined, 208 | getTaskSpecificValue: (taskType: TaskType) => number | undefined, 209 | getUniversalValue: () => number, 210 | defaultValue: number 211 | ): number { 212 | // Priority hierarchy: LLM-assigned > function-specific > task-specific > universal > default 213 | if (directValue !== undefined) { 214 | return directValue; 215 | } 216 | 217 | if (functionName) { 218 | const functionSpecificValue = getFunctionSpecificValue(functionName); 219 | if (functionSpecificValue !== undefined) { 220 | return functionSpecificValue; 221 | } 222 | } 223 | 224 | const taskSpecificValue = getTaskSpecificValue(taskType); 225 | if (taskSpecificValue !== undefined) { 226 | return taskSpecificValue; 227 | } 228 | 229 | return getUniversalValue() || defaultValue; 230 | } 231 | 232 | protected resolveTemperature( 233 | taskType: TaskType, 234 | directValue: number | undefined 235 | ): number { 236 | return this.resolveParameter( 237 | taskType, 238 | directValue, 239 | this.configService.getTemperatureForTask.bind(this.configService), 240 | () => this.configService.getApiConfig().temperature, 241 | 0.8 242 | ); 243 | } 244 | 245 | protected resolveTopP( 246 | taskType: TaskType, 247 | directValue: number | undefined 248 | ): number { 249 | return this.resolveParameter( 250 | taskType, 251 | directValue, 252 | this.configService.getTopPForTask.bind(this.configService), 253 | () => this.configService.getApiConfig().topP, 254 | 0.95 255 | ); 256 | } 257 | 258 | protected resolveTopK( 259 | taskType: TaskType, 260 | directValue: number | undefined 261 | ): number { 262 | return this.resolveParameter( 263 | taskType, 264 | directValue, 265 | this.configService.getTopKForTask.bind(this.configService), 266 | () => this.configService.getApiConfig().topK, 267 | 30 268 | ); 269 | } 270 | 271 | protected resolveMaxTokens( 272 | taskType: TaskType, 273 | directValue: number | undefined 274 | ): number { 275 | const defaultValue = taskType === 'image' ? 500 : 2000; 276 | return this.resolveParameter( 277 | taskType, 278 | directValue, 279 | this.configService.getMaxTokensForTask.bind(this.configService), 280 | () => this.configService.getApiConfig().maxTokens, 281 | defaultValue 282 | ); 283 | } 284 | 285 | // Function-specific resolution methods 286 | protected resolveTemperatureForFunction( 287 | taskType: TaskType, 288 | functionName: FunctionName | undefined, 289 | directValue: number | undefined 290 | ): number { 291 | return this.resolveParameterWithFunction( 292 | taskType, 293 | functionName, 294 | directValue, 295 | this.configService.getTemperatureForFunction.bind(this.configService), 296 | this.configService.getTemperatureForTask.bind(this.configService), 297 | () => this.configService.getApiConfig().temperature, 298 | 0.8 299 | ); 300 | } 301 | 302 | protected resolveTopPForFunction( 303 | taskType: TaskType, 304 | functionName: FunctionName | undefined, 305 | directValue: number | undefined 306 | ): number { 307 | return this.resolveParameterWithFunction( 308 | taskType, 309 | functionName, 310 | directValue, 311 | this.configService.getTopPForFunction.bind(this.configService), 312 | this.configService.getTopPForTask.bind(this.configService), 313 | () => this.configService.getApiConfig().topP, 314 | 0.95 315 | ); 316 | } 317 | 318 | protected resolveTopKForFunction( 319 | taskType: TaskType, 320 | functionName: FunctionName | undefined, 321 | directValue: number | undefined 322 | ): number { 323 | return this.resolveParameterWithFunction( 324 | taskType, 325 | functionName, 326 | directValue, 327 | this.configService.getTopKForFunction.bind(this.configService), 328 | this.configService.getTopKForTask.bind(this.configService), 329 | () => this.configService.getApiConfig().topK, 330 | 30 331 | ); 332 | } 333 | 334 | protected resolveMaxTokensForFunction( 335 | taskType: TaskType, 336 | functionName: FunctionName | undefined, 337 | directValue: number | undefined 338 | ): number { 339 | const defaultValue = taskType === 'image' ? 500 : 2000; 340 | return this.resolveParameterWithFunction( 341 | taskType, 342 | functionName, 343 | directValue, 344 | this.configService.getMaxTokensForFunction.bind(this.configService), 345 | this.configService.getMaxTokensForTask.bind(this.configService), 346 | () => this.configService.getApiConfig().maxTokens, 347 | defaultValue 348 | ); 349 | } 350 | 351 | /** 352 | * Build config object with all standard options including structured output support 353 | * @param taskType - 'image' or 'video' 354 | * @param functionName - Specific function being called (for function-specific config) 355 | * @param options - Analysis options from caller 356 | * @returns Config object ready for API call 357 | */ 358 | protected buildConfigWithOptions( 359 | taskType: TaskType, 360 | functionName: FunctionName | undefined, 361 | options?: AnalysisOptions 362 | ): any { 363 | const config: any = { 364 | temperature: this.resolveTemperatureForFunction( 365 | taskType, 366 | functionName, 367 | options?.temperature 368 | ), 369 | topP: this.resolveTopPForFunction(taskType, functionName, options?.topP), 370 | topK: this.resolveTopKForFunction(taskType, functionName, options?.topK), 371 | maxOutputTokens: this.resolveMaxTokensForFunction( 372 | taskType, 373 | functionName, 374 | options?.maxTokens 375 | ), 376 | candidateCount: 1, 377 | }; 378 | 379 | // Add structured output configuration if responseSchema is provided 380 | if (options?.responseSchema) { 381 | config.responseMimeType = 'application/json'; 382 | config.responseSchema = options.responseSchema; 383 | } 384 | 385 | // Add system instruction if provided 386 | if (options?.systemInstruction) { 387 | config.systemInstruction = options.systemInstruction; 388 | } 389 | 390 | // Add thinking budget configuration for Gemini models 391 | const model = this.resolveModelForFunction(taskType, functionName); 392 | const thinkingBudget = this.getThinkingBudgetForModel(model); 393 | if (thinkingBudget !== undefined) { 394 | config.thinkingConfig = { 395 | thinkingBudget: thinkingBudget, 396 | }; 397 | } 398 | 399 | return config; 400 | } 401 | 402 | // Function-specific model resolution methods 403 | protected resolveModelForFunction( 404 | taskType: 'image' | 'video', 405 | functionName: FunctionName | undefined 406 | ): string { 407 | const systemDefault = 408 | taskType === 'image' ? 'gemini-2.5-flash-lite' : 'gemini-2.5-flash'; 409 | 410 | // Priority hierarchy: Function-specific > Task-specific > System default 411 | if (functionName) { 412 | const functionSpecificModel = 413 | this.configService.getModelForFunction(functionName); 414 | if (functionSpecificModel) { 415 | return functionSpecificModel; 416 | } 417 | } 418 | 419 | const taskSpecificModel = this.getModelForTask(taskType); 420 | if (taskSpecificModel) { 421 | return taskSpecificModel; 422 | } 423 | 424 | return systemDefault; 425 | } 426 | 427 | private getModelForTask(taskType: 'image' | 'video'): string | undefined { 428 | return taskType === 'image' 429 | ? this.configService.getConfig().IMAGE_MODEL 430 | : this.configService.getConfig().VIDEO_MODEL; 431 | } 432 | 433 | /** 434 | * Determine the appropriate thinking budget for Gemini model variants 435 | * Applies to both Gemini API and Vertex AI providers when using Gemini models 436 | * Based on user requirements: 437 | * - gemini-2.5-flash-lite and gemini-2.5-flash: thinking_budget = 0 438 | * - gemini-2.5-pro: thinking_budget = 128 439 | * - Other models: no thinking budget (undefined) 440 | * @param model - The model name 441 | * @returns thinking budget value or undefined if not applicable 442 | */ 443 | protected getThinkingBudgetForModel(model: string): number | undefined { 444 | // Only apply thinking budget to specific Gemini model variants 445 | // This works for both direct Gemini API and Vertex AI when using Gemini models 446 | if ( 447 | model.includes('gemini-2.5-flash-lite') || 448 | model.includes('gemini-2.5-flash') 449 | ) { 450 | // For flash models, use minimal thinking budget for faster response 451 | return 0; 452 | } else if (model.includes('gemini-2.5-pro')) { 453 | // For pro models, use higher thinking budget for better reasoning 454 | return 128; 455 | } 456 | 457 | // For other models (older Gemini versions, non-Gemini models), don't set thinking budget 458 | return undefined; 459 | } 460 | } 461 | -------------------------------------------------------------------------------- /docs/llm_logs/web-context-detection-plan.md: -------------------------------------------------------------------------------- 1 | # Plan: Web Context Detection and HTML Element Classification 2 | 3 | **Date**: 2025-01-10 4 | **Author**: Claude Code 5 | **Issue**: Enhance object detection to automatically detect web page contexts and use appropriate HTML element names for better automation compatibility 6 | 7 | ## Problem Statement 8 | 9 | The current object detection system uses generic element names regardless of context: 10 | 11 | 1. **Context-Agnostic Naming**: Uses "button", "input", "text" for all interfaces (web, mobile, desktop) 12 | 2. **Missed Semantic Opportunities**: Web pages could benefit from HTML-specific element names 13 | 3. **Automation Mismatch**: Generic names don't align with CSS selector targeting for web automation 14 | 4. **Limited Specificity**: Cannot distinguish between HTML input types (text, email, password, etc.) 15 | 16 | ## Solution: Context-Aware System Instructions 17 | 18 | ### Core Approach 19 | - **Automatic Web Detection**: Enhance system instructions to identify web page interfaces 20 | - **HTML Element Classification**: Use semantic HTML element names when web context is detected 21 | - **Fallback Mechanism**: Maintain current generic naming for non-web contexts 22 | - **Progressive Enhancement**: Start with basic detection, enhance over time 23 | 24 | ## Implementation Strategy 25 | 26 | ### Phase 1: Enhanced System Instructions (Day 1-2) 27 | 28 | **Objective**: Modify the detection system instruction to include web context detection logic. 29 | 30 | **Key Changes**: 31 | 1. **Context Detection Prompting**: Add web interface identification step 32 | 2. **HTML Element Vocabulary**: Provide comprehensive HTML element list 33 | 3. **Conditional Logic**: Use HTML names for web contexts, generic names otherwise 34 | 4. **Input Type Specificity**: Detect specific input types when possible 35 | 36 | **Updated System Instruction Structure**: 37 | ``` 38 | 1. CONTEXT DETECTION: 39 | - Analyze if image shows a web page, browser interface, or web application 40 | - Look for indicators: address bars, browser UI, web-style layouts, form elements 41 | - If web context detected → use HTML element names 42 | - If non-web context → use generic object names 43 | 44 | 2. HTML ELEMENT CLASSIFICATION (Web Context Only): 45 | - Interactive Elements: button, input[type], select, textarea, a 46 | - Form Elements: form, label, fieldset, legend 47 | - Structural: nav, header, footer, main, section, article 48 | - Content: h1-h6, p, img, video, ul, ol, li 49 | 50 | 3. INPUT TYPE DETECTION: 51 | - Analyze visual cues for input specificity 52 | - text, email, password, search, tel, url, number, date 53 | - checkbox, radio, file, submit, reset 54 | 55 | 4. FALLBACK NAMING: 56 | - Non-web contexts: button, text, image, icon, object, container 57 | ``` 58 | 59 | ### Phase 2: Web Context Indicators (Day 2-3) 60 | 61 | **Visual Indicators for Web Detection**: 62 | - **Browser Elements**: Address bar, navigation buttons, tabs, bookmarks bar 63 | - **Web UI Patterns**: Navigation menus, breadcrumbs, pagination, form layouts 64 | - **Typography**: Web fonts, text rendering typical of browsers 65 | - **Layout Patterns**: Grid systems, responsive design indicators, web-style spacing 66 | - **Form Elements**: Standard HTML form controls with web styling 67 | 68 | **Detection Logic**: 69 | ```typescript 70 | const WEB_CONTEXT_INDICATORS = [ 71 | // Browser UI 72 | 'address bar', 'url bar', 'browser tab', 'bookmark bar', 73 | 'browser window', 'navigation buttons', 74 | 75 | // Web Interface Patterns 76 | 'navigation menu', 'breadcrumb', 'pagination', 'web form', 77 | 'login form', 'search bar', 'dropdown menu', 'checkbox', 78 | 'radio button', 'submit button', 'hyperlink', 79 | 80 | // Layout Patterns 81 | 'web page', 'website', 'web application', 'responsive design', 82 | 'grid layout', 'sidebar', 'header', 'footer', 'navigation' 83 | ]; 84 | ``` 85 | 86 | ### Phase 3: HTML Element Mapping (Day 3-4) 87 | 88 | **Interactive Elements** (High Priority): 89 | ``` 90 | button →