├── CLAUDE.md
├── src
    ├── providers
    │   ├── base
    │   │   ├── index.ts
    │   │   └── VisionProvider.ts
    │   ├── gemini
    │   │   └── index.ts
    │   ├── vertexai
    │   │   └── index.ts
    │   └── factory
    │   │   └── ProviderFactory.ts
    ├── utils
    │   ├── index.ts
    │   ├── credentialsParser.ts
    │   ├── imageAnnotator.ts
    │   ├── retry.ts
    │   └── validation.ts
    ├── storage
    │   ├── gcs
    │   │   ├── index.ts
    │   │   └── GCSStorage.ts
    │   └── index.ts
    ├── index.ts
    ├── tools
    │   ├── index.ts
    │   ├── analyze_video.ts
    │   ├── analyze_image.ts
    │   └── compare_images.ts
    ├── constants
    │   └── FunctionNames.ts
    ├── types
    │   ├── index.ts
    │   ├── Storage.ts
    │   ├── ObjectDetection.ts
    │   ├── Analysis.ts
    │   ├── Config.ts
    │   ├── Errors.ts
    │   └── Providers.ts
    ├── file-upload
    │   ├── vertexai
    │   │   └── VertexAIStorageStrategy.ts
    │   ├── gemini
    │   │   └── GeminiFilesAPI.ts
    │   └── factory
    │   │   └── FileUploadFactory.ts
    └── services
    │   └── FileService.ts
├── .prettierrc
├── .eslintrc.json
├── .claude
    └── settings.local.json
├── .gitignore
├── docs
    ├── TASK.md
    ├── PLAN.md
    ├── llm_logs
    │   ├── dynamic-schema-validation-fix-plan.md
    │   ├── vertex-ai-auth-fix-plan.md
    │   ├── percentage-scale-text-summary-plan.md
    │   └── web-context-detection-plan.md
    └── environment-variable-guide.md
├── LICENSE
├── tsconfig.json
├── package.json
├── .env.example
├── AGENTS.md
└── README.md


/CLAUDE.md:
--------------------------------------------------------------------------------
1 | AGENTS.md


--------------------------------------------------------------------------------
/src/providers/base/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * Base provider exports
3 |  */
4 | 
5 | export * from './VisionProvider.js';
6 | 


--------------------------------------------------------------------------------
/src/providers/gemini/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * Gemini provider exports
3 |  */
4 | 
5 | export * from './GeminiProvider.js';
6 | 


--------------------------------------------------------------------------------
/src/providers/vertexai/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * Vertex AI provider exports
3 |  */
4 | 
5 | export * from './VertexAIProvider.js';
6 | 


--------------------------------------------------------------------------------
/src/utils/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * Utility exports
3 |  */
4 | 
5 | export * from './validation.js';
6 | export * from './retry.js';
7 | 


--------------------------------------------------------------------------------
/src/storage/gcs/index.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * Google Cloud Storage exports
3 |  */
4 | 
5 | export { GCSStorageProvider } from './GCSStorage.js';
6 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | 
3 | /**
4 |  * Main entry point for the AI Vision MCP server
5 |  */
6 | 
7 | // Import and start the server
8 | import './server.js';
9 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "semi": true,
 3 |   "trailingComma": "es5",
 4 |   "singleQuote": true,
 5 |   "printWidth": 80,
 6 |   "tabWidth": 2,
 7 |   "useTabs": false,
 8 |   "bracketSpacing": true,
 9 |   "arrowParens": "avoid",
10 |   "endOfLine": "lf"
11 | }


--------------------------------------------------------------------------------
/src/storage/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Storage providers exports
 3 |  */
 4 | 
 5 | export {
 6 |   StorageProvider,
 7 |   StorageFile,
 8 |   StorageConfig,
 9 | } from '../types/Storage.js';
10 | export { GCSStorageProvider } from './gcs/GCSStorage.js';
11 | export type { GCSConfig } from '../types/Config.js';
12 | 


--------------------------------------------------------------------------------
/src/tools/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Tools exports
 3 |  */
 4 | 
 5 | export { analyze_image, type AnalyzeImageArgs } from './analyze_image.js';
 6 | export { analyze_video, type AnalyzeVideoArgs } from './analyze_video.js';
 7 | export {
 8 |   detect_objects_in_image,
 9 |   type ObjectDetectionArgs,
10 | } from './detect_objects_in_image.js';
11 | export { compare_images, type CompareImagesArgs } from './compare_images.js';
12 | 


--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "env": {
 3 |     "es2022": true,
 4 |     "node": true,
 5 |     "jest": true
 6 |   },
 7 |   "extends": [
 8 |     "eslint:recommended"
 9 |   ],
10 |   "parser": "@typescript-eslint/parser",
11 |   "parserOptions": {
12 |     "ecmaVersion": "latest",
13 |     "sourceType": "module"
14 |   },
15 |   "plugins": [
16 |     "@typescript-eslint"
17 |   ],
18 |   "rules": {
19 |     "no-unused-vars": "off",
20 |     "@typescript-eslint/no-unused-vars": "warn",
21 |     "@typescript-eslint/no-explicit-any": "off",
22 |     "no-console": "off",
23 |     "prefer-const": "error",
24 |     "no-case-declarations": "off",
25 |     "no-useless-escape": "off"
26 |   },
27 |   "ignorePatterns": ["dist/", "node_modules/", "*.js", "**/__tests__/**/*"]
28 | }


--------------------------------------------------------------------------------
/.claude/settings.local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "permissions": {
 3 |     "allow": [
 4 |       "WebSearch",
 5 |       "WebFetch",
 6 |       "mcp__context7__resolve-library-id",
 7 |       "mcp__context7__get-library-docs",
 8 |       "Bash(npm run build:*)",
 9 |       "Bash(npm install:*)",
10 |       "Bash(npm run lint:*)",
11 |       "Bash(npm run format:*)",
12 |       "Bash(npm run prepublishOnly:*)",
13 |       "Bash(npx:*)",
14 |       "Bash(mcp-inspector:*)",
15 |       "Bash(node:*)",
16 |       "mcp__ai-vision-mcp__analyze_image",
17 |       "Bash(cat .env)",
18 |       "mcp__ai-vision-mcp__analyze_video",
19 |       "mcp__ai-vision-mcp__compare_images",
20 |       "mcp__claude-context__index_codebase",
21 |       "mcp__claude-context__search_code"
22 |     ],
23 |     "deny": [],
24 |     "ask": []
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Dependencies
 4 | node_modules/
 5 | npm-debug.log*
 6 | yarn-debug.log*
 7 | yarn-error.log*
 8 | 
 9 | # Build outputs
10 | dist/
11 | build/
12 | *.tsbuildinfo
13 | 
14 | # Environment variables
15 | .env
16 | .env.local
17 | .env.development.local
18 | .env.test.local
19 | .env.production.local
20 | 
21 | # IDE
22 | .vscode/
23 | .idea/
24 | *.swp
25 | *.swo
26 | *~
27 | 
28 | # OS
29 | .DS_Store
30 | .DS_Store?
31 | ._*
32 | .Spotlight-V100
33 | .Trashes
34 | ehthumbs.db
35 | Thumbs.db
36 | 
37 | # Logs
38 | logs
39 | *.log
40 | 
41 | # Coverage
42 | coverage/
43 | coverage-integration/
44 | 
45 | # Jest
46 | .nyc_output
47 | 
48 | # Temporary files
49 | *.tmp
50 | *.temp
51 | 
52 | # mcp-related configuration
53 | mcp-publisher.exe
54 | .mcpregistry_github_token
55 | .mcpregistry_registry_token


--------------------------------------------------------------------------------
/docs/TASK.md:
--------------------------------------------------------------------------------
 1 | ## Tasks to do:
 2 | 
 3 | 1. what this image is about: https://images.pexels.com/photos/1391498/pexels-photo-1391498.jpeg
 4 | 
 5 | 2. what this image is about: "C:\Users\tys\Downloads\rqo6ns_1024x1024.jpg"
 6 | 
 7 | 3. compare the images: "C:\Users\tys\Downloads\rqo6ns_1024x1024.jpg" and https://images.pexels.com/photos/1391498/pexels-photo-1391498.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
 8 | 
 9 | 4. compare the images: https://img.freepik.com/free-photo/beautiful-girl-stands-park_8353-5084.jpg?semt=ais_hybrid&w=740&q=80 and https://images.pexels.com/photos/1391498/pexels-photo-1391498.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500
10 | 
11 | 5. Detect the dress wear: https://ichef.bbci.co.uk/images/ic/480xn/p0529h01.jpg
12 | 
13 | 6. Detect all buttons in this webpage: "C:\Users\tys\Downloads\export\static\input\tanyongsheng_screenshot.png"
14 | 
15 | 7. what this video is about: https://www.youtube.com/watch?v=9hE5-98ZeCg
16 | 
17 | 8. what this video is about: "C:\Users\tys\Downloads\test.mp4"
18 | 


--------------------------------------------------------------------------------
/src/constants/FunctionNames.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Centralized function name constants to avoid hard-coding
 3 |  */
 4 | 
 5 | export const FUNCTION_NAMES = {
 6 |   ANALYZE_IMAGE: 'analyze_image',
 7 |   COMPARE_IMAGES: 'compare_images',
 8 |   DETECT_OBJECTS_IN_IMAGE: 'detect_objects_in_image',
 9 |   ANALYZE_VIDEO: 'analyze_video',
10 | } as const;
11 | 
12 | export type FunctionName = (typeof FUNCTION_NAMES)[keyof typeof FUNCTION_NAMES];
13 | 
14 | // Union types for different function groups
15 | export const IMAGE_FUNCTIONS = [
16 |   FUNCTION_NAMES.ANALYZE_IMAGE,
17 |   FUNCTION_NAMES.COMPARE_IMAGES,
18 |   FUNCTION_NAMES.DETECT_OBJECTS_IN_IMAGE,
19 | ] as const;
20 | 
21 | export const VIDEO_FUNCTIONS = [FUNCTION_NAMES.ANALYZE_VIDEO] as const;
22 | 
23 | export const ALL_FUNCTIONS = [...IMAGE_FUNCTIONS, ...VIDEO_FUNCTIONS] as const;
24 | 
25 | // Type helpers
26 | export type ImageFunctionName = (typeof IMAGE_FUNCTIONS)[number];
27 | export type VideoFunctionName = (typeof VIDEO_FUNCTIONS)[number];
28 | export type AllFunctionName = (typeof ALL_FUNCTIONS)[number];
29 | 


--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Central type exports for the Vision MCP Server
 3 |  */
 4 | 
 5 | // Export Config types with aliases to avoid conflicts
 6 | export type {
 7 |   Config,
 8 |   GeminiConfig as GeminiProviderConfig,
 9 |   VertexAIConfig as VertexAIProviderConfig,
10 |   FileUploadConfig,
11 |   ApiConfig,
12 |   FileProcessingConfig,
13 |   LoggingConfig,
14 |   DevelopmentConfig,
15 | } from './Config.js';
16 | 
17 | // Export all other types normally
18 | export * from './Analysis.js';
19 | export type {
20 |   StorageProvider,
21 |   StorageFile,
22 |   StorageConfig,
23 |   UploadOptions,
24 |   ListOptions,
25 |   ListResult,
26 |   SignedUrlOptions,
27 | } from './Storage.js';
28 | export * from './Providers.js';
29 | export type {
30 |   VisionError,
31 |   ConfigurationError,
32 |   ProviderError,
33 |   FileUploadError,
34 |   FileNotFoundError,
35 |   UnsupportedFileTypeError,
36 |   FileSizeExceededError,
37 |   RateLimitExceededError,
38 |   AuthenticationError,
39 |   AuthorizationError,
40 |   NetworkError,
41 |   ValidationError,
42 |   StorageError,
43 |   ErrorType,
44 |   ErrorDetails,
45 | } from './Errors.js';
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 AI Vision MCP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2022",
 4 |     "module": "ESNext",
 5 |     "moduleResolution": "node",
 6 |     "lib": ["ES2022"],
 7 |     "outDir": "./dist",
 8 |     "rootDir": "./src",
 9 |     "strict": true,
10 |     "esModuleInterop": true,
11 |     "skipLibCheck": true,
12 |     "forceConsistentCasingInFileNames": true,
13 |     "declaration": true,
14 |     "declarationMap": true,
15 |     "sourceMap": true,
16 |     "removeComments": true,
17 |     "noImplicitAny": true,
18 |     "noImplicitReturns": true,
19 |     "noImplicitThis": true,
20 |     // "noUnusedLocals": true,
21 |     // "noUnusedParameters": true,
22 |     // "exactOptionalPropertyTypes": true,
23 |     "noImplicitOverride": true,
24 |     // "noPropertyAccessFromIndexSignature": true,
25 |     // "noUncheckedIndexedAccess": true,
26 |     "resolveJsonModule": true,
27 |     "allowSyntheticDefaultImports": true,
28 |     "experimentalDecorators": true,
29 |     "emitDecoratorMetadata": true,
30 |     "baseUrl": ".",
31 |     "paths": {
32 |       "@/*": ["src/*"]
33 |     }
34 |   },
35 |   "include": ["src/**/*"],
36 |   "exclude": ["node_modules", "dist", "**/*.test.ts", "**/*.spec.ts"]
37 | }


--------------------------------------------------------------------------------
/src/types/Storage.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Storage types for cloud storage providers
 3 |  */
 4 | 
 5 | export interface StorageProvider {
 6 |   uploadFile(
 7 |     buffer: Buffer,
 8 |     filename: string,
 9 |     mimeType: string
10 |   ): Promise<StorageFile>;
11 |   downloadFile(fileId: string): Promise<Buffer>;
12 |   deleteFile(fileId: string): Promise<void>;
13 |   getPublicUrl(fileId: string): Promise<string>;
14 |   getSignedUrl(fileId: string, expiresIn: number): Promise<string>;
15 |   listFiles(prefix?: string): Promise<StorageFile[]>;
16 | }
17 | 
18 | export interface StorageFile {
19 |   id: string;
20 |   filename: string;
21 |   mimeType: string;
22 |   size: number;
23 |   url: string;
24 |   lastModified: string;
25 |   etag?: string;
26 |   metadata?: Record<string, string>;
27 | }
28 | 
29 | export interface StorageConfig {
30 |   accessKey: string;
31 |   secretKey: string;
32 |   region: string;
33 |   bucket: string;
34 |   endpoint: string;
35 |   cdnUrl?: string;
36 |   forcePathStyle?: boolean;
37 |   signatureVersion?: string;
38 | }
39 | 
40 | export interface UploadOptions {
41 |   metadata?: Record<string, string>;
42 |   contentType?: string;
43 |   cacheControl?: string;
44 |   expires?: Date;
45 |   tags?: Record<string, string>;
46 | }
47 | 
48 | export interface ListOptions {
49 |   prefix?: string;
50 |   maxKeys?: number;
51 |   continuationToken?: string;
52 | }
53 | 
54 | export interface ListResult {
55 |   files: StorageFile[];
56 |   isTruncated: boolean;
57 |   nextContinuationToken?: string;
58 |   count: number;
59 | }
60 | 
61 | export interface SignedUrlOptions {
62 |   expiresIn: number;
63 |   method?: 'GET' | 'PUT' | 'DELETE';
64 |   contentType?: string;
65 |   checksum?: string;
66 | }
67 | 
68 | export interface StorageError extends Error {
69 |   code: string;
70 |   statusCode?: number;
71 |   region?: string;
72 |   time: Date;
73 |   request_id?: string;
74 | }
75 | 


--------------------------------------------------------------------------------
/src/file-upload/vertexai/VertexAIStorageStrategy.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Vertex AI storage upload strategy using external storage
 3 |  */
 4 | 
 5 | import type {
 6 |   FileUploadStrategy,
 7 |   UploadedFile,
 8 |   FileReference,
 9 | } from '../../types/Providers.js';
10 | import type { StorageProvider } from '../../types/Storage.js';
11 | import { FileUploadError } from '../../types/Errors.js';
12 | 
13 | export class VertexAIStorageStrategy implements FileUploadStrategy {
14 |   constructor(private storageProvider: StorageProvider) {}
15 | 
16 |   async uploadFile(
17 |     buffer: Buffer,
18 |     filename: string,
19 |     mimeType: string
20 |   ): Promise<UploadedFile> {
21 |     try {
22 |       return await this.storageProvider.uploadFile(buffer, filename, mimeType);
23 |     } catch (error) {
24 |       throw new FileUploadError(
25 |         `Failed to upload file to external storage: ${error instanceof Error ? error.message : String(error)}`,
26 |         'vertex_ai',
27 |         error instanceof Error ? error : new Error(String(error))
28 |       );
29 |     }
30 |   }
31 | 
32 |   async getFileForAnalysis(uploadedFile: UploadedFile): Promise<FileReference> {
33 |     if (!uploadedFile.id) {
34 |       throw new FileUploadError(
35 |         'Uploaded file does not have an ID for analysis',
36 |         'vertex_ai'
37 |       );
38 |     }
39 | 
40 |     // For Vertex AI with native GCS, the URL is already in gs:// format
41 |     const gcsUri = await this.storageProvider.getPublicUrl(uploadedFile.id);
42 | 
43 |     return {
44 |       type: 'file_uri',
45 |       uri: gcsUri,
46 |       mimeType: uploadedFile.mimeType,
47 |     };
48 |   }
49 | 
50 |   async cleanup(fileId: string): Promise<void> {
51 |     try {
52 |       await this.storageProvider.deleteFile(fileId);
53 |     } catch (error) {
54 |       // Log error but don't throw - cleanup failures shouldn't block the main flow
55 |       console.warn(`Failed to cleanup storage file ${fileId}:`, error);
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/file-upload/gemini/GeminiFilesAPI.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Gemini Files API upload strategy
 3 |  */
 4 | 
 5 | import type {
 6 |   FileUploadStrategy,
 7 |   UploadedFile,
 8 |   FileReference,
 9 | } from '../../types/Providers.js';
10 | import { GeminiProvider } from '../../providers/gemini/GeminiProvider.js';
11 | import { FileUploadError } from '../../types/Errors.js';
12 | 
13 | export class GeminiFilesAPI implements FileUploadStrategy {
14 |   constructor(private geminiProvider: GeminiProvider) {}
15 | 
16 |   async uploadFile(
17 |     buffer: Buffer,
18 |     filename: string,
19 |     mimeType: string
20 |   ): Promise<UploadedFile> {
21 |     try {
22 |       return await this.geminiProvider.uploadFile(buffer, filename, mimeType);
23 |     } catch (error) {
24 |       throw new FileUploadError(
25 |         `Failed to upload file to Gemini Files API: ${error instanceof Error ? error.message : String(error)}`,
26 |         'gemini',
27 |         error instanceof Error ? error : new Error(String(error))
28 |       );
29 |     }
30 |   }
31 | 
32 |   async getFileForAnalysis(uploadedFile: UploadedFile): Promise<FileReference> {
33 |     if (!uploadedFile.uri) {
34 |       throw new FileUploadError(
35 |         'Uploaded file does not have a URI for analysis',
36 |         'gemini'
37 |       );
38 |     }
39 | 
40 |     // Wait for the file to become ACTIVE before returning it for analysis
41 |     await this.waitForFileProcessing(uploadedFile.id);
42 | 
43 |     return {
44 |       type: 'file_uri',
45 |       uri: uploadedFile.uri,
46 |       mimeType: uploadedFile.mimeType,
47 |     };
48 |   }
49 | 
50 |   private async waitForFileProcessing(fileId: string): Promise<void> {
51 |     await this.geminiProvider.waitForFileProcessing(fileId);
52 |   }
53 | 
54 |   async cleanup(fileId: string): Promise<void> {
55 |     try {
56 |       await this.geminiProvider.deleteFile(fileId);
57 |     } catch (error) {
58 |       // Log error but don't throw - cleanup failures shouldn't block the main flow
59 |       console.warn(`Failed to cleanup Gemini file ${fileId}:`, error);
60 |     }
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/file-upload/factory/FileUploadFactory.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * File upload strategy factory
 3 |  */
 4 | 
 5 | import type { FileUploadStrategy } from '../../types/Providers.js';
 6 | import type { Config } from '../../types/Config.js';
 7 | import { GeminiFilesAPI } from '../gemini/GeminiFilesAPI.js';
 8 | import { VertexAIStorageStrategy } from '../vertexai/VertexAIStorageStrategy.js';
 9 | import { GCSStorageProvider } from '../../storage/gcs/GCSStorage.js';
10 | import { GeminiProvider } from '../../providers/gemini/GeminiProvider.js';
11 | import { ConfigurationError } from '../../types/Errors.js';
12 | import { ConfigService } from '../../services/ConfigService.js';
13 | 
14 | export class FileUploadFactory {
15 |   static createStrategy(
16 |     config: Config,
17 |     type: 'image' | 'video',
18 |     visionProvider: GeminiProvider
19 |   ): FileUploadStrategy {
20 |     const providerName =
21 |       type === 'image' ? config.IMAGE_PROVIDER : config.VIDEO_PROVIDER;
22 | 
23 |     switch (providerName) {
24 |       case 'google':
25 |         return new GeminiFilesAPI(visionProvider);
26 | 
27 |       case 'vertex_ai':
28 |         // For Vertex AI, we need Google Cloud Storage with native GCS SDK
29 |         const gcsConfig = ConfigService.getInstance().getGCSConfig();
30 |         const storageProvider = new GCSStorageProvider(gcsConfig);
31 |         return new VertexAIStorageStrategy(storageProvider);
32 | 
33 |       default:
34 |         throw new ConfigurationError(
35 |           `Unsupported provider for file upload: ${providerName}`
36 |         );
37 |     }
38 |   }
39 | 
40 |   static getThreshold(config: Config, type: 'image' | 'video'): number {
41 |     const providerName =
42 |       type === 'image' ? config.IMAGE_PROVIDER : config.VIDEO_PROVIDER;
43 | 
44 |     if (providerName === 'google') {
45 |       return config.GEMINI_FILES_API_THRESHOLD || 10 * 1024 * 1024; // 10MB default
46 |     }
47 | 
48 |     if (providerName === 'vertex_ai') {
49 |       // Vertex AI requires external storage for all files
50 |       return config.VERTEX_AI_FILES_API_THRESHOLD || 0;
51 |     }
52 | 
53 |     return 0;
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/docs/PLAN.md:
--------------------------------------------------------------------------------
 1 | ## DONE
 2 | [x] should we split MAX_TOKENS environment variable into MAX_TOKENS_FOR_IMAGE and MAX_TOKENS_FOR_VIDEO? Justify and don't write the code first.
 3 | [x] add detect_objects_in_image mcp function
 4 | [x] add native support to change GEMINI_BASE_URL to be switched to another proxy provider
 5 | 
 6 | ## TODO
 7 | 
 8 | **IMPLEMENTED BUT TO BE TESTED**
 9 | - add TEMPERATURE_FOR_IMAGE, TOP_P_FOR_IMAGE, TOP_K_FOR_IMAGE, TEMPERATURE_FOR_VIDEO, TOP_P_FOR_VIDEO, TOP_K_FOR_VIDEO (prepare for future....)
10 | - add MAX_TOKENS, TOP_P, TOP_K, TEMPERATURE configurable as environment variable for each function
11 | - add MAX_TOKENS, since above set the standard for TOP_P, TOP_K, TEMPERATURE...
12 | - add ANALYZE_IMAGE_MODEL, COMPARE_IMAGES_MODEL , ANALYZE_VIDEO_MODEL environment variable...
13 | 
14 | **URGENT**
15 | - ..
16 | 
17 | **ICEBOX**
18 | - optional dependencies to download when define, for example, ai-vision-mcp[google] (Reason: a bit hard to manage, currently only two providers so the dependencies not that large yet ...)
19 | 
20 | **DISPOSAL**
21 | 
22 | - remove MAX_VIDEO_DURATION environment variable...
23 | 
24 | - let user to add their custom SYSTEM_INSTRUCTIONS_FOR_IMAGE_MODEL and SYSTEM_INSTRUCTIONS_FOR_VIDEO_MODEL ... (Reason: hard to control the behaviour, for example, somebody may inject harmful prompt?)
25 | 
26 | - add analyze_image description for prompt params: "Detailed text prompt. If the task is **front-end code replication**, the prompt you provide must be: "Describe in detail the layout structure, color style, main components, and interactive elements of the website in this image to facilitate subsequent code generation by the model." + your additional requirements. \ For **other tasks**, the prompt you provide must clearly describe what to analyze, extract, or understand from the image." (Reason: wait too long for such task to complete, but can try to add `timeout` params to mcp client in future)
27 | 
28 | - add metadata params per model level like supportsThinking, supportsNoThinking - set thinkingbudget=0 for all models (except gemini 2.5 pro) - (Reason: hard to add another layer `thinking` as I think thinking_budget is not that useful for image analysis - unsure about this...)
29 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ai-vision-mcp",
 3 |   "version": "0.0.5",
 4 |   "description": "Vision MCP server that provides AI-powered image and video analysis using Google Gemini and Vertex AI",
 5 |   "main": "dist/index.js",
 6 |   "type": "module",
 7 |   "scripts": {
 8 |     "build": "tsc",
 9 |     "dev": "tsc --watch",
10 |     "start": "node dist/index.js",
11 |     "lint": "eslint src/**/*.ts",
12 |     "lint:fix": "eslint src/**/*.ts --fix",
13 |     "format": "prettier --write src/**/*.ts",
14 |     "prepare": "npm run build",
15 |     "prepublishOnly": "npm run lint",
16 |     "preversion": "npm run lint",
17 |     "version": "npm run format && git add -A src",
18 |     "postversion": "git push && git push --tags",
19 |     "publish-registry": "mcp-publisher publish",
20 |     "publish-registry:dry-run": "mcp-publisher publish --dry-run"
21 |   },
22 |   "keywords": [
23 |     "mcp",
24 |     "vision",
25 |     "ai",
26 |     "image-analysis",
27 |     "video-analysis",
28 |     "gemini",
29 |     "vertex-ai",
30 |     "google-ai"
31 |   ],
32 |   "author": "Tan Yong Sheng <tys203831@gmail.com>",
33 |   "license": "MIT",
34 |   "bin": {
35 |     "ai-vision-mcp": "dist/index.js"
36 |   },
37 |   "files": [
38 |     "dist",
39 |     "README.md",
40 |     "LICENSE"
41 |   ],
42 |   "repository": {
43 |     "type": "git",
44 |     "url": "https://github.com/tan-yong-sheng/ai-vision-mcp.git"
45 |   },
46 |   "bugs": {
47 |     "url": "https://github.com/tan-yong-sheng/ai-vision-mcp/issues"
48 |   },
49 |   "homepage": "https://github.com/tan-yong-sheng/ai-vision-mcp#readme",
50 |   "dependencies": {
51 |     "@google-cloud/storage": "^7.17.1",
52 |     "@google/genai": "^1.24.0",
53 |     "@modelcontextprotocol/sdk": "^1.0.0",
54 |     "dotenv": "^16.4.5",
55 |     "mime-types": "^2.1.35",
56 |     "node-fetch": "^3.3.2",
57 |     "sharp": "^0.33.5",
58 |     "zod": "^3.23.8"
59 |   },
60 |   "devDependencies": {
61 |     "@types/glob": "^8.1.0",
62 |     "@types/html-to-text": "^9.0.4",
63 |     "@types/http-cache-semantics": "^4.0.4",
64 |     "@types/long": "^4.0.2",
65 |     "@types/mime-types": "^2.1.4",
66 |     "@types/node": "^20.14.0",
67 |     "@types/phoenix": "^1.6.6",
68 |     "@types/ws": "^8.18.1",
69 |     "@types/yauzl": "^2.10.3",
70 |     "@typescript-eslint/eslint-plugin": "^7.18.0",
71 |     "@typescript-eslint/parser": "^7.18.0",
72 |     "eslint": "^8.57.0",
73 |     "eslint-config-prettier": "^9.1.2",
74 |     "eslint-plugin-prettier": "^5.5.4",
75 |     "prettier": "^3.3.2",
76 |     "typescript": "^5.5.0"
77 |   },
78 |   "engines": {
79 |     "node": ">=18.0.0"
80 |   },
81 |   "publishConfig": {
82 |     "access": "public"
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/src/tools/analyze_video.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * MCP Tool: analyze_video
 3 |  * Analyzes a video using AI vision models. Supports URLs and local file paths.
 4 |  */
 5 | 
 6 | import type { AnalysisOptions, AnalysisResult } from '../types/Providers.js';
 7 | import type { VisionProvider } from '../types/Providers.js';
 8 | import { FileService } from '../services/FileService.js';
 9 | import type { Config } from '../types/Config.js';
10 | import { VisionError } from '../types/Errors.js';
11 | import { FUNCTION_NAMES } from '../constants/FunctionNames.js';
12 | 
13 | export interface AnalyzeVideoArgs {
14 |   videoSource: string; // Can be URL or local file path
15 |   prompt: string;
16 |   options?: AnalysisOptions;
17 | }
18 | 
19 | export async function analyze_video(
20 |   args: AnalyzeVideoArgs,
21 |   config: Config,
22 |   videoProvider: VisionProvider,
23 |   videoFileService: FileService
24 | ): Promise<AnalysisResult> {
25 |   try {
26 |     // Validate arguments
27 |     if (!args.videoSource) {
28 |       throw new VisionError('videoSource is required', 'MISSING_ARGUMENT');
29 |     }
30 |     if (!args.prompt) {
31 |       throw new VisionError('prompt is required', 'MISSING_ARGUMENT');
32 |     }
33 | 
34 |     // Handle video source (URL vs local file)
35 |     const processedVideoSource = await videoFileService.handleVideoSource(
36 |       args.videoSource
37 |     );
38 | 
39 |     // Merge default options with provided options
40 |     const options: AnalysisOptions = {
41 |       temperature:
42 |         config.TEMPERATURE_FOR_ANALYZE_VIDEO ??
43 |         config.TEMPERATURE_FOR_VIDEO ??
44 |         config.TEMPERATURE,
45 |       topP:
46 |         config.TOP_P_FOR_ANALYZE_VIDEO ??
47 |         config.TOP_P_FOR_VIDEO ??
48 |         config.TOP_P,
49 |       topK:
50 |         config.TOP_K_FOR_ANALYZE_VIDEO ??
51 |         config.TOP_K_FOR_VIDEO ??
52 |         config.TOP_K,
53 |       maxTokens:
54 |         config.MAX_TOKENS_FOR_ANALYZE_VIDEO ??
55 |         config.MAX_TOKENS_FOR_VIDEO ??
56 |         config.MAX_TOKENS,
57 |       taskType: 'video',
58 |       functionName: FUNCTION_NAMES.ANALYZE_VIDEO,
59 |       ...args.options, // User options override defaults
60 |     };
61 | 
62 |     // Analyze the video
63 |     const result = await videoProvider.analyzeVideo(
64 |       processedVideoSource,
65 |       args.prompt,
66 |       options
67 |     );
68 | 
69 |     return result;
70 |   } catch (error) {
71 |     console.error('Error in analyze_video tool:', error);
72 | 
73 |     if (error instanceof VisionError) {
74 |       throw error;
75 |     }
76 | 
77 |     throw new VisionError(
78 |       `Failed to analyze video: ${error instanceof Error ? error.message : String(error)}`,
79 |       'ANALYSIS_ERROR',
80 |       'gemini',
81 |       error instanceof Error ? error : undefined
82 |     );
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/docs/llm_logs/dynamic-schema-validation-fix-plan.md:
--------------------------------------------------------------------------------
 1 | # Plan: Fix Dynamic MAX_IMAGES_FOR_COMPARISON Schema Validation
 2 | 
 3 | ## Problem Description
 4 | 
 5 | The `compare_images` MCP tool has inconsistent validation:
 6 | - **Schema validation** (server.ts): Hardcoded to max 4 images
 7 | - **Tool implementation** (compare_images.ts): Respects `MAX_IMAGES_FOR_COMPARISON` environment variable
 8 | 
 9 | This means users cannot use more than 4 images even if they configure `MAX_IMAGES_FOR_COMPARISON=6`.
10 | 
11 | ## Root Cause
12 | 
13 | The MCP tool registration in `server.ts` uses a static Zod schema that's defined at module load time, before configuration is available:
14 | 
15 | ```typescript
16 | // Current - HARDCODED
17 | imageSources: z
18 |   .array(z.string())
19 |   .min(2)
20 |   .max(4)  // ← Static value, ignores config
21 | ```
22 | 
23 | ## Solution Strategy
24 | 
25 | ### Option 1: Lazy Schema Generation (Recommended)
26 | Move the schema creation inside the tool handler where config is available.
27 | 
28 | ### Option 2: Dynamic Schema Factory
29 | Create a schema factory function that accepts max images parameter.
30 | 
31 | ### Option 3: Configuration-based Registration
32 | Register tools after configuration is loaded.
33 | 
34 | **Selected: Option 1** - Most straightforward and maintains existing patterns.
35 | 
36 | ## Implementation Plan
37 | 
38 | 1. **Modify server.ts**:
39 |    - Move schema validation from registration to handler
40 |    - Use manual validation with config values
41 |    - Keep Zod for type safety but make limits dynamic
42 | 
43 | 2. **Update validation logic**:
44 |    - Read `MAX_IMAGES_FOR_COMPARISON` from config
45 |    - Apply dynamic validation in handler
46 |    - Maintain backward compatibility
47 | 
48 | 3. **Preserve error consistency**:
49 |    - Same error format as current Zod validation
50 |    - Clear error messages for users
51 | 
52 | ## Implementation Details
53 | 
54 | ```typescript
55 | // Before (hardcoded)
56 | inputSchema: {
57 |   imageSources: z.array(z.string()).min(2).max(4)
58 | }
59 | 
60 | // After (dynamic)
61 | inputSchema: {
62 |   imageSources: z.array(z.string()).min(2)  // Remove max, validate in handler
63 | }
64 | 
65 | // Handler validation:
66 | const { config } = getServices();
67 | const maxImages = config.MAX_IMAGES_FOR_COMPARISON || 4;
68 | if (imageSources.length > maxImages) {
69 |   throw new Error(`Maximum ${maxImages} images allowed`);
70 | }
71 | ```
72 | 
73 | ## Benefits
74 | 
75 | - ✅ Respects user configuration
76 | - ✅ Consistent behavior across schema and implementation
77 | - ✅ No breaking changes to existing API
78 | - ✅ Maintains type safety
79 | 
80 | ## Risk Assessment
81 | 
82 | - **Risk Level**: Low
83 | - **Breaking Changes**: None
84 | - **Backward Compatibility**: Full
85 | - **Testing**: Can validate with different MAX_IMAGES_FOR_COMPARISON values
86 | 
87 | ## Expected Outcome
88 | 
89 | After fix:
90 | - `MAX_IMAGES_FOR_COMPARISON=6` → Users can compare up to 6 images
91 | - `MAX_IMAGES_FOR_COMPARISON=2` → Users can compare up to 2 images
92 | - Default behavior unchanged (max 4 images)


--------------------------------------------------------------------------------
/src/types/ObjectDetection.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Object detection types for AI Vision MCP
 3 |  */
 4 | 
 5 | import type { AnalysisOptions } from './Providers.js';
 6 | 
 7 | export interface DetectedObject {
 8 |   object: string; // Generic category for detected object
 9 |   label: string; // Descriptive label or instance-specific detail
10 |   normalized_box_2d: [number, number, number, number]; // [ymin, xmin, ymax, xmax] normalized to 0-1000
11 | }
12 | 
13 | export interface ObjectDetectionResult {
14 |   detections: DetectedObject[];
15 |   image_metadata: {
16 |     width: number;
17 |     height: number;
18 |     size_bytes: number;
19 |     format: string;
20 |   };
21 |   processing_time?: number;
22 |   model: string;
23 |   provider: string;
24 | }
25 | 
26 | export interface ObjectDetectionArgs {
27 |   imageSource: string; // URL, base64, or local file path
28 |   prompt?: string; // Optional custom detection prompt
29 |   outputFilePath?: string; // Optional explicit output path
30 |   options?: AnalysisOptions; // Optional API configuration parameters
31 | }
32 | 
33 | // Enhanced metadata interface for object detection responses
34 | export interface ObjectDetectionMetadata {
35 |   model: string; // "gemini-2.5-flash-lite"
36 |   provider: string; // "google" | "vertex_ai"
37 |   usage?: {
38 |     promptTokenCount: number;
39 |     candidatesTokenCount: number;
40 |     totalTokenCount: number;
41 |   };
42 |   processingTime: number; // milliseconds
43 |   fileType?: string; // "image/png"
44 |   fileSize?: number; // bytes
45 |   modelVersion?: string; // "gemini-2.5-flash-lite"
46 |   responseId?: string; // "abc123..."
47 |   fileSaveStatus?: 'saved' | 'skipped_due_to_permissions'; // File save status
48 | }
49 | 
50 | // MCP response types for different output scenarios
51 | export interface DetectionWithFile {
52 |   detections: DetectedObject[];
53 |   file: {
54 |     path: string;
55 |     size_bytes: number;
56 |     format: string;
57 |   };
58 |   image_metadata: {
59 |     width: number;
60 |     height: number;
61 |     original_size: number;
62 |   };
63 |   summary: string; // Human-readable summary with percentage coordinates
64 |   metadata: ObjectDetectionMetadata; // Enhanced metadata
65 | }
66 | 
67 | export interface DetectionWithTempFile {
68 |   detections: DetectedObject[];
69 |   tempFile: {
70 |     path: string;
71 |     size_bytes: number;
72 |     format: string;
73 |   };
74 |   image_metadata: {
75 |     width: number;
76 |     height: number;
77 |     original_size: number;
78 |   };
79 |   summary: string; // Human-readable summary with percentage coordinates
80 |   metadata: ObjectDetectionMetadata; // Enhanced metadata
81 | }
82 | 
83 | export interface DetectionOnly {
84 |   detections: DetectedObject[];
85 |   image_metadata: {
86 |     width: number;
87 |     height: number;
88 |     original_size: number;
89 |   };
90 |   summary: string; // Human-readable summary with percentage coordinates
91 |   metadata: ObjectDetectionMetadata; // Enhanced metadata
92 | }
93 | 
94 | // Union type for all possible response types
95 | export type ObjectDetectionResponse = DetectionWithFile | DetectionWithTempFile | DetectionOnly;
96 | 


--------------------------------------------------------------------------------
/src/types/Analysis.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Analysis types for vision providers
  3 |  */
  4 | 
  5 | import { type FunctionName } from '../constants/FunctionNames.js';
  6 | 
  7 | export type TaskType = 'image' | 'video';
  8 | 
  9 | export interface AnalysisOptions {
 10 |   temperature?: number | undefined;
 11 |   topP?: number | undefined;
 12 |   topK?: number | undefined;
 13 |   maxTokens?: number | undefined;
 14 |   maxTokensForImage?: number | undefined;
 15 |   maxTokensForVideo?: number | undefined;
 16 |   stopSequences?: string[] | undefined;
 17 |   taskType?: TaskType;
 18 |   functionName?: FunctionName;
 19 |   responseSchema?: any; // Structured output schema for object detection
 20 |   systemInstruction?: string | undefined; // System instruction to guide model behavior
 21 | }
 22 | 
 23 | export interface AnalysisResult {
 24 |   text: string;
 25 |   metadata: AnalysisMetadata;
 26 | }
 27 | 
 28 | export interface AnalysisMetadata {
 29 |   model: string;
 30 |   provider: string;
 31 |   usage?: UsageMetadata;
 32 |   processingTime?: number;
 33 |   fileType?: string;
 34 |   fileSize?: number;
 35 |   modelVersion?: string; // "gemini-2.5-flash-lite"
 36 |   responseId?: string; // "abc123..."
 37 | }
 38 | 
 39 | export interface UsageMetadata {
 40 |   promptTokenCount: number;
 41 |   candidatesTokenCount: number;
 42 |   totalTokenCount: number;
 43 | }
 44 | 
 45 | export interface UploadedFile {
 46 |   id: string;
 47 |   filename: string;
 48 |   mimeType: string;
 49 |   size: number;
 50 |   url?: string;
 51 |   uri?: string;
 52 |   displayName?: string;
 53 |   state?: 'PROCESSING' | 'ACTIVE' | 'FAILED';
 54 |   createTime?: string;
 55 |   updateTime?: string;
 56 |   expirationTime?: string;
 57 |   sha256Hash?: string;
 58 | }
 59 | 
 60 | export interface FileReference {
 61 |   type: 'file_uri' | 'public_url' | 'base64';
 62 |   uri?: string;
 63 |   url?: string;
 64 |   data?: string;
 65 |   mimeType: string;
 66 | }
 67 | 
 68 | export interface HealthStatus {
 69 |   status: 'healthy' | 'unhealthy' | 'degraded';
 70 |   message?: string;
 71 |   lastCheck: string;
 72 |   responseTime?: number;
 73 | }
 74 | 
 75 | export interface RateLimitInfo {
 76 |   requestsPerMinute?: number;
 77 |   requestsPerDay?: number;
 78 |   currentUsage?: {
 79 |     requestsPerMinute: number;
 80 |     requestsPerDay: number;
 81 |   };
 82 |   resetTime?: string;
 83 | }
 84 | 
 85 | export interface ProviderCapabilities {
 86 |   supportedImageFormats: string[];
 87 |   supportedVideoFormats: string[];
 88 |   maxImageSize: number;
 89 |   maxVideoSize: number;
 90 |   maxVideoDuration: number;
 91 |   supportsVideo: boolean;
 92 |   supportsFileUpload: boolean;
 93 | }
 94 | 
 95 | export interface ModelCapabilities {
 96 |   imageAnalysis: boolean;
 97 |   videoAnalysis: boolean;
 98 |   maxTokensForImage: number;
 99 |   maxTokensForVideo: number;
100 |   supportedFormats: string[];
101 | }
102 | 
103 | export interface ProviderInfo {
104 |   name: string;
105 |   version: string;
106 |   description: string;
107 |   capabilities: ProviderCapabilities;
108 |   modelCapabilities: ModelCapabilities;
109 |   rateLimit?: RateLimitInfo; // Optional - rate limits vary by user tier/project
110 | }
111 | 


--------------------------------------------------------------------------------
/src/tools/analyze_image.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * MCP Tool: analyze_image
 3 |  * Analyzes an image using AI vision models. Supports URLs, base64 data, and local file paths.
 4 |  */
 5 | 
 6 | import type { AnalysisOptions, AnalysisResult } from '../types/Providers.js';
 7 | import type { VisionProvider } from '../types/Providers.js';
 8 | import { FileService } from '../services/FileService.js';
 9 | import type { Config } from '../types/Config.js';
10 | import { VisionError } from '../types/Errors.js';
11 | import { FUNCTION_NAMES } from '../constants/FunctionNames.js';
12 | 
13 | export interface AnalyzeImageArgs {
14 |   imageSource: string; // Can be URL, base64 data, or local file path
15 |   prompt: string;
16 |   options?: AnalysisOptions;
17 | }
18 | 
19 | export async function analyze_image(
20 |   args: AnalyzeImageArgs,
21 |   config: Config,
22 |   imageProvider: VisionProvider,
23 |   imageFileService: FileService
24 | ): Promise<AnalysisResult> {
25 |   try {
26 |     // Validate arguments
27 |     if (!args.imageSource) {
28 |       throw new VisionError('imageSource is required', 'MISSING_ARGUMENT');
29 |     }
30 |     if (!args.prompt) {
31 |       throw new VisionError('prompt is required', 'MISSING_ARGUMENT');
32 |     }
33 | 
34 |     // Handle image source (URL vs local file vs base64)
35 |     const processedImageSource = await imageFileService.handleImageSource(
36 |       args.imageSource
37 |     );
38 |     console.log(
39 |       `[analyze_image] Processed image source: ${processedImageSource.substring(0, 100)}${processedImageSource.length > 100 ? '...' : ''}`
40 |     );
41 |     console.log(`[analyze_image] Original source: ${args.imageSource}`);
42 |     console.log(
43 |       `[analyze_image] Processed source starts with data:image: ${processedImageSource.startsWith('data:image/')}`
44 |     );
45 | 
46 |     // Merge default options with provided options
47 |     const options: AnalysisOptions = {
48 |       temperature:
49 |         config.TEMPERATURE_FOR_ANALYZE_IMAGE ??
50 |         config.TEMPERATURE_FOR_IMAGE ??
51 |         config.TEMPERATURE,
52 |       topP:
53 |         config.TOP_P_FOR_ANALYZE_IMAGE ??
54 |         config.TOP_P_FOR_IMAGE ??
55 |         config.TOP_P,
56 |       topK:
57 |         config.TOP_K_FOR_ANALYZE_IMAGE ??
58 |         config.TOP_K_FOR_IMAGE ??
59 |         config.TOP_K,
60 |       maxTokens:
61 |         config.MAX_TOKENS_FOR_ANALYZE_IMAGE ??
62 |         config.MAX_TOKENS_FOR_IMAGE ??
63 |         config.MAX_TOKENS,
64 |       taskType: 'image',
65 |       functionName: FUNCTION_NAMES.ANALYZE_IMAGE,
66 |       ...args.options, // User options override defaults
67 |     };
68 | 
69 |     // Analyze the image
70 |     const result = await imageProvider.analyzeImage(
71 |       processedImageSource,
72 |       args.prompt,
73 |       options
74 |     );
75 | 
76 |     return result;
77 |   } catch (error) {
78 |     console.error('Error in analyze_image tool:', error);
79 | 
80 |     if (error instanceof VisionError) {
81 |       throw error;
82 |     }
83 | 
84 |     throw new VisionError(
85 |       `Failed to analyze image: ${error instanceof Error ? error.message : String(error)}`,
86 |       'ANALYSIS_ERROR',
87 |       'gemini',
88 |       error instanceof Error ? error : undefined
89 |     );
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/utils/credentialsParser.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Utility functions for parsing Google Cloud service account credentials
  3 |  */
  4 | 
  5 | import fs from 'fs';
  6 | import { ConfigurationError } from '../types/Errors.js';
  7 | 
  8 | export interface ServiceAccountCredentials {
  9 |   type: string;
 10 |   project_id: string;
 11 |   private_key_id: string;
 12 |   private_key: string;
 13 |   client_email: string;
 14 |   client_id: string;
 15 |   auth_uri: string;
 16 |   token_uri: string;
 17 |   auth_provider_x509_cert_url: string;
 18 |   client_x509_cert_url: string;
 19 |   universe_domain?: string;
 20 | }
 21 | 
 22 | /**
 23 |  * Parse Google Cloud service account credentials from a file path
 24 |  * @param credentialsPath - Path to the service account JSON file
 25 |  * @returns Parsed service account credentials
 26 |  */
 27 | export function parseServiceAccountCredentials(
 28 |   credentialsPath: string
 29 | ): ServiceAccountCredentials {
 30 |   try {
 31 |     // Check if file exists
 32 |     if (!fs.existsSync(credentialsPath)) {
 33 |       throw new ConfigurationError(
 34 |         `Service account credentials file not found: ${credentialsPath}`,
 35 |         'VERTEX_CREDENTIALS'
 36 |       );
 37 |     }
 38 | 
 39 |     // Read and parse the JSON file
 40 |     const fileContent = fs.readFileSync(credentialsPath, 'utf-8');
 41 |     const credentials = JSON.parse(fileContent) as ServiceAccountCredentials;
 42 | 
 43 |     // Validate required fields
 44 |     if (!credentials.project_id) {
 45 |       throw new ConfigurationError(
 46 |         'Service account credentials file is missing "project_id" field',
 47 |         'VERTEX_CREDENTIALS'
 48 |       );
 49 |     }
 50 | 
 51 |     if (!credentials.private_key) {
 52 |       throw new ConfigurationError(
 53 |         'Service account credentials file is missing "private_key" field',
 54 |         'VERTEX_CREDENTIALS'
 55 |       );
 56 |     }
 57 | 
 58 |     if (!credentials.client_email) {
 59 |       throw new ConfigurationError(
 60 |         'Service account credentials file is missing "client_email" field',
 61 |         'VERTEX_CREDENTIALS'
 62 |       );
 63 |     }
 64 | 
 65 |     return credentials;
 66 |   } catch (error) {
 67 |     if (error instanceof ConfigurationError) {
 68 |       throw error;
 69 |     }
 70 | 
 71 |     if (error instanceof SyntaxError) {
 72 |       throw new ConfigurationError(
 73 |         `Invalid JSON in service account credentials file: ${credentialsPath}`,
 74 |         'VERTEX_CREDENTIALS'
 75 |       );
 76 |     }
 77 | 
 78 |     throw new ConfigurationError(
 79 |       `Failed to parse service account credentials: ${error instanceof Error ? error.message : String(error)}`,
 80 |       'VERTEX_CREDENTIALS'
 81 |     );
 82 |   }
 83 | }
 84 | 
 85 | /**
 86 |  * Extract project ID from service account credentials file
 87 |  * @param credentialsPath - Path to the service account JSON file
 88 |  * @returns Project ID from the credentials
 89 |  */
 90 | export function extractProjectIdFromCredentials(
 91 |   credentialsPath: string
 92 | ): string {
 93 |   const credentials = parseServiceAccountCredentials(credentialsPath);
 94 |   return credentials.project_id;
 95 | }
 96 | 
 97 | /**
 98 |  * Validate service account credentials file format
 99 |  * @param credentialsPath - Path to the service account JSON file
100 |  * @returns true if valid, throws error otherwise
101 |  */
102 | export function validateServiceAccountCredentials(
103 |   credentialsPath: string
104 | ): boolean {
105 |   parseServiceAccountCredentials(credentialsPath);
106 |   return true;
107 | }
108 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
  1 | #===============================================
  2 | # PROVIDER SELECTION
  3 | #===============================================
  4 | IMAGE_PROVIDER=google # or vertex_ai
  5 | VIDEO_PROVIDER=google # or vertex_ai
  6 | 
  7 | #===============================================
  8 | # MODEL SELECTION
  9 | #===============================================
 10 | # IMAGE_MODEL=gemini-2.5-flash-lite
 11 | # VIDEO_MODEL=gemini-2.5-flash-pro
 12 | 
 13 | #===============================================
 14 | # FUNCTION-SPECIFIC MODEL SELECTION
 15 | #===============================================
 16 | # ANALYZE_IMAGE_MODEL=
 17 | # COMPARE_IMAGES_MODEL=
 18 | # DETECT_OBJECTS_IN_IMAGE_MODEL=
 19 | # ANALYZE_VIDEO_MODEL=
 20 | 
 21 | #===============================================
 22 | # GEMINI API CONFIGURATION (AI Studio)
 23 | #===============================================
 24 | GEMINI_API_KEY=your_gemini_api_key_here
 25 | # GEMINI_BASE_URL=https://generativelanguage.googleapis.com
 26 | 
 27 | #===============================================
 28 | # VERTEX AI CONFIGURATION
 29 | #===============================================
 30 | VERTEX_CREDENTIALS=path/to/service-account.json
 31 | # The following are optional and auto-derived from VERTEX_CREDENTIALS:
 32 | # VERTEX_PROJECT_ID=your-gcp-project-id
 33 | # VERTEX_LOCATION=us-central1
 34 | # VERTEX_ENDPOINT=https://aiplatform.googleapis.com
 35 | 
 36 | #===============================================
 37 | # GOOGLE CLOUD STORAGE CONFIGURATION (Required for Vertex AI)
 38 | #===============================================
 39 | GCS_BUCKET_NAME=your-vision-files-bucket
 40 | # The following are optional and auto-derived from VERTEX_CREDENTIALS:
 41 | # GCS_PROJECT_ID - Auto-derived from VERTEX_CREDENTIALS
 42 | # GCS_CREDENTIALS - Defaults to VERTEX_CREDENTIALS
 43 | # GCS_REGION - Defaults to VERTEX_LOCATION
 44 | 
 45 | #===============================================
 46 | # UNIVERSAL API PARAMETERS
 47 | #===============================================
 48 | TEMPERATURE=0.8
 49 | TOP_P=0.95
 50 | TOP_K=30
 51 | MAX_TOKENS=16384
 52 | 
 53 | #===============================================
 54 | # TASK-SPECIFIC API PARAMETERS
 55 | #===============================================
 56 | # TEMPERATURE_FOR_IMAGE=
 57 | # TOP_P_FOR_IMAGE=
 58 | # TOP_K_FOR_IMAGE=
 59 | # MAX_TOKENS_FOR_IMAGE=
 60 | # TEMPERATURE_FOR_VIDEO=
 61 | # TOP_P_FOR_VIDEO=
 62 | # TOP_K_FOR_VIDEO=
 63 | # MAX_TOKENS_FOR_VIDEO=
 64 | 
 65 | #===============================================
 66 | # FUNCTION-SPECIFIC API PARAMETERS
 67 | #===============================================
 68 | # TEMPERATURE_FOR_ANALYZE_IMAGE=
 69 | # TOP_P_FOR_ANALYZE_IMAGE=
 70 | # TOP_K_FOR_ANALYZE_IMAGE=
 71 | # MAX_TOKENS_FOR_ANALYZE_IMAGE=
 72 | # TEMPERATURE_FOR_COMPARE_IMAGES=
 73 | # TOP_P_FOR_COMPARE_IMAGES=
 74 | # TOP_K_FOR_COMPARE_IMAGES=
 75 | # MAX_TOKENS_FOR_COMPARE_IMAGES=
 76 | # TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=
 77 | # TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE=
 78 | # TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE=
 79 | # MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE=
 80 | # TEMPERATURE_FOR_ANALYZE_VIDEO=
 81 | # TOP_P_FOR_ANALYZE_VIDEO=
 82 | # TOP_K_FOR_ANALYZE_VIDEO=
 83 | # MAX_TOKENS_FOR_ANALYZE_VIDEO=
 84 | 
 85 | #===============================================
 86 | # FILE PROCESSING CONFIGURATION
 87 | #===============================================
 88 | MAX_IMAGE_SIZE=20971520
 89 | MAX_VIDEO_SIZE=2147483648
 90 | ALLOWED_IMAGE_FORMATS=png,jpg,jpeg,webp,gif,bmp,tiff
 91 | ALLOWED_VIDEO_FORMATS=mp4,mov,avi,mkv,webm,flv,wmv,3gp
 92 | MAX_VIDEO_DURATION=3600
 93 | 
 94 | #===============================================
 95 | # FILE UPLOAD CONFIGURATION
 96 | #===============================================
 97 | GEMINI_FILES_API_THRESHOLD=10485760
 98 | VERTEX_AI_FILES_API_THRESHOLD=0
 99 | 
100 | #===============================================
101 | # LOGGING CONFIGURATION
102 | #===============================================
103 | LOG_LEVEL=info
104 | 
105 | #===============================================
106 | # DEVELOPMENT CONFIGURATION
107 | #===============================================
108 | NODE_ENV=production # or 'development'


--------------------------------------------------------------------------------
/src/tools/compare_images.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * MCP Tool: compare_images
  3 |  * Compares multiple images using AI vision models. Supports URLs, base64 data, and local file paths.
  4 |  */
  5 | 
  6 | import type { AnalysisOptions, AnalysisResult } from '../types/Providers.js';
  7 | import type { VisionProvider } from '../types/Providers.js';
  8 | import { FileService } from '../services/FileService.js';
  9 | import type { Config } from '../types/Config.js';
 10 | import { VisionError } from '../types/Errors.js';
 11 | import { FUNCTION_NAMES } from '../constants/FunctionNames.js';
 12 | 
 13 | export interface CompareImagesArgs {
 14 |   imageSources: string[]; // Array of image sources (URLs/base64/file paths)
 15 |   prompt: string;
 16 |   options?: AnalysisOptions;
 17 | }
 18 | 
 19 | export async function compare_images(
 20 |   args: CompareImagesArgs,
 21 |   config: Config,
 22 |   imageProvider: VisionProvider,
 23 |   imageFileService: FileService
 24 | ): Promise<AnalysisResult> {
 25 |   try {
 26 |     // Validate arguments
 27 |     if (!args.imageSources || !Array.isArray(args.imageSources)) {
 28 |       throw new VisionError(
 29 |         'imageSources must be an array',
 30 |         'MISSING_ARGUMENT'
 31 |       );
 32 |     }
 33 |     if (!args.prompt) {
 34 |       throw new VisionError('prompt is required', 'MISSING_ARGUMENT');
 35 |     }
 36 | 
 37 |     // Validate image count
 38 |     const maxImages = config.MAX_IMAGES_FOR_COMPARISON || 4;
 39 |     if (args.imageSources.length < 2) {
 40 |       throw new VisionError(
 41 |         'At least 2 images are required for comparison',
 42 |         'INVALID_ARGUMENT'
 43 |       );
 44 |     }
 45 |     if (args.imageSources.length > maxImages) {
 46 |       throw new VisionError(
 47 |         `Maximum ${maxImages} images allowed for comparison, received ${args.imageSources.length}`,
 48 |         'INVALID_ARGUMENT'
 49 |       );
 50 |     }
 51 | 
 52 |     // Validate each image source
 53 |     for (let i = 0; i < args.imageSources.length; i++) {
 54 |       if (!args.imageSources[i] || typeof args.imageSources[i] !== 'string') {
 55 |         throw new VisionError(
 56 |           `Image source at index ${i} is invalid`,
 57 |           'INVALID_ARGUMENT'
 58 |         );
 59 |       }
 60 |     }
 61 | 
 62 |     console.log(
 63 |       `[compare_images] Processing ${args.imageSources.length} images for comparison`
 64 |     );
 65 | 
 66 |     // Process all image sources
 67 |     const processedImageSources = await Promise.all(
 68 |       args.imageSources.map(async (imageSource, index) => {
 69 |         console.log(
 70 |           `[compare_images] Processing image ${index + 1}: ${imageSource.substring(0, 100)}${imageSource.length > 100 ? '...' : ''}`
 71 |         );
 72 |         return await imageFileService.handleImageSource(imageSource);
 73 |       })
 74 |     );
 75 | 
 76 |     console.log(
 77 |       `[compare_images] All ${processedImageSources.length} images processed successfully`
 78 |     );
 79 | 
 80 |     // Merge default options with provided options
 81 |     const options: AnalysisOptions = {
 82 |       temperature:
 83 |         config.TEMPERATURE_FOR_COMPARE_IMAGES ??
 84 |         config.TEMPERATURE_FOR_IMAGE ??
 85 |         config.TEMPERATURE,
 86 |       topP:
 87 |         config.TOP_P_FOR_COMPARE_IMAGES ??
 88 |         config.TOP_P_FOR_IMAGE ??
 89 |         config.TOP_P,
 90 |       topK:
 91 |         config.TOP_K_FOR_COMPARE_IMAGES ??
 92 |         config.TOP_K_FOR_IMAGE ??
 93 |         config.TOP_K,
 94 |       maxTokens:
 95 |         config.MAX_TOKENS_FOR_COMPARE_IMAGES ??
 96 |         config.MAX_TOKENS_FOR_IMAGE ??
 97 |         config.MAX_TOKENS,
 98 |       taskType: 'image',
 99 |       functionName: FUNCTION_NAMES.COMPARE_IMAGES,
100 |       ...args.options, // User options override defaults
101 |     };
102 | 
103 |     // Call the provider's comparison method
104 |     const result = await imageProvider.compareImages(
105 |       processedImageSources,
106 |       args.prompt,
107 |       options
108 |     );
109 | 
110 |     return result;
111 |   } catch (error) {
112 |     console.error('Error in compare_images tool:', error);
113 | 
114 |     if (error instanceof VisionError) {
115 |       throw error;
116 |     }
117 | 
118 |     throw new VisionError(
119 |       `Failed to compare images: ${error instanceof Error ? error.message : String(error)}`,
120 |       'ANALYSIS_ERROR',
121 |       'gemini',
122 |       error instanceof Error ? error : undefined
123 |     );
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/docs/llm_logs/vertex-ai-auth-fix-plan.md:
--------------------------------------------------------------------------------
  1 | # Plan: Fix Vertex AI Authentication Issue
  2 | 
  3 | ## Problem Description
  4 | 
  5 | The VertexAI provider is failing with `aiplatform.endpoints.predict` permission denied errors because the GoogleGenAI client is not being initialized with proper authentication credentials.
  6 | 
  7 | ## Root Cause Analysis
  8 | 
  9 | 1. **Missing Authentication**: The `VertexAIProvider.ts` constructor initializes the GoogleGenAI client without any authentication configuration
 10 | 2. **Credentials Available but Unused**: The `VertexAIConfig` includes a `credentials` field from `VERTEX_CREDENTIALS` environment variable, but it's not passed to the GoogleGenAI client
 11 | 3. **No GoogleAuthOptions**: The client config lacks the required `googleAuthOptions` parameter
 12 | 
 13 | ## Current Error Flow
 14 | 
 15 | ```
 16 | Environment Variables → ConfigService → VertexAIConfig → VertexAIProvider Constructor
 17 |                                                            ↓
 18 | VERTEX_CREDENTIALS ✓  → credentials: "path/to/file.json" → NOT USED ✗
 19 |                                                            ↓
 20 |                                         GoogleGenAI client with NO AUTH → 403 Permission Denied
 21 | ```
 22 | 
 23 | ## Solution Options Analysis
 24 | 
 25 | ### Option 1: Environment Variable (Simple)
 26 | - ✅ Quick implementation
 27 | - ❌ Modifies global process environment
 28 | - ❌ Not ideal for multiple concurrent instances
 29 | 
 30 | ### Option 2: GoogleAuthOptions (Recommended) ⭐
 31 | - ✅ Explicit authentication configuration
 32 | - ✅ Clean separation of concerns
 33 | - ✅ Supports both file paths and credential objects
 34 | - ✅ No global environment modification
 35 | 
 36 | ### Option 3: Credential Object Parsing (Complex)
 37 | - ✅ Most flexible
 38 | - ❌ Higher complexity
 39 | - ❌ Requires file system operations
 40 | 
 41 | ## Selected Solution: Option 2 - GoogleAuthOptions
 42 | 
 43 | ### Implementation Steps
 44 | 
 45 | 1. **Modify VertexAI Provider Constructor**:
 46 |    - Add authentication logic before GoogleGenAI client initialization
 47 |    - Check if `config.credentials` is provided
 48 |    - Add `googleAuthOptions` to client configuration
 49 | 
 50 | 2. **Support Multiple Credential Types**:
 51 |    - File path (most common from README examples)
 52 |    - JSON string (future flexibility)
 53 | 
 54 | 3. **Maintain Backward Compatibility**:
 55 |    - Keep existing environment variable support
 56 |    - No breaking changes to public API
 57 | 
 58 | ### Implementation Details
 59 | 
 60 | ```typescript
 61 | // Before (current - broken)
 62 | const clientConfig: any = {
 63 |   vertexai: true,
 64 |   project: config.projectId,
 65 |   location: config.location,
 66 | };
 67 | 
 68 | // After (fixed)
 69 | const clientConfig: any = {
 70 |   vertexai: true,
 71 |   project: config.projectId,
 72 |   location: config.location,
 73 | };
 74 | 
 75 | // Add authentication if credentials are provided
 76 | if (config.credentials) {
 77 |   clientConfig.googleAuthOptions = {
 78 |     keyFile: config.credentials
 79 |   };
 80 | }
 81 | ```
 82 | 
 83 | ### Environment Variables Respected
 84 | 
 85 | From README.md requirements:
 86 | - ✅ `VERTEX_CREDENTIALS` - Path to service account JSON file
 87 | - ✅ `VERTEX_PROJECT_ID` - Auto-derived from credentials or explicit
 88 | - ✅ `VERTEX_LOCATION` - Defaults to 'us-central1'
 89 | - ✅ `VERTEX_ENDPOINT` - Defaults to 'https://aiplatform.googleapis.com'
 90 | 
 91 | ### Testing Strategy
 92 | 
 93 | 1. **Verify Current Error**: Confirm 403 permission denied
 94 | 2. **Apply Fix**: Implement GoogleAuthOptions
 95 | 3. **Test Authentication**: Verify successful API calls
 96 | 4. **Test Edge Cases**: Missing credentials, invalid paths
 97 | 
 98 | ### Risk Assessment
 99 | 
100 | - **Risk Level**: Low
101 | - **Rollback**: Simple revert if issues arise
102 | - **Breaking Changes**: None
103 | - **Dependencies**: No new dependencies required
104 | 
105 | ### Expected Outcome
106 | 
107 | After implementation:
108 | ```
109 | Environment Variables → ConfigService → VertexAIConfig → VertexAIProvider Constructor
110 |                                                            ↓
111 | VERTEX_CREDENTIALS ✓  → credentials: "path/to/file.json" → googleAuthOptions ✓
112 |                                                            ↓
113 |                                         GoogleGenAI client with AUTH → ✅ Success
114 | ```
115 | 
116 | ## Recommendation: Option 2
117 | 
118 | **Rationale**: Provides the best balance of implementation simplicity, maintainability, and explicit configuration while respecting all environment variables outlined in the README.


--------------------------------------------------------------------------------
/src/types/Config.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Configuration types for the Vision MCP Server
  3 |  */
  4 | 
  5 | export interface Config {
  6 |   // Provider selection
  7 |   IMAGE_PROVIDER: 'google' | 'vertex_ai';
  8 |   VIDEO_PROVIDER: 'google' | 'vertex_ai';
  9 | 
 10 |   // Gemini API configuration
 11 |   GEMINI_API_KEY?: string | undefined;
 12 |   GEMINI_BASE_URL?: string | undefined;
 13 | 
 14 |   // Vertex AI configuration
 15 |   VERTEX_CREDENTIALS?: string;
 16 |   VERTEX_PROJECT_ID?: string;
 17 |   VERTEX_LOCATION?: string;
 18 |   VERTEX_ENDPOINT?: string;
 19 | 
 20 |   // Model configuration
 21 |   IMAGE_MODEL?: string;
 22 |   VIDEO_MODEL?: string;
 23 | 
 24 |   // Function-specific model configuration
 25 |   ANALYZE_IMAGE_MODEL?: string;
 26 |   COMPARE_IMAGES_MODEL?: string;
 27 |   DETECT_OBJECTS_IN_IMAGE_MODEL?: string;
 28 |   ANALYZE_VIDEO_MODEL?: string;
 29 | 
 30 |   // Google Cloud Storage configuration (for Vertex AI file storage)
 31 |   GCS_BUCKET_NAME?: string;
 32 |   GCS_PROJECT_ID?: string; // Auto-derived from VERTEX_CREDENTIALS if not provided
 33 |   GCS_CREDENTIALS?: string; // Optional: defaults to VERTEX_CREDENTIALS
 34 |   GCS_REGION?: string; // Optional: defaults to VERTEX_LOCATION
 35 | 
 36 |   // Universal API parameters
 37 |   TEMPERATURE?: number;
 38 |   TOP_P?: number;
 39 |   TOP_K?: number;
 40 |   MAX_TOKENS?: number;
 41 | 
 42 |   // Task-specific API parameters
 43 |   TEMPERATURE_FOR_IMAGE?: number;
 44 |   TOP_P_FOR_IMAGE?: number;
 45 |   TOP_K_FOR_IMAGE?: number;
 46 |   MAX_TOKENS_FOR_IMAGE?: number;
 47 |   TEMPERATURE_FOR_VIDEO?: number;
 48 |   TOP_P_FOR_VIDEO?: number;
 49 |   TOP_K_FOR_VIDEO?: number;
 50 |   MAX_TOKENS_FOR_VIDEO?: number;
 51 | 
 52 |   // Function-specific API parameters
 53 |   TEMPERATURE_FOR_ANALYZE_IMAGE?: number;
 54 |   TOP_P_FOR_ANALYZE_IMAGE?: number;
 55 |   TOP_K_FOR_ANALYZE_IMAGE?: number;
 56 |   MAX_TOKENS_FOR_ANALYZE_IMAGE?: number;
 57 |   TEMPERATURE_FOR_COMPARE_IMAGES?: number;
 58 |   TOP_P_FOR_COMPARE_IMAGES?: number;
 59 |   TOP_K_FOR_COMPARE_IMAGES?: number;
 60 |   MAX_TOKENS_FOR_COMPARE_IMAGES?: number;
 61 |   TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE?: number;
 62 |   TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE?: number;
 63 |   TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE?: number;
 64 |   MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE?: number;
 65 |   TEMPERATURE_FOR_ANALYZE_VIDEO?: number;
 66 |   TOP_P_FOR_ANALYZE_VIDEO?: number;
 67 |   TOP_K_FOR_ANALYZE_VIDEO?: number;
 68 |   MAX_TOKENS_FOR_ANALYZE_VIDEO?: number;
 69 | 
 70 |   // File processing configuration
 71 |   MAX_IMAGE_SIZE?: number;
 72 |   MAX_VIDEO_SIZE?: number;
 73 |   ALLOWED_IMAGE_FORMATS?: string[];
 74 |   ALLOWED_VIDEO_FORMATS?: string[];
 75 |   MAX_VIDEO_DURATION?: number;
 76 |   MAX_IMAGES_FOR_COMPARISON?: number;
 77 | 
 78 |   // File upload configuration
 79 |   GEMINI_FILES_API_THRESHOLD?: number;
 80 |   VERTEX_AI_FILES_API_THRESHOLD?: number;
 81 | 
 82 |   // Logging configuration
 83 |   LOG_LEVEL?: 'info' | 'debug' | 'warn' | 'error';
 84 | 
 85 |   // Development configuration
 86 |   NODE_ENV?: 'development' | 'production';
 87 | }
 88 | 
 89 | export interface GeminiConfig {
 90 |   apiKey: string;
 91 |   baseUrl: string;
 92 |   imageModel: string;
 93 |   videoModel: string;
 94 | }
 95 | 
 96 | export interface VertexAIConfig {
 97 |   projectId: string;
 98 |   location: string;
 99 |   endpoint: string;
100 |   credentials?: string;
101 |   imageModel: string;
102 |   videoModel: string;
103 | }
104 | 
105 | export interface GCSConfig {
106 |   bucketName: string;
107 |   projectId: string;
108 |   credentials: string;
109 |   region: string;
110 | }
111 | 
112 | export interface FileUploadConfig {
113 |   useProviderFilesApi: boolean;
114 |   geminiFilesApiThreshold: number;
115 |   vertexAIFilesApiThreshold: number;
116 | }
117 | 
118 | export interface ApiConfig {
119 |   temperature: number;
120 |   topP: number;
121 |   topK: number;
122 |   maxTokens: number;
123 |   maxTokensForImage: number;
124 |   maxTokensForVideo: number;
125 |   temperatureForImage?: number;
126 |   topPForImage?: number;
127 |   topKForImage?: number;
128 |   temperatureForVideo?: number;
129 |   topPForVideo?: number;
130 |   topKForVideo?: number;
131 |   temperatureForAnalyzeImage?: number;
132 |   topPForAnalyzeImage?: number;
133 |   topKForAnalyzeImage?: number;
134 |   maxTokensForAnalyzeImage?: number;
135 |   temperatureForCompareImages?: number;
136 |   topPForCompareImages?: number;
137 |   topKForCompareImages?: number;
138 |   maxTokensForCompareImages?: number;
139 |   temperatureForDetectObjectsInImage?: number;
140 |   topPForDetectObjectsInImage?: number;
141 |   topKForDetectObjectsInImage?: number;
142 |   maxTokensForDetectObjectsInImage?: number;
143 |   temperatureForAnalyzeVideo?: number;
144 |   topPForAnalyzeVideo?: number;
145 |   topKForAnalyzeVideo?: number;
146 |   maxTokensForAnalyzeVideo?: number;
147 |   // Model configuration
148 |   analyzeImageModel?: string;
149 |   compareImagesModel?: string;
150 |   detectObjectsInImageModel?: string;
151 |   analyzeVideoModel?: string;
152 | }
153 | 
154 | export interface FileProcessingConfig {
155 |   maxImageSize: number;
156 |   maxVideoSize: number;
157 |   allowedImageFormats: string[];
158 |   allowedVideoFormats: string[];
159 |   maxVideoDuration: number;
160 |   maxImagesForComparison: number;
161 | }
162 | 
163 | export interface LoggingConfig {
164 |   logLevel: 'info' | 'debug' | 'warn' | 'error';
165 | }
166 | 
167 | export interface DevelopmentConfig {
168 |   nodeEnv: 'development' | 'production';
169 | }
170 | 


--------------------------------------------------------------------------------
/src/types/Errors.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Error types for the Vision MCP Server
  3 |  */
  4 | 
  5 | export class VisionError extends Error {
  6 |   constructor(
  7 |     message: string,
  8 |     public code: string,
  9 |     public provider?: string,
 10 |     public originalError?: Error,
 11 |     public statusCode?: number
 12 |   ) {
 13 |     super(message);
 14 |     this.name = 'VisionError';
 15 | 
 16 |     // Maintains proper stack trace for where our error was thrown (only available on V8)
 17 |     if (Error.captureStackTrace) {
 18 |       Error.captureStackTrace(this, VisionError);
 19 |     }
 20 |   }
 21 | }
 22 | 
 23 | export class ConfigurationError extends VisionError {
 24 |   constructor(message: string, variable?: string) {
 25 |     super(message, 'CONFIG_ERROR', undefined, undefined, 400);
 26 |     this.name = 'ConfigurationError';
 27 |     this.variable = variable;
 28 |   }
 29 | 
 30 |   public variable?: string;
 31 | }
 32 | 
 33 | export class ProviderError extends VisionError {
 34 |   constructor(
 35 |     message: string,
 36 |     provider: string,
 37 |     originalError?: Error,
 38 |     statusCode?: number
 39 |   ) {
 40 |     super(message, 'PROVIDER_ERROR', provider, originalError, statusCode);
 41 |     this.name = 'ProviderError';
 42 |   }
 43 | }
 44 | 
 45 | export class FileUploadError extends VisionError {
 46 |   constructor(
 47 |     message: string,
 48 |     provider?: string,
 49 |     originalError?: Error,
 50 |     statusCode?: number
 51 |   ) {
 52 |     super(message, 'FILE_UPLOAD_ERROR', provider, originalError, statusCode);
 53 |     this.name = 'FileUploadError';
 54 |   }
 55 | }
 56 | 
 57 | export class FileNotFoundError extends VisionError {
 58 |   constructor(fileId: string, provider?: string) {
 59 |     super(
 60 |       `File not found: ${fileId}`,
 61 |       'FILE_NOT_FOUND',
 62 |       provider,
 63 |       undefined,
 64 |       404
 65 |     );
 66 |     this.name = 'FileNotFoundError';
 67 |     this.fileId = fileId;
 68 |   }
 69 | 
 70 |   public fileId: string;
 71 | }
 72 | 
 73 | export class UnsupportedFileTypeError extends VisionError {
 74 |   constructor(mimeType: string, supportedTypes?: string[]) {
 75 |     const message = supportedTypes
 76 |       ? `Unsupported file type: ${mimeType}. Supported types: ${supportedTypes.join(', ')}`
 77 |       : `Unsupported file type: ${mimeType}`;
 78 |     super(message, 'UNSUPPORTED_FILE_TYPE', undefined, undefined, 400);
 79 |     this.name = 'UnsupportedFileTypeError';
 80 |     this.mimeType = mimeType;
 81 |     this.supportedTypes = supportedTypes;
 82 |   }
 83 | 
 84 |   public mimeType: string;
 85 |   public supportedTypes?: string[];
 86 | }
 87 | 
 88 | export class FileSizeExceededError extends VisionError {
 89 |   constructor(fileSize: number, maxSize: number) {
 90 |     const message = `File size ${fileSize} bytes exceeds maximum allowed size ${maxSize} bytes`;
 91 |     super(message, 'FILE_SIZE_EXCEEDED', undefined, undefined, 400);
 92 |     this.name = 'FileSizeExceededError';
 93 |     this.fileSize = fileSize;
 94 |     this.maxSize = maxSize;
 95 |   }
 96 | 
 97 |   public fileSize: number;
 98 |   public maxSize: number;
 99 | }
100 | 
101 | export class RateLimitExceededError extends VisionError {
102 |   constructor(message: string, provider?: string, retryAfter?: number) {
103 |     super(message, 'RATE_LIMIT_EXCEEDED', provider, undefined, 429);
104 |     this.name = 'RateLimitExceededError';
105 |     this.retryAfter = retryAfter;
106 |   }
107 | 
108 |   public retryAfter?: number;
109 | }
110 | 
111 | export class AuthenticationError extends VisionError {
112 |   constructor(message: string, provider?: string) {
113 |     super(message, 'AUTHENTICATION_ERROR', provider, undefined, 401);
114 |     this.name = 'AuthenticationError';
115 |   }
116 | }
117 | 
118 | export class AuthorizationError extends VisionError {
119 |   constructor(message: string, provider?: string) {
120 |     super(message, 'AUTHORIZATION_ERROR', provider, undefined, 403);
121 |     this.name = 'AuthorizationError';
122 |   }
123 | }
124 | 
125 | export class NetworkError extends VisionError {
126 |   constructor(message: string, originalError?: Error) {
127 |     super(message, 'NETWORK_ERROR', undefined, originalError);
128 |     this.name = 'NetworkError';
129 |   }
130 | }
131 | 
132 | export class ValidationError extends VisionError {
133 |   constructor(message: string, field?: string) {
134 |     super(message, 'VALIDATION_ERROR', undefined, undefined, 400);
135 |     this.name = 'ValidationError';
136 |     this.field = field;
137 |   }
138 | 
139 |   public field?: string;
140 | }
141 | 
142 | export class StorageError extends VisionError {
143 |   constructor(
144 |     message: string,
145 |     storageType?: string,
146 |     originalError?: Error,
147 |     statusCode?: number
148 |   ) {
149 |     super(message, 'STORAGE_ERROR', storageType, originalError, statusCode);
150 |     this.name = 'StorageError';
151 |   }
152 | }
153 | 
154 | export type ErrorType =
155 |   | 'CONFIG_ERROR'
156 |   | 'PROVIDER_ERROR'
157 |   | 'FILE_UPLOAD_ERROR'
158 |   | 'FILE_NOT_FOUND'
159 |   | 'UNSUPPORTED_FILE_TYPE'
160 |   | 'FILE_SIZE_EXCEEDED'
161 |   | 'RATE_LIMIT_EXCEEDED'
162 |   | 'AUTHENTICATION_ERROR'
163 |   | 'AUTHORIZATION_ERROR'
164 |   | 'NETWORK_ERROR'
165 |   | 'VALIDATION_ERROR'
166 |   | 'STORAGE_ERROR';
167 | 
168 | export interface ErrorDetails {
169 |   code: ErrorType;
170 |   message: string;
171 |   provider?: string;
172 |   statusCode?: number;
173 |   originalError?: string;
174 |   timestamp: string;
175 |   requestId?: string;
176 | }
177 | 


--------------------------------------------------------------------------------
/src/providers/factory/ProviderFactory.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Provider factory for creating and managing vision providers
  3 |  */
  4 | 
  5 | import type { VisionProvider } from '../../types/Providers.js';
  6 | import type { Config } from '../../types/Config.js';
  7 | import { GeminiProvider } from '../gemini/GeminiProvider.js';
  8 | import { VertexAIProvider } from '../vertexai/VertexAIProvider.js';
  9 | import { ConfigurationError, ProviderError } from '../../types/Errors.js';
 10 | import { ConfigService } from '../../services/ConfigService.js';
 11 | 
 12 | export class VisionProviderFactory {
 13 |   private static providers = new Map<string, () => VisionProvider>();
 14 | 
 15 |   /**
 16 |    * Register a new provider with the factory
 17 |    */
 18 |   static registerProvider(name: string, factory: () => VisionProvider): void {
 19 |     this.providers.set(name, factory);
 20 |   }
 21 | 
 22 |   /**
 23 |    * Create a provider instance based on configuration
 24 |    */
 25 |   static createProvider(
 26 |     config: Config,
 27 |     type: 'image' | 'video'
 28 |   ): VisionProvider {
 29 |     const providerName =
 30 |       (config as any)[`${type.toUpperCase()}_PROVIDER`] || 'google';
 31 |     const factory = this.providers.get(providerName);
 32 | 
 33 |     if (!factory) {
 34 |       throw new ConfigurationError(`Unsupported provider: ${providerName}`);
 35 |     }
 36 | 
 37 |     try {
 38 |       const provider = factory();
 39 | 
 40 |       // Set default models if not configured
 41 |       const defaultModels = this.getDefaultModels(providerName);
 42 |       provider.setModel(
 43 |         config.IMAGE_MODEL || defaultModels.image,
 44 |         config.VIDEO_MODEL || defaultModels.video
 45 |       );
 46 | 
 47 |       return provider;
 48 |     } catch (error) {
 49 |       throw new ProviderError(
 50 |         `Failed to create ${providerName} provider: ${error instanceof Error ? error.message : String(error)}`,
 51 |         providerName,
 52 |         error instanceof Error ? error : new Error(String(error))
 53 |       );
 54 |     }
 55 |   }
 56 | 
 57 |   /**
 58 |    * Get list of supported providers
 59 |    */
 60 |   static getSupportedProviders(): string[] {
 61 |     return Array.from(this.providers.keys());
 62 |   }
 63 | 
 64 |   /**
 65 |    * Check if a provider is supported
 66 |    */
 67 |   static isProviderSupported(providerName: string): boolean {
 68 |     return this.providers.has(providerName);
 69 |   }
 70 | 
 71 |   /**
 72 |    * Get provider-specific configuration validation rules
 73 |    */
 74 |   static getProviderConfigRequirements(providerName: string): string[] {
 75 |     switch (providerName) {
 76 |       case 'google':
 77 |         return ['GEMINI_API_KEY'];
 78 | 
 79 |       case 'vertex_ai':
 80 |         return ['VERTEX_CREDENTIALS', 'VERTEX_PROJECT_ID', 'VERTEX_LOCATION'];
 81 | 
 82 |       default:
 83 |         return [];
 84 |     }
 85 |   }
 86 | 
 87 |   /**
 88 |    * Validate provider configuration
 89 |    */
 90 |   static validateProviderConfig(config: Config, providerName: string): void {
 91 |     const requirements = this.getProviderConfigRequirements(providerName);
 92 |     const missing = requirements.filter(req => {
 93 |       const value = config[req as keyof Config];
 94 |       return !value || (typeof value === 'string' && value.trim() === '');
 95 |     });
 96 | 
 97 |     if (missing.length > 0) {
 98 |       throw new ConfigurationError(
 99 |         `Missing required configuration for ${providerName}: ${missing.join(', ')}`
100 |       );
101 |     }
102 |   }
103 | 
104 |   /**
105 |    * Get default models for each provider
106 |    */
107 |   private static getDefaultModels(providerName: string): {
108 |     image: string;
109 |     video: string;
110 |   } {
111 |     const config = ConfigService.getInstance().getConfig();
112 | 
113 |     // Resolution priority:
114 |     // 1. IMAGE_MODEL/VIDEO_MODEL (if set) - User's explicit choice
115 |     // 2. Hardcoded defaults - Last resort
116 |     return {
117 |       image: config.IMAGE_MODEL || 'gemini-2.5-flash-lite',
118 |       video: config.VIDEO_MODEL || 'gemini-2.5-flash',
119 |     };
120 |   }
121 | 
122 |   /**
123 |    * Initialize default providers
124 |    */
125 |   static initializeDefaultProviders(): void {
126 |     // Register Gemini API provider
127 |     this.registerProvider('google', () => {
128 |       const geminiConfig = ConfigService.getInstance().getGeminiConfig();
129 |       return new GeminiProvider(geminiConfig);
130 |     });
131 | 
132 |     // Register Vertex AI provider
133 |     this.registerProvider('vertex_ai', () => {
134 |       const vertexConfig = ConfigService.getInstance().getVertexAIConfig();
135 |       return new VertexAIProvider(vertexConfig);
136 |     });
137 |   }
138 | 
139 |   /**
140 |    * Create provider with configuration validation
141 |    */
142 |   static createProviderWithValidation(
143 |     config: Config,
144 |     type: 'image' | 'video'
145 |   ): VisionProvider {
146 |     const providerName =
147 |       (config as any)[`${type.toUpperCase()}_PROVIDER`] || 'google';
148 | 
149 |     // Validate configuration before creating provider
150 |     this.validateProviderConfig(config, providerName);
151 | 
152 |     // Create the provider through factory (which now properly initializes with config)
153 |     const factory = this.providers.get(providerName);
154 |     if (!factory) {
155 |       throw new ConfigurationError(`Unsupported provider: ${providerName}`);
156 |     }
157 | 
158 |     try {
159 |       const provider = factory();
160 | 
161 |       // Set default models if not configured
162 |       const defaultModels = this.getDefaultModels(providerName);
163 |       provider.setModel(
164 |         config.IMAGE_MODEL || defaultModels.image,
165 |         config.VIDEO_MODEL || defaultModels.video
166 |       );
167 | 
168 |       return provider;
169 |     } catch (error) {
170 |       throw new ProviderError(
171 |         `Failed to create ${providerName} provider: ${error instanceof Error ? error.message : String(error)}`,
172 |         providerName,
173 |         error instanceof Error ? error : new Error(String(error))
174 |       );
175 |     }
176 |   }
177 | }
178 | 
179 | // Initialize default providers when module is loaded
180 | VisionProviderFactory.initializeDefaultProviders();
181 | 


--------------------------------------------------------------------------------
/src/types/Providers.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Provider interface and types
  3 |  */
  4 | 
  5 | import type {
  6 |   AnalysisOptions,
  7 |   AnalysisResult,
  8 |   UploadedFile,
  9 |   FileReference,
 10 |   HealthStatus,
 11 |   ProviderCapabilities,
 12 |   ModelCapabilities,
 13 |   ProviderInfo,
 14 | } from './Analysis.js';
 15 | 
 16 | export type {
 17 |   AnalysisOptions,
 18 |   AnalysisResult,
 19 |   UploadedFile,
 20 |   FileReference,
 21 |   HealthStatus,
 22 |   ProviderCapabilities,
 23 |   ModelCapabilities,
 24 |   ProviderInfo,
 25 | } from './Analysis.js';
 26 | 
 27 | export interface VisionProvider {
 28 |   // Core capabilities
 29 |   analyzeImage(
 30 |     imageSource: string,
 31 |     prompt: string,
 32 |     options?: AnalysisOptions
 33 |   ): Promise<AnalysisResult>;
 34 |   analyzeVideo(
 35 |     videoSource: string,
 36 |     prompt: string,
 37 |     options?: AnalysisOptions
 38 |   ): Promise<AnalysisResult>;
 39 |   compareImages(
 40 |     imageSources: string[],
 41 |     prompt: string,
 42 |     options?: AnalysisOptions
 43 |   ): Promise<AnalysisResult>;
 44 | 
 45 |   // File operations
 46 |   uploadFile(
 47 |     buffer: Buffer,
 48 |     filename: string,
 49 |     mimeType: string
 50 |   ): Promise<UploadedFile>;
 51 |   downloadFile(fileId: string): Promise<Buffer>;
 52 |   deleteFile(fileId: string): Promise<void>;
 53 | 
 54 |   // Model configuration
 55 |   setModel(imageModel: string, videoModel: string): void;
 56 |   getImageModel(): string;
 57 |   getVideoModel(): string;
 58 | 
 59 |   // Provider information
 60 |   getSupportedFormats(): ProviderCapabilities;
 61 |   getModelCapabilities(): ModelCapabilities;
 62 |   getProviderInfo(): ProviderInfo;
 63 | 
 64 |   // Health and status
 65 |   healthCheck(): Promise<HealthStatus>;
 66 |   supportsVideo(): boolean;
 67 | }
 68 | 
 69 | export interface FileUploadStrategy {
 70 |   uploadFile(
 71 |     buffer: Buffer,
 72 |     filename: string,
 73 |     mimeType: string
 74 |   ): Promise<UploadedFile>;
 75 |   getFileForAnalysis(uploadedFile: UploadedFile): Promise<FileReference>;
 76 |   cleanup?(fileId: string): Promise<void>;
 77 | }
 78 | 
 79 | export interface ProviderConfig {
 80 |   name: string;
 81 |   type: 'image' | 'video';
 82 |   models: {
 83 |     image: string;
 84 |     video: string;
 85 |   };
 86 |   credentials: Record<string, string>;
 87 |   options: Record<string, unknown>;
 88 | }
 89 | 
 90 | export interface ProviderFactory {
 91 |   createProvider(config: ProviderConfig): VisionProvider;
 92 |   getSupportedProviders(): string[];
 93 |   registerProvider(name: string, factory: () => VisionProvider): void;
 94 | }
 95 | 
 96 | // Gemini-specific types
 97 | export interface GeminiConfig {
 98 |   apiKey: string;
 99 |   baseUrl: string;
100 |   imageModel: string;
101 |   videoModel: string;
102 | }
103 | 
104 | export interface GeminiFileMetadata {
105 |   name: string;
106 |   displayName: string;
107 |   mimeType: string;
108 |   sizeBytes: string;
109 |   createTime: string;
110 |   updateTime: string;
111 |   expirationTime: string;
112 |   sha256Hash: string;
113 |   uri: string;
114 |   state: 'PROCESSING' | 'ACTIVE' | 'FAILED';
115 | }
116 | 
117 | export interface GeminiGenerateContentRequest {
118 |   contents: GeminiContent[];
119 |   generationConfig?: GeminiGenerationConfig;
120 |   safetySettings?: GeminiSafetySetting[];
121 | }
122 | 
123 | export interface GeminiContent {
124 |   role: 'user' | 'model';
125 |   parts: GeminiPart[];
126 | }
127 | 
128 | export type GeminiPart =
129 |   | { text: string }
130 |   | { inlineData: { mimeType: string; data: string } }
131 |   | { fileData: { mimeType: string; fileUri: string } };
132 | 
133 | export interface GeminiGenerationConfig {
134 |   temperature?: number;
135 |   topP?: number;
136 |   topK?: number;
137 |   maxOutputTokens?: number;
138 |   candidateCount?: number;
139 |   stopSequences?: string[];
140 | }
141 | 
142 | export interface GeminiSafetySetting {
143 |   category: string;
144 |   threshold: string;
145 | }
146 | 
147 | export interface GeminiGenerateContentResponse {
148 |   candidates: GeminiCandidate[];
149 |   usageMetadata: {
150 |     promptTokenCount: number;
151 |     candidatesTokenCount: number;
152 |     totalTokenCount: number;
153 |   };
154 | }
155 | 
156 | export interface GeminiCandidate {
157 |   content: GeminiContent;
158 |   finishReason: string;
159 |   index: number;
160 |   safetyRatings?: GeminiSafetyRating[];
161 | }
162 | 
163 | export interface GeminiSafetyRating {
164 |   category: string;
165 |   probability: string;
166 |   blocked: boolean;
167 | }
168 | 
169 | // Vertex AI-specific types
170 | export interface VertexAIConfig {
171 |   projectId: string;
172 |   location: string;
173 |   endpoint: string;
174 |   credentials?: string;
175 |   imageModel: string;
176 |   videoModel: string;
177 | }
178 | 
179 | export interface VertexAIGenerateContentRequest {
180 |   contents: VertexAIContent[];
181 |   generationConfig?: VertexAIGenerationConfig;
182 |   safetySettings?: VertexAISafetySetting[];
183 | }
184 | 
185 | export interface VertexAIContent {
186 |   role: 'user' | 'model';
187 |   parts: VertexAIPart[];
188 | }
189 | 
190 | export type VertexAIPart =
191 |   | { text: string }
192 |   | { inlineData: { mimeType: string; data: string } }
193 |   | { fileData: { mimeType: string; fileUri: string } };
194 | 
195 | export interface VertexAIGenerationConfig {
196 |   temperature?: number;
197 |   topP?: number;
198 |   topK?: number;
199 |   maxOutputTokens?: number;
200 |   candidateCount?: number;
201 |   stopSequences?: string[];
202 | }
203 | 
204 | export interface VertexAISafetySetting {
205 |   category: string;
206 |   threshold: string;
207 | }
208 | 
209 | export interface VertexAIGenerateContentResponse {
210 |   candidates: VertexAICandidate[];
211 |   usageMetadata: {
212 |     promptTokenCount: number;
213 |     candidatesTokenCount: number;
214 |     totalTokenCount: number;
215 |   };
216 |   modelVersion?: string;
217 | }
218 | 
219 | export interface VertexAICandidate {
220 |   content: VertexAIContent;
221 |   finishReason: string;
222 |   index: number;
223 |   safetyRatings?: VertexAISafetyRating[];
224 | }
225 | 
226 | export interface VertexAISafetyRating {
227 |   category: string;
228 |   probability: string;
229 |   blocked: boolean;
230 | }
231 | 


--------------------------------------------------------------------------------
/src/storage/gcs/GCSStorage.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Google Cloud Storage provider implementation using native GCS SDK
  3 |  */
  4 | 
  5 | import { Storage, Bucket, File } from '@google-cloud/storage';
  6 | import { StorageProvider, StorageFile } from '../../types/Storage.js';
  7 | import { StorageError } from '../../types/Errors.js';
  8 | import type { GCSConfig } from '../../types/Config.js';
  9 | 
 10 | export class GCSStorageProvider implements StorageProvider {
 11 |   private storage: Storage;
 12 |   private bucket: Bucket;
 13 |   private config: GCSConfig;
 14 | 
 15 |   constructor(config: GCSConfig) {
 16 |     this.config = config;
 17 | 
 18 |     // Initialize native GCS Storage client
 19 |     this.storage = new Storage({
 20 |       projectId: config.projectId,
 21 |       keyFilename: config.credentials,
 22 |     });
 23 | 
 24 |     this.bucket = this.storage.bucket(config.bucketName);
 25 |   }
 26 | 
 27 |   async uploadFile(
 28 |     buffer: Buffer,
 29 |     filename: string,
 30 |     mimeType: string
 31 |   ): Promise<StorageFile> {
 32 |     try {
 33 |       const key = this.generateKey(filename);
 34 |       const file: File = this.bucket.file(key);
 35 | 
 36 |       await file.save(buffer, {
 37 |         contentType: mimeType,
 38 |         metadata: {
 39 |           cacheControl: 'public, max-age=31536000', // 1 year
 40 |         },
 41 |       });
 42 | 
 43 |       // Get the file metadata
 44 |       const [metadata] = await file.getMetadata();
 45 | 
 46 |       return {
 47 |         id: key,
 48 |         filename,
 49 |         mimeType,
 50 |         size: buffer.length,
 51 |         url: `gs://${this.config.bucketName}/${key}`,
 52 |         lastModified: metadata.updated || new Date().toISOString(),
 53 |         etag: metadata.etag || this.generateETag(buffer),
 54 |       };
 55 |     } catch (error) {
 56 |       throw new StorageError(
 57 |         `Failed to upload file to GCS: ${error instanceof Error ? error.message : String(error)}`,
 58 |         'gcs',
 59 |         error instanceof Error ? error : new Error(String(error))
 60 |       );
 61 |     }
 62 |   }
 63 | 
 64 |   async downloadFile(fileId: string): Promise<Buffer> {
 65 |     try {
 66 |       const file: File = this.bucket.file(fileId);
 67 |       const [buffer] = await file.download();
 68 |       return buffer;
 69 |     } catch (error) {
 70 |       throw new StorageError(
 71 |         `Failed to download file from GCS: ${error instanceof Error ? error.message : String(error)}`,
 72 |         'gcs',
 73 |         error instanceof Error ? error : new Error(String(error))
 74 |       );
 75 |     }
 76 |   }
 77 | 
 78 |   async deleteFile(fileId: string): Promise<void> {
 79 |     try {
 80 |       const file: File = this.bucket.file(fileId);
 81 |       await file.delete();
 82 |     } catch (error) {
 83 |       // Don't throw error if file doesn't exist (404)
 84 |       if (error instanceof Error && error.message.includes('No such object')) {
 85 |         return;
 86 |       }
 87 |       throw new StorageError(
 88 |         `Failed to delete file from GCS: ${error instanceof Error ? error.message : String(error)}`,
 89 |         'gcs',
 90 |         error instanceof Error ? error : new Error(String(error))
 91 |       );
 92 |     }
 93 |   }
 94 | 
 95 |   async getPublicUrl(fileId: string): Promise<string> {
 96 |     // Return GCS URI format (gs://bucket/path)
 97 |     return `gs://${this.config.bucketName}/${fileId}`;
 98 |   }
 99 | 
100 |   async getSignedUrl(fileId: string, expiresIn: number): Promise<string> {
101 |     try {
102 |       const file: File = this.bucket.file(fileId);
103 |       const [signedUrl] = await file.getSignedUrl({
104 |         version: 'v4',
105 |         action: 'read',
106 |         expires: Date.now() + expiresIn * 1000, // Convert seconds to milliseconds
107 |       });
108 |       return signedUrl;
109 |     } catch (error) {
110 |       throw new StorageError(
111 |         `Failed to generate signed URL: ${error instanceof Error ? error.message : String(error)}`,
112 |         'gcs',
113 |         error instanceof Error ? error : new Error(String(error))
114 |       );
115 |     }
116 |   }
117 | 
118 |   async listFiles(prefix?: string): Promise<StorageFile[]> {
119 |     try {
120 |       const [files] = await this.bucket.getFiles({ prefix });
121 |       const storageFiles: StorageFile[] = [];
122 | 
123 |       for (const file of files) {
124 |         const [metadata] = await file.getMetadata();
125 |         const filename = file.name.split('/').pop() || file.name;
126 | 
127 |         storageFiles.push({
128 |           id: file.name,
129 |           filename,
130 |           mimeType: metadata.contentType || 'application/octet-stream',
131 |           size: parseInt(String(metadata.size || '0'), 10),
132 |           url: `gs://${this.config.bucketName}/${file.name}`,
133 |           lastModified: metadata.updated || new Date().toISOString(),
134 |           etag: metadata.etag || '',
135 |         });
136 |       }
137 | 
138 |       return storageFiles;
139 |     } catch (error) {
140 |       throw new StorageError(
141 |         `Failed to list files from GCS: ${error instanceof Error ? error.message : String(error)}`,
142 |         'gcs',
143 |         error instanceof Error ? error : new Error(String(error))
144 |       );
145 |     }
146 |   }
147 | 
148 |   // Private helper methods
149 | 
150 |   private generateKey(filename: string): string {
151 |     // Generate a unique key with timestamp and random UUID
152 |     const timestamp = new Date().toISOString().split('T')[0]; // YYYY-MM-DD
153 |     const randomId = Math.random().toString(36).substring(2, 15);
154 |     const extension = filename.includes('.')
155 |       ? `.${filename.split('.').pop()}`
156 |       : '';
157 | 
158 |     // Organize files by date and type
159 |     const type = this.getFileType(filename);
160 |     return `${type}/${timestamp}/${randomId}${extension}`;
161 |   }
162 | 
163 |   private getFileType(filename: string): string {
164 |     const extension = filename.split('.').pop()?.toLowerCase();
165 | 
166 |     if (
167 |       [
168 |         'jpg',
169 |         'jpeg',
170 |         'png',
171 |         'gif',
172 |         'bmp',
173 |         'webp',
174 |         'tiff',
175 |         'heic',
176 |         'heif',
177 |       ].includes(extension || '')
178 |     ) {
179 |       return 'images';
180 |     } else if (
181 |       ['mp4', 'mov', 'avi', 'mkv', 'webm', 'flv', 'wmv', '3gp', 'm4v'].includes(
182 |         extension || ''
183 |       )
184 |     ) {
185 |       return 'videos';
186 |     } else {
187 |       return 'files';
188 |     }
189 |   }
190 | 
191 |   private generateETag(buffer: Buffer): string {
192 |     // Simple hash generation - in production, you might want to use a proper hash function
193 |     const hash = Buffer.from(buffer).toString('base64').substring(0, 32);
194 |     return `"${hash}"`;
195 |   }
196 | 
197 |   // Configuration methods
198 | 
199 |   public getBucket(): string {
200 |     return this.config.bucketName;
201 |   }
202 | 
203 |   public getProjectId(): string {
204 |     return this.config.projectId;
205 |   }
206 | 
207 |   public getRegion(): string {
208 |     return this.config.region;
209 |   }
210 | }
211 | 


--------------------------------------------------------------------------------
/src/utils/imageAnnotator.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Image annotation utilities using Sharp
  3 |  * Based on gemini_object_detection.js annotation logic
  4 |  */
  5 | 
  6 | import sharp from 'sharp';
  7 | import fs from 'fs/promises';
  8 | import path from 'path';
  9 | import os from 'os';
 10 | import crypto from 'crypto';
 11 | import type { DetectedObject } from '../types/ObjectDetection.js';
 12 | 
 13 | export interface AnnotationOptions {
 14 |   lineColor?: string;
 15 |   lineWidth?: number;
 16 |   labelColor?: string;
 17 |   labelHeight?: number;
 18 |   // REMOVED: pointColor and pointRadius (corner circles no longer used)
 19 | }
 20 | 
 21 | export class ImageAnnotator {
 22 |   private options: Required<AnnotationOptions>;
 23 | 
 24 |   constructor(options: AnnotationOptions = {}) {
 25 |     this.options = {
 26 |       lineColor: options.lineColor || 'red',
 27 |       lineWidth: options.lineWidth || 3,
 28 |       labelColor: options.labelColor || 'red',
 29 |       labelHeight: options.labelHeight || 20,
 30 |       // REMOVED: pointColor and pointRadius initialization (corner circles no longer used)
 31 |     };
 32 |   }
 33 | 
 34 |   /**
 35 |    * Draw bounding boxes and labels on image using Sharp
 36 |    * Adapted from gemini_object_detection.js drawAnnotations function
 37 |    */
 38 |   async drawAnnotations(
 39 |     imageBuffer: Buffer,
 40 |     detections: DetectedObject[],
 41 |     imageWidth: number,
 42 |     imageHeight: number
 43 |   ): Promise<Buffer> {
 44 |     let sharpImage = sharp(imageBuffer);
 45 | 
 46 |     // Prepare overlays for bounding boxes, corners, and text
 47 |     const overlays = [];
 48 | 
 49 |     for (let idx = 0; idx < detections.length; idx++) {
 50 |       const detection = detections[idx];
 51 | 
 52 |       // Use normalized_box_2d coordinates (converted to pixels)
 53 |       if (
 54 |         !detection.normalized_box_2d ||
 55 |         detection.normalized_box_2d.length !== 4
 56 |       ) {
 57 |         console.warn(
 58 |           `[ImageAnnotator] Skipping detection without valid normalized_box_2d: ${detection.object}`
 59 |         );
 60 |         continue;
 61 |       }
 62 | 
 63 |       // Convert normalized coordinates to pixels
 64 |       const [normY1, normX1, normY2, normX2] = detection.normalized_box_2d;
 65 |       const x1 = Math.round((normX1 / 1000) * imageWidth); // left edge
 66 |       const y1 = Math.round((normY1 / 1000) * imageHeight); // top edge
 67 |       const x2 = Math.round((normX2 / 1000) * imageWidth); // right edge
 68 |       const y2 = Math.round((normY2 / 1000) * imageHeight); // bottom edge
 69 | 
 70 |       // Create rectangle overlay (bounding box)
 71 |       const rectOverlay = await this.createRectangleOverlay(
 72 |         imageWidth,
 73 |         imageHeight,
 74 |         x1,
 75 |         y1,
 76 |         x2,
 77 |         y2
 78 |       );
 79 |       overlays.push({
 80 |         input: rectOverlay,
 81 |         left: 0,
 82 |         top: 0,
 83 |       });
 84 | 
 85 |       // REMOVED: Corner circles (were causing "double boxing" visual clutter)
 86 |       // The 4 corner circles made it appear like buttons were boxed multiple times
 87 | 
 88 |       // Create text label
 89 |       const text = `${detection.object} - ${detection.label}`;
 90 |       const textOverlay = await this.createTextOverlay(text);
 91 | 
 92 |       // Calculate text position (above bounding box)
 93 |       const textX = x1;
 94 |       const textY = Math.max(y1 - this.options.labelHeight - 4, 0);
 95 | 
 96 |       overlays.push({
 97 |         input: textOverlay,
 98 |         left: textX,
 99 |         top: textY,
100 |       });
101 |     }
102 | 
103 |     // Composite all overlays onto the original image
104 |     if (overlays.length > 0) {
105 |       sharpImage = sharpImage.composite(overlays);
106 |     }
107 | 
108 |     return sharpImage.toBuffer();
109 |   }
110 | 
111 |   /**
112 |    * Create a rectangle overlay using SVG
113 |    * Adapted from gemini_object_detection.js createRectangleOverlay function
114 |    */
115 |   private async createRectangleOverlay(
116 |     imageWidth: number,
117 |     imageHeight: number,
118 |     x1: number,
119 |     y1: number,
120 |     x2: number,
121 |     y3: number
122 |   ): Promise<Buffer> {
123 |     const rectWidth = x2 - x1;
124 |     const rectHeight = y3 - y1;
125 | 
126 |     const rectangleBuffer = await sharp({
127 |       create: {
128 |         width: imageWidth,
129 |         height: imageHeight,
130 |         channels: 4,
131 |         background: { r: 0, g: 0, b: 0, alpha: 0 },
132 |       },
133 |     })
134 |       .composite([
135 |         {
136 |           input: Buffer.from(
137 |             `<svg width="${imageWidth}" height="${imageHeight}">
138 |         <rect x="${x1}" y="${y1}" width="${rectWidth}" height="${rectHeight}"
139 |               fill="none" stroke="${this.options.lineColor}" stroke-width="${this.options.lineWidth}"/>
140 |       </svg>`
141 |           ),
142 |           left: 0,
143 |           top: 0,
144 |         },
145 |       ])
146 |       .png()
147 |       .toBuffer();
148 | 
149 |     return rectangleBuffer;
150 |   }
151 | 
152 |   // REMOVED: createCircleOverlay method (no longer needed since corner circles removed)
153 | 
154 |   /**
155 |    * Create a text overlay using Sharp
156 |    * Adapted from gemini_object_detection.js createTextOverlay function
157 |    */
158 |   private async createTextOverlay(text: string): Promise<Buffer> {
159 |     // Try to find a system font, fallback to default
160 |     const fontPaths = [
161 |       'C:/Windows/Fonts/arial.ttf', // Windows
162 |       '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', // Linux
163 |       '/System/Library/Fonts/Arial.ttf', // macOS
164 |     ];
165 | 
166 |     let fontfile = undefined;
167 |     for (const fontPath of fontPaths) {
168 |       try {
169 |         await fs.access(fontPath);
170 |         fontfile = fontPath;
171 |         break;
172 |       } catch {
173 |         // Font not found, try next
174 |       }
175 |     }
176 | 
177 |     // Calculate approximate text width (rough estimate: 8 pixels per character)
178 |     // Add padding for better visual appearance
179 |     const estimatedWidth = Math.max(text.length * 8 + 8, 50); // Minimum 50px width
180 | 
181 |     const textBuffer = await sharp({
182 |       create: {
183 |         width: estimatedWidth,
184 |         height: this.options.labelHeight,
185 |         channels: 4,
186 |         background: { r: 255, g: 0, b: 0, alpha: 1 }, // Red background
187 |       },
188 |     })
189 |       .composite([
190 |         {
191 |           input: {
192 |             text: {
193 |               text: text,
194 |               font: fontfile ? 'Arial' : 'sans-serif',
195 |               fontfile: fontfile,
196 |               rgba: true,
197 |               align: 'left',
198 |             },
199 |           },
200 |           left: 2,
201 |           top: 2,
202 |         },
203 |       ])
204 |       .png()
205 |       .toBuffer();
206 | 
207 |     return textBuffer;
208 |   }
209 | 
210 |   /**
211 |    * Save buffer to a temporary file with unique name
212 |    */
213 |   async saveToTempFile(
214 |     buffer: Buffer,
215 |     extension: string = 'png'
216 |   ): Promise<string> {
217 |     const tempDir = os.tmpdir();
218 |     const randomId = crypto.randomBytes(8).toString('hex');
219 |     const filename = `ai-vision-mcp-${randomId}.${extension}`;
220 |     const tempPath = path.join(tempDir, filename);
221 | 
222 |     await fs.writeFile(tempPath, buffer);
223 |     return tempPath;
224 |   }
225 | 
226 |   /**
227 |    * Save buffer to temp file, or gracefully skip if permission denied
228 |    */
229 |   async saveToTempFileOrSkip(
230 |     buffer: Buffer,
231 |     extension: string = 'png'
232 |   ): Promise<{ path: string; method: 'temp_file' } | { method: 'skipped' }> {
233 |     try {
234 |       const tempDir = os.tmpdir();
235 |       const randomId = crypto.randomBytes(8).toString('hex');
236 |       const filename = `ai-vision-mcp-${randomId}.${extension}`;
237 |       const tempPath = path.join(tempDir, filename);
238 | 
239 |       await fs.writeFile(tempPath, buffer);
240 |       return { path: tempPath, method: 'temp_file' };
241 |     } catch (error) {
242 |       const errorMessage = error instanceof Error ? error.message : String(error);
243 |       console.warn(`[ImageAnnotator] Skipped temp file creation due to permission error: ${errorMessage}. Detection results will be returned without file output.`);
244 |       return { method: 'skipped' };
245 |     }
246 |   }
247 | 
248 |   /**
249 |    * Save buffer to explicit path, ensuring directory exists
250 |    */
251 |   async saveToExplicitPath(filePath: string, buffer: Buffer): Promise<void> {
252 |     const outputDir = path.dirname(filePath);
253 |     await fs.mkdir(outputDir, { recursive: true });
254 |     await fs.writeFile(filePath, buffer);
255 |   }
256 | }
257 | 


--------------------------------------------------------------------------------
/src/utils/retry.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Retry logic and error handling utilities
  3 |  */
  4 | 
  5 | import { RateLimitExceededError, NetworkError } from '../types/Errors.js';
  6 | 
  7 | export interface RetryOptions {
  8 |   maxRetries?: number;
  9 |   baseDelay?: number;
 10 |   maxDelay?: number;
 11 |   backoffMultiplier?: number;
 12 |   jitter?: boolean;
 13 |   retryableErrors?: string[];
 14 |   onRetry?: (attempt: number, error: Error) => void;
 15 | }
 16 | 
 17 | export interface RetryResult<T> {
 18 |   result: T;
 19 |   attempts: number;
 20 |   totalDelay: number;
 21 | }
 22 | 
 23 | const DEFAULT_RETRY_OPTIONS: Required<RetryOptions> = {
 24 |   maxRetries: 3,
 25 |   baseDelay: 1000,
 26 |   maxDelay: 30000,
 27 |   backoffMultiplier: 2,
 28 |   jitter: true,
 29 |   retryableErrors: [
 30 |     'RATE_LIMIT_EXCEEDED',
 31 |     'NETWORK_ERROR',
 32 |     'ECONNRESET',
 33 |     'ECONNREFUSED',
 34 |     'ETIMEDOUT',
 35 |     'ENOTFOUND',
 36 |   ],
 37 |   onRetry: () => {},
 38 | };
 39 | 
 40 | export class RetryHandler {
 41 |   /**
 42 |    * Execute an operation with retry logic
 43 |    */
 44 |   static async withRetry<T>(
 45 |     operation: () => Promise<T>,
 46 |     options: RetryOptions = {}
 47 |   ): Promise<RetryResult<T>> {
 48 |     const opts = { ...DEFAULT_RETRY_OPTIONS, ...options };
 49 |     let lastError: Error;
 50 |     let totalDelay = 0;
 51 | 
 52 |     for (let attempt = 0; attempt <= opts.maxRetries; attempt++) {
 53 |       try {
 54 |         const result = await operation();
 55 |         return {
 56 |           result,
 57 |           attempts: attempt + 1,
 58 |           totalDelay,
 59 |         };
 60 |       } catch (error) {
 61 |         lastError = error instanceof Error ? error : new Error(String(error));
 62 | 
 63 |         // Don't retry on the last attempt
 64 |         if (attempt === opts.maxRetries || !this.shouldRetry(lastError, opts)) {
 65 |           throw lastError;
 66 |         }
 67 | 
 68 |         const delay = this.calculateDelay(attempt, opts);
 69 |         totalDelay += delay;
 70 | 
 71 |         opts.onRetry(attempt + 1, lastError);
 72 |         await this.sleep(delay);
 73 |       }
 74 |     }
 75 | 
 76 |     throw lastError!;
 77 |   }
 78 | 
 79 |   /**
 80 |    * Execute an operation with exponential backoff
 81 |    */
 82 |   static async withExponentialBackoff<T>(
 83 |     operation: () => Promise<T>,
 84 |     options: RetryOptions = {}
 85 |   ): Promise<T> {
 86 |     const result = await this.withRetry(operation, {
 87 |       ...options,
 88 |       backoffMultiplier: 2,
 89 |       jitter: true,
 90 |     });
 91 |     return result.result;
 92 |   }
 93 | 
 94 |   /**
 95 |    * Execute an operation with linear backoff
 96 |    */
 97 |   static async withLinearBackoff<T>(
 98 |     operation: () => Promise<T>,
 99 |     options: RetryOptions = {}
100 |   ): Promise<T> {
101 |     const result = await this.withRetry(operation, {
102 |       ...options,
103 |       backoffMultiplier: 1,
104 |       jitter: false,
105 |     });
106 |     return result.result;
107 |   }
108 | 
109 |   /**
110 |    * Determine if an error is retryable
111 |    */
112 |   private static shouldRetry(
113 |     error: Error,
114 |     options: Required<RetryOptions>
115 |   ): boolean {
116 |     // Check if it's a VisionError with a retryable code
117 |     if ('code' in error && typeof error.code === 'string') {
118 |       return options.retryableErrors.includes(error.code);
119 |     }
120 | 
121 |     // Check if it's a RateLimitExceededError
122 |     if (error instanceof RateLimitExceededError) {
123 |       return true;
124 |     }
125 | 
126 |     // Check if it's a NetworkError
127 |     if (error instanceof NetworkError) {
128 |       return true;
129 |     }
130 | 
131 |     // Check error message for common network-related errors
132 |     const message = error.message.toLowerCase();
133 |     const networkErrorPatterns = [
134 |       'network error',
135 |       'connection refused',
136 |       'connection reset',
137 |       'name resolution failed',
138 |     ];
139 | 
140 |     return networkErrorPatterns.some(pattern => message.includes(pattern));
141 |   }
142 | 
143 |   /**
144 |    * Calculate delay before next retry
145 |    */
146 |   private static calculateDelay(
147 |     attempt: number,
148 |     options: Required<RetryOptions>
149 |   ): number {
150 |     let delay =
151 |       options.baseDelay * Math.pow(options.backoffMultiplier, attempt);
152 | 
153 |     // Apply jitter if enabled
154 |     if (options.jitter) {
155 |       delay = delay * (0.5 + Math.random() * 0.5);
156 |     }
157 | 
158 |     // Ensure delay doesn't exceed maximum
159 |     return Math.min(delay, options.maxDelay);
160 |   }
161 | 
162 |   /**
163 |    * Sleep for the specified number of milliseconds
164 |    */
165 |   private static async sleep(ms: number): Promise<void> {
166 |     await new Promise(resolve => setTimeout(resolve, ms));
167 |   }
168 | 
169 |   /**
170 |    * Create a retryable version of a function
171 |    */
172 |   static wrap<T extends (...args: any[]) => Promise<any>>(
173 |     fn: T,
174 |     options: RetryOptions = {}
175 |   ): T {
176 |     return (async (...args: Parameters<T>): Promise<ReturnType<T>> => {
177 |       const result = await this.withRetry(() => fn(...args), options);
178 |       return result.result;
179 |     }) as T;
180 |   }
181 | }
182 | 
183 | /**
184 |  * Circuit breaker pattern for handling repeated failures
185 |  */
186 | export interface CircuitBreakerOptions {
187 |   failureThreshold?: number;
188 |   recoveryDelay?: number;
189 |   monitoringPeriod?: number;
190 |   onStateChange?: (state: 'CLOSED' | 'OPEN' | 'HALF_OPEN') => void;
191 | }
192 | 
193 | export class CircuitBreaker {
194 |   private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
195 |   private failureCount = 0;
196 |   private lastFailureTime = 0;
197 |   private successCount = 0;
198 | 
199 |   constructor(private options: CircuitBreakerOptions = {}) {}
200 | 
201 |   async execute<T>(operation: () => Promise<T>): Promise<T> {
202 |     const opts = {
203 |       failureThreshold: 5,
204 |       recoveryDelay: 60000,
205 |       monitoringPeriod: 10000,
206 |       ...this.options,
207 |     };
208 | 
209 |     if (this.state === 'OPEN') {
210 |       if (Date.now() - this.lastFailureTime > opts.recoveryDelay) {
211 |         this.setState('HALF_OPEN');
212 |       } else {
213 |         throw new Error('Circuit breaker is OPEN');
214 |       }
215 |     }
216 | 
217 |     try {
218 |       const result = await operation();
219 |       this.onSuccess();
220 |       return result;
221 |     } catch (error) {
222 |       this.onFailure();
223 |       throw error;
224 |     }
225 |   }
226 | 
227 |   private onSuccess(): void {
228 |     this.failureCount = 0;
229 |     this.successCount++;
230 | 
231 |     if (this.state === 'HALF_OPEN') {
232 |       this.setState('CLOSED');
233 |     }
234 |   }
235 | 
236 |   private onFailure(): void {
237 |     this.failureCount++;
238 |     this.lastFailureTime = Date.now();
239 | 
240 |     if (this.failureCount >= this.options.failureThreshold!) {
241 |       this.setState('OPEN');
242 |     }
243 |   }
244 | 
245 |   private setState(state: 'CLOSED' | 'OPEN' | 'HALF_OPEN'): void {
246 |     this.state = state;
247 |     this.options.onStateChange?.(state);
248 |   }
249 | 
250 |   getState(): 'CLOSED' | 'OPEN' | 'HALF_OPEN' {
251 |     return this.state;
252 |   }
253 | 
254 |   getFailureCount(): number {
255 |     return this.failureCount;
256 |   }
257 | 
258 |   getSuccessCount(): number {
259 |     return this.successCount;
260 |   }
261 | 
262 |   reset(): void {
263 |     this.state = 'CLOSED';
264 |     this.failureCount = 0;
265 |     this.successCount = 0;
266 |   }
267 | }
268 | 
269 | /**
270 |  * Bulkhead pattern for limiting concurrent operations
271 |  */
272 | export class Bulkhead {
273 |   private running = 0;
274 |   private queue: Array<{
275 |     resolve: (value: any) => void;
276 |     reject: (reason: any) => void;
277 |     operation: () => Promise<any>;
278 |   }> = [];
279 | 
280 |   constructor(private maxConcurrency: number) {}
281 | 
282 |   async execute<T>(operation: () => Promise<T>): Promise<T> {
283 |     return new Promise((resolve, reject) => {
284 |       this.queue.push({ resolve, reject, operation });
285 |       this.process();
286 |     });
287 |   }
288 | 
289 |   private async process(): Promise<void> {
290 |     if (this.running >= this.maxConcurrency || this.queue.length === 0) {
291 |       return;
292 |     }
293 | 
294 |     this.running++;
295 |     const { resolve, reject, operation } = this.queue.shift()!;
296 | 
297 |     try {
298 |       const result = await operation();
299 |       resolve(result);
300 |     } catch (error) {
301 |       reject(error);
302 |     } finally {
303 |       this.running--;
304 |       this.process();
305 |     }
306 |   }
307 | 
308 |   getRunningCount(): number {
309 |     return this.running;
310 |   }
311 | 
312 |   getQueueLength(): number {
313 |     return this.queue.length;
314 |   }
315 | }
316 | 


--------------------------------------------------------------------------------
/docs/llm_logs/percentage-scale-text-summary-plan.md:
--------------------------------------------------------------------------------
  1 | # Plan: Hybrid Element Identification + Spatial Reference for Object Detection
  2 | 
  3 | **Date**: 2025-01-10 (Updated: 2025-01-10)
  4 | **Author**: Claude Code
  5 | **Issue**: Improve object detection output by combining CSS selector automation guidance with minimal spatial reference coordinates
  6 | 
  7 | ## Problem Statement
  8 | 
  9 | The current `detect_objects_in_image` output has two competing needs:
 10 | 
 11 | 1. **Web Automation**: Requires CSS selectors, semantic targeting for reliable automation
 12 | 2. **Spatial Awareness**: Needs position reference for layout understanding and debugging
 13 | 3. **Information Overload**: Current verbose coordinate explanations obscure actionable guidance
 14 | 4. **Mixed Priorities**: Unclear whether to focus on automation or spatial reference
 15 | 
 16 | ## Solution: Hybrid Approach - CSS Selectors + Minimal Coordinates
 17 | 
 18 | ### Core Approach
 19 | - **Primary Focus**: CSS selectors and semantic targeting (automation best practices)
 20 | - **Secondary Reference**: Concise percentage coordinates (spatial awareness)
 21 | - **Information Hierarchy**: 1-2 lines per element, automation guidance first
 22 | - **Clear Separation**: Distinct purposes for different information types
 23 | 
 24 | ### Rationale for Hybrid Approach
 25 | 
 26 | **Why CSS Selectors (Primary):**
 27 | - **Automation Reliability**: Survives layout changes, responsive design, and viewport differences
 28 | - **Industry Standard**: Aligns with modern web automation best practices (Playwright, Puppeteer)
 29 | - **Maintenance Friendly**: Less brittle than coordinate-based approaches
 30 | - **Semantic Accuracy**: Targets elements by their actual purpose and attributes
 31 | 
 32 | **Why Minimal Coordinates (Secondary):**
 33 | - **Spatial Reference**: Quick position orientation without overwhelming detail
 34 | - **Visual Debugging**: Helps developers locate elements in complex layouts
 35 | - **Design Validation**: Useful for QA and design review workflows
 36 | - **Non-Automation Use Cases**: Screenshots annotation, layout documentation
 37 | 
 38 | **Why Concise Format (1-2 Lines):**
 39 | - **Reduced Cognitive Load**: Focus on essential information only
 40 | - **Faster Scanning**: Developers can quickly find what they need
 41 | - **Clear Hierarchy**: Automation guidance prominently featured
 42 | - **Information Efficiency**: No redundant explanations or verbose calculations
 43 | 
 44 | ### Benefits
 45 | - ✅ **Automation-First Design**: CSS selectors prominently featured for web automation
 46 | - ✅ **Spatial Context Preserved**: Percentage coordinates provide layout reference
 47 | - ✅ **Information Efficiency**: Concise 1-2 line format reduces cognitive load
 48 | - ✅ **Multi-Use Case Support**: Serves automation, debugging, and documentation needs
 49 | - ✅ **Industry Alignment**: Follows modern web development and testing practices
 50 | - ✅ **Reduced Verbosity**: Eliminates redundant coordinate calculations and explanations
 51 | 
 52 | ## Implementation Plan
 53 | 
 54 | ### Phase 1: Update Summary Generator (2 days)
 55 | 
 56 | **File Changes Required**:
 57 | - Modify `src/tools/detect_objects_in_image.ts` to generate hybrid summary format
 58 | - Implement concise 2-line element description (automation + position)
 59 | - Remove verbose coordinate explanations and automation guidance
 60 | - Focus on CSS selector recommendations as primary automation method
 61 | 
 62 | **Key Functions**:
 63 | ```typescript
 64 | function generateDetectionSummary(
 65 |   detections: DetectedObject[],
 66 |   imageMetadata: ImageMetadata,
 67 |   model: string,
 68 |   provider: string
 69 | ): string {
 70 |   // Generate concise element summaries (1-2 lines each)
 71 |   // Line 1: CSS selector recommendations
 72 |   // Line 2: Percentage position reference
 73 |   // Remove verbose coordinate calculations
 74 | }
 75 | 
 76 | function suggestCSSSelectors(detection: DetectedObject): string[] {
 77 |   // Recommend CSS selectors based on element type and label
 78 |   // Return 2-3 most likely selectors
 79 | }
 80 | 
 81 | function formatPositionReference(detection: DetectedObject): string {
 82 |   // Return concise position: "78.5% across, 26.7% down (13% × 4.5% size)"
 83 | }
 84 | ```
 85 | 
 86 | ### Phase 2: Testing and Validation (1 day)
 87 | - Test with various UI element types (buttons, inputs, links, etc.)
 88 | - Validate CSS selector recommendations are accurate and useful
 89 | - Ensure percentage coordinates provide meaningful spatial reference
 90 | - Verify 2-line format provides sufficient information without overload
 91 | 
 92 | ### Phase 3: Documentation Update (1 day)
 93 | - Update README.md with new hybrid summary examples
 94 | - Document the automation-first approach with spatial reference
 95 | - Remove verbose coordinate automation examples
 96 | 
 97 | ## Technical Specifications
 98 | 
 99 | ### Input Data (Simplified)
100 | ```typescript
101 | interface DetectedObject {
102 |   object: string;                                    // "button", "input"
103 |   label: string;                                     // "Submit button"
104 |   normalized_box_2d: [number, number, number, number]; // [ymin, xmin, ymax, xmax] 0-1000
105 |   // REMOVED: coordinates object (redundant, confusing)
106 | }
107 | ```
108 | 
109 | ### Output Enhancement (New + Simplified)
110 | ```typescript
111 | interface ObjectDetectionResponse {
112 |   detections: DetectedObject[];
113 |   image_metadata: { width: number, height: number, size_bytes: number, format: string };
114 | 
115 |   // NEW: Human-readable summary (primary coordinate interface)
116 |   summary: string; // Percentage-based descriptions with automation code
117 | 
118 |   // REMOVED: Complex nested coordinate structures
119 | }
120 | ```
121 | 
122 | ### Coordinate Conversion Logic
123 | ```typescript
124 | // Convert normalized coordinates to percentages AND calculate pixel details
125 | const [ymin, xmin, ymax, xmax] = detection.normalized_box_2d;
126 | const { width: imageWidth, height: imageHeight } = imageMetadata;
127 | 
128 | // Percentage calculations
129 | const percentageBox = {
130 |   top: ymin / 10,      // Convert 245 → 24.5%
131 |   left: xmin / 10,     // Convert 720 → 72.0%
132 |   bottom: ymax / 10,   // Convert 290 → 29.0%
133 |   right: xmax / 10     // Convert 850 → 85.0%
134 | };
135 | 
136 | const centerX = (xmin + xmax) / 2 / 10;  // 78.5%
137 | const centerY = (ymin + ymax) / 2 / 10;  // 26.7%
138 | const widthPercent = (xmax - xmin) / 10; // 13.0%
139 | const heightPercent = (ymax - ymin) / 10; // 4.5%
140 | 
141 | // Pixel calculations (derived from normalized + image dimensions)
142 | const pixelBox = {
143 |   x: Math.round((xmin / 1000) * imageWidth),      // 1382
144 |   y: Math.round((ymin / 1000) * imageHeight),     // 470
145 |   width: Math.round(((xmax - xmin) / 1000) * imageWidth),  // 250
146 |   height: Math.round(((ymax - ymin) / 1000) * imageHeight) // 86
147 | };
148 | ```
149 | 
150 | ## Sample Output
151 | 
152 | ### Updated Text Summary (Hybrid Approach - CSS Selectors + Minimal Coordinates)
153 | ```
154 | 🖼️ IMAGE ANALYSIS COMPLETE
155 | 
156 | 📏 Source Image: 1920×1080 pixels (PNG, 2.0MB)
157 | 🤖 Detection Model: gemini-2.5-flash-lite (google)
158 | 📊 Elements Found: 2 interactive elements detected
159 | 
160 | ⚠️ FOR WEB AUTOMATION:
161 | - **RECOMMENDED**: Use CSS selectors for reliable automation (primary approach)
162 | - **REFERENCE ONLY**: Percentage coordinates for spatial context (secondary reference)
163 | - **AVOID**: Direct coordinate-based clicking for automation
164 | 
165 | ## 🔍 DETECTED ELEMENTS:
166 | 
167 | ### 1. button - Submit Button
168 | - **Automation**: `button[type="submit"]` or `button:has-text("Submit")`
169 | - **Position**: 78.5% across, 26.7% down (13% × 4.5% size)
170 | 
171 | ### 2. input - Email Address Field
172 | - **Automation**: `input[type="email"]` or `input[name="email"]`
173 | - **Position**: 40.0% across, 20.0% down (40% × 4% size)
174 | 
175 | ### 3. select - Country Dropdown
176 | - **Automation**: `select[name="country"]` or `#country-select`
177 | - **Position**: 25.0% across, 45.0% down (35% × 3% size)
178 | ```
179 | 
180 | ## Risk Assessment
181 | 
182 | ### Low Risk
183 | - **Backward Compatibility**: No changes to existing data structure
184 | - **Performance Impact**: Minimal text generation overhead (~1ms)
185 | - **Implementation Simplicity**: Straightforward 2-line format per element
186 | 
187 | ### Medium Risk
188 | - **CSS Selector Accuracy**: Need to ensure recommended selectors are practical
189 | - **Balance Maintenance**: Keep automation focus while providing useful spatial reference
190 | 
191 | ### High Value
192 | - **Automation-First Approach**: Prominently features industry-standard CSS selectors
193 | - **Information Efficiency**: Concise format reduces cognitive load
194 | - **Multi-Purpose Utility**: Serves both automation and spatial reference needs
195 | - **Developer Experience**: Clear hierarchy and actionable guidance
196 | 
197 | ## Success Metrics
198 | 
199 | 1. **Automation Adoption**: Increased use of CSS selectors over coordinate-based automation
200 | 2. **Information Efficiency**: Positive feedback on concise 2-line element format
201 | 3. **Dual-Purpose Utility**: Usage for both automation and spatial reference scenarios
202 | 4. **Developer Satisfaction**: Preference for automation-first approach with spatial context
203 | 
204 | ## Future Enhancements
205 | 
206 | If the hybrid approach proves successful, consider:
207 | - **Context-Aware HTML Elements**: Use specific HTML element names (button, input, select) when analyzing web pages
208 | - **Smart Selector Intelligence**: AI-powered CSS selector suggestions based on visual analysis and common patterns
209 | - **Accessibility Integration**: Include ARIA attributes and accessibility hints in selector recommendations
210 | - **Framework-Specific Guidance**: Tailored selector recommendations for different testing frameworks (Playwright, Puppeteer, Cypress)
211 | 
212 | ## Implementation Timeline
213 | 
214 | - **Day 1**: Update summary generator for hybrid format (CSS selectors + minimal coordinates)
215 | - **Day 2**: Implement 2-line element descriptions and remove verbose explanations
216 | - **Day 3**: Integration testing with various element types and validation
217 | - **Day 4**: Documentation updates and example refinements
218 | 
219 | ## Conclusion
220 | 
221 | This hybrid approach represents the optimal balance between automation best practices and spatial reference utility. By prominently featuring CSS selectors while maintaining concise percentage coordinates, the tool provides:
222 | 
223 | 1. **Actionable Automation Guidance**: Industry-standard CSS selectors for reliable web automation
224 | 2. **Spatial Context**: Quick position reference without overwhelming detail
225 | 3. **Information Efficiency**: Concise 2-line format that reduces cognitive load
226 | 4. **Multi-Purpose Value**: Serves automation, debugging, and documentation workflows
227 | 
228 | The enhancement transforms the object detection output from a coordinate-focused tool into an automation-first solution that still preserves essential spatial awareness - making it valuable for real-world web development and testing workflows while promoting robust, maintainable automation practices.


--------------------------------------------------------------------------------
/src/utils/validation.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Zod schemas for data validation
  3 |  */
  4 | 
  5 | import { z } from 'zod';
  6 | import type { Config } from '../types/Config.js';
  7 | import type { AnalysisOptions } from '../types/Analysis.js';
  8 | import {
  9 |   FUNCTION_NAMES,
 10 |   type FunctionName,
 11 | } from '../constants/FunctionNames.js';
 12 | 
 13 | // Provider selection schemas
 14 | const ProviderSchema = z.enum(['google', 'vertex_ai']);
 15 | 
 16 | // Log level schema
 17 | const LogLevelSchema = z.enum(['info', 'debug', 'warn', 'error']);
 18 | 
 19 | // Node environment schema
 20 | const NodeEnvSchema = z.enum(['development', 'production']);
 21 | 
 22 | // File format schemas
 23 | const ImageFormatSchema = z.enum([
 24 |   'png',
 25 |   'jpg',
 26 |   'jpeg',
 27 |   'webp',
 28 |   'gif',
 29 |   'bmp',
 30 |   'tiff',
 31 | ]);
 32 | 
 33 | const VideoFormatSchema = z.enum([
 34 |   'mp4',
 35 |   'mov',
 36 |   'avi',
 37 |   'mkv',
 38 |   'webm',
 39 |   'flv',
 40 |   'wmv',
 41 |   '3gp',
 42 | ]);
 43 | 
 44 | // Configuration schema
 45 | export const ConfigSchema = z.object({
 46 |   IMAGE_PROVIDER: ProviderSchema.optional().default('google'),
 47 |   VIDEO_PROVIDER: ProviderSchema.optional().default('google'),
 48 | 
 49 |   // Model configuration
 50 |   IMAGE_MODEL: z.string().min(1).optional(),
 51 |   VIDEO_MODEL: z.string().min(1).optional(),
 52 | 
 53 |   // Function-specific model configuration
 54 |   ANALYZE_IMAGE_MODEL: z.string().min(1).optional(),
 55 |   COMPARE_IMAGES_MODEL: z.string().min(1).optional(),
 56 |   DETECT_OBJECTS_IN_IMAGE_MODEL: z.string().min(1).optional(),
 57 |   ANALYZE_VIDEO_MODEL: z.string().min(1).optional(),
 58 | 
 59 |   // Gemini API configuration
 60 |   GEMINI_API_KEY: z.string().min(1).optional(),
 61 |   GEMINI_BASE_URL: z
 62 |     .string()
 63 |     .url()
 64 |     .optional()
 65 |     .default('https://generativelanguage.googleapis.com'),
 66 | 
 67 |   // Vertex AI configuration
 68 |   VERTEX_CREDENTIALS: z.string().min(1).optional(),
 69 |   VERTEX_PROJECT_ID: z.string().min(1).optional(),
 70 |   VERTEX_LOCATION: z.string().min(1).optional().default('us-central1'),
 71 |   VERTEX_ENDPOINT: z
 72 |     .string()
 73 |     .url()
 74 |     .optional()
 75 |     .default('https://aiplatform.googleapis.com'),
 76 | 
 77 |   // Google Cloud Storage configuration (for Vertex AI file storage)
 78 |   GCS_BUCKET_NAME: z.string().min(1).optional(),
 79 |   GCS_PROJECT_ID: z.string().min(1).optional(), // Auto-derived from credentials
 80 |   GCS_CREDENTIALS: z.string().min(1).optional(), // Defaults to VERTEX_CREDENTIALS
 81 |   GCS_REGION: z.string().min(1).optional().default('us-central1'),
 82 | 
 83 |   // Universal API parameters
 84 |   TEMPERATURE: z.coerce.number().min(0).max(2).optional().default(0.8),
 85 |   TOP_P: z.coerce.number().min(0).max(1).optional().default(0.95),
 86 |   TOP_K: z.coerce.number().int().min(1).max(100).optional().default(30),
 87 |   MAX_TOKENS: z.coerce.number().int().min(1).max(8192).optional().default(1000),
 88 | 
 89 |   // Task-specific API parameters
 90 |   TEMPERATURE_FOR_IMAGE: z.number().min(0).max(2).optional(),
 91 |   TOP_P_FOR_IMAGE: z.number().min(0).max(1).optional(),
 92 |   TOP_K_FOR_IMAGE: z.number().int().positive().optional(),
 93 |   MAX_TOKENS_FOR_IMAGE: z.number().int().positive().optional(),
 94 |   TEMPERATURE_FOR_VIDEO: z.number().min(0).max(2).optional(),
 95 |   TOP_P_FOR_VIDEO: z.number().min(0).max(1).optional(),
 96 |   TOP_K_FOR_VIDEO: z.number().int().positive().optional(),
 97 |   MAX_TOKENS_FOR_VIDEO: z.number().int().positive().optional(),
 98 | 
 99 |   // Function-specific API parameters
100 |   TEMPERATURE_FOR_ANALYZE_IMAGE: z.number().min(0).max(2).optional(),
101 |   TOP_P_FOR_ANALYZE_IMAGE: z.number().min(0).max(1).optional(),
102 |   TOP_K_FOR_ANALYZE_IMAGE: z.number().int().positive().optional(),
103 |   MAX_TOKENS_FOR_ANALYZE_IMAGE: z.number().int().positive().optional(),
104 |   TEMPERATURE_FOR_COMPARE_IMAGES: z.number().min(0).max(2).optional(),
105 |   TOP_P_FOR_COMPARE_IMAGES: z.number().min(0).max(1).optional(),
106 |   TOP_K_FOR_COMPARE_IMAGES: z.number().int().positive().optional(),
107 |   MAX_TOKENS_FOR_COMPARE_IMAGES: z.number().int().positive().optional(),
108 |   TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE: z
109 |     .number()
110 |     .min(0)
111 |     .max(2)
112 |     .optional()
113 |     .default(0),
114 |   TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE: z
115 |     .number()
116 |     .min(0)
117 |     .max(1)
118 |     .optional()
119 |     .default(0.95),
120 |   TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE: z
121 |     .number()
122 |     .int()
123 |     .positive()
124 |     .optional()
125 |     .default(30),
126 |   MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE: z
127 |     .number()
128 |     .int()
129 |     .positive()
130 |     .optional()
131 |     .default(8192),
132 |   TEMPERATURE_FOR_ANALYZE_VIDEO: z.number().min(0).max(2).optional(),
133 |   TOP_P_FOR_ANALYZE_VIDEO: z.number().min(0).max(1).optional(),
134 |   TOP_K_FOR_ANALYZE_VIDEO: z.number().int().positive().optional(),
135 |   MAX_TOKENS_FOR_ANALYZE_VIDEO: z.number().int().positive().optional(),
136 | 
137 |   // File processing configuration
138 |   MAX_IMAGE_SIZE: z.coerce
139 |     .number()
140 |     .int()
141 |     .positive()
142 |     .optional()
143 |     .default(20 * 1024 * 1024), // 20MB
144 |   MAX_VIDEO_SIZE: z.coerce
145 |     .number()
146 |     .int()
147 |     .positive()
148 |     .optional()
149 |     .default(2 * 1024 * 1024 * 1024), // 2GB
150 |   ALLOWED_IMAGE_FORMATS: z
151 |     .array(ImageFormatSchema)
152 |     .optional()
153 |     .default(['png', 'jpg', 'jpeg', 'webp', 'gif', 'bmp', 'tiff']),
154 |   ALLOWED_VIDEO_FORMATS: z
155 |     .array(VideoFormatSchema)
156 |     .optional()
157 |     .default(['mp4', 'mov', 'avi', 'mkv', 'webm', 'flv', 'wmv', '3gp']),
158 |   MAX_VIDEO_DURATION: z.coerce
159 |     .number()
160 |     .int()
161 |     .positive()
162 |     .optional()
163 |     .default(3600), // 1 hour
164 |   MAX_IMAGES_FOR_COMPARISON: z.coerce
165 |     .number()
166 |     .int()
167 |     .positive()
168 |     .optional()
169 |     .default(4), // Maximum 4 images for comparison
170 | 
171 |   // File upload configuration
172 |   GEMINI_FILES_API_THRESHOLD: z.coerce
173 |     .number()
174 |     .int()
175 |     .positive()
176 |     .optional()
177 |     .default(10 * 1024 * 1024), // 10MB
178 |   VERTEX_AI_FILES_API_THRESHOLD: z.coerce.number().int().optional().default(0), // Vertex AI requires external storage for all files
179 | 
180 |   // Logging configuration
181 |   LOG_LEVEL: LogLevelSchema.optional().default('info'),
182 | 
183 |   // Development configuration
184 |   NODE_ENV: NodeEnvSchema.optional().default('development'),
185 | });
186 | 
187 | // Analysis options schema
188 | export const AnalysisOptionsSchema = z.object({
189 |   temperature: z.number().min(0).max(2).optional(),
190 |   topP: z.number().min(0).max(1).optional(),
191 |   maxTokens: z.number().int().positive().optional(),
192 |   stopSequences: z.array(z.string()).optional(),
193 |   taskType: z.enum(['image', 'video']).optional(),
194 |   functionName: z
195 |     .enum(Object.values(FUNCTION_NAMES) as [FunctionName, ...FunctionName[]])
196 |     .optional(),
197 | });
198 | 
199 | // MCP tool argument schemas
200 | export const AnalyzeImageArgsSchema = z.object({
201 |   imageSource: z.string().min(1, 'Image source is required'),
202 |   prompt: z.string().min(1, 'Prompt is required'),
203 |   options: AnalysisOptionsSchema.optional(),
204 | });
205 | 
206 | export const AnalyzeVideoArgsSchema = z.object({
207 |   videoSource: z.string().min(1, 'Video source is required'),
208 |   prompt: z.string().min(1, 'Prompt is required'),
209 |   options: AnalysisOptionsSchema.optional(),
210 | });
211 | 
212 | // File validation schemas
213 | export const FileValidationSchema = z.object({
214 |   filename: z.string().min(1, 'Filename is required'),
215 |   mimeType: z.string().min(1, 'MIME type is required'),
216 |   size: z.number().int().nonnegative('File size must be non-negative'),
217 | });
218 | 
219 | // URL validation schema
220 | export const UrlSchema = z.string().url('Invalid URL format');
221 | 
222 | // Base64 validation schema
223 | export const Base64Schema = z
224 |   .string()
225 |   .regex(/^data:image\/[a-zA-Z]+;base64,/, 'Invalid base64 image format');
226 | 
227 | // Model name validation
228 | export const ModelNameSchema = z.string().min(1, 'Model name is required');
229 | 
230 | // Provider info validation
231 | export const ProviderInfoSchema = z.object({
232 |   name: z.string().min(1, 'Provider name is required'),
233 |   type: z.enum(['image', 'video']),
234 |   models: z.object({
235 |     image: ModelNameSchema,
236 |     video: ModelNameSchema,
237 |   }),
238 |   credentials: z.record(z.string()),
239 |   options: z.record(z.unknown()).optional(),
240 | });
241 | 
242 | // Health status validation
243 | export const HealthStatusSchema = z.object({
244 |   status: z.enum(['healthy', 'unhealthy', 'degraded']),
245 |   message: z.string().optional(),
246 |   lastCheck: z.string().datetime(),
247 |   responseTime: z.number().nonnegative().optional(),
248 | });
249 | 
250 | // Usage metadata validation
251 | export const UsageMetadataSchema = z.object({
252 |   promptTokenCount: z.number().int().nonnegative(),
253 |   candidatesTokenCount: z.number().int().nonnegative(),
254 |   totalTokenCount: z.number().int().nonnegative(),
255 | });
256 | 
257 | // Analysis result validation
258 | export const AnalysisResultSchema = z.object({
259 |   text: z.string(),
260 |   metadata: z.object({
261 |     model: z.string(),
262 |     provider: z.string(),
263 |     usage: UsageMetadataSchema.optional(),
264 |     processingTime: z.number().nonnegative().optional(),
265 |     fileType: z.string().optional(),
266 |     fileSize: z.number().int().nonnegative().optional(),
267 |   }),
268 | });
269 | 
270 | // File reference validation
271 | export const FileReferenceSchema = z.union([
272 |   z.object({
273 |     type: z.literal('file_uri'),
274 |     uri: z.string().min(1),
275 |     mimeType: z.string().min(1),
276 |   }),
277 |   z.object({
278 |     type: z.literal('public_url'),
279 |     url: z.string().url(),
280 |     mimeType: z.string().min(1),
281 |   }),
282 |   z.object({
283 |     type: z.literal('base64'),
284 |     data: z.string().min(1),
285 |     mimeType: z.string().min(1),
286 |   }),
287 | ]);
288 | 
289 | // Validation functions
290 | export const validateConfig = (config: unknown): Config => {
291 |   return ConfigSchema.parse(config);
292 | };
293 | 
294 | export const validateAnalysisOptions = (options: unknown): AnalysisOptions => {
295 |   return AnalysisOptionsSchema.parse(options);
296 | };
297 | 
298 | export const validateAnalyzeImageArgs = (args: unknown) => {
299 |   return AnalyzeImageArgsSchema.parse(args);
300 | };
301 | 
302 | export const validateAnalyzeVideoArgs = (args: unknown) => {
303 |   return AnalyzeVideoArgsSchema.parse(args);
304 | };
305 | 
306 | export const validateFile = (file: unknown) => {
307 |   return FileValidationSchema.parse(file);
308 | };
309 | 
310 | export const validateUrl = (url: unknown): string => {
311 |   return UrlSchema.parse(url);
312 | };
313 | 
314 | export const validateBase64 = (base64: unknown): string => {
315 |   return Base64Schema.parse(base64);
316 | };
317 | 
318 | export const validateModelName = (model: unknown): string => {
319 |   return ModelNameSchema.parse(model);
320 | };
321 | 
322 | export const validateHealthStatus = (status: unknown) => {
323 |   return HealthStatusSchema.parse(status);
324 | };
325 | 
326 | export const validateAnalysisResult = (result: unknown) => {
327 |   return AnalysisResultSchema.parse(result);
328 | };
329 | 
330 | export const validateFileReference = (reference: unknown) => {
331 |   return FileReferenceSchema.parse(reference);
332 | };
333 | 
334 | // Type guards
335 | export const isValidUrl = (value: unknown): value is string => {
336 |   return UrlSchema.safeParse(value).success;
337 | };
338 | 
339 | export const isValidBase64 = (value: unknown): value is string => {
340 |   return Base64Schema.safeParse(value).success;
341 | };
342 | 
343 | export const isImageFormat = (mimeType: string): boolean => {
344 |   return mimeType.startsWith('image/');
345 | };
346 | 
347 | export const isVideoFormat = (mimeType: string): boolean => {
348 |   return mimeType.startsWith('video/');
349 | };
350 | 
351 | export const isSupportedImageFormat = (
352 |   mimeType: string,
353 |   supportedFormats: string[]
354 | ): boolean => {
355 |   const extension = mimeType.split('/')[1];
356 |   return supportedFormats.includes(extension);
357 | };
358 | 
359 | export const isSupportedVideoFormat = (
360 |   mimeType: string,
361 |   supportedFormats: string[]
362 | ): boolean => {
363 |   const extension = mimeType.split('/')[1];
364 |   return supportedFormats.includes(extension);
365 | };
366 | 
367 | // Error formatting for validation errors
368 | export const formatZodError = (error: z.ZodError): string => {
369 |   const errorMessages = error.errors.map(err => {
370 |     const path = err.path.join('.');
371 |     return `${path}: ${err.message}`;
372 |   });
373 |   return `Validation failed: ${errorMessages.join(', ')}`;
374 | };
375 | 


--------------------------------------------------------------------------------
/docs/environment-variable-guide.md:
--------------------------------------------------------------------------------
  1 | # Environment Variable Configuration Guide
  2 | 
  3 | This guide provides comprehensive documentation for all environment variables used by the AI Vision MCP Server.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Quick Setup](#quick-setup)
  8 | - [Configuration Priority](#configuration-priority)
  9 | - [Environment Variables Reference](#environment-variables-reference)
 10 | - [Configuration Examples](#configuration-examples)
 11 | - [Troubleshooting](#troubleshooting)
 12 | 
 13 | ## Quick Setup
 14 | 
 15 | ### Google AI Studio Provider (Recommended for simplicity)
 16 | 
 17 | ```bash
 18 | export IMAGE_PROVIDER="google"
 19 | export VIDEO_PROVIDER="google"
 20 | export GEMINI_API_KEY="your-gemini-api-key"
 21 | ```
 22 | 
 23 | ### Vertex AI Provider (Recommended for production)
 24 | 
 25 | ```bash
 26 | export IMAGE_PROVIDER="vertex_ai"
 27 | export VIDEO_PROVIDER="vertex_ai"
 28 | export VERTEX_CREDENTIALS="/path/to/service-account.json"
 29 | export GCS_BUCKET_NAME="your-gcs-bucket"
 30 | ```
 31 | 
 32 | Get your Google AI Studio API key [here](https://aistudio.google.com/app/api-keys).
 33 | 
 34 | For Vertex AI setup, see [Vertex AI Setup Guide](provider/vertex-ai-setup-guide.md).
 35 | 
 36 | ## Configuration Priority
 37 | 
 38 | The AI Vision MCP Server uses a hierarchical configuration system where more specific settings override general ones.
 39 | 
 40 | ### AI Parameters Priority (Highest to Lowest)
 41 | 
 42 | 1. **LLM-assigned values** - Parameters passed directly in tool calls (e.g., `{"temperature": 0.1}`)
 43 | 2. **Function-specific variables** - `TEMPERATURE_FOR_ANALYZE_IMAGE`, `MAX_TOKENS_FOR_COMPARE_IMAGES`, etc.
 44 | 3. **Task-specific variables** - `TEMPERATURE_FOR_IMAGE`, `MAX_TOKENS_FOR_VIDEO`, etc.
 45 | 4. **Universal variables** - `TEMPERATURE`, `MAX_TOKENS`, etc.
 46 | 5. **System defaults** - Built-in fallback values
 47 | 
 48 | ### Model Selection Priority (Highest to Lowest)
 49 | 
 50 | 1. **Function-specific models** - `ANALYZE_IMAGE_MODEL`, `COMPARE_IMAGES_MODEL`, `ANALYZE_VIDEO_MODEL`
 51 | 2. **Task-specific models** - `IMAGE_MODEL`, `VIDEO_MODEL`
 52 | 3. **System defaults** - Built-in fallback models (`gemini-2.5-flash-lite`, `gemini-2.5-flash`)
 53 | 
 54 | ## Environment Variables Reference
 55 | 
 56 | ### Provider Selection
 57 | 
 58 | | Variable | Required | Description | Default |
 59 | |----------|-----------|-------------|---------|
 60 | | `IMAGE_PROVIDER` | Yes | Provider for image analysis | `google` or `vertex_ai` |
 61 | | `VIDEO_PROVIDER` | Yes | Provider for video analysis | `google` or `vertex_ai` |
 62 | 
 63 | ### Model Selection
 64 | 
 65 | | Variable | Required | Description | Default |
 66 | |----------|-----------|-------------|---------|
 67 | | `IMAGE_MODEL` | No | Model for image analysis | `gemini-2.5-flash-lite` |
 68 | | `VIDEO_MODEL` | No | Model for video analysis | `gemini-2.5-flash` |
 69 | 
 70 | ### Function-specific Model Selection
 71 | 
 72 | | Variable | Required | Description | Default |
 73 | |----------|-----------|-------------|---------|
 74 | | `ANALYZE_IMAGE_MODEL` | No | Model for analyze_image function | Uses `IMAGE_MODEL` |
 75 | | `COMPARE_IMAGES_MODEL` | No | Model for compare_images function | Uses `IMAGE_MODEL` |
 76 | | `DETECT_OBJECTS_IN_IMAGE_MODEL` | No | Model for detect_objects_in_image function | Uses `IMAGE_MODEL` |
 77 | | `ANALYZE_VIDEO_MODEL` | No | Model for analyze_video function | Uses `VIDEO_MODEL` |
 78 | 
 79 | ### Google Gemini API Configuration
 80 | 
 81 | | Variable | Required | Description | Default |
 82 | |----------|-----------|-------------|---------|
 83 | | `GEMINI_API_KEY` | Yes if using `google` provider | Google Gemini API key | Required for Gemini |
 84 | | `GEMINI_BASE_URL` | No | Gemini API base URL | `https://generativelanguage.googleapis.com` |
 85 | 
 86 | ### Vertex AI Configuration
 87 | 
 88 | | Variable | Required | Description | Default |
 89 | |----------|-----------|-------------|---------|
 90 | | `VERTEX_CREDENTIALS` | Yes if using `vertex_ai` provider | Path to GCP service account JSON | Required for Vertex AI |
 91 | | `VERTEX_PROJECT_ID` | Auto | Google Cloud project ID | Auto-derived from credentials |
 92 | | `VERTEX_LOCATION` | No | Vertex AI region | `us-central1` |
 93 | | `VERTEX_ENDPOINT` | No | Vertex AI endpoint URL | `https://aiplatform.googleapis.com` |
 94 | 
 95 | ### Google Cloud Storage (Required for Vertex AI)
 96 | 
 97 | | Variable | Required | Description | Default |
 98 | |----------|-----------|-------------|---------|
 99 | | `GCS_BUCKET_NAME` | Yes if using `vertex_ai` provider | GCS bucket name for Vertex AI uploads | Required for Vertex AI |
100 | | `GCS_CREDENTIALS` | No | Path to GCS credentials | Defaults to `VERTEX_CREDENTIALS` |
101 | | `GCS_PROJECT_ID` | No | GCS project ID | Auto-derived from `VERTEX_CREDENTIALS` |
102 | | `GCS_REGION` | No | GCS region | Defaults to `VERTEX_LOCATION` |
103 | 
104 | ### Universal API Parameters
105 | 
106 | | Variable | Required | Description | Range | Default |
107 | |----------|-----------|-------------|-------|---------|
108 | | `TEMPERATURE` | No | AI response temperature | 0.0–2.0 | `0.8` |
109 | | `TOP_P` | No | Top-p sampling parameter | 0.0–1.0 | `0.95` |
110 | | `TOP_K` | No | Top-k sampling parameter | 1–100 | `30` |
111 | | `MAX_TOKENS` | No | Maximum tokens for analysis | 1–8192 | `1000` |
112 | 
113 | ### Task-specific API Parameters
114 | 
115 | | Variable | Required | Description | Range | Default |
116 | |----------|-----------|-------------|-------|---------|
117 | | `TEMPERATURE_FOR_IMAGE` | No | Image-specific temperature | 0.0–2.0 | Uses `TEMPERATURE` |
118 | | `TOP_P_FOR_IMAGE` | No | Image-specific top-p | 0.0–1.0 | Uses `TOP_P` |
119 | | `TOP_K_FOR_IMAGE` | No | Image-specific top-k | 1–100 | Uses `TOP_K` |
120 | | `MAX_TOKENS_FOR_IMAGE` | No | Maximum tokens for image analysis | 1–8192 | Uses `MAX_TOKENS` |
121 | | `TEMPERATURE_FOR_VIDEO` | No | Video-specific temperature | 0.0–2.0 | Uses `TEMPERATURE` |
122 | | `TOP_P_FOR_VIDEO` | No | Video-specific top-p | 0.0–1.0 | Uses `TOP_P` |
123 | | `TOP_K_FOR_VIDEO` | No | Video-specific top-k | 1–100 | Uses `TOP_K` |
124 | | `MAX_TOKENS_FOR_VIDEO` | No | Maximum tokens for video analysis | 1–8192 | Uses `MAX_TOKENS` |
125 | 
126 | ### Function-specific API Parameters
127 | 
128 | | Variable | Required | Description | Range | Default |
129 | |----------|-----------|-------------|-------|---------|
130 | | `TEMPERATURE_FOR_ANALYZE_IMAGE` | No | Temperature for analyze_image | 0.0–2.0 | Uses `TEMPERATURE_FOR_IMAGE` |
131 | | `TOP_P_FOR_ANALYZE_IMAGE` | No | Top-p for analyze_image | 0.0–1.0 | Uses `TOP_P_FOR_IMAGE` |
132 | | `TOP_K_FOR_ANALYZE_IMAGE` | No | Top-k for analyze_image | 1–100 | Uses `TOP_K_FOR_IMAGE` |
133 | | `MAX_TOKENS_FOR_ANALYZE_IMAGE` | No | Max tokens for analyze_image | 1–8192 | Uses `MAX_TOKENS_FOR_IMAGE` |
134 | | `TEMPERATURE_FOR_COMPARE_IMAGES` | No | Temperature for compare_images | 0.0–2.0 | Uses `TEMPERATURE_FOR_IMAGE` |
135 | | `TOP_P_FOR_COMPARE_IMAGES` | No | Top-p for compare_images | 0.0–1.0 | Uses `TOP_P_FOR_IMAGE` |
136 | | `TOP_K_FOR_COMPARE_IMAGES` | No | Top-k for compare_images | 1–100 | Uses `TOP_K_FOR_IMAGE` |
137 | | `MAX_TOKENS_FOR_COMPARE_IMAGES` | No | Max tokens for compare_images | 1–8192 | Uses `MAX_TOKENS_FOR_IMAGE` |
138 | | `TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Temperature for object detection | 0.0–2.0 | `0.0` |
139 | | `TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Top-p for object detection | 0.0–1.0 | `0.95` |
140 | | `TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Top-k for object detection | 1–100 | `30` |
141 | | `MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE` | No | Max tokens for object detection | 1–8192 | `8192` |
142 | | `TEMPERATURE_FOR_ANALYZE_VIDEO` | No | Temperature for analyze_video | 0.0–2.0 | Uses `TEMPERATURE_FOR_VIDEO` |
143 | | `TOP_P_FOR_ANALYZE_VIDEO` | No | Top-p for analyze_video | 0.0–1.0 | Uses `TOP_P_FOR_VIDEO` |
144 | | `TOP_K_FOR_ANALYZE_VIDEO` | No | Top-k for analyze_video | 1–100 | Uses `TOP_K_FOR_VIDEO` |
145 | | `MAX_TOKENS_FOR_ANALYZE_VIDEO` | No | Max tokens for analyze_video | 1–8192 | Uses `MAX_TOKENS_FOR_VIDEO` |
146 | 
147 | ### File Processing Configuration
148 | 
149 | | Variable | Required | Description | Default |
150 | |----------|-----------|-------------|---------|
151 | | `MAX_IMAGE_SIZE` | No | Maximum image size in bytes | `20971520` (20 MB) |
152 | | `MAX_VIDEO_SIZE` | No | Maximum video size in bytes | `2147483648` (2 GB) |
153 | | `MAX_VIDEO_DURATION` | No | Maximum video duration (seconds) | `3600` (1 hour) |
154 | | `MAX_IMAGES_FOR_COMPARISON` | No | Maximum images for comparison | `4` |
155 | | `ALLOWED_IMAGE_FORMATS` | No | Comma-separated image formats | `png,jpg,jpeg,webp,gif,bmp,tiff` |
156 | | `ALLOWED_VIDEO_FORMATS` | No | Comma-separated video formats | `mp4,mov,avi,mkv,webm,flv,wmv,3gp` |
157 | 
158 | ### File Upload Configuration
159 | 
160 | | Variable | Required | Description | Default |
161 | |----------|-----------|-------------|---------|
162 | | `GEMINI_FILES_API_THRESHOLD` | No | Size threshold for Gemini Files API | `10485760` (10 MB) |
163 | | `VERTEX_AI_FILES_API_THRESHOLD` | No | Size threshold for Vertex AI uploads | `0` |
164 | 
165 | ### Development Configuration
166 | 
167 | | Variable | Required | Description | Default |
168 | |----------|-----------|-------------|---------|
169 | | `LOG_LEVEL` | No | Logging level | `info` |
170 | | `NODE_ENV` | No | Environment mode | `development` |
171 | 
172 | ## Configuration Examples
173 | 
174 | ### Basic Development Setup
175 | 
176 | ```bash
177 | # Provider selection
178 | export IMAGE_PROVIDER="google"
179 | export VIDEO_PROVIDER="google"
180 | export GEMINI_API_KEY="your-gemini-api-key"
181 | 
182 | # Basic configuration
183 | export TEMPERATURE=0.7
184 | export MAX_TOKENS=1500
185 | export LOG_LEVEL="debug"
186 | ```
187 | 
188 | ### Production Setup with Vertex AI
189 | 
190 | ```bash
191 | # Provider selection
192 | export IMAGE_PROVIDER="vertex_ai"
193 | export VIDEO_PROVIDER="vertex_ai"
194 | export VERTEX_CREDENTIALS="/path/to/service-account.json"
195 | export GCS_BUCKET_NAME="your-production-bucket"
196 | 
197 | # Production models
198 | export IMAGE_MODEL="gemini-2.5-flash"
199 | export VIDEO_MODEL="gemini-2.5-flash-pro"
200 | 
201 | # Production parameters
202 | export TEMPERATURE=0.3
203 | export MAX_TOKENS=2000
204 | export NODE_ENV="production"
205 | export LOG_LEVEL="info"
206 | ```
207 | 
208 | ### Function-specific Optimization
209 | 
210 | ```bash
211 | # General settings
212 | export IMAGE_PROVIDER="google"
213 | export GEMINI_API_KEY="your-gemini-api-key"
214 | 
215 | # Function-specific optimizations
216 | export TEMPERATURE_FOR_ANALYZE_IMAGE=0.1      # Precise image analysis
217 | export TEMPERATURE_FOR_COMPARE_IMAGES=0.5     # More creative comparisons
218 | export TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=0.0  # Deterministic detection
219 | export MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE=8192   # High token limit for JSON
220 | 
221 | # Function-specific models
222 | export ANALYZE_IMAGE_MODEL="gemini-2.5-flash-lite"
223 | export COMPARE_IMAGES_MODEL="gemini-2.5-flash"
224 | export DETECT_OBJECTS_IN_IMAGE_MODEL="gemini-2.5-flash-lite"
225 | ```
226 | 
227 | ### Mixed Provider Setup
228 | 
229 | ```bash
230 | # Use Gemini for images (simpler, faster)
231 | export IMAGE_PROVIDER="google"
232 | export GEMINI_API_KEY="your-gemini-api-key"
233 | 
234 | # Use Vertex AI for videos (enterprise features)
235 | export VIDEO_PROVIDER="vertex_ai"
236 | export VERTEX_CREDENTIALS="/path/to/service-account.json"
237 | export GCS_BUCKET_NAME="your-mixed-provider-bucket"
238 | 
239 | # Task-specific parameters
240 | export TEMPERATURE_FOR_IMAGE=0.2
241 | export TEMPERATURE_FOR_VIDEO=0.5
242 | export MAX_TOKENS_FOR_IMAGE=1000
243 | export MAX_TOKENS_FOR_VIDEO=2000
244 | ```
245 | 
246 | ## File Upload Strategy Configuration
247 | 
248 | ### Gemini Provider Strategy
249 | 
250 | ```bash
251 | export GEMINI_FILES_API_THRESHOLD=10485760  # 10MB
252 | 
253 | # Files ≤ 10MB: Use inline base64 data
254 | # Files > 10MB: Use Gemini Files API
255 | ```
256 | 
257 | ### Vertex AI Provider Strategy
258 | 
259 | ```bash
260 | export VERTEX_AI_FILES_API_THRESHOLD=0  # All files use GCS
261 | 
262 | # All files: Upload to Google Cloud Storage and use gs:// URIs
263 | ```
264 | 
265 | ## Troubleshooting
266 | 
267 | ### Common Issues
268 | 
269 | 1. **Missing API Key Error**
270 |    ```
271 |    Error: Missing required configuration for google: GEMINI_API_KEY
272 |    ```
273 |    **Solution**: Set `GEMINI_API_KEY` environment variable when using `google` provider
274 | 
275 | 2. **Vertex AI Authentication Error**
276 |    ```
277 |    Error: Missing required configuration for vertex_ai: VERTEX_CREDENTIALS
278 |    ```
279 |    **Solution**: Set `VERTEX_CREDENTIALS` and `GCS_BUCKET_NAME` for Vertex AI
280 | 
281 | 3. **File Size Limit Exceeded**
282 |    ```
283 |    Error: File size exceeds maximum limit
284 |    ```
285 |    **Solution**: Increase `MAX_IMAGE_SIZE` or `MAX_VIDEO_SIZE`, or reduce file size
286 | 
287 | 4. **Unsupported File Format**
288 |    ```
289 |    Error: Unsupported file format
290 |    ```
291 |    **Solution**: Check `ALLOWED_IMAGE_FORMATS` and `ALLOWED_VIDEO_FORMATS` settings
292 | 
293 | 5. **Token Limit Exceeded**
294 |    ```
295 |    Error: Response exceeds max tokens
296 |    ```
297 |    **Solution**: Increase relevant `MAX_TOKENS_*` environment variable
298 | 
299 | ### Debug Mode
300 | 
301 | Enable debug logging to troubleshoot issues:
302 | 
303 | ```bash
304 | export LOG_LEVEL="debug"
305 | ```
306 | 
307 | This will provide detailed information about:
308 | - Configuration loading
309 | - Provider initialization
310 | - File processing
311 | - API requests and responses
312 | - Error details
313 | 
314 | ### Configuration Validation
315 | 
316 | The server validates configuration on startup. Common validation errors:
317 | 
318 | - Missing required provider-specific variables
319 | - Invalid file paths in credentials
320 | - Incompatible configuration combinations
321 | - Out-of-range parameter values
322 | 
323 | Check the console output for detailed validation messages.
324 | 
325 | ## Best Practices
326 | 
327 | 1. **Use Environment-specific Files**: Create `.env.development` and `.env.production` files
328 | 2. **Secure Credentials**: Never commit API keys or credentials to version control
329 | 3. **Optimize Token Usage**: Set appropriate `MAX_TOKENS` values for each function type
330 | 4. **Monitor Usage**: Use appropriate temperature settings for your use case
331 | 5. **Test Configuration**: Validate configuration in development before production deployment
332 | 
333 | ## Related Documentation
334 | 
335 | - [Installation Guide](../README.md#installation)
336 | - [Vertex AI Setup Guide](provider/vertex-ai-setup-guide.md)
337 | - [Technical Specification](SPEC.md)
338 | - [Development Patterns](../CLAUDE.md)


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
  1 | # CLAUDE.md
  2 | 
  3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
  4 | Please always use context7 MCP, web search, or web fetch for additional information when fixing bugs or implementing new features.
  5 | 
  6 | ## **CRITICAL: Documentation Maintenance Requirements**
  7 | 
  8 | **BEFORE starting any coding work:**
  9 | 1. **ALWAYS create a plan document** in the `docs/llm_logs/` folder before writing any code
 10 | 2. **ALWAYS update README.md** when introducing changes that affect:
 11 |    - New MCP tools or parameters
 12 |    - Environment variables
 13 |    - Configuration options
 14 |    - Installation instructions
 15 |    - Breaking changes
 16 | 3. **ALWAYS update docs/SPEC.md** when introducing changes that affect:
 17 |    - Architecture modifications
 18 |    - New provider implementations
 19 |    - API interface changes
 20 |    - File handling logic
 21 |    - Error handling patterns
 22 | 
 23 | **Planning Process:**
 24 | - Create plan documents in `docs/llm_logs/` folder (e.g., `docs/llm_logs/feature-name-plan.md`)
 25 | - Include architecture decisions, implementation steps, and testing strategy
 26 | - Reference this plan in your commit messages
 27 | - Keep plan documents as documentation of implementation decisions
 28 | 
 29 | **Solution Planning Best Practices:**
 30 | - **ALWAYS present at least 3 options** when planning solutions to problems
 31 | - Analyze trade-offs: effort vs. benefit, maintainability vs. speed, risk vs. reward
 32 | - Provide clear recommendations with rationale (e.g., "Option 2 recommended because...")
 33 | - Consider: quick fixes, balanced approaches, and comprehensive solutions
 34 | - Include effort estimates, risk assessments, and rollback strategies for each option
 35 | - Use structured format: Option 1 (Simple), Option 2 (Balanced), Option 3 (Comprehensive)
 36 | 
 37 | **Example Planning Structure:**
 38 | ```
 39 | ## Plan: [Problem Description]
 40 | 
 41 | ### Option 1: Quick Fix (15 min)
 42 | - ✅ Minimal change, fastest implementation
 43 | - ❌ Technical debt, not future-proof
 44 | - **When to use**: Urgent hotfixes, time pressure
 45 | 
 46 | ### Option 2: Balanced Solution (45 min) - RECOMMENDED
 47 | - ✅ Good maintainability, moderate effort
 48 | - ✅ Addresses root cause, extensible
 49 | - ❌ Longer implementation time
 50 | - **When to use**: Most production scenarios
 51 | 
 52 | ### Option 3: Comprehensive Refactor (2 hours)
 53 | - ✅ Perfect architecture, future-proof
 54 | - ❌ High effort, potential for new bugs
 55 | - **When to use**: Major feature additions, architectural improvements
 56 | 
 57 | ### Recommendation: Option 2
 58 | **Rationale**: Balances immediate needs with long-term maintainability...
 59 | ```
 60 | 
 61 | **Documentation Synchronization:**
 62 | - README.md is for **users** - installation, usage, and configuration
 63 | - docs/SPEC.md is for **developers** - technical specifications and architecture
 64 | - CLAUDE.md is for **AI assistants** - development patterns and constraints
 65 | - All three documents must stay consistent with the actual implementation
 66 | 
 67 | ## Development Commands
 68 | 
 69 | ### Building and Testing
 70 | - `npm run build` - Build TypeScript project to `dist/` directory
 71 | - `npm run dev` - Start development server with watch mode (tsc --watch)
 72 | - `npm start` - Start the built MCP server (node dist/index.js)
 73 | 
 74 | ### Code Quality
 75 | - `npm run lint` - Run ESLint on all TypeScript files
 76 | - `npm run lint:fix` - Run ESLint with auto-fix
 77 | - `npm run format` - Format code with Prettier
 78 | 
 79 | ### Publishing
 80 | - `npm run prepublishOnly` - Run lint before publish
 81 | - `npm run preversion` - Run lint before version bump
 82 | - `npm run version` - Format code and add to git before version
 83 | - `npm run prepare` - Build project automatically on install
 84 | 
 85 | ## Architecture Overview
 86 | 
 87 | This is a Model Context Protocol (MCP) server that provides AI-powered image and video analysis using Google Gemini and Vertex AI models.
 88 | 
 89 | ### Core Components
 90 | 
 91 | **Server Architecture** (`src/server.ts`):
 92 | - Main MCP server entry point using `@modelcontextprotocol/sdk`
 93 | - Lazy-loaded services initialized on first request via `getServices()` function
 94 | - Four primary tools: `analyze_image`, `compare_images`, `detect_objects_in_image`, and `analyze_video`
 95 | - Comprehensive error handling with custom `VisionError` types
 96 | - Graceful shutdown handling for SIGINT/SIGTERM
 97 | 
 98 | **Configuration Hierarchy System**:
 99 | The server implements a sophisticated 4-level configuration priority system:
100 | 1. **LLM-assigned values** - Parameters passed directly in tool calls (e.g., `{"temperature": 0.1}`)
101 | 2. **Function-specific variables** - `TEMPERATURE_FOR_ANALYZE_IMAGE`, `MAX_TOKENS_FOR_COMPARE_IMAGES`, etc.
102 | 3. **Task-specific variables** - `TEMPERATURE_FOR_IMAGE`, `MAX_TOKENS_FOR_VIDEO`, etc.
103 | 4. **Universal variables** - `TEMPERATURE`, `MAX_TOKENS`, etc.
104 | 
105 | **Provider Factory** (`src/providers/factory/ProviderFactory.ts`):
106 | - Factory pattern for creating AI provider instances with validation (`VisionProviderFactory`)
107 | - Supports two providers: `google` (Gemini API) and `vertex_ai` (Vertex AI)
108 | - `createProviderWithValidation()` method ensures configuration validation before provider creation
109 | - Automatic provider initialization via `initializeDefaultProviders()` on module load
110 | - Configuration requirement validation and error handling with provider context
111 | - Dynamic provider registration support through `registerProvider()` method
112 | 
113 | **Configuration Service** (`src/services/ConfigService.ts`):
114 | - Singleton pattern for configuration management via `ConfigService.getInstance()`
115 | - Environment variable validation with Zod schemas
116 | - Provider-specific configuration methods
117 | - Auto-derivation of related settings (e.g., project ID from credentials)
118 | - Hierarchical configuration resolution
119 | 
120 | **Configuration Validation** (`src/types/Config.ts` and `src/utils/validation.ts`):
121 | - `Config.ts` defines TypeScript interfaces for all configuration options
122 | - `validation.ts` provides Zod schemas that validate environment variables against these interfaces
123 | - These files must stay synchronized - any new config field in Config.ts requires corresponding validation rules in validation.ts
124 | 
125 | **Key Services**:
126 | - `FileService` - Handles file uploads, validation, and processing with support for URLs, local files, and base64, includes cross-platform path handling
127 | - `ConfigService` - Singleton pattern for environment variables and settings with validation
128 | - Vision providers in `src/providers/` - AI model implementations with consistent interfaces
129 | - Storage strategies in `src/storage/` - Google Cloud Storage integration
130 | - File upload strategies in `src/file-upload/` - Provider-specific upload handling
131 | - Image annotation utilities in `src/utils/` - Sharp-based image processing for object detection
132 | 
133 | ### MCP Tools Implementation
134 | 
135 | **All tools follow consistent patterns:**
136 | - Configuration hierarchy: function-specific → task-specific → universal variables
137 | - File source support: URLs, local files, base64 data
138 | - Error handling with custom `VisionError` types with provider context
139 | - Provider-agnostic interface through factory pattern
140 | - Structured output schemas for object detection
141 | 
142 | **Tool-specific behaviors:**
143 | - `detect_objects_in_image`: Returns annotated images with bounding boxes, 2-step file handling (explicit path → temp file), uses structured JSON output with coordinates, includes CSS selector suggestions for web elements
144 | - `compare_images`: Supports 2-4 images with mixed source types, batch processing optimization
145 | - `analyze_image`: Special prompt handling for frontend UI comparison tasks, intelligent file processing based on size
146 | - `analyze_video`: YouTube URL and local file support, GCS integration for Vertex AI, duration and size validation
147 | 
148 | ### Provider Implementation
149 | 
150 | **Gemini Provider** (`src/providers/gemini/`):
151 | - Direct Google Gemini API integration using `@google/genai`
152 | - Files API for larger uploads (>10MB via `GEMINI_FILES_API_THRESHOLD`)
153 | - Base64 encoding for smaller files (inline data)
154 | - Structured output support for object detection with response schemas
155 | - Native support for both `google` and `vertex_ai` providers using same SDK
156 | 
157 | **Vertex AI Provider** (`src/providers/vertexai/`):
158 | - Google Cloud Vertex AI integration using `@google/genai` SDK
159 | - Requires GCS bucket for all file uploads (configured via `VERTEX_AI_FILES_API_THRESHOLD`)
160 | - Service account authentication with auto project ID extraction
161 | - Uses same underlying SDK as Gemini provider for consistency
162 | 
163 | ### File Processing Flow
164 | 
165 | 1. **Input Validation**: File size, format, and duration checks using configurable limits
166 | 2. **Upload Strategy Selection**: Based on provider and file size thresholds
167 | 3. **File Processing**: MIME type detection, path resolution, cross-platform support (Windows/Unix)
168 | 4. **AI Analysis**: Provider-specific API calls with structured output schemas
169 | 5. **Response Processing**: Structured JSON responses with comprehensive error handling
170 | 
171 | ## Critical Development Constraints
172 | 
173 | ### Configuration Synchronization
174 | - `src/types/Config.ts` and `src/utils/validation.ts` MUST stay synchronized
175 | - Every new config field in Config.ts requires corresponding Zod validation in validation.ts
176 | - Function-specific environment variables must follow the naming pattern: `TEMPERATURE_FOR_ANALYZE_IMAGE`, etc.
177 | - When adding new configuration, always implement the 4-level hierarchy
178 | 
179 | ### Error Handling Requirements
180 | - Always use custom `VisionError` types with provider context
181 | - Include error codes for proper client handling
182 | - Implement retry logic for network failures
183 | - Never expose sensitive credentials in error messages
184 | - Provider-specific error context for debugging
185 | 
186 | ### TypeScript Configuration
187 | - ES2022 target with ESNext modules, strict type checking enabled
188 | - Path mapping with `@/*` pointing to `src/*` for clean imports
189 | - Declaration maps and source maps enabled for debugging
190 | - No implicit any, returns, or this allowed (strict mode)
191 | 
192 | ### File Organization
193 | ```
194 | src/
195 | ├── providers/          # AI provider implementations
196 | │   ├── gemini/        # Google Gemini provider
197 | │   ├── vertexai/      # Vertex AI provider
198 | │   └── factory/       # Provider factory
199 | ├── services/          # Core services
200 | │   ├── ConfigService.ts
201 | │   └── FileService.ts
202 | ├── storage/           # Storage implementations
203 | ├── file-upload/       # File upload strategies
204 | ├── types/            # TypeScript type definitions
205 | ├── utils/            # Utility functions
206 | └── tools/            # MCP tool implementations
207 | ```
208 | 
209 | ## Development Patterns
210 | 
211 | 1. **Lazy Loading**: Services initialized on first request via `getServices()` function
212 | 2. **Factory Pattern**: Providers created through `VisionProviderFactory` with validation
213 | 3. **Singleton Pattern**: `ConfigService.getInstance()` ensures consistency
214 | 4. **Strategy Pattern**: File upload strategies selected based on provider and size
215 | 5. **Zod Validation**: All inputs validated with Zod schemas for runtime type safety
216 | 6. **Configuration Hierarchy**: Always implement 4-level priority: LLM-assigned → function-specific → task-specific → universal
217 | 7. **Error Context**: Always include provider information in errors for debugging
218 | 8. **Cross-Platform Support**: Handle both Windows and Unix file paths correctly
219 | 9. **Config Building Pattern**: Use `buildConfigWithOptions()` helper from BaseVisionProvider for consistent config generation
220 | 
221 | ### Config Building Pattern (IMPORTANT)
222 | 
223 | When implementing provider methods that need AI configuration, **always use** the `buildConfigWithOptions()` helper:
224 | 
225 | ```typescript
226 | // ✅ Correct - uses helper method
227 | const config = this.buildConfigWithOptions('image', options?.functionName, options);
228 | 
229 | await this.client.models.generateContent({
230 |   model,
231 |   contents,
232 |   config,  // Automatically includes responseSchema and systemInstruction if provided
233 | });
234 | 
235 | // ❌ Incorrect - manual config building (duplicates code)
236 | const config = {
237 |   temperature: this.resolveTemperatureForFunction(...),
238 |   topP: this.resolveTopPForFunction(...),
239 |   topK: this.resolveTopKForFunction(...),
240 |   maxOutputTokens: this.resolveMaxTokensForFunction(...),
241 |   candidateCount: 1,
242 | };
243 | if (options?.responseSchema) {
244 |   config.responseMimeType = 'application/json';
245 |   config.responseSchema = options.responseSchema;
246 | }
247 | if (options?.systemInstruction) {
248 |   config.systemInstruction = options.systemInstruction;
249 | }
250 | // ... manual config building creates maintenance burden
251 | ```
252 | 
253 | **Why use `buildConfigWithOptions()`?**
254 | 
255 | 1. **DRY Principle**: Single source of truth for config generation
256 | 2. **Automatic Structured Output**: Handles `responseSchema` and `systemInstruction` automatically
257 | 3. **Consistency**: Same config format across all providers (Gemini, Vertex AI)
258 | 4. **Maintainability**: Adding new config options only requires updating one method
259 | 5. **Type Safety**: Centralized TypeScript type checking
260 | 
261 | **This pattern is critical for:**
262 | - Object detection (`detect_objects_in_image`) - requires structured JSON output
263 | - Any future tools that need custom response schemas
264 | - Maintaining consistency between Gemini and Vertex AI providers
265 | 
266 | **Reference Implementation:**
267 | - Helper method: `src/providers/base/VisionProvider.ts:354-395`
268 | - Usage in Gemini: `src/providers/gemini/GeminiProvider.ts:185-189, 348-352, 468-472`
269 | - Usage in Vertex AI: `src/providers/vertexai/VertexAIProvider.ts:84-88, 161-165, 246-250`
270 | 
271 | ## Environment Variables
272 | 
273 | **Required for Development:**
274 | - `IMAGE_PROVIDER` and `VIDEO_PROVIDER`: Set to `google` or `vertex_ai`
275 | - Provider-specific credentials (GEMINI_API_KEY or VERTEX_CREDENTIALS + GCS_BUCKET_NAME)
276 | 
277 | **Common Development Overrides:**
278 | - `TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=0` for deterministic object detection
279 | - `LOG_LEVEL=debug` for verbose logging during development
280 | - `NODE_ENV=development` for development-specific behavior
281 | 
282 | ## Testing and Debugging
283 | 
284 | - Use `npm run dev` for development with automatic rebuilding
285 | - Check console logs for detailed file processing information
286 | - Verify configuration hierarchy by setting different levels of environment variables
287 | - Test with multiple file sources (URLs, local files, base64) to ensure compatibility
288 | - Use structured logging patterns for consistent debugging output


--------------------------------------------------------------------------------
/src/providers/base/VisionProvider.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Base vision provider interface and abstract class
  3 |  */
  4 | 
  5 | import type {
  6 |   VisionProvider,
  7 |   AnalysisOptions,
  8 |   AnalysisResult,
  9 |   UploadedFile,
 10 |   HealthStatus,
 11 |   ProviderCapabilities,
 12 |   ModelCapabilities,
 13 |   ProviderInfo,
 14 | } from '../../types/Providers.js';
 15 | import type { TaskType } from '../../types/Analysis.js';
 16 | import { type FunctionName } from '../../constants/FunctionNames.js';
 17 | import { ConfigService } from '../../services/ConfigService.js';
 18 | 
 19 | export abstract class BaseVisionProvider implements VisionProvider {
 20 |   protected imageModel: string;
 21 |   protected videoModel: string;
 22 |   protected providerName: string;
 23 |   protected configService: ConfigService;
 24 | 
 25 |   constructor(providerName: string, imageModel: string, videoModel: string) {
 26 |     this.providerName = providerName;
 27 |     this.imageModel = imageModel;
 28 |     this.videoModel = videoModel;
 29 |     this.configService = ConfigService.getInstance();
 30 |   }
 31 | 
 32 |   // Abstract methods that must be implemented by concrete providers
 33 |   abstract analyzeImage(
 34 |     imageSource: string,
 35 |     prompt: string,
 36 |     options?: AnalysisOptions
 37 |   ): Promise<AnalysisResult>;
 38 |   abstract analyzeVideo(
 39 |     videoSource: string,
 40 |     prompt: string,
 41 |     options?: AnalysisOptions
 42 |   ): Promise<AnalysisResult>;
 43 |   abstract compareImages(
 44 |     imageSources: string[],
 45 |     prompt: string,
 46 |     options?: AnalysisOptions
 47 |   ): Promise<AnalysisResult>;
 48 |   abstract uploadFile(
 49 |     buffer: Buffer,
 50 |     filename: string,
 51 |     mimeType: string
 52 |   ): Promise<UploadedFile>;
 53 |   abstract downloadFile(fileId: string): Promise<Buffer>;
 54 |   abstract deleteFile(fileId: string): Promise<void>;
 55 |   abstract getSupportedFormats(): ProviderCapabilities;
 56 |   abstract getModelCapabilities(): ModelCapabilities;
 57 |   abstract getProviderInfo(): ProviderInfo;
 58 |   abstract healthCheck(): Promise<HealthStatus>;
 59 | 
 60 |   // Concrete implementations for common functionality
 61 |   setModel(imageModel: string, videoModel: string): void {
 62 |     this.imageModel = imageModel;
 63 |     this.videoModel = videoModel;
 64 |   }
 65 | 
 66 |   getImageModel(): string {
 67 |     return this.imageModel;
 68 |   }
 69 | 
 70 |   getVideoModel(): string {
 71 |     return this.videoModel;
 72 |   }
 73 | 
 74 |   supportsVideo(): boolean {
 75 |     const capabilities = this.getSupportedFormats();
 76 |     return capabilities.supportsVideo;
 77 |   }
 78 | 
 79 |   protected createAnalysisResult(
 80 |     text: string,
 81 |     model: string,
 82 |     usage?: {
 83 |       promptTokenCount: number;
 84 |       candidatesTokenCount: number;
 85 |       totalTokenCount: number;
 86 |     },
 87 |     processingTime?: number,
 88 |     fileType?: string,
 89 |     fileSize?: number,
 90 |     modelVersion?: string,
 91 |     responseId?: string
 92 |   ): AnalysisResult {
 93 |     return {
 94 |       text,
 95 |       metadata: {
 96 |         model,
 97 |         provider: this.providerName,
 98 |         usage,
 99 |         processingTime,
100 |         fileType,
101 |         fileSize,
102 |         modelVersion,
103 |         responseId,
104 |       },
105 |     };
106 |   }
107 | 
108 |   protected createHealthStatus(
109 |     status: 'healthy' | 'unhealthy' | 'degraded',
110 |     responseTime?: number,
111 |     message?: string
112 |   ): HealthStatus {
113 |     return {
114 |       status,
115 |       lastCheck: new Date().toISOString(),
116 |       responseTime,
117 |       message,
118 |     };
119 |   }
120 | 
121 |   protected async measureAsync<T>(
122 |     operation: () => Promise<T>
123 |   ): Promise<{ result: T; duration: number }> {
124 |     const startTime = Date.now();
125 |     const result = await operation();
126 |     const duration = Date.now() - startTime;
127 |     return { result, duration };
128 |   }
129 | 
130 |   protected isValidImageFormat(mimeType: string): boolean {
131 |     return mimeType.startsWith('image/');
132 |   }
133 | 
134 |   protected isValidVideoFormat(mimeType: string): boolean {
135 |     return mimeType.startsWith('video/');
136 |   }
137 | 
138 |   protected getProviderCapabilities(): ProviderCapabilities {
139 |     return {
140 |       supportedImageFormats: [
141 |         'png',
142 |         'jpg',
143 |         'jpeg',
144 |         'webp',
145 |         'gif',
146 |         'bmp',
147 |         'tiff',
148 |       ],
149 |       supportedVideoFormats: [
150 |         'mp4',
151 |         'mov',
152 |         'avi',
153 |         'mkv',
154 |         'webm',
155 |         'flv',
156 |         'wmv',
157 |         '3gp',
158 |       ],
159 |       maxImageSize: 20 * 1024 * 1024, // 20MB
160 |       maxVideoSize: 2 * 1024 * 1024 * 1024, // 2GB
161 |       maxVideoDuration: 3600, // 1 hour
162 |       supportsVideo: true,
163 |       supportsFileUpload: true,
164 |     };
165 |   }
166 | 
167 |   protected getBaseModelCapabilities(): ModelCapabilities {
168 |     return {
169 |       imageAnalysis: true,
170 |       videoAnalysis: this.supportsVideo(),
171 |       maxTokensForImage: 500, // Default, will be overridden by specific providers
172 |       maxTokensForVideo: 2000, // Default, will be overridden by specific providers
173 |       supportedFormats: this.getSupportedFormats().supportedImageFormats.concat(
174 |         this.supportsVideo()
175 |           ? this.getSupportedFormats().supportedVideoFormats
176 |           : []
177 |       ),
178 |     };
179 |   }
180 | 
181 |   protected resolveParameter(
182 |     taskType: TaskType,
183 |     directValue: number | undefined,
184 |     getTaskSpecificValue: (taskType: TaskType) => number | undefined,
185 |     getUniversalValue: () => number,
186 |     defaultValue: number
187 |   ): number {
188 |     // Priority hierarchy: LLM-assigned > task-specific > universal > default
189 |     if (directValue !== undefined) {
190 |       return directValue;
191 |     }
192 | 
193 |     const taskSpecificValue = getTaskSpecificValue(taskType);
194 |     if (taskSpecificValue !== undefined) {
195 |       return taskSpecificValue;
196 |     }
197 | 
198 |     return getUniversalValue() || defaultValue;
199 |   }
200 | 
201 |   protected resolveParameterWithFunction(
202 |     taskType: TaskType,
203 |     functionName: FunctionName | undefined,
204 |     directValue: number | undefined,
205 |     getFunctionSpecificValue: (
206 |       functionName: FunctionName
207 |     ) => number | undefined,
208 |     getTaskSpecificValue: (taskType: TaskType) => number | undefined,
209 |     getUniversalValue: () => number,
210 |     defaultValue: number
211 |   ): number {
212 |     // Priority hierarchy: LLM-assigned > function-specific > task-specific > universal > default
213 |     if (directValue !== undefined) {
214 |       return directValue;
215 |     }
216 | 
217 |     if (functionName) {
218 |       const functionSpecificValue = getFunctionSpecificValue(functionName);
219 |       if (functionSpecificValue !== undefined) {
220 |         return functionSpecificValue;
221 |       }
222 |     }
223 | 
224 |     const taskSpecificValue = getTaskSpecificValue(taskType);
225 |     if (taskSpecificValue !== undefined) {
226 |       return taskSpecificValue;
227 |     }
228 | 
229 |     return getUniversalValue() || defaultValue;
230 |   }
231 | 
232 |   protected resolveTemperature(
233 |     taskType: TaskType,
234 |     directValue: number | undefined
235 |   ): number {
236 |     return this.resolveParameter(
237 |       taskType,
238 |       directValue,
239 |       this.configService.getTemperatureForTask.bind(this.configService),
240 |       () => this.configService.getApiConfig().temperature,
241 |       0.8
242 |     );
243 |   }
244 | 
245 |   protected resolveTopP(
246 |     taskType: TaskType,
247 |     directValue: number | undefined
248 |   ): number {
249 |     return this.resolveParameter(
250 |       taskType,
251 |       directValue,
252 |       this.configService.getTopPForTask.bind(this.configService),
253 |       () => this.configService.getApiConfig().topP,
254 |       0.95
255 |     );
256 |   }
257 | 
258 |   protected resolveTopK(
259 |     taskType: TaskType,
260 |     directValue: number | undefined
261 |   ): number {
262 |     return this.resolveParameter(
263 |       taskType,
264 |       directValue,
265 |       this.configService.getTopKForTask.bind(this.configService),
266 |       () => this.configService.getApiConfig().topK,
267 |       30
268 |     );
269 |   }
270 | 
271 |   protected resolveMaxTokens(
272 |     taskType: TaskType,
273 |     directValue: number | undefined
274 |   ): number {
275 |     const defaultValue = taskType === 'image' ? 500 : 2000;
276 |     return this.resolveParameter(
277 |       taskType,
278 |       directValue,
279 |       this.configService.getMaxTokensForTask.bind(this.configService),
280 |       () => this.configService.getApiConfig().maxTokens,
281 |       defaultValue
282 |     );
283 |   }
284 | 
285 |   // Function-specific resolution methods
286 |   protected resolveTemperatureForFunction(
287 |     taskType: TaskType,
288 |     functionName: FunctionName | undefined,
289 |     directValue: number | undefined
290 |   ): number {
291 |     return this.resolveParameterWithFunction(
292 |       taskType,
293 |       functionName,
294 |       directValue,
295 |       this.configService.getTemperatureForFunction.bind(this.configService),
296 |       this.configService.getTemperatureForTask.bind(this.configService),
297 |       () => this.configService.getApiConfig().temperature,
298 |       0.8
299 |     );
300 |   }
301 | 
302 |   protected resolveTopPForFunction(
303 |     taskType: TaskType,
304 |     functionName: FunctionName | undefined,
305 |     directValue: number | undefined
306 |   ): number {
307 |     return this.resolveParameterWithFunction(
308 |       taskType,
309 |       functionName,
310 |       directValue,
311 |       this.configService.getTopPForFunction.bind(this.configService),
312 |       this.configService.getTopPForTask.bind(this.configService),
313 |       () => this.configService.getApiConfig().topP,
314 |       0.95
315 |     );
316 |   }
317 | 
318 |   protected resolveTopKForFunction(
319 |     taskType: TaskType,
320 |     functionName: FunctionName | undefined,
321 |     directValue: number | undefined
322 |   ): number {
323 |     return this.resolveParameterWithFunction(
324 |       taskType,
325 |       functionName,
326 |       directValue,
327 |       this.configService.getTopKForFunction.bind(this.configService),
328 |       this.configService.getTopKForTask.bind(this.configService),
329 |       () => this.configService.getApiConfig().topK,
330 |       30
331 |     );
332 |   }
333 | 
334 |   protected resolveMaxTokensForFunction(
335 |     taskType: TaskType,
336 |     functionName: FunctionName | undefined,
337 |     directValue: number | undefined
338 |   ): number {
339 |     const defaultValue = taskType === 'image' ? 500 : 2000;
340 |     return this.resolveParameterWithFunction(
341 |       taskType,
342 |       functionName,
343 |       directValue,
344 |       this.configService.getMaxTokensForFunction.bind(this.configService),
345 |       this.configService.getMaxTokensForTask.bind(this.configService),
346 |       () => this.configService.getApiConfig().maxTokens,
347 |       defaultValue
348 |     );
349 |   }
350 | 
351 |   /**
352 |    * Build config object with all standard options including structured output support
353 |    * @param taskType - 'image' or 'video'
354 |    * @param functionName - Specific function being called (for function-specific config)
355 |    * @param options - Analysis options from caller
356 |    * @returns Config object ready for API call
357 |    */
358 |   protected buildConfigWithOptions(
359 |     taskType: TaskType,
360 |     functionName: FunctionName | undefined,
361 |     options?: AnalysisOptions
362 |   ): any {
363 |     const config: any = {
364 |       temperature: this.resolveTemperatureForFunction(
365 |         taskType,
366 |         functionName,
367 |         options?.temperature
368 |       ),
369 |       topP: this.resolveTopPForFunction(taskType, functionName, options?.topP),
370 |       topK: this.resolveTopKForFunction(taskType, functionName, options?.topK),
371 |       maxOutputTokens: this.resolveMaxTokensForFunction(
372 |         taskType,
373 |         functionName,
374 |         options?.maxTokens
375 |       ),
376 |       candidateCount: 1,
377 |     };
378 | 
379 |     // Add structured output configuration if responseSchema is provided
380 |     if (options?.responseSchema) {
381 |       config.responseMimeType = 'application/json';
382 |       config.responseSchema = options.responseSchema;
383 |     }
384 | 
385 |     // Add system instruction if provided
386 |     if (options?.systemInstruction) {
387 |       config.systemInstruction = options.systemInstruction;
388 |     }
389 | 
390 |     // Add thinking budget configuration for Gemini models
391 |     const model = this.resolveModelForFunction(taskType, functionName);
392 |     const thinkingBudget = this.getThinkingBudgetForModel(model);
393 |     if (thinkingBudget !== undefined) {
394 |       config.thinkingConfig = {
395 |         thinkingBudget: thinkingBudget,
396 |       };
397 |     }
398 | 
399 |     return config;
400 |   }
401 | 
402 |   // Function-specific model resolution methods
403 |   protected resolveModelForFunction(
404 |     taskType: 'image' | 'video',
405 |     functionName: FunctionName | undefined
406 |   ): string {
407 |     const systemDefault =
408 |       taskType === 'image' ? 'gemini-2.5-flash-lite' : 'gemini-2.5-flash';
409 | 
410 |     // Priority hierarchy: Function-specific > Task-specific > System default
411 |     if (functionName) {
412 |       const functionSpecificModel =
413 |         this.configService.getModelForFunction(functionName);
414 |       if (functionSpecificModel) {
415 |         return functionSpecificModel;
416 |       }
417 |     }
418 | 
419 |     const taskSpecificModel = this.getModelForTask(taskType);
420 |     if (taskSpecificModel) {
421 |       return taskSpecificModel;
422 |     }
423 | 
424 |     return systemDefault;
425 |   }
426 | 
427 |   private getModelForTask(taskType: 'image' | 'video'): string | undefined {
428 |     return taskType === 'image'
429 |       ? this.configService.getConfig().IMAGE_MODEL
430 |       : this.configService.getConfig().VIDEO_MODEL;
431 |   }
432 | 
433 |   /**
434 |    * Determine the appropriate thinking budget for Gemini model variants
435 |    * Applies to both Gemini API and Vertex AI providers when using Gemini models
436 |    * Based on user requirements:
437 |    * - gemini-2.5-flash-lite and gemini-2.5-flash: thinking_budget = 0
438 |    * - gemini-2.5-pro: thinking_budget = 128
439 |    * - Other models: no thinking budget (undefined)
440 |    * @param model - The model name
441 |    * @returns thinking budget value or undefined if not applicable
442 |    */
443 |   protected getThinkingBudgetForModel(model: string): number | undefined {
444 |     // Only apply thinking budget to specific Gemini model variants
445 |     // This works for both direct Gemini API and Vertex AI when using Gemini models
446 |     if (
447 |       model.includes('gemini-2.5-flash-lite') ||
448 |       model.includes('gemini-2.5-flash')
449 |     ) {
450 |       // For flash models, use minimal thinking budget for faster response
451 |       return 0;
452 |     } else if (model.includes('gemini-2.5-pro')) {
453 |       // For pro models, use higher thinking budget for better reasoning
454 |       return 128;
455 |     }
456 | 
457 |     // For other models (older Gemini versions, non-Gemini models), don't set thinking budget
458 |     return undefined;
459 |   }
460 | }
461 | 


--------------------------------------------------------------------------------
/docs/llm_logs/web-context-detection-plan.md:
--------------------------------------------------------------------------------
  1 | # Plan: Web Context Detection and HTML Element Classification
  2 | 
  3 | **Date**: 2025-01-10
  4 | **Author**: Claude Code
  5 | **Issue**: Enhance object detection to automatically detect web page contexts and use appropriate HTML element names for better automation compatibility
  6 | 
  7 | ## Problem Statement
  8 | 
  9 | The current object detection system uses generic element names regardless of context:
 10 | 
 11 | 1. **Context-Agnostic Naming**: Uses "button", "input", "text" for all interfaces (web, mobile, desktop)
 12 | 2. **Missed Semantic Opportunities**: Web pages could benefit from HTML-specific element names
 13 | 3. **Automation Mismatch**: Generic names don't align with CSS selector targeting for web automation
 14 | 4. **Limited Specificity**: Cannot distinguish between HTML input types (text, email, password, etc.)
 15 | 
 16 | ## Solution: Context-Aware System Instructions
 17 | 
 18 | ### Core Approach
 19 | - **Automatic Web Detection**: Enhance system instructions to identify web page interfaces
 20 | - **HTML Element Classification**: Use semantic HTML element names when web context is detected
 21 | - **Fallback Mechanism**: Maintain current generic naming for non-web contexts
 22 | - **Progressive Enhancement**: Start with basic detection, enhance over time
 23 | 
 24 | ## Implementation Strategy
 25 | 
 26 | ### Phase 1: Enhanced System Instructions (Day 1-2)
 27 | 
 28 | **Objective**: Modify the detection system instruction to include web context detection logic.
 29 | 
 30 | **Key Changes**:
 31 | 1. **Context Detection Prompting**: Add web interface identification step
 32 | 2. **HTML Element Vocabulary**: Provide comprehensive HTML element list
 33 | 3. **Conditional Logic**: Use HTML names for web contexts, generic names otherwise
 34 | 4. **Input Type Specificity**: Detect specific input types when possible
 35 | 
 36 | **Updated System Instruction Structure**:
 37 | ```
 38 | 1. CONTEXT DETECTION:
 39 |    - Analyze if image shows a web page, browser interface, or web application
 40 |    - Look for indicators: address bars, browser UI, web-style layouts, form elements
 41 |    - If web context detected → use HTML element names
 42 |    - If non-web context → use generic object names
 43 | 
 44 | 2. HTML ELEMENT CLASSIFICATION (Web Context Only):
 45 |    - Interactive Elements: button, input[type], select, textarea, a
 46 |    - Form Elements: form, label, fieldset, legend
 47 |    - Structural: nav, header, footer, main, section, article
 48 |    - Content: h1-h6, p, img, video, ul, ol, li
 49 | 
 50 | 3. INPUT TYPE DETECTION:
 51 |    - Analyze visual cues for input specificity
 52 |    - text, email, password, search, tel, url, number, date
 53 |    - checkbox, radio, file, submit, reset
 54 | 
 55 | 4. FALLBACK NAMING:
 56 |    - Non-web contexts: button, text, image, icon, object, container
 57 | ```
 58 | 
 59 | ### Phase 2: Web Context Indicators (Day 2-3)
 60 | 
 61 | **Visual Indicators for Web Detection**:
 62 | - **Browser Elements**: Address bar, navigation buttons, tabs, bookmarks bar
 63 | - **Web UI Patterns**: Navigation menus, breadcrumbs, pagination, form layouts
 64 | - **Typography**: Web fonts, text rendering typical of browsers
 65 | - **Layout Patterns**: Grid systems, responsive design indicators, web-style spacing
 66 | - **Form Elements**: Standard HTML form controls with web styling
 67 | 
 68 | **Detection Logic**:
 69 | ```typescript
 70 | const WEB_CONTEXT_INDICATORS = [
 71 |   // Browser UI
 72 |   'address bar', 'url bar', 'browser tab', 'bookmark bar',
 73 |   'browser window', 'navigation buttons',
 74 | 
 75 |   // Web Interface Patterns
 76 |   'navigation menu', 'breadcrumb', 'pagination', 'web form',
 77 |   'login form', 'search bar', 'dropdown menu', 'checkbox',
 78 |   'radio button', 'submit button', 'hyperlink',
 79 | 
 80 |   // Layout Patterns
 81 |   'web page', 'website', 'web application', 'responsive design',
 82 |   'grid layout', 'sidebar', 'header', 'footer', 'navigation'
 83 | ];
 84 | ```
 85 | 
 86 | ### Phase 3: HTML Element Mapping (Day 3-4)
 87 | 
 88 | **Interactive Elements** (High Priority):
 89 | ```
 90 | button → <button>, <input type="submit">, <input type="button">
 91 | input → <input type="text">, <input type="email">, <input type="password">
 92 | select → <select>
 93 | textarea → <textarea>
 94 | link → <a>
 95 | checkbox → <input type="checkbox">
 96 | radio → <input type="radio">
 97 | ```
 98 | 
 99 | **Structural Elements**:
100 | ```
101 | navigation → <nav>
102 | header → <header>
103 | footer → <footer>
104 | main → <main>
105 | section → <section>
106 | article → <article>
107 | ```
108 | 
109 | **Content Elements**:
110 | ```
111 | heading → <h1>, <h2>, <h3>, <h4>, <h5>, <h6>
112 | paragraph → <p>
113 | image → <img>
114 | video → <video>
115 | list → <ul>, <ol>
116 | listitem → <li>
117 | ```
118 | 
119 | ### Phase 4: Enhanced System Instruction Implementation
120 | 
121 | **Strategy: Two-Phase Detection with Conditional Logic**
122 | 
123 | The key is to make the LLM first analyze the context, then conditionally apply different naming schemes based on that analysis.
124 | 
125 | **Updated DETECTION_SYSTEM_INSTRUCTION**:
126 | ```typescript
127 | const DETECTION_SYSTEM_INSTRUCTION = `
128 | You are a precise visual detection assistant with context-aware element naming.
129 | 
130 | STEP 1 - CONTEXT ANALYSIS:
131 | First, analyze this image to determine the interface type:
132 | 
133 | WEB INTERFACE INDICATORS (look for these):
134 | - Browser elements: address bar, tabs, navigation buttons, bookmarks
135 | - Web UI patterns: navigation menus, breadcrumbs, form layouts
136 | - Web typography: typical web fonts and text rendering
137 | - HTML form controls: standard web input fields, buttons, dropdowns
138 | - Web layout patterns: grid systems, responsive design, web-style spacing
139 | - URL visible, web page content, browser interface
140 | 
141 | If you detect 2 or more web indicators → WEB CONTEXT = TRUE
142 | If unclear or mobile/desktop app → WEB CONTEXT = FALSE
143 | 
144 | STEP 2 - CONDITIONAL ELEMENT NAMING:
145 | 
146 | IF WEB CONTEXT = TRUE:
147 | Use specific HTML element names:
148 | - Buttons: "button", "input[type=\"submit\"]", "input[type=\"button\"]"
149 | - Text inputs: "input[type=\"text\"]", "input[type=\"email\"]", "input[type=\"password\"]"
150 | - Form controls: "select", "textarea", "label", "fieldset"
151 | - Links: "a"
152 | - Navigation: "nav", "ul", "ol", "li"
153 | - Structure: "header", "footer", "main", "section", "article"
154 | - Content: "h1", "h2", "h3", "h4", "h5", "h6", "p", "img", "video"
155 | 
156 | IF WEB CONTEXT = FALSE:
157 | Use generic element names:
158 | - "button", "input", "text", "image", "icon", "container", "object"
159 | 
160 | STEP 3 - OUTPUT FORMAT:
161 | For each detected element, return:
162 | {
163 |   "object": "<conditional name based on context analysis>",
164 |   "label": "<descriptive label>",
165 |   "normalized_box_2d": [ymin, xmin, ymax, xmax]
166 | }
167 | 
168 | STEP 4 - EXAMPLES:
169 | 
170 | Web Context Example:
171 | {
172 |   "object": "input[type=\"email\"]",
173 |   "label": "Email address field",
174 |   "normalized_box_2d": [180, 200, 220, 600]
175 | }
176 | 
177 | Non-Web Context Example:
178 | {
179 |   "object": "input",
180 |   "label": "Email address field",
181 |   "normalized_box_2d": [180, 200, 220, 600]
182 | }
183 | 
184 | Return only valid JSON array - no explanatory text, no context description.
185 | `;
186 | ```
187 | 
188 | ## Practical Implementation in Code
189 | 
190 | ### Method 1: Single System Instruction (Recommended)
191 | 
192 | **File**: `src/tools/detect_objects_in_image.ts`
193 | 
194 | The LLM handles context detection internally through the enhanced system instruction:
195 | 
196 | ```typescript
197 | // Enhanced system instruction with conditional logic
198 | const DETECTION_SYSTEM_INSTRUCTION = `
199 | You are a precise visual detection assistant with context-aware element naming.
200 | 
201 | STEP 1 - CONTEXT ANALYSIS:
202 | First, analyze this image to determine the interface type:
203 | 
204 | WEB INTERFACE INDICATORS (look for these):
205 | - Browser elements: address bar, tabs, navigation buttons, bookmarks
206 | - Web UI patterns: navigation menus, breadcrumbs, form layouts
207 | - Web typography: typical web fonts and text rendering
208 | - HTML form controls: standard web input fields, buttons, dropdowns
209 | - Web layout patterns: grid systems, responsive design, web-style spacing
210 | - URL visible, web page content, browser interface
211 | 
212 | If you detect 2 or more web indicators → WEB CONTEXT = TRUE
213 | If unclear or mobile/desktop app → WEB CONTEXT = FALSE
214 | 
215 | STEP 2 - CONDITIONAL ELEMENT NAMING:
216 | 
217 | IF WEB CONTEXT = TRUE:
218 | Use specific HTML element names:
219 | - Buttons: "button", "input[type=\"submit\"]", "input[type=\"button\"]"
220 | - Text inputs: "input[type=\"text\"]", "input[type=\"email\"]", "input[type=\"password\"]"
221 | - Form controls: "select", "textarea", "label", "fieldset"
222 | - Links: "a"
223 | - Navigation: "nav", "ul", "ol", "li"
224 | - Structure: "header", "footer", "main", "section", "article"
225 | - Content: "h1", "h2", "h3", "h4", "h5", "h6", "p", "img", "video"
226 | 
227 | IF WEB CONTEXT = FALSE:
228 | Use generic element names:
229 | - "button", "input", "text", "image", "icon", "container", "object"
230 | 
231 | Return only valid JSON array with conditional naming applied.
232 | `;
233 | 
234 | // No code changes needed - LLM handles context detection automatically
235 | export async function detect_objects_in_image(
236 |   args: ObjectDetectionArgs,
237 |   config: Config,
238 |   imageProvider: VisionProvider,
239 |   imageFileService: FileService
240 | ): Promise<ObjectDetectionResponse> {
241 |   // ... existing code ...
242 | 
243 |   const options: AnalysisOptions = {
244 |     // ... existing options ...
245 |     systemInstruction: DETECTION_SYSTEM_INSTRUCTION, // Enhanced instruction
246 |   };
247 | 
248 |   // LLM automatically applies conditional naming based on context analysis
249 |   const result = await imageProvider.analyzeImage(
250 |     processedImageSource,
251 |     detectionPrompt,
252 |     options
253 |   );
254 | 
255 |   // ... rest of existing code unchanged ...
256 | }
257 | ```
258 | 
259 | ## Implementation Approach: Single System Instruction
260 | 
261 | **How it works:**
262 | - Single API call with enhanced system instruction
263 | - LLM internally analyzes context and applies conditional naming
264 | - No code changes to existing flow
265 | 
266 | **Benefits:**
267 | - ✅ **Simplest Implementation**: No additional API calls or complexity
268 | - ✅ **Cost Efficient**: Single request instead of two
269 | - ✅ **Faster Execution**: No sequential API calls
270 | - ✅ **Atomic Operation**: Context and detection in one step
271 | - ✅ **No Code Refactoring**: Drop-in replacement for existing system instruction
272 | 
273 | **Implementation Strategy:**
274 | - Start with single instruction approach
275 | - Monitor accuracy through logging and user feedback
276 | - Modern LLMs are capable of following complex conditional instructions reliably
277 | 
278 | ## Example Outputs with Conditional Naming
279 | 
280 | ### Same Screenshot, Different Contexts
281 | 
282 | **Web Page Screenshot:**
283 | ```json
284 | [
285 |   {
286 |     "object": "input[type=\"email\"]",
287 |     "label": "Email address field",
288 |     "normalized_box_2d": [180, 200, 220, 600]
289 |   },
290 |   {
291 |     "object": "button[type=\"submit\"]",
292 |     "label": "Login submit button",
293 |     "normalized_box_2d": [245, 720, 290, 850]
294 |   },
295 |   {
296 |     "object": "nav",
297 |     "label": "Main navigation menu",
298 |     "normalized_box_2d": [50, 100, 120, 900]
299 |   }
300 | ]
301 | ```
302 | 
303 | **Mobile App Screenshot (Same Visual Elements):**
304 | ```json
305 | [
306 |   {
307 |     "object": "input",
308 |     "label": "Email address field",
309 |     "normalized_box_2d": [180, 200, 220, 600]
310 |   },
311 |   {
312 |     "object": "button",
313 |     "label": "Login submit button",
314 |     "normalized_box_2d": [245, 720, 290, 850]
315 |   },
316 |   {
317 |     "object": "container",
318 |     "label": "Main navigation menu",
319 |     "normalized_box_2d": [50, 100, 120, 900]
320 |   }
321 | ]
322 | ```
323 | 
324 | **Key Difference:** Same visual elements get HTML-specific names for web contexts, generic names for non-web contexts.
325 | 
326 | ## CSS Selector Integration
327 | 
328 | **Enhanced Summary Generation**:
329 | ```typescript
330 | function suggestCSSSelectors(detection: DetectedObject): string[] {
331 |   const selectors = [];
332 | 
333 |   // HTML element-specific selectors
334 |   if (detection.object.startsWith('input[type=')) {
335 |     const inputType = detection.object.match(/type="([^"]+)"/)?.[1];
336 |     selectors.push(`input[type="${inputType}"]`);
337 |     selectors.push(`input[name="${detection.label.toLowerCase().replace(/\s+/g, '_')}"]`);
338 |   } else if (detection.object === 'button') {
339 |     selectors.push('button[type="submit"]');
340 |     selectors.push(`button:has-text("${detection.label}")`);
341 |   } else if (detection.object === 'select') {
342 |     selectors.push('select');
343 |     selectors.push(`select[name="${detection.label.toLowerCase().replace(/\s+/g, '_')}"]`);
344 |   }
345 | 
346 |   return selectors;
347 | }
348 | ```
349 | 
350 | ## Benefits
351 | 
352 | - **Perfect CSS Alignment**: HTML element names directly map to selectors
353 | - **Automation Accuracy**: More precise element targeting for web automation
354 | - **Context Awareness**: Appropriate naming based on interface type
355 | - **Backward Compatibility**: Non-web contexts unchanged
356 | - **Framework Integration**: Better integration with Playwright, Puppeteer, Cypress
357 | - **Accessibility Support**: HTML semantics align with accessibility standards
358 | - **Developer Experience**: More intuitive element identification
359 | 
360 | ## Risk Assessment
361 | 
362 | **Low Risk:**
363 | - **Fallback Mechanism**: Non-web contexts remain unchanged
364 | - **Gradual Implementation**: Can be deployed incrementally
365 | - **No Breaking Changes**: Existing functionality preserved
366 | 
367 | **Medium Risk:**
368 | - **Detection Accuracy**: Web context detection may have false positives/negatives
369 | - **HTML Specificity**: Need to ensure HTML element names are accurate
370 | 
371 | **Mitigation Strategies:**
372 | - **Conservative Detection**: Err on side of generic naming when uncertain
373 | - **Validation Testing**: Extensive testing across web and non-web contexts
374 | - **User Feedback**: Monitor accuracy and adjust detection logic
375 | 
376 | ## Success Metrics
377 | 
378 | 1. **Web Detection Accuracy**: >90% correct web vs non-web classification
379 | 2. **HTML Element Accuracy**: >85% appropriate HTML element names for web contexts
380 | 3. **CSS Selector Utility**: Functional CSS selectors for detected web elements
381 | 4. **User Adoption**: Increased usage for web automation scenarios
382 | 
383 | ## Implementation Timeline
384 | 
385 | - **Day 1**: Update system instructions with web context detection
386 | - **Day 2**: Add HTML element classification logic
387 | - **Day 3**: Integrate with CSS selector generation
388 | - **Day 4**: Testing and validation across different interface types
389 | - **Day 5**: Documentation and deployment
390 | 
391 | ## Future Enhancements
392 | 
393 | - **Machine Learning**: Train models to better detect web contexts
394 | - **Framework-Specific**: Tailored outputs for different automation frameworks
395 | - **Accessibility**: Include ARIA attributes and accessibility information
396 | - **Mobile Web**: Detect mobile web interfaces vs native mobile apps
397 | - **Component Libraries**: Recognize common UI component patterns
398 | 
399 | ## Conclusion
400 | 
401 | This enhancement will significantly improve the tool's value for web automation by providing context-aware element classification. The progressive approach ensures backward compatibility while adding substantial value for web-based use cases, making the tool more aligned with modern web development and automation practices.


--------------------------------------------------------------------------------
/src/services/FileService.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * File service for handling image and video sources
  3 |  */
  4 | 
  5 | import fs from 'fs/promises';
  6 | import path from 'path';
  7 | import fetch from 'node-fetch';
  8 | import type { FileUploadStrategy } from '../types/Providers.js';
  9 | import { FileUploadFactory } from '../file-upload/factory/FileUploadFactory.js';
 10 | import { GeminiProvider } from '../providers/gemini/GeminiProvider.js';
 11 | import { ConfigService } from './ConfigService.js';
 12 | import {
 13 |   FileUploadError,
 14 |   UnsupportedFileTypeError,
 15 |   FileSizeExceededError,
 16 | } from '../types/Errors.js';
 17 | 
 18 | export class FileService {
 19 |   private uploadStrategy: FileUploadStrategy;
 20 |   private configService: ConfigService;
 21 | 
 22 |   constructor(
 23 |     configService: ConfigService,
 24 |     type: 'image' | 'video',
 25 |     visionProvider: GeminiProvider
 26 |   ) {
 27 |     this.configService = configService;
 28 |     this.uploadStrategy = FileUploadFactory.createStrategy(
 29 |       configService.getConfig(),
 30 |       type,
 31 |       visionProvider
 32 |     );
 33 |   }
 34 | 
 35 |   async handleImageSource(imageSource: string): Promise<string> {
 36 |     console.log(
 37 |       `[FileService] handleImageSource input: ${imageSource.substring(0, 100)}${imageSource.length > 100 ? '...' : ''}`
 38 |     );
 39 | 
 40 |     // If it's already a file reference, return as-is
 41 |     if (
 42 |       imageSource.startsWith('files/') ||
 43 |       imageSource.includes('generativelanguage.googleapis.com')
 44 |     ) {
 45 |       console.log(`[FileService] Returning existing file reference`);
 46 |       return imageSource;
 47 |     }
 48 | 
 49 |     // If it's a GCS URI, return as-is
 50 |     if (this.isGcsUri(imageSource)) {
 51 |       console.log(`[FileService] Returning GCS URI`);
 52 |       return imageSource;
 53 |     }
 54 | 
 55 |     // Get the image data and size regardless of source
 56 |     const { buffer, mimeType, filename } = await this.getImageData(imageSource);
 57 | 
 58 |     // Choose processing method based on size threshold
 59 |     const threshold = this.configService.getGeminiFilesApiThreshold();
 60 |     console.log(
 61 |       `[FileService] Buffer size: ${buffer.length}, Threshold: ${threshold}`
 62 |     );
 63 | 
 64 |     if (buffer.length <= threshold) {
 65 |       // Use inline data for small images
 66 |       const result = `data:${mimeType};base64,${buffer.toString('base64')}`;
 67 |       console.log(
 68 |         `[FileService] Returning inline data URL: ${result.substring(0, 100)}...`
 69 |       );
 70 |       return result;
 71 |     } else {
 72 |       // Use Files API for large images
 73 |       const result = await this.uploadFile(
 74 |         buffer,
 75 |         filename || `image.${this.getFileExtension(mimeType)}`,
 76 |         mimeType
 77 |       );
 78 |       console.log(`[FileService] Returning file URI: ${result}`);
 79 |       return result;
 80 |     }
 81 |   }
 82 | 
 83 |   async handleVideoSource(videoSource: string): Promise<string> {
 84 |     // If it's already a public URL or GCS URI, return as-is
 85 |     if (this.isPublicUrl(videoSource) || this.isGcsUri(videoSource)) {
 86 |       return videoSource;
 87 |     }
 88 | 
 89 |     // If it's a local file path, upload to storage
 90 |     if (this.isLocalFilePath(videoSource)) {
 91 |       return await this.handleLocalFile(videoSource, 'video');
 92 |     }
 93 | 
 94 |     // If it's a file reference (files/...), return as-is
 95 |     if (videoSource.startsWith('files/')) {
 96 |       return videoSource;
 97 |     }
 98 | 
 99 |     throw new FileUploadError(`Invalid video source format: ${videoSource}`);
100 |   }
101 | 
102 |   async uploadFile(
103 |     buffer: Buffer,
104 |     filename: string,
105 |     mimeType: string
106 |   ): Promise<string> {
107 |     // Validate file size
108 |     const maxSize = this.getMaxFileSize(mimeType);
109 |     if (buffer.length > maxSize) {
110 |       throw new FileSizeExceededError(buffer.length, maxSize);
111 |     }
112 | 
113 |     // Validate file type
114 |     if (!this.isSupportedFileType(mimeType)) {
115 |       throw new UnsupportedFileTypeError(
116 |         mimeType,
117 |         this.getSupportedFileTypes()
118 |       );
119 |     }
120 | 
121 |     const uploadedFile = await this.uploadStrategy.uploadFile(
122 |       buffer,
123 |       filename,
124 |       mimeType
125 |     );
126 |     const fileReference =
127 |       await this.uploadStrategy.getFileForAnalysis(uploadedFile);
128 | 
129 |     return fileReference.type === 'file_uri'
130 |       ? fileReference.uri || ''
131 |       : fileReference.url || '';
132 |   }
133 | 
134 |   async cleanup(fileId: string): Promise<void> {
135 |     if (this.uploadStrategy.cleanup) {
136 |       await this.uploadStrategy.cleanup(fileId);
137 |     }
138 |   }
139 | 
140 |   // Public method to read file directly (used by detect_objects_in_image)
141 |   async readFile(filePath: string): Promise<Buffer> {
142 |     const normalizedPath = path.normalize(filePath);
143 |     await fs.access(normalizedPath);
144 |     return await fs.readFile(normalizedPath);
145 |   }
146 | 
147 |   // Private helper methods
148 | 
149 |   private async getImageData(imageSource: string): Promise<{
150 |     buffer: Buffer;
151 |     mimeType: string;
152 |     filename?: string;
153 |   }> {
154 |     if (imageSource.startsWith('data:image/')) {
155 |       // Handle base64 data
156 |       const matches = imageSource.match(
157 |         /^data:image\/([a-zA-Z]+);base64,(.+)$/
158 |       );
159 |       if (!matches) {
160 |         throw new FileUploadError('Invalid base64 image format');
161 |       }
162 |       const mimeType = `image/${matches[1]}`;
163 |       const buffer = Buffer.from(matches[2], 'base64');
164 |       return { buffer, mimeType, filename: `image.${matches[1]}` };
165 |     }
166 | 
167 |     if (this.isPublicUrl(imageSource)) {
168 |       // Handle URL images
169 |       try {
170 |         // Decode URL-encoded characters to handle escaped sequences like \&
171 |         const decodedUrl = imageSource.replace(/\\&/g, '&');
172 |         console.log(`[FileService] Fetching URL: ${decodedUrl}`);
173 | 
174 |         const response = await fetch(decodedUrl);
175 |         if (!response.ok) {
176 |           throw new FileUploadError(
177 |             `Failed to fetch image from URL: ${decodedUrl} (Status: ${response.status})`
178 |           );
179 |         }
180 |         const arrayBuffer = await response.arrayBuffer();
181 |         const buffer = Buffer.from(arrayBuffer);
182 |         const filename = decodedUrl.split('/').pop() || 'image.jpg';
183 |         const mimeType = this.getMimeType(filename, buffer);
184 |         console.log(
185 |           `[FileService] Successfully fetched image - size: ${buffer.length}, mimeType: ${mimeType}`
186 |         );
187 |         return { buffer, mimeType, filename };
188 |       } catch (error) {
189 |         console.error(`[FileService] Error fetching URL:`, error);
190 |         throw new FileUploadError(
191 |           `Failed to download image from URL: ${error instanceof Error ? error.message : String(error)}`
192 |         );
193 |       }
194 |     }
195 | 
196 |     if (this.isLocalFilePath(imageSource)) {
197 |       // Handle local files
198 |       try {
199 |         const normalizedPath = path.normalize(imageSource);
200 |         await fs.access(normalizedPath);
201 |         const buffer = await fs.readFile(normalizedPath);
202 |         const filename = path.basename(normalizedPath);
203 |         const mimeType = this.getMimeType(filename, buffer);
204 |         return { buffer, mimeType, filename };
205 |       } catch (error) {
206 |         throw new FileUploadError(
207 |           `Failed to read local file ${imageSource}: ${error instanceof Error ? error.message : String(error)}`
208 |         );
209 |       }
210 |     }
211 | 
212 |     throw new FileUploadError(`Invalid image source format: ${imageSource}`);
213 |   }
214 | 
215 |   private async handleBase64Image(base64Data: string): Promise<string> {
216 |     try {
217 |       // Extract the base64 content and MIME type
218 |       const matches = base64Data.match(/^data:image\/([a-zA-Z]+);base64,(.+)$/);
219 |       if (!matches) {
220 |         throw new FileUploadError('Invalid base64 image format');
221 |       }
222 | 
223 |       const mimeType = `image/${matches[1]}`;
224 |       const buffer = Buffer.from(matches[2], 'base64');
225 | 
226 |       return await this.uploadFile(buffer, `image.${matches[1]}`, mimeType);
227 |     } catch (error) {
228 |       throw new FileUploadError(
229 |         `Failed to process base64 image: ${error instanceof Error ? error.message : String(error)}`
230 |       );
231 |     }
232 |   }
233 | 
234 |   private async handleLocalFile(
235 |     filePath: string,
236 |     _type: 'image' | 'video'
237 |   ): Promise<string> {
238 |     try {
239 |       // Normalize file path for cross-platform compatibility
240 |       const normalizedPath = path.normalize(filePath);
241 | 
242 |       // Check if file exists
243 |       await fs.access(normalizedPath);
244 | 
245 |       // Read file
246 |       const buffer = await fs.readFile(normalizedPath);
247 |       const filename = path.basename(normalizedPath);
248 | 
249 |       // Determine MIME type
250 |       const mimeType = this.getMimeType(filename, buffer);
251 | 
252 |       return await this.uploadFile(buffer, filename, mimeType);
253 |     } catch (error) {
254 |       if (error instanceof FileUploadError) {
255 |         throw error;
256 |       }
257 |       throw new FileUploadError(
258 |         `Failed to process local file ${filePath}: ${error instanceof Error ? error.message : String(error)}`
259 |       );
260 |     }
261 |   }
262 | 
263 |   private isPublicUrl(url: string): boolean {
264 |     return url.startsWith('http://') || url.startsWith('https://');
265 |   }
266 | 
267 |   private isGcsUri(uri: string): boolean {
268 |     return uri.startsWith('gs://');
269 |   }
270 | 
271 |   private isLocalFilePath(filePath: string): boolean {
272 |     // Unix/Linux paths
273 |     if (
274 |       filePath.startsWith('/') ||
275 |       filePath.startsWith('./') ||
276 |       filePath.startsWith('../')
277 |     ) {
278 |       return true;
279 |     }
280 | 
281 |     // Windows paths
282 |     // Check for drive letter pattern (e.g., "C:\", "D:/", etc.)
283 |     if (/^[a-zA-Z]:[\\/]/.test(filePath)) {
284 |       return true;
285 |     }
286 | 
287 |     // Check for UNC paths (e.g., "\\server\share")
288 |     if (filePath.startsWith('\\\\')) {
289 |       return true;
290 |     }
291 | 
292 |     // Check for relative paths with backslashes (Windows-style)
293 |     if (
294 |       filePath.includes('\\') &&
295 |       (filePath.startsWith('.\\') || filePath.startsWith('..\\'))
296 |     ) {
297 |       return true;
298 |     }
299 | 
300 |     return false;
301 |   }
302 | 
303 |   private isSupportedFileType(mimeType: string): boolean {
304 |     const allowedImageTypes = this.configService.getAllowedImageFormats();
305 |     const allowedVideoTypes = this.configService.getAllowedVideoFormats();
306 | 
307 |     if (mimeType.startsWith('image/')) {
308 |       const extension = mimeType.split('/')[1];
309 |       return allowedImageTypes.includes(extension);
310 |     }
311 | 
312 |     if (mimeType.startsWith('video/')) {
313 |       const extension = mimeType.split('/')[1];
314 |       return allowedVideoTypes.includes(extension);
315 |     }
316 | 
317 |     return false;
318 |   }
319 | 
320 |   private getSupportedFileTypes(): string[] {
321 |     const imageTypes = this.configService.getAllowedImageFormats();
322 |     const videoTypes = this.configService.getAllowedVideoFormats();
323 |     return [...imageTypes, ...videoTypes];
324 |   }
325 | 
326 |   private getMaxFileSize(mimeType: string): number {
327 |     if (mimeType.startsWith('image/')) {
328 |       return this.configService.getMaxImageSize();
329 |     }
330 | 
331 |     if (mimeType.startsWith('video/')) {
332 |       return this.configService.getMaxVideoSize();
333 |     }
334 | 
335 |     return 10 * 1024 * 1024; // 10MB default for other types
336 |   }
337 | 
338 |   private getMimeType(filename: string, buffer?: Buffer): string {
339 |     // Try to determine MIME type from file extension first
340 |     const extension = path.extname(filename).toLowerCase().substring(1);
341 |     const mimeTypes: Record<string, string> = {
342 |       // Images
343 |       jpg: 'image/jpeg',
344 |       jpeg: 'image/jpeg',
345 |       png: 'image/png',
346 |       gif: 'image/gif',
347 |       webp: 'image/webp',
348 |       bmp: 'image/bmp',
349 |       tiff: 'image/tiff',
350 |       heic: 'image/heic',
351 |       heif: 'image/heif',
352 |       // Videos
353 |       mp4: 'video/mp4',
354 |       mov: 'video/quicktime',
355 |       avi: 'video/x-msvideo',
356 |       mkv: 'video/x-matroska',
357 |       webm: 'video/webm',
358 |       flv: 'video/x-flv',
359 |       wmv: 'video/x-ms-wmv',
360 |       '3gp': 'video/3gpp',
361 |       m4v: 'video/x-m4v',
362 |     };
363 | 
364 |     const mimeType = mimeTypes[extension];
365 |     if (mimeType) {
366 |       return mimeType;
367 |     }
368 | 
369 |     // If buffer is available, try to determine from file signature
370 |     if (buffer) {
371 |       return this.getMimeTypeFromBuffer(buffer);
372 |     }
373 | 
374 |     // Default fallback
375 |     return extension.includes('jpg') || extension.includes('jpeg')
376 |       ? 'image/jpeg'
377 |       : 'application/octet-stream';
378 |   }
379 | 
380 |   private getMimeTypeFromBuffer(buffer: Buffer): string {
381 |     // Check PNG signature
382 |     if (
383 |       buffer.length >= 8 &&
384 |       buffer[0] === 0x89 &&
385 |       buffer[1] === 0x50 &&
386 |       buffer[2] === 0x4e &&
387 |       buffer[3] === 0x47 &&
388 |       buffer[4] === 0x0d &&
389 |       buffer[5] === 0x0a &&
390 |       buffer[6] === 0x1a &&
391 |       buffer[7] === 0x0a
392 |     ) {
393 |       return 'image/png';
394 |     }
395 | 
396 |     // Check JPEG signature
397 |     if (
398 |       buffer.length >= 3 &&
399 |       buffer[0] === 0xff &&
400 |       buffer[1] === 0xd8 &&
401 |       buffer[2] === 0xff
402 |     ) {
403 |       return 'image/jpeg';
404 |     }
405 | 
406 |     // Check GIF signatures
407 |     if (buffer.length >= 6) {
408 |       const header = buffer.slice(0, 6).toString('ascii');
409 |       if (header === 'GIF87a' || header === 'GIF89a') {
410 |         return 'image/gif';
411 |       }
412 |     }
413 | 
414 |     // Check WebP signature (RIFF....WEBP)
415 |     if (
416 |       buffer.length >= 12 &&
417 |       buffer.slice(0, 4).toString('ascii') === 'RIFF' &&
418 |       buffer.slice(8, 12).toString('ascii') === 'WEBP'
419 |     ) {
420 |       return 'image/webp';
421 |     }
422 | 
423 |     // Check for video formats
424 |     if (buffer.length >= 4 && buffer.slice(0, 4).toString('ascii') === 'RIFF') {
425 |       // This could be AVI or WebM, default to WebM
426 |       return 'video/webm';
427 |     }
428 | 
429 |     // Check for MP4/MOV (ftyp box)
430 |     if (buffer.length >= 8 && buffer.slice(4, 8).toString('ascii') === 'ftyp') {
431 |       return 'video/mp4';
432 |     }
433 | 
434 |     return 'application/octet-stream';
435 |   }
436 | 
437 |   private getFileExtension(mimeType: string): string {
438 |     return FileService.getFileExtension(mimeType);
439 |   }
440 | 
441 |   // Static utility methods
442 | 
443 |   static isImageFile(mimeType: string): boolean {
444 |     return mimeType.startsWith('image/');
445 |   }
446 | 
447 |   static isVideoFile(mimeType: string): boolean {
448 |     return mimeType.startsWith('video/');
449 |   }
450 | 
451 |   static getFileExtension(mimeType: string): string {
452 |     const extensions: Record<string, string> = {
453 |       'image/jpeg': 'jpg',
454 |       'image/png': 'png',
455 |       'image/gif': 'gif',
456 |       'image/webp': 'webp',
457 |       'image/bmp': 'bmp',
458 |       'image/tiff': 'tiff',
459 |       'image/heic': 'heic',
460 |       'image/heif': 'heif',
461 |       'video/mp4': 'mp4',
462 |       'video/quicktime': 'mov',
463 |       'video/x-msvideo': 'avi',
464 |       'video/x-matroska': 'mkv',
465 |       'video/webm': 'webm',
466 |       'video/x-flv': 'flv',
467 |       'video/x-ms-wmv': 'wmv',
468 |       'video/3gpp': '3gp',
469 |       'video/x-m4v': 'm4v',
470 |     };
471 | 
472 |     return extensions[mimeType] || 'bin';
473 |   }
474 | }
475 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # AI Vision MCP Server
  2 | 
  3 | A powerful Model Context Protocol (MCP) server that provides AI-powered image and video analysis using Google Gemini and Vertex AI models.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Dual Provider Support**: Choose between Google Gemini API and Vertex AI
  8 | - **Multimodal Analysis**: Support for both image and video content analysis
  9 | - **Flexible File Handling**: Upload via multiple methods (URLs, local files, base64)
 10 | - **Storage Integration**: Built-in Google Cloud Storage support
 11 | - **Comprehensive Validation**: Zod-based data validation throughout
 12 | - **Error Handling**: Robust error handling with retry logic and circuit breakers
 13 | - **TypeScript**: Full TypeScript support with strict type checking
 14 | 
 15 | 
 16 | ## Quick Start
 17 | 
 18 | ### Pre-requisites
 19 | 
 20 | You could choose either to use [`google` provider](https://aistudio.google.com/welcome) or [`vertex_ai` provider](https://cloud.google.com/vertex-ai/generative-ai/docs/start/quickstart). For simplicity, `google` provider is recommended.
 21 | 
 22 | Below are the environment variables you need to set based on your selected provider. (Note: It’s recommended to set the timeout configuration to more than 5 minutes for your MCP client).
 23 | 
 24 | (i) **Using Google AI Studio Provider**
 25 | 
 26 | ```bash
 27 | export IMAGE_PROVIDER="google" # or vertex_ai
 28 | export VIDEO_PROVIDER="google" # or vertex_ai
 29 | export GEMINI_API_KEY="your-gemini-api-key"
 30 | ```
 31 | 
 32 | Get your Google AI Studio's api key [here](https://aistudio.google.com/app/api-keys)
 33 | 
 34 | (ii) **Using Vertex AI Provider**
 35 | 
 36 | ```bash
 37 | export IMAGE_PROVIDER="vertex_ai"
 38 | export VIDEO_PROVIDER="vertex_ai"
 39 | export VERTEX_CREDENTIALS="/path/to/service-account.json"
 40 | export GCS_BUCKET_NAME="your-gcs-bucket"
 41 | ```
 42 | 
 43 | Refer to [the guideline here](docs/provider/vertex-ai-setup-guide.md) on how to set this up.
 44 | 
 45 | 
 46 | ### Installation
 47 | 
 48 | Below are the installation guide for this MCP on different MCP clients, such as Claude Desktop, Claude Code, Cursor, Cline, etc.
 49 | 
 50 | <details>
 51 | <summary>Claude Desktop</summary>
 52 | 
 53 | Add to your Claude Desktop configuration:
 54 | 
 55 | (i) Using Google AI Studio Provider
 56 | ```json
 57 | {
 58 |   "mcpServers": {
 59 |     "ai-vision-mcp": {
 60 |       "command": "npx",
 61 |       "args": ["ai-vision-mcp"],
 62 |       "env": {
 63 |         "IMAGE_PROVIDER": "google",
 64 |         "VIDEO_PROVIDER": "google",
 65 |         "GEMINI_API_KEY": "your-gemini-api-key"
 66 |       }
 67 |     }
 68 |   }
 69 | }
 70 | ```
 71 | 
 72 | (ii) Using Vertex AI Provider
 73 | ```json
 74 | {
 75 |   "mcpServers": {
 76 |     "ai-vision-mcp": {
 77 |       "command": "npx",
 78 |       "args": ["ai-vision-mcp"],
 79 |       "env": {
 80 |         "IMAGE_PROVIDER": "vertex_ai",
 81 |         "VIDEO_PROVIDER": "vertex_ai",
 82 |         "VERTEX_CREDENTIALS": "/path/to/service-account.json",
 83 |         "GCS_BUCKET_NAME": "ai-vision-mcp-{VERTEX_PROJECT_ID}"
 84 |       }
 85 |     }
 86 |   }
 87 | }
 88 | ```
 89 | 
 90 | </details>
 91 | 
 92 | <details>
 93 | <summary>Claude Code</summary>
 94 | 
 95 | (i) Using Google AI Studio Provider
 96 | ```bash
 97 | claude mcp add ai-vision-mcp \
 98 |   -e IMAGE_PROVIDER=google \
 99 |   -e VIDEO_PROVIDER=google \
100 |   -e GEMINI_API_KEY=your-gemini-api-key \
101 |   -- npx ai-vision-mcp
102 | ```
103 | 
104 | (ii) Using Vertex AI Provider
105 | ```bash
106 | claude mcp add ai-vision-mcp \
107 |   -e IMAGE_PROVIDER=vertex_ai \
108 |   -e VIDEO_PROVIDER=vertex_ai \
109 |   -e VERTEX_CREDENTIALS=/path/to/service-account.json \
110 |   -e GCS_BUCKET_NAME=ai-vision-mcp-{VERTEX_PROJECT_ID} \
111 |   -- npx ai-vision-mcp
112 | ```
113 | 
114 | 
115 | Note: Increase the MCP startup timeout to 1 minutes and MCP tool execution timeout to about 5 minutes by updating `~\.claude\settings.json` as follows:
116 | 
117 | ```json
118 | {
119 |   "env": {
120 |     "MCP_TIMEOUT": "60000",
121 |     "MCP_TOOL_TIMEOUT": "300000"
122 |   }
123 | }
124 | ```
125 | 
126 | </details>
127 | 
128 | <details>
129 | <summary>Cursor</summary>
130 | 
131 | Go to: Settings -> Cursor Settings -> MCP -> Add new global MCP server
132 | 
133 | Pasting the following configuration into your Cursor ~/.cursor/mcp.json file is the recommended approach. You may also install in a specific project by creating .cursor/mcp.json in your project folder. See [Cursor MCP docs](https://docs.cursor.com/context/model-context-protocol) for more info.
134 | 
135 | (i) Using Google AI Studio Provider
136 | ```json
137 | {
138 |   "mcpServers": {
139 |     "ai-vision-mcp": {
140 |       "command": "npx",
141 |       "args": ["ai-vision-mcp"],
142 |       "env": {
143 |         "IMAGE_PROVIDER": "google",
144 |         "VIDEO_PROVIDER": "google",
145 |         "GEMINI_API_KEY": "your-gemini-api-key"
146 |       }
147 |     }
148 |   }
149 | }
150 | ```
151 | 
152 | (ii) Using Vertex AI Provider
153 | ```json
154 | {
155 |   "mcpServers": {
156 |     "ai-vision-mcp": {
157 |       "command": "npx",
158 |       "args": ["ai-vision-mcp"],
159 |       "env": {
160 |         "IMAGE_PROVIDER": "vertex_ai",
161 |         "VIDEO_PROVIDER": "vertex_ai",
162 |         "VERTEX_CREDENTIALS": "/path/to/service-account.json",
163 |         "GCS_BUCKET_NAME": "ai-vision-mcp-{VERTEX_PROJECT_ID}"
164 |       }
165 |     }
166 |   }
167 | }
168 | ```
169 | </details>
170 | 
171 | 
172 | <details>
173 | <summary>Cline</summary>
174 | 
175 | Cline uses a JSON configuration file to manage MCP servers. To integrate the provided MCP server configuration:
176 | 
177 | 1. Open Cline and click on the MCP Servers icon in the top navigation bar.
178 | 2. Select the Installed tab, then click Advanced MCP Settings.
179 | 3. In the cline_mcp_settings.json file, add the following configuration:
180 | 
181 | (i) Using Google AI Studio Provider
182 | ```json
183 | {
184 |   "mcpServers": {
185 |     "timeout": 300, 
186 |     "type": "stdio",
187 |     "ai-vision-mcp": {
188 |       "command": "npx",
189 |       "args": ["ai-vision-mcp"],
190 |       "env": {
191 |         "IMAGE_PROVIDER": "google",
192 |         "VIDEO_PROVIDER": "google",
193 |         "GEMINI_API_KEY": "your-gemini-api-key"
194 |       }
195 |     }
196 |   }
197 | }
198 | ```
199 | 
200 | (ii) Using Vertex AI Provider
201 | ```json
202 | {
203 |   "mcpServers": {
204 |     "ai-vision-mcp": {
205 |       "timeout": 300,
206 |       "type": "stdio",
207 |       "command": "npx",
208 |       "args": ["ai-vision-mcp"],
209 |       "env": {
210 |         "IMAGE_PROVIDER": "vertex_ai",
211 |         "VIDEO_PROVIDER": "vertex_ai",
212 |         "VERTEX_CREDENTIALS": "/path/to/service-account.json",
213 |         "GCS_BUCKET_NAME": "ai-vision-mcp-{VERTEX_PROJECT_ID}"
214 |       }
215 |     }
216 |   }
217 | }
218 | ```
219 | </details>
220 | 
221 | 
222 | <details>
223 | 
224 | <summary>Other MCP clients</summary>
225 | 
226 | The server uses stdio transport and follows the standard MCP protocol. It can be integrated with any MCP-compatible client by running:
227 | 
228 | ```bash
229 | npx ai-vision-mcp
230 | ```
231 | </details>
232 | 
233 | 
234 | ## MCP Tools
235 | 
236 | The server provides four main MCP tools:
237 | 
238 | ### 1) `analyze_image`
239 | 
240 | Analyzes an image using AI and returns a detailed description.
241 | 
242 | **Parameters:**
243 | - `imageSource` (string): URL, base64 data, or file path to the image
244 | - `prompt` (string): Question or instruction for the AI
245 | - `options` (object, optional): Analysis options including temperature and max tokens
246 | 
247 | **Examples:**
248 | 
249 | 1. **Analyze image from URL:**
250 | ```json
251 | {
252 |   "imageSource": "https://plus.unsplash.com/premium_photo-1710965560034-778eedc929ff",
253 |   "prompt": "What is this image about? Describe what you see in detail."
254 | }
255 | ```
256 | 
257 | 2. **Analyze local image file:**
258 | ```json
259 | {
260 |   "imageSource": "C:\\Users\\username\\Downloads\\image.jpg",
261 |   "prompt": "What is this image about? Describe what you see in detail."
262 | }
263 | ```
264 | 
265 | 
266 | ### 2) `compare_images`
267 | 
268 | Compares multiple images using AI and returns a detailed comparison analysis.
269 | 
270 | **Parameters:**
271 | - `imageSources` (array): Array of image sources (URLs, base64 data, or file paths) - minimum 2, maximum 4 images
272 | - `prompt` (string): Question or instruction for comparing the images
273 | - `options` (object, optional): Analysis options including temperature and max tokens
274 | 
275 | **Examples:**
276 | 
277 | 1. **Compare images from URLs:**
278 | ```json
279 | {
280 |   "imageSources": [
281 |     "https://example.com/image1.jpg",
282 |     "https://example.com/image2.jpg"
283 |   ],
284 |   "prompt": "Compare these two images and tell me the differences"
285 | }
286 | ```
287 | 
288 | 2. **Compare mixed sources:**
289 | ```json
290 | {
291 |   "imageSources": [
292 |     "https://example.com/image1.jpg",
293 |     "C:\\\\Users\\\\username\\\\Downloads\\\\image2.jpg",
294 |     "data:image/jpeg;base64,/9j/4AAQSkZJRgAB..."
295 |   ],
296 |   "prompt": "Which image has the best lighting quality?"
297 | }
298 | ```
299 | 
300 | ### 3) `detect_objects_in_image`
301 | 
302 | Detects objects in an image using AI vision models and generates annotated images with bounding boxes. Returns detected objects with coordinates and either saves the annotated image to a file or temporary directory.
303 | 
304 | **Parameters:**
305 | - `imageSource` (string): URL, base64 data, or file path to the image
306 | - `prompt` (string): Custom detection prompt describing what to detect or recognize in the image
307 | - `outputFilePath` (string, optional): Explicit output path for the annotated image
308 | 
309 | **Configuration:**
310 | This function uses optimized default parameters for object detection and does not accept runtime `options` parameter. To customize the AI parameters (temperature, topP, topK, maxTokens), use environment variables:
311 | 
312 | ```
313 | # Recommended environment variable settings for object detection (these are now the defaults)
314 | TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=0.0     # Deterministic responses
315 | TOP_P_FOR_DETECT_OBJECTS_IN_IMAGE=0.95          # Nucleus sampling
316 | TOP_K_FOR_DETECT_OBJECTS_IN_IMAGE=30            # Vocabulary selection
317 | MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE=8192     # High token limit for JSON
318 | ```
319 | 
320 | **File Handling Logic:**
321 | 1. **Explicit outputFilePath provided** → Saves to the exact path specified
322 | 2. **If not explicit outputFilePath** → Automatically saves to temporary directory
323 | 
324 | **Response Types:**
325 | - Returns `file` object when explicit outputFilePath is provided
326 | - Returns `tempFile` object when explicit outputFilePath is not provided so the image file output is auto-saved to temporary folder
327 | - Always includes `detections` array with detected objects and coordinates
328 | - Includes `summary` with percentage-based coordinates for browser automation
329 | 
330 | **Examples:**
331 | 
332 | 1. **Basic object detection:**
333 | ```json
334 | {
335 |   "imageSource": "https://example.com/image.jpg",
336 |   "prompt": "Detect all objects in this image"
337 | }
338 | ```
339 | 
340 | 2. **Save annotated image to specific path:**
341 | ```json
342 | {
343 |   "imageSource": "C:\\Users\\username\\Downloads\\image.jpg",
344 |   "outputFilePath": "C:\\Users\\username\\Documents\\annotated_image.png"
345 | }
346 | ```
347 | 
348 | 3. **Custom detection prompt:**
349 | ```json
350 | {
351 |   "imageSource": "data:image/jpeg;base64,/9j/4AAQSkZJRgAB...",
352 |   "prompt": "Detect and label all electronic devices in this image"
353 | }
354 | ```
355 | 
356 | 
357 | ### 4) `analyze_video`
358 | 
359 | Analyzes a video using AI and returns a detailed description.
360 | 
361 | **Parameters:**
362 | - `videoSource` (string): YouTube URL, GCS URI, or local file path to the video
363 | - `prompt` (string): Question or instruction for the AI
364 | - `options` (object, optional): Analysis options including temperature and max tokens
365 | 
366 | **Supported video sources:**
367 | - YouTube URLs (e.g., `https://www.youtube.com/watch?v=...`)
368 | - Local file paths (e.g., `C:\Users\username\Downloads\video.mp4`)
369 | 
370 | **Examples:**
371 | 
372 | 1. **Analyze video from YouTube URL:**
373 | ```json
374 | {
375 |   "videoSource": "https://www.youtube.com/watch?v=9hE5-98ZeCg",
376 |   "prompt": "What is this video about? Describe what you see in detail."
377 | }
378 | ```
379 | 
380 | 2. **Analyze local video file:**
381 | ```json
382 | {
383 |   "videoSource": "C:\\Users\\username\\Downloads\\video.mp4",
384 |   "prompt": "What is this video about? Describe what you see in detail."
385 | }
386 | ```
387 | 
388 | **Note:** Only YouTube URLs are supported for public video URLs. Other public video URLs are not currently supported.
389 | 
390 | 
391 | ## Environment Configuration
392 | 
393 | For basic setup, you only need to configure the provider selection and required credentials:
394 | 
395 | ### Google AI Studio Provider (Recommended)
396 | ```bash
397 | export IMAGE_PROVIDER="google"
398 | export VIDEO_PROVIDER="google"
399 | export GEMINI_API_KEY="your-gemini-api-key"
400 | ```
401 | 
402 | ### Vertex AI Provider (Production)
403 | ```bash
404 | export IMAGE_PROVIDER="vertex_ai"
405 | export VIDEO_PROVIDER="vertex_ai"
406 | export VERTEX_CREDENTIALS="/path/to/service-account.json"
407 | export GCS_BUCKET_NAME="your-gcs-bucket"
408 | ```
409 | 
410 | ### 📖 **Detailed Configuration Guide**
411 | 
412 | For comprehensive environment variable documentation, including:
413 | - Complete configuration reference (60+ environment variables)
414 | - Function-specific optimization examples
415 | - Advanced configuration patterns
416 | - Troubleshooting guidance
417 | 
418 | 👉 **[See Environment Variable Guide](docs/environment-variable-guide.md)**
419 | 
420 | ### Configuration Priority Overview
421 | 
422 | The server uses a hierarchical configuration system where more specific settings override general ones:
423 | 
424 | 1. **LLM-assigned values** (runtime parameters in tool calls)
425 | 2. **Function-specific variables** (`TEMPERATURE_FOR_ANALYZE_IMAGE`, etc.)
426 | 3. **Task-specific variables** (`TEMPERATURE_FOR_IMAGE`, etc.)
427 | 4. **Universal variables** (`TEMPERATURE`, etc.)
428 | 5. **System defaults**
429 | 
430 | <details>
431 | <summary><strong>Quick Configuration Examples</strong></summary>
432 | 
433 | **Basic Optimization:**
434 | ```bash
435 | # General settings
436 | export TEMPERATURE=0.7
437 | export MAX_TOKENS=1500
438 | 
439 | # Task-specific optimization
440 | export TEMPERATURE_FOR_IMAGE=0.2     # More precise for images
441 | export TEMPERATURE_FOR_VIDEO=0.5     # More creative for videos
442 | ```
443 | 
444 | **Function-specific Optimization:**
445 | ```bash
446 | # Optimize individual functions
447 | export TEMPERATURE_FOR_ANALYZE_IMAGE=0.1
448 | export TEMPERATURE_FOR_COMPARE_IMAGES=0.3
449 | export TEMPERATURE_FOR_DETECT_OBJECTS_IN_IMAGE=0.0  # Deterministic
450 | export MAX_TOKENS_FOR_DETECT_OBJECTS_IN_IMAGE=8192   # High token limit
451 | ```
452 | 
453 | **Model Selection:**
454 | ```bash
455 | # Choose models per function
456 | export ANALYZE_IMAGE_MODEL="gemini-2.5-flash-lite"
457 | export COMPARE_IMAGES_MODEL="gemini-2.5-flash"
458 | export ANALYZE_VIDEO_MODEL="gemini-2.5-flash-pro"
459 | ```
460 | </details>
461 | 
462 | ## Development
463 | 
464 | ### Prerequisites
465 | 
466 | - Node.js 18+
467 | - npm or yarn
468 | 
469 | ### Setup
470 | 
471 | ```bash
472 | # Clone the repository
473 | git clone https://github.com/tan-yong-sheng/ai-vision-mcp.git
474 | cd ai-vision-mcp
475 | 
476 | # Install dependencies
477 | npm install
478 | 
479 | # Build the project
480 | npm run build
481 | 
482 | # Start development server
483 | npm run dev
484 | ```
485 | 
486 | ### Scripts
487 | 
488 | - `npm run build` - Build the TypeScript project
489 | - `npm run dev` - Start development server with watch mode
490 | - `npm run lint` - Run ESLint
491 | - `npm run format` - Format code with Prettier
492 | - `npm start` - Start the built server
493 | 
494 | ## Architecture
495 | 
496 | The project follows a modular architecture:
497 | 
498 | ```
499 | src/
500 | ├── providers/          # AI provider implementations
501 | │   ├── gemini/        # Google Gemini provider
502 | │   ├── vertexai/      # Vertex AI provider
503 | │   └── factory/       # Provider factory
504 | ├── services/          # Core services
505 | │   ├── ConfigService.ts
506 | │   └── FileService.ts
507 | ├── storage/           # Storage implementations
508 | ├── file-upload/       # File upload strategies
509 | ├── types/            # TypeScript type definitions
510 | ├── utils/            # Utility functions
511 | └── server.ts         # Main MCP server
512 | ```
513 | 
514 | ## Error Handling
515 | 
516 | The server includes comprehensive error handling:
517 | 
518 | - **Validation Errors**: Input validation using Zod schemas
519 | - **Network Errors**: Automatic retries with exponential backoff
520 | - **Authentication Errors**: Clear error messages for API key issues
521 | - **File Errors**: Handling for file size limits and format restrictions
522 | 
523 | ## Contributing
524 | 
525 | 1. Fork the repository
526 | 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
527 | 3. Commit your changes (`git commit -m 'Add amazing feature'`)
528 | 4. Push to the branch (`git push origin feature/amazing-feature`)
529 | 5. Open a Pull Request
530 | 
531 | ## License
532 | 
533 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
534 | 
535 | ## Acknowledgments
536 | 
537 | - Google for the Gemini and Vertex AI APIs
538 | - The Model Context Protocol team for the MCP framework
539 | - All contributors and users of this project


--------------------------------------------------------------------------------