├── .npmignore ├── .gitignore ├── src ├── vision-client.ts ├── utils │ ├── helpers.ts │ └── logger.ts ├── config.ts ├── qwen-client.ts ├── siliconflow-client.ts ├── volcengine-client.ts ├── zhipu-client.ts ├── image-processor.ts └── index.ts ├── tsconfig.json ├── LICENSE ├── .env.example ├── package.json ├── .github └── workflows │ └── release.yml ├── test ├── test-data-uri.ts ├── test-qwen.ts ├── test-deepseek-raw.ts └── test-local.ts ├── CHANGELOG.md ├── README.md └── docs └── README_EN.md /.npmignore: -------------------------------------------------------------------------------- 1 | src/ 2 | docs/ 3 | examples/ 4 | *.log 5 | .DS_Store 6 | .env 7 | tsconfig.json 8 | PUBLISHING.md 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | 4 | # Build output 5 | build/ 6 | 7 | # Logs 8 | *.log 9 | logs/ 10 | .luma-mcp/ 11 | 12 | # Environment variables 13 | .env 14 | .env.local 15 | 16 | # OS files 17 | .DS_Store 18 | Thumbs.db 19 | 20 | # IDE 21 | .vscode/ 22 | .idea/ 23 | *.swp 24 | *.swo 25 | 26 | # Test files 27 | test/image.png 28 | test/*.jpg 29 | test/*.jpeg 30 | 31 | # Temporary files 32 | *.tmp 33 | *.temp 34 | mcp-server 35 | .claude -------------------------------------------------------------------------------- /src/vision-client.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 视觉模型客户端统一接口 3 | */ 4 | 5 | export interface VisionClient { 6 | /** 7 | * 分析图片 8 | * @param imageDataUrl 图片 Data URL 或 URL 9 | * @param prompt 分析提示词 10 | * @param enableThinking 是否启用思考模式(如果模型支持) 11 | * @returns 分析结果文本 12 | */ 13 | analyzeImage( 14 | imageDataUrl: string, 15 | prompt: string, 16 | enableThinking?: boolean 17 | ): Promise; 18 | 19 | /** 20 | * 获取模型名称 21 | */ 22 | getModelName(): string; 23 | } 24 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "module": "Node16", 5 | "moduleResolution": "Node16", 6 | "outDir": "./build", 7 | "rootDir": "./src", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true, 11 | "forceConsistentCasingInFileNames": true, 12 | "resolveJsonModule": true, 13 | "declaration": true, 14 | "declarationMap": true, 15 | "sourceMap": true 16 | }, 17 | "include": ["src/**/*"], 18 | "exclude": ["node_modules", "build"] 19 | } 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Jochen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # 模型提供商选择:zhipu | siliconflow | qwen | volcengine 2 | MODEL_PROVIDER=zhipu 3 | 4 | # ========== Zhipu AI (推荐 - GLM-4.6V) ========== 5 | # 获取API Key: https://open.bigmodel.cn/ 6 | ZHIPU_API_KEY=your-zhipu-api-key-here 7 | MODEL_NAME=glm-4.6v 8 | 9 | # ========== SiliconFlow (免费 - DeepSeek-OCR) ========== 10 | # 获取API Key: https://siliconflow.cn/ 11 | # SILICONFLOW_API_KEY=your-siliconflow-api-key 12 | # MODEL_NAME=deepseek-ai/DeepSeek-OCR 13 | 14 | # ========== 阿里云 Qwen (Qwen3-VL-Flash) ========== 15 | # 获取API Key: https://dashscope.aliyun.com/ 16 | # DASHSCOPE_API_KEY=your-dashscope-api-key 17 | # MODEL_NAME=qwen3-vl-flash 18 | 19 | # ========== 火山方舟 Volcengine (Doubao-Seed-1.6) ========== 20 | # 获取API Key: https://console.volcengine.com/ark 21 | # VOLCENGINE_API_KEY=your-volcengine-api-key 22 | # MODEL_NAME=doubao-seed-1-6-flash-250828 23 | # 可选模型: 24 | # - doubao-seed-1-6-flash-250828 (性价比高,256k上下文) 25 | # - doubao-seed-1-6-vision-250815 (视觉优化,64k输出) 26 | # - doubao-seed-1-6-lite-251015 (轻量级) 27 | # 注意: 使用控制台中的实际模型ID 28 | 29 | # ========== 通用参数 ========== 30 | MAX_TOKENS=16384 31 | TEMPERATURE=0.7 32 | TOP_P=0.7 33 | ENABLE_THINKING=true 34 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "luma-mcp", 3 | "version": "1.2.7", 4 | "description": "Multi-model vision understanding MCP server. Supports GLM-4.6V (Zhipu), DeepSeek-OCR (SiliconFlow - Free), Qwen3-VL-Flash (Aliyun), and Doubao-Seed-1.6 (Volcengine)", 5 | "type": "module", 6 | "bin": { 7 | "luma-mcp": "build/index.js" 8 | }, 9 | "main": "./build/index.js", 10 | "scripts": { 11 | "build": "tsc", 12 | "watch": "tsc --watch", 13 | "prepare": "npm run build", 14 | "test:local": "tsx test/test-local.ts" 15 | }, 16 | "keywords": [ 17 | "mcp", 18 | "vision", 19 | "ai", 20 | "glm-4.6v", 21 | "zhipu", 22 | "deepseek-ocr", 23 | "siliconflow", 24 | "qwen3-vl", 25 | "aliyun", 26 | "dashscope", 27 | "doubao", 28 | "volcengine", 29 | "ark", 30 | "ocr", 31 | "free", 32 | "image-understanding", 33 | "multi-model" 34 | ], 35 | "author": "Jochen", 36 | "license": "MIT", 37 | "repository": { 38 | "type": "git", 39 | "url": "git+https://github.com/JochenYang/luma-mcp.git" 40 | }, 41 | "bugs": { 42 | "url": "https://github.com/JochenYang/luma-mcp/issues" 43 | }, 44 | "homepage": "https://github.com/JochenYang/luma-mcp#readme", 45 | "dependencies": { 46 | "@modelcontextprotocol/sdk": "^1.0.4", 47 | "axios": "^1.7.9", 48 | "sharp": "^0.33.5", 49 | "zod": "^3.25.76" 50 | }, 51 | "devDependencies": { 52 | "@types/node": "^22.10.2", 53 | "tsx": "^4.20.6", 54 | "typescript": "^5.7.2" 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Create Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | release: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Extract version from tag 21 | id: version 22 | run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT 23 | 24 | - name: Extract changelog for this version 25 | id: changelog 26 | run: | 27 | VERSION=${{ steps.version.outputs.VERSION }} 28 | echo "Extracting changelog for version $VERSION" 29 | 30 | # Extract changelog content between version headers 31 | sed -n "/## \[${VERSION}\]/,/## \[/p" CHANGELOG.md | sed '$d' > release_notes.md 32 | 33 | # If empty, use a default message 34 | if [ ! -s release_notes.md ]; then 35 | echo "Release version ${VERSION}" > release_notes.md 36 | fi 37 | 38 | cat release_notes.md 39 | 40 | - name: Create GitHub Release 41 | uses: softprops/action-gh-release@v1 42 | with: 43 | tag_name: v${{ steps.version.outputs.VERSION }} 44 | name: Release v${{ steps.version.outputs.VERSION }} 45 | body_path: release_notes.md 46 | draft: false 47 | prerelease: false 48 | env: 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | -------------------------------------------------------------------------------- /src/utils/helpers.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 工具函数 3 | */ 4 | 5 | /** 6 | * 带重试机制的异步函数包装器 7 | */ 8 | export function withRetry( 9 | fn: (...args: any[]) => Promise, 10 | maxRetries: number = 2, 11 | initialDelay: number = 1000 12 | ): (...args: any[]) => Promise { 13 | return async (...args: any[]): Promise => { 14 | let lastError: Error; 15 | 16 | for (let attempt = 0; attempt <= maxRetries; attempt++) { 17 | try { 18 | return await fn(...args); 19 | } catch (error) { 20 | lastError = error instanceof Error ? error : new Error(String(error)); 21 | 22 | if (attempt === maxRetries) { 23 | throw lastError; 24 | } 25 | 26 | // 指数退避 27 | const delay = initialDelay * Math.pow(2, attempt); 28 | await new Promise(resolve => setTimeout(resolve, delay)); 29 | } 30 | } 31 | 32 | throw lastError!; 33 | }; 34 | } 35 | 36 | /** 37 | * 检查字符串是否为 URL 38 | */ 39 | export function isUrl(source: string): boolean { 40 | try { 41 | const url = new URL(source); 42 | return url.protocol === 'http:' || url.protocol === 'https:'; 43 | } catch { 44 | return false; 45 | } 46 | } 47 | 48 | /** 49 | * 创建成功响应 50 | */ 51 | export function createSuccessResponse(data: string) { 52 | return { 53 | content: [{ type: 'text' as const, text: data }], 54 | }; 55 | } 56 | 57 | /** 58 | * 创建错误响应 59 | */ 60 | export function createErrorResponse(message: string) { 61 | return { 62 | content: [{ type: 'text' as const, text: `错误: ${message}` }], 63 | isError: true, 64 | }; 65 | } 66 | -------------------------------------------------------------------------------- /src/config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 配置管理模块 3 | * 从环境变量读取配置 4 | */ 5 | 6 | export type ModelProvider = 'zhipu' | 'siliconflow' | 'qwen' | 'volcengine'; 7 | 8 | export interface LumaConfig { 9 | provider: ModelProvider; 10 | apiKey: string; 11 | model: string; 12 | maxTokens: number; 13 | temperature: number; 14 | topP: number; 15 | enableThinking: boolean; 16 | } 17 | 18 | /** 19 | * 从环境变量加载配置 20 | */ 21 | export function loadConfig(): LumaConfig { 22 | // 确定使用的模型提供商 23 | const provider = (process.env.MODEL_PROVIDER?.toLowerCase() || 'zhipu') as ModelProvider; 24 | 25 | // 根据提供商获取 API Key 26 | let apiKey: string | undefined; 27 | let defaultModel: string; 28 | 29 | if (provider === 'siliconflow') { 30 | apiKey = process.env.SILICONFLOW_API_KEY; 31 | defaultModel = 'deepseek-ai/DeepSeek-OCR'; 32 | 33 | if (!apiKey) { 34 | throw new Error('SILICONFLOW_API_KEY environment variable is required when using SiliconFlow provider'); 35 | } 36 | } else if (provider === 'qwen') { 37 | apiKey = process.env.DASHSCOPE_API_KEY; 38 | defaultModel = 'qwen3-vl-flash'; 39 | 40 | if (!apiKey) { 41 | throw new Error('DASHSCOPE_API_KEY environment variable is required when using Qwen provider'); 42 | } 43 | } else if (provider === 'volcengine') { 44 | apiKey = process.env.VOLCENGINE_API_KEY; 45 | defaultModel = 'doubao-seed-1-6-flash-250828'; 46 | 47 | if (!apiKey) { 48 | throw new Error('VOLCENGINE_API_KEY environment variable is required when using Volcengine provider'); 49 | } 50 | } else { 51 | apiKey = process.env.ZHIPU_API_KEY; 52 | defaultModel = 'glm-4.6v'; 53 | 54 | if (!apiKey) { 55 | throw new Error('ZHIPU_API_KEY environment variable is required when using Zhipu provider'); 56 | } 57 | } 58 | 59 | return { 60 | provider, 61 | apiKey, 62 | model: process.env.MODEL_NAME || defaultModel, 63 | maxTokens: parseInt(process.env.MAX_TOKENS || '16384', 10), 64 | temperature: parseFloat(process.env.TEMPERATURE || '0.7'), 65 | topP: parseFloat(process.env.TOP_P || '0.7'), 66 | enableThinking: process.env.ENABLE_THINKING !== 'false', 67 | }; 68 | } 69 | -------------------------------------------------------------------------------- /src/utils/logger.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 日志工具 3 | * 将日志输出到 stderr,避免污染 MCP 的 stdout JSON 通信 4 | */ 5 | 6 | import { writeFileSync, appendFileSync, mkdirSync } from 'fs'; 7 | import { dirname, join } from 'path'; 8 | import { homedir } from 'os'; 9 | 10 | class Logger { 11 | private logFilePath?: string; 12 | 13 | constructor() { 14 | this.initLogFile(); 15 | } 16 | 17 | private initLogFile() { 18 | try { 19 | const homeDir = homedir(); 20 | const now = new Date(); 21 | const dateStr = now.toISOString().split('T')[0]; // YYYY-MM-DD 22 | const logDir = join(homeDir, '.luma-mcp'); 23 | 24 | mkdirSync(logDir, { recursive: true }); 25 | this.logFilePath = join(logDir, `luma-mcp-${dateStr}.log`); 26 | } catch (error) { 27 | // 如果无法创建日志文件,只输出到 stderr 28 | process.stderr.write(`[WARN] Failed to initialize log file: ${error}\n`); 29 | } 30 | } 31 | 32 | private write(level: string, message: string, ...args: any[]) { 33 | const timestamp = new Date().toISOString(); 34 | const argsStr = args.length > 0 ? ` ${JSON.stringify(args)}` : ''; 35 | const logMessage = `[${timestamp}] ${level.toUpperCase()}: ${message}${argsStr}`; 36 | 37 | // 输出到 stderr 38 | process.stderr.write(logMessage + '\n'); 39 | 40 | // 写入日志文件 41 | if (this.logFilePath) { 42 | try { 43 | appendFileSync(this.logFilePath, logMessage + '\n'); 44 | } catch { 45 | // 忽略文件写入错误 46 | } 47 | } 48 | } 49 | 50 | info(message: string, ...args: any[]) { 51 | this.write('info', message, ...args); 52 | } 53 | 54 | error(message: string, ...args: any[]) { 55 | this.write('error', message, ...args); 56 | } 57 | 58 | warn(message: string, ...args: any[]) { 59 | this.write('warn', message, ...args); 60 | } 61 | 62 | debug(message: string, ...args: any[]) { 63 | this.write('debug', message, ...args); 64 | } 65 | } 66 | 67 | export const logger = new Logger(); 68 | 69 | /** 70 | * 重定向 console 到 logger,避免污染 stdout 71 | */ 72 | export function setupConsoleRedirection() { 73 | console.log = logger.info.bind(logger); 74 | console.info = logger.info.bind(logger); 75 | console.error = logger.error.bind(logger); 76 | console.warn = logger.warn.bind(logger); 77 | console.debug = logger.debug.bind(logger); 78 | } 79 | -------------------------------------------------------------------------------- /test/test-data-uri.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 测试 Data URI 支持 3 | */ 4 | 5 | import { validateImageSource, imageToBase64 } from '../src/image-processor.js'; 6 | 7 | // 一个有效的 1x1 像素 PNG 图片的 Data URI 8 | const validDataUri = ''; 9 | 10 | // 无效的 Data URI(不支持的格式) 11 | const invalidDataUri = ''; 12 | 13 | async function testDataUri() { 14 | console.log('🧪 测试 Data URI 支持\n'); 15 | 16 | // 测试 1: 验证有效的 Data URI 17 | try { 18 | console.log('测试 1: 验证有效的 PNG Data URI'); 19 | await validateImageSource(validDataUri); 20 | console.log('✅ 通过:有效的 Data URI 验证成功\n'); 21 | } catch (error) { 22 | console.log(`❌ 失败: ${error instanceof Error ? error.message : String(error)}\n`); 23 | } 24 | 25 | // 测试 2: 验证无效的 Data URI(不支持的格式) 26 | try { 27 | console.log('测试 2: 验证不支持的格式 (SVG)'); 28 | await validateImageSource(invalidDataUri); 29 | console.log('❌ 失败:应该抛出错误\n'); 30 | } catch (error) { 31 | console.log(`✅ 通过:正确拒绝不支持的格式 - ${error instanceof Error ? error.message : String(error)}\n`); 32 | } 33 | 34 | // 测试 3: Data URI 转换(应该直接返回) 35 | try { 36 | console.log('测试 3: Data URI 转换'); 37 | const result = await imageToBase64(validDataUri); 38 | if (result === validDataUri) { 39 | console.log('✅ 通过:Data URI 正确传递(未修改)\n'); 40 | } else { 41 | console.log('❌ 失败:Data URI 被修改了\n'); 42 | } 43 | } catch (error) { 44 | console.log(`❌ 失败: ${error instanceof Error ? error.message : String(error)}\n`); 45 | } 46 | 47 | // 测试 4: 大小验证(创建一个超过10MB的Data URI) 48 | try { 49 | console.log('测试 4: 验证大小限制 (>10MB)'); 50 | // 创建一个约 15MB 的 base64 字符串(20MB * 3/4 = 15MB) 51 | const largeBase64 = 'A'.repeat(20 * 1024 * 1024); 52 | const largeDataUri = `data:image/png;base64,${largeBase64}`; 53 | await validateImageSource(largeDataUri); 54 | console.log('❌ 失败:应该拒绝过大的文件\n'); 55 | } catch (error) { 56 | console.log(`✅ 通过:正确拒绝超大文件 - ${error instanceof Error ? error.message : String(error)}\n`); 57 | } 58 | 59 | console.log('=========================================='); 60 | console.log('✅ Data URI 测试完成!'); 61 | console.log('==========================================\n'); 62 | } 63 | 64 | testDataUri().catch(console.error); 65 | -------------------------------------------------------------------------------- /test/test-qwen.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Qwen 客户端测试 3 | * 测试阿里云通义千问VL视觉理解 4 | */ 5 | 6 | import { QwenClient } from '../src/qwen-client.js'; 7 | import { imageToBase64 } from '../src/image-processor.js'; 8 | 9 | async function testQwen() { 10 | const apiKey = process.env.DASHSCOPE_API_KEY; 11 | 12 | if (!apiKey) { 13 | console.error('❌ 错误: 需要设置 DASHSCOPE_API_KEY 环境变量'); 14 | console.log('设置方法:'); 15 | console.log(' macOS/Linux: export DASHSCOPE_API_KEY="your-api-key"'); 16 | console.log(' Windows: $env:DASHSCOPE_API_KEY="your-api-key"'); 17 | process.exit(1); 18 | } 19 | 20 | // 获取图片路径 21 | const imagePath = process.argv[2]; 22 | if (!imagePath) { 23 | console.error('❌ 错误: 请提供图片路径'); 24 | console.log('用法: tsx test/test-qwen.ts <图片路径>'); 25 | console.log('示例: tsx test/test-qwen.ts ./test.png'); 26 | process.exit(1); 27 | } 28 | 29 | console.log('🚀 开始测试 Qwen3-VL-Flash...\n'); 30 | 31 | try { 32 | // 1. 初始化客户端 33 | console.log('1️⃣ 初始化 Qwen 客户端...'); 34 | const client = new QwenClient( 35 | apiKey, 36 | 'qwen3-vl-flash', // 使用高性价比的 Flash 版本 37 | 4096, 38 | 0.7 39 | ); 40 | console.log(`✅ 客户端初始化成功: ${client.getModelName()}\n`); 41 | 42 | // 2. 读取图片 43 | console.log('2️⃣ 读取图片...'); 44 | const imageData = await imageToBase64(imagePath); 45 | console.log(`✅ 图片读取成功 (${imagePath})\n`); 46 | 47 | // 3. 测试基础分析 48 | console.log('3️⃣ 测试基础分析(不启用思考模式)...'); 49 | const basicResult = await client.analyzeImage( 50 | imageData, 51 | '请详细分析这张图片的内容', 52 | false 53 | ); 54 | console.log('📊 基础分析结果:'); 55 | console.log(basicResult); 56 | console.log('\n'); 57 | 58 | // 4. 测试思考模式 59 | console.log('4️⃣ 测试思考模式(enable_thinking=true)...'); 60 | const thinkingResult = await client.analyzeImage( 61 | imageData, 62 | '请详细分析这张图片的内容,包括所有细节', 63 | true // 启用思考模式 64 | ); 65 | console.log('🧠 思考模式分析结果:'); 66 | console.log(thinkingResult); 67 | console.log('\n'); 68 | 69 | // 5. 测试 OCR 70 | console.log('5️⃣ 测试 OCR 能力...'); 71 | const ocrResult = await client.analyzeImage( 72 | imageData, 73 | '识别图片中的所有文字', 74 | false 75 | ); 76 | console.log('📝 OCR 结果:'); 77 | console.log(ocrResult); 78 | console.log('\n'); 79 | 80 | console.log('✅ 所有测试完成!'); 81 | 82 | } catch (error) { 83 | console.error('❌ 测试失败:', error instanceof Error ? error.message : error); 84 | process.exit(1); 85 | } 86 | } 87 | 88 | testQwen(); 89 | -------------------------------------------------------------------------------- /src/qwen-client.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 阿里云通义千问VL客户端 3 | * 使用 OpenAI 兼容接口 4 | * API 文档: https://help.aliyun.com/zh/model-studio/vision 5 | */ 6 | 7 | import axios, { AxiosInstance } from 'axios'; 8 | import { VisionClient } from './vision-client.js'; 9 | import type { LumaConfig } from './config.js'; 10 | 11 | export class QwenClient implements VisionClient { 12 | private client: AxiosInstance; 13 | private apiKey: string; 14 | private model: string; 15 | private maxTokens: number; 16 | private temperature: number; 17 | 18 | constructor(config: LumaConfig) { 19 | this.apiKey = config.apiKey; 20 | this.model = config.model; 21 | this.maxTokens = config.maxTokens; 22 | this.temperature = config.temperature; 23 | 24 | // 使用阿里云百炼的 OpenAI 兼容接口 25 | this.client = axios.create({ 26 | baseURL: 'https://dashscope.aliyuncs.com/compatible-mode/v1', 27 | headers: { 28 | 'Authorization': `Bearer ${config.apiKey}`, 29 | 'Content-Type': 'application/json', 30 | }, 31 | timeout: 180000, // 180秒超时 32 | }); 33 | } 34 | 35 | async analyzeImage(imageDataUrl: string, prompt: string, enableThinking?: boolean): Promise { 36 | try { 37 | // Qwen3-VL 支持思考模式,使用 extra_body 传递非标准参数 38 | const requestBody: any = { 39 | model: this.model, 40 | messages: [ 41 | { 42 | role: 'user', 43 | content: [ 44 | { 45 | type: 'image_url', 46 | image_url: { 47 | url: imageDataUrl 48 | } 49 | }, 50 | { 51 | type: 'text', 52 | text: prompt 53 | } 54 | ] 55 | } 56 | ], 57 | max_tokens: this.maxTokens, 58 | temperature: this.temperature, 59 | stream: false 60 | }; 61 | 62 | // 如果启用思考模式,添加 extra_body 参数 63 | if (enableThinking) { 64 | requestBody.extra_body = { 65 | enable_thinking: true, 66 | thinking_budget: 81920 // 最大思考 Token 数 67 | }; 68 | } 69 | 70 | const response = await this.client.post('/chat/completions', requestBody); 71 | 72 | if (!response.data?.choices?.[0]?.message?.content) { 73 | throw new Error('Invalid response format from Qwen API'); 74 | } 75 | 76 | return response.data.choices[0].message.content; 77 | 78 | } catch (error) { 79 | if (axios.isAxiosError(error)) { 80 | const errorMessage = error.response?.data?.error?.message || error.message; 81 | throw new Error(`Qwen API error: ${errorMessage}`); 82 | } 83 | throw error; 84 | } 85 | } 86 | 87 | getModelName(): string { 88 | return `Qwen (${this.model})`; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /test/test-deepseek-raw.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 直接测试 DeepSeek-OCR API(无任何包装) 3 | */ 4 | 5 | import axios from 'axios'; 6 | import * as fs from 'fs'; 7 | import * as path from 'path'; 8 | 9 | async function testDeepSeekOCR(imagePath: string) { 10 | console.log('\n🧪 测试 DeepSeek-OCR API(原始调用)\n'); 11 | 12 | const apiKey = process.env.SILICONFLOW_API_KEY; 13 | 14 | if (!apiKey) { 15 | console.error('❌ 错误: 需要设置 SILICONFLOW_API_KEY 环境变量'); 16 | console.error('示例: $env:SILICONFLOW_API_KEY="your-api-key"'); 17 | process.exit(1); 18 | } 19 | 20 | // 读取图片并转为 base64 21 | const imageBuffer = fs.readFileSync(imagePath); 22 | const base64Image = imageBuffer.toString('base64'); 23 | const mimeType = imagePath.endsWith('.png') ? 'image/png' : 'image/jpeg'; 24 | const imageDataUrl = `data:${mimeType};base64,${base64Image}`; 25 | 26 | console.log(`📸 图片: ${imagePath}`); 27 | console.log(`📦 大小: ${(imageBuffer.length / 1024).toFixed(2)} KB\n`); 28 | 29 | // 测试不同的 prompt 30 | const prompts = [ 31 | '识别图片中的所有文字', 32 | 'OCR', 33 | 'Extract all text from this image', 34 | 'What do you see in this image?', 35 | '请详细描述这张图片' 36 | ]; 37 | 38 | for (const prompt of prompts) { 39 | console.log(`\n🔍 测试 Prompt: "${prompt}"`); 40 | console.log('─'.repeat(50)); 41 | 42 | try { 43 | const response = await axios.post( 44 | 'https://api.siliconflow.cn/v1/chat/completions', 45 | { 46 | model: 'deepseek-ai/DeepSeek-OCR', 47 | messages: [ 48 | { 49 | role: 'user', 50 | content: [ 51 | { 52 | type: 'image_url', 53 | image_url: { 54 | url: imageDataUrl, 55 | }, 56 | }, 57 | { 58 | type: 'text', 59 | text: prompt, 60 | }, 61 | ], 62 | }, 63 | ], 64 | temperature: 0.7, 65 | max_tokens: 4096, 66 | }, 67 | { 68 | headers: { 69 | 'Authorization': `Bearer ${apiKey}`, 70 | 'Content-Type': 'application/json', 71 | }, 72 | timeout: 60000, 73 | } 74 | ); 75 | 76 | const result = response.data.choices[0].message.content; 77 | const usage = response.data.usage; 78 | 79 | console.log(`✅ Tokens: ${usage.total_tokens} (prompt: ${usage.prompt_tokens}, completion: ${usage.completion_tokens})`); 80 | console.log(`📝 响应长度: ${result?.length || 0} 字符`); 81 | 82 | if (result && result.trim().length > 0) { 83 | console.log('\n📊 结果:'); 84 | console.log('─'.repeat(50)); 85 | console.log(result); 86 | console.log('─'.repeat(50)); 87 | console.log('\n✅ 找到有效响应!'); 88 | break; 89 | } else { 90 | console.log('❌ 空响应'); 91 | } 92 | } catch (error: any) { 93 | console.log(`❌ 错误: ${error.message}`); 94 | } 95 | } 96 | } 97 | 98 | // 运行测试 99 | const imagePath = path.join(process.cwd(), 'test.png'); 100 | testDeepSeekOCR(imagePath).catch(console.error); 101 | -------------------------------------------------------------------------------- /test/test-local.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Luma MCP 本地测试脚本 3 | * 直接测试图片分析功能,不需要MCP客户端 4 | */ 5 | 6 | import { loadConfig } from '../src/config.js'; 7 | import type { VisionClient } from '../src/vision-client.js'; 8 | import { ZhipuClient } from '../src/zhipu-client.js'; 9 | import { SiliconFlowClient } from '../src/siliconflow-client.js'; 10 | import { imageToBase64, validateImageSource } from '../src/image-processor.js'; 11 | import { buildAnalysisPrompt } from '../src/prompts.js'; 12 | import { logger } from '../src/utils/logger.js'; 13 | 14 | async function testImageAnalysis(imagePath: string, question?: string) { 15 | console.log('\n=========================================='); 16 | console.log('🧪 测试 Luma MCP 图片分析'); 17 | console.log('==========================================\n'); 18 | 19 | try { 20 | // 1. 加载配置 21 | console.log('📝 加载配置...'); 22 | const config = loadConfig(); 23 | console.log(`✅ 配置加载成功: 提供商 ${config.provider}, 模型 ${config.model}\n`); 24 | 25 | // 2. 验证图片 26 | console.log('🔍 验证图片来源...'); 27 | await validateImageSource(imagePath); 28 | console.log(`✅ 图片验证通过: ${imagePath}\n`); 29 | 30 | // 3. 处理图片 31 | console.log('🖼️ 处理图片...'); 32 | const imageDataUrl = await imageToBase64(imagePath); 33 | const isUrl = imagePath.startsWith('http'); 34 | console.log(`✅ 图片处理完成: ${isUrl ? 'URL' : 'Base64编码'}\n`); 35 | 36 | // 4. 构建提示词 37 | console.log('💬 构建提示词...'); 38 | // DeepSeek-OCR 需要简洁 prompt 39 | const prompt = config.provider === 'siliconflow' 40 | ? (question || '请详细分析这张图片的内容') 41 | : buildAnalysisPrompt(question); 42 | console.log(`✅ 提示词: ${question || '通用描述'}\n`); 43 | 44 | // 5. 创建客户端并调用API 45 | const client: VisionClient = config.provider === 'siliconflow' 46 | ? new SiliconFlowClient(config) 47 | : new ZhipuClient(config); 48 | 49 | const modelName = config.provider === 'siliconflow' ? 'DeepSeek-OCR' : 'GLM-4.5V'; 50 | console.log(`🤖 调用 ${modelName} API...`); 51 | const result = await client.analyzeImage(imageDataUrl, prompt); 52 | 53 | // 6. 显示结果 54 | console.log('\n=========================================='); 55 | console.log('📊 分析结果'); 56 | console.log('==========================================\n'); 57 | console.log(result); 58 | console.log('\n=========================================='); 59 | console.log('✅ 测试完成!'); 60 | console.log('==========================================\n'); 61 | 62 | } catch (error) { 63 | console.error('\n❌ 测试失败:'); 64 | console.error(error instanceof Error ? error.message : String(error)); 65 | process.exit(1); 66 | } 67 | } 68 | 69 | // 解析命令行参数 70 | const args = process.argv.slice(2); 71 | 72 | if (args.length === 0) { 73 | console.log(` 74 | 使用方法: 75 | npm run test:local <图片路径或URL> [问题] 76 | 77 | 示例: 78 | # 分析本地图片 79 | npm run test:local ./test.png 80 | 81 | # 分析本地图片并提问 82 | npm run test:local ./code-error.png "这段代码为什么报错?" 83 | 84 | # 分析远程图片 85 | npm run test:local https://example.com/image.jpg 86 | 87 | 环境变量: 88 | # 使用智谱 GLM-4.5V 89 | ZHIPU_API_KEY=your-api-key 90 | 91 | # 使用硅基流动 DeepSeek-OCR 92 | MODEL_PROVIDER=siliconflow 93 | SILICONFLOW_API_KEY=your-api-key 94 | `); 95 | process.exit(1); 96 | } 97 | 98 | const imagePath = args[0]; 99 | const question = args.slice(1).join(' ') || undefined; 100 | 101 | testImageAnalysis(imagePath, question); 102 | -------------------------------------------------------------------------------- /src/siliconflow-client.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 硅基流动 DeepSeek-OCR API 客户端 3 | * 基于 OpenAI 兼容 API 4 | */ 5 | 6 | import axios from 'axios'; 7 | import type { VisionClient } from './vision-client.js'; 8 | import type { LumaConfig } from './config.js'; 9 | import { logger } from './utils/logger.js'; 10 | 11 | interface SiliconFlowMessage { 12 | role: string; 13 | content: Array<{ 14 | type: string; 15 | text?: string; 16 | image_url?: { 17 | url: string; 18 | }; 19 | }>; 20 | } 21 | 22 | interface SiliconFlowRequest { 23 | model: string; 24 | messages: SiliconFlowMessage[]; 25 | temperature?: number; 26 | max_tokens?: number; 27 | top_p?: number; 28 | stream?: boolean; 29 | } 30 | 31 | interface SiliconFlowResponse { 32 | id: string; 33 | object: string; 34 | created: number; 35 | model: string; 36 | choices: Array<{ 37 | index: number; 38 | message: { 39 | role: string; 40 | content: string; 41 | }; 42 | finish_reason: string; 43 | }>; 44 | usage: { 45 | prompt_tokens: number; 46 | completion_tokens: number; 47 | total_tokens: number; 48 | }; 49 | } 50 | 51 | /** 52 | * 硅基流动 API 客户端 53 | */ 54 | export class SiliconFlowClient implements VisionClient { 55 | private apiKey: string; 56 | private model: string; 57 | private maxTokens: number; 58 | private temperature: number; 59 | private apiEndpoint = 'https://api.siliconflow.cn/v1/chat/completions'; 60 | 61 | constructor(config: LumaConfig) { 62 | this.apiKey = config.apiKey; 63 | this.model = config.model; 64 | this.maxTokens = config.maxTokens; 65 | this.temperature = config.temperature; 66 | } 67 | 68 | /** 69 | * 分析图片 70 | */ 71 | async analyzeImage(imageDataUrl: string, prompt: string, enableThinking?: boolean): Promise { 72 | const requestBody: SiliconFlowRequest = { 73 | model: this.model, 74 | messages: [ 75 | { 76 | role: 'user', 77 | content: [ 78 | { 79 | type: 'image_url', 80 | image_url: { 81 | url: imageDataUrl, 82 | }, 83 | }, 84 | { 85 | type: 'text', 86 | text: prompt, 87 | }, 88 | ], 89 | }, 90 | ], 91 | temperature: this.temperature, 92 | max_tokens: this.maxTokens, 93 | stream: false, 94 | }; 95 | 96 | logger.info('Calling SiliconFlow DeepSeek-OCR API', { 97 | model: this.model, 98 | }); 99 | 100 | try { 101 | const response = await axios.post( 102 | this.apiEndpoint, 103 | requestBody, 104 | { 105 | headers: { 106 | 'Authorization': `Bearer ${this.apiKey}`, 107 | 'Content-Type': 'application/json', 108 | }, 109 | timeout: 60000, // 60秒超时 110 | } 111 | ); 112 | 113 | if (!response.data.choices || response.data.choices.length === 0) { 114 | throw new Error('No response from DeepSeek-OCR'); 115 | } 116 | 117 | const result = response.data.choices[0].message.content; 118 | const usage = response.data.usage; 119 | 120 | logger.info('SiliconFlow API call successful', { 121 | tokens: usage?.total_tokens || 0, 122 | model: response.data.model 123 | }); 124 | 125 | return result; 126 | } catch (error) { 127 | logger.error('SiliconFlow API call failed', { 128 | error: error instanceof Error ? error.message : String(error) 129 | }); 130 | 131 | if (axios.isAxiosError(error)) { 132 | const message = error.response?.data?.error?.message || error.message; 133 | const status = error.response?.status; 134 | throw new Error(`SiliconFlow API error (${status || 'unknown'}): ${message}`); 135 | } 136 | throw error; 137 | } 138 | } 139 | 140 | /** 141 | * 获取模型名称 142 | */ 143 | getModelName(): string { 144 | return `DeepSeek (${this.model})`; 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/volcengine-client.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 火山方舟 Volcengine Doubao 视觉模型客户端 3 | * 支持 Doubao-Seed-1.6 系列模型(flash、vision、lite) 4 | * 使用 Chat Completions API 格式 5 | */ 6 | 7 | import axios from "axios"; 8 | import type { VisionClient } from "./vision-client.js"; 9 | import type { LumaConfig } from "./config.js"; 10 | import { logger } from "./utils/logger.js"; 11 | 12 | interface VolcengineMessage { 13 | role: string; 14 | content: Array<{ 15 | type: string; 16 | text?: string; 17 | image_url?: { 18 | url: string; 19 | }; 20 | }>; 21 | } 22 | 23 | interface VolcengineRequest { 24 | model: string; 25 | messages: VolcengineMessage[]; 26 | temperature?: number; 27 | max_tokens?: number; 28 | top_p?: number; 29 | stream?: boolean; 30 | } 31 | 32 | interface VolcengineResponse { 33 | id: string; 34 | object: string; 35 | created: number; 36 | model: string; 37 | choices: Array<{ 38 | index: number; 39 | message: { 40 | role: string; 41 | content: string; 42 | }; 43 | finish_reason: string; 44 | }>; 45 | usage: { 46 | prompt_tokens: number; 47 | completion_tokens: number; 48 | total_tokens: number; 49 | }; 50 | } 51 | 52 | /** 53 | * 火山方舟客户端 54 | */ 55 | export class VolcengineClient implements VisionClient { 56 | private apiKey: string; 57 | private model: string; 58 | private maxTokens: number; 59 | private temperature: number; 60 | private apiEndpoint = 61 | "https://ark.cn-beijing.volces.com/api/v3/chat/completions"; 62 | 63 | constructor(config: LumaConfig) { 64 | this.apiKey = config.apiKey; 65 | this.model = config.model; 66 | this.maxTokens = config.maxTokens; 67 | this.temperature = config.temperature; 68 | } 69 | 70 | /** 71 | * 分析图片 72 | */ 73 | async analyzeImage( 74 | imageDataUrl: string, 75 | prompt: string, 76 | enableThinking?: boolean 77 | ): Promise { 78 | const requestBody: VolcengineRequest = { 79 | model: this.model, 80 | messages: [ 81 | { 82 | role: "user", 83 | content: [ 84 | { 85 | type: "image_url", 86 | image_url: { 87 | url: imageDataUrl, 88 | }, 89 | }, 90 | { 91 | type: "text", 92 | text: prompt, 93 | }, 94 | ], 95 | }, 96 | ], 97 | temperature: this.temperature, 98 | max_tokens: this.maxTokens, 99 | stream: false, 100 | }; 101 | 102 | logger.info("Calling Volcengine Doubao API", { 103 | model: this.model, 104 | thinking: !!enableThinking, 105 | }); 106 | 107 | try { 108 | const response = await axios.post( 109 | this.apiEndpoint, 110 | requestBody, 111 | { 112 | headers: { 113 | Authorization: `Bearer ${this.apiKey}`, 114 | "Content-Type": "application/json", 115 | }, 116 | timeout: 120000, // 120秒超时 117 | } 118 | ); 119 | 120 | if (!response.data.choices || response.data.choices.length === 0) { 121 | throw new Error("No response from Volcengine Doubao"); 122 | } 123 | 124 | const result = response.data.choices[0].message.content; 125 | const usage = response.data.usage; 126 | 127 | logger.info("Volcengine Doubao API call successful", { 128 | tokens: usage?.total_tokens || 0, 129 | model: response.data.model, 130 | }); 131 | 132 | return result; 133 | } catch (error) { 134 | logger.error("Volcengine Doubao API call failed", { 135 | error: error instanceof Error ? error.message : String(error), 136 | }); 137 | 138 | if (axios.isAxiosError(error)) { 139 | const message = error.response?.data?.error?.message || error.message; 140 | const status = error.response?.status; 141 | throw new Error( 142 | `Volcengine Doubao API error (${status || "unknown"}): ${message}` 143 | ); 144 | } 145 | throw error; 146 | } 147 | } 148 | 149 | /** 150 | * 获取模型名称 151 | */ 152 | getModelName(): string { 153 | return `Doubao (${this.model})`; 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/zhipu-client.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 智谱 GLM-4.6V API 客户端 3 | */ 4 | 5 | import axios from "axios"; 6 | import type { VisionClient } from "./vision-client.js"; 7 | import type { LumaConfig } from "./config.js"; 8 | import { logger } from "./utils/logger.js"; 9 | 10 | interface ZhipuMessage { 11 | role: string; 12 | content: Array<{ 13 | type: string; 14 | text?: string; 15 | image_url?: { 16 | url: string; 17 | }; 18 | }>; 19 | } 20 | 21 | interface ZhipuRequest { 22 | model: string; 23 | messages: ZhipuMessage[]; 24 | temperature: number; 25 | max_tokens: number; 26 | top_p: number; 27 | thinking?: { 28 | type: string; 29 | }; 30 | } 31 | 32 | interface ZhipuResponse { 33 | id: string; 34 | created: number; 35 | model: string; 36 | choices: Array<{ 37 | index: number; 38 | message: { 39 | role: string; 40 | content: string; 41 | }; 42 | finish_reason: string; 43 | }>; 44 | usage: { 45 | prompt_tokens: number; 46 | completion_tokens: number; 47 | total_tokens: number; 48 | }; 49 | } 50 | 51 | /** 52 | * 智谱 API 客户端 53 | */ 54 | export class ZhipuClient implements VisionClient { 55 | private apiKey: string; 56 | private model: string; 57 | private maxTokens: number; 58 | private temperature: number; 59 | private topP: number; 60 | private apiEndpoint = "https://open.bigmodel.cn/api/paas/v4/chat/completions"; 61 | 62 | constructor(config: LumaConfig) { 63 | this.apiKey = config.apiKey; 64 | this.model = config.model; 65 | this.maxTokens = config.maxTokens; 66 | this.temperature = config.temperature; 67 | this.topP = config.topP; 68 | } 69 | 70 | /** 71 | * 分析图片 72 | */ 73 | async analyzeImage( 74 | imageDataUrl: string, 75 | prompt: string, 76 | enableThinking?: boolean 77 | ): Promise { 78 | const requestBody: ZhipuRequest = { 79 | model: this.model, 80 | messages: [ 81 | { 82 | role: "user", 83 | content: [ 84 | { 85 | type: "image_url", 86 | image_url: { 87 | url: imageDataUrl, 88 | }, 89 | }, 90 | { 91 | type: "text", 92 | text: prompt, 93 | }, 94 | ], 95 | }, 96 | ], 97 | temperature: this.temperature, 98 | max_tokens: this.maxTokens, 99 | top_p: this.topP, 100 | thinking: { type: "enabled" }, // 默认启用思考模式,提高分析准确性 101 | }; 102 | 103 | // 允许显式禁用 thinking(如需要更快速度) 104 | if (enableThinking === false) { 105 | delete requestBody.thinking; 106 | } 107 | 108 | logger.info("Calling GLM-4.6V API", { 109 | model: this.model, 110 | thinking: !!requestBody.thinking, 111 | }); 112 | 113 | try { 114 | const response = await axios.post( 115 | this.apiEndpoint, 116 | requestBody, 117 | { 118 | headers: { 119 | Authorization: `Bearer ${this.apiKey}`, 120 | "Content-Type": "application/json", 121 | }, 122 | timeout: 60000, // 60秒超时 123 | } 124 | ); 125 | 126 | if (!response.data.choices || response.data.choices.length === 0) { 127 | throw new Error("No response from GLM-4.6V"); 128 | } 129 | 130 | const result = response.data.choices[0].message.content; 131 | const usage = response.data.usage; 132 | 133 | logger.info("GLM-4.6V API call successful", { 134 | tokens: usage?.total_tokens || 0, 135 | model: response.data.model, 136 | }); 137 | 138 | return result; 139 | } catch (error) { 140 | logger.error("GLM-4.6V API call failed", { 141 | error: error instanceof Error ? error.message : String(error), 142 | }); 143 | 144 | if (axios.isAxiosError(error)) { 145 | const message = error.response?.data?.error?.message || error.message; 146 | const status = error.response?.status; 147 | throw new Error( 148 | `GLM-4.6V API error (${status || "unknown"}): ${message}` 149 | ); 150 | } 151 | throw error; 152 | } 153 | } 154 | 155 | /** 156 | * 获取模型名称 157 | */ 158 | getModelName(): string { 159 | return `GLM (${this.model})`; 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/image-processor.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * 图片处理工具 3 | * 负责读取、压缩和编码图片(支持本地文件和远程 URL) 4 | */ 5 | 6 | import { readFile, stat } from "fs/promises"; 7 | import sharp from "sharp"; 8 | import { isUrl } from "./utils/helpers.js"; 9 | import { logger } from "./utils/logger.js"; 10 | 11 | // 判断是否为 Data URI(data:image/png;base64,....) 12 | function isDataUri(input: string): boolean { 13 | return ( 14 | typeof input === "string" && 15 | input.startsWith("data:") && 16 | /;base64,/.test(input) 17 | ); 18 | } 19 | 20 | // 从 Data URI 获取 mimeType 21 | function getMimeFromDataUri(input: string): string | null { 22 | const match = input.match(/^data:([^;]+);base64,/i); 23 | return match ? match[1].toLowerCase() : null; 24 | } 25 | 26 | // 估算 Data URI 的原始字节大小(不含头部) 27 | function estimateBytesFromDataUri(input: string): number { 28 | try { 29 | const base64 = input.split(",")[1] || ""; 30 | // base64 长度 * 3/4,忽略 padding 近似即可 31 | return Math.floor((base64.length * 3) / 4); 32 | } catch { 33 | return 0; 34 | } 35 | } 36 | 37 | /** 38 | * 规范化本地图像路径(例如去掉前缀符号) 39 | * 某些客户端会使用 "@path/to/file" 作为文件引用,这里统一转换为真实路径 40 | */ 41 | function normalizeImageSourcePath(source: string): string { 42 | if (typeof source === "string" && source.startsWith("@")) { 43 | const normalized = source.slice(1); 44 | logger.debug("Normalized @-prefixed image path", { 45 | original: source, 46 | normalized, 47 | }); 48 | return normalized; 49 | } 50 | return source; 51 | } 52 | 53 | /** 54 | * 验证图片来源(文件或URL) 55 | */ 56 | export async function validateImageSource( 57 | imageSource: string, 58 | maxSizeMB: number = 10 59 | ): Promise { 60 | // 先规范化可能带有前缀符号的本地路径(如 "@image.png") 61 | const normalizedSource = normalizeImageSourcePath(imageSource); 62 | 63 | // 如果是 URL,直接返回 64 | if (isUrl(normalizedSource)) { 65 | logger.debug("Image source is URL, skipping validation"); 66 | return; 67 | } 68 | 69 | // 验证本地文件 70 | try { 71 | const stats = await stat(normalizedSource); 72 | const fileSizeMB = stats.size / (1024 * 1024); 73 | 74 | if (fileSizeMB > maxSizeMB) { 75 | throw new Error( 76 | `Image file too large: ${fileSizeMB.toFixed(2)}MB (max: ${maxSizeMB}MB)` 77 | ); 78 | } 79 | 80 | // 验证文件格式 81 | const ext = normalizedSource.toLowerCase().split(".").pop(); 82 | const supportedFormats = ["jpg", "jpeg", "png", "webp", "gif"]; 83 | 84 | if (!ext || !supportedFormats.includes(ext)) { 85 | throw new Error( 86 | `Unsupported image format: ${ext}. Supported: ${supportedFormats.join( 87 | ", " 88 | )}` 89 | ); 90 | } 91 | } catch (error) { 92 | if ((error as any).code === "ENOENT") { 93 | throw new Error(`Image file not found: ${normalizedSource}`); 94 | } 95 | throw error; 96 | } 97 | } 98 | 99 | /** 100 | * 将图片转换为 base64 data URL 或返回URL 101 | */ 102 | export async function imageToBase64(imagePath: string): Promise { 103 | try { 104 | // 规范化本地路径(处理可能的前缀符号) 105 | const normalizedPath = normalizeImageSourcePath(imagePath); 106 | 107 | // 如果是 URL,直接返回 108 | if (isUrl(normalizedPath)) { 109 | logger.info("Using remote image URL", { url: normalizedPath }); 110 | return normalizedPath; 111 | } 112 | 113 | // 本地文件:读取并编码 114 | let imageBuffer: Buffer = await readFile(normalizedPath); 115 | 116 | // 检查文件大小,如果超过 2MB 则压缩 117 | if (imageBuffer.length > 2 * 1024 * 1024) { 118 | logger.info("Compressing large image", { 119 | originalSize: `${(imageBuffer.length / (1024 * 1024)).toFixed(2)}MB`, 120 | }); 121 | imageBuffer = Buffer.from(await compressImage(imageBuffer)); 122 | } 123 | 124 | // 转换为 base64 125 | const base64 = imageBuffer.toString("base64"); 126 | const mimeType = getMimeType(normalizedPath); 127 | 128 | return `data:${mimeType};base64,${base64}`; 129 | } catch (error) { 130 | throw new Error( 131 | `Failed to process image: ${ 132 | error instanceof Error ? error.message : "Unknown error" 133 | }` 134 | ); 135 | } 136 | } 137 | 138 | /** 139 | * 压缩图片 140 | */ 141 | async function compressImage(imageBuffer: Buffer): Promise { 142 | return sharp(imageBuffer) 143 | .resize(2048, 2048, { 144 | fit: "inside", 145 | withoutEnlargement: true, 146 | }) 147 | .jpeg({ quality: 85 }) 148 | .toBuffer(); 149 | } 150 | 151 | /** 152 | * 根据文件扩展名获取 MIME 类型 153 | */ 154 | function getMimeType(filePath: string): string { 155 | const ext = filePath.toLowerCase().split(".").pop(); 156 | 157 | switch (ext) { 158 | case "jpg": 159 | case "jpeg": 160 | return "image/jpeg"; 161 | case "png": 162 | return "image/png"; 163 | case "webp": 164 | return "image/webp"; 165 | case "gif": 166 | return "image/gif"; 167 | default: 168 | return "image/jpeg"; // 默认使用 jpeg 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * Luma MCP Server 5 | * 通用图像理解 MCP 服务器,支持多家视觉模型提供商 6 | */ 7 | 8 | // 第一件事:重定向console到stderr,避免污染MCP的stdout 9 | import { setupConsoleRedirection, logger } from "./utils/logger.js"; 10 | setupConsoleRedirection(); 11 | 12 | import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; 13 | import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; 14 | import { z } from "zod"; 15 | 16 | import { loadConfig } from "./config.js"; 17 | import type { VisionClient } from "./vision-client.js"; 18 | import { ZhipuClient } from "./zhipu-client.js"; 19 | import { SiliconFlowClient } from "./siliconflow-client.js"; 20 | import { QwenClient } from "./qwen-client.js"; 21 | import { VolcengineClient } from "./volcengine-client.js"; 22 | import { imageToBase64, validateImageSource } from "./image-processor.js"; 23 | import { 24 | withRetry, 25 | createSuccessResponse, 26 | createErrorResponse, 27 | } from "./utils/helpers.js"; 28 | 29 | /** 30 | * 创建 MCP 服务器 31 | */ 32 | async function createServer() { 33 | logger.info("Initializing Luma MCP Server"); 34 | 35 | // 加载配置 36 | const config = loadConfig(); 37 | 38 | // 根据配置选择模型客户端 39 | let visionClient: VisionClient; 40 | 41 | if (config.provider === "siliconflow") { 42 | visionClient = new SiliconFlowClient(config); 43 | } else if (config.provider === "qwen") { 44 | visionClient = new QwenClient(config); 45 | } else if (config.provider === "volcengine") { 46 | visionClient = new VolcengineClient(config); 47 | } else { 48 | visionClient = new ZhipuClient(config); 49 | } 50 | 51 | logger.info("Vision client initialized", { 52 | provider: config.provider, 53 | model: visionClient.getModelName(), 54 | }); 55 | 56 | // 创建服务器 - 使用 McpServer 57 | const server = new McpServer( 58 | { 59 | name: "luma-mcp", 60 | version: "1.0.0", 61 | }, 62 | { 63 | capabilities: { 64 | tools: {}, 65 | }, 66 | } 67 | ); 68 | 69 | // 创建带重试的分析函数 70 | const analyzeWithRetry = withRetry( 71 | async (imageSource: string, prompt: string) => { 72 | // 1. 验证图片来源 73 | await validateImageSource(imageSource); 74 | 75 | // 2. 处理图片(读取或返回URL) 76 | const imageDataUrl = await imageToBase64(imageSource); 77 | 78 | // 3. 直接使用原始提示词(不进行包装或增强) 79 | const fullPrompt = prompt; 80 | 81 | // 4. 调用视觉模型分析图片 82 | return await visionClient.analyzeImage(imageDataUrl, fullPrompt); 83 | }, 84 | 2, // 最多重试2次 85 | 1000 // 初始延补1秒 86 | ); 87 | 88 | // 注册工具 - 使用 McpServer.tool() API 89 | server.tool( 90 | "analyze_image", 91 | "图像分析工具:支持三种使用方式:1) 用户粘贴图片时直接调用,无需手动指定路径 2) 指定本地图片路径,如./screenshot.png 3) 指定图片URL,如https://example.com/image.png。AI应根据用户问题生成专业的分析提示词(如用户问'网站布局有什么问题',应生成'请详细分析这个网站界面的布局问题,包括视觉层次、对齐方式、间距、响应式设计等方面的问题'),然后传递提示词和图片进行调用。", 92 | { 93 | image_source: z 94 | .string() 95 | .describe( 96 | "要分析的图片来源:支持三种方式 1) 用户粘贴图片时由Claude Desktop自动提供路径 2) 本地文件路径,如./screenshot.png 3) HTTP(S)图片URL,如https://example.com/image.png(支持 PNG、JPG、JPEG、WebP、GIF,最大 10MB)" 97 | ), 98 | prompt: z 99 | .string() 100 | .describe( 101 | '分析提示词:AI根据用户问题生成的专业分析提示词。应该包含具体的分析要求和期望的输出格式。' 102 | ), 103 | }, 104 | async (params) => { 105 | try { 106 | // AI应该已经根据用户问题生成了合适的prompt 107 | const prompt = params.prompt; 108 | 109 | logger.info("Analyzing image", { 110 | source: params.image_source, 111 | prompt, 112 | }); 113 | 114 | // 执行分析(带重试) 115 | const result = await analyzeWithRetry(params.image_source, prompt); 116 | 117 | logger.info("Image analysis completed successfully"); 118 | return createSuccessResponse(result); 119 | } catch (error) { 120 | logger.error("Image analysis failed", { 121 | error: error instanceof Error ? error.message : String(error), 122 | }); 123 | 124 | return createErrorResponse( 125 | error instanceof Error ? error.message : "Unknown error" 126 | ); 127 | } 128 | } 129 | ); 130 | 131 | return server; 132 | } 133 | 134 | /** 135 | * 主函数 136 | */ 137 | async function main() { 138 | try { 139 | const server = await createServer(); 140 | const transport = new StdioServerTransport(); 141 | await server.connect(transport); 142 | 143 | logger.info("Luma MCP server started successfully on stdio"); 144 | } catch (error) { 145 | logger.error("Failed to start Luma MCP server", { 146 | error: error instanceof Error ? error.message : String(error), 147 | }); 148 | process.exit(1); 149 | } 150 | } 151 | 152 | // 全局错误处理 153 | process.on("uncaughtException", (error) => { 154 | logger.error("Uncaught exception", { 155 | error: error.message, 156 | stack: error.stack, 157 | }); 158 | process.exit(1); 159 | }); 160 | 161 | process.on("unhandledRejection", (reason) => { 162 | logger.error("Unhandled rejection", { reason }); 163 | process.exit(1); 164 | }); 165 | 166 | process.on("SIGINT", () => { 167 | logger.info("Received SIGINT, shutting down gracefully"); 168 | process.exit(0); 169 | }); 170 | 171 | process.on("SIGTERM", () => { 172 | logger.info("Received SIGTERM, shutting down gracefully"); 173 | process.exit(0); 174 | }); 175 | 176 | main(); 177 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 本项目的所有重大变更都将记录在此文件中。 4 | 5 | ## [1.2.7] - 2025-12-17 6 | 7 | ### Added 8 | 9 | - 🆕 **火山方舟 Provider**: 新增第四个视觉模型提供商 - 火山方舟 Volcengine 10 | - 🎯 **Doubao-Seed-1.6 系列**: 支持 flash、vision、lite 多种版本 11 | - 🔧 **统一配置架构**: 客户端构造函数改为接受 LumaConfig 对象,实现配置集中管理 12 | - 🖼️ **完整图片格式支持**: 火山方舟支持 base64 数据、URL 链接和本地文件 13 | 14 | ### Changed 15 | 16 | - 🏗️ **架构重构**: 三个现有客户端(Zhipu、SiliconFlow、Qwen)重构为统一配置对象模式 17 | - 🗃️ **客户端优化**: 移除硬编码默认值,所有配置统一从环境变量读取 18 | - 📝 **API 格式统一**: 火山方舟客户端改为使用 Chat Completions API 格式,与其他 provider 保持一致 19 | - 📚 **文档完善**: 更新中英文 README,添加火山方舟配置示例和模型对比 20 | 21 | ### Technical Details 22 | 23 | - `src/config.ts`: 新增 volcengine provider 支持,添加 VOLCENGINE_API_KEY 环境变量 24 | - `src/volcengine-client.ts`: 新文件,完整实现 VolcengineClient 类,支持 Chat Completions API 25 | - `src/zhipu-client.ts`: 重构构造函数,移除硬编码参数,支持 LumaConfig 26 | - `src/siliconflow-client.ts`: 重构构造函数,支持统一配置对象 27 | - `src/qwen-client.ts`: 重构构造函数,支持统一配置对象 28 | - `src/index.ts`: 添加 VolcengineClient 导入和实例化逻辑 29 | - `.env.example`: 添加火山方舟配置示例和说明 30 | - `README.md` & `docs/README_EN.md`: 新增火山方舟特性说明和配置示例 31 | 32 | ### Provider Summary 33 | 34 | 现在支持 4 个视觉模型提供商: 35 | 36 | 1. **智谱 GLM-4.6V** (默认): 中文理解优秀,16384 tokens 37 | 2. **硅基流动 DeepSeek-OCR**: 免费使用,OCR 能力强 38 | 3. **阿里云 Qwen3-VL-Flash**: 速度快成本低,支持思考模式 39 | 4. **火山方舟 Doubao-Seed-1.6**: 性价比高,256k 上下文,支持多种版本 40 | 41 | ## [1.2.6] - 2025-12-16 42 | 43 | ### Changed 44 | 45 | - 🚀 **模型升级**: 更新智谱模型从 GLM-4.5V 升级至 GLM-4.6V,性能和理解能力提升 46 | - 📈 **Token 限制提升**: 默认 maxTokens 从 8192 提升至 16384,支持更详细的分析输出 47 | - 💡 **思考模式默认开启**: ENABLE_THINKING 默认为 true,提供更准确的分析结果 48 | - 🧹 **代码清理**: 移除 prompts.ts 提示词模板文件,简化架构 49 | - 🔧 **TypeScript 优化**: 清理未使用的类型导入,修复 TS6133 警告 50 | - 📝 **文档完善**: 更新中英文 README,强化三种使用方式说明(粘贴图片、本地路径、URL) 51 | 52 | ### Technical Details 53 | 54 | - `src/config.ts`: 更新默认模型为 glm-4.6v,默认 maxTokens 改为 16384,enableThinking 默认为 true 55 | - `src/zhipu-client.ts`: 更新模型引用,清理未使用导入 56 | - `src/siliconflow-client.ts`: 清理未使用的类型导入 57 | - `src/index.ts`: 简化 prompt 处理逻辑,直接使用原始提示词 58 | - 删除 `src/prompts.ts`: 移除 buildAnalysisPrompt 函数 59 | - README 更新: 模型信息、Token 配置、项目结构、思考模式配置 60 | 61 | ## [1.2.4] - 2025-12-16 (Reverted) 62 | 63 | ### Note 64 | 65 | 此版本因代码回滚问题被回退,所有优化内容已整合至 v1.2.6 66 | 67 | ## [1.2.3] - 2025-11-21 68 | 69 | ### Changed 70 | 71 | - 🧹 **代码清理**: 移除 Claude 特定调试注释和实验性代码 72 | - 📝 **工具描述优化**: 简化和专业化工具说明,提升 AI 模型调用成功率 73 | - 🔧 **路径处理通用化**: 重构 @ 前缀路径处理,移除平台特定命名 74 | 75 | ### Technical Details 76 | 77 | - 移除 Claude 资源读取相关的实验性代码 78 | - 重命名 `stripAtPrefix()` 为 `normalizeImageSourcePath()` 79 | - 清理所有客户端适配器中的调试日志和注释 80 | - 统一代码风格和注释规范 81 | 82 | ## [1.2.2] - 2025-11-20 83 | 84 | ### Added 85 | 86 | - ✨ **@ 路径支持**: 自动处理 Claude Code 的 @ 文件引用前缀,修复第一次调用失败的问题 87 | - 📝 **智能 Prompt**: 通用请求自动添加详细指引,保证全面分析 88 | 89 | ### Changed 90 | 91 | - 🔧 **Prompt 统一**: 简化为单一通用 prompt,智能处理不同场景 92 | - ✨ **表述优化**: 融合 Minimax 的经典表述,强调“不遗漏细节”和“完整提取” 93 | - 📚 **文档更新**: 更新项目结构,添加 qwen-client.ts 和测试文件 94 | 95 | ### Fixed 96 | 97 | - 🐛 **@ 路径问题**: 修复 Claude Code 中 `@folder/image.png` 导致的路径错误 98 | - 🐛 **编译错误**: 修复 image-processor.ts 中重复声明的变量 99 | 100 | ### Technical Details 101 | 102 | - 新增 `stripAtPrefix()` 函数处理 Claude Code 的文件引用语法 103 | - 简化 `buildAnalysisPrompt()` 从两套逻辑到单一逻辑 104 | - 添加智能请求检测,自动补充详细分析指引 105 | 106 | ## [1.2.1] - 2025-11-18 107 | 108 | ### Changed 109 | 110 | - 📝 **文档优化**: 精简 README,移除冲余配置文件路径说明 111 | - 📝 **更新日志简化**: 将 README 中的详细更新日志替换为 CHANGELOG.md 链接 112 | - ✨ **Qwen 测试示例**: 添加 Qwen3-VL-Flash 本地测试命令 113 | - 💰 **定价信息**: 添加阿里云通义千问定价参考链接 114 | - 📋 **模型对比**: 更新模型选择表,完善 Qwen3-VL-Flash 信息 115 | - 🔗 **API Key 获取**: 添加阿里云百炼 API Key 获取指南 116 | - 📚 **相关链接**: 新增阿里云百炼平台和 Qwen3-VL 文档链接 117 | - 🐛 **错误信息**: 优化 API 调用失败排查提示,包含阿里云账户 118 | 119 | ### Fixed 120 | 121 | - 🐛 **描述修正**: 修正 package.json 中模型名称为 qwen3-vl-flash 122 | - 📝 **注释精简**: 简化 prompts.ts 注释头 123 | 124 | ## [1.2.0] - 2025-11-17 125 | 126 | ### Added 127 | 128 | - 🎉 **第三个视觉模型**: 新增阿里云通义千问 Qwen3-VL-Flash 支持 129 | - 💡 **思考模式**: Qwen3-VL-Flash 支持深度思考模式(enable_thinking),提升复杂场景分析准确性 130 | - ⚡ **高性价比**: Flash 版本速度更快、成本更低,适合大量使用 131 | - 🔌 **OpenAI 兼容**: 使用阿里云百炼的 OpenAI 兼容 API,统一接口设计 132 | - 🌐 **多地域支持**: 默认使用北京地域,支持新加坡地域配置 133 | 134 | ### Changed 135 | 136 | - ⚙️ 新增 `MODEL_PROVIDER=qwen` 和 `DASHSCOPE_API_KEY` 环境变量配置 137 | - 📝 更新所有文档(中英文),添加 Qwen3-VL-Flash 配置示例 138 | - 💰 默认使用 qwen3-vl-flash 模型,兹顾性能与成本 139 | - 🏗️ 重构客户端构造函数,统一参数传递方式 140 | 141 | ### Technical Details 142 | 143 | - 新增文件: 144 | - `src/qwen-client.ts` - 阿里云通义千问 VL API 客户端实现 145 | - 修改文件: 146 | - `src/config.ts` - 添加 'qwen' 提供商支持 147 | - `src/zhipu-client.ts` - 重构构造函数,支持独立参数 148 | - `src/siliconflow-client.ts` - 重构构造函数,支持独立参数 149 | - `src/index.ts` - 添加 Qwen 客户端初始化逻辑 150 | - `package.json` - 更新版本至 1.2.0,添加 qwen/aliyun/dashscope 关键词 151 | 152 | ## [1.1.1] - 2025-11-13 153 | 154 | ### Added 155 | 156 | - 🖼️ **Data URI 支持**: 支持接收 base64 编码的图片数据 (data:image/png;base64,...) 157 | - 🚀 **为未来做准备**: 当 MCP 客户端支持时,可直接传递用户粘贴的图片 158 | 159 | ### Changed 160 | 161 | - 📝 更新工具描述,说明支持三种输入格式:本地路径、URL、Data URI 162 | - ✅ 新增 Data URI 格式验证(MIME 类型、大小限制) 163 | 164 | ## [1.1.0] - 2025-11-13 165 | 166 | ### Added 167 | 168 | - 🎉 **多模型支持**: 新增硅基流动 DeepSeek-OCR 支持 169 | - 🆓 **免费选项**: DeepSeek-OCR 通过硅基流动提供完全免费的 OCR 服务 170 | - 📐 **统一接口**: 创建 VisionClient 接口,支持灵活扩展更多视觉模型 171 | - ⚙️ **灵活配置**: 通过 `MODEL_PROVIDER` 环境变量轻松切换模型 172 | 173 | ### Changed 174 | 175 | - 🔧 环境变量命名优化,支持通用配置(`MODEL_NAME`、`MAX_TOKENS` 等) 176 | - 📝 更新文档,提供双模型配置说明和选择建议 177 | - 🏗️ 重构代码结构,提升可维护性 178 | 179 | ### Technical Details 180 | 181 | - 新增文件: 182 | - `src/vision-client.ts` - 视觉模型客户端统一接口 183 | - `src/siliconflow-client.ts` - 硅基流动 API 客户端实现 184 | - `.env.example` - 配置示例文件 185 | - 修改文件: 186 | - `src/config.ts` - 支持多提供商配置 187 | - `src/zhipu-client.ts` - 实现 VisionClient 接口 188 | - `src/index.ts` - 根据配置动态选择客户端 189 | - `README.md` - 完整的双模型使用文档 190 | 191 | ## [1.0.3] - 2025-11-12 192 | 193 | ### Features 194 | 195 | - 基于智谱 GLM-4.5V 的视觉理解能力 196 | - 支持本地文件和远程 URL 197 | - 内置重试机制 198 | - 思考模式支持 199 | 200 | --- 201 | 202 | **模型对比**: 203 | 204 | || 特性 | GLM-4.5V | DeepSeek-OCR | Qwen3-VL-Flash | 205 | ||----------|----------|--------------|----------------| 206 | || 提供商 | 智谱清言 | 硅基流动 | 阿里云百炼 | 207 | || 费用 | 收费 | **免费** | 收费 | 208 | || 中文理解 | 优秀 | 良好 | **优秀** | 209 | || OCR 能力 | 良好 | **优秀** | 优秀 | 210 | || 思考模式 | ✅ | ❌ | ✅ | 211 | || 速度/成本 | 中等 | 免费 | **快/低** | 212 | || 综合能力 | 良好 | OCR 专精 | **优秀** | 213 | || 3D 定位 | ❌ | ❌ | ✅ | 214 | 215 | **推荐使用场景**: 216 | 217 | - 需要 OCR/文字识别 → **DeepSeek-OCR** (免费) 218 | - 需要深度图片理解 → **Qwen3-VL-Flash** 或 **GLM-4.5V** 219 | - 需要思考模式 → **Qwen3-VL-Flash** 或 **GLM-4.5V** 220 | - 需要高性价比 → **Qwen3-VL-Flash** (速度快、成本低) 221 | - 需要 3D 定位/复杂分析 → **Qwen3-VL-Flash** 222 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Luma MCP 2 | 3 | 多模型视觉理解 MCP 服务器,为不支持图片理解的 AI 助手提供视觉能力。 4 | 5 | [English](./docs/README_EN.md) | 中文 6 | 7 | ## 特性 8 | 9 | - **多模型支持**: 支持四个视觉模型 10 | - GLM-4.6V(智谱清言)- 付费,中文理解优秀 11 | - DeepSeek-OCR(硅基流动)- **免费使用**,OCR 能力强 12 | - Qwen3-VL-Flash(阿里云通义千问)- 付费,速度快成本低,支持思考模式 13 | - Doubao-Seed-1.6(火山方舟)- 付费,性价比高,支持多种版本 14 | - **简单设计**: 单一 `analyze_image` 工具处理所有图片分析任务 15 | - **智能理解**: 自动识别代码、UI、错误等不同场景 16 | - **全面支持**: 代码截图、界面设计、错误诊断、OCR 文字识别 17 | - **标准 MCP 协议**: 无缝集成 Claude Desktop、Cline 等 MCP 客户端 18 | - **URL 支持**: 支持本地文件和远程图片 URL 19 | - **重试机制**: 内置指数退避重试,提高可靠性 20 | 21 | ## 快速开始 22 | 23 | ### 前置要求 24 | 25 | - Node.js >= 18.0.0 26 | - **选择一种模型**: 27 | - **方案 A**: 智谱 AI API Key ([获取地址](https://open.bigmodel.cn/)) - 中文理解优秀 28 | - **方案 B**: 硅基流动 API Key ([获取地址](https://cloud.siliconflow.cn/)) - **免费使用**,OCR 能力强 29 | - **方案 C**: 阿里云百炼 API Key ([获取地址](https://bailian.console.aliyun.com/)) - 速度快成本低,支持思考模式 30 | - **方案 D**: 火山方舟 API Key ([获取地址](https://console.volcengine.com/ark)) - 性价比高,支持多种版本 31 | 32 | ### 安装 33 | 34 | #### 方式 1: 本地开发(推荐用于测试) 35 | 36 | ```bash 37 | git clone https://github.com/JochenYang/luma-mcp.git 38 | cd luma-mcp 39 | npm install 40 | npm run build 41 | ``` 42 | 43 | #### 方式 2: 使用 npx(需要先发布到 npm) 44 | 45 | ```bash 46 | npx luma-mcp 47 | ``` 48 | 49 | ### 配置 50 | 51 | #### Claude Desktop 52 | 53 | **方案 A: 使用智谱 GLM-4.6V**: 54 | 55 | ```json 56 | { 57 | "mcpServers": { 58 | "luma": { 59 | "command": "npx", 60 | "args": ["-y", "luma-mcp"], 61 | "env": { 62 | "ZHIPU_API_KEY": "your-zhipu-api-key" 63 | } 64 | } 65 | } 66 | } 67 | ``` 68 | 69 | **方案 B: 使用硅基流动 DeepSeek-OCR(免费)**: 70 | 71 | ```json 72 | { 73 | "mcpServers": { 74 | "luma": { 75 | "command": "npx", 76 | "args": ["-y", "luma-mcp"], 77 | "env": { 78 | "MODEL_PROVIDER": "siliconflow", 79 | "SILICONFLOW_API_KEY": "your-siliconflow-api-key" 80 | } 81 | } 82 | } 83 | } 84 | ``` 85 | 86 | **方案 C: 使用阿里云通义千问 Qwen3-VL-Flash**: 87 | 88 | ```json 89 | { 90 | "mcpServers": { 91 | "luma": { 92 | "command": "npx", 93 | "args": ["-y", "luma-mcp"], 94 | "env": { 95 | "MODEL_PROVIDER": "qwen", 96 | "DASHSCOPE_API_KEY": "your-dashscope-api-key" 97 | } 98 | } 99 | } 100 | } 101 | ``` 102 | 103 | **方案 D: 使用火山方舟 Doubao-Seed-1.6**: 104 | 105 | ```json 106 | { 107 | "mcpServers": { 108 | "luma": { 109 | "command": "npx", 110 | "args": ["-y", "luma-mcp"], 111 | "env": { 112 | "MODEL_PROVIDER": "volcengine", 113 | "VOLCENGINE_API_KEY": "your-volcengine-api-key", 114 | "MODEL_NAME": "doubao-seed-1-6-flash-250828" 115 | } 116 | } 117 | } 118 | } 119 | ``` 120 | 121 | **本地开发(智谱)**: 122 | 123 | ```json 124 | { 125 | "mcpServers": { 126 | "luma": { 127 | "command": "node", 128 | "args": ["D:\\codes\\Luma_mcp\\build\\index.js"], 129 | "env": { 130 | "ZHIPU_API_KEY": "your-zhipu-api-key" 131 | } 132 | } 133 | } 134 | } 135 | ``` 136 | 137 | **本地开发(硅基流动)**: 138 | 139 | ```json 140 | { 141 | "mcpServers": { 142 | "luma": { 143 | "command": "node", 144 | "args": ["D:\\codes\\Luma_mcp\\build\\index.js"], 145 | "env": { 146 | "MODEL_PROVIDER": "siliconflow", 147 | "SILICONFLOW_API_KEY": "your-siliconflow-api-key" 148 | } 149 | } 150 | } 151 | } 152 | ``` 153 | 154 | 配置完成后重启 Claude Desktop。 155 | 156 | #### Cline (VSCode) 157 | 158 | 在项目根目录或 `.vscode/` 目录下创建 `mcp.json` 159 | 160 | **方案 A: 使用智谱 GLM-4.6V**: 161 | 162 | ```json 163 | { 164 | "mcpServers": { 165 | "luma": { 166 | "command": "npx", 167 | "args": ["-y", "luma-mcp"], 168 | "env": { 169 | "ZHIPU_API_KEY": "your-zhipu-api-key" 170 | } 171 | } 172 | } 173 | } 174 | ``` 175 | 176 | **方案 B: 使用硅基流动 DeepSeek-OCR(免费)**: 177 | 178 | ```json 179 | { 180 | "mcpServers": { 181 | "luma": { 182 | "command": "npx", 183 | "args": ["-y", "luma-mcp"], 184 | "env": { 185 | "MODEL_PROVIDER": "siliconflow", 186 | "SILICONFLOW_API_KEY": "your-siliconflow-api-key" 187 | } 188 | } 189 | } 190 | } 191 | ``` 192 | 193 | **方案 C: 使用阿里云通义千问 Qwen3-VL-Flash**: 194 | 195 | ```json 196 | { 197 | "mcpServers": { 198 | "luma": { 199 | "command": "npx", 200 | "args": ["-y", "luma-mcp"], 201 | "env": { 202 | "MODEL_PROVIDER": "qwen", 203 | "DASHSCOPE_API_KEY": "your-dashscope-api-key" 204 | } 205 | } 206 | } 207 | } 208 | ``` 209 | 210 | #### Claude Code (命令行) 211 | 212 | **使用智谱 GLM-4.6V**: 213 | 214 | ```bash 215 | claude mcp add -s user luma-mcp --env ZHIPU_API_KEY=your-api-key -- npx -y luma-mcp 216 | ``` 217 | 218 | **使用硅基流动 DeepSeek-OCR(免费)**: 219 | 220 | ```bash 221 | claude mcp add -s user luma-mcp --env MODEL_PROVIDER=siliconflow --env SILICONFLOW_API_KEY=your-api-key -- npx -y luma-mcp 222 | ``` 223 | 224 | **使用阿里云通义千问 Qwen3-VL-Flash**: 225 | 226 | ```bash 227 | claude mcp add -s user luma-mcp --env MODEL_PROVIDER=qwen --env DASHSCOPE_API_KEY=your-api-key -- npx -y luma-mcp 228 | ``` 229 | 230 | #### 其他工具 231 | 232 | 更多 MCP 客户端配置方法请参考[智谱官方文档](https://docs.bigmodel.cn/cn/coding-plan/mcp/vision-mcp-server#claude-code) 233 | 234 | ## 使用方法 235 | 236 | ### 重要提示 237 | 238 | **MCP 工具调用机制**: 239 | 240 | - MCP 工具需要 AI 模型**主动调用**才会执行 241 | - 如果使用的 AI 模型本身支持视觉(如 Claude 4.5 Sonnet),它会优先使用自己的视觉能力 242 | - Luma MCP 主要服务于**不支持视觉的模型**(如 GPT-4、Claude Opus 等文本模型) 243 | 244 | **如何确保工具被调用**: 245 | 246 | 1. 使用完整工具名:`使用 mcp__luma-mcp__analyze_image 工具分析这张图片` 247 | 2. 使用简化名称:`用 analyze_image 工具查看 ./screenshot.png` 248 | 3. 提供图片路径:`请用图片分析工具查看 ./screenshot.png 中的代码错误` 249 | 4. 明确提及服务器:`通过 luma-mcp 服务器分析这张图片` 250 | 251 | **注意**: 直接在聊天框粘贴图片,非视觉模型不会自动调用 Luma,需要明确指示。 252 | 253 | ### 在 Claude code 中使用 254 | 255 | 配置完成后,在 Claude 对话中可以这样使用: 256 | 257 | **推荐用法(明确指示)**: 258 | 259 | ``` 260 | 用户: 使用 Luma 分析 ./code-error.png,这段代码为什么报错? 261 | Claude: [调用 Luma 分析图片,返回详细分析] 262 | ``` 263 | 264 | **或提供图片路径**: 265 | 266 | ``` 267 | 用户: 请分析 https://example.com/screenshot.jpg 中的界面问题 268 | Claude: [自动调用 analyze_image 工具] 269 | ``` 270 | 271 | ### 本地测试 272 | 273 | 不需要 MCP 客户端即可测试: 274 | 275 | **测试智谱 GLM-4.6V**: 276 | 277 | ```bash 278 | # 设置 API Key 279 | export ZHIPU_API_KEY="your-api-key" # macOS/Linux 280 | $env:ZHIPU_API_KEY="your-api-key" # Windows PowerShell 281 | 282 | # 测试本地图片 283 | npm run test:local ./test.png 284 | ``` 285 | 286 | **测试硅基流动 DeepSeek-OCR**: 287 | 288 | ```bash 289 | # 设置 API Key 和提供商 290 | export MODEL_PROVIDER=siliconflow 291 | export SILICONFLOW_API_KEY="your-api-key" # macOS/Linux 292 | 293 | $env:MODEL_PROVIDER="siliconflow" 294 | $env:SILICONFLOW_API_KEY="your-api-key" # Windows PowerShell 295 | 296 | # 测试本地图片 297 | npm run test:local ./test.png 298 | ``` 299 | 300 | **测试阿里云通义千问 Qwen3-VL-Flash**: 301 | 302 | ```bash 303 | # 设置 API Key 和提供商 304 | export MODEL_PROVIDER=qwen 305 | export DASHSCOPE_API_KEY="your-api-key" # macOS/Linux 306 | 307 | $env:MODEL_PROVIDER="qwen" 308 | $env:DASHSCOPE_API_KEY="your-api-key" # Windows PowerShell 309 | 310 | # 测试本地图片 311 | npm run test:local ./test.png 312 | ``` 313 | 314 | **其他测试命令**: 315 | 316 | ```bash 317 | # 测试并提问 318 | npm run test:local ./code-error.png "这段代码有什么问题?" 319 | 320 | # 测试远程URL 321 | npm run test:local https://example.com/image.jpg 322 | ``` 323 | 324 | ## 工具说明 325 | 326 | ### analyze_image 327 | 328 | 分析图片内容的通用工具。 329 | 330 | **参数**: 331 | 332 | - `image_source` (必需): 图片来源,支持三种格式 333 | - **本地文件**: 绝对路径或相对路径(例:`./image.png`, `C:\Users\...\image.jpg`) 334 | - **远程 URL**: https:// 开头的 URL(例:`https://example.com/pic.jpg`) 335 | - **Data URI**: Base64 编码的图片数据(例:`...`) 336 | - 支持格式: JPG, PNG, WebP, GIF 337 | - `prompt` (必需): 分析指令或问题 338 | 339 | **示例**: 340 | 341 | ```typescript 342 | // 通用分析 343 | analyze_image({ 344 | image_source: "./screenshot.png", 345 | prompt: "请详细分析这张图片的内容", 346 | }); 347 | 348 | // 代码分析 349 | analyze_image({ 350 | image_source: "./code-error.png", 351 | prompt: "这段代码为什么报错?请提供修复建议", 352 | }); 353 | 354 | // UI 分析 355 | analyze_image({ 356 | image_source: "https://example.com/ui.png", 357 | prompt: "分析这个界面的布局和可用性问题", 358 | }); 359 | 360 | // Data URI (当客户端支持时) 361 | analyze_image({ 362 | image_source: "...", 363 | prompt: "识别图片中的所有文字", 364 | }); 365 | ``` 366 | 367 | ## 环境变量 368 | 369 | ### 通用配置 370 | 371 | | 变量名 | 必需 | 默认值 | 说明 | 372 | | ----------------- | ---- | ------- | ---------------------------------------------- | 373 | | `MODEL_PROVIDER` | 否 | `zhipu` | 模型提供商:`zhipu`、`siliconflow` 或 `qwen` | 374 | | `MODEL_NAME` | 否 | 见下文 | 模型名称(自动根据提供商选择) | 375 | | `MAX_TOKENS` | 否 | `16384` | 最大生成 tokens | 376 | | `TEMPERATURE` | 否 | `0.7` | 温度参数 (0-1) | 377 | | `TOP_P` | 否 | `0.7` | Top-p 参数 (0-1) | 378 | | `ENABLE_THINKING` | 否 | `true` | 是否启用思考模式(GLM-4.6V 和 Qwen3-VL-Flash) | 379 | 380 | ### 智谱 GLM-4.6V 专用 381 | 382 | | 变量名 | 必需 | 默认值 | 说明 | 383 | | --------------- | ---------------- | ------ | ------------------- | 384 | | `ZHIPU_API_KEY` | 是(使用智谱时) | - | 智谱 AI 的 API 密钥 | 385 | 386 | 默认模型:`glm-4.6v` 387 | 388 | ### 硅基流动 DeepSeek-OCR 专用 389 | 390 | | 变量名 | 必需 | 默认值 | 说明 | 391 | | --------------------- | -------------------- | ------ | ------------------- | 392 | | `SILICONFLOW_API_KEY` | 是(使用硅基流动时) | - | 硅基流动的 API 密钥 | 393 | 394 | 默认模型:`deepseek-ai/DeepSeek-OCR` 395 | 396 | ### 阿里云通义千问 Qwen3-VL-Flash 专用 397 | 398 | | 变量名 | 必需 | 默认值 | 说明 | 399 | | ------------------- | ---------------- | ------ | --------------------- | 400 | | `DASHSCOPE_API_KEY` | 是(使用千问时) | - | 阿里云百炼的 API 密钥 | 401 | 402 | 默认模型:`qwen3-vl-flash` 403 | 404 | **思考模式说明**: 405 | 406 | - 默认开启,提高图片分析的准确性和详细程度 407 | - 如需关闭(提高速度、降低成本),请在配置文件中设置: 408 | ```json 409 | { 410 | "mcpServers": { 411 | "luma": { 412 | "command": "npx", 413 | "args": ["-y", "luma-mcp"], 414 | "env": { 415 | "ZHIPU_API_KEY": "your-api-key", 416 | "ENABLE_THINKING": "false" 417 | } 418 | } 419 | } 420 | } 421 | ``` 422 | - 关闭后可节省 20-30% tokens 消耗,响应速度提升约 30% 423 | 424 | ## 开发 425 | 426 | ```bash 427 | # 开发模式(监听文件变化) 428 | npm run watch 429 | 430 | # 构建 431 | npm run build 432 | 433 | # 本地测试 434 | npm run test:local <图片路径> [问题] 435 | ``` 436 | 437 | ## 项目结构 438 | 439 | ``` 440 | luma-mcp/ 441 | ├── src/ 442 | │ ├── index.ts # MCP 服务器入口 443 | │ ├── config.ts # 配置管理(支持多模型) 444 | │ ├── vision-client.ts # 视觉模型客户端接口 445 | │ ├── zhipu-client.ts # GLM-4.6V API 客户端 446 | │ ├── siliconflow-client.ts # DeepSeek-OCR API 客户端 447 | │ ├── qwen-client.ts # Qwen3-VL API 客户端 448 | │ ├── volcengine-client.ts # Doubao-Seed-1.6 API 客户端 449 | │ ├── image-processor.ts # 图片处理 450 | │ └── utils/ 451 | │ ├── logger.ts # 日志工具 452 | │ └── helpers.ts # 工具函数 453 | ├── test/ 454 | │ ├── test-local.ts # 本地测试脚本 455 | │ ├── test-qwen.ts # Qwen 测试脚本 456 | │ ├── test-deepseek-raw.ts # DeepSeek 原始测试脚本 457 | │ └── test-data-uri.ts # Data URI 测试脚本 458 | ├── docs/ 459 | │ ├── design.md # 设计文档 460 | │ ├── installation.md # 安装指南 461 | │ └── README_EN.md # 英文文档 462 | ├── build/ # 编译输出 463 | └── package.json 464 | ``` 465 | 466 | ## 常见问题 467 | 468 | ### 如何获取 API Key? 469 | 470 | **智谱 GLM-4.6V**: 471 | 472 | 1. 访问 [智谱开放平台](https://open.bigmodel.cn/) 473 | 2. 注册/登录账号 474 | 3. 进入控制台创建 API Key 475 | 4. 复制 API Key 到配置文件 476 | 477 | **硅基流动 DeepSeek-OCR(免费)**: 478 | 479 | 1. 访问 [硅基流动平台](https://cloud.siliconflow.cn/) 480 | 2. 注册/登录账号 481 | 3. 进入 API 管理创建 API Key 482 | 4. 复制 API Key 到配置文件 483 | 484 | **阿里云通义千问 Qwen3-VL-Flash**: 485 | 486 | 1. 访问 [阿里云百炼平台](https://bailian.console.aliyun.com/) 487 | 2. 注册/登录账号 488 | 3. 进入 API-KEY 管理创建 API Key 489 | 4. 复制 API Key 到配置文件 490 | 491 | ### 支持哪些图片格式? 492 | 493 | 支持 JPG、PNG、WebP、GIF 格式。建议使用 JPG 格式以获得更好的压缩率。 494 | 495 | ### 什么是 Data URI? 496 | 497 | Data URI 是一种将图片数据嵌入字符串的方式,格式为: 498 | 499 | ``` 500 | ... 501 | ``` 502 | 503 | **使用场景**: 504 | 505 | - 当 MCP 客户端(如 Claude Desktop)支持时,可以直接传递用户粘贴的图片 506 | - 无需保存为临时文件,更加高效 507 | - 当前支持状态:**服务器已支持**,等待客户端实现 508 | 509 | ### 图片大小限制? 510 | 511 | - 最大文件大小: 10MB 512 | - 超过 2MB 的图片会自动压缩 513 | - 推荐分辨率: 800-2048 像素 514 | 515 | ### 如何查看日志? 516 | 517 | 日志文件位置: `~/.luma-mcp/luma-mcp-YYYY-MM-DD.log` 518 | 519 | ### API 调用失败怎么办? 520 | 521 | 1. 检查 API Key 是否正确 522 | 2. 确认账户余额充足(智谱/阿里云) 523 | 3. 检查网络连接 524 | 4. 查看日志文件了解详细错误信息 525 | 526 | ### 成本如何? 527 | 528 | **硅基流动 DeepSeek-OCR**: **完全免费**,无需付费! 529 | 530 | **智谱 GLM-4.6V**: 定价请参考[智谱官方定价](https://open.bigmodel.cn/pricing)。 531 | 532 | **阿里云通义千问 Qwen3-VL-Flash**: 定价请参考[阿里云百炼定价](https://help.aliyun.com/zh/model-studio/getting-started/models)。 533 | 534 | 典型场景估算(已启用思考模式): 535 | 536 | - 简单图片理解: 500-1000 tokens 537 | - 代码截图分析: 1500-2500 tokens 538 | - 详细 UI 分析: 2000-3000 tokens 539 | 540 | 关闭思考模式可节省约 20-30% tokens。如需关闭,请设置 `ENABLE_THINKING=false`。 541 | 542 | ### 如何选择模型? 543 | 544 | | 特性 | GLM-4.6V(智谱) | DeepSeek-OCR(硅基流动) | Qwen3-VL-Flash(阿里云) | 545 | | ------------- | ---------------- | ------------------------ | ------------------------ | 546 | | **费用** | 收费 | **完全免费** | 收费 | 547 | | **中文理解** | 优秀 | 良好 | **优秀** | 548 | | **OCR 能力** | 良好 | **优秀** | 优秀 | 549 | | **思考模式** | 支持 | 不支持 | 支持 | 550 | | **速度/成本** | 中等 | 免费 | **快速/低成本** | 551 | | **适用场景** | 通用图片分析 | OCR、文字识别 | 快速分析、3D 定位 | 552 | 553 | **推荐**: 554 | 555 | - 需要 OCR 或文字识别:选择 **DeepSeek-OCR**(免费) 556 | - 需要快速低成本分析:选择 **Qwen3-VL-Flash** 557 | - 需要深度图片理解:选择 **GLM-4.6V** 558 | 559 | ## 贡献 560 | 561 | 欢迎提交 Issue 和 Pull Request! 562 | 563 | ## 许可证 564 | 565 | MIT License 566 | 567 | ## 相关链接 568 | 569 | - [智谱 AI 开放平台](https://open.bigmodel.cn/) 570 | - [GLM-4.6V 文档](https://docs.bigmodel.cn/cn/guide/models/vlm/glm-4.6v) 571 | - [硅基流动平台](https://cloud.siliconflow.cn/) 572 | - [DeepSeek-OCR 文档](https://docs.siliconflow.cn/cn/api-reference/chat-completions/chat-completions) 573 | - [阿里云百炼平台](https://bailian.console.aliyun.com/) 574 | - [Qwen3-VL 文档](https://help.aliyun.com/zh/model-studio/getting-started/models) 575 | - [MCP 协议文档](https://modelcontextprotocol.io/) 576 | 577 | ## 更新日志 578 | 579 | 更多更新历史请查看 [CHANGELOG.md](./CHANGELOG.md) 580 | 581 | ## 作者 582 | 583 | Jochen 584 | 585 | --- 586 | 587 | **注意**: 请勿在公开仓库中提交包含真实 API Key 的配置文件。 588 | -------------------------------------------------------------------------------- /docs/README_EN.md: -------------------------------------------------------------------------------- 1 | # Luma MCP 2 | 3 | Multi-model vision understanding MCP server, providing visual capabilities to AI assistants that don't natively support image understanding. 4 | 5 | English | [中文](../README.md) 6 | 7 | ## Features 8 | 9 | - **Multi-Model Support**: Supports four vision models 10 | - GLM-4.6V (Zhipu) - Paid, excellent Chinese understanding 11 | - DeepSeek-OCR (SiliconFlow) - **Free to use**, strong OCR capability 12 | - Qwen3-VL-Flash (Aliyun) - Paid, fast and cost-effective, supports thinking mode 13 | - Doubao-Seed-1.6 (Volcengine) - Paid, cost-effective, supports multiple versions 14 | - **Simple Design**: Single `analyze_image` tool handles all image analysis tasks 15 | - **Smart Understanding**: Automatically recognizes different scenarios (code, UI, errors, etc.) 16 | - **Comprehensive Support**: Code screenshots, UI design, error diagnosis, OCR text recognition 17 | - **Standard MCP Protocol**: Seamless integration with Claude Desktop, Cline, and other MCP clients 18 | - **URL Support**: Handles both local files and remote image URLs 19 | - **Retry Mechanism**: Built-in exponential backoff retry for reliability 20 | 21 | ## Quick Start 22 | 23 | ### Prerequisites 24 | 25 | - Node.js >= 18.0.0 26 | - **Choose one model**: 27 | - **Option A**: Zhipu AI API Key ([Get it here](https://open.bigmodel.cn/)) - Excellent Chinese understanding 28 | - **Option B**: SiliconFlow API Key ([Get it here](https://cloud.siliconflow.cn/)) - **Free to use**, Strong OCR capability 29 | - **Option C**: Aliyun Bailian API Key ([Get it here](https://bailian.console.aliyun.com/)) - Fast and cost-effective, supports thinking mode 30 | - **Option D**: Volcengine API Key ([Get it here](https://console.volcengine.com/ark)) - Cost-effective, supports multiple versions 31 | 32 | ### Installation 33 | 34 | #### Method 1: Local Development (Recommended for testing) 35 | 36 | ```bash 37 | git clone https://github.com/yourusername/luma-mcp.git 38 | cd luma-mcp 39 | npm install 40 | npm run build 41 | ``` 42 | 43 | #### Method 2: Using npx (After publishing to npm) 44 | 45 | ```bash 46 | npx luma-mcp 47 | ``` 48 | 49 | ### Configuration 50 | 51 | #### Claude Desktop 52 | 53 | **Option A: Using Zhipu GLM-4.6V**: 54 | 55 | ```json 56 | { 57 | "mcpServers": { 58 | "luma": { 59 | "command": "npx", 60 | "args": ["-y", "luma-mcp"], 61 | "env": { 62 | "ZHIPU_API_KEY": "your-zhipu-api-key" 63 | } 64 | } 65 | } 66 | } 67 | ``` 68 | 69 | **Option B: Using SiliconFlow DeepSeek-OCR (Free)**: 70 | 71 | ```json 72 | { 73 | "mcpServers": { 74 | "luma": { 75 | "command": "npx", 76 | "args": ["-y", "luma-mcp"], 77 | "env": { 78 | "MODEL_PROVIDER": "siliconflow", 79 | "SILICONFLOW_API_KEY": "your-siliconflow-api-key" 80 | } 81 | } 82 | } 83 | } 84 | ``` 85 | 86 | **Option C: Using Aliyun Qwen3-VL-Flash**: 87 | 88 | ```json 89 | { 90 | "mcpServers": { 91 | "luma": { 92 | "command": "npx", 93 | "args": ["-y", "luma-mcp"], 94 | "env": { 95 | "MODEL_PROVIDER": "qwen", 96 | "DASHSCOPE_API_KEY": "your-dashscope-api-key" 97 | } 98 | } 99 | } 100 | } 101 | ``` 102 | 103 | **Option D: Using Volcengine Doubao-Seed-1.6**: 104 | 105 | ```json 106 | { 107 | "mcpServers": { 108 | "luma": { 109 | "command": "npx", 110 | "args": ["-y", "luma-mcp"], 111 | "env": { 112 | "MODEL_PROVIDER": "volcengine", 113 | "VOLCENGINE_API_KEY": "your-volcengine-api-key", 114 | "MODEL_NAME": "doubao-seed-1-6-flash-250828" 115 | } 116 | } 117 | } 118 | } 119 | ``` 120 | 121 | **Local Development (Zhipu)**: 122 | 123 | ```json 124 | { 125 | "mcpServers": { 126 | "luma": { 127 | "command": "node", 128 | "args": ["D:\\codes\\Luma_mcp\\build\\index.js"], 129 | "env": { 130 | "ZHIPU_API_KEY": "your-zhipu-api-key" 131 | } 132 | } 133 | } 134 | } 135 | ``` 136 | 137 | **Local Development (SiliconFlow)**: 138 | 139 | ```json 140 | { 141 | "mcpServers": { 142 | "luma": { 143 | "command": "node", 144 | "args": ["D:\\codes\\Luma_mcp\\build\\index.js"], 145 | "env": { 146 | "MODEL_PROVIDER": "siliconflow", 147 | "SILICONFLOW_API_KEY": "your-siliconflow-api-key" 148 | } 149 | } 150 | } 151 | } 152 | ``` 153 | 154 | Restart Claude Desktop after configuration. 155 | 156 | #### Cline (VSCode) 157 | 158 | Create `mcp.json` in project root or `.vscode/` directory 159 | 160 | **Option A: Using Zhipu GLM-4.6V**: 161 | 162 | ```json 163 | { 164 | "mcpServers": { 165 | "luma": { 166 | "command": "npx", 167 | "args": ["-y", "luma-mcp"], 168 | "env": { 169 | "ZHIPU_API_KEY": "your-zhipu-api-key" 170 | } 171 | } 172 | } 173 | } 174 | ``` 175 | 176 | **Option B: Using SiliconFlow DeepSeek-OCR (Free)**: 177 | 178 | ```json 179 | { 180 | "mcpServers": { 181 | "luma": { 182 | "command": "npx", 183 | "args": ["-y", "luma-mcp"], 184 | "env": { 185 | "MODEL_PROVIDER": "siliconflow", 186 | "SILICONFLOW_API_KEY": "your-siliconflow-api-key" 187 | } 188 | } 189 | } 190 | } 191 | ``` 192 | 193 | **Option C: Using Aliyun Qwen3-VL-Flash**: 194 | 195 | ```json 196 | { 197 | "mcpServers": { 198 | "luma": { 199 | "command": "npx", 200 | "args": ["-y", "luma-mcp"], 201 | "env": { 202 | "MODEL_PROVIDER": "qwen", 203 | "DASHSCOPE_API_KEY": "your-dashscope-api-key" 204 | } 205 | } 206 | } 207 | } 208 | ``` 209 | 210 | #### Claude Code (CLI) 211 | 212 | **Using Zhipu GLM-4.6V**: 213 | ```bash 214 | claude mcp add -s user luma-mcp --env ZHIPU_API_KEY=your-api-key -- npx -y luma-mcp 215 | ``` 216 | 217 | **Using SiliconFlow DeepSeek-OCR (Free)**: 218 | ```bash 219 | claude mcp add -s user luma-mcp --env MODEL_PROVIDER=siliconflow --env SILICONFLOW_API_KEY=your-api-key -- npx -y luma-mcp 220 | ``` 221 | 222 | **Using Aliyun Qwen3-VL-Flash**: 223 | ```bash 224 | claude mcp add -s user luma-mcp --env MODEL_PROVIDER=qwen --env DASHSCOPE_API_KEY=your-api-key -- npx -y luma-mcp 225 | ``` 226 | 227 | #### Other Tools 228 | 229 | For more MCP client configuration methods, refer to [Zhipu Official Documentation](https://docs.bigmodel.cn/cn/coding-plan/mcp/vision-mcp-server#claude-code) 230 | 231 | ## Usage 232 | 233 | ### Important Notes 234 | 235 | **MCP Tool Invocation Mechanism**: 236 | - MCP tools require the AI model to **actively call** them to execute 237 | - If the AI model itself supports vision (like Claude 4.5 Sonnet), it will prioritize its native vision capabilities 238 | - Luma MCP primarily serves **non-vision models** (like GPT-4, Claude Opus, etc.) 239 | 240 | **How to Ensure Tool Invocation**: 241 | 1. Use full tool name: `Use mcp__luma-mcp__analyze_image tool to analyze this image` 242 | 2. Use simplified name: `Use analyze_image tool to view ./screenshot.png` 243 | 3. Provide image path: `Use image analysis tool to check ./screenshot.png for code errors` 244 | 4. Mention server explicitly: `Analyze this image via luma-mcp server` 245 | 246 | **Note**: Simply pasting an image in the chat box won't automatically trigger Luma for non-vision models - explicit instruction is required. 247 | 248 | ### Using in Claude Desktop 249 | 250 | After configuration, use it in Claude conversations like this: 251 | 252 | **Recommended Usage (Explicit Instruction)**: 253 | ``` 254 | User: Use Luma to analyze ./code-error.png, why is this code throwing an error? 255 | Claude: [Calls Luma to analyze the image and returns detailed analysis] 256 | ``` 257 | 258 | **Or Provide Image Path**: 259 | ``` 260 | User: Please analyze the interface issues in https://example.com/screenshot.jpg 261 | Claude: [Automatically calls analyze_image tool] 262 | ``` 263 | 264 | ### Local Testing 265 | 266 | Test without MCP clients: 267 | 268 | **Test Zhipu GLM-4.6V**: 269 | ```bash 270 | # Set API Key 271 | export ZHIPU_API_KEY="your-api-key" # macOS/Linux 272 | $env:ZHIPU_API_KEY="your-api-key" # Windows PowerShell 273 | 274 | # Test local image 275 | npm run test:local ./test.png 276 | ``` 277 | 278 | **Test SiliconFlow DeepSeek-OCR**: 279 | ```bash 280 | # Set API Key and provider 281 | export MODEL_PROVIDER=siliconflow 282 | export SILICONFLOW_API_KEY="your-api-key" # macOS/Linux 283 | 284 | $env:MODEL_PROVIDER="siliconflow" 285 | $env:SILICONFLOW_API_KEY="your-api-key" # Windows PowerShell 286 | 287 | # Test local image 288 | npm run test:local ./test.png 289 | ``` 290 | 291 | **Test Aliyun Qwen3-VL-Flash**: 292 | ```bash 293 | # Set API Key and provider 294 | export MODEL_PROVIDER=qwen 295 | export DASHSCOPE_API_KEY="your-api-key" # macOS/Linux 296 | 297 | $env:MODEL_PROVIDER="qwen" 298 | $env:DASHSCOPE_API_KEY="your-api-key" # Windows PowerShell 299 | 300 | # Test local image 301 | npm run test:local ./test.png 302 | ``` 303 | 304 | **Other test commands**: 305 | ```bash 306 | # Test with question 307 | npm run test:local ./code-error.png "What's wrong with this code?" 308 | 309 | # Test remote URL 310 | npm run test:local https://example.com/image.jpg 311 | ``` 312 | 313 | ## Tool Reference 314 | 315 | ### analyze_image 316 | 317 | Universal tool for analyzing image content. 318 | 319 | **Parameters**: 320 | 321 | - `image_source` (required): Image source, supports three formats 322 | - **Local file**: Absolute or relative path (e.g., `./image.png`, `C:\Users\...\image.jpg`) 323 | - **Remote URL**: URL starting with https:// (e.g., `https://example.com/pic.jpg`) 324 | - **Data URI**: Base64-encoded image data (e.g., `...`) 325 | - Supported formats: JPG, PNG, WebP, GIF 326 | - `prompt` (required): Analysis instruction or question about the image 327 | 328 | **Examples**: 329 | 330 | ```typescript 331 | // General analysis 332 | analyze_image({ 333 | image_source: "./screenshot.png", 334 | prompt: "Please analyze this image in detail" 335 | }) 336 | 337 | // Code analysis 338 | analyze_image({ 339 | image_source: "./code-error.png", 340 | prompt: "Why is this code throwing an error? Provide fix suggestions" 341 | }) 342 | 343 | // UI analysis 344 | analyze_image({ 345 | image_source: "https://example.com/ui.png", 346 | prompt: "Analyze the layout and usability issues of this interface" 347 | }) 348 | 349 | // Data URI (when client supports it) 350 | analyze_image({ 351 | image_source: "...", 352 | prompt: "Extract all text from the image" 353 | }) 354 | ``` 355 | 356 | ## Environment Variables 357 | 358 | ### General Configuration 359 | 360 | | Variable | Required | Default | Description | 361 | |-------------------|----------|-----------|--------------------------------------------------------------| 362 | | `MODEL_PROVIDER` | No | `zhipu` | Model provider: `zhipu`, `siliconflow`, or `qwen` | 363 | | `MODEL_NAME` | No | See below | Model name (auto-selected based on provider) | 364 | | `MAX_TOKENS` | No | `4096` | Maximum tokens to generate | 365 | | `TEMPERATURE` | No | `0.7` | Temperature (0-1) | 366 | | `TOP_P` | No | `0.7` | Top-p parameter (0-1) | 367 | | `ENABLE_THINKING` | No | `false` | Enable thinking mode (GLM-4.6V and Qwen3-VL-Flash) | 368 | 369 | ### Zhipu GLM-4.6V Specific 370 | 371 | | Variable | Required | Default | Description | 372 | |------------------|----------------------|------------|-------------------| 373 | | `ZHIPU_API_KEY` | Yes (when using Zhipu) | - | Zhipu AI API key | 374 | 375 | Default model: `glm-4.5v` 376 | 377 | ### SiliconFlow DeepSeek-OCR Specific 378 | 379 | | Variable | Required | Default | Description | 380 | |------------------------|-------------------------------|------------------------------|------------------------| 381 | | `SILICONFLOW_API_KEY` | Yes (when using SiliconFlow) | - | SiliconFlow API key | 382 | 383 | Default model: `deepseek-ai/DeepSeek-OCR` 384 | 385 | ### Aliyun Qwen3-VL-Flash Specific 386 | 387 | | Variable | Required | Default | Description | 388 | |-----------------------|----------------------------|----------|------------------------------| 389 | | `DASHSCOPE_API_KEY` | Yes (when using Qwen) | - | Aliyun Bailian API key | 390 | 391 | Default model: `qwen3-vl-flash` 392 | 393 | **Thinking Mode**: 394 | - Enabled by default for better accuracy and detailed analysis 395 | - To disable (faster speed, lower cost), set in config: 396 | ```json 397 | { 398 | "mcpServers": { 399 | "luma": { 400 | "command": "npx", 401 | "args": ["-y", "luma-mcp"], 402 | "env": { 403 | "ZHIPU_API_KEY": "your-api-key", 404 | "ZHIPU_ENABLE_THINKING": "false" 405 | } 406 | } 407 | } 408 | } 409 | ``` 410 | - Disabling saves ~20-30% tokens and improves speed by ~30% 411 | 412 | ## Development 413 | 414 | ```bash 415 | # Development mode (watch for changes) 416 | npm run watch 417 | 418 | # Build 419 | npm run build 420 | 421 | # Local test 422 | npm run test:local [question] 423 | ``` 424 | 425 | ## Project Structure 426 | 427 | ``` 428 | luma-mcp/ 429 | ├── src/ 430 | │ ├── index.ts # MCP server entry 431 | │ ├── config.ts # Configuration management (multi-model) 432 | │ ├── vision-client.ts # Vision model client interface 433 | │ ├── zhipu-client.ts # GLM-4.6V API client 434 | │ ├── siliconflow-client.ts # DeepSeek-OCR API client 435 | │ ├── qwen-client.ts # Qwen3-VL API client 436 | │ ├── volcengine-client.ts # Doubao-Seed-1.6 API client 437 | │ ├── image-processor.ts # Image processing 438 | │ └── utils/ 439 | │ ├── logger.ts # Logging utilities 440 | │ └── helpers.ts # Helper functions 441 | ├── test/ 442 | │ ├── test-local.ts # Local testing script 443 | │ ├── test-qwen.ts # Qwen testing script 444 | │ ├── test-deepseek-raw.ts # DeepSeek raw testing script 445 | │ └── test-data-uri.ts # Data URI testing script 446 | ├── docs/ 447 | │ ├── design.md # Design documentation 448 | │ ├── installation.md # Installation guide 449 | │ └── README_EN.md # English documentation 450 | ├── build/ # Build output 451 | └── package.json 452 | ``` 453 | 454 | ## FAQ 455 | 456 | ### How to get API Key? 457 | 458 | **Zhipu GLM-4.6V**: 459 | 1. Visit [Zhipu Open Platform](https://open.bigmodel.cn/) 460 | 2. Register/Login 461 | 3. Go to console and create API Key 462 | 4. Copy API Key to configuration file 463 | 464 | **SiliconFlow DeepSeek-OCR (Free)**: 465 | 1. Visit [SiliconFlow Platform](https://cloud.siliconflow.cn/) 466 | 2. Register/Login 467 | 3. Go to API management and create API Key 468 | 4. Copy API Key to configuration file 469 | 470 | **Aliyun Qwen3-VL-Flash**: 471 | 1. Visit [Aliyun Bailian Platform](https://bailian.console.aliyun.com/) 472 | 2. Register/Login 473 | 3. Go to API-KEY management and create API Key 474 | 4. Copy API Key to configuration file 475 | 476 | ### What image formats are supported? 477 | 478 | Supports JPG, PNG, WebP, GIF. JPG format is recommended for better compression. 479 | 480 | ### What is a Data URI? 481 | 482 | A Data URI is a way to embed image data into a string, formatted as: 483 | ``` 484 | ... 485 | ``` 486 | 487 | **Use cases**: 488 | - When MCP clients (like Claude Desktop) support it, can directly pass user-pasted images 489 | - No need to save as temporary files, more efficient 490 | - Current status: **Server supports**, waiting for client implementation 491 | 492 | ### Image size limits? 493 | 494 | - Maximum file size: 10MB 495 | - Images over 2MB will be automatically compressed 496 | - Recommended resolution: 800-2048 pixels 497 | 498 | ### How to view logs? 499 | 500 | Log file location: `~/.luma-mcp/luma-mcp-YYYY-MM-DD.log` 501 | 502 | ### What if API call fails? 503 | 504 | 1. Check if API Key is correct 505 | 2. Confirm sufficient balance in your account (Zhipu/Aliyun) 506 | 3. Check network connection 507 | 4. View log file for detailed error information 508 | 509 | ### What's the cost? 510 | 511 | **SiliconFlow DeepSeek-OCR**: **Completely free**, no charges! 512 | 513 | **Zhipu GLM-4.6V**: For pricing, refer to [Zhipu Official Pricing](https://open.bigmodel.cn/pricing). 514 | 515 | **Aliyun Qwen3-VL-Flash**: For pricing, refer to [Aliyun Bailian Pricing](https://help.aliyun.com/zh/model-studio/getting-started/models). 516 | 517 | Typical scenario estimates: 518 | - Simple image understanding: 500-1000 tokens 519 | - Code screenshot analysis: 1500-2500 tokens 520 | - Detailed UI analysis: 2000-3000 tokens 521 | 522 | Enabling thinking mode (GLM-4.6V/Qwen3-VL-Flash) increases tokens by approximately 20-30%. 523 | 524 | ### How to choose a model? 525 | 526 | | Feature | GLM-4.6V (Zhipu) | DeepSeek-OCR (SiliconFlow) | Qwen3-VL-Flash (Aliyun) | 527 | |------------------|-------------------|----------------------------|-------------------------| 528 | | **Cost** | Paid | **Completely Free** | Paid | 529 | | **Chinese** | Excellent | Good | **Excellent** | 530 | | **OCR** | Good | **Excellent** | Excellent | 531 | | **Thinking Mode**| Supported | Not supported | Supported | 532 | | **Speed/Cost** | Medium | Free | **Fast/Low Cost** | 533 | | **Use Cases** | General analysis | OCR, Text recognition | Fast analysis, 3D positioning | 534 | 535 | **Recommendations**: 536 | - Need OCR/text recognition → **DeepSeek-OCR** (free) 537 | - Need fast and cost-effective analysis → **Qwen3-VL-Flash** 538 | - Need deep image understanding → **GLM-4.6V** 539 | 540 | ## Contributing 541 | 542 | Issues and Pull Requests are welcome! 543 | 544 | ## License 545 | 546 | MIT License 547 | 548 | ## Related Links 549 | 550 | - [Zhipu AI Open Platform](https://open.bigmodel.cn/) 551 | - [GLM-4.6V Documentation](https://docs.bigmodel.cn/cn/guide/models/vlm/glm-4.5v) 552 | - [SiliconFlow Platform](https://cloud.siliconflow.cn/) 553 | - [DeepSeek-OCR Documentation](https://docs.siliconflow.cn/cn/api-reference/chat-completions/chat-completions) 554 | - [Aliyun Bailian Platform](https://bailian.console.aliyun.com/) 555 | - [Qwen3-VL Documentation](https://help.aliyun.com/zh/model-studio/getting-started/models) 556 | - [MCP Protocol Documentation](https://modelcontextprotocol.io/) 557 | 558 | ## Changelog 559 | 560 | For more update history, see [CHANGELOG.md](../CHANGELOG.md) 561 | 562 | ## Author 563 | 564 | Jochen 565 | 566 | --- 567 | 568 | **Note**: Do not commit configuration files containing real API Keys to public repositories. 569 | --------------------------------------------------------------------------------