├── .clinerules └── mcp-ragdocs_clinerules.md ├── src ├── tools │ ├── index.ts │ ├── base-tool.ts │ ├── clear-queue.ts │ ├── list-queue.ts │ ├── list-sources.ts │ ├── search-documentation.ts │ ├── remove-documentation.ts │ ├── run-queue.ts │ └── extract-urls.ts ├── types │ └── ollama.d.ts ├── handlers │ ├── clear-queue.ts │ ├── base-handler.ts │ ├── index.ts │ ├── prompts-list.ts │ ├── resources-list.ts │ ├── list-queue.ts │ ├── search-documentation.ts │ ├── remove-documentation.ts │ ├── remove-repository.ts │ ├── list-repositories.ts │ ├── run-queue.ts │ ├── extract-urls.ts │ ├── get-indexing-status.ts │ ├── add-documentation.ts │ ├── list-sources.ts │ ├── watch-repository.ts │ ├── update-repository.ts │ └── local-repository.ts ├── types.ts ├── utils │ ├── language-detection.ts │ ├── repository-watcher.ts │ ├── indexing-status-manager.ts │ └── repository-config-loader.ts ├── index.ts ├── api-client.ts ├── services │ └── embeddings.ts ├── public │ └── index.html ├── server.ts └── handler-registry.ts ├── tsconfig.json ├── docker-compose.yml ├── repositories.json ├── .gitignore ├── smithery.yaml ├── TASK.md ├── Dockerfile ├── LICENSE ├── repositories.json.example ├── CHANGELOG.md ├── package.json ├── KNOWLEDGE.md └── README.md /.clinerules/mcp-ragdocs_clinerules.md: -------------------------------------------------------------------------------- 1 | - refer to KNOWLEDGE.md and README.md for useful context 2 | - Keep those 2 files up-to-date after completing a task -------------------------------------------------------------------------------- /src/tools/index.ts: -------------------------------------------------------------------------------- 1 | export * from './search-documentation.js'; 2 | export * from './list-sources.js'; 3 | export * from './extract-urls.js'; 4 | export * from './remove-documentation.js'; 5 | export * from './list-queue.js'; 6 | export * from './run-queue.js'; 7 | export * from './clear-queue.js'; -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "module": "Node16", 5 | "moduleResolution": "Node16", 6 | "outDir": "./build", 7 | "rootDir": "./src", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true, 11 | "forceConsistentCasingInFileNames": true 12 | }, 13 | "include": ["src/**/*"], 14 | "exclude": ["node_modules"] 15 | } 16 | -------------------------------------------------------------------------------- /src/types/ollama.d.ts: -------------------------------------------------------------------------------- 1 | declare module 'ollama' { 2 | export interface EmbeddingsRequest { 3 | model: string; 4 | prompt: string; 5 | options?: Record; 6 | } 7 | 8 | export interface EmbeddingsResponse { 9 | embedding: number[]; 10 | } 11 | 12 | const ollama: { 13 | embeddings(request: EmbeddingsRequest): Promise; 14 | }; 15 | 16 | export default ollama; 17 | } 18 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | qdrant: 3 | image: qdrant/qdrant:latest 4 | restart: always 5 | ports: 6 | - "6333:6333" # REST API 7 | - "6334:6334" # GRPC API 8 | volumes: 9 | - qdrant_storage:/qdrant/storage 10 | environment: 11 | - QDRANT_ALLOW_RECOVERY=true 12 | healthcheck: 13 | test: ["CMD", "curl", "-f", "http://localhost:6333/healthz"] 14 | interval: 30s 15 | timeout: 10s 16 | retries: 3 17 | 18 | volumes: 19 | qdrant_storage: 20 | -------------------------------------------------------------------------------- /repositories.json: -------------------------------------------------------------------------------- 1 | { 2 | "repositories": [ 3 | { 4 | "path": "/Users/guillaumeb/dev", 5 | "name": "goali", 6 | "include": ["**/*.js", "**/*.ts", "**/*.md"], 7 | "exclude": ["**/node_modules/**", "**/.git/**"], 8 | "watchMode": true, 9 | "watchInterval": 60000, 10 | "chunkSize": 1000, 11 | "fileTypeConfig": { 12 | ".py": { "include": true, "chunkStrategy": "semantic" }, 13 | ".md": { "include": true, "chunkStrategy": "semantic" } 14 | } 15 | } 16 | ], 17 | "autoWatch": true 18 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | .pnp/ 4 | .pnp.js 5 | 6 | # Build output 7 | build/ 8 | dist/ 9 | *.tsbuildinfo 10 | 11 | # Environment variables 12 | .env 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | # Logs 19 | logs/ 20 | *.log 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | 25 | # Editor directories and files 26 | .idea/ 27 | .vscode/ 28 | *.swp 29 | *.swo 30 | .DS_Store 31 | 32 | # Test coverage 33 | coverage/ 34 | 35 | # Local documentation files 36 | INTERNAL.TXT 37 | queue.txt 38 | MCPguide.txt 39 | -------------------------------------------------------------------------------- /src/handlers/clear-queue.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { ApiClient } from '../api-client.js'; 3 | import { ClearQueueTool } from '../tools/clear-queue.js'; 4 | 5 | export class ClearQueueHandler extends ClearQueueTool { 6 | constructor(server: Server, apiClient: ApiClient) { 7 | super(); 8 | } 9 | 10 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }) { 11 | // ClearQueueTool.execute doesn't use callContext, so we don't pass it. 12 | return this.execute(args); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/handlers/base-handler.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { ApiClient } from '../api-client.js'; 3 | import { McpToolResponse } from '../types.js'; 4 | 5 | export abstract class BaseHandler { 6 | protected server: Server; 7 | protected apiClient: ApiClient; 8 | 9 | constructor(server: Server, apiClient: ApiClient) { 10 | this.server = server; 11 | this.apiClient = apiClient; 12 | } 13 | 14 | protected abstract handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise; 15 | } 16 | -------------------------------------------------------------------------------- /src/handlers/index.ts: -------------------------------------------------------------------------------- 1 | export * from './base-handler.js'; 2 | export * from './add-documentation.js'; 3 | export * from './search-documentation.js'; 4 | export * from './list-sources.js'; 5 | export * from './extract-urls.js'; 6 | export * from './remove-documentation.js'; 7 | export * from './list-queue.js'; 8 | export * from './run-queue.js'; 9 | export * from './clear-queue.js'; 10 | export * from './prompts-list.js'; 11 | export * from './resources-list.js'; 12 | export * from './local-repository.js'; 13 | export * from './list-repositories.js'; 14 | export * from './remove-repository.js'; 15 | export * from './update-repository.js'; 16 | export * from './watch-repository.js'; 17 | export * from './get-indexing-status.js'; -------------------------------------------------------------------------------- /src/tools/base-tool.ts: -------------------------------------------------------------------------------- 1 | import { ToolDefinition, McpToolResponse } from '../types.js'; 2 | 3 | export abstract class BaseTool { 4 | abstract get definition(): ToolDefinition; 5 | abstract execute(args: unknown): Promise; 6 | 7 | protected formatResponse(data: unknown): McpToolResponse { 8 | return { 9 | content: [ 10 | { 11 | type: 'text', 12 | text: JSON.stringify(data, null, 2), 13 | }, 14 | ], 15 | }; 16 | } 17 | 18 | protected handleError(error: any): McpToolResponse { 19 | return { 20 | content: [ 21 | { 22 | type: 'text', 23 | text: `Error: ${error}`, 24 | }, 25 | ], 26 | isError: true, 27 | }; 28 | } 29 | } -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | required: 9 | - openAiApiKey 10 | - qdrantUrl 11 | properties: 12 | openAiApiKey: 13 | type: string 14 | description: API key for accessing OpenAI's services. 15 | qdrantUrl: 16 | type: string 17 | description: URL for the Qdrant vector database. 18 | commandFunction: 19 | # A function that produces the CLI command to start the MCP on stdio. 20 | |- 21 | (config) => ({command:'node', args:['build/index.js'], env:{OPENAI_API_KEY:config.openAiApiKey, QDRANT_URL:config.qdrantUrl}}) -------------------------------------------------------------------------------- /TASK.md: -------------------------------------------------------------------------------- 1 | # MCP RAG Docs Tasks 2 | 3 | ## Pending Tasks 4 | 5 | ### Repository Indexing Enhancements 6 | - [ ] Implement incremental indexing for faster updates 7 | - [ ] Add support for custom chunking strategies 8 | - [ ] Improve language detection for better code chunking 9 | - [ ] Add support for binary file indexing (e.g., PDFs) 10 | 11 | ### Web Interface Improvements 12 | - [ ] Add repository management to web interface 13 | - [ ] Implement real-time indexing status display 14 | - [ ] Add search interface for testing queries 15 | - [ ] Create dashboard for system monitoring 16 | 17 | ### Documentation 18 | - [ ] Create comprehensive API documentation 19 | - [ ] Add examples for all tools 20 | - [ ] Create user guide with common workflows 21 | - [ ] Add developer documentation for extending the system -------------------------------------------------------------------------------- /src/handlers/prompts-list.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { ApiClient } from '../api-client.js'; 3 | import { BaseHandler } from './base-handler.js'; 4 | import { McpToolResponse } from '../types.js'; 5 | 6 | export class PromptsListHandler extends BaseHandler { 7 | constructor(server: Server, apiClient: ApiClient) { 8 | super(server, apiClient); 9 | } 10 | 11 | async handle(_args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 12 | // Return an empty list of prompts 13 | // This is a minimal implementation to prevent the error 14 | return { 15 | content: [ 16 | { 17 | type: 'text', 18 | text: JSON.stringify({ prompts: [] }) 19 | } 20 | ] 21 | }; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/handlers/resources-list.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { ApiClient } from '../api-client.js'; 3 | import { BaseHandler } from './base-handler.js'; 4 | import { McpToolResponse } from '../types.js'; 5 | 6 | export class ResourcesListHandler extends BaseHandler { 7 | constructor(server: Server, apiClient: ApiClient) { 8 | super(server, apiClient); 9 | } 10 | 11 | async handle(_args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 12 | // Return an empty list of resources 13 | // This is a minimal implementation to prevent the error 14 | return { 15 | content: [ 16 | { 17 | type: 'text', 18 | text: JSON.stringify({ resources: [] }) 19 | } 20 | ] 21 | }; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile 2 | # Stage 1: Build the TypeScript project 3 | FROM node:18 AS builder 4 | 5 | # Set working directory 6 | WORKDIR /app 7 | 8 | # Copy package.json and package-lock.json for npm install 9 | COPY package.json package-lock.json ./ 10 | 11 | # Install dependencies 12 | RUN npm install 13 | 14 | # Copy the entire source code to the working directory 15 | COPY . . 16 | 17 | # Build the project 18 | RUN npm run build 19 | 20 | # Stage 2: Run the project 21 | FROM node:18 22 | 23 | # Set working directory 24 | WORKDIR /app 25 | 26 | # Copy built files from the builder stage 27 | COPY --from=builder /app/build /app/build 28 | COPY --from=builder /app/node_modules /app/node_modules 29 | COPY --from=builder /app/package.json /app/package.json 30 | 31 | # Define environment variables 32 | ENV OPENAI_API_KEY=your-api-key-here 33 | ENV QDRANT_URL=http://localhost:6333 34 | 35 | # Start the application 36 | CMD ["node", "build/index.js"] 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This project is a fork of qpd-v/mcp-ragdocs, originally developed by qpd-v. 2 | The fork has been enhanced with additional features and improvements by Rahul Retnan. 3 | 4 | MIT License 5 | 6 | Copyright (c) 2025 Rahul Retnan 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | -------------------------------------------------------------------------------- /repositories.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "repositories": [ 3 | { 4 | "path": "/path/to/your/first/repo", 5 | "name": "my-project", 6 | "include": ["**/*.js", "**/*.ts", "**/*.md"], 7 | "exclude": [ 8 | "**/node_modules/**", 9 | "**/.git/**", 10 | "**/build/**", 11 | "**/dist/**", 12 | "**/*.min.js", 13 | "**/*.map", 14 | "**/package-lock.json", 15 | "**/yarn.lock" 16 | ], 17 | "watchMode": true, 18 | "watchInterval": 60000, 19 | "chunkSize": 1000, 20 | "fileTypeConfig": { 21 | ".js": { "include": true, "chunkStrategy": "semantic" }, 22 | ".ts": { "include": true, "chunkStrategy": "semantic" }, 23 | ".md": { "include": true, "chunkStrategy": "semantic" }, 24 | ".json": { "include": true, "chunkStrategy": "line" } 25 | } 26 | }, 27 | { 28 | "path": "/path/to/your/second/repo", 29 | "name": "documentation", 30 | "include": ["**/*.md", "**/*.txt", "**/*.rst"], 31 | "exclude": [ 32 | "**/.git/**", 33 | "**/node_modules/**" 34 | ], 35 | "watchMode": false, 36 | "watchInterval": 300000, 37 | "chunkSize": 1500, 38 | "fileTypeConfig": { 39 | ".md": { "include": true, "chunkStrategy": "semantic" }, 40 | ".txt": { "include": true, "chunkStrategy": "line" }, 41 | ".rst": { "include": true, "chunkStrategy": "semantic" } 42 | } 43 | } 44 | ], 45 | "autoWatch": true 46 | } 47 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [1.1.0] - 2024-03-14 4 | 5 | ### Initial Feature Addition 6 | - Implemented new clear_queue tool for queue management 7 | - Created src/tools/clear-queue.ts with core functionality 8 | - Added handler in src/handlers/clear-queue.ts 9 | - Integrated with existing queue management system 10 | - Added tool exports and registration 11 | 12 | ### Code Organization 13 | - Improved tool ordering in handler-registry.ts 14 | - Moved remove_documentation before extract_urls 15 | - Enhanced logical grouping of related tools 16 | - Updated imports to match new ordering 17 | 18 | ### Documentation Enhancement Phase 1 19 | - Enhanced tool descriptions in handler-registry.ts: 20 | 1. search_documentation 21 | - Added natural language query support details 22 | - Clarified result ranking and context 23 | - Improved limit parameter documentation 24 | 2. list_sources 25 | - Added details about indexed documentation 26 | - Clarified source information returned 27 | 3. extract_urls 28 | - Enhanced URL crawling explanation 29 | - Added queue integration details 30 | - Clarified URL validation requirements 31 | 4. remove_documentation 32 | - Added permanence warning 33 | - Clarified URL matching requirements 34 | 5. list_queue 35 | - Added queue monitoring details 36 | - Clarified status checking capabilities 37 | 6. run_queue 38 | - Added processing behavior details 39 | - Documented error handling 40 | 7. clear_queue 41 | - Detailed queue clearing behavior 42 | - Added permanence warnings 43 | - Documented URL re-adding requirements 44 | 45 | ### Documentation Enhancement Phase 2 46 | - Updated README.md 47 | - Removed add_documentation and queue_documentation tools 48 | - Updated tool descriptions to match handler-registry.ts 49 | - Added parameter format requirements 50 | - Enhanced usage guidance -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@rahulretnan/mcp-ragdocs", 3 | "version": "1.0.0", 4 | "description": "An MCP server for semantic documentation search and retrieval using vector databases to augment LLM capabilities.", 5 | "private": false, 6 | "type": "module", 7 | "bin": { 8 | "@rahulretnan/mcp-ragdocs": "./build/index.js" 9 | }, 10 | "files": [ 11 | "build", 12 | "README.md", 13 | "LICENSE" 14 | ], 15 | "scripts": { 16 | "build": "tsc && node -e \"require('fs').chmodSync('build/index.js', '755')\"", 17 | "prepare": "npm run build", 18 | "watch": "tsc --watch", 19 | "inspector": "npx @modelcontextprotocol/inspector build/index.js", 20 | "start": "node build/index.js" 21 | }, 22 | "keywords": [ 23 | "mcp", 24 | "model-context-protocol", 25 | "rag", 26 | "documentation", 27 | "vector-database", 28 | "qdrant", 29 | "claude", 30 | "llm" 31 | ], 32 | "author": "rahul", 33 | "license": "MIT", 34 | "repository": { 35 | "type": "git", 36 | "url": "git+https://github.com/rahulretnan/mcp-ragdocs.git" 37 | }, 38 | "bugs": { 39 | "url": "https://github.com/rahulretnan/mcp-ragdocs/issues" 40 | }, 41 | "homepage": "https://github.com/rahulretnan/mcp-ragdocs#readme", 42 | "dependencies": { 43 | "@azure/openai": "2.0.0", 44 | "@modelcontextprotocol/sdk": "1.0.3", 45 | "@qdrant/js-client-rest": "1.12.0", 46 | "axios": "^1.9.0", 47 | "cheerio": "1.0.0", 48 | "cors": "^2.8.5", 49 | "express": "^4.21.2", 50 | "file-type": "^18.7.0", 51 | "glob": "^10.4.5", 52 | "net": "^1.0.2", 53 | "ollama": "^0.5.11", 54 | "openai": "4.76.2", 55 | "playwright": "1.49.1" 56 | }, 57 | "devDependencies": { 58 | "@types/cors": "^2.8.17", 59 | "@types/express": "^5.0.0", 60 | "@types/node": "^20.17.10", 61 | "ts-node": "^10.9.2", 62 | "typescript": "^5.7.2" 63 | }, 64 | "publishConfig": { 65 | "access": "public" 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/handlers/list-queue.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { ApiClient } from '../api-client.js'; 3 | import { BaseHandler } from './base-handler.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | import { fileURLToPath } from 'url'; 7 | 8 | // Get current directory in ES modules 9 | const __filename = fileURLToPath(import.meta.url); 10 | const __dirname = path.dirname(__filename); 11 | const QUEUE_FILE = path.join(__dirname, '..', '..', 'queue.txt'); 12 | 13 | export class ListQueueHandler extends BaseHandler { 14 | constructor(server: Server, apiClient: ApiClient) { 15 | super(server, apiClient); 16 | } 17 | 18 | async handle(_args: any, callContext?: { progressToken?: string | number, requestId: string | number }) { 19 | try { 20 | // Check if queue file exists 21 | try { 22 | await fs.access(QUEUE_FILE); 23 | } catch { 24 | return { 25 | content: [ 26 | { 27 | type: 'text', 28 | text: 'Queue is empty (queue file does not exist)', 29 | }, 30 | ], 31 | }; 32 | } 33 | 34 | // Read queue file 35 | const content = await fs.readFile(QUEUE_FILE, 'utf-8'); 36 | const urls = content.split('\n').filter(url => url.trim() !== ''); 37 | 38 | if (urls.length === 0) { 39 | return { 40 | content: [ 41 | { 42 | type: 'text', 43 | text: 'Queue is empty', 44 | }, 45 | ], 46 | }; 47 | } 48 | 49 | return { 50 | content: [ 51 | { 52 | type: 'text', 53 | text: `Queue contains ${urls.length} URLs:\n${urls.join('\n')}`, 54 | }, 55 | ], 56 | }; 57 | } catch (error) { 58 | return { 59 | content: [ 60 | { 61 | type: 'text', 62 | text: `Failed to read queue: ${error}`, 63 | }, 64 | ], 65 | isError: true, 66 | }; 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/tools/clear-queue.ts: -------------------------------------------------------------------------------- 1 | import { BaseTool } from './base-tool.js'; 2 | import { ToolDefinition, McpToolResponse } from '../types.js'; 3 | import fs from 'fs/promises'; 4 | import path from 'path'; 5 | import { fileURLToPath } from 'url'; 6 | 7 | // Get current directory in ES modules 8 | const __filename = fileURLToPath(import.meta.url); 9 | const __dirname = path.dirname(__filename); 10 | const QUEUE_FILE = path.join(__dirname, '..', '..', 'queue.txt'); 11 | 12 | export class ClearQueueTool extends BaseTool { 13 | get definition(): ToolDefinition { 14 | return { 15 | name: 'clear_queue', 16 | description: 'Clear all URLs from the queue', 17 | inputSchema: { 18 | type: 'object', 19 | properties: {}, 20 | required: [], 21 | }, 22 | }; 23 | } 24 | 25 | async execute(_args: any): Promise { 26 | try { 27 | // Check if queue file exists 28 | try { 29 | await fs.access(QUEUE_FILE); 30 | } catch { 31 | return { 32 | content: [ 33 | { 34 | type: 'text', 35 | text: 'Queue is already empty (queue file does not exist)', 36 | }, 37 | ], 38 | }; 39 | } 40 | 41 | // Read current queue to get count of URLs being cleared 42 | const content = await fs.readFile(QUEUE_FILE, 'utf-8'); 43 | const urlCount = content.split('\n').filter(url => url.trim() !== '').length; 44 | 45 | // Clear the queue by emptying the file 46 | await fs.writeFile(QUEUE_FILE, ''); 47 | 48 | return { 49 | content: [ 50 | { 51 | type: 'text', 52 | text: `Queue cleared successfully. Removed ${urlCount} URL${urlCount === 1 ? '' : 's'} from the queue.`, 53 | }, 54 | ], 55 | }; 56 | } catch (error) { 57 | return { 58 | content: [ 59 | { 60 | type: 'text', 61 | text: `Failed to clear queue: ${error}`, 62 | }, 63 | ], 64 | isError: true, 65 | }; 66 | } 67 | } 68 | } -------------------------------------------------------------------------------- /src/tools/list-queue.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs/promises"; 2 | import path, { dirname } from "path"; 3 | import { fileURLToPath } from "url"; 4 | import { McpToolResponse, ToolDefinition } from "../types.js"; 5 | import { BaseTool } from "./base-tool.js"; 6 | 7 | const __filename = fileURLToPath(import.meta.url); 8 | const __dirname = dirname(__filename); 9 | const rootDir = path.join(__dirname, "../.."); 10 | const QUEUE_FILE = path.join(rootDir, "queue.txt"); 11 | 12 | export class ListQueueTool extends BaseTool { 13 | constructor() { 14 | super(); 15 | } 16 | 17 | get definition(): ToolDefinition { 18 | return { 19 | name: "list_queue", 20 | description: 21 | "List all URLs currently in the documentation processing queue", 22 | inputSchema: { 23 | type: "object", 24 | properties: {}, 25 | required: [], 26 | }, 27 | }; 28 | } 29 | 30 | async execute(_args: any): Promise { 31 | try { 32 | // Check if queue file exists 33 | try { 34 | await fs.access(QUEUE_FILE); 35 | } catch { 36 | return { 37 | content: [ 38 | { 39 | type: "text", 40 | text: "", 41 | }, 42 | ], 43 | }; 44 | } 45 | 46 | // Read queue file 47 | const content = await fs.readFile(QUEUE_FILE, "utf-8"); 48 | const urls = content.split("\n").filter((url) => url.trim() !== ""); 49 | 50 | if (urls.length === 0) { 51 | return { 52 | content: [ 53 | { 54 | type: "text", 55 | text: "", 56 | }, 57 | ], 58 | }; 59 | } 60 | 61 | // Return just the URLs, one per line 62 | return { 63 | content: [ 64 | { 65 | type: "text", 66 | text: urls.join("\n"), 67 | }, 68 | ], 69 | }; 70 | } catch (error) { 71 | console.error("Error reading queue:", error); 72 | return { 73 | content: [ 74 | { 75 | type: "text", 76 | text: "", 77 | }, 78 | ], 79 | }; 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | export interface DocumentChunk { 2 | text: string; 3 | url: string; 4 | title: string; 5 | timestamp: string; 6 | filePath?: string; 7 | language?: string; 8 | chunkIndex?: number; 9 | totalChunks?: number; 10 | repository?: string; 11 | isRepositoryFile?: boolean; 12 | } 13 | 14 | export interface DocumentPayload extends DocumentChunk { 15 | _type: 'DocumentChunk'; 16 | [key: string]: unknown; 17 | } 18 | 19 | export function isDocumentPayload(payload: unknown): payload is DocumentPayload { 20 | if (!payload || typeof payload !== 'object') return false; 21 | const p = payload as Partial; 22 | return ( 23 | p._type === 'DocumentChunk' && 24 | typeof p.text === 'string' && 25 | typeof p.url === 'string' && 26 | typeof p.title === 'string' && 27 | typeof p.timestamp === 'string' 28 | ); 29 | } 30 | 31 | export interface ToolDefinition { 32 | name: string; 33 | description: string; 34 | inputSchema: { 35 | type: string; 36 | properties: Record; 37 | required: string[]; 38 | }; 39 | } 40 | 41 | export interface McpToolResponse { 42 | content: Array<{ 43 | type: string; 44 | text: string; 45 | }>; 46 | isError?: boolean; 47 | } 48 | 49 | export interface RepositoryConfig { 50 | path: string; // Absolute path to repository 51 | name: string; // User-friendly name 52 | include: string[]; // Glob patterns to include 53 | exclude: string[]; // Glob patterns to exclude 54 | watchMode: boolean; // Whether to watch for changes 55 | watchInterval: number; // Polling interval in ms 56 | chunkSize: number; // Default chunk size for files 57 | fileTypeConfig: { // Per file type configuration 58 | [extension: string]: { 59 | include: boolean; 60 | chunkSize?: number; 61 | chunkStrategy?: 'line' | 'character' | 'semantic'; 62 | } 63 | } 64 | } 65 | 66 | export interface IndexingStatus { 67 | repositoryName: string; 68 | status: 'pending' | 'processing' | 'completed' | 'failed'; 69 | startTime: string; 70 | endTime?: string; 71 | totalFiles?: number; 72 | processedFiles?: number; 73 | skippedFiles?: number; 74 | totalChunks?: number; 75 | indexedChunks?: number; 76 | currentBatch?: number; 77 | totalBatches?: number; 78 | percentageComplete?: number; 79 | error?: string; 80 | lastUpdated: string; 81 | } -------------------------------------------------------------------------------- /src/handlers/search-documentation.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse, isDocumentPayload } from '../types.js'; 4 | 5 | const COLLECTION_NAME = 'documentation'; 6 | 7 | export class SearchDocumentationHandler extends BaseHandler { 8 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 9 | if (!args.query || typeof args.query !== 'string') { 10 | throw new McpError(ErrorCode.InvalidParams, 'Query is required'); 11 | } 12 | 13 | const limit = args.limit || 5; 14 | 15 | try { 16 | const queryEmbedding = await this.apiClient.getEmbeddings(args.query); 17 | 18 | const searchResults = await this.apiClient.qdrantClient.search(COLLECTION_NAME, { 19 | vector: queryEmbedding, 20 | limit, 21 | with_payload: true, 22 | with_vector: false, // Optimize network transfer by not retrieving vectors 23 | score_threshold: 0.7, // Only return relevant results 24 | }); 25 | 26 | const formattedResults = searchResults.map(result => { 27 | if (!isDocumentPayload(result.payload)) { 28 | throw new Error('Invalid payload type'); 29 | } 30 | return `[${result.payload.title}](${result.payload.url})\nScore: ${result.score.toFixed(3)}\nContent: ${result.payload.text}\n`; 31 | }).join('\n---\n'); 32 | 33 | return { 34 | content: [ 35 | { 36 | type: 'text', 37 | text: formattedResults || 'No results found matching the query.', 38 | }, 39 | ], 40 | }; 41 | } catch (error) { 42 | if (error instanceof Error) { 43 | if (error.message.includes('unauthorized')) { 44 | throw new McpError( 45 | ErrorCode.InvalidRequest, 46 | 'Failed to authenticate with Qdrant cloud while searching' 47 | ); 48 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 49 | throw new McpError( 50 | ErrorCode.InternalError, 51 | 'Connection to Qdrant cloud failed while searching' 52 | ); 53 | } 54 | } 55 | return { 56 | content: [ 57 | { 58 | type: 'text', 59 | text: `Search failed: ${error}`, 60 | }, 61 | ], 62 | isError: true, 63 | }; 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/handlers/remove-documentation.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse } from '../types.js'; 4 | 5 | const COLLECTION_NAME = 'documentation'; 6 | 7 | export class RemoveDocumentationHandler extends BaseHandler { 8 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 9 | if (!args.urls || !Array.isArray(args.urls) || args.urls.length === 0) { 10 | throw new McpError(ErrorCode.InvalidParams, 'urls must be a non-empty array'); 11 | } 12 | 13 | if (!args.urls.every((url: string) => typeof url === 'string')) { 14 | throw new McpError(ErrorCode.InvalidParams, 'All URLs must be strings'); 15 | } 16 | 17 | try { 18 | // Delete using filter to match any of the provided URLs 19 | const result = await this.apiClient.qdrantClient.delete(COLLECTION_NAME, { 20 | filter: { 21 | should: args.urls.map((url: string) => ({ 22 | key: 'url', 23 | match: { value: url } 24 | })) 25 | }, 26 | wait: true // Ensure deletion is complete before responding 27 | }); 28 | 29 | if (!['acknowledged', 'completed'].includes(result.status)) { 30 | throw new Error('Delete operation failed'); 31 | } 32 | 33 | return { 34 | content: [ 35 | { 36 | type: 'text', 37 | text: `Successfully removed documentation from ${args.urls.length} source${args.urls.length > 1 ? 's' : ''}: ${args.urls.join(', ')}`, 38 | }, 39 | ], 40 | }; 41 | } catch (error) { 42 | if (error instanceof Error) { 43 | if (error.message.includes('unauthorized')) { 44 | throw new McpError( 45 | ErrorCode.InvalidRequest, 46 | 'Failed to authenticate with Qdrant cloud while removing documentation' 47 | ); 48 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 49 | throw new McpError( 50 | ErrorCode.InternalError, 51 | 'Connection to Qdrant cloud failed while removing documentation' 52 | ); 53 | } 54 | } 55 | return { 56 | content: [ 57 | { 58 | type: 'text', 59 | text: `Failed to remove documentation: ${error}`, 60 | }, 61 | ], 62 | isError: true, 63 | }; 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/handlers/remove-repository.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse } from '../types.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | import { RepositoryConfigLoader } from '../utils/repository-config-loader.js'; 7 | 8 | const REPO_CONFIG_DIR = path.join(process.cwd(), 'repo-configs'); 9 | const COLLECTION_NAME = 'documentation'; 10 | 11 | export class RemoveRepositoryHandler extends BaseHandler { 12 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 13 | if (!args.name || typeof args.name !== 'string') { 14 | throw new McpError(ErrorCode.InvalidParams, 'Repository name is required'); 15 | } 16 | 17 | const repoName = args.name; 18 | const configPath = path.join(REPO_CONFIG_DIR, `${repoName}.json`); 19 | 20 | try { 21 | // Check if the repository config exists 22 | try { 23 | await fs.access(configPath); 24 | } catch { 25 | throw new McpError(ErrorCode.InvalidParams, `Repository not found: ${repoName}`); 26 | } 27 | 28 | // Read the config to get repository details 29 | const configContent = await fs.readFile(configPath, 'utf-8'); 30 | const config = JSON.parse(configContent); 31 | 32 | // Remove the repository config file 33 | await fs.unlink(configPath); 34 | 35 | // Update the repositories.json configuration file 36 | const configLoader = new RepositoryConfigLoader(this.server, this.apiClient); 37 | await configLoader.removeRepositoryFromConfig(repoName); 38 | 39 | // Remove repository documents from the vector database 40 | const result = await this.apiClient.qdrantClient.delete(COLLECTION_NAME, { 41 | filter: { 42 | must: [ 43 | { 44 | key: 'repository', 45 | match: { value: repoName } 46 | }, 47 | { 48 | key: 'isRepositoryFile', 49 | match: { value: true } 50 | } 51 | ] 52 | }, 53 | wait: true 54 | }); 55 | 56 | return { 57 | content: [ 58 | { 59 | type: 'text', 60 | text: `Successfully removed repository: ${repoName} (${config.path})`, 61 | }, 62 | ], 63 | }; 64 | } catch (error) { 65 | if (error instanceof McpError) { 66 | throw error; 67 | } 68 | return { 69 | content: [ 70 | { 71 | type: 'text', 72 | text: `Failed to remove repository: ${error}`, 73 | }, 74 | ], 75 | isError: true, 76 | }; 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/tools/list-sources.ts: -------------------------------------------------------------------------------- 1 | import { BaseTool } from './base-tool.js'; 2 | import { ToolDefinition, McpToolResponse, isDocumentPayload } from '../types.js'; 3 | import { ApiClient } from '../api-client.js'; 4 | import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; 5 | 6 | const COLLECTION_NAME = 'documentation'; 7 | 8 | export class ListSourcesTool extends BaseTool { 9 | private apiClient: ApiClient; 10 | 11 | constructor(apiClient: ApiClient) { 12 | super(); 13 | this.apiClient = apiClient; 14 | } 15 | 16 | get definition(): ToolDefinition { 17 | return { 18 | name: 'list_sources', 19 | description: 'List all documentation sources currently stored', 20 | inputSchema: { 21 | type: 'object', 22 | properties: {}, 23 | required: [], 24 | }, 25 | }; 26 | } 27 | 28 | async execute(args: any): Promise { 29 | try { 30 | // Use pagination for better performance with large datasets 31 | const pageSize = 100; 32 | let offset: string | null = null; 33 | const sources = new Set(); 34 | 35 | while (true) { 36 | const scroll = await this.apiClient.qdrantClient.scroll(COLLECTION_NAME, { 37 | with_payload: true, 38 | with_vector: false, // Optimize network transfer 39 | limit: pageSize, 40 | offset, 41 | }); 42 | 43 | if (scroll.points.length === 0) break; 44 | 45 | for (const point of scroll.points) { 46 | if (isDocumentPayload(point.payload)) { 47 | sources.add(`${point.payload.title} (${point.payload.url})`); 48 | } 49 | } 50 | 51 | if (scroll.points.length < pageSize) break; 52 | offset = scroll.points[scroll.points.length - 1].id as string; 53 | } 54 | 55 | return { 56 | content: [ 57 | { 58 | type: 'text', 59 | text: Array.from(sources).join('\n') || 'No documentation sources found in the cloud collection.', 60 | }, 61 | ], 62 | }; 63 | } catch (error) { 64 | if (error instanceof Error) { 65 | if (error.message.includes('unauthorized')) { 66 | throw new McpError( 67 | ErrorCode.InvalidRequest, 68 | 'Failed to authenticate with Qdrant cloud while listing sources' 69 | ); 70 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 71 | throw new McpError( 72 | ErrorCode.InternalError, 73 | 'Connection to Qdrant cloud failed while listing sources' 74 | ); 75 | } 76 | } 77 | return { 78 | content: [ 79 | { 80 | type: 'text', 81 | text: `Failed to list sources: ${error}`, 82 | }, 83 | ], 84 | isError: true, 85 | }; 86 | } 87 | } 88 | } -------------------------------------------------------------------------------- /src/handlers/list-repositories.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse, RepositoryConfig } from '../types.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | 7 | const REPO_CONFIG_DIR = path.join(process.cwd(), 'repo-configs'); 8 | 9 | export class ListRepositoriesHandler extends BaseHandler { 10 | async handle(_args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 11 | try { 12 | // Ensure the config directory exists 13 | try { 14 | await fs.mkdir(REPO_CONFIG_DIR, { recursive: true }); 15 | } catch (error) { 16 | console.error('Error creating repository config directory:', error); 17 | } 18 | 19 | // Get all repository config files 20 | let configFiles: string[]; 21 | try { 22 | configFiles = await fs.readdir(REPO_CONFIG_DIR); 23 | } catch (error) { 24 | return { 25 | content: [ 26 | { 27 | type: 'text', 28 | text: 'No repositories found (config directory is empty)', 29 | }, 30 | ], 31 | }; 32 | } 33 | 34 | // Filter for JSON files 35 | configFiles = configFiles.filter(file => file.endsWith('.json')); 36 | 37 | if (configFiles.length === 0) { 38 | return { 39 | content: [ 40 | { 41 | type: 'text', 42 | text: 'No repositories found', 43 | }, 44 | ], 45 | }; 46 | } 47 | 48 | // Load each repository config 49 | const repositories: RepositoryConfig[] = []; 50 | for (const file of configFiles) { 51 | try { 52 | const configPath = path.join(REPO_CONFIG_DIR, file); 53 | const configContent = await fs.readFile(configPath, 'utf-8'); 54 | const config = JSON.parse(configContent) as RepositoryConfig; 55 | repositories.push(config); 56 | } catch (error) { 57 | console.error(`Error loading repository config ${file}:`, error); 58 | } 59 | } 60 | 61 | // Format the response 62 | const repoList = repositories.map(repo => { 63 | return `- ${repo.name} (${repo.path}) 64 | Include: ${repo.include.join(', ')} 65 | Exclude: ${repo.exclude.join(', ')} 66 | Watch Mode: ${repo.watchMode ? 'Enabled' : 'Disabled'} 67 | File Types: ${Object.keys(repo.fileTypeConfig).length} configured`; 68 | }); 69 | 70 | return { 71 | content: [ 72 | { 73 | type: 'text', 74 | text: repositories.length > 0 75 | ? `Found ${repositories.length} repositories:\n\n${repoList.join('\n\n')}` 76 | : 'No valid repositories found', 77 | }, 78 | ], 79 | }; 80 | } catch (error) { 81 | return { 82 | content: [ 83 | { 84 | type: 'text', 85 | text: `Failed to list repositories: ${error}`, 86 | }, 87 | ], 88 | isError: true, 89 | }; 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/utils/language-detection.ts: -------------------------------------------------------------------------------- 1 | import path from 'path'; 2 | 3 | // Map of file extensions to language names 4 | const EXTENSION_TO_LANGUAGE: Record = { 5 | // JavaScript and TypeScript 6 | '.js': 'javascript', 7 | '.jsx': 'javascript', 8 | '.ts': 'typescript', 9 | '.tsx': 'typescript', 10 | 11 | // Web 12 | '.html': 'html', 13 | '.css': 'css', 14 | '.scss': 'scss', 15 | '.less': 'less', 16 | 17 | // Python 18 | '.py': 'python', 19 | '.ipynb': 'jupyter', 20 | 21 | // Java and JVM languages 22 | '.java': 'java', 23 | '.kt': 'kotlin', 24 | '.scala': 'scala', 25 | '.groovy': 'groovy', 26 | 27 | // C-family 28 | '.c': 'c', 29 | '.cpp': 'cpp', 30 | '.cc': 'cpp', 31 | '.h': 'c', 32 | '.hpp': 'cpp', 33 | '.cs': 'csharp', 34 | 35 | // Ruby 36 | '.rb': 'ruby', 37 | '.erb': 'ruby', 38 | 39 | // PHP 40 | '.php': 'php', 41 | 42 | // Go 43 | '.go': 'go', 44 | 45 | // Rust 46 | '.rs': 'rust', 47 | 48 | // Swift 49 | '.swift': 'swift', 50 | 51 | // Shell 52 | '.sh': 'bash', 53 | '.bash': 'bash', 54 | '.zsh': 'bash', 55 | 56 | // Data formats 57 | '.json': 'json', 58 | '.xml': 'xml', 59 | '.yaml': 'yaml', 60 | '.yml': 'yaml', 61 | '.toml': 'toml', 62 | 63 | // Documentation 64 | '.md': 'markdown', 65 | '.markdown': 'markdown', 66 | '.rst': 'restructuredtext', 67 | '.txt': 'text', 68 | 69 | // Configuration 70 | '.ini': 'ini', 71 | '.cfg': 'ini', 72 | '.conf': 'ini', 73 | '.properties': 'properties', 74 | 75 | // Other 76 | '.sql': 'sql', 77 | '.graphql': 'graphql', 78 | '.proto': 'protobuf', 79 | '.dockerfile': 'dockerfile', 80 | '.Dockerfile': 'dockerfile', 81 | }; 82 | 83 | // Shebang patterns for script files 84 | const SHEBANG_PATTERNS: Array<[RegExp, string]> = [ 85 | [/^#!.*\bpython\b/, 'python'], 86 | [/^#!.*\bnode\b/, 'javascript'], 87 | [/^#!.*\bbash\b/, 'bash'], 88 | [/^#!.*\bsh\b/, 'bash'], 89 | [/^#!.*\bruby\b/, 'ruby'], 90 | [/^#!.*\bperl\b/, 'perl'], 91 | [/^#!.*\bphp\b/, 'php'], 92 | ]; 93 | 94 | /** 95 | * Detect the programming language of a file based on its extension and content 96 | * 97 | * @param filePath The path to the file 98 | * @param content The content of the file 99 | * @returns The detected language or 'unknown' 100 | */ 101 | export function detectLanguage(filePath: string, content: string): string { 102 | const extension = path.extname(filePath).toLowerCase(); 103 | 104 | // Check if we have a direct mapping for this extension 105 | if (extension in EXTENSION_TO_LANGUAGE) { 106 | return EXTENSION_TO_LANGUAGE[extension]; 107 | } 108 | 109 | // Special case for Dockerfiles 110 | if (path.basename(filePath) === 'Dockerfile') { 111 | return 'dockerfile'; 112 | } 113 | 114 | // Check for shebang in the first line for script files 115 | const firstLine = content.split('\n')[0]; 116 | for (const [pattern, language] of SHEBANG_PATTERNS) { 117 | if (pattern.test(firstLine)) { 118 | return language; 119 | } 120 | } 121 | 122 | // Default to 'unknown' if we couldn't determine the language 123 | return 'unknown'; 124 | } 125 | -------------------------------------------------------------------------------- /src/tools/search-documentation.ts: -------------------------------------------------------------------------------- 1 | import { BaseTool } from './base-tool.js'; 2 | import { ToolDefinition, McpToolResponse, isDocumentPayload } from '../types.js'; 3 | import { ApiClient } from '../api-client.js'; 4 | import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; 5 | 6 | const COLLECTION_NAME = 'documentation'; 7 | 8 | export class SearchDocumentationTool extends BaseTool { 9 | private apiClient: ApiClient; 10 | 11 | constructor(apiClient: ApiClient) { 12 | super(); 13 | this.apiClient = apiClient; 14 | } 15 | 16 | get definition(): ToolDefinition { 17 | return { 18 | name: 'search_documentation', 19 | description: 'Search through stored documentation', 20 | inputSchema: { 21 | type: 'object', 22 | properties: { 23 | query: { 24 | type: 'string', 25 | description: 'Search query', 26 | }, 27 | limit: { 28 | type: 'number', 29 | description: 'Maximum number of results to return', 30 | default: 5, 31 | }, 32 | }, 33 | required: ['query'], 34 | }, 35 | }; 36 | } 37 | 38 | async execute(args: any): Promise { 39 | if (!args.query || typeof args.query !== 'string') { 40 | throw new McpError(ErrorCode.InvalidParams, 'Query is required'); 41 | } 42 | 43 | const limit = args.limit || 5; 44 | 45 | try { 46 | const queryEmbedding = await this.apiClient.getEmbeddings(args.query); 47 | 48 | const searchResults = await this.apiClient.qdrantClient.search(COLLECTION_NAME, { 49 | vector: queryEmbedding, 50 | limit, 51 | with_payload: true, 52 | with_vector: false, // Optimize network transfer by not retrieving vectors 53 | score_threshold: 0.7, // Only return relevant results 54 | }); 55 | 56 | const formattedResults = searchResults.map(result => { 57 | if (!isDocumentPayload(result.payload)) { 58 | throw new Error('Invalid payload type'); 59 | } 60 | return `[${result.payload.title}](${result.payload.url})\nScore: ${result.score.toFixed(3)}\nContent: ${result.payload.text}\n`; 61 | }).join('\n---\n'); 62 | 63 | return { 64 | content: [ 65 | { 66 | type: 'text', 67 | text: formattedResults || 'No results found matching the query.', 68 | }, 69 | ], 70 | }; 71 | } catch (error) { 72 | if (error instanceof Error) { 73 | if (error.message.includes('unauthorized')) { 74 | throw new McpError( 75 | ErrorCode.InvalidRequest, 76 | 'Failed to authenticate with Qdrant cloud while searching' 77 | ); 78 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 79 | throw new McpError( 80 | ErrorCode.InternalError, 81 | 'Connection to Qdrant cloud failed while searching' 82 | ); 83 | } 84 | } 85 | return { 86 | content: [ 87 | { 88 | type: 'text', 89 | text: `Search failed: ${error}`, 90 | }, 91 | ], 92 | isError: true, 93 | }; 94 | } 95 | } 96 | } -------------------------------------------------------------------------------- /src/tools/remove-documentation.ts: -------------------------------------------------------------------------------- 1 | import { BaseTool } from './base-tool.js'; 2 | import { ToolDefinition, McpToolResponse } from '../types.js'; 3 | import { ApiClient } from '../api-client.js'; 4 | import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; 5 | 6 | const COLLECTION_NAME = 'documentation'; 7 | 8 | export class RemoveDocumentationTool extends BaseTool { 9 | private apiClient: ApiClient; 10 | 11 | constructor(apiClient: ApiClient) { 12 | super(); 13 | this.apiClient = apiClient; 14 | } 15 | 16 | get definition(): ToolDefinition { 17 | return { 18 | name: 'remove_documentation', 19 | description: 'Remove one or more documentation sources by their URLs', 20 | inputSchema: { 21 | type: 'object', 22 | properties: { 23 | urls: { 24 | type: 'array', 25 | items: { 26 | type: 'string', 27 | description: 'URL of a documentation source to remove' 28 | }, 29 | description: 'Array of URLs to remove. Can be a single URL or multiple URLs.', 30 | minItems: 1 31 | } 32 | }, 33 | required: ['urls'], 34 | }, 35 | }; 36 | } 37 | 38 | async execute(args: { urls: string[] }): Promise { 39 | if (!Array.isArray(args.urls) || args.urls.length === 0) { 40 | throw new McpError(ErrorCode.InvalidParams, 'At least one URL is required'); 41 | } 42 | 43 | if (!args.urls.every(url => typeof url === 'string')) { 44 | throw new McpError(ErrorCode.InvalidParams, 'All URLs must be strings'); 45 | } 46 | 47 | try { 48 | // Delete using filter to match any of the provided URLs 49 | const result = await this.apiClient.qdrantClient.delete(COLLECTION_NAME, { 50 | filter: { 51 | should: args.urls.map(url => ({ 52 | key: 'url', 53 | match: { value: url } 54 | })) 55 | }, 56 | wait: true 57 | }); 58 | 59 | if (!['acknowledged', 'completed'].includes(result.status)) { 60 | throw new Error('Delete operation failed'); 61 | } 62 | 63 | return { 64 | content: [ 65 | { 66 | type: 'text', 67 | text: `Successfully removed documentation from ${args.urls.length} source${args.urls.length > 1 ? 's' : ''}: ${args.urls.join(', ')}`, 68 | }, 69 | ], 70 | }; 71 | } catch (error) { 72 | if (error instanceof Error) { 73 | if (error.message.includes('unauthorized')) { 74 | throw new McpError( 75 | ErrorCode.InvalidRequest, 76 | 'Failed to authenticate with Qdrant cloud while removing documentation' 77 | ); 78 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 79 | throw new McpError( 80 | ErrorCode.InternalError, 81 | 'Connection to Qdrant cloud failed while removing documentation' 82 | ); 83 | } 84 | } 85 | return { 86 | content: [ 87 | { 88 | type: 'text', 89 | text: `Failed to remove documentation: ${error}`, 90 | }, 91 | ], 92 | isError: true, 93 | }; 94 | } 95 | } 96 | } -------------------------------------------------------------------------------- /src/handlers/run-queue.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { ApiClient } from '../api-client.js'; 3 | import { BaseHandler } from './base-handler.js'; 4 | import { McpToolResponse } from '../types.js'; 5 | import { AddDocumentationHandler } from './add-documentation.js'; 6 | import fs from 'fs/promises'; 7 | import path from 'path'; 8 | import { fileURLToPath } from 'url'; 9 | 10 | // Get current directory in ES modules 11 | const __filename = fileURLToPath(import.meta.url); 12 | const __dirname = path.dirname(__filename); 13 | const QUEUE_FILE = path.join(__dirname, '..', '..', 'queue.txt'); 14 | 15 | export class RunQueueHandler extends BaseHandler { 16 | private addDocHandler: AddDocumentationHandler; 17 | 18 | constructor(server: Server, apiClient: ApiClient) { 19 | super(server, apiClient); 20 | this.addDocHandler = new AddDocumentationHandler(server, apiClient); 21 | } 22 | 23 | async handle(_args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 24 | try { 25 | // Check if queue file exists 26 | try { 27 | await fs.access(QUEUE_FILE); 28 | } catch { 29 | return { 30 | content: [ 31 | { 32 | type: 'text', 33 | text: 'Queue is empty (queue file does not exist)', 34 | }, 35 | ], 36 | }; 37 | } 38 | 39 | let processedCount = 0; 40 | let failedCount = 0; 41 | const failedUrls: string[] = []; 42 | 43 | while (true) { 44 | // Read current queue 45 | const content = await fs.readFile(QUEUE_FILE, 'utf-8'); 46 | const urls = content.split('\n').filter(url => url.trim() !== ''); 47 | 48 | if (urls.length === 0) { 49 | break; // Queue is empty 50 | } 51 | 52 | const currentUrl = urls[0]; // Get first URL 53 | 54 | try { 55 | // Process the URL using add_documentation handler 56 | // Pass the callContext along if it exists 57 | await this.addDocHandler.handle({ url: currentUrl }, callContext); 58 | processedCount++; 59 | } catch (error) { 60 | failedCount++; 61 | failedUrls.push(currentUrl); 62 | console.error(`Failed to process URL ${currentUrl}:`, error); 63 | } 64 | 65 | // Remove the processed URL from queue 66 | const remainingUrls = urls.slice(1); 67 | await fs.writeFile(QUEUE_FILE, remainingUrls.join('\n') + (remainingUrls.length > 0 ? '\n' : '')); 68 | } 69 | 70 | let resultText = `Queue processing complete.\nProcessed: ${processedCount} URLs\nFailed: ${failedCount} URLs`; 71 | if (failedUrls.length > 0) { 72 | resultText += `\n\nFailed URLs:\n${failedUrls.join('\n')}`; 73 | } 74 | 75 | return { 76 | content: [ 77 | { 78 | type: 'text', 79 | text: resultText, 80 | }, 81 | ], 82 | }; 83 | } catch (error) { 84 | return { 85 | content: [ 86 | { 87 | type: 'text', 88 | text: `Failed to process queue: ${error}`, 89 | }, 90 | ], 91 | isError: true, 92 | }; 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/handlers/extract-urls.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse } from '../types.js'; 4 | import * as cheerio from 'cheerio'; 5 | import fs from 'fs/promises'; 6 | import path from 'path'; 7 | import { fileURLToPath } from 'url'; 8 | 9 | // Get current directory in ES modules 10 | const __filename = fileURLToPath(import.meta.url); 11 | const __dirname = path.dirname(__filename); 12 | const QUEUE_FILE = path.join(__dirname, '..', '..', 'queue.txt'); 13 | 14 | export class ExtractUrlsHandler extends BaseHandler { 15 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 16 | if (!args.url || typeof args.url !== 'string') { 17 | throw new McpError(ErrorCode.InvalidParams, 'URL is required'); 18 | } 19 | 20 | await this.apiClient.initBrowser(); 21 | const page = await this.apiClient.browser.newPage(); 22 | 23 | try { 24 | const baseUrl = new URL(args.url); 25 | const basePath = baseUrl.pathname.split('/').slice(0, 3).join('/'); // Get the base path (e.g., /3/ for Python docs) 26 | 27 | await page.goto(args.url, { waitUntil: 'networkidle' }); 28 | const content = await page.content(); 29 | const $ = cheerio.load(content); 30 | const urls = new Set(); 31 | 32 | $('a[href]').each((_, element) => { 33 | const href = $(element).attr('href'); 34 | if (href) { 35 | try { 36 | const url = new URL(href, args.url); 37 | // Only include URLs from the same documentation section 38 | if (url.hostname === baseUrl.hostname && 39 | url.pathname.startsWith(basePath) && 40 | !url.hash && 41 | !url.href.endsWith('#')) { 42 | urls.add(url.href); 43 | } 44 | } catch (e) { 45 | // Ignore invalid URLs 46 | } 47 | } 48 | }); 49 | 50 | const urlArray = Array.from(urls); 51 | 52 | if (args.add_to_queue) { 53 | try { 54 | // Ensure queue file exists 55 | try { 56 | await fs.access(QUEUE_FILE); 57 | } catch { 58 | await fs.writeFile(QUEUE_FILE, ''); 59 | } 60 | 61 | // Append URLs to queue 62 | const urlsToAdd = urlArray.join('\n') + (urlArray.length > 0 ? '\n' : ''); 63 | await fs.appendFile(QUEUE_FILE, urlsToAdd); 64 | 65 | return { 66 | content: [ 67 | { 68 | type: 'text', 69 | text: `Successfully added ${urlArray.length} URLs to the queue`, 70 | }, 71 | ], 72 | }; 73 | } catch (error) { 74 | return { 75 | content: [ 76 | { 77 | type: 'text', 78 | text: `Failed to add URLs to queue: ${error}`, 79 | }, 80 | ], 81 | isError: true, 82 | }; 83 | } 84 | } 85 | 86 | return { 87 | content: [ 88 | { 89 | type: 'text', 90 | text: urlArray.join('\n') || 'No URLs found on this page.', 91 | }, 92 | ], 93 | }; 94 | } catch (error) { 95 | return { 96 | content: [ 97 | { 98 | type: 'text', 99 | text: `Failed to extract URLs: ${error}`, 100 | }, 101 | ], 102 | isError: true, 103 | }; 104 | } finally { 105 | await page.close(); 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import { Server } from "@modelcontextprotocol/sdk/server/index.js"; 3 | import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; 4 | import { ApiClient } from "./api-client.js"; 5 | import { HandlerRegistry } from "./handler-registry.js"; 6 | import { WebInterface } from "./server.js"; 7 | import { RepositoryConfigLoader } from "./utils/repository-config-loader.js"; 8 | 9 | const COLLECTION_NAME = "documentation"; 10 | 11 | class RagDocsServer { 12 | private server: Server; 13 | private apiClient: ApiClient; 14 | private handlerRegistry: HandlerRegistry; 15 | private webInterface: WebInterface; 16 | private repoConfigLoader: RepositoryConfigLoader; 17 | 18 | constructor() { 19 | this.server = new Server( 20 | { 21 | name: "mcp-ragdocs", 22 | version: "1.0.0", 23 | }, 24 | { 25 | capabilities: { 26 | tools: {}, 27 | prompts: { 28 | listChanged: false 29 | }, 30 | resources: { 31 | listChanged: false 32 | }, 33 | }, 34 | } 35 | ); 36 | 37 | this.apiClient = new ApiClient(); 38 | this.handlerRegistry = new HandlerRegistry(this.server, this.apiClient); 39 | this.webInterface = new WebInterface(this.apiClient); 40 | this.repoConfigLoader = new RepositoryConfigLoader(this.server, this.apiClient); 41 | 42 | // Error handling 43 | this.server.onerror = (error) => console.error("[MCP Error]", error); 44 | process.on("SIGINT", async () => { 45 | await this.cleanup(); 46 | process.exit(0); 47 | }); 48 | } 49 | 50 | private async cleanup() { 51 | await this.apiClient.cleanup(); 52 | await this.webInterface.stop(); 53 | await this.server.close(); 54 | } 55 | 56 | async run() { 57 | try { 58 | // Redirect console methods to stderr to avoid interfering with JSON-RPC communication 59 | const originalConsoleLog = console.log; 60 | const originalConsoleInfo = console.info; 61 | const originalConsoleWarn = console.warn; 62 | const originalConsoleError = console.error; 63 | 64 | console.log = (...args) => { 65 | process.stderr.write(args.map(arg => String(arg)).join(' ') + '\n'); 66 | }; 67 | console.info = (...args) => { 68 | process.stderr.write(args.map(arg => String(arg)).join(' ') + '\n'); 69 | }; 70 | console.warn = (...args) => { 71 | process.stderr.write(args.map(arg => String(arg)).join(' ') + '\n'); 72 | }; 73 | console.error = (...args) => { 74 | process.stderr.write(args.map(arg => String(arg)).join(' ') + '\n'); 75 | }; 76 | 77 | // Initialize Qdrant collection 78 | console.log("Initializing Qdrant collection..."); 79 | await this.apiClient.initCollection(COLLECTION_NAME); 80 | console.log("Qdrant collection initialized successfully"); 81 | 82 | // Start web interface 83 | await this.webInterface.start(); 84 | console.log("Web interface is running"); 85 | 86 | // Load repositories from configuration 87 | console.log("Loading repositories from configuration..."); 88 | await this.repoConfigLoader.loadRepositories(); 89 | 90 | // Start MCP server 91 | const transport = new StdioServerTransport(); 92 | await this.server.connect(transport); 93 | console.log("RAG Docs MCP server running on stdio"); 94 | } catch (error) { 95 | process.stderr.write(`Failed to initialize server: ${error}\n`); 96 | process.exit(1); 97 | } 98 | } 99 | } 100 | 101 | const server = new RagDocsServer(); 102 | server.run().catch((error) => { 103 | process.stderr.write(`Fatal error: ${error}\n`); 104 | process.exit(1); 105 | }); 106 | -------------------------------------------------------------------------------- /src/tools/run-queue.ts: -------------------------------------------------------------------------------- 1 | import { BaseTool } from './base-tool.js'; 2 | import { ToolDefinition, McpToolResponse } from '../types.js'; 3 | import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | import { fileURLToPath } from 'url'; 7 | import { ApiClient } from '../api-client.js'; 8 | import { AddDocumentationHandler } from '../handlers/add-documentation.js'; 9 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 10 | 11 | // Get current directory in ES modules 12 | const __filename = fileURLToPath(import.meta.url); 13 | const __dirname = path.dirname(__filename); 14 | const QUEUE_FILE = path.join(__dirname, '..', '..', 'queue.txt'); 15 | 16 | export class RunQueueTool extends BaseTool { 17 | private apiClient: ApiClient; 18 | private addDocHandler: AddDocumentationHandler; 19 | 20 | constructor(apiClient: ApiClient) { 21 | super(); 22 | this.apiClient = apiClient; 23 | // Create a temporary server instance just for the handler 24 | const tempServer = new Server( 25 | { name: 'temp', version: '0.0.0' }, 26 | { capabilities: { tools: {} } } 27 | ); 28 | this.addDocHandler = new AddDocumentationHandler(tempServer, apiClient); 29 | } 30 | 31 | get definition(): ToolDefinition { 32 | return { 33 | name: 'run_queue', 34 | description: 'Process URLs from the queue one at a time until complete', 35 | inputSchema: { 36 | type: 'object', 37 | properties: {}, 38 | required: [], 39 | }, 40 | }; 41 | } 42 | 43 | async execute(_args: any): Promise { 44 | try { 45 | // Check if queue file exists 46 | try { 47 | await fs.access(QUEUE_FILE); 48 | } catch { 49 | return { 50 | content: [ 51 | { 52 | type: 'text', 53 | text: 'Queue is empty (queue file does not exist)', 54 | }, 55 | ], 56 | }; 57 | } 58 | 59 | let processedCount = 0; 60 | let failedCount = 0; 61 | const failedUrls: string[] = []; 62 | 63 | while (true) { 64 | // Read current queue 65 | const content = await fs.readFile(QUEUE_FILE, 'utf-8'); 66 | const urls = content.split('\n').filter(url => url.trim() !== ''); 67 | 68 | if (urls.length === 0) { 69 | break; // Queue is empty 70 | } 71 | 72 | const currentUrl = urls[0]; // Get first URL 73 | 74 | try { 75 | // Process the URL using the handler 76 | await this.addDocHandler.handle({ url: currentUrl }); 77 | processedCount++; 78 | } catch (error) { 79 | failedCount++; 80 | failedUrls.push(currentUrl); 81 | console.error(`Failed to process URL ${currentUrl}:`, error); 82 | } 83 | 84 | // Remove the processed URL from queue 85 | const remainingUrls = urls.slice(1); 86 | await fs.writeFile(QUEUE_FILE, remainingUrls.join('\n') + (remainingUrls.length > 0 ? '\n' : '')); 87 | } 88 | 89 | let resultText = `Queue processing complete.\nProcessed: ${processedCount} URLs\nFailed: ${failedCount} URLs`; 90 | if (failedUrls.length > 0) { 91 | resultText += `\n\nFailed URLs:\n${failedUrls.join('\n')}`; 92 | } 93 | 94 | return { 95 | content: [ 96 | { 97 | type: 'text', 98 | text: resultText, 99 | }, 100 | ], 101 | }; 102 | } catch (error) { 103 | return { 104 | content: [ 105 | { 106 | type: 'text', 107 | text: `Failed to process queue: ${error}`, 108 | }, 109 | ], 110 | isError: true, 111 | }; 112 | } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/tools/extract-urls.ts: -------------------------------------------------------------------------------- 1 | import { ErrorCode, McpError } from "@modelcontextprotocol/sdk/types.js"; 2 | import * as cheerio from "cheerio"; 3 | import fs from "fs/promises"; 4 | import path from "path"; 5 | import { fileURLToPath } from "url"; 6 | import { ApiClient } from "../api-client.js"; 7 | import { McpToolResponse, ToolDefinition } from "../types.js"; 8 | import { BaseTool } from "./base-tool.js"; 9 | 10 | // Get current directory in ES modules 11 | const __filename = fileURLToPath(import.meta.url); 12 | const __dirname = path.dirname(__filename); 13 | const QUEUE_FILE = path.join(__dirname, "..", "..", "queue.txt"); 14 | 15 | export class ExtractUrlsTool extends BaseTool { 16 | private apiClient: ApiClient; 17 | 18 | constructor(apiClient: ApiClient) { 19 | super(); 20 | this.apiClient = apiClient; 21 | } 22 | 23 | get definition(): ToolDefinition { 24 | return { 25 | name: "extract_urls", 26 | description: "Extract all URLs from a given web page", 27 | inputSchema: { 28 | type: "object", 29 | properties: { 30 | url: { 31 | type: "string", 32 | description: "URL of the page to extract URLs from", 33 | }, 34 | add_to_queue: { 35 | type: "boolean", 36 | description: 37 | "If true, automatically add extracted URLs to the queue", 38 | default: false, 39 | }, 40 | }, 41 | required: ["url"], 42 | }, 43 | }; 44 | } 45 | 46 | async execute(args: any): Promise { 47 | if (!args.url || typeof args.url !== "string") { 48 | throw new McpError(ErrorCode.InvalidParams, "URL is required"); 49 | } 50 | 51 | await this.apiClient.initBrowser(); 52 | const page = await this.apiClient.browser.newPage(); 53 | 54 | try { 55 | await page.goto(args.url, { waitUntil: "networkidle" }); 56 | const content = await page.content(); 57 | const $ = cheerio.load(content); 58 | const urls = new Set(); 59 | 60 | $("a[href]").each((_, element) => { 61 | const href = $(element).attr("href"); 62 | if (href) { 63 | try { 64 | const url = new URL(href, args.url); 65 | // Only include URLs from the same domain to avoid external links 66 | if ( 67 | url.origin === new URL(args.url).origin && 68 | !url.hash && 69 | !url.href.endsWith("#") 70 | ) { 71 | urls.add(url.href); 72 | } 73 | } catch (e) { 74 | // Ignore invalid URLs 75 | } 76 | } 77 | }); 78 | 79 | const urlArray = Array.from(urls); 80 | 81 | if (args.add_to_queue) { 82 | try { 83 | // Ensure queue file exists 84 | try { 85 | await fs.access(QUEUE_FILE); 86 | } catch { 87 | await fs.writeFile(QUEUE_FILE, ""); 88 | } 89 | 90 | // Append URLs to queue 91 | const urlsToAdd = 92 | urlArray.join("\n") + (urlArray.length > 0 ? "\n" : ""); 93 | await fs.appendFile(QUEUE_FILE, urlsToAdd); 94 | 95 | return { 96 | content: [ 97 | { 98 | type: "text", 99 | text: `Successfully added ${urlArray.length} URLs to the queue`, 100 | }, 101 | ], 102 | }; 103 | } catch (error) { 104 | return { 105 | content: [ 106 | { 107 | type: "text", 108 | text: `Failed to add URLs to queue: ${error}`, 109 | }, 110 | ], 111 | isError: true, 112 | }; 113 | } 114 | } 115 | 116 | return { 117 | content: [ 118 | { 119 | type: "text", 120 | text: urlArray.join("\n") || "No URLs found on this page.", 121 | }, 122 | ], 123 | }; 124 | } catch (error) { 125 | return { 126 | content: [ 127 | { 128 | type: "text", 129 | text: `Failed to extract URLs: ${error}`, 130 | }, 131 | ], 132 | isError: true, 133 | }; 134 | } finally { 135 | await page.close(); 136 | } 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/api-client.ts: -------------------------------------------------------------------------------- 1 | import { ErrorCode, McpError } from "@modelcontextprotocol/sdk/types.js"; 2 | import { QdrantClient } from "@qdrant/js-client-rest"; 3 | import { chromium } from "playwright"; 4 | import { EmbeddingService } from "./services/embeddings.js"; 5 | 6 | // Environment variables for configuration 7 | const EMBEDDING_PROVIDER = process.env.EMBEDDING_PROVIDER || 'ollama'; 8 | const EMBEDDING_MODEL = process.env.EMBEDDING_MODEL; 9 | const OPENAI_API_KEY = process.env.OPENAI_API_KEY; 10 | const FALLBACK_PROVIDER = process.env.FALLBACK_PROVIDER; 11 | const FALLBACK_MODEL = process.env.FALLBACK_MODEL; 12 | const QDRANT_URL = process.env.QDRANT_URL || 'http://localhost:6333'; 13 | const QDRANT_API_KEY = process.env.QDRANT_API_KEY; 14 | 15 | if (!QDRANT_URL) { 16 | throw new Error( 17 | "QDRANT_URL environment variable is required for cloud storage" 18 | ); 19 | } 20 | 21 | if ((EMBEDDING_PROVIDER === 'openai' || FALLBACK_PROVIDER === 'openai') && !OPENAI_API_KEY) { 22 | throw new Error( 23 | "OPENAI_API_KEY environment variable is required when using OpenAI as either primary or fallback provider" 24 | ); 25 | } 26 | 27 | if (EMBEDDING_PROVIDER === 'ollama') { 28 | console.warn('Using Ollama as primary provider. Make sure Ollama is running locally.'); 29 | } 30 | 31 | export class ApiClient { 32 | qdrantClient: QdrantClient; 33 | embeddingService: EmbeddingService; 34 | browser: any; 35 | vectorSize: number; 36 | 37 | constructor() { 38 | // Initialize Qdrant client with cloud configuration 39 | this.qdrantClient = new QdrantClient({ 40 | url: QDRANT_URL, 41 | apiKey: QDRANT_API_KEY, 42 | }); 43 | 44 | // Initialize embedding service with configured provider 45 | this.embeddingService = EmbeddingService.createFromConfig({ 46 | provider: EMBEDDING_PROVIDER as 'ollama' | 'openai', 47 | apiKey: EMBEDDING_PROVIDER === 'openai' ? OPENAI_API_KEY : undefined, 48 | model: EMBEDDING_MODEL, 49 | fallbackProvider: FALLBACK_PROVIDER as 'ollama' | 'openai' | undefined, 50 | fallbackApiKey: FALLBACK_PROVIDER === 'openai' ? OPENAI_API_KEY : undefined, 51 | fallbackModel: FALLBACK_MODEL 52 | }); 53 | 54 | this.vectorSize = this.embeddingService.getVectorSize(); 55 | } 56 | 57 | async initBrowser() { 58 | if (!this.browser) { 59 | this.browser = await chromium.launch(); 60 | } 61 | } 62 | 63 | async cleanup() { 64 | if (this.browser) { 65 | await this.browser.close(); 66 | } 67 | } 68 | 69 | async getEmbeddings(text: string): Promise { 70 | try { 71 | return await this.embeddingService.generateEmbeddings(text); 72 | } catch (error) { 73 | if (error instanceof McpError) { 74 | throw error; 75 | } 76 | throw new McpError( 77 | ErrorCode.InternalError, 78 | `Failed to generate embeddings: ${error}` 79 | ); 80 | } 81 | } 82 | 83 | async initCollection(COLLECTION_NAME: string) { 84 | try { 85 | const collections = await this.qdrantClient.getCollections(); 86 | const exists = collections.collections.some( 87 | (c) => c.name === COLLECTION_NAME 88 | ); 89 | 90 | if (!exists) { 91 | await this.qdrantClient.createCollection(COLLECTION_NAME, { 92 | vectors: { 93 | size: this.vectorSize, // Dynamic size based on provider 94 | distance: "Cosine", 95 | }, 96 | optimizers_config: { 97 | default_segment_number: 2, 98 | memmap_threshold: 20000, 99 | }, 100 | replication_factor: 2, 101 | }); 102 | } 103 | } catch (error) { 104 | if (error instanceof Error) { 105 | if (error.message.includes("unauthorized")) { 106 | throw new McpError( 107 | ErrorCode.InvalidRequest, 108 | "Failed to authenticate with Qdrant cloud. Please check your API key." 109 | ); 110 | } else if ( 111 | error.message.includes("ECONNREFUSED") || 112 | error.message.includes("ETIMEDOUT") 113 | ) { 114 | throw new McpError( 115 | ErrorCode.InternalError, 116 | "Failed to connect to Qdrant cloud. Please check your QDRANT_URL." 117 | ); 118 | } 119 | } 120 | throw new McpError( 121 | ErrorCode.InternalError, 122 | `Failed to initialize Qdrant cloud collection: ${error}` 123 | ); 124 | } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/handlers/get-indexing-status.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse } from '../types.js'; 4 | import { IndexingStatusManager } from '../utils/indexing-status-manager.js'; 5 | 6 | export class GetIndexingStatusHandler extends BaseHandler { 7 | private statusManager: IndexingStatusManager; 8 | 9 | constructor(server: any, apiClient: any) { 10 | super(server, apiClient); 11 | this.statusManager = new IndexingStatusManager(); 12 | } 13 | 14 | async handle(args: any): Promise { 15 | // If name is provided, get status for specific repository 16 | if (args.name && typeof args.name === 'string') { 17 | const status = await this.statusManager.getStatus(args.name); 18 | 19 | if (!status) { 20 | return { 21 | content: [ 22 | { 23 | type: 'text', 24 | text: `No indexing status found for repository: ${args.name}`, 25 | }, 26 | ], 27 | }; 28 | } 29 | 30 | // Format the status information 31 | const formattedStatus = this.formatStatus(status); 32 | 33 | return { 34 | content: [ 35 | { 36 | type: 'text', 37 | text: formattedStatus, 38 | }, 39 | ], 40 | }; 41 | } 42 | // Otherwise, get all statuses 43 | else { 44 | const allStatuses = await this.statusManager.getAllStatuses(); 45 | 46 | if (allStatuses.length === 0) { 47 | return { 48 | content: [ 49 | { 50 | type: 'text', 51 | text: 'No repository indexing operations found.', 52 | }, 53 | ], 54 | }; 55 | } 56 | 57 | // Format all statuses 58 | const formattedStatuses = allStatuses.map(status => this.formatStatus(status)).join('\n\n---\n\n'); 59 | 60 | return { 61 | content: [ 62 | { 63 | type: 'text', 64 | text: formattedStatuses, 65 | }, 66 | ], 67 | }; 68 | } 69 | } 70 | 71 | private formatStatus(status: any): string { 72 | const startTime = new Date(status.startTime).toLocaleString(); 73 | const endTime = status.endTime ? new Date(status.endTime).toLocaleString() : 'In progress'; 74 | const duration = status.endTime 75 | ? this.formatDuration(new Date(status.endTime).getTime() - new Date(status.startTime).getTime()) 76 | : this.formatDuration(Date.now() - new Date(status.startTime).getTime()); 77 | 78 | let statusText = ''; 79 | 80 | switch (status.status) { 81 | case 'pending': 82 | statusText = '⏳ Pending'; 83 | break; 84 | case 'processing': 85 | statusText = '🔄 Processing'; 86 | break; 87 | case 'completed': 88 | statusText = '✅ Completed'; 89 | break; 90 | case 'failed': 91 | statusText = '❌ Failed'; 92 | break; 93 | default: 94 | statusText = status.status; 95 | } 96 | 97 | let result = `Repository: ${status.repositoryName}\n`; 98 | result += `Status: ${statusText}\n`; 99 | result += `Progress: ${status.percentageComplete || 0}%\n`; 100 | result += `Started: ${startTime}\n`; 101 | 102 | if (status.status === 'completed' || status.status === 'failed') { 103 | result += `Ended: ${endTime}\n`; 104 | } 105 | 106 | result += `Duration: ${duration}\n`; 107 | 108 | if (status.totalFiles !== undefined) { 109 | result += `Files: ${status.processedFiles || 0} processed, ${status.skippedFiles || 0} skipped (of ${status.totalFiles})\n`; 110 | } 111 | 112 | if (status.totalChunks !== undefined) { 113 | result += `Chunks: ${status.indexedChunks || 0} indexed (of ${status.totalChunks})\n`; 114 | } 115 | 116 | if (status.currentBatch !== undefined && status.totalBatches !== undefined) { 117 | result += `Batch: ${status.currentBatch} of ${status.totalBatches}\n`; 118 | } 119 | 120 | if (status.error) { 121 | result += `Error: ${status.error}\n`; 122 | } 123 | 124 | return result; 125 | } 126 | 127 | private formatDuration(ms: number): string { 128 | const seconds = Math.floor(ms / 1000); 129 | const minutes = Math.floor(seconds / 60); 130 | const hours = Math.floor(minutes / 60); 131 | 132 | if (hours > 0) { 133 | return `${hours}h ${minutes % 60}m ${seconds % 60}s`; 134 | } else if (minutes > 0) { 135 | return `${minutes}m ${seconds % 60}s`; 136 | } else { 137 | return `${seconds}s`; 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /KNOWLEDGE.md: -------------------------------------------------------------------------------- 1 | # Knowledge Base for MCP RAG Docs 2 | 3 | ## Architecture 4 | 5 | ### Handler Registry 6 | The system uses a handler registry pattern to manage tools. The key components are: 7 | 8 | 1. **HandlerRegistry Class** (`src/handler-registry.ts`): 9 | - Manages all tool handlers 10 | - Registers handlers with the MCP server 11 | - Defines tool schemas and descriptions 12 | 13 | 2. **Handler Registration Process**: 14 | - Handlers are set up in the `setupHandlers` method 15 | - Tools are exposed to clients via the `ListToolsRequestSchema` handler 16 | - **Important**: Tools must be included in both places to be available to clients 17 | 18 | 3. **Tool Definition Structure**: 19 | ```typescript 20 | { 21 | name: 'tool_name', 22 | description: 'Tool description...', 23 | inputSchema: { 24 | type: 'object', 25 | properties: { 26 | // Tool parameters 27 | }, 28 | required: ['param1', 'param2'] 29 | } 30 | } as ToolDefinition 31 | ``` 32 | 33 | ## Tools 34 | 35 | ### Documentation Management Tools 36 | 37 | 1. **add_documentation**: 38 | - Directly adds documentation from a URL 39 | - Processes content immediately 40 | - Chunks text and creates embeddings 41 | - Stores in Qdrant vector database 42 | - Required parameter: `url` 43 | 44 | 2. **Queue-Based Processing**: 45 | - `extract_urls`: Extracts URLs from a page 46 | - `list_queue`: Shows pending URLs 47 | - `run_queue`: Processes all queued URLs 48 | - `clear_queue`: Empties the queue 49 | 50 | ## Client Integration 51 | 52 | ### Claude Desktop Configuration 53 | Claude Desktop requires explicit configuration to recognize tools: 54 | 55 | 1. **Tool Registration**: Tools must be properly registered in the server code 56 | 2. **Auto-Approval**: Tools must be listed in the `autoApprove` array in the configuration 57 | 3. **Configuration File**: Located at `claude_desktop_config.json` 58 | 59 | ### Common Issues 60 | - Tools registered as handlers but not included in the `ListToolsRequestSchema` response won't appear in clients 61 | - Changes to tool definitions require server restart 62 | - Client applications may cache tool listings, requiring restart 63 | 64 | ## Troubleshooting 65 | 66 | ### Missing Tools 67 | If tools are missing from client applications: 68 | 1. Check the tool is registered in `setupHandlers` 69 | 2. Verify the tool is included in the `tools` array in the `ListToolsRequestSchema` handler 70 | 3. Ensure the client configuration includes the tool in any approval lists 71 | 4. Restart both server and client applications 72 | 73 | ### Server Logs 74 | Server logs provide valuable debugging information. MCP servers typically redirect `console.log`, `console.info`, `console.error`, etc., to `stderr` to avoid interfering with the JSON-RPC communication over `stdout`. When troubleshooting or monitoring: 75 | - Check `stderr` output for logs from handlers (e.g., progress during long operations like repository indexing). 76 | - Logs can reveal tool registration issues. 77 | - Client connection details are often logged. 78 | - Request/response patterns can be observed. 79 | - For long-running tools like `add_repository` and `update_repository`: 80 | - Detailed progress logs are sent to `stderr` to act as a server-side heartbeat. 81 | - MCP `$/progress` notifications are sent to the client to prevent request timeouts and provide client-side progress updates. 82 | - **Timeout Issue Solution**: The timeout issue with large repositories has been addressed by implementing asynchronous processing: 83 | - Repository indexing now runs in the background after initial setup 84 | - The MCP request returns quickly with a success message, preventing timeout 85 | - A new `get_indexing_status` tool allows checking the progress of ongoing indexing operations 86 | - Batch size reduced from 100 to 50 chunks per batch for more frequent progress updates 87 | - Status tracking implemented via the `IndexingStatusManager` class 88 | - Detailed status information includes progress percentage, file counts, and timing data 89 | 90 | - **Implementation Details**: 91 | - Added `IndexingStatus` type to track indexing progress 92 | - Created `IndexingStatusManager` class to manage status persistence 93 | - Modified `LocalRepositoryHandler` to use asynchronous processing 94 | - Added `processRepositoryAsync` method that runs in the background 95 | - Created `GetIndexingStatusHandler` for checking indexing status 96 | - Updated documentation to reflect the new asynchronous approach 97 | 98 | - **Additional Improvements**: 99 | - More robust error handling in batch processing 100 | - Better progress reporting with detailed status information 101 | - Status persistence across server restarts 102 | - Ability to monitor multiple concurrent indexing operations 103 | -------------------------------------------------------------------------------- /src/services/embeddings.ts: -------------------------------------------------------------------------------- 1 | import ollama from 'ollama'; 2 | import OpenAI from 'openai'; 3 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 4 | 5 | export interface EmbeddingProvider { 6 | generateEmbeddings(text: string): Promise; 7 | getVectorSize(): number; 8 | } 9 | 10 | export class OllamaProvider implements EmbeddingProvider { 11 | private model: string; 12 | 13 | constructor(model: string = 'nomic-embed-text') { 14 | this.model = model; 15 | } 16 | 17 | async generateEmbeddings(text: string): Promise { 18 | try { 19 | console.error('Generating Ollama embeddings for text:', text.substring(0, 50) + '...'); 20 | const response = await ollama.embeddings({ 21 | model: this.model, 22 | prompt: text 23 | }); 24 | console.error('Successfully generated Ollama embeddings with size:', response.embedding.length); 25 | return response.embedding; 26 | } catch (error) { 27 | console.error('Ollama embedding error:', error); 28 | throw new McpError( 29 | ErrorCode.InternalError, 30 | `Failed to generate embeddings with Ollama: ${error}` 31 | ); 32 | } 33 | } 34 | 35 | getVectorSize(): number { 36 | // nomic-embed-text produces 768-dimensional vectors 37 | return 768; 38 | } 39 | } 40 | 41 | export class OpenAIProvider implements EmbeddingProvider { 42 | private client: OpenAI; 43 | private model: string; 44 | 45 | constructor(apiKey: string, model: string = 'text-embedding-3-small') { 46 | this.client = new OpenAI({ apiKey }); 47 | this.model = model; 48 | } 49 | 50 | async generateEmbeddings(text: string): Promise { 51 | try { 52 | console.error('Generating OpenAI embeddings for text:', text.substring(0, 50) + '...'); 53 | const response = await this.client.embeddings.create({ 54 | model: this.model, 55 | input: text, 56 | }); 57 | const embedding = response.data[0].embedding; 58 | console.error('Successfully generated OpenAI embeddings with size:', embedding.length); 59 | return embedding; 60 | } catch (error) { 61 | console.error('OpenAI embedding error:', error); 62 | throw new McpError( 63 | ErrorCode.InternalError, 64 | `Failed to generate embeddings with OpenAI: ${error}` 65 | ); 66 | } 67 | } 68 | 69 | getVectorSize(): number { 70 | // text-embedding-3-small produces 1536-dimensional vectors 71 | return 1536; 72 | } 73 | } 74 | 75 | export class EmbeddingService { 76 | private provider: EmbeddingProvider; 77 | private fallbackProvider?: EmbeddingProvider; 78 | 79 | constructor(provider: EmbeddingProvider, fallbackProvider?: EmbeddingProvider) { 80 | this.provider = provider; 81 | this.fallbackProvider = fallbackProvider; 82 | } 83 | 84 | async generateEmbeddings(text: string): Promise { 85 | try { 86 | return await this.provider.generateEmbeddings(text); 87 | } catch (error) { 88 | if (this.fallbackProvider) { 89 | console.error('Primary provider failed, trying fallback provider...'); 90 | return this.fallbackProvider.generateEmbeddings(text); 91 | } 92 | throw error; 93 | } 94 | } 95 | 96 | getVectorSize(): number { 97 | return this.provider.getVectorSize(); 98 | } 99 | 100 | static createFromConfig(config: { 101 | provider: 'ollama' | 'openai'; 102 | apiKey?: string; 103 | model?: string; 104 | fallbackProvider?: 'ollama' | 'openai'; 105 | fallbackApiKey?: string; 106 | fallbackModel?: string; 107 | }): EmbeddingService { 108 | const primaryProvider = EmbeddingService.createProvider( 109 | config.provider, 110 | config.apiKey, 111 | config.model 112 | ); 113 | 114 | let fallbackProvider: EmbeddingProvider | undefined; 115 | if (config.fallbackProvider) { 116 | fallbackProvider = EmbeddingService.createProvider( 117 | config.fallbackProvider, 118 | config.fallbackApiKey, 119 | config.fallbackModel 120 | ); 121 | } 122 | 123 | return new EmbeddingService(primaryProvider, fallbackProvider); 124 | } 125 | 126 | private static createProvider( 127 | provider: 'ollama' | 'openai', 128 | apiKey?: string, 129 | model?: string 130 | ): EmbeddingProvider { 131 | switch (provider) { 132 | case 'ollama': 133 | return new OllamaProvider(model); 134 | case 'openai': 135 | if (!apiKey) { 136 | throw new McpError( 137 | ErrorCode.InvalidParams, 138 | 'OpenAI API key is required' 139 | ); 140 | } 141 | return new OpenAIProvider(apiKey, model); 142 | default: 143 | throw new McpError( 144 | ErrorCode.InvalidParams, 145 | `Unknown embedding provider: ${provider}` 146 | ); 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /src/handlers/add-documentation.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { DocumentChunk, McpToolResponse } from '../types.js'; 4 | import * as cheerio from 'cheerio'; 5 | import crypto from 'crypto'; 6 | 7 | const COLLECTION_NAME = 'documentation'; 8 | 9 | export class AddDocumentationHandler extends BaseHandler { 10 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 11 | if (!args.url || typeof args.url !== 'string') { 12 | throw new McpError(ErrorCode.InvalidParams, 'URL is required'); 13 | } 14 | 15 | try { 16 | const chunks = await this.fetchAndProcessUrl(args.url); 17 | 18 | // Batch process chunks for better performance 19 | const batchSize = 100; 20 | for (let i = 0; i < chunks.length; i += batchSize) { 21 | const batch = chunks.slice(i, i + batchSize); 22 | const points = await Promise.all( 23 | batch.map(async (chunk) => { 24 | const embedding = await this.apiClient.getEmbeddings(chunk.text); 25 | return { 26 | id: this.generatePointId(), 27 | vector: embedding, 28 | payload: { 29 | ...chunk, 30 | _type: 'DocumentChunk' as const, 31 | } as Record, 32 | }; 33 | }) 34 | ); 35 | 36 | try { 37 | await this.apiClient.qdrantClient.upsert(COLLECTION_NAME, { 38 | wait: true, 39 | points, 40 | }); 41 | } catch (error) { 42 | if (error instanceof Error) { 43 | if (error.message.includes('unauthorized')) { 44 | throw new McpError( 45 | ErrorCode.InvalidRequest, 46 | 'Failed to authenticate with Qdrant cloud while adding documents' 47 | ); 48 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 49 | throw new McpError( 50 | ErrorCode.InternalError, 51 | 'Connection to Qdrant cloud failed while adding documents' 52 | ); 53 | } 54 | } 55 | throw error; 56 | } 57 | } 58 | 59 | return { 60 | content: [ 61 | { 62 | type: 'text', 63 | text: `Successfully added documentation from ${args.url} (${chunks.length} chunks processed in ${Math.ceil(chunks.length / batchSize)} batches)`, 64 | }, 65 | ], 66 | }; 67 | } catch (error) { 68 | if (error instanceof McpError) { 69 | throw error; 70 | } 71 | return { 72 | content: [ 73 | { 74 | type: 'text', 75 | text: `Failed to add documentation: ${error}`, 76 | }, 77 | ], 78 | isError: true, 79 | }; 80 | } 81 | } 82 | 83 | private async fetchAndProcessUrl(url: string): Promise { 84 | await this.apiClient.initBrowser(); 85 | const page = await this.apiClient.browser.newPage(); 86 | 87 | try { 88 | await page.goto(url, { waitUntil: 'networkidle' }); 89 | const content = await page.content(); 90 | const $ = cheerio.load(content); 91 | 92 | // Remove script tags, style tags, and comments 93 | $('script').remove(); 94 | $('style').remove(); 95 | $('noscript').remove(); 96 | 97 | // Extract main content 98 | const title = $('title').text() || url; 99 | const mainContent = $('main, article, .content, .documentation, body').text(); 100 | 101 | // Split content into chunks 102 | const chunks = this.chunkText(mainContent, 1000); 103 | 104 | return chunks.map(chunk => ({ 105 | text: chunk, 106 | url, 107 | title, 108 | timestamp: new Date().toISOString(), 109 | })); 110 | } catch (error) { 111 | throw new McpError( 112 | ErrorCode.InternalError, 113 | `Failed to fetch URL ${url}: ${error}` 114 | ); 115 | } finally { 116 | await page.close(); 117 | } 118 | } 119 | 120 | private chunkText(text: string, maxChunkSize: number): string[] { 121 | const words = text.split(/\s+/); 122 | const chunks: string[] = []; 123 | let currentChunk: string[] = []; 124 | 125 | for (const word of words) { 126 | currentChunk.push(word); 127 | const currentLength = currentChunk.join(' ').length; 128 | 129 | if (currentLength >= maxChunkSize) { 130 | chunks.push(currentChunk.join(' ')); 131 | currentChunk = []; 132 | } 133 | } 134 | 135 | if (currentChunk.length > 0) { 136 | chunks.push(currentChunk.join(' ')); 137 | } 138 | 139 | return chunks; 140 | } 141 | 142 | private generatePointId(): string { 143 | return crypto.randomBytes(16).toString('hex'); 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/handlers/list-sources.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse, isDocumentPayload } from '../types.js'; 4 | 5 | const COLLECTION_NAME = 'documentation'; 6 | 7 | interface Source { 8 | title: string; 9 | url: string; 10 | } 11 | 12 | interface GroupedSources { 13 | [domain: string]: { 14 | [subdomain: string]: Source[]; 15 | }; 16 | } 17 | 18 | export class ListSourcesHandler extends BaseHandler { 19 | private groupSourcesByDomainAndSubdomain(sources: Source[]): GroupedSources { 20 | const grouped: GroupedSources = {}; 21 | 22 | for (const source of sources) { 23 | try { 24 | const url = new URL(source.url); 25 | const domain = url.hostname; 26 | const pathParts = url.pathname.split('/').filter(p => p); 27 | const subdomain = pathParts[0] || '/'; 28 | 29 | if (!grouped[domain]) { 30 | grouped[domain] = {}; 31 | } 32 | if (!grouped[domain][subdomain]) { 33 | grouped[domain][subdomain] = []; 34 | } 35 | grouped[domain][subdomain].push(source); 36 | } catch (error) { 37 | console.error(`Invalid URL: ${source.url}`); 38 | } 39 | } 40 | 41 | return grouped; 42 | } 43 | 44 | private formatGroupedSources(grouped: GroupedSources): string { 45 | const output: string[] = []; 46 | let domainCounter = 1; 47 | 48 | for (const [domain, subdomains] of Object.entries(grouped)) { 49 | output.push(`${domainCounter}. ${domain}`); 50 | 51 | // Create a Set of unique URL+title combinations 52 | const uniqueSources = new Map(); 53 | for (const sources of Object.values(subdomains)) { 54 | for (const source of sources) { 55 | uniqueSources.set(source.url, source); 56 | } 57 | } 58 | 59 | // Convert to array and sort 60 | const sortedSources = Array.from(uniqueSources.values()) 61 | .sort((a, b) => a.title.localeCompare(b.title)); 62 | 63 | // Use letters for subdomain entries 64 | sortedSources.forEach((source, index) => { 65 | output.push(`${domainCounter}.${index + 1}. ${source.title} (${source.url})`); 66 | }); 67 | 68 | output.push(''); // Add blank line between domains 69 | domainCounter++; 70 | } 71 | 72 | return output.join('\n'); 73 | } 74 | 75 | async handle(args?: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 76 | try { 77 | await this.apiClient.initCollection(COLLECTION_NAME); 78 | 79 | const pageSize = 100; 80 | let offset = null; 81 | const sources: Source[] = []; 82 | 83 | while (true) { 84 | const scroll = await this.apiClient.qdrantClient.scroll(COLLECTION_NAME, { 85 | with_payload: true, 86 | with_vector: false, 87 | limit: pageSize, 88 | offset, 89 | }); 90 | 91 | if (scroll.points.length === 0) break; 92 | 93 | for (const point of scroll.points) { 94 | if (point.payload && typeof point.payload === 'object' && 'url' in point.payload && 'title' in point.payload) { 95 | const payload = point.payload as any; 96 | sources.push({ 97 | title: payload.title, 98 | url: payload.url 99 | }); 100 | } 101 | } 102 | 103 | if (scroll.points.length < pageSize) break; 104 | offset = scroll.points[scroll.points.length - 1].id; 105 | } 106 | 107 | if (sources.length === 0) { 108 | return { 109 | content: [ 110 | { 111 | type: 'text', 112 | text: 'No documentation sources found.', 113 | }, 114 | ], 115 | }; 116 | } 117 | 118 | const grouped = this.groupSourcesByDomainAndSubdomain(sources); 119 | const formattedOutput = this.formatGroupedSources(grouped); 120 | 121 | return { 122 | content: [ 123 | { 124 | type: 'text', 125 | text: formattedOutput, 126 | }, 127 | ], 128 | }; 129 | } catch (error) { 130 | if (error instanceof Error) { 131 | if (error.message.includes('unauthorized')) { 132 | throw new McpError( 133 | ErrorCode.InvalidRequest, 134 | 'Failed to authenticate with Qdrant cloud while listing sources' 135 | ); 136 | } else if (error.message.includes('ECONNREFUSED') || error.message.includes('ETIMEDOUT')) { 137 | throw new McpError( 138 | ErrorCode.InternalError, 139 | 'Connection to Qdrant cloud failed while listing sources' 140 | ); 141 | } 142 | } 143 | return { 144 | content: [ 145 | { 146 | type: 'text', 147 | text: `Failed to list sources: ${error}`, 148 | }, 149 | ], 150 | isError: true, 151 | }; 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /src/handlers/watch-repository.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { McpToolResponse, RepositoryConfig } from '../types.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | import { RepositoryWatcher } from '../utils/repository-watcher.js'; 7 | import { UpdateRepositoryHandler } from './update-repository.js'; 8 | import { RepositoryConfigLoader } from '../utils/repository-config-loader.js'; 9 | 10 | const REPO_CONFIG_DIR = path.join(process.cwd(), 'repo-configs'); 11 | 12 | // Map to store active watchers 13 | const activeWatchers = new Map(); 14 | 15 | export class WatchRepositoryHandler extends BaseHandler { 16 | private updateHandler: UpdateRepositoryHandler; 17 | 18 | constructor(server: any, apiClient: any) { 19 | super(server, apiClient); 20 | this.updateHandler = new UpdateRepositoryHandler(server, apiClient); 21 | } 22 | 23 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 24 | if (!args.name || typeof args.name !== 'string') { 25 | throw new McpError(ErrorCode.InvalidParams, 'Repository name is required'); 26 | } 27 | 28 | if (args.action !== 'start' && args.action !== 'stop') { 29 | throw new McpError(ErrorCode.InvalidParams, 'Action must be either "start" or "stop"'); 30 | } 31 | 32 | const repoName = args.name; 33 | const configPath = path.join(REPO_CONFIG_DIR, `${repoName}.json`); 34 | 35 | try { 36 | // Check if the repository config exists 37 | try { 38 | await fs.access(configPath); 39 | } catch { 40 | throw new McpError(ErrorCode.InvalidParams, `Repository not found: ${repoName}`); 41 | } 42 | 43 | // Read the config 44 | const configContent = await fs.readFile(configPath, 'utf-8'); 45 | const config = JSON.parse(configContent) as RepositoryConfig; 46 | 47 | if (args.action === 'start') { 48 | // Check if already watching 49 | if (activeWatchers.has(repoName)) { 50 | return { 51 | content: [ 52 | { 53 | type: 'text', 54 | text: `Repository ${repoName} is already being watched`, 55 | }, 56 | ], 57 | }; 58 | } 59 | 60 | // Create a new watcher 61 | const watcher = new RepositoryWatcher( 62 | config, 63 | async (changedFiles, removedFiles) => { 64 | console.log(`Repository ${repoName} changed: ${changedFiles.length} files changed, ${removedFiles.length} files removed`); 65 | 66 | // Update the repository index 67 | if (changedFiles.length > 0 || removedFiles.length > 0) { 68 | try { 69 | // Pass the callContext along if it exists 70 | await this.updateHandler.handle({ name: repoName }, callContext); 71 | console.log(`Repository ${repoName} index updated successfully`); 72 | } catch (error) { 73 | console.error(`Failed to update repository ${repoName} index:`, error); 74 | } 75 | } 76 | } 77 | ); 78 | 79 | // Start watching 80 | await watcher.start(); 81 | activeWatchers.set(repoName, watcher); 82 | 83 | // Update the config to reflect watch mode 84 | config.watchMode = true; 85 | await fs.writeFile(configPath, JSON.stringify(config, null, 2), 'utf-8'); 86 | 87 | // Update the repositories.json configuration file 88 | const configLoader = new RepositoryConfigLoader(this.server, this.apiClient); 89 | await configLoader.addRepositoryToConfig(config); 90 | 91 | return { 92 | content: [ 93 | { 94 | type: 'text', 95 | text: `Started watching repository: ${repoName} (${config.path})`, 96 | }, 97 | ], 98 | }; 99 | } else { 100 | // Stop watching 101 | const watcher = activeWatchers.get(repoName); 102 | if (!watcher) { 103 | return { 104 | content: [ 105 | { 106 | type: 'text', 107 | text: `Repository ${repoName} is not currently being watched`, 108 | }, 109 | ], 110 | }; 111 | } 112 | 113 | watcher.stop(); 114 | activeWatchers.delete(repoName); 115 | 116 | // Update the config to reflect watch mode 117 | config.watchMode = false; 118 | await fs.writeFile(configPath, JSON.stringify(config, null, 2), 'utf-8'); 119 | 120 | // Update the repositories.json configuration file 121 | const configLoader = new RepositoryConfigLoader(this.server, this.apiClient); 122 | await configLoader.addRepositoryToConfig(config); 123 | 124 | return { 125 | content: [ 126 | { 127 | type: 'text', 128 | text: `Stopped watching repository: ${repoName} (${config.path})`, 129 | }, 130 | ], 131 | }; 132 | } 133 | } catch (error) { 134 | if (error instanceof McpError) { 135 | throw error; 136 | } 137 | return { 138 | content: [ 139 | { 140 | type: 'text', 141 | text: `Failed to ${args.action} watching repository: ${error}`, 142 | }, 143 | ], 144 | isError: true, 145 | }; 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/utils/repository-watcher.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import path from 'path'; 3 | import { glob } from 'glob'; 4 | import crypto from 'crypto'; 5 | import { RepositoryConfig } from '../types.js'; 6 | 7 | interface FileState { 8 | path: string; 9 | hash: string; 10 | lastModified: number; 11 | } 12 | 13 | export class RepositoryWatcher { 14 | private config: RepositoryConfig; 15 | private fileStates: Map = new Map(); 16 | private watchInterval: NodeJS.Timeout | null = null; 17 | private onFileChanged: (changedFiles: string[], removedFiles: string[]) => Promise; 18 | 19 | constructor( 20 | config: RepositoryConfig, 21 | onFileChanged: (changedFiles: string[], removedFiles: string[]) => Promise 22 | ) { 23 | this.config = config; 24 | this.onFileChanged = onFileChanged; 25 | } 26 | 27 | /** 28 | * Start watching the repository for changes 29 | */ 30 | async start(): Promise { 31 | // Initialize the file states 32 | await this.initializeFileStates(); 33 | 34 | // Start the watch interval 35 | this.watchInterval = setInterval( 36 | () => this.checkForChanges(), 37 | this.config.watchInterval 38 | ); 39 | 40 | console.log(`Started watching repository: ${this.config.name} (${this.config.path})`); 41 | } 42 | 43 | /** 44 | * Stop watching the repository 45 | */ 46 | stop(): void { 47 | if (this.watchInterval) { 48 | clearInterval(this.watchInterval); 49 | this.watchInterval = null; 50 | console.log(`Stopped watching repository: ${this.config.name}`); 51 | } 52 | } 53 | 54 | /** 55 | * Initialize the file states by scanning the repository 56 | */ 57 | private async initializeFileStates(): Promise { 58 | const files = await glob(this.config.include, { 59 | cwd: this.config.path, 60 | ignore: this.config.exclude, 61 | absolute: true, 62 | nodir: true, 63 | }); 64 | 65 | for (const file of files) { 66 | try { 67 | const stats = fs.statSync(file); 68 | const content = fs.readFileSync(file, 'utf-8'); 69 | const hash = this.hashContent(content); 70 | 71 | this.fileStates.set(file, { 72 | path: file, 73 | hash, 74 | lastModified: stats.mtimeMs, 75 | }); 76 | } catch (error) { 77 | console.error(`Error initializing file state for ${file}:`, error); 78 | } 79 | } 80 | 81 | console.log(`Initialized file states for ${this.fileStates.size} files in repository: ${this.config.name}`); 82 | } 83 | 84 | /** 85 | * Check for changes in the repository 86 | */ 87 | private async checkForChanges(): Promise { 88 | try { 89 | const currentFiles = await glob(this.config.include, { 90 | cwd: this.config.path, 91 | ignore: this.config.exclude, 92 | absolute: true, 93 | nodir: true, 94 | }); 95 | 96 | const currentFilePaths = new Set(currentFiles); 97 | const previousFilePaths = new Set(this.fileStates.keys()); 98 | 99 | // Find added or modified files 100 | const changedFiles: string[] = []; 101 | for (const file of currentFiles) { 102 | try { 103 | const stats = fs.statSync(file); 104 | const previousState = this.fileStates.get(file); 105 | 106 | // If the file is new or the modification time has changed 107 | if (!previousState || previousState.lastModified !== stats.mtimeMs) { 108 | const content = fs.readFileSync(file, 'utf-8'); 109 | const hash = this.hashContent(content); 110 | 111 | // If the file is new or the content has changed 112 | if (!previousState || previousState.hash !== hash) { 113 | changedFiles.push(file); 114 | 115 | // Update the file state 116 | this.fileStates.set(file, { 117 | path: file, 118 | hash, 119 | lastModified: stats.mtimeMs, 120 | }); 121 | } else if (previousState) { 122 | // Update just the modification time if only that changed 123 | this.fileStates.set(file, { 124 | ...previousState, 125 | lastModified: stats.mtimeMs, 126 | }); 127 | } 128 | } 129 | } catch (error) { 130 | console.error(`Error checking file ${file}:`, error); 131 | } 132 | } 133 | 134 | // Find removed files 135 | const removedFiles: string[] = []; 136 | for (const file of previousFilePaths) { 137 | if (!currentFilePaths.has(file)) { 138 | removedFiles.push(file); 139 | this.fileStates.delete(file); 140 | } 141 | } 142 | 143 | // If there are changes, notify the callback 144 | if (changedFiles.length > 0 || removedFiles.length > 0) { 145 | console.log(`Detected changes in repository ${this.config.name}:`); 146 | if (changedFiles.length > 0) { 147 | console.log(`- Changed files: ${changedFiles.length}`); 148 | } 149 | if (removedFiles.length > 0) { 150 | console.log(`- Removed files: ${removedFiles.length}`); 151 | } 152 | 153 | await this.onFileChanged(changedFiles, removedFiles); 154 | } 155 | } catch (error) { 156 | console.error(`Error checking for changes in repository ${this.config.name}:`, error); 157 | } 158 | } 159 | 160 | /** 161 | * Generate a hash of the file content 162 | */ 163 | private hashContent(content: string): string { 164 | return crypto.createHash('md5').update(content).digest('hex'); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/utils/indexing-status-manager.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs/promises'; 2 | import path from 'path'; 3 | import { fileURLToPath } from 'url'; 4 | import { IndexingStatus } from '../types.js'; 5 | 6 | const __dirname = path.dirname(fileURLToPath(import.meta.url)); 7 | const STATUS_DIR = path.join(__dirname, '..', 'indexing-status'); 8 | const STATUS_FILE_PREFIX = 'status-'; 9 | 10 | /** 11 | * Manages the status of repository indexing operations 12 | */ 13 | export class IndexingStatusManager { 14 | /** 15 | * Initialize the status manager 16 | */ 17 | constructor() { 18 | this.ensureStatusDirectory(); 19 | } 20 | 21 | /** 22 | * Create a new indexing status entry 23 | */ 24 | async createStatus(repositoryName: string): Promise { 25 | await this.ensureStatusDirectory(); 26 | 27 | const status: IndexingStatus = { 28 | repositoryName, 29 | status: 'pending', 30 | startTime: new Date().toISOString(), 31 | lastUpdated: new Date().toISOString() 32 | }; 33 | 34 | await this.saveStatus(status); 35 | return status; 36 | } 37 | 38 | /** 39 | * Update an existing indexing status 40 | */ 41 | async updateStatus(status: Partial & { repositoryName: string }): Promise { 42 | const currentStatus = await this.getStatus(status.repositoryName); 43 | 44 | if (!currentStatus) { 45 | throw new Error(`No status found for repository: ${status.repositoryName}`); 46 | } 47 | 48 | const updatedStatus: IndexingStatus = { 49 | ...currentStatus, 50 | ...status, 51 | lastUpdated: new Date().toISOString() 52 | }; 53 | 54 | await this.saveStatus(updatedStatus); 55 | return updatedStatus; 56 | } 57 | 58 | /** 59 | * Get the current status for a repository 60 | */ 61 | async getStatus(repositoryName: string): Promise { 62 | try { 63 | const filePath = this.getStatusFilePath(repositoryName); 64 | const content = await fs.readFile(filePath, 'utf-8'); 65 | return JSON.parse(content) as IndexingStatus; 66 | } catch (error) { 67 | // If file doesn't exist, return null 68 | if ((error as NodeJS.ErrnoException).code === 'ENOENT') { 69 | return null; 70 | } 71 | throw error; 72 | } 73 | } 74 | 75 | /** 76 | * Get all indexing statuses 77 | */ 78 | async getAllStatuses(): Promise { 79 | await this.ensureStatusDirectory(); 80 | 81 | try { 82 | const files = await fs.readdir(STATUS_DIR); 83 | const statusFiles = files.filter(file => file.startsWith(STATUS_FILE_PREFIX)); 84 | 85 | const statuses: IndexingStatus[] = []; 86 | for (const file of statusFiles) { 87 | try { 88 | const content = await fs.readFile(path.join(STATUS_DIR, file), 'utf-8'); 89 | statuses.push(JSON.parse(content) as IndexingStatus); 90 | } catch (error) { 91 | console.error(`Error reading status file ${file}:`, error); 92 | } 93 | } 94 | 95 | return statuses; 96 | } catch (error) { 97 | console.error('Error reading status directory:', error); 98 | return []; 99 | } 100 | } 101 | 102 | /** 103 | * Delete a status entry 104 | */ 105 | async deleteStatus(repositoryName: string): Promise { 106 | try { 107 | const filePath = this.getStatusFilePath(repositoryName); 108 | await fs.unlink(filePath); 109 | } catch (error) { 110 | // Ignore if file doesn't exist 111 | if ((error as NodeJS.ErrnoException).code !== 'ENOENT') { 112 | throw error; 113 | } 114 | } 115 | } 116 | 117 | /** 118 | * Complete an indexing operation 119 | */ 120 | async completeStatus( 121 | repositoryName: string, 122 | success: boolean, 123 | stats?: { 124 | processedFiles: number, 125 | skippedFiles: number, 126 | totalChunks: number, 127 | indexedChunks: number 128 | }, 129 | error?: string 130 | ): Promise { 131 | const status = await this.getStatus(repositoryName); 132 | 133 | if (!status) { 134 | throw new Error(`No status found for repository: ${repositoryName}`); 135 | } 136 | 137 | const updatedStatus: IndexingStatus = { 138 | ...status, 139 | status: success ? 'completed' : 'failed', 140 | endTime: new Date().toISOString(), 141 | lastUpdated: new Date().toISOString(), 142 | percentageComplete: success ? 100 : status.percentageComplete, 143 | error: error || status.error 144 | }; 145 | 146 | if (stats) { 147 | updatedStatus.processedFiles = stats.processedFiles; 148 | updatedStatus.skippedFiles = stats.skippedFiles; 149 | updatedStatus.totalChunks = stats.totalChunks; 150 | updatedStatus.indexedChunks = stats.indexedChunks; 151 | } 152 | 153 | await this.saveStatus(updatedStatus); 154 | return updatedStatus; 155 | } 156 | 157 | /** 158 | * Save status to file 159 | */ 160 | private async saveStatus(status: IndexingStatus): Promise { 161 | await this.ensureStatusDirectory(); 162 | const filePath = this.getStatusFilePath(status.repositoryName); 163 | await fs.writeFile(filePath, JSON.stringify(status, null, 2), 'utf-8'); 164 | } 165 | 166 | /** 167 | * Get the file path for a status file 168 | */ 169 | private getStatusFilePath(repositoryName: string): string { 170 | return path.join(STATUS_DIR, `${STATUS_FILE_PREFIX}${repositoryName}.json`); 171 | } 172 | 173 | /** 174 | * Ensure the status directory exists 175 | */ 176 | private async ensureStatusDirectory(): Promise { 177 | try { 178 | await fs.mkdir(STATUS_DIR, { recursive: true }); 179 | } catch (error) { 180 | console.error('Error creating status directory:', error); 181 | throw error; 182 | } 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /src/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | MCP RAG Docs 7 | 8 | 9 | 24 | 25 | 26 |
27 |
28 |

MCP RAG Docs

29 |
30 | 31 |
32 |
33 |

34 | Add Documentation 35 |

36 |
37 | 39 | 42 |
43 | 55 |
56 | 57 |
58 |
59 |

60 | Processing Queue 61 | 62 |

63 |
64 | 67 | 70 |
71 |
72 |
73 |
Loading queue...
74 |
75 |
76 | 77 |
78 |

79 | Search Documentation 80 |

81 |
82 | 84 | 87 |
88 |
89 |
90 | 91 |
92 |

93 | Available Documents 94 |

95 |
96 |
Loading documents...
97 |
98 |
99 | 102 | 103 | Page 1 of 1 104 | 105 | 108 |
109 |
110 |
111 | 112 | 118 |
119 | 120 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /src/utils/repository-config-loader.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs/promises'; 2 | import path from 'path'; 3 | import { fileURLToPath } from 'url'; 4 | import { RepositoryConfig } from '../types.js'; 5 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 6 | import { ApiClient } from '../api-client.js'; 7 | import { UpdateRepositoryHandler } from '../handlers/update-repository.js'; 8 | import { LocalRepositoryHandler } from '../handlers/local-repository.js'; 9 | import { WatchRepositoryHandler } from '../handlers/watch-repository.js'; 10 | 11 | const __dirname = path.dirname(fileURLToPath(import.meta.url)); 12 | const CONFIG_FILE_PATH = path.join(__dirname, '..', '..', 'repositories.json'); 13 | const REPO_CONFIG_DIR = path.join(__dirname, '..', '..', 'repo-configs'); 14 | 15 | /** 16 | * Interface for the repositories configuration file 17 | */ 18 | interface RepositoriesConfig { 19 | repositories: RepositoryConfig[]; 20 | autoWatch: boolean; 21 | } 22 | 23 | /** 24 | * Class for loading and managing repository configurations from a JSON file 25 | */ 26 | export class RepositoryConfigLoader { 27 | private server: Server; 28 | private apiClient: ApiClient; 29 | private updateHandler: UpdateRepositoryHandler; 30 | private addHandler: LocalRepositoryHandler; 31 | private watchHandler: WatchRepositoryHandler; 32 | 33 | constructor(server: Server, apiClient: ApiClient) { 34 | this.server = server; 35 | this.apiClient = apiClient; 36 | this.updateHandler = new UpdateRepositoryHandler(server, apiClient); 37 | this.addHandler = new LocalRepositoryHandler(server, apiClient); 38 | this.watchHandler = new WatchRepositoryHandler(server, apiClient); 39 | } 40 | 41 | /** 42 | * Load repositories from the configuration file and initialize them 43 | */ 44 | async loadRepositories(): Promise { 45 | try { 46 | // Check if the config file exists 47 | try { 48 | await fs.access(CONFIG_FILE_PATH); 49 | } catch { 50 | console.log('No repositories.json configuration file found. Creating default configuration...'); 51 | await this.createDefaultConfig(); 52 | return; 53 | } 54 | 55 | // Read the config file 56 | const configContent = await fs.readFile(CONFIG_FILE_PATH, 'utf-8'); 57 | const config = JSON.parse(configContent) as RepositoriesConfig; 58 | 59 | // Ensure the repo-configs directory exists 60 | await fs.mkdir(REPO_CONFIG_DIR, { recursive: true }); 61 | 62 | // Process each repository in the config 63 | console.log(`Loading ${config.repositories.length} repositories from configuration...`); 64 | 65 | for (const repoConfig of config.repositories) { 66 | try { 67 | // Check if the repository path exists 68 | try { 69 | const stats = await fs.stat(repoConfig.path); 70 | if (!stats.isDirectory()) { 71 | console.error(`Repository path is not a directory: ${repoConfig.path}`); 72 | continue; 73 | } 74 | } catch { 75 | console.error(`Repository path does not exist: ${repoConfig.path}`); 76 | continue; 77 | } 78 | 79 | // Check if the repository is already indexed 80 | const configPath = path.join(REPO_CONFIG_DIR, `${repoConfig.name}.json`); 81 | let isUpdate = false; 82 | 83 | try { 84 | await fs.access(configPath); 85 | isUpdate = true; 86 | } catch { 87 | // Repository doesn't exist yet, will be added 88 | } 89 | 90 | if (isUpdate) { 91 | // Update existing repository 92 | console.log(`Updating repository: ${repoConfig.name}`); 93 | await this.updateHandler.handle(repoConfig); 94 | } else { 95 | // Add new repository 96 | console.log(`Adding repository: ${repoConfig.name}`); 97 | await this.addHandler.handle(repoConfig); 98 | } 99 | 100 | // Start watching if configured 101 | if (config.autoWatch && repoConfig.watchMode) { 102 | console.log(`Starting watch for repository: ${repoConfig.name}`); 103 | await this.watchHandler.handle({ 104 | name: repoConfig.name, 105 | action: 'start' 106 | }); 107 | } 108 | } catch (error) { 109 | console.error(`Error processing repository ${repoConfig.name}:`, error); 110 | } 111 | } 112 | 113 | console.log('Repositories loaded successfully from configuration'); 114 | } catch (error) { 115 | console.error('Error loading repositories from configuration:', error); 116 | } 117 | } 118 | 119 | /** 120 | * Create a default configuration file if none exists 121 | */ 122 | private async createDefaultConfig(): Promise { 123 | const defaultConfig: RepositoriesConfig = { 124 | repositories: [], 125 | autoWatch: true 126 | }; 127 | 128 | try { 129 | await fs.writeFile(CONFIG_FILE_PATH, JSON.stringify(defaultConfig, null, 2), 'utf-8'); 130 | console.log(`Created default repositories configuration at ${CONFIG_FILE_PATH}`); 131 | } catch (error) { 132 | console.error('Error creating default configuration:', error); 133 | } 134 | } 135 | 136 | /** 137 | * Update the configuration file with the current state of repositories 138 | */ 139 | async updateConfigFile(): Promise { 140 | try { 141 | // Get all repository config files 142 | const configFiles = await fs.readdir(REPO_CONFIG_DIR); 143 | const jsonFiles = configFiles.filter(file => file.endsWith('.json')); 144 | 145 | // Load each repository config 146 | const repositories: RepositoryConfig[] = []; 147 | for (const file of jsonFiles) { 148 | try { 149 | const configPath = path.join(REPO_CONFIG_DIR, file); 150 | const configContent = await fs.readFile(configPath, 'utf-8'); 151 | const config = JSON.parse(configContent) as RepositoryConfig; 152 | repositories.push(config); 153 | } catch (error) { 154 | console.error(`Error loading repository config ${file}:`, error); 155 | } 156 | } 157 | 158 | // Check if the config file exists 159 | let existingConfig: RepositoriesConfig = { repositories: [], autoWatch: true }; 160 | try { 161 | const configContent = await fs.readFile(CONFIG_FILE_PATH, 'utf-8'); 162 | existingConfig = JSON.parse(configContent) as RepositoriesConfig; 163 | } catch { 164 | // Config file doesn't exist yet, will use default 165 | } 166 | 167 | // Update the config file 168 | const updatedConfig: RepositoriesConfig = { 169 | repositories, 170 | autoWatch: existingConfig.autoWatch 171 | }; 172 | 173 | await fs.writeFile(CONFIG_FILE_PATH, JSON.stringify(updatedConfig, null, 2), 'utf-8'); 174 | console.log(`Updated repositories configuration at ${CONFIG_FILE_PATH}`); 175 | } catch (error) { 176 | console.error('Error updating configuration file:', error); 177 | } 178 | } 179 | 180 | /** 181 | * Add a repository to the configuration file 182 | */ 183 | async addRepositoryToConfig(config: RepositoryConfig): Promise { 184 | try { 185 | // Check if the config file exists 186 | let existingConfig: RepositoriesConfig = { repositories: [], autoWatch: true }; 187 | try { 188 | const configContent = await fs.readFile(CONFIG_FILE_PATH, 'utf-8'); 189 | existingConfig = JSON.parse(configContent) as RepositoriesConfig; 190 | } catch { 191 | // Config file doesn't exist yet, will use default 192 | } 193 | 194 | // Check if the repository already exists 195 | const existingIndex = existingConfig.repositories.findIndex(repo => repo.name === config.name); 196 | if (existingIndex >= 0) { 197 | // Update existing repository 198 | existingConfig.repositories[existingIndex] = config; 199 | } else { 200 | // Add new repository 201 | existingConfig.repositories.push(config); 202 | } 203 | 204 | // Update the config file 205 | await fs.writeFile(CONFIG_FILE_PATH, JSON.stringify(existingConfig, null, 2), 'utf-8'); 206 | console.log(`Added repository ${config.name} to configuration`); 207 | } catch (error) { 208 | console.error(`Error adding repository ${config.name} to configuration:`, error); 209 | } 210 | } 211 | 212 | /** 213 | * Remove a repository from the configuration file 214 | */ 215 | async removeRepositoryFromConfig(name: string): Promise { 216 | try { 217 | // Check if the config file exists 218 | try { 219 | await fs.access(CONFIG_FILE_PATH); 220 | } catch { 221 | console.log('No repositories.json configuration file found.'); 222 | return; 223 | } 224 | 225 | // Read the config file 226 | const configContent = await fs.readFile(CONFIG_FILE_PATH, 'utf-8'); 227 | const config = JSON.parse(configContent) as RepositoriesConfig; 228 | 229 | // Remove the repository 230 | const initialLength = config.repositories.length; 231 | config.repositories = config.repositories.filter(repo => repo.name !== name); 232 | 233 | if (config.repositories.length === initialLength) { 234 | console.log(`Repository ${name} not found in configuration`); 235 | return; 236 | } 237 | 238 | // Update the config file 239 | await fs.writeFile(CONFIG_FILE_PATH, JSON.stringify(config, null, 2), 'utf-8'); 240 | console.log(`Removed repository ${name} from configuration`); 241 | } catch (error) { 242 | console.error(`Error removing repository ${name} from configuration:`, error); 243 | } 244 | } 245 | } 246 | -------------------------------------------------------------------------------- /src/handlers/update-repository.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { DocumentChunk, McpToolResponse, RepositoryConfig } from '../types.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | import { glob } from 'glob'; 7 | import crypto from 'crypto'; 8 | import { detectLanguage } from '../utils/language-detection.js'; 9 | import { RepositoryConfigLoader } from '../utils/repository-config-loader.js'; 10 | 11 | const COLLECTION_NAME = 'documentation'; 12 | const REPO_CONFIG_DIR = path.join(process.cwd(), 'repo-configs'); 13 | 14 | export class UpdateRepositoryHandler extends BaseHandler { 15 | private activeProgressToken: string | number | undefined; 16 | 17 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 18 | this.activeProgressToken = callContext?.progressToken || callContext?.requestId; 19 | 20 | if (!args.name || typeof args.name !== 'string') { 21 | throw new McpError(ErrorCode.InvalidParams, 'Repository name is required'); 22 | } 23 | 24 | const repoName = args.name; 25 | const configPath = path.join(REPO_CONFIG_DIR, `${repoName}.json`); 26 | 27 | try { 28 | // Check if the repository config exists 29 | try { 30 | await fs.access(configPath); 31 | } catch { 32 | throw new McpError(ErrorCode.InvalidParams, `Repository not found: ${repoName}`); 33 | } 34 | 35 | // Read the config 36 | const configContent = await fs.readFile(configPath, 'utf-8'); 37 | let config = JSON.parse(configContent) as RepositoryConfig; 38 | 39 | // Update config with any provided parameters 40 | if (args.include) config.include = args.include; 41 | if (args.exclude) config.exclude = args.exclude; 42 | if (args.watchMode !== undefined) config.watchMode = args.watchMode; 43 | if (args.watchInterval) config.watchInterval = args.watchInterval; 44 | if (args.chunkSize) config.chunkSize = args.chunkSize; 45 | if (args.fileTypeConfig) config.fileTypeConfig = { ...config.fileTypeConfig, ...args.fileTypeConfig }; 46 | 47 | // Check if the repository path exists 48 | try { 49 | const stats = await fs.stat(config.path); 50 | if (!stats.isDirectory()) { 51 | throw new McpError(ErrorCode.InvalidParams, `Path is not a directory: ${config.path}`); 52 | } 53 | } catch (error) { 54 | throw new McpError(ErrorCode.InvalidParams, `Invalid repository path: ${config.path}`); 55 | } 56 | 57 | // Save the updated config 58 | await fs.writeFile(configPath, JSON.stringify(config, null, 2), 'utf-8'); 59 | 60 | // Update the repositories.json configuration file 61 | const configLoader = new RepositoryConfigLoader(this.server, this.apiClient); 62 | await configLoader.addRepositoryToConfig(config); 63 | console.info(`[${config.name}] Repository configuration updated and saved.`); 64 | if (this.activeProgressToken) { 65 | (this.server as any).sendProgress(this.activeProgressToken, { message: "Repository configuration updated." }); 66 | } 67 | 68 | // Process the repository 69 | console.info(`[${config.name}] Starting to re-process repository files...`); 70 | if (this.activeProgressToken) { 71 | (this.server as any).sendProgress(this.activeProgressToken, { message: "Starting to re-process repository files..." }); 72 | } 73 | const { chunks, processedFiles, skippedFiles } = await this.processRepository(config); 74 | console.info(`[${config.name}] Finished re-processing repository files. Found ${chunks.length} chunks from ${processedFiles} processed files (${skippedFiles} skipped).`); 75 | if (this.activeProgressToken) { 76 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Finished re-processing files. Found ${chunks.length} chunks.`, percentageComplete: 25 }); // 25% for file processing 77 | } 78 | 79 | // Remove existing repository documents from the vector database 80 | console.info(`[${config.name}] Removing existing documents from vector database...`); 81 | if (this.activeProgressToken) { 82 | (this.server as any).sendProgress(this.activeProgressToken, { message: "Removing existing documents...", percentageComplete: 50 }); // 50% after deletion 83 | } 84 | await this.apiClient.qdrantClient.delete(COLLECTION_NAME, { 85 | filter: { 86 | must: [ 87 | { 88 | key: 'repository', 89 | match: { value: repoName } 90 | }, 91 | { 92 | key: 'isRepositoryFile', 93 | match: { value: true } 94 | } 95 | ] 96 | }, 97 | wait: true 98 | }); 99 | 100 | // Batch process chunks for better performance 101 | const batchSize = 100; 102 | let indexedChunks = 0; 103 | const totalChunks = chunks.length; 104 | 105 | console.info(`[${config.name}] Starting to generate embeddings and re-index ${totalChunks} chunks...`); 106 | if (this.activeProgressToken) { 107 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Starting to generate embeddings for ${totalChunks} chunks...`, percentageComplete: 50 }); 108 | } 109 | 110 | for (let i = 0; i < totalChunks; i += batchSize) { 111 | const batchChunks = chunks.slice(i, i + batchSize); 112 | 113 | const embeddingResults = await Promise.allSettled( 114 | batchChunks.map(async (chunk) => { 115 | try { 116 | const embedding = await this.apiClient.getEmbeddings(chunk.text); 117 | return { 118 | id: this.generatePointId(), 119 | vector: embedding, 120 | payload: { 121 | ...chunk, 122 | _type: 'DocumentChunk' as const, 123 | repository: config.name, 124 | isRepositoryFile: true, 125 | } as Record, 126 | }; 127 | } catch (embeddingError) { 128 | console.error(`[${config.name}] Failed to generate embedding for chunk from ${chunk.filePath || chunk.url} during update: ${embeddingError instanceof Error ? embeddingError.message : String(embeddingError)}`); 129 | throw embeddingError; // Re-throw to be caught by Promise.allSettled 130 | } 131 | }) 132 | ); 133 | 134 | const successfulPoints = embeddingResults 135 | .filter(result => result.status === 'fulfilled') 136 | .map(result => (result as PromiseFulfilledResult).value); 137 | 138 | const failedEmbeddingsCount = embeddingResults.filter(result => result.status === 'rejected').length; 139 | if (failedEmbeddingsCount > 0) { 140 | console.warn(`[${config.name}] Failed to generate embeddings for ${failedEmbeddingsCount} of ${batchChunks.length} chunks in this batch during update.`); 141 | } 142 | 143 | if (successfulPoints.length > 0) { 144 | try { 145 | await this.apiClient.qdrantClient.upsert(COLLECTION_NAME, { 146 | wait: true, 147 | points: successfulPoints, 148 | }); 149 | indexedChunks += successfulPoints.length; 150 | } catch (upsertError) { 151 | console.error(`[${config.name}] Failed to upsert batch of ${successfulPoints.length} points to Qdrant during update: ${upsertError instanceof Error ? upsertError.message : String(upsertError)}`); 152 | } 153 | } 154 | 155 | const percentageComplete = 50 + Math.round(((i + batchChunks.length) / totalChunks) * 50); // Remaining 50% for indexing 156 | console.info(`[${config.name}] Re-processed batch ${Math.floor(i / batchSize) + 1} of ${Math.ceil(totalChunks / batchSize)}. Successfully re-indexed in this batch: ${successfulPoints.length}. Total re-indexed so far: ${indexedChunks} chunks.`); 157 | if (this.activeProgressToken) { 158 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Re-processed ${i + batchChunks.length} of ${totalChunks} chunks for embedding/indexing. Successfully re-indexed: ${indexedChunks}.`, percentageComplete }); 159 | } 160 | } 161 | console.info(`[${config.name}] Finished generating embeddings and re-indexing. Total indexed: ${indexedChunks} chunks.`); 162 | if (this.activeProgressToken) { 163 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Finished re-indexing ${indexedChunks} chunks.`, percentageComplete: 100 }); 164 | } 165 | 166 | return { 167 | content: [ 168 | { 169 | type: 'text', 170 | text: `Successfully updated repository: ${config.name} (${config.path})\n` + 171 | `Processed ${processedFiles} files, skipped ${skippedFiles} files\n` + 172 | `Created ${chunks.length} chunks, indexed ${indexedChunks} chunks\n` + 173 | `Watch mode: ${config.watchMode ? 'enabled' : 'disabled'}`, 174 | }, 175 | ], 176 | }; 177 | } catch (error) { 178 | if (error instanceof McpError) { 179 | throw error; 180 | } 181 | return { 182 | content: [ 183 | { 184 | type: 'text', 185 | text: `Failed to update repository: ${error}`, 186 | }, 187 | ], 188 | isError: true, 189 | }; 190 | } 191 | } 192 | 193 | private async processRepository(config: RepositoryConfig): Promise<{ 194 | chunks: DocumentChunk[], 195 | processedFiles: number, 196 | skippedFiles: number 197 | }> { 198 | const chunks: DocumentChunk[] = []; 199 | let processedFiles = 0; 200 | let skippedFiles = 0; 201 | let fileCounter = 0; 202 | 203 | // Get all files matching the include/exclude patterns 204 | const files = await glob(config.include, { 205 | cwd: config.path, 206 | ignore: config.exclude, 207 | absolute: true, 208 | nodir: true, 209 | }); 210 | const totalFiles = files.length; 211 | 212 | console.info(`[${config.name}] Found ${totalFiles} files to re-process based on include/exclude patterns.`); 213 | if (this.activeProgressToken) { 214 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Found ${totalFiles} files to re-process.` }); 215 | } 216 | 217 | 218 | for (const file of files) { 219 | fileCounter++; 220 | try { 221 | const relativePath = path.relative(config.path, file); 222 | const extension = path.extname(file); 223 | const fileTypeConfig = config.fileTypeConfig[extension]; 224 | 225 | // Skip files that should be excluded based on file type config 226 | if (fileTypeConfig && fileTypeConfig.include === false) { 227 | skippedFiles++; 228 | continue; 229 | } 230 | 231 | // Read file content 232 | const content = await fs.readFile(file, 'utf-8'); 233 | 234 | // Skip empty files 235 | if (!content.trim()) { 236 | skippedFiles++; 237 | continue; 238 | } 239 | 240 | // Detect language for better processing 241 | const language = detectLanguage(file, content); 242 | 243 | // Process the file content into chunks 244 | const fileChunks = this.chunkFileContent( 245 | content, 246 | file, 247 | relativePath, 248 | config, 249 | language, 250 | fileTypeConfig?.chunkStrategy || 'line' 251 | ); 252 | 253 | chunks.push(...fileChunks); 254 | processedFiles++; 255 | if (fileCounter % 50 === 0 && fileCounter > 0 && this.activeProgressToken) { 256 | const percentageComplete = Math.round((fileCounter / totalFiles) * 25); // File processing is ~1/4 of the job here 257 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Re-processed ${fileCounter} of ${totalFiles} files...`, percentageComplete }); 258 | console.info(`[${config.name}] Re-processed ${fileCounter} of ${totalFiles} files... (${processedFiles} successful, ${skippedFiles} skipped/errored)`); 259 | } 260 | } catch (error) { 261 | console.error(`[${config.name}] Error processing file ${file}:`, error); 262 | skippedFiles++; 263 | } 264 | } 265 | console.info(`[${config.name}] Completed file re-iteration. Processed: ${processedFiles}, Skipped/Errored: ${skippedFiles}.`); 266 | 267 | return { chunks, processedFiles, skippedFiles }; 268 | } 269 | 270 | private chunkFileContent( 271 | content: string, 272 | filePath: string, 273 | relativePath: string, 274 | config: RepositoryConfig, 275 | language: string, 276 | chunkStrategy: string 277 | ): DocumentChunk[] { 278 | const chunks: DocumentChunk[] = []; 279 | const timestamp = new Date().toISOString(); 280 | const fileUrl = `file://${filePath}`; 281 | const title = `${config.name}/${relativePath}`; 282 | 283 | // Different chunking strategies based on file type 284 | let textChunks: string[] = []; 285 | 286 | switch (chunkStrategy) { 287 | case 'semantic': 288 | // For semantic chunking, we'd ideally use a more sophisticated approach 289 | // For now, we'll use a simple paragraph-based approach 290 | textChunks = this.chunkByParagraphs(content, config.chunkSize); 291 | break; 292 | case 'line': 293 | // Chunk by lines, respecting max chunk size 294 | textChunks = this.chunkByLines(content, config.chunkSize); 295 | break; 296 | default: 297 | // Default to simple text chunking 298 | textChunks = this.chunkText(content, config.chunkSize); 299 | } 300 | 301 | // Create document chunks with metadata 302 | chunks.push(...textChunks.map((text, index) => ({ 303 | text, 304 | url: fileUrl, 305 | title, 306 | timestamp, 307 | filePath: relativePath, 308 | language, 309 | chunkIndex: index, 310 | totalChunks: textChunks.length, 311 | }))); 312 | 313 | return chunks; 314 | } 315 | 316 | private chunkText(text: string, maxChunkSize: number): string[] { 317 | const words = text.split(/\s+/); 318 | const chunks: string[] = []; 319 | let currentChunk: string[] = []; 320 | 321 | for (const word of words) { 322 | currentChunk.push(word); 323 | const currentLength = currentChunk.join(' ').length; 324 | 325 | if (currentLength >= maxChunkSize) { 326 | chunks.push(currentChunk.join(' ')); 327 | currentChunk = []; 328 | } 329 | } 330 | 331 | if (currentChunk.length > 0) { 332 | chunks.push(currentChunk.join(' ')); 333 | } 334 | 335 | return chunks; 336 | } 337 | 338 | private chunkByLines(text: string, maxChunkSize: number): string[] { 339 | const lines = text.split(/\r?\n/); 340 | const chunks: string[] = []; 341 | let currentChunk: string[] = []; 342 | let currentLength = 0; 343 | 344 | for (const line of lines) { 345 | const lineLength = line.length + 1; // +1 for the newline 346 | 347 | if (currentLength + lineLength > maxChunkSize && currentChunk.length > 0) { 348 | chunks.push(currentChunk.join('\n')); 349 | currentChunk = []; 350 | currentLength = 0; 351 | } 352 | 353 | currentChunk.push(line); 354 | currentLength += lineLength; 355 | } 356 | 357 | if (currentChunk.length > 0) { 358 | chunks.push(currentChunk.join('\n')); 359 | } 360 | 361 | return chunks; 362 | } 363 | 364 | private chunkByParagraphs(text: string, maxChunkSize: number): string[] { 365 | // Split by double newlines (paragraphs) 366 | const paragraphs = text.split(/\r?\n\r?\n/); 367 | const chunks: string[] = []; 368 | let currentChunk: string[] = []; 369 | let currentLength = 0; 370 | 371 | for (const paragraph of paragraphs) { 372 | const paragraphLength = paragraph.length + 2; // +2 for the double newline 373 | 374 | if (currentLength + paragraphLength > maxChunkSize && currentChunk.length > 0) { 375 | chunks.push(currentChunk.join('\n\n')); 376 | currentChunk = []; 377 | currentLength = 0; 378 | } 379 | 380 | currentChunk.push(paragraph); 381 | currentLength += paragraphLength; 382 | } 383 | 384 | if (currentChunk.length > 0) { 385 | chunks.push(currentChunk.join('\n\n')); 386 | } 387 | 388 | return chunks; 389 | } 390 | 391 | private generatePointId(): string { 392 | return crypto.randomBytes(16).toString('hex'); 393 | } 394 | } 395 | -------------------------------------------------------------------------------- /src/server.ts: -------------------------------------------------------------------------------- 1 | import cors from "cors"; 2 | import express, { Application, NextFunction, Request, Response } from "express"; 3 | import fs from "fs"; 4 | import { dirname, join } from "path"; 5 | import { fileURLToPath } from "url"; 6 | import { ApiClient } from "./api-client.js"; 7 | import { ClearQueueTool } from "./tools/clear-queue.js"; 8 | import { ExtractUrlsTool } from "./tools/extract-urls.js"; 9 | import { ListQueueTool } from "./tools/list-queue.js"; 10 | import { ListSourcesTool } from "./tools/list-sources.js"; 11 | import { RemoveDocumentationTool } from "./tools/remove-documentation.js"; 12 | import { RunQueueTool } from "./tools/run-queue.js"; 13 | import { SearchDocumentationTool } from "./tools/search-documentation.js"; 14 | 15 | const __filename = fileURLToPath(import.meta.url); 16 | const __dirname = dirname(__filename); 17 | const rootDir = join(__dirname, ".."); 18 | 19 | interface ApiError extends Error { 20 | status?: number; 21 | } 22 | 23 | interface SearchResponse { 24 | results: Array<{ 25 | url: string; 26 | title: string; 27 | content: string; 28 | snippet?: string; 29 | }>; 30 | } 31 | 32 | interface ErrorResponse { 33 | error: string; 34 | details?: string; 35 | } 36 | 37 | interface Document { 38 | url: string; 39 | title: string; 40 | timestamp: string; 41 | status: string; 42 | } 43 | 44 | interface QueueItem { 45 | id: number; 46 | url: string; 47 | status: string; 48 | timestamp: string; 49 | } 50 | 51 | import net from 'net'; 52 | 53 | function getAvailablePort(startPort: number): Promise { 54 | return new Promise((resolve, reject) => { 55 | const server = net.createServer(); 56 | server.listen(startPort, () => { 57 | const { port } = server.address() as net.AddressInfo; 58 | server.close(() => resolve(port)); 59 | }); 60 | server.on('error', (err: any) => { 61 | if (err.code === 'EADDRINUSE') { 62 | resolve(getAvailablePort(startPort + 1)); 63 | } else { 64 | reject(err); 65 | } 66 | }); 67 | }); 68 | } 69 | 70 | export class WebInterface { 71 | private app: Application; 72 | private server: any; 73 | private apiClient: ApiClient; 74 | private searchTool: SearchDocumentationTool; 75 | private runQueueTool: RunQueueTool; 76 | private listQueueTool: ListQueueTool; 77 | private listSourcesTool: ListSourcesTool; 78 | private clearQueueTool: ClearQueueTool; 79 | private removeDocTool: RemoveDocumentationTool; 80 | private extractUrlsTool: ExtractUrlsTool; 81 | private queuePath: string; 82 | 83 | constructor(apiClient: ApiClient) { 84 | this.apiClient = apiClient; 85 | this.app = express(); 86 | this.queuePath = join(rootDir, "queue.txt"); 87 | 88 | // Initialize tools 89 | this.searchTool = new SearchDocumentationTool(apiClient); 90 | this.runQueueTool = new RunQueueTool(apiClient); 91 | this.listQueueTool = new ListQueueTool(); 92 | this.listSourcesTool = new ListSourcesTool(apiClient); 93 | this.clearQueueTool = new ClearQueueTool(); 94 | this.removeDocTool = new RemoveDocumentationTool(apiClient); 95 | this.extractUrlsTool = new ExtractUrlsTool(apiClient); 96 | 97 | // Ensure queue file exists 98 | this.initializeQueueFile(); 99 | 100 | this.setupMiddleware(); 101 | this.setupRoutes(); 102 | } 103 | 104 | private async initializeQueueFile() { 105 | try { 106 | // Check if queue file exists 107 | if (!fs.existsSync(this.queuePath)) { 108 | // Create the file if it doesn't exist 109 | await fs.promises.writeFile(this.queuePath, "", "utf8"); 110 | console.log("Queue file created at:", this.queuePath); 111 | } 112 | } catch (error) { 113 | console.error("Error initializing queue file:", error); 114 | } 115 | } 116 | 117 | private setupMiddleware() { 118 | this.app.use(cors()); 119 | this.app.use(express.json()); 120 | this.app.use(express.static(join(rootDir, "src/public"))); 121 | this.app.get("/", (req: Request, res: Response) => { 122 | res.sendFile(join(rootDir, "src/public/index.html")); 123 | }); 124 | } 125 | 126 | private setupRoutes() { 127 | const errorHandler = ( 128 | err: ApiError, 129 | req: Request, 130 | res: Response, 131 | next: NextFunction 132 | ) => { 133 | console.error("API Error:", err); 134 | const status = err.status || 500; 135 | const response: ErrorResponse = { 136 | error: err.message || "Internal server error", 137 | }; 138 | if (process.env.NODE_ENV === "development" && err.stack) { 139 | response.details = err.stack; 140 | } 141 | res.status(status).json(response); 142 | }; 143 | 144 | // Get all available documents 145 | this.app.get( 146 | "/documents", 147 | async ( 148 | req: Request, 149 | res: Response, 150 | next: NextFunction 151 | ): Promise => { 152 | try { 153 | const response = await this.listSourcesTool.execute({}); 154 | const sourcesText = response.content[0].text; 155 | 156 | if ( 157 | sourcesText === 158 | "No documentation sources found in the cloud collection." 159 | ) { 160 | res.json([]); 161 | return; 162 | } 163 | 164 | const documents = sourcesText 165 | .split("\n") 166 | .map((line) => { 167 | const match = line.match(/(.*?) \((.*?)\)/); 168 | if (match) { 169 | const [_, title, url] = match; 170 | return { 171 | url, 172 | title, 173 | timestamp: new Date().toISOString(), // Timestamp not available from list-sources 174 | status: "COMPLETED", 175 | }; 176 | } 177 | return null; 178 | }) 179 | .filter(Boolean); 180 | 181 | res.json(documents); 182 | } catch (error) { 183 | next(error); 184 | } 185 | } 186 | ); 187 | 188 | // Get queue status 189 | this.app.get("/queue", async (req: Request, res: Response) => { 190 | try { 191 | // Ensure queue file exists 192 | if (!fs.existsSync(this.queuePath)) { 193 | await this.initializeQueueFile(); 194 | res.json([]); 195 | return; 196 | } 197 | 198 | // Read the queue file directly to get pending items 199 | const queueContent = await fs.promises.readFile(this.queuePath, "utf8"); 200 | console.log("Queue file content:", queueContent); 201 | 202 | const pendingUrls = queueContent 203 | .split("\n") 204 | .filter((line) => line.trim()); 205 | console.log("Pending URLs:", pendingUrls); 206 | 207 | // Get processing status from list-queue tool 208 | const response = await this.listQueueTool.execute({}); 209 | console.log("List queue tool response:", response); 210 | 211 | const queueText = response.content[0].text; 212 | console.log("Queue text from tool:", queueText); 213 | 214 | const processingItems = queueText 215 | .split("\n") 216 | .filter((line) => line.trim()) 217 | .map((line) => { 218 | const [url, status, timestamp] = line.split(" | "); 219 | return { 220 | id: Buffer.from(url).toString("base64"), 221 | url, 222 | status: status || "PROCESSING", 223 | timestamp: timestamp || new Date().toISOString(), 224 | }; 225 | }); 226 | console.log("Processing items:", processingItems); 227 | 228 | // Combine pending and processing items 229 | const queue = [ 230 | // Add pending items that aren't in processing 231 | ...pendingUrls 232 | .filter((url) => !processingItems.some((item) => item.url === url)) 233 | .map((url) => ({ 234 | id: Buffer.from(url).toString("base64"), 235 | url, 236 | status: "PENDING", 237 | timestamp: new Date().toISOString(), 238 | })), 239 | // Add processing items 240 | ...processingItems, 241 | ]; 242 | console.log("Final queue:", queue); 243 | 244 | res.json(queue); 245 | } catch (error) { 246 | console.error("Error getting queue:", error); 247 | res.json([]); 248 | } 249 | }); 250 | 251 | // Add document to queue 252 | this.app.post( 253 | "/add-doc", 254 | async (req: Request, res: Response, next: NextFunction) => { 255 | try { 256 | const { url, urls } = req.body; 257 | 258 | if (!url && (!urls || !Array.isArray(urls))) { 259 | const error: ApiError = new Error( 260 | "URL or array of URLs is required" 261 | ); 262 | error.status = 400; 263 | throw error; 264 | } 265 | 266 | // Ensure queue file exists 267 | if (!fs.existsSync(this.queuePath)) { 268 | await this.initializeQueueFile(); 269 | } 270 | 271 | const urlsToAdd = urls || [url]; 272 | const addedItems: QueueItem[] = []; 273 | 274 | for (const u of urlsToAdd) { 275 | // Add newline only if file is not empty 276 | const fileContent = await fs.promises.readFile( 277 | this.queuePath, 278 | "utf8" 279 | ); 280 | const separator = fileContent.length > 0 ? "\n" : ""; 281 | await fs.promises.appendFile(this.queuePath, separator + u); 282 | 283 | addedItems.push({ 284 | id: Date.now(), 285 | url: u, 286 | status: "PENDING", 287 | timestamp: new Date().toISOString(), 288 | }); 289 | } 290 | 291 | // Start processing queue in background 292 | this.runQueueTool.execute({}).catch((error) => { 293 | console.error("Error processing queue:", error); 294 | }); 295 | 296 | res.json(addedItems); 297 | } catch (error) { 298 | next(error); 299 | } 300 | } 301 | ); 302 | 303 | // Search documentation 304 | this.app.post( 305 | "/search", 306 | async ( 307 | req: Request, 308 | res: Response, 309 | next: NextFunction 310 | ): Promise => { 311 | try { 312 | const { query } = req.body; 313 | if (!query) { 314 | const error: ApiError = new Error("Query is required"); 315 | error.status = 400; 316 | throw error; 317 | } 318 | 319 | const searchResponse = await this.searchTool.execute({ query }); 320 | const searchText = searchResponse.content[0].text; 321 | 322 | if (searchText === "No results found matching the query.") { 323 | res.json({ results: [] }); 324 | } 325 | 326 | // Parse the markdown formatted results 327 | const results = searchText 328 | .split("---") 329 | .filter((block) => block.trim()) 330 | .map((block) => { 331 | const titleMatch = block.match(/\[(.*?)\]\((.*?)\)/); 332 | const contentMatch = block.match(/Content: (.*?)(?=\n|$)/s); 333 | 334 | return { 335 | title: titleMatch ? titleMatch[1] : "Unknown", 336 | url: titleMatch ? titleMatch[2] : "", 337 | content: contentMatch ? contentMatch[1] : "", 338 | snippet: contentMatch 339 | ? contentMatch[1].substring(0, 200) + "..." 340 | : undefined, 341 | }; 342 | }); 343 | 344 | const response: SearchResponse = { results }; 345 | res.json(response); 346 | } catch (error) { 347 | next(error); 348 | } 349 | } 350 | ); 351 | 352 | // Clear queue 353 | this.app.post( 354 | "/clear-queue", 355 | async (req: Request, res: Response, next: NextFunction) => { 356 | try { 357 | // Call the clear queue tool 358 | const response = await this.clearQueueTool.execute({}); 359 | 360 | if (response.isError) { 361 | throw new Error(response.content[0].text); 362 | } 363 | 364 | // Also clear any running processes 365 | await this.runQueueTool.execute({ action: "stop" }); 366 | 367 | // Ensure the queue file is empty 368 | await fs.promises.writeFile(this.queuePath, "", "utf8"); 369 | 370 | res.json({ message: "Queue cleared successfully" }); 371 | } catch (error) { 372 | next(error); 373 | } 374 | } 375 | ); 376 | 377 | // Process queue 378 | this.app.post( 379 | "/process-queue", 380 | async (req: Request, res: Response, next: NextFunction) => { 381 | try { 382 | // Start processing queue in background 383 | this.runQueueTool.execute({}).catch((error) => { 384 | console.error("Error processing queue:", error); 385 | }); 386 | 387 | res.json({ message: "Queue processing started" }); 388 | } catch (error) { 389 | next(error); 390 | } 391 | } 392 | ); 393 | 394 | // Remove documentation (single or multiple) 395 | this.app.delete( 396 | "/documents", 397 | async (req: Request, res: Response, next: NextFunction) => { 398 | try { 399 | const { url, urls } = req.body; 400 | if (!url && (!urls || !Array.isArray(urls))) { 401 | const error: ApiError = new Error( 402 | "URL or array of URLs is required" 403 | ); 404 | error.status = 400; 405 | throw error; 406 | } 407 | 408 | const urlsToRemove = urls || [url]; 409 | await this.removeDocTool.execute({ urls: urlsToRemove }); 410 | res.json({ 411 | message: `${urlsToRemove.length} document${ 412 | urlsToRemove.length === 1 ? "" : "s" 413 | } removed successfully`, 414 | count: urlsToRemove.length, 415 | }); 416 | } catch (error) { 417 | next(error); 418 | } 419 | } 420 | ); 421 | 422 | // Remove all documents 423 | this.app.delete( 424 | "/documents/all", 425 | async (req: Request, res: Response, next: NextFunction) => { 426 | try { 427 | // First get all documents 428 | const response = await this.listSourcesTool.execute({}); 429 | const sourcesText = response.content[0].text; 430 | 431 | if ( 432 | sourcesText === 433 | "No documentation sources found in the cloud collection." 434 | ) { 435 | res.json({ message: "No documents to remove", count: 0 }); 436 | return; 437 | } 438 | 439 | // Extract URLs from the sources 440 | const urls = sourcesText 441 | .split("\n") 442 | .map((line) => { 443 | const match = line.match(/(.*?) \((.*?)\)/); 444 | return match ? match[2] : null; 445 | }) 446 | .filter((url): url is string => url !== null); 447 | 448 | if (urls.length === 0) { 449 | res.json({ message: "No documents to remove", count: 0 }); 450 | return; 451 | } 452 | 453 | // Remove all documents 454 | await this.removeDocTool.execute({ urls }); 455 | res.json({ 456 | message: `${urls.length} document${ 457 | urls.length === 1 ? "" : "s" 458 | } removed successfully`, 459 | count: urls.length, 460 | }); 461 | } catch (error) { 462 | next(error); 463 | } 464 | } 465 | ); 466 | 467 | // Extract URLs 468 | this.app.post( 469 | "/extract-urls", 470 | async (req: Request, res: Response, next: NextFunction) => { 471 | try { 472 | const { url } = req.body; 473 | if (!url) { 474 | const error: ApiError = new Error("URL is required"); 475 | error.status = 400; 476 | throw error; 477 | } 478 | 479 | const response = await this.extractUrlsTool.execute({ url }); 480 | const urls = response.content[0].text 481 | .split("\n") 482 | .filter((url) => url.trim()); 483 | 484 | res.json({ urls }); 485 | } catch (error) { 486 | next(error); 487 | } 488 | } 489 | ); 490 | 491 | this.app.use(errorHandler); 492 | } 493 | 494 | async start() { 495 | const port = await getAvailablePort(3030); 496 | this.server = this.app.listen(port, () => { 497 | console.log(`Web interface running at http://localhost:${port}`); 498 | }); 499 | } 500 | 501 | async stop() { 502 | if (this.server) { 503 | return new Promise((resolve) => { 504 | this.server.close(() => { 505 | console.log("Web interface stopped"); 506 | resolve(true); 507 | }); 508 | }); 509 | } 510 | } 511 | } 512 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAG Documentation MCP Server 2 | [![smithery badge](https://smithery.ai/badge/@rahulretnan/mcp-ragdocs)](https://smithery.ai/server/@rahulretnan/mcp-ragdocs) 3 | 4 | An MCP server implementation that provides tools for retrieving and processing documentation through vector search, enabling AI assistants to augment their responses with relevant documentation context. 5 | 6 | ## Table of Contents 7 | 8 | - [Features](#features) 9 | - [Quick Start](#quick-start) 10 | - [Docker Compose Setup](#docker-compose-setup) 11 | - [Web Interface](#web-interface) 12 | - [Configuration](#configuration) 13 | - [Cline Configuration](#cline-configuration) 14 | - [Claude Desktop Configuration](#claude-desktop-configuration) 15 | - [Acknowledgments](#acknowledgments) 16 | - [Troubleshooting](#troubleshooting) 17 | 18 | ## Features 19 | 20 | ### Tools 21 | 22 | 1. **search_documentation** 23 | 24 | - Search through the documentation using vector search 25 | - Returns relevant chunks of documentation with source information 26 | 27 | 2. **list_sources** 28 | 29 | - List all available documentation sources 30 | - Provides metadata about each source 31 | 32 | 3. **extract_urls** 33 | 34 | - Extract URLs from text and check if they're already in the documentation 35 | - Useful for preventing duplicate documentation 36 | 37 | 4. **remove_documentation** 38 | 39 | - Remove documentation from a specific source 40 | - Cleans up outdated or irrelevant documentation 41 | 42 | 5. **list_queue** 43 | 44 | - List all items in the processing queue 45 | - Shows status of pending documentation processing 46 | 47 | 6. **run_queue** 48 | 49 | - Process all items in the queue 50 | - Automatically adds new documentation to the vector store 51 | 52 | 7. **clear_queue** 53 | 54 | - Clear all items from the processing queue 55 | - Useful for resetting the system 56 | 57 | 8. **add_documentation** 58 | - Add new documentation directly to the system by providing a URL 59 | - Automatically fetches, processes, and indexes the content 60 | - Supports various web page formats and extracts relevant content 61 | - Chunks content intelligently for optimal retrieval 62 | - Required parameter: `url` (must include protocol, e.g., https://) 63 | 64 | 9. **add_repository** 65 | - Index a local code repository for documentation 66 | - Configure include/exclude patterns for files and directories 67 | - Supports different chunking strategies based on file types 68 | - Uses asynchronous processing to avoid MCP timeouts with large repositories 69 | - Provides detailed progress logging (heartbeat) to `stderr` during indexing 70 | - Required parameter: `path` (absolute path to repository) 71 | 72 | 10. **list_repositories** 73 | - List all indexed repositories with their configurations 74 | - Shows include/exclude patterns and watch status 75 | 76 | 11. **update_repository** 77 | - Re-index a repository with updated configuration 78 | - Can modify include/exclude patterns and other settings 79 | - Provides detailed progress logging (heartbeat) to `stderr` during re-indexing 80 | - Required parameter: `name` (repository name) 81 | 82 | 12. **remove_repository** 83 | - Remove a repository from the index 84 | - Deletes all associated documents from the vector database 85 | - Required parameter: `name` (repository name) 86 | 87 | 13. **watch_repository** 88 | - Start or stop watching a repository for changes 89 | - Automatically updates the index when files change 90 | - Required parameters: `name` (repository name) and `action` ("start" or "stop") 91 | 92 | 14. **get_indexing_status** 93 | - Get the current status of repository indexing operations 94 | - Provides detailed information about ongoing or completed indexing processes 95 | - Shows progress percentage, file counts, and timing information 96 | - Optional parameter: `name` (repository name) - if not provided, returns status for all repositories 97 | 98 | ## Quick Start 99 | 100 | The RAG Documentation tool is designed for: 101 | 102 | - Enhancing AI responses with relevant documentation 103 | - Building documentation-aware AI assistants 104 | - Creating context-aware tooling for developers 105 | - Implementing semantic documentation search 106 | - Augmenting existing knowledge bases 107 | 108 | ## Docker Compose Setup 109 | 110 | The project includes a `docker-compose.yml` file for easy containerized deployment. To start the services: 111 | 112 | ```bash 113 | docker-compose up -d 114 | ``` 115 | 116 | To stop the services: 117 | 118 | ```bash 119 | docker-compose down 120 | ``` 121 | 122 | ## Web Interface 123 | 124 | The system includes a web interface that can be accessed after starting the Docker Compose services: 125 | 126 | 1. Open your browser and navigate to: `http://localhost:3030` 127 | 2. The interface provides: 128 | - Real-time queue monitoring 129 | - Documentation source management 130 | - Search interface for testing queries 131 | - System status and health checks 132 | 133 | ## Configuration 134 | 135 | ### Embeddings Configuration 136 | 137 | The system uses Ollama as the default embedding provider for local embeddings generation, with OpenAI available as a fallback option. This setup prioritizes local processing while maintaining reliability through cloud-based fallback. 138 | 139 | #### Environment Variables 140 | 141 | - `EMBEDDING_PROVIDER`: Choose the primary embedding provider ('ollama' or 'openai', default: 'ollama') 142 | - `EMBEDDING_MODEL`: Specify the model to use (optional) 143 | - For OpenAI: defaults to 'text-embedding-3-small' 144 | - For Ollama: defaults to 'nomic-embed-text' 145 | - `OPENAI_API_KEY`: Required when using OpenAI as provider 146 | - `FALLBACK_PROVIDER`: Optional backup provider ('ollama' or 'openai') 147 | - `FALLBACK_MODEL`: Optional model for fallback provider 148 | 149 | ### Cline Configuration 150 | 151 | Add this to your `cline_mcp_settings.json`: 152 | 153 | ```json 154 | { 155 | "mcpServers": { 156 | "rag-docs": { 157 | "command": "node", 158 | "args": ["/path/to/your/mcp-ragdocs/build/index.js"], 159 | "env": { 160 | "EMBEDDING_PROVIDER": "ollama", // default 161 | "EMBEDDING_MODEL": "nomic-embed-text", // optional 162 | "OPENAI_API_KEY": "your-api-key-here", // required for fallback 163 | "FALLBACK_PROVIDER": "openai", // recommended for reliability 164 | "FALLBACK_MODEL": "nomic-embed-text", // optional 165 | "QDRANT_URL": "http://localhost:6333" 166 | }, 167 | "disabled": false, 168 | "autoApprove": [ 169 | "search_documentation", 170 | "list_sources", 171 | "extract_urls", 172 | "remove_documentation", 173 | "list_queue", 174 | "run_queue", 175 | "clear_queue", 176 | "add_documentation", 177 | "add_repository", 178 | "list_repositories", 179 | "update_repository", 180 | "remove_repository", 181 | "watch_repository", 182 | "get_indexing_status" 183 | ] 184 | } 185 | } 186 | } 187 | ``` 188 | 189 | ### Claude Desktop Configuration 190 | 191 | Add this to your `claude_desktop_config.json`: 192 | 193 | ```json 194 | { 195 | "mcpServers": { 196 | "rag-docs": { 197 | "command": "node", 198 | "args": ["/path/to/your/mcp-ragdocs/build/index.js"], 199 | "env": { 200 | "EMBEDDING_PROVIDER": "ollama", // default 201 | "EMBEDDING_MODEL": "nomic-embed-text", // optional 202 | "OPENAI_API_KEY": "your-api-key-here", // required for fallback 203 | "FALLBACK_PROVIDER": "openai", // recommended for reliability 204 | "FALLBACK_MODEL": "nomic-embed-text", // optional 205 | "QDRANT_URL": "http://localhost:6333" 206 | }, 207 | "autoApprove": [ 208 | "search_documentation", 209 | "list_sources", 210 | "extract_urls", 211 | "remove_documentation", 212 | "list_queue", 213 | "run_queue", 214 | "clear_queue", 215 | "add_documentation", 216 | "add_repository", 217 | "list_repositories", 218 | "update_repository", 219 | "remove_repository", 220 | "watch_repository", 221 | "get_indexing_status" 222 | ] 223 | } 224 | } 225 | } 226 | ``` 227 | 228 | ### Default Configuration 229 | 230 | The system uses Ollama by default for efficient local embedding generation. For optimal reliability: 231 | 232 | 1. Install and run Ollama locally 233 | 2. Configure OpenAI as fallback (recommended): 234 | ```json 235 | { 236 | // Ollama is used by default, no need to specify EMBEDDING_PROVIDER 237 | "EMBEDDING_MODEL": "nomic-embed-text", // optional 238 | "FALLBACK_PROVIDER": "openai", 239 | "FALLBACK_MODEL": "text-embedding-3-small", 240 | "OPENAI_API_KEY": "your-api-key-here" 241 | } 242 | ``` 243 | 244 | This configuration ensures: 245 | - Fast, local embedding generation with Ollama 246 | - Automatic fallback to OpenAI if Ollama fails 247 | - No external API calls unless necessary 248 | 249 | Note: The system will automatically use the appropriate vector dimensions based on the provider: 250 | - Ollama (nomic-embed-text): 768 dimensions 251 | - OpenAI (text-embedding-3-small): 1536 dimensions 252 | 253 | ## Documentation Management 254 | 255 | ### Direct vs. Queue-Based Documentation Addition 256 | 257 | The system provides two complementary approaches for adding documentation: 258 | 259 | 1. **Direct Addition (`add_documentation` tool)** 260 | - Immediately processes and indexes the documentation from a URL 261 | - Best for adding individual documentation sources 262 | - Provides immediate feedback on processing success/failure 263 | - Example usage: `add_documentation` with `url: "https://example.com/docs"` 264 | 265 | 2. **Queue-Based Processing** 266 | - Add URLs to a processing queue (`extract_urls` with `add_to_queue: true`) 267 | - Process multiple URLs in batch later (`run_queue`) 268 | - Better for large-scale documentation ingestion 269 | - Allows for scheduled processing of many documentation sources 270 | - Provides resilience through the queue system 271 | 272 | Choose the approach that best fits your documentation management needs. For small numbers of important documents, direct addition provides immediate results. For large documentation sets or recursive crawling, the queue-based approach offers better scalability. 273 | 274 | ### Local Repository Indexing 275 | 276 | The system supports indexing local code repositories, making their content searchable alongside web documentation: 277 | 278 | 1. **Repository Configuration** 279 | - Define which files to include/exclude using glob patterns 280 | - Configure chunking strategies per file type 281 | - Set up automatic change detection with watch mode 282 | 283 | 2. **File Processing** 284 | - Files are processed based on their type and language 285 | - Code is chunked intelligently to preserve context 286 | - Metadata like file path and language are preserved 287 | 288 | 3. **Asynchronous Processing** 289 | - Large repositories are processed asynchronously to avoid MCP timeouts 290 | - Indexing continues in the background after the initial response 291 | - Progress can be monitored using the `get_indexing_status` tool 292 | - Smaller batch sizes (50 chunks per batch) improve responsiveness 293 | 294 | 4. **Change Detection** 295 | - Repositories can be watched for changes 296 | - Modified files are automatically re-indexed 297 | - Deleted files are removed from the index 298 | 299 | Example usage: 300 | ``` 301 | add_repository with { 302 | "path": "/path/to/your/repo", 303 | "name": "my-project", 304 | "include": ["**/*.js", "**/*.ts", "**/*.md"], 305 | "exclude": ["**/node_modules/**", "**/dist/**"], 306 | "watchMode": true 307 | } 308 | ``` 309 | 310 | After starting the indexing process, you can check its status: 311 | ``` 312 | get_indexing_status with { 313 | "name": "my-project" 314 | } 315 | ``` 316 | 317 | This will return detailed information about the indexing progress: 318 | ``` 319 | Repository: my-project 320 | Status: 🔄 Processing 321 | Progress: 45% 322 | Started: 5/11/2025, 2:45:30 PM 323 | Duration: 3m 15s 324 | Files: 120 processed, 15 skipped (of 250) 325 | Chunks: 1500 indexed (of 3300) 326 | Batch: 15 of 33 327 | ``` 328 | 329 | ### Repository Configuration File 330 | 331 | The system supports a `repositories.json` configuration file that allows you to define repositories to be automatically indexed at startup: 332 | 333 | ```json 334 | { 335 | "repositories": [ 336 | { 337 | "path": "/path/to/your/repo", 338 | ``` 339 | 340 | The configuration file is automatically updated when repositories are added, updated, or removed using the repository management tools. You can also manually edit the file to configure repositories before starting the server. The paths within the configuration file, such as the `path` for each repository and the implicit location of `repositories.json` itself, are resolved relative to the project root directory where the server is executed. 341 | 342 | **Configuration Options:** 343 | 344 | - `repositories`: Array of repository configurations 345 | - `path`: Absolute path to the repository directory 346 | "name": "my-project", 347 | "include": ["**/*.js", "**/*.ts", "**/*.md"], 348 | "exclude": ["**/node_modules/**", "**/.git/**"], 349 | "watchMode": true, 350 | "watchInterval": 60000, 351 | "chunkSize": 1000, 352 | "fileTypeConfig": { 353 | ".js": { "include": true, "chunkStrategy": "semantic" }, 354 | ".ts": { "include": true, "chunkStrategy": "semantic" }, 355 | ".md": { "include": true, "chunkStrategy": "semantic" } 356 | } 357 | } 358 | ], 359 | "autoWatch": true 360 | } 361 | ``` 362 | 363 | The configuration file is automatically updated when repositories are added, updated, or removed using the repository management tools. You can also manually edit the file to configure repositories before starting the server. 364 | 365 | **Configuration Options:** 366 | 367 | - `repositories`: Array of repository configurations 368 | - `path`: Absolute path to the repository directory 369 | - `name`: Unique name for the repository 370 | - `include`: Array of glob patterns to include 371 | - `exclude`: Array of glob patterns to exclude 372 | - `watchMode`: Whether to watch for changes 373 | - `watchInterval`: Polling interval in milliseconds 374 | - `chunkSize`: Default chunk size for files 375 | - `fileTypeConfig`: Configuration for specific file types 376 | - `include`: Whether to include this file type 377 | - `chunkStrategy`: Chunking strategy ("semantic", "line", or "character") 378 | - `chunkSize`: Optional override for chunk size 379 | 380 | - `autoWatch`: Whether to automatically start watching repositories with `watchMode: true` at startup 381 | 382 | ## Acknowledgments 383 | 384 | This project is a fork of [qpd-v/mcp-ragdocs](https://github.com/qpd-v/mcp-ragdocs), originally developed by qpd-v. The original project provided the foundation for this implementation. 385 | 386 | Special thanks to the original creator, qpd-v, for their innovative work on the initial version of this MCP server. This fork has been enhanced with additional features and improvements by Rahul Retnan. 387 | 388 | ## Troubleshooting 389 | 390 | ### Server Not Starting (Port Conflict) 391 | 392 | If the MCP server fails to start due to a port conflict, follow these steps: 393 | 394 | 1. Identify and kill the process using port 3030: 395 | 396 | ```bash 397 | npx kill-port 3030 398 | ``` 399 | 400 | 2. Restart the MCP server 401 | 402 | 3. If the issue persists, check for other processes using the port: 403 | 404 | ```bash 405 | lsof -i :3030 406 | ``` 407 | 408 | 4. You can also change the default port in the configuration if needed 409 | 410 | ### Missing Tools in Claude Desktop 411 | 412 | If certain tools (like `add_documentation`) are not appearing in Claude Desktop: 413 | 414 | 1. Verify that the tool is properly registered in the server's `handler-registry.ts` file 415 | 2. Make sure the tool is included in the `ListToolsRequestSchema` handler response 416 | 3. Check that your Claude Desktop configuration includes the tool in the `autoApprove` array 417 | 4. Restart the Claude Desktop application and the MCP server 418 | 5. Check the server logs for any errors related to tool registration 419 | 420 | The most common cause of missing tools is that they are registered as handlers but not included in the `tools` array returned by the `ListToolsRequestSchema` handler. 421 | 422 | ### Timeout Issues with Large Repositories 423 | 424 | If you encounter timeout errors when indexing large repositories: 425 | 426 | 1. The system now uses asynchronous processing to avoid MCP timeouts 427 | 2. When adding a repository with `add_repository`, the indexing will continue in the background 428 | 3. Use the `get_indexing_status` tool to monitor progress 429 | 4. If you still experience issues, try these solutions: 430 | - Reduce the scope of indexing with more specific include/exclude patterns 431 | - Break up very large repositories into smaller logical units 432 | - Increase the batch size in the code if your system has more resources available 433 | - Check system resources (memory, CPU) during indexing to identify bottlenecks 434 | -------------------------------------------------------------------------------- /src/handler-registry.ts: -------------------------------------------------------------------------------- 1 | import { 2 | CallToolRequestSchema, 3 | ErrorCode, 4 | ListToolsRequestSchema, 5 | McpError, 6 | ListPromptsRequestSchema, 7 | ListResourcesRequestSchema, 8 | } from '@modelcontextprotocol/sdk/types.js'; 9 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 10 | import { ApiClient } from './api-client.js'; 11 | import { ToolDefinition } from './types.js'; 12 | import { 13 | AddDocumentationHandler, 14 | SearchDocumentationHandler, 15 | ListSourcesHandler, 16 | RemoveDocumentationHandler, 17 | ExtractUrlsHandler, 18 | ListQueueHandler, 19 | RunQueueHandler, 20 | ClearQueueHandler, 21 | PromptsListHandler, 22 | ResourcesListHandler, 23 | LocalRepositoryHandler, 24 | ListRepositoriesHandler, 25 | RemoveRepositoryHandler, 26 | UpdateRepositoryHandler, 27 | WatchRepositoryHandler, 28 | GetIndexingStatusHandler, 29 | } from './handlers/index.js'; 30 | 31 | const COLLECTION_NAME = 'documentation'; 32 | 33 | export class HandlerRegistry { 34 | private server: Server; 35 | private apiClient: ApiClient; 36 | private handlers: Map; 37 | 38 | constructor(server: Server, apiClient: ApiClient) { 39 | this.server = server; 40 | this.apiClient = apiClient; 41 | this.handlers = new Map(); 42 | this.setupHandlers(); 43 | this.registerHandlers(); 44 | } 45 | 46 | private setupHandlers() { 47 | // Web documentation handlers 48 | this.handlers.set('add_documentation', new AddDocumentationHandler(this.server, this.apiClient)); 49 | this.handlers.set('search_documentation', new SearchDocumentationHandler(this.server, this.apiClient)); 50 | this.handlers.set('list_sources', new ListSourcesHandler(this.server, this.apiClient)); 51 | this.handlers.set('remove_documentation', new RemoveDocumentationHandler(this.server, this.apiClient)); 52 | this.handlers.set('extract_urls', new ExtractUrlsHandler(this.server, this.apiClient)); 53 | this.handlers.set('list_queue', new ListQueueHandler(this.server, this.apiClient)); 54 | this.handlers.set('run_queue', new RunQueueHandler(this.server, this.apiClient)); 55 | this.handlers.set('clear_queue', new ClearQueueHandler(this.server, this.apiClient)); 56 | 57 | // Repository handlers 58 | this.handlers.set('add_repository', new LocalRepositoryHandler(this.server, this.apiClient)); 59 | this.handlers.set('list_repositories', new ListRepositoriesHandler(this.server, this.apiClient)); 60 | this.handlers.set('remove_repository', new RemoveRepositoryHandler(this.server, this.apiClient)); 61 | this.handlers.set('update_repository', new UpdateRepositoryHandler(this.server, this.apiClient)); 62 | this.handlers.set('watch_repository', new WatchRepositoryHandler(this.server, this.apiClient)); 63 | this.handlers.set('get_indexing_status', new GetIndexingStatusHandler(this.server, this.apiClient)); 64 | 65 | // Setup prompts and resources handlers 66 | this.handlers.set('prompts/list', new PromptsListHandler(this.server, this.apiClient)); 67 | this.handlers.set('resources/list', new ResourcesListHandler(this.server, this.apiClient)); 68 | } 69 | 70 | private registerHandlers() { 71 | this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ 72 | tools: [ 73 | { 74 | name: 'search_documentation', 75 | description: 'Search through stored documentation using natural language queries. Use this tool to find relevant information across all stored documentation sources. Returns matching excerpts with context, ranked by relevance. Useful for finding specific information, code examples, or related documentation.', 76 | inputSchema: { 77 | type: 'object', 78 | properties: { 79 | query: { 80 | type: 'string', 81 | description: 'The text to search for in the documentation. Can be a natural language query, specific terms, or code snippets.', 82 | }, 83 | limit: { 84 | type: 'number', 85 | description: 'Maximum number of results to return (1-20). Higher limits provide more comprehensive results but may take longer to process. Default is 5.', 86 | default: 5, 87 | }, 88 | }, 89 | required: ['query'], 90 | }, 91 | } as ToolDefinition, 92 | { 93 | name: 'add_documentation', 94 | description: 'Add new documentation to the system by providing a URL. The tool will fetch the content, process it into chunks, and store it in the vector database for future searches. Supports various web page formats and automatically extracts relevant content.', 95 | inputSchema: { 96 | type: 'object', 97 | properties: { 98 | url: { 99 | type: 'string', 100 | description: 'The complete URL of the documentation to add (must include protocol, e.g., https://). The page must be publicly accessible.', 101 | }, 102 | }, 103 | required: ['url'], 104 | }, 105 | } as ToolDefinition, 106 | { 107 | name: 'list_sources', 108 | description: 'List all documentation sources currently stored in the system. Returns a comprehensive list of all indexed documentation including source URLs, titles, and last update times. Use this to understand what documentation is available for searching or to verify if specific sources have been indexed.', 109 | inputSchema: { 110 | type: 'object', 111 | properties: {}, 112 | }, 113 | } as ToolDefinition, 114 | { 115 | name: 'extract_urls', 116 | description: 'Extract and analyze all URLs from a given web page. This tool crawls the specified webpage, identifies all hyperlinks, and optionally adds them to the processing queue. Useful for discovering related documentation pages, API references, or building a documentation graph. Handles various URL formats and validates links before extraction.', 117 | inputSchema: { 118 | type: 'object', 119 | properties: { 120 | url: { 121 | type: 'string', 122 | description: 'The complete URL of the webpage to analyze (must include protocol, e.g., https://). The page must be publicly accessible.', 123 | }, 124 | add_to_queue: { 125 | type: 'boolean', 126 | description: 'If true, automatically add extracted URLs to the processing queue for later indexing. This enables recursive documentation discovery. Use with caution on large sites to avoid excessive queuing.', 127 | default: false, 128 | }, 129 | }, 130 | required: ['url'], 131 | }, 132 | } as ToolDefinition, 133 | { 134 | name: 'remove_documentation', 135 | description: 'Remove specific documentation sources from the system by their URLs. Use this tool to clean up outdated documentation, remove incorrect sources, or manage the documentation collection. The removal is permanent and will affect future search results. Supports removing multiple URLs in a single operation.', 136 | inputSchema: { 137 | type: 'object', 138 | properties: { 139 | urls: { 140 | type: 'array', 141 | items: { 142 | type: 'string', 143 | description: 'The complete URL of the documentation source to remove. Must exactly match the URL used when the documentation was added.', 144 | }, 145 | description: 'Array of URLs to remove from the database', 146 | }, 147 | }, 148 | required: ['urls'], 149 | }, 150 | } as ToolDefinition, 151 | { 152 | name: 'list_queue', 153 | description: 'List all URLs currently waiting in the documentation processing queue. Shows pending documentation sources that will be processed when run_queue is called. Use this to monitor queue status, verify URLs were added correctly, or check processing backlog. Returns URLs in the order they will be processed.', 154 | inputSchema: { 155 | type: 'object', 156 | properties: {}, 157 | }, 158 | } as ToolDefinition, 159 | { 160 | name: 'run_queue', 161 | description: 'Process and index all URLs currently in the documentation queue. Each URL is processed sequentially, with proper error handling and retry logic. Progress updates are provided as processing occurs. Use this after adding new URLs to ensure all documentation is indexed and searchable. Long-running operations will process until the queue is empty or an unrecoverable error occurs.', 162 | inputSchema: { 163 | type: 'object', 164 | properties: {}, 165 | }, 166 | } as ToolDefinition, 167 | { 168 | name: 'clear_queue', 169 | description: 'Remove all pending URLs from the documentation processing queue. Use this to reset the queue when you want to start fresh, remove unwanted URLs, or cancel pending processing. This operation is immediate and permanent - URLs will need to be re-added if you want to process them later. Returns the number of URLs that were cleared from the queue.', 170 | inputSchema: { 171 | type: 'object', 172 | properties: {}, 173 | }, 174 | } as ToolDefinition, 175 | { 176 | name: 'add_repository', 177 | description: 'Add a local code repository to the documentation system. This tool indexes all files in the repository according to the specified configuration, processes them into searchable chunks, and stores them in the vector database for future searches.', 178 | inputSchema: { 179 | type: 'object', 180 | properties: { 181 | path: { 182 | type: 'string', 183 | description: 'The absolute path to the repository directory on the local file system.', 184 | }, 185 | name: { 186 | type: 'string', 187 | description: 'A user-friendly name for the repository. If not provided, the directory name will be used.', 188 | }, 189 | include: { 190 | type: 'array', 191 | items: { 192 | type: 'string', 193 | }, 194 | description: 'Array of glob patterns to include. Default is ["**/*"] (all files).', 195 | }, 196 | exclude: { 197 | type: 'array', 198 | items: { 199 | type: 'string', 200 | }, 201 | description: 'Array of glob patterns to exclude. Default excludes common non-source directories and files.', 202 | }, 203 | watchMode: { 204 | type: 'boolean', 205 | description: 'Whether to watch the repository for changes and automatically update the index. Default is false.', 206 | }, 207 | watchInterval: { 208 | type: 'number', 209 | description: 'Interval in milliseconds to check for changes when watch mode is enabled. Default is 60000 (1 minute).', 210 | }, 211 | chunkSize: { 212 | type: 'number', 213 | description: 'Default maximum size of text chunks in characters. Default is 1000.', 214 | }, 215 | fileTypeConfig: { 216 | type: 'object', 217 | description: 'Configuration for specific file types. Keys are file extensions, values are objects with include, chunkSize, and chunkStrategy properties.', 218 | }, 219 | }, 220 | required: ['path'], 221 | }, 222 | } as ToolDefinition, 223 | { 224 | name: 'list_repositories', 225 | description: 'List all local repositories currently indexed in the system. Returns details about each repository including path, include/exclude patterns, and watch mode status.', 226 | inputSchema: { 227 | type: 'object', 228 | properties: {}, 229 | }, 230 | } as ToolDefinition, 231 | { 232 | name: 'remove_repository', 233 | description: 'Remove a repository from the system by its name. This removes both the repository configuration and all indexed documents from the vector database.', 234 | inputSchema: { 235 | type: 'object', 236 | properties: { 237 | name: { 238 | type: 'string', 239 | description: 'The name of the repository to remove.', 240 | }, 241 | }, 242 | required: ['name'], 243 | }, 244 | } as ToolDefinition, 245 | { 246 | name: 'update_repository', 247 | description: 'Update an existing repository index. This re-processes all files in the repository according to the current configuration and updates the vector database.', 248 | inputSchema: { 249 | type: 'object', 250 | properties: { 251 | name: { 252 | type: 'string', 253 | description: 'The name of the repository to update.', 254 | }, 255 | include: { 256 | type: 'array', 257 | items: { 258 | type: 'string', 259 | }, 260 | description: 'Array of glob patterns to include. If provided, replaces the existing include patterns.', 261 | }, 262 | exclude: { 263 | type: 'array', 264 | items: { 265 | type: 'string', 266 | }, 267 | description: 'Array of glob patterns to exclude. If provided, replaces the existing exclude patterns.', 268 | }, 269 | watchMode: { 270 | type: 'boolean', 271 | description: 'Whether to watch the repository for changes. If provided, updates the existing watch mode setting.', 272 | }, 273 | watchInterval: { 274 | type: 'number', 275 | description: 'Interval in milliseconds to check for changes when watch mode is enabled.', 276 | }, 277 | chunkSize: { 278 | type: 'number', 279 | description: 'Default maximum size of text chunks in characters.', 280 | }, 281 | fileTypeConfig: { 282 | type: 'object', 283 | description: 'Configuration for specific file types. If provided, merges with the existing file type configuration.', 284 | }, 285 | }, 286 | required: ['name'], 287 | }, 288 | } as ToolDefinition, 289 | { 290 | name: 'watch_repository', 291 | description: 'Start or stop watching a repository for changes. When watching is enabled, the system automatically detects file changes and updates the index accordingly.', 292 | inputSchema: { 293 | type: 'object', 294 | properties: { 295 | name: { 296 | type: 'string', 297 | description: 'The name of the repository to watch or unwatch.', 298 | }, 299 | action: { 300 | type: 'string', 301 | enum: ['start', 'stop'], 302 | description: 'The action to perform: "start" to begin watching, "stop" to end watching.', 303 | }, 304 | }, 305 | required: ['name', 'action'], 306 | }, 307 | } as ToolDefinition, 308 | { 309 | name: 'get_indexing_status', 310 | description: 'Get the current status of repository indexing operations. This tool provides detailed information about ongoing or completed indexing processes, including progress percentage, file counts, and timing information.', 311 | inputSchema: { 312 | type: 'object', 313 | properties: { 314 | name: { 315 | type: 'string', 316 | description: 'Optional. The name of the repository to get status for. If not provided, returns status for all repositories.', 317 | }, 318 | }, 319 | required: [], 320 | }, 321 | } as ToolDefinition, 322 | ], 323 | })); 324 | 325 | // Register the prompts/list handler 326 | this.server.setRequestHandler(ListPromptsRequestSchema, async (request) => { 327 | const handler = this.handlers.get('prompts/list'); 328 | if (!handler) { 329 | throw new McpError( 330 | ErrorCode.MethodNotFound, 331 | 'Method prompts/list not found' 332 | ); 333 | } 334 | 335 | // Call the handler but ignore the response 336 | await handler.handle(request.params); 337 | // Return an empty list of prompts 338 | return { prompts: [] }; 339 | }); 340 | 341 | // Register the resources/list handler 342 | this.server.setRequestHandler(ListResourcesRequestSchema, async (request) => { 343 | const handler = this.handlers.get('resources/list'); 344 | if (!handler) { 345 | throw new McpError( 346 | ErrorCode.MethodNotFound, 347 | 'Method resources/list not found' 348 | ); 349 | } 350 | 351 | // Call the handler but ignore the response 352 | await handler.handle(request.params); 353 | // Return an empty list of resources 354 | return { resources: [] }; 355 | }); 356 | 357 | this.server.setRequestHandler(CallToolRequestSchema, async (request) => { 358 | await this.apiClient.initCollection(COLLECTION_NAME); 359 | 360 | const handler = this.handlers.get(request.params.name); 361 | if (!handler) { 362 | throw new McpError( 363 | ErrorCode.MethodNotFound, 364 | `Unknown tool: ${request.params.name}` 365 | ); 366 | } 367 | 368 | // Extract progressToken or use requestId as fallback 369 | const typedRequest = request as any; // Cast to any to access id 370 | const callContext = { 371 | progressToken: typedRequest.params._meta?.progressToken, 372 | requestId: typedRequest.id 373 | }; 374 | 375 | const response = await handler.handle(typedRequest.params.arguments, callContext); 376 | return { 377 | _meta: {}, // Ensure _meta is always present in the response 378 | ...response 379 | }; 380 | }); 381 | } 382 | } 383 | -------------------------------------------------------------------------------- /src/handlers/local-repository.ts: -------------------------------------------------------------------------------- 1 | import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; 2 | import { BaseHandler } from './base-handler.js'; 3 | import { DocumentChunk, McpToolResponse, RepositoryConfig, IndexingStatus } from '../types.js'; 4 | import fs from 'fs/promises'; 5 | import path from 'path'; 6 | import { fileURLToPath } from 'url'; 7 | import crypto from 'crypto'; 8 | import { glob } from 'glob'; 9 | import { fileTypeFromFile } from 'file-type'; 10 | import { detectLanguage } from '../utils/language-detection.js'; 11 | import { RepositoryConfigLoader } from '../utils/repository-config-loader.js'; 12 | import { IndexingStatusManager } from '../utils/indexing-status-manager.js'; 13 | 14 | const COLLECTION_NAME = 'documentation'; 15 | const __dirname = path.dirname(fileURLToPath(import.meta.url)); 16 | const REPO_CONFIG_DIR = path.join(__dirname, '..', 'repo-configs'); 17 | const DEFAULT_CHUNK_SIZE = 1000; 18 | 19 | export class LocalRepositoryHandler extends BaseHandler { 20 | private activeProgressToken: string | number | undefined; 21 | private statusManager: IndexingStatusManager; 22 | // Track active indexing processes 23 | private static activeIndexingProcesses: Map = new Map(); 24 | // Smaller batch size to reduce processing time per batch 25 | private static BATCH_SIZE = 50; 26 | 27 | constructor(server: any, apiClient: any) { 28 | super(server, apiClient); 29 | this.statusManager = new IndexingStatusManager(); 30 | } 31 | 32 | async handle(args: any, callContext?: { progressToken?: string | number, requestId: string | number }): Promise { 33 | this.activeProgressToken = callContext?.progressToken || callContext?.requestId; 34 | 35 | // Validate required parameters 36 | if (!args.path || typeof args.path !== 'string') { 37 | throw new McpError(ErrorCode.InvalidParams, 'Repository path is required'); 38 | } 39 | 40 | // Normalize the repository path 41 | const repoPath = path.resolve(args.path); 42 | 43 | // Check if the repository path exists 44 | try { 45 | const stats = await fs.stat(repoPath); 46 | if (!stats.isDirectory()) { 47 | throw new McpError(ErrorCode.InvalidParams, `Path is not a directory: ${repoPath}`); 48 | } 49 | } catch (error) { 50 | throw new McpError(ErrorCode.InvalidParams, `Invalid repository path: ${repoPath}`); 51 | } 52 | 53 | // Create repository configuration 54 | const config: RepositoryConfig = { 55 | path: repoPath, 56 | name: args.name || path.basename(repoPath), 57 | include: args.include || ['**/*'], 58 | exclude: args.exclude || [ 59 | '**/node_modules/**', 60 | '**/.git/**', 61 | '**/build/**', 62 | '**/dist/**', 63 | '**/*.min.js', 64 | '**/*.map', 65 | '**/package-lock.json', 66 | '**/yarn.lock' 67 | ], 68 | watchMode: args.watchMode || false, 69 | watchInterval: args.watchInterval || 60000, // Default: 1 minute 70 | chunkSize: args.chunkSize || DEFAULT_CHUNK_SIZE, 71 | fileTypeConfig: args.fileTypeConfig || { 72 | // Default file type configurations 73 | '.js': { include: true, chunkStrategy: 'semantic' }, 74 | '.ts': { include: true, chunkStrategy: 'semantic' }, 75 | '.jsx': { include: true, chunkStrategy: 'semantic' }, 76 | '.tsx': { include: true, chunkStrategy: 'semantic' }, 77 | '.py': { include: true, chunkStrategy: 'semantic' }, 78 | '.java': { include: true, chunkStrategy: 'semantic' }, 79 | '.md': { include: true, chunkStrategy: 'semantic' }, 80 | '.txt': { include: true, chunkStrategy: 'line' }, 81 | '.json': { include: true, chunkStrategy: 'semantic' }, 82 | '.html': { include: true, chunkStrategy: 'semantic' }, 83 | '.css': { include: true, chunkStrategy: 'semantic' }, 84 | '.scss': { include: true, chunkStrategy: 'semantic' }, 85 | '.xml': { include: true, chunkStrategy: 'semantic' }, 86 | '.yaml': { include: true, chunkStrategy: 'semantic' }, 87 | '.yml': { include: true, chunkStrategy: 'semantic' }, 88 | } 89 | }; 90 | 91 | try { 92 | // Check if indexing is already in progress for this repository 93 | if (LocalRepositoryHandler.activeIndexingProcesses.has(config.name)) { 94 | // Get current status 95 | const status = await this.statusManager.getStatus(config.name); 96 | if (status && status.status === 'processing') { 97 | return { 98 | content: [ 99 | { 100 | type: 'text', 101 | text: `Repository indexing already in progress for ${config.name}.\n` + 102 | `Current progress: ${status.percentageComplete || 0}%\n` + 103 | `Files processed: ${status.processedFiles || 0} of ${status.totalFiles || 'unknown'}\n` + 104 | `Chunks indexed: ${status.indexedChunks || 0} of ${status.totalChunks || 'unknown'}\n` + 105 | `Started at: ${new Date(status.startTime).toLocaleString()}` 106 | }, 107 | ], 108 | }; 109 | } 110 | } 111 | 112 | // Save the repository configuration 113 | await this.saveRepositoryConfig(config); 114 | 115 | // Update the repositories.json configuration file 116 | const configLoader = new RepositoryConfigLoader(this.server, this.apiClient); 117 | await configLoader.addRepositoryToConfig(config); 118 | console.info(`[${config.name}] Repository configuration saved and loaded.`); 119 | if (this.activeProgressToken) { 120 | (this.server as any).sendProgress(this.activeProgressToken, { message: "Repository configuration saved." }); 121 | } 122 | 123 | // Create initial status 124 | await this.statusManager.createStatus(config.name); 125 | 126 | // Start the indexing process asynchronously 127 | this.processRepositoryAsync(config, this.activeProgressToken); 128 | 129 | return { 130 | content: [ 131 | { 132 | type: 'text', 133 | text: `Repository configuration saved for ${config.name} (${repoPath}).\n` + 134 | `Indexing has started in the background and will continue after this response.\n` + 135 | `You can check the status using the 'get_indexing_status' tool with parameter name="${config.name}".\n` + 136 | `Watch mode: ${config.watchMode ? 'enabled' : 'disabled'}` 137 | }, 138 | ], 139 | }; 140 | } catch (error) { 141 | if (error instanceof McpError) { 142 | throw error; 143 | } 144 | return { 145 | content: [ 146 | { 147 | type: 'text', 148 | text: `Failed to index repository: ${error}`, 149 | }, 150 | ], 151 | isError: true, 152 | }; 153 | } 154 | } 155 | 156 | private async processRepository(config: RepositoryConfig): Promise<{ 157 | chunks: DocumentChunk[], 158 | processedFiles: number, 159 | skippedFiles: number 160 | }> { 161 | const chunks: DocumentChunk[] = []; 162 | let processedFiles = 0; 163 | let skippedFiles = 0; 164 | let fileCounter = 0; 165 | 166 | // Get all files matching the include/exclude patterns 167 | const files = await glob(config.include, { 168 | cwd: config.path, 169 | ignore: config.exclude, 170 | absolute: true, 171 | nodir: true, 172 | }); 173 | const totalFiles = files.length; 174 | 175 | console.info(`[${config.name}] Found ${totalFiles} files to process based on include/exclude patterns.`); 176 | if (this.activeProgressToken) { 177 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Found ${totalFiles} files to process.` }); 178 | } 179 | 180 | for (const file of files) { 181 | fileCounter++; 182 | try { 183 | const relativePath = path.relative(config.path, file); 184 | const extension = path.extname(file); 185 | const fileTypeConfig = config.fileTypeConfig[extension]; 186 | 187 | // Skip files that should be excluded based on file type config 188 | if (fileTypeConfig && fileTypeConfig.include === false) { 189 | skippedFiles++; 190 | continue; 191 | } 192 | 193 | // Read file content 194 | const content = await fs.readFile(file, 'utf-8'); 195 | 196 | // Skip empty files 197 | if (!content.trim()) { 198 | skippedFiles++; 199 | continue; 200 | } 201 | 202 | // Detect language for better processing 203 | const language = detectLanguage(file, content); 204 | 205 | // Process the file content into chunks 206 | const fileChunks = this.chunkFileContent( 207 | content, 208 | file, 209 | relativePath, 210 | config, 211 | language, 212 | fileTypeConfig?.chunkStrategy || 'line' 213 | ); 214 | 215 | chunks.push(...fileChunks); 216 | processedFiles++; 217 | if (fileCounter % 50 === 0 && fileCounter > 0 && this.activeProgressToken) { 218 | const percentageComplete = Math.round((fileCounter / totalFiles) * 33); // File processing is ~1/3 of the job 219 | (this.server as any).sendProgress(this.activeProgressToken, { message: `Processed ${fileCounter} of ${totalFiles} files...`, percentageComplete }); 220 | console.info(`[${config.name}] Processed ${fileCounter} of ${totalFiles} files... (${processedFiles} successful, ${skippedFiles} skipped/errored)`); 221 | } 222 | } catch (error) { 223 | console.error(`[${config.name}] Error processing file ${file}: ${error instanceof Error ? error.message : String(error)}`); 224 | skippedFiles++; 225 | } 226 | } 227 | console.info(`[${config.name}] Completed file iteration. Processed: ${processedFiles}, Skipped/Errored: ${skippedFiles}.`); 228 | 229 | return { chunks, processedFiles, skippedFiles }; 230 | } 231 | 232 | private chunkFileContent( 233 | content: string, 234 | filePath: string, 235 | relativePath: string, 236 | config: RepositoryConfig, 237 | language: string, 238 | chunkStrategy: string 239 | ): DocumentChunk[] { 240 | const chunks: DocumentChunk[] = []; 241 | const timestamp = new Date().toISOString(); 242 | const fileUrl = `file://${filePath}`; 243 | const title = `${config.name}/${relativePath}`; 244 | 245 | // Different chunking strategies based on file type 246 | let textChunks: string[] = []; 247 | 248 | switch (chunkStrategy) { 249 | case 'semantic': 250 | // For semantic chunking, we'd ideally use a more sophisticated approach 251 | // For now, we'll use a simple paragraph-based approach 252 | textChunks = this.chunkByParagraphs(content, config.chunkSize); 253 | break; 254 | case 'line': 255 | // Chunk by lines, respecting max chunk size 256 | textChunks = this.chunkByLines(content, config.chunkSize); 257 | break; 258 | default: 259 | // Default to simple text chunking 260 | textChunks = this.chunkText(content, config.chunkSize); 261 | } 262 | 263 | // Create document chunks with metadata 264 | chunks.push(...textChunks.map((text, index) => ({ 265 | text, 266 | url: fileUrl, 267 | title, 268 | timestamp, 269 | filePath: relativePath, 270 | language, 271 | chunkIndex: index, 272 | totalChunks: textChunks.length, 273 | }))); 274 | 275 | return chunks; 276 | } 277 | 278 | private chunkText(text: string, maxChunkSize: number): string[] { 279 | const words = text.split(/\s+/); 280 | const chunks: string[] = []; 281 | let currentChunk: string[] = []; 282 | 283 | for (const word of words) { 284 | currentChunk.push(word); 285 | const currentLength = currentChunk.join(' ').length; 286 | 287 | if (currentLength >= maxChunkSize) { 288 | chunks.push(currentChunk.join(' ')); 289 | currentChunk = []; 290 | } 291 | } 292 | 293 | if (currentChunk.length > 0) { 294 | chunks.push(currentChunk.join(' ')); 295 | } 296 | 297 | return chunks; 298 | } 299 | 300 | private chunkByLines(text: string, maxChunkSize: number): string[] { 301 | const lines = text.split(/\r?\n/); 302 | const chunks: string[] = []; 303 | let currentChunk: string[] = []; 304 | let currentLength = 0; 305 | 306 | for (const line of lines) { 307 | const lineLength = line.length + 1; // +1 for the newline 308 | 309 | if (currentLength + lineLength > maxChunkSize && currentChunk.length > 0) { 310 | chunks.push(currentChunk.join('\n')); 311 | currentChunk = []; 312 | currentLength = 0; 313 | } 314 | 315 | currentChunk.push(line); 316 | currentLength += lineLength; 317 | } 318 | 319 | if (currentChunk.length > 0) { 320 | chunks.push(currentChunk.join('\n')); 321 | } 322 | 323 | return chunks; 324 | } 325 | 326 | private chunkByParagraphs(text: string, maxChunkSize: number): string[] { 327 | // Split by double newlines (paragraphs) 328 | const paragraphs = text.split(/\r?\n\r?\n/); 329 | const chunks: string[] = []; 330 | let currentChunk: string[] = []; 331 | let currentLength = 0; 332 | 333 | for (const paragraph of paragraphs) { 334 | const paragraphLength = paragraph.length + 2; // +2 for the double newline 335 | 336 | if (currentLength + paragraphLength > maxChunkSize && currentChunk.length > 0) { 337 | chunks.push(currentChunk.join('\n\n')); 338 | currentChunk = []; 339 | currentLength = 0; 340 | } 341 | 342 | currentChunk.push(paragraph); 343 | currentLength += paragraphLength; 344 | } 345 | 346 | if (currentChunk.length > 0) { 347 | chunks.push(currentChunk.join('\n\n')); 348 | } 349 | 350 | return chunks; 351 | } 352 | 353 | private async saveRepositoryConfig(config: RepositoryConfig): Promise { 354 | // Ensure the config directory exists 355 | try { 356 | await fs.mkdir(REPO_CONFIG_DIR, { recursive: true }); 357 | } catch (error) { 358 | console.error('Error creating repository config directory:', error); 359 | throw new McpError(ErrorCode.InternalError, 'Failed to create repository config directory'); 360 | } 361 | 362 | // Save the config file 363 | const configPath = path.join(REPO_CONFIG_DIR, `${config.name}.json`); 364 | await fs.writeFile(configPath, JSON.stringify(config, null, 2), 'utf-8'); 365 | } 366 | 367 | private generatePointId(): string { 368 | return crypto.randomBytes(16).toString('hex'); 369 | } 370 | 371 | /** 372 | * Process repository asynchronously to avoid MCP timeout 373 | */ 374 | private async processRepositoryAsync(config: RepositoryConfig, progressToken?: string | number): Promise { 375 | try { 376 | // Mark this repository as being processed 377 | LocalRepositoryHandler.activeIndexingProcesses.set(config.name, true); 378 | 379 | // Update status to processing 380 | await this.statusManager.updateStatus({ 381 | repositoryName: config.name, 382 | status: 'processing' 383 | }); 384 | 385 | console.info(`[${config.name}] Starting to process repository files asynchronously...`); 386 | 387 | // Process the repository files 388 | const { chunks, processedFiles, skippedFiles } = await this.processRepository(config); 389 | 390 | // Update status with file processing results 391 | await this.statusManager.updateStatus({ 392 | repositoryName: config.name, 393 | totalFiles: processedFiles + skippedFiles, 394 | processedFiles, 395 | skippedFiles, 396 | totalChunks: chunks.length, 397 | percentageComplete: 33 398 | }); 399 | 400 | console.info(`[${config.name}] Finished processing repository files. Found ${chunks.length} chunks from ${processedFiles} files (${skippedFiles} skipped).`); 401 | 402 | // Batch process chunks with smaller batch size for better responsiveness 403 | const batchSize = LocalRepositoryHandler.BATCH_SIZE; 404 | let indexedChunks = 0; 405 | const totalChunks = chunks.length; 406 | const totalBatches = Math.ceil(totalChunks / batchSize); 407 | 408 | console.info(`[${config.name}] Starting to generate embeddings and index ${totalChunks} chunks in ${totalBatches} batches...`); 409 | 410 | const COLLECTION_NAME = 'documentation'; 411 | 412 | for (let i = 0; i < totalChunks; i += batchSize) { 413 | const batchChunks = chunks.slice(i, i + batchSize); 414 | const currentBatch = Math.floor(i / batchSize) + 1; 415 | 416 | // Update status before processing batch 417 | await this.statusManager.updateStatus({ 418 | repositoryName: config.name, 419 | currentBatch, 420 | totalBatches, 421 | indexedChunks, 422 | percentageComplete: 33 + Math.round((i / totalChunks) * 66) 423 | }); 424 | 425 | console.info(`[${config.name}] Processing batch ${currentBatch} of ${totalBatches}...`); 426 | 427 | try { 428 | const embeddingResults = await Promise.allSettled( 429 | batchChunks.map(async (chunk) => { 430 | try { 431 | const embedding = await this.apiClient.getEmbeddings(chunk.text); 432 | return { 433 | id: this.generatePointId(), 434 | vector: embedding, 435 | payload: { 436 | ...chunk, 437 | _type: 'DocumentChunk' as const, 438 | repository: config.name, 439 | isRepositoryFile: true, 440 | } as Record, 441 | }; 442 | } catch (embeddingError) { 443 | console.error(`[${config.name}] Failed to generate embedding for chunk from ${chunk.filePath || chunk.url}: ${embeddingError instanceof Error ? embeddingError.message : String(embeddingError)}`); 444 | throw embeddingError; // Re-throw to be caught by Promise.allSettled 445 | } 446 | }) 447 | ); 448 | 449 | const successfulPoints = embeddingResults 450 | .filter(result => result.status === 'fulfilled') 451 | .map(result => (result as PromiseFulfilledResult).value); 452 | 453 | const failedEmbeddingsCount = embeddingResults.filter(result => result.status === 'rejected').length; 454 | if (failedEmbeddingsCount > 0) { 455 | console.warn(`[${config.name}] Failed to generate embeddings for ${failedEmbeddingsCount} of ${batchChunks.length} chunks in batch ${currentBatch}.`); 456 | } 457 | 458 | if (successfulPoints.length > 0) { 459 | try { 460 | await this.apiClient.qdrantClient.upsert(COLLECTION_NAME, { 461 | wait: true, 462 | points: successfulPoints, 463 | }); 464 | indexedChunks += successfulPoints.length; 465 | } catch (upsertError) { 466 | console.error(`[${config.name}] Failed to upsert batch ${currentBatch} of ${successfulPoints.length} points to Qdrant: ${upsertError instanceof Error ? upsertError.message : String(upsertError)}`); 467 | } 468 | } 469 | 470 | const percentageComplete = 33 + Math.round(((i + batchChunks.length) / totalChunks) * 66); 471 | console.info(`[${config.name}] Processed batch ${currentBatch} of ${totalBatches}. Successfully indexed in this batch: ${successfulPoints.length}. Total indexed so far: ${indexedChunks} chunks.`); 472 | 473 | // Update status after processing batch 474 | await this.statusManager.updateStatus({ 475 | repositoryName: config.name, 476 | currentBatch, 477 | totalBatches, 478 | indexedChunks, 479 | percentageComplete 480 | }); 481 | } catch (batchError) { 482 | console.error(`[${config.name}] Error processing batch ${currentBatch}:`, batchError); 483 | // Continue with next batch despite errors 484 | } 485 | } 486 | 487 | // Mark indexing as completed 488 | console.info(`[${config.name}] Finished generating embeddings and indexing. Total indexed: ${indexedChunks} of ${totalChunks} chunks.`); 489 | 490 | await this.statusManager.completeStatus(config.name, true, { 491 | processedFiles, 492 | skippedFiles, 493 | totalChunks, 494 | indexedChunks 495 | }); 496 | 497 | // If watch mode is enabled, start the watcher 498 | if (config.watchMode) { 499 | // This would be implemented in a separate class 500 | // this.startRepositoryWatcher(config); 501 | } 502 | } catch (error) { 503 | console.error(`[${config.name}] Error during async repository processing:`, error); 504 | 505 | // Update status to failed 506 | await this.statusManager.completeStatus( 507 | config.name, 508 | false, 509 | undefined, 510 | error instanceof Error ? error.message : String(error) 511 | ); 512 | } finally { 513 | // Remove from active processes 514 | LocalRepositoryHandler.activeIndexingProcesses.delete(config.name); 515 | } 516 | } 517 | } 518 | --------------------------------------------------------------------------------