├── .nvmrc ├── docs ├── apify-logo.png ├── mcp-clients.png ├── apify-mcp-server.png ├── claude-desktop.png ├── apify_mcp_server_dark_background.png ├── apify_mcp_server_white_background.png └── apify-logo.svg ├── src ├── errors.ts ├── tsconfig.json ├── index.ts ├── prompts │ ├── index.ts │ └── latest-news-on-topic.ts ├── actor │ ├── README.md │ ├── types.ts │ ├── const.ts │ └── utils.ts ├── utils │ ├── version.ts │ ├── html-to-md.ts │ ├── mcp-clients.ts │ ├── userid-cache.ts │ ├── tool-status.ts │ ├── ajv.ts │ ├── mcp.ts │ ├── actor-details.ts │ ├── html.ts │ ├── logging.ts │ ├── tools.ts │ ├── ttl-lru.ts │ ├── actor-response.ts │ ├── schema-generation.ts │ ├── progress.ts │ ├── generic.ts │ ├── actor.ts │ └── apify-docs.ts ├── mcp │ ├── const.ts │ ├── proxy.ts │ ├── actors.ts │ ├── utils.ts │ └── client.ts ├── state.ts ├── index-internals.ts ├── main.ts ├── tools │ ├── run_collection.ts │ ├── key_value_store_collection.ts │ ├── dataset_collection.ts │ ├── helpers.ts │ ├── index.ts │ ├── fetch-actor-details.ts │ ├── build.ts │ ├── run.ts │ ├── fetch-apify-docs.ts │ ├── key_value_store.ts │ ├── search-apify-docs.ts │ └── get-html-skeleton.ts ├── apify-client.ts ├── telemetry.ts └── input.ts ├── glama.json ├── .npmignore ├── .env.example ├── tsconfig.eslint.json ├── .dockerignore ├── tsconfig.json ├── tests ├── integration │ ├── stdio.test.ts │ ├── utils │ │ └── port.ts │ ├── actor.server-sse.test.ts │ ├── actor.server-streamable.test.ts │ └── internals.test.ts ├── const.ts ├── README.md └── unit │ ├── tools.actor.test.ts │ ├── utils.tool-status.test.ts │ ├── utils.ttl-lru.test.ts │ ├── mcp.utils.test.ts │ ├── utils.progress.test.ts │ ├── mcp.actors.test.ts │ ├── schema-generation.test.ts │ ├── telemetry.test.ts │ ├── utils.html.test.ts │ └── utils.actor.test.ts ├── vitest.config.ts ├── evals ├── tsconfig.json ├── eval-single.ts └── create-dataset.ts ├── .actor ├── actor.json ├── Dockerfile └── input_schema.json ├── smithery.yaml ├── .gitignore ├── Dockerfile ├── server.json ├── LICENSE.md ├── .github └── workflows │ ├── check.yaml │ ├── evaluations.yaml │ └── pre_release.yaml ├── .editorconfig ├── res ├── INDEX.md └── ALGOLIA.md ├── pyproject.toml ├── manifest.json ├── package.json └── eslint.config.mjs /.nvmrc: -------------------------------------------------------------------------------- 1 | 24 2 | -------------------------------------------------------------------------------- /docs/apify-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/apify-mcp-server/HEAD/docs/apify-logo.png -------------------------------------------------------------------------------- /docs/mcp-clients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/apify-mcp-server/HEAD/docs/mcp-clients.png -------------------------------------------------------------------------------- /docs/apify-mcp-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/apify-mcp-server/HEAD/docs/apify-mcp-server.png -------------------------------------------------------------------------------- /docs/claude-desktop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/apify-mcp-server/HEAD/docs/claude-desktop.png -------------------------------------------------------------------------------- /src/errors.ts: -------------------------------------------------------------------------------- 1 | export class TimeoutError extends Error { 2 | override readonly name = 'TimeoutError'; 3 | } 4 | -------------------------------------------------------------------------------- /glama.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://glama.ai/mcp/schemas/server.json", 3 | "maintainers": [ "jirispilka", "mq37" ] 4 | } 5 | -------------------------------------------------------------------------------- /docs/apify_mcp_server_dark_background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/apify-mcp-server/HEAD/docs/apify_mcp_server_dark_background.png -------------------------------------------------------------------------------- /docs/apify_mcp_server_white_background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apify/apify-mcp-server/HEAD/docs/apify_mcp_server_white_background.png -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | # .npmignore 2 | # Exclude everything by default 3 | * 4 | 5 | # Include specific files and folders 6 | !dist/ 7 | !README.md 8 | !LICENSE 9 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | APIFY_TOKEN= 2 | 3 | # EVALS 4 | PHOENIX_API_KEY= 5 | PHOENIX_HOST= 6 | 7 | OPENROUTER_API_KEY= 8 | OPENROUTER_BASE_URL=https://openrouter.ai/api/v1 9 | -------------------------------------------------------------------------------- /tsconfig.eslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "./tsconfig.json", 3 | "include": [ 4 | "evals", 5 | "src", 6 | "test", 7 | "tests", 8 | "vitest.config.ts" 9 | ], 10 | } 11 | -------------------------------------------------------------------------------- /src/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.json", 3 | "compilerOptions": { 4 | "rootDir": "./", 5 | "outDir": "../dist", 6 | "noEmit": false, 7 | }, 8 | "include": [ 9 | "./**/*" 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # configurations 2 | .idea 3 | 4 | # crawlee and apify storage folders 5 | apify_storage 6 | crawlee_storage 7 | storage 8 | 9 | # installed files 10 | node_modules 11 | 12 | # git folder 13 | .git 14 | 15 | # data 16 | data 17 | src/storage 18 | dist 19 | .env 20 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | /* 2 | This file provides essential functions and tools for MCP servers, serving as a library. 3 | The ActorsMcpServer should be the only class exported from the package 4 | */ 5 | 6 | import { ActorsMcpServer } from './mcp/server.js'; 7 | 8 | export { ActorsMcpServer }; 9 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@apify/tsconfig", 3 | "compilerOptions": { 4 | "module": "ES2022", 5 | "skipLibCheck": true, 6 | "noEmit": true, 7 | }, 8 | "include": [ 9 | "src/**/*", 10 | "tests/**/*" 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /src/prompts/index.ts: -------------------------------------------------------------------------------- 1 | import type { PromptBase } from '../types.js'; 2 | import { latestNewsOnTopicPrompt } from './latest-news-on-topic.js'; 3 | 4 | /** 5 | * List of all enabled prompts. 6 | */ 7 | export const prompts: PromptBase[] = [ 8 | latestNewsOnTopicPrompt, 9 | ]; 10 | -------------------------------------------------------------------------------- /src/actor/README.md: -------------------------------------------------------------------------------- 1 | # Actor 2 | 3 | Code related to Apify Actor called Actors-MCP-Server. 4 | This Actor will be deprecated in favor of Apify MCP Server, therefore we are keeping it separate from the main codebase. 5 | 6 | The only exception is the `src/main.ts` file that also belongs to the Actor. 7 | -------------------------------------------------------------------------------- /tests/integration/stdio.test.ts: -------------------------------------------------------------------------------- 1 | import { createMcpStdioClient } from '../helpers.js'; 2 | import { createIntegrationTestsSuite } from './suite.js'; 3 | 4 | createIntegrationTestsSuite({ 5 | suiteName: 'MCP stdio', 6 | transport: 'stdio', 7 | createClientFn: createMcpStdioClient, 8 | }); 9 | -------------------------------------------------------------------------------- /vitest.config.ts: -------------------------------------------------------------------------------- 1 | // eslint-disable-next-line import/extensions 2 | import { defineConfig } from 'vitest/config'; 3 | 4 | export default defineConfig({ 5 | test: { 6 | globals: true, 7 | environment: 'node', 8 | include: ['tests/**/*.test.ts'], 9 | testTimeout: 120_000, 10 | }, 11 | }); 12 | -------------------------------------------------------------------------------- /src/utils/version.ts: -------------------------------------------------------------------------------- 1 | import { createRequire } from 'node:module'; 2 | 3 | const require = createRequire(import.meta.url); 4 | const packageJson = require('../../package.json'); 5 | 6 | /** 7 | * Gets the package version from package.json 8 | * Returns null if version is not available 9 | */ 10 | export function getPackageVersion(): string | null { 11 | return packageJson.version || null; 12 | } 13 | -------------------------------------------------------------------------------- /src/mcp/const.ts: -------------------------------------------------------------------------------- 1 | export const MAX_TOOL_NAME_LENGTH = 64; 2 | export const SERVER_ID_LENGTH = 8; 3 | export const EXTERNAL_TOOL_CALL_TIMEOUT_MSEC = 120_000; // 2 minutes 4 | export const ACTORIZED_MCP_CONNECTION_TIMEOUT_MSEC = 30_000; // 30 seconds 5 | 6 | export const LOG_LEVEL_MAP: Record = { 7 | debug: 0, 8 | info: 1, 9 | notice: 2, 10 | warning: 3, 11 | error: 4, 12 | critical: 5, 13 | alert: 6, 14 | emergency: 7, 15 | }; 16 | -------------------------------------------------------------------------------- /evals/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2019", 4 | "module": "ES2022", 5 | "moduleResolution": "bundler", 6 | "lib": ["ESNext"], 7 | "strict": true, 8 | "esModuleInterop": true, 9 | "allowSyntheticDefaultImports": true, 10 | "skipLibCheck": true, 11 | "noEmit": true, 12 | "types": ["vitest/globals"] 13 | }, 14 | "include": [ 15 | "*.ts" 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.actor/actor.json: -------------------------------------------------------------------------------- 1 | { 2 | "actorSpecification": 1, 3 | "name": "apify-mcp-server", 4 | "title": "Model Context Protocol Server for Apify Actors", 5 | "description": "Implementation of a Model Context Protocol (MCP) Server for Apify Actors that enables AI applications (and AI agents) to interact with Apify Actors", 6 | "version": "0.1", 7 | "input": "./input_schema.json", 8 | "readme": "./ACTOR.md", 9 | "dockerfile": "./Dockerfile", 10 | "webServerMcpPath": "/sse" 11 | } 12 | -------------------------------------------------------------------------------- /src/actor/types.ts: -------------------------------------------------------------------------------- 1 | export type ActorRunData = { 2 | id?: string; 3 | actId?: string; 4 | userId?: string; 5 | startedAt?: string; 6 | finishedAt: null; 7 | status: 'RUNNING'; 8 | meta: { 9 | origin?: string; 10 | }; 11 | options: { 12 | build?: string; 13 | memoryMbytes?: string; 14 | }; 15 | buildId?: string; 16 | defaultKeyValueStoreId?: string; 17 | defaultDatasetId?: string; 18 | defaultRequestQueueId?: string; 19 | buildNumber?: string; 20 | containerUrl?: string; 21 | standbyUrl?: string; 22 | }; 23 | -------------------------------------------------------------------------------- /tests/const.ts: -------------------------------------------------------------------------------- 1 | import { defaults } from '../src/const.js'; 2 | import { toolCategoriesEnabledByDefault } from '../src/tools/index.js'; 3 | import { actorNameToToolName } from '../src/tools/utils.js'; 4 | import { getExpectedToolNamesByCategories } from '../src/utils/tools.js'; 5 | 6 | export const ACTOR_PYTHON_EXAMPLE = 'apify/python-example'; 7 | export const ACTOR_MCP_SERVER_ACTOR_NAME = 'apify/actors-mcp-server'; 8 | export const DEFAULT_TOOL_NAMES = getExpectedToolNamesByCategories(toolCategoriesEnabledByDefault); 9 | export const DEFAULT_ACTOR_NAMES = defaults.actors.map((tool) => actorNameToToolName(tool)); 10 | -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | required: 9 | - apifyToken 10 | properties: 11 | apifyToken: 12 | type: string 13 | description: The API token for accessing Apify's services. 14 | commandFunction: 15 | # A function that produces the CLI command to start the MCP on stdio. 16 | |- 17 | (config) => ({ command: 'node', args: ['dist/main.js'], env: { APIFY_TOKEN: config.apifyToken } }) -------------------------------------------------------------------------------- /docs/apify-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /tests/integration/utils/port.ts: -------------------------------------------------------------------------------- 1 | import { createServer } from 'node:net'; 2 | 3 | /** 4 | * Finds an available port by letting the OS assign one dynamically. 5 | * This is to prevent the address already in use errors to prevent flaky tests. 6 | * @returns Promise - An available port assigned by the OS 7 | */ 8 | export async function getAvailablePort(): Promise { 9 | return new Promise((resolve, reject) => { 10 | const server = createServer(); 11 | server.listen(0, () => { 12 | const { port } = server.address() as { port: number }; 13 | server.close(() => resolve(port)); 14 | }); 15 | server.on('error', reject); 16 | }); 17 | } 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # This file tells Git which files shouldn't be added to source control 2 | 3 | .idea 4 | .vscode 5 | storage 6 | apify_storage 7 | crawlee_storage 8 | node_modules 9 | dist 10 | tsconfig.tsbuildinfo 11 | storage/* 12 | !storage/key_value_stores 13 | storage/key_value_stores/* 14 | !storage/key_value_stores/default 15 | storage/key_value_stores/default/* 16 | !storage/key_value_stores/default/INPUT.json 17 | 18 | # Added by Apify CLI 19 | .venv 20 | .env 21 | 22 | # Aider coding agent files 23 | .aider* 24 | 25 | 26 | # MCP registry private key 27 | key.pem 28 | 29 | # Ignore MCP config for Opencode client 30 | opencode.json 31 | 32 | # Python cache files 33 | __pycache__/ 34 | *.pyc 35 | *.pyo 36 | *.pyd 37 | .Python 38 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Stage 1: Build the project 2 | FROM node:24-alpine AS builder 3 | 4 | # Set working directory 5 | WORKDIR /app 6 | 7 | # Copy package files and install dependencies 8 | COPY package.json package-lock.json ./ 9 | RUN npm install 10 | 11 | # Copy source files 12 | COPY src ./src 13 | COPY tsconfig.json ./ 14 | 15 | # Build the project 16 | RUN npm run build 17 | 18 | # Stage 2: Set up the runtime environment 19 | FROM node:24-alpine 20 | 21 | # Set working directory 22 | WORKDIR /app 23 | 24 | # Copy only the necessary files from the build stage 25 | COPY --from=builder /app/dist ./dist 26 | COPY package.json package-lock.json ./ 27 | 28 | # Install production dependencies only 29 | RUN npm ci --omit=dev 30 | 31 | # Set the entry point for the container 32 | ENTRYPOINT ["node", "dist/stdio.js"] 33 | -------------------------------------------------------------------------------- /src/actor/const.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Constants for the Actor. 3 | */ 4 | export const HEADER_READINESS_PROBE = 'x-apify-container-server-readiness-probe'; 5 | 6 | export enum TransportType { 7 | HTTP = 'HTTP', 8 | SSE = 'SSE', 9 | } 10 | 11 | export enum Routes { 12 | ROOT = '/', 13 | MCP = '/mcp', 14 | SSE = '/sse', 15 | MESSAGE = '/message', 16 | } 17 | 18 | export const getHelpMessage = (host: string) => `To interact with the server you can either: 19 | - send request to ${host}${Routes.MCP}?token=YOUR-APIFY-TOKEN and receive a response 20 | or 21 | - connect for Server-Sent Events (SSE) via GET request to: ${host}${Routes.SSE}?token=YOUR-APIFY-TOKEN 22 | - send messages via POST request to: ${host}${Routes.MESSAGE}?token=YOUR-APIFY-TOKEN 23 | (Include your message content in the request body.)`; 24 | -------------------------------------------------------------------------------- /server.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://static.modelcontextprotocol.io/schemas/2025-07-09/server.schema.json", 3 | "name": "com.apify/apify-mcp-server", 4 | "description": "Extract data from any website with thousands of scrapers, crawlers, and automations on Apify Store ⚡", 5 | "status": "active", 6 | "repository": { 7 | "url": "https://github.com/apify/apify-mcp-server", 8 | "source": "github" 9 | }, 10 | "version": "0.6.5", 11 | "remotes": [ 12 | { 13 | "type": "streamable-http", 14 | "url": "https://mcp.apify.com/", 15 | "headers": [ 16 | { 17 | "name": "Authorization", 18 | "description": "Apify API token for authentication with Apify platform services. For example 'Bearer '", 19 | "is_required": true, 20 | "is_secret": true 21 | } 22 | ] 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | This directory contains **unit** and **integration** tests for the `actors-mcp-server` project. 4 | 5 | # Unit Tests 6 | 7 | Unit tests are located in the `tests/unit` directory. 8 | 9 | To run the unit tests, you can use the following command: 10 | ```bash 11 | npm run test:unit 12 | ``` 13 | 14 | # Integration Tests 15 | 16 | Integration tests are located in the `tests/integration` directory. 17 | In order to run the integration tests, you need to have the `APIFY_TOKEN` environment variable set. 18 | Also following Actors need to exist on the target execution Apify platform: 19 | ``` 20 | ALL DEFAULT ONES DEFINED IN consts.ts AND ALSO EXPLICITLY: 21 | apify/rag-web-browser 22 | apify/instagram-scraper 23 | apify/python-example 24 | ``` 25 | 26 | To run the integration tests, you can use the following command: 27 | ```bash 28 | APIFY_TOKEN=your_token npm run test:integration 29 | ``` 30 | -------------------------------------------------------------------------------- /src/utils/html-to-md.ts: -------------------------------------------------------------------------------- 1 | import TurndownService from 'turndown'; 2 | 3 | const turndown = new TurndownService(); 4 | 5 | // Remove non-visible elements 6 | turndown.remove('script'); 7 | turndown.remove('style'); 8 | turndown.remove('noscript'); 9 | 10 | // Remove multimedia elements 11 | turndown.remove('svg'); 12 | turndown.remove('img'); 13 | turndown.remove('figure'); 14 | turndown.remove('video'); 15 | turndown.remove('audio'); 16 | turndown.remove('picture'); 17 | 18 | // Remove interactive elements 19 | turndown.remove('canvas'); 20 | turndown.remove('button'); 21 | turndown.remove('select'); 22 | turndown.remove('input'); 23 | 24 | // Remove embedded 25 | turndown.remove('iframe'); 26 | turndown.remove('embed'); 27 | turndown.remove('object'); 28 | 29 | // Remove navigation and footer elements 30 | turndown.remove('aside'); 31 | turndown.remove('nav'); 32 | turndown.remove('footer'); 33 | 34 | /** 35 | * Converts HTML content to Markdown format using Turndown. 36 | */ 37 | export function htmlToMarkdown(html: string): string { 38 | return turndown.turndown(html); 39 | } 40 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Apify 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/actor/utils.ts: -------------------------------------------------------------------------------- 1 | import { Actor } from 'apify'; 2 | 3 | import type { ActorRunData } from './types.js'; 4 | 5 | export function getActorRunData(): ActorRunData | null { 6 | return Actor.isAtHome() ? { 7 | id: process.env.ACTOR_RUN_ID, 8 | actId: process.env.ACTOR_ID, 9 | userId: process.env.APIFY_USER_ID, 10 | startedAt: process.env.ACTOR_STARTED_AT, 11 | finishedAt: null, 12 | status: 'RUNNING', 13 | meta: { 14 | origin: process.env.APIFY_META_ORIGIN, 15 | }, 16 | options: { 17 | build: process.env.ACTOR_BUILD_NUMBER, 18 | memoryMbytes: process.env.ACTOR_MEMORY_MBYTES, 19 | }, 20 | buildId: process.env.ACTOR_BUILD_ID, 21 | defaultKeyValueStoreId: process.env.ACTOR_DEFAULT_KEY_VALUE_STORE_ID, 22 | defaultDatasetId: process.env.ACTOR_DEFAULT_DATASET_ID, 23 | defaultRequestQueueId: process.env.ACTOR_DEFAULT_REQUEST_QUEUE_ID, 24 | buildNumber: process.env.ACTOR_BUILD_NUMBER, 25 | containerUrl: process.env.ACTOR_WEB_SERVER_URL, 26 | standbyUrl: process.env.ACTOR_STANDBY_URL, 27 | } : null; 28 | } 29 | -------------------------------------------------------------------------------- /src/utils/mcp-clients.ts: -------------------------------------------------------------------------------- 1 | import type { InitializeRequest } from '@modelcontextprotocol/sdk/types.js'; 2 | import { mcpClients } from 'mcp-client-capabilities'; 3 | 4 | /** 5 | * Determines if the MCP client supports dynamic tools based on the InitializeRequest data. 6 | */ 7 | export function doesMcpClientSupportDynamicTools(initializeRequestData?: InitializeRequest): boolean { 8 | const clientName = initializeRequestData?.params?.clientInfo?.name; 9 | const clientCapabilities = mcpClients[clientName || '']; 10 | if (!clientCapabilities) return false; 11 | 12 | const clientProtocolVersion = clientCapabilities.protocolVersion; 13 | const knownProtocolVersion = initializeRequestData?.params?.protocolVersion; 14 | 15 | // Compare the protocolVersion to check if the client is up to date 16 | // We check for strict equality because if the versions differ, we cannot be sure about the capabilities 17 | if (clientProtocolVersion !== knownProtocolVersion) { 18 | // Client version is different from the known version, we cannot be sure about its capabilities 19 | return false; 20 | } 21 | 22 | return clientCapabilities.tools?.listChanged === true; 23 | } 24 | -------------------------------------------------------------------------------- /src/utils/userid-cache.ts: -------------------------------------------------------------------------------- 1 | import { createHash } from 'node:crypto'; 2 | 3 | import type { ApifyClient } from '../apify-client.js'; 4 | import { USER_CACHE_MAX_SIZE, USER_CACHE_TTL_SECS } from '../const.js'; 5 | import { TTLLRUCache } from './ttl-lru.js'; 6 | 7 | // LRU cache with TTL for user info - stores the raw User object from API 8 | const userIdCache = new TTLLRUCache(USER_CACHE_MAX_SIZE, USER_CACHE_TTL_SECS); 9 | 10 | /** 11 | * Gets user ID from token, using cache to avoid repeated API calls 12 | * Token is hashed before caching to avoid storing raw tokens 13 | * Returns userId or null if not found 14 | */ 15 | export async function getUserIdFromTokenCached( 16 | token: string, 17 | apifyClient: ApifyClient, 18 | ): Promise { 19 | const tokenHash = createHash('sha256').update(token).digest('hex'); 20 | const cachedId = userIdCache.get(tokenHash); 21 | if (cachedId) return cachedId; 22 | 23 | try { 24 | const user = await apifyClient.user('me').get(); 25 | if (!user || !user.id) { 26 | return null; 27 | } 28 | userIdCache.set(tokenHash, user.id); 29 | return user.id; 30 | } catch { 31 | return null; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /.github/workflows/check.yaml: -------------------------------------------------------------------------------- 1 | # This workflow runs for every pull request to lint and test the proposed changes. 2 | 3 | name: Check 4 | 5 | on: 6 | pull_request: 7 | 8 | # Push to master will trigger code checks 9 | push: 10 | branches: 11 | - master 12 | tags-ignore: 13 | - "**" # Ignore all tags to prevent duplicate builds when tags are pushed. 14 | 15 | jobs: 16 | lint_and_test: 17 | name: Code checks 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Use Node.js 23 | uses: actions/setup-node@v6 24 | with: 25 | node-version-file: '.nvmrc' 26 | cache: 'npm' 27 | cache-dependency-path: 'package-lock.json' 28 | - name: Install Dependencies 29 | run: npm ci --force 30 | 31 | - name: Lint 32 | run: npm run lint 33 | 34 | - name: Build 35 | run: npm run build 36 | 37 | - name: Test 38 | run: npm run test 39 | 40 | - name: Type checks 41 | run: npm run type-check 42 | -------------------------------------------------------------------------------- /src/mcp/proxy.ts: -------------------------------------------------------------------------------- 1 | import type { Client } from '@modelcontextprotocol/sdk/client/index.js'; 2 | 3 | import { fixedAjvCompile } from '../tools/utils.js'; 4 | import type { ActorMcpTool, ToolEntry } from '../types.js'; 5 | import { ajv } from '../utils/ajv.js'; 6 | import { getMCPServerID, getProxyMCPServerToolName } from './utils.js'; 7 | 8 | export async function getMCPServerTools( 9 | actorID: string, 10 | client: Client, 11 | // Name of the MCP server 12 | serverUrl: string, 13 | ): Promise { 14 | const res = await client.listTools(); 15 | const { tools } = res; 16 | 17 | const compiledTools: ToolEntry[] = []; 18 | for (const tool of tools) { 19 | const mcpTool: ActorMcpTool = { 20 | type: 'actor-mcp', 21 | actorId: actorID, 22 | serverId: getMCPServerID(serverUrl), 23 | serverUrl, 24 | originToolName: tool.name, 25 | 26 | name: getProxyMCPServerToolName(serverUrl, tool.name), 27 | description: tool.description || '', 28 | inputSchema: tool.inputSchema, 29 | ajvValidate: fixedAjvCompile(ajv, tool.inputSchema), 30 | }; 31 | 32 | compiledTools.push(mcpTool); 33 | } 34 | 35 | return compiledTools; 36 | } 37 | -------------------------------------------------------------------------------- /tests/unit/tools.actor.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest'; 2 | 3 | import { actorNameToToolName } from '../../src/tools/utils.js'; 4 | 5 | describe('actors', () => { 6 | describe('actorNameToToolName', () => { 7 | it('should replace slashes and dots with dash notation', () => { 8 | expect(actorNameToToolName('apify/web-scraper')).toBe('apify-slash-web-scraper'); 9 | expect(actorNameToToolName('my.actor.name')).toBe('my-dot-actor-dot-name'); 10 | }); 11 | 12 | it('should handle empty strings', () => { 13 | expect(actorNameToToolName('')).toBe(''); 14 | }); 15 | 16 | it('should handle strings without slashes or dots', () => { 17 | expect(actorNameToToolName('actorname')).toBe('actorname'); 18 | }); 19 | 20 | it('should handle strings with multiple slashes and dots', () => { 21 | expect(actorNameToToolName('actor/name.with/multiple.parts')).toBe('actor-slash-name-dot-with-slash-multiple-dot-parts'); 22 | }); 23 | 24 | it('should handle tool names longer than 64 characters', () => { 25 | const longName = 'a'.repeat(70); 26 | const expected = 'a'.repeat(64); 27 | expect(actorNameToToolName(longName)).toBe(expected); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /src/state.ts: -------------------------------------------------------------------------------- 1 | import { 2 | ACTOR_CACHE_MAX_SIZE, 3 | ACTOR_CACHE_TTL_SECS, 4 | APIFY_DOCS_CACHE_MAX_SIZE, 5 | APIFY_DOCS_CACHE_TTL_SECS, 6 | GET_HTML_SKELETON_CACHE_MAX_SIZE, 7 | GET_HTML_SKELETON_CACHE_TTL_SECS, 8 | MCP_SERVER_CACHE_MAX_SIZE, 9 | MCP_SERVER_CACHE_TTL_SECS, 10 | } from './const.js'; 11 | import type { ActorDefinitionWithInfo, ApifyDocsSearchResult } from './types.js'; 12 | import { TTLLRUCache } from './utils/ttl-lru.js'; 13 | 14 | export const actorDefinitionPrunedCache = new TTLLRUCache(ACTOR_CACHE_MAX_SIZE, ACTOR_CACHE_TTL_SECS); 15 | export const searchApifyDocsCache = new TTLLRUCache(APIFY_DOCS_CACHE_MAX_SIZE, APIFY_DOCS_CACHE_TTL_SECS); 16 | /** Stores processed Markdown content */ 17 | export const fetchApifyDocsCache = new TTLLRUCache(APIFY_DOCS_CACHE_MAX_SIZE, APIFY_DOCS_CACHE_TTL_SECS); 18 | /** Stores HTML content per URL so we can paginate the tool output */ 19 | export const getHtmlSkeletonCache = new TTLLRUCache(GET_HTML_SKELETON_CACHE_MAX_SIZE, GET_HTML_SKELETON_CACHE_TTL_SECS); 20 | /** 21 | * Stores MCP server resolution per actor: 22 | * - false: not an MCP server 23 | * - string: MCP server URL 24 | */ 25 | export const mcpServerCache = new TTLLRUCache(MCP_SERVER_CACHE_MAX_SIZE, MCP_SERVER_CACHE_TTL_SECS); 26 | -------------------------------------------------------------------------------- /src/utils/tool-status.ts: -------------------------------------------------------------------------------- 1 | import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; 2 | 3 | import { TOOL_STATUS } from '../const.js'; 4 | import type { ToolStatus } from '../types.js'; 5 | import { getHttpStatusCode } from './logging.js'; 6 | 7 | /** 8 | * Central helper to classify an error into a ToolStatus value. 9 | * 10 | * - TOOL_STATUS.ABORTED → Request was explicitly aborted by the client. 11 | * - TOOL_STATUS.SOFT_FAIL → User/client errors (HTTP 4xx, InvalidParams, validation issues). 12 | * - TOOL_STATUS.FAILED → Server errors (HTTP 5xx, unknown, or unexpected exceptions). 13 | */ 14 | export function getToolStatusFromError(error: unknown, isAborted: boolean): ToolStatus { 15 | if (isAborted) { 16 | return TOOL_STATUS.ABORTED; 17 | } 18 | 19 | const statusCode = getHttpStatusCode(error); 20 | 21 | // HTTP client errors (4xx) are treated as user errors 22 | if (statusCode !== undefined && statusCode >= 400 && statusCode < 500) { 23 | return TOOL_STATUS.SOFT_FAIL; 24 | } 25 | 26 | // MCP InvalidParams errors are also user errors 27 | if (error instanceof McpError && error.code === ErrorCode.InvalidParams) { 28 | return TOOL_STATUS.SOFT_FAIL; 29 | } 30 | 31 | // Everything else is considered a server / unexpected failure 32 | return TOOL_STATUS.FAILED; 33 | } 34 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is a file format and collection of text editor plugins 2 | # for maintaining consistent coding styles between different editors and IDEs 3 | # See https://editorconfig.org for more information 4 | 5 | root = true 6 | 7 | # Apply to all files 8 | [*] 9 | indent_style = space 10 | indent_size = 4 11 | charset = utf-8 12 | trim_trailing_whitespace = true 13 | insert_final_newline = true 14 | end_of_line = lf 15 | # Maximum line length (160 characters) 16 | # Note: editorconfig-tools is unable to ignore long strings or URLs, so this is informational 17 | # ESLint will enforce this limit with its max-len rule 18 | max_line_length = 160 19 | 20 | # IntelliJ IDEA / WebStorm specific settings 21 | # These settings configure code formatting behavior in JetBrains IDEs 22 | # They ensure consistent formatting when using IDE auto-format features 23 | # - Adds spaces within TypeScript import braces: import { a, b } instead of import {a,b} 24 | ij_typescript_spaces_within_imports = true 25 | # - Adds spaces within JavaScript import braces: import { a, b } instead of import {a,b} 26 | ij_javascript_spaces_within_imports = true 27 | # - Adds spaces within TypeScript union types: string | number instead of string|number 28 | ij_typescript_spaces_within_union_types = true 29 | 30 | # YAML files use 2-space indentation (YAML standard) 31 | [{*.yaml, *.yml}] 32 | indent_size = 2 33 | -------------------------------------------------------------------------------- /src/index-internals.ts: -------------------------------------------------------------------------------- 1 | /* 2 | This file provides essential internal functions for Apify MCP servers, serving as an internal library. 3 | */ 4 | 5 | import { ApifyClient } from './apify-client.js'; 6 | import { defaults, HelperTools } from './const.js'; 7 | import { processParamsGetTools } from './mcp/utils.js'; 8 | import { addTool } from './tools/helpers.js'; 9 | import { defaultTools, getActorsAsTools, toolCategories, 10 | toolCategoriesEnabledByDefault, unauthEnabledToolCategories, unauthEnabledTools } from './tools/index.js'; 11 | import { actorNameToToolName } from './tools/utils.js'; 12 | import type { ToolCategory } from './types.js'; 13 | import { parseCommaSeparatedList, parseQueryParamList } from './utils/generic.js'; 14 | import { getExpectedToolNamesByCategories, getToolPublicFieldOnly } from './utils/tools.js'; 15 | import { TTLLRUCache } from './utils/ttl-lru.js'; 16 | 17 | export { 18 | ApifyClient, 19 | getExpectedToolNamesByCategories, 20 | TTLLRUCache, 21 | actorNameToToolName, 22 | HelperTools, 23 | defaults, 24 | defaultTools, 25 | addTool, 26 | toolCategories, 27 | toolCategoriesEnabledByDefault, 28 | type ToolCategory, 29 | processParamsGetTools, 30 | getActorsAsTools, 31 | getToolPublicFieldOnly, 32 | unauthEnabledToolCategories, 33 | unauthEnabledTools, 34 | parseCommaSeparatedList, 35 | parseQueryParamList, 36 | }; 37 | -------------------------------------------------------------------------------- /res/INDEX.md: -------------------------------------------------------------------------------- 1 | # Resources Directory Index 2 | 3 | This directory contains useful documents and insights about the repository architecture, design decisions, and implementation details that don't belong in code comments or JSDoc. 4 | 5 | ## Files 6 | 7 | ### [ALGOLIA.md](./ALGOLIA.md) 8 | Technical analysis of Algolia search API responses for each documentation source. 9 | - Data structure overview for each doc source (apify, crawlee-js, crawlee-py) 10 | - Field availability patterns (content, hierarchy, anchors) 11 | - Example response payloads 12 | - Recommendations for response processing logic 13 | - **Use case**: Understand what data is actually returned by Algolia to inform simplification decisions 14 | 15 | --- 16 | 17 | ## Purpose 18 | 19 | Resources in this directory serve as: 20 | - **Technical references** for complex subsystems (e.g., Algolia integration) 21 | - **Decision documentation** explaining why certain approaches were chosen 22 | - **Data analysis** for optimization and refactoring efforts 23 | - **Integration guides** for external services and APIs 24 | 25 | ## Guidelines 26 | 27 | - Keep documents **short and technical** - avoid duplicating code logic 28 | - Focus on **insights and patterns** rather than implementation details 29 | - Use **tables, examples, and structured data** for clarity 30 | - Link to relevant source files when explaining code flow 31 | - Update when making significant changes to documented systems 32 | -------------------------------------------------------------------------------- /tests/integration/actor.server-sse.test.ts: -------------------------------------------------------------------------------- 1 | import type { Server as HttpServer } from 'node:http'; 2 | 3 | import type { Express } from 'express'; 4 | 5 | import log from '@apify/log'; 6 | 7 | import { createExpressApp } from '../../src/actor/server.js'; 8 | import { createMcpSseClient } from '../helpers.js'; 9 | import { createIntegrationTestsSuite } from './suite.js'; 10 | import { getAvailablePort } from './utils/port.js'; 11 | 12 | let app: Express; 13 | let httpServer: HttpServer; 14 | let httpServerPort: number; 15 | let httpServerHost: string; 16 | let mcpUrl: string; 17 | 18 | createIntegrationTestsSuite({ 19 | suiteName: 'Apify MCP Server SSE', 20 | transport: 'sse', 21 | createClientFn: async (options) => await createMcpSseClient(mcpUrl, options), 22 | beforeAllFn: async () => { 23 | log.setLevel(log.LEVELS.OFF); 24 | 25 | // Get an available port 26 | httpServerPort = await getAvailablePort(); 27 | httpServerHost = `http://localhost:${httpServerPort}`; 28 | mcpUrl = `${httpServerHost}/sse`; 29 | 30 | // Create an express app 31 | app = createExpressApp(httpServerHost); 32 | 33 | // Start a test server 34 | await new Promise((resolve) => { 35 | httpServer = app.listen(httpServerPort, () => resolve()); 36 | }); 37 | }, 38 | afterAllFn: async () => { 39 | await new Promise((resolve) => { 40 | httpServer.close(() => resolve()); 41 | }); 42 | }, 43 | }); 44 | -------------------------------------------------------------------------------- /tests/integration/actor.server-streamable.test.ts: -------------------------------------------------------------------------------- 1 | import type { Server as HttpServer } from 'node:http'; 2 | 3 | import type { Express } from 'express'; 4 | 5 | import log from '@apify/log'; 6 | 7 | import { createExpressApp } from '../../src/actor/server.js'; 8 | import { createMcpStreamableClient } from '../helpers.js'; 9 | import { createIntegrationTestsSuite } from './suite.js'; 10 | import { getAvailablePort } from './utils/port.js'; 11 | 12 | let app: Express; 13 | let httpServer: HttpServer; 14 | let httpServerPort: number; 15 | let httpServerHost: string; 16 | let mcpUrl: string; 17 | 18 | createIntegrationTestsSuite({ 19 | suiteName: 'Apify MCP Server Streamable HTTP', 20 | transport: 'streamable-http', 21 | createClientFn: async (options) => await createMcpStreamableClient(mcpUrl, options), 22 | beforeAllFn: async () => { 23 | log.setLevel(log.LEVELS.OFF); 24 | 25 | // Get an available port 26 | httpServerPort = await getAvailablePort(); 27 | httpServerHost = `http://localhost:${httpServerPort}`; 28 | mcpUrl = `${httpServerHost}/mcp`; 29 | 30 | // Create an express app 31 | app = createExpressApp(httpServerHost); 32 | 33 | // Start a test server 34 | await new Promise((resolve) => { 35 | httpServer = app.listen(httpServerPort, () => resolve()); 36 | }); 37 | }, 38 | afterAllFn: async () => { 39 | await new Promise((resolve) => { 40 | httpServer.close(() => resolve()); 41 | }); 42 | }, 43 | }); 44 | -------------------------------------------------------------------------------- /tests/unit/utils.tool-status.test.ts: -------------------------------------------------------------------------------- 1 | import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; 2 | import { describe, expect, it } from 'vitest'; 3 | 4 | import { TOOL_STATUS } from '../../src/const.js'; 5 | import { getToolStatusFromError } from '../../src/utils/tool-status.js'; 6 | 7 | describe('getToolStatusFromError', () => { 8 | it('returns aborted when isAborted is true', () => { 9 | const status = getToolStatusFromError(new Error('any'), true); 10 | expect(status).toBe(TOOL_STATUS.ABORTED); 11 | }); 12 | 13 | it('classifies HTTP 4xx errors as soft_fail', () => { 14 | const error = Object.assign(new Error('Bad Request'), { statusCode: 400 }); 15 | const status = getToolStatusFromError(error, false); 16 | expect(status).toBe(TOOL_STATUS.SOFT_FAIL); 17 | }); 18 | 19 | it('classifies HTTP 5xx errors as failed', () => { 20 | const error = Object.assign(new Error('Internal Error'), { statusCode: 500 }); 21 | const status = getToolStatusFromError(error, false); 22 | expect(status).toBe(TOOL_STATUS.FAILED); 23 | }); 24 | 25 | it('classifies McpError InvalidParams as soft_fail', () => { 26 | const error = new McpError(ErrorCode.InvalidParams, 'invalid', undefined); 27 | const status = getToolStatusFromError(error, false); 28 | expect(status).toBe(TOOL_STATUS.SOFT_FAIL); 29 | }); 30 | 31 | it('classifies unknown errors without status code as failed', () => { 32 | const status = getToolStatusFromError(new Error('unknown'), false); 33 | expect(status).toBe(TOOL_STATUS.FAILED); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /src/utils/ajv.ts: -------------------------------------------------------------------------------- 1 | import type { ValidateFunction } from 'ajv'; 2 | import Ajv from 'ajv'; 3 | 4 | export const ajv = new Ajv({ coerceTypes: 'array', strict: false }); 5 | 6 | /** 7 | * Removes the $schema property and fixes the required array from a JSON schema. 8 | * The z.toJSONSchema() function in Zod 4.x has two issues: 9 | * 1. Includes a $schema reference that can cause issues when compiling with AJV 10 | * 2. Incorrectly marks fields with default values as required 11 | * 12 | * This function fixes both issues to ensure proper schema validation. 13 | */ 14 | function cleanJsonSchema(schema: Record): Record { 15 | const cleaned = { ...schema }; 16 | delete cleaned.$schema; 17 | 18 | // Fix the required array: remove fields that have default values 19 | if (Array.isArray(cleaned.required) && typeof cleaned.properties === 'object' && cleaned.properties !== null) { 20 | const properties = cleaned.properties as Record; 21 | cleaned.required = (cleaned.required as string[]).filter( 22 | (fieldName) => { 23 | const fieldSchema = properties[fieldName]; 24 | // Only include in required if the field doesn't have a default value 25 | return !(typeof fieldSchema === 'object' && fieldSchema !== null && 'default' in fieldSchema); 26 | }, 27 | ); 28 | } 29 | 30 | return cleaned; 31 | } 32 | 33 | /** 34 | * Compiles a JSON schema with AJV, automatically cleaning the $schema property 35 | * and fixing the required array. 36 | * This wrapper ensures compatibility with z.toJSONSchema() output. 37 | */ 38 | export function compileSchema(schema: Record): ValidateFunction { 39 | return ajv.compile(cleanJsonSchema(schema)); 40 | } 41 | -------------------------------------------------------------------------------- /src/utils/mcp.ts: -------------------------------------------------------------------------------- 1 | import type { ToolStatus } from '../types.js'; 2 | 3 | /** 4 | * Helper to build a response for MCP from an array of text strings. 5 | * @param options - Object containing response configuration 6 | * @param options.texts - Array of text strings to include in the response 7 | * @param options.isError - Optional flag to mark the response as an error (default: false). 8 | * This must remain MCP compliant: true for any tool-level error. 9 | * @param options.toolStatus - Optional internal tool status used for telemetry. When provided, 10 | * it will be attached as `_toolStatus` so the server can read it 11 | * and strip it before sending the response to the MCP client. 12 | * @param options.structuredContent - Optional structured content of unknown type 13 | */ 14 | 15 | export function buildMCPResponse(options: { 16 | texts: string[]; 17 | isError?: boolean; 18 | toolStatus?: ToolStatus; 19 | structuredContent?: unknown; 20 | }) { 21 | const { 22 | texts, 23 | isError = false, 24 | toolStatus, 25 | structuredContent, 26 | } = options; 27 | 28 | const response: { 29 | content: { type: 'text'; text: string }[]; 30 | isError: boolean; 31 | internalToolStatus?: ToolStatus; 32 | structuredContent?: unknown; 33 | } = { 34 | content: texts.map((text) => ({ type: 'text', text })), 35 | isError, 36 | }; 37 | 38 | // Attach internal tool status for telemetry; server will read and strip it 39 | if (toolStatus) { 40 | response.internalToolStatus = toolStatus; 41 | } 42 | 43 | // Add structured content if provided 44 | if (structuredContent !== undefined) { 45 | response.structuredContent = structuredContent; 46 | } 47 | 48 | return response; 49 | } 50 | -------------------------------------------------------------------------------- /.github/workflows/evaluations.yaml: -------------------------------------------------------------------------------- 1 | # This workflow runs MCP tool calling evaluations on master branch merges 2 | # It evaluates AI models' ability to correctly identify and call MCP tools. 3 | 4 | name: MCP tool calling evaluations 5 | 6 | on: 7 | # Run evaluations on PR merges to master or PRs with 'validated' label 8 | pull_request: 9 | types: [closed, labeled] 10 | branches: 11 | - 'master' 12 | 13 | jobs: 14 | evaluations: 15 | name: MCP tool calling evaluations 16 | runs-on: ubuntu-latest 17 | # Run on PR merges to master or PRs with 'validated' label 18 | if: | 19 | (github.event.action == 'closed' && github.event.pull_request.merged == true) || 20 | (github.event.action == 'labeled' && github.event.label.name == 'validated') 21 | 22 | steps: 23 | - name: Checkout code 24 | uses: actions/checkout@v4 25 | 26 | - name: Use Node.js 27 | uses: actions/setup-node@v6 28 | with: 29 | node-version-file: '.nvmrc' 30 | cache: 'npm' 31 | cache-dependency-path: 'package-lock.json' 32 | 33 | - name: Install Node dependencies 34 | run: npm ci --force --include=dev 35 | 36 | - name: Build project 37 | run: npm run build 38 | 39 | - name: Run evaluations 40 | run: npm run evals:run 41 | env: 42 | GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }} 43 | PHOENIX_API_KEY: ${{ secrets.PHOENIX_API_KEY }} 44 | PHOENIX_BASE_URL: ${{ secrets.PHOENIX_BASE_URL }} 45 | OPENROUTER_BASE_URL: ${{ secrets.OPENROUTER_BASE_URL }} 46 | OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} 47 | -------------------------------------------------------------------------------- /.actor/Dockerfile: -------------------------------------------------------------------------------- 1 | # Specify the base Docker image. You can read more about 2 | # the available images at https://docs.apify.com/sdk/js/docs/guides/docker-images 3 | # You can also use any other image from Docker Hub. 4 | FROM apify/actor-node:24 AS builder 5 | 6 | # Check preinstalled packages 7 | RUN npm ls crawlee apify puppeteer playwright 8 | 9 | # Copy just package.json and package-lock.json 10 | # to speed up the build using Docker layer cache. 11 | COPY package*.json ./ 12 | 13 | # Install all dependencies. Don't audit to speed up the installation. 14 | RUN npm install --include=dev --audit=false 15 | 16 | # Next, copy the source files using the user set 17 | # in the base image. 18 | COPY . ./ 19 | 20 | # Install all dependencies and build the project. 21 | # Don't audit to speed up the installation. 22 | RUN npm run build 23 | 24 | # Create final image 25 | FROM apify/actor-node:24 26 | 27 | # Check preinstalled packages 28 | RUN npm ls crawlee apify puppeteer playwright 29 | 30 | # Copy just package.json and package-lock.json 31 | # to speed up the build using Docker layer cache. 32 | COPY package*.json ./ 33 | 34 | # Install NPM packages, skip optional and development dependencies to 35 | # keep the image small. Avoid logging too much and print the dependency 36 | # tree for debugging 37 | RUN npm --quiet set progress=false \ 38 | && npm install --omit=dev --omit=optional \ 39 | && echo "Installed NPM packages:" \ 40 | && (npm list --omit=dev --all || true) \ 41 | && echo "Node.js version:" \ 42 | && node --version \ 43 | && echo "NPM version:" \ 44 | && npm --version \ 45 | && rm -r ~/.npm 46 | 47 | # Copy built JS files from builder image 48 | COPY --from=builder /usr/src/app/dist ./dist 49 | 50 | # Next, copy the remaining files and directories with the source code. 51 | # Since we do this after NPM install, quick build will be really fast 52 | # for most source file changes. 53 | COPY . ./ 54 | 55 | 56 | # Run the image. 57 | CMD npm run start:prod --silent 58 | -------------------------------------------------------------------------------- /tests/unit/utils.ttl-lru.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest'; 2 | 3 | import { TTLLRUCache } from '../../src/utils/ttl-lru.js'; 4 | 5 | describe('TTLLRUCache', () => { 6 | it('should set and get values before TTL expires', () => { 7 | const cache = new TTLLRUCache(2, 2); // 2 seconds TTL 8 | cache.set('a', 'valueA'); 9 | expect(cache.get('a')).toBe('valueA'); 10 | }); 11 | 12 | it('should return null after TTL expires', async () => { 13 | const cache = new TTLLRUCache(2, 1); // 1 second TTL 14 | cache.set('a', 'valueA'); 15 | await new Promise((r) => { setTimeout(r, 1100); }); 16 | expect(cache.get('a')).toBeNull(); 17 | }); 18 | 19 | it('should evict least recently used items when maxLength is exceeded', () => { 20 | const cache = new TTLLRUCache(2, 10); // Large TTL 21 | cache.set('a', 'valueA'); 22 | cache.set('b', 'valueB'); 23 | cache.set('c', 'valueC'); // Should evict 'a' 24 | expect(cache.get('a')).toBeNull(); 25 | expect(cache.get('b')).toBe('valueB'); 26 | expect(cache.get('c')).toBe('valueC'); 27 | }); 28 | 29 | it('should update value and TTL on set for existing key', async () => { 30 | const cache = new TTLLRUCache(2, 1); // 1 second TTL 31 | cache.set('a', 'valueA'); 32 | await new Promise((r) => { setTimeout(r, 700); }); 33 | cache.set('a', 'valueA2'); // Reset TTL 34 | await new Promise((r) => { setTimeout(r, 700); }); 35 | expect(cache.get('a')).toBe('valueA2'); 36 | }); 37 | 38 | it('should remove expired entry on get', async () => { 39 | const cache = new TTLLRUCache(2, 1); // 1 second TTL 40 | cache.set('a', 'valueA'); 41 | await new Promise((r) => { setTimeout(r, 1100); }); 42 | expect(cache.get('a')).toBeNull(); 43 | // Should not throw if called again 44 | expect(cache.get('a')).toBeNull(); 45 | }); 46 | }); 47 | -------------------------------------------------------------------------------- /src/prompts/latest-news-on-topic.ts: -------------------------------------------------------------------------------- 1 | import type { PromptArgument } from '@modelcontextprotocol/sdk/types.js'; 2 | 3 | import { fixedAjvCompile } from '../tools/utils.js'; 4 | import type { PromptBase } from '../types.js'; 5 | import { ajv } from '../utils/ajv.js'; 6 | 7 | /** 8 | * Prompt MCP arguments list. 9 | */ 10 | const args: PromptArgument[] = [ 11 | { 12 | name: 'topic', 13 | description: 'The topic to retrieve the latest news on.', 14 | required: true, 15 | }, 16 | { 17 | name: 'timespan', 18 | description: 'The timespan for which to retrieve news articles. Defaults to "7 days". For example "1 day", "3 days", "7 days", "1 month", etc.', 19 | required: false, 20 | }, 21 | ]; 22 | 23 | /** 24 | * Prompt AJV arguments schema for validation. 25 | */ 26 | const argsSchema = fixedAjvCompile(ajv, { 27 | type: 'object', 28 | properties: { 29 | ...Object.fromEntries(args.map((arg) => [arg.name, { 30 | type: 'string', 31 | description: arg.description, 32 | }])), 33 | }, 34 | required: [...args.filter((arg) => arg.required).map((arg) => arg.name)], 35 | }); 36 | 37 | /** 38 | * Actual prompt definition. 39 | */ 40 | export const latestNewsOnTopicPrompt: PromptBase = { 41 | name: 'GetLatestNewsOnTopic', 42 | description: 'This prompt retrieves the latest news articles on a selected topic.', 43 | arguments: args, 44 | ajvValidate: argsSchema, 45 | render: (data) => { 46 | const currentDateUtc = new Date().toISOString().split('T')[0]; 47 | const timespan = data.timespan && data.timespan.trim() !== '' ? data.timespan : '7 days'; 48 | return `I want you to use the RAG web browser to search the web for the latest news on the "${data.topic}" topic. Retrieve news from the last ${timespan}. The RAG web browser accepts a query parameter that supports all Google input, including filters and flags—be sure to use them to accomplish my goal. Today is ${currentDateUtc} UTC.`; 49 | }, 50 | }; 51 | -------------------------------------------------------------------------------- /.github/workflows/pre_release.yaml: -------------------------------------------------------------------------------- 1 | name: Create a pre-release 2 | 3 | on: 4 | # Only trigger on PRs with "beta" label 5 | pull_request: 6 | types: [labeled, synchronize, reopened] 7 | 8 | concurrency: 9 | group: release 10 | cancel-in-progress: false 11 | 12 | jobs: 13 | wait_for_checks: 14 | # Run ONLY when PR has the "beta" label 15 | if: contains(github.event.pull_request.labels.*.name, 'beta') 16 | name: Wait for code checks to pass 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Wait for existing checks or skip if none 20 | uses: lewagon/wait-on-check-action@v1.3.4 21 | with: 22 | ref: ${{ github.event.pull_request.head.sha }} 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | check-regexp: (Code checks) 25 | wait-interval: 10 26 | running-workflow-name: 'Wait for code checks to pass' 27 | allowed-conclusions: success,neutral,skipped 28 | continue-on-error: false 29 | 30 | push_pkg_pr_new: 31 | needs: [ wait_for_checks ] 32 | name: Push to pkg.pr.new 33 | runs-on: ubuntu-latest 34 | 35 | steps: 36 | - name: Checkout repository 37 | uses: actions/checkout@v4 38 | with: 39 | ref: ${{ github.event.pull_request.head.ref }} 40 | repository: ${{ github.event.pull_request.head.repo.full_name }} 41 | 42 | - name: Use Node.js 43 | uses: actions/setup-node@v6 44 | with: 45 | node-version-file: '.nvmrc' 46 | cache: 'npm' 47 | cache-dependency-path: 'package-lock.json' 48 | 49 | - name: Install dependencies 50 | run: npm ci --force 51 | 52 | - name: Build 53 | run: npm run build 54 | 55 | - run: npx -y pkg-pr-new publish 56 | -------------------------------------------------------------------------------- /src/utils/actor-details.ts: -------------------------------------------------------------------------------- 1 | import type { Actor, Build } from 'apify-client'; 2 | 3 | import type { ApifyClient } from '../apify-client.js'; 4 | import { filterSchemaProperties, shortenProperties } from '../tools/utils.js'; 5 | import type { ActorInputSchema, StructuredActorCard } from '../types.js'; 6 | import { formatActorToActorCard, formatActorToStructuredCard } from './actor-card.js'; 7 | import { logHttpError } from './logging.js'; 8 | 9 | // Keep the type here since it is a self-contained module 10 | export type ActorDetailsResult = { 11 | actorInfo: Actor; 12 | buildInfo: Build; 13 | actorCard: string; 14 | actorCardStructured: StructuredActorCard; 15 | inputSchema: ActorInputSchema; 16 | readme: string; 17 | }; 18 | 19 | export async function fetchActorDetails(apifyClient: ApifyClient, actorName: string): Promise { 20 | try { 21 | const [actorInfo, buildInfo]: [Actor | undefined, Build | undefined] = await Promise.all([ 22 | apifyClient.actor(actorName).get(), 23 | apifyClient.actor(actorName).defaultBuild().then(async (build) => build.get()), 24 | ]); 25 | if (!actorInfo || !buildInfo || !buildInfo.actorDefinition) return null; 26 | const inputSchema = (buildInfo.actorDefinition.input || { 27 | type: 'object', 28 | properties: {}, 29 | }) as ActorInputSchema; 30 | inputSchema.properties = filterSchemaProperties(inputSchema.properties); 31 | inputSchema.properties = shortenProperties(inputSchema.properties); 32 | const actorCard = formatActorToActorCard(actorInfo); 33 | const actorCardStructured = formatActorToStructuredCard(actorInfo); 34 | return { 35 | actorInfo, 36 | buildInfo, 37 | actorCard, 38 | actorCardStructured, 39 | inputSchema, 40 | readme: buildInfo.actorDefinition.readme || 'No README provided.', 41 | }; 42 | } catch (error) { 43 | logHttpError(error, `Failed to fetch actor details for '${actorName}'`, { actorName }); 44 | return null; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "apify-mcp-evals" 3 | version = "0.1.0" 4 | description = "Python evaluations for Apify MCP Server using Arize Phoenix" 5 | requires-python = ">=3.12" 6 | dependencies = [ 7 | "arize-phoenix>=12.5.0", 8 | "anthropic>=0.33.1", 9 | "openai>=1.0.0", 10 | "pandas>=2.0.0", 11 | "python-dotenv>=1.0.0", 12 | "tqdm>=4.65.0", 13 | ] 14 | 15 | [dependency-groups] 16 | dev = [ 17 | "mypy", 18 | "ruff", 19 | ] 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build.targets.wheel] 26 | packages = ["evals"] 27 | 28 | [tool.ruff] 29 | line-length = 120 30 | include = ["*.py"] 31 | 32 | [tool.ruff.lint] 33 | select = ["ALL"] 34 | ignore = [ 35 | "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in {filename} 36 | "BLE001", # Do not catch blind exception 37 | "C901", # `{name}` is too complex 38 | "COM812", # This rule may cause conflicts when used with the formatter 39 | "D100", # Missing docstring in public module 40 | "D104", # Missing docstring in public package 41 | "D107", # Missing docstring in `__init__` 42 | "D203", # One blank line required before class docstring 43 | "D213", # Multi-line docstring summary should start at the second line 44 | "D413", # Missing blank line after last section 45 | "EM", # flake8-errmsg 46 | "G004", # Logging statement uses f-string 47 | "ISC001", # This rule may cause conflicts when used with the formatter 48 | "FIX", # flake8-fixme 49 | "TRY003", # Avoid specifying long messages outside the exception class 50 | ] 51 | 52 | [tool.ruff.format] 53 | quote-style = "single" 54 | indent-style = "space" 55 | 56 | [tool.ruff.lint.per-file-ignores] 57 | "**/__init__.py" = [ 58 | "F401", # Unused imports 59 | ] 60 | 61 | [tool.ruff.lint.flake8-quotes] 62 | docstring-quotes = "double" 63 | inline-quotes = "single" 64 | 65 | [tool.ruff.lint.flake8-type-checking] 66 | runtime-evaluated-base-classes = [ 67 | "pydantic.BaseModel", 68 | "pydantic_settings.BaseSettings", 69 | ] 70 | 71 | [tool.ruff.lint.flake8-builtins] 72 | builtins-ignorelist = ["id"] 73 | -------------------------------------------------------------------------------- /src/utils/html.ts: -------------------------------------------------------------------------------- 1 | import * as cheerio from 'cheerio'; 2 | 3 | type CheerioElementLike = { 4 | attribs: Record; 5 | tagName: string; 6 | }; 7 | 8 | type NodeLike = { 9 | type: string; 10 | }; 11 | 12 | /** 13 | * Strips HTML and keeps only the structure. 14 | * 15 | * Removes styles, scripts, and other non-content elements. 16 | * Collapses whitespace and trims the result. 17 | * Keeps only href, src, alt, id, class, title, name, data-* attributes. 18 | * Removes HTML comments and spaces between tags. 19 | * Removes base64 encoded images. 20 | */ 21 | export function stripHtml(html: string): string { 22 | const $ = cheerio.load(html); 23 | 24 | // Remove all attributes except href (only on a), src, alt, id, class, title, name, data-* 25 | const allowedAttrs = ['href', 'src', 'alt', 'id', 'class', 'title', 'name']; 26 | $('*').each((_, element) => { 27 | const { attribs } = (element as CheerioElementLike); 28 | if (attribs) { 29 | Object.keys(attribs).forEach((attr) => { 30 | if (attr === 'href' && (element as CheerioElementLike).tagName !== 'a') { 31 | $(element).removeAttr(attr); 32 | } else if (!allowedAttrs.includes(attr) && !attr.startsWith('data-')) { 33 | $(element).removeAttr(attr); 34 | } 35 | }); 36 | } 37 | }); 38 | 39 | // Remove

Content

'; 26 | const expected = '

Content

'; 27 | expect(stripHtml(input)).toBe(expected); 28 | }); 29 | 30 | it('should remove noscript, iframe, svg, canvas, math tags', () => { 31 | const input = '

Text

'; 32 | const expected = '

Text

'; 33 | expect(stripHtml(input)).toBe(expected); 34 | }); 35 | 36 | it('should remove HTML comments', () => { 37 | const input = '

Content

'; 38 | const expected = '

Content

'; 39 | expect(stripHtml(input)).toBe(expected); 40 | }); 41 | 42 | it('should remove base64 encoded images', () => { 43 | const input = '

Text

'; 45 | const expected = '

Text

'; 46 | expect(stripHtml(input)).toBe(expected); 47 | }); 48 | 49 | it('should keep regular images with http src', () => { 50 | const input = 'Image'; 51 | const expected = 'Image'; 52 | expect(stripHtml(input)).toBe(expected); 53 | }); 54 | 55 | it('should collapse multiple spaces and remove spaces between tags', () => { 56 | const input = '

Text

'; 57 | const expected = '

Text

'; 58 | expect(stripHtml(input)).toBe(expected); 59 | }); 60 | 61 | it('should trim the result', () => { 62 | const input = '
Content
'; 63 | const expected = '
Content
'; 64 | expect(stripHtml(input)).toBe(expected); 65 | }); 66 | 67 | it('should handle empty string', () => { 68 | expect(stripHtml('')).toBe(''); 69 | }); 70 | 71 | it('should handle plain text', () => { 72 | const input = 'Just plain text'; 73 | expect(stripHtml(input)).toBe('Just plain text'); 74 | }); 75 | 76 | it('should handle malformed HTML', () => { 77 | const input = '

Unclosed tag'; 78 | const expected = '

Unclosed tag

'; 79 | expect(stripHtml(input)).toBe(expected); 80 | }); 81 | 82 | it('should handle nested elements with mixed attributes', () => { 83 | const input = ''; 84 | const expected = ''; 85 | expect(stripHtml(input)).toBe(expected); 86 | }); 87 | }); 88 | -------------------------------------------------------------------------------- /src/utils/generic.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Parses a comma-separated string into an array of trimmed strings. 3 | * Empty strings are filtered out after trimming. 4 | * 5 | * @param input - The comma-separated string to parse. If undefined, returns an empty array. 6 | * @returns An array of trimmed, non-empty strings. 7 | * @example 8 | * parseCommaSeparatedList("a, b, c"); // ["a", "b", "c"] 9 | * parseCommaSeparatedList("a, , b"); // ["a", "b"] 10 | */ 11 | export function parseCommaSeparatedList(input?: string): string[] { 12 | if (!input) { 13 | return []; 14 | } 15 | return input.split(',').map((s) => s.trim()).filter((s) => s.length > 0); 16 | } 17 | 18 | /** 19 | * Parses a query parameter that can be either a string or an array of strings. 20 | * Handles comma-separated values in strings and filters out empty values. 21 | * 22 | * @param param - A query parameter that can be a string, array of strings, or undefined 23 | * @returns An array of trimmed, non-empty strings 24 | * @example 25 | * parseQueryParamList("a,b,c"); // ["a", "b", "c"] 26 | * parseQueryParamList(["a", "b"]); // ["a", "b"] 27 | * parseQueryParamList(undefined); // [] 28 | */ 29 | export function parseQueryParamList(param?: string | string[]): string[] { 30 | if (!param) { 31 | return []; 32 | } 33 | if (Array.isArray(param)) { 34 | return param.flatMap((item) => parseCommaSeparatedList(item)); 35 | } 36 | return parseCommaSeparatedList(param); 37 | } 38 | 39 | /** 40 | * Recursively gets the value in a nested object for each key in the keys array. 41 | * Each key can be a dot-separated path (e.g. 'a.b.c'). 42 | * Returns an object mapping each key to its resolved value (or undefined if not found). 43 | * 44 | * @example 45 | * const obj = { a: { b: { c: 42 } }, nested: { d: 100 } }; 46 | * const value = getValuesByDotKeys(obj, ['a.b.c', 'a.b.d', 'nested']); 47 | * value; // { 'a.b.c': 42, 'a.b.d': undefined, 'nested': { d: 100 } } 48 | */ 49 | export function getValuesByDotKeys(obj: Record, keys: string[]): Record { 50 | const result: Record = {}; 51 | for (const key of keys) { 52 | const path = key.split('.'); 53 | let current: unknown = obj; 54 | for (const segment of path) { 55 | if ( 56 | current !== null 57 | && typeof current === 'object' 58 | && Object.prototype.hasOwnProperty.call(current, segment) 59 | ) { 60 | // Use index signature to avoid 'any' and type errors 61 | current = (current as Record)[segment]; 62 | } else { 63 | current = undefined; 64 | break; 65 | } 66 | } 67 | result[key] = current; 68 | } 69 | return result; 70 | } 71 | 72 | /** 73 | * Validates whether a given string is a well-formed URL. 74 | * 75 | * Allows only valid HTTP or HTTPS URLs. 76 | */ 77 | export function isValidHttpUrl(urlString: string): boolean { 78 | if (!urlString.startsWith('http://') && !urlString.startsWith('https://')) { 79 | return false; 80 | } 81 | try { 82 | /* eslint-disable no-new */ 83 | new URL(urlString); 84 | return true; 85 | } catch { 86 | return false; 87 | } 88 | } 89 | 90 | /** 91 | * Parses a boolean value from a string, boolean, null, or undefined. 92 | * Accepts 'true', '1' as true, 'false', '0' as false. 93 | * If value is already a boolean, returns it directly. 94 | * Returns undefined if the value is not a recognized boolean string or is null/undefined/empty string. 95 | */ 96 | export function parseBooleanFromString(value: string | boolean | undefined | null): boolean | undefined { 97 | // If already a boolean, return it directly 98 | if (typeof value === 'boolean') { 99 | return value; 100 | } 101 | // Handle undefined/null 102 | if (value === undefined || value === null) { 103 | return undefined; 104 | } 105 | // Handle empty string (after trim) 106 | const normalized = value.toLowerCase().trim(); 107 | if (normalized === '') { 108 | return undefined; 109 | } 110 | if (normalized === 'true' || normalized === '1') { 111 | return true; 112 | } 113 | if (normalized === 'false' || normalized === '0') { 114 | return false; 115 | } 116 | return undefined; 117 | } 118 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | /** 2 | * ESLint Configuration 3 | * 4 | * This configuration follows the apify-core style and uses the shared @apify/eslint-config package. 5 | * It follows the shared config as much as possible, only adding project-specific overrides where necessary. 6 | * 7 | * The shared config provides: 8 | * - Import ordering via simple-import-sort/imports (groups: side effects, node:, external, @apify/, @apify-packages/, relative) 9 | * - max-len rule (160 chars, ignores URLs and template literals) 10 | * - TypeScript-specific rules and best practices 11 | * 12 | * Project-specific overrides: 13 | * - import/no-extraneous-dependencies: Adds vitest.config.ts and evals/** patterns 14 | * - @typescript-eslint/consistent-type-definitions: Prefers 'type' over 'interface' (use interface only for class implementations) 15 | * - @typescript-eslint/no-unused-vars: Detects unused variables, functions, and parameters (allows _ prefix) 16 | * - import/no-default-export: Allows default exports in config files 17 | */ 18 | import apifyTypeScriptConfig from '@apify/eslint-config/ts.js'; 19 | 20 | export default [ 21 | { 22 | // Ignores must be defined first in flat config format 23 | // These directories/files are excluded from linting 24 | ignores: [ 25 | '**/dist', // Build output directory 26 | '**/.venv', // Python virtual environment (if present) 27 | 'evals/**', // Evaluation scripts directory 28 | ], 29 | }, 30 | // Apply the shared Apify TypeScript ESLint configuration 31 | // This includes TypeScript-specific rules, import ordering, and other best practices 32 | ...apifyTypeScriptConfig, 33 | { 34 | rules: { 35 | // Prevent importing devDependencies in production code 36 | // This helps catch accidental imports of test/build tools in source code 37 | 'import/no-extraneous-dependencies': [ 38 | 'error', 39 | { 40 | // Allow importing devDependencies in these specific file patterns: 41 | devDependencies: [ 42 | '**/eslint.config.mjs', // ESLint config files 43 | '**/vitest.config.ts', // Vitest config files 44 | '**/*.test.{js,ts,jsx,tsx}', // Test files 45 | '**/{test,tests}/**/*.{js,ts,jsx,tsx,mjs,mts,cjs,cts}', // Test directories 46 | 'evals/**/*.{js,ts,jsx,tsx,mjs,mts,cjs,cts}', // Evaluation scripts 47 | ], 48 | }, 49 | ], 50 | }, 51 | languageOptions: { 52 | // Use ES modules (import/export syntax) 53 | sourceType: 'module', 54 | parserOptions: { 55 | // Use the ESLint-specific tsconfig that includes test files 56 | // This ensures TypeScript-aware linting works for all files 57 | project: './tsconfig.eslint.json', 58 | }, 59 | }, 60 | }, 61 | // TypeScript-specific rules (applied only to .ts files) 62 | // These rules require the @typescript-eslint plugin which is included in apifyTypeScriptConfig 63 | { 64 | files: ['**/*.ts', '**/*.tsx'], 65 | rules: { 66 | // Prefer 'type' over 'interface' for flexibility 67 | // Use 'interface' only when required for class implementations (implements) 68 | '@typescript-eslint/consistent-type-definitions': ['error', 'type'], 69 | // Detect unused variables, functions, and parameters 70 | // This prevents dead code and helps maintain clean code 71 | '@typescript-eslint/no-unused-vars': [ 72 | 'error', 73 | { 74 | // Allow variables starting with underscore (intentionally unused) 75 | argsIgnorePattern: '^_', 76 | // Allow destructured variables starting with underscore 77 | destructuredArrayIgnorePattern: '^_', 78 | // Allow variables starting with underscore 79 | varsIgnorePattern: '^_', 80 | }, 81 | ], 82 | }, 83 | }, 84 | // Override rules for configuration files 85 | // Config files (like this one) typically use default exports, which is acceptable 86 | { 87 | files: ['**/eslint.config.mjs', '**/vitest.config.ts'], 88 | rules: { 89 | // Allow default exports in config files (standard practice for config files) 90 | 'import/no-default-export': 'off', 91 | }, 92 | }, 93 | ]; 94 | -------------------------------------------------------------------------------- /src/tools/build.ts: -------------------------------------------------------------------------------- 1 | import type { ApifyClient } from '../apify-client.js'; 2 | import { ACTOR_README_MAX_LENGTH } from '../const.js'; 3 | import type { 4 | ActorDefinitionPruned, 5 | ActorDefinitionWithDesc, 6 | ActorDefinitionWithInfo, 7 | SchemaProperties, 8 | } from '../types.js'; 9 | 10 | /** 11 | * Get Actor input schema by Actor name. 12 | * First, fetch the Actor details to get the default build tag and buildId. 13 | * Then, fetch the build details and return actorName, description, and input schema. 14 | * @param {string} actorIdOrName - Actor ID or Actor full name. 15 | * @param {ApifyClient} apifyClient - The Apify client instance. 16 | * @param {number} limit - Truncate the README to this limit. 17 | * @returns {Promise} - The Actor definition with info or null if not found. 18 | */ 19 | export async function getActorDefinition( 20 | actorIdOrName: string, 21 | apifyClient: ApifyClient, 22 | limit: number = ACTOR_README_MAX_LENGTH, 23 | ): Promise { 24 | const actorClient = apifyClient.actor(actorIdOrName); 25 | try { 26 | // Fetch Actor details 27 | const actor = await actorClient.get(); 28 | if (!actor) { 29 | return null; 30 | } 31 | 32 | const defaultBuildClient = await actorClient.defaultBuild(); 33 | const buildDetails = await defaultBuildClient.get(); 34 | 35 | if (buildDetails?.actorDefinition) { 36 | const actorDefinitions = buildDetails?.actorDefinition as ActorDefinitionWithDesc; 37 | // We set actorDefinition ID to Actor ID 38 | actorDefinitions.id = actor.id; 39 | actorDefinitions.readme = truncateActorReadme(actorDefinitions.readme || '', limit); 40 | actorDefinitions.description = actor.description || ''; 41 | actorDefinitions.actorFullName = `${actor.username}/${actor.name}`; 42 | actorDefinitions.defaultRunOptions = actor.defaultRunOptions; 43 | // Pass pictureUrl from actor object (untyped property but present in API response) 44 | (actorDefinitions as Record).pictureUrl = (actor as unknown as Record).pictureUrl; 45 | return { 46 | definition: pruneActorDefinition(actorDefinitions), 47 | info: actor, 48 | }; 49 | } 50 | return null; 51 | } catch (error) { 52 | // Check if it's a "not found" error (404 or 400 status codes) 53 | const isNotFound = typeof error === 'object' 54 | && error !== null 55 | && 'statusCode' in error 56 | && (error.statusCode === 404 || error.statusCode === 400); 57 | 58 | if (isNotFound) { 59 | // Return null for not found - caller will log appropriately 60 | return null; 61 | } 62 | 63 | // For server errors, throw the original error (preserve error type) 64 | // Caller should catch and log 65 | throw error; 66 | } 67 | } 68 | function pruneActorDefinition(response: ActorDefinitionWithDesc): ActorDefinitionPruned { 69 | return { 70 | id: response.id, 71 | actorFullName: response.actorFullName || '', 72 | buildTag: response?.buildTag || '', 73 | readme: response?.readme || '', 74 | input: response?.input && 'type' in response.input && 'properties' in response.input 75 | ? { 76 | ...response.input, 77 | type: response.input.type as string, 78 | properties: response.input.properties as Record, 79 | } 80 | : undefined, 81 | description: response.description, 82 | defaultRunOptions: response.defaultRunOptions, 83 | webServerMcpPath: 'webServerMcpPath' in response ? response.webServerMcpPath as string : undefined, 84 | pictureUrl: 'pictureUrl' in response ? response.pictureUrl as string | undefined : undefined, 85 | }; 86 | } 87 | /** Prune Actor README if it is too long 88 | * If the README is too long 89 | * - We keep the README as it is up to the limit. 90 | * - After the limit, we keep heading only 91 | * - We add a note that the README was truncated because it was too long. 92 | */ 93 | function truncateActorReadme(readme: string, limit = ACTOR_README_MAX_LENGTH): string { 94 | if (readme.length <= limit) { 95 | return readme; 96 | } 97 | const readmeFirst = readme.slice(0, limit); 98 | const readmeRest = readme.slice(limit); 99 | const lines = readmeRest.split('\n'); 100 | const prunedReadme = lines.filter((line) => line.startsWith('#')); 101 | return `${readmeFirst}\n\nREADME was truncated because it was too long. Remaining headers:\n${prunedReadme.join(', ')}`; 102 | } 103 | -------------------------------------------------------------------------------- /src/mcp/client.ts: -------------------------------------------------------------------------------- 1 | import { Client } from '@modelcontextprotocol/sdk/client/index.js'; 2 | import { SSEClientTransport } from '@modelcontextprotocol/sdk/client/sse.js'; 3 | import { StreamableHTTPClientTransport } from '@modelcontextprotocol/sdk/client/streamableHttp.js'; 4 | 5 | import log from '@apify/log'; 6 | 7 | import { TimeoutError } from '../errors.js'; 8 | import { logHttpError } from '../utils/logging.js'; 9 | import { ACTORIZED_MCP_CONNECTION_TIMEOUT_MSEC } from './const.js'; 10 | import { getMCPServerID } from './utils.js'; 11 | 12 | /** 13 | * Creates and connects a ModelContextProtocol client. 14 | * First tries streamable HTTP transport, then falls back to SSE transport. 15 | */ 16 | export async function connectMCPClient( 17 | url: string, token: string, 18 | ): Promise { 19 | let client: Client; 20 | try { 21 | client = await createMCPStreamableClient(url, token); 22 | return client; 23 | } catch (error) { 24 | // If streamable HTTP transport fails on not timeout error, continue with SSE transport 25 | if (error instanceof TimeoutError) { 26 | log.warning('Connection to MCP server using streamable HTTP transport timed out', { url }); 27 | return null; 28 | } 29 | 30 | // If streamable HTTP transport fails, fall back to SSE transport 31 | log.debug('Streamable HTTP transport failed, falling back to SSE transport', { 32 | url, 33 | }); 34 | } 35 | 36 | try { 37 | client = await createMCPSSEClient(url, token); 38 | return client; 39 | } catch (error) { 40 | if (error instanceof TimeoutError) { 41 | log.warning('Connection to MCP server using SSE transport timed out', { url }); 42 | return null; 43 | } 44 | logHttpError(error, 'Failed to connect to MCP server using SSE transport', { url, cause: error }); 45 | throw error; 46 | } 47 | } 48 | 49 | async function withTimeout(millis: number, promise: Promise): Promise { 50 | let timeoutPid: NodeJS.Timeout; 51 | const timeout = new Promise((_resolve, reject) => { 52 | timeoutPid = setTimeout( 53 | () => reject(new TimeoutError(`Timed out after ${millis} ms.`)), 54 | millis, 55 | ); 56 | }); 57 | 58 | return Promise.race([ 59 | promise, 60 | timeout, 61 | ]).finally(() => { 62 | if (timeoutPid) { 63 | clearTimeout(timeoutPid); 64 | } 65 | }); 66 | } 67 | 68 | /** 69 | * Creates and connects a ModelContextProtocol client. 70 | */ 71 | async function createMCPSSEClient( 72 | url: string, token: string, 73 | ): Promise { 74 | const transport = new SSEClientTransport( 75 | new URL(url), 76 | { 77 | requestInit: { 78 | headers: { 79 | authorization: `Bearer ${token}`, 80 | }, 81 | }, 82 | eventSourceInit: { 83 | // The EventSource package augments EventSourceInit with a "fetch" parameter. 84 | // You can use this to set additional headers on the outgoing request. 85 | // Based on this example: https://github.com/modelcontextprotocol/typescript-sdk/issues/118 86 | async fetch(input: Request | URL | string, init?: RequestInit) { 87 | const headers = new Headers(init?.headers || {}); 88 | headers.set('authorization', `Bearer ${token}`); 89 | return fetch(input, { ...init, headers }); 90 | }, 91 | // We have to cast to "any" to use it, since it's non-standard 92 | } as any, // eslint-disable-line @typescript-eslint/no-explicit-any 93 | }); 94 | 95 | const client = new Client({ 96 | name: getMCPServerID(url), 97 | version: '1.0.0', 98 | }); 99 | 100 | await withTimeout(ACTORIZED_MCP_CONNECTION_TIMEOUT_MSEC, client.connect(transport)); 101 | 102 | return client; 103 | } 104 | 105 | /** 106 | * Creates and connects a ModelContextProtocol client using the streamable HTTP transport. 107 | */ 108 | async function createMCPStreamableClient( 109 | url: string, token: string, 110 | ): Promise { 111 | const transport = new StreamableHTTPClientTransport( 112 | new URL(url), 113 | { 114 | requestInit: { 115 | headers: { 116 | authorization: `Bearer ${token}`, 117 | }, 118 | }, 119 | }); 120 | 121 | const client = new Client({ 122 | name: getMCPServerID(url), 123 | version: '1.0.0', 124 | }); 125 | 126 | await withTimeout(ACTORIZED_MCP_CONNECTION_TIMEOUT_MSEC, client.connect(transport)); 127 | 128 | return client; 129 | } 130 | -------------------------------------------------------------------------------- /src/utils/actor.ts: -------------------------------------------------------------------------------- 1 | import type { ApifyClient } from '../apify-client.js'; 2 | import { getActorMCPServerPath, getActorMCPServerURL } from '../mcp/actors.js'; 3 | import { mcpServerCache } from '../state.js'; 4 | import { getActorDefinition } from '../tools/build.js'; 5 | import type { ActorDefinitionStorage, DatasetItem } from '../types.js'; 6 | import { getValuesByDotKeys } from './generic.js'; 7 | 8 | /** 9 | * Resolve and cache the MCP server URL for the given Actor. 10 | * - Returns a string URL when the Actor exposes an MCP server 11 | * - Returns false when the Actor is not an MCP server 12 | * Uses a TTL LRU cache to avoid repeated API calls. 13 | */ 14 | export async function getActorMcpUrlCached( 15 | actorIdOrName: string, 16 | apifyClient: ApifyClient, 17 | ): Promise { 18 | const cached = mcpServerCache.get(actorIdOrName); 19 | if (cached !== null && cached !== undefined) { 20 | return cached as string | false; 21 | } 22 | 23 | try { 24 | const actorDefinitionWithInfo = await getActorDefinition(actorIdOrName, apifyClient); 25 | const definition = actorDefinitionWithInfo?.definition; 26 | const mcpPath = definition && getActorMCPServerPath(definition); 27 | if (mcpPath) { 28 | const url = await getActorMCPServerURL(definition.id, mcpPath); 29 | mcpServerCache.set(actorIdOrName, url); 30 | return url; 31 | } 32 | 33 | mcpServerCache.set(actorIdOrName, false); 34 | return false; 35 | } catch (error) { 36 | // Check if it's a "not found" error (404 or 400 status codes) 37 | const isNotFound = typeof error === 'object' 38 | && error !== null 39 | && 'statusCode' in error 40 | && (error.statusCode === 404 || error.statusCode === 400); 41 | 42 | if (isNotFound) { 43 | // Actor doesn't exist - cache false and return false 44 | mcpServerCache.set(actorIdOrName, false); 45 | return false; 46 | } 47 | // Real server error - don't cache, let it propagate 48 | throw error; 49 | } 50 | } 51 | 52 | /** 53 | * Returns an array of all field names mentioned in the display.properties 54 | * of all views in the given ActorDefinitionStorage object. 55 | */ 56 | export function getActorDefinitionStorageFieldNames(storage: ActorDefinitionStorage | object): string[] { 57 | const fieldSet = new Set(); 58 | if ('views' in storage && typeof storage.views === 'object' && storage.views !== null) { 59 | for (const view of Object.values(storage.views)) { 60 | // Collect from display.properties 61 | if (view.display && view.display.properties) { 62 | Object.keys(view.display.properties).forEach((field) => fieldSet.add(field)); 63 | } 64 | // Collect from transformation.fields 65 | if (view.transformation && Array.isArray(view.transformation.fields)) { 66 | view.transformation.fields.forEach((field) => { 67 | if (typeof field === 'string') fieldSet.add(field); 68 | }); 69 | } 70 | } 71 | } 72 | return Array.from(fieldSet); 73 | } 74 | 75 | /** 76 | * Ensures the Actor output items are within the character limit. 77 | * 78 | * First checks if all items fit into the limit, then tries only the important fields and as a last resort 79 | * starts removing items until within the limit. In worst scenario return empty array. 80 | * 81 | * This is primarily used to ensure the tool output does not exceed the LLM context length or tool output limit. 82 | */ 83 | export function ensureOutputWithinCharLimit(items: DatasetItem[], importantFields: string[], charLimit: number): DatasetItem[] { 84 | // Check if all items fit into the limit 85 | const allItemsString = JSON.stringify(items); 86 | if (allItemsString.length <= charLimit) { 87 | return items; 88 | } 89 | 90 | /** 91 | * Items used for the final fallback - removing items until within the limit. 92 | * If important fields are defined, use only those fields for that fallback step. 93 | */ 94 | let sourceItems = items; 95 | // Try keeping only the important fields 96 | if (importantFields.length > 0) { 97 | const importantItems = items.map((item) => getValuesByDotKeys(item, importantFields)); 98 | const importantItemsString = JSON.stringify(importantItems); 99 | if (importantItemsString.length <= charLimit) { 100 | return importantItems; 101 | } 102 | sourceItems = importantItems; 103 | } 104 | 105 | // Start removing items until within the limit 106 | const result: DatasetItem[] = []; 107 | for (const item of sourceItems) { 108 | if (JSON.stringify(result.concat(item)).length > charLimit) { 109 | break; 110 | } 111 | result.push(item); 112 | } 113 | return result; 114 | } 115 | -------------------------------------------------------------------------------- /res/ALGOLIA.md: -------------------------------------------------------------------------------- 1 | # Algolia Search Response Analysis 2 | 3 | This document contains insights about Algolia API responses for each documentation source. This information helps understand the data structure returned by Algolia and informs decisions about response processing logic. 4 | 5 | ## Key Findings 6 | 7 | ### URL Handling 8 | - **All hits always have `url_without_anchor`** ✓ 9 | - No need to skip hits for missing URLs (the check can be simplified or removed) 10 | - The `url_without_anchor` field is always populated across all documentation sources 11 | 12 | ### Anchor/Fragment Field 13 | - **Initial finding** (search: "api"): No hits had anchors 14 | - **Updated finding** (search: "actor"): **80% of Apify hits have anchors** (16/20 hits) 15 | - **Fragment Distribution**: 16 | - Apify source: 80% of results include anchors pointing to specific sections 17 | - Same URLs appear multiple times with different anchors (e.g., `/actors` page has 4 different section anchors) 18 | - Crawlee sources: No anchors (due to `typeFilter: 'lvl1'` which returns page-level only) 19 | - **Decision**: Fragments are important for Apify source and should be preserved in results 20 | 21 | ### Content Field 22 | 23 | | Source | Has Content | Count | Note | 24 | |--------|------------|-------|------| 25 | | **apify** | ✓ YES | 20/20 (100%) | Always has content provided in full text | 26 | | **crawlee-js** | ✗ NO | 0/3 (0%) | Content is `null`, must use hierarchy | 27 | | **crawlee-py** | ✗ NO | 0/20 (0%) | Content is `null`, must use hierarchy | 28 | 29 | ### Hierarchy Field 30 | - **All hits have hierarchy object** with fields: `lvl0`, `lvl1`, `lvl2`, `lvl3`, `lvl4`, `lvl5`, `lvl6` 31 | - Most of these fields are `null` in responses 32 | - Only the first 1-2 levels typically contain values 33 | - **Apify source**: Has `content` field, so hierarchy is used less 34 | - **Crawlee sources**: No `content` field, must rely on hierarchy for display 35 | 36 | ## Data Structure Examples 37 | 38 | ### Raw Algolia Response (Apify with fragments) 39 | ```json 40 | { 41 | "url_without_anchor": "https://docs.apify.com/platform/actors", 42 | "anchor": "actors-overview", 43 | "content": "Actors are serverless cloud programs that can perform anything...", 44 | "type": "content", 45 | "hierarchy": { "lvl0": "Platform", "lvl1": "Actors", ... } 46 | } 47 | ``` 48 | 49 | ### Processed Result (After processAlgoliaResponse) 50 | ```json 51 | { 52 | "url": "https://docs.apify.com/platform/actors#actors-overview", 53 | "content": "Actors are serverless cloud programs that can perform anything..." 54 | } 55 | ``` 56 | 57 | ### Multiple Sections Same Page 58 | When searching "actor", the Apify index returns multiple hits from the same page with different anchors: 59 | ``` 60 | https://docs.apify.com/platform/actors#actors-overview 61 | https://docs.apify.com/platform/actors#actor-components 62 | https://docs.apify.com/platform/actors#build-actors 63 | https://docs.apify.com/platform/actors#running-actors 64 | ``` 65 | 66 | This gives LLM access to different sections of the same page. 67 | 68 | ### Crawlee (No fragments) 69 | ```json 70 | // Raw Algolia Response 71 | { 72 | "url_without_anchor": "https://crawlee.dev/js/api", 73 | "anchor": "", 74 | "content": null, 75 | "type": "lvl1" 76 | } 77 | 78 | // Processed Result 79 | { 80 | "url": "https://crawlee.dev/js/api" 81 | // Note: no content field since Crawlee doesn't provide it 82 | } 83 | ``` 84 | 85 | ## Simplification & Design Decisions 86 | 87 | ### Fragment Handling Strategy 88 | **Decision**: Embed fragments directly in returned URLs instead of returning as separate field. 89 | 90 | **Rationale**: 91 | - Simpler type definition (`ApifyDocsSearchResult` has only `url` and `content`) 92 | - LLM receives ready-to-use URLs (e.g., `https://docs.apify.com/actors#build-actors`) 93 | - Fetch tool already handles fragments correctly (splits on `#`) 94 | - No need for complex logic to reconstruct URL+fragment 95 | 96 | **Implementation**: 97 | ```typescript 98 | // Returns: 99 | { url: "https://docs.apify.com/actors#build-actors", content: "..." } 100 | 101 | // Instead of: 102 | { url: "https://docs.apify.com/actors", fragment: "build-actors", content: "..." } 103 | ``` 104 | 105 | ### Content Strategy 106 | - **Use Algolia content directly** - Always populated for Apify, never for Crawlee 107 | - **Remove hierarchy fallback** - Simplified approach, no hierarchy-based content synthesis 108 | - **Result**: 109 | - Apify search results include both URL (with anchor) and content 110 | - Crawlee search results include URL only (content is not available) 111 | 112 | ### Configuration Cleanup 113 | - Removed `supportsFragments` property from DOCS_SOURCES config 114 | - Simplified typeFilter comments (no longer need to explain fragment filtering) 115 | 116 | ### Code Simplification 117 | **processAlgoliaResponse() went from ~45 lines to ~20 lines:** 118 | - Removed fragment/hierarchy processing logic 119 | - Removed supportsFragments checks 120 | - URL building: `hit.url_without_anchor + (hit.anchor ? '#' + hit.anchor : '')` 121 | - Content: `hit.content` (use as-is if present) 122 | -------------------------------------------------------------------------------- /tests/unit/utils.actor.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from 'vitest'; 2 | 3 | import { ensureOutputWithinCharLimit, getActorDefinitionStorageFieldNames } from '../../src/utils/actor.js'; 4 | 5 | describe('getActorDefinitionStorageFieldNames', () => { 6 | it('should return an array of field names from a single view (display.properties and transformation.fields)', () => { 7 | const storage = { 8 | views: { 9 | view1: { 10 | display: { 11 | properties: { 12 | foo: {}, 13 | bar: {}, 14 | baz: {}, 15 | }, 16 | }, 17 | transformation: { 18 | fields: ['baz', 'qux', 'extra'], 19 | }, 20 | }, 21 | }, 22 | }; 23 | const result = getActorDefinitionStorageFieldNames(storage); 24 | expect(result.sort()).toEqual(['bar', 'baz', 'extra', 'foo', 'qux']); 25 | }); 26 | 27 | it('should return unique field names from multiple views (display.properties and transformation.fields)', () => { 28 | const storage = { 29 | views: { 30 | view1: { 31 | display: { 32 | properties: { 33 | foo: {}, 34 | bar: {}, 35 | }, 36 | }, 37 | transformation: { 38 | fields: ['foo', 'alpha'], 39 | }, 40 | }, 41 | view2: { 42 | display: { 43 | properties: { 44 | bar: {}, 45 | baz: {}, 46 | }, 47 | }, 48 | transformation: { 49 | fields: ['baz', 'beta', 'alpha'], 50 | }, 51 | }, 52 | }, 53 | }; 54 | const result = getActorDefinitionStorageFieldNames(storage); 55 | expect(result.sort()).toEqual(['alpha', 'bar', 'baz', 'beta', 'foo']); 56 | }); 57 | 58 | it('should return an empty array if no properties or fields are present', () => { 59 | const storage = { 60 | views: { 61 | view1: { 62 | display: { 63 | properties: {}, 64 | }, 65 | transformation: { 66 | fields: [], 67 | }, 68 | }, 69 | }, 70 | }; 71 | const result = getActorDefinitionStorageFieldNames(storage); 72 | expect(result).toEqual([]); 73 | }); 74 | 75 | it('should handle empty views object', () => { 76 | const storage = { views: {} }; 77 | const result = getActorDefinitionStorageFieldNames(storage); 78 | expect(result).toEqual([]); 79 | }); 80 | 81 | it('should handle missing transformation or display', () => { 82 | const storage = { 83 | views: { 84 | view1: { 85 | display: { 86 | properties: { foo: {} }, 87 | }, 88 | }, 89 | view2: { 90 | transformation: { 91 | fields: ['bar', 'baz'], 92 | }, 93 | }, 94 | view3: {}, 95 | }, 96 | }; 97 | const result = getActorDefinitionStorageFieldNames(storage); 98 | expect(result.sort()).toEqual(['bar', 'baz', 'foo']); 99 | }); 100 | }); 101 | 102 | describe('ensureOutputWithinCharLimit', () => { 103 | it('should return all items when limit is high', () => { 104 | const items = [ 105 | { id: 1, name: 'Item 1', value: 'test' }, 106 | { id: 2, name: 'Item 2', value: 'test' }, 107 | ]; 108 | const charLimit = JSON.stringify(items).length; 109 | const result = ensureOutputWithinCharLimit(items, [], charLimit); 110 | expect(result).toEqual(items); 111 | }); 112 | 113 | it('should use important fields when all items exceed limit', () => { 114 | const items = [ 115 | { id: 1, name: 'Item 1', description: 'Very long description that makes this item exceed the limit', extra: 'unnecessary data' }, 116 | { id: 2, name: 'Item 2', description: 'Another long description', extra: 'more unnecessary data' }, 117 | ]; 118 | const importantFields = ['id', 'name']; 119 | const charLimit = 100; // Very small limit 120 | const result = ensureOutputWithinCharLimit(items, importantFields, charLimit); 121 | expect(result).toEqual([ 122 | { id: 1, name: 'Item 1' }, 123 | { id: 2, name: 'Item 2' }, 124 | ]); 125 | }); 126 | 127 | it('should remove all items when limit is extremely small', () => { 128 | const items = [ 129 | { id: 1, name: 'Item 1' }, 130 | { id: 2, name: 'Item 2' }, 131 | ]; 132 | const charLimit = 10; // Extremely small limit - even empty array JSON "[]" is 2 chars 133 | const result = ensureOutputWithinCharLimit(items, [], charLimit); 134 | expect(result).toEqual([]); 135 | expect(JSON.stringify(result).length).toBeLessThanOrEqual(charLimit); 136 | }); 137 | }); 138 | -------------------------------------------------------------------------------- /src/tools/run.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | import { ApifyClient } from '../apify-client.js'; 4 | import { HelperTools, TOOL_STATUS } from '../const.js'; 5 | import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../types.js'; 6 | import { compileSchema } from '../utils/ajv.js'; 7 | import { buildMCPResponse } from '../utils/mcp.js'; 8 | 9 | const getActorRunArgs = z.object({ 10 | runId: z.string() 11 | .min(1) 12 | .describe('The ID of the Actor run.'), 13 | }); 14 | 15 | const abortRunArgs = z.object({ 16 | runId: z.string() 17 | .min(1) 18 | .describe('The ID of the Actor run to abort.'), 19 | gracefully: z.boolean().optional().describe('If true, the Actor run will abort gracefully with a 30-second timeout.'), 20 | }); 21 | 22 | /** 23 | * https://docs.apify.com/api/v2/actor-run-get 24 | */ 25 | export const getActorRun: ToolEntry = { 26 | type: 'internal', 27 | name: HelperTools.ACTOR_RUNS_GET, 28 | description: `Get detailed information about a specific Actor run by runId. 29 | The results will include run metadata (status, timestamps), performance stats, and resource IDs (datasetId, keyValueStoreId, requestQueueId). 30 | 31 | USAGE: 32 | - Use when you need to inspect run status or retrieve associated resource IDs (e.g., datasetId for output). 33 | 34 | USAGE EXAMPLES: 35 | - user_input: Show details of run y2h7sK3Wc 36 | - user_input: What is the datasetId for run y2h7sK3Wc?`, 37 | inputSchema: z.toJSONSchema(getActorRunArgs) as ToolInputSchema, 38 | ajvValidate: compileSchema(z.toJSONSchema(getActorRunArgs)), 39 | annotations: { 40 | title: 'Get Actor run', 41 | readOnlyHint: true, 42 | openWorldHint: false, 43 | }, 44 | call: async (toolArgs: InternalToolArgs) => { 45 | const { args, apifyToken } = toolArgs; 46 | const parsed = getActorRunArgs.parse(args); 47 | const client = new ApifyClient({ token: apifyToken }); 48 | const v = await client.run(parsed.runId).get(); 49 | if (!v) { 50 | return buildMCPResponse({ texts: [`Run with ID '${parsed.runId}' not found.`], 51 | isError: true, 52 | toolStatus: TOOL_STATUS.SOFT_FAIL }); 53 | } 54 | const texts = [`\`\`\`json\n${JSON.stringify(v, null, 2)}\n\`\`\``]; 55 | return buildMCPResponse({ texts }); 56 | }, 57 | } as const; 58 | 59 | const GetRunLogArgs = z.object({ 60 | runId: z.string().describe('The ID of the Actor run.'), 61 | lines: z.number() 62 | .max(50) 63 | .describe('Output the last NUM lines, instead of the last 10') 64 | .default(10), 65 | }); 66 | 67 | /** 68 | * https://docs.apify.com/api/v2/actor-run-get 69 | * /v2/actor-runs/{runId}/log{?token} 70 | */ 71 | export const getActorRunLog: ToolEntry = { 72 | type: 'internal', 73 | name: HelperTools.ACTOR_RUNS_LOG, 74 | description: `Retrieve recent log lines for a specific Actor run. 75 | The results will include the last N lines of the run's log output (plain text). 76 | 77 | USAGE: 78 | - Use when you need to inspect recent logs to debug or monitor a run. 79 | 80 | USAGE EXAMPLES: 81 | - user_input: Show last 20 lines of logs for run y2h7sK3Wc 82 | - user_input: Get logs for run y2h7sK3Wc`, 83 | inputSchema: z.toJSONSchema(GetRunLogArgs) as ToolInputSchema, 84 | // It does not make sense to add structured output here since the log API just returns plain text 85 | ajvValidate: compileSchema(z.toJSONSchema(GetRunLogArgs)), 86 | annotations: { 87 | title: 'Get Actor run log', 88 | readOnlyHint: true, 89 | openWorldHint: false, 90 | }, 91 | call: async (toolArgs: InternalToolArgs) => { 92 | const { args, apifyToken } = toolArgs; 93 | const parsed = GetRunLogArgs.parse(args); 94 | const client = new ApifyClient({ token: apifyToken }); 95 | const v = await client.run(parsed.runId).log().get() ?? ''; 96 | const lines = v.split('\n'); 97 | const text = lines.slice(lines.length - parsed.lines - 1, lines.length).join('\n'); 98 | return { content: [{ type: 'text', text }] }; 99 | }, 100 | } as const; 101 | 102 | /** 103 | * https://docs.apify.com/api/v2/actor-run-abort-post 104 | */ 105 | export const abortActorRun: ToolEntry = { 106 | type: 'internal', 107 | name: HelperTools.ACTOR_RUNS_ABORT, 108 | description: `Abort an Actor run that is currently starting or running. 109 | For runs with status FINISHED, FAILED, ABORTING, or TIMED-OUT, this call has no effect. 110 | The results will include the updated run details after the abort request. 111 | 112 | USAGE: 113 | - Use when you need to stop a run that is taking too long or misconfigured. 114 | 115 | USAGE EXAMPLES: 116 | - user_input: Abort run y2h7sK3Wc 117 | - user_input: Gracefully abort run y2h7sK3Wc`, 118 | inputSchema: z.toJSONSchema(abortRunArgs) as ToolInputSchema, 119 | ajvValidate: compileSchema(z.toJSONSchema(abortRunArgs)), 120 | annotations: { 121 | title: 'Abort Actor run', 122 | openWorldHint: false, 123 | }, 124 | call: async (toolArgs: InternalToolArgs) => { 125 | const { args, apifyToken } = toolArgs; 126 | const parsed = abortRunArgs.parse(args); 127 | const client = new ApifyClient({ token: apifyToken }); 128 | const v = await client.run(parsed.runId).abort({ gracefully: parsed.gracefully }); 129 | return { content: [{ type: 'text', text: `\`\`\`json\n${JSON.stringify(v)}\n\`\`\`` }] }; 130 | }, 131 | } as const; 132 | -------------------------------------------------------------------------------- /src/tools/fetch-apify-docs.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | import log from '@apify/log'; 4 | 5 | import { ALLOWED_DOC_DOMAINS, HelperTools, TOOL_STATUS } from '../const.js'; 6 | import { fetchApifyDocsCache } from '../state.js'; 7 | import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../types.js'; 8 | import { compileSchema } from '../utils/ajv.js'; 9 | import { htmlToMarkdown } from '../utils/html-to-md.js'; 10 | import { logHttpError } from '../utils/logging.js'; 11 | import { buildMCPResponse } from '../utils/mcp.js'; 12 | import { fetchApifyDocsToolOutputSchema } from './structured-output-schemas.js'; 13 | 14 | const fetchApifyDocsToolArgsSchema = z.object({ 15 | url: z.string() 16 | .min(1) 17 | .describe(`URL of the Apify documentation page to fetch. This should be the full URL, including the protocol (e.g., https://docs.apify.com/).`), 18 | }); 19 | 20 | export const fetchApifyDocsTool: ToolEntry = { 21 | type: 'internal', 22 | name: HelperTools.DOCS_FETCH, 23 | description: `Fetch the full content of an Apify or Crawlee documentation page by its URL. 24 | Use this after finding a relevant page with the ${HelperTools.DOCS_SEARCH} tool. 25 | 26 | USAGE: 27 | - Use when you need the complete content of a specific docs page for detailed answers. 28 | 29 | USAGE EXAMPLES: 30 | - user_input: Fetch https://docs.apify.com/platform/actors/running#builds 31 | - user_input: Fetch https://docs.apify.com/academy 32 | - user_input: Fetch https://crawlee.dev/docs/guides/basic-concepts`, 33 | inputSchema: z.toJSONSchema(fetchApifyDocsToolArgsSchema) as ToolInputSchema, 34 | outputSchema: fetchApifyDocsToolOutputSchema, 35 | ajvValidate: compileSchema(z.toJSONSchema(fetchApifyDocsToolArgsSchema)), 36 | annotations: { 37 | title: 'Fetch Apify docs', 38 | readOnlyHint: true, 39 | openWorldHint: false, 40 | }, 41 | call: async (toolArgs: InternalToolArgs) => { 42 | const { args } = toolArgs; 43 | 44 | const parsed = fetchApifyDocsToolArgsSchema.parse(args); 45 | const url = parsed.url.trim(); 46 | const urlWithoutFragment = url.split('#')[0]; 47 | 48 | // Allow URLs from Apify and Crawlee documentation 49 | const isAllowedDomain = ALLOWED_DOC_DOMAINS.some((domain) => url.startsWith(domain)); 50 | 51 | if (!isAllowedDomain) { 52 | log.softFail(`[fetch-apify-docs] Invalid URL domain: ${url}`); 53 | return buildMCPResponse({ 54 | texts: [`Invalid URL: "${url}". 55 | Only documentation URLs from Apify and Crawlee are allowed (starting with ${ALLOWED_DOC_DOMAINS.map((d) => `"${d}"`).join(' or ')}). 56 | Please provide a valid documentation URL. You can find documentation URLs using the ${HelperTools.DOCS_SEARCH} tool.`], 57 | isError: true, 58 | toolStatus: TOOL_STATUS.SOFT_FAIL, 59 | }); 60 | } 61 | 62 | // Cache URL without fragment to avoid fetching the same page multiple times 63 | let markdown = fetchApifyDocsCache.get(urlWithoutFragment); 64 | // If the content is not cached, fetch it from the URL 65 | if (!markdown) { 66 | try { 67 | const response = await fetch(url); 68 | if (!response.ok) { 69 | const error = Object.assign(new Error(`HTTP ${response.status} ${response.statusText}`), { 70 | statusCode: response.status, 71 | }); 72 | logHttpError(error, 'Failed to fetch the documentation page', { url, statusText: response.statusText }); 73 | // HTTP 4xx = user error (soft_fail), 5xx = server error (will be caught by catch block) 74 | const isUserError = response.status >= 400 && response.status < 500; 75 | return buildMCPResponse({ 76 | texts: [`Failed to fetch the documentation page at "${url}". 77 | HTTP Status: ${response.status} ${response.statusText}. 78 | Please verify the URL is correct and accessible. You can search for available documentation pages using the ${HelperTools.DOCS_SEARCH} tool.`], 79 | isError: true, 80 | toolStatus: isUserError ? TOOL_STATUS.SOFT_FAIL : TOOL_STATUS.FAILED, 81 | }); 82 | } 83 | const html = await response.text(); 84 | markdown = htmlToMarkdown(html); 85 | // Cache the processed Markdown content 86 | // Use the URL without fragment as the key to avoid caching same page with different fragments 87 | fetchApifyDocsCache.set(urlWithoutFragment, markdown); 88 | } catch (error) { 89 | logHttpError(error, 'Failed to fetch the documentation page', { url }); 90 | // Network/fetch errors are typically user errors (bad URL, connectivity issues) 91 | return buildMCPResponse({ 92 | texts: [`Failed to fetch the documentation page at "${url}". 93 | Error: ${error instanceof Error ? error.message : String(error)}. 94 | Please verify the URL is correct and accessible. You can search for available documentation pages using the ${HelperTools.DOCS_SEARCH} tool.`], 95 | isError: true, 96 | toolStatus: TOOL_STATUS.SOFT_FAIL, 97 | }); 98 | } 99 | } 100 | 101 | return buildMCPResponse({ texts: [`Fetched content from ${url}:\n\n${markdown}`], structuredContent: { url, content: markdown } }); 102 | }, 103 | } as const; 104 | -------------------------------------------------------------------------------- /src/tools/key_value_store.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | import { ApifyClient } from '../apify-client.js'; 4 | import { HelperTools } from '../const.js'; 5 | import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../types.js'; 6 | import { compileSchema } from '../utils/ajv.js'; 7 | 8 | const getKeyValueStoreArgs = z.object({ 9 | storeId: z.string() 10 | .min(1) 11 | .describe('Key-value store ID or username~store-name'), 12 | }); 13 | 14 | /** 15 | * https://docs.apify.com/api/v2/key-value-store-get 16 | */ 17 | export const getKeyValueStore: ToolEntry = { 18 | type: 'internal', 19 | name: HelperTools.KEY_VALUE_STORE_GET, 20 | description: `Get details about a key-value store by ID or username~store-name. 21 | The results will include store metadata (ID, name, owner, access settings) and usage statistics. 22 | 23 | USAGE: 24 | - Use when you need to inspect a store to locate records or understand its properties. 25 | 26 | USAGE EXAMPLES: 27 | - user_input: Show info for key-value store username~my-store 28 | - user_input: Get details for store adb123`, 29 | inputSchema: z.toJSONSchema(getKeyValueStoreArgs) as ToolInputSchema, 30 | ajvValidate: compileSchema(z.toJSONSchema(getKeyValueStoreArgs)), 31 | annotations: { 32 | title: 'Get key-value store', 33 | readOnlyHint: true, 34 | openWorldHint: false, 35 | }, 36 | call: async (toolArgs: InternalToolArgs) => { 37 | const { args, apifyToken } = toolArgs; 38 | const parsed = getKeyValueStoreArgs.parse(args); 39 | const client = new ApifyClient({ token: apifyToken }); 40 | const store = await client.keyValueStore(parsed.storeId).get(); 41 | return { content: [{ type: 'text', text: `\`\`\`json\n${JSON.stringify(store)}\n\`\`\`` }] }; 42 | }, 43 | } as const; 44 | 45 | const getKeyValueStoreKeysArgs = z.object({ 46 | storeId: z.string() 47 | .min(1) 48 | .describe('Key-value store ID or username~store-name'), 49 | exclusiveStartKey: z.string() 50 | .optional() 51 | .describe('All keys up to this one (including) are skipped from the result.'), 52 | limit: z.number() 53 | .max(10) 54 | .optional() 55 | .describe('Number of keys to be returned. Maximum value is 1000.'), 56 | }); 57 | 58 | /** 59 | * https://docs.apify.com/api/v2/key-value-store-keys-get 60 | */ 61 | export const getKeyValueStoreKeys: ToolEntry = { 62 | type: 'internal', 63 | name: HelperTools.KEY_VALUE_STORE_KEYS_GET, 64 | description: `List keys in a key-value store with optional pagination. 65 | The results will include keys and basic info about stored values (e.g., size). 66 | Use exclusiveStartKey and limit to paginate. 67 | 68 | USAGE: 69 | - Use when you need to discover what records exist in a store. 70 | 71 | USAGE EXAMPLES: 72 | - user_input: List first 100 keys in store username~my-store 73 | - user_input: Continue listing keys in store a123 from key data.json`, 74 | inputSchema: z.toJSONSchema(getKeyValueStoreKeysArgs) as ToolInputSchema, 75 | ajvValidate: compileSchema(z.toJSONSchema(getKeyValueStoreKeysArgs)), 76 | annotations: { 77 | title: 'Get key-value store keys', 78 | readOnlyHint: true, 79 | openWorldHint: false, 80 | }, 81 | call: async (toolArgs: InternalToolArgs) => { 82 | const { args, apifyToken } = toolArgs; 83 | const parsed = getKeyValueStoreKeysArgs.parse(args); 84 | const client = new ApifyClient({ token: apifyToken }); 85 | const keys = await client.keyValueStore(parsed.storeId).listKeys({ 86 | exclusiveStartKey: parsed.exclusiveStartKey, 87 | limit: parsed.limit, 88 | }); 89 | return { content: [{ type: 'text', text: `\`\`\`json\n${JSON.stringify(keys)}\n\`\`\`` }] }; 90 | }, 91 | } as const; 92 | 93 | const getKeyValueStoreRecordArgs = z.object({ 94 | storeId: z.string() 95 | .min(1) 96 | .describe('Key-value store ID or username~store-name'), 97 | recordKey: z.string() 98 | .min(1) 99 | .describe('Key of the record to retrieve.'), 100 | }); 101 | 102 | /** 103 | * https://docs.apify.com/api/v2/key-value-store-record-get 104 | */ 105 | export const getKeyValueStoreRecord: ToolEntry = { 106 | type: 'internal', 107 | name: HelperTools.KEY_VALUE_STORE_RECORD_GET, 108 | description: `Get a value stored in a key-value store under a specific key. 109 | The response preserves the original Content-Encoding; most clients handle decompression automatically. 110 | 111 | USAGE: 112 | - Use when you need to retrieve a specific record (JSON, text, or binary) from a store. 113 | 114 | USAGE EXAMPLES: 115 | - user_input: Get record INPUT from store abc123 116 | - user_input: Get record data.json from store username~my-store`, 117 | inputSchema: z.toJSONSchema(getKeyValueStoreRecordArgs) as ToolInputSchema, 118 | ajvValidate: compileSchema(z.toJSONSchema(getKeyValueStoreRecordArgs)), 119 | annotations: { 120 | title: 'Get key-value store record', 121 | readOnlyHint: true, 122 | openWorldHint: false, 123 | }, 124 | call: async (toolArgs: InternalToolArgs) => { 125 | const { args, apifyToken } = toolArgs; 126 | const parsed = getKeyValueStoreRecordArgs.parse(args); 127 | const client = new ApifyClient({ token: apifyToken }); 128 | const record = await client.keyValueStore(parsed.storeId).getRecord(parsed.recordKey); 129 | return { content: [{ type: 'text', text: `\`\`\`json\n${JSON.stringify(record)}\n\`\`\`` }] }; 130 | }, 131 | } as const; 132 | -------------------------------------------------------------------------------- /src/tools/search-apify-docs.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | import { DOCS_SOURCES, HelperTools } from '../const.js'; 4 | import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../types.js'; 5 | import { compileSchema } from '../utils/ajv.js'; 6 | import { searchDocsBySourceCached } from '../utils/apify-docs.js'; 7 | import { buildMCPResponse } from '../utils/mcp.js'; 8 | import { searchApifyDocsToolOutputSchema } from './structured-output-schemas.js'; 9 | 10 | /** 11 | * Build docSource parameter description dynamically from DOCS_SOURCES 12 | */ 13 | function buildDocSourceDescription(): string { 14 | const options = DOCS_SOURCES.map( 15 | (idx) => `• "${idx.id}" - ${idx.label}`, 16 | ).join('\n'); 17 | return `Documentation source to search. Defaults to "apify".\n${options}`; 18 | } 19 | 20 | /** 21 | * Build tool description dynamically from DOCS_SOURCES 22 | */ 23 | function buildToolDescription(): string { 24 | const sources = DOCS_SOURCES.map( 25 | (idx) => `• docSource="${idx.id}" - ${idx.label}:\n ${idx.description}`, 26 | ).join('\n\n'); 27 | 28 | return `Search Apify and Crawlee documentation using full-text search. 29 | 30 | You must explicitly select which documentation source to search using the docSource parameter: 31 | 32 | ${sources} 33 | 34 | The results will include the URL of the documentation page (which may include an anchor), 35 | and a limited piece of content that matches the search query. 36 | 37 | Fetch the full content of the document using the ${HelperTools.DOCS_FETCH} tool by providing the URL.`; 38 | } 39 | 40 | const searchApifyDocsToolArgsSchema = z.object({ 41 | docSource: z.enum( 42 | DOCS_SOURCES.map((source) => source.id) as [string, ...string[]], 43 | ) 44 | .optional() 45 | .default('apify') 46 | .describe(buildDocSourceDescription()), 47 | query: z.string() 48 | .min(1) 49 | .describe( 50 | `Algolia full-text search query to find relevant documentation pages. 51 | Use only keywords, do not use full sentences or questions. 52 | For example, "standby actor" will return documentation pages that contain the words "standby" and "actor".`, 53 | ), 54 | limit: z.number() 55 | .min(1) 56 | .max(20) // Algolia does not return more than 20 results anyway 57 | .optional() 58 | .default(5) 59 | .describe(`Maximum number of search results to return. Defaults to 5. Maximum is 20. 60 | You can increase this limit if you need more results, but keep in mind that the search results are limited to the most relevant pages.`), 61 | offset: z.number() 62 | .optional() 63 | .default(0) 64 | .describe(`Offset for the search results. Defaults to 0. 65 | Use this to paginate through the search results. For example, if you want to get the next 5 results, set the offset to 5 and limit to 5.`), 66 | }); 67 | 68 | export const searchApifyDocsTool: ToolEntry = { 69 | type: 'internal', 70 | name: HelperTools.DOCS_SEARCH, 71 | description: buildToolDescription(), 72 | inputSchema: z.toJSONSchema(searchApifyDocsToolArgsSchema) as ToolInputSchema, 73 | outputSchema: searchApifyDocsToolOutputSchema, 74 | ajvValidate: compileSchema(z.toJSONSchema(searchApifyDocsToolArgsSchema)), 75 | annotations: { 76 | title: 'Search Apify docs', 77 | readOnlyHint: true, 78 | openWorldHint: false, 79 | }, 80 | call: async (toolArgs: InternalToolArgs) => { 81 | const { args } = toolArgs; 82 | 83 | const parsed = searchApifyDocsToolArgsSchema.parse(args); 84 | 85 | const query = parsed.query.trim(); 86 | const resultsRaw = await searchDocsBySourceCached(parsed.docSource, query); 87 | 88 | const results = resultsRaw.slice(parsed.offset, parsed.offset + parsed.limit); 89 | 90 | if (results.length === 0) { 91 | const instructions = `No results found for the query "${query}" in the "${parsed.docSource}" documentation source. 92 | Please try a different query with different keywords, or adjust the limit and offset parameters. 93 | You can also try using more specific or alternative keywords related to your search topic.`; 94 | const structuredContent = { 95 | results: [], 96 | query, 97 | count: 0, 98 | instructions, 99 | }; 100 | return buildMCPResponse({ texts: [instructions], structuredContent }); 101 | } 102 | 103 | // Instructions for LLM to use the docs fetch tool when retrieving full document content 104 | const instructions = 'You can use the Apify docs fetch tool to retrieve the full content of a document by its URL.'; 105 | // Actual unstructured text result 106 | const textResult = `Search results for "${query}" in ${parsed.docSource}: 107 | 108 | ${results.map((result) => { 109 | let line = `- Document URL: ${result.url}`; 110 | if (result.content) { 111 | line += `\n Content: ${result.content}`; 112 | } 113 | return line; 114 | }).join('\n\n')}`; 115 | 116 | const structuredContent = { 117 | results: results.map((result) => ({ 118 | url: result.url, 119 | ...(result.content ? { content: result.content } : {}), 120 | })), 121 | query, 122 | count: results.length, 123 | instructions, 124 | }; 125 | // We put the instructions at the end so that they are more likely to be acknowledged by the LLM 126 | return buildMCPResponse({ texts: [textResult, instructions], structuredContent }); 127 | }, 128 | } as const; 129 | -------------------------------------------------------------------------------- /src/tools/get-html-skeleton.ts: -------------------------------------------------------------------------------- 1 | import { z } from 'zod'; 2 | 3 | import { ApifyClient } from '../apify-client.js'; 4 | import { HelperTools, RAG_WEB_BROWSER, TOOL_MAX_OUTPUT_CHARS, TOOL_STATUS } from '../const.js'; 5 | import { getHtmlSkeletonCache } from '../state.js'; 6 | import type { InternalToolArgs, ToolEntry, ToolInputSchema } from '../types.js'; 7 | import { compileSchema } from '../utils/ajv.js'; 8 | import { isValidHttpUrl } from '../utils/generic.js'; 9 | import { stripHtml } from '../utils/html.js'; 10 | import { buildMCPResponse } from '../utils/mcp.js'; 11 | 12 | type ScrapedPageItem = { 13 | crawl: { 14 | httpStatusCode: number; 15 | httpStatusMessage: string; 16 | }; 17 | metadata: { 18 | url: string; 19 | }; 20 | query: string; 21 | html?: string; 22 | } 23 | 24 | const getHtmlSkeletonArgs = z.object({ 25 | url: z.string() 26 | .min(1) 27 | .describe('URL of the webpage to retrieve HTML skeleton from.'), 28 | enableJavascript: z.boolean() 29 | .optional() 30 | .default(false) 31 | .describe('Whether to enable JavaScript rendering. Enabling this may increase the time taken to retrieve the HTML skeleton.'), 32 | chunk: z.number() 33 | .optional() 34 | .default(1) 35 | .describe('Chunk number to retrieve when getting the content. The content is split into chunks to prevent exceeding the maximum tool output length.'), 36 | }); 37 | 38 | export const getHtmlSkeleton: ToolEntry = { 39 | type: 'internal', 40 | name: HelperTools.GET_HTML_SKELETON, 41 | description: `Retrieve the HTML skeleton (clean structure) of a webpage by stripping scripts, styles, and non-essential attributes. 42 | This keeps the core HTML structure, links, images, and data attributes for analysis. Supports optional JavaScript rendering for dynamic pages. 43 | 44 | The results will include a chunked HTML skeleton if the content is large. Use the chunk parameter to paginate through the output. 45 | 46 | USAGE: 47 | - Use when you need a clean HTML structure to design selectors or parsers for scraping. 48 | 49 | USAGE EXAMPLES: 50 | - user_input: Get HTML skeleton for https://example.com 51 | - user_input: Get next chunk of HTML skeleton for https://example.com (chunk=2)`, 52 | inputSchema: z.toJSONSchema(getHtmlSkeletonArgs) as ToolInputSchema, 53 | ajvValidate: compileSchema(z.toJSONSchema(getHtmlSkeletonArgs)), 54 | annotations: { 55 | title: 'Get HTML skeleton', 56 | readOnlyHint: true, 57 | openWorldHint: true, 58 | }, 59 | call: async (toolArgs: InternalToolArgs) => { 60 | const { args, apifyToken } = toolArgs; 61 | const parsed = getHtmlSkeletonArgs.parse(args); 62 | 63 | if (!isValidHttpUrl(parsed.url)) { 64 | return buildMCPResponse({ 65 | texts: [`The provided URL is not a valid HTTP or HTTPS URL: ${parsed.url}`], 66 | isError: true, 67 | toolStatus: TOOL_STATUS.SOFT_FAIL, 68 | }); 69 | } 70 | 71 | // Try to get from cache first 72 | let strippedHtml = getHtmlSkeletonCache.get(parsed.url); 73 | if (!strippedHtml) { 74 | // Not in cache, call the Actor for scraping 75 | const client = new ApifyClient({ token: apifyToken }); 76 | 77 | const run = await client.actor(RAG_WEB_BROWSER).call({ 78 | query: parsed.url, 79 | outputFormats: [ 80 | 'html', 81 | ], 82 | scrapingTool: parsed.enableJavascript ? 'browser-playwright' : 'raw-http', 83 | }); 84 | 85 | const datasetItems = await client.dataset(run.defaultDatasetId).listItems(); 86 | if (datasetItems.items.length === 0) { 87 | return buildMCPResponse({ 88 | texts: [`The scraping Actor (${RAG_WEB_BROWSER}) did not return any output for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`], 89 | isError: true, 90 | }); 91 | } 92 | 93 | const firstItem = datasetItems.items[0] as unknown as ScrapedPageItem; 94 | if (firstItem.crawl.httpStatusMessage.toLocaleLowerCase() !== 'ok') { 95 | return buildMCPResponse({ 96 | texts: [`The scraping Actor (${RAG_WEB_BROWSER}) returned an HTTP status ${firstItem.crawl.httpStatusCode} (${firstItem.crawl.httpStatusMessage}) for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`], 97 | isError: true, 98 | }); 99 | } 100 | 101 | if (!firstItem.html) { 102 | return buildMCPResponse({ 103 | texts: [`The scraping Actor (${RAG_WEB_BROWSER}) did not return any HTML content for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`], 104 | isError: true, 105 | }); 106 | } 107 | 108 | strippedHtml = stripHtml(firstItem.html); 109 | getHtmlSkeletonCache.set(parsed.url, strippedHtml); 110 | } 111 | 112 | // Pagination logic 113 | const totalLength = strippedHtml.length; 114 | const chunkSize = TOOL_MAX_OUTPUT_CHARS; 115 | const totalChunks = Math.ceil(totalLength / chunkSize); 116 | const startIndex = (parsed.chunk - 1) * chunkSize; 117 | const endIndex = Math.min(startIndex + chunkSize, totalLength); 118 | const chunkContent = strippedHtml.slice(startIndex, endIndex); 119 | const hasNextChunk = parsed.chunk < totalChunks; 120 | 121 | const chunkInfo = `\n\n--- Chunk ${parsed.chunk} of ${totalChunks} ---\n${hasNextChunk ? `Next chunk: ${parsed.chunk + 1}` : 'End of content'}`; 122 | 123 | return buildMCPResponse({ texts: [chunkContent + chunkInfo] }); 124 | }, 125 | } as const; 126 | -------------------------------------------------------------------------------- /tests/integration/internals.test.ts: -------------------------------------------------------------------------------- 1 | import { InMemoryTaskStore } from '@modelcontextprotocol/sdk/experimental/tasks/stores/in-memory.js'; 2 | import { beforeAll, describe, expect, it } from 'vitest'; 3 | 4 | import log from '@apify/log'; 5 | 6 | import { ApifyClient } from '../../src/apify-client.js'; 7 | import { ActorsMcpServer } from '../../src/index.js'; 8 | import { addTool } from '../../src/tools/helpers.js'; 9 | import { getActorsAsTools } from '../../src/tools/index.js'; 10 | import { actorNameToToolName } from '../../src/tools/utils.js'; 11 | import type { Input } from '../../src/types.js'; 12 | import { loadToolsFromInput } from '../../src/utils/tools-loader.js'; 13 | import { ACTOR_PYTHON_EXAMPLE } from '../const.js'; 14 | import { expectArrayWeakEquals } from '../helpers.js'; 15 | 16 | beforeAll(() => { 17 | log.setLevel(log.LEVELS.OFF); 18 | }); 19 | 20 | describe('MCP server internals integration tests', () => { 21 | it('should load and restore tools from a tool list', async () => { 22 | const actorsMcpServer = new ActorsMcpServer({ setupSigintHandler: false, taskStore: new InMemoryTaskStore() }); 23 | const apifyClient = new ApifyClient({ token: process.env.APIFY_TOKEN }); 24 | const initialTools = await loadToolsFromInput({ 25 | enableAddingActors: true, 26 | } as Input, apifyClient); 27 | actorsMcpServer.upsertTools(initialTools); 28 | 29 | // Load new tool 30 | const newTool = await getActorsAsTools([ACTOR_PYTHON_EXAMPLE], apifyClient); 31 | actorsMcpServer.upsertTools(newTool); 32 | 33 | // Store the tool name list 34 | const names = actorsMcpServer.listAllToolNames(); 35 | // With enableAddingActors=true and no tools/actors, we should only have add-actor initially 36 | const expectedToolNames = [ 37 | addTool.name, 38 | ACTOR_PYTHON_EXAMPLE, 39 | 'get-actor-output', 40 | ]; 41 | expectArrayWeakEquals(expectedToolNames, names); 42 | 43 | // Remove all tools 44 | actorsMcpServer.tools.clear(); 45 | expect(actorsMcpServer.listAllToolNames()).toEqual([]); 46 | 47 | // Load the tool state from the tool name list 48 | await actorsMcpServer.loadToolsByName(names, apifyClient); 49 | 50 | // Check if the tool name list is restored 51 | expectArrayWeakEquals(actorsMcpServer.listAllToolNames(), expectedToolNames); 52 | }); 53 | 54 | it('should notify tools changed handler on tool modifications', async () => { 55 | let latestTools: string[] = []; 56 | // With enableAddingActors=true and no tools/actors, seeded set contains only add-actor 57 | const numberOfTools = 2; 58 | 59 | let toolNotificationCount = 0; 60 | const onToolsChanged = (tools: string[]) => { 61 | latestTools = tools; 62 | toolNotificationCount++; 63 | }; 64 | 65 | const actorsMCPServer = new ActorsMcpServer({ setupSigintHandler: false, taskStore: new InMemoryTaskStore() }); 66 | const apifyClient = new ApifyClient({ token: process.env.APIFY_TOKEN }); 67 | const seeded = await loadToolsFromInput({ enableAddingActors: true } as Input, apifyClient); 68 | actorsMCPServer.upsertTools(seeded); 69 | actorsMCPServer.registerToolsChangedHandler(onToolsChanged); 70 | 71 | // Add a new Actor 72 | const actor = ACTOR_PYTHON_EXAMPLE; 73 | const newTool = await getActorsAsTools([actor], apifyClient); 74 | actorsMCPServer.upsertTools(newTool, true); 75 | 76 | // Check if the notification was received with the correct tools 77 | expect(toolNotificationCount).toBe(1); 78 | expect(latestTools.length).toBe(numberOfTools + 1); 79 | expect(latestTools).toContain(actor); 80 | expect(latestTools).toContain(addTool.name); 81 | // No default actors are present when only add-actor is enabled by default 82 | 83 | // Remove the Actor 84 | actorsMCPServer.removeToolsByName([actorNameToToolName(actor)], true); 85 | 86 | // Check if the notification was received with the correct tools 87 | expect(toolNotificationCount).toBe(2); 88 | expect(latestTools.length).toBe(numberOfTools); 89 | expect(latestTools).not.toContain(actor); 90 | expect(latestTools).toContain(addTool.name); 91 | // No default actors are present by default in this mode 92 | }); 93 | 94 | it('should stop notifying after unregistering tools changed handler', async () => { 95 | let latestTools: string[] = []; 96 | let notificationCount = 0; 97 | const numberOfTools = 2; 98 | const onToolsChanged = (tools: string[]) => { 99 | latestTools = tools; 100 | notificationCount++; 101 | }; 102 | 103 | const actorsMCPServer = new ActorsMcpServer({ setupSigintHandler: false, taskStore: new InMemoryTaskStore() }); 104 | const apifyClient = new ApifyClient({ token: process.env.APIFY_TOKEN }); 105 | const seeded = await loadToolsFromInput({ enableAddingActors: true } as Input, apifyClient); 106 | actorsMCPServer.upsertTools(seeded); 107 | actorsMCPServer.registerToolsChangedHandler(onToolsChanged); 108 | 109 | // Add a new Actor 110 | const actor = ACTOR_PYTHON_EXAMPLE; 111 | const newTool = await getActorsAsTools([actor], apifyClient); 112 | actorsMCPServer.upsertTools(newTool, true); 113 | 114 | // Check if the notification was received 115 | expect(notificationCount).toBe(1); 116 | expect(latestTools.length).toBe(numberOfTools + 1); 117 | expect(latestTools).toContain(actor); 118 | 119 | actorsMCPServer.unregisterToolsChangedHandler(); 120 | 121 | // Remove the Actor 122 | actorsMCPServer.removeToolsByName([actorNameToToolName(actor)], true); 123 | 124 | // Check if the notification was NOT received 125 | expect(notificationCount).toBe(1); 126 | }); 127 | }); 128 | -------------------------------------------------------------------------------- /evals/create-dataset.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env tsx 2 | /** 3 | * One-time script to create Phoenix dataset from test cases. 4 | * Run this once to upload test cases to Phoenix platform and receive a dataset ID. 5 | */ 6 | 7 | import { createClient } from '@arizeai/phoenix-client'; 8 | // eslint-disable-next-line import/extensions 9 | import { createDataset } from '@arizeai/phoenix-client/datasets'; 10 | import dotenv from 'dotenv'; 11 | import yargs from 'yargs'; 12 | // eslint-disable-next-line import/extensions 13 | import { hideBin } from 'yargs/helpers'; 14 | 15 | import log from '@apify/log'; 16 | 17 | import { sanitizeHeaderValue, validateEnvVars } from './config.js'; 18 | import { loadTestCases, filterByCategory, filterById, type TestCase } from './evaluation-utils.js'; 19 | 20 | // Set log level to debug 21 | log.setLevel(log.LEVELS.INFO); 22 | 23 | /** 24 | * Type for command line arguments 25 | */ 26 | type CliArgs = { 27 | testCases?: string; 28 | category?: string; 29 | id?: string; 30 | datasetName?: string; 31 | }; 32 | 33 | // Load environment variables from .env file if present 34 | dotenv.config({ path: '.env' }); 35 | 36 | // Parse command line arguments using yargs 37 | const argv = yargs(hideBin(process.argv)) 38 | .wrap(null) // Disable automatic wrapping to avoid issues with long lines 39 | .usage('Usage: $0 [options]') 40 | .env() 41 | .option('test-cases', { 42 | type: 'string', 43 | describe: 'Path to test cases JSON file', 44 | default: 'test-cases.json', 45 | example: 'custom-test-cases.json', 46 | }) 47 | .option('category', { 48 | type: 'string', 49 | describe: 'Filter test cases by category. Supports wildcards with * (e.g., search-actors, search-actors-*)', 50 | example: 'search-actors', 51 | }) 52 | .option('id', { 53 | type: 'string', 54 | describe: 'Filter test cases by ID using regex pattern', 55 | example: 'instagram.*', 56 | }) 57 | .option('dataset-name', { 58 | type: 'string', 59 | describe: 'Custom dataset name (overrides auto-generated name)', 60 | example: 'my_custom_dataset', 61 | }) 62 | .help('help') 63 | .alias('h', 'help') 64 | .version(false) 65 | .epilogue('Examples:') 66 | .epilogue(' $0 # Use defaults') 67 | .epilogue(' $0 --test-cases custom.json # Use custom test cases file') 68 | .epilogue(' $0 --category search-actors # Filter by exact category') 69 | .epilogue(' $0 --category search-actors-* # Filter by wildcard pattern') 70 | .epilogue(' $0 --id instagram.* # Filter by ID regex pattern') 71 | .epilogue(' $0 --dataset-name my_dataset # Custom dataset name') 72 | .epilogue(' $0 --test-cases custom.json --category search-actors') 73 | .parseSync() as CliArgs; 74 | 75 | 76 | async function createDatasetFromTestCases( 77 | testCases: TestCase[], 78 | datasetName: string, 79 | version: string, 80 | ): Promise { 81 | log.info('Creating Phoenix dataset from test cases...'); 82 | 83 | // Validate environment variables 84 | if (!validateEnvVars()) { 85 | process.exit(1); 86 | } 87 | 88 | log.info(`Loaded ${testCases.length} test cases`); 89 | 90 | // Convert to format expected by Phoenix 91 | const examples = testCases.map((testCase) => ({ 92 | input: { query: testCase.query, context: testCase.context || '' }, 93 | output: { expectedTools: testCase.expectedTools?.join(', '), reference: testCase.reference || '' }, 94 | metadata: { category: testCase.category }, 95 | })); 96 | 97 | // Initialize Phoenix client 98 | const client = createClient({ 99 | options: { 100 | baseUrl: process.env.PHOENIX_BASE_URL!, 101 | headers: { Authorization: `Bearer ${sanitizeHeaderValue(process.env.PHOENIX_API_KEY)}` }, 102 | }, 103 | }); 104 | 105 | log.info(`Uploading dataset '${datasetName}' to Phoenix...`); 106 | 107 | try { 108 | const { datasetId } = await createDataset({ 109 | client, 110 | name: datasetName, 111 | description: `MCP server dataset: version ${version}`, 112 | examples, 113 | }); 114 | 115 | log.info(`Dataset '${datasetName}' created with ID: ${datasetId}`); 116 | } catch (error) { 117 | if (error instanceof Error && error.message.includes('409')) { 118 | log.error(`❌ Dataset '${datasetName}' already exists in Phoenix!`); 119 | log.error(''); 120 | log.error('💡 Solutions:'); 121 | log.error(' 1. Use --dataset-name to specify a different name:'); 122 | log.error(` tsx create-dataset.ts --dataset-name ${datasetName}_v2`); 123 | log.error(` npm run evals:create-dataset -- --dataset-name ${datasetName}_v2`); 124 | log.error(' 2. Delete the existing dataset from Phoenix dashboard first'); 125 | log.error(''); 126 | log.error(`📋 Technical details: ${error.message}`); 127 | } else { 128 | log.error(`Error creating dataset: ${error}`); 129 | } 130 | process.exit(1); 131 | } 132 | } 133 | 134 | // Run the script 135 | async function main(): Promise { 136 | try { 137 | // Load test cases from specified file 138 | 139 | const testData = loadTestCases(argv.testCases || 'test-cases.json'); 140 | let { testCases } = testData; 141 | 142 | // Apply category filter if specified 143 | if (argv.category) { 144 | testCases = filterByCategory(testCases, argv.category); 145 | log.info(`Filtered to ${testCases.length} test cases in category '${argv.category}'`); 146 | } 147 | 148 | // Apply ID filter if specified 149 | if (argv.id) { 150 | testCases = filterById(testCases, argv.id); 151 | log.info(`Filtered to ${testCases.length} test cases matching ID pattern '${argv.id}'`); 152 | } 153 | 154 | // Determine dataset name 155 | const datasetName = argv.datasetName || `mcp_server_dataset_v${testData.version}`; 156 | 157 | // Create dataset 158 | await createDatasetFromTestCases(testCases, datasetName, testData.version); 159 | } catch (error) { 160 | log.error('Unexpected error:', { error }); 161 | process.exit(1); 162 | } 163 | } 164 | 165 | // Run 166 | main() 167 | .then(() => process.exit()) 168 | .catch((err) => { 169 | log.error('Unexpected error:', err); 170 | process.exit(1); 171 | }); 172 | -------------------------------------------------------------------------------- /src/utils/apify-docs.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Utilities for searching Apify documentation using Algolia. 3 | * 4 | * Provides a function to query the Apify docs via Algolia's search API and return structured results. 5 | * 6 | * @module utils/apify-docs 7 | */ 8 | import { algoliasearch } from 'algoliasearch'; 9 | 10 | import log from '@apify/log'; 11 | 12 | import { DOCS_SOURCES } from '../const.js'; 13 | import { searchApifyDocsCache } from '../state.js'; 14 | import type { ApifyDocsSearchResult } from '../types.js'; 15 | 16 | /** 17 | * Pool of Algolia search clients, keyed by app ID to handle multiple Algolia accounts. 18 | */ 19 | const clientPool: Record> = {}; 20 | 21 | function getAlgoliaClient(appId: string, apiKey: string) { 22 | if (!clientPool[appId]) { 23 | clientPool[appId] = algoliasearch(appId, apiKey); 24 | } 25 | return clientPool[appId]; 26 | } 27 | 28 | /** 29 | * Represents a single search hit from Algolia's response. 30 | */ 31 | type AlgoliaResultHit = { 32 | url_without_anchor?: string; 33 | anchor?: string; 34 | content?: string | null; 35 | type?: string; 36 | hierarchy?: Record; 37 | }; 38 | 39 | /** 40 | * Represents a single Algolia search result containing hits. 41 | */ 42 | type AlgoliaResult = { 43 | hits?: AlgoliaResultHit[]; 44 | }; 45 | 46 | /** 47 | * Builds an Algolia search request with conditional filters based on documentation source configuration. 48 | * 49 | * @param {object} indexConfig - The documentation source configuration from DOCS_SOURCES 50 | * @param {string} query - The search query string 51 | * @returns {object} Algolia search request object with index name, query, and conditional filters 52 | */ 53 | function prepareAlgoliaRequest( 54 | indexConfig: (typeof DOCS_SOURCES)[number], 55 | query: string, 56 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 57 | ): any { 58 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 59 | const searchRequest: any = { 60 | indexName: indexConfig.indexName, 61 | query: query.trim(), 62 | }; 63 | 64 | // Apply filters if configured 65 | if ('filters' in indexConfig && indexConfig.filters) { 66 | searchRequest.filters = indexConfig.filters; 67 | } 68 | 69 | // Apply type filter if configured (e.g., for Crawlee to filter to lvl1 pages only) 70 | if ('typeFilter' in indexConfig && indexConfig.typeFilter) { 71 | const typeFilter = `type:${indexConfig.typeFilter}`; 72 | if (searchRequest.filters) { 73 | // Combine with existing filters using AND 74 | searchRequest.filters = `${searchRequest.filters} AND ${typeFilter}`; 75 | } else { 76 | searchRequest.filters = typeFilter; 77 | } 78 | } 79 | 80 | // Apply facet filters if configured 81 | if ('facetFilters' in indexConfig && indexConfig.facetFilters) { 82 | searchRequest.facetFilters = indexConfig.facetFilters; 83 | } 84 | 85 | return searchRequest; 86 | } 87 | 88 | /** 89 | * Processes Algolia search response and transforms hits into ApifyDocsSearchResult array. 90 | * 91 | * @param {AlgoliaResult[]} results - Raw Algolia search results 92 | * @returns {ApifyDocsSearchResult[]} Processed search results with URL (may include anchor) and optional content 93 | */ 94 | function processAlgoliaResponse(results: AlgoliaResult[]): ApifyDocsSearchResult[] { 95 | const searchResults: ApifyDocsSearchResult[] = []; 96 | 97 | for (const result of results) { 98 | if (!result.hits?.length) { 99 | continue; 100 | } 101 | 102 | for (const hit of result.hits) { 103 | if (!hit.url_without_anchor) { 104 | continue; 105 | } 106 | 107 | // Build URL with anchor if present 108 | let url = hit.url_without_anchor; 109 | if (hit.anchor && hit.anchor.trim()) { 110 | url += `#${hit.anchor}`; 111 | } 112 | 113 | searchResults.push({ 114 | url, 115 | ...(hit.content ? { content: hit.content } : {}), 116 | }); 117 | } 118 | } 119 | 120 | return searchResults; 121 | } 122 | 123 | /** 124 | * Searches a specific documentation source by ID using Algolia. 125 | * 126 | * @param {string} docSource - The documentation source ID ('apify', 'crawlee-js', or 'crawlee-py'). 127 | * @param {string} query - The search query string. 128 | * @returns {Promise} Array of search results with URL (may include anchor) and optional content. 129 | */ 130 | export async function searchDocsBySource( 131 | docSource: string, 132 | query: string, 133 | ): Promise { 134 | const indexConfig = DOCS_SOURCES.find((idx) => idx.id === docSource); 135 | 136 | if (!indexConfig) { 137 | throw new Error(`Unknown documentation source: ${docSource}`); 138 | } 139 | 140 | const client = getAlgoliaClient(indexConfig.appId, indexConfig.apiKey); 141 | 142 | const searchRequest = prepareAlgoliaRequest(indexConfig, query); 143 | const response = await client.search({ 144 | requests: [searchRequest], 145 | }); 146 | 147 | const results = response.results as unknown as AlgoliaResult[]; 148 | const searchResults = processAlgoliaResponse(results); 149 | 150 | log.info(`[Algolia] Search completed successfully. Found ${searchResults.length} results for "${docSource}"`); 151 | return searchResults; 152 | } 153 | 154 | /** 155 | * Searches a documentation source with caching. 156 | * 157 | * @param {string} docSource - The documentation source ID ('apify', 'crawlee-js', or 'crawlee-py'). 158 | * @param {string} query - The search query string. 159 | * @returns {Promise} Array of search results with URL (may include anchor) and optional content. 160 | */ 161 | export async function searchDocsBySourceCached( 162 | docSource: string, 163 | query: string, 164 | ): Promise { 165 | const cacheKey = `${docSource}::${query.trim().toLowerCase()}`; 166 | const cachedResults = searchApifyDocsCache.get(cacheKey); 167 | if (cachedResults) { 168 | log.debug(`[Algolia] Cache hit for key: "${cacheKey}". Returning ${cachedResults.length} cached results`); 169 | return cachedResults; 170 | } 171 | 172 | log.debug(`[Algolia] Cache miss for key: "${cacheKey}". Executing search...`); 173 | const results = await searchDocsBySource(docSource, query); 174 | searchApifyDocsCache.set(cacheKey, results); 175 | return results; 176 | } 177 | --------------------------------------------------------------------------------