├── .eslintrc.json ├── .github └── workflows │ ├── ci.yml │ ├── image.yml │ └── publish.yml ├── .gitignore ├── .prettierrc ├── CHANGELOG.md ├── Dockerfile ├── Dockerfile.service ├── LICENSE ├── README.md ├── jest.config.js ├── jest.setup.ts ├── package-lock.json ├── package.json ├── pnpm-lock.yaml ├── smithery.yaml ├── src ├── index.test.ts └── index.ts └── tsconfig.json /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "parser": "@typescript-eslint/parser", 3 | "plugins": ["@typescript-eslint"], 4 | "extends": [ 5 | "eslint:recommended", 6 | "plugin:@typescript-eslint/recommended", 7 | "prettier" 8 | ], 9 | "env": { 10 | "node": true, 11 | "es2022": true 12 | }, 13 | "parserOptions": { 14 | "ecmaVersion": 2022, 15 | "sourceType": "module", 16 | "project": "./tsconfig.json" 17 | }, 18 | "rules": { 19 | "@typescript-eslint/explicit-function-return-type": "off", 20 | "@typescript-eslint/no-explicit-any": "off", 21 | "@typescript-eslint/no-unused-vars": [ 22 | "error", 23 | { "argsIgnorePattern": "^_" } 24 | ] 25 | }, 26 | "overrides": [ 27 | { 28 | "files": ["**/*.test.ts"], 29 | "rules": { 30 | "@typescript-eslint/no-unused-vars": "off", 31 | "@typescript-eslint/no-explicit-any": "off" 32 | } 33 | } 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Use Node.js 17 | uses: actions/setup-node@v3 18 | with: 19 | node-version: '20.x' 20 | cache: 'npm' 21 | 22 | - name: Install dependencies 23 | run: npm ci 24 | 25 | - name: Build 26 | run: npm run build 27 | 28 | - name: Lint 29 | run: npm run lint 30 | 31 | - name: Test 32 | run: npm test 33 | -------------------------------------------------------------------------------- /.github/workflows/image.yml: -------------------------------------------------------------------------------- 1 | name: Deploy Images to GHCR 2 | 3 | env: 4 | DOTNET_VERSION: '6.0.x' 5 | 6 | on: 7 | push: 8 | branches: 9 | - main 10 | workflow_dispatch: 11 | 12 | jobs: 13 | push-image: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: 'Checkout GitHub Action' 17 | uses: actions/checkout@main 18 | 19 | - name: 'Login to GitHub Container Registry' 20 | uses: docker/login-action@v1 21 | with: 22 | registry: ghcr.io 23 | username: ${{github.actor}} 24 | password: ${{secrets.GITHUB_TOKEN}} 25 | 26 | - name: 'Set up Docker Buildx' 27 | uses: docker/setup-buildx-action@v1 28 | 29 | - name: 'Build Service Image' 30 | uses: docker/build-push-action@v2 31 | with: 32 | context: . 33 | file: ./Dockerfile 34 | push: true 35 | tags: ghcr.io/mendableai/firecrawl-mcp-server:latest 36 | cache-from: type=registry,ref=ghcr.io/mendableai/firecrawl-mcp-server:latest 37 | cache-to: type=inline 38 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | publish: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | 15 | - name: Use Node.js 16 | uses: actions/setup-node@v3 17 | with: 18 | node-version: '20.x' 19 | registry-url: 'https://registry.npmjs.org' 20 | 21 | - name: Install dependencies 22 | run: npm ci 23 | 24 | - name: Build 25 | run: npm run build 26 | 27 | - name: Publish to NPM 28 | run: npm publish 29 | env: 30 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | 4 | # Build 5 | dist/ 6 | 7 | # Logs 8 | logs 9 | *.log 10 | npm-debug.log* 11 | 12 | # Environment 13 | .env 14 | .env.local 15 | .env.*.local 16 | claude_desktop_config.json 17 | 18 | # IDE 19 | .idea/ 20 | .vscode/ 21 | *.swp 22 | *.swo 23 | .cursorrules.md 24 | IMPLEMENTATION.md 25 | v1.2.md 26 | 27 | # OS 28 | .DS_Store 29 | Thumbs.db -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "trailingComma": "es5", 4 | "singleQuote": true, 5 | "printWidth": 80, 6 | "tabWidth": 2, 7 | "useTabs": false 8 | } 9 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [1.7.0] - 2025-03-18 4 | 5 | ### Fixed 6 | 7 | - Critical bugfix for stdio transport hanging issues with Python clients 8 | - Implemented transport-aware logging that directs logs to stderr when using stdio transport 9 | - Resolves issue #22 where Python clients would hang during initialization or tool execution 10 | - Improves compatibility with non-JavaScript MCP clients 11 | 12 | ## [1.2.4] - 2024-02-05 13 | 14 | ### Added 15 | 16 | - Environment variable support for all configuration options 17 | - Detailed configuration documentation in README 18 | 19 | ### Changed 20 | 21 | - Made retry and credit monitoring settings configurable via environment variables: 22 | - `FIRECRAWL_RETRY_MAX_ATTEMPTS` 23 | - `FIRECRAWL_RETRY_INITIAL_DELAY` 24 | - `FIRECRAWL_RETRY_MAX_DELAY` 25 | - `FIRECRAWL_RETRY_BACKOFF_FACTOR` 26 | - `FIRECRAWL_CREDIT_WARNING_THRESHOLD` 27 | - `FIRECRAWL_CREDIT_CRITICAL_THRESHOLD` 28 | - Enhanced configuration examples with detailed comments and use cases 29 | - Improved documentation for retry behavior and credit monitoring 30 | 31 | ### Documentation 32 | 33 | - Added comprehensive configuration examples for both cloud and self-hosted setups 34 | - Added detailed explanations of retry behavior with timing examples 35 | - Added credit monitoring threshold explanations 36 | - Updated Claude Desktop configuration documentation 37 | 38 | ## [1.2.3] - 2024-02-05 39 | 40 | ### Changed 41 | 42 | - Removed redundant batch configuration to rely on Firecrawl library's built-in functionality 43 | - Simplified batch processing logic by leveraging library's native implementation 44 | - Optimized parallel processing and rate limiting handling 45 | - Reduced code complexity and potential configuration conflicts 46 | 47 | ### Technical 48 | 49 | - Removed custom `CONFIG.batch` settings (`maxParallelOperations` and `delayBetweenRequests`) 50 | - Simplified batch operation processing to use library's built-in batch handling 51 | - Updated server startup logging to remove batch configuration references 52 | - Maintained credit usage tracking and error handling functionality 53 | 54 | ## [1.2.2] - 2025-02-05 55 | 56 | ### Fixed 57 | 58 | - Resolved unused interface warnings for ExtractParams and ExtractResponse 59 | - Improved type safety in extract operations 60 | - Fixed type casting issues in API responses 61 | 62 | ### Changed 63 | 64 | - Improved type guards for better type inference 65 | - Enhanced error messages for configuration validation 66 | 67 | ## [1.2.0] - 2024-01-03 68 | 69 | ### Added 70 | 71 | - Implemented automatic retries with exponential backoff for rate limits 72 | - Added queue system for batch operations with parallel processing 73 | - Integrated credit usage monitoring with warning thresholds 74 | - Enhanced content validation with configurable criteria 75 | - Added comprehensive logging system for operations and errors 76 | - New search tool (`firecrawl_search`) for web search with content extraction 77 | - Support for self-hosted Firecrawl instances via optional API URL configuration 78 | - New `FIRECRAWL_API_URL` environment variable 79 | - Automatic fallback to cloud API 80 | - Improved error messages for self-hosted instances 81 | 82 | ### Changed 83 | 84 | - Improved error handling for HTTP errors including 404s 85 | - Enhanced URL validation before scraping 86 | - Updated configuration with new retry and batch processing options 87 | - Optimized rate limiting with automatic backoff strategy 88 | - Improved documentation with new features and examples 89 | - Added detailed self-hosted configuration guide 90 | 91 | ### Fixed 92 | 93 | - Rate limit handling in batch operations 94 | - Error response formatting 95 | - Type definitions for response handlers 96 | - Test suite mock responses 97 | - Error handling for invalid search queries 98 | - API configuration validation 99 | 100 | ## [1.0.1] - 2023-12-03 101 | 102 | ### Added 103 | 104 | - Initial release with basic scraping functionality 105 | - Support for batch scraping 106 | - URL discovery and crawling capabilities 107 | - Rate limiting implementation 108 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile 2 | # Use a Node.js image as the base for building the application 3 | FROM node:22-alpine AS builder 4 | 5 | # Set the working directory inside the container 6 | WORKDIR /app 7 | 8 | # Copy package.json and package-lock.json to install dependencies 9 | COPY package.json package-lock.json ./ 10 | 11 | # Install dependencies (ignoring scripts to prevent running the prepare script) 12 | RUN npm install --ignore-scripts 13 | 14 | # Copy the rest of the application source code 15 | COPY . . 16 | 17 | # Build the application using TypeScript 18 | RUN npm run build 19 | 20 | # Use a smaller Node.js image for the final image 21 | FROM node:22-slim AS release 22 | 23 | # Set the working directory inside the container 24 | WORKDIR /app 25 | 26 | # Copy the built application from the builder stage 27 | COPY --from=builder /app/dist /app/dist 28 | COPY --from=builder /app/package.json /app/package.json 29 | COPY --from=builder /app/package-lock.json /app/package-lock.json 30 | 31 | # Install only production dependencies 32 | RUN npm ci --omit=dev --ignore-scripts 33 | 34 | # Set environment variables for API key and custom API URL if needed 35 | 36 | 37 | # Specify the command to run the application 38 | ENTRYPOINT ["node", "dist/index.js"] 39 | -------------------------------------------------------------------------------- /Dockerfile.service: -------------------------------------------------------------------------------- 1 | FROM node:22-slim 2 | 3 | WORKDIR /app 4 | 5 | COPY package.json package-lock.json* ./ 6 | 7 | 8 | COPY tsconfig.json ./ 9 | COPY src ./src 10 | RUN npm run build 11 | 12 | ENV CLOUD_SERVICE=true 13 | ENV PORT=3000 14 | 15 | # Expose the port 16 | EXPOSE 3000 17 | 18 | # Run the server 19 | CMD ["node", "dist/index.js"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 vrknetha 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Firecrawl MCP Server 2 | 3 | A Model Context Protocol (MCP) server implementation that integrates with [Firecrawl](https://github.com/mendableai/firecrawl) for web scraping capabilities. 4 | 5 | > Big thanks to [@vrknetha](https://github.com/vrknetha), [@knacklabs](https://www.knacklabs.ai) for the initial implementation! 6 | 7 | ## Features 8 | 9 | - Web scraping, crawling, and discovery 10 | - Search and content extraction 11 | - Deep research and batch scraping 12 | - Automatic retries and rate limiting 13 | - Cloud and self-hosted support 14 | - SSE support 15 | 16 | > Play around with [our MCP Server on MCP.so's playground](https://mcp.so/playground?server=firecrawl-mcp-server) or on [Klavis AI](https://www.klavis.ai/mcp-servers). 17 | 18 | ## Installation 19 | 20 | ### Running with npx 21 | 22 | ```bash 23 | env FIRECRAWL_API_KEY=fc-YOUR_API_KEY npx -y firecrawl-mcp 24 | ``` 25 | 26 | ### Manual Installation 27 | 28 | ```bash 29 | npm install -g firecrawl-mcp 30 | ``` 31 | 32 | ### Running on Cursor 33 | 34 | Configuring Cursor 🖥️ 35 | Note: Requires Cursor version 0.45.6+ 36 | For the most up-to-date configuration instructions, please refer to the official Cursor documentation on configuring MCP servers: 37 | [Cursor MCP Server Configuration Guide](https://docs.cursor.com/context/model-context-protocol#configuring-mcp-servers) 38 | 39 | To configure Firecrawl MCP in Cursor **v0.48.6** 40 | 41 | 1. Open Cursor Settings 42 | 2. Go to Features > MCP Servers 43 | 3. Click "+ Add new global MCP server" 44 | 4. Enter the following code: 45 | ```json 46 | { 47 | "mcpServers": { 48 | "firecrawl-mcp": { 49 | "command": "npx", 50 | "args": ["-y", "firecrawl-mcp"], 51 | "env": { 52 | "FIRECRAWL_API_KEY": "YOUR-API-KEY" 53 | } 54 | } 55 | } 56 | } 57 | ``` 58 | 59 | To configure Firecrawl MCP in Cursor **v0.45.6** 60 | 61 | 1. Open Cursor Settings 62 | 2. Go to Features > MCP Servers 63 | 3. Click "+ Add New MCP Server" 64 | 4. Enter the following: 65 | - Name: "firecrawl-mcp" (or your preferred name) 66 | - Type: "command" 67 | - Command: `env FIRECRAWL_API_KEY=your-api-key npx -y firecrawl-mcp` 68 | 69 | 70 | 71 | > If you are using Windows and are running into issues, try `cmd /c "set FIRECRAWL_API_KEY=your-api-key && npx -y firecrawl-mcp"` 72 | 73 | Replace `your-api-key` with your Firecrawl API key. If you don't have one yet, you can create an account and get it from https://www.firecrawl.dev/app/api-keys 74 | 75 | After adding, refresh the MCP server list to see the new tools. The Composer Agent will automatically use Firecrawl MCP when appropriate, but you can explicitly request it by describing your web scraping needs. Access the Composer via Command+L (Mac), select "Agent" next to the submit button, and enter your query. 76 | 77 | ### Running on Windsurf 78 | 79 | Add this to your `./codeium/windsurf/model_config.json`: 80 | 81 | ```json 82 | { 83 | "mcpServers": { 84 | "mcp-server-firecrawl": { 85 | "command": "npx", 86 | "args": ["-y", "firecrawl-mcp"], 87 | "env": { 88 | "FIRECRAWL_API_KEY": "YOUR_API_KEY" 89 | } 90 | } 91 | } 92 | } 93 | ``` 94 | 95 | ### Running with SSE Local Mode 96 | 97 | To run the server using Server-Sent Events (SSE) locally instead of the default stdio transport: 98 | 99 | ```bash 100 | env SSE_LOCAL=true FIRECRAWL_API_KEY=fc-YOUR_API_KEY npx -y firecrawl-mcp 101 | ``` 102 | 103 | Use the url: http://localhost:3000/sse 104 | 105 | ### Installing via Smithery (Legacy) 106 | 107 | To install Firecrawl for Claude Desktop automatically via [Smithery](https://smithery.ai/server/@mendableai/mcp-server-firecrawl): 108 | 109 | ```bash 110 | npx -y @smithery/cli install @mendableai/mcp-server-firecrawl --client claude 111 | ``` 112 | 113 | ### Running on VS Code 114 | 115 | For one-click installation, click one of the install buttons below... 116 | 117 | [![Install with NPX in VS Code](https://img.shields.io/badge/VS_Code-NPM-0098FF?style=flat-square&logo=visualstudiocode&logoColor=white)](https://insiders.vscode.dev/redirect/mcp/install?name=firecrawl&inputs=%5B%7B%22type%22%3A%22promptString%22%2C%22id%22%3A%22apiKey%22%2C%22description%22%3A%22Firecrawl%20API%20Key%22%2C%22password%22%3Atrue%7D%5D&config=%7B%22command%22%3A%22npx%22%2C%22args%22%3A%5B%22-y%22%2C%22firecrawl-mcp%22%5D%2C%22env%22%3A%7B%22FIRECRAWL_API_KEY%22%3A%22%24%7Binput%3AapiKey%7D%22%7D%7D) [![Install with NPX in VS Code Insiders](https://img.shields.io/badge/VS_Code_Insiders-NPM-24bfa5?style=flat-square&logo=visualstudiocode&logoColor=white)](https://insiders.vscode.dev/redirect/mcp/install?name=firecrawl&inputs=%5B%7B%22type%22%3A%22promptString%22%2C%22id%22%3A%22apiKey%22%2C%22description%22%3A%22Firecrawl%20API%20Key%22%2C%22password%22%3Atrue%7D%5D&config=%7B%22command%22%3A%22npx%22%2C%22args%22%3A%5B%22-y%22%2C%22firecrawl-mcp%22%5D%2C%22env%22%3A%7B%22FIRECRAWL_API_KEY%22%3A%22%24%7Binput%3AapiKey%7D%22%7D%7D&quality=insiders) 118 | 119 | For manual installation, add the following JSON block to your User Settings (JSON) file in VS Code. You can do this by pressing `Ctrl + Shift + P` and typing `Preferences: Open User Settings (JSON)`. 120 | 121 | ```json 122 | { 123 | "mcp": { 124 | "inputs": [ 125 | { 126 | "type": "promptString", 127 | "id": "apiKey", 128 | "description": "Firecrawl API Key", 129 | "password": true 130 | } 131 | ], 132 | "servers": { 133 | "firecrawl": { 134 | "command": "npx", 135 | "args": ["-y", "firecrawl-mcp"], 136 | "env": { 137 | "FIRECRAWL_API_KEY": "${input:apiKey}" 138 | } 139 | } 140 | } 141 | } 142 | } 143 | ``` 144 | 145 | Optionally, you can add it to a file called `.vscode/mcp.json` in your workspace. This will allow you to share the configuration with others: 146 | 147 | ```json 148 | { 149 | "inputs": [ 150 | { 151 | "type": "promptString", 152 | "id": "apiKey", 153 | "description": "Firecrawl API Key", 154 | "password": true 155 | } 156 | ], 157 | "servers": { 158 | "firecrawl": { 159 | "command": "npx", 160 | "args": ["-y", "firecrawl-mcp"], 161 | "env": { 162 | "FIRECRAWL_API_KEY": "${input:apiKey}" 163 | } 164 | } 165 | } 166 | } 167 | ``` 168 | 169 | ## Configuration 170 | 171 | ### Environment Variables 172 | 173 | #### Required for Cloud API 174 | 175 | - `FIRECRAWL_API_KEY`: Your Firecrawl API key 176 | - Required when using cloud API (default) 177 | - Optional when using self-hosted instance with `FIRECRAWL_API_URL` 178 | - `FIRECRAWL_API_URL` (Optional): Custom API endpoint for self-hosted instances 179 | - Example: `https://firecrawl.your-domain.com` 180 | - If not provided, the cloud API will be used (requires API key) 181 | 182 | #### Optional Configuration 183 | 184 | ##### Retry Configuration 185 | 186 | - `FIRECRAWL_RETRY_MAX_ATTEMPTS`: Maximum number of retry attempts (default: 3) 187 | - `FIRECRAWL_RETRY_INITIAL_DELAY`: Initial delay in milliseconds before first retry (default: 1000) 188 | - `FIRECRAWL_RETRY_MAX_DELAY`: Maximum delay in milliseconds between retries (default: 10000) 189 | - `FIRECRAWL_RETRY_BACKOFF_FACTOR`: Exponential backoff multiplier (default: 2) 190 | 191 | ##### Credit Usage Monitoring 192 | 193 | - `FIRECRAWL_CREDIT_WARNING_THRESHOLD`: Credit usage warning threshold (default: 1000) 194 | - `FIRECRAWL_CREDIT_CRITICAL_THRESHOLD`: Credit usage critical threshold (default: 100) 195 | 196 | ### Configuration Examples 197 | 198 | For cloud API usage with custom retry and credit monitoring: 199 | 200 | ```bash 201 | # Required for cloud API 202 | export FIRECRAWL_API_KEY=your-api-key 203 | 204 | # Optional retry configuration 205 | export FIRECRAWL_RETRY_MAX_ATTEMPTS=5 # Increase max retry attempts 206 | export FIRECRAWL_RETRY_INITIAL_DELAY=2000 # Start with 2s delay 207 | export FIRECRAWL_RETRY_MAX_DELAY=30000 # Maximum 30s delay 208 | export FIRECRAWL_RETRY_BACKOFF_FACTOR=3 # More aggressive backoff 209 | 210 | # Optional credit monitoring 211 | export FIRECRAWL_CREDIT_WARNING_THRESHOLD=2000 # Warning at 2000 credits 212 | export FIRECRAWL_CREDIT_CRITICAL_THRESHOLD=500 # Critical at 500 credits 213 | ``` 214 | 215 | For self-hosted instance: 216 | 217 | ```bash 218 | # Required for self-hosted 219 | export FIRECRAWL_API_URL=https://firecrawl.your-domain.com 220 | 221 | # Optional authentication for self-hosted 222 | export FIRECRAWL_API_KEY=your-api-key # If your instance requires auth 223 | 224 | # Custom retry configuration 225 | export FIRECRAWL_RETRY_MAX_ATTEMPTS=10 226 | export FIRECRAWL_RETRY_INITIAL_DELAY=500 # Start with faster retries 227 | ``` 228 | 229 | ### Usage with Claude Desktop 230 | 231 | Add this to your `claude_desktop_config.json`: 232 | 233 | ```json 234 | { 235 | "mcpServers": { 236 | "mcp-server-firecrawl": { 237 | "command": "npx", 238 | "args": ["-y", "firecrawl-mcp"], 239 | "env": { 240 | "FIRECRAWL_API_KEY": "YOUR_API_KEY_HERE", 241 | 242 | "FIRECRAWL_RETRY_MAX_ATTEMPTS": "5", 243 | "FIRECRAWL_RETRY_INITIAL_DELAY": "2000", 244 | "FIRECRAWL_RETRY_MAX_DELAY": "30000", 245 | "FIRECRAWL_RETRY_BACKOFF_FACTOR": "3", 246 | 247 | "FIRECRAWL_CREDIT_WARNING_THRESHOLD": "2000", 248 | "FIRECRAWL_CREDIT_CRITICAL_THRESHOLD": "500" 249 | } 250 | } 251 | } 252 | } 253 | ``` 254 | 255 | ### System Configuration 256 | 257 | The server includes several configurable parameters that can be set via environment variables. Here are the default values if not configured: 258 | 259 | ```typescript 260 | const CONFIG = { 261 | retry: { 262 | maxAttempts: 3, // Number of retry attempts for rate-limited requests 263 | initialDelay: 1000, // Initial delay before first retry (in milliseconds) 264 | maxDelay: 10000, // Maximum delay between retries (in milliseconds) 265 | backoffFactor: 2, // Multiplier for exponential backoff 266 | }, 267 | credit: { 268 | warningThreshold: 1000, // Warn when credit usage reaches this level 269 | criticalThreshold: 100, // Critical alert when credit usage reaches this level 270 | }, 271 | }; 272 | ``` 273 | 274 | These configurations control: 275 | 276 | 1. **Retry Behavior** 277 | 278 | - Automatically retries failed requests due to rate limits 279 | - Uses exponential backoff to avoid overwhelming the API 280 | - Example: With default settings, retries will be attempted at: 281 | - 1st retry: 1 second delay 282 | - 2nd retry: 2 seconds delay 283 | - 3rd retry: 4 seconds delay (capped at maxDelay) 284 | 285 | 2. **Credit Usage Monitoring** 286 | - Tracks API credit consumption for cloud API usage 287 | - Provides warnings at specified thresholds 288 | - Helps prevent unexpected service interruption 289 | - Example: With default settings: 290 | - Warning at 1000 credits remaining 291 | - Critical alert at 100 credits remaining 292 | 293 | ### Rate Limiting and Batch Processing 294 | 295 | The server utilizes Firecrawl's built-in rate limiting and batch processing capabilities: 296 | 297 | - Automatic rate limit handling with exponential backoff 298 | - Efficient parallel processing for batch operations 299 | - Smart request queuing and throttling 300 | - Automatic retries for transient errors 301 | 302 | ## How to Choose a Tool 303 | 304 | Use this guide to select the right tool for your task: 305 | 306 | - **If you know the exact URL(s) you want:** 307 | - For one: use **scrape** 308 | - For many: use **batch_scrape** 309 | - **If you need to discover URLs on a site:** use **map** 310 | - **If you want to search the web for info:** use **search** 311 | - **If you want to extract structured data:** use **extract** 312 | - **If you want to analyze a whole site or section:** use **crawl** (with limits!) 313 | - **If you want to do in-depth research:** use **deep_research** 314 | - **If you want to generate LLMs.txt:** use **generate_llmstxt** 315 | 316 | ### Quick Reference Table 317 | 318 | | Tool | Best for | Returns | 319 | |---------------------|------------------------------------------|-----------------| 320 | | scrape | Single page content | markdown/html | 321 | | batch_scrape | Multiple known URLs | markdown/html[] | 322 | | map | Discovering URLs on a site | URL[] | 323 | | crawl | Multi-page extraction (with limits) | markdown/html[] | 324 | | search | Web search for info | results[] | 325 | | extract | Structured data from pages | JSON | 326 | | deep_research | In-depth, multi-source research | summary, sources| 327 | | generate_llmstxt | LLMs.txt for a domain | text | 328 | 329 | ## Available Tools 330 | 331 | ### 1. Scrape Tool (`firecrawl_scrape`) 332 | 333 | Scrape content from a single URL with advanced options. 334 | 335 | **Best for:** 336 | - Single page content extraction, when you know exactly which page contains the information. 337 | 338 | **Not recommended for:** 339 | - Extracting content from multiple pages (use batch_scrape for known URLs, or map + batch_scrape to discover URLs first, or crawl for full page content) 340 | - When you're unsure which page contains the information (use search) 341 | - When you need structured data (use extract) 342 | 343 | **Common mistakes:** 344 | - Using scrape for a list of URLs (use batch_scrape instead). 345 | 346 | **Prompt Example:** 347 | > "Get the content of the page at https://example.com." 348 | 349 | **Usage Example:** 350 | ```json 351 | { 352 | "name": "firecrawl_scrape", 353 | "arguments": { 354 | "url": "https://example.com", 355 | "formats": ["markdown"], 356 | "onlyMainContent": true, 357 | "waitFor": 1000, 358 | "timeout": 30000, 359 | "mobile": false, 360 | "includeTags": ["article", "main"], 361 | "excludeTags": ["nav", "footer"], 362 | "skipTlsVerification": false 363 | } 364 | } 365 | ``` 366 | 367 | **Returns:** 368 | - Markdown, HTML, or other formats as specified. 369 | 370 | ### 2. Batch Scrape Tool (`firecrawl_batch_scrape`) 371 | 372 | Scrape multiple URLs efficiently with built-in rate limiting and parallel processing. 373 | 374 | **Best for:** 375 | - Retrieving content from multiple pages, when you know exactly which pages to scrape. 376 | 377 | **Not recommended for:** 378 | - Discovering URLs (use map first if you don't know the URLs) 379 | - Scraping a single page (use scrape) 380 | 381 | **Common mistakes:** 382 | - Using batch_scrape with too many URLs at once (may hit rate limits or token overflow) 383 | 384 | **Prompt Example:** 385 | > "Get the content of these three blog posts: [url1, url2, url3]." 386 | 387 | **Usage Example:** 388 | ```json 389 | { 390 | "name": "firecrawl_batch_scrape", 391 | "arguments": { 392 | "urls": ["https://example1.com", "https://example2.com"], 393 | "options": { 394 | "formats": ["markdown"], 395 | "onlyMainContent": true 396 | } 397 | } 398 | } 399 | ``` 400 | 401 | **Returns:** 402 | - Response includes operation ID for status checking: 403 | 404 | ```json 405 | { 406 | "content": [ 407 | { 408 | "type": "text", 409 | "text": "Batch operation queued with ID: batch_1. Use firecrawl_check_batch_status to check progress." 410 | } 411 | ], 412 | "isError": false 413 | } 414 | ``` 415 | 416 | ### 3. Check Batch Status (`firecrawl_check_batch_status`) 417 | 418 | Check the status of a batch operation. 419 | 420 | ```json 421 | { 422 | "name": "firecrawl_check_batch_status", 423 | "arguments": { 424 | "id": "batch_1" 425 | } 426 | } 427 | ``` 428 | 429 | ### 4. Map Tool (`firecrawl_map`) 430 | 431 | Map a website to discover all indexed URLs on the site. 432 | 433 | **Best for:** 434 | - Discovering URLs on a website before deciding what to scrape 435 | - Finding specific sections of a website 436 | 437 | **Not recommended for:** 438 | - When you already know which specific URL you need (use scrape or batch_scrape) 439 | - When you need the content of the pages (use scrape after mapping) 440 | 441 | **Common mistakes:** 442 | - Using crawl to discover URLs instead of map 443 | 444 | **Prompt Example:** 445 | > "List all URLs on example.com." 446 | 447 | **Usage Example:** 448 | ```json 449 | { 450 | "name": "firecrawl_map", 451 | "arguments": { 452 | "url": "https://example.com" 453 | } 454 | } 455 | ``` 456 | 457 | **Returns:** 458 | - Array of URLs found on the site 459 | 460 | ### 5. Search Tool (`firecrawl_search`) 461 | 462 | Search the web and optionally extract content from search results. 463 | 464 | **Best for:** 465 | - Finding specific information across multiple websites, when you don't know which website has the information. 466 | - When you need the most relevant content for a query 467 | 468 | **Not recommended for:** 469 | - When you already know which website to scrape (use scrape) 470 | - When you need comprehensive coverage of a single website (use map or crawl) 471 | 472 | **Common mistakes:** 473 | - Using crawl or map for open-ended questions (use search instead) 474 | 475 | **Usage Example:** 476 | ```json 477 | { 478 | "name": "firecrawl_search", 479 | "arguments": { 480 | "query": "latest AI research papers 2023", 481 | "limit": 5, 482 | "lang": "en", 483 | "country": "us", 484 | "scrapeOptions": { 485 | "formats": ["markdown"], 486 | "onlyMainContent": true 487 | } 488 | } 489 | } 490 | ``` 491 | 492 | **Returns:** 493 | - Array of search results (with optional scraped content) 494 | 495 | **Prompt Example:** 496 | > "Find the latest research papers on AI published in 2023." 497 | 498 | ### 6. Crawl Tool (`firecrawl_crawl`) 499 | 500 | Starts an asynchronous crawl job on a website and extract content from all pages. 501 | 502 | **Best for:** 503 | - Extracting content from multiple related pages, when you need comprehensive coverage. 504 | 505 | **Not recommended for:** 506 | - Extracting content from a single page (use scrape) 507 | - When token limits are a concern (use map + batch_scrape) 508 | - When you need fast results (crawling can be slow) 509 | 510 | **Warning:** Crawl responses can be very large and may exceed token limits. Limit the crawl depth and number of pages, or use map + batch_scrape for better control. 511 | 512 | **Common mistakes:** 513 | - Setting limit or maxDepth too high (causes token overflow) 514 | - Using crawl for a single page (use scrape instead) 515 | 516 | **Prompt Example:** 517 | > "Get all blog posts from the first two levels of example.com/blog." 518 | 519 | **Usage Example:** 520 | ```json 521 | { 522 | "name": "firecrawl_crawl", 523 | "arguments": { 524 | "url": "https://example.com/blog/*", 525 | "maxDepth": 2, 526 | "limit": 100, 527 | "allowExternalLinks": false, 528 | "deduplicateSimilarURLs": true 529 | } 530 | } 531 | ``` 532 | 533 | **Returns:** 534 | - Response includes operation ID for status checking: 535 | 536 | ```json 537 | { 538 | "content": [ 539 | { 540 | "type": "text", 541 | "text": "Started crawl for: https://example.com/* with job ID: 550e8400-e29b-41d4-a716-446655440000. Use firecrawl_check_crawl_status to check progress." 542 | } 543 | ], 544 | "isError": false 545 | } 546 | ``` 547 | 548 | ### 7. Check Crawl Status (`firecrawl_check_crawl_status`) 549 | 550 | Check the status of a crawl job. 551 | 552 | ```json 553 | { 554 | "name": "firecrawl_check_crawl_status", 555 | "arguments": { 556 | "id": "550e8400-e29b-41d4-a716-446655440000" 557 | } 558 | } 559 | ``` 560 | 561 | **Returns:** 562 | - Response includes the status of the crawl job: 563 | 564 | ### 8. Extract Tool (`firecrawl_extract`) 565 | 566 | Extract structured information from web pages using LLM capabilities. Supports both cloud AI and self-hosted LLM extraction. 567 | 568 | **Best for:** 569 | - Extracting specific structured data like prices, names, details. 570 | 571 | **Not recommended for:** 572 | - When you need the full content of a page (use scrape) 573 | - When you're not looking for specific structured data 574 | 575 | **Arguments:** 576 | - `urls`: Array of URLs to extract information from 577 | - `prompt`: Custom prompt for the LLM extraction 578 | - `systemPrompt`: System prompt to guide the LLM 579 | - `schema`: JSON schema for structured data extraction 580 | - `allowExternalLinks`: Allow extraction from external links 581 | - `enableWebSearch`: Enable web search for additional context 582 | - `includeSubdomains`: Include subdomains in extraction 583 | 584 | When using a self-hosted instance, the extraction will use your configured LLM. For cloud API, it uses Firecrawl's managed LLM service. 585 | **Prompt Example:** 586 | > "Extract the product name, price, and description from these product pages." 587 | 588 | **Usage Example:** 589 | ```json 590 | { 591 | "name": "firecrawl_extract", 592 | "arguments": { 593 | "urls": ["https://example.com/page1", "https://example.com/page2"], 594 | "prompt": "Extract product information including name, price, and description", 595 | "systemPrompt": "You are a helpful assistant that extracts product information", 596 | "schema": { 597 | "type": "object", 598 | "properties": { 599 | "name": { "type": "string" }, 600 | "price": { "type": "number" }, 601 | "description": { "type": "string" } 602 | }, 603 | "required": ["name", "price"] 604 | }, 605 | "allowExternalLinks": false, 606 | "enableWebSearch": false, 607 | "includeSubdomains": false 608 | } 609 | } 610 | ``` 611 | 612 | **Returns:** 613 | - Extracted structured data as defined by your schema 614 | 615 | ```json 616 | { 617 | "content": [ 618 | { 619 | "type": "text", 620 | "text": { 621 | "name": "Example Product", 622 | "price": 99.99, 623 | "description": "This is an example product description" 624 | } 625 | } 626 | ], 627 | "isError": false 628 | } 629 | ``` 630 | 631 | ### 9. Deep Research Tool (`firecrawl_deep_research`) 632 | 633 | Conduct deep web research on a query using intelligent crawling, search, and LLM analysis. 634 | 635 | **Best for:** 636 | - Complex research questions requiring multiple sources, in-depth analysis. 637 | 638 | **Not recommended for:** 639 | - Simple questions that can be answered with a single search 640 | - When you need very specific information from a known page (use scrape) 641 | - When you need results quickly (deep research can take time) 642 | 643 | **Arguments:** 644 | - query (string, required): The research question or topic to explore. 645 | - maxDepth (number, optional): Maximum recursive depth for crawling/search (default: 3). 646 | - timeLimit (number, optional): Time limit in seconds for the research session (default: 120). 647 | - maxUrls (number, optional): Maximum number of URLs to analyze (default: 50). 648 | 649 | **Prompt Example:** 650 | > "Research the environmental impact of electric vehicles versus gasoline vehicles." 651 | 652 | **Usage Example:** 653 | ```json 654 | { 655 | "name": "firecrawl_deep_research", 656 | "arguments": { 657 | "query": "What are the environmental impacts of electric vehicles compared to gasoline vehicles?", 658 | "maxDepth": 3, 659 | "timeLimit": 120, 660 | "maxUrls": 50 661 | } 662 | } 663 | ``` 664 | 665 | **Returns:** 666 | - Final analysis generated by an LLM based on research. (data.finalAnalysis) 667 | - May also include structured activities and sources used in the research process. 668 | 669 | ### 10. Generate LLMs.txt Tool (`firecrawl_generate_llmstxt`) 670 | 671 | Generate a standardized llms.txt (and optionally llms-full.txt) file for a given domain. This file defines how large language models should interact 672 | with the site. 673 | 674 | **Best for:** 675 | - Creating machine-readable permission guidelines for AI models. 676 | 677 | **Not recommended for:** 678 | - General content extraction or research 679 | 680 | **Arguments:** 681 | - url (string, required): The base URL of the website to analyze. 682 | - maxUrls (number, optional): Max number of URLs to include (default: 10). 683 | - showFullText (boolean, optional): Whether to include llms-full.txt contents in the response. 684 | 685 | **Prompt Example:** 686 | > "Generate an LLMs.txt file for example.com." 687 | 688 | **Usage Example:** 689 | ```json 690 | { 691 | "name": "firecrawl_generate_llmstxt", 692 | "arguments": { 693 | "url": "https://example.com", 694 | "maxUrls": 20, 695 | "showFullText": true 696 | } 697 | } 698 | ``` 699 | 700 | **Returns:** 701 | - LLMs.txt file contents (and optionally llms-full.txt) 702 | 703 | ## Logging System 704 | 705 | The server includes comprehensive logging: 706 | 707 | - Operation status and progress 708 | - Performance metrics 709 | - Credit usage monitoring 710 | - Rate limit tracking 711 | - Error conditions 712 | 713 | Example log messages: 714 | 715 | ``` 716 | [INFO] Firecrawl MCP Server initialized successfully 717 | [INFO] Starting scrape for URL: https://example.com 718 | [INFO] Batch operation queued with ID: batch_1 719 | [WARNING] Credit usage has reached warning threshold 720 | [ERROR] Rate limit exceeded, retrying in 2s... 721 | ``` 722 | 723 | ## Error Handling 724 | 725 | The server provides robust error handling: 726 | 727 | - Automatic retries for transient errors 728 | - Rate limit handling with backoff 729 | - Detailed error messages 730 | - Credit usage warnings 731 | - Network resilience 732 | 733 | Example error response: 734 | 735 | ```json 736 | { 737 | "content": [ 738 | { 739 | "type": "text", 740 | "text": "Error: Rate limit exceeded. Retrying in 2 seconds..." 741 | } 742 | ], 743 | "isError": true 744 | } 745 | ``` 746 | 747 | ## Development 748 | 749 | ```bash 750 | # Install dependencies 751 | npm install 752 | 753 | # Build 754 | npm run build 755 | 756 | # Run tests 757 | npm test 758 | ``` 759 | 760 | ### Contributing 761 | 762 | 1. Fork the repository 763 | 2. Create your feature branch 764 | 3. Run tests: `npm test` 765 | 4. Submit a pull request 766 | 767 | ### Thanks to contributors 768 | 769 | Thanks to [@vrknetha](https://github.com/vrknetha), [@cawstudios](https://caw.tech) for the initial implementation! 770 | 771 | Thanks to MCP.so and Klavis AI for hosting and [@gstarwd](https://github.com/gstarwd), [@xiangkaiz](https://github.com/xiangkaiz) and [@zihaolin96](https://github.com/zihaolin96) for integrating our server. 772 | 773 | ## License 774 | 775 | MIT License - see LICENSE file for details 776 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | export default { 2 | preset: 'ts-jest/presets/default-esm', 3 | testEnvironment: 'node', 4 | extensionsToTreatAsEsm: ['.ts'], 5 | transform: { 6 | '^.+\\.tsx?$': [ 7 | 'ts-jest', 8 | { 9 | useESM: true, 10 | }, 11 | ], 12 | }, 13 | moduleNameMapper: { 14 | '^(\\.{1,2}/.*)\\.js$': '$1', 15 | }, 16 | testMatch: ['**/*.test.ts'], 17 | setupFilesAfterEnv: ['/jest.setup.ts'], 18 | }; 19 | -------------------------------------------------------------------------------- /jest.setup.ts: -------------------------------------------------------------------------------- 1 | import { jest } from '@jest/globals'; 2 | import FirecrawlApp from '@mendable/firecrawl-js'; 3 | import type { 4 | SearchResponse, 5 | BatchScrapeResponse, 6 | BatchScrapeStatusResponse, 7 | FirecrawlDocument, 8 | } from '@mendable/firecrawl-js'; 9 | 10 | // Set test timeout 11 | jest.setTimeout(30000); 12 | 13 | // Create mock responses 14 | const mockSearchResponse: SearchResponse = { 15 | success: true, 16 | data: [ 17 | { 18 | url: 'https://example.com', 19 | title: 'Test Page', 20 | description: 'Test Description', 21 | markdown: '# Test Content', 22 | actions: null as never, 23 | }, 24 | ] as FirecrawlDocument[], 25 | }; 26 | 27 | const mockBatchScrapeResponse: BatchScrapeResponse = { 28 | success: true, 29 | id: 'test-batch-id', 30 | }; 31 | 32 | const mockBatchStatusResponse: BatchScrapeStatusResponse = { 33 | success: true, 34 | status: 'completed', 35 | completed: 1, 36 | total: 1, 37 | creditsUsed: 1, 38 | expiresAt: new Date(), 39 | data: [ 40 | { 41 | url: 'https://example.com', 42 | title: 'Test Page', 43 | description: 'Test Description', 44 | markdown: '# Test Content', 45 | actions: null as never, 46 | }, 47 | ] as FirecrawlDocument[], 48 | }; 49 | 50 | // Create mock instance methods 51 | const mockSearch = jest.fn().mockImplementation(async () => mockSearchResponse); 52 | const mockAsyncBatchScrapeUrls = jest 53 | .fn() 54 | .mockImplementation(async () => mockBatchScrapeResponse); 55 | const mockCheckBatchScrapeStatus = jest 56 | .fn() 57 | .mockImplementation(async () => mockBatchStatusResponse); 58 | 59 | // Create mock instance 60 | const mockInstance = { 61 | apiKey: 'test-api-key', 62 | apiUrl: 'test-api-url', 63 | search: mockSearch, 64 | asyncBatchScrapeUrls: mockAsyncBatchScrapeUrls, 65 | checkBatchScrapeStatus: mockCheckBatchScrapeStatus, 66 | }; 67 | 68 | // Mock the module 69 | jest.mock('@mendable/firecrawl-js', () => ({ 70 | __esModule: true, 71 | default: jest.fn().mockImplementation(() => mockInstance), 72 | })); 73 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "firecrawl-mcp", 3 | "version": "1.9.0", 4 | "description": "MCP server for Firecrawl web scraping integration. Supports both cloud and self-hosted instances. Features include web scraping, batch processing, structured data extraction, and LLM-powered content analysis.", 5 | "type": "module", 6 | "bin": { 7 | "firecrawl-mcp": "dist/index.js" 8 | }, 9 | "files": [ 10 | "dist" 11 | ], 12 | "publishConfig": { 13 | "access": "public" 14 | }, 15 | "scripts": { 16 | "build": "tsc && node -e \"require('fs').chmodSync('dist/index.js', '755')\"", 17 | "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js", 18 | "start": "node dist/index.js", 19 | "lint": "eslint src/**/*.ts", 20 | "lint:fix": "eslint src/**/*.ts --fix", 21 | "format": "prettier --write .", 22 | "prepare": "npm run build", 23 | "publish": "npm run build && npm publish" 24 | }, 25 | "license": "MIT", 26 | "dependencies": { 27 | "@mendable/firecrawl-js": "^1.19.0", 28 | "@modelcontextprotocol/sdk": "^1.4.1", 29 | "dotenv": "^16.4.7", 30 | "express": "^5.1.0", 31 | "shx": "^0.3.4", 32 | "ws": "^8.18.1" 33 | }, 34 | "devDependencies": { 35 | "@jest/globals": "^29.7.0", 36 | "@types/express": "^5.0.1", 37 | "@types/jest": "^29.5.14", 38 | "@types/node": "^20.10.5", 39 | "@typescript-eslint/eslint-plugin": "^7.0.0", 40 | "@typescript-eslint/parser": "^7.0.0", 41 | "eslint": "^8.56.0", 42 | "eslint-config-prettier": "^9.1.0", 43 | "jest": "^29.7.0", 44 | "jest-mock-extended": "^4.0.0-beta1", 45 | "prettier": "^3.1.1", 46 | "ts-jest": "^29.1.1", 47 | "typescript": "^5.3.3" 48 | }, 49 | "engines": { 50 | "node": ">=18.0.0" 51 | }, 52 | "keywords": [ 53 | "mcp", 54 | "firecrawl", 55 | "web-scraping", 56 | "crawler", 57 | "content-extraction" 58 | ], 59 | "repository": { 60 | "type": "git", 61 | "url": "git+https://github.com/mendableai/firecrawl-mcp-server.git" 62 | }, 63 | "author": "vrknetha", 64 | "bugs": { 65 | "url": "https://github.com/mendableai/firecrawl-mcp-server/issues" 66 | }, 67 | "homepage": "https://github.com/mendableai/firecrawl-mcp-server#readme" 68 | } 69 | -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | required: 9 | - fireCrawlApiKey 10 | properties: 11 | fireCrawlApiKey: 12 | type: string 13 | description: Your Firecrawl API key. Required for cloud API usage. 14 | fireCrawlApiUrl: 15 | type: string 16 | description: 17 | Custom API endpoint for self-hosted instances. If provided, API key 18 | becomes optional. 19 | commandFunction: 20 | # A function that produces the CLI command to start the MCP on stdio. 21 | |- 22 | (config) => ({ command: 'node', args: ['dist/index.js'], env: { FIRECRAWL_API_KEY: config.fireCrawlApiKey, FIRECRAWL_API_URL: config.fireCrawlApiUrl || '' } }) 23 | -------------------------------------------------------------------------------- /src/index.test.ts: -------------------------------------------------------------------------------- 1 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 2 | import { CallToolRequestSchema } from '@modelcontextprotocol/sdk/types.js'; 3 | import FirecrawlApp from '@mendable/firecrawl-js'; 4 | import type { 5 | SearchResponse, 6 | BatchScrapeResponse, 7 | BatchScrapeStatusResponse, 8 | CrawlResponse, 9 | CrawlStatusResponse, 10 | ScrapeResponse, 11 | FirecrawlDocument, 12 | SearchParams, 13 | } from '@mendable/firecrawl-js'; 14 | import { 15 | describe, 16 | expect, 17 | jest, 18 | test, 19 | beforeEach, 20 | afterEach, 21 | } from '@jest/globals'; 22 | import { mock, MockProxy } from 'jest-mock-extended'; 23 | 24 | // Mock FirecrawlApp 25 | jest.mock('@mendable/firecrawl-js'); 26 | 27 | // Test interfaces 28 | interface RequestParams { 29 | method: string; 30 | params: { 31 | name: string; 32 | arguments?: Record; 33 | }; 34 | } 35 | 36 | interface BatchScrapeArgs { 37 | urls: string[]; 38 | options?: { 39 | formats?: string[]; 40 | [key: string]: any; 41 | }; 42 | } 43 | 44 | interface StatusCheckArgs { 45 | id: string; 46 | } 47 | 48 | interface SearchArgs { 49 | query: string; 50 | scrapeOptions?: { 51 | formats?: string[]; 52 | onlyMainContent?: boolean; 53 | }; 54 | } 55 | 56 | interface ScrapeArgs { 57 | url: string; 58 | formats?: string[]; 59 | onlyMainContent?: boolean; 60 | } 61 | 62 | interface CrawlArgs { 63 | url: string; 64 | maxDepth?: number; 65 | limit?: number; 66 | } 67 | 68 | // Mock client interface 69 | interface MockFirecrawlClient { 70 | scrapeUrl(url: string, options?: any): Promise; 71 | search(query: string, params?: SearchParams): Promise; 72 | asyncBatchScrapeUrls( 73 | urls: string[], 74 | options?: any 75 | ): Promise; 76 | checkBatchScrapeStatus(id: string): Promise; 77 | asyncCrawlUrl(url: string, options?: any): Promise; 78 | checkCrawlStatus(id: string): Promise; 79 | mapUrl(url: string, options?: any): Promise<{ links: string[] }>; 80 | } 81 | 82 | describe('Firecrawl Tool Tests', () => { 83 | let mockClient: MockProxy; 84 | let requestHandler: (request: RequestParams) => Promise; 85 | 86 | beforeEach(() => { 87 | jest.clearAllMocks(); 88 | mockClient = mock(); 89 | 90 | // Set up mock implementations 91 | const mockInstance = new FirecrawlApp({ apiKey: 'test' }); 92 | Object.assign(mockInstance, mockClient); 93 | 94 | // Create request handler 95 | requestHandler = async (request: RequestParams) => { 96 | const { name, arguments: args } = request.params; 97 | if (!args) { 98 | throw new Error('No arguments provided'); 99 | } 100 | return handleRequest(name, args, mockClient); 101 | }; 102 | }); 103 | 104 | afterEach(() => { 105 | jest.clearAllMocks(); 106 | }); 107 | 108 | // Test scrape functionality 109 | test('should handle scrape request', async () => { 110 | const url = 'https://example.com'; 111 | const options = { formats: ['markdown'] }; 112 | 113 | const mockResponse: ScrapeResponse = { 114 | success: true, 115 | markdown: '# Test Content', 116 | html: undefined, 117 | rawHtml: undefined, 118 | url: 'https://example.com', 119 | actions: undefined as never, 120 | }; 121 | 122 | mockClient.scrapeUrl.mockResolvedValueOnce(mockResponse); 123 | 124 | const response = await requestHandler({ 125 | method: 'call_tool', 126 | params: { 127 | name: 'firecrawl_scrape', 128 | arguments: { url, ...options }, 129 | }, 130 | }); 131 | 132 | expect(response).toEqual({ 133 | content: [{ type: 'text', text: '# Test Content' }], 134 | isError: false, 135 | }); 136 | expect(mockClient.scrapeUrl).toHaveBeenCalledWith(url, { 137 | formats: ['markdown'], 138 | url, 139 | }); 140 | }); 141 | 142 | // Test batch scrape functionality 143 | test('should handle batch scrape request', async () => { 144 | const urls = ['https://example.com']; 145 | const options = { formats: ['markdown'] }; 146 | 147 | mockClient.asyncBatchScrapeUrls.mockResolvedValueOnce({ 148 | success: true, 149 | id: 'test-batch-id', 150 | }); 151 | 152 | const response = await requestHandler({ 153 | method: 'call_tool', 154 | params: { 155 | name: 'firecrawl_batch_scrape', 156 | arguments: { urls, options }, 157 | }, 158 | }); 159 | 160 | expect(response.content[0].text).toContain( 161 | 'Batch operation queued with ID: batch_' 162 | ); 163 | expect(mockClient.asyncBatchScrapeUrls).toHaveBeenCalledWith(urls, options); 164 | }); 165 | 166 | // Test search functionality 167 | test('should handle search request', async () => { 168 | const query = 'test query'; 169 | const scrapeOptions = { formats: ['markdown'] }; 170 | 171 | const mockSearchResponse: SearchResponse = { 172 | success: true, 173 | data: [ 174 | { 175 | url: 'https://example.com', 176 | title: 'Test Page', 177 | description: 'Test Description', 178 | markdown: '# Test Content', 179 | actions: undefined as never, 180 | }, 181 | ], 182 | }; 183 | 184 | mockClient.search.mockResolvedValueOnce(mockSearchResponse); 185 | 186 | const response = await requestHandler({ 187 | method: 'call_tool', 188 | params: { 189 | name: 'firecrawl_search', 190 | arguments: { query, scrapeOptions }, 191 | }, 192 | }); 193 | 194 | expect(response.isError).toBe(false); 195 | expect(response.content[0].text).toContain('Test Page'); 196 | expect(mockClient.search).toHaveBeenCalledWith(query, scrapeOptions); 197 | }); 198 | 199 | // Test crawl functionality 200 | test('should handle crawl request', async () => { 201 | const url = 'https://example.com'; 202 | const options = { maxDepth: 2 }; 203 | 204 | mockClient.asyncCrawlUrl.mockResolvedValueOnce({ 205 | success: true, 206 | id: 'test-crawl-id', 207 | }); 208 | 209 | const response = await requestHandler({ 210 | method: 'call_tool', 211 | params: { 212 | name: 'firecrawl_crawl', 213 | arguments: { url, ...options }, 214 | }, 215 | }); 216 | 217 | expect(response.isError).toBe(false); 218 | expect(response.content[0].text).toContain('test-crawl-id'); 219 | expect(mockClient.asyncCrawlUrl).toHaveBeenCalledWith(url, { 220 | maxDepth: 2, 221 | url, 222 | }); 223 | }); 224 | 225 | // Test error handling 226 | test('should handle API errors', async () => { 227 | const url = 'https://example.com'; 228 | 229 | mockClient.scrapeUrl.mockRejectedValueOnce(new Error('API Error')); 230 | 231 | const response = await requestHandler({ 232 | method: 'call_tool', 233 | params: { 234 | name: 'firecrawl_scrape', 235 | arguments: { url }, 236 | }, 237 | }); 238 | 239 | expect(response.isError).toBe(true); 240 | expect(response.content[0].text).toContain('API Error'); 241 | }); 242 | 243 | // Test rate limiting 244 | test('should handle rate limits', async () => { 245 | const url = 'https://example.com'; 246 | 247 | // Mock rate limit error 248 | mockClient.scrapeUrl.mockRejectedValueOnce( 249 | new Error('rate limit exceeded') 250 | ); 251 | 252 | const response = await requestHandler({ 253 | method: 'call_tool', 254 | params: { 255 | name: 'firecrawl_scrape', 256 | arguments: { url }, 257 | }, 258 | }); 259 | 260 | expect(response.isError).toBe(true); 261 | expect(response.content[0].text).toContain('rate limit exceeded'); 262 | }); 263 | }); 264 | 265 | // Helper function to simulate request handling 266 | async function handleRequest( 267 | name: string, 268 | args: any, 269 | client: MockFirecrawlClient 270 | ) { 271 | try { 272 | switch (name) { 273 | case 'firecrawl_scrape': { 274 | const response = await client.scrapeUrl(args.url, args); 275 | if (!response.success) { 276 | throw new Error(response.error || 'Scraping failed'); 277 | } 278 | return { 279 | content: [ 280 | { type: 'text', text: response.markdown || 'No content available' }, 281 | ], 282 | isError: false, 283 | }; 284 | } 285 | 286 | case 'firecrawl_batch_scrape': { 287 | const response = await client.asyncBatchScrapeUrls( 288 | args.urls, 289 | args.options 290 | ); 291 | return { 292 | content: [ 293 | { 294 | type: 'text', 295 | text: `Batch operation queued with ID: batch_1. Use firecrawl_check_batch_status to check progress.`, 296 | }, 297 | ], 298 | isError: false, 299 | }; 300 | } 301 | 302 | case 'firecrawl_search': { 303 | const response = await client.search(args.query, args.scrapeOptions); 304 | if (!response.success) { 305 | throw new Error(response.error || 'Search failed'); 306 | } 307 | const results = response.data 308 | .map( 309 | (result) => 310 | `URL: ${result.url}\nTitle: ${ 311 | result.title || 'No title' 312 | }\nDescription: ${result.description || 'No description'}\n${ 313 | result.markdown ? `\nContent:\n${result.markdown}` : '' 314 | }` 315 | ) 316 | .join('\n\n'); 317 | return { 318 | content: [{ type: 'text', text: results }], 319 | isError: false, 320 | }; 321 | } 322 | 323 | case 'firecrawl_crawl': { 324 | const response = await client.asyncCrawlUrl(args.url, args); 325 | if (!response.success) { 326 | throw new Error(response.error); 327 | } 328 | return { 329 | content: [ 330 | { 331 | type: 'text', 332 | text: `Started crawl for ${args.url} with job ID: ${response.id}`, 333 | }, 334 | ], 335 | isError: false, 336 | }; 337 | } 338 | 339 | default: 340 | throw new Error(`Unknown tool: ${name}`); 341 | } 342 | } catch (error) { 343 | return { 344 | content: [ 345 | { 346 | type: 'text', 347 | text: error instanceof Error ? error.message : String(error), 348 | }, 349 | ], 350 | isError: true, 351 | }; 352 | } 353 | } 354 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { Server } from '@modelcontextprotocol/sdk/server/index.js'; 4 | import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; 5 | import { SSEServerTransport } from '@modelcontextprotocol/sdk/server/sse.js'; 6 | import { 7 | Tool, 8 | CallToolRequestSchema, 9 | ListToolsRequestSchema, 10 | } from '@modelcontextprotocol/sdk/types.js'; 11 | import FirecrawlApp, { 12 | type ScrapeParams, 13 | type MapParams, 14 | type CrawlParams, 15 | type FirecrawlDocument, 16 | } from '@mendable/firecrawl-js'; 17 | 18 | import express, { Request, Response } from 'express'; 19 | import dotenv from 'dotenv'; 20 | 21 | dotenv.config(); 22 | 23 | // Tool definitions 24 | const SCRAPE_TOOL: Tool = { 25 | name: 'firecrawl_scrape', 26 | description: ` 27 | Scrape content from a single URL with advanced options. 28 | 29 | **Best for:** Single page content extraction, when you know exactly which page contains the information. 30 | **Not recommended for:** Multiple pages (use batch_scrape), unknown page (use search), structured data (use extract). 31 | **Common mistakes:** Using scrape for a list of URLs (use batch_scrape instead). 32 | **Prompt Example:** "Get the content of the page at https://example.com." 33 | **Usage Example:** 34 | \`\`\`json 35 | { 36 | "name": "firecrawl_scrape", 37 | "arguments": { 38 | "url": "https://example.com", 39 | "formats": ["markdown"] 40 | } 41 | } 42 | \`\`\` 43 | **Returns:** Markdown, HTML, or other formats as specified. 44 | `, 45 | inputSchema: { 46 | type: 'object', 47 | properties: { 48 | url: { 49 | type: 'string', 50 | description: 'The URL to scrape', 51 | }, 52 | formats: { 53 | type: 'array', 54 | items: { 55 | type: 'string', 56 | enum: [ 57 | 'markdown', 58 | 'html', 59 | 'rawHtml', 60 | 'screenshot', 61 | 'links', 62 | 'screenshot@fullPage', 63 | 'extract', 64 | ], 65 | }, 66 | default: ['markdown'], 67 | description: "Content formats to extract (default: ['markdown'])", 68 | }, 69 | onlyMainContent: { 70 | type: 'boolean', 71 | description: 72 | 'Extract only the main content, filtering out navigation, footers, etc.', 73 | }, 74 | includeTags: { 75 | type: 'array', 76 | items: { type: 'string' }, 77 | description: 'HTML tags to specifically include in extraction', 78 | }, 79 | excludeTags: { 80 | type: 'array', 81 | items: { type: 'string' }, 82 | description: 'HTML tags to exclude from extraction', 83 | }, 84 | waitFor: { 85 | type: 'number', 86 | description: 'Time in milliseconds to wait for dynamic content to load', 87 | }, 88 | timeout: { 89 | type: 'number', 90 | description: 91 | 'Maximum time in milliseconds to wait for the page to load', 92 | }, 93 | actions: { 94 | type: 'array', 95 | items: { 96 | type: 'object', 97 | properties: { 98 | type: { 99 | type: 'string', 100 | enum: [ 101 | 'wait', 102 | 'click', 103 | 'screenshot', 104 | 'write', 105 | 'press', 106 | 'scroll', 107 | 'scrape', 108 | 'executeJavascript', 109 | ], 110 | description: 'Type of action to perform', 111 | }, 112 | selector: { 113 | type: 'string', 114 | description: 'CSS selector for the target element', 115 | }, 116 | milliseconds: { 117 | type: 'number', 118 | description: 'Time to wait in milliseconds (for wait action)', 119 | }, 120 | text: { 121 | type: 'string', 122 | description: 'Text to write (for write action)', 123 | }, 124 | key: { 125 | type: 'string', 126 | description: 'Key to press (for press action)', 127 | }, 128 | direction: { 129 | type: 'string', 130 | enum: ['up', 'down'], 131 | description: 'Scroll direction', 132 | }, 133 | script: { 134 | type: 'string', 135 | description: 'JavaScript code to execute', 136 | }, 137 | fullPage: { 138 | type: 'boolean', 139 | description: 'Take full page screenshot', 140 | }, 141 | }, 142 | required: ['type'], 143 | }, 144 | description: 'List of actions to perform before scraping', 145 | }, 146 | extract: { 147 | type: 'object', 148 | properties: { 149 | schema: { 150 | type: 'object', 151 | description: 'Schema for structured data extraction', 152 | }, 153 | systemPrompt: { 154 | type: 'string', 155 | description: 'System prompt for LLM extraction', 156 | }, 157 | prompt: { 158 | type: 'string', 159 | description: 'User prompt for LLM extraction', 160 | }, 161 | }, 162 | description: 'Configuration for structured data extraction', 163 | }, 164 | mobile: { 165 | type: 'boolean', 166 | description: 'Use mobile viewport', 167 | }, 168 | skipTlsVerification: { 169 | type: 'boolean', 170 | description: 'Skip TLS certificate verification', 171 | }, 172 | removeBase64Images: { 173 | type: 'boolean', 174 | description: 'Remove base64 encoded images from output', 175 | }, 176 | location: { 177 | type: 'object', 178 | properties: { 179 | country: { 180 | type: 'string', 181 | description: 'Country code for geolocation', 182 | }, 183 | languages: { 184 | type: 'array', 185 | items: { type: 'string' }, 186 | description: 'Language codes for content', 187 | }, 188 | }, 189 | description: 'Location settings for scraping', 190 | }, 191 | }, 192 | required: ['url'], 193 | }, 194 | }; 195 | 196 | const MAP_TOOL: Tool = { 197 | name: 'firecrawl_map', 198 | description: ` 199 | Map a website to discover all indexed URLs on the site. 200 | 201 | **Best for:** Discovering URLs on a website before deciding what to scrape; finding specific sections of a website. 202 | **Not recommended for:** When you already know which specific URL you need (use scrape or batch_scrape); when you need the content of the pages (use scrape after mapping). 203 | **Common mistakes:** Using crawl to discover URLs instead of map. 204 | **Prompt Example:** "List all URLs on example.com." 205 | **Usage Example:** 206 | \`\`\`json 207 | { 208 | "name": "firecrawl_map", 209 | "arguments": { 210 | "url": "https://example.com" 211 | } 212 | } 213 | \`\`\` 214 | **Returns:** Array of URLs found on the site. 215 | `, 216 | inputSchema: { 217 | type: 'object', 218 | properties: { 219 | url: { 220 | type: 'string', 221 | description: 'Starting URL for URL discovery', 222 | }, 223 | search: { 224 | type: 'string', 225 | description: 'Optional search term to filter URLs', 226 | }, 227 | ignoreSitemap: { 228 | type: 'boolean', 229 | description: 'Skip sitemap.xml discovery and only use HTML links', 230 | }, 231 | sitemapOnly: { 232 | type: 'boolean', 233 | description: 'Only use sitemap.xml for discovery, ignore HTML links', 234 | }, 235 | includeSubdomains: { 236 | type: 'boolean', 237 | description: 'Include URLs from subdomains in results', 238 | }, 239 | limit: { 240 | type: 'number', 241 | description: 'Maximum number of URLs to return', 242 | }, 243 | }, 244 | required: ['url'], 245 | }, 246 | }; 247 | 248 | const CRAWL_TOOL: Tool = { 249 | name: 'firecrawl_crawl', 250 | description: ` 251 | Starts an asynchronous crawl job on a website and extracts content from all pages. 252 | 253 | **Best for:** Extracting content from multiple related pages, when you need comprehensive coverage. 254 | **Not recommended for:** Extracting content from a single page (use scrape); when token limits are a concern (use map + batch_scrape); when you need fast results (crawling can be slow). 255 | **Warning:** Crawl responses can be very large and may exceed token limits. Limit the crawl depth and number of pages, or use map + batch_scrape for better control. 256 | **Common mistakes:** Setting limit or maxDepth too high (causes token overflow); using crawl for a single page (use scrape instead). 257 | **Prompt Example:** "Get all blog posts from the first two levels of example.com/blog." 258 | **Usage Example:** 259 | \`\`\`json 260 | { 261 | "name": "firecrawl_crawl", 262 | "arguments": { 263 | "url": "https://example.com/blog/*", 264 | "maxDepth": 2, 265 | "limit": 100, 266 | "allowExternalLinks": false, 267 | "deduplicateSimilarURLs": true 268 | } 269 | } 270 | \`\`\` 271 | **Returns:** Operation ID for status checking; use firecrawl_check_crawl_status to check progress. 272 | `, 273 | inputSchema: { 274 | type: 'object', 275 | properties: { 276 | url: { 277 | type: 'string', 278 | description: 'Starting URL for the crawl', 279 | }, 280 | excludePaths: { 281 | type: 'array', 282 | items: { type: 'string' }, 283 | description: 'URL paths to exclude from crawling', 284 | }, 285 | includePaths: { 286 | type: 'array', 287 | items: { type: 'string' }, 288 | description: 'Only crawl these URL paths', 289 | }, 290 | maxDepth: { 291 | type: 'number', 292 | description: 'Maximum link depth to crawl', 293 | }, 294 | ignoreSitemap: { 295 | type: 'boolean', 296 | description: 'Skip sitemap.xml discovery', 297 | }, 298 | limit: { 299 | type: 'number', 300 | description: 'Maximum number of pages to crawl', 301 | }, 302 | allowBackwardLinks: { 303 | type: 'boolean', 304 | description: 'Allow crawling links that point to parent directories', 305 | }, 306 | allowExternalLinks: { 307 | type: 'boolean', 308 | description: 'Allow crawling links to external domains', 309 | }, 310 | webhook: { 311 | oneOf: [ 312 | { 313 | type: 'string', 314 | description: 'Webhook URL to notify when crawl is complete', 315 | }, 316 | { 317 | type: 'object', 318 | properties: { 319 | url: { 320 | type: 'string', 321 | description: 'Webhook URL', 322 | }, 323 | headers: { 324 | type: 'object', 325 | description: 'Custom headers for webhook requests', 326 | }, 327 | }, 328 | required: ['url'], 329 | }, 330 | ], 331 | }, 332 | deduplicateSimilarURLs: { 333 | type: 'boolean', 334 | description: 'Remove similar URLs during crawl', 335 | }, 336 | ignoreQueryParameters: { 337 | type: 'boolean', 338 | description: 'Ignore query parameters when comparing URLs', 339 | }, 340 | scrapeOptions: { 341 | type: 'object', 342 | properties: { 343 | formats: { 344 | type: 'array', 345 | items: { 346 | type: 'string', 347 | enum: [ 348 | 'markdown', 349 | 'html', 350 | 'rawHtml', 351 | 'screenshot', 352 | 'links', 353 | 'screenshot@fullPage', 354 | 'extract', 355 | ], 356 | }, 357 | }, 358 | onlyMainContent: { 359 | type: 'boolean', 360 | }, 361 | includeTags: { 362 | type: 'array', 363 | items: { type: 'string' }, 364 | }, 365 | excludeTags: { 366 | type: 'array', 367 | items: { type: 'string' }, 368 | }, 369 | waitFor: { 370 | type: 'number', 371 | }, 372 | }, 373 | description: 'Options for scraping each page', 374 | }, 375 | }, 376 | required: ['url'], 377 | }, 378 | }; 379 | 380 | const CHECK_CRAWL_STATUS_TOOL: Tool = { 381 | name: 'firecrawl_check_crawl_status', 382 | description: ` 383 | Check the status of a crawl job. 384 | 385 | **Usage Example:** 386 | \`\`\`json 387 | { 388 | "name": "firecrawl_check_crawl_status", 389 | "arguments": { 390 | "id": "550e8400-e29b-41d4-a716-446655440000" 391 | } 392 | } 393 | \`\`\` 394 | **Returns:** Status and progress of the crawl job, including results if available. 395 | `, 396 | inputSchema: { 397 | type: 'object', 398 | properties: { 399 | id: { 400 | type: 'string', 401 | description: 'Crawl job ID to check', 402 | }, 403 | }, 404 | required: ['id'], 405 | }, 406 | }; 407 | 408 | const SEARCH_TOOL: Tool = { 409 | name: 'firecrawl_search', 410 | description: ` 411 | Search the web and optionally extract content from search results. 412 | 413 | **Best for:** Finding specific information across multiple websites, when you don't know which website has the information; when you need the most relevant content for a query. 414 | **Not recommended for:** When you already know which website to scrape (use scrape); when you need comprehensive coverage of a single website (use map or crawl). 415 | **Common mistakes:** Using crawl or map for open-ended questions (use search instead). 416 | **Prompt Example:** "Find the latest research papers on AI published in 2023." 417 | **Usage Example:** 418 | \`\`\`json 419 | { 420 | "name": "firecrawl_search", 421 | "arguments": { 422 | "query": "latest AI research papers 2023", 423 | "limit": 5, 424 | "lang": "en", 425 | "country": "us", 426 | "scrapeOptions": { 427 | "formats": ["markdown"], 428 | "onlyMainContent": true 429 | } 430 | } 431 | } 432 | \`\`\` 433 | **Returns:** Array of search results (with optional scraped content). 434 | `, 435 | inputSchema: { 436 | type: 'object', 437 | properties: { 438 | query: { 439 | type: 'string', 440 | description: 'Search query string', 441 | }, 442 | limit: { 443 | type: 'number', 444 | description: 'Maximum number of results to return (default: 5)', 445 | }, 446 | lang: { 447 | type: 'string', 448 | description: 'Language code for search results (default: en)', 449 | }, 450 | country: { 451 | type: 'string', 452 | description: 'Country code for search results (default: us)', 453 | }, 454 | tbs: { 455 | type: 'string', 456 | description: 'Time-based search filter', 457 | }, 458 | filter: { 459 | type: 'string', 460 | description: 'Search filter', 461 | }, 462 | location: { 463 | type: 'object', 464 | properties: { 465 | country: { 466 | type: 'string', 467 | description: 'Country code for geolocation', 468 | }, 469 | languages: { 470 | type: 'array', 471 | items: { type: 'string' }, 472 | description: 'Language codes for content', 473 | }, 474 | }, 475 | description: 'Location settings for search', 476 | }, 477 | scrapeOptions: { 478 | type: 'object', 479 | properties: { 480 | formats: { 481 | type: 'array', 482 | items: { 483 | type: 'string', 484 | enum: ['markdown', 'html', 'rawHtml'], 485 | }, 486 | description: 'Content formats to extract from search results', 487 | }, 488 | onlyMainContent: { 489 | type: 'boolean', 490 | description: 'Extract only the main content from results', 491 | }, 492 | waitFor: { 493 | type: 'number', 494 | description: 'Time in milliseconds to wait for dynamic content', 495 | }, 496 | }, 497 | description: 'Options for scraping search results', 498 | }, 499 | }, 500 | required: ['query'], 501 | }, 502 | }; 503 | 504 | const EXTRACT_TOOL: Tool = { 505 | name: 'firecrawl_extract', 506 | description: ` 507 | Extract structured information from web pages using LLM capabilities. Supports both cloud AI and self-hosted LLM extraction. 508 | 509 | **Best for:** Extracting specific structured data like prices, names, details. 510 | **Not recommended for:** When you need the full content of a page (use scrape); when you're not looking for specific structured data. 511 | **Arguments:** 512 | - urls: Array of URLs to extract information from 513 | - prompt: Custom prompt for the LLM extraction 514 | - systemPrompt: System prompt to guide the LLM 515 | - schema: JSON schema for structured data extraction 516 | - allowExternalLinks: Allow extraction from external links 517 | - enableWebSearch: Enable web search for additional context 518 | - includeSubdomains: Include subdomains in extraction 519 | **Prompt Example:** "Extract the product name, price, and description from these product pages." 520 | **Usage Example:** 521 | \`\`\`json 522 | { 523 | "name": "firecrawl_extract", 524 | "arguments": { 525 | "urls": ["https://example.com/page1", "https://example.com/page2"], 526 | "prompt": "Extract product information including name, price, and description", 527 | "systemPrompt": "You are a helpful assistant that extracts product information", 528 | "schema": { 529 | "type": "object", 530 | "properties": { 531 | "name": { "type": "string" }, 532 | "price": { "type": "number" }, 533 | "description": { "type": "string" } 534 | }, 535 | "required": ["name", "price"] 536 | }, 537 | "allowExternalLinks": false, 538 | "enableWebSearch": false, 539 | "includeSubdomains": false 540 | } 541 | } 542 | \`\`\` 543 | **Returns:** Extracted structured data as defined by your schema. 544 | `, 545 | inputSchema: { 546 | type: 'object', 547 | properties: { 548 | urls: { 549 | type: 'array', 550 | items: { type: 'string' }, 551 | description: 'List of URLs to extract information from', 552 | }, 553 | prompt: { 554 | type: 'string', 555 | description: 'Prompt for the LLM extraction', 556 | }, 557 | systemPrompt: { 558 | type: 'string', 559 | description: 'System prompt for LLM extraction', 560 | }, 561 | schema: { 562 | type: 'object', 563 | description: 'JSON schema for structured data extraction', 564 | }, 565 | allowExternalLinks: { 566 | type: 'boolean', 567 | description: 'Allow extraction from external links', 568 | }, 569 | enableWebSearch: { 570 | type: 'boolean', 571 | description: 'Enable web search for additional context', 572 | }, 573 | includeSubdomains: { 574 | type: 'boolean', 575 | description: 'Include subdomains in extraction', 576 | }, 577 | }, 578 | required: ['urls'], 579 | }, 580 | }; 581 | 582 | const DEEP_RESEARCH_TOOL: Tool = { 583 | name: 'firecrawl_deep_research', 584 | description: ` 585 | Conduct deep web research on a query using intelligent crawling, search, and LLM analysis. 586 | 587 | **Best for:** Complex research questions requiring multiple sources, in-depth analysis. 588 | **Not recommended for:** Simple questions that can be answered with a single search; when you need very specific information from a known page (use scrape); when you need results quickly (deep research can take time). 589 | **Arguments:** 590 | - query (string, required): The research question or topic to explore. 591 | - maxDepth (number, optional): Maximum recursive depth for crawling/search (default: 3). 592 | - timeLimit (number, optional): Time limit in seconds for the research session (default: 120). 593 | - maxUrls (number, optional): Maximum number of URLs to analyze (default: 50). 594 | **Prompt Example:** "Research the environmental impact of electric vehicles versus gasoline vehicles." 595 | **Usage Example:** 596 | \`\`\`json 597 | { 598 | "name": "firecrawl_deep_research", 599 | "arguments": { 600 | "query": "What are the environmental impacts of electric vehicles compared to gasoline vehicles?", 601 | "maxDepth": 3, 602 | "timeLimit": 120, 603 | "maxUrls": 50 604 | } 605 | } 606 | \`\`\` 607 | **Returns:** Final analysis generated by an LLM based on research. (data.finalAnalysis); may also include structured activities and sources used in the research process. 608 | `, 609 | inputSchema: { 610 | type: 'object', 611 | properties: { 612 | query: { 613 | type: 'string', 614 | description: 'The query to research', 615 | }, 616 | maxDepth: { 617 | type: 'number', 618 | description: 'Maximum depth of research iterations (1-10)', 619 | }, 620 | timeLimit: { 621 | type: 'number', 622 | description: 'Time limit in seconds (30-300)', 623 | }, 624 | maxUrls: { 625 | type: 'number', 626 | description: 'Maximum number of URLs to analyze (1-1000)', 627 | }, 628 | }, 629 | required: ['query'], 630 | }, 631 | }; 632 | 633 | const GENERATE_LLMSTXT_TOOL: Tool = { 634 | name: 'firecrawl_generate_llmstxt', 635 | description: ` 636 | Generate a standardized llms.txt (and optionally llms-full.txt) file for a given domain. This file defines how large language models should interact with the site. 637 | 638 | **Best for:** Creating machine-readable permission guidelines for AI models. 639 | **Not recommended for:** General content extraction or research. 640 | **Arguments:** 641 | - url (string, required): The base URL of the website to analyze. 642 | - maxUrls (number, optional): Max number of URLs to include (default: 10). 643 | - showFullText (boolean, optional): Whether to include llms-full.txt contents in the response. 644 | **Prompt Example:** "Generate an LLMs.txt file for example.com." 645 | **Usage Example:** 646 | \`\`\`json 647 | { 648 | "name": "firecrawl_generate_llmstxt", 649 | "arguments": { 650 | "url": "https://example.com", 651 | "maxUrls": 20, 652 | "showFullText": true 653 | } 654 | } 655 | \`\`\` 656 | **Returns:** LLMs.txt file contents (and optionally llms-full.txt). 657 | `, 658 | inputSchema: { 659 | type: 'object', 660 | properties: { 661 | url: { 662 | type: 'string', 663 | description: 'The URL to generate LLMs.txt from', 664 | }, 665 | maxUrls: { 666 | type: 'number', 667 | description: 'Maximum number of URLs to process (1-100, default: 10)', 668 | }, 669 | showFullText: { 670 | type: 'boolean', 671 | description: 'Whether to show the full LLMs-full.txt in the response', 672 | }, 673 | }, 674 | required: ['url'], 675 | }, 676 | }; 677 | 678 | /** 679 | * Parameters for LLMs.txt generation operations. 680 | */ 681 | interface GenerateLLMsTextParams { 682 | /** 683 | * Maximum number of URLs to process (1-100) 684 | * @default 10 685 | */ 686 | maxUrls?: number; 687 | /** 688 | * Whether to show the full LLMs-full.txt in the response 689 | * @default false 690 | */ 691 | showFullText?: boolean; 692 | /** 693 | * Experimental flag for streaming 694 | */ 695 | __experimental_stream?: boolean; 696 | } 697 | 698 | /** 699 | * Response interface for LLMs.txt generation operations. 700 | */ 701 | // interface GenerateLLMsTextResponse { 702 | // success: boolean; 703 | // id: string; 704 | // } 705 | 706 | /** 707 | * Status response interface for LLMs.txt generation operations. 708 | */ 709 | // interface GenerateLLMsTextStatusResponse { 710 | // success: boolean; 711 | // data: { 712 | // llmstxt: string; 713 | // llmsfulltxt?: string; 714 | // }; 715 | // status: 'processing' | 'completed' | 'failed'; 716 | // error?: string; 717 | // expiresAt: string; 718 | // } 719 | 720 | interface StatusCheckOptions { 721 | id: string; 722 | } 723 | 724 | interface SearchOptions { 725 | query: string; 726 | limit?: number; 727 | lang?: string; 728 | country?: string; 729 | tbs?: string; 730 | filter?: string; 731 | location?: { 732 | country?: string; 733 | languages?: string[]; 734 | }; 735 | scrapeOptions?: { 736 | formats?: string[]; 737 | onlyMainContent?: boolean; 738 | waitFor?: number; 739 | includeTags?: string[]; 740 | excludeTags?: string[]; 741 | timeout?: number; 742 | }; 743 | } 744 | 745 | // Add after other interfaces 746 | interface ExtractParams { 747 | prompt?: string; 748 | systemPrompt?: string; 749 | schema?: T | object; 750 | allowExternalLinks?: boolean; 751 | enableWebSearch?: boolean; 752 | includeSubdomains?: boolean; 753 | origin?: string; 754 | } 755 | 756 | interface ExtractArgs { 757 | urls: string[]; 758 | prompt?: string; 759 | systemPrompt?: string; 760 | schema?: object; 761 | allowExternalLinks?: boolean; 762 | enableWebSearch?: boolean; 763 | includeSubdomains?: boolean; 764 | origin?: string; 765 | } 766 | 767 | interface ExtractResponse { 768 | success: boolean; 769 | data: T; 770 | error?: string; 771 | warning?: string; 772 | creditsUsed?: number; 773 | } 774 | 775 | // Type guards 776 | function isScrapeOptions( 777 | args: unknown 778 | ): args is ScrapeParams & { url: string } { 779 | return ( 780 | typeof args === 'object' && 781 | args !== null && 782 | 'url' in args && 783 | typeof (args as { url: unknown }).url === 'string' 784 | ); 785 | } 786 | 787 | function isMapOptions(args: unknown): args is MapParams & { url: string } { 788 | return ( 789 | typeof args === 'object' && 790 | args !== null && 791 | 'url' in args && 792 | typeof (args as { url: unknown }).url === 'string' 793 | ); 794 | } 795 | 796 | function isCrawlOptions(args: unknown): args is CrawlParams & { url: string } { 797 | return ( 798 | typeof args === 'object' && 799 | args !== null && 800 | 'url' in args && 801 | typeof (args as { url: unknown }).url === 'string' 802 | ); 803 | } 804 | 805 | function isStatusCheckOptions(args: unknown): args is StatusCheckOptions { 806 | return ( 807 | typeof args === 'object' && 808 | args !== null && 809 | 'id' in args && 810 | typeof (args as { id: unknown }).id === 'string' 811 | ); 812 | } 813 | 814 | function isSearchOptions(args: unknown): args is SearchOptions { 815 | return ( 816 | typeof args === 'object' && 817 | args !== null && 818 | 'query' in args && 819 | typeof (args as { query: unknown }).query === 'string' 820 | ); 821 | } 822 | 823 | function isExtractOptions(args: unknown): args is ExtractArgs { 824 | if (typeof args !== 'object' || args === null) return false; 825 | const { urls } = args as { urls?: unknown }; 826 | return ( 827 | Array.isArray(urls) && 828 | urls.every((url): url is string => typeof url === 'string') 829 | ); 830 | } 831 | 832 | function isGenerateLLMsTextOptions( 833 | args: unknown 834 | ): args is { url: string } & Partial { 835 | return ( 836 | typeof args === 'object' && 837 | args !== null && 838 | 'url' in args && 839 | typeof (args as { url: unknown }).url === 'string' 840 | ); 841 | } 842 | 843 | // Server implementation 844 | const server = new Server( 845 | { 846 | name: 'firecrawl-mcp', 847 | version: '1.7.0', 848 | }, 849 | { 850 | capabilities: { 851 | tools: {}, 852 | logging: {}, 853 | }, 854 | } 855 | ); 856 | 857 | // Get optional API URL 858 | const FIRECRAWL_API_URL = process.env.FIRECRAWL_API_URL; 859 | const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY; 860 | 861 | // Check if API key is required (only for cloud service) 862 | if ( 863 | process.env.CLOUD_SERVICE !== 'true' && 864 | !FIRECRAWL_API_URL && 865 | !FIRECRAWL_API_KEY 866 | ) { 867 | console.error( 868 | 'Error: FIRECRAWL_API_KEY environment variable is required when using the cloud service' 869 | ); 870 | process.exit(1); 871 | } 872 | 873 | // Initialize Firecrawl client with optional API URL 874 | 875 | // Configuration for retries and monitoring 876 | const CONFIG = { 877 | retry: { 878 | maxAttempts: Number(process.env.FIRECRAWL_RETRY_MAX_ATTEMPTS) || 3, 879 | initialDelay: Number(process.env.FIRECRAWL_RETRY_INITIAL_DELAY) || 1000, 880 | maxDelay: Number(process.env.FIRECRAWL_RETRY_MAX_DELAY) || 10000, 881 | backoffFactor: Number(process.env.FIRECRAWL_RETRY_BACKOFF_FACTOR) || 2, 882 | }, 883 | credit: { 884 | warningThreshold: 885 | Number(process.env.FIRECRAWL_CREDIT_WARNING_THRESHOLD) || 1000, 886 | criticalThreshold: 887 | Number(process.env.FIRECRAWL_CREDIT_CRITICAL_THRESHOLD) || 100, 888 | }, 889 | }; 890 | 891 | // Add utility function for delay 892 | function delay(ms: number): Promise { 893 | return new Promise((resolve) => setTimeout(resolve, ms)); 894 | } 895 | 896 | let isStdioTransport = false; 897 | 898 | function safeLog( 899 | level: 900 | | 'error' 901 | | 'debug' 902 | | 'info' 903 | | 'notice' 904 | | 'warning' 905 | | 'critical' 906 | | 'alert' 907 | | 'emergency', 908 | data: any 909 | ): void { 910 | if (isStdioTransport) { 911 | // For stdio transport, log to stderr to avoid protocol interference 912 | console.error( 913 | `[${level}] ${typeof data === 'object' ? JSON.stringify(data) : data}` 914 | ); 915 | } else { 916 | // For other transport types, use the normal logging mechanism 917 | server.sendLoggingMessage({ level, data }); 918 | } 919 | } 920 | 921 | // Add retry logic with exponential backoff 922 | async function withRetry( 923 | operation: () => Promise, 924 | context: string, 925 | attempt = 1 926 | ): Promise { 927 | try { 928 | return await operation(); 929 | } catch (error) { 930 | const isRateLimit = 931 | error instanceof Error && 932 | (error.message.includes('rate limit') || error.message.includes('429')); 933 | 934 | if (isRateLimit && attempt < CONFIG.retry.maxAttempts) { 935 | const delayMs = Math.min( 936 | CONFIG.retry.initialDelay * 937 | Math.pow(CONFIG.retry.backoffFactor, attempt - 1), 938 | CONFIG.retry.maxDelay 939 | ); 940 | 941 | safeLog( 942 | 'warning', 943 | `Rate limit hit for ${context}. Attempt ${attempt}/${CONFIG.retry.maxAttempts}. Retrying in ${delayMs}ms` 944 | ); 945 | 946 | await delay(delayMs); 947 | return withRetry(operation, context, attempt + 1); 948 | } 949 | 950 | throw error; 951 | } 952 | } 953 | 954 | // Tool handlers 955 | server.setRequestHandler(ListToolsRequestSchema, async () => ({ 956 | tools: [ 957 | SCRAPE_TOOL, 958 | MAP_TOOL, 959 | CRAWL_TOOL, 960 | CHECK_CRAWL_STATUS_TOOL, 961 | SEARCH_TOOL, 962 | EXTRACT_TOOL, 963 | DEEP_RESEARCH_TOOL, 964 | GENERATE_LLMSTXT_TOOL, 965 | ], 966 | })); 967 | 968 | server.setRequestHandler(CallToolRequestSchema, async (request) => { 969 | const startTime = Date.now(); 970 | try { 971 | const { name, arguments: args } = request.params; 972 | 973 | const apiKey = process.env.CLOUD_SERVICE 974 | ? (request.params._meta?.apiKey as string) 975 | : FIRECRAWL_API_KEY; 976 | if (process.env.CLOUD_SERVICE && !apiKey) { 977 | throw new Error('No API key provided'); 978 | } 979 | 980 | const client = new FirecrawlApp({ 981 | apiKey, 982 | ...(FIRECRAWL_API_URL ? { apiUrl: FIRECRAWL_API_URL } : {}), 983 | }); 984 | // Log incoming request with timestamp 985 | safeLog( 986 | 'info', 987 | `[${new Date().toISOString()}] Received request for tool: ${name}` 988 | ); 989 | 990 | if (!args) { 991 | throw new Error('No arguments provided'); 992 | } 993 | 994 | switch (name) { 995 | case 'firecrawl_scrape': { 996 | if (!isScrapeOptions(args)) { 997 | throw new Error('Invalid arguments for firecrawl_scrape'); 998 | } 999 | const { url, ...options } = args; 1000 | try { 1001 | const scrapeStartTime = Date.now(); 1002 | safeLog( 1003 | 'info', 1004 | `Starting scrape for URL: ${url} with options: ${JSON.stringify(options)}` 1005 | ); 1006 | 1007 | const response = await client.scrapeUrl(url, { 1008 | ...options, 1009 | // @ts-expect-error Extended API options including origin 1010 | origin: 'mcp-server', 1011 | }); 1012 | 1013 | // Log performance metrics 1014 | safeLog( 1015 | 'info', 1016 | `Scrape completed in ${Date.now() - scrapeStartTime}ms` 1017 | ); 1018 | 1019 | if ('success' in response && !response.success) { 1020 | throw new Error(response.error || 'Scraping failed'); 1021 | } 1022 | 1023 | // Format content based on requested formats 1024 | const contentParts = []; 1025 | 1026 | if (options.formats?.includes('markdown') && response.markdown) { 1027 | contentParts.push(response.markdown); 1028 | } 1029 | if (options.formats?.includes('html') && response.html) { 1030 | contentParts.push(response.html); 1031 | } 1032 | if (options.formats?.includes('rawHtml') && response.rawHtml) { 1033 | contentParts.push(response.rawHtml); 1034 | } 1035 | if (options.formats?.includes('links') && response.links) { 1036 | contentParts.push(response.links.join('\n')); 1037 | } 1038 | if (options.formats?.includes('screenshot') && response.screenshot) { 1039 | contentParts.push(response.screenshot); 1040 | } 1041 | if (options.formats?.includes('extract') && response.extract) { 1042 | contentParts.push(JSON.stringify(response.extract, null, 2)); 1043 | } 1044 | 1045 | // If options.formats is empty, default to markdown 1046 | if (!options.formats || options.formats.length === 0) { 1047 | options.formats = ['markdown']; 1048 | } 1049 | 1050 | // Add warning to response if present 1051 | if (response.warning) { 1052 | safeLog('warning', response.warning); 1053 | } 1054 | 1055 | return { 1056 | content: [ 1057 | { 1058 | type: 'text', 1059 | text: trimResponseText( 1060 | contentParts.join('\n\n') || 'No content available' 1061 | ), 1062 | }, 1063 | ], 1064 | isError: false, 1065 | }; 1066 | } catch (error) { 1067 | const errorMessage = 1068 | error instanceof Error ? error.message : String(error); 1069 | return { 1070 | content: [{ type: 'text', text: trimResponseText(errorMessage) }], 1071 | isError: true, 1072 | }; 1073 | } 1074 | } 1075 | 1076 | case 'firecrawl_map': { 1077 | if (!isMapOptions(args)) { 1078 | throw new Error('Invalid arguments for firecrawl_map'); 1079 | } 1080 | const { url, ...options } = args; 1081 | const response = await client.mapUrl(url, { 1082 | ...options, 1083 | // @ts-expect-error Extended API options including origin 1084 | origin: 'mcp-server', 1085 | }); 1086 | if ('error' in response) { 1087 | throw new Error(response.error); 1088 | } 1089 | if (!response.links) { 1090 | throw new Error('No links received from Firecrawl API'); 1091 | } 1092 | return { 1093 | content: [ 1094 | { type: 'text', text: trimResponseText(response.links.join('\n')) }, 1095 | ], 1096 | isError: false, 1097 | }; 1098 | } 1099 | 1100 | case 'firecrawl_crawl': { 1101 | if (!isCrawlOptions(args)) { 1102 | throw new Error('Invalid arguments for firecrawl_crawl'); 1103 | } 1104 | const { url, ...options } = args; 1105 | const response = await withRetry( 1106 | async () => 1107 | // @ts-expect-error Extended API options including origin 1108 | client.asyncCrawlUrl(url, { ...options, origin: 'mcp-server' }), 1109 | 'crawl operation' 1110 | ); 1111 | 1112 | if (!response.success) { 1113 | throw new Error(response.error); 1114 | } 1115 | 1116 | return { 1117 | content: [ 1118 | { 1119 | type: 'text', 1120 | text: trimResponseText( 1121 | `Started crawl for ${url} with job ID: ${response.id}. Use firecrawl_check_crawl_status to check progress.` 1122 | ), 1123 | }, 1124 | ], 1125 | isError: false, 1126 | }; 1127 | } 1128 | 1129 | case 'firecrawl_check_crawl_status': { 1130 | if (!isStatusCheckOptions(args)) { 1131 | throw new Error('Invalid arguments for firecrawl_check_crawl_status'); 1132 | } 1133 | const response = await client.checkCrawlStatus(args.id); 1134 | if (!response.success) { 1135 | throw new Error(response.error); 1136 | } 1137 | const status = `Crawl Status: 1138 | Status: ${response.status} 1139 | Progress: ${response.completed}/${response.total} 1140 | Credits Used: ${response.creditsUsed} 1141 | Expires At: ${response.expiresAt} 1142 | ${ 1143 | response.data.length > 0 ? '\nResults:\n' + formatResults(response.data) : '' 1144 | }`; 1145 | return { 1146 | content: [{ type: 'text', text: trimResponseText(status) }], 1147 | isError: false, 1148 | }; 1149 | } 1150 | 1151 | case 'firecrawl_search': { 1152 | if (!isSearchOptions(args)) { 1153 | throw new Error('Invalid arguments for firecrawl_search'); 1154 | } 1155 | try { 1156 | const response = await withRetry( 1157 | async () => 1158 | client.search(args.query, { ...args, origin: 'mcp-server' }), 1159 | 'search operation' 1160 | ); 1161 | 1162 | if (!response.success) { 1163 | throw new Error( 1164 | `Search failed: ${response.error || 'Unknown error'}` 1165 | ); 1166 | } 1167 | 1168 | // Format the results 1169 | const results = response.data 1170 | .map( 1171 | (result) => 1172 | `URL: ${result.url} 1173 | Title: ${result.title || 'No title'} 1174 | Description: ${result.description || 'No description'} 1175 | ${result.markdown ? `\nContent:\n${result.markdown}` : ''}` 1176 | ) 1177 | .join('\n\n'); 1178 | 1179 | return { 1180 | content: [{ type: 'text', text: trimResponseText(results) }], 1181 | isError: false, 1182 | }; 1183 | } catch (error) { 1184 | const errorMessage = 1185 | error instanceof Error 1186 | ? error.message 1187 | : `Search failed: ${JSON.stringify(error)}`; 1188 | return { 1189 | content: [{ type: 'text', text: trimResponseText(errorMessage) }], 1190 | isError: true, 1191 | }; 1192 | } 1193 | } 1194 | 1195 | case 'firecrawl_extract': { 1196 | if (!isExtractOptions(args)) { 1197 | throw new Error('Invalid arguments for firecrawl_extract'); 1198 | } 1199 | 1200 | try { 1201 | const extractStartTime = Date.now(); 1202 | 1203 | safeLog( 1204 | 'info', 1205 | `Starting extraction for URLs: ${args.urls.join(', ')}` 1206 | ); 1207 | 1208 | // Log if using self-hosted instance 1209 | if (FIRECRAWL_API_URL) { 1210 | safeLog('info', 'Using self-hosted instance for extraction'); 1211 | } 1212 | 1213 | const extractResponse = await withRetry( 1214 | async () => 1215 | client.extract(args.urls, { 1216 | prompt: args.prompt, 1217 | systemPrompt: args.systemPrompt, 1218 | schema: args.schema, 1219 | allowExternalLinks: args.allowExternalLinks, 1220 | enableWebSearch: args.enableWebSearch, 1221 | includeSubdomains: args.includeSubdomains, 1222 | origin: 'mcp-server', 1223 | } as ExtractParams), 1224 | 'extract operation' 1225 | ); 1226 | 1227 | // Type guard for successful response 1228 | if (!('success' in extractResponse) || !extractResponse.success) { 1229 | throw new Error(extractResponse.error || 'Extraction failed'); 1230 | } 1231 | 1232 | const response = extractResponse as ExtractResponse; 1233 | 1234 | // Log performance metrics 1235 | safeLog( 1236 | 'info', 1237 | `Extraction completed in ${Date.now() - extractStartTime}ms` 1238 | ); 1239 | 1240 | // Add warning to response if present 1241 | const result = { 1242 | content: [ 1243 | { 1244 | type: 'text', 1245 | text: trimResponseText(JSON.stringify(response.data, null, 2)), 1246 | }, 1247 | ], 1248 | isError: false, 1249 | }; 1250 | 1251 | if (response.warning) { 1252 | safeLog('warning', response.warning); 1253 | } 1254 | 1255 | return result; 1256 | } catch (error) { 1257 | const errorMessage = 1258 | error instanceof Error ? error.message : String(error); 1259 | 1260 | // Special handling for self-hosted instance errors 1261 | if ( 1262 | FIRECRAWL_API_URL && 1263 | errorMessage.toLowerCase().includes('not supported') 1264 | ) { 1265 | safeLog( 1266 | 'error', 1267 | 'Extraction is not supported by this self-hosted instance' 1268 | ); 1269 | return { 1270 | content: [ 1271 | { 1272 | type: 'text', 1273 | text: trimResponseText( 1274 | 'Extraction is not supported by this self-hosted instance. Please ensure LLM support is configured.' 1275 | ), 1276 | }, 1277 | ], 1278 | isError: true, 1279 | }; 1280 | } 1281 | 1282 | return { 1283 | content: [{ type: 'text', text: trimResponseText(errorMessage) }], 1284 | isError: true, 1285 | }; 1286 | } 1287 | } 1288 | 1289 | case 'firecrawl_deep_research': { 1290 | if (!args || typeof args !== 'object' || !('query' in args)) { 1291 | throw new Error('Invalid arguments for firecrawl_deep_research'); 1292 | } 1293 | 1294 | try { 1295 | const researchStartTime = Date.now(); 1296 | safeLog('info', `Starting deep research for query: ${args.query}`); 1297 | 1298 | const response = await client.deepResearch( 1299 | args.query as string, 1300 | { 1301 | maxDepth: args.maxDepth as number, 1302 | timeLimit: args.timeLimit as number, 1303 | maxUrls: args.maxUrls as number, 1304 | // @ts-expect-error Extended API options including origin 1305 | origin: 'mcp-server', 1306 | }, 1307 | // Activity callback 1308 | (activity) => { 1309 | safeLog( 1310 | 'info', 1311 | `Research activity: ${activity.message} (Depth: ${activity.depth})` 1312 | ); 1313 | }, 1314 | // Source callback 1315 | (source) => { 1316 | safeLog( 1317 | 'info', 1318 | `Research source found: ${source.url}${source.title ? ` - ${source.title}` : ''}` 1319 | ); 1320 | } 1321 | ); 1322 | 1323 | // Log performance metrics 1324 | safeLog( 1325 | 'info', 1326 | `Deep research completed in ${Date.now() - researchStartTime}ms` 1327 | ); 1328 | 1329 | if (!response.success) { 1330 | throw new Error(response.error || 'Deep research failed'); 1331 | } 1332 | 1333 | // Format the results 1334 | const formattedResponse = { 1335 | finalAnalysis: response.data.finalAnalysis, 1336 | activities: response.data.activities, 1337 | sources: response.data.sources, 1338 | }; 1339 | 1340 | return { 1341 | content: [ 1342 | { 1343 | type: 'text', 1344 | text: trimResponseText(formattedResponse.finalAnalysis), 1345 | }, 1346 | ], 1347 | isError: false, 1348 | }; 1349 | } catch (error) { 1350 | const errorMessage = 1351 | error instanceof Error ? error.message : String(error); 1352 | return { 1353 | content: [{ type: 'text', text: trimResponseText(errorMessage) }], 1354 | isError: true, 1355 | }; 1356 | } 1357 | } 1358 | 1359 | case 'firecrawl_generate_llmstxt': { 1360 | if (!isGenerateLLMsTextOptions(args)) { 1361 | throw new Error('Invalid arguments for firecrawl_generate_llmstxt'); 1362 | } 1363 | 1364 | try { 1365 | const { url, ...params } = args; 1366 | const generateStartTime = Date.now(); 1367 | 1368 | safeLog('info', `Starting LLMs.txt generation for URL: ${url}`); 1369 | 1370 | // Start the generation process 1371 | const response = await withRetry( 1372 | async () => 1373 | // @ts-expect-error Extended API options including origin 1374 | client.generateLLMsText(url, { ...params, origin: 'mcp-server' }), 1375 | 'LLMs.txt generation' 1376 | ); 1377 | 1378 | if (!response.success) { 1379 | throw new Error(response.error || 'LLMs.txt generation failed'); 1380 | } 1381 | 1382 | // Log performance metrics 1383 | safeLog( 1384 | 'info', 1385 | `LLMs.txt generation completed in ${Date.now() - generateStartTime}ms` 1386 | ); 1387 | 1388 | // Format the response 1389 | let resultText = ''; 1390 | 1391 | if ('data' in response) { 1392 | resultText = `LLMs.txt content:\n\n${response.data.llmstxt}`; 1393 | 1394 | if (args.showFullText && response.data.llmsfulltxt) { 1395 | resultText += `\n\nLLMs-full.txt content:\n\n${response.data.llmsfulltxt}`; 1396 | } 1397 | } 1398 | 1399 | return { 1400 | content: [{ type: 'text', text: trimResponseText(resultText) }], 1401 | isError: false, 1402 | }; 1403 | } catch (error) { 1404 | const errorMessage = 1405 | error instanceof Error ? error.message : String(error); 1406 | return { 1407 | content: [{ type: 'text', text: trimResponseText(errorMessage) }], 1408 | isError: true, 1409 | }; 1410 | } 1411 | } 1412 | 1413 | default: 1414 | return { 1415 | content: [ 1416 | { type: 'text', text: trimResponseText(`Unknown tool: ${name}`) }, 1417 | ], 1418 | isError: true, 1419 | }; 1420 | } 1421 | } catch (error) { 1422 | // Log detailed error information 1423 | safeLog('error', { 1424 | message: `Request failed: ${ 1425 | error instanceof Error ? error.message : String(error) 1426 | }`, 1427 | tool: request.params.name, 1428 | arguments: request.params.arguments, 1429 | timestamp: new Date().toISOString(), 1430 | duration: Date.now() - startTime, 1431 | }); 1432 | return { 1433 | content: [ 1434 | { 1435 | type: 'text', 1436 | text: trimResponseText( 1437 | `Error: ${error instanceof Error ? error.message : String(error)}` 1438 | ), 1439 | }, 1440 | ], 1441 | isError: true, 1442 | }; 1443 | } finally { 1444 | // Log request completion with performance metrics 1445 | safeLog('info', `Request completed in ${Date.now() - startTime}ms`); 1446 | } 1447 | }); 1448 | 1449 | // Helper function to format results 1450 | function formatResults(data: FirecrawlDocument[]): string { 1451 | return data 1452 | .map((doc) => { 1453 | const content = doc.markdown || doc.html || doc.rawHtml || 'No content'; 1454 | return `URL: ${doc.url || 'Unknown URL'} 1455 | Content: ${content.substring(0, 100)}${content.length > 100 ? '...' : ''} 1456 | ${doc.metadata?.title ? `Title: ${doc.metadata.title}` : ''}`; 1457 | }) 1458 | .join('\n\n'); 1459 | } 1460 | 1461 | // Utility function to trim trailing whitespace from text responses 1462 | // This prevents Claude API errors with "final assistant content cannot end with trailing whitespace" 1463 | function trimResponseText(text: string): string { 1464 | return text.trim(); 1465 | } 1466 | 1467 | // Server startup 1468 | async function runLocalServer() { 1469 | try { 1470 | console.error('Initializing Firecrawl MCP Server...'); 1471 | 1472 | const transport = new StdioServerTransport(); 1473 | 1474 | // Detect if we're using stdio transport 1475 | isStdioTransport = transport instanceof StdioServerTransport; 1476 | if (isStdioTransport) { 1477 | console.error( 1478 | 'Running in stdio mode, logging will be directed to stderr' 1479 | ); 1480 | } 1481 | 1482 | await server.connect(transport); 1483 | 1484 | // Now that we're connected, we can send logging messages 1485 | safeLog('info', 'Firecrawl MCP Server initialized successfully'); 1486 | safeLog( 1487 | 'info', 1488 | `Configuration: API URL: ${FIRECRAWL_API_URL || 'default'}` 1489 | ); 1490 | 1491 | console.error('Firecrawl MCP Server running on stdio'); 1492 | } catch (error) { 1493 | console.error('Fatal error running server:', error); 1494 | process.exit(1); 1495 | } 1496 | } 1497 | async function runSSELocalServer() { 1498 | let transport: SSEServerTransport | null = null; 1499 | const app = express(); 1500 | 1501 | app.get('/sse', async (req, res) => { 1502 | transport = new SSEServerTransport(`/messages`, res); 1503 | res.on('close', () => { 1504 | transport = null; 1505 | }); 1506 | await server.connect(transport); 1507 | }); 1508 | 1509 | // Endpoint for the client to POST messages 1510 | // Remove express.json() middleware - let the transport handle the body 1511 | app.post('/messages', (req, res) => { 1512 | if (transport) { 1513 | transport.handlePostMessage(req, res); 1514 | } 1515 | }); 1516 | 1517 | const PORT = process.env.PORT || 3000; 1518 | console.log('Starting server on port', PORT); 1519 | try { 1520 | app.listen(PORT, () => { 1521 | console.log(`MCP SSE Server listening on http://localhost:${PORT}`); 1522 | console.log(`SSE endpoint: http://localhost:${PORT}/sse`); 1523 | console.log(`Message endpoint: http://localhost:${PORT}/messages`); 1524 | }); 1525 | } catch (error) { 1526 | console.error('Error starting server:', error); 1527 | } 1528 | } 1529 | 1530 | async function runSSECloudServer() { 1531 | const transports: { [sessionId: string]: SSEServerTransport } = {}; 1532 | const app = express(); 1533 | 1534 | app.get('/health', (req, res) => { 1535 | res.status(200).send('OK'); 1536 | }); 1537 | 1538 | app.get('/:apiKey/sse', async (req, res) => { 1539 | const apiKey = req.params.apiKey; 1540 | const transport = new SSEServerTransport(`/${apiKey}/messages`, res); 1541 | 1542 | //todo: validate api key, close if invalid 1543 | const compositeKey = `${apiKey}-${transport.sessionId}`; 1544 | transports[compositeKey] = transport; 1545 | res.on('close', () => { 1546 | delete transports[compositeKey]; 1547 | }); 1548 | await server.connect(transport); 1549 | }); 1550 | 1551 | // Endpoint for the client to POST messages 1552 | // Remove express.json() middleware - let the transport handle the body 1553 | app.post( 1554 | '/:apiKey/messages', 1555 | express.json(), 1556 | async (req: Request, res: Response) => { 1557 | const apiKey = req.params.apiKey; 1558 | const body = req.body; 1559 | const enrichedBody = { 1560 | ...body, 1561 | }; 1562 | 1563 | if (enrichedBody && enrichedBody.params && !enrichedBody.params._meta) { 1564 | enrichedBody.params._meta = { apiKey }; 1565 | } else if ( 1566 | enrichedBody && 1567 | enrichedBody.params && 1568 | enrichedBody.params._meta 1569 | ) { 1570 | enrichedBody.params._meta.apiKey = apiKey; 1571 | } 1572 | 1573 | console.log('enrichedBody', enrichedBody); 1574 | 1575 | const sessionId = req.query.sessionId as string; 1576 | const compositeKey = `${apiKey}-${sessionId}`; 1577 | const transport = transports[compositeKey]; 1578 | if (transport) { 1579 | await transport.handlePostMessage(req, res, enrichedBody); 1580 | } else { 1581 | res.status(400).send('No transport found for sessionId'); 1582 | } 1583 | } 1584 | ); 1585 | 1586 | const PORT = 3000; 1587 | app.listen(PORT, () => { 1588 | console.log(`MCP SSE Server listening on http://localhost:${PORT}`); 1589 | console.log(`SSE endpoint: http://localhost:${PORT}/sse`); 1590 | console.log(`Message endpoint: http://localhost:${PORT}/messages`); 1591 | }); 1592 | } 1593 | 1594 | if (process.env.CLOUD_SERVICE === 'true') { 1595 | runSSECloudServer().catch((error: any) => { 1596 | console.error('Fatal error running server:', error); 1597 | process.exit(1); 1598 | }); 1599 | } else if (process.env.SSE_LOCAL === 'true') { 1600 | runSSELocalServer().catch((error: any) => { 1601 | console.error('Fatal error running server:', error); 1602 | process.exit(1); 1603 | }); 1604 | } else { 1605 | runLocalServer().catch((error: any) => { 1606 | console.error('Fatal error running server:', error); 1607 | process.exit(1); 1608 | }); 1609 | } 1610 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "module": "NodeNext", 5 | "moduleResolution": "NodeNext", 6 | "outDir": "./dist", 7 | "rootDir": "./src", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true, 11 | "forceConsistentCasingInFileNames": true 12 | }, 13 | "include": ["src/**/*"], 14 | "exclude": ["node_modules", "dist", "tests"] 15 | } 16 | --------------------------------------------------------------------------------