├── requirements.txt ├── Dockerfile ├── crawl_server.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | # Discord Bot with LangChain, LangMem, and MCP Server Tools 2 | discord.py>=2.0.0 3 | python-dotenv>=0.21.0 4 | langchain>=0.1.0 5 | langchain-community>=0.0.13 6 | langchain-mistralai>=0.0.1 7 | langchain-google-genai>=0.0.3 8 | langchain-mcp-adapters>=0.0.1 9 | langmem>=0.0.1 10 | langgraph>=0.0.20 11 | chromadb>=0.4.22 # Added for persistent memory 12 | langgraph-checkpoint-sqlite 13 | aiosqlite>=0.20.0 14 | # Core MCP Server library (includes FastAPI, Uvicorn for SSE) 15 | mcp[cli] 16 | 17 | # Web scraping and extraction library 18 | crawl4ai 19 | 20 | # For loading .env file (API Keys) 21 | python-dotenv 22 | 23 | # Potentially needed by crawl4ai or underlying libraries, good to include 24 | # httpx >= 0.27.0 # Usually installed by crawl4ai/mcp 25 | # beautifulsoup4 >= 4.12.2 # Usually installed by crawl4ai -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | # 3 | # MCP Server container 4 | # - Works on Smithery.ai or any Docker runtime 5 | # - Installs system deps needed by langchain/chromadb/lxml/etc. 6 | # - Lets you override the start command with START_CMD env (Smithery sets $PORT) 7 | 8 | FROM python:3.11-slim 9 | 10 | ENV PYTHONDONTWRITEBYTECODE=1 \ 11 | PYTHONUNBUFFERED=1 \ 12 | PIP_NO_CACHE_DIR=1 13 | 14 | # System dependencies often required by the stack in requirements.txt 15 | # (chromadb -> hnswlib, lxml; google/genai/grpc; sqlite for aiosqlite/langgraph checkpoints) 16 | RUN apt-get update && apt-get install -y --no-install-recommends \ 17 | build-essential \ 18 | git \ 19 | curl \ 20 | ca-certificates \ 21 | libffi-dev \ 22 | libssl-dev \ 23 | pkg-config \ 24 | libxml2-dev \ 25 | libxslt1-dev \ 26 | sqlite3 \ && rm -rf /var/lib/apt/lists/* 27 | 28 | WORKDIR /app 29 | 30 | # Copy requirements first to leverage Docker build cache 31 | # Ensure your repo contains requirements.txt with the contents you shared. 32 | COPY requirements.txt /app/requirements.txt 33 | 34 | # Upgrade pip & install deps 35 | RUN python -m pip install --upgrade pip setuptools wheel && \ pip install --no-cache-dir -r /app/requirements.txt 36 | 37 | # Copy the rest of your server code (e.g., crawl_server.py, README.md, etc.) 38 | COPY . /app 39 | 40 | # Create unprivileged user 41 | RUN useradd -u 10001 -ms /bin/bash appuser 42 | USER appuser 43 | 44 | # Smithery typically injects PORT; default to 8000 if not set 45 | ENV PORT=8000 46 | 47 | # Expose the PORT for local/docker runs (no effect on some PaaS) 48 | EXPOSE 8000 49 | 50 | # Start command: 51 | # By default we run your Python entrypoint. 52 | # If your server exposes a FastAPI/Starlette app as `app`, you can override at runtime with: 53 | # -e START_CMD='uvicorn crawl_server:app --host 0.0.0.0 --port ${PORT:-8000}' 54 | CMD bash -lc "${START_CMD:-python /app/crawl_server.py}" 55 | -------------------------------------------------------------------------------- /crawl_server.py: -------------------------------------------------------------------------------- 1 | from mcp.server.fastmcp import FastMCP 2 | import asyncio 3 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig 4 | from crawl4ai.extraction_strategy import LLMExtractionStrategy 5 | from bs4 import BeautifulSoup 6 | import json 7 | import os 8 | from dotenv import load_dotenv 9 | 10 | load_dotenv() 11 | 12 | GOOGLE_API_KEY = os.environ.get("GOOGLE2_API_KEY") 13 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") 14 | MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY") 15 | 16 | print(f"[INIT].... → GOOGLE_API_KEY available: {GOOGLE_API_KEY is not None}") 17 | print(f"[INIT].... → OPENAI_API_KEY available: {OPENAI_API_KEY is not None and OPENAI_API_KEY != ''}") 18 | print(f"[INIT].... → MISTRAL_API_KEY available: {MISTRAL_API_KEY is not None and MISTRAL_API_KEY != ''}") 19 | 20 | mcp = FastMCP("webcrawl") 21 | 22 | mcp.settings.port = 8002 23 | 24 | 25 | @mcp.tool() 26 | async def scrape_url(url: str) -> str: 27 | """ 28 | Scrape a webpage and return its content. 29 | 30 | Args: 31 | url: The URL of the webpage to scrape 32 | 33 | Returns: 34 | The webpage content in markdown format 35 | """ 36 | try: 37 | async with AsyncWebCrawler() as crawler: 38 | result = await crawler.arun(url=url) 39 | return result.markdown.raw_markdown if result.markdown else "No content found" 40 | except Exception as e: 41 | return f"Error scraping URL: {str(e)}" 42 | 43 | 44 | @mcp.tool() 45 | async def extract_text_by_query(url: str, query: str, context_size: int = 300) -> str: 46 | """ 47 | Extract relevant text from a webpage based on a search query. 48 | 49 | Args: 50 | url: The URL of the webpage to search 51 | query: The search query to look for in the content 52 | context_size: Number of characters around the matching text to include (default: 300) 53 | 54 | Returns: 55 | The relevant text snippets containing the query 56 | """ 57 | try: 58 | async with AsyncWebCrawler() as crawler: 59 | result = await crawler.arun(url=url) 60 | 61 | # Get the text content from the markdown result 62 | if not result.markdown or not result.markdown.raw_markdown: 63 | return f"No text content found for URL: {url}" 64 | 65 | full_text = result.markdown.raw_markdown 66 | 67 | 68 | query = query.lower() 69 | matches = [] 70 | 71 | if query in full_text.lower(): 72 | 73 | positions = [] 74 | current_pos = 0 75 | lower_text = full_text.lower() 76 | 77 | while True: 78 | pos = lower_text.find(query, current_pos) 79 | if pos == -1: 80 | break 81 | positions.append(pos) 82 | current_pos = pos + len(query) 83 | 84 | 85 | for pos in positions: 86 | start = max(0, pos - context_size) 87 | end = min(len(full_text), pos + len(query) + context_size) 88 | context = full_text[start:end] 89 | matches.append(context) 90 | 91 | if matches: 92 | 93 | matches = matches[:5] 94 | result_text = "\n\n---\n\n".join([f"Match {i+1}:\n{match}" 95 | for i, match in enumerate(matches)]) 96 | return f"Found {len(matches)} matches for '{query}' on the page. Here are the relevant sections:\n\n{result_text}" 97 | 98 | return f"No matches found for '{query}' on the page." 99 | except Exception as e: 100 | return f"Error searching page: {str(e)}" 101 | 102 | 103 | @mcp.tool() 104 | async def smart_extract(url: str, instruction: str) -> str: 105 | """ 106 | Intelligently extract specific information from a webpage using LLM-based extraction. 107 | 108 | Args: 109 | url: The URL of the webpage to analyze 110 | instruction: Natural language instruction specifying what information to extract 111 | (e.g., "Extract all mentions of machine learning and its applications") 112 | 113 | Returns: 114 | The extracted information based on the instruction 115 | """ 116 | try: 117 | 118 | if GOOGLE_API_KEY: 119 | print(f"[EXTRACT] Using Google Gemini API directly") 120 | 121 | 122 | extraction_strategy = LLMExtractionStrategy( 123 | llm_config=LLMConfig( 124 | provider="gemini/gemini-2.0-flash", 125 | api_token=GOOGLE_API_KEY 126 | ), 127 | extraction_type="natural", 128 | instruction=instruction, 129 | extra_args={"temperature": 0.2} 130 | ) 131 | 132 | # Configure the crawler run 133 | config = CrawlerRunConfig( 134 | extraction_strategy=extraction_strategy 135 | ) 136 | 137 | 138 | async with AsyncWebCrawler() as crawler: 139 | result = await crawler.arun(url=url, config=config) 140 | 141 | 142 | if result.extracted_content: 143 | # Clean up the extracted content (remove extra quotes if JSON string) 144 | content = result.extracted_content 145 | try: 146 | 147 | parsed = json.loads(content) 148 | content = json.dumps(parsed, indent=2) 149 | except: 150 | 151 | pass 152 | 153 | return f"Successfully extracted information based on your instruction:\n\n{content}" 154 | else: 155 | return f"No relevant information found for your instruction: '{instruction}'" 156 | else: 157 | return "Error: Google API key not found. Please set GOOGLE_API_KEY in your environment." 158 | except Exception as e: 159 | return f"Error during intelligent extraction: {str(e)}" 160 | 161 | 162 | if __name__ == "__main__": 163 | mcp.run(transport="sse") 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Crawl4AI Web Scraper MCP Server 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 4 | 5 | This project provides an MCP (Model Context Protocol) server that uses the **[crawl4ai](https://github.com/unclecode/crawl4ai)** library to perform web scraping and intelligent content extraction tasks. It allows AI agents (like Claude, or agents built with LangChain/LangGraph) to interact with web pages, retrieve content, search for specific text, and perform LLM-based extraction based on natural language instructions. 6 | 7 | This server uses: 8 | 9 | * **[FastMCP](https://github.com/model-context-protocol/mcp-py/blob/main/docs/fastmcp.md):** For creating the MCP server endpoint. 10 | * **[crawl4ai](https://github.com/unclecode/crawl4ai):** For the core web crawling and extraction logic. 11 | * **[dotenv](https://github.com/theskumar/python-dotenv):** For managing API keys via a `.env` file. 12 | * **(Optional) Docker:** For containerized deployment, bundling Python and dependencies. 13 | 14 | ## Features 15 | 16 | * Exposes MCP tools for web interaction: 17 | * `scrape_url`: Get the full content of a webpage in Markdown format. 18 | * `extract_text_by_query`: Find specific text snippets on a page based on a query. 19 | * `smart_extract`: Use an LLM (currently Google Gemini) to extract structured information based on instructions. 20 | * Configurable via environment variables (API keys). 21 | * Includes Docker configuration (`Dockerfile`) for easy, self-contained deployment. 22 | * Communicates over Server-Sent Events (SSE) on port 8002 by default. 23 | 24 | ## Exposed MCP Tools 25 | 26 | ### `scrape_url` 27 | 28 | Scrape a webpage and return its content in Markdown format. 29 | 30 | **Arguments:** 31 | 32 | * `url` (str, **required**): The URL of the webpage to scrape. 33 | 34 | **Returns:** 35 | 36 | * (str): The webpage content in Markdown format, or an error message. 37 | 38 | ### `extract_text_by_query` 39 | 40 | Extract relevant text snippets from a webpage that contain a specific search query. Returns up to the first 5 matches found. 41 | 42 | **Arguments:** 43 | 44 | * `url` (str, **required**): The URL of the webpage to search within. 45 | * `query` (str, **required**): The text query to search for (case-insensitive). 46 | * `context_size` (int, *optional*): The number of characters to include before and after the matched query text in each snippet. Defaults to `300`. 47 | 48 | **Returns:** 49 | 50 | * (str): A formatted string containing the found text snippets or a message indicating no matches were found, or an error message. 51 | 52 | ### `smart_extract` 53 | 54 | Intelligently extract specific information from a webpage using the configured LLM (currently requires Google Gemini API key) based on a natural language instruction. 55 | 56 | **Arguments:** 57 | 58 | * `url` (str, **required**): The URL of the webpage to analyze and extract from. 59 | * `instruction` (str, **required**): Natural language instruction specifying what information to extract (e.g., "List all the speakers mentioned on this page", "Extract the main contact email address", "Summarize the key findings"). 60 | 61 | **Returns:** 62 | 63 | * (str): The extracted information (often formatted as JSON or structured text based on the instruction) or a message indicating no relevant information was found, or an error message (including if the required API key is missing). 64 | 65 | ## Setup and Running 66 | 67 | You can run this server either locally or using the provided Docker configuration. 68 | 69 | ### Option 1: Running with Docker (Recommended for Deployment) 70 | 71 | This method bundles Python and all necessary libraries. You only need Docker installed on the host machine. 72 | 73 | 1. **Install Docker:** Download and install [Docker Desktop](https://www.docker.com/products/docker-desktop/) for your OS. Start Docker Desktop. 74 | 2. **Clone Repository:** 75 | ```bash 76 | git clone https://github.com/your-username/your-repo-name.git # Replace with your repo URL 77 | cd your-repo-name 78 | ``` 79 | 3. **Create `.env` File:** Create a file named `.env` in the project root directory and add your API keys: 80 | ```.env 81 | # Required for the smart_extract tool 82 | GOOGLE_API_KEY=your_google_ai_api_key_here 83 | 84 | # Optional, checked by server but not currently used by tools 85 | # OPENAI_API_KEY=your_openai_key_here 86 | # MISTRAL_API_KEY=your_mistral_key_here 87 | ``` 88 | 4. **Build the Docker Image:** 89 | ```bash 90 | docker build -t crawl4ai-mcp-server . 91 | ``` 92 | 5. **Run the Container:** This starts the server, making port 8002 available on your host machine. It uses `--env-file` to securely pass the API keys from your local `.env` file into the container's environment. 93 | ```bash 94 | docker run -it --rm -p 8002:8002 --env-file .env crawl4ai-mcp-server 95 | ``` 96 | * `-it`: Runs interactively. 97 | * `--rm`: Removes container on exit. 98 | * `-p 8002:8002`: Maps host port 8002 to container port 8002. 99 | * `--env-file .env`: Loads environment variables from your local `.env` file into the container. **Crucial for API keys.** 100 | * `crawl4ai-mcp-server`: The name of the image you built. 101 | 6. **Server is Running:** Logs will appear, indicating the server is listening on SSE (`http://0.0.0.0:8002`). 102 | 7. **Connecting Client:** Configure your MCP client (e.g., LangChain agent) to connect to `http://127.0.0.1:8002/sse` with `transport: "sse"`. 103 | 104 | ### Option 2: Running Locally 105 | 106 | This requires Python and manual installation of dependencies on your host machine. 107 | 108 | 1. **Install Python:** Ensure Python >= 3.9 (check `crawl4ai` requirements if needed, 3.10+ recommended). 109 | 2. **Clone Repository:** 110 | ```bash 111 | git clone https://github.com/your-username/your-repo-name.git # Replace with your repo URL 112 | cd your-repo-name 113 | ``` 114 | 3. **Create Virtual Environment (Recommended):** 115 | ```bash 116 | python -m venv venv 117 | source venv/bin/activate # Linux/macOS 118 | # venv\Scripts\activate # Windows 119 | ``` 120 | *(Or use Conda: `conda create --name crawl4ai-env python=3.11 -y && conda activate crawl4ai-env`)* 121 | 4. **Install Dependencies:** 122 | ```bash 123 | pip install -r requirements.txt 124 | ``` 125 | 5. **Create `.env` File:** Create a file named `.env` in the project root directory and add your API keys (same content as in Docker setup step 3). 126 | 6. **Run the Server:** 127 | ```bash 128 | python your_server_script_name.py # e.g., python webcrawl_mcp_server.py 129 | ``` 130 | 7. **Server is Running:** It will listen on `http://127.0.0.1:8002/sse`. 131 | 8. **Connecting Client:** Configure your MCP client to connect to `http://127.0.0.1:8002/sse`. 132 | 133 | ## Environment Variables 134 | 135 | The server uses the following environment variables, typically loaded from an `.env` file: 136 | 137 | * `GOOGLE_API_KEY`: **Required** for the `smart_extract` tool to function (uses Google Gemini). Get one from [Google AI Studio](https://aistudio.google.com/app/apikey). 138 | * `OPENAI_API_KEY`: Checked for existence but **not currently used** by any tool in this version. 139 | * `MISTRAL_API_KEY`: Checked for existence but **not currently used** by any tool in this version. 140 | 141 | ## Example Agent Interaction 142 | 143 | ``` 144 | # Example using the agent CLI from the previous setup 145 | 146 | You: scrape_url https://example.com 147 | Agent: Thinking... 148 | [Agent calls scrape_url tool] 149 | Agent: [Markdown content of example.com] 150 | ------------------------------ 151 | You: extract text from https://en.wikipedia.org/wiki/Web_scraping using the query "ethical considerations" 152 | Agent: Thinking... 153 | [Agent calls extract_text_by_query tool] 154 | Agent: Found X matches for 'ethical considerations' on the page. Here are the relevant sections: 155 | Match 1: 156 | ... text snippet ... 157 | --- 158 | Match 2: 159 | ... text snippet ... 160 | ------------------------------ 161 | You: Use smart_extract on https://blog.google/technology/ai/google-gemini-ai/ to get the main points about Gemini models 162 | Agent: Thinking... 163 | [Agent calls smart_extract tool with Google API Key] 164 | Agent: Successfully extracted information based on your instruction: 165 | { 166 | "main_points": [ 167 | "Gemini is Google's most capable AI model family (Ultra, Pro, Nano).", 168 | "Designed to be multimodal, understanding text, code, audio, image, video.", 169 | "Outperforms previous models on various benchmarks.", 170 | "Being integrated into Google products like Bard and Pixel." 171 | ] 172 | } 173 | 174 | ``` 175 | 176 | ## Files 177 | 178 | * `your_server_script_name.py`: The main Python script for the MCP server (e.g., `webcrawl_mcp_server.py`). 179 | * `Dockerfile`: Instructions for building the Docker container image. 180 | * `requirements.txt`: Python dependencies. 181 | * `.env.example`: (Recommended) An example environment file showing needed keys. **Do not commit your actual `.env` file.** 182 | * `.gitignore`: Specifies intentionally untracked files for Git (should include `.env`). 183 | * `README.md`: This file. 184 | 185 | ## Contributing 186 | 187 | (Add contribution guidelines if desired) 188 | 189 | ## License 190 | 191 | (Specify your license, e.g., MIT License) --------------------------------------------------------------------------------