├── requirements.txt
├── Dockerfile
├── crawl_server.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Discord Bot with LangChain, LangMem, and MCP Server Tools
 2 | discord.py>=2.0.0
 3 | python-dotenv>=0.21.0
 4 | langchain>=0.1.0
 5 | langchain-community>=0.0.13
 6 | langchain-mistralai>=0.0.1
 7 | langchain-google-genai>=0.0.3
 8 | langchain-mcp-adapters>=0.0.1
 9 | langmem>=0.0.1
10 | langgraph>=0.0.20
11 | chromadb>=0.4.22  # Added for persistent memory
12 | langgraph-checkpoint-sqlite
13 | aiosqlite>=0.20.0
14 | # Core MCP Server library (includes FastAPI, Uvicorn for SSE)
15 | mcp[cli]
16 | 
17 | # Web scraping and extraction library
18 | crawl4ai
19 | 
20 | # For loading .env file (API Keys)
21 | python-dotenv
22 | 
23 | # Potentially needed by crawl4ai or underlying libraries, good to include
24 | # httpx >= 0.27.0 # Usually installed by crawl4ai/mcp
25 | # beautifulsoup4 >= 4.12.2 # Usually installed by crawl4ai


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | #
 3 | # MCP Server container
 4 | # - Works on Smithery.ai or any Docker runtime
 5 | # - Installs system deps needed by langchain/chromadb/lxml/etc.
 6 | # - Lets you override the start command with START_CMD env (Smithery sets $PORT)
 7 | 
 8 | FROM python:3.11-slim
 9 | 
10 | ENV PYTHONDONTWRITEBYTECODE=1 \
11 |     PYTHONUNBUFFERED=1 \
12 |     PIP_NO_CACHE_DIR=1
13 | 
14 | # System dependencies often required by the stack in requirements.txt
15 | # (chromadb -> hnswlib, lxml; google/genai/grpc; sqlite for aiosqlite/langgraph checkpoints)
16 | RUN apt-get update && apt-get install -y --no-install-recommends \
17 |     build-essential \
18 |     git \
19 |     curl \
20 |     ca-certificates \
21 |     libffi-dev \
22 |     libssl-dev \
23 |     pkg-config \
24 |     libxml2-dev \
25 |     libxslt1-dev \
26 |     sqlite3 \ && rm -rf /var/lib/apt/lists/*
27 | 
28 | WORKDIR /app
29 | 
30 | # Copy requirements first to leverage Docker build cache
31 | # Ensure your repo contains requirements.txt with the contents you shared.
32 | COPY requirements.txt /app/requirements.txt
33 | 
34 | # Upgrade pip & install deps
35 | RUN python -m pip install --upgrade pip setuptools wheel && \    pip install --no-cache-dir -r /app/requirements.txt
36 | 
37 | # Copy the rest of your server code (e.g., crawl_server.py, README.md, etc.)
38 | COPY . /app
39 | 
40 | # Create unprivileged user
41 | RUN useradd -u 10001 -ms /bin/bash appuser
42 | USER appuser
43 | 
44 | # Smithery typically injects PORT; default to 8000 if not set
45 | ENV PORT=8000
46 | 
47 | # Expose the PORT for local/docker runs (no effect on some PaaS)
48 | EXPOSE 8000
49 | 
50 | # Start command:
51 | # By default we run your Python entrypoint.
52 | # If your server exposes a FastAPI/Starlette app as `app`, you can override at runtime with:
53 | #   -e START_CMD='uvicorn crawl_server:app --host 0.0.0.0 --port ${PORT:-8000}'
54 | CMD bash -lc "${START_CMD:-python /app/crawl_server.py}"
55 | 


--------------------------------------------------------------------------------
/crawl_server.py:
--------------------------------------------------------------------------------
  1 | from mcp.server.fastmcp import FastMCP
  2 | import asyncio
  3 | from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
  4 | from crawl4ai.extraction_strategy import LLMExtractionStrategy
  5 | from bs4 import BeautifulSoup
  6 | import json
  7 | import os
  8 | from dotenv import load_dotenv
  9 | 
 10 | load_dotenv()
 11 | 
 12 | GOOGLE_API_KEY = os.environ.get("GOOGLE2_API_KEY")
 13 | OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 14 | MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY")
 15 | 
 16 | print(f"[INIT].... → GOOGLE_API_KEY available: {GOOGLE_API_KEY is not None}")
 17 | print(f"[INIT].... → OPENAI_API_KEY available: {OPENAI_API_KEY is not None and OPENAI_API_KEY != ''}")
 18 | print(f"[INIT].... → MISTRAL_API_KEY available: {MISTRAL_API_KEY is not None and MISTRAL_API_KEY != ''}")
 19 | 
 20 | mcp = FastMCP("webcrawl")
 21 | 
 22 | mcp.settings.port = 8002
 23 | 
 24 | 
 25 | @mcp.tool()
 26 | async def scrape_url(url: str) -> str:
 27 |     """
 28 |     Scrape a webpage and return its content.
 29 |     
 30 |     Args:
 31 |         url: The URL of the webpage to scrape
 32 |         
 33 |     Returns:
 34 |         The webpage content in markdown format
 35 |     """
 36 |     try:
 37 |         async with AsyncWebCrawler() as crawler:
 38 |             result = await crawler.arun(url=url)
 39 |             return result.markdown.raw_markdown if result.markdown else "No content found"
 40 |     except Exception as e:
 41 |         return f"Error scraping URL: {str(e)}"
 42 | 
 43 | 
 44 | @mcp.tool()
 45 | async def extract_text_by_query(url: str, query: str, context_size: int = 300) -> str:
 46 |     """
 47 |     Extract relevant text from a webpage based on a search query.
 48 |     
 49 |     Args:
 50 |         url: The URL of the webpage to search
 51 |         query: The search query to look for in the content
 52 |         context_size: Number of characters around the matching text to include (default: 300)
 53 |         
 54 |     Returns:
 55 |         The relevant text snippets containing the query
 56 |     """
 57 |     try:
 58 |         async with AsyncWebCrawler() as crawler:
 59 |             result = await crawler.arun(url=url)
 60 |             
 61 |             # Get the text content from the markdown result
 62 |             if not result.markdown or not result.markdown.raw_markdown:
 63 |                 return f"No text content found for URL: {url}"
 64 |                 
 65 |             full_text = result.markdown.raw_markdown
 66 |             
 67 |             
 68 |             query = query.lower()
 69 |             matches = []
 70 |             
 71 |             if query in full_text.lower():
 72 |                 
 73 |                 positions = []
 74 |                 current_pos = 0
 75 |                 lower_text = full_text.lower()
 76 |                 
 77 |                 while True:
 78 |                     pos = lower_text.find(query, current_pos)
 79 |                     if pos == -1:
 80 |                         break
 81 |                     positions.append(pos)
 82 |                     current_pos = pos + len(query)
 83 |                 
 84 |                 
 85 |                 for pos in positions:
 86 |                     start = max(0, pos - context_size)
 87 |                     end = min(len(full_text), pos + len(query) + context_size)
 88 |                     context = full_text[start:end]
 89 |                     matches.append(context)
 90 |                 
 91 |                 if matches:
 92 |                     
 93 |                     matches = matches[:5]
 94 |                     result_text = "\n\n---\n\n".join([f"Match {i+1}:\n{match}" 
 95 |                                                     for i, match in enumerate(matches)])
 96 |                     return f"Found {len(matches)} matches for '{query}' on the page. Here are the relevant sections:\n\n{result_text}"
 97 |             
 98 |             return f"No matches found for '{query}' on the page."
 99 |     except Exception as e:
100 |         return f"Error searching page: {str(e)}"
101 | 
102 | 
103 | @mcp.tool()
104 | async def smart_extract(url: str, instruction: str) -> str:
105 |     """
106 |     Intelligently extract specific information from a webpage using LLM-based extraction.
107 |     
108 |     Args:
109 |         url: The URL of the webpage to analyze
110 |         instruction: Natural language instruction specifying what information to extract
111 |                     (e.g., "Extract all mentions of machine learning and its applications")
112 |         
113 |     Returns:
114 |         The extracted information based on the instruction
115 |     """
116 |     try:
117 |         
118 |         if GOOGLE_API_KEY:
119 |             print(f"[EXTRACT] Using Google Gemini API directly")
120 |             
121 |            
122 |             extraction_strategy = LLMExtractionStrategy(
123 |                 llm_config=LLMConfig(
124 |                     provider="gemini/gemini-2.0-flash",  
125 |                     api_token=GOOGLE_API_KEY
126 |                 ),
127 |                 extraction_type="natural", 
128 |                 instruction=instruction,
129 |                 extra_args={"temperature": 0.2} 
130 |             )
131 |             
132 |             # Configure the crawler run
133 |             config = CrawlerRunConfig(
134 |                 extraction_strategy=extraction_strategy
135 |             )
136 |             
137 |             
138 |             async with AsyncWebCrawler() as crawler:
139 |                 result = await crawler.arun(url=url, config=config)
140 |                 
141 |                
142 |                 if result.extracted_content:
143 |                     # Clean up the extracted content (remove extra quotes if JSON string)
144 |                     content = result.extracted_content
145 |                     try:
146 |                         
147 |                         parsed = json.loads(content)
148 |                         content = json.dumps(parsed, indent=2)
149 |                     except:
150 |                        
151 |                         pass
152 |                     
153 |                     return f"Successfully extracted information based on your instruction:\n\n{content}"
154 |                 else:
155 |                     return f"No relevant information found for your instruction: '{instruction}'"
156 |         else:
157 |             return "Error: Google API key not found. Please set GOOGLE_API_KEY in your environment."
158 |     except Exception as e:
159 |         return f"Error during intelligent extraction: {str(e)}"
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     mcp.run(transport="sse")
164 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Crawl4AI Web Scraper MCP Server
  2 | 
  3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) <!-- Optional: Add license -->
  4 | 
  5 | This project provides an MCP (Model Context Protocol) server that uses the **[crawl4ai](https://github.com/unclecode/crawl4ai)** library to perform web scraping and intelligent content extraction tasks. It allows AI agents (like Claude, or agents built with LangChain/LangGraph) to interact with web pages, retrieve content, search for specific text, and perform LLM-based extraction based on natural language instructions.
  6 | 
  7 | This server uses:
  8 | 
  9 | *   **[FastMCP](https://github.com/model-context-protocol/mcp-py/blob/main/docs/fastmcp.md):** For creating the MCP server endpoint.
 10 | *   **[crawl4ai](https://github.com/unclecode/crawl4ai):** For the core web crawling and extraction logic.
 11 | *   **[dotenv](https://github.com/theskumar/python-dotenv):** For managing API keys via a `.env` file.
 12 | *   **(Optional) Docker:** For containerized deployment, bundling Python and dependencies.
 13 | 
 14 | ## Features
 15 | 
 16 | *   Exposes MCP tools for web interaction:
 17 |     *   `scrape_url`: Get the full content of a webpage in Markdown format.
 18 |     *   `extract_text_by_query`: Find specific text snippets on a page based on a query.
 19 |     *   `smart_extract`: Use an LLM (currently Google Gemini) to extract structured information based on instructions.
 20 | *   Configurable via environment variables (API keys).
 21 | *   Includes Docker configuration (`Dockerfile`) for easy, self-contained deployment.
 22 | *   Communicates over Server-Sent Events (SSE) on port 8002 by default.
 23 | 
 24 | ## Exposed MCP Tools
 25 | 
 26 | ### `scrape_url`
 27 | 
 28 | Scrape a webpage and return its content in Markdown format.
 29 | 
 30 | **Arguments:**
 31 | 
 32 | *   `url` (str, **required**): The URL of the webpage to scrape.
 33 | 
 34 | **Returns:**
 35 | 
 36 | *   (str): The webpage content in Markdown format, or an error message.
 37 | 
 38 | ### `extract_text_by_query`
 39 | 
 40 | Extract relevant text snippets from a webpage that contain a specific search query. Returns up to the first 5 matches found.
 41 | 
 42 | **Arguments:**
 43 | 
 44 | *   `url` (str, **required**): The URL of the webpage to search within.
 45 | *   `query` (str, **required**): The text query to search for (case-insensitive).
 46 | *   `context_size` (int, *optional*): The number of characters to include before and after the matched query text in each snippet. Defaults to `300`.
 47 | 
 48 | **Returns:**
 49 | 
 50 | *   (str): A formatted string containing the found text snippets or a message indicating no matches were found, or an error message.
 51 | 
 52 | ### `smart_extract`
 53 | 
 54 | Intelligently extract specific information from a webpage using the configured LLM (currently requires Google Gemini API key) based on a natural language instruction.
 55 | 
 56 | **Arguments:**
 57 | 
 58 | *   `url` (str, **required**): The URL of the webpage to analyze and extract from.
 59 | *   `instruction` (str, **required**): Natural language instruction specifying what information to extract (e.g., "List all the speakers mentioned on this page", "Extract the main contact email address", "Summarize the key findings").
 60 | 
 61 | **Returns:**
 62 | 
 63 | *   (str): The extracted information (often formatted as JSON or structured text based on the instruction) or a message indicating no relevant information was found, or an error message (including if the required API key is missing).
 64 | 
 65 | ## Setup and Running
 66 | 
 67 | You can run this server either locally or using the provided Docker configuration.
 68 | 
 69 | ### Option 1: Running with Docker (Recommended for Deployment)
 70 | 
 71 | This method bundles Python and all necessary libraries. You only need Docker installed on the host machine.
 72 | 
 73 | 1.  **Install Docker:** Download and install [Docker Desktop](https://www.docker.com/products/docker-desktop/) for your OS. Start Docker Desktop.
 74 | 2.  **Clone Repository:**
 75 |     ```bash
 76 |     git clone https://github.com/your-username/your-repo-name.git # Replace with your repo URL
 77 |     cd your-repo-name
 78 |     ```
 79 | 3.  **Create `.env` File:** Create a file named `.env` in the project root directory and add your API keys:
 80 |     ```.env
 81 |     # Required for the smart_extract tool
 82 |     GOOGLE_API_KEY=your_google_ai_api_key_here
 83 | 
 84 |     # Optional, checked by server but not currently used by tools
 85 |     # OPENAI_API_KEY=your_openai_key_here
 86 |     # MISTRAL_API_KEY=your_mistral_key_here
 87 |     ```
 88 | 4.  **Build the Docker Image:**
 89 |     ```bash
 90 |     docker build -t crawl4ai-mcp-server .
 91 |     ```
 92 | 5.  **Run the Container:** This starts the server, making port 8002 available on your host machine. It uses `--env-file` to securely pass the API keys from your local `.env` file into the container's environment.
 93 |     ```bash
 94 |     docker run -it --rm -p 8002:8002 --env-file .env crawl4ai-mcp-server
 95 |     ```
 96 |     *   `-it`: Runs interactively.
 97 |     *   `--rm`: Removes container on exit.
 98 |     *   `-p 8002:8002`: Maps host port 8002 to container port 8002.
 99 |     *   `--env-file .env`: Loads environment variables from your local `.env` file into the container. **Crucial for API keys.**
100 |     *   `crawl4ai-mcp-server`: The name of the image you built.
101 | 6.  **Server is Running:** Logs will appear, indicating the server is listening on SSE (`http://0.0.0.0:8002`).
102 | 7.  **Connecting Client:** Configure your MCP client (e.g., LangChain agent) to connect to `http://127.0.0.1:8002/sse` with `transport: "sse"`.
103 | 
104 | ### Option 2: Running Locally
105 | 
106 | This requires Python and manual installation of dependencies on your host machine.
107 | 
108 | 1.  **Install Python:** Ensure Python >= 3.9 (check `crawl4ai` requirements if needed, 3.10+ recommended).
109 | 2.  **Clone Repository:**
110 |     ```bash
111 |     git clone https://github.com/your-username/your-repo-name.git # Replace with your repo URL
112 |     cd your-repo-name
113 |     ```
114 | 3.  **Create Virtual Environment (Recommended):**
115 |     ```bash
116 |     python -m venv venv
117 |     source venv/bin/activate # Linux/macOS
118 |     # venv\Scripts\activate # Windows
119 |     ```
120 |     *(Or use Conda: `conda create --name crawl4ai-env python=3.11 -y && conda activate crawl4ai-env`)*
121 | 4.  **Install Dependencies:**
122 |     ```bash
123 |     pip install -r requirements.txt
124 |     ```
125 | 5.  **Create `.env` File:** Create a file named `.env` in the project root directory and add your API keys (same content as in Docker setup step 3).
126 | 6.  **Run the Server:**
127 |     ```bash
128 |     python your_server_script_name.py # e.g., python webcrawl_mcp_server.py
129 |     ```
130 | 7.  **Server is Running:** It will listen on `http://127.0.0.1:8002/sse`.
131 | 8.  **Connecting Client:** Configure your MCP client to connect to `http://127.0.0.1:8002/sse`.
132 | 
133 | ## Environment Variables
134 | 
135 | The server uses the following environment variables, typically loaded from an `.env` file:
136 | 
137 | *   `GOOGLE_API_KEY`: **Required** for the `smart_extract` tool to function (uses Google Gemini). Get one from [Google AI Studio](https://aistudio.google.com/app/apikey).
138 | *   `OPENAI_API_KEY`: Checked for existence but **not currently used** by any tool in this version.
139 | *   `MISTRAL_API_KEY`: Checked for existence but **not currently used** by any tool in this version.
140 | 
141 | ## Example Agent Interaction
142 | 
143 | ```
144 | # Example using the agent CLI from the previous setup
145 | 
146 | You: scrape_url https://example.com
147 | Agent: Thinking...
148 | [Agent calls scrape_url tool]
149 | Agent: [Markdown content of example.com]
150 | ------------------------------
151 | You: extract text from https://en.wikipedia.org/wiki/Web_scraping using the query "ethical considerations"
152 | Agent: Thinking...
153 | [Agent calls extract_text_by_query tool]
154 | Agent: Found X matches for 'ethical considerations' on the page. Here are the relevant sections:
155 | Match 1:
156 | ... text snippet ...
157 | ---
158 | Match 2:
159 | ... text snippet ...
160 | ------------------------------
161 | You: Use smart_extract on https://blog.google/technology/ai/google-gemini-ai/ to get the main points about Gemini models
162 | Agent: Thinking...
163 | [Agent calls smart_extract tool with Google API Key]
164 | Agent: Successfully extracted information based on your instruction:
165 | {
166 |   "main_points": [
167 |     "Gemini is Google's most capable AI model family (Ultra, Pro, Nano).",
168 |     "Designed to be multimodal, understanding text, code, audio, image, video.",
169 |     "Outperforms previous models on various benchmarks.",
170 |     "Being integrated into Google products like Bard and Pixel."
171 |   ]
172 | }
173 | 
174 | ```
175 | 
176 | ## Files
177 | 
178 | *   `your_server_script_name.py`: The main Python script for the MCP server (e.g., `webcrawl_mcp_server.py`).
179 | *   `Dockerfile`: Instructions for building the Docker container image.
180 | *   `requirements.txt`: Python dependencies.
181 | *   `.env.example`: (Recommended) An example environment file showing needed keys. **Do not commit your actual `.env` file.**
182 | *   `.gitignore`: Specifies intentionally untracked files for Git (should include `.env`).
183 | *   `README.md`: This file.
184 | 
185 | ## Contributing
186 | 
187 | (Add contribution guidelines if desired)
188 | 
189 | ## License
190 | 
191 | (Specify your license, e.g., MIT License)


--------------------------------------------------------------------------------