├── .gitignore ├── LICENSE ├── README.md ├── app_architecture.png ├── core ├── .env.example ├── LICENSE ├── README.md ├── __init__.py ├── agents │ ├── __init__.py │ └── base │ │ ├── __init__.py │ │ └── react.py ├── clients │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── mcp.py │ │ └── utils.py │ └── langgraph │ │ ├── __init__.py │ │ └── client.py ├── mcp_architecture.png ├── pyproject.toml └── uv.lock └── servers ├── .env.example ├── LICENSE ├── README.md ├── pyproject.toml ├── uv.lock ├── web_firecrawl_service_v0.1.0.png └── website_firecrawl_service ├── __init__.py ├── crawler.py ├── openai.py ├── prompt.py ├── server.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # PyPI configuration file 171 | .pypirc 172 | 173 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 lgesuellip 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | An application built on the Model Context Protocol (MCP) that transforms any website into highly relevant content based on your queries. The app seamlessly integrates with platforms like X, Slack, and others through Arcade. 2 | 3 | ### Perfect For 4 | - LLM-Ready File Creation: Generate .txt files optimized for use with large language models. 5 | - Documentation Indexing: Organize and structure documentation effortlessly. 6 | - Research Automation: Save time by automating repetitive research tasks. 7 | 8 | ### Tech Stack 9 | - LangGraph as the MCP Client 10 | - Firecrawll for web research (site mapping, intelligent selection, and scraping) 11 | - Arcade for seamless platform integration (X, Slack, etc.) 12 | - Tracing powered by LangChainAI LangSmith 13 | - Utilizes OpenAI's structured outputs, async processing, exponential backoff, and Pydantic for reliability 14 | 15 | ### Architecture Diagram 16 | 17 | ![Architecture Diagram](app_architecture.png) 18 | -------------------------------------------------------------------------------- /app_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/app_architecture.png -------------------------------------------------------------------------------- /core/.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY = 2 | 3 | # If you want to use Arcade, you can set these variables 4 | ARCADE_API_KEY= 5 | ARCADE_USER_ID= 6 | 7 | # Optional 8 | LANGSMITH_API_KEY = 9 | LANGCHAIN_TRACING_V2 = 10 | LANGCHAIN_PROJECT = 11 | -------------------------------------------------------------------------------- /core/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 lgesuellip 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /core/README.md: -------------------------------------------------------------------------------- 1 | # LangGraph MCP Integration 2 | 3 | This repository demonstrates the integration of LangGraph with MCP servers. This integration allows you to build powerful agent-based applications that can interact with both MCP servers and external services. 4 | 5 | ## Features 6 | 7 | - Seamless integration of LangGraph with MCP servers 8 | - Support for React-based agents using MCP tools 9 | - Integration with Arcade and custom services (like Firecrawl) 10 | - Asynchronous tool execution 11 | - Type-safe tool wrapping with Pydantic models 12 | 13 | Make sure to set up your environment variables in a `.env` file: 14 | 15 | ```env 16 | OPENAI_API_KEY=your_openai_api_key 17 | ARCADE_API_KEY=your_arcade_api_key 18 | ARCADE_USER_ID=your_arcade_user_id 19 | ``` 20 | 21 | ### Development Commands 22 | 23 | ```bash 24 | uv run agents/base/react.py 25 | ``` 26 | 27 | ## Architecture 28 | 29 | ![MCP Architecture](mcp_architecture.png) 30 | 31 | ### Components 32 | 33 | 1. **LangGraph Agent** 34 | - React Agent: Implements the ReAct pattern for reasoning and action 35 | - LangGraph MCP Client: Bridges LangGraph with MCP servers 36 | 37 | 2. **MCP Servers** 38 | - Firecrawl Server: Provides web crawling capabilities 39 | 40 | 3. **Arcade Server**: Offers integration with various services, its so familiar with MCP. 41 | 42 | ## Usage 43 | 44 | ### Basic LangGraph MCP Client 45 | 46 | The `LanggraphMCPClient` class provides a bridge between LangGraph and MCP servers: 47 | 48 | ```python 49 | from clients.langgraph.client import LanggraphMCPClient 50 | from mcp import StdioServerParameters 51 | 52 | # Configure your MCP server 53 | server_params = StdioServerParameters( 54 | command="uv", 55 | args=[ 56 | "--directory", 57 | "/path/to/your/service", 58 | "run", 59 | "website-firecrawl-service" 60 | ] 61 | ) 62 | 63 | # Use the client to get tools from the MCP server 64 | async with LanggraphMCPClient(server_params=server_params) as mcp_client: 65 | tools = await mcp_client.get_tools() 66 | ``` 67 | 68 | ### Creating a React Agent with MCP Tools 69 | 70 | Here's an example of creating a React agent that combines MCP tools with Arcade services: 71 | 72 | ```python 73 | from langchain_core.messages import HumanMessage 74 | from langchain_openai import ChatOpenAI 75 | from langgraph.prebuilt import create_react_agent 76 | from langchain_arcade import ArcadeToolManager 77 | 78 | async def create_agent(): 79 | tools = [] 80 | 81 | # Get tools from MCP server 82 | async with LanggraphMCPClient(server_params=server_params) as mcp_client: 83 | tools.extend(await mcp_client.get_tools()) 84 | 85 | # Add Arcade tools 86 | tool_arcade_manager = ArcadeToolManager(api_key=os.getenv("ARCADE_API_KEY")) 87 | tools.extend(tool_arcade_manager.get_tools(toolkits=["slack"])) 88 | 89 | # Create the agent with combined tools 90 | model = ChatOpenAI(model="gpt-4") 91 | graph = create_react_agent(model, tools=tools) 92 | 93 | return graph 94 | 95 | # Use the agent 96 | inputs = { 97 | "messages": [HumanMessage(content="Analyze pampa.ai and send a summary to Slack")], 98 | } 99 | 100 | config = { 101 | "configurable": { 102 | "thread_id": "1", 103 | "user_id": os.getenv("ARCADE_USER_ID"), 104 | } 105 | } 106 | 107 | result = await graph.ainvoke(inputs, config=config) 108 | ``` 109 | 110 | ### Using the Firecrawl Service 111 | 112 | The Firecrawl service is implemented as an MCP server, allowing your agents to crawl and analyze websites: 113 | 114 | ```python 115 | # Example configuration in your MCP server setup: 116 | server_params = StdioServerParameters( 117 | command="uv", 118 | args=[ 119 | "--directory", 120 | "/path/to/firecrawl", 121 | "run", 122 | "website-firecrawl-service" 123 | ] 124 | ) 125 | 126 | # The Firecrawl service tools become available through the MCP client 127 | # Example usage in your agent's prompt: 128 | "Analyze the website example.com and provide a summary" 129 | ``` 130 | 131 | ## Resources 132 | 133 | - [MCP Official Documentation](https://modelcontextprotocol.io/docs) 134 | - [MCP GitHub Repository](https://github.com/modelcontextprotocol) 135 | - [LangGraph Documentation](https://python.langchain.com/docs/langgraph) 136 | 137 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/__init__.py -------------------------------------------------------------------------------- /core/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/agents/__init__.py -------------------------------------------------------------------------------- /core/agents/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/agents/base/__init__.py -------------------------------------------------------------------------------- /core/agents/base/react.py: -------------------------------------------------------------------------------- 1 | from langchain_core.messages import HumanMessage 2 | from langchain_openai import ChatOpenAI 3 | from langgraph.prebuilt import create_react_agent 4 | 5 | from mcp import StdioServerParameters 6 | from clients.langgraph.client import LanggraphMCPClient 7 | 8 | from langchain_arcade import ArcadeToolManager 9 | 10 | import asyncio 11 | import logging 12 | 13 | import os 14 | from dotenv import load_dotenv 15 | 16 | load_dotenv() 17 | 18 | logger = logging.getLogger(__name__) 19 | logger.setLevel(logging.INFO) 20 | 21 | logger.info("Starting the example client script.") 22 | 23 | server_params = StdioServerParameters( 24 | command="uv", 25 | args= [ 26 | "--directory", 27 | "/Users/lgesuellip/Desktop/mcp_firecrawl/researcher_service/servers", 28 | "run", 29 | "website-firecrawl-service" 30 | ] 31 | ) 32 | 33 | async def main(): 34 | 35 | tools = [] 36 | 37 | # Get tools from MCP 38 | async with LanggraphMCPClient(server_params=server_params) as mcp_client: 39 | tools.extend(await mcp_client.get_tools()) 40 | 41 | # Get tools from Arcade 42 | tool_arcade_manager = ArcadeToolManager(api_key=os.getenv("ARCADE_API_KEY")) 43 | tools.extend(tool_arcade_manager.get_tools(toolkits=["slack"])) 44 | 45 | model = ChatOpenAI(model="gpt-4") 46 | 47 | graph = create_react_agent(model, tools=tools) 48 | 49 | inputs = { 50 | "messages": [HumanMessage(content="A summary of pampa.ai website, and send it to 'lautaro'")], 51 | } 52 | 53 | config = { 54 | "configurable": { 55 | "thread_id": "1", 56 | "user_id": os.getenv("ARCADE_USER_ID"), 57 | } 58 | } 59 | result = await graph.ainvoke(inputs, config=config) 60 | print(result) 61 | 62 | if __name__ == "__main__": 63 | asyncio.run(main()) 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /core/clients/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/clients/__init__.py -------------------------------------------------------------------------------- /core/clients/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/clients/common/__init__.py -------------------------------------------------------------------------------- /core/clients/common/mcp.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | from typing import Any, Optional 4 | from mcp import ClientSession 5 | from mcp.client.stdio import stdio_client 6 | 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class BaseMCPClient(ABC): 12 | """Abstract base class for MCP clients providing session management and tool handling.""" 13 | 14 | def __init__( 15 | self, 16 | server_params: Optional[Any] = None, 17 | **kwargs: dict[str, Any], 18 | ) -> None: 19 | """ 20 | Initialize the BaseMCPClient with optional server parameters. 21 | 22 | Args: 23 | server_params: Optional parameters for server configuration. 24 | **kwargs: Additional keyword arguments. 25 | """ 26 | self.session = None 27 | self._tools: list[Any] = [] 28 | self.server_params = server_params 29 | self.read: Optional[Any] = None 30 | self.write: Optional[Any] = None 31 | 32 | @property 33 | def tools(self) -> list[Any]: 34 | """List of tools available to the client.""" 35 | return self._tools 36 | 37 | async def __aenter__(self): 38 | """ 39 | Asynchronous context manager entry. 40 | 41 | Initializes the stdio client and session, preparing the client for use. 42 | """ 43 | self._stdio_ctx = stdio_client(self.server_params) 44 | self.read, self.write = await self._stdio_ctx.__aenter__() 45 | 46 | self.session = ClientSession(self.read, self.write) 47 | await self.session.__aenter__() 48 | await self.session.initialize() 49 | return self 50 | 51 | async def __aexit__(self, exc_type, exc_val, exc_tb): 52 | """ 53 | Asynchronous context manager exit. 54 | 55 | Cleans up the session and stdio context. 56 | """ 57 | if self.session: 58 | await self.session.__aexit__(exc_type, exc_val, exc_tb) 59 | if hasattr(self, '_stdio_ctx'): 60 | await self._stdio_ctx.__aexit__(exc_type, exc_val, exc_tb) 61 | return 0 62 | 63 | async def init_tools( 64 | self, 65 | ) -> None: 66 | """ 67 | Initialize the list of tools by querying the session. 68 | 69 | Populates the internal tools list with available tools from the session. 70 | """ 71 | response = await self.session.list_tools() 72 | self._tools = response.tools 73 | 74 | @abstractmethod 75 | def wrap_tool(self, name: str, tool_def: Any, **kwargs: Any) -> Any: 76 | """ 77 | Abstract method to wrap a tool definition. 78 | 79 | Args: 80 | name: The name of the tool. 81 | tool_def: The tool definition object. 82 | **kwargs: Additional keyword arguments for tool configuration. 83 | 84 | Returns: 85 | A wrapped tool object. 86 | """ 87 | pass 88 | 89 | async def get_tools( 90 | self, **kwargs: Any 91 | ) -> list[Any]: 92 | """ 93 | Retrieve and wrap available tools. 94 | 95 | Args: 96 | **kwargs: Additional keyword arguments for tool wrapping. 97 | 98 | Returns: 99 | A list of wrapped tool objects. 100 | """ 101 | if not self._tools: 102 | await self.init_tools() 103 | return [self.wrap_tool(tool, **kwargs) for tool in self._tools] 104 | -------------------------------------------------------------------------------- /core/clients/common/utils.py: -------------------------------------------------------------------------------- 1 | from pydantic import create_model, Field 2 | 3 | BASIC_TYPE_MAP = { 4 | "string": str, 5 | "number": float, 6 | "integer": int, 7 | "boolean": bool, 8 | "array": list, 9 | "object": dict, 10 | } 11 | 12 | def create_pydantic_model_from_json_schema(klass, schema): 13 | """ 14 | Creates a Pydantic model from a JSON schema. 15 | """ 16 | fields = {} 17 | for prop_name, prop_info in schema['properties'].items(): 18 | field_type = prop_info.get('type', 'default') # if no type, then it's the default? 19 | py_type = None 20 | if field_type == 'default' or prop_name in ['properties', 'required', 'default', 'additionalProperties']: 21 | continue 22 | if field_type == 'array': 23 | item_type = prop_info['items']['type'] 24 | if item_type == 'object': 25 | py_type = list[create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", prop_info['items'])] 26 | else: 27 | py_type = list[BASIC_TYPE_MAP.get(item_type, None)] 28 | elif field_type == 'object': 29 | if prop_info.get('properties', None): 30 | py_type = create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", prop_info) 31 | elif prop_info.get('$ref'): 32 | # NOTE: We probably need to make this more robust 33 | ref_info = schema['properties'].get(prop_info['$ref'].split("/")[-1]) 34 | py_type = create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", ref_info) 35 | elif prop_info.get('additionalProperties', {}).get('$ref', None): 36 | ref_info = schema['properties'].get(prop_info['additionalProperties']['$ref'].split("/")[-1]) 37 | py_type = dict[str, create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", ref_info)] 38 | else: 39 | raise Exception(f"Object Error, {py_type} {prop_name} for {field_type}") 40 | elif BASIC_TYPE_MAP.get(field_type): 41 | py_type = BASIC_TYPE_MAP[field_type] 42 | 43 | if py_type is None: 44 | raise Exception(f"Error, {py_type} for {field_type}") 45 | 46 | default = prop_info.get('default', ...) if prop_name in schema.get('required', []) else ... 47 | description = prop_info.get('description', '') 48 | fields[prop_name] = (py_type, Field(default, description=description)) 49 | 50 | return create_model(klass, **fields) -------------------------------------------------------------------------------- /core/clients/langgraph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/clients/langgraph/__init__.py -------------------------------------------------------------------------------- /core/clients/langgraph/client.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | import logging 3 | from langchain_core.tools import StructuredTool 4 | from ..common.mcp import BaseMCPClient 5 | from ..common.utils import create_pydantic_model_from_json_schema 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class LanggraphMCPClient(BaseMCPClient): 10 | 11 | def tool_call(self, tool_name: str) -> Any: 12 | """Create an asynchronous function to call a tool by its name. 13 | 14 | Args: 15 | tool_name: The name of the tool to be called. 16 | 17 | Returns: 18 | An asynchronous function that executes the tool with the provided arguments. 19 | """ 20 | 21 | async def tool_function(*args: Any, **kwargs: Any) -> Any: 22 | print(f"Executing tool with args: {args} and kwargs: {kwargs}, {tool_name}") 23 | result = await self.session.call_tool(tool_name, arguments=kwargs) 24 | return result 25 | 26 | return tool_function 27 | 28 | def wrap_tool( 29 | self, tool: Any, **kwargs: Any 30 | ) -> StructuredTool: 31 | """Wrap a tool as a StructuredTool instance. 32 | 33 | Args: 34 | tool: The tool object to wrap. 35 | **kwargs: Additional keyword arguments for tool configuration. 36 | 37 | Returns: 38 | A StructuredTool instance configured with the provided tool and arguments. 39 | """ 40 | 41 | return StructuredTool.from_function( 42 | coroutine=self.tool_call(tool.name), 43 | name=tool.name, 44 | description=tool.description or "No description provided.", 45 | args_schema=create_pydantic_model_from_json_schema(tool.name, tool.inputSchema), 46 | ) 47 | -------------------------------------------------------------------------------- /core/mcp_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/mcp_architecture.png -------------------------------------------------------------------------------- /core/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "mcp-client" 7 | version = "0.1.0" 8 | description = "MCP Client" 9 | readme = "README.md" 10 | requires-python = ">=3.11" 11 | dependencies = [ 12 | "mcp>=1.1.0", 13 | "openai>=1.43.0", 14 | "pydantic>=2.9.2", 15 | "langchain_community>=0.3.1", 16 | "langsmith>=0.1.129", 17 | "jinja2>=3.1.2", 18 | "langchain_openai>=0.2.14,<0.3", 19 | "langgraph>=0.2.60,<0.3", 20 | "langchain_arcade>=0.1.2,<0.2.0", 21 | "arcadepy >= 0.1.0, <0.2.0", 22 | "httpx >= 0.27.1, <0.28.0", 23 | ] 24 | 25 | [[project.authors]] 26 | name = "lgesuellip" 27 | email = "lautaro@pampa.ai" 28 | 29 | [tool.hatch.build.targets.wheel] 30 | packages = ["agents", "clients"] 31 | 32 | [project.scripts] 33 | react-agent = "agents.base.react:main" 34 | -------------------------------------------------------------------------------- /servers/.env.example: -------------------------------------------------------------------------------- 1 | FIRECRAWL_API_KEY = 2 | OPENAI_API_KEY = 3 | 4 | # Optional 5 | LANGSMITH_API_KEY = 6 | LANGCHAIN_TRACING_V2 = 7 | LANGCHAIN_PROJECT = 8 | -------------------------------------------------------------------------------- /servers/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 lgesuellip 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /servers/README.md: -------------------------------------------------------------------------------- 1 | # website_firecrawl_service - MCP Server 2 | 3 | 🔍 Internet research just got smarter! Built an MCP server that turns any website into structured, relevant content based on your queries! 4 | 5 | Using @Firecrawl's powerful features (mapping, selection, scraping), combined with GPT-4o for smart URL filtering, it's like having an AI research assistant that knows exactly what you're looking for! Works seamlessly with Claude, or any MCP-compatible client! 6 | 7 | ![Agentic Web Scraping Architecture](web_firecrawl_service_v0.1.0.png) 8 | 9 | *An agentic web scraping system powered by Firecrawl: Map → Select → Scrape → Extract* 10 | 11 | --- 12 | 13 | ## Features 14 | 15 | - **Efficient Web Crawling**: Crawls websites using the Firecrawl API with customizable link limits and intelligent URL selection 16 | - **Intelligent URL Selection**: Uses GPT-4 to select the most relevant URLs based on user queries 17 | - **Smart Content Processing**: Extracts and cleans HTML content, providing readable text output 18 | 19 | ## Setup 20 | 21 | 1. Create a `.env` file with the following required environment variables: 22 | ``` 23 | FIRECRAWL_API_KEY=your_firecrawl_api_key 24 | OPENAI_API_KEY=your_openai_api_key 25 | ``` 26 | 27 | ## Usage 28 | 29 | The server exposes a single tool: 30 | 31 | ### website_firecrawl 32 | 33 | **Description**: Crawls a website and returns relevant content based on a query. 34 | 35 | **Parameters**: 36 | - `query` (string): The search query to filter relevant content 37 | - `base_url` (string): The target website URL to crawl 38 | - `max_links` (integer, optional): Maximum number of links to process (default: 100) 39 | 40 | ## Technical Details 41 | 42 | - Built using the MCP (Model Control Protocol) server framework 43 | - Implements retry logic with exponential backoff for API calls 44 | - Integrates with LangSmith for tracing and monitoring 45 | - Implements singleton patterns for API clients to manage resources efficiently 46 | - Uses Pydantic for robust data validation and serialization: 47 | - `WebsiteCrawlArgs`: Validates input parameters for the crawling service 48 | - `CrawlerModel`: Handles URL selection and justification 49 | - `Page`: Structures metadata and content from crawled pages 50 | - Structured OpenAI Integration: 51 | - Uses OpenAI's beta chat completions with parsing 52 | - Automatically validates and converts JSON responses to Pydantic models 53 | - Ensures type safety and data validation for AI-generated content 54 | - Jinja2 Template System: 55 | - Modular prompt management using template inheritance 56 | - Dynamic prompt generation based on user queries and context 57 | - Separate system and user prompt templates for clear separation of concerns 58 | - Easy maintenance and updates of prompt structures 59 | 60 | -------------------------------------------------------------------------------- /servers/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "website-firecrawl-service" 3 | version = "0.1.0" 4 | description = "This tool leverages Firecrawl to generate concise summaries of web pages directly from their URLs. Firecrawl processes the content of the provided website, extracting key insights and metadata to deliver a brief, focused summary." 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | dependencies = [ 8 | "mcp>=1.1.0", 9 | "openai>=1.43.0", 10 | "pydantic>=2.9.2", 11 | "langchain_community>=0.3.1", 12 | "langsmith>=0.1.129", 13 | "tenacity>=8.5.0", 14 | "firecrawl>=1.0.0", 15 | "jinja2>=3.1.2", 16 | "beautifulsoup4>=4.12.3" 17 | ] 18 | 19 | [[project.authors]] 20 | name = "lgesuellip" 21 | email = "lautaro@pampa.ai" 22 | 23 | [build-system] 24 | requires = [ "hatchling",] 25 | build-backend = "hatchling.build" 26 | 27 | [project.scripts] 28 | website-firecrawl-service = "website_firecrawl_service:main" 29 | -------------------------------------------------------------------------------- /servers/web_firecrawl_service_v0.1.0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/servers/web_firecrawl_service_v0.1.0.png -------------------------------------------------------------------------------- /servers/website_firecrawl_service/__init__.py: -------------------------------------------------------------------------------- 1 | from . import server 2 | import asyncio 3 | 4 | def main(): 5 | """Main entry point for the package.""" 6 | asyncio.run(server.main()) 7 | 8 | __all__ = ['main', 'server'] -------------------------------------------------------------------------------- /servers/website_firecrawl_service/crawler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from typing import List, Dict, Optional 4 | from urllib.parse import urlparse 5 | from pydantic import BaseModel, Field 6 | from dotenv import load_dotenv 7 | 8 | from firecrawl import FirecrawlApp 9 | 10 | from langsmith import traceable 11 | 12 | from website_firecrawl_service.openai import Inference 13 | from website_firecrawl_service.prompt import SYSTEM_CRAWLER_PROMPT, USER_CRAWLER_PROMPT 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | logger = logging.getLogger(__name__) 17 | 18 | load_dotenv() 19 | 20 | inference = Inference() 21 | 22 | model_args = { 23 | "model": "gpt-4o-mini", 24 | "temperature": 0, 25 | } 26 | 27 | class CrawlerModel(BaseModel): 28 | target_urls: List[str] = Field(description="The domain URLs to be considered") 29 | justification: str = Field(description="The reason for selecting these target URLs") 30 | 31 | class Page(BaseModel): 32 | metadata: Dict 33 | body: str 34 | 35 | class FirecrawlClient: 36 | _instance = None 37 | 38 | def __new__(cls): 39 | if cls._instance is None: 40 | cls._instance = super().__new__(cls) 41 | cls._instance._initialize() 42 | return cls._instance 43 | 44 | def _initialize(self): 45 | 46 | self.client = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) 47 | 48 | class WebsiteCrawler: 49 | def __init__(self): 50 | self._pages: List[Page] = [] 51 | self.firecrawl = FirecrawlClient() 52 | 53 | def _normalize_url(self, url: str) -> str: 54 | """Normalize the URL and handle GitHub repositories specially""" 55 | if not url.startswith(('http://', 'https://')): 56 | if url.startswith(('http:/', 'https:/')): 57 | url = url.replace(':', '://') 58 | else: 59 | url = f'http://{url}' 60 | 61 | url_obj = urlparse(url) 62 | stem_url = url_obj.netloc 63 | 64 | # Special handling for GitHub URLs 65 | if 'github.com' in stem_url: 66 | path_segments = [seg for seg in url_obj.path.split('/') if seg] 67 | if len(path_segments) >= 2: 68 | owner, repo = path_segments[0:2] 69 | stem_url = f"{stem_url}/{owner}/{repo}" 70 | 71 | return stem_url 72 | 73 | 74 | @traceable(name="select_crawler_urls") 75 | async def _select_links(self, query: str, links: List[Dict]): 76 | messages = [ 77 | {"role": "system", "content": SYSTEM_CRAWLER_PROMPT.render()}, 78 | {"role": "user", "content": USER_CRAWLER_PROMPT.render(query=query, links=links)} 79 | ] 80 | result = await inference.predict_with_parse_async(model_args, CrawlerModel, messages) 81 | return result.target_urls 82 | 83 | async def crawl(self, query: str, base_url: str, max_links: int = 100, llm_predict: bool = True) -> Optional[List[Page]]: 84 | """Fetch multiple pages using Firecrawl API and create Page objects""" 85 | try: 86 | logger.info(f"Searching for urls in {base_url}") 87 | 88 | self.base_url = self._normalize_url(base_url) 89 | 90 | # Map the URL to get a list of links 91 | map_result = self.firecrawl.client.map_url( 92 | self.base_url, 93 | params={'limit': max_links} 94 | ) 95 | 96 | if not map_result.get('success'): 97 | logger.error(f"Failed to map URL: {base_url}") 98 | return None 99 | 100 | # Select links based on relevance, given the user query 101 | links = await self._select_links(query, map_result['links']) if llm_predict else map_result['links'] 102 | 103 | # Scrape the selected links 104 | for link in links[:]: 105 | try: 106 | logger.info(f"Starting scrape for link: {link}") 107 | 108 | page = self.firecrawl.client.scrape_url( 109 | link, 110 | params={ 111 | 'formats': ['rawHtml'], 112 | }, 113 | ) 114 | 115 | self._pages.append(Page( 116 | body=page['rawHtml'], 117 | metadata={ 118 | 'url': page['metadata']['url'], 119 | 'title': page['metadata'].get('title', ''), 120 | 'description': page['metadata'].get('description', ''), 121 | 'language': page['metadata'].get('language', ''), 122 | }, 123 | )) 124 | except Exception as e: 125 | logger.error(f"Error scraping {link}: {str(e)}") 126 | continue 127 | 128 | return self._pages 129 | 130 | except Exception as e: 131 | logger.error(f"Error during crawl: {str(e)}") 132 | return None 133 | 134 | -------------------------------------------------------------------------------- /servers/website_firecrawl_service/openai.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from openai import AsyncOpenAI 3 | from tenacity import ( 4 | retry, 5 | stop_after_attempt, 6 | wait_random_exponential, 7 | ) 8 | import os 9 | from langsmith.wrappers import wrap_openai 10 | import logging 11 | from langsmith import traceable 12 | 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class OpenAIClientSingleton: 18 | _instance = None 19 | 20 | @classmethod 21 | def get_instance(cls): 22 | if cls._instance is None: 23 | api_key = os.getenv("OPENAI_API_KEY") 24 | if not api_key: 25 | raise ValueError("OPENAI_API_KEY not found in environment variables") 26 | 27 | cls._instance = wrap_openai(AsyncOpenAI(api_key=api_key)) 28 | return cls._instance 29 | 30 | class Inference: 31 | def __init__(self): 32 | self.client = OpenAIClientSingleton.get_instance() 33 | 34 | @traceable 35 | @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3)) 36 | async def predict_with_parse_async(self, model_args: Dict, response_format, messages: List[Dict]): 37 | 38 | response = await self.client.beta.chat.completions.parse( 39 | **model_args, 40 | messages=messages, 41 | response_format=response_format, 42 | ) 43 | 44 | return response.choices[0].message.parsed 45 | -------------------------------------------------------------------------------- /servers/website_firecrawl_service/prompt.py: -------------------------------------------------------------------------------- 1 | from jinja2 import Template 2 | 3 | SYSTEM_CRAWLER_PROMPT = Template(""" 4 | 5 | As an expert in web crawling, data extraction, and content relevance identification, your goal is to select URLs based on their relevance to the user's query. 6 | 7 | 8 | 9 | - PRIMARY FOCUS: Identify URLs that best address the user's information needs as expressed in the query. 10 | - EXCLUDE: Omit URLs that are not relevant or do not contribute meaningful information related to the query. 11 | - DEDUPLICATE: Ensure all URLs in the final output are unique. 12 | 13 | """) 14 | 15 | USER_CRAWLER_PROMPT = Template(""" 16 | The user is seeking information related to: "{{query}}" 17 | 18 | Below is a list of URLs. Based on the user's query, select the URLs most likely to contain relevant and helpful information. 19 | 20 | 21 | {{links}} 22 | 23 | """) -------------------------------------------------------------------------------- /servers/website_firecrawl_service/server.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Optional 3 | from pydantic import BaseModel 4 | from dotenv import load_dotenv 5 | 6 | import mcp.types as types 7 | from mcp.server import Server 8 | from mcp.server.models import InitializationOptions 9 | import mcp.server.stdio 10 | from mcp.types import ServerCapabilities 11 | 12 | from .crawler import WebsiteCrawler 13 | from website_firecrawl_service.utils import clean_html_text 14 | 15 | 16 | load_dotenv() 17 | 18 | server = Server("website_firecrawl_service") 19 | 20 | class WebsiteCrawlArgs(BaseModel): 21 | query: str 22 | base_url: str 23 | max_links: int = 100 24 | 25 | model_config = { 26 | "json_schema_extra": { 27 | "description": "Arguments for crawling a website" 28 | } 29 | } 30 | 31 | @server.list_tools() 32 | async def handle_list_tools() -> list[types.Tool]: 33 | """ 34 | List available tools. 35 | Each tool specifies its arguments using JSON Schema validation. 36 | """ 37 | return [ 38 | types.Tool( 39 | name="website_firecrawl", 40 | description="Crawl a website", 41 | inputSchema=WebsiteCrawlArgs.model_json_schema(), 42 | ) 43 | ] 44 | 45 | @server.call_tool() 46 | async def handle_call_tool( 47 | name: str, arguments: Optional[dict] 48 | ) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]: 49 | """ 50 | Handle tool execution requests. 51 | 52 | Args: 53 | name: The name of the tool to execute 54 | arguments: Dictionary of tool arguments 55 | 56 | Returns: 57 | List of content items produced by the tool 58 | 59 | Raises: 60 | ValueError: If tool name is invalid or arguments are missing 61 | """ 62 | try: 63 | if name != "website_firecrawl": 64 | raise ValueError(f"Unknown tool: {name}") 65 | 66 | if not arguments: 67 | raise ValueError("Missing arguments") 68 | 69 | args = WebsiteCrawlArgs.model_validate(arguments) 70 | 71 | logging.info(f"Starting crawl of query={args.query} and base_url={args.base_url} with max_links={args.max_links}") 72 | 73 | crawler = WebsiteCrawler() 74 | pages = await crawler.crawl(args.query, args.base_url, args.max_links, llm_predict=True) 75 | 76 | return [ 77 | types.TextContent( 78 | type="text", 79 | text="\n".join(clean_html_text(str(page)) for page in pages), 80 | ) 81 | ] 82 | except Exception as e: 83 | logging.error(f"Error during crawl: {str(e)}") 84 | raise 85 | 86 | async def main(): 87 | logging.basicConfig(level=logging.INFO) 88 | 89 | async with mcp.server.stdio.stdio_server() as (read_stream, write_stream): 90 | try: 91 | await server.run( 92 | read_stream, 93 | write_stream, 94 | InitializationOptions( 95 | server_name=server.name, 96 | server_version="0.1.0", 97 | capabilities=ServerCapabilities() 98 | ) 99 | ) 100 | except Exception as e: 101 | logging.error(f"Server error: {str(e)}") 102 | raise 103 | -------------------------------------------------------------------------------- /servers/website_firecrawl_service/utils.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | 4 | def clean_html_text(text: str) -> str: 5 | """Remove HTML tags and clean up text.""" 6 | soup = BeautifulSoup(text, 'html.parser') 7 | clean_text = soup.get_text() 8 | clean_text = re.sub(r'\s+', ' ', clean_text).strip() 9 | return clean_text --------------------------------------------------------------------------------