├── .gitignore
├── LICENSE
├── README.md
├── app_architecture.png
├── core
    ├── .env.example
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   └── base
    │   │   ├── __init__.py
    │   │   └── react.py
    ├── clients
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── mcp.py
    │   │   └── utils.py
    │   └── langgraph
    │   │   ├── __init__.py
    │   │   └── client.py
    ├── mcp_architecture.png
    ├── pyproject.toml
    └── uv.lock
└── servers
    ├── .env.example
    ├── LICENSE
    ├── README.md
    ├── pyproject.toml
    ├── uv.lock
    ├── web_firecrawl_service_v0.1.0.png
    └── website_firecrawl_service
        ├── __init__.py
        ├── crawler.py
        ├── openai.py
        ├── prompt.py
        ├── server.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | 
173 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 lgesuellip
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | An application built on the Model Context Protocol (MCP) that transforms any website into highly relevant content based on your queries. The app seamlessly integrates with platforms like X, Slack, and others through Arcade.
 2 | 
 3 | ### Perfect For
 4 | - LLM-Ready File Creation: Generate .txt files optimized for use with large language models.
 5 | - Documentation Indexing: Organize and structure documentation effortlessly.
 6 | - Research Automation: Save time by automating repetitive research tasks.
 7 | 
 8 | ### Tech Stack
 9 | - LangGraph as the MCP Client
10 | - Firecrawll for web research (site mapping, intelligent selection, and scraping)
11 | - Arcade for seamless platform integration (X, Slack, etc.)
12 | - Tracing powered by LangChainAI LangSmith
13 | - Utilizes OpenAI's structured outputs, async processing, exponential backoff, and Pydantic for reliability
14 | 
15 | ### Architecture Diagram
16 | 
17 | ![Architecture Diagram](app_architecture.png)
18 | 


--------------------------------------------------------------------------------
/app_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/app_architecture.png


--------------------------------------------------------------------------------
/core/.env.example:
--------------------------------------------------------------------------------
 1 | OPENAI_API_KEY =
 2 | 
 3 | # If you want to use Arcade, you can set these variables
 4 | ARCADE_API_KEY=
 5 | ARCADE_USER_ID=
 6 | 
 7 | # Optional
 8 | LANGSMITH_API_KEY =
 9 | LANGCHAIN_TRACING_V2 =
10 | LANGCHAIN_PROJECT =
11 | 


--------------------------------------------------------------------------------
/core/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 lgesuellip
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/core/README.md:
--------------------------------------------------------------------------------
  1 | # LangGraph MCP Integration
  2 | 
  3 | This repository demonstrates the integration of LangGraph with MCP servers. This integration allows you to build powerful agent-based applications that can interact with both MCP servers and external services.
  4 | 
  5 | ## Features
  6 | 
  7 | - Seamless integration of LangGraph with MCP servers
  8 | - Support for React-based agents using MCP tools
  9 | - Integration with Arcade and custom services (like Firecrawl)
 10 | - Asynchronous tool execution
 11 | - Type-safe tool wrapping with Pydantic models
 12 | 
 13 | Make sure to set up your environment variables in a `.env` file:
 14 | 
 15 | ```env
 16 | OPENAI_API_KEY=your_openai_api_key
 17 | ARCADE_API_KEY=your_arcade_api_key
 18 | ARCADE_USER_ID=your_arcade_user_id
 19 | ```
 20 | 
 21 | ### Development Commands
 22 | 
 23 | ```bash
 24 |  uv run agents/base/react.py
 25 | ```
 26 | 
 27 | ## Architecture
 28 | 
 29 | ![MCP Architecture](mcp_architecture.png)
 30 | 
 31 | ### Components
 32 | 
 33 | 1. **LangGraph Agent**
 34 |    - React Agent: Implements the ReAct pattern for reasoning and action
 35 |    - LangGraph MCP Client: Bridges LangGraph with MCP servers
 36 | 
 37 | 2. **MCP Servers**
 38 |    - Firecrawl Server: Provides web crawling capabilities
 39 | 
 40 | 3. **Arcade Server**: Offers integration with various services, its so familiar with MCP.
 41 | 
 42 | ## Usage
 43 | 
 44 | ### Basic LangGraph MCP Client
 45 | 
 46 | The `LanggraphMCPClient` class provides a bridge between LangGraph and MCP servers:
 47 | 
 48 | ```python
 49 | from clients.langgraph.client import LanggraphMCPClient
 50 | from mcp import StdioServerParameters
 51 | 
 52 | # Configure your MCP server
 53 | server_params = StdioServerParameters(
 54 |     command="uv",
 55 |     args=[
 56 |         "--directory",
 57 |         "/path/to/your/service",
 58 |         "run",
 59 |         "website-firecrawl-service"
 60 |     ]
 61 | )
 62 | 
 63 | # Use the client to get tools from the MCP server
 64 | async with LanggraphMCPClient(server_params=server_params) as mcp_client:
 65 |     tools = await mcp_client.get_tools()
 66 | ```
 67 | 
 68 | ### Creating a React Agent with MCP Tools
 69 | 
 70 | Here's an example of creating a React agent that combines MCP tools with Arcade services:
 71 | 
 72 | ```python
 73 | from langchain_core.messages import HumanMessage
 74 | from langchain_openai import ChatOpenAI
 75 | from langgraph.prebuilt import create_react_agent
 76 | from langchain_arcade import ArcadeToolManager
 77 | 
 78 | async def create_agent():
 79 |     tools = []
 80 |     
 81 |     # Get tools from MCP server
 82 |     async with LanggraphMCPClient(server_params=server_params) as mcp_client:
 83 |         tools.extend(await mcp_client.get_tools())
 84 |     
 85 |     # Add Arcade tools
 86 |     tool_arcade_manager = ArcadeToolManager(api_key=os.getenv("ARCADE_API_KEY"))
 87 |     tools.extend(tool_arcade_manager.get_tools(toolkits=["slack"]))
 88 |     
 89 |     # Create the agent with combined tools
 90 |     model = ChatOpenAI(model="gpt-4")
 91 |     graph = create_react_agent(model, tools=tools)
 92 |     
 93 |     return graph
 94 | 
 95 | # Use the agent
 96 | inputs = {
 97 |     "messages": [HumanMessage(content="Analyze pampa.ai and send a summary to Slack")],
 98 | }
 99 | 
100 | config = {
101 |     "configurable": {
102 |         "thread_id": "1",
103 |         "user_id": os.getenv("ARCADE_USER_ID"),
104 |     }
105 | }
106 | 
107 | result = await graph.ainvoke(inputs, config=config)
108 | ```
109 | 
110 | ### Using the Firecrawl Service
111 | 
112 | The Firecrawl service is implemented as an MCP server, allowing your agents to crawl and analyze websites:
113 | 
114 | ```python
115 | # Example configuration in your MCP server setup:
116 | server_params = StdioServerParameters(
117 |     command="uv",
118 |     args=[
119 |         "--directory",
120 |         "/path/to/firecrawl",
121 |         "run",
122 |         "website-firecrawl-service"
123 |     ]
124 | )
125 | 
126 | # The Firecrawl service tools become available through the MCP client
127 | # Example usage in your agent's prompt:
128 | "Analyze the website example.com and provide a summary"
129 | ```
130 | 
131 | ## Resources
132 | 
133 | - [MCP Official Documentation](https://modelcontextprotocol.io/docs)
134 | - [MCP GitHub Repository](https://github.com/modelcontextprotocol)
135 | - [LangGraph Documentation](https://python.langchain.com/docs/langgraph)
136 | 
137 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/__init__.py


--------------------------------------------------------------------------------
/core/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/agents/__init__.py


--------------------------------------------------------------------------------
/core/agents/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/agents/base/__init__.py


--------------------------------------------------------------------------------
/core/agents/base/react.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.messages import HumanMessage
 2 | from langchain_openai import ChatOpenAI
 3 | from langgraph.prebuilt import create_react_agent
 4 | 
 5 | from mcp import StdioServerParameters
 6 | from clients.langgraph.client import LanggraphMCPClient
 7 | 
 8 | from langchain_arcade import ArcadeToolManager
 9 | 
10 | import asyncio
11 | import logging
12 | 
13 | import os
14 | from dotenv import load_dotenv
15 | 
16 | load_dotenv()
17 | 
18 | logger = logging.getLogger(__name__)
19 | logger.setLevel(logging.INFO)
20 | 
21 | logger.info("Starting the example client script.")
22 | 
23 | server_params = StdioServerParameters(
24 |     command="uv",
25 |     args= [
26 |         "--directory",
27 |         "/Users/lgesuellip/Desktop/mcp_firecrawl/researcher_service/servers",
28 |         "run",
29 |         "website-firecrawl-service"
30 |     ]
31 | )
32 | 
33 | async def main():
34 | 
35 |     tools = []
36 | 
37 |     # Get tools from MCP
38 |     async with LanggraphMCPClient(server_params=server_params) as mcp_client:
39 |         tools.extend(await mcp_client.get_tools())
40 |     
41 |         # Get tools from Arcade
42 |         tool_arcade_manager = ArcadeToolManager(api_key=os.getenv("ARCADE_API_KEY"))
43 |         tools.extend(tool_arcade_manager.get_tools(toolkits=["slack"]))
44 |     
45 |         model = ChatOpenAI(model="gpt-4")
46 | 
47 |         graph = create_react_agent(model, tools=tools)
48 | 
49 |         inputs = {
50 |             "messages": [HumanMessage(content="A summary of pampa.ai website, and send it to 'lautaro'")],
51 |         }
52 | 
53 |         config = {
54 |             "configurable": {
55 |                 "thread_id": "1",
56 |                 "user_id": os.getenv("ARCADE_USER_ID"),
57 |             }
58 |         }
59 |         result = await graph.ainvoke(inputs, config=config)
60 |         print(result)
61 | 
62 | if __name__ == "__main__":
63 |     asyncio.run(main())
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/core/clients/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/clients/__init__.py


--------------------------------------------------------------------------------
/core/clients/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/clients/common/__init__.py


--------------------------------------------------------------------------------
/core/clients/common/mcp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from abc import ABC, abstractmethod
  3 | from typing import Any, Optional
  4 | from mcp import ClientSession
  5 | from mcp.client.stdio import stdio_client
  6 | 
  7 | import logging
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | class BaseMCPClient(ABC):
 12 |     """Abstract base class for MCP clients providing session management and tool handling."""
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         server_params: Optional[Any] = None,
 17 |         **kwargs: dict[str, Any],
 18 |     ) -> None:
 19 |         """
 20 |         Initialize the BaseMCPClient with optional server parameters.
 21 | 
 22 |         Args:
 23 |             server_params: Optional parameters for server configuration.
 24 |             **kwargs: Additional keyword arguments.
 25 |         """
 26 |         self.session = None
 27 |         self._tools: list[Any] = []
 28 |         self.server_params = server_params
 29 |         self.read: Optional[Any] = None
 30 |         self.write: Optional[Any] = None
 31 | 
 32 |     @property
 33 |     def tools(self) -> list[Any]:
 34 |         """List of tools available to the client."""
 35 |         return self._tools
 36 | 
 37 |     async def __aenter__(self):
 38 |         """
 39 |         Asynchronous context manager entry.
 40 | 
 41 |         Initializes the stdio client and session, preparing the client for use.
 42 |         """
 43 |         self._stdio_ctx = stdio_client(self.server_params)
 44 |         self.read, self.write = await self._stdio_ctx.__aenter__()
 45 |         
 46 |         self.session = ClientSession(self.read, self.write)
 47 |         await self.session.__aenter__()
 48 |         await self.session.initialize()
 49 |         return self
 50 | 
 51 |     async def __aexit__(self, exc_type, exc_val, exc_tb):
 52 |         """
 53 |         Asynchronous context manager exit.
 54 | 
 55 |         Cleans up the session and stdio context.
 56 |         """
 57 |         if self.session:
 58 |             await self.session.__aexit__(exc_type, exc_val, exc_tb)
 59 |         if hasattr(self, '_stdio_ctx'):
 60 |             await self._stdio_ctx.__aexit__(exc_type, exc_val, exc_tb)
 61 |         return 0
 62 | 
 63 |     async def init_tools(
 64 |         self,
 65 |     ) -> None:
 66 |         """
 67 |         Initialize the list of tools by querying the session.
 68 | 
 69 |         Populates the internal tools list with available tools from the session.
 70 |         """
 71 |         response = await self.session.list_tools()
 72 |         self._tools = response.tools
 73 | 
 74 |     @abstractmethod
 75 |     def wrap_tool(self, name: str, tool_def: Any, **kwargs: Any) -> Any:
 76 |         """
 77 |         Abstract method to wrap a tool definition.
 78 | 
 79 |         Args:
 80 |             name: The name of the tool.
 81 |             tool_def: The tool definition object.
 82 |             **kwargs: Additional keyword arguments for tool configuration.
 83 | 
 84 |         Returns:
 85 |             A wrapped tool object.
 86 |         """
 87 |         pass
 88 | 
 89 |     async def get_tools(
 90 |         self, **kwargs: Any
 91 |     ) -> list[Any]:
 92 |         """
 93 |         Retrieve and wrap available tools.
 94 | 
 95 |         Args:
 96 |             **kwargs: Additional keyword arguments for tool wrapping.
 97 | 
 98 |         Returns:
 99 |             A list of wrapped tool objects.
100 |         """
101 |         if not self._tools:
102 |             await self.init_tools()
103 |         return [self.wrap_tool(tool, **kwargs) for tool in self._tools]
104 | 


--------------------------------------------------------------------------------
/core/clients/common/utils.py:
--------------------------------------------------------------------------------
 1 | from pydantic import create_model, Field
 2 | 
 3 | BASIC_TYPE_MAP = {
 4 |     "string": str,
 5 |     "number": float,
 6 |     "integer": int,
 7 |     "boolean": bool,
 8 |     "array": list,
 9 |     "object": dict,
10 | }
11 | 
12 | def create_pydantic_model_from_json_schema(klass, schema):
13 |     """
14 |     Creates a Pydantic model from a JSON schema.
15 |     """
16 |     fields = {}
17 |     for prop_name, prop_info in schema['properties'].items():
18 |         field_type = prop_info.get('type', 'default') # if no type, then it's the default?
19 |         py_type = None
20 |         if field_type == 'default' or prop_name in ['properties', 'required', 'default', 'additionalProperties']:
21 |             continue
22 |         if field_type == 'array':
23 |             item_type = prop_info['items']['type']
24 |             if item_type == 'object':
25 |                 py_type = list[create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", prop_info['items'])]
26 |             else:
27 |                 py_type = list[BASIC_TYPE_MAP.get(item_type, None)]
28 |         elif field_type == 'object':
29 |             if prop_info.get('properties', None):
30 |                 py_type = create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", prop_info)
31 |             elif prop_info.get('$ref'):
32 |                 # NOTE: We probably need to make this more robust
33 |                 ref_info = schema['properties'].get(prop_info['$ref'].split("/")[-1])
34 |                 py_type = create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", ref_info)
35 |             elif prop_info.get('additionalProperties', {}).get('$ref', None):
36 |                 ref_info = schema['properties'].get(prop_info['additionalProperties']['$ref'].split("/")[-1])
37 |                 py_type = dict[str, create_pydantic_model_from_json_schema(f"{klass}_{prop_name}", ref_info)]
38 |             else:
39 |                 raise Exception(f"Object Error, {py_type} {prop_name} for {field_type}")
40 |         elif BASIC_TYPE_MAP.get(field_type):
41 |             py_type = BASIC_TYPE_MAP[field_type]
42 | 
43 |         if py_type is None:
44 |             raise Exception(f"Error, {py_type} for {field_type}")
45 | 
46 |         default = prop_info.get('default', ...) if prop_name in schema.get('required', []) else ...
47 |         description = prop_info.get('description', '')
48 |         fields[prop_name] = (py_type, Field(default, description=description))
49 | 
50 |     return create_model(klass, **fields)


--------------------------------------------------------------------------------
/core/clients/langgraph/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/clients/langgraph/__init__.py


--------------------------------------------------------------------------------
/core/clients/langgraph/client.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | import logging
 3 | from langchain_core.tools import StructuredTool
 4 | from ..common.mcp import BaseMCPClient
 5 | from ..common.utils import create_pydantic_model_from_json_schema
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | class LanggraphMCPClient(BaseMCPClient):
10 | 
11 |     def tool_call(self, tool_name: str) -> Any:
12 |         """Create an asynchronous function to call a tool by its name.
13 | 
14 |         Args:
15 |             tool_name: The name of the tool to be called.
16 | 
17 |         Returns:
18 |             An asynchronous function that executes the tool with the provided arguments.
19 |         """
20 | 
21 |         async def tool_function(*args: Any, **kwargs: Any) -> Any:
22 |             print(f"Executing tool with args: {args} and kwargs: {kwargs}, {tool_name}")
23 |             result = await self.session.call_tool(tool_name, arguments=kwargs)
24 |             return result
25 |         
26 |         return tool_function
27 | 
28 |     def wrap_tool(
29 |         self, tool: Any, **kwargs: Any
30 |     ) -> StructuredTool:
31 |         """Wrap a tool as a StructuredTool instance.
32 | 
33 |         Args:
34 |             tool: The tool object to wrap.
35 |             **kwargs: Additional keyword arguments for tool configuration.
36 | 
37 |         Returns:
38 |             A StructuredTool instance configured with the provided tool and arguments.
39 |         """
40 | 
41 |         return StructuredTool.from_function(
42 |             coroutine=self.tool_call(tool.name),
43 |             name=tool.name,
44 |             description=tool.description or "No description provided.",
45 |             args_schema=create_pydantic_model_from_json_schema(tool.name, tool.inputSchema),
46 |         )
47 | 


--------------------------------------------------------------------------------
/core/mcp_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/core/mcp_architecture.png


--------------------------------------------------------------------------------
/core/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "mcp-client"
 7 | version = "0.1.0"
 8 | description = "MCP Client"
 9 | readme = "README.md"
10 | requires-python = ">=3.11"
11 | dependencies = [
12 |     "mcp>=1.1.0",
13 |     "openai>=1.43.0",
14 |     "pydantic>=2.9.2",
15 |     "langchain_community>=0.3.1",
16 |     "langsmith>=0.1.129",
17 |     "jinja2>=3.1.2",
18 |     "langchain_openai>=0.2.14,<0.3",
19 |     "langgraph>=0.2.60,<0.3",
20 |     "langchain_arcade>=0.1.2,<0.2.0",
21 |     "arcadepy >= 0.1.0, <0.2.0",
22 |     "httpx >= 0.27.1, <0.28.0",
23 | ]
24 | 
25 | [[project.authors]]
26 | name = "lgesuellip"
27 | email = "lautaro@pampa.ai"
28 | 
29 | [tool.hatch.build.targets.wheel]
30 | packages = ["agents", "clients"]
31 | 
32 | [project.scripts]
33 | react-agent = "agents.base.react:main"
34 | 


--------------------------------------------------------------------------------
/servers/.env.example:
--------------------------------------------------------------------------------
1 | FIRECRAWL_API_KEY = 
2 | OPENAI_API_KEY = 
3 | 
4 | # Optional
5 | LANGSMITH_API_KEY =
6 | LANGCHAIN_TRACING_V2 =
7 | LANGCHAIN_PROJECT = 
8 | 


--------------------------------------------------------------------------------
/servers/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 lgesuellip
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/servers/README.md:
--------------------------------------------------------------------------------
 1 | # website_firecrawl_service - MCP Server
 2 | 
 3 | 🔍 Internet research just got smarter! Built an MCP server that turns any website into structured, relevant content based on your queries!
 4 | 
 5 | Using @Firecrawl's powerful features (mapping, selection, scraping), combined with GPT-4o for smart URL filtering, it's like having an AI research assistant that knows exactly what you're looking for! Works seamlessly with Claude, or any MCP-compatible client!
 6 | 
 7 | ![Agentic Web Scraping Architecture](web_firecrawl_service_v0.1.0.png)
 8 | 
 9 | *An agentic web scraping system powered by Firecrawl: Map → Select → Scrape → Extract*
10 | 
11 | ---
12 | 
13 | ## Features
14 | 
15 | - **Efficient Web Crawling**: Crawls websites using the Firecrawl API with customizable link limits and intelligent URL selection
16 | - **Intelligent URL Selection**: Uses GPT-4 to select the most relevant URLs based on user queries
17 | - **Smart Content Processing**: Extracts and cleans HTML content, providing readable text output
18 | 
19 | ## Setup
20 | 
21 | 1. Create a `.env` file with the following required environment variables:
22 |    ```
23 |    FIRECRAWL_API_KEY=your_firecrawl_api_key
24 |    OPENAI_API_KEY=your_openai_api_key
25 |    ```
26 | 
27 | ## Usage
28 | 
29 | The server exposes a single tool:
30 | 
31 | ### website_firecrawl
32 | 
33 | **Description**: Crawls a website and returns relevant content based on a query.
34 | 
35 | **Parameters**:
36 | - `query` (string): The search query to filter relevant content
37 | - `base_url` (string): The target website URL to crawl
38 | - `max_links` (integer, optional): Maximum number of links to process (default: 100)
39 | 
40 | ## Technical Details
41 | 
42 | - Built using the MCP (Model Control Protocol) server framework
43 | - Implements retry logic with exponential backoff for API calls
44 | - Integrates with LangSmith for tracing and monitoring
45 | - Implements singleton patterns for API clients to manage resources efficiently
46 | - Uses Pydantic for robust data validation and serialization:
47 |   - `WebsiteCrawlArgs`: Validates input parameters for the crawling service
48 |   - `CrawlerModel`: Handles URL selection and justification
49 |   - `Page`: Structures metadata and content from crawled pages
50 | - Structured OpenAI Integration:
51 |   - Uses OpenAI's beta chat completions with parsing
52 |   - Automatically validates and converts JSON responses to Pydantic models
53 |   - Ensures type safety and data validation for AI-generated content
54 | - Jinja2 Template System:
55 |   - Modular prompt management using template inheritance
56 |   - Dynamic prompt generation based on user queries and context
57 |   - Separate system and user prompt templates for clear separation of concerns
58 |   - Easy maintenance and updates of prompt structures
59 | 
60 | 


--------------------------------------------------------------------------------
/servers/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "website-firecrawl-service"
 3 | version = "0.1.0"
 4 | description = "This tool leverages Firecrawl to generate concise summaries of web pages directly from their URLs. Firecrawl processes the content of the provided website, extracting key insights and metadata to deliver a brief, focused summary."
 5 | readme = "README.md"
 6 | requires-python = ">=3.11"
 7 | dependencies = [
 8 |     "mcp>=1.1.0",
 9 |     "openai>=1.43.0",
10 |     "pydantic>=2.9.2",
11 |     "langchain_community>=0.3.1",
12 |     "langsmith>=0.1.129",
13 |     "tenacity>=8.5.0",
14 |     "firecrawl>=1.0.0",
15 |     "jinja2>=3.1.2",
16 |     "beautifulsoup4>=4.12.3"
17 | ]
18 | 
19 | [[project.authors]]
20 | name = "lgesuellip"
21 | email = "lautaro@pampa.ai"
22 | 
23 | [build-system]
24 | requires = [ "hatchling",]
25 | build-backend = "hatchling.build"
26 | 
27 | [project.scripts]
28 | website-firecrawl-service = "website_firecrawl_service:main"
29 | 


--------------------------------------------------------------------------------
/servers/web_firecrawl_service_v0.1.0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgesuellip/researcher_agent/d95548b6e63db7f46a94df432ebccc703fb8ec6b/servers/web_firecrawl_service_v0.1.0.png


--------------------------------------------------------------------------------
/servers/website_firecrawl_service/__init__.py:
--------------------------------------------------------------------------------
1 | from . import server
2 | import asyncio
3 | 
4 | def main():
5 |     """Main entry point for the package."""
6 |     asyncio.run(server.main())
7 | 
8 | __all__ = ['main', 'server']


--------------------------------------------------------------------------------
/servers/website_firecrawl_service/crawler.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from typing import List, Dict, Optional
  4 | from urllib.parse import urlparse
  5 | from pydantic import BaseModel, Field
  6 | from dotenv import load_dotenv
  7 | 
  8 | from firecrawl import FirecrawlApp
  9 | 
 10 | from langsmith import traceable
 11 | 
 12 | from website_firecrawl_service.openai import Inference
 13 | from website_firecrawl_service.prompt import SYSTEM_CRAWLER_PROMPT, USER_CRAWLER_PROMPT
 14 | 
 15 | logging.basicConfig(level=logging.INFO)
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | load_dotenv()
 19 | 
 20 | inference = Inference()
 21 | 
 22 | model_args = {
 23 |     "model": "gpt-4o-mini",
 24 |     "temperature": 0,
 25 | }
 26 | 
 27 | class CrawlerModel(BaseModel):
 28 |     target_urls: List[str] = Field(description="The domain URLs to be considered")
 29 |     justification: str = Field(description="The reason for selecting these target URLs")
 30 | 
 31 | class Page(BaseModel):
 32 |     metadata: Dict
 33 |     body: str
 34 | 
 35 | class FirecrawlClient:
 36 |     _instance = None
 37 | 
 38 |     def __new__(cls):
 39 |         if cls._instance is None:
 40 |             cls._instance = super().__new__(cls)
 41 |             cls._instance._initialize()
 42 |         return cls._instance
 43 | 
 44 |     def _initialize(self):
 45 |         
 46 |         self.client = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
 47 | 
 48 | class WebsiteCrawler:
 49 |     def __init__(self):
 50 |         self._pages: List[Page] = []
 51 |         self.firecrawl = FirecrawlClient()
 52 | 
 53 |     def _normalize_url(self, url: str) -> str:
 54 |         """Normalize the URL and handle GitHub repositories specially"""
 55 |         if not url.startswith(('http://', 'https://')):
 56 |             if url.startswith(('http:/', 'https:/')):
 57 |                 url = url.replace(':', '://')
 58 |             else:
 59 |                 url = f'http://{url}'
 60 |         
 61 |         url_obj = urlparse(url)
 62 |         stem_url = url_obj.netloc
 63 | 
 64 |         # Special handling for GitHub URLs
 65 |         if 'github.com' in stem_url:
 66 |             path_segments = [seg for seg in url_obj.path.split('/') if seg]
 67 |             if len(path_segments) >= 2:
 68 |                 owner, repo = path_segments[0:2]
 69 |                 stem_url = f"{stem_url}/{owner}/{repo}"
 70 |         
 71 |         return stem_url
 72 |     
 73 | 
 74 |     @traceable(name="select_crawler_urls")
 75 |     async def _select_links(self, query: str, links: List[Dict]):  
 76 |         messages = [
 77 |             {"role": "system", "content": SYSTEM_CRAWLER_PROMPT.render()},
 78 |             {"role": "user", "content": USER_CRAWLER_PROMPT.render(query=query, links=links)}
 79 |         ]
 80 |         result = await inference.predict_with_parse_async(model_args, CrawlerModel, messages)
 81 |         return result.target_urls
 82 | 
 83 |     async def crawl(self, query: str, base_url: str, max_links: int = 100, llm_predict: bool = True) -> Optional[List[Page]]:
 84 |         """Fetch multiple pages using Firecrawl API and create Page objects"""
 85 |         try:
 86 |             logger.info(f"Searching for urls in {base_url}")
 87 |             
 88 |             self.base_url = self._normalize_url(base_url)
 89 | 
 90 |             # Map the URL to get a list of links
 91 |             map_result = self.firecrawl.client.map_url(
 92 |                 self.base_url, 
 93 |                 params={'limit': max_links}
 94 |             )
 95 |             
 96 |             if not map_result.get('success'):
 97 |                 logger.error(f"Failed to map URL: {base_url}")
 98 |                 return None
 99 |             
100 |             # Select links based on relevance, given the user query
101 |             links = await self._select_links(query, map_result['links']) if llm_predict else map_result['links']
102 | 
103 |             # Scrape the selected links
104 |             for link in links[:]:
105 |                 try:
106 |                     logger.info(f"Starting scrape for link: {link}")
107 |                     
108 |                     page = self.firecrawl.client.scrape_url(
109 |                         link, 
110 |                         params={
111 |                             'formats': ['rawHtml'],
112 |                         },
113 |                     )       
114 |                         
115 |                     self._pages.append(Page(
116 |                         body=page['rawHtml'],
117 |                             metadata={
118 |                                 'url': page['metadata']['url'],
119 |                                 'title': page['metadata'].get('title', ''),
120 |                                 'description': page['metadata'].get('description', ''),
121 |                                 'language': page['metadata'].get('language', ''),
122 |                             },
123 |                         ))
124 |                 except Exception as e:
125 |                     logger.error(f"Error scraping {link}: {str(e)}")
126 |                     continue
127 |                     
128 |             return self._pages
129 |             
130 |         except Exception as e:
131 |             logger.error(f"Error during crawl: {str(e)}")
132 |             return None
133 | 
134 | 


--------------------------------------------------------------------------------
/servers/website_firecrawl_service/openai.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | from openai import AsyncOpenAI
 3 | from tenacity import (
 4 |     retry,
 5 |     stop_after_attempt,
 6 |     wait_random_exponential,
 7 | )
 8 | import os
 9 | from langsmith.wrappers import wrap_openai
10 | import logging
11 | from langsmith import traceable
12 | 
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class OpenAIClientSingleton:
18 |     _instance = None
19 | 
20 |     @classmethod
21 |     def get_instance(cls):
22 |         if cls._instance is None:
23 |             api_key = os.getenv("OPENAI_API_KEY")
24 |             if not api_key:
25 |                 raise ValueError("OPENAI_API_KEY not found in environment variables")
26 |             
27 |             cls._instance = wrap_openai(AsyncOpenAI(api_key=api_key))
28 |         return cls._instance
29 | 
30 | class Inference:
31 |     def __init__(self):
32 |         self.client = OpenAIClientSingleton.get_instance()
33 | 
34 |     @traceable
35 |     @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(3))
36 |     async def predict_with_parse_async(self, model_args: Dict, response_format, messages: List[Dict]):
37 |         
38 |         response = await self.client.beta.chat.completions.parse(
39 |                         **model_args,
40 |                         messages=messages,
41 |                         response_format=response_format,
42 |                     )
43 | 
44 |         return response.choices[0].message.parsed
45 | 


--------------------------------------------------------------------------------
/servers/website_firecrawl_service/prompt.py:
--------------------------------------------------------------------------------
 1 | from jinja2 import Template
 2 | 
 3 | SYSTEM_CRAWLER_PROMPT = Template("""
 4 | <TASK>
 5 | As an expert in web crawling, data extraction, and content relevance identification, your goal is to select URLs based on their relevance to the user's query.
 6 | </TASK>
 7 | 
 8 | <GUIDELINES>
 9 | - PRIMARY FOCUS: Identify URLs that best address the user's information needs as expressed in the query.
10 | - EXCLUDE: Omit URLs that are not relevant or do not contribute meaningful information related to the query.
11 | - DEDUPLICATE: Ensure all URLs in the final output are unique.
12 | </GUIDELINES>
13 | """)
14 | 
15 | USER_CRAWLER_PROMPT = Template("""
16 | The user is seeking information related to: "{{query}}"
17 | 
18 | Below is a list of URLs. Based on the user's query, select the URLs most likely to contain relevant and helpful information.
19 | 
20 | <URLS>
21 | {{links}}
22 | </URLS>
23 | """)


--------------------------------------------------------------------------------
/servers/website_firecrawl_service/server.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Optional
  3 | from pydantic import BaseModel
  4 | from dotenv import load_dotenv
  5 | 
  6 | import mcp.types as types
  7 | from mcp.server import Server
  8 | from mcp.server.models import InitializationOptions
  9 | import mcp.server.stdio
 10 | from mcp.types import ServerCapabilities
 11 | 
 12 | from .crawler import WebsiteCrawler
 13 | from website_firecrawl_service.utils import clean_html_text
 14 | 
 15 | 
 16 | load_dotenv()
 17 | 
 18 | server = Server("website_firecrawl_service")
 19 | 
 20 | class WebsiteCrawlArgs(BaseModel):
 21 |     query: str
 22 |     base_url: str
 23 |     max_links: int = 100
 24 | 
 25 |     model_config = {
 26 |         "json_schema_extra": {
 27 |             "description": "Arguments for crawling a website"
 28 |         }
 29 |     }
 30 | 
 31 | @server.list_tools()
 32 | async def handle_list_tools() -> list[types.Tool]:
 33 |     """
 34 |     List available tools.
 35 |     Each tool specifies its arguments using JSON Schema validation.
 36 |     """
 37 |     return [
 38 |         types.Tool(
 39 |             name="website_firecrawl",
 40 |             description="Crawl a website",
 41 |             inputSchema=WebsiteCrawlArgs.model_json_schema(),
 42 |         )
 43 |     ]
 44 | 
 45 | @server.call_tool()
 46 | async def handle_call_tool(
 47 |     name: str, arguments: Optional[dict]
 48 | ) -> List[types.TextContent | types.ImageContent | types.EmbeddedResource]:
 49 |     """
 50 |     Handle tool execution requests.
 51 |     
 52 |     Args:
 53 |         name: The name of the tool to execute
 54 |         arguments: Dictionary of tool arguments
 55 |         
 56 |     Returns:
 57 |         List of content items produced by the tool
 58 |         
 59 |     Raises:
 60 |         ValueError: If tool name is invalid or arguments are missing
 61 |     """
 62 |     try:
 63 |         if name != "website_firecrawl":
 64 |             raise ValueError(f"Unknown tool: {name}")
 65 | 
 66 |         if not arguments:
 67 |             raise ValueError("Missing arguments")
 68 |         
 69 |         args = WebsiteCrawlArgs.model_validate(arguments)
 70 | 
 71 |         logging.info(f"Starting crawl of query={args.query} and base_url={args.base_url} with max_links={args.max_links}")
 72 |         
 73 |         crawler = WebsiteCrawler()
 74 |         pages = await crawler.crawl(args.query, args.base_url, args.max_links, llm_predict=True)
 75 | 
 76 |         return [
 77 |             types.TextContent(
 78 |                 type="text",
 79 |                 text="\n".join(clean_html_text(str(page)) for page in pages),
 80 |             )
 81 |         ]
 82 |     except Exception as e:
 83 |         logging.error(f"Error during crawl: {str(e)}")
 84 |         raise
 85 | 
 86 | async def main():
 87 |     logging.basicConfig(level=logging.INFO)
 88 |     
 89 |     async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
 90 |         try:
 91 |             await server.run(
 92 |                 read_stream,
 93 |                 write_stream,
 94 |                 InitializationOptions(
 95 |                     server_name=server.name,
 96 |                     server_version="0.1.0",
 97 |                     capabilities=ServerCapabilities()
 98 |                 )
 99 |             )
100 |         except Exception as e:
101 |             logging.error(f"Server error: {str(e)}")
102 |             raise
103 | 


--------------------------------------------------------------------------------
/servers/website_firecrawl_service/utils.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import re
3 | 
4 | def clean_html_text(text: str) -> str:
5 |     """Remove HTML tags and clean up text."""
6 |     soup = BeautifulSoup(text, 'html.parser')
7 |     clean_text = soup.get_text()
8 |     clean_text = re.sub(r'\s+', ' ', clean_text).strip()
9 |     return clean_text


--------------------------------------------------------------------------------