├── graph.png ├── src └── company_researcher │ ├── __init__.py │ ├── utils │ ├── all.py │ └── tavily_utils.py │ ├── nodes │ ├── __init__.py │ ├── enrich.py │ ├── ground.py │ ├── rerank.py │ ├── research.py │ ├── cluster.py │ └── write.py │ ├── config.py │ ├── router.py │ ├── graph.py │ └── state.py ├── .env.example ├── langgraph.json ├── README.md ├── pyproject.toml └── .gitignore /graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tavily-ai/tavily_company_researcher/HEAD/graph.png -------------------------------------------------------------------------------- /src/company_researcher/__init__.py: -------------------------------------------------------------------------------- 1 | from company_researcher.graph import graph 2 | 3 | __all__ = ["graph"] -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY="your-API-key" 2 | TAVILY_API_KEY="your-API-key" 3 | COHERE_API_KEY="your-API-key" -------------------------------------------------------------------------------- /src/company_researcher/utils/all.py: -------------------------------------------------------------------------------- 1 | from .tavily_utils import Tavily 2 | 3 | class Utils: 4 | def __init__(self): 5 | self.tavily = Tavily() -------------------------------------------------------------------------------- /langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": ["."], 3 | "graphs": { 4 | "agent": "./src/company_researcher/graph.py:graph" 5 | }, 6 | "env": ".env" 7 | } 8 | -------------------------------------------------------------------------------- /src/company_researcher/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | from .ground import GroundAgent 2 | from .research import ResearchAgent 3 | from .cluster import ClusterAgent 4 | from .rerank import RerankAgent 5 | from .enrich import EnrichAgent 6 | from .write import WriteAgent -------------------------------------------------------------------------------- /src/company_researcher/config.py: -------------------------------------------------------------------------------- 1 | from langchain_openai import ChatOpenAI 2 | 3 | # Description: Configuration file 4 | class Config: 5 | def __init__(self): 6 | """ 7 | Initializes the configuration for the agent 8 | """ 9 | self.MAX_SEARCH_QUERIES = 6 10 | self.DEFAULT_CLUSTER_SIZE = 10 11 | self.RERANK_TIMEOUT = 3 12 | self.MAX_PROMPT_LENGTH = 350000 13 | self.MAX_DOC_LENGTH = 8000 14 | self.BASE_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=2000) 15 | self.FACTUAL_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0.0, max_tokens=2000) 16 | self.DEBUG = False -------------------------------------------------------------------------------- /src/company_researcher/router.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | def cluster_router(state) -> Literal["enrich", "rerank"]: 4 | """Routes the workflow after the 'cluster' step. 5 | 6 | If no clusters are formed, it falls back to 'rerank' for reevaluation.""" 7 | if state.clusters: 8 | return "enrich" 9 | else: 10 | return "rerank" 11 | 12 | def rerank_router(state) -> Literal["enrich", "write"]: 13 | """Routes the workflow after the 'rerank' step. 14 | 15 | If no clusters are formed even after reranking, it skips to the 'write' step without enriching the documents.""" 16 | if state.clusters: 17 | return "enrich" 18 | else: 19 | return "write" 20 | -------------------------------------------------------------------------------- /src/company_researcher/nodes/enrich.py: -------------------------------------------------------------------------------- 1 | class EnrichAgent: 2 | def __init__(self, cfg, utils): 3 | self.cfg = cfg 4 | self.utils = utils 5 | 6 | async def run(self, state): 7 | clusters = state.clusters 8 | chosen_cluster = clusters[state.chosen_cluster] 9 | msg = f"🚀 Enriching documents for selected cluster '{chosen_cluster.company_name}'...\n" 10 | if self.cfg.DEBUG: 11 | print(msg) 12 | research_data, extract_msg = await self.utils.tavily.extract(chosen_cluster.urls, state.research_data) 13 | if self.cfg.DEBUG: 14 | print(extract_msg) 15 | return {"research_data": research_data, "messages": msg + extract_msg} 16 | 17 | -------------------------------------------------------------------------------- /src/company_researcher/nodes/ground.py: -------------------------------------------------------------------------------- 1 | class GroundAgent: 2 | def __init__(self, cfg, utils): 3 | self.cfg = cfg 4 | self.utils = utils 5 | 6 | async def run(self, state): 7 | msg = f"🔗 Initiating initial grounding for company '{state.company}'...\n" 8 | if self.cfg.DEBUG: 9 | print(msg) 10 | grounding_data, extract_msg = await self.utils.tavily.extract([state.company_url], state.grounding_data) 11 | if self.cfg.DEBUG: 12 | print(extract_msg) 13 | if not grounding_data: 14 | grounding_data, extract_msg = await self.utils.tavily.extract([state.company_url], state.grounding_data, 15 | extract_depth="advanced") 16 | if self.cfg.DEBUG: 17 | print("Used advanced grounding") 18 | return {"grounding_data": grounding_data, "messages": msg + extract_msg} 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tavily Company Research Agent 2 | 3 | This agent automates company research by leveraging Tavily to retrieve accurate, up-to-date data. You can also use the "include" argument to customize the report by providing a list of specific details you want to include. For example, you can request details such as "Company's CEO", "Location of Headquarters", or other specific information. 4 | 5 | ## Key Steps 6 | 7 | 1. **🔗 Grounding**: Establishes the website URL as a trusted baseline for all research efforts. 8 | 2. **🔎 Searching**: Collects a wide range of relevant data from various online sources, including **trusted sources like LinkedIn** to ensure accuracy and reliability 9 | 3. **📊 Clustering**: Organizes the collected data into clusters, picking the most relevant one. This is especially handy for companies with similar names or limited online visibility. 10 | 4. **🚀 Extraction**: Enriches documents in the chosen cluster. 11 | 5. **📝 Generation**: Creates a detailed company report. 12 | 13 | Our goal is to provide you with a practical tool that helps you effortlessly and efficiently gather meaningful insights on any company. 14 | 15 | ## 🔍 Workflow Overview 16 | 17 | ![Workflow Graph](graph.png) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "tavily_company_researcher" 3 | version = "0.0.1" 4 | description = "Starter template for a company research agent using Tavily and Langgraph." 5 | authors = [ 6 | { name = "Tavily", email = "support@tavily.com" }, 7 | ] 8 | readme = "README.md" 9 | license = { text = "MIT" } 10 | requires-python = ">=3.11" 11 | dependencies = [ 12 | "langchain-core", 13 | "langchain-openai", 14 | "langgraph", 15 | "tavily-python", 16 | "pydantic", 17 | "cohere" 18 | ] 19 | 20 | [project.optional-dependencies] 21 | dev = ["mypy>=1.11.1", "ruff>=0.6.1"] 22 | 23 | [build-system] 24 | requires = ["setuptools>=73.0.0", "wheel"] 25 | build-backend = "setuptools.build_meta" 26 | 27 | [tool.setuptools] 28 | packages = ["tavily.templates.company_researcher", "company_researcher"] 29 | [tool.setuptools.package-dir] 30 | "tavily.templates.company_researcher" = "src/company_researcher" 31 | "company_researcher" = "src/company_researcher" 32 | 33 | 34 | [tool.setuptools.package-data] 35 | "*" = ["py.typed"] 36 | 37 | [tool.ruff] 38 | lint.select = [ 39 | "E", # pycodestyle 40 | "F", # pyflakes 41 | "I", # isort 42 | "D", # pydocstyle 43 | "D401", # First line should be in imperative mood 44 | "T201", 45 | "UP", 46 | ] 47 | lint.ignore = [ 48 | "UP006", 49 | "UP007", 50 | # We actually do want to import from typing_extensions 51 | "UP035", 52 | # Relax the convention by _not_ requiring documentation for every function parameter. 53 | "D417", 54 | "E501", 55 | ] 56 | [tool.ruff.lint.per-file-ignores] 57 | "tests/*" = ["D", "UP"] 58 | [tool.ruff.lint.pydocstyle] 59 | convention = "google" 60 | 61 | -------------------------------------------------------------------------------- /src/company_researcher/graph.py: -------------------------------------------------------------------------------- 1 | from langgraph.graph import StateGraph, END 2 | 3 | from company_researcher.config import Config 4 | from company_researcher.state import InputState, OutputState, ResearchState 5 | from company_researcher.nodes import GroundAgent, ResearchAgent, ClusterAgent, RerankAgent, EnrichAgent, WriteAgent 6 | from company_researcher.utils.all import Utils 7 | from company_researcher.router import cluster_router, rerank_router 8 | 9 | 10 | cfg = Config() 11 | utils = Utils() 12 | 13 | # Initialize agents 14 | ground_agent = GroundAgent(cfg, utils) 15 | research_agent = ResearchAgent(cfg, utils) 16 | cluster_agent = ClusterAgent(cfg, utils) 17 | rerank_agent = RerankAgent(cfg, utils) 18 | enrich_agent = EnrichAgent(cfg, utils) 19 | write_agent = WriteAgent(cfg, utils) 20 | 21 | # Define a Langchain graph 22 | workflow = StateGraph(ResearchState, input=InputState, output=OutputState) 23 | 24 | # Add node for each agent 25 | workflow.add_node('ground', ground_agent.run) 26 | workflow.add_node('research', research_agent.run) 27 | workflow.add_node('cluster', cluster_agent.run) 28 | workflow.add_node('rerank', rerank_agent.run) 29 | workflow.add_node('enrich', enrich_agent.run) 30 | workflow.add_node('write', write_agent.run) 31 | 32 | # Set up edges 33 | workflow.add_edge('ground', 'research') 34 | workflow.add_edge('research', 'cluster') 35 | workflow.add_conditional_edges('cluster', cluster_router) 36 | workflow.add_conditional_edges('rerank', rerank_router) 37 | workflow.add_edge('enrich', 'write') 38 | workflow.add_edge('write', END) 39 | 40 | # Set up start node 41 | workflow.set_entry_point('ground') 42 | 43 | graph = workflow.compile() 44 | graph.name = "Tavily Company Researcher" 45 | -------------------------------------------------------------------------------- /src/company_researcher/state.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import Dict, Union, List, Annotated 3 | from langchain_core.messages import AnyMessage 4 | from langgraph.graph import add_messages 5 | 6 | from company_researcher.nodes.cluster import Cluster 7 | from company_researcher.utils.tavily_utils import TavilySearchInput, TavilyQuery 8 | 9 | class InputState(BaseModel): 10 | company: str = Field( 11 | description="The name of the company to research", 12 | examples=["Tavily"], 13 | ) 14 | company_url: str = Field( 15 | description="The official website URL of the company.", 16 | examples=["https://tavily.com/"], 17 | ), 18 | include: list[str] = Field( 19 | description=( 20 | "Optional list specifying information to include in the company research report, " 21 | "such as the company's official website URL, LinkedIn profile URL, headquarters location, " 22 | "number of employees, CEO's name, and more." 23 | ), 24 | examples=[ 25 | "Company's official website URL", 26 | "Company's LinkedIn profile URL", 27 | "Location of headquarters formatted as , (e.g. San Francisco, CA)", 28 | "Number of employees", 29 | "Name of the CEO" 30 | ], 31 | default_factory=list 32 | ) 33 | 34 | 35 | class OutputState(BaseModel): 36 | report: str = "" 37 | 38 | class ResearchState(InputState, OutputState): 39 | grounding_data: Dict[str, Dict[str, Union[str, None]]] = Field(default_factory=dict) 40 | research_data: Dict[str, Dict[str, Union[str, float, None]]] = Field(default_factory=dict) 41 | clusters: List[Cluster] = Field(default_factory=list) 42 | chosen_cluster: int = Field(default_factory=int) 43 | search_queries: List[TavilyQuery] = Field(default_factory=list) 44 | messages: Annotated[List[AnyMessage], add_messages] = Field(default_factory=list) 45 | 46 | -------------------------------------------------------------------------------- /src/company_researcher/nodes/rerank.py: -------------------------------------------------------------------------------- 1 | import cohere 2 | import asyncio 3 | 4 | from company_researcher.nodes.cluster import Cluster 5 | 6 | 7 | class RerankAgent: 8 | def __init__(self, cfg, utils): 9 | self.cfg = cfg 10 | self.utils = utils 11 | self.co = cohere.AsyncClient() 12 | 13 | async def rerank_documents(self, query, documents, top_n, timeout): 14 | """Performs reranking of documents using Cohere.""" 15 | try: 16 | response = await asyncio.wait_for( 17 | self.co.rerank( 18 | query=query, 19 | documents=documents, 20 | top_n=top_n, 21 | return_documents=False, 22 | ), 23 | timeout=timeout, 24 | ) 25 | return response.results 26 | except asyncio.TimeoutError: 27 | raise TimeoutError("Timeout occurred during reranking") 28 | except Exception as e: 29 | raise RuntimeError(f"Unexpected error during reranking: {e}") 30 | 31 | def create_cluster(self, company_name, urls): 32 | """Creates a Cluster object from company name and URLs.""" 33 | return Cluster( 34 | company_name=company_name, 35 | urls=urls, 36 | ) 37 | 38 | async def run(self, state): 39 | """Main method to rerank research data and create clusters.""" 40 | msg = "🔄 Reranking research data...\n" 41 | data = list(state.research_data.values()) 42 | 43 | try: 44 | # Perform reranking 45 | query = f"Company {state.company}" 46 | if state.include: 47 | query += " " + " ".join(state.include) 48 | if self.cfg.DEBUG: 49 | print("Reranking query: ", query) 50 | documents = [result["content"] for result in data] 51 | top_n = self.cfg.DEFAULT_CLUSTER_SIZE 52 | timeout = self.cfg.RERANK_TIMEOUT 53 | 54 | rerank_results = await self.rerank_documents( 55 | query=query, 56 | documents=documents, 57 | top_n=top_n, 58 | timeout=timeout, 59 | ) 60 | 61 | # Process results 62 | urls = [] 63 | msg += "Top documents selected:\n" 64 | for r in rerank_results: 65 | original_result = data[r.index] 66 | msg += f"{original_result['url']}\n" 67 | urls.append(original_result["url"]) 68 | 69 | # Create and return cluster 70 | cluster = self.create_cluster(state.company, urls) 71 | return {"clusters": [cluster], "messages": msg} 72 | 73 | except TimeoutError: 74 | return {"messages": "🚫 Timeout occurred while reranking research data"} 75 | except RuntimeError as e: 76 | return {"messages": f"🚫 Error during reranking: {e}"} 77 | except Exception as e: 78 | return {"messages": f"🚫 Unexpected error during reranking: {e}"} 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | .idea/ 163 | 164 | .DS_Store 165 | -------------------------------------------------------------------------------- /src/company_researcher/utils/tavily_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import datetime 3 | from tavily import AsyncTavilyClient 4 | from typing import List, Optional 5 | from pydantic import BaseModel, Field 6 | from company_researcher.config import Config 7 | cfg = Config() 8 | 9 | # Define Tavily's arguments to tailor the search results 10 | class TavilyQuery(BaseModel): 11 | query: str = Field(description="Web search query") 12 | search_depth: str = Field(description="The depth of the search, available options: 'basic', 'advanced'") 13 | time_range: Optional[str] = Field(default=None, description="(Optional) Filters results by time range, useful when looking for sources like magazines and articles. Available options: 'day', 'week', 'month', 'year'.") 14 | include_domains: Optional[List[str]] = Field(default=None, 15 | description="List of domains to include in the research. Useful when trying to gather information from trusted and relevant domains") 16 | 17 | class TavilySearchInput(BaseModel): 18 | sub_queries: List[TavilyQuery] = Field(description="Set of web search queries that can be answered in isolation") 19 | 20 | 21 | class Tavily: 22 | def __init__(self): 23 | self.client = AsyncTavilyClient() 24 | 25 | async def extract(self, urls: list[str], sources_dict: dict, extract_depth="basic"): 26 | msg = "" 27 | 28 | async def process_batch(url_batch): 29 | batch_msg = "" 30 | try: 31 | response = await self.client.extract(urls=url_batch, extract_depth=extract_depth) 32 | for itm in response['results']: 33 | url = itm['url'] 34 | raw_content = itm['raw_content'] 35 | if len(raw_content) > cfg.MAX_DOC_LENGTH: 36 | raw_content = raw_content[:cfg.MAX_DOC_LENGTH] + " [...]" 37 | if cfg.DEBUG: 38 | print(f"Content from {url} was truncated to the maximum allowed length ({cfg.MAX_DOC_LENGTH} characters). Current length: {len(raw_content)}\nPreview:\n{raw_content}") 39 | if url in sources_dict: 40 | sources_dict[url]['raw_content'] = raw_content 41 | else: 42 | sources_dict[url] = {'raw_content': raw_content} 43 | batch_msg += f"{url}\n" 44 | return batch_msg 45 | except Exception as e: 46 | return f"Error occurred during Tavily Extract request for batch: {e}\n" 47 | 48 | # Split URLs into batches of 20 49 | url_batches = [urls[i:i + 20] for i in range(0, len(urls), 20)] 50 | 51 | # Process all batches in parallel 52 | results = await asyncio.gather(*[process_batch(batch) for batch in url_batches]) 53 | 54 | # Collect messages from all batches 55 | if results: 56 | msg += "Extracted raw content for:\n" + "".join(results) 57 | 58 | return sources_dict, msg 59 | 60 | async def search(self, sub_queries: List[TavilyQuery], sources_dict: dict): 61 | """ 62 | Perform searches for each sub-query using the Tavily Search concurrently. 63 | 64 | :param sub_queries: List of search queries. 65 | :param sources_dict: Dictionary to store unique search results, keyed by URL. 66 | """ 67 | 68 | # Define a coroutine function to perform a single search with error handling 69 | async def perform_search(query): 70 | try: 71 | print(query) 72 | # Add date to the query as we need the most recent results 73 | # query_with_date = f"{query.query} {datetime.now().strftime('%m-%Y')}" 74 | tavily_response = await self.client.search(query=query.query, topic="general", search_depth=query.search_depth, time_range=query.time_range, include_domains=query.include_domains, max_results=10) 75 | return tavily_response['results'] 76 | except Exception as e: 77 | # Handle any exceptions, log them, and return an empty list 78 | if cfg.DEBUG: 79 | print(f"Error occurred during search for query '{query}': {str(e)}") 80 | return [] 81 | 82 | # Run all the search tasks in parallel 83 | search_tasks = [perform_search(itm) for itm in sub_queries] 84 | search_responses = await asyncio.gather(*search_tasks) 85 | 86 | # Combine the results from all the responses and update the sources_dict 87 | for response in search_responses: 88 | for result in response: 89 | url = result.get("url") 90 | if url and url not in sources_dict: 91 | # Add the result to sources_dict if the URL is not already present 92 | sources_dict[url] = result 93 | 94 | return sources_dict -------------------------------------------------------------------------------- /src/company_researcher/nodes/research.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, Field 2 | from typing import List, Optional 3 | from langchain_core.messages import AnyMessage, AIMessage, SystemMessage, HumanMessage, ToolMessage 4 | from company_researcher.utils.tavily_utils import TavilySearchInput, TavilyQuery 5 | 6 | class ResearchAgent: 7 | def __init__(self, cfg, utils): 8 | self.cfg = cfg 9 | self.utils = utils 10 | 11 | async def generate_queries(self, state): 12 | try: 13 | msg = f"🤔 Generating search queries based on grounding data...\n" 14 | if self.cfg.DEBUG: 15 | print(msg) 16 | prompt = ( 17 | f"You are an expert company researcher specializing in generating company analysis reports.\n" 18 | f"Your task is to generate up to {self.cfg.MAX_SEARCH_QUERIES} precise **web search queries** to thoroughly understand the company: '{state.company}'.\n\n" 19 | f"### Key Areas to Explore:\n" 20 | f"- **Company Background**: Focus on keywords such as history, mission, headquarters, CEO, leadership team, and number of employees.\n" 21 | f"- **Products and Services**: Search for offerings like main products, unique features, customer segments, and market differentiation.\n" 22 | f"- **Market Position**: Use terms like market competition, industry ranking, competitive landscape, market reach, and impact.\n" 23 | f"- **Financials**: Look for information on funding rounds, revenue, financial growth, recent investments, and performance metrics.\n\n" 24 | ) 25 | 26 | if state.include: 27 | prompt += ( 28 | f"### Required Information to Include:\n" 29 | f"- You are tasked with ensuring the following specific types of information are covered in the report, as specified by the user:\n" 30 | f"{', '.join(state.include)}\n" 31 | # f"- Prioritize missing information: Check the grounding data and identify any missing elements from the required information to include.\n" 32 | f"- Generate a search query only for the information that is missing from the provided grounding data.\n" 33 | ) 34 | 35 | prompt += ( 36 | f"### Grounding Data:\n" 37 | f"Use the grounding data provided from the company's website below to ensure queries are closely tied to **{state.company}** and reflect its latest context:\n" 38 | f"{state.grounding_data}\n\n" 39 | f"### Additional Guidance:\n" 40 | ) 41 | 42 | # if state.include: 43 | # prompt += ( 44 | # f"- Prioritize missing information: Check the grounding data and identify any missing elements from the required information to include.\n" 45 | # ) 46 | 47 | prompt += ( 48 | f"- Ensure each query incorporates **specific keywords** derived from the grounding data, such as the company's name, key products or services, leadership titles, geographical locations, and other unique identifiers, to allow the search engine to retrieve the most relevant sources specific to the company you are researching.\n" 49 | f"- **Limit each query to 100 characters or fewer** to ensure clarity and search engine compatibility.\n" 50 | f"- Structure queries to focus on specific aspects of the company, such as \"{state.company} number of employees\" or \"{state.company} market competition.\"\n" 51 | f"- Avoid redundancy: Each query should focus on unique information to retrieve relevant details efficiently. For example, there should be only one query to search for the company's CEO name.\n" 52 | 53 | ) 54 | if self.cfg.DEBUG: 55 | print(prompt) 56 | messages = [SystemMessage(content=prompt)] 57 | response = await self.cfg.BASE_LLM.with_structured_output(TavilySearchInput).ainvoke(messages) 58 | return response.sub_queries, msg 59 | except Exception as e: 60 | msg = f"🚫 An error occurred during search queries generation: {str(e)}" 61 | return [TavilyQuery(query=f"Company {state.company}", search_depth="advanced")], msg 62 | 63 | async def run(self, state): 64 | sub_queries, msg = await self.generate_queries(state) 65 | sub_queries.append(TavilyQuery(query=f'{state.company} company', search_depth="advanced", include_domains=['linkedin.com/company'])) 66 | print(sub_queries) 67 | msg += "🔎 Tavily Searching ...\n" + "\n".join(f'"{query.query}"' for query in sub_queries) 68 | if self.cfg.DEBUG: 69 | print(msg) 70 | research_data = await self.utils.tavily.search(sub_queries, state.research_data) 71 | return {"messages": msg, "search_queries": sub_queries, "research_data": research_data} 72 | -------------------------------------------------------------------------------- /src/company_researcher/nodes/cluster.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pydantic import BaseModel, Field 3 | from typing import List 4 | from langchain_core.messages import AnyMessage, AIMessage, SystemMessage, HumanMessage, ToolMessage 5 | 6 | class Cluster(BaseModel): 7 | company_name: str = Field( 8 | ..., 9 | description="The name or identifier of the company these documents belong to." 10 | ) 11 | urls: List[str] = Field( 12 | ..., 13 | description="A list of URLs relevant to the identified company." 14 | ) 15 | 16 | class Clusters(BaseModel): 17 | clusters: List[Cluster] = Field(default_factory=list, description="List of clusters") 18 | 19 | class ClusterAgent: 20 | def __init__(self, cfg, utils): 21 | self.cfg = cfg 22 | self.utils = utils 23 | 24 | async def cluster(self, state): 25 | target_domain = state.company_url.split("//")[-1].split("/")[0] 26 | 27 | prompt = ( 28 | f"We conducted a search for a company called '{state.company}', but the results may include documents from other companies with similar names or domains.\n" 29 | f"Your task is to accurately categorize these retrieved documents based on which specific company they pertain to, using the initial company information as 'ground truth.'\n\n" 30 | f"### Target Company Information\n" 31 | f"- **Company Name**: '{state.company}'\n" 32 | f"- **Primary Domain**: '{target_domain}'\n" 33 | f"- **Initial Context (Ground Truth)**: Information below should act as a verification baseline. Use it to confirm that the document content aligns directly with {state.company}.\n" 34 | f"- **{json.dumps(state.grounding_data)}**\n\n" 35 | f"### Retrieved Documents for Clustering\n" 36 | f"Below are the retrieved documents, including URLs and brief content snippets:\n" 37 | f"{[{'url': doc['url'], 'snippet': doc['content']} for doc in state.research_data.values()]}\n\n" 38 | f"### Clustering Instructions\n" 39 | f"- **Primary Domain Priority**: Documents with URLs containing '{target_domain}' should be prioritized for the main cluster for '{state.company}'.\n" 40 | f"- **Include Relevant Third-Party Sources**: Documents from third-party domains (e.g., news sites, industry reports) should also be included in the '{state.company}' cluster if they provide specific information about '{state.company}', reference '{target_domain}', or closely match the initial company context.\n" 41 | ) 42 | 43 | if state.include: 44 | prompt += ( 45 | f"- **Trusted Sources Inclusion**: If possible, trusted sources that include the following information should be added to the main cluster:\n" 46 | f"{', '.join(state.include)}.\n" 47 | ) 48 | 49 | prompt += ( 50 | f"- **Separate Similar But Distinct Domains**: Documents from similar but distinct domains (e.g., '{target_domain.replace('.com', '.io')}') should be placed in separate clusters unless they explicitly reference the target domain and align with the company's context.\n" 51 | f"- **Handle Ambiguities Separately**: Documents that lack clear alignment with '{state.company}' should be placed in an 'Ambiguous' cluster for further review.\n\n" 52 | f"### Example Output Format\n" 53 | f"{{\n" 54 | f" 'clusters': [\n" 55 | f" {{\n" 56 | f" 'company_name': 'Name of Company A',\n" 57 | f" 'urls': [\n" 58 | f" 'http://example.com/doc1',\n" 59 | f" 'http://example.com/doc2'\n" 60 | f" ]\n" 61 | f" }},\n" 62 | f" {{\n" 63 | f" 'company_name': 'Name of Company B',\n" 64 | f" 'urls': [\n" 65 | f" 'http://example.com/doc3'\n" 66 | f" ]\n" 67 | f" }},\n" 68 | f" {{\n" 69 | f" 'company_name': 'Ambiguous',\n" 70 | f" 'urls': [\n" 71 | f" 'http://example.com/doc4'\n" 72 | f" ]\n" 73 | f" }}\n" 74 | f" ]\n" 75 | f"}}\n\n" 76 | f"### Key Points\n" 77 | f"- **Focus on Relevant Content**: Documents that contain relevant references to '{state.company}' (even from third-party domains) should be clustered with '{state.company}' if they align well with the initial information and context provided.\n" 78 | f"- **Identify Ambiguities**: Any documents without clear relevance to '{state.company}' should be placed in the 'Ambiguous' cluster for manual review.\n" 79 | ) 80 | prompt = prompt[:self.cfg.MAX_PROMPT_LENGTH] 81 | if self.cfg.DEBUG: 82 | print(prompt) 83 | try: 84 | messages = [SystemMessage(content=prompt)] 85 | response = await self.cfg.BASE_LLM.with_structured_output(Clusters).ainvoke(messages) 86 | clusters = response.clusters # Access the structured clusters directly 87 | return clusters, "" 88 | except Exception as e: 89 | msg = f"🚫 Error accrued during clustering: {str(e)}\n" 90 | clusters = [] 91 | return clusters, msg 92 | 93 | # Define the function to automatically choose the correct cluster, can add in the future manual selection support 94 | async def choose_cluster(self, company_url, clusters): 95 | chosen_cluster = 0 96 | msg = "" 97 | for index, cluster in enumerate(clusters): 98 | # Check if any URL in the cluster starts with the company URL 99 | if any(company_url in url for url in cluster.urls): 100 | chosen_cluster = index 101 | break 102 | if clusters: 103 | cluster = clusters[chosen_cluster] 104 | msg = f"Automatically selected cluster: {cluster.company_name} with the following urls: {cluster.urls}\n" 105 | return chosen_cluster, msg 106 | 107 | 108 | async def run(self, state): 109 | msg = "📊 Beginning clustering process...\n" 110 | if self.cfg.DEBUG: 111 | print(msg) 112 | clusters, cluster_msg = await self.cluster(state) 113 | if self.cfg.DEBUG: 114 | print(cluster_msg) 115 | chosen_cluster, choose_msg = await self.choose_cluster(state.company_url, clusters) 116 | if self.cfg.DEBUG: 117 | print(choose_msg) 118 | return {"clusters": clusters, "chosen_cluster": chosen_cluster, "messages": msg + cluster_msg + choose_msg} 119 | -------------------------------------------------------------------------------- /src/company_researcher/nodes/write.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from langchain_core.messages import AnyMessage, AIMessage, SystemMessage, HumanMessage, ToolMessage 3 | 4 | 5 | class WriteAgent: 6 | def __init__(self, cfg, utils): 7 | self.cfg = cfg 8 | self.utils = utils 9 | 10 | async def run(self, state): 11 | report_title = f"{state.company} Company Report" 12 | report_date = datetime.now().strftime('%B %d, %Y') 13 | 14 | prompt = ( 15 | f"You are an expert company researcher tasked with writing a fact-based report on recent developments for the company **{state.company}**. " 16 | f"Write the report in Markdown format. DO NOT change the titles. Each section must be written in well-structured paragraphs, not lists or bullet points.\n" 17 | f"Ensure the report includes:\n" 18 | f"- **Inline citations** as Markdown hyperlinks directly in the main sections (e.g., Company X is an innovative leader in AI ([LinkedIn](https://linkedin.com))).\n" 19 | f"- A **Citations Section** at the end that lists all URLs used.\n\n" 20 | f"### Strict Guidelines:\n" 21 | f"- You must only use the information provided in the documents listed below.\n" 22 | f"- Do not make up or infer any details that are not explicitly stated in the provided sources.\n" 23 | f"- If a required data point (e.g., employee count, financial figures) is not available in the provided documents, state that it is unavailable.\n" 24 | f"- As of today, **{report_date}**, prioritize the most recent and updated source in cases where conflicting data points or metrics are found.\n" 25 | ) 26 | 27 | if state.include: 28 | prompt += ( 29 | f"- Ensure the report includes the following user-requested information, if available: " 30 | f"{', '.join(state.include)}.\n" 31 | ) 32 | 33 | prompt += ( 34 | "- Make sure to support specific data points and metrics included in the report with in-text Markdown hyperlink citations.\n\n" 35 | f"### Report Structure:\n" 36 | f"Title: {report_title}\n" 37 | f"Date: {report_date}\n" 38 | f"1. **Executive Summary**:\n" 39 | f" - High-level overview of the company, its services, location, employee count, and achievements.\n" 40 | f" - Make sure to include the general information necessary to understand the company well, including any notable achievements.\n\n" 41 | f"2. **Leadership and Vision**:\n" 42 | f" - Details on the CEO and key team members, their experience, and alignment with company goals.\n" 43 | f" - Any personnel changes and their strategic impact.\n\n" 44 | f"3. **Product and Service Overview**:\n" 45 | f" - Summary of current products/services, features, updates, and market fit.\n" 46 | f" - Include details from the company's website, tools, or new integrations.\n\n" 47 | f"4. **Financial Performance**:\n" 48 | f" - For public companies: key metrics (e.g., revenue, market cap).\n" 49 | f" - For startups: funding rounds, investors, and milestones.\n\n" 50 | f"5. **Recent Developments**:\n" 51 | f" - New product enhancements, partnerships, competitive moves, or market entries.\n\n" 52 | f"6. **Competitive Landscape**:\n" 53 | f" - Overview of major competitors and their positioning in the market.\n" 54 | f" - Compare key differentiators, market share, pricing, and product/service features.\n" 55 | f" - Include relevant competitor developments that impact the company’s strategy.\n\n" 56 | ) 57 | if state.include: 58 | prompt += ( 59 | f"7. (Optional) **Additional Information**:\n" 60 | f" - Attempt to fit the user-requested information into the predefined sections above, where relevant.\n" 61 | f" - ONLY if the information does not fit into ANY section, include that unfitted information here.\n" 62 | f" - AVOID including user-requested information in multiple sections. For example, if the user requests that report includes the company CEO's name, it should be mentioned ONLY in the **Leadership and Vision** section and not repeated here." 63 | f" - Present the information in well-structured paragraphs, not lists or bullet points.\n\n" 64 | ) 65 | 66 | prompt += ( 67 | f"{'8' if state.include else '7'}. **Citations**:\n" 68 | f" - Ensure every source cited in the report is listed in the text as Markdown hyperlinks.\n" 69 | f" - Also include a list of all URLs as Markdown hyperlinks in this section.\n\n" 70 | ) 71 | 72 | # Dynamically generate the "Documents to Base the Report On" section 73 | if state.clusters: 74 | # Use cluster-specific research data 75 | documents = "\n".join( 76 | f"- {state.research_data[key]}" 77 | for key in state.clusters[state.chosen_cluster].urls 78 | if key in state.research_data 79 | ) 80 | prompt += ( 81 | f"### Documents to Base the Report On:\n" 82 | f"Use the following cluster-specific documents to write the report:\n" 83 | f"{documents}" 84 | ) 85 | else: 86 | # Use all available research data 87 | grounding_data_content = "\n".join(f"- {item}" for item in state.grounding_data.values()) 88 | research_data_content = "\n".join(f"- {item}" for item in state.research_data.values()) 89 | prompt += ( 90 | f"### Documents to Base the Report On:\n" 91 | f"#### Official Grounding Data:\n" 92 | f"The following is official data sourced from the company's website and should be used as a primary reference:\n" 93 | f"{grounding_data_content}\n\n" 94 | f"#### Additional Research Data:\n" 95 | f"Select and prioritize the most relevant sources to ensure alignment with the target company.\n" 96 | f"{research_data_content}" 97 | ) 98 | 99 | prompt = prompt[:self.cfg.MAX_PROMPT_LENGTH] 100 | if self.cfg.DEBUG: 101 | print(prompt) 102 | 103 | try: 104 | messages = [SystemMessage(content=prompt)] 105 | response = await self.cfg.FACTUAL_LLM.ainvoke(messages) 106 | report = response.content 107 | return {"report": report} 108 | except Exception as e: 109 | msg = f"🚫 Error generating report: {str(e)}" 110 | if self.cfg.DEBUG: 111 | print(msg) 112 | return {"messages": msg} 113 | --------------------------------------------------------------------------------