├── graph.png
├── src
    └── company_researcher
    │   ├── __init__.py
    │   ├── utils
    │       ├── all.py
    │       └── tavily_utils.py
    │   ├── nodes
    │       ├── __init__.py
    │       ├── enrich.py
    │       ├── ground.py
    │       ├── rerank.py
    │       ├── research.py
    │       ├── cluster.py
    │       └── write.py
    │   ├── config.py
    │   ├── router.py
    │   ├── graph.py
    │   └── state.py
├── .env.example
├── langgraph.json
├── README.md
├── pyproject.toml
└── .gitignore


/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tavily-ai/tavily_company_researcher/HEAD/graph.png


--------------------------------------------------------------------------------
/src/company_researcher/__init__.py:
--------------------------------------------------------------------------------
1 | from company_researcher.graph import graph
2 | 
3 | __all__ = ["graph"]


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY="your-API-key"
2 | TAVILY_API_KEY="your-API-key"
3 | COHERE_API_KEY="your-API-key"


--------------------------------------------------------------------------------
/src/company_researcher/utils/all.py:
--------------------------------------------------------------------------------
1 | from .tavily_utils import Tavily
2 | 
3 | class Utils:
4 |     def __init__(self):
5 |         self.tavily = Tavily()


--------------------------------------------------------------------------------
/langgraph.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": ["."],
3 |   "graphs": {
4 |     "agent": "./src/company_researcher/graph.py:graph"
5 |   },
6 |   "env": ".env"
7 | }
8 | 


--------------------------------------------------------------------------------
/src/company_researcher/nodes/__init__.py:
--------------------------------------------------------------------------------
1 | from .ground import GroundAgent
2 | from .research import ResearchAgent
3 | from .cluster import ClusterAgent
4 | from .rerank import RerankAgent
5 | from .enrich import EnrichAgent
6 | from .write import WriteAgent


--------------------------------------------------------------------------------
/src/company_researcher/config.py:
--------------------------------------------------------------------------------
 1 | from langchain_openai import ChatOpenAI
 2 | 
 3 | # Description: Configuration file
 4 | class Config:
 5 |     def __init__(self):
 6 |         """
 7 |         Initializes the configuration for the agent
 8 |         """
 9 |         self.MAX_SEARCH_QUERIES = 6
10 |         self.DEFAULT_CLUSTER_SIZE = 10
11 |         self.RERANK_TIMEOUT = 3
12 |         self.MAX_PROMPT_LENGTH = 350000
13 |         self.MAX_DOC_LENGTH = 8000
14 |         self.BASE_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=2000)
15 |         self.FACTUAL_LLM = ChatOpenAI(model="gpt-4o-mini", temperature=0.0, max_tokens=2000)
16 |         self.DEBUG = False


--------------------------------------------------------------------------------
/src/company_researcher/router.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | def cluster_router(state) -> Literal["enrich", "rerank"]:
 4 |     """Routes the workflow after the 'cluster' step.
 5 | 
 6 |      If no clusters are formed, it falls back to 'rerank' for reevaluation."""
 7 |     if state.clusters:
 8 |         return "enrich"
 9 |     else:
10 |         return "rerank"
11 | 
12 | def rerank_router(state) -> Literal["enrich", "write"]:
13 |     """Routes the workflow after the 'rerank' step.
14 | 
15 |     If no clusters are formed even after reranking, it skips to the 'write' step without enriching the documents."""
16 |     if state.clusters:
17 |         return "enrich"
18 |     else:
19 |         return "write"
20 | 


--------------------------------------------------------------------------------
/src/company_researcher/nodes/enrich.py:
--------------------------------------------------------------------------------
 1 | class EnrichAgent:
 2 |     def __init__(self, cfg, utils):
 3 |         self.cfg = cfg
 4 |         self.utils = utils
 5 | 
 6 |     async def run(self, state):
 7 |         clusters = state.clusters
 8 |         chosen_cluster = clusters[state.chosen_cluster]
 9 |         msg = f"🚀 Enriching documents for selected cluster '{chosen_cluster.company_name}'...\n"
10 |         if self.cfg.DEBUG:
11 |             print(msg)
12 |         research_data, extract_msg = await self.utils.tavily.extract(chosen_cluster.urls, state.research_data)
13 |         if self.cfg.DEBUG:
14 |             print(extract_msg)
15 |         return {"research_data": research_data, "messages": msg + extract_msg}
16 | 
17 | 


--------------------------------------------------------------------------------
/src/company_researcher/nodes/ground.py:
--------------------------------------------------------------------------------
 1 | class GroundAgent:
 2 |     def __init__(self, cfg, utils):
 3 |         self.cfg = cfg
 4 |         self.utils = utils
 5 | 
 6 |     async def run(self, state):
 7 |         msg = f"🔗 Initiating initial grounding for company '{state.company}'...\n"
 8 |         if self.cfg.DEBUG:
 9 |             print(msg)
10 |         grounding_data, extract_msg = await self.utils.tavily.extract([state.company_url], state.grounding_data)
11 |         if self.cfg.DEBUG:
12 |             print(extract_msg)
13 |         if not grounding_data:
14 |             grounding_data, extract_msg = await self.utils.tavily.extract([state.company_url], state.grounding_data,
15 |                                                                           extract_depth="advanced")
16 |             if self.cfg.DEBUG:
17 |                 print("Used advanced grounding")
18 |         return {"grounding_data": grounding_data, "messages": msg + extract_msg}
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tavily Company Research Agent
 2 | 
 3 | This agent automates company research by leveraging Tavily to retrieve accurate, up-to-date data. You can also use the "include" argument to customize the report by providing a list of specific details you want to include. For example, you can request details such as "Company's CEO", "Location of Headquarters", or other specific information.
 4 | 
 5 | ## Key Steps
 6 | 
 7 | 1. **🔗 Grounding**: Establishes the website URL as a trusted baseline for all research efforts.
 8 | 2. **🔎 Searching**: Collects a wide range of relevant data from various online sources, including **trusted sources like LinkedIn** to ensure accuracy and reliability
 9 | 3. **📊 Clustering**: Organizes the collected data into clusters, picking the most relevant one. This is especially handy for companies with similar names or limited online visibility.  
10 | 4. **🚀 Extraction**: Enriches documents in the chosen cluster.  
11 | 5. **📝 Generation**: Creates a detailed company report.  
12 | 
13 | Our goal is to provide you with a practical tool that helps you effortlessly and efficiently gather meaningful insights on any company.
14 | 
15 | ## 🔍 Workflow Overview
16 | 
17 | ![Workflow Graph](graph.png)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "tavily_company_researcher"
 3 | version = "0.0.1"
 4 | description = "Starter template for a company research agent using Tavily and Langgraph."
 5 | authors = [
 6 |     { name = "Tavily", email = "support@tavily.com" },
 7 | ]
 8 | readme = "README.md"
 9 | license = { text = "MIT" }
10 | requires-python = ">=3.11"
11 | dependencies = [
12 |     "langchain-core",
13 |     "langchain-openai",
14 |     "langgraph",
15 |     "tavily-python",
16 |     "pydantic",
17 |     "cohere"
18 | ]
19 | 
20 | [project.optional-dependencies]
21 | dev = ["mypy>=1.11.1", "ruff>=0.6.1"]
22 | 
23 | [build-system]
24 | requires = ["setuptools>=73.0.0", "wheel"]
25 | build-backend = "setuptools.build_meta"
26 | 
27 | [tool.setuptools]
28 | packages = ["tavily.templates.company_researcher", "company_researcher"]
29 | [tool.setuptools.package-dir]
30 | "tavily.templates.company_researcher" = "src/company_researcher"
31 | "company_researcher" = "src/company_researcher"
32 | 
33 | 
34 | [tool.setuptools.package-data]
35 | "*" = ["py.typed"]
36 | 
37 | [tool.ruff]
38 | lint.select = [
39 |     "E",    # pycodestyle
40 |     "F",    # pyflakes
41 |     "I",    # isort
42 |     "D",    # pydocstyle
43 |     "D401", # First line should be in imperative mood
44 |     "T201",
45 |     "UP",
46 | ]
47 | lint.ignore = [
48 |     "UP006",
49 |     "UP007",
50 |     # We actually do want to import from typing_extensions
51 |     "UP035",
52 |     # Relax the convention by _not_ requiring documentation for every function parameter.
53 |     "D417",
54 |     "E501",
55 | ]
56 | [tool.ruff.lint.per-file-ignores]
57 | "tests/*" = ["D", "UP"]
58 | [tool.ruff.lint.pydocstyle]
59 | convention = "google"
60 | 
61 | 


--------------------------------------------------------------------------------
/src/company_researcher/graph.py:
--------------------------------------------------------------------------------
 1 | from langgraph.graph import StateGraph, END
 2 | 
 3 | from company_researcher.config import Config
 4 | from company_researcher.state import InputState, OutputState, ResearchState
 5 | from company_researcher.nodes import GroundAgent, ResearchAgent, ClusterAgent, RerankAgent, EnrichAgent, WriteAgent
 6 | from company_researcher.utils.all import Utils
 7 | from company_researcher.router import cluster_router, rerank_router
 8 | 
 9 | 
10 | cfg = Config()
11 | utils = Utils()
12 | 
13 | # Initialize agents
14 | ground_agent = GroundAgent(cfg, utils)
15 | research_agent = ResearchAgent(cfg, utils)
16 | cluster_agent = ClusterAgent(cfg, utils)
17 | rerank_agent = RerankAgent(cfg, utils)
18 | enrich_agent = EnrichAgent(cfg, utils)
19 | write_agent = WriteAgent(cfg, utils)
20 | 
21 | # Define a Langchain graph
22 | workflow = StateGraph(ResearchState, input=InputState, output=OutputState)
23 | 
24 | # Add node for each agent
25 | workflow.add_node('ground', ground_agent.run)
26 | workflow.add_node('research', research_agent.run)
27 | workflow.add_node('cluster', cluster_agent.run)
28 | workflow.add_node('rerank', rerank_agent.run)
29 | workflow.add_node('enrich', enrich_agent.run)
30 | workflow.add_node('write', write_agent.run)
31 | 
32 | # Set up edges
33 | workflow.add_edge('ground', 'research')
34 | workflow.add_edge('research', 'cluster')
35 | workflow.add_conditional_edges('cluster', cluster_router)
36 | workflow.add_conditional_edges('rerank', rerank_router)
37 | workflow.add_edge('enrich', 'write')
38 | workflow.add_edge('write', END)
39 | 
40 | # Set up start node
41 | workflow.set_entry_point('ground')
42 | 
43 | graph = workflow.compile()
44 | graph.name = "Tavily Company Researcher"
45 | 


--------------------------------------------------------------------------------
/src/company_researcher/state.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import Dict, Union, List, Annotated
 3 | from langchain_core.messages import AnyMessage
 4 | from langgraph.graph import add_messages
 5 | 
 6 | from company_researcher.nodes.cluster import Cluster
 7 | from company_researcher.utils.tavily_utils import TavilySearchInput, TavilyQuery
 8 | 
 9 | class InputState(BaseModel):
10 |     company: str = Field(
11 |         description="The name of the company to research",
12 |         examples=["Tavily"],
13 |     )
14 |     company_url: str = Field(
15 |         description="The official website URL of the company.",
16 |         examples=["https://tavily.com/"],
17 |     ),
18 |     include: list[str] = Field(
19 |         description=(
20 |             "Optional list specifying information to include in the company research report, "
21 |             "such as the company's official website URL, LinkedIn profile URL, headquarters location, "
22 |             "number of employees, CEO's name, and more."
23 |         ),
24 |         examples=[
25 |             "Company's official website URL",
26 |             "Company's LinkedIn profile URL",
27 |             "Location of headquarters formatted as <city>, <state code> (e.g. San Francisco, CA)",
28 |             "Number of employees",
29 |             "Name of the CEO"
30 |         ],
31 |         default_factory=list
32 |     )
33 | 
34 | 
35 | class OutputState(BaseModel):
36 |     report: str = ""
37 | 
38 | class ResearchState(InputState, OutputState):
39 |     grounding_data: Dict[str, Dict[str, Union[str, None]]] = Field(default_factory=dict)
40 |     research_data: Dict[str, Dict[str, Union[str, float, None]]] = Field(default_factory=dict)
41 |     clusters: List[Cluster] = Field(default_factory=list)
42 |     chosen_cluster: int = Field(default_factory=int)
43 |     search_queries: List[TavilyQuery] = Field(default_factory=list)
44 |     messages: Annotated[List[AnyMessage], add_messages] = Field(default_factory=list)
45 | 
46 | 


--------------------------------------------------------------------------------
/src/company_researcher/nodes/rerank.py:
--------------------------------------------------------------------------------
 1 | import cohere
 2 | import asyncio
 3 | 
 4 | from company_researcher.nodes.cluster import Cluster
 5 | 
 6 | 
 7 | class RerankAgent:
 8 |     def __init__(self, cfg, utils):
 9 |         self.cfg = cfg
10 |         self.utils = utils
11 |         self.co = cohere.AsyncClient()
12 | 
13 |     async def rerank_documents(self, query, documents, top_n, timeout):
14 |         """Performs reranking of documents using Cohere."""
15 |         try:
16 |             response = await asyncio.wait_for(
17 |                 self.co.rerank(
18 |                     query=query,
19 |                     documents=documents,
20 |                     top_n=top_n,
21 |                     return_documents=False,
22 |                 ),
23 |                 timeout=timeout,
24 |             )
25 |             return response.results
26 |         except asyncio.TimeoutError:
27 |             raise TimeoutError("Timeout occurred during reranking")
28 |         except Exception as e:
29 |             raise RuntimeError(f"Unexpected error during reranking: {e}")
30 | 
31 |     def create_cluster(self, company_name, urls):
32 |         """Creates a Cluster object from company name and URLs."""
33 |         return Cluster(
34 |             company_name=company_name,
35 |             urls=urls,
36 |         )
37 | 
38 |     async def run(self, state):
39 |         """Main method to rerank research data and create clusters."""
40 |         msg = "🔄 Reranking research data...\n"
41 |         data = list(state.research_data.values())
42 | 
43 |         try:
44 |             # Perform reranking
45 |             query = f"Company {state.company}"
46 |             if state.include:
47 |                 query += " " + " ".join(state.include)
48 |             if self.cfg.DEBUG:
49 |                 print("Reranking query: ", query)
50 |             documents = [result["content"] for result in data]
51 |             top_n = self.cfg.DEFAULT_CLUSTER_SIZE
52 |             timeout = self.cfg.RERANK_TIMEOUT
53 | 
54 |             rerank_results = await self.rerank_documents(
55 |                 query=query,
56 |                 documents=documents,
57 |                 top_n=top_n,
58 |                 timeout=timeout,
59 |             )
60 | 
61 |             # Process results
62 |             urls = []
63 |             msg += "Top documents selected:\n"
64 |             for r in rerank_results:
65 |                 original_result = data[r.index]
66 |                 msg += f"{original_result['url']}\n"
67 |                 urls.append(original_result["url"])
68 | 
69 |             # Create and return cluster
70 |             cluster = self.create_cluster(state.company, urls)
71 |             return {"clusters": [cluster], "messages": msg}
72 | 
73 |         except TimeoutError:
74 |             return {"messages": "🚫 Timeout occurred while reranking research data"}
75 |         except RuntimeError as e:
76 |             return {"messages": f"🚫 Error during reranking: {e}"}
77 |         except Exception as e:
78 |             return {"messages": f"🚫 Unexpected error during reranking: {e}"}
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | .idea/
163 | 
164 | .DS_Store
165 | 


--------------------------------------------------------------------------------
/src/company_researcher/utils/tavily_utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from datetime import datetime
 3 | from tavily import AsyncTavilyClient
 4 | from typing import List, Optional
 5 | from pydantic import BaseModel, Field
 6 | from company_researcher.config import Config
 7 | cfg = Config()
 8 | 
 9 | # Define Tavily's arguments to tailor the search results
10 | class TavilyQuery(BaseModel):
11 |     query: str = Field(description="Web search query")
12 |     search_depth: str = Field(description="The depth of the search, available options: 'basic', 'advanced'")
13 |     time_range:  Optional[str] = Field(default=None, description="(Optional) Filters results by time range, useful when looking for sources like magazines and articles. Available options: 'day', 'week', 'month', 'year'.")
14 |     include_domains: Optional[List[str]] = Field(default=None,
15 |                                          description="List of domains to include in the research. Useful when trying to gather information from trusted and relevant domains")
16 | 
17 | class TavilySearchInput(BaseModel):
18 |     sub_queries: List[TavilyQuery] = Field(description="Set of web search queries that can be answered in isolation")
19 | 
20 | 
21 | class Tavily:
22 |     def __init__(self):
23 |         self.client = AsyncTavilyClient()
24 | 
25 |     async def extract(self, urls: list[str], sources_dict: dict, extract_depth="basic"):
26 |         msg = ""
27 | 
28 |         async def process_batch(url_batch):
29 |             batch_msg = ""
30 |             try:
31 |                 response = await self.client.extract(urls=url_batch, extract_depth=extract_depth)
32 |                 for itm in response['results']:
33 |                     url = itm['url']
34 |                     raw_content = itm['raw_content']
35 |                     if len(raw_content) > cfg.MAX_DOC_LENGTH:
36 |                         raw_content = raw_content[:cfg.MAX_DOC_LENGTH] + " [...]"
37 |                         if cfg.DEBUG:
38 |                             print(f"Content from {url} was truncated to the maximum allowed length ({cfg.MAX_DOC_LENGTH} characters). Current length: {len(raw_content)}\nPreview:\n{raw_content}")
39 |                     if url in sources_dict:
40 |                         sources_dict[url]['raw_content'] = raw_content
41 |                     else:
42 |                         sources_dict[url] = {'raw_content': raw_content}
43 |                     batch_msg += f"{url}\n"
44 |                 return batch_msg
45 |             except Exception as e:
46 |                 return f"Error occurred during Tavily Extract request for batch: {e}\n"
47 | 
48 |         # Split URLs into batches of 20
49 |         url_batches = [urls[i:i + 20] for i in range(0, len(urls), 20)]
50 | 
51 |         # Process all batches in parallel
52 |         results = await asyncio.gather(*[process_batch(batch) for batch in url_batches])
53 | 
54 |         # Collect messages from all batches
55 |         if results:
56 |             msg += "Extracted raw content for:\n" + "".join(results)
57 | 
58 |         return sources_dict, msg
59 | 
60 |     async def search(self, sub_queries: List[TavilyQuery], sources_dict: dict):
61 |         """
62 |         Perform searches for each sub-query using the Tavily Search concurrently.
63 | 
64 |         :param sub_queries: List of search queries.
65 |         :param sources_dict: Dictionary to store unique search results, keyed by URL.
66 |         """
67 | 
68 |         # Define a coroutine function to perform a single search with error handling
69 |         async def perform_search(query):
70 |             try:
71 |                 print(query)
72 |                 # Add date to the query as we need the most recent results
73 |                 # query_with_date = f"{query.query} {datetime.now().strftime('%m-%Y')}"
74 |                 tavily_response = await self.client.search(query=query.query, topic="general", search_depth=query.search_depth, time_range=query.time_range, include_domains=query.include_domains, max_results=10)
75 |                 return tavily_response['results']
76 |             except Exception as e:
77 |                 # Handle any exceptions, log them, and return an empty list
78 |                 if cfg.DEBUG:
79 |                     print(f"Error occurred during search for query '{query}': {str(e)}")
80 |                 return []
81 | 
82 |         # Run all the search tasks in parallel
83 |         search_tasks = [perform_search(itm) for itm in sub_queries]
84 |         search_responses = await asyncio.gather(*search_tasks)
85 | 
86 |         # Combine the results from all the responses and update the sources_dict
87 |         for response in search_responses:
88 |             for result in response:
89 |                 url = result.get("url")
90 |                 if url and url not in sources_dict:
91 |                     # Add the result to sources_dict if the URL is not already present
92 |                     sources_dict[url] = result
93 | 
94 |         return sources_dict


--------------------------------------------------------------------------------
/src/company_researcher/nodes/research.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, Field
 2 | from typing import List, Optional
 3 | from langchain_core.messages import AnyMessage, AIMessage, SystemMessage, HumanMessage, ToolMessage
 4 | from company_researcher.utils.tavily_utils import TavilySearchInput, TavilyQuery
 5 | 
 6 | class ResearchAgent:
 7 |     def __init__(self, cfg, utils):
 8 |         self.cfg = cfg
 9 |         self.utils = utils
10 | 
11 |     async def generate_queries(self, state):
12 |         try:
13 |             msg = f"🤔 Generating search queries based on grounding data...\n"
14 |             if self.cfg.DEBUG:
15 |                 print(msg)
16 |             prompt = (
17 |                 f"You are an expert company researcher specializing in generating company analysis reports.\n"
18 |                 f"Your task is to generate up to {self.cfg.MAX_SEARCH_QUERIES} precise **web search queries** to thoroughly understand the company: '{state.company}'.\n\n"
19 |                 f"### Key Areas to Explore:\n"
20 |                 f"- **Company Background**: Focus on keywords such as history, mission, headquarters, CEO, leadership team, and number of employees.\n"
21 |                 f"- **Products and Services**: Search for offerings like main products, unique features, customer segments, and market differentiation.\n"
22 |                 f"- **Market Position**: Use terms like market competition, industry ranking, competitive landscape, market reach, and impact.\n"
23 |                 f"- **Financials**: Look for information on funding rounds, revenue, financial growth, recent investments, and performance metrics.\n\n"
24 |             )
25 | 
26 |             if state.include:
27 |                 prompt += (
28 |                     f"### Required Information to Include:\n"
29 |                     f"- You are tasked with ensuring the following specific types of information are covered in the report, as specified by the user:\n"
30 |                     f"{', '.join(state.include)}\n"
31 |                     # f"- Prioritize missing information: Check the grounding data and identify any missing elements from the required information to include.\n"
32 |                     f"- Generate a search query only for the information that is missing from the provided grounding data.\n"
33 |                 )
34 | 
35 |             prompt += (
36 |                 f"### Grounding Data:\n"
37 |                 f"Use the grounding data provided from the company's website below to ensure queries are closely tied to **{state.company}** and reflect its latest context:\n"
38 |                 f"{state.grounding_data}\n\n"
39 |                 f"### Additional Guidance:\n"
40 |             )
41 | 
42 |             # if state.include:
43 |             #     prompt += (
44 |             #         f"- Prioritize missing information: Check the grounding data and identify any missing elements from the required information to include.\n"
45 |             #     )
46 | 
47 |             prompt += (
48 |                 f"- Ensure each query incorporates **specific keywords** derived from the grounding data, such as the company's name, key products or services, leadership titles, geographical locations, and other unique identifiers, to allow the search engine to retrieve the most relevant sources specific to the company you are researching.\n"
49 |                 f"- **Limit each query to 100 characters or fewer** to ensure clarity and search engine compatibility.\n"
50 |                 f"- Structure queries to focus on specific aspects of the company, such as \"{state.company} number of employees\" or \"{state.company} market competition.\"\n"
51 |                 f"- Avoid redundancy: Each query should focus on unique information to retrieve relevant details efficiently. For example, there should be only one query to search for the company's CEO name.\n"
52 | 
53 |             )
54 |             if self.cfg.DEBUG:
55 |                 print(prompt)
56 |             messages = [SystemMessage(content=prompt)]
57 |             response = await self.cfg.BASE_LLM.with_structured_output(TavilySearchInput).ainvoke(messages)
58 |             return response.sub_queries, msg
59 |         except Exception as e:
60 |             msg = f"🚫 An error occurred during search queries generation: {str(e)}"
61 |             return [TavilyQuery(query=f"Company {state.company}", search_depth="advanced")], msg
62 | 
63 |     async def run(self, state):
64 |         sub_queries, msg = await self.generate_queries(state)
65 |         sub_queries.append(TavilyQuery(query=f'{state.company} company', search_depth="advanced", include_domains=['linkedin.com/company']))
66 |         print(sub_queries)
67 |         msg += "🔎 Tavily Searching ...\n" + "\n".join(f'"{query.query}"' for query in sub_queries)
68 |         if self.cfg.DEBUG:
69 |             print(msg)
70 |         research_data = await self.utils.tavily.search(sub_queries, state.research_data)
71 |         return {"messages": msg, "search_queries": sub_queries, "research_data": research_data}
72 | 


--------------------------------------------------------------------------------
/src/company_researcher/nodes/cluster.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pydantic import BaseModel, Field
  3 | from typing import List
  4 | from langchain_core.messages import AnyMessage, AIMessage, SystemMessage, HumanMessage, ToolMessage
  5 | 
  6 | class Cluster(BaseModel):
  7 |     company_name: str = Field(
  8 |         ...,
  9 |         description="The name or identifier of the company these documents belong to."
 10 |     )
 11 |     urls: List[str] = Field(
 12 |         ...,
 13 |         description="A list of URLs relevant to the identified company."
 14 |     )
 15 | 
 16 | class Clusters(BaseModel):
 17 |     clusters: List[Cluster] = Field(default_factory=list, description="List of clusters")
 18 | 
 19 | class ClusterAgent:
 20 |     def __init__(self, cfg, utils):
 21 |         self.cfg = cfg
 22 |         self.utils = utils
 23 | 
 24 |     async def cluster(self, state):
 25 |         target_domain = state.company_url.split("//")[-1].split("/")[0]
 26 | 
 27 |         prompt = (
 28 |             f"We conducted a search for a company called '{state.company}', but the results may include documents from other companies with similar names or domains.\n"
 29 |             f"Your task is to accurately categorize these retrieved documents based on which specific company they pertain to, using the initial company information as 'ground truth.'\n\n"
 30 |             f"### Target Company Information\n"
 31 |             f"- **Company Name**: '{state.company}'\n"
 32 |             f"- **Primary Domain**: '{target_domain}'\n"
 33 |             f"- **Initial Context (Ground Truth)**: Information below should act as a verification baseline. Use it to confirm that the document content aligns directly with {state.company}.\n"
 34 |             f"- **{json.dumps(state.grounding_data)}**\n\n"
 35 |             f"### Retrieved Documents for Clustering\n"
 36 |             f"Below are the retrieved documents, including URLs and brief content snippets:\n"
 37 |             f"{[{'url': doc['url'], 'snippet': doc['content']} for doc in state.research_data.values()]}\n\n"
 38 |             f"### Clustering Instructions\n"
 39 |             f"- **Primary Domain Priority**: Documents with URLs containing '{target_domain}' should be prioritized for the main cluster for '{state.company}'.\n"
 40 |             f"- **Include Relevant Third-Party Sources**: Documents from third-party domains (e.g., news sites, industry reports) should also be included in the '{state.company}' cluster if they provide specific information about '{state.company}', reference '{target_domain}', or closely match the initial company context.\n"
 41 |         )
 42 | 
 43 |         if state.include:
 44 |             prompt += (
 45 |                 f"- **Trusted Sources Inclusion**: If possible, trusted sources that include the following information should be added to the main cluster:\n"
 46 |                 f"{', '.join(state.include)}.\n"
 47 |             )
 48 | 
 49 |         prompt += (
 50 |             f"- **Separate Similar But Distinct Domains**: Documents from similar but distinct domains (e.g., '{target_domain.replace('.com', '.io')}') should be placed in separate clusters unless they explicitly reference the target domain and align with the company's context.\n"
 51 |             f"- **Handle Ambiguities Separately**: Documents that lack clear alignment with '{state.company}' should be placed in an 'Ambiguous' cluster for further review.\n\n"
 52 |             f"### Example Output Format\n"
 53 |             f"{{\n"
 54 |             f"    'clusters': [\n"
 55 |             f"        {{\n"
 56 |             f"            'company_name': 'Name of Company A',\n"
 57 |             f"            'urls': [\n"
 58 |             f"                'http://example.com/doc1',\n"
 59 |             f"                'http://example.com/doc2'\n"
 60 |             f"            ]\n"
 61 |             f"        }},\n"
 62 |             f"        {{\n"
 63 |             f"            'company_name': 'Name of Company B',\n"
 64 |             f"            'urls': [\n"
 65 |             f"                'http://example.com/doc3'\n"
 66 |             f"            ]\n"
 67 |             f"        }},\n"
 68 |             f"        {{\n"
 69 |             f"            'company_name': 'Ambiguous',\n"
 70 |             f"            'urls': [\n"
 71 |             f"                'http://example.com/doc4'\n"
 72 |             f"            ]\n"
 73 |             f"        }}\n"
 74 |             f"    ]\n"
 75 |             f"}}\n\n"
 76 |             f"### Key Points\n"
 77 |             f"- **Focus on Relevant Content**: Documents that contain relevant references to '{state.company}' (even from third-party domains) should be clustered with '{state.company}' if they align well with the initial information and context provided.\n"
 78 |             f"- **Identify Ambiguities**: Any documents without clear relevance to '{state.company}' should be placed in the 'Ambiguous' cluster for manual review.\n"
 79 |         )
 80 |         prompt = prompt[:self.cfg.MAX_PROMPT_LENGTH]
 81 |         if self.cfg.DEBUG:
 82 |             print(prompt)
 83 |         try:
 84 |             messages = [SystemMessage(content=prompt)]
 85 |             response = await self.cfg.BASE_LLM.with_structured_output(Clusters).ainvoke(messages)
 86 |             clusters = response.clusters  # Access the structured clusters directly
 87 |             return clusters, ""
 88 |         except Exception as e:
 89 |             msg = f"🚫 Error accrued during clustering: {str(e)}\n"
 90 |             clusters = []
 91 |             return clusters, msg
 92 | 
 93 |     # Define the function to automatically choose the correct cluster, can add in the future manual selection support
 94 |     async def choose_cluster(self, company_url, clusters):
 95 |         chosen_cluster = 0
 96 |         msg = ""
 97 |         for index, cluster in enumerate(clusters):
 98 |             # Check if any URL in the cluster starts with the company URL
 99 |             if any(company_url in url for url in cluster.urls):
100 |                 chosen_cluster = index
101 |                 break
102 |         if clusters:
103 |             cluster = clusters[chosen_cluster]
104 |             msg = f"Automatically selected cluster: {cluster.company_name} with the following urls: {cluster.urls}\n"
105 |         return chosen_cluster, msg
106 | 
107 | 
108 |     async def run(self, state):
109 |         msg = "📊 Beginning clustering process...\n"
110 |         if self.cfg.DEBUG:
111 |             print(msg)
112 |         clusters, cluster_msg = await self.cluster(state)
113 |         if self.cfg.DEBUG:
114 |             print(cluster_msg)
115 |         chosen_cluster, choose_msg = await self.choose_cluster(state.company_url, clusters)
116 |         if self.cfg.DEBUG:
117 |             print(choose_msg)
118 |         return {"clusters": clusters, "chosen_cluster": chosen_cluster, "messages": msg + cluster_msg + choose_msg}
119 | 


--------------------------------------------------------------------------------
/src/company_researcher/nodes/write.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from langchain_core.messages import AnyMessage, AIMessage, SystemMessage, HumanMessage, ToolMessage
  3 | 
  4 | 
  5 | class WriteAgent:
  6 |     def __init__(self, cfg, utils):
  7 |         self.cfg = cfg
  8 |         self.utils = utils
  9 | 
 10 |     async def run(self, state):
 11 |         report_title = f"{state.company} Company Report"
 12 |         report_date = datetime.now().strftime('%B %d, %Y')
 13 | 
 14 |         prompt = (
 15 |             f"You are an expert company researcher tasked with writing a fact-based report on recent developments for the company **{state.company}**. "
 16 |             f"Write the report in Markdown format. DO NOT change the titles. Each section must be written in well-structured paragraphs, not lists or bullet points.\n"
 17 |             f"Ensure the report includes:\n"
 18 |             f"- **Inline citations** as Markdown hyperlinks directly in the main sections (e.g., Company X is an innovative leader in AI ([LinkedIn](https://linkedin.com))).\n"
 19 |             f"- A **Citations Section** at the end that lists all URLs used.\n\n"
 20 |             f"### Strict Guidelines:\n"
 21 |             f"- You must only use the information provided in the documents listed below.\n"
 22 |             f"- Do not make up or infer any details that are not explicitly stated in the provided sources.\n"
 23 |             f"- If a required data point (e.g., employee count, financial figures) is not available in the provided documents, state that it is unavailable.\n"
 24 |             f"- As of today, **{report_date}**, prioritize the most recent and updated source in cases where conflicting data points or metrics are found.\n"
 25 |         )
 26 | 
 27 |         if state.include:
 28 |             prompt += (
 29 |                 f"- Ensure the report includes the following user-requested information, if available: "
 30 |                 f"{', '.join(state.include)}.\n"
 31 |             )
 32 | 
 33 |         prompt += (
 34 |             "- Make sure to support specific data points and metrics included in the report with in-text Markdown hyperlink citations.\n\n"
 35 |             f"### Report Structure:\n"
 36 |             f"Title: {report_title}\n"
 37 |             f"Date: {report_date}\n"
 38 |             f"1. **Executive Summary**:\n"
 39 |             f"    - High-level overview of the company, its services, location, employee count, and achievements.\n"
 40 |             f"    - Make sure to include the general information necessary to understand the company well, including any notable achievements.\n\n"
 41 |             f"2. **Leadership and Vision**:\n"
 42 |             f"    - Details on the CEO and key team members, their experience, and alignment with company goals.\n"
 43 |             f"    - Any personnel changes and their strategic impact.\n\n"
 44 |             f"3. **Product and Service Overview**:\n"
 45 |             f"    - Summary of current products/services, features, updates, and market fit.\n"
 46 |             f"    - Include details from the company's website, tools, or new integrations.\n\n"
 47 |             f"4. **Financial Performance**:\n"
 48 |             f"    - For public companies: key metrics (e.g., revenue, market cap).\n"
 49 |             f"    - For startups: funding rounds, investors, and milestones.\n\n"
 50 |             f"5. **Recent Developments**:\n"
 51 |             f"    - New product enhancements, partnerships, competitive moves, or market entries.\n\n"
 52 |             f"6. **Competitive Landscape**:\n"
 53 |             f"    - Overview of major competitors and their positioning in the market.\n"
 54 |             f"    - Compare key differentiators, market share, pricing, and product/service features.\n"
 55 |             f"    - Include relevant competitor developments that impact the company’s strategy.\n\n"
 56 |         )
 57 |         if state.include:
 58 |             prompt += (
 59 |             f"7. (Optional) **Additional Information**:\n"
 60 |             f"    - Attempt to fit the user-requested information into the predefined sections above, where relevant.\n"
 61 |             f"    - ONLY if the information does not fit into ANY section, include that unfitted information here.\n"
 62 |             f"    - AVOID including user-requested information in multiple sections. For example, if the user requests that report includes the company CEO's name, it should be mentioned ONLY in the **Leadership and Vision** section and not repeated here."
 63 |             f"    - Present the information in well-structured paragraphs, not lists or bullet points.\n\n"
 64 |             )
 65 | 
 66 |         prompt += (
 67 |             f"{'8' if state.include else '7'}. **Citations**:\n"
 68 |             f"    - Ensure every source cited in the report is listed in the text as Markdown hyperlinks.\n"
 69 |             f"    - Also include a list of all URLs as Markdown hyperlinks in this section.\n\n"
 70 |         )
 71 | 
 72 |         # Dynamically generate the "Documents to Base the Report On" section
 73 |         if state.clusters:
 74 |             # Use cluster-specific research data
 75 |             documents = "\n".join(
 76 |                 f"- {state.research_data[key]}"
 77 |                 for key in state.clusters[state.chosen_cluster].urls
 78 |                 if key in state.research_data
 79 |             )
 80 |             prompt += (
 81 |                 f"### Documents to Base the Report On:\n"
 82 |                 f"Use the following cluster-specific documents to write the report:\n"
 83 |                 f"{documents}"
 84 |             )
 85 |         else:
 86 |             # Use all available research data
 87 |             grounding_data_content = "\n".join(f"- {item}" for item in state.grounding_data.values())
 88 |             research_data_content = "\n".join(f"- {item}" for item in state.research_data.values())
 89 |             prompt += (
 90 |                 f"### Documents to Base the Report On:\n"
 91 |                 f"#### Official Grounding Data:\n"
 92 |                 f"The following is official data sourced from the company's website and should be used as a primary reference:\n"
 93 |                 f"{grounding_data_content}\n\n"
 94 |                 f"#### Additional Research Data:\n"
 95 |                 f"Select and prioritize the most relevant sources to ensure alignment with the target company.\n"
 96 |                 f"{research_data_content}"
 97 |             )
 98 | 
 99 |         prompt = prompt[:self.cfg.MAX_PROMPT_LENGTH]
100 |         if self.cfg.DEBUG:
101 |             print(prompt)
102 | 
103 |         try:
104 |             messages = [SystemMessage(content=prompt)]
105 |             response = await self.cfg.FACTUAL_LLM.ainvoke(messages)
106 |             report = response.content
107 |             return {"report": report}
108 |         except Exception as e:
109 |             msg = f"🚫 Error generating report: {str(e)}"
110 |             if self.cfg.DEBUG:
111 |                 print(msg)
112 |             return {"messages": msg}
113 | 


--------------------------------------------------------------------------------