├── eval
├── __init__.py
├── run_eval.py
└── create_dataset.py
├── src
└── agent
│ ├── __init__.py
│ ├── configuration.py
│ ├── prompts.py
│ ├── utils.py
│ ├── state.py
│ └── graph.py
├── .env.example
├── langgraph.json
├── pyproject.toml
├── .gitignore
└── README.md
/eval/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/agent/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | ANTHROPIC_API_KEY=sk-xxx
2 | TAVILY_API_KEY=xxx
--------------------------------------------------------------------------------
/langgraph.json:
--------------------------------------------------------------------------------
1 | {
2 | "dockerfile_lines": [],
3 | "graphs": {
4 | "company_researcher": "./src/agent/graph.py:graph"
5 | },
6 | "python_version": "3.11",
7 | "env": "./.env",
8 | "dependencies": [
9 | "."
10 | ]
11 | }
--------------------------------------------------------------------------------
/src/agent/configuration.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass, fields
3 | from typing import Any, Optional
4 |
5 | from langchain_core.runnables import RunnableConfig
6 |
7 |
8 | @dataclass(kw_only=True)
9 | class Configuration:
10 | """The configurable fields for the chatbot."""
11 |
12 | max_search_queries: int = 3 # Max search queries per company
13 | max_search_results: int = 3 # Max search results per query
14 | max_reflection_steps: int = 0 # Max reflection steps
15 | include_search_results: bool = (
16 | False # Whether to include search results in the output
17 | )
18 |
19 | @classmethod
20 | def from_runnable_config(
21 | cls, config: Optional[RunnableConfig] = None
22 | ) -> "Configuration":
23 | """Create a Configuration instance from a RunnableConfig."""
24 | configurable = (
25 | config["configurable"] if config and "configurable" in config else {}
26 | )
27 | values: dict[str, Any] = {
28 | f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
29 | for f in fields(cls)
30 | if f.init
31 | }
32 | return cls(**{k: v for k, v in values.items() if v})
33 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "company-researcher"
3 | version = "0.0.1"
4 | description = "Researcher agent that searches information about a company and returns it in a structured format."
5 | authors = [
6 | { name = "Vadym Barda" },
7 | { name = "Lance Martin" }
8 | ]
9 | readme = "README.md"
10 | license = { text = "MIT" }
11 | requires-python = ">=3.9"
12 | dependencies = [
13 | "langgraph>=0.2.52",
14 | "langsmith>=0.1.147",
15 | "langchain-community>=0.3.8",
16 | "tavily-python>=0.5.0",
17 | "langchain_anthropic>=0.3.0",
18 | ]
19 |
20 | [project.optional-dependencies]
21 | dev = ["mypy>=1.11.1", "ruff>=0.6.1"]
22 |
23 | [build-system]
24 | requires = ["setuptools>=73.0.0", "wheel"]
25 | build-backend = "setuptools.build_meta"
26 |
27 | [tool.setuptools]
28 | packages = ["agent"]
29 | [tool.setuptools.package-dir]
30 | "agent" = "src/agent"
31 |
32 |
33 | [tool.setuptools.package-data]
34 | "*" = ["py.typed"]
35 |
36 | [tool.ruff]
37 | lint.select = [
38 | "E", # pycodestyle
39 | "F", # pyflakes
40 | "I", # isort
41 | "D", # pydocstyle
42 | "D401", # First line should be in imperative mood
43 | "T201",
44 | "UP",
45 | ]
46 | lint.ignore = [
47 | "UP006",
48 | "UP007",
49 | # We actually do want to import from typing_extensions
50 | "UP035",
51 | # Relax the convention by _not_ requiring documentation for every function parameter.
52 | "D417",
53 | "E501",
54 | ]
55 | [tool.ruff.lint.per-file-ignores]
56 | "tests/*" = ["D", "UP"]
57 | [tool.ruff.lint.pydocstyle]
58 | convention = "google"
59 |
60 | [dependency-groups]
61 | dev = [
62 | "langgraph-cli[inmem]>=0.1.61",
63 | ]
64 |
--------------------------------------------------------------------------------
/src/agent/prompts.py:
--------------------------------------------------------------------------------
1 | EXTRACTION_PROMPT = """Your task is to take notes gathered from web research and extract them into the following schema.
2 |
3 |
4 | {info}
5 |
6 |
7 | Here are all the notes from research:
8 |
9 |
10 | {notes}
11 |
12 | """
13 |
14 | QUERY_WRITER_PROMPT = """You are a search query generator tasked with creating targeted search queries to gather specific company information.
15 |
16 | Here is the company you are researching: {company}
17 |
18 | Generate at most {max_search_queries} search queries that will help gather the following information:
19 |
20 |
21 | {info}
22 |
23 |
24 |
25 | {user_notes}
26 |
27 |
28 | Your query should:
29 | 1. Focus on finding factual, up-to-date company information
30 | 2. Target official sources, news, and reliable business databases
31 | 3. Prioritize finding information that matches the schema requirements
32 | 4. Include the company name and relevant business terms
33 | 5. Be specific enough to avoid irrelevant results
34 |
35 | Create a focused query that will maximize the chances of finding schema-relevant information."""
36 |
37 | INFO_PROMPT = """You are doing web research on a company, {company}.
38 |
39 | The following schema shows the type of information we're interested in:
40 |
41 |
42 | {info}
43 |
44 |
45 | You have just scraped website content. Your task is to take clear, organized notes about the company, focusing on topics relevant to our interests.
46 |
47 |
48 | {content}
49 |
50 |
51 | Here are any additional notes from the user:
52 |
53 | {user_notes}
54 |
55 |
56 | Please provide detailed research notes that:
57 | 1. Are well-organized and easy to read
58 | 2. Focus on topics mentioned in the schema
59 | 3. Include specific facts, dates, and figures when available
60 | 4. Maintain accuracy of the original content
61 | 5. Note when important information appears to be missing or unclear
62 |
63 | Remember: Don't try to format the output to match the schema - just take clear notes that capture all relevant information."""
64 |
65 | REFLECTION_PROMPT = """You are a research analyst tasked with reviewing the quality and completeness of extracted company information.
66 |
67 | Compare the extracted information with the required schema:
68 |
69 |
70 | {schema}
71 |
72 |
73 | Here is the extracted information:
74 |
75 | {info}
76 |
77 |
78 | Analyze if all required fields are present and sufficiently populated. Consider:
79 | 1. Are any required fields missing?
80 | 2. Are any fields incomplete or containing uncertain information?
81 | 3. Are there fields with placeholder values or "unknown" markers?
82 | """
83 |
--------------------------------------------------------------------------------
/src/agent/utils.py:
--------------------------------------------------------------------------------
1 | def deduplicate_sources(search_response: dict | list[dict]) -> list[dict]:
2 | """
3 | Takes either a single search response or list of responses from Tavily API and de-duplicates them based on the URL.
4 |
5 | Args:
6 | search_response: Either:
7 | - A dict with a 'results' key containing a list of search results
8 | - A list of dicts, each containing search results
9 |
10 | Returns:
11 | str: Formatted string with deduplicated sources
12 | """
13 | # Convert input to list of results
14 | if isinstance(search_response, dict):
15 | sources_list = search_response["results"]
16 | elif isinstance(search_response, list):
17 | sources_list = []
18 | for response in search_response:
19 | if isinstance(response, dict) and "results" in response:
20 | sources_list.extend(response["results"])
21 | else:
22 | sources_list.extend(response)
23 | else:
24 | raise ValueError(
25 | "Input must be either a dict with 'results' or a list of search results"
26 | )
27 |
28 | # Deduplicate by URL
29 | unique_urls = set()
30 | unique_sources_list = []
31 | for source in sources_list:
32 | if source["url"] not in unique_urls:
33 | unique_urls.add(source["url"])
34 | unique_sources_list.append(source)
35 |
36 | return unique_sources_list
37 |
38 |
39 | def format_sources(
40 | sources_list: list[dict],
41 | include_raw_content: bool = True,
42 | max_tokens_per_source: int = 1000,
43 | ) -> str:
44 | """
45 | Takes a list of unique results from Tavily API and formats them.
46 | Limits the raw_content to approximately max_tokens_per_source.
47 | include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
48 |
49 | Args:
50 | sources_list: list of unique results from Tavily API
51 | max_tokens_per_source: int, maximum number of tokens per each search result to include in the formatted string
52 | include_raw_content: bool, whether to include the raw_content from Tavily in the formatted string
53 |
54 | Returns:
55 | str: Formatted string with deduplicated sources
56 | """
57 | # Format output
58 | formatted_text = "Sources:\n\n"
59 | for source in sources_list:
60 | formatted_text += f"Source {source['title']}:\n===\n"
61 | formatted_text += f"URL: {source['url']}\n===\n"
62 | formatted_text += (
63 | f"Most relevant content from source: {source['content']}\n===\n"
64 | )
65 | if include_raw_content:
66 | # Using rough estimate of 4 characters per token
67 | char_limit = max_tokens_per_source * 4
68 | # Handle None raw_content
69 | raw_content = source.get("raw_content", "")
70 | if raw_content is None:
71 | raw_content = ""
72 | print(f"Warning: No raw_content found for source {source['url']}")
73 | if len(raw_content) > char_limit:
74 | raw_content = raw_content[:char_limit] + "... [truncated]"
75 | formatted_text += f"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\n\n"
76 |
77 | return formatted_text.strip()
78 |
79 |
80 | def format_all_notes(completed_notes: list[str]) -> str:
81 | """Format a list of notes into a string"""
82 | formatted_str = ""
83 | for idx, company_notes in enumerate(completed_notes, 1):
84 | formatted_str += f"""
85 | {'='*60}
86 | Note: {idx}:
87 | {'='*60}
88 | Notes from research:
89 | {company_notes}"""
90 | return formatted_str
91 |
--------------------------------------------------------------------------------
/src/agent/state.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Any, Optional, Annotated
3 | import operator
4 |
5 |
6 | DEFAULT_EXTRACTION_SCHEMA = {
7 | "title": "CompanyInfo",
8 | "description": "Basic information about a company",
9 | "type": "object",
10 | "properties": {
11 | "company_name": {
12 | "type": "string",
13 | "description": "Official name of the company",
14 | },
15 | "founding_year": {
16 | "type": "integer",
17 | "description": "Year the company was founded",
18 | },
19 | "founder_names": {
20 | "type": "array",
21 | "items": {"type": "string"},
22 | "description": "Names of the founding team members",
23 | },
24 | "product_description": {
25 | "type": "string",
26 | "description": "Brief description of the company's main product or service",
27 | },
28 | "funding_summary": {
29 | "type": "string",
30 | "description": "Summary of the company's funding history",
31 | },
32 | },
33 | "required": ["company_name"],
34 | }
35 |
36 |
37 | @dataclass(kw_only=True)
38 | class InputState:
39 | """Input state defines the interface between the graph and the user (external API)."""
40 |
41 | company: str
42 | "Company to research provided by the user."
43 |
44 | extraction_schema: dict[str, Any] = field(
45 | default_factory=lambda: DEFAULT_EXTRACTION_SCHEMA
46 | )
47 | "The json schema defines the information the agent is tasked with filling out."
48 |
49 | user_notes: Optional[dict[str, Any]] = field(default=None)
50 | "Any notes from the user to start the research process."
51 |
52 |
53 | @dataclass(kw_only=True)
54 | class OverallState:
55 | """Input state defines the interface between the graph and the user (external API)."""
56 |
57 | company: str
58 | "Company to research provided by the user."
59 |
60 | extraction_schema: dict[str, Any] = field(
61 | default_factory=lambda: DEFAULT_EXTRACTION_SCHEMA
62 | )
63 | "The json schema defines the information the agent is tasked with filling out."
64 |
65 | user_notes: str = field(default=None)
66 | "Any notes from the user to start the research process."
67 |
68 | search_queries: list[str] = field(default=None)
69 | "List of generated search queries to find relevant information"
70 |
71 | search_results: list[dict] = field(default=None)
72 | "List of search results"
73 |
74 | completed_notes: Annotated[list, operator.add] = field(default_factory=list)
75 | "Notes from completed research related to the schema"
76 |
77 | info: dict[str, Any] = field(default=None)
78 | """
79 | A dictionary containing the extracted and processed information
80 | based on the user's query and the graph's execution.
81 | This is the primary output of the enrichment process.
82 | """
83 |
84 | is_satisfactory: bool = field(default=None)
85 | "True if all required fields are well populated, False otherwise"
86 |
87 | reflection_steps_taken: int = field(default=0)
88 | "Number of times the reflection node has been executed"
89 |
90 |
91 | @dataclass(kw_only=True)
92 | class OutputState:
93 | """The response object for the end user.
94 |
95 | This class defines the structure of the output that will be provided
96 | to the user after the graph's execution is complete.
97 | """
98 |
99 | info: dict[str, Any]
100 | """
101 | A dictionary containing the extracted and processed information
102 | based on the user's query and the graph's execution.
103 | This is the primary output of the enrichment process.
104 | """
105 |
106 | search_results: list[dict] = field(default=None)
107 | "List of search results"
108 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | .DS_Store
164 | uv.lock
165 | .langgraph_api
166 |
--------------------------------------------------------------------------------
/eval/run_eval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from typing import Optional
3 |
4 | from langchain_anthropic import ChatAnthropic
5 | from langsmith import Client, evaluate
6 | from langsmith.evaluation import EvaluationResults
7 | from pydantic import BaseModel, Field
8 |
9 | from langgraph.pregel.remote import RemoteGraph
10 |
11 |
12 | client = Client()
13 |
14 | NUMERIC_FIELDS = (
15 | "total_funding_mm_usd",
16 | "latest_round_amount_mm_usd",
17 | )
18 | EXACT_MATCH_FIELDS = (
19 | "website",
20 | "crunchbase_profile",
21 | "headquarters",
22 | "year_founded",
23 | "latest_round",
24 | "latest_round_date",
25 | )
26 | FUZZY_MATCH_FIELDS = ("name", "ceo", "description")
27 |
28 | DEFAULT_DATASET_NAME = "Startup Data Enrichment"
29 | DEFAULT_GRAPH_ID = "company_researcher"
30 | DEFAULT_AGENT_URL = "http://localhost:2024"
31 |
32 | judge_llm = ChatAnthropic(model="claude-3-5-sonnet-latest", temperature=0)
33 |
34 | EVALUATION_PROMPT = f"""You are an evaluator tasked with assessing the accuracy of an agent's output compared to the expected output. Follow these instructions:
35 |
36 | 1. **Numeric Fields Evaluation**: For fields {NUMERIC_FIELDS}, check if the agent's output is within 10% of the expected value. Score 1 if yes, 0 if no.
37 | 2. **Exact Match Evaluation**: For fields {EXACT_MATCH_FIELDS}, check if the agent's output matches the expected output EXACTLY. Score 1 if yes, 0 if no.
38 | 3. **Fuzzy Match Evaluation**: For fields {FUZZY_MATCH_FIELDS}, check if the agent's output matches the expected output APPROXIMATELY. Score 1 if yes, 0 if no.
39 | 4. **Overall Evaluation**: Return final score that is a fraction of fields that have score of 1. For example, if 1/5 fields has score of 1, the final score is 0.2."""
40 |
41 |
42 | def evaluate_agent(outputs: dict, reference_outputs: dict):
43 | if "info" not in outputs:
44 | raise ValueError("Agent output must contain 'info' key")
45 |
46 | class Score(BaseModel):
47 | """Evaluate the agent's output against the expected output."""
48 |
49 | score: float = Field(
50 | description="A score between 0 and 1 indicating the accuracy of the agent's output compared to the expected output. 1 is a perfect match."
51 | )
52 | reason: str = Field(
53 | description="A brief explanation for why you scored the agent's output as you did."
54 | )
55 |
56 | score = judge_llm.with_structured_output(Score).invoke(
57 | [
58 | {
59 | "role": "system",
60 | "content": EVALUATION_PROMPT,
61 | },
62 | {
63 | "role": "user",
64 | "content": f'Actual output: {outputs["info"]}\nExpected output: {reference_outputs["info"]}',
65 | },
66 | ]
67 | )
68 | return score.score
69 |
70 |
71 | # PUBLIC API
72 |
73 |
74 | def transform_dataset_inputs(inputs: dict) -> dict:
75 | """Transform LangSmith dataset inputs to match the agent's input schema before invoking the agent."""
76 | # see the `Example input` in the README for reference on what `inputs` dict should look like
77 | # the dataset inputs already match the agent's input schema, but you can add any additional processing here
78 | return inputs
79 |
80 |
81 | def transform_agent_outputs(outputs: dict) -> dict:
82 | """Transform agent outputs to match the LangSmith dataset output schema."""
83 | # see the `Example output` in the README for reference on what the output should look like
84 | return {"info": outputs["info"]}
85 |
86 |
87 | def make_agent_runner(graph_id: str, agent_url: str):
88 | """Wrapper that transforms inputs/outputs to match the expected eval schema and invokes the agent."""
89 | agent_graph = RemoteGraph(graph_id, url=agent_url)
90 |
91 | def run_agent(inputs: dict) -> dict:
92 | """Run the agent on the inputs from the LangSmith dataset record, return outputs conforming to the LangSmith dataset output schema."""
93 | transformed_inputs = transform_dataset_inputs(inputs)
94 | response = agent_graph.invoke(transformed_inputs)
95 | return transform_agent_outputs(response)
96 |
97 | return run_agent
98 |
99 |
100 | def run_eval(
101 | *,
102 | dataset_name: str,
103 | graph_id: str = DEFAULT_GRAPH_ID,
104 | agent_url: str = DEFAULT_AGENT_URL,
105 | experiment_prefix: Optional[str] = None,
106 | ) -> EvaluationResults:
107 | dataset = client.read_dataset(dataset_name=dataset_name)
108 | run_agent = make_agent_runner(graph_id, agent_url)
109 | results = evaluate(
110 | run_agent,
111 | data=dataset,
112 | evaluators=[evaluate_agent],
113 | experiment_prefix=experiment_prefix,
114 | )
115 | return results
116 |
117 |
118 | if __name__ == "__main__":
119 | import argparse
120 |
121 | parser = argparse.ArgumentParser()
122 | parser.add_argument(
123 | "--dataset-name",
124 | type=str,
125 | default=DEFAULT_DATASET_NAME,
126 | help="Name of the dataset to evaluate against",
127 | )
128 | parser.add_argument(
129 | "--graph-id",
130 | type=str,
131 | default=DEFAULT_GRAPH_ID,
132 | help="ID of the graph to evaluate",
133 | )
134 | parser.add_argument(
135 | "--agent-url",
136 | type=str,
137 | default=DEFAULT_AGENT_URL,
138 | help="URL of the deployed agent to evaluate",
139 | )
140 | parser.add_argument(
141 | "--experiment-prefix",
142 | type=str,
143 | help="Experiment prefix for the evaluation",
144 | )
145 | args = parser.parse_args()
146 |
147 | run_eval(
148 | dataset_name=args.dataset_name,
149 | graph_id=args.graph_id,
150 | agent_url=args.agent_url,
151 | experiment_prefix=args.experiment_prefix,
152 | )
153 |
--------------------------------------------------------------------------------
/src/agent/graph.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import cast, Any, Literal
3 | import json
4 |
5 | from tavily import AsyncTavilyClient
6 | from langchain_anthropic import ChatAnthropic
7 | from langchain_core.rate_limiters import InMemoryRateLimiter
8 | from langchain_core.runnables import RunnableConfig
9 | from langgraph.graph import START, END, StateGraph
10 | from pydantic import BaseModel, Field
11 |
12 | from agent.configuration import Configuration
13 | from agent.state import InputState, OutputState, OverallState
14 | from agent.utils import deduplicate_sources, format_sources, format_all_notes
15 | from agent.prompts import (
16 | EXTRACTION_PROMPT,
17 | REFLECTION_PROMPT,
18 | INFO_PROMPT,
19 | QUERY_WRITER_PROMPT,
20 | )
21 |
22 | # LLMs
23 |
24 | rate_limiter = InMemoryRateLimiter(
25 | requests_per_second=4,
26 | check_every_n_seconds=0.1,
27 | max_bucket_size=10, # Controls the maximum burst size.
28 | )
29 | claude_3_5_sonnet = ChatAnthropic(
30 | model="claude-3-5-sonnet-latest", temperature=0, rate_limiter=rate_limiter
31 | )
32 |
33 | # Search
34 |
35 | tavily_async_client = AsyncTavilyClient()
36 |
37 |
38 | class Queries(BaseModel):
39 | queries: list[str] = Field(
40 | description="List of search queries.",
41 | )
42 |
43 |
44 | class ReflectionOutput(BaseModel):
45 | is_satisfactory: bool = Field(
46 | description="True if all required fields are well populated, False otherwise"
47 | )
48 | missing_fields: list[str] = Field(
49 | description="List of field names that are missing or incomplete"
50 | )
51 | search_queries: list[str] = Field(
52 | description="If is_satisfactory is False, provide 1-3 targeted search queries to find the missing information"
53 | )
54 | reasoning: str = Field(description="Brief explanation of the assessment")
55 |
56 |
57 | def generate_queries(state: OverallState, config: RunnableConfig) -> dict[str, Any]:
58 | """Generate search queries based on the user input and extraction schema."""
59 | # Get configuration
60 | configurable = Configuration.from_runnable_config(config)
61 | max_search_queries = configurable.max_search_queries
62 |
63 | # Generate search queries
64 | structured_llm = claude_3_5_sonnet.with_structured_output(Queries)
65 |
66 | # Format system instructions
67 | query_instructions = QUERY_WRITER_PROMPT.format(
68 | company=state.company,
69 | info=json.dumps(state.extraction_schema, indent=2),
70 | user_notes=state.user_notes,
71 | max_search_queries=max_search_queries,
72 | )
73 |
74 | # Generate queries
75 | results = cast(
76 | Queries,
77 | structured_llm.invoke(
78 | [
79 | {"role": "system", "content": query_instructions},
80 | {
81 | "role": "user",
82 | "content": "Please generate a list of search queries related to the schema that you want to populate.",
83 | },
84 | ]
85 | ),
86 | )
87 |
88 | # Queries
89 | query_list = [query for query in results.queries]
90 | return {"search_queries": query_list}
91 |
92 |
93 | async def research_company(
94 | state: OverallState, config: RunnableConfig
95 | ) -> dict[str, Any]:
96 | """Execute a multi-step web search and information extraction process.
97 |
98 | This function performs the following steps:
99 | 1. Executes concurrent web searches using the Tavily API
100 | 2. Deduplicates and formats the search results
101 | """
102 |
103 | # Get configuration
104 | configurable = Configuration.from_runnable_config(config)
105 | max_search_results = configurable.max_search_results
106 |
107 | # Search tasks
108 | search_tasks = []
109 | for query in state.search_queries:
110 | search_tasks.append(
111 | tavily_async_client.search(
112 | query,
113 | max_results=max_search_results,
114 | include_raw_content=True,
115 | topic="general",
116 | )
117 | )
118 |
119 | # Execute all searches concurrently
120 | search_docs = await asyncio.gather(*search_tasks)
121 |
122 | # Deduplicate and format sources
123 | deduplicated_search_docs = deduplicate_sources(search_docs)
124 | source_str = format_sources(
125 | deduplicated_search_docs, max_tokens_per_source=1000, include_raw_content=True
126 | )
127 |
128 | # Generate structured notes relevant to the extraction schema
129 | p = INFO_PROMPT.format(
130 | info=json.dumps(state.extraction_schema, indent=2),
131 | content=source_str,
132 | company=state.company,
133 | user_notes=state.user_notes,
134 | )
135 | result = await claude_3_5_sonnet.ainvoke(p)
136 | state_update = {
137 | "completed_notes": [str(result.content)],
138 | }
139 | if configurable.include_search_results:
140 | state_update["search_results"] = deduplicated_search_docs
141 |
142 | return state_update
143 |
144 |
145 | def gather_notes_extract_schema(state: OverallState) -> dict[str, Any]:
146 | """Gather notes from the web search and extract the schema fields."""
147 |
148 | # Format all notes
149 | notes = format_all_notes(state.completed_notes)
150 |
151 | # Extract schema fields
152 | system_prompt = EXTRACTION_PROMPT.format(
153 | info=json.dumps(state.extraction_schema, indent=2), notes=notes
154 | )
155 | structured_llm = claude_3_5_sonnet.with_structured_output(state.extraction_schema)
156 | result = structured_llm.invoke(
157 | [
158 | {"role": "system", "content": system_prompt},
159 | {
160 | "role": "user",
161 | "content": "Produce a structured output from these notes.",
162 | },
163 | ]
164 | )
165 | return {"info": result}
166 |
167 |
168 | def reflection(state: OverallState) -> dict[str, Any]:
169 | """Reflect on the extracted information and generate search queries to find missing information."""
170 | structured_llm = claude_3_5_sonnet.with_structured_output(ReflectionOutput)
171 |
172 | # Format reflection prompt
173 | system_prompt = REFLECTION_PROMPT.format(
174 | schema=json.dumps(state.extraction_schema, indent=2),
175 | info=state.info,
176 | )
177 |
178 | # Invoke
179 | result = cast(
180 | ReflectionOutput,
181 | structured_llm.invoke(
182 | [
183 | {"role": "system", "content": system_prompt},
184 | {"role": "user", "content": "Produce a structured reflection output."},
185 | ]
186 | ),
187 | )
188 |
189 | if result.is_satisfactory:
190 | return {"is_satisfactory": result.is_satisfactory}
191 | else:
192 | return {
193 | "is_satisfactory": result.is_satisfactory,
194 | "search_queries": result.search_queries,
195 | "reflection_steps_taken": state.reflection_steps_taken + 1,
196 | }
197 |
198 |
199 | def route_from_reflection(
200 | state: OverallState, config: RunnableConfig
201 | ) -> Literal[END, "research_company"]: # type: ignore
202 | """Route the graph based on the reflection output."""
203 | # Get configuration
204 | configurable = Configuration.from_runnable_config(config)
205 |
206 | # If we have satisfactory results, end the process
207 | if state.is_satisfactory:
208 | return END
209 |
210 | # If results aren't satisfactory but we haven't hit max steps, continue research
211 | if state.reflection_steps_taken <= configurable.max_reflection_steps:
212 | return "research_company"
213 |
214 | # If we've exceeded max steps, end even if not satisfactory
215 | return END
216 |
217 |
218 | # Add nodes and edges
219 | builder = StateGraph(
220 | OverallState,
221 | input=InputState,
222 | output=OutputState,
223 | config_schema=Configuration,
224 | )
225 | builder.add_node("gather_notes_extract_schema", gather_notes_extract_schema)
226 | builder.add_node("generate_queries", generate_queries)
227 | builder.add_node("research_company", research_company)
228 | builder.add_node("reflection", reflection)
229 |
230 | builder.add_edge(START, "generate_queries")
231 | builder.add_edge("generate_queries", "research_company")
232 | builder.add_edge("research_company", "gather_notes_extract_schema")
233 | builder.add_edge("gather_notes_extract_schema", "reflection")
234 | builder.add_conditional_edges("reflection", route_from_reflection)
235 |
236 | # Compile
237 | graph = builder.compile()
238 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Company Researcher Agent
2 |
3 | Company Researcher Agent searches the web for information about a user-supplied company and returns it in a structured format defined by user-supplied JSON schema.
4 |
5 | ## 🚀 Quickstart with LangGraph server
6 |
7 | Set API keys for the LLM of choice (Anthropic is set by default in `src/agent/graph.py`) and [Tavily API](https://tavily.com/):
8 | ```
9 | cp .env.example .env
10 | ```
11 |
12 | Clone the repository and launch the assistant [using the LangGraph server](https://langchain-ai.github.io/langgraph/cloud/reference/cli/#dev):
13 | ```bash
14 | curl -LsSf https://astral.sh/uv/install.sh | sh
15 | git clone https://github.com/langchain-ai/company-researcher.git
16 | cd company-researcher
17 | uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.11 langgraph dev
18 | ```
19 |
20 | 
21 |
22 | ## How it works
23 |
24 | Company Researcher Agent follows a multi-step research and extraction workflow that separates web research from schema extraction, allowing for better resource management and comprehensive data collection:
25 |
26 | - **Research Phase**: The system performs intelligent web research on the input company:
27 | - Uses an LLM to generate targeted search queries based on the schema requirements (up to `max_search_queries`)
28 | - Executes concurrent web searches via [Tavily API](https://tavily.com/), retrieving up to `max_search_results` results per query
29 | - Takes structured research notes focused on schema-relevant information
30 | - **Extraction Phase**: After research is complete, the system:
31 | - Consolidates all research notes
32 | - Uses an LLM to extract and format the information according to the user-defined schema
33 | - Returns the structured data in the exact format requested
34 | - **Reflection Phase**: The system evaluates the quality of extracted information:
35 | - Analyzes completeness of required fields
36 | - Identifies any missing or incomplete information
37 | - Generates targeted follow-up search queries if needed
38 | - Continues research until information is satisfactory or max reflection steps reached
39 |
40 | ## Configuration
41 |
42 | The configuration for Company Researcher Agent is defined in the `src/agent/configuration.py` file:
43 | * `max_search_queries`: int = 3 # Max search queries per company
44 | * `max_search_results`: int = 3 # Max search results per query
45 | * `max_reflection_steps`: int = 1 # Max reflection steps
46 |
47 | ## Inputs
48 |
49 | The user inputs are:
50 |
51 | ```
52 | * company: str - A company to research
53 | * extraction_schema: Optional[dict] - A JSON schema for the output
54 | * user_notes: Optional[str] - Any additional notes about the company from the user
55 | ```
56 |
57 | If a schema is not provided, the system will use a default schema (`DEFAULT_EXTRACTION_SCHEMA`) defined in `src/agent/state.py`.
58 |
59 | ### Schemas
60 |
61 | > ⚠️ **WARNING:** JSON schemas require `title` and `description` fields for [extraction](https://python.langchain.com/docs/how_to/structured_output/#typeddict-or-json-schema).
62 | > ⚠️ **WARNING:** Avoid JSON objects with nesting; LLMs have challenges performing structured extraction from nested objects. See examples below that we have tested.
63 |
64 | Here is an example schema that can be supplied to research a company:
65 |
66 | * See the trace [here](https://smith.langchain.com/public/9f51fb8b-9486-4cd2-90ed-895f7932304e/r).
67 |
68 |
69 | Example schema
70 |
71 | ```
72 | {
73 | "title": "CompanyInfo",
74 | "description": "Basic information about a company",
75 | "type": "object",
76 | "properties": {
77 | "company_name": {
78 | "type": "string",
79 | "description": "Official name of the company"
80 | },
81 | "founding_year": {
82 | "type": "integer",
83 | "description": "Year the company was founded"
84 | },
85 | "founder_names": {
86 | "type": "array",
87 | "items": {"type": "string"},
88 | "description": "Names of the founding team members"
89 | },
90 | "product_description": {
91 | "type": "string",
92 | "description": "Brief description of the company's main product or service"
93 | },
94 | "funding_summary": {
95 | "type": "string",
96 | "description": "Summary of the company's funding history"
97 | }
98 | },
99 | "required": ["company_name"]
100 | }
101 | ```
102 |
103 |
104 | Here is an example of a more complex schema:
105 |
106 | * See the reflections steps in the trace [here](https://smith.langchain.com/public/36f0d917-4edd-4d55-8dbf-6d6ec8a25754/r).
107 |
108 |
109 | Example complex schema
110 |
111 | ```
112 | HARD_EXTRACTION_SCHEMA = {
113 | "title": "CompanyInfo",
114 | "description": "Comprehensive information about a company with confidence tracking",
115 | "type": "object",
116 | "properties": {
117 | "company_name": {
118 | "type": "string",
119 | "description": "Official name of the company"
120 | },
121 | "verified_company": {
122 | "type": "boolean",
123 | "description": "Confirmation this is the intended company, not a similarly named one"
124 | },
125 | "similar_companies": {
126 | "type": "array",
127 | "items": {"type": "string"},
128 | "description": "List of similarly named companies that could be confused with the target"
129 | },
130 | "distinguishing_features": {
131 | "type": "string",
132 | "description": "Key features that distinguish this company from similarly named ones"
133 | },
134 | "key_executives": {
135 | "type": "array",
136 | "items": {
137 | "type": "object",
138 | "properties": {
139 | "name": {"type": "string"},
140 | "title": {"type": "string"},
141 | "verification_date": {"type": "string"},
142 | "confidence_level": {
143 | "type": "string",
144 | "enum": ["high", "medium", "low", "uncertain"]
145 | },
146 | "source": {"type": "string"}
147 | }
148 | }
149 | },
150 | "org_chart_summary": {
151 | "type": "string",
152 | "description": "Brief description of organizational structure"
153 | },
154 | "leadership_caveats": {
155 | "type": "string",
156 | "description": "Any uncertainties or caveats about leadership information"
157 | },
158 | "main_products": {
159 | "type": "array",
160 | "items": {
161 | "type": "object",
162 | "properties": {
163 | "name": {"type": "string"},
164 | "description": {"type": "string"},
165 | "launch_date": {"type": "string"},
166 | "current_status": {"type": "string"}
167 | }
168 | }
169 | },
170 | "services": {
171 | "type": "array",
172 | "items": {
173 | "type": "object",
174 | "properties": {
175 | "name": {"type": "string"},
176 | "description": {"type": "string"},
177 | "target_market": {"type": "string"}
178 | }
179 | }
180 | },
181 | "recent_developments": {
182 | "type": "array",
183 | "items": {
184 | "type": "object",
185 | "properties": {
186 | "date": {"type": "string"},
187 | "title": {"type": "string"},
188 | "summary": {"type": "string"},
189 | "source_url": {"type": "string"},
190 | "significance": {"type": "string"}
191 | }
192 | },
193 | "description": "Major news and developments from the last 6 months"
194 | },
195 | "historical_challenges": {
196 | "type": "array",
197 | "items": {
198 | "type": "object",
199 | "properties": {
200 | "issue_type": {"type": "string"},
201 | "description": {"type": "string"},
202 | "date_period": {"type": "string"},
203 | "resolution": {"type": "string"},
204 | "current_status": {"type": "string"}
205 | }
206 | },
207 | "description": "Past challenges, issues, or controversies"
208 | },
209 | "sources": {
210 | "type": "array",
211 | "items": {
212 | "type": "object",
213 | "properties": {
214 | "url": {"type": "string"},
215 | "title": {"type": "string"},
216 | "date_accessed": {"type": "string"},
217 | "information_type": {
218 | "type": "array",
219 | "items": {"type": "string"},
220 | "description": "Types of information sourced from this link (e.g., leadership, products, news)"
221 | }
222 | }
223 | }
224 | },
225 | "company_summary": {
226 | "type": "string",
227 | "description": "Concise, dense summary of the most important company information (max 250 words)"
228 | }
229 | },
230 | "required": [
231 | "company_name",
232 | "verified_company",
233 | "company_summary",
234 | "key_executives",
235 | "main_products",
236 | "sources"
237 | ]
238 | }
239 | ```
240 |
241 |
242 |
243 | ## Evaluation
244 |
245 | Prior to engaging in any optimization, it is important to establish a baseline performance. This repository includes:
246 |
247 | 1. A dataset consisting of a list of companies and the expected structured information to be extracted for each company.
248 | 2. An evaluation script that can be used to evaluate the agent on this dataset.
249 |
250 | ### Set up
251 |
252 | Make sure you have the LangSmith CLI installed:
253 |
254 | ```shell
255 | pip install langsmith
256 | ```
257 |
258 | And set your API key:
259 |
260 | ```shell
261 | export LANGSMITH_API_KEY=
262 | export ANTHROPIC_API_KEY=
263 | ```
264 |
265 | ### Evaluation metric
266 |
267 | A score between 0 and 1 is assigned to each extraction result by an LLM model that acts
268 | as a judge.
269 |
270 | The model assigns the score based on how closely the extracted information matches the expected information.
271 |
272 | ### Get the dataset
273 |
274 | Create a new dataset in LangSmith using the code in the `eval` folder:
275 |
276 | ```shell
277 | python eval/create_dataset.py
278 | ```
279 |
280 | ### Run the evaluation
281 |
282 | To run the evaluation, you can use the `run_eval.py` script in the `eval` folder. This will create a new experiment in LangSmith for the dataset you created in the previous step.
283 |
284 | ```shell
285 | python eval/run_eval.py --experiment-prefix "My custom prefix" --agent-url http://localhost:2024
286 | ```
287 |
--------------------------------------------------------------------------------
/eval/create_dataset.py:
--------------------------------------------------------------------------------
1 | EXAMPLES = [
2 | {
3 | "company": "LangChain",
4 | "info": {
5 | "name": "LangChain, Inc.",
6 | "description": "LangChain helps developers to build applications powered by large language models (LLMs). It provides tools and frameworks to integrate LLMs with external data sources and APIs, facilitating the creation of advanced AI applications.",
7 | "website": "https://www.langchain.com",
8 | "crunchbase_profile": "https://www.crunchbase.com/organization/langchain",
9 | "year_founded": 2022,
10 | "ceo": "Harrison Chase",
11 | "total_funding_mm_usd": 35.0,
12 | "latest_round": "Series A",
13 | "latest_round_date": "2024-02-15",
14 | "latest_round_amount_mm_usd": 25.0,
15 | },
16 | },
17 | {
18 | "company": "Kensho",
19 | "info": {
20 | "name": "Kensho Technologies, LLC.",
21 | "description": "Kensho Technologies, a subsidiary of S&P Global, specializes in developing advanced analytics and machine learning solutions for the financial industry. Their products include tools for natural language processing, data extraction, and linking, enabling clients to derive actionable insights from complex data sets.",
22 | "website": "https://kensho.com/",
23 | "crunchbase_profile": "https://www.crunchbase.com/organization/kensho",
24 | "year_founded": 2013,
25 | "ceo": "Bhavesh Dayalji",
26 | "total_funding_mm_usd": 81.1,
27 | "latest_round": "Series B",
28 | "latest_round_date": "2017-02-28",
29 | "latest_round_amount_mm_usd": 50.0,
30 | },
31 | },
32 | {
33 | "company": "Robust Intelligence",
34 | "info": {
35 | "name": "Robust Intelligence, Inc.",
36 | "description": "Robust Intelligence offers an AI application security platform designed to protect machine learning models from various threats, including data poisoning and adversarial attacks. Their solutions ensure the integrity and reliability of AI systems across diverse industries.",
37 | "website": "https://www.robustintelligence.com/",
38 | "crunchbase_profile": "https://www.crunchbase.com/organization/robust-intelligence",
39 | "year_founded": 2019,
40 | "ceo": "Yaron Singer",
41 | "total_funding_mm_usd": 44.0,
42 | "latest_round": "Series B",
43 | "latest_round_date": "2021-12-09",
44 | "latest_round_amount_mm_usd": 30.0,
45 | },
46 | },
47 | {
48 | "company": "Perplexity.ai",
49 | "info": {
50 | "name": "Perplexity AI, Inc.",
51 | "description": "Perplexity.ai is an AI-powered search engine that delivers concise and accurate answers to user queries. It leverages advanced natural language processing to provide direct responses, enhancing the search experience.",
52 | "website": "https://www.perplexity.ai",
53 | "crunchbase_profile": "https://www.crunchbase.com/organization/perplexity-ai",
54 | "year_founded": 2022,
55 | "ceo": "Aravind Srinivas",
56 | "total_funding_mm_usd": 165.0,
57 | "latest_round": "Series B",
58 | "latest_round_date": "2024-04-23",
59 | "latest_round_amount_mm_usd": 62.7,
60 | },
61 | },
62 | {
63 | "company": "Physical Intelligence.ai",
64 | "info": {
65 | "name": "Physical Intelligence.ai",
66 | "description": "Physical Intelligence.ai specializes in developing AI solutions that enhance human physical capabilities. Their technologies focus on improving physical performance and health through intelligent systems.",
67 | "website": "https://www.physicalintelligence.company/",
68 | "crunchbase_profile": "https://www.crunchbase.com/organization/physical-intelligence-834b",
69 | "year_founded": 2023,
70 | "ceo": "Karol Hausman",
71 | "total_funding_mm_usd": 470.0,
72 | "latest_round": "Series A",
73 | "latest_round_date": "2024-11-04",
74 | "latest_round_amount_mm_usd": 400.0,
75 | },
76 | },
77 | {
78 | "company": "Galileo.ai",
79 | "info": {
80 | "name": "Galileo AI, Inc.",
81 | "description": "Galileo.ai offers AI-driven design tools that assist in creating user interfaces and experiences. Their platform automates design processes, enabling rapid prototyping and iteration for designers and developers.",
82 | "website": "https://www.usegalileo.ai/",
83 | "crunchbase_profile": "https://www.crunchbase.com/organization/galileo-ai",
84 | "year_founded": 2022,
85 | "ceo": "Arnaud Benard",
86 | "total_funding_mm_usd": 4.8,
87 | "latest_round": "Seed",
88 | "latest_round_date": "2024-02-06",
89 | "latest_round_amount_mm_usd": 4.4,
90 | },
91 | },
92 | {
93 | "company": "Sierra.ai",
94 | "info": {
95 | "name": "Sierra Technologies, Inc.",
96 | "description": "Sierra.ai develops AI-powered safety and compliance solutions for the trucking industry. Their technology aims to enhance driver safety, ensure regulatory compliance, and improve operational efficiency.",
97 | "website": "https://sierra.ai/",
98 | "crunchbase_profile": "https://www.crunchbase.com/organization/sierra-1124",
99 | "year_founded": 2023,
100 | "ceo": "Clay Bavor",
101 | "total_funding_mm_usd": 285.0,
102 | "latest_round": "Series B",
103 | "latest_round_date": "2024-10-28",
104 | "latest_round_amount_mm_usd": 175.0,
105 | },
106 | },
107 | {
108 | "company": "Rad AI",
109 | "info": {
110 | "name": "Rad AI, Inc.",
111 | "description": "Rad AI provides artificial intelligence solutions for radiology, aiming to improve diagnostic accuracy and efficiency. Their platform assists radiologists by automating routine tasks and enhancing image analysis.",
112 | "website": "https://www.radai.com",
113 | "crunchbase_profile": "https://www.crunchbase.com/organization/radai",
114 | "year_founded": 2018,
115 | "ceo": "Doktor Gurson",
116 | "total_funding_mm_usd": 83.0,
117 | "latest_round": "Series B",
118 | "latest_round_date": "2024-05-07",
119 | "latest_round_amount_mm_usd": 50.0,
120 | },
121 | },
122 | {
123 | "company": "Together AI",
124 | "info": {
125 | "name": "Together, Inc.",
126 | "description": "Together AI focuses on building open-source models and tools for natural language processing. They aim to make advanced AI technologies accessible and collaborative for researchers and developers.",
127 | "website": "https://www.together.ai/",
128 | "crunchbase_profile": "https://www.crunchbase.com/organization/together-ai",
129 | "year_founded": 2022,
130 | "ceo": "Vipul Ved Prakash",
131 | "total_funding_mm_usd": 228.5,
132 | "latest_round": "Series A",
133 | "latest_round_date": "2024-03-13",
134 | "latest_round_amount_mm_usd": 106.0,
135 | },
136 | },
137 | {
138 | "company": "Omneky",
139 | "info": {
140 | "name": "Omneky Inc.",
141 | "description": "Omneky utilizes AI to create personalized advertising content across digital platforms. Their platform analyzes data to generate targeted ads, optimizing marketing strategies for businesses.",
142 | "website": "https://www.omneky.com",
143 | "crunchbase_profile": "https://www.crunchbase.com/organization/omneky",
144 | "year_founded": 2018,
145 | "ceo": "Hikari Senju",
146 | "total_funding_mm_usd": 13.0,
147 | "latest_round": "Seed",
148 | "latest_round_date": "2022-11-15",
149 | "latest_round_amount_mm_usd": 10.0,
150 | },
151 | },
152 | {
153 | "company": "Curai Health",
154 | "info": {
155 | "name": "Curai, Inc.",
156 | "description": "Curai Health offers AI-assisted primary care services, combining artificial intelligence with medical expertise to provide accessible and affordable healthcare solutions.",
157 | "website": "https://www.curaihealth.com",
158 | "crunchbase_profile": "https://www.crunchbase.com/organization/curai",
159 | "year_founded": 2017,
160 | "ceo": "Neal Khosla",
161 | "total_funding_mm_usd": 38.2,
162 | "latest_round": "Series B",
163 | "latest_round_date": "2020-12-16",
164 | "latest_round_amount_mm_usd": 27.5,
165 | },
166 | },
167 | {
168 | "company": "Decagon.ai",
169 | "info": {
170 | "name": "Decagon AI, Inc.",
171 | "description": "Decagon.ai develops enterprise-grade generative AI agents for customer support, enabling businesses to provide efficient and personalized customer service experiences.",
172 | "website": "https://decagon.ai",
173 | "crunchbase_profile": "https://www.crunchbase.com/organization/decagon-485e",
174 | "year_founded": 2023,
175 | "ceo": "Jesse Zhang",
176 | "total_funding_mm_usd": 100.0,
177 | "latest_round": "Series B",
178 | "latest_round_date": "2024-10-15",
179 | "latest_round_amount_mm_usd": 65.0,
180 | },
181 | },
182 | {
183 | "company": "Xaira Therapeutics",
184 | "info": {
185 | "name": "Xaira Therapeutics",
186 | "description": "Xaira Therapeutics is a biotechnology company leveraging artificial intelligence for drug discovery and development, aiming to deliver transformative medicines.",
187 | "website": "https://xaira.com/",
188 | "crunchbase_profile": "https://www.crunchbase.com/organization/xaira-therapeutics",
189 | "year_founded": 2023,
190 | "ceo": "Marc Tessier-Lavigne",
191 | "total_funding_mm_usd": 1000.0,
192 | "latest_round": "Series A",
193 | "latest_round_date": "2024-04-23",
194 | "latest_round_amount_mm_usd": 1000.0,
195 | },
196 | },
197 | {
198 | "company": "Regie.ai",
199 | "info": {
200 | "name": "Regie.ai",
201 | "description": "Regie.ai provides generative AI tools for sales teams, automating content creation and streamlining communication processes to enhance sales efficiency.",
202 | "website": "https://www.regie.ai/",
203 | "crunchbase_profile": "https://www.crunchbase.com/organization/regie-da23",
204 | "year_founded": 2020,
205 | "ceo": "Srinath Sridhar",
206 | "total_funding_mm_usd": 20.8,
207 | "latest_round": "Series A",
208 | "latest_round_date": "2023-02-09",
209 | "latest_round_amount_mm_usd": 6.0,
210 | },
211 | },
212 | {
213 | "company": "Bifrost AI",
214 | "info": {
215 | "name": "Bifrost AI, Inc.",
216 | "description": "Bifrost AI specializes in generating synthetic data for AI and robotics, enabling faster training and validation of models without the need for real-world data.",
217 | "website": "https://www.bifrost.ai",
218 | "crunchbase_profile": "https://www.crunchbase.com/organization/bifrost",
219 | "year_founded": 2020,
220 | "ceo": "Charles Wong",
221 | "total_funding_mm_usd": 13.1,
222 | "latest_round": "Series A",
223 | "latest_round_date": "2024-10-30",
224 | "latest_round_amount_mm_usd": 8.0,
225 | },
226 | },
227 | {
228 | "company": "Recraft",
229 | "info": {
230 | "name": "Recraft, Inc",
231 | "description": "Recraft offers an AI-powered design tool for creating and editing images, providing features like image generation, vectorization, and mockup creation for professional designers.",
232 | "website": "https://www.recraft.ai",
233 | "crunchbase_profile": "https://www.crunchbase.com/organization/recraft",
234 | "year_founded": 2022,
235 | "ceo": "Anna Veronika Dorogush",
236 | "total_funding_mm_usd": 12.0,
237 | "latest_round": "Series A",
238 | "latest_round_date": "2024-01-18",
239 | "latest_round_amount_mm_usd": 12.0,
240 | },
241 | },
242 | {
243 | "company": "Brightseed",
244 | "info": {
245 | "name": "Brightseed, Inc",
246 | "description": "Brightseed utilizes artificial intelligence to discover bioactive compounds in nature that can restore human health, focusing on the intersection of nature, science, and humanity.",
247 | "website": "https://www.brightseedbio.com",
248 | "crunchbase_profile": "https://www.crunchbase.com/organization/brightseed",
249 | "year_founded": 2017,
250 | "ceo": "Jim Flatt",
251 | "total_funding_mm_usd": 120.8,
252 | "latest_round": "Series B",
253 | "latest_round_date": "2022-05-09",
254 | "latest_round_amount_mm_usd": 68.0,
255 | },
256 | },
257 | {
258 | "company": "Etched.ai",
259 | "info": {
260 | "name": "Etched.ai, Inc.",
261 | "description": "Etched.ai is developing the world's first transformer ASIC, a specialized chip designed to run AI models faster and more efficiently than traditional GPUs.",
262 | "website": "https://www.etched.com",
263 | "crunchbase_profile": "https://www.crunchbase.com/organization/etched-ai",
264 | "year_founded": 2022,
265 | "ceo": "Gavin Uberti",
266 | "total_funding_mm_usd": 125.4,
267 | "latest_round": "Series A",
268 | "latest_round_date": "2024-06-25",
269 | "latest_round_amount_mm_usd": 120.0,
270 | },
271 | },
272 | {
273 | "company": "World Labs",
274 | "info": {
275 | "name": "World Labs Technologies",
276 | "description": "World Labs is an AI-focused company dedicated to advancing artificial intelligence technologies and applications across various sectors.",
277 | "website": "https://www.worldlabs.ai",
278 | "crunchbase_profile": "https://www.crunchbase.com/organization/world-labs",
279 | "year_founded": 2024,
280 | "ceo": "Fei-Fei Li",
281 | "total_funding_mm_usd": 230.0,
282 | "latest_round": "Series A",
283 | "latest_round_date": "2024-09-13",
284 | "latest_round_amount_mm_usd": 230.0,
285 | },
286 | },
287 | {
288 | "company": "Sight Machine",
289 | "info": {
290 | "name": "Sight Machine Inc.",
291 | "description": "Sight Machine provides manufacturing analytics powered by AI, offering real-time insights to improve production efficiency and quality.",
292 | "website": "https://sightmachine.com",
293 | "crunchbase_profile": "https://www.crunchbase.com/organization/sight-machine",
294 | "year_founded": 2011,
295 | "ceo": "Jon Sobel",
296 | "total_funding_mm_usd": 80.4,
297 | "latest_round": "Series C",
298 | "latest_round_date": "2019-04-23",
299 | "latest_round_amount_mm_usd": 29.4,
300 | },
301 | },
302 | {
303 | "company": "Ambience Healthcare",
304 | "info": {
305 | "name": "Ambience Healthcare, Inc.",
306 | "description": "Ambience Healthcare offers AI-powered scribe solutions for healthcare providers, automating clinical documentation to reduce clinician burnout and improve care quality.",
307 | "website": "https://www.ambiencehealthcare.com",
308 | "crunchbase_profile": "https://www.crunchbase.com/organization/ambience-healthcare",
309 | "year_founded": 2020,
310 | "ceo": "Mike Ng",
311 | "total_funding_mm_usd": 76.3,
312 | "latest_round": "Series B",
313 | "latest_round_date": "2024-02-06",
314 | "latest_round_amount_mm_usd": 70.0,
315 | },
316 | },
317 | {
318 | "company": "Safely You",
319 | "info": {
320 | "name": "SafelyYou, Inc.",
321 | "description": "Safely You utilizes AI technology to reduce falls and associated risks in senior living communities, enhancing resident safety and care.",
322 | "website": "https://www.safely-you.com",
323 | "crunchbase_profile": "https://www.crunchbase.com/organization/safely-you",
324 | "year_founded": 2016,
325 | "ceo": "George Netscher",
326 | "total_funding_mm_usd": 71.3,
327 | "latest_round": "Debt",
328 | "latest_round_date": "2023-05-25",
329 | "latest_round_amount_mm_usd": 10.0,
330 | },
331 | },
332 | {
333 | "company": "Kintsugi.AI",
334 | "info": {
335 | "name": "KintsugiAI, Inc.",
336 | "description": "Kintsugi.AI provides sales tax automation solutions for companies globally, streamlining compliance processes and reducing errors.",
337 | "website": "trykintsugi.com",
338 | "crunchbase_profile": "https://www.crunchbase.com/organization/kintsugi-0524",
339 | "year_founded": 2022,
340 | "ceo": "Pujun Bhatnagar",
341 | "total_funding_mm_usd": 12.2,
342 | "latest_round": "Series A",
343 | "latest_round_date": "2024-11-19",
344 | "latest_round_amount_mm_usd": 4.0,
345 | },
346 | },
347 | ]
348 |
349 | EXTRACTION_SCHEMA = {
350 | "type": "object",
351 | "title": "company_info",
352 | "properties": {
353 | "name": {"type": "string", "description": "Official company name"},
354 | "description": {
355 | "type": "string",
356 | "description": "Brief description of the company and its activities",
357 | },
358 | "website": {
359 | "type": "string",
360 | "format": "uri",
361 | "description": "Company's official website URL",
362 | },
363 | "crunchbase_profile": {
364 | "type": "string",
365 | "format": "uri",
366 | "description": "Company's Crunchbase profile URL",
367 | },
368 | "year_founded": {
369 | "type": "integer",
370 | "minimum": 1800,
371 | "description": "Year when the company was founded",
372 | },
373 | "ceo": {"type": "string", "description": "Name of the company's CEO"},
374 | "total_funding_mm_usd": {
375 | "type": "number",
376 | "minimum": 0,
377 | "description": "Total funding raised in millions of USD",
378 | },
379 | "latest_round": {
380 | "type": "string",
381 | "description": "Type of the most recent funding round (e.g., Series A, Seed, etc.)",
382 | },
383 | "latest_round_date": {
384 | "type": "string",
385 | "format": "date",
386 | "description": "Date of the most recent funding round (YYYY-MM-DD)",
387 | },
388 | "latest_round_amount_mm_usd": {
389 | "type": "number",
390 | "minimum": 0,
391 | "description": "Amount raised in the most recent funding round in millions of USD",
392 | },
393 | },
394 | "required": [
395 | "name",
396 | "description",
397 | "website",
398 | "crunchbase_profile",
399 | "year_founded",
400 | "ceo",
401 | "total_funding_mm_usd",
402 | "latest_round",
403 | "latest_round_date",
404 | "latest_round_amount_mm_usd",
405 | ],
406 | "description": "Company information",
407 | }
408 |
409 | if __name__ == "__main__":
410 | from langsmith import Client
411 | from langsmith.utils import LangSmithNotFoundError
412 |
413 | client = Client()
414 | dataset_name = "Startup Data Enrichment"
415 |
416 | # Storing inputs in a dataset lets us
417 | # run chains and LLMs over a shared set of examples.
418 | try:
419 | exists_dataset = client.read_dataset(dataset_name=dataset_name)
420 | print(f"Dataset '{dataset_name}' already exists.")
421 | print("You can access the dataset via the URL: ", exists_dataset.url)
422 | exit(1)
423 | except LangSmithNotFoundError:
424 | # Then let's create the dataset if it doesn't exist
425 | pass
426 |
427 | dataset = client.create_dataset(
428 | dataset_name=dataset_name,
429 | description="Evaluate ability to research information about startups (e.g., latest round, total funding, year founded etc.)",
430 | )
431 |
432 | # Prepare inputs, outputs, and metadata for bulk creation
433 | inputs = [
434 | {"company": record["company"], "extraction_schema": EXTRACTION_SCHEMA}
435 | for record in EXAMPLES
436 | ]
437 | outputs = [{"info": record["info"]} for record in EXAMPLES]
438 |
439 | client.create_examples(
440 | inputs=inputs,
441 | outputs=outputs,
442 | dataset_id=dataset.id,
443 | )
444 | print(f"Dataset '{dataset_name}' created with {len(EXAMPLES)} examples.")
445 | print("You can access the dataset via the URL: ", dataset.url)
446 |
--------------------------------------------------------------------------------