├── eval ├── __init__.py ├── run_eval.py └── create_dataset.py ├── src └── agent │ ├── __init__.py │ ├── configuration.py │ ├── prompts.py │ ├── utils.py │ ├── state.py │ └── graph.py ├── .env.example ├── langgraph.json ├── pyproject.toml ├── .gitignore └── README.md /eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/agent/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | ANTHROPIC_API_KEY=sk-xxx 2 | TAVILY_API_KEY=xxx -------------------------------------------------------------------------------- /langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "dockerfile_lines": [], 3 | "graphs": { 4 | "company_researcher": "./src/agent/graph.py:graph" 5 | }, 6 | "python_version": "3.11", 7 | "env": "./.env", 8 | "dependencies": [ 9 | "." 10 | ] 11 | } -------------------------------------------------------------------------------- /src/agent/configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass, fields 3 | from typing import Any, Optional 4 | 5 | from langchain_core.runnables import RunnableConfig 6 | 7 | 8 | @dataclass(kw_only=True) 9 | class Configuration: 10 | """The configurable fields for the chatbot.""" 11 | 12 | max_search_queries: int = 3 # Max search queries per company 13 | max_search_results: int = 3 # Max search results per query 14 | max_reflection_steps: int = 0 # Max reflection steps 15 | include_search_results: bool = ( 16 | False # Whether to include search results in the output 17 | ) 18 | 19 | @classmethod 20 | def from_runnable_config( 21 | cls, config: Optional[RunnableConfig] = None 22 | ) -> "Configuration": 23 | """Create a Configuration instance from a RunnableConfig.""" 24 | configurable = ( 25 | config["configurable"] if config and "configurable" in config else {} 26 | ) 27 | values: dict[str, Any] = { 28 | f.name: os.environ.get(f.name.upper(), configurable.get(f.name)) 29 | for f in fields(cls) 30 | if f.init 31 | } 32 | return cls(**{k: v for k, v in values.items() if v}) 33 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "company-researcher" 3 | version = "0.0.1" 4 | description = "Researcher agent that searches information about a company and returns it in a structured format." 5 | authors = [ 6 | { name = "Vadym Barda" }, 7 | { name = "Lance Martin" } 8 | ] 9 | readme = "README.md" 10 | license = { text = "MIT" } 11 | requires-python = ">=3.9" 12 | dependencies = [ 13 | "langgraph>=0.2.52", 14 | "langsmith>=0.1.147", 15 | "langchain-community>=0.3.8", 16 | "tavily-python>=0.5.0", 17 | "langchain_anthropic>=0.3.0", 18 | ] 19 | 20 | [project.optional-dependencies] 21 | dev = ["mypy>=1.11.1", "ruff>=0.6.1"] 22 | 23 | [build-system] 24 | requires = ["setuptools>=73.0.0", "wheel"] 25 | build-backend = "setuptools.build_meta" 26 | 27 | [tool.setuptools] 28 | packages = ["agent"] 29 | [tool.setuptools.package-dir] 30 | "agent" = "src/agent" 31 | 32 | 33 | [tool.setuptools.package-data] 34 | "*" = ["py.typed"] 35 | 36 | [tool.ruff] 37 | lint.select = [ 38 | "E", # pycodestyle 39 | "F", # pyflakes 40 | "I", # isort 41 | "D", # pydocstyle 42 | "D401", # First line should be in imperative mood 43 | "T201", 44 | "UP", 45 | ] 46 | lint.ignore = [ 47 | "UP006", 48 | "UP007", 49 | # We actually do want to import from typing_extensions 50 | "UP035", 51 | # Relax the convention by _not_ requiring documentation for every function parameter. 52 | "D417", 53 | "E501", 54 | ] 55 | [tool.ruff.lint.per-file-ignores] 56 | "tests/*" = ["D", "UP"] 57 | [tool.ruff.lint.pydocstyle] 58 | convention = "google" 59 | 60 | [dependency-groups] 61 | dev = [ 62 | "langgraph-cli[inmem]>=0.1.61", 63 | ] 64 | -------------------------------------------------------------------------------- /src/agent/prompts.py: -------------------------------------------------------------------------------- 1 | EXTRACTION_PROMPT = """Your task is to take notes gathered from web research and extract them into the following schema. 2 | 3 | 4 | {info} 5 | 6 | 7 | Here are all the notes from research: 8 | 9 | 10 | {notes} 11 | 12 | """ 13 | 14 | QUERY_WRITER_PROMPT = """You are a search query generator tasked with creating targeted search queries to gather specific company information. 15 | 16 | Here is the company you are researching: {company} 17 | 18 | Generate at most {max_search_queries} search queries that will help gather the following information: 19 | 20 | 21 | {info} 22 | 23 | 24 | 25 | {user_notes} 26 | 27 | 28 | Your query should: 29 | 1. Focus on finding factual, up-to-date company information 30 | 2. Target official sources, news, and reliable business databases 31 | 3. Prioritize finding information that matches the schema requirements 32 | 4. Include the company name and relevant business terms 33 | 5. Be specific enough to avoid irrelevant results 34 | 35 | Create a focused query that will maximize the chances of finding schema-relevant information.""" 36 | 37 | INFO_PROMPT = """You are doing web research on a company, {company}. 38 | 39 | The following schema shows the type of information we're interested in: 40 | 41 | 42 | {info} 43 | 44 | 45 | You have just scraped website content. Your task is to take clear, organized notes about the company, focusing on topics relevant to our interests. 46 | 47 | 48 | {content} 49 | 50 | 51 | Here are any additional notes from the user: 52 | 53 | {user_notes} 54 | 55 | 56 | Please provide detailed research notes that: 57 | 1. Are well-organized and easy to read 58 | 2. Focus on topics mentioned in the schema 59 | 3. Include specific facts, dates, and figures when available 60 | 4. Maintain accuracy of the original content 61 | 5. Note when important information appears to be missing or unclear 62 | 63 | Remember: Don't try to format the output to match the schema - just take clear notes that capture all relevant information.""" 64 | 65 | REFLECTION_PROMPT = """You are a research analyst tasked with reviewing the quality and completeness of extracted company information. 66 | 67 | Compare the extracted information with the required schema: 68 | 69 | 70 | {schema} 71 | 72 | 73 | Here is the extracted information: 74 | 75 | {info} 76 | 77 | 78 | Analyze if all required fields are present and sufficiently populated. Consider: 79 | 1. Are any required fields missing? 80 | 2. Are any fields incomplete or containing uncertain information? 81 | 3. Are there fields with placeholder values or "unknown" markers? 82 | """ 83 | -------------------------------------------------------------------------------- /src/agent/utils.py: -------------------------------------------------------------------------------- 1 | def deduplicate_sources(search_response: dict | list[dict]) -> list[dict]: 2 | """ 3 | Takes either a single search response or list of responses from Tavily API and de-duplicates them based on the URL. 4 | 5 | Args: 6 | search_response: Either: 7 | - A dict with a 'results' key containing a list of search results 8 | - A list of dicts, each containing search results 9 | 10 | Returns: 11 | str: Formatted string with deduplicated sources 12 | """ 13 | # Convert input to list of results 14 | if isinstance(search_response, dict): 15 | sources_list = search_response["results"] 16 | elif isinstance(search_response, list): 17 | sources_list = [] 18 | for response in search_response: 19 | if isinstance(response, dict) and "results" in response: 20 | sources_list.extend(response["results"]) 21 | else: 22 | sources_list.extend(response) 23 | else: 24 | raise ValueError( 25 | "Input must be either a dict with 'results' or a list of search results" 26 | ) 27 | 28 | # Deduplicate by URL 29 | unique_urls = set() 30 | unique_sources_list = [] 31 | for source in sources_list: 32 | if source["url"] not in unique_urls: 33 | unique_urls.add(source["url"]) 34 | unique_sources_list.append(source) 35 | 36 | return unique_sources_list 37 | 38 | 39 | def format_sources( 40 | sources_list: list[dict], 41 | include_raw_content: bool = True, 42 | max_tokens_per_source: int = 1000, 43 | ) -> str: 44 | """ 45 | Takes a list of unique results from Tavily API and formats them. 46 | Limits the raw_content to approximately max_tokens_per_source. 47 | include_raw_content specifies whether to include the raw_content from Tavily in the formatted string. 48 | 49 | Args: 50 | sources_list: list of unique results from Tavily API 51 | max_tokens_per_source: int, maximum number of tokens per each search result to include in the formatted string 52 | include_raw_content: bool, whether to include the raw_content from Tavily in the formatted string 53 | 54 | Returns: 55 | str: Formatted string with deduplicated sources 56 | """ 57 | # Format output 58 | formatted_text = "Sources:\n\n" 59 | for source in sources_list: 60 | formatted_text += f"Source {source['title']}:\n===\n" 61 | formatted_text += f"URL: {source['url']}\n===\n" 62 | formatted_text += ( 63 | f"Most relevant content from source: {source['content']}\n===\n" 64 | ) 65 | if include_raw_content: 66 | # Using rough estimate of 4 characters per token 67 | char_limit = max_tokens_per_source * 4 68 | # Handle None raw_content 69 | raw_content = source.get("raw_content", "") 70 | if raw_content is None: 71 | raw_content = "" 72 | print(f"Warning: No raw_content found for source {source['url']}") 73 | if len(raw_content) > char_limit: 74 | raw_content = raw_content[:char_limit] + "... [truncated]" 75 | formatted_text += f"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\n\n" 76 | 77 | return formatted_text.strip() 78 | 79 | 80 | def format_all_notes(completed_notes: list[str]) -> str: 81 | """Format a list of notes into a string""" 82 | formatted_str = "" 83 | for idx, company_notes in enumerate(completed_notes, 1): 84 | formatted_str += f""" 85 | {'='*60} 86 | Note: {idx}: 87 | {'='*60} 88 | Notes from research: 89 | {company_notes}""" 90 | return formatted_str 91 | -------------------------------------------------------------------------------- /src/agent/state.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Any, Optional, Annotated 3 | import operator 4 | 5 | 6 | DEFAULT_EXTRACTION_SCHEMA = { 7 | "title": "CompanyInfo", 8 | "description": "Basic information about a company", 9 | "type": "object", 10 | "properties": { 11 | "company_name": { 12 | "type": "string", 13 | "description": "Official name of the company", 14 | }, 15 | "founding_year": { 16 | "type": "integer", 17 | "description": "Year the company was founded", 18 | }, 19 | "founder_names": { 20 | "type": "array", 21 | "items": {"type": "string"}, 22 | "description": "Names of the founding team members", 23 | }, 24 | "product_description": { 25 | "type": "string", 26 | "description": "Brief description of the company's main product or service", 27 | }, 28 | "funding_summary": { 29 | "type": "string", 30 | "description": "Summary of the company's funding history", 31 | }, 32 | }, 33 | "required": ["company_name"], 34 | } 35 | 36 | 37 | @dataclass(kw_only=True) 38 | class InputState: 39 | """Input state defines the interface between the graph and the user (external API).""" 40 | 41 | company: str 42 | "Company to research provided by the user." 43 | 44 | extraction_schema: dict[str, Any] = field( 45 | default_factory=lambda: DEFAULT_EXTRACTION_SCHEMA 46 | ) 47 | "The json schema defines the information the agent is tasked with filling out." 48 | 49 | user_notes: Optional[dict[str, Any]] = field(default=None) 50 | "Any notes from the user to start the research process." 51 | 52 | 53 | @dataclass(kw_only=True) 54 | class OverallState: 55 | """Input state defines the interface between the graph and the user (external API).""" 56 | 57 | company: str 58 | "Company to research provided by the user." 59 | 60 | extraction_schema: dict[str, Any] = field( 61 | default_factory=lambda: DEFAULT_EXTRACTION_SCHEMA 62 | ) 63 | "The json schema defines the information the agent is tasked with filling out." 64 | 65 | user_notes: str = field(default=None) 66 | "Any notes from the user to start the research process." 67 | 68 | search_queries: list[str] = field(default=None) 69 | "List of generated search queries to find relevant information" 70 | 71 | search_results: list[dict] = field(default=None) 72 | "List of search results" 73 | 74 | completed_notes: Annotated[list, operator.add] = field(default_factory=list) 75 | "Notes from completed research related to the schema" 76 | 77 | info: dict[str, Any] = field(default=None) 78 | """ 79 | A dictionary containing the extracted and processed information 80 | based on the user's query and the graph's execution. 81 | This is the primary output of the enrichment process. 82 | """ 83 | 84 | is_satisfactory: bool = field(default=None) 85 | "True if all required fields are well populated, False otherwise" 86 | 87 | reflection_steps_taken: int = field(default=0) 88 | "Number of times the reflection node has been executed" 89 | 90 | 91 | @dataclass(kw_only=True) 92 | class OutputState: 93 | """The response object for the end user. 94 | 95 | This class defines the structure of the output that will be provided 96 | to the user after the graph's execution is complete. 97 | """ 98 | 99 | info: dict[str, Any] 100 | """ 101 | A dictionary containing the extracted and processed information 102 | based on the user's query and the graph's execution. 103 | This is the primary output of the enrichment process. 104 | """ 105 | 106 | search_results: list[dict] = field(default=None) 107 | "List of search results" 108 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | .DS_Store 164 | uv.lock 165 | .langgraph_api 166 | -------------------------------------------------------------------------------- /eval/run_eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import Optional 3 | 4 | from langchain_anthropic import ChatAnthropic 5 | from langsmith import Client, evaluate 6 | from langsmith.evaluation import EvaluationResults 7 | from pydantic import BaseModel, Field 8 | 9 | from langgraph.pregel.remote import RemoteGraph 10 | 11 | 12 | client = Client() 13 | 14 | NUMERIC_FIELDS = ( 15 | "total_funding_mm_usd", 16 | "latest_round_amount_mm_usd", 17 | ) 18 | EXACT_MATCH_FIELDS = ( 19 | "website", 20 | "crunchbase_profile", 21 | "headquarters", 22 | "year_founded", 23 | "latest_round", 24 | "latest_round_date", 25 | ) 26 | FUZZY_MATCH_FIELDS = ("name", "ceo", "description") 27 | 28 | DEFAULT_DATASET_NAME = "Startup Data Enrichment" 29 | DEFAULT_GRAPH_ID = "company_researcher" 30 | DEFAULT_AGENT_URL = "http://localhost:2024" 31 | 32 | judge_llm = ChatAnthropic(model="claude-3-5-sonnet-latest", temperature=0) 33 | 34 | EVALUATION_PROMPT = f"""You are an evaluator tasked with assessing the accuracy of an agent's output compared to the expected output. Follow these instructions: 35 | 36 | 1. **Numeric Fields Evaluation**: For fields {NUMERIC_FIELDS}, check if the agent's output is within 10% of the expected value. Score 1 if yes, 0 if no. 37 | 2. **Exact Match Evaluation**: For fields {EXACT_MATCH_FIELDS}, check if the agent's output matches the expected output EXACTLY. Score 1 if yes, 0 if no. 38 | 3. **Fuzzy Match Evaluation**: For fields {FUZZY_MATCH_FIELDS}, check if the agent's output matches the expected output APPROXIMATELY. Score 1 if yes, 0 if no. 39 | 4. **Overall Evaluation**: Return final score that is a fraction of fields that have score of 1. For example, if 1/5 fields has score of 1, the final score is 0.2.""" 40 | 41 | 42 | def evaluate_agent(outputs: dict, reference_outputs: dict): 43 | if "info" not in outputs: 44 | raise ValueError("Agent output must contain 'info' key") 45 | 46 | class Score(BaseModel): 47 | """Evaluate the agent's output against the expected output.""" 48 | 49 | score: float = Field( 50 | description="A score between 0 and 1 indicating the accuracy of the agent's output compared to the expected output. 1 is a perfect match." 51 | ) 52 | reason: str = Field( 53 | description="A brief explanation for why you scored the agent's output as you did." 54 | ) 55 | 56 | score = judge_llm.with_structured_output(Score).invoke( 57 | [ 58 | { 59 | "role": "system", 60 | "content": EVALUATION_PROMPT, 61 | }, 62 | { 63 | "role": "user", 64 | "content": f'Actual output: {outputs["info"]}\nExpected output: {reference_outputs["info"]}', 65 | }, 66 | ] 67 | ) 68 | return score.score 69 | 70 | 71 | # PUBLIC API 72 | 73 | 74 | def transform_dataset_inputs(inputs: dict) -> dict: 75 | """Transform LangSmith dataset inputs to match the agent's input schema before invoking the agent.""" 76 | # see the `Example input` in the README for reference on what `inputs` dict should look like 77 | # the dataset inputs already match the agent's input schema, but you can add any additional processing here 78 | return inputs 79 | 80 | 81 | def transform_agent_outputs(outputs: dict) -> dict: 82 | """Transform agent outputs to match the LangSmith dataset output schema.""" 83 | # see the `Example output` in the README for reference on what the output should look like 84 | return {"info": outputs["info"]} 85 | 86 | 87 | def make_agent_runner(graph_id: str, agent_url: str): 88 | """Wrapper that transforms inputs/outputs to match the expected eval schema and invokes the agent.""" 89 | agent_graph = RemoteGraph(graph_id, url=agent_url) 90 | 91 | def run_agent(inputs: dict) -> dict: 92 | """Run the agent on the inputs from the LangSmith dataset record, return outputs conforming to the LangSmith dataset output schema.""" 93 | transformed_inputs = transform_dataset_inputs(inputs) 94 | response = agent_graph.invoke(transformed_inputs) 95 | return transform_agent_outputs(response) 96 | 97 | return run_agent 98 | 99 | 100 | def run_eval( 101 | *, 102 | dataset_name: str, 103 | graph_id: str = DEFAULT_GRAPH_ID, 104 | agent_url: str = DEFAULT_AGENT_URL, 105 | experiment_prefix: Optional[str] = None, 106 | ) -> EvaluationResults: 107 | dataset = client.read_dataset(dataset_name=dataset_name) 108 | run_agent = make_agent_runner(graph_id, agent_url) 109 | results = evaluate( 110 | run_agent, 111 | data=dataset, 112 | evaluators=[evaluate_agent], 113 | experiment_prefix=experiment_prefix, 114 | ) 115 | return results 116 | 117 | 118 | if __name__ == "__main__": 119 | import argparse 120 | 121 | parser = argparse.ArgumentParser() 122 | parser.add_argument( 123 | "--dataset-name", 124 | type=str, 125 | default=DEFAULT_DATASET_NAME, 126 | help="Name of the dataset to evaluate against", 127 | ) 128 | parser.add_argument( 129 | "--graph-id", 130 | type=str, 131 | default=DEFAULT_GRAPH_ID, 132 | help="ID of the graph to evaluate", 133 | ) 134 | parser.add_argument( 135 | "--agent-url", 136 | type=str, 137 | default=DEFAULT_AGENT_URL, 138 | help="URL of the deployed agent to evaluate", 139 | ) 140 | parser.add_argument( 141 | "--experiment-prefix", 142 | type=str, 143 | help="Experiment prefix for the evaluation", 144 | ) 145 | args = parser.parse_args() 146 | 147 | run_eval( 148 | dataset_name=args.dataset_name, 149 | graph_id=args.graph_id, 150 | agent_url=args.agent_url, 151 | experiment_prefix=args.experiment_prefix, 152 | ) 153 | -------------------------------------------------------------------------------- /src/agent/graph.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import cast, Any, Literal 3 | import json 4 | 5 | from tavily import AsyncTavilyClient 6 | from langchain_anthropic import ChatAnthropic 7 | from langchain_core.rate_limiters import InMemoryRateLimiter 8 | from langchain_core.runnables import RunnableConfig 9 | from langgraph.graph import START, END, StateGraph 10 | from pydantic import BaseModel, Field 11 | 12 | from agent.configuration import Configuration 13 | from agent.state import InputState, OutputState, OverallState 14 | from agent.utils import deduplicate_sources, format_sources, format_all_notes 15 | from agent.prompts import ( 16 | EXTRACTION_PROMPT, 17 | REFLECTION_PROMPT, 18 | INFO_PROMPT, 19 | QUERY_WRITER_PROMPT, 20 | ) 21 | 22 | # LLMs 23 | 24 | rate_limiter = InMemoryRateLimiter( 25 | requests_per_second=4, 26 | check_every_n_seconds=0.1, 27 | max_bucket_size=10, # Controls the maximum burst size. 28 | ) 29 | claude_3_5_sonnet = ChatAnthropic( 30 | model="claude-3-5-sonnet-latest", temperature=0, rate_limiter=rate_limiter 31 | ) 32 | 33 | # Search 34 | 35 | tavily_async_client = AsyncTavilyClient() 36 | 37 | 38 | class Queries(BaseModel): 39 | queries: list[str] = Field( 40 | description="List of search queries.", 41 | ) 42 | 43 | 44 | class ReflectionOutput(BaseModel): 45 | is_satisfactory: bool = Field( 46 | description="True if all required fields are well populated, False otherwise" 47 | ) 48 | missing_fields: list[str] = Field( 49 | description="List of field names that are missing or incomplete" 50 | ) 51 | search_queries: list[str] = Field( 52 | description="If is_satisfactory is False, provide 1-3 targeted search queries to find the missing information" 53 | ) 54 | reasoning: str = Field(description="Brief explanation of the assessment") 55 | 56 | 57 | def generate_queries(state: OverallState, config: RunnableConfig) -> dict[str, Any]: 58 | """Generate search queries based on the user input and extraction schema.""" 59 | # Get configuration 60 | configurable = Configuration.from_runnable_config(config) 61 | max_search_queries = configurable.max_search_queries 62 | 63 | # Generate search queries 64 | structured_llm = claude_3_5_sonnet.with_structured_output(Queries) 65 | 66 | # Format system instructions 67 | query_instructions = QUERY_WRITER_PROMPT.format( 68 | company=state.company, 69 | info=json.dumps(state.extraction_schema, indent=2), 70 | user_notes=state.user_notes, 71 | max_search_queries=max_search_queries, 72 | ) 73 | 74 | # Generate queries 75 | results = cast( 76 | Queries, 77 | structured_llm.invoke( 78 | [ 79 | {"role": "system", "content": query_instructions}, 80 | { 81 | "role": "user", 82 | "content": "Please generate a list of search queries related to the schema that you want to populate.", 83 | }, 84 | ] 85 | ), 86 | ) 87 | 88 | # Queries 89 | query_list = [query for query in results.queries] 90 | return {"search_queries": query_list} 91 | 92 | 93 | async def research_company( 94 | state: OverallState, config: RunnableConfig 95 | ) -> dict[str, Any]: 96 | """Execute a multi-step web search and information extraction process. 97 | 98 | This function performs the following steps: 99 | 1. Executes concurrent web searches using the Tavily API 100 | 2. Deduplicates and formats the search results 101 | """ 102 | 103 | # Get configuration 104 | configurable = Configuration.from_runnable_config(config) 105 | max_search_results = configurable.max_search_results 106 | 107 | # Search tasks 108 | search_tasks = [] 109 | for query in state.search_queries: 110 | search_tasks.append( 111 | tavily_async_client.search( 112 | query, 113 | max_results=max_search_results, 114 | include_raw_content=True, 115 | topic="general", 116 | ) 117 | ) 118 | 119 | # Execute all searches concurrently 120 | search_docs = await asyncio.gather(*search_tasks) 121 | 122 | # Deduplicate and format sources 123 | deduplicated_search_docs = deduplicate_sources(search_docs) 124 | source_str = format_sources( 125 | deduplicated_search_docs, max_tokens_per_source=1000, include_raw_content=True 126 | ) 127 | 128 | # Generate structured notes relevant to the extraction schema 129 | p = INFO_PROMPT.format( 130 | info=json.dumps(state.extraction_schema, indent=2), 131 | content=source_str, 132 | company=state.company, 133 | user_notes=state.user_notes, 134 | ) 135 | result = await claude_3_5_sonnet.ainvoke(p) 136 | state_update = { 137 | "completed_notes": [str(result.content)], 138 | } 139 | if configurable.include_search_results: 140 | state_update["search_results"] = deduplicated_search_docs 141 | 142 | return state_update 143 | 144 | 145 | def gather_notes_extract_schema(state: OverallState) -> dict[str, Any]: 146 | """Gather notes from the web search and extract the schema fields.""" 147 | 148 | # Format all notes 149 | notes = format_all_notes(state.completed_notes) 150 | 151 | # Extract schema fields 152 | system_prompt = EXTRACTION_PROMPT.format( 153 | info=json.dumps(state.extraction_schema, indent=2), notes=notes 154 | ) 155 | structured_llm = claude_3_5_sonnet.with_structured_output(state.extraction_schema) 156 | result = structured_llm.invoke( 157 | [ 158 | {"role": "system", "content": system_prompt}, 159 | { 160 | "role": "user", 161 | "content": "Produce a structured output from these notes.", 162 | }, 163 | ] 164 | ) 165 | return {"info": result} 166 | 167 | 168 | def reflection(state: OverallState) -> dict[str, Any]: 169 | """Reflect on the extracted information and generate search queries to find missing information.""" 170 | structured_llm = claude_3_5_sonnet.with_structured_output(ReflectionOutput) 171 | 172 | # Format reflection prompt 173 | system_prompt = REFLECTION_PROMPT.format( 174 | schema=json.dumps(state.extraction_schema, indent=2), 175 | info=state.info, 176 | ) 177 | 178 | # Invoke 179 | result = cast( 180 | ReflectionOutput, 181 | structured_llm.invoke( 182 | [ 183 | {"role": "system", "content": system_prompt}, 184 | {"role": "user", "content": "Produce a structured reflection output."}, 185 | ] 186 | ), 187 | ) 188 | 189 | if result.is_satisfactory: 190 | return {"is_satisfactory": result.is_satisfactory} 191 | else: 192 | return { 193 | "is_satisfactory": result.is_satisfactory, 194 | "search_queries": result.search_queries, 195 | "reflection_steps_taken": state.reflection_steps_taken + 1, 196 | } 197 | 198 | 199 | def route_from_reflection( 200 | state: OverallState, config: RunnableConfig 201 | ) -> Literal[END, "research_company"]: # type: ignore 202 | """Route the graph based on the reflection output.""" 203 | # Get configuration 204 | configurable = Configuration.from_runnable_config(config) 205 | 206 | # If we have satisfactory results, end the process 207 | if state.is_satisfactory: 208 | return END 209 | 210 | # If results aren't satisfactory but we haven't hit max steps, continue research 211 | if state.reflection_steps_taken <= configurable.max_reflection_steps: 212 | return "research_company" 213 | 214 | # If we've exceeded max steps, end even if not satisfactory 215 | return END 216 | 217 | 218 | # Add nodes and edges 219 | builder = StateGraph( 220 | OverallState, 221 | input=InputState, 222 | output=OutputState, 223 | config_schema=Configuration, 224 | ) 225 | builder.add_node("gather_notes_extract_schema", gather_notes_extract_schema) 226 | builder.add_node("generate_queries", generate_queries) 227 | builder.add_node("research_company", research_company) 228 | builder.add_node("reflection", reflection) 229 | 230 | builder.add_edge(START, "generate_queries") 231 | builder.add_edge("generate_queries", "research_company") 232 | builder.add_edge("research_company", "gather_notes_extract_schema") 233 | builder.add_edge("gather_notes_extract_schema", "reflection") 234 | builder.add_conditional_edges("reflection", route_from_reflection) 235 | 236 | # Compile 237 | graph = builder.compile() 238 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Company Researcher Agent 2 | 3 | Company Researcher Agent searches the web for information about a user-supplied company and returns it in a structured format defined by user-supplied JSON schema. 4 | 5 | ## 🚀 Quickstart with LangGraph server 6 | 7 | Set API keys for the LLM of choice (Anthropic is set by default in `src/agent/graph.py`) and [Tavily API](https://tavily.com/): 8 | ``` 9 | cp .env.example .env 10 | ``` 11 | 12 | Clone the repository and launch the assistant [using the LangGraph server](https://langchain-ai.github.io/langgraph/cloud/reference/cli/#dev): 13 | ```bash 14 | curl -LsSf https://astral.sh/uv/install.sh | sh 15 | git clone https://github.com/langchain-ai/company-researcher.git 16 | cd company-researcher 17 | uvx --refresh --from "langgraph-cli[inmem]" --with-editable . --python 3.11 langgraph dev 18 | ``` 19 | 20 | ![company_people_researcher](https://github.com/user-attachments/assets/f651d18c-8cf8-4dde-87cb-3daed59c7fa0) 21 | 22 | ## How it works 23 | 24 | Company Researcher Agent follows a multi-step research and extraction workflow that separates web research from schema extraction, allowing for better resource management and comprehensive data collection: 25 | 26 | - **Research Phase**: The system performs intelligent web research on the input company: 27 | - Uses an LLM to generate targeted search queries based on the schema requirements (up to `max_search_queries`) 28 | - Executes concurrent web searches via [Tavily API](https://tavily.com/), retrieving up to `max_search_results` results per query 29 | - Takes structured research notes focused on schema-relevant information 30 | - **Extraction Phase**: After research is complete, the system: 31 | - Consolidates all research notes 32 | - Uses an LLM to extract and format the information according to the user-defined schema 33 | - Returns the structured data in the exact format requested 34 | - **Reflection Phase**: The system evaluates the quality of extracted information: 35 | - Analyzes completeness of required fields 36 | - Identifies any missing or incomplete information 37 | - Generates targeted follow-up search queries if needed 38 | - Continues research until information is satisfactory or max reflection steps reached 39 | 40 | ## Configuration 41 | 42 | The configuration for Company Researcher Agent is defined in the `src/agent/configuration.py` file: 43 | * `max_search_queries`: int = 3 # Max search queries per company 44 | * `max_search_results`: int = 3 # Max search results per query 45 | * `max_reflection_steps`: int = 1 # Max reflection steps 46 | 47 | ## Inputs 48 | 49 | The user inputs are: 50 | 51 | ``` 52 | * company: str - A company to research 53 | * extraction_schema: Optional[dict] - A JSON schema for the output 54 | * user_notes: Optional[str] - Any additional notes about the company from the user 55 | ``` 56 | 57 | If a schema is not provided, the system will use a default schema (`DEFAULT_EXTRACTION_SCHEMA`) defined in `src/agent/state.py`. 58 | 59 | ### Schemas 60 | 61 | > ⚠️ **WARNING:** JSON schemas require `title` and `description` fields for [extraction](https://python.langchain.com/docs/how_to/structured_output/#typeddict-or-json-schema). 62 | > ⚠️ **WARNING:** Avoid JSON objects with nesting; LLMs have challenges performing structured extraction from nested objects. See examples below that we have tested. 63 | 64 | Here is an example schema that can be supplied to research a company: 65 | 66 | * See the trace [here](https://smith.langchain.com/public/9f51fb8b-9486-4cd2-90ed-895f7932304e/r). 67 | 68 |
69 | Example schema 70 | 71 | ``` 72 | { 73 | "title": "CompanyInfo", 74 | "description": "Basic information about a company", 75 | "type": "object", 76 | "properties": { 77 | "company_name": { 78 | "type": "string", 79 | "description": "Official name of the company" 80 | }, 81 | "founding_year": { 82 | "type": "integer", 83 | "description": "Year the company was founded" 84 | }, 85 | "founder_names": { 86 | "type": "array", 87 | "items": {"type": "string"}, 88 | "description": "Names of the founding team members" 89 | }, 90 | "product_description": { 91 | "type": "string", 92 | "description": "Brief description of the company's main product or service" 93 | }, 94 | "funding_summary": { 95 | "type": "string", 96 | "description": "Summary of the company's funding history" 97 | } 98 | }, 99 | "required": ["company_name"] 100 | } 101 | ``` 102 |
103 | 104 | Here is an example of a more complex schema: 105 | 106 | * See the reflections steps in the trace [here](https://smith.langchain.com/public/36f0d917-4edd-4d55-8dbf-6d6ec8a25754/r). 107 | 108 |
109 | Example complex schema 110 | 111 | ``` 112 | HARD_EXTRACTION_SCHEMA = { 113 | "title": "CompanyInfo", 114 | "description": "Comprehensive information about a company with confidence tracking", 115 | "type": "object", 116 | "properties": { 117 | "company_name": { 118 | "type": "string", 119 | "description": "Official name of the company" 120 | }, 121 | "verified_company": { 122 | "type": "boolean", 123 | "description": "Confirmation this is the intended company, not a similarly named one" 124 | }, 125 | "similar_companies": { 126 | "type": "array", 127 | "items": {"type": "string"}, 128 | "description": "List of similarly named companies that could be confused with the target" 129 | }, 130 | "distinguishing_features": { 131 | "type": "string", 132 | "description": "Key features that distinguish this company from similarly named ones" 133 | }, 134 | "key_executives": { 135 | "type": "array", 136 | "items": { 137 | "type": "object", 138 | "properties": { 139 | "name": {"type": "string"}, 140 | "title": {"type": "string"}, 141 | "verification_date": {"type": "string"}, 142 | "confidence_level": { 143 | "type": "string", 144 | "enum": ["high", "medium", "low", "uncertain"] 145 | }, 146 | "source": {"type": "string"} 147 | } 148 | } 149 | }, 150 | "org_chart_summary": { 151 | "type": "string", 152 | "description": "Brief description of organizational structure" 153 | }, 154 | "leadership_caveats": { 155 | "type": "string", 156 | "description": "Any uncertainties or caveats about leadership information" 157 | }, 158 | "main_products": { 159 | "type": "array", 160 | "items": { 161 | "type": "object", 162 | "properties": { 163 | "name": {"type": "string"}, 164 | "description": {"type": "string"}, 165 | "launch_date": {"type": "string"}, 166 | "current_status": {"type": "string"} 167 | } 168 | } 169 | }, 170 | "services": { 171 | "type": "array", 172 | "items": { 173 | "type": "object", 174 | "properties": { 175 | "name": {"type": "string"}, 176 | "description": {"type": "string"}, 177 | "target_market": {"type": "string"} 178 | } 179 | } 180 | }, 181 | "recent_developments": { 182 | "type": "array", 183 | "items": { 184 | "type": "object", 185 | "properties": { 186 | "date": {"type": "string"}, 187 | "title": {"type": "string"}, 188 | "summary": {"type": "string"}, 189 | "source_url": {"type": "string"}, 190 | "significance": {"type": "string"} 191 | } 192 | }, 193 | "description": "Major news and developments from the last 6 months" 194 | }, 195 | "historical_challenges": { 196 | "type": "array", 197 | "items": { 198 | "type": "object", 199 | "properties": { 200 | "issue_type": {"type": "string"}, 201 | "description": {"type": "string"}, 202 | "date_period": {"type": "string"}, 203 | "resolution": {"type": "string"}, 204 | "current_status": {"type": "string"} 205 | } 206 | }, 207 | "description": "Past challenges, issues, or controversies" 208 | }, 209 | "sources": { 210 | "type": "array", 211 | "items": { 212 | "type": "object", 213 | "properties": { 214 | "url": {"type": "string"}, 215 | "title": {"type": "string"}, 216 | "date_accessed": {"type": "string"}, 217 | "information_type": { 218 | "type": "array", 219 | "items": {"type": "string"}, 220 | "description": "Types of information sourced from this link (e.g., leadership, products, news)" 221 | } 222 | } 223 | } 224 | }, 225 | "company_summary": { 226 | "type": "string", 227 | "description": "Concise, dense summary of the most important company information (max 250 words)" 228 | } 229 | }, 230 | "required": [ 231 | "company_name", 232 | "verified_company", 233 | "company_summary", 234 | "key_executives", 235 | "main_products", 236 | "sources" 237 | ] 238 | } 239 | ``` 240 |
241 | 242 | 243 | ## Evaluation 244 | 245 | Prior to engaging in any optimization, it is important to establish a baseline performance. This repository includes: 246 | 247 | 1. A dataset consisting of a list of companies and the expected structured information to be extracted for each company. 248 | 2. An evaluation script that can be used to evaluate the agent on this dataset. 249 | 250 | ### Set up 251 | 252 | Make sure you have the LangSmith CLI installed: 253 | 254 | ```shell 255 | pip install langsmith 256 | ``` 257 | 258 | And set your API key: 259 | 260 | ```shell 261 | export LANGSMITH_API_KEY= 262 | export ANTHROPIC_API_KEY= 263 | ``` 264 | 265 | ### Evaluation metric 266 | 267 | A score between 0 and 1 is assigned to each extraction result by an LLM model that acts 268 | as a judge. 269 | 270 | The model assigns the score based on how closely the extracted information matches the expected information. 271 | 272 | ### Get the dataset 273 | 274 | Create a new dataset in LangSmith using the code in the `eval` folder: 275 | 276 | ```shell 277 | python eval/create_dataset.py 278 | ``` 279 | 280 | ### Run the evaluation 281 | 282 | To run the evaluation, you can use the `run_eval.py` script in the `eval` folder. This will create a new experiment in LangSmith for the dataset you created in the previous step. 283 | 284 | ```shell 285 | python eval/run_eval.py --experiment-prefix "My custom prefix" --agent-url http://localhost:2024 286 | ``` 287 | -------------------------------------------------------------------------------- /eval/create_dataset.py: -------------------------------------------------------------------------------- 1 | EXAMPLES = [ 2 | { 3 | "company": "LangChain", 4 | "info": { 5 | "name": "LangChain, Inc.", 6 | "description": "LangChain helps developers to build applications powered by large language models (LLMs). It provides tools and frameworks to integrate LLMs with external data sources and APIs, facilitating the creation of advanced AI applications.", 7 | "website": "https://www.langchain.com", 8 | "crunchbase_profile": "https://www.crunchbase.com/organization/langchain", 9 | "year_founded": 2022, 10 | "ceo": "Harrison Chase", 11 | "total_funding_mm_usd": 35.0, 12 | "latest_round": "Series A", 13 | "latest_round_date": "2024-02-15", 14 | "latest_round_amount_mm_usd": 25.0, 15 | }, 16 | }, 17 | { 18 | "company": "Kensho", 19 | "info": { 20 | "name": "Kensho Technologies, LLC.", 21 | "description": "Kensho Technologies, a subsidiary of S&P Global, specializes in developing advanced analytics and machine learning solutions for the financial industry. Their products include tools for natural language processing, data extraction, and linking, enabling clients to derive actionable insights from complex data sets.", 22 | "website": "https://kensho.com/", 23 | "crunchbase_profile": "https://www.crunchbase.com/organization/kensho", 24 | "year_founded": 2013, 25 | "ceo": "Bhavesh Dayalji", 26 | "total_funding_mm_usd": 81.1, 27 | "latest_round": "Series B", 28 | "latest_round_date": "2017-02-28", 29 | "latest_round_amount_mm_usd": 50.0, 30 | }, 31 | }, 32 | { 33 | "company": "Robust Intelligence", 34 | "info": { 35 | "name": "Robust Intelligence, Inc.", 36 | "description": "Robust Intelligence offers an AI application security platform designed to protect machine learning models from various threats, including data poisoning and adversarial attacks. Their solutions ensure the integrity and reliability of AI systems across diverse industries.", 37 | "website": "https://www.robustintelligence.com/", 38 | "crunchbase_profile": "https://www.crunchbase.com/organization/robust-intelligence", 39 | "year_founded": 2019, 40 | "ceo": "Yaron Singer", 41 | "total_funding_mm_usd": 44.0, 42 | "latest_round": "Series B", 43 | "latest_round_date": "2021-12-09", 44 | "latest_round_amount_mm_usd": 30.0, 45 | }, 46 | }, 47 | { 48 | "company": "Perplexity.ai", 49 | "info": { 50 | "name": "Perplexity AI, Inc.", 51 | "description": "Perplexity.ai is an AI-powered search engine that delivers concise and accurate answers to user queries. It leverages advanced natural language processing to provide direct responses, enhancing the search experience.", 52 | "website": "https://www.perplexity.ai", 53 | "crunchbase_profile": "https://www.crunchbase.com/organization/perplexity-ai", 54 | "year_founded": 2022, 55 | "ceo": "Aravind Srinivas", 56 | "total_funding_mm_usd": 165.0, 57 | "latest_round": "Series B", 58 | "latest_round_date": "2024-04-23", 59 | "latest_round_amount_mm_usd": 62.7, 60 | }, 61 | }, 62 | { 63 | "company": "Physical Intelligence.ai", 64 | "info": { 65 | "name": "Physical Intelligence.ai", 66 | "description": "Physical Intelligence.ai specializes in developing AI solutions that enhance human physical capabilities. Their technologies focus on improving physical performance and health through intelligent systems.", 67 | "website": "https://www.physicalintelligence.company/", 68 | "crunchbase_profile": "https://www.crunchbase.com/organization/physical-intelligence-834b", 69 | "year_founded": 2023, 70 | "ceo": "Karol Hausman", 71 | "total_funding_mm_usd": 470.0, 72 | "latest_round": "Series A", 73 | "latest_round_date": "2024-11-04", 74 | "latest_round_amount_mm_usd": 400.0, 75 | }, 76 | }, 77 | { 78 | "company": "Galileo.ai", 79 | "info": { 80 | "name": "Galileo AI, Inc.", 81 | "description": "Galileo.ai offers AI-driven design tools that assist in creating user interfaces and experiences. Their platform automates design processes, enabling rapid prototyping and iteration for designers and developers.", 82 | "website": "https://www.usegalileo.ai/", 83 | "crunchbase_profile": "https://www.crunchbase.com/organization/galileo-ai", 84 | "year_founded": 2022, 85 | "ceo": "Arnaud Benard", 86 | "total_funding_mm_usd": 4.8, 87 | "latest_round": "Seed", 88 | "latest_round_date": "2024-02-06", 89 | "latest_round_amount_mm_usd": 4.4, 90 | }, 91 | }, 92 | { 93 | "company": "Sierra.ai", 94 | "info": { 95 | "name": "Sierra Technologies, Inc.", 96 | "description": "Sierra.ai develops AI-powered safety and compliance solutions for the trucking industry. Their technology aims to enhance driver safety, ensure regulatory compliance, and improve operational efficiency.", 97 | "website": "https://sierra.ai/", 98 | "crunchbase_profile": "https://www.crunchbase.com/organization/sierra-1124", 99 | "year_founded": 2023, 100 | "ceo": "Clay Bavor", 101 | "total_funding_mm_usd": 285.0, 102 | "latest_round": "Series B", 103 | "latest_round_date": "2024-10-28", 104 | "latest_round_amount_mm_usd": 175.0, 105 | }, 106 | }, 107 | { 108 | "company": "Rad AI", 109 | "info": { 110 | "name": "Rad AI, Inc.", 111 | "description": "Rad AI provides artificial intelligence solutions for radiology, aiming to improve diagnostic accuracy and efficiency. Their platform assists radiologists by automating routine tasks and enhancing image analysis.", 112 | "website": "https://www.radai.com", 113 | "crunchbase_profile": "https://www.crunchbase.com/organization/radai", 114 | "year_founded": 2018, 115 | "ceo": "Doktor Gurson", 116 | "total_funding_mm_usd": 83.0, 117 | "latest_round": "Series B", 118 | "latest_round_date": "2024-05-07", 119 | "latest_round_amount_mm_usd": 50.0, 120 | }, 121 | }, 122 | { 123 | "company": "Together AI", 124 | "info": { 125 | "name": "Together, Inc.", 126 | "description": "Together AI focuses on building open-source models and tools for natural language processing. They aim to make advanced AI technologies accessible and collaborative for researchers and developers.", 127 | "website": "https://www.together.ai/", 128 | "crunchbase_profile": "https://www.crunchbase.com/organization/together-ai", 129 | "year_founded": 2022, 130 | "ceo": "Vipul Ved Prakash", 131 | "total_funding_mm_usd": 228.5, 132 | "latest_round": "Series A", 133 | "latest_round_date": "2024-03-13", 134 | "latest_round_amount_mm_usd": 106.0, 135 | }, 136 | }, 137 | { 138 | "company": "Omneky", 139 | "info": { 140 | "name": "Omneky Inc.", 141 | "description": "Omneky utilizes AI to create personalized advertising content across digital platforms. Their platform analyzes data to generate targeted ads, optimizing marketing strategies for businesses.", 142 | "website": "https://www.omneky.com", 143 | "crunchbase_profile": "https://www.crunchbase.com/organization/omneky", 144 | "year_founded": 2018, 145 | "ceo": "Hikari Senju", 146 | "total_funding_mm_usd": 13.0, 147 | "latest_round": "Seed", 148 | "latest_round_date": "2022-11-15", 149 | "latest_round_amount_mm_usd": 10.0, 150 | }, 151 | }, 152 | { 153 | "company": "Curai Health", 154 | "info": { 155 | "name": "Curai, Inc.", 156 | "description": "Curai Health offers AI-assisted primary care services, combining artificial intelligence with medical expertise to provide accessible and affordable healthcare solutions.", 157 | "website": "https://www.curaihealth.com", 158 | "crunchbase_profile": "https://www.crunchbase.com/organization/curai", 159 | "year_founded": 2017, 160 | "ceo": "Neal Khosla", 161 | "total_funding_mm_usd": 38.2, 162 | "latest_round": "Series B", 163 | "latest_round_date": "2020-12-16", 164 | "latest_round_amount_mm_usd": 27.5, 165 | }, 166 | }, 167 | { 168 | "company": "Decagon.ai", 169 | "info": { 170 | "name": "Decagon AI, Inc.", 171 | "description": "Decagon.ai develops enterprise-grade generative AI agents for customer support, enabling businesses to provide efficient and personalized customer service experiences.", 172 | "website": "https://decagon.ai", 173 | "crunchbase_profile": "https://www.crunchbase.com/organization/decagon-485e", 174 | "year_founded": 2023, 175 | "ceo": "Jesse Zhang", 176 | "total_funding_mm_usd": 100.0, 177 | "latest_round": "Series B", 178 | "latest_round_date": "2024-10-15", 179 | "latest_round_amount_mm_usd": 65.0, 180 | }, 181 | }, 182 | { 183 | "company": "Xaira Therapeutics", 184 | "info": { 185 | "name": "Xaira Therapeutics", 186 | "description": "Xaira Therapeutics is a biotechnology company leveraging artificial intelligence for drug discovery and development, aiming to deliver transformative medicines.", 187 | "website": "https://xaira.com/", 188 | "crunchbase_profile": "https://www.crunchbase.com/organization/xaira-therapeutics", 189 | "year_founded": 2023, 190 | "ceo": "Marc Tessier-Lavigne", 191 | "total_funding_mm_usd": 1000.0, 192 | "latest_round": "Series A", 193 | "latest_round_date": "2024-04-23", 194 | "latest_round_amount_mm_usd": 1000.0, 195 | }, 196 | }, 197 | { 198 | "company": "Regie.ai", 199 | "info": { 200 | "name": "Regie.ai", 201 | "description": "Regie.ai provides generative AI tools for sales teams, automating content creation and streamlining communication processes to enhance sales efficiency.", 202 | "website": "https://www.regie.ai/", 203 | "crunchbase_profile": "https://www.crunchbase.com/organization/regie-da23", 204 | "year_founded": 2020, 205 | "ceo": "Srinath Sridhar", 206 | "total_funding_mm_usd": 20.8, 207 | "latest_round": "Series A", 208 | "latest_round_date": "2023-02-09", 209 | "latest_round_amount_mm_usd": 6.0, 210 | }, 211 | }, 212 | { 213 | "company": "Bifrost AI", 214 | "info": { 215 | "name": "Bifrost AI, Inc.", 216 | "description": "Bifrost AI specializes in generating synthetic data for AI and robotics, enabling faster training and validation of models without the need for real-world data.", 217 | "website": "https://www.bifrost.ai", 218 | "crunchbase_profile": "https://www.crunchbase.com/organization/bifrost", 219 | "year_founded": 2020, 220 | "ceo": "Charles Wong", 221 | "total_funding_mm_usd": 13.1, 222 | "latest_round": "Series A", 223 | "latest_round_date": "2024-10-30", 224 | "latest_round_amount_mm_usd": 8.0, 225 | }, 226 | }, 227 | { 228 | "company": "Recraft", 229 | "info": { 230 | "name": "Recraft, Inc", 231 | "description": "Recraft offers an AI-powered design tool for creating and editing images, providing features like image generation, vectorization, and mockup creation for professional designers.", 232 | "website": "https://www.recraft.ai", 233 | "crunchbase_profile": "https://www.crunchbase.com/organization/recraft", 234 | "year_founded": 2022, 235 | "ceo": "Anna Veronika Dorogush", 236 | "total_funding_mm_usd": 12.0, 237 | "latest_round": "Series A", 238 | "latest_round_date": "2024-01-18", 239 | "latest_round_amount_mm_usd": 12.0, 240 | }, 241 | }, 242 | { 243 | "company": "Brightseed", 244 | "info": { 245 | "name": "Brightseed, Inc", 246 | "description": "Brightseed utilizes artificial intelligence to discover bioactive compounds in nature that can restore human health, focusing on the intersection of nature, science, and humanity.", 247 | "website": "https://www.brightseedbio.com", 248 | "crunchbase_profile": "https://www.crunchbase.com/organization/brightseed", 249 | "year_founded": 2017, 250 | "ceo": "Jim Flatt", 251 | "total_funding_mm_usd": 120.8, 252 | "latest_round": "Series B", 253 | "latest_round_date": "2022-05-09", 254 | "latest_round_amount_mm_usd": 68.0, 255 | }, 256 | }, 257 | { 258 | "company": "Etched.ai", 259 | "info": { 260 | "name": "Etched.ai, Inc.", 261 | "description": "Etched.ai is developing the world's first transformer ASIC, a specialized chip designed to run AI models faster and more efficiently than traditional GPUs.", 262 | "website": "https://www.etched.com", 263 | "crunchbase_profile": "https://www.crunchbase.com/organization/etched-ai", 264 | "year_founded": 2022, 265 | "ceo": "Gavin Uberti", 266 | "total_funding_mm_usd": 125.4, 267 | "latest_round": "Series A", 268 | "latest_round_date": "2024-06-25", 269 | "latest_round_amount_mm_usd": 120.0, 270 | }, 271 | }, 272 | { 273 | "company": "World Labs", 274 | "info": { 275 | "name": "World Labs Technologies", 276 | "description": "World Labs is an AI-focused company dedicated to advancing artificial intelligence technologies and applications across various sectors.", 277 | "website": "https://www.worldlabs.ai", 278 | "crunchbase_profile": "https://www.crunchbase.com/organization/world-labs", 279 | "year_founded": 2024, 280 | "ceo": "Fei-Fei Li", 281 | "total_funding_mm_usd": 230.0, 282 | "latest_round": "Series A", 283 | "latest_round_date": "2024-09-13", 284 | "latest_round_amount_mm_usd": 230.0, 285 | }, 286 | }, 287 | { 288 | "company": "Sight Machine", 289 | "info": { 290 | "name": "Sight Machine Inc.", 291 | "description": "Sight Machine provides manufacturing analytics powered by AI, offering real-time insights to improve production efficiency and quality.", 292 | "website": "https://sightmachine.com", 293 | "crunchbase_profile": "https://www.crunchbase.com/organization/sight-machine", 294 | "year_founded": 2011, 295 | "ceo": "Jon Sobel", 296 | "total_funding_mm_usd": 80.4, 297 | "latest_round": "Series C", 298 | "latest_round_date": "2019-04-23", 299 | "latest_round_amount_mm_usd": 29.4, 300 | }, 301 | }, 302 | { 303 | "company": "Ambience Healthcare", 304 | "info": { 305 | "name": "Ambience Healthcare, Inc.", 306 | "description": "Ambience Healthcare offers AI-powered scribe solutions for healthcare providers, automating clinical documentation to reduce clinician burnout and improve care quality.", 307 | "website": "https://www.ambiencehealthcare.com", 308 | "crunchbase_profile": "https://www.crunchbase.com/organization/ambience-healthcare", 309 | "year_founded": 2020, 310 | "ceo": "Mike Ng", 311 | "total_funding_mm_usd": 76.3, 312 | "latest_round": "Series B", 313 | "latest_round_date": "2024-02-06", 314 | "latest_round_amount_mm_usd": 70.0, 315 | }, 316 | }, 317 | { 318 | "company": "Safely You", 319 | "info": { 320 | "name": "SafelyYou, Inc.", 321 | "description": "Safely You utilizes AI technology to reduce falls and associated risks in senior living communities, enhancing resident safety and care.", 322 | "website": "https://www.safely-you.com", 323 | "crunchbase_profile": "https://www.crunchbase.com/organization/safely-you", 324 | "year_founded": 2016, 325 | "ceo": "George Netscher", 326 | "total_funding_mm_usd": 71.3, 327 | "latest_round": "Debt", 328 | "latest_round_date": "2023-05-25", 329 | "latest_round_amount_mm_usd": 10.0, 330 | }, 331 | }, 332 | { 333 | "company": "Kintsugi.AI", 334 | "info": { 335 | "name": "KintsugiAI, Inc.", 336 | "description": "Kintsugi.AI provides sales tax automation solutions for companies globally, streamlining compliance processes and reducing errors.", 337 | "website": "trykintsugi.com", 338 | "crunchbase_profile": "https://www.crunchbase.com/organization/kintsugi-0524", 339 | "year_founded": 2022, 340 | "ceo": "Pujun Bhatnagar", 341 | "total_funding_mm_usd": 12.2, 342 | "latest_round": "Series A", 343 | "latest_round_date": "2024-11-19", 344 | "latest_round_amount_mm_usd": 4.0, 345 | }, 346 | }, 347 | ] 348 | 349 | EXTRACTION_SCHEMA = { 350 | "type": "object", 351 | "title": "company_info", 352 | "properties": { 353 | "name": {"type": "string", "description": "Official company name"}, 354 | "description": { 355 | "type": "string", 356 | "description": "Brief description of the company and its activities", 357 | }, 358 | "website": { 359 | "type": "string", 360 | "format": "uri", 361 | "description": "Company's official website URL", 362 | }, 363 | "crunchbase_profile": { 364 | "type": "string", 365 | "format": "uri", 366 | "description": "Company's Crunchbase profile URL", 367 | }, 368 | "year_founded": { 369 | "type": "integer", 370 | "minimum": 1800, 371 | "description": "Year when the company was founded", 372 | }, 373 | "ceo": {"type": "string", "description": "Name of the company's CEO"}, 374 | "total_funding_mm_usd": { 375 | "type": "number", 376 | "minimum": 0, 377 | "description": "Total funding raised in millions of USD", 378 | }, 379 | "latest_round": { 380 | "type": "string", 381 | "description": "Type of the most recent funding round (e.g., Series A, Seed, etc.)", 382 | }, 383 | "latest_round_date": { 384 | "type": "string", 385 | "format": "date", 386 | "description": "Date of the most recent funding round (YYYY-MM-DD)", 387 | }, 388 | "latest_round_amount_mm_usd": { 389 | "type": "number", 390 | "minimum": 0, 391 | "description": "Amount raised in the most recent funding round in millions of USD", 392 | }, 393 | }, 394 | "required": [ 395 | "name", 396 | "description", 397 | "website", 398 | "crunchbase_profile", 399 | "year_founded", 400 | "ceo", 401 | "total_funding_mm_usd", 402 | "latest_round", 403 | "latest_round_date", 404 | "latest_round_amount_mm_usd", 405 | ], 406 | "description": "Company information", 407 | } 408 | 409 | if __name__ == "__main__": 410 | from langsmith import Client 411 | from langsmith.utils import LangSmithNotFoundError 412 | 413 | client = Client() 414 | dataset_name = "Startup Data Enrichment" 415 | 416 | # Storing inputs in a dataset lets us 417 | # run chains and LLMs over a shared set of examples. 418 | try: 419 | exists_dataset = client.read_dataset(dataset_name=dataset_name) 420 | print(f"Dataset '{dataset_name}' already exists.") 421 | print("You can access the dataset via the URL: ", exists_dataset.url) 422 | exit(1) 423 | except LangSmithNotFoundError: 424 | # Then let's create the dataset if it doesn't exist 425 | pass 426 | 427 | dataset = client.create_dataset( 428 | dataset_name=dataset_name, 429 | description="Evaluate ability to research information about startups (e.g., latest round, total funding, year founded etc.)", 430 | ) 431 | 432 | # Prepare inputs, outputs, and metadata for bulk creation 433 | inputs = [ 434 | {"company": record["company"], "extraction_schema": EXTRACTION_SCHEMA} 435 | for record in EXAMPLES 436 | ] 437 | outputs = [{"info": record["info"]} for record in EXAMPLES] 438 | 439 | client.create_examples( 440 | inputs=inputs, 441 | outputs=outputs, 442 | dataset_id=dataset.id, 443 | ) 444 | print(f"Dataset '{dataset_name}' created with {len(EXAMPLES)} examples.") 445 | print("You can access the dataset via the URL: ", dataset.url) 446 | --------------------------------------------------------------------------------