├── agent ├── __init__.py ├── tools │ ├── __init__.py │ ├── web_search.py │ ├── file_writer.py │ └── web_scraper.py ├── prompts │ └── __init__.py ├── planner.py ├── validator.py └── executor.py ├── requirements.txt ├── .gitignore ├── LICENSE ├── CONTRIBUTING.md ├── README.md └── main.py /agent/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | VC Research AI Agent - Core package 3 | """ -------------------------------------------------------------------------------- /agent/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | VC Research AI Agent - Tools package 3 | """ -------------------------------------------------------------------------------- /agent/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | VC Research AI Agent - Prompts package 3 | """ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.109.2 2 | uvicorn==0.27.1 3 | python-dotenv==1.0.1 4 | openai==1.55.3 5 | httpx==0.27.2 6 | beautifulsoup4==4.12.3 7 | requests==2.31.0 8 | pydantic==2.6.1 9 | python-multipart==0.0.9 10 | duckduckgo-search==4.1.1 11 | markdown==3.5.2 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | venv/ 25 | ENV/ 26 | env/ 27 | myenv/ 28 | 29 | # Environment Variables 30 | .env 31 | 32 | # IDE 33 | .idea/ 34 | .vscode/ 35 | *.swp 36 | *.swo 37 | 38 | # OS 39 | .DS_Store 40 | Thumbs.db 41 | 42 | # Project specific 43 | research_output/ 44 | *.log 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Naman Bhalla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to VC Research AI Agent 2 | 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's: 4 | 5 | - Reporting a bug 6 | - Discussing the current state of the code 7 | - Submitting a fix 8 | - Proposing new features 9 | - Becoming a maintainer 10 | 11 | ## We Develop with Github 12 | We use Github to host code, to track issues and feature requests, as well as accept pull requests. 13 | 14 | ## Pull Requests Process 15 | 16 | 1. Fork the repo and create your branch from `main`. 17 | 2. If you've added code that should be tested, add tests. 18 | 3. If you've changed APIs, update the documentation. 19 | 4. Ensure the test suite passes. 20 | 5. Make sure your code lints. 21 | 6. Issue that pull request! 22 | 23 | ## Any contributions you make will be under the MIT Software License 24 | When you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project. 25 | 26 | ## Report bugs using Github's [issue tracker] 27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](). 28 | 29 | ## Write bug reports with detail, background, and sample code 30 | 31 | **Great Bug Reports** tend to have: 32 | 33 | - A quick summary and/or background 34 | - Steps to reproduce 35 | - Be specific! 36 | - Give sample code if you can. 37 | - What you expected would happen 38 | - What actually happens 39 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 40 | 41 | ## License 42 | By contributing, you agree that your contributions will be licensed under its MIT License. 43 | 44 | ## References 45 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md). 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VC Research AI Agent 2 | 3 | An intelligent AI agent designed for Venture Capitalists to conduct comprehensive market research and company analysis. 4 | 5 | ## Features 6 | 7 | - Domain-specific research capabilities 8 | - Automated company discovery and analysis 9 | - Product line investigation 10 | - Detailed company profiles in markdown format 11 | - Domain summary with potential opportunities 12 | - Multi-step LLM planning and validation 13 | - Web scraping and information extraction 14 | 15 | ## Setup 16 | 17 | 1. Clone the repository 18 | 2. Create a virtual environment: 19 | ```bash 20 | python -m venv venv 21 | source venv/bin/activate # On Windows: venv\Scripts\activate 22 | ``` 23 | 3. Install dependencies: 24 | ```bash 25 | pip install -r requirements.txt 26 | ``` 27 | 4. Create a `.env` file with your OpenAI API key: 28 | ``` 29 | OPENAI_API_KEY=your_api_key_here 30 | ``` 31 | 32 | ## Running the Application 33 | 34 | 1. Start the FastAPI server: 35 | ```bash 36 | uvicorn main:app --reload 37 | ``` 38 | 2. Access the API documentation at `http://localhost:8000/docs` 39 | 40 | ## Project Structure 41 | 42 | ``` 43 | . 44 | ├── README.md 45 | ├── requirements.txt 46 | ├── main.py # FastAPI application 47 | ├── agent/ 48 | │ ├── __init__.py 49 | │ ├── planner.py # LLM planning module 50 | │ ├── validator.py # Plan validation module 51 | │ ├── executor.py # Plan execution module 52 | │ ├── tools/ # Function calling tools 53 | │ │ ├── __init__.py 54 | │ │ ├── web_search.py 55 | │ │ ├── web_scraper.py 56 | │ │ └── file_writer.py 57 | │ └── prompts/ # LLM prompt templates 58 | │ ├── __init__.py 59 | │ ├── planning.py 60 | │ └── validation.py 61 | └── .env # Environment variables 62 | ``` 63 | 64 | ## API Endpoints 65 | 66 | - `POST /research`: Start a new research task 67 | - Input: Domain to research 68 | - Output: Job ID for tracking progress 69 | 70 | - `GET /research/{job_id}`: Get research status and results 71 | - Output: Current status and available results 72 | 73 | ## License 74 | 75 | MIT License -------------------------------------------------------------------------------- /agent/planner.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import os 3 | from typing import List, Dict 4 | import json 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | client = OpenAI() 10 | 11 | PLANNING_PROMPT = """You are an AI research planner for a Venture Capitalist. Create a detailed step-by-step plan to research the following domain: {domain} 12 | 13 | The plan should include: 14 | 1. Initial domain research and market analysis 15 | 2. Company identification and filtering 16 | 3. Detailed company analysis for each identified company 17 | 4. Product line investigation 18 | 5. Financial metrics gathering 19 | 6. Summary and opportunity analysis 20 | 21 | For each step, specify: 22 | - The objective 23 | - Required tools/APIs 24 | - Expected output 25 | - Success criteria 26 | 27 | Format the response as a JSON object with the following structure: 28 | {{ 29 | "steps": [ 30 | {{ 31 | "name": "step_name", 32 | "objective": "step_objective", 33 | "tools": ["tool1", "tool2"], 34 | "expected_output": "output_description", 35 | "success_criteria": "criteria_description" 36 | }} 37 | ] 38 | }}""" 39 | 40 | async def create_research_plan(domain: str) -> Dict: 41 | """ 42 | Create a detailed research plan using OpenAI's GPT model. 43 | 44 | Args: 45 | domain (str): The domain to research 46 | 47 | Returns: 48 | Dict: A structured research plan 49 | """ 50 | plan = "" 51 | try: 52 | # message = PLANNING_PROMPT.format(domain=domain) 53 | response = client.chat.completions.create( 54 | model="gpt-4o-mini", 55 | messages=[ 56 | {"role": "system", "content": "You are an expert research planner for venture capital analysis."}, 57 | {"role": "user", "content": PLANNING_PROMPT.format(domain=domain)} 58 | ], 59 | response_format={"type": "json_object"} 60 | ) 61 | 62 | # Parse the response into a Python dictionary 63 | plan = json.loads(response.choices[0].message.content) 64 | print(plan) 65 | # Validate plan structure 66 | if not isinstance(plan, dict) or "steps" not in plan: 67 | raise ValueError("Invalid plan structure received from OpenAI") 68 | 69 | return plan 70 | 71 | except Exception as e: 72 | print(f"Error creating research plan: {str(e)}") 73 | raise Exception(f"Error creating research plan: {plan}") 74 | 75 | # Available research tools 76 | AVAILABLE_TOOLS = { 77 | "web_search": "Search the internet for relevant information", 78 | "company_scraper": "Extract information from company websites", 79 | "financial_data": "Gather financial metrics and valuations", 80 | "market_analysis": "Analyze market trends and opportunities", 81 | "document_writer": "Create formatted markdown documents" 82 | } -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException, BackgroundTasks 2 | from pydantic import BaseModel 3 | from typing import Dict, Optional 4 | import uuid 5 | from agent.planner import create_research_plan 6 | from agent.validator import validate_plan 7 | from agent.executor import execute_plan 8 | import os 9 | from dotenv import load_dotenv 10 | 11 | # Load environment variables 12 | load_dotenv() 13 | 14 | # Initialize FastAPI app 15 | app = FastAPI( 16 | title="VC Research AI Agent", 17 | description="An AI-powered research assistant for Venture Capitalists", 18 | version="1.0.0" 19 | ) 20 | 21 | # Store active research jobs 22 | research_jobs: Dict[str, dict] = {} 23 | 24 | class ResearchRequest(BaseModel): 25 | domain: str 26 | 27 | class ResearchResponse(BaseModel): 28 | job_id: str 29 | message: str 30 | 31 | class ResearchStatus(BaseModel): 32 | status: str 33 | progress: float 34 | results: Optional[dict] = None 35 | error: Optional[str] = None 36 | 37 | @app.get("/research", response_model=ResearchResponse) 38 | async def start_research(request: ResearchRequest, background_tasks: BackgroundTasks): 39 | print(request) 40 | # Generate unique job ID 41 | job_id = str(uuid.uuid4()) 42 | 43 | # Initialize job status 44 | research_jobs[job_id] = { 45 | "status": "planning", 46 | "progress": 0.0, 47 | "results": None, 48 | "error": None 49 | } 50 | 51 | # Add research task to background tasks 52 | background_tasks.add_task( 53 | process_research, 54 | job_id=job_id, 55 | domain=request.domain 56 | ) 57 | 58 | return ResearchResponse( 59 | job_id=job_id, 60 | message="Research task started successfully" 61 | ) 62 | 63 | @app.get("/research/{job_id}", response_model=ResearchStatus) 64 | async def get_research_status(job_id: str): 65 | if job_id not in research_jobs: 66 | raise HTTPException(status_code=404, detail="Research job not found") 67 | 68 | return ResearchStatus(**research_jobs[job_id]) 69 | 70 | async def process_research(job_id: str, domain: str): 71 | try: 72 | # Step 1: Create research plan 73 | research_jobs[job_id]["status"] = "creating_plan" 74 | plan = await create_research_plan(domain) 75 | research_jobs[job_id]["progress"] = 0.2 76 | 77 | # Step 2: Validate plan 78 | research_jobs[job_id]["status"] = "validating_plan" 79 | validated_plan = await validate_plan(plan) 80 | research_jobs[job_id]["progress"] = 0.4 81 | 82 | # Step 3: Execute plan 83 | research_jobs[job_id]["status"] = "executing_plan" 84 | results = await execute_plan(validated_plan) 85 | 86 | # Update job status with results 87 | research_jobs[job_id].update({ 88 | "status": "completed", 89 | "progress": 1.0, 90 | "results": results 91 | }) 92 | 93 | except Exception as e: 94 | research_jobs[job_id].update({ 95 | "status": "failed", 96 | "error": str(e) 97 | }) 98 | 99 | if __name__ == "__main__": 100 | import uvicorn 101 | uvicorn.run(app, host="0.0.0.0", port=8008) -------------------------------------------------------------------------------- /agent/tools/web_search.py: -------------------------------------------------------------------------------- 1 | from duckduckgo_search import DDGS 2 | from typing import List, Dict 3 | import re 4 | from urllib.parse import urlparse 5 | 6 | async def search_companies(domain: str) -> List[Dict]: 7 | """ 8 | Search for companies in a specific domain using DuckDuckGo. 9 | 10 | Args: 11 | domain (str): The domain to search for companies in 12 | 13 | Returns: 14 | List[Dict]: List of company information 15 | """ 16 | companies = [] 17 | search_queries = [ 18 | f"top companies in {domain}", 19 | f"startups in {domain}", 20 | f"leading {domain} companies", 21 | f"{domain} technology companies" 22 | ] 23 | 24 | try: 25 | with DDGS() as ddgs: 26 | for query in search_queries: 27 | results = ddgs.text(query, max_results=10) 28 | for result in results: 29 | # Extract company information 30 | company = { 31 | "name": extract_company_name(result["title"]), 32 | "description": result["body"], 33 | "url": clean_url(result["link"]) 34 | } 35 | 36 | # Only add if we got a valid company name and it's not a duplicate 37 | if (company["name"] and 38 | company["url"] and 39 | not any(c["name"] == company["name"] for c in companies)): 40 | companies.append(company) 41 | 42 | return companies[:20] # Return top 20 unique companies 43 | 44 | except Exception as e: 45 | raise Exception(f"Error searching for companies: {str(e)}") 46 | 47 | def extract_company_name(title: str) -> str: 48 | """ 49 | Extract company name from search result title. 50 | 51 | Args: 52 | title (str): The search result title 53 | 54 | Returns: 55 | str: Extracted company name or None if not found 56 | """ 57 | # Common patterns to clean up titles 58 | patterns = [ 59 | r"^(.*?)\s*\|", # Remove everything after | 60 | r"^(.*?)\s*-", # Remove everything after - 61 | r"^(.*?)\s*:", # Remove everything after : 62 | r"(.*?)'s\s*.*" # Keep only the part before 's 63 | ] 64 | 65 | for pattern in patterns: 66 | match = re.match(pattern, title) 67 | if match: 68 | return match.group(1).strip() 69 | 70 | return title.strip() 71 | 72 | def clean_url(url: str) -> str: 73 | """ 74 | Clean and validate company URL. 75 | 76 | Args: 77 | url (str): The URL to clean 78 | 79 | Returns: 80 | str: Cleaned URL or None if invalid 81 | """ 82 | try: 83 | # Parse the URL 84 | parsed = urlparse(url) 85 | 86 | # Ensure it's a company website (not a news article, etc.) 87 | if any(domain in parsed.netloc for domain in [ 88 | "wikipedia.org", "linkedin.com", "facebook.com", "twitter.com", 89 | "youtube.com", "medium.com", "github.com", "crunchbase.com" 90 | ]): 91 | return None 92 | 93 | # Return base domain 94 | return f"{parsed.scheme}://{parsed.netloc}" 95 | 96 | except Exception: 97 | return None -------------------------------------------------------------------------------- /agent/validator.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import os 3 | from typing import Dict 4 | import json 5 | from .planner import AVAILABLE_TOOLS 6 | 7 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 8 | 9 | VALIDATION_PROMPT = """You are an AI research plan validator. Review the following research plan and validate its completeness, feasibility, and alignment with VC research goals. 10 | 11 | Research Plan: 12 | {plan} 13 | 14 | Available Tools: 15 | {tools} 16 | 17 | Validate the plan based on: 18 | 1. Completeness - Are all necessary steps included? 19 | 2. Tool Usage - Are the specified tools appropriate and available? 20 | 3. Dependencies - Are step dependencies properly ordered? 21 | 4. Output Quality - Will the expected outputs be sufficient for VC decision-making? 22 | 5. Feasibility - Can the plan be executed with the available tools? 23 | 24 | If the plan is valid, return it unchanged. If modifications are needed, return the modified plan in the same JSON format with explanations for changes. 25 | 26 | Your response should be a JSON object with: 27 | {{ 28 | "is_valid": boolean, 29 | "modifications": ["list of modifications made"], 30 | "modified_plan": original_or_modified_plan_object 31 | }}""" 32 | 33 | async def validate_plan(plan: Dict) -> Dict: 34 | """ 35 | Validate and potentially modify a research plan using OpenAI's GPT model. 36 | 37 | Args: 38 | plan (Dict): The research plan to validate 39 | 40 | Returns: 41 | Dict: The validated and potentially modified plan 42 | """ 43 | try: 44 | # Convert tools to a formatted string 45 | tools_str = "\n".join([f"- {name}: {desc}" for name, desc in AVAILABLE_TOOLS.items()]) 46 | 47 | response = client.chat.completions.create( 48 | model="gpt-4-turbo-preview", 49 | messages=[ 50 | {"role": "system", "content": "You are an expert research plan validator for venture capital analysis."}, 51 | {"role": "user", "content": VALIDATION_PROMPT.format( 52 | plan=json.dumps(plan, indent=2), 53 | tools=tools_str 54 | )} 55 | ], 56 | response_format={"type": "json_object"} 57 | ) 58 | 59 | # Parse the validation response 60 | validation_result = json.loads(response.choices[0].message.content) 61 | 62 | # Validate response structure 63 | required_keys = {"is_valid", "modifications", "modified_plan"} 64 | if not all(key in validation_result for key in required_keys): 65 | raise ValueError("Invalid validation result structure") 66 | 67 | # If the plan is valid, return the modified plan 68 | return validation_result["modified_plan"] 69 | 70 | except Exception as e: 71 | raise Exception(f"Error validating research plan: {str(e)}") 72 | 73 | def validate_step_dependencies(steps: list) -> bool: 74 | """ 75 | Validate that step dependencies are properly ordered. 76 | 77 | Args: 78 | steps (list): List of steps in the plan 79 | 80 | Returns: 81 | bool: True if dependencies are valid 82 | """ 83 | required_outputs = set() 84 | 85 | for step in steps: 86 | # Check if current step's requirements are met by previous steps 87 | if "required_outputs" in step: 88 | for req in step["required_outputs"]: 89 | if req not in required_outputs: 90 | return False 91 | 92 | # Add current step's outputs to the set 93 | if "expected_output" in step: 94 | required_outputs.add(step["expected_output"]) 95 | 96 | return True -------------------------------------------------------------------------------- /agent/tools/file_writer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | import os 3 | import json 4 | from datetime import datetime 5 | 6 | COMPANY_PROFILE_TEMPLATE = """# {company_name} 7 | 8 | ## Company Overview 9 | {company_overview} 10 | 11 | ## Product Lines 12 | {product_lines} 13 | 14 | ## Market Position 15 | {market_position} 16 | 17 | ## Technology Stack 18 | {tech_stack} 19 | 20 | ## Key Differentiators 21 | {differentiators} 22 | 23 | ## Target Customers 24 | {target_customers} 25 | 26 | ## Financial Information 27 | {financials} 28 | 29 | --- 30 | *Generated on {date}* 31 | """ 32 | 33 | async def write_company_profile(company_name: str, company_info: Dict) -> None: 34 | """ 35 | Write company information to a markdown file. 36 | 37 | Args: 38 | company_name (str): Name of the company 39 | company_info (Dict): Company information to write 40 | """ 41 | try: 42 | # Create output directory if it doesn't exist 43 | os.makedirs("research_output", exist_ok=True) 44 | 45 | # Clean company name for filename 46 | clean_name = "".join(c if c.isalnum() else "_" for c in company_name) 47 | filename = f"research_output/{clean_name}.md" 48 | 49 | # Format company information 50 | content = COMPANY_PROFILE_TEMPLATE.format( 51 | company_name=company_name, 52 | company_overview=company_info.get("company_overview", "Information not available"), 53 | product_lines=format_product_lines(company_info.get("product_lines", [])), 54 | market_position=company_info.get("market_positioning", "Information not available"), 55 | tech_stack=format_tech_stack(company_info.get("technology_stack", [])), 56 | differentiators=format_list(company_info.get("key_differentiators", [])), 57 | target_customers=company_info.get("target_customers", "Information not available"), 58 | financials=format_financials(company_info.get("revenue_valuation", {})), 59 | date=datetime.now().strftime("%Y-%m-%d %H:%M:%S") 60 | ) 61 | 62 | # Write to file 63 | with open(filename, "w") as f: 64 | f.write(content) 65 | 66 | except Exception as e: 67 | raise Exception(f"Error writing company profile: {str(e)}") 68 | 69 | async def write_domain_summary(summary: str) -> None: 70 | """ 71 | Write domain summary to a markdown file. 72 | 73 | Args: 74 | summary (str): The domain summary to write 75 | """ 76 | try: 77 | # Create output directory if it doesn't exist 78 | os.makedirs("research_output", exist_ok=True) 79 | 80 | filename = f"research_output/domain_summary.md" 81 | 82 | # Add timestamp to summary 83 | content = f"{summary}\n\n---\n*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*" 84 | 85 | # Write to file 86 | with open(filename, "w") as f: 87 | f.write(content) 88 | 89 | except Exception as e: 90 | raise Exception(f"Error writing domain summary: {str(e)}") 91 | 92 | def format_product_lines(product_lines: list) -> str: 93 | """Format product lines as markdown.""" 94 | if not product_lines: 95 | return "Information not available" 96 | 97 | result = "" 98 | for product in product_lines: 99 | if isinstance(product, dict): 100 | result += f"### {product.get('name', 'Unnamed Product')}\n" 101 | result += f"{product.get('description', '')}\n\n" 102 | if 'features' in product: 103 | result += "**Features:**\n" 104 | for feature in product['features']: 105 | result += f"- {feature}\n" 106 | result += "\n" 107 | else: 108 | result += f"- {product}\n" 109 | 110 | return result 111 | 112 | def format_tech_stack(tech_stack: list) -> str: 113 | """Format technology stack as markdown.""" 114 | if not tech_stack: 115 | return "Information not available" 116 | 117 | return "\n".join(f"- {tech}" for tech in tech_stack) 118 | 119 | def format_list(items: list) -> str: 120 | """Format a list as markdown bullet points.""" 121 | if not items: 122 | return "Information not available" 123 | 124 | return "\n".join(f"- {item}" for item in items) 125 | 126 | def format_financials(financials: Dict) -> str: 127 | """Format financial information as markdown.""" 128 | if not financials: 129 | return "Information not available" 130 | 131 | result = "" 132 | if "revenue" in financials: 133 | result += f"**Revenue:** {financials['revenue']}\n" 134 | if "valuation" in financials: 135 | result += f"**Valuation:** {financials['valuation']}\n" 136 | if "funding" in financials: 137 | result += f"**Funding:** {financials['funding']}\n" 138 | 139 | return result or "Information not available" -------------------------------------------------------------------------------- /agent/executor.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | import os 3 | from typing import Dict, List 4 | import json 5 | import asyncio 6 | from .tools.web_search import search_companies 7 | from .tools.web_scraper import scrape_company_info 8 | from .tools.file_writer import write_company_profile, write_domain_summary 9 | 10 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 11 | 12 | EXECUTION_PROMPT = """You are an AI research executor. Execute the following step in the research plan and provide the results: 13 | 14 | Step: 15 | {step} 16 | 17 | Context: 18 | {context} 19 | 20 | Previous Results: 21 | {previous_results} 22 | 23 | Analyze the information and provide a structured response that includes: 24 | 1. Key findings 25 | 2. Relevant metrics 26 | 3. Important insights 27 | 4. Next steps or recommendations 28 | 29 | Format the response as a JSON object.""" 30 | 31 | async def execute_plan(plan: Dict) -> Dict: 32 | """ 33 | Execute a validated research plan step by step. 34 | 35 | Args: 36 | plan (Dict): The validated research plan 37 | 38 | Returns: 39 | Dict: The research results 40 | """ 41 | results = { 42 | "companies": {}, 43 | "domain_summary": None, 44 | "execution_log": [] 45 | } 46 | 47 | context = {} 48 | 49 | try: 50 | for step in plan["steps"]: 51 | step_result = await execute_step(step, context, results) 52 | results["execution_log"].append({ 53 | "step": step["name"], 54 | "status": "completed", 55 | "result": step_result 56 | }) 57 | 58 | # Update context with step results 59 | context[step["name"]] = step_result 60 | 61 | # Generate final domain summary 62 | await generate_domain_summary(results) 63 | 64 | return results 65 | 66 | except Exception as e: 67 | results["execution_log"].append({ 68 | "step": step["name"] if "step" in locals() else "unknown", 69 | "status": "failed", 70 | "error": str(e) 71 | }) 72 | raise 73 | 74 | async def execute_step(step: Dict, context: Dict, results: Dict) -> Dict: 75 | """ 76 | Execute a single step of the research plan. 77 | 78 | Args: 79 | step (Dict): The step to execute 80 | context (Dict): The current execution context 81 | results (Dict): The current results 82 | 83 | Returns: 84 | Dict: The step execution results 85 | """ 86 | if step["name"] == "company_identification": 87 | companies = await search_companies(context.get("domain_research", {}).get("domain")) 88 | return {"companies": companies} 89 | 90 | elif step["name"] == "company_analysis": 91 | for company in context.get("company_identification", {}).get("companies", []): 92 | company_info = await scrape_company_info(company["url"]) 93 | results["companies"][company["name"]] = company_info 94 | await write_company_profile(company["name"], company_info) 95 | return {"analyzed_companies": list(results["companies"].keys())} 96 | 97 | else: 98 | # Use OpenAI to analyze and synthesize information for other steps 99 | response = client.chat.completions.create( 100 | model="gpt-4-turbo-preview", 101 | messages=[ 102 | {"role": "system", "content": "You are an expert venture capital researcher."}, 103 | {"role": "user", "content": EXECUTION_PROMPT.format( 104 | step=json.dumps(step, indent=2), 105 | context=json.dumps(context, indent=2), 106 | previous_results=json.dumps(results, indent=2) 107 | )} 108 | ], 109 | response_format={"type": "json_object"} 110 | ) 111 | 112 | return json.loads(response.choices[0].message.content) 113 | 114 | async def generate_domain_summary(results: Dict) -> None: 115 | """ 116 | Generate a comprehensive domain summary based on all research results. 117 | 118 | Args: 119 | results (Dict): The complete research results 120 | """ 121 | summary_prompt = """Based on the following research results, create a comprehensive domain summary that includes: 122 | 1. Market overview 123 | 2. Key players and their positions 124 | 3. Product trends and innovations 125 | 4. Market opportunities 126 | 5. Potential investment thesis 127 | 128 | Research Results: 129 | {results} 130 | 131 | Format the response as a markdown document.""" 132 | 133 | response = client.chat.completions.create( 134 | model="gpt-4-turbo-preview", 135 | messages=[ 136 | {"role": "system", "content": "You are an expert venture capital analyst."}, 137 | {"role": "user", "content": summary_prompt.format( 138 | results=json.dumps(results, indent=2) 139 | )} 140 | ] 141 | ) 142 | 143 | summary = response.choices[0].message.content 144 | await write_domain_summary(summary) 145 | results["domain_summary"] = summary -------------------------------------------------------------------------------- /agent/tools/web_scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from typing import Dict, Optional 4 | import re 5 | from openai import OpenAI 6 | import os 7 | import json 8 | import asyncio 9 | 10 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) 11 | 12 | ANALYSIS_PROMPT = """Analyze the following website content and extract key information about the company: 13 | 14 | Content: 15 | {content} 16 | 17 | Please extract and structure the following information: 18 | 1. Product lines and their features 19 | 2. Company overview 20 | 3. Market positioning 21 | 4. Technology stack (if available) 22 | 5. Key differentiators 23 | 6. Target customers 24 | 7. Revenue/valuation information (if available) 25 | 26 | Format the response as a JSON object with these fields.""" 27 | 28 | async def scrape_company_info(url: str) -> Dict: 29 | """ 30 | Scrape and analyze company information from their website. 31 | 32 | Args: 33 | url (str): The company's website URL 34 | 35 | Returns: 36 | Dict: Structured company information 37 | """ 38 | try: 39 | # Fetch main page content 40 | main_content = await fetch_page_content(url) 41 | 42 | # Try to find and fetch additional important pages 43 | about_url = find_about_page(main_content, url) 44 | products_url = find_products_page(main_content, url) 45 | 46 | # Fetch additional pages concurrently 47 | additional_contents = await asyncio.gather( 48 | fetch_page_content(about_url) if about_url else asyncio.sleep(0), 49 | fetch_page_content(products_url) if products_url else asyncio.sleep(0) 50 | ) 51 | 52 | # Combine all content 53 | all_content = main_content 54 | if additional_contents[0]: 55 | all_content += "\n\n" + additional_contents[0] 56 | if additional_contents[1]: 57 | all_content += "\n\n" + additional_contents[1] 58 | 59 | # Analyze content using OpenAI 60 | return await analyze_content(all_content) 61 | 62 | except Exception as e: 63 | raise Exception(f"Error scraping company info: {str(e)}") 64 | 65 | async def fetch_page_content(url: Optional[str]) -> Optional[str]: 66 | """ 67 | Fetch and extract text content from a webpage. 68 | 69 | Args: 70 | url (Optional[str]): The URL to fetch 71 | 72 | Returns: 73 | Optional[str]: Extracted text content 74 | """ 75 | if not url: 76 | return None 77 | 78 | try: 79 | headers = { 80 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 81 | } 82 | 83 | response = requests.get(url, headers=headers, timeout=10) 84 | response.raise_for_status() 85 | 86 | soup = BeautifulSoup(response.text, 'html.parser') 87 | 88 | # Remove script and style elements 89 | for script in soup(["script", "style"]): 90 | script.decompose() 91 | 92 | # Extract text content 93 | text = soup.get_text() 94 | 95 | # Clean up text 96 | lines = (line.strip() for line in text.splitlines()) 97 | chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) 98 | text = ' '.join(chunk for chunk in chunks if chunk) 99 | 100 | return text 101 | 102 | except Exception: 103 | return None 104 | 105 | def find_about_page(content: str, base_url: str) -> Optional[str]: 106 | """ 107 | Find the URL of the company's about page. 108 | 109 | Args: 110 | content (str): The main page content 111 | base_url (str): The base URL of the company website 112 | 113 | Returns: 114 | Optional[str]: URL of the about page if found 115 | """ 116 | soup = BeautifulSoup(content, 'html.parser') 117 | 118 | # Common patterns for about pages 119 | patterns = [ 120 | r'about\b', 121 | r'about-us\b', 122 | r'company\b', 123 | r'who-we-are\b' 124 | ] 125 | 126 | for link in soup.find_all('a', href=True): 127 | href = link['href'] 128 | text = link.text.lower() 129 | 130 | if any(re.search(pattern, text) or re.search(pattern, href) for pattern in patterns): 131 | return make_absolute_url(href, base_url) 132 | 133 | return None 134 | 135 | def find_products_page(content: str, base_url: str) -> Optional[str]: 136 | """ 137 | Find the URL of the company's products page. 138 | 139 | Args: 140 | content (str): The main page content 141 | base_url (str): The base URL of the company website 142 | 143 | Returns: 144 | Optional[str]: URL of the products page if found 145 | """ 146 | soup = BeautifulSoup(content, 'html.parser') 147 | 148 | # Common patterns for product pages 149 | patterns = [ 150 | r'products?\b', 151 | r'solutions\b', 152 | r'services\b', 153 | r'platform\b' 154 | ] 155 | 156 | for link in soup.find_all('a', href=True): 157 | href = link['href'] 158 | text = link.text.lower() 159 | 160 | if any(re.search(pattern, text) or re.search(pattern, href) for pattern in patterns): 161 | return make_absolute_url(href, base_url) 162 | 163 | return None 164 | 165 | def make_absolute_url(href: str, base_url: str) -> str: 166 | """ 167 | Convert a relative URL to an absolute URL. 168 | 169 | Args: 170 | href (str): The relative or absolute URL 171 | base_url (str): The base URL of the website 172 | 173 | Returns: 174 | str: The absolute URL 175 | """ 176 | if href.startswith('http'): 177 | return href 178 | elif href.startswith('//'): 179 | return f"https:{href}" 180 | elif href.startswith('/'): 181 | return f"{base_url.rstrip('/')}{href}" 182 | else: 183 | return f"{base_url.rstrip('/')}/{href.lstrip('/')}" 184 | 185 | async def analyze_content(content: str) -> Dict: 186 | """ 187 | Analyze website content using OpenAI's GPT model. 188 | 189 | Args: 190 | content (str): The website content to analyze 191 | 192 | Returns: 193 | Dict: Structured analysis of the company 194 | """ 195 | try: 196 | response = client.chat.completions.create( 197 | model="gpt-4-turbo-preview", 198 | messages=[ 199 | {"role": "system", "content": "You are an expert business analyst."}, 200 | {"role": "user", "content": ANALYSIS_PROMPT.format(content=content[:10000])} # Limit content length 201 | ], 202 | response_format={"type": "json_object"} 203 | ) 204 | 205 | return json.loads(response.choices[0].message.content) 206 | 207 | except Exception as e: 208 | raise Exception(f"Error analyzing content: {str(e)}") --------------------------------------------------------------------------------