├── agent
    ├── __init__.py
    ├── tools
    │   ├── __init__.py
    │   ├── web_search.py
    │   ├── file_writer.py
    │   └── web_scraper.py
    ├── prompts
    │   └── __init__.py
    ├── planner.py
    ├── validator.py
    └── executor.py
├── requirements.txt
├── .gitignore
├── LICENSE
├── CONTRIBUTING.md
├── README.md
└── main.py


/agent/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | VC Research AI Agent - Core package
3 | """ 


--------------------------------------------------------------------------------
/agent/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | VC Research AI Agent - Tools package
3 | """ 


--------------------------------------------------------------------------------
/agent/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | VC Research AI Agent - Prompts package
3 | """ 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.109.2
 2 | uvicorn==0.27.1
 3 | python-dotenv==1.0.1
 4 | openai==1.55.3
 5 | httpx==0.27.2
 6 | beautifulsoup4==4.12.3
 7 | requests==2.31.0
 8 | pydantic==2.6.1
 9 | python-multipart==0.0.9
10 | duckduckgo-search==4.1.1
11 | markdown==3.5.2 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | venv/
25 | ENV/
26 | env/
27 | myenv/
28 | 
29 | # Environment Variables
30 | .env
31 | 
32 | # IDE
33 | .idea/
34 | .vscode/
35 | *.swp
36 | *.swo
37 | 
38 | # OS
39 | .DS_Store
40 | Thumbs.db
41 | 
42 | # Project specific
43 | research_output/
44 | *.log
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Naman Bhalla
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to VC Research AI Agent
 2 | 
 3 | We love your input! We want to make contributing to this project as easy and transparent as possible, whether it's:
 4 | 
 5 | - Reporting a bug
 6 | - Discussing the current state of the code
 7 | - Submitting a fix
 8 | - Proposing new features
 9 | - Becoming a maintainer
10 | 
11 | ## We Develop with Github
12 | We use Github to host code, to track issues and feature requests, as well as accept pull requests.
13 | 
14 | ## Pull Requests Process
15 | 
16 | 1. Fork the repo and create your branch from `main`.
17 | 2. If you've added code that should be tested, add tests.
18 | 3. If you've changed APIs, update the documentation.
19 | 4. Ensure the test suite passes.
20 | 5. Make sure your code lints.
21 | 6. Issue that pull request!
22 | 
23 | ## Any contributions you make will be under the MIT Software License
24 | When you submit code changes, your submissions are understood to be under the same [MIT License](http://choosealicense.com/licenses/mit/) that covers the project.
25 | 
26 | ## Report bugs using Github's [issue tracker]
27 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue]().
28 | 
29 | ## Write bug reports with detail, background, and sample code
30 | 
31 | **Great Bug Reports** tend to have:
32 | 
33 | - A quick summary and/or background
34 | - Steps to reproduce
35 |   - Be specific!
36 |   - Give sample code if you can.
37 | - What you expected would happen
38 | - What actually happens
39 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
40 | 
41 | ## License
42 | By contributing, you agree that your contributions will be licensed under its MIT License.
43 | 
44 | ## References
45 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md).
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # VC Research AI Agent
 2 | 
 3 | An intelligent AI agent designed for Venture Capitalists to conduct comprehensive market research and company analysis.
 4 | 
 5 | ## Features
 6 | 
 7 | - Domain-specific research capabilities
 8 | - Automated company discovery and analysis
 9 | - Product line investigation
10 | - Detailed company profiles in markdown format
11 | - Domain summary with potential opportunities
12 | - Multi-step LLM planning and validation
13 | - Web scraping and information extraction
14 | 
15 | ## Setup
16 | 
17 | 1. Clone the repository
18 | 2. Create a virtual environment:
19 | ```bash
20 | python -m venv venv
21 | source venv/bin/activate  # On Windows: venv\Scripts\activate
22 | ```
23 | 3. Install dependencies:
24 | ```bash
25 | pip install -r requirements.txt
26 | ```
27 | 4. Create a `.env` file with your OpenAI API key:
28 | ```
29 | OPENAI_API_KEY=your_api_key_here
30 | ```
31 | 
32 | ## Running the Application
33 | 
34 | 1. Start the FastAPI server:
35 | ```bash
36 | uvicorn main:app --reload
37 | ```
38 | 2. Access the API documentation at `http://localhost:8000/docs`
39 | 
40 | ## Project Structure
41 | 
42 | ```
43 | .
44 | ├── README.md
45 | ├── requirements.txt
46 | ├── main.py                 # FastAPI application
47 | ├── agent/
48 | │   ├── __init__.py
49 | │   ├── planner.py         # LLM planning module
50 | │   ├── validator.py       # Plan validation module
51 | │   ├── executor.py        # Plan execution module
52 | │   ├── tools/            # Function calling tools
53 | │   │   ├── __init__.py
54 | │   │   ├── web_search.py
55 | │   │   ├── web_scraper.py
56 | │   │   └── file_writer.py
57 | │   └── prompts/          # LLM prompt templates
58 | │       ├── __init__.py
59 | │       ├── planning.py
60 | │       └── validation.py
61 | └── .env                   # Environment variables
62 | ```
63 | 
64 | ## API Endpoints
65 | 
66 | - `POST /research`: Start a new research task
67 |   - Input: Domain to research
68 |   - Output: Job ID for tracking progress
69 | 
70 | - `GET /research/{job_id}`: Get research status and results
71 |   - Output: Current status and available results
72 | 
73 | ## License
74 | 
75 | MIT License 


--------------------------------------------------------------------------------
/agent/planner.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import os
 3 | from typing import List, Dict
 4 | import json
 5 | from dotenv import load_dotenv
 6 | 
 7 | load_dotenv()
 8 | 
 9 | client = OpenAI()
10 | 
11 | PLANNING_PROMPT = """You are an AI research planner for a Venture Capitalist. Create a detailed step-by-step plan to research the following domain: {domain}
12 | 
13 | The plan should include:
14 | 1. Initial domain research and market analysis
15 | 2. Company identification and filtering
16 | 3. Detailed company analysis for each identified company
17 | 4. Product line investigation
18 | 5. Financial metrics gathering
19 | 6. Summary and opportunity analysis
20 | 
21 | For each step, specify:
22 | - The objective
23 | - Required tools/APIs
24 | - Expected output
25 | - Success criteria
26 | 
27 | Format the response as a JSON object with the following structure:
28 | {{
29 |     "steps": [
30 |         {{
31 |             "name": "step_name",
32 |             "objective": "step_objective",
33 |             "tools": ["tool1", "tool2"],
34 |             "expected_output": "output_description",
35 |             "success_criteria": "criteria_description"
36 |         }}
37 |     ]
38 | }}"""
39 | 
40 | async def create_research_plan(domain: str) -> Dict:
41 |     """
42 |     Create a detailed research plan using OpenAI's GPT model.
43 |     
44 |     Args:
45 |         domain (str): The domain to research
46 |         
47 |     Returns:
48 |         Dict: A structured research plan
49 |     """
50 |     plan = ""
51 |     try:
52 |         # message = PLANNING_PROMPT.format(domain=domain)
53 |         response = client.chat.completions.create(
54 |             model="gpt-4o-mini",
55 |             messages=[
56 |                 {"role": "system", "content": "You are an expert research planner for venture capital analysis."},
57 |                 {"role": "user", "content": PLANNING_PROMPT.format(domain=domain)}
58 |             ],
59 |             response_format={"type": "json_object"}
60 |         )
61 |         
62 |         # Parse the response into a Python dictionary
63 |         plan = json.loads(response.choices[0].message.content)
64 |         print(plan)
65 |         # Validate plan structure
66 |         if not isinstance(plan, dict) or "steps" not in plan:
67 |             raise ValueError("Invalid plan structure received from OpenAI")
68 |         
69 |         return plan
70 |         
71 |     except Exception as e:
72 |         print(f"Error creating research plan: {str(e)}")
73 |         raise Exception(f"Error creating research plan: {plan}")
74 | 
75 | # Available research tools
76 | AVAILABLE_TOOLS = {
77 |     "web_search": "Search the internet for relevant information",
78 |     "company_scraper": "Extract information from company websites",
79 |     "financial_data": "Gather financial metrics and valuations",
80 |     "market_analysis": "Analyze market trends and opportunities",
81 |     "document_writer": "Create formatted markdown documents"
82 | } 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from fastapi import FastAPI, HTTPException, BackgroundTasks
  2 | from pydantic import BaseModel
  3 | from typing import Dict, Optional
  4 | import uuid
  5 | from agent.planner import create_research_plan
  6 | from agent.validator import validate_plan
  7 | from agent.executor import execute_plan
  8 | import os
  9 | from dotenv import load_dotenv
 10 | 
 11 | # Load environment variables
 12 | load_dotenv()
 13 | 
 14 | # Initialize FastAPI app
 15 | app = FastAPI(
 16 |     title="VC Research AI Agent",
 17 |     description="An AI-powered research assistant for Venture Capitalists",
 18 |     version="1.0.0"
 19 | )
 20 | 
 21 | # Store active research jobs
 22 | research_jobs: Dict[str, dict] = {}
 23 | 
 24 | class ResearchRequest(BaseModel):
 25 |     domain: str
 26 |     
 27 | class ResearchResponse(BaseModel):
 28 |     job_id: str
 29 |     message: str
 30 | 
 31 | class ResearchStatus(BaseModel):
 32 |     status: str
 33 |     progress: float
 34 |     results: Optional[dict] = None
 35 |     error: Optional[str] = None
 36 | 
 37 | @app.get("/research", response_model=ResearchResponse)
 38 | async def start_research(request: ResearchRequest, background_tasks: BackgroundTasks):
 39 |     print(request)
 40 |     # Generate unique job ID
 41 |     job_id = str(uuid.uuid4())
 42 |     
 43 |     # Initialize job status
 44 |     research_jobs[job_id] = {
 45 |         "status": "planning",
 46 |         "progress": 0.0,
 47 |         "results": None,
 48 |         "error": None
 49 |     }
 50 |     
 51 |     # Add research task to background tasks
 52 |     background_tasks.add_task(
 53 |         process_research,
 54 |         job_id=job_id,
 55 |         domain=request.domain
 56 |     )
 57 |     
 58 |     return ResearchResponse(
 59 |         job_id=job_id,
 60 |         message="Research task started successfully"
 61 |     )
 62 | 
 63 | @app.get("/research/{job_id}", response_model=ResearchStatus)
 64 | async def get_research_status(job_id: str):
 65 |     if job_id not in research_jobs:
 66 |         raise HTTPException(status_code=404, detail="Research job not found")
 67 |     
 68 |     return ResearchStatus(**research_jobs[job_id])
 69 | 
 70 | async def process_research(job_id: str, domain: str):
 71 |     try:
 72 |         # Step 1: Create research plan
 73 |         research_jobs[job_id]["status"] = "creating_plan"
 74 |         plan = await create_research_plan(domain)
 75 |         research_jobs[job_id]["progress"] = 0.2
 76 |         
 77 |         # Step 2: Validate plan
 78 |         research_jobs[job_id]["status"] = "validating_plan"
 79 |         validated_plan = await validate_plan(plan)
 80 |         research_jobs[job_id]["progress"] = 0.4
 81 |         
 82 |         # Step 3: Execute plan
 83 |         research_jobs[job_id]["status"] = "executing_plan"
 84 |         results = await execute_plan(validated_plan)
 85 |         
 86 |         # Update job status with results
 87 |         research_jobs[job_id].update({
 88 |             "status": "completed",
 89 |             "progress": 1.0,
 90 |             "results": results
 91 |         })
 92 |         
 93 |     except Exception as e:
 94 |         research_jobs[job_id].update({
 95 |             "status": "failed",
 96 |             "error": str(e)
 97 |         })
 98 | 
 99 | if __name__ == "__main__":
100 |     import uvicorn
101 |     uvicorn.run(app, host="0.0.0.0", port=8008) 


--------------------------------------------------------------------------------
/agent/tools/web_search.py:
--------------------------------------------------------------------------------
 1 | from duckduckgo_search import DDGS
 2 | from typing import List, Dict
 3 | import re
 4 | from urllib.parse import urlparse
 5 | 
 6 | async def search_companies(domain: str) -> List[Dict]:
 7 |     """
 8 |     Search for companies in a specific domain using DuckDuckGo.
 9 |     
10 |     Args:
11 |         domain (str): The domain to search for companies in
12 |         
13 |     Returns:
14 |         List[Dict]: List of company information
15 |     """
16 |     companies = []
17 |     search_queries = [
18 |         f"top companies in {domain}",
19 |         f"startups in {domain}",
20 |         f"leading {domain} companies",
21 |         f"{domain} technology companies"
22 |     ]
23 |     
24 |     try:
25 |         with DDGS() as ddgs:
26 |             for query in search_queries:
27 |                 results = ddgs.text(query, max_results=10)
28 |                 for result in results:
29 |                     # Extract company information
30 |                     company = {
31 |                         "name": extract_company_name(result["title"]),
32 |                         "description": result["body"],
33 |                         "url": clean_url(result["link"])
34 |                     }
35 |                     
36 |                     # Only add if we got a valid company name and it's not a duplicate
37 |                     if (company["name"] and 
38 |                         company["url"] and 
39 |                         not any(c["name"] == company["name"] for c in companies)):
40 |                         companies.append(company)
41 |         
42 |         return companies[:20]  # Return top 20 unique companies
43 |         
44 |     except Exception as e:
45 |         raise Exception(f"Error searching for companies: {str(e)}")
46 | 
47 | def extract_company_name(title: str) -> str:
48 |     """
49 |     Extract company name from search result title.
50 |     
51 |     Args:
52 |         title (str): The search result title
53 |         
54 |     Returns:
55 |         str: Extracted company name or None if not found
56 |     """
57 |     # Common patterns to clean up titles
58 |     patterns = [
59 |         r"^(.*?)\s*\|",  # Remove everything after |
60 |         r"^(.*?)\s*-",   # Remove everything after -
61 |         r"^(.*?)\s*:",   # Remove everything after :
62 |         r"(.*?)'s\s*.*"  # Keep only the part before 's
63 |     ]
64 |     
65 |     for pattern in patterns:
66 |         match = re.match(pattern, title)
67 |         if match:
68 |             return match.group(1).strip()
69 |     
70 |     return title.strip()
71 | 
72 | def clean_url(url: str) -> str:
73 |     """
74 |     Clean and validate company URL.
75 |     
76 |     Args:
77 |         url (str): The URL to clean
78 |         
79 |     Returns:
80 |         str: Cleaned URL or None if invalid
81 |     """
82 |     try:
83 |         # Parse the URL
84 |         parsed = urlparse(url)
85 |         
86 |         # Ensure it's a company website (not a news article, etc.)
87 |         if any(domain in parsed.netloc for domain in [
88 |             "wikipedia.org", "linkedin.com", "facebook.com", "twitter.com",
89 |             "youtube.com", "medium.com", "github.com", "crunchbase.com"
90 |         ]):
91 |             return None
92 |             
93 |         # Return base domain
94 |         return f"{parsed.scheme}://{parsed.netloc}"
95 |         
96 |     except Exception:
97 |         return None 


--------------------------------------------------------------------------------
/agent/validator.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | import os
 3 | from typing import Dict
 4 | import json
 5 | from .planner import AVAILABLE_TOOLS
 6 | 
 7 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 8 | 
 9 | VALIDATION_PROMPT = """You are an AI research plan validator. Review the following research plan and validate its completeness, feasibility, and alignment with VC research goals.
10 | 
11 | Research Plan:
12 | {plan}
13 | 
14 | Available Tools:
15 | {tools}
16 | 
17 | Validate the plan based on:
18 | 1. Completeness - Are all necessary steps included?
19 | 2. Tool Usage - Are the specified tools appropriate and available?
20 | 3. Dependencies - Are step dependencies properly ordered?
21 | 4. Output Quality - Will the expected outputs be sufficient for VC decision-making?
22 | 5. Feasibility - Can the plan be executed with the available tools?
23 | 
24 | If the plan is valid, return it unchanged. If modifications are needed, return the modified plan in the same JSON format with explanations for changes.
25 | 
26 | Your response should be a JSON object with:
27 | {{
28 |     "is_valid": boolean,
29 |     "modifications": ["list of modifications made"],
30 |     "modified_plan": original_or_modified_plan_object
31 | }}"""
32 | 
33 | async def validate_plan(plan: Dict) -> Dict:
34 |     """
35 |     Validate and potentially modify a research plan using OpenAI's GPT model.
36 |     
37 |     Args:
38 |         plan (Dict): The research plan to validate
39 |         
40 |     Returns:
41 |         Dict: The validated and potentially modified plan
42 |     """
43 |     try:
44 |         # Convert tools to a formatted string
45 |         tools_str = "\n".join([f"- {name}: {desc}" for name, desc in AVAILABLE_TOOLS.items()])
46 |         
47 |         response = client.chat.completions.create(
48 |             model="gpt-4-turbo-preview",
49 |             messages=[
50 |                 {"role": "system", "content": "You are an expert research plan validator for venture capital analysis."},
51 |                 {"role": "user", "content": VALIDATION_PROMPT.format(
52 |                     plan=json.dumps(plan, indent=2),
53 |                     tools=tools_str
54 |                 )}
55 |             ],
56 |             response_format={"type": "json_object"}
57 |         )
58 |         
59 |         # Parse the validation response
60 |         validation_result = json.loads(response.choices[0].message.content)
61 |         
62 |         # Validate response structure
63 |         required_keys = {"is_valid", "modifications", "modified_plan"}
64 |         if not all(key in validation_result for key in required_keys):
65 |             raise ValueError("Invalid validation result structure")
66 |         
67 |         # If the plan is valid, return the modified plan
68 |         return validation_result["modified_plan"]
69 |         
70 |     except Exception as e:
71 |         raise Exception(f"Error validating research plan: {str(e)}")
72 | 
73 | def validate_step_dependencies(steps: list) -> bool:
74 |     """
75 |     Validate that step dependencies are properly ordered.
76 |     
77 |     Args:
78 |         steps (list): List of steps in the plan
79 |         
80 |     Returns:
81 |         bool: True if dependencies are valid
82 |     """
83 |     required_outputs = set()
84 |     
85 |     for step in steps:
86 |         # Check if current step's requirements are met by previous steps
87 |         if "required_outputs" in step:
88 |             for req in step["required_outputs"]:
89 |                 if req not in required_outputs:
90 |                     return False
91 |         
92 |         # Add current step's outputs to the set
93 |         if "expected_output" in step:
94 |             required_outputs.add(step["expected_output"])
95 |     
96 |     return True 


--------------------------------------------------------------------------------
/agent/tools/file_writer.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict
  2 | import os
  3 | import json
  4 | from datetime import datetime
  5 | 
  6 | COMPANY_PROFILE_TEMPLATE = """# {company_name}
  7 | 
  8 | ## Company Overview
  9 | {company_overview}
 10 | 
 11 | ## Product Lines
 12 | {product_lines}
 13 | 
 14 | ## Market Position
 15 | {market_position}
 16 | 
 17 | ## Technology Stack
 18 | {tech_stack}
 19 | 
 20 | ## Key Differentiators
 21 | {differentiators}
 22 | 
 23 | ## Target Customers
 24 | {target_customers}
 25 | 
 26 | ## Financial Information
 27 | {financials}
 28 | 
 29 | ---
 30 | *Generated on {date}*
 31 | """
 32 | 
 33 | async def write_company_profile(company_name: str, company_info: Dict) -> None:
 34 |     """
 35 |     Write company information to a markdown file.
 36 |     
 37 |     Args:
 38 |         company_name (str): Name of the company
 39 |         company_info (Dict): Company information to write
 40 |     """
 41 |     try:
 42 |         # Create output directory if it doesn't exist
 43 |         os.makedirs("research_output", exist_ok=True)
 44 |         
 45 |         # Clean company name for filename
 46 |         clean_name = "".join(c if c.isalnum() else "_" for c in company_name)
 47 |         filename = f"research_output/{clean_name}.md"
 48 |         
 49 |         # Format company information
 50 |         content = COMPANY_PROFILE_TEMPLATE.format(
 51 |             company_name=company_name,
 52 |             company_overview=company_info.get("company_overview", "Information not available"),
 53 |             product_lines=format_product_lines(company_info.get("product_lines", [])),
 54 |             market_position=company_info.get("market_positioning", "Information not available"),
 55 |             tech_stack=format_tech_stack(company_info.get("technology_stack", [])),
 56 |             differentiators=format_list(company_info.get("key_differentiators", [])),
 57 |             target_customers=company_info.get("target_customers", "Information not available"),
 58 |             financials=format_financials(company_info.get("revenue_valuation", {})),
 59 |             date=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 60 |         )
 61 |         
 62 |         # Write to file
 63 |         with open(filename, "w") as f:
 64 |             f.write(content)
 65 |             
 66 |     except Exception as e:
 67 |         raise Exception(f"Error writing company profile: {str(e)}")
 68 | 
 69 | async def write_domain_summary(summary: str) -> None:
 70 |     """
 71 |     Write domain summary to a markdown file.
 72 |     
 73 |     Args:
 74 |         summary (str): The domain summary to write
 75 |     """
 76 |     try:
 77 |         # Create output directory if it doesn't exist
 78 |         os.makedirs("research_output", exist_ok=True)
 79 |         
 80 |         filename = f"research_output/domain_summary.md"
 81 |         
 82 |         # Add timestamp to summary
 83 |         content = f"{summary}\n\n---\n*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*"
 84 |         
 85 |         # Write to file
 86 |         with open(filename, "w") as f:
 87 |             f.write(content)
 88 |             
 89 |     except Exception as e:
 90 |         raise Exception(f"Error writing domain summary: {str(e)}")
 91 | 
 92 | def format_product_lines(product_lines: list) -> str:
 93 |     """Format product lines as markdown."""
 94 |     if not product_lines:
 95 |         return "Information not available"
 96 |         
 97 |     result = ""
 98 |     for product in product_lines:
 99 |         if isinstance(product, dict):
100 |             result += f"### {product.get('name', 'Unnamed Product')}\n"
101 |             result += f"{product.get('description', '')}\n\n"
102 |             if 'features' in product:
103 |                 result += "**Features:**\n"
104 |                 for feature in product['features']:
105 |                     result += f"- {feature}\n"
106 |                 result += "\n"
107 |         else:
108 |             result += f"- {product}\n"
109 |     
110 |     return result
111 | 
112 | def format_tech_stack(tech_stack: list) -> str:
113 |     """Format technology stack as markdown."""
114 |     if not tech_stack:
115 |         return "Information not available"
116 |         
117 |     return "\n".join(f"- {tech}" for tech in tech_stack)
118 | 
119 | def format_list(items: list) -> str:
120 |     """Format a list as markdown bullet points."""
121 |     if not items:
122 |         return "Information not available"
123 |         
124 |     return "\n".join(f"- {item}" for item in items)
125 | 
126 | def format_financials(financials: Dict) -> str:
127 |     """Format financial information as markdown."""
128 |     if not financials:
129 |         return "Information not available"
130 |         
131 |     result = ""
132 |     if "revenue" in financials:
133 |         result += f"**Revenue:** {financials['revenue']}\n"
134 |     if "valuation" in financials:
135 |         result += f"**Valuation:** {financials['valuation']}\n"
136 |     if "funding" in financials:
137 |         result += f"**Funding:** {financials['funding']}\n"
138 |         
139 |     return result or "Information not available" 


--------------------------------------------------------------------------------
/agent/executor.py:
--------------------------------------------------------------------------------
  1 | from openai import OpenAI
  2 | import os
  3 | from typing import Dict, List
  4 | import json
  5 | import asyncio
  6 | from .tools.web_search import search_companies
  7 | from .tools.web_scraper import scrape_company_info
  8 | from .tools.file_writer import write_company_profile, write_domain_summary
  9 | 
 10 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 11 | 
 12 | EXECUTION_PROMPT = """You are an AI research executor. Execute the following step in the research plan and provide the results:
 13 | 
 14 | Step:
 15 | {step}
 16 | 
 17 | Context:
 18 | {context}
 19 | 
 20 | Previous Results:
 21 | {previous_results}
 22 | 
 23 | Analyze the information and provide a structured response that includes:
 24 | 1. Key findings
 25 | 2. Relevant metrics
 26 | 3. Important insights
 27 | 4. Next steps or recommendations
 28 | 
 29 | Format the response as a JSON object."""
 30 | 
 31 | async def execute_plan(plan: Dict) -> Dict:
 32 |     """
 33 |     Execute a validated research plan step by step.
 34 |     
 35 |     Args:
 36 |         plan (Dict): The validated research plan
 37 |         
 38 |     Returns:
 39 |         Dict: The research results
 40 |     """
 41 |     results = {
 42 |         "companies": {},
 43 |         "domain_summary": None,
 44 |         "execution_log": []
 45 |     }
 46 |     
 47 |     context = {}
 48 |     
 49 |     try:
 50 |         for step in plan["steps"]:
 51 |             step_result = await execute_step(step, context, results)
 52 |             results["execution_log"].append({
 53 |                 "step": step["name"],
 54 |                 "status": "completed",
 55 |                 "result": step_result
 56 |             })
 57 |             
 58 |             # Update context with step results
 59 |             context[step["name"]] = step_result
 60 |             
 61 |         # Generate final domain summary
 62 |         await generate_domain_summary(results)
 63 |         
 64 |         return results
 65 |         
 66 |     except Exception as e:
 67 |         results["execution_log"].append({
 68 |             "step": step["name"] if "step" in locals() else "unknown",
 69 |             "status": "failed",
 70 |             "error": str(e)
 71 |         })
 72 |         raise
 73 | 
 74 | async def execute_step(step: Dict, context: Dict, results: Dict) -> Dict:
 75 |     """
 76 |     Execute a single step of the research plan.
 77 |     
 78 |     Args:
 79 |         step (Dict): The step to execute
 80 |         context (Dict): The current execution context
 81 |         results (Dict): The current results
 82 |         
 83 |     Returns:
 84 |         Dict: The step execution results
 85 |     """
 86 |     if step["name"] == "company_identification":
 87 |         companies = await search_companies(context.get("domain_research", {}).get("domain"))
 88 |         return {"companies": companies}
 89 |         
 90 |     elif step["name"] == "company_analysis":
 91 |         for company in context.get("company_identification", {}).get("companies", []):
 92 |             company_info = await scrape_company_info(company["url"])
 93 |             results["companies"][company["name"]] = company_info
 94 |             await write_company_profile(company["name"], company_info)
 95 |         return {"analyzed_companies": list(results["companies"].keys())}
 96 |         
 97 |     else:
 98 |         # Use OpenAI to analyze and synthesize information for other steps
 99 |         response = client.chat.completions.create(
100 |             model="gpt-4-turbo-preview",
101 |             messages=[
102 |                 {"role": "system", "content": "You are an expert venture capital researcher."},
103 |                 {"role": "user", "content": EXECUTION_PROMPT.format(
104 |                     step=json.dumps(step, indent=2),
105 |                     context=json.dumps(context, indent=2),
106 |                     previous_results=json.dumps(results, indent=2)
107 |                 )}
108 |             ],
109 |             response_format={"type": "json_object"}
110 |         )
111 |         
112 |         return json.loads(response.choices[0].message.content)
113 | 
114 | async def generate_domain_summary(results: Dict) -> None:
115 |     """
116 |     Generate a comprehensive domain summary based on all research results.
117 |     
118 |     Args:
119 |         results (Dict): The complete research results
120 |     """
121 |     summary_prompt = """Based on the following research results, create a comprehensive domain summary that includes:
122 | 1. Market overview
123 | 2. Key players and their positions
124 | 3. Product trends and innovations
125 | 4. Market opportunities
126 | 5. Potential investment thesis
127 | 
128 | Research Results:
129 | {results}
130 | 
131 | Format the response as a markdown document."""
132 |     
133 |     response = client.chat.completions.create(
134 |         model="gpt-4-turbo-preview",
135 |         messages=[
136 |             {"role": "system", "content": "You are an expert venture capital analyst."},
137 |             {"role": "user", "content": summary_prompt.format(
138 |                 results=json.dumps(results, indent=2)
139 |             )}
140 |         ]
141 |     )
142 |     
143 |     summary = response.choices[0].message.content
144 |     await write_domain_summary(summary)
145 |     results["domain_summary"] = summary 


--------------------------------------------------------------------------------
/agent/tools/web_scraper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | from typing import Dict, Optional
  4 | import re
  5 | from openai import OpenAI
  6 | import os
  7 | import json
  8 | import asyncio
  9 | 
 10 | client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 11 | 
 12 | ANALYSIS_PROMPT = """Analyze the following website content and extract key information about the company:
 13 | 
 14 | Content:
 15 | {content}
 16 | 
 17 | Please extract and structure the following information:
 18 | 1. Product lines and their features
 19 | 2. Company overview
 20 | 3. Market positioning
 21 | 4. Technology stack (if available)
 22 | 5. Key differentiators
 23 | 6. Target customers
 24 | 7. Revenue/valuation information (if available)
 25 | 
 26 | Format the response as a JSON object with these fields."""
 27 | 
 28 | async def scrape_company_info(url: str) -> Dict:
 29 |     """
 30 |     Scrape and analyze company information from their website.
 31 |     
 32 |     Args:
 33 |         url (str): The company's website URL
 34 |         
 35 |     Returns:
 36 |         Dict: Structured company information
 37 |     """
 38 |     try:
 39 |         # Fetch main page content
 40 |         main_content = await fetch_page_content(url)
 41 |         
 42 |         # Try to find and fetch additional important pages
 43 |         about_url = find_about_page(main_content, url)
 44 |         products_url = find_products_page(main_content, url)
 45 |         
 46 |         # Fetch additional pages concurrently
 47 |         additional_contents = await asyncio.gather(
 48 |             fetch_page_content(about_url) if about_url else asyncio.sleep(0),
 49 |             fetch_page_content(products_url) if products_url else asyncio.sleep(0)
 50 |         )
 51 |         
 52 |         # Combine all content
 53 |         all_content = main_content
 54 |         if additional_contents[0]:
 55 |             all_content += "\n\n" + additional_contents[0]
 56 |         if additional_contents[1]:
 57 |             all_content += "\n\n" + additional_contents[1]
 58 |         
 59 |         # Analyze content using OpenAI
 60 |         return await analyze_content(all_content)
 61 |         
 62 |     except Exception as e:
 63 |         raise Exception(f"Error scraping company info: {str(e)}")
 64 | 
 65 | async def fetch_page_content(url: Optional[str]) -> Optional[str]:
 66 |     """
 67 |     Fetch and extract text content from a webpage.
 68 |     
 69 |     Args:
 70 |         url (Optional[str]): The URL to fetch
 71 |         
 72 |     Returns:
 73 |         Optional[str]: Extracted text content
 74 |     """
 75 |     if not url:
 76 |         return None
 77 |         
 78 |     try:
 79 |         headers = {
 80 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 81 |         }
 82 |         
 83 |         response = requests.get(url, headers=headers, timeout=10)
 84 |         response.raise_for_status()
 85 |         
 86 |         soup = BeautifulSoup(response.text, 'html.parser')
 87 |         
 88 |         # Remove script and style elements
 89 |         for script in soup(["script", "style"]):
 90 |             script.decompose()
 91 |         
 92 |         # Extract text content
 93 |         text = soup.get_text()
 94 |         
 95 |         # Clean up text
 96 |         lines = (line.strip() for line in text.splitlines())
 97 |         chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
 98 |         text = ' '.join(chunk for chunk in chunks if chunk)
 99 |         
100 |         return text
101 |         
102 |     except Exception:
103 |         return None
104 | 
105 | def find_about_page(content: str, base_url: str) -> Optional[str]:
106 |     """
107 |     Find the URL of the company's about page.
108 |     
109 |     Args:
110 |         content (str): The main page content
111 |         base_url (str): The base URL of the company website
112 |         
113 |     Returns:
114 |         Optional[str]: URL of the about page if found
115 |     """
116 |     soup = BeautifulSoup(content, 'html.parser')
117 |     
118 |     # Common patterns for about pages
119 |     patterns = [
120 |         r'about\b',
121 |         r'about-us\b',
122 |         r'company\b',
123 |         r'who-we-are\b'
124 |     ]
125 |     
126 |     for link in soup.find_all('a', href=True):
127 |         href = link['href']
128 |         text = link.text.lower()
129 |         
130 |         if any(re.search(pattern, text) or re.search(pattern, href) for pattern in patterns):
131 |             return make_absolute_url(href, base_url)
132 |     
133 |     return None
134 | 
135 | def find_products_page(content: str, base_url: str) -> Optional[str]:
136 |     """
137 |     Find the URL of the company's products page.
138 |     
139 |     Args:
140 |         content (str): The main page content
141 |         base_url (str): The base URL of the company website
142 |         
143 |     Returns:
144 |         Optional[str]: URL of the products page if found
145 |     """
146 |     soup = BeautifulSoup(content, 'html.parser')
147 |     
148 |     # Common patterns for product pages
149 |     patterns = [
150 |         r'products?\b',
151 |         r'solutions\b',
152 |         r'services\b',
153 |         r'platform\b'
154 |     ]
155 |     
156 |     for link in soup.find_all('a', href=True):
157 |         href = link['href']
158 |         text = link.text.lower()
159 |         
160 |         if any(re.search(pattern, text) or re.search(pattern, href) for pattern in patterns):
161 |             return make_absolute_url(href, base_url)
162 |     
163 |     return None
164 | 
165 | def make_absolute_url(href: str, base_url: str) -> str:
166 |     """
167 |     Convert a relative URL to an absolute URL.
168 |     
169 |     Args:
170 |         href (str): The relative or absolute URL
171 |         base_url (str): The base URL of the website
172 |         
173 |     Returns:
174 |         str: The absolute URL
175 |     """
176 |     if href.startswith('http'):
177 |         return href
178 |     elif href.startswith('//'):
179 |         return f"https:{href}"
180 |     elif href.startswith('/'):
181 |         return f"{base_url.rstrip('/')}{href}"
182 |     else:
183 |         return f"{base_url.rstrip('/')}/{href.lstrip('/')}"
184 | 
185 | async def analyze_content(content: str) -> Dict:
186 |     """
187 |     Analyze website content using OpenAI's GPT model.
188 |     
189 |     Args:
190 |         content (str): The website content to analyze
191 |         
192 |     Returns:
193 |         Dict: Structured analysis of the company
194 |     """
195 |     try:
196 |         response = client.chat.completions.create(
197 |             model="gpt-4-turbo-preview",
198 |             messages=[
199 |                 {"role": "system", "content": "You are an expert business analyst."},
200 |                 {"role": "user", "content": ANALYSIS_PROMPT.format(content=content[:10000])}  # Limit content length
201 |             ],
202 |             response_format={"type": "json_object"}
203 |         )
204 |         
205 |         return json.loads(response.choices[0].message.content)
206 |         
207 |     except Exception as e:
208 |         raise Exception(f"Error analyzing content: {str(e)}") 


--------------------------------------------------------------------------------