├── assets └── demo.mp4 ├── src ├── __init__.py ├── utils │ ├── __init__.py │ ├── cache.py │ ├── history.py │ ├── citations.py │ ├── exports.py │ ├── credibility.py │ ├── web_utils.py │ └── tools.py ├── state.py ├── llm_tracker.py ├── config.py ├── graph.py ├── callbacks.py └── agents.py ├── .gitignore ├── requirements.txt ├── pyproject.toml ├── LICENSE ├── main.py ├── outputs ├── small language models_20251113_230645.txt └── small language models_20251113_230645.md ├── app.py └── README.md /assets/demo.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tarun7r/deep-research-agent/HEAD/assets/demo.mp4 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | """Deep Research Agent - An efficient research agent using LangGraph.""" 2 | 3 | __version__ = "0.1.0" 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # Virtual environments 6 | .venv/ 7 | venv/ 8 | 9 | # Environment variables 10 | .env 11 | 12 | # IDE 13 | .vscode/ 14 | .idea/ 15 | 16 | # OS 17 | .DS_Store 18 | 19 | # Project 20 | .cache/ 21 | .chainlit/ 22 | .files/ 23 | outputs/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langgraph>=0.2.57 2 | langchain>=0.3.13 3 | langchain-core>=0.3.13 4 | langchain-google-genai>=2.0.8 5 | langchain-ollama>=0.2.0 6 | langchain-openai>=0.2.0 7 | langchain-community>=0.3.13 8 | ddgs>=1.0.0 9 | python-dotenv>=1.0.1 10 | beautifulsoup4>=4.12.3 11 | requests>=2.32.3 12 | markdown>=3.7 13 | pydantic>=2.10.6 14 | aiohttp>=3.11.11 15 | chainlit>=1.0.0 16 | 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "deep-research-agent" 3 | version = "0.1.0" 4 | description = "An efficient deep research agent using LangGraph with free tools" 5 | requires-python = ">=3.11" 6 | dependencies = [ 7 | "langgraph>=0.2.57", 8 | "langchain>=0.3.13", 9 | "langchain-google-genai>=2.0.8", 10 | "langchain-community>=0.3.13", 11 | "duckduckgo-search>=7.0.0", 12 | "python-dotenv>=1.0.1", 13 | "beautifulsoup4>=4.12.3", 14 | "requests>=2.32.3", 15 | "markdown>=3.7", 16 | "pydantic>=2.10.6", 17 | "aiohttp>=3.11.11", 18 | ] 19 | 20 | [project.optional-dependencies] 21 | dev = [ 22 | "pytest>=8.3.4", 23 | "black>=24.10.0", 24 | "ruff>=0.8.4", 25 | ] 26 | 27 | [build-system] 28 | requires = ["setuptools>=61.0"] 29 | build-backend = "setuptools.build_meta" 30 | 31 | [tool.black] 32 | line-length = 100 33 | target-version = ['py311'] 34 | 35 | [tool.ruff] 36 | line-length = 100 37 | target-version = "py311" 38 | 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Deep Research Agent 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Utility modules for the Deep Research Agent.""" 2 | 3 | # LLM-invokable tools 4 | from src.utils.tools import ( 5 | get_research_tools, 6 | web_search, 7 | extract_webpage_content, 8 | analyze_research_topic, 9 | extract_insights_from_text, 10 | format_citation, 11 | validate_section_quality, 12 | all_research_tools 13 | ) 14 | 15 | # Web utilities (for internal use) 16 | from src.utils.web_utils import WebSearchTool, ContentExtractor, is_valid_url 17 | 18 | # Other utilities 19 | from src.utils.cache import ResearchCache 20 | from src.utils.exports import ReportExporter 21 | from src.utils.credibility import CredibilityScorer 22 | from src.utils.citations import CitationFormatter 23 | from src.utils.history import ResearchHistory 24 | 25 | __all__ = [ 26 | # LLM Tools 27 | 'research_tools', 28 | 'get_research_tools', 29 | 'web_search', 30 | 'extract_webpage_content', 31 | # Web Utils 32 | 'WebSearchTool', 33 | 'ContentExtractor', 34 | 'is_valid_url', 35 | # Other Utils 36 | 'ResearchCache', 37 | 'ReportExporter', 38 | 'CredibilityScorer', 39 | 'CitationFormatter', 40 | 'ResearchHistory', 41 | ] 42 | 43 | -------------------------------------------------------------------------------- /src/utils/cache.py: -------------------------------------------------------------------------------- 1 | """Caching layer for research results to avoid redundant searches.""" 2 | 3 | import json 4 | import hashlib 5 | from pathlib import Path 6 | from typing import Optional, Dict, Any 7 | from datetime import datetime, timedelta 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class ResearchCache: 14 | """Simple file-based cache for research results.""" 15 | 16 | def __init__(self, cache_dir: Path = Path(".cache/research")): 17 | self.cache_dir = cache_dir 18 | self.cache_dir.mkdir(parents=True, exist_ok=True) 19 | self.cache_file = self.cache_dir / "cache.json" 20 | self.cache_ttl_days = 7 # Cache expires after 7 days 21 | 22 | # Load existing cache 23 | self._cache: Dict[str, Dict[str, Any]] = self._load_cache() 24 | 25 | def _load_cache(self) -> Dict[str, Dict[str, Any]]: 26 | """Load cache from disk.""" 27 | if self.cache_file.exists(): 28 | try: 29 | with open(self.cache_file, 'r', encoding='utf-8') as f: 30 | cache = json.load(f) 31 | # Filter expired entries 32 | now = datetime.now() 33 | valid_cache = {} 34 | for key, value in cache.items(): 35 | cached_time = datetime.fromisoformat(value.get('timestamp', '2000-01-01')) 36 | if (now - cached_time).days < self.cache_ttl_days: 37 | valid_cache[key] = value 38 | return valid_cache 39 | except Exception as e: 40 | logger.warning(f"Failed to load cache: {e}") 41 | return {} 42 | return {} 43 | 44 | def _save_cache(self): 45 | """Save cache to disk.""" 46 | try: 47 | with open(self.cache_file, 'w', encoding='utf-8') as f: 48 | json.dump(self._cache, f, indent=2, default=str) 49 | except Exception as e: 50 | logger.warning(f"Failed to save cache: {e}") 51 | 52 | def _get_key(self, topic: str) -> str: 53 | """Generate cache key from topic.""" 54 | # Normalize topic (lowercase, strip whitespace) 55 | normalized = topic.lower().strip() 56 | return hashlib.md5(normalized.encode()).hexdigest() 57 | 58 | def get(self, topic: str) -> Optional[Dict[str, Any]]: 59 | """Get cached research result for a topic.""" 60 | key = self._get_key(topic) 61 | if key in self._cache: 62 | logger.info(f"Cache hit for topic: {topic}") 63 | return self._cache[key].get('data') 64 | logger.info(f"Cache miss for topic: {topic}") 65 | return None 66 | 67 | def set(self, topic: str, data: Dict[str, Any]): 68 | """Cache research result for a topic.""" 69 | key = self._get_key(topic) 70 | self._cache[key] = { 71 | 'topic': topic, 72 | 'data': data, 73 | 'timestamp': datetime.now().isoformat() 74 | } 75 | self._save_cache() 76 | logger.info(f"Cached research result for topic: {topic}") 77 | 78 | def clear(self): 79 | """Clear all cached entries.""" 80 | self._cache = {} 81 | self._save_cache() 82 | logger.info("Cache cleared") 83 | 84 | def get_stats(self) -> Dict[str, Any]: 85 | """Get cache statistics.""" 86 | return { 87 | 'total_entries': len(self._cache), 88 | 'cache_dir': str(self.cache_dir), 89 | 'cache_file': str(self.cache_file) 90 | } 91 | 92 | -------------------------------------------------------------------------------- /src/state.py: -------------------------------------------------------------------------------- 1 | """State management for the Deep Research Agent.""" 2 | 3 | from typing import Annotated, List, Dict, Optional, Literal 4 | from pydantic import BaseModel, Field 5 | from langgraph.graph import MessagesState 6 | from langchain_core.messages import BaseMessage 7 | 8 | 9 | class SearchQuery(BaseModel): 10 | """A search query with metadata.""" 11 | query: str = Field(description="The search query text") 12 | purpose: str = Field(description="Why this query is being made") 13 | completed: bool = Field(default=False) 14 | 15 | 16 | class SearchResult(BaseModel): 17 | """A search result with content.""" 18 | query: str = Field(description="The original query") 19 | title: str = Field(description="Result title") 20 | url: str = Field(description="Result URL") 21 | snippet: str = Field(description="Result snippet/summary") 22 | content: Optional[str] = Field(default=None, description="Full scraped content if available") 23 | 24 | 25 | class ReportSection(BaseModel): 26 | """A section of the research report.""" 27 | title: str = Field(description="Section title") 28 | content: str = Field(description="Section content in markdown") 29 | sources: List[str] = Field(default_factory=list, description="Source URLs used") 30 | 31 | 32 | class ResearchPlan(BaseModel): 33 | """Research plan with queries and outline.""" 34 | topic: str = Field(description="The research topic") 35 | objectives: List[str] = Field(description="Research objectives") 36 | search_queries: List[SearchQuery] = Field(description="Search queries to execute") 37 | report_outline: List[str] = Field(description="Outline of report sections") 38 | 39 | 40 | class ResearchState(BaseModel): 41 | """State for the research workflow.""" 42 | 43 | # User input 44 | research_topic: str = Field(description="The topic to research") 45 | 46 | # Planning phase 47 | plan: Optional[ResearchPlan] = Field(default=None, description="Research plan") 48 | 49 | # Search phase 50 | search_results: List[SearchResult] = Field( 51 | default_factory=list, 52 | description="All search results collected" 53 | ) 54 | 55 | # Synthesis phase 56 | key_findings: List[str] = Field( 57 | default_factory=list, 58 | description="Key findings extracted from search results" 59 | ) 60 | 61 | # Report generation phase 62 | report_sections: List[ReportSection] = Field( 63 | default_factory=list, 64 | description="Generated report sections" 65 | ) 66 | 67 | final_report: Optional[str] = Field( 68 | default=None, 69 | description="Complete final report in markdown" 70 | ) 71 | 72 | # Workflow control 73 | current_stage: Literal[ 74 | "planning", "searching", "synthesizing", "reporting", "complete" 75 | ] = Field(default="planning") 76 | 77 | error: Optional[str] = Field(default=None, description="Error message if any") 78 | 79 | # Metadata 80 | iterations: int = Field(default=0, description="Number of iterations") 81 | 82 | # Quality and metrics 83 | quality_score: Optional[Dict] = Field(default=None, description="Report quality metrics") 84 | credibility_scores: List[Dict] = Field(default_factory=list, description="Source credibility scores") 85 | 86 | # LLM tracking 87 | llm_calls: int = Field(default=0, description="Total number of LLM API calls") 88 | total_input_tokens: int = Field(default=0, description="Total input tokens used") 89 | total_output_tokens: int = Field(default=0, description="Total output tokens generated") 90 | llm_call_details: List[Dict] = Field(default_factory=list, description="Details of each LLM call") 91 | 92 | class Config: 93 | arbitrary_types_allowed = True 94 | 95 | -------------------------------------------------------------------------------- /src/utils/history.py: -------------------------------------------------------------------------------- 1 | """Research history tracking and persistence.""" 2 | 3 | import json 4 | from pathlib import Path 5 | from typing import List, Dict, Optional 6 | from datetime import datetime 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ResearchHistory: 13 | """Track and manage research history.""" 14 | 15 | def __init__(self, history_file: Path = Path(".cache/research_history.json")): 16 | self.history_file = history_file 17 | self.history_file.parent.mkdir(parents=True, exist_ok=True) 18 | self._history: List[Dict] = self._load_history() 19 | 20 | def _load_history(self) -> List[Dict]: 21 | """Load history from disk.""" 22 | if self.history_file.exists(): 23 | try: 24 | with open(self.history_file, 'r', encoding='utf-8') as f: 25 | return json.load(f) 26 | except Exception as e: 27 | logger.warning(f"Failed to load history: {e}") 28 | return [] 29 | return [] 30 | 31 | def _save_history(self): 32 | """Save history to disk.""" 33 | try: 34 | with open(self.history_file, 'w', encoding='utf-8') as f: 35 | json.dump(self._history, f, indent=2, default=str) 36 | except Exception as e: 37 | logger.warning(f"Failed to save history: {e}") 38 | 39 | def add_research( 40 | self, 41 | topic: str, 42 | output_file: Optional[Path] = None, 43 | quality_score: Optional[Dict] = None, 44 | metadata: Optional[Dict] = None 45 | ): 46 | """Add a research entry to history.""" 47 | entry = { 48 | 'topic': topic, 49 | 'timestamp': datetime.now().isoformat(), 50 | 'output_file': str(output_file) if output_file else None, 51 | 'quality_score': quality_score, 52 | 'metadata': metadata or {} 53 | } 54 | 55 | # Add to beginning of list (most recent first) 56 | self._history.insert(0, entry) 57 | 58 | # Keep only last 100 entries 59 | if len(self._history) > 100: 60 | self._history = self._history[:100] 61 | 62 | self._save_history() 63 | logger.info(f"Added research to history: {topic}") 64 | 65 | def get_recent(self, limit: int = 10) -> List[Dict]: 66 | """Get recent research entries.""" 67 | return self._history[:limit] 68 | 69 | def search_history(self, query: str) -> List[Dict]: 70 | """Search history by topic.""" 71 | query_lower = query.lower() 72 | return [ 73 | entry for entry in self._history 74 | if query_lower in entry.get('topic', '').lower() 75 | ] 76 | 77 | def get_by_topic(self, topic: str) -> Optional[Dict]: 78 | """Get most recent research for a topic.""" 79 | for entry in self._history: 80 | if entry.get('topic', '').lower() == topic.lower(): 81 | return entry 82 | return None 83 | 84 | def clear_history(self): 85 | """Clear all history.""" 86 | self._history = [] 87 | self._save_history() 88 | logger.info("History cleared") 89 | 90 | def get_stats(self) -> Dict: 91 | """Get history statistics.""" 92 | if not self._history: 93 | return { 94 | 'total_researches': 0, 95 | 'oldest': None, 96 | 'newest': None 97 | } 98 | 99 | timestamps = [datetime.fromisoformat(e['timestamp']) for e in self._history if 'timestamp' in e] 100 | 101 | return { 102 | 'total_researches': len(self._history), 103 | 'oldest': min(timestamps).isoformat() if timestamps else None, 104 | 'newest': max(timestamps).isoformat() if timestamps else None 105 | } 106 | 107 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """Main entry point for the Deep Research Agent.""" 2 | 3 | import asyncio 4 | import sys 5 | from pathlib import Path 6 | import logging 7 | 8 | from src.config import config 9 | from src.graph import run_research 10 | 11 | logging.basicConfig( 12 | level=logging.INFO, 13 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 14 | ) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | async def main(): 19 | """Main function to run the research agent.""" 20 | 21 | # Validate configuration 22 | try: 23 | config.validate_config() 24 | except ValueError as e: 25 | logger.error(f"Configuration error: {e}") 26 | sys.exit(1) 27 | 28 | # Get research topic 29 | if len(sys.argv) > 1: 30 | topic = " ".join(sys.argv[1:]) 31 | else: 32 | print("\nDeep Research Agent") 33 | print("=" * 50) 34 | topic = input("\nEnter your research topic: ").strip() 35 | 36 | if not topic: 37 | logger.error("No research topic provided") 38 | sys.exit(1) 39 | 40 | print(f"\n[INFO] Starting deep research on: {topic}\n") 41 | print("This may take several minutes. Please wait...\n") 42 | 43 | try: 44 | # Run the research workflow 45 | final_state = await run_research(topic, verbose=True) 46 | 47 | # LangGraph returns dict with state - access fields directly 48 | # Check for errors 49 | if final_state.get("error"): 50 | logger.error(f"Research failed: {final_state.get('error')}") 51 | sys.exit(1) 52 | 53 | # Display results 54 | print("\n" + "=" * 80) 55 | print("RESEARCH COMPLETE") 56 | print("=" * 80) 57 | 58 | if final_state.get("plan"): 59 | plan = final_state["plan"] 60 | print(f"\nResearch Plan Summary:") 61 | print(f" - Objectives: {len(plan.objectives)}") 62 | print(f" - Search Queries: {len(plan.search_queries)}") 63 | print(f" - Report Sections: {len(plan.report_outline)}") 64 | 65 | print(f"\nResearch Data Summary:") 66 | print(f" - Search Results: {len(final_state.get('search_results', []))}") 67 | print(f" - Key Findings: {len(final_state.get('key_findings', []))}") 68 | print(f" - Report Sections: {len(final_state.get('report_sections', []))}") 69 | print(f" - Iterations: {final_state.get('iterations', 0)}") 70 | 71 | # Save the report 72 | if final_state.get("final_report"): 73 | output_dir = Path("outputs") 74 | output_dir.mkdir(exist_ok=True) 75 | 76 | # Create safe filename 77 | safe_topic = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in topic) 78 | safe_topic = safe_topic[:50].strip() 79 | 80 | output_file = output_dir / f"{safe_topic}.md" 81 | final_report = final_state["final_report"] 82 | output_file.write_text(final_report, encoding='utf-8') 83 | 84 | print(f"\n[SUCCESS] Report saved to: {output_file}") 85 | print(f" Report length: {len(final_report)} characters") 86 | 87 | # Display a preview 88 | print("\n" + "=" * 80) 89 | print("REPORT PREVIEW") 90 | print("=" * 80) 91 | print(final_report[:1500]) 92 | if len(final_report) > 1500: 93 | print(f"\n... (showing first 1500 of {len(final_report)} characters)") 94 | print("\n" + "=" * 80) 95 | 96 | else: 97 | logger.warning("No report was generated") 98 | 99 | except KeyboardInterrupt: 100 | print("\n\n[WARNING] Research interrupted by user") 101 | sys.exit(0) 102 | except Exception as e: 103 | logger.error(f"[ERROR] Unexpected error: {e}", exc_info=True) 104 | sys.exit(1) 105 | 106 | 107 | if __name__ == "__main__": 108 | asyncio.run(main()) 109 | 110 | -------------------------------------------------------------------------------- /src/utils/citations.py: -------------------------------------------------------------------------------- 1 | """Citation formatting utilities for different citation styles.""" 2 | 3 | from typing import List, Dict 4 | from datetime import datetime 5 | import re 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class CitationFormatter: 12 | """Format citations in different academic styles.""" 13 | 14 | def __init__(self): 15 | self.styles = ['apa', 'mla', 'chicago', 'ieee'] 16 | 17 | def format_apa(self, url: str, title: str = "", author: str = "", date: str = "") -> str: 18 | """Format citation in APA style.""" 19 | if author and date: 20 | return f"{author} ({date}). {title}. Retrieved from {url}" 21 | elif title: 22 | return f"{title}. (n.d.). Retrieved from {url}" 23 | else: 24 | return f"Retrieved from {url}" 25 | 26 | def format_mla(self, url: str, title: str = "", author: str = "", date: str = "") -> str: 27 | """Format citation in MLA style.""" 28 | parts = [] 29 | if author: 30 | parts.append(author) 31 | if title: 32 | parts.append(f'"{title}"') 33 | if date: 34 | parts.append(date) 35 | parts.append(f"Web. {datetime.now().strftime('%d %b. %Y')}") 36 | parts.append(f"<{url}>") 37 | return ". ".join(parts) 38 | 39 | def format_chicago(self, url: str, title: str = "", author: str = "", date: str = "") -> str: 40 | """Format citation in Chicago style.""" 41 | if author: 42 | return f"{author}. \"{title}.\" Accessed {datetime.now().strftime('%B %d, %Y')}. {url}." 43 | else: 44 | return f"\"{title}.\" Accessed {datetime.now().strftime('%B %d, %Y')}. {url}." 45 | 46 | def format_ieee(self, url: str, title: str = "", author: str = "", date: str = "") -> str: 47 | """Format citation in IEEE style.""" 48 | if author: 49 | return f"{author}, \"{title},\" {url}, accessed {datetime.now().strftime('%B %d, %Y')}." 50 | else: 51 | return f"\"{title},\" {url}, accessed {datetime.now().strftime('%B %d, %Y')}." 52 | 53 | def format_references_section( 54 | self, 55 | urls: List[str], 56 | style: str = 'apa', 57 | search_results: List = None 58 | ) -> str: 59 | """Format a references section in the specified style. 60 | 61 | Args: 62 | urls: List of URLs to cite 63 | style: Citation style ('apa', 'mla', 'chicago', 'ieee') 64 | search_results: Optional search results to extract metadata 65 | 66 | Returns: 67 | Formatted references section 68 | """ 69 | style = style.lower() 70 | if style not in self.styles: 71 | style = 'apa' 72 | logger.warning(f"Unknown style {style}, defaulting to APA") 73 | 74 | # Create URL to metadata mapping 75 | url_metadata = {} 76 | if search_results: 77 | for result in search_results: 78 | if hasattr(result, 'url') and result.url: 79 | url_metadata[result.url] = { 80 | 'title': getattr(result, 'title', ''), 81 | 'snippet': getattr(result, 'snippet', '') 82 | } 83 | 84 | references = [] 85 | for i, url in enumerate(urls, 1): 86 | metadata = url_metadata.get(url, {}) 87 | title = metadata.get('title', '') 88 | 89 | if style == 'apa': 90 | citation = self.format_apa(url, title) 91 | elif style == 'mla': 92 | citation = self.format_mla(url, title) 93 | elif style == 'chicago': 94 | citation = self.format_chicago(url, title) 95 | elif style == 'ieee': 96 | citation = self.format_ieee(url, title) 97 | else: 98 | citation = url 99 | 100 | references.append(f"{i}. {citation}") 101 | 102 | return "\n".join(references) 103 | 104 | def update_report_citations( 105 | self, 106 | report_content: str, 107 | style: str = 'apa', 108 | search_results: List = None 109 | ) -> str: 110 | """Update citations in a report to use specified style. 111 | 112 | This updates the references section but keeps inline citations as [1], [2], etc. 113 | """ 114 | # Extract URLs from references section 115 | references_match = re.search( 116 | r'## References\n\n(.*?)(?=\n##|\Z)', 117 | report_content, 118 | re.DOTALL 119 | ) 120 | 121 | if not references_match: 122 | return report_content 123 | 124 | # Extract URLs from existing references 125 | url_pattern = r'https?://[^\s\)]+' 126 | existing_refs = references_match.group(1) 127 | urls = re.findall(url_pattern, existing_refs) 128 | 129 | if not urls: 130 | return report_content 131 | 132 | # Format new references section 133 | new_references = f"## References\n\n{self.format_references_section(urls, style, search_results)}" 134 | 135 | # Replace references section 136 | updated_report = re.sub( 137 | r'## References\n\n.*?(?=\n##|\Z)', 138 | new_references, 139 | report_content, 140 | flags=re.DOTALL 141 | ) 142 | 143 | return updated_report 144 | 145 | -------------------------------------------------------------------------------- /src/llm_tracker.py: -------------------------------------------------------------------------------- 1 | """LLM call tracking and token usage monitoring.""" 2 | 3 | from typing import Dict, Optional, Any 4 | import logging 5 | from functools import wraps 6 | import time 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class LLMCallTracker: 12 | """Track LLM calls and token usage.""" 13 | 14 | def __init__(self): 15 | self.calls = [] 16 | self.total_input_tokens = 0 17 | self.total_output_tokens = 0 18 | 19 | def track_call( 20 | self, 21 | agent_name: str, 22 | operation: str, 23 | input_tokens: int = 0, 24 | output_tokens: int = 0, 25 | duration: float = 0.0, 26 | model: str = "", 27 | success: bool = True, 28 | error: Optional[str] = None 29 | ) -> Dict[str, Any]: 30 | """Track an LLM call.""" 31 | call_info = { 32 | 'agent': agent_name, 33 | 'operation': operation, 34 | 'model': model, 35 | 'input_tokens': input_tokens, 36 | 'output_tokens': output_tokens, 37 | 'total_tokens': input_tokens + output_tokens, 38 | 'duration': round(duration, 2), 39 | 'success': success, 40 | 'error': error, 41 | 'timestamp': time.time() 42 | } 43 | 44 | self.calls.append(call_info) 45 | self.total_input_tokens += input_tokens 46 | self.total_output_tokens += output_tokens 47 | 48 | logger.info( 49 | f"LLM Call [{agent_name}/{operation}]: " 50 | f"{input_tokens} in + {output_tokens} out = {input_tokens + output_tokens} tokens " 51 | f"({duration:.2f}s)" 52 | ) 53 | 54 | return call_info 55 | 56 | def get_summary(self) -> Dict[str, Any]: 57 | """Get summary of all LLM calls.""" 58 | total_tokens = self.total_input_tokens + self.total_output_tokens 59 | total_duration = sum(call['duration'] for call in self.calls) 60 | 61 | # Group by agent 62 | by_agent = {} 63 | for call in self.calls: 64 | agent = call['agent'] 65 | if agent not in by_agent: 66 | by_agent[agent] = { 67 | 'calls': 0, 68 | 'input_tokens': 0, 69 | 'output_tokens': 0, 70 | 'total_tokens': 0, 71 | 'duration': 0.0 72 | } 73 | by_agent[agent]['calls'] += 1 74 | by_agent[agent]['input_tokens'] += call['input_tokens'] 75 | by_agent[agent]['output_tokens'] += call['output_tokens'] 76 | by_agent[agent]['total_tokens'] += call['total_tokens'] 77 | by_agent[agent]['duration'] += call['duration'] 78 | 79 | return { 80 | 'total_calls': len(self.calls), 81 | 'total_input_tokens': self.total_input_tokens, 82 | 'total_output_tokens': self.total_output_tokens, 83 | 'total_tokens': total_tokens, 84 | 'total_duration': round(total_duration, 2), 85 | 'by_agent': by_agent, 86 | 'successful_calls': sum(1 for c in self.calls if c['success']), 87 | 'failed_calls': sum(1 for c in self.calls if not c['success']) 88 | } 89 | 90 | def get_calls(self) -> list: 91 | """Get all tracked calls.""" 92 | return self.calls 93 | 94 | 95 | def estimate_tokens(text: str) -> int: 96 | """Estimate token count for text (rough approximation: 1 token ≈ 4 chars).""" 97 | return max(1, len(text) // 4) 98 | 99 | 100 | def track_llm_call(agent_name: str, operation: str, model: str = ""): 101 | """Decorator to track LLM calls.""" 102 | def decorator(func): 103 | @wraps(func) 104 | async def async_wrapper(*args, **kwargs): 105 | start_time = time.time() 106 | try: 107 | result = await func(*args, **kwargs) 108 | duration = time.time() - start_time 109 | 110 | # Try to extract token info from result if available 111 | input_tokens = kwargs.get('_input_tokens', 0) 112 | output_tokens = kwargs.get('_output_tokens', 0) 113 | 114 | # If not provided, estimate based on result 115 | if output_tokens == 0 and isinstance(result, str): 116 | output_tokens = estimate_tokens(result) 117 | 118 | return result, { 119 | 'agent': agent_name, 120 | 'operation': operation, 121 | 'model': model, 122 | 'input_tokens': input_tokens, 123 | 'output_tokens': output_tokens, 124 | 'duration': duration, 125 | 'success': True 126 | } 127 | except Exception as e: 128 | duration = time.time() - start_time 129 | logger.error(f"LLM call failed: {e}") 130 | raise 131 | 132 | @wraps(func) 133 | def sync_wrapper(*args, **kwargs): 134 | start_time = time.time() 135 | try: 136 | result = func(*args, **kwargs) 137 | duration = time.time() - start_time 138 | return result 139 | except Exception as e: 140 | duration = time.time() - start_time 141 | logger.error(f"LLM call failed: {e}") 142 | raise 143 | 144 | # Return appropriate wrapper based on function type 145 | import inspect 146 | if inspect.iscoroutinefunction(func): 147 | return async_wrapper 148 | else: 149 | return sync_wrapper 150 | 151 | return decorator 152 | 153 | -------------------------------------------------------------------------------- /src/utils/exports.py: -------------------------------------------------------------------------------- 1 | """Export research reports to various formats.""" 2 | 3 | from pathlib import Path 4 | from typing import Optional 5 | import logging 6 | import markdown 7 | from datetime import datetime 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ReportExporter: 13 | """Export reports to various formats.""" 14 | 15 | def __init__(self): 16 | self.supported_formats = ['markdown', 'html', 'txt'] 17 | 18 | def export_markdown(self, content: str, output_path: Path) -> Path: 19 | """Export as markdown (already in markdown format).""" 20 | output_path.write_text(content, encoding='utf-8') 21 | logger.info(f"Exported markdown to {output_path}") 22 | return output_path 23 | 24 | def export_html(self, content: str, output_path: Path) -> Path: 25 | """Export as HTML with styling.""" 26 | # Convert markdown to HTML 27 | html_content = markdown.markdown( 28 | content, 29 | extensions=['extra', 'codehilite', 'tables'] 30 | ) 31 | 32 | # Wrap in styled HTML template 33 | html_template = f""" 34 | 35 | 36 | 37 | 38 | Research Report 39 | 111 | 112 | 113 | {html_content} 114 | 117 | 118 | """ 119 | 120 | output_path.write_text(html_template, encoding='utf-8') 121 | logger.info(f"Exported HTML to {output_path}") 122 | return output_path 123 | 124 | def export_txt(self, content: str, output_path: Path) -> Path: 125 | """Export as plain text (strip markdown).""" 126 | import re 127 | # Remove markdown formatting 128 | text = content 129 | # Remove headers 130 | text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) 131 | # Remove bold/italic 132 | text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) 133 | text = re.sub(r'\*([^*]+)\*', r'\1', text) 134 | # Remove links but keep text 135 | text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) 136 | # Remove code blocks 137 | text = re.sub(r'```[^`]+```', '', text, flags=re.DOTALL) 138 | text = re.sub(r'`([^`]+)`', r'\1', text) 139 | 140 | output_path.write_text(text, encoding='utf-8') 141 | logger.info(f"Exported text to {output_path}") 142 | return output_path 143 | 144 | def export(self, content: str, output_path: Path, format: str = 'markdown') -> Path: 145 | """Export content to specified format. 146 | 147 | Args: 148 | content: Report content (markdown) 149 | output_path: Output file path 150 | format: Export format ('markdown', 'html', 'txt') 151 | 152 | Returns: 153 | Path to exported file 154 | """ 155 | format = format.lower() 156 | 157 | if format not in self.supported_formats: 158 | raise ValueError(f"Unsupported format: {format}. Supported: {self.supported_formats}") 159 | 160 | # Adjust file extension if needed 161 | if format == 'html' and not output_path.suffix == '.html': 162 | output_path = output_path.with_suffix('.html') 163 | elif format == 'txt' and not output_path.suffix == '.txt': 164 | output_path = output_path.with_suffix('.txt') 165 | elif format == 'markdown' and not output_path.suffix in ['.md', '.markdown']: 166 | output_path = output_path.with_suffix('.md') 167 | 168 | if format == 'markdown': 169 | return self.export_markdown(content, output_path) 170 | elif format == 'html': 171 | return self.export_html(content, output_path) 172 | elif format == 'txt': 173 | return self.export_txt(content, output_path) 174 | else: 175 | raise ValueError(f"Export not implemented for format: {format}") 176 | 177 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | """Configuration management for the Deep Research Agent.""" 2 | 3 | import os 4 | from typing import Optional 5 | from pathlib import Path 6 | from pydantic import BaseModel, Field 7 | from dotenv import load_dotenv 8 | 9 | # Load environment variables from .env file 10 | env_path = Path(__file__).parent.parent / ".env" 11 | load_dotenv(dotenv_path=env_path) 12 | 13 | 14 | class ResearchConfig(BaseModel): 15 | """Configuration for the research agent.""" 16 | 17 | # Model Provider Configuration 18 | model_provider: str = Field( 19 | default=os.getenv("MODEL_PROVIDER", "gemini"), 20 | description="Model provider: 'gemini', 'ollama', 'openai', or 'llamacpp'" 21 | ) 22 | 23 | # API Keys 24 | google_api_key: str = Field( 25 | default_factory=lambda: os.getenv("GEMINI_API_KEY", ""), 26 | description="Google/Gemini API key (required if using Gemini)" 27 | ) 28 | 29 | openai_api_key: str = Field( 30 | default_factory=lambda: os.getenv("OPENAI_API_KEY", ""), 31 | description="OpenAI API key (required if using OpenAI)" 32 | ) 33 | 34 | # Ollama Configuration 35 | ollama_base_url: str = Field( 36 | default=os.getenv("OLLAMA_BASE_URL", "http://localhost:11434"), 37 | description="Ollama server URL" 38 | ) 39 | 40 | # llama.cpp Server Configuration 41 | llamacpp_base_url: str = Field( 42 | default=os.getenv("LLAMACPP_BASE_URL", "http://localhost:8080"), 43 | description="llama.cpp server URL (OpenAI-compatible API)" 44 | ) 45 | 46 | # Model Configuration 47 | model_name: str = Field( 48 | default=os.getenv("MODEL_NAME", "gemini-2.5-flash"), 49 | description="Model to use for research and generation" 50 | ) 51 | 52 | summarization_model: str = Field( 53 | default=os.getenv("SUMMARIZATION_MODEL", "gemini-2.5-flash"), 54 | description="Model for summarizing search results (faster/cheaper)" 55 | ) 56 | 57 | # Search Configuration 58 | max_search_queries: int = Field( 59 | default=int(os.getenv("MAX_SEARCH_QUERIES", "3")), 60 | description="Maximum number of search queries to generate" 61 | ) 62 | 63 | max_search_results_per_query: int = Field( 64 | default=int(os.getenv("MAX_SEARCH_RESULTS_PER_QUERY", "3")), 65 | description="Maximum results to fetch per search query" 66 | ) 67 | 68 | max_parallel_searches: int = Field( 69 | default=int(os.getenv("MAX_PARALLEL_SEARCHES", "3")), 70 | description="Maximum number of parallel search operations" 71 | ) 72 | 73 | # Credibility Configuration 74 | min_credibility_score: int = Field( 75 | default=int(os.getenv("MIN_CREDIBILITY_SCORE", "40")), 76 | description="Minimum credibility score (0-100) to filter low-quality sources" 77 | ) 78 | 79 | # Report Configuration 80 | max_report_sections: int = Field( 81 | default=int(os.getenv("MAX_REPORT_SECTIONS", "8")), 82 | description="Maximum number of sections in the final report" 83 | ) 84 | 85 | min_section_words: int = Field( 86 | default=200, 87 | description="Minimum words per section" 88 | ) 89 | 90 | # Citation Configuration 91 | citation_style: str = Field( 92 | default=os.getenv("CITATION_STYLE", "apa"), 93 | description="Citation style (apa, mla, chicago, ieee)" 94 | ) 95 | 96 | # LangSmith Configuration 97 | langsmith_tracing: bool = Field( 98 | default=os.getenv("LANGCHAIN_TRACING_V2", "false").lower() == "true", 99 | description="Enable LangSmith tracing" 100 | ) 101 | 102 | langsmith_project: str = Field( 103 | default=os.getenv("LANGCHAIN_PROJECT", "deep-research-agent"), 104 | description="LangSmith project name" 105 | ) 106 | 107 | def validate_config(self) -> bool: 108 | """Validate that required configuration is present.""" 109 | if self.model_provider == "gemini": 110 | if not self.google_api_key: 111 | raise ValueError( 112 | "GEMINI_API_KEY is required when using Gemini. Get one from https://makersuite.google.com/app/apikey" 113 | ) 114 | elif self.model_provider == "ollama": 115 | # Validate Ollama is accessible 116 | try: 117 | import requests 118 | response = requests.get(f"{self.ollama_base_url}/api/tags", timeout=5) 119 | if response.status_code != 200: 120 | raise ValueError(f"Ollama server not accessible at {self.ollama_base_url}") 121 | except requests.exceptions.RequestException as e: 122 | raise ValueError(f"Cannot connect to Ollama server at {self.ollama_base_url}: {e}") 123 | elif self.model_provider == "openai": 124 | if not self.openai_api_key: 125 | raise ValueError( 126 | "OPENAI_API_KEY is required when using OpenAI. Get one from https://platform.openai.com/api-keys" 127 | ) 128 | elif self.model_provider == "llamacpp": 129 | # Validate llama.cpp server is accessible 130 | try: 131 | import requests 132 | response = requests.get(f"{self.llamacpp_base_url}/health", timeout=5) 133 | if response.status_code not in [200, 404]: # 404 is ok, means server is running but no health endpoint 134 | raise ValueError(f"llama.cpp server not accessible at {self.llamacpp_base_url}") 135 | except requests.exceptions.RequestException as e: 136 | raise ValueError(f"Cannot connect to llama.cpp server at {self.llamacpp_base_url}: {e}") 137 | else: 138 | raise ValueError(f"Invalid MODEL_PROVIDER: {self.model_provider}. Must be 'gemini', 'ollama', 'openai', or 'llamacpp'") 139 | 140 | return True 141 | 142 | 143 | # Global configuration instance 144 | config = ResearchConfig() 145 | 146 | # Log configuration for debugging 147 | import logging 148 | logging.basicConfig(level=logging.INFO) 149 | logger = logging.getLogger(__name__) 150 | logger.info(f"Configuration loaded - MAX_SEARCH_QUERIES: {config.max_search_queries}, " 151 | f"MAX_SEARCH_RESULTS_PER_QUERY: {config.max_search_results_per_query}, " 152 | f"MAX_REPORT_SECTIONS: {config.max_report_sections}") 153 | 154 | -------------------------------------------------------------------------------- /src/graph.py: -------------------------------------------------------------------------------- 1 | """LangGraph workflow for deep research following best practices. 2 | 3 | Nodes return dict updates that LangGraph automatically merges into state. 4 | This is the recommended pattern per LangGraph documentation. 5 | """ 6 | 7 | from langgraph.graph import StateGraph, START, END 8 | from src.state import ResearchState 9 | from src.agents import ResearchPlanner, ResearchSearcher, ResearchSynthesizer, ReportWriter 10 | from src.utils.cache import ResearchCache 11 | from src.config import config 12 | import logging 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def create_research_graph(): 19 | """Create the research workflow graph with enhanced routing and error handling.""" 20 | 21 | # Initialize agents 22 | planner = ResearchPlanner() 23 | searcher = ResearchSearcher() 24 | synthesizer = ResearchSynthesizer() 25 | writer = ReportWriter(citation_style=config.citation_style) 26 | 27 | # Define the graph 28 | workflow = StateGraph(ResearchState) 29 | 30 | # Add nodes - functions return dicts that LangGraph merges into state 31 | workflow.add_node("plan", planner.plan) 32 | workflow.add_node("search", searcher.search) 33 | workflow.add_node("synthesize", synthesizer.synthesize) 34 | workflow.add_node("write_report", writer.write_report) 35 | 36 | # Define entry point using START constant (v1.0 best practice) 37 | workflow.add_edge(START, "plan") 38 | 39 | def should_continue_after_plan(state: ResearchState) -> str: 40 | """Validate planning output and route appropriately.""" 41 | if state.error: 42 | logger.error(f"Planning failed: {state.error}") 43 | return END 44 | 45 | if not state.plan or not state.plan.search_queries: 46 | logger.error("No search queries generated in plan") 47 | state.error = "Failed to generate valid research plan" 48 | return END 49 | 50 | logger.info(f"Plan validated: {len(state.plan.search_queries)} queries") 51 | return "search" 52 | 53 | def should_continue_after_search(state: ResearchState) -> str: 54 | """Validate search results and route appropriately.""" 55 | if state.error: 56 | logger.error(f"Search failed: {state.error}") 57 | return END 58 | 59 | if not state.search_results: 60 | logger.warning("No search results found") 61 | state.error = "No search results available for synthesis" 62 | return END 63 | 64 | # Check minimum threshold 65 | if len(state.search_results) < 2: 66 | logger.warning(f"Insufficient search results: {len(state.search_results)}") 67 | state.error = "Insufficient data for comprehensive research" 68 | return END 69 | 70 | logger.info(f"Search validated: {len(state.search_results)} results") 71 | return "synthesize" 72 | 73 | def should_continue_after_synthesize(state: ResearchState) -> str: 74 | """Validate synthesis output and route appropriately.""" 75 | if state.error: 76 | logger.error(f"Synthesis failed: {state.error}") 77 | return END 78 | 79 | if not state.key_findings: 80 | logger.warning("No key findings extracted") 81 | state.error = "Failed to extract findings from search results" 82 | return END 83 | 84 | logger.info(f"Synthesis validated: {len(state.key_findings)} findings") 85 | return "write_report" 86 | 87 | def should_continue_after_report(state: ResearchState) -> str: 88 | """Validate final report and complete workflow.""" 89 | if state.error: 90 | logger.error(f"Report generation failed: {state.error}") 91 | elif not state.final_report: 92 | logger.error("No report generated") 93 | state.error = "Report generation produced no output" 94 | else: 95 | logger.info("Report generation complete") 96 | 97 | return END 98 | 99 | # Add conditional edges with validation 100 | workflow.add_conditional_edges( 101 | "plan", 102 | should_continue_after_plan, 103 | { 104 | "search": "search", 105 | END: END 106 | } 107 | ) 108 | 109 | workflow.add_conditional_edges( 110 | "search", 111 | should_continue_after_search, 112 | { 113 | "synthesize": "synthesize", 114 | END: END 115 | } 116 | ) 117 | 118 | workflow.add_conditional_edges( 119 | "synthesize", 120 | should_continue_after_synthesize, 121 | { 122 | "write_report": "write_report", 123 | END: END 124 | } 125 | ) 126 | 127 | workflow.add_conditional_edges( 128 | "write_report", 129 | should_continue_after_report, 130 | { 131 | END: END 132 | } 133 | ) 134 | 135 | # Compile the graph 136 | return workflow.compile() 137 | 138 | 139 | async def run_research(topic: str, verbose: bool = True, use_cache: bool = True) -> dict: 140 | """Run the research workflow for a given topic. 141 | 142 | Args: 143 | topic: Research topic 144 | verbose: Enable verbose logging 145 | use_cache: Use cached results if available 146 | 147 | Returns the complete accumulated state as a dict. 148 | """ 149 | logger.info(f"Starting research on: {topic}") 150 | 151 | # Check cache first 152 | cache = ResearchCache() 153 | if use_cache: 154 | cached_result = cache.get(topic) 155 | if cached_result: 156 | logger.info("Using cached research result") 157 | return cached_result 158 | 159 | # Initialize state 160 | initial_state = ResearchState(research_topic=topic) 161 | 162 | # Create and run the graph 163 | graph = create_research_graph() 164 | 165 | # Execute the workflow using invoke to get complete final state 166 | # Note: invoke runs once and returns the complete accumulated state 167 | final_state = await graph.ainvoke(initial_state) 168 | 169 | # Cache the result 170 | if use_cache and not final_state.get("error"): 171 | cache.set(topic, final_state) 172 | 173 | if verbose: 174 | logger.info("Workflow completed") 175 | if final_state.get("final_report"): 176 | logger.info(f"Report generated: {len(final_state['final_report'])} characters") 177 | 178 | return final_state 179 | 180 | -------------------------------------------------------------------------------- /src/utils/credibility.py: -------------------------------------------------------------------------------- 1 | """Source credibility scoring based on domain authority and other factors.""" 2 | 3 | import re 4 | from typing import List, Dict, Any 5 | from urllib.parse import urlparse 6 | import logging 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class CredibilityScorer: 12 | """Score sources based on domain authority and other credibility factors.""" 13 | 14 | # Trusted domains 15 | TRUSTED_DOMAINS = { 16 | # Academic institutions (global) 17 | '.edu', '.ac.uk', '.ac.in', '.edu.in', '.edu.au', '.ac.jp', 18 | 19 | # Government (global) 20 | '.gov', '.gov.uk', '.gov.au', '.gov.ca', '.gov.in', '.europa.eu', 21 | 22 | # International news organizations 23 | 'bbc.com', 'bbc.co.uk', 'reuters.com', 'ap.org', 'npr.org', 24 | 'theguardian.com', 'nytimes.com', 'washingtonpost.com', 'wsj.com', 25 | 'ft.com', 'economist.com', 'bloomberg.com', 'cnbc.com', 26 | 'cnn.com', 'aljazeera.com', 'france24.com', 'dw.com', 27 | 28 | # Indian news organizations 29 | 'thehindu.com', 'indianexpress.com', 'timesofindia.com', 'indiatimes.com', 30 | 'economictimes.com', 'financialexpress.com', 'livemint.com', 31 | 'business-standard.com', 'moneycontrol.com', 'businessline.in', 32 | 'businesstoday.in', 'businessinsider.in', 33 | 34 | # Academic & Research platforms 35 | 'arxiv.org', 'scholar.google.com', 'researchgate.net', 'semanticscholar.org', 36 | 'pubmed.ncbi.nlm.nih.gov', 'ncbi.nlm.nih.gov', 'nih.gov', 'nature.com', 37 | 'sciencedirect.com', 'springer.com', 'wiley.com', 'ieee.org', 38 | 'jstor.org', 'plos.org', 'sciencemag.org', 'cell.com', 39 | 40 | # Medical & Health organizations 41 | 'who.int', 'cdc.gov', 'mayoclinic.org', 'nih.gov', 'webmd.com', 42 | 43 | # International organizations 44 | 'un.org', 'worldbank.org', 'imf.org', 'wto.org', 'oecd.org', 45 | 46 | # Tech & Science publications 47 | 'nature.com', 'scientificamerican.com', 'newscientist.com', 48 | 'technologyreview.com', 'spectrum.ieee.org', 'arstechnica.com', 49 | 'wired.com', 'techcrunch.com', 'theverge.com', 50 | 51 | # Wikipedia & educational resources 52 | 'wikipedia.org', 'britannica.com', 'khanacademy.org', 53 | 54 | # Legal & policy 55 | 'supremecourt.gov', 'congress.gov', 'loc.gov', 56 | 57 | # Statistics & data 58 | 'census.gov', 'bls.gov', 'data.gov', 'worldbank.org', 59 | 'statista.com', 'pewresearch.org', 'gallup.com' 60 | } 61 | 62 | # Suspicious patterns 63 | SUSPICIOUS_PATTERNS = [ 64 | r'\.(xyz|tk|ml|ga|cf|gq)$', # Suspicious TLDs 65 | r'bit\.ly|tinyurl|t\.co', # URL shorteners 66 | r'blogspot|wordpress\.com', # Personal blogs (lower credibility) 67 | ] 68 | 69 | def score_url(self, url: str) -> Dict[str, Any]: 70 | """Score a URL's credibility. 71 | 72 | Returns: 73 | Dict with 'score' (0-100), 'factors', and 'level' (low/medium/high) 74 | """ 75 | if not url: 76 | return {'score': 0, 'factors': ['No URL'], 'level': 'low'} 77 | 78 | score = 50 # Base score 79 | factors = [] 80 | 81 | try: 82 | parsed = urlparse(url) 83 | domain = parsed.netloc.lower() 84 | 85 | # Check for trusted domains 86 | is_trusted = False 87 | for trusted in self.TRUSTED_DOMAINS: 88 | if trusted in domain: 89 | score += 30 90 | factors.append(f'Trusted domain: {trusted}') 91 | is_trusted = True 92 | break 93 | 94 | # Check for suspicious patterns 95 | is_suspicious = False 96 | for pattern in self.SUSPICIOUS_PATTERNS: 97 | if re.search(pattern, domain): 98 | score -= 20 99 | factors.append(f'Suspicious pattern: {pattern}') 100 | is_suspicious = True 101 | break 102 | 103 | # HTTPS bonus 104 | if parsed.scheme == 'https': 105 | score += 5 106 | factors.append('HTTPS enabled') 107 | else: 108 | score -= 10 109 | factors.append('No HTTPS') 110 | 111 | # Domain age indicators (heuristic based on domain structure) 112 | if not is_trusted and not is_suspicious: 113 | # Longer domains might be less credible (often spam) 114 | if len(domain.split('.')) > 3: 115 | score -= 5 116 | factors.append('Complex domain structure') 117 | 118 | # Academic paths 119 | if '/papers/' in parsed.path or '/research/' in parsed.path or '/publications/' in parsed.path: 120 | score += 10 121 | factors.append('Academic/research path') 122 | 123 | # Normalize score to 0-100 124 | score = max(0, min(100, score)) 125 | 126 | # Determine level 127 | if score >= 70: 128 | level = 'high' 129 | elif score >= 40: 130 | level = 'medium' 131 | else: 132 | level = 'low' 133 | 134 | return { 135 | 'score': score, 136 | 'factors': factors if factors else ['Standard domain'], 137 | 'level': level, 138 | 'domain': domain 139 | } 140 | 141 | except Exception as e: 142 | logger.warning(f"Error scoring URL {url}: {e}") 143 | return {'score': 30, 'factors': ['Scoring error'], 'level': 'low'} 144 | 145 | def score_search_results(self, results: List) -> List[Dict]: 146 | """Score a list of search results.""" 147 | scored = [] 148 | for result in results: 149 | if hasattr(result, 'url'): 150 | url = result.url 151 | elif isinstance(result, dict): 152 | url = result.get('url', '') 153 | else: 154 | url = str(result) 155 | 156 | credibility = self.score_url(url) 157 | scored.append({ 158 | 'result': result, 159 | 'credibility': credibility 160 | }) 161 | 162 | # Sort by credibility score (highest first) 163 | scored.sort(key=lambda x: x['credibility']['score'], reverse=True) 164 | return scored 165 | 166 | def filter_by_credibility(self, results: List, min_score: int = 40) -> List: 167 | """Filter results by minimum credibility score.""" 168 | scored = self.score_search_results(results) 169 | filtered = [ 170 | item['result'] for item in scored 171 | if item['credibility']['score'] >= min_score 172 | ] 173 | logger.info(f"Filtered {len(results)} -> {len(filtered)} results (min_score={min_score})") 174 | return filtered 175 | 176 | -------------------------------------------------------------------------------- /outputs/small language models_20251113_230645.txt: -------------------------------------------------------------------------------- 1 | small language models 2 | Deep Research Report 3 | 4 | Executive Summary 5 | This report provides a comprehensive analysis of small language models. The research was conducted across 7 sources and synthesized into 2 key sections. 6 | 7 | Research Objectives 8 | 1. To define small language models (SLMs), identify their key characteristics, and differentiate them from large language models (LLMs). 9 | 2. To analyze the primary advantages (e.g., cost-efficiency, speed, local deployment) and disadvantages (e.g., performance limitations, data requirements) of SLMs. 10 | 3. To explore current and emerging applications of SLMs across various industries and use cases, including edge computing and specialized tasks. 11 | 4. To investigate the technological advancements, optimization techniques (e.g., fine-tuning, quantization), and research trends driving the development and adoption of SLMs. 12 | 5. To assess the future outlook, potential societal impact, and strategic role of SLMs in the broader AI landscape. 13 | 14 | --- 15 | 16 | 1. Introduction to Small Language Models (SLMs) 17 | 18 | 1. Introduction to Small Language Models (SLMs) 19 | 20 | Language models are sophisticated computational frameworks designed to comprehend and generate human language, representing a cornerstone in the field of natural language processing (NLP) [1]. Over recent years, large language models (LLMs) have garnered significant attention due to their remarkable effectiveness and versatility across a myriad of domains. These models, characterized by their immense scale, vast knowledge bases, and deep reasoning capabilities, have demonstrated breakthroughs in applications ranging from complex table processing to advanced medical diagnostics, often leveraging cutting-edge generative techniques [2, 3, 4]. The success of LLMs in numerous domains has solidified their position as a leading technology in artificial intelligence [4]. 21 | 22 | However, the rapid evolution of AI research and application has also brought forth the emergence of small language models (SLMs). SLMs are a class of AI models specifically engineered to process and generate human language, akin to LLMs, but with a critical emphasis on efficiency, specialized tasks, and a reduced computational footprint [5]. Unlike their larger counterparts, which are typically designed for broad, general-purpose applications and possess extensive knowledge, SLMs are often more constrained in their scope. This allows for significant optimization in terms of model size, the volume of training data required, and the operational resources necessary for deployment and inference [6]. 23 | 24 | The distinction between LLMs and SLMs is increasingly pertinent as the practical deployment of language models expands across various industries and environments. While LLMs are widely recognized for their "vast knowledge and deep reasoning" capabilities, SLMs distinguish themselves through their efficiency and targeted specialization [7]. This fundamental difference highlights that while LLMs excel in handling broad, knowledge-intensive tasks, SLMs are strategically developed to address specific requirements with enhanced agility and frequently with substantially reduced infrastructure demands [7]. The growing interest in SLMs signifies a strategic shift towards more accessible, cost-effective, and deployable AI solutions, particularly advantageous for scenarios involving edge computing, resource-constrained environments, or highly specialized applications. This introductory section lays the groundwork for a comprehensive exploration of SLMs, delving into their architectural nuances, diverse applications, and their evolving role within the broader ecosystem of language AI. 25 | 26 | 2. Strategic Importance and Future Landscape of SLMs 27 | 28 | 2. Strategic Importance and Future Landscape of SLMs 29 | 30 | While Large Language Models (LLMs) have garnered significant attention for their extensive knowledge and deep reasoning capabilities across diverse applications, including table processing and medical advancements [4], [6], Small Language Models (SLMs) are emerging as strategically important due to their specialized nature and efficiency [1], [2], [3]. LLMs, characterized by their vast computational models and capacity to comprehend and generate human language, have demonstrated effectiveness in numerous domains, from general language understanding to complex memory mechanisms [5], [7]. However, their substantial resource requirements for training and deployment present challenges for certain applications and environments. 31 | 32 | SLMs are AI models specifically designed for processing and generating human language, much like LLMs, but with a focus on smaller scales and specialized tasks [2]. This specialization allows SLMs to offer distinct advantages. For instance, SLMs are often quicker to train and deploy, requiring fewer computational resources compared to their larger counterparts [1], [3]. This makes them particularly suitable for edge computing, mobile devices, and applications where latency and resource consumption are critical factors. Their smaller footprint also translates to reduced energy consumption, addressing growing concerns about the environmental impact of large-scale AI models. They excel in specific tasks where their targeted architecture can outperform a generalized LLM, offering a balance between performance and resource efficiency. 33 | 34 | The future landscape of SLMs is poised for significant growth, driven by the increasing demand for efficient, specialized, and accessible AI solutions. While LLMs excel in broad, general-purpose tasks, SLMs are well-suited for niche applications that require high performance within constrained environments. This includes tasks such as localized content moderation, personalized customer support, efficient data summarization on devices, and specialized translation services. The development of SLMs will likely focus on optimizing performance for specific tasks, potentially leading to highly accurate and reliable models for particular domains. The comparison between LLMs and SLMs highlights a complementary relationship rather than a purely competitive one, where each model type serves different strategic purposes within the evolving AI ecosystem [1], [2], [3]. As AI integration becomes more pervasive across various industries, the strategic importance of SLMs will continue to grow, offering scalable and sustainable solutions for a wider range of real-world applications. 35 | 36 | --- 37 | 38 | References 39 | 40 | 1. Large Language Model vs Small Language Model - ML Journey. (n.d.). Retrieved from https://mljourney.com/large-language-model-vs-small-language-model/ 41 | 2. Large Language Model for Table Processing: A Survey. (n.d.). Retrieved from https://arxiv.org/html/2402.05121v3 42 | 3. A Survey on the Memory Mechanism of Large Language Model based. (n.d.). Retrieved from https://arxiv.org/html/2404.13501v1 43 | 4. Differences and Comparisons: Small LLMs vs Large Language Models. (n.d.). Retrieved from https://www.ema.co/additional-blogs/addition-blogs/small-llm-vs-large-language-models 44 | 5. Deconfusing ‘AI’ and ‘evolution’ - LessWrong 2.0 viewer. (n.d.). Retrieved from https://www.greaterwrong.com/posts/qvgEbZDcxwTSEBdwD/implicit-and-explicit-learning 45 | 6. Advances in Large Language Models for Medicine. (n.d.). Retrieved from https://arxiv.org/html/2509.18690v1 46 | 7. Fairness in Large Language Models: A Taxonomic Survey. (n.d.). Retrieved from https://arxiv.org/html/2404.01349v2 -------------------------------------------------------------------------------- /outputs/small language models_20251113_230645.md: -------------------------------------------------------------------------------- 1 | # small language models 2 | **Deep Research Report** 3 | 4 | ## Executive Summary 5 | This report provides a comprehensive analysis of small language models. The research was conducted across **7 sources** and synthesized into **2 key sections**. 6 | 7 | ## Research Objectives 8 | 1. To define small language models (SLMs), identify their key characteristics, and differentiate them from large language models (LLMs). 9 | 2. To analyze the primary advantages (e.g., cost-efficiency, speed, local deployment) and disadvantages (e.g., performance limitations, data requirements) of SLMs. 10 | 3. To explore current and emerging applications of SLMs across various industries and use cases, including edge computing and specialized tasks. 11 | 4. To investigate the technological advancements, optimization techniques (e.g., fine-tuning, quantization), and research trends driving the development and adoption of SLMs. 12 | 5. To assess the future outlook, potential societal impact, and strategic role of SLMs in the broader AI landscape. 13 | 14 | --- 15 | 16 | ## 1. Introduction to Small Language Models (SLMs) 17 | 18 | ### 1. Introduction to Small Language Models (SLMs) 19 | 20 | Language models are sophisticated computational frameworks designed to comprehend and generate human language, representing a cornerstone in the field of natural language processing (NLP) [1]. Over recent years, large language models (LLMs) have garnered significant attention due to their remarkable effectiveness and versatility across a myriad of domains. These models, characterized by their immense scale, vast knowledge bases, and deep reasoning capabilities, have demonstrated breakthroughs in applications ranging from complex table processing to advanced medical diagnostics, often leveraging cutting-edge generative techniques [2, 3, 4]. The success of LLMs in numerous domains has solidified their position as a leading technology in artificial intelligence [4]. 21 | 22 | However, the rapid evolution of AI research and application has also brought forth the emergence of small language models (SLMs). SLMs are a class of AI models specifically engineered to process and generate human language, akin to LLMs, but with a critical emphasis on efficiency, specialized tasks, and a reduced computational footprint [5]. Unlike their larger counterparts, which are typically designed for broad, general-purpose applications and possess extensive knowledge, SLMs are often more constrained in their scope. This allows for significant optimization in terms of model size, the volume of training data required, and the operational resources necessary for deployment and inference [6]. 23 | 24 | The distinction between LLMs and SLMs is increasingly pertinent as the practical deployment of language models expands across various industries and environments. While LLMs are widely recognized for their "vast knowledge and deep reasoning" capabilities, SLMs distinguish themselves through their efficiency and targeted specialization [7]. This fundamental difference highlights that while LLMs excel in handling broad, knowledge-intensive tasks, SLMs are strategically developed to address specific requirements with enhanced agility and frequently with substantially reduced infrastructure demands [7]. The growing interest in SLMs signifies a strategic shift towards more accessible, cost-effective, and deployable AI solutions, particularly advantageous for scenarios involving edge computing, resource-constrained environments, or highly specialized applications. This introductory section lays the groundwork for a comprehensive exploration of SLMs, delving into their architectural nuances, diverse applications, and their evolving role within the broader ecosystem of language AI. 25 | 26 | ## 2. Strategic Importance and Future Landscape of SLMs 27 | 28 | ### 2. Strategic Importance and Future Landscape of SLMs 29 | 30 | While Large Language Models (LLMs) have garnered significant attention for their extensive knowledge and deep reasoning capabilities across diverse applications, including table processing and medical advancements [4], [6], Small Language Models (SLMs) are emerging as strategically important due to their specialized nature and efficiency [1], [2], [3]. LLMs, characterized by their vast computational models and capacity to comprehend and generate human language, have demonstrated effectiveness in numerous domains, from general language understanding to complex memory mechanisms [5], [7]. However, their substantial resource requirements for training and deployment present challenges for certain applications and environments. 31 | 32 | SLMs are AI models specifically designed for processing and generating human language, much like LLMs, but with a focus on smaller scales and specialized tasks [2]. This specialization allows SLMs to offer distinct advantages. For instance, SLMs are often quicker to train and deploy, requiring fewer computational resources compared to their larger counterparts [1], [3]. This makes them particularly suitable for edge computing, mobile devices, and applications where latency and resource consumption are critical factors. Their smaller footprint also translates to reduced energy consumption, addressing growing concerns about the environmental impact of large-scale AI models. They excel in specific tasks where their targeted architecture can outperform a generalized LLM, offering a balance between performance and resource efficiency. 33 | 34 | The future landscape of SLMs is poised for significant growth, driven by the increasing demand for efficient, specialized, and accessible AI solutions. While LLMs excel in broad, general-purpose tasks, SLMs are well-suited for niche applications that require high performance within constrained environments. This includes tasks such as localized content moderation, personalized customer support, efficient data summarization on devices, and specialized translation services. The development of SLMs will likely focus on optimizing performance for specific tasks, potentially leading to highly accurate and reliable models for particular domains. The comparison between LLMs and SLMs highlights a complementary relationship rather than a purely competitive one, where each model type serves different strategic purposes within the evolving AI ecosystem [1], [2], [3]. As AI integration becomes more pervasive across various industries, the strategic importance of SLMs will continue to grow, offering scalable and sustainable solutions for a wider range of real-world applications. 35 | 36 | --- 37 | 38 | ## References 39 | 40 | 1. Large Language Model vs Small Language Model - ML Journey. (n.d.). Retrieved from https://mljourney.com/large-language-model-vs-small-language-model/ 41 | 2. Large Language Model for Table Processing: A Survey. (n.d.). Retrieved from https://arxiv.org/html/2402.05121v3 42 | 3. A Survey on the Memory Mechanism of Large Language Model based. (n.d.). Retrieved from https://arxiv.org/html/2404.13501v1 43 | 4. Differences and Comparisons: Small LLMs vs Large Language Models. (n.d.). Retrieved from https://www.ema.co/additional-blogs/addition-blogs/small-llm-vs-large-language-models 44 | 5. Deconfusing ‘AI’ and ‘evolution’ - LessWrong 2.0 viewer. (n.d.). Retrieved from https://www.greaterwrong.com/posts/qvgEbZDcxwTSEBdwD/implicit-and-explicit-learning 45 | 6. Advances in Large Language Models for Medicine. (n.d.). Retrieved from https://arxiv.org/html/2509.18690v1 46 | 7. Fairness in Large Language Models: A Taxonomic Survey. (n.d.). Retrieved from https://arxiv.org/html/2404.01349v2 -------------------------------------------------------------------------------- /src/utils/web_utils.py: -------------------------------------------------------------------------------- 1 | """Web search and content extraction utilities.""" 2 | 3 | import asyncio 4 | import re 5 | import time 6 | from typing import List, Optional 7 | from ddgs import DDGS 8 | import requests 9 | from bs4 import BeautifulSoup 10 | from urllib.parse import urlparse 11 | import logging 12 | 13 | from src.state import SearchResult 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def is_valid_url(url: str) -> bool: 20 | """Check if a URL is valid. 21 | 22 | Args: 23 | url: URL string to validate 24 | 25 | Returns: 26 | bool: True if URL is valid, False otherwise 27 | """ 28 | try: 29 | result = urlparse(url) 30 | return all([result.scheme, result.netloc]) 31 | except: 32 | return False 33 | 34 | 35 | class WebSearchTool: 36 | """DuckDuckGo web search tool with rate limiting.""" 37 | 38 | def __init__(self, max_results: int = 5): 39 | self.max_results = max_results 40 | self.last_search_time = 0 41 | self.min_delay = 2.0 # Minimum 2 seconds between searches 42 | 43 | def search(self, query: str) -> List[SearchResult]: 44 | """Perform a web search using DuckDuckGo with rate limiting. 45 | 46 | Args: 47 | query: Search query string 48 | 49 | Returns: 50 | List[SearchResult]: List of search results 51 | """ 52 | try: 53 | # Rate limiting: wait if needed 54 | elapsed = time.time() - self.last_search_time 55 | if elapsed < self.min_delay: 56 | wait_time = self.min_delay - elapsed 57 | logger.info(f"Rate limiting: waiting {wait_time:.1f}s") 58 | time.sleep(wait_time) 59 | 60 | logger.info(f"Searching for: {query}") 61 | results = [] 62 | 63 | # Use DDGS with retry logic for rate limits 64 | max_retries = 3 65 | for attempt in range(max_retries): 66 | try: 67 | ddgs = DDGS() 68 | search_results = list(ddgs.text( 69 | query, 70 | max_results=self.max_results 71 | )) 72 | 73 | for result in search_results: 74 | results.append(SearchResult( 75 | query=query, 76 | title=result.get("title", ""), 77 | url=result.get("href", ""), 78 | snippet=result.get("body", "") 79 | )) 80 | 81 | self.last_search_time = time.time() 82 | logger.info(f"Found {len(results)} results for: {query}") 83 | return results 84 | 85 | except Exception as retry_error: 86 | error_str = str(retry_error).lower() 87 | if ("ratelimit" in error_str or "202" in error_str) and attempt < max_retries - 1: 88 | wait_time = (attempt + 1) * 5 # 5, 10, 15 seconds 89 | logger.warning(f"Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}") 90 | time.sleep(wait_time) 91 | else: 92 | raise 93 | 94 | return results 95 | 96 | except Exception as e: 97 | logger.error(f"Search error for '{query}': {str(e)}") 98 | self.last_search_time = time.time() 99 | return [] 100 | 101 | async def search_async(self, query: str) -> List[SearchResult]: 102 | """Async version of search. 103 | 104 | Args: 105 | query: Search query string 106 | 107 | Returns: 108 | List[SearchResult]: List of search results 109 | """ 110 | return await asyncio.to_thread(self.search, query) 111 | 112 | 113 | class ContentExtractor: 114 | """Extract and clean content from web pages.""" 115 | 116 | def __init__(self, timeout: int = 10): 117 | self.timeout = timeout 118 | self.headers = { 119 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' 120 | } 121 | 122 | def extract_content(self, url: str) -> Optional[str]: 123 | """Extract main content from a URL. 124 | 125 | Args: 126 | url: URL to extract content from 127 | 128 | Returns: 129 | Optional[str]: Extracted content or None if extraction fails 130 | """ 131 | try: 132 | logger.info(f"Extracting content from: {url}") 133 | 134 | response = requests.get( 135 | url, 136 | headers=self.headers, 137 | timeout=self.timeout, 138 | allow_redirects=True 139 | ) 140 | response.raise_for_status() 141 | 142 | soup = BeautifulSoup(response.content, 'html.parser') 143 | 144 | # Remove unwanted elements 145 | for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']): 146 | element.decompose() 147 | 148 | # Try to find main content 149 | main_content = None 150 | for selector in ['article', 'main', '[role="main"]', '.content', '#content']: 151 | main_content = soup.select_one(selector) 152 | if main_content: 153 | break 154 | 155 | if not main_content: 156 | main_content = soup.body 157 | 158 | if main_content: 159 | text = main_content.get_text(separator='\n', strip=True) 160 | # Clean up excessive whitespace 161 | text = re.sub(r'\n\s*\n', '\n\n', text) 162 | text = re.sub(r' +', ' ', text) 163 | 164 | # Limit to reasonable length (first 5000 chars) 165 | text = text[:5000] if len(text) > 5000 else text 166 | 167 | logger.info(f"Extracted {len(text)} characters from {url}") 168 | return text 169 | 170 | return None 171 | 172 | except Exception as e: 173 | logger.warning(f"Failed to extract content from {url}: {str(e)}") 174 | return None 175 | 176 | async def extract_content_async(self, url: str) -> Optional[str]: 177 | """Async version of content extraction. 178 | 179 | Args: 180 | url: URL to extract content from 181 | 182 | Returns: 183 | Optional[str]: Extracted content or None if extraction fails 184 | """ 185 | return await asyncio.to_thread(self.extract_content, url) 186 | 187 | async def enhance_search_results_async( 188 | self, 189 | results: List[SearchResult], 190 | max_concurrent: int = 3 191 | ) -> List[SearchResult]: 192 | """Enhance search results with full content extraction (async). 193 | 194 | Args: 195 | results: List of search results to enhance 196 | max_concurrent: Maximum concurrent extraction tasks 197 | 198 | Returns: 199 | List[SearchResult]: Enhanced search results with content 200 | """ 201 | semaphore = asyncio.Semaphore(max_concurrent) 202 | 203 | async def enhance_one(result: SearchResult) -> SearchResult: 204 | async with semaphore: 205 | if not result.content: 206 | try: 207 | content = await self.extract_content_async(result.url) 208 | if content: 209 | result.content = content 210 | except Exception as e: 211 | logger.warning(f"Failed to enhance {result.url}: {str(e)}") 212 | return result 213 | 214 | try: 215 | tasks = [enhance_one(result) for result in results] 216 | return await asyncio.gather(*tasks) 217 | except Exception as e: 218 | logger.error(f"Error enhancing results: {str(e)}") 219 | return results 220 | 221 | -------------------------------------------------------------------------------- /src/callbacks.py: -------------------------------------------------------------------------------- 1 | """Callback system for real-time progress updates in the research workflow.""" 2 | 3 | import asyncio 4 | from typing import Callable, Optional, Dict, Any, List 5 | from enum import Enum 6 | from dataclasses import dataclass, field 7 | from datetime import datetime 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class ResearchStage(Enum): 14 | """Research workflow stages.""" 15 | INITIALIZING = "initializing" 16 | PLANNING = "planning" 17 | SEARCHING = "searching" 18 | EXTRACTING = "extracting" 19 | SYNTHESIZING = "synthesizing" 20 | WRITING = "writing" 21 | COMPLETE = "complete" 22 | ERROR = "error" 23 | 24 | 25 | @dataclass 26 | class ProgressUpdate: 27 | """A progress update event.""" 28 | stage: ResearchStage 29 | message: str 30 | details: Optional[str] = None 31 | progress_pct: Optional[float] = None # 0-100 32 | metadata: Dict[str, Any] = field(default_factory=dict) 33 | timestamp: datetime = field(default_factory=datetime.now) 34 | 35 | 36 | class ProgressCallback: 37 | """Manages progress callbacks for research workflow.""" 38 | 39 | _instance: Optional['ProgressCallback'] = None 40 | _callbacks: List[Callable[[ProgressUpdate], None]] = [] 41 | _async_callbacks: List[Callable[[ProgressUpdate], Any]] = [] 42 | _updates: List[ProgressUpdate] = [] 43 | _current_stage: ResearchStage = ResearchStage.INITIALIZING 44 | 45 | def __new__(cls): 46 | """Singleton pattern to ensure one global callback manager.""" 47 | if cls._instance is None: 48 | cls._instance = super().__new__(cls) 49 | cls._instance._callbacks = [] 50 | cls._instance._async_callbacks = [] 51 | cls._instance._updates = [] 52 | cls._instance._current_stage = ResearchStage.INITIALIZING 53 | return cls._instance 54 | 55 | def reset(self): 56 | """Reset state for a new research session.""" 57 | self._updates = [] 58 | self._current_stage = ResearchStage.INITIALIZING 59 | 60 | def register(self, callback: Callable[[ProgressUpdate], None]): 61 | """Register a synchronous callback function.""" 62 | if callback not in self._callbacks: 63 | self._callbacks.append(callback) 64 | 65 | def register_async(self, callback: Callable[[ProgressUpdate], Any]): 66 | """Register an async callback function.""" 67 | if callback not in self._async_callbacks: 68 | self._async_callbacks.append(callback) 69 | 70 | def unregister(self, callback: Callable): 71 | """Unregister a callback function.""" 72 | if callback in self._callbacks: 73 | self._callbacks.remove(callback) 74 | if callback in self._async_callbacks: 75 | self._async_callbacks.remove(callback) 76 | 77 | def clear_callbacks(self): 78 | """Clear all registered callbacks.""" 79 | self._callbacks = [] 80 | self._async_callbacks = [] 81 | 82 | async def emit(self, update: ProgressUpdate): 83 | """Emit a progress update to all registered callbacks.""" 84 | self._current_stage = update.stage 85 | self._updates.append(update) 86 | 87 | # Log the update 88 | logger.info(f"[{update.stage.value}] {update.message}" + 89 | (f" - {update.details}" if update.details else "")) 90 | 91 | # Call sync callbacks 92 | for callback in self._callbacks: 93 | try: 94 | callback(update) 95 | except Exception as e: 96 | logger.error(f"Error in sync callback: {e}") 97 | 98 | # Call async callbacks 99 | for callback in self._async_callbacks: 100 | try: 101 | await callback(update) 102 | except Exception as e: 103 | logger.error(f"Error in async callback: {e}") 104 | 105 | @property 106 | def current_stage(self) -> ResearchStage: 107 | return self._current_stage 108 | 109 | @property 110 | def updates(self) -> List[ProgressUpdate]: 111 | return self._updates.copy() 112 | 113 | 114 | # Global progress callback instance 115 | progress_callback = ProgressCallback() 116 | 117 | 118 | # Convenience functions for emitting progress 119 | async def emit_progress( 120 | stage: ResearchStage, 121 | message: str, 122 | details: Optional[str] = None, 123 | progress_pct: Optional[float] = None, 124 | **metadata 125 | ): 126 | """Emit a progress update.""" 127 | update = ProgressUpdate( 128 | stage=stage, 129 | message=message, 130 | details=details, 131 | progress_pct=progress_pct, 132 | metadata=metadata 133 | ) 134 | await progress_callback.emit(update) 135 | 136 | 137 | async def emit_planning_start(topic: str): 138 | """Emit planning stage start.""" 139 | await emit_progress( 140 | ResearchStage.PLANNING, 141 | "Creating research plan", 142 | f"Topic: {topic}", 143 | progress_pct=5 144 | ) 145 | 146 | 147 | async def emit_planning_complete(num_queries: int, num_sections: int): 148 | """Emit planning stage completion.""" 149 | await emit_progress( 150 | ResearchStage.PLANNING, 151 | "Research plan created", 152 | f"{num_queries} search queries, {num_sections} report sections planned", 153 | progress_pct=15 154 | ) 155 | 156 | 157 | async def emit_search_start(query: str, query_num: int, total_queries: int): 158 | """Emit search start.""" 159 | base_progress = 15 160 | search_progress_range = 35 # 15% to 50% 161 | progress = base_progress + (query_num / total_queries) * search_progress_range 162 | 163 | await emit_progress( 164 | ResearchStage.SEARCHING, 165 | f"Searching ({query_num}/{total_queries})", 166 | f"Query: {query[:60]}..." if len(query) > 60 else f"Query: {query}", 167 | progress_pct=progress 168 | ) 169 | 170 | 171 | async def emit_search_results(num_results: int, query_num: int, total_queries: int): 172 | """Emit search results found.""" 173 | base_progress = 15 174 | search_progress_range = 35 175 | progress = base_progress + ((query_num + 0.5) / total_queries) * search_progress_range 176 | 177 | await emit_progress( 178 | ResearchStage.SEARCHING, 179 | f"Found {num_results} results", 180 | f"Query {query_num}/{total_queries} complete", 181 | progress_pct=progress 182 | ) 183 | 184 | 185 | async def emit_extraction_start(url: str, current: int, total: int): 186 | """Emit content extraction start.""" 187 | base_progress = 50 188 | extract_progress_range = 15 # 50% to 65% 189 | progress = base_progress + (current / total) * extract_progress_range 190 | 191 | # Extract domain from URL for cleaner display 192 | try: 193 | from urllib.parse import urlparse 194 | domain = urlparse(url).netloc 195 | except: 196 | domain = url[:40] 197 | 198 | await emit_progress( 199 | ResearchStage.EXTRACTING, 200 | f"Extracting content ({current}/{total})", 201 | f"Source: {domain}", 202 | progress_pct=progress 203 | ) 204 | 205 | 206 | async def emit_extraction_complete(num_extracted: int, total_chars: int): 207 | """Emit extraction completion.""" 208 | await emit_progress( 209 | ResearchStage.EXTRACTING, 210 | f"Content extraction complete", 211 | f"{num_extracted} pages, {total_chars:,} characters extracted", 212 | progress_pct=65 213 | ) 214 | 215 | 216 | async def emit_synthesis_start(num_sources: int): 217 | """Emit synthesis stage start.""" 218 | await emit_progress( 219 | ResearchStage.SYNTHESIZING, 220 | "Analyzing sources", 221 | f"Synthesizing {num_sources} sources into key findings", 222 | progress_pct=68 223 | ) 224 | 225 | 226 | async def emit_synthesis_progress(message: str): 227 | """Emit synthesis progress.""" 228 | await emit_progress( 229 | ResearchStage.SYNTHESIZING, 230 | message, 231 | progress_pct=72 232 | ) 233 | 234 | 235 | async def emit_synthesis_complete(num_findings: int): 236 | """Emit synthesis completion.""" 237 | await emit_progress( 238 | ResearchStage.SYNTHESIZING, 239 | "Synthesis complete", 240 | f"Extracted {num_findings} key findings", 241 | progress_pct=78 242 | ) 243 | 244 | 245 | async def emit_writing_start(num_sections: int): 246 | """Emit writing stage start.""" 247 | await emit_progress( 248 | ResearchStage.WRITING, 249 | "Writing report", 250 | f"Generating {num_sections} sections", 251 | progress_pct=80 252 | ) 253 | 254 | 255 | async def emit_writing_section(section_title: str, section_num: int, total_sections: int): 256 | """Emit section writing progress.""" 257 | base_progress = 80 258 | writing_progress_range = 18 # 80% to 98% 259 | progress = base_progress + (section_num / total_sections) * writing_progress_range 260 | 261 | await emit_progress( 262 | ResearchStage.WRITING, 263 | f"Writing section ({section_num}/{total_sections})", 264 | f"Section: {section_title[:50]}..." if len(section_title) > 50 else f"Section: {section_title}", 265 | progress_pct=progress 266 | ) 267 | 268 | 269 | async def emit_writing_complete(report_length: int): 270 | """Emit writing completion.""" 271 | await emit_progress( 272 | ResearchStage.WRITING, 273 | "Report writing complete", 274 | f"Generated {report_length:,} character report", 275 | progress_pct=98 276 | ) 277 | 278 | 279 | async def emit_complete(topic: str, sources: int, findings: int): 280 | """Emit research completion.""" 281 | await emit_progress( 282 | ResearchStage.COMPLETE, 283 | "Research complete!", 284 | f"{sources} sources analyzed, {findings} insights extracted", 285 | progress_pct=100 286 | ) 287 | 288 | 289 | async def emit_error(error_message: str): 290 | """Emit error.""" 291 | await emit_progress( 292 | ResearchStage.ERROR, 293 | "Error occurred", 294 | error_message, 295 | progress_pct=None 296 | ) 297 | 298 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """Interactive Chainlit interface for Deep Research Agent with real-time progress updates.""" 2 | 3 | import asyncio 4 | import chainlit as cl 5 | from pathlib import Path 6 | from datetime import datetime 7 | 8 | from src.config import config 9 | from src.state import ResearchState 10 | from src.graph import create_research_graph 11 | from src.utils.exports import ReportExporter 12 | from src.utils.history import ResearchHistory 13 | from src.callbacks import ( 14 | progress_callback, 15 | ProgressUpdate, 16 | ResearchStage, 17 | emit_complete 18 | ) 19 | 20 | 21 | # Stage markers for visual display 22 | STAGE_ICONS = { 23 | ResearchStage.INITIALIZING: "[...]", 24 | ResearchStage.PLANNING: "[1/5]", 25 | ResearchStage.SEARCHING: "[2/5]", 26 | ResearchStage.EXTRACTING: "[3/5]", 27 | ResearchStage.SYNTHESIZING: "[4/5]", 28 | ResearchStage.WRITING: "[5/5]", 29 | ResearchStage.COMPLETE: "[OK]", 30 | ResearchStage.ERROR: "[ERR]" 31 | } 32 | 33 | STAGE_NAMES = { 34 | ResearchStage.INITIALIZING: "Initializing", 35 | ResearchStage.PLANNING: "Planning Research", 36 | ResearchStage.SEARCHING: "Searching Web", 37 | ResearchStage.EXTRACTING: "Extracting Content", 38 | ResearchStage.SYNTHESIZING: "Synthesizing Findings", 39 | ResearchStage.WRITING: "Writing Report", 40 | ResearchStage.COMPLETE: "Complete", 41 | ResearchStage.ERROR: "Error" 42 | } 43 | 44 | 45 | class ProgressDisplay: 46 | """Manages the progress display for a research session.""" 47 | 48 | def __init__(self): 49 | self.message: cl.Message = None 50 | self.updates: list[ProgressUpdate] = [] 51 | self.current_stage: ResearchStage = ResearchStage.INITIALIZING 52 | self.start_time: datetime = None 53 | 54 | async def initialize(self, topic: str): 55 | """Initialize the progress display.""" 56 | self.start_time = datetime.now() 57 | self.updates = [] 58 | self.current_stage = ResearchStage.INITIALIZING 59 | 60 | content = self._render_progress(topic) 61 | self.message = cl.Message(content=content) 62 | await self.message.send() 63 | 64 | async def update(self, progress_update: ProgressUpdate): 65 | """Update the progress display with a new update.""" 66 | self.updates.append(progress_update) 67 | self.current_stage = progress_update.stage 68 | 69 | if self.message: 70 | self.message.content = self._render_progress() 71 | await self.message.update() 72 | 73 | def _render_progress(self, topic: str = None) -> str: 74 | """Render the progress display as markdown.""" 75 | # Calculate elapsed time 76 | elapsed = "" 77 | if self.start_time: 78 | delta = datetime.now() - self.start_time 79 | elapsed = f" ({delta.seconds}s)" 80 | 81 | # Build progress bar 82 | stages_order = [ 83 | ResearchStage.PLANNING, 84 | ResearchStage.SEARCHING, 85 | ResearchStage.EXTRACTING, 86 | ResearchStage.SYNTHESIZING, 87 | ResearchStage.WRITING, 88 | ResearchStage.COMPLETE 89 | ] 90 | 91 | # Get current progress percentage 92 | current_pct = 0 93 | if self.updates: 94 | for update in reversed(self.updates): 95 | if update.progress_pct is not None: 96 | current_pct = update.progress_pct 97 | break 98 | 99 | # Build visual progress bar 100 | bar_length = 20 101 | filled = int(bar_length * current_pct / 100) 102 | bar = "#" * filled + "-" * (bar_length - filled) 103 | 104 | content = f"""## Research Progress{elapsed} 105 | 106 | **Progress:** [{bar}] {current_pct:.0f}% 107 | 108 | --- 109 | 110 | """ 111 | 112 | # Show stage status 113 | current_stage_idx = -1 114 | if self.current_stage in stages_order: 115 | current_stage_idx = stages_order.index(self.current_stage) 116 | 117 | for idx, stage in enumerate(stages_order): 118 | icon = STAGE_ICONS.get(stage, "[...]") 119 | name = STAGE_NAMES.get(stage, stage.value) 120 | 121 | if idx < current_stage_idx: 122 | # Completed stage 123 | content += f"[DONE] ~~{name}~~\n" 124 | elif idx == current_stage_idx: 125 | # Current stage 126 | content += f"**{icon} {name}** <- *Current*\n" 127 | else: 128 | # Pending stage 129 | content += f"[ ] {name}\n" 130 | 131 | content += "\n---\n\n" 132 | 133 | # Show recent activity log (last 8 updates) 134 | content += "### Activity Log\n\n" 135 | 136 | if self.updates: 137 | recent_updates = self.updates[-8:] 138 | for update in reversed(recent_updates): 139 | icon = STAGE_ICONS.get(update.stage, "*") 140 | time_str = update.timestamp.strftime("%H:%M:%S") 141 | 142 | msg = f"`{time_str}` {icon} **{update.message}**" 143 | if update.details: 144 | msg += f"\n _{update.details}_" 145 | content += msg + "\n\n" 146 | else: 147 | content += "_Starting research..._\n" 148 | 149 | return content 150 | 151 | 152 | async def run_research_with_updates(topic: str, progress_display: ProgressDisplay): 153 | """Run research with real-time updates to the UI.""" 154 | 155 | # Reset callback state 156 | progress_callback.reset() 157 | 158 | # Register async callback for UI updates 159 | async def on_progress(update: ProgressUpdate): 160 | await progress_display.update(update) 161 | 162 | progress_callback.register_async(on_progress) 163 | 164 | try: 165 | # Initialize state 166 | initial_state = ResearchState(research_topic=topic) 167 | 168 | # Create graph 169 | graph = create_research_graph() 170 | 171 | # Execute workflow and get final state 172 | final_state = await graph.ainvoke(initial_state) 173 | 174 | # Emit completion 175 | search_results = final_state.get('search_results', []) 176 | key_findings = final_state.get('key_findings', []) 177 | await emit_complete(topic, len(search_results), len(key_findings)) 178 | 179 | return final_state 180 | 181 | finally: 182 | # Cleanup callback 183 | progress_callback.unregister(on_progress) 184 | 185 | 186 | @cl.on_chat_start 187 | async def start(): 188 | """Initialize the chat session.""" 189 | await cl.Message( 190 | content="""# Deep Research Agent 191 | 192 | Welcome! I'm your AI research assistant powered by **LangGraph** and **Gemini**. 193 | 194 | ## How it works: 195 | 1. **Tell me** what you want to research 196 | 2. I'll **search** the web for authoritative sources 197 | 3. **Synthesize** findings using AI 198 | 4. **Generate** a comprehensive report 199 | 200 | ## Features: 201 | - Real-time web search with DuckDuckGo 202 | - Source credibility scoring 203 | - Multiple export formats (MD, HTML, TXT) 204 | - Live progress tracking 205 | 206 | --- 207 | 208 | **What would you like to research today?** 209 | 210 | _Example topics:_ 211 | - "Future of quantum computing in 2025" 212 | - "How does WebSocket streaming work?" 213 | - "Best practices for microservices architecture" 214 | """, 215 | author="Research Agent" 216 | ).send() 217 | 218 | 219 | @cl.on_message 220 | async def main(message: cl.Message): 221 | """Handle user messages.""" 222 | 223 | topic = message.content.strip() 224 | 225 | if not topic: 226 | await cl.Message( 227 | content="WARNING: Please provide a research topic.", 228 | author="System" 229 | ).send() 230 | return 231 | 232 | # Validate config 233 | try: 234 | config.validate_config() 235 | except ValueError as e: 236 | await cl.Message( 237 | content=f"**Configuration Error:** {str(e)}\n\n" 238 | "Please set your API key in the `.env` file.", 239 | author="System" 240 | ).send() 241 | return 242 | 243 | # Show starting message 244 | await cl.Message( 245 | content=f"""## Starting Research 246 | 247 | **Topic:** _{topic}_ 248 | 249 | **Configuration:** 250 | - Model: `{config.model_name}` 251 | - Max Queries: `{config.max_search_queries}` 252 | - Max Sections: `{config.max_report_sections}` 253 | 254 | _Research will begin shortly..._ 255 | """, 256 | author="Research Agent" 257 | ).send() 258 | 259 | # Initialize progress display 260 | progress_display = ProgressDisplay() 261 | await progress_display.initialize(topic) 262 | 263 | try: 264 | # Run research with updates 265 | final_state = await run_research_with_updates(topic, progress_display) 266 | 267 | # Check for errors 268 | if final_state.get("error"): 269 | await cl.Message( 270 | content=f"## Research Failed\n\n{final_state.get('error')}", 271 | author="System" 272 | ).send() 273 | return 274 | 275 | # Display detailed summary with metrics 276 | search_results = final_state.get('search_results', []) 277 | key_findings = final_state.get('key_findings', []) 278 | report_sections = final_state.get('report_sections', []) 279 | credibility_scores = final_state.get('credibility_scores', []) 280 | 281 | # Count unique sources 282 | unique_sources = set() 283 | for result in search_results: 284 | if hasattr(result, 'url') and result.url: 285 | unique_sources.add(result.url) 286 | 287 | # Count high-credibility sources 288 | high_cred_count = sum(1 for score in credibility_scores if score.get('level') == 'high') 289 | medium_cred_count = sum(1 for score in credibility_scores if score.get('level') == 'medium') 290 | 291 | # Get LLM tracking info 292 | llm_calls = final_state.get('llm_calls', 0) 293 | total_input_tokens = final_state.get('total_input_tokens', 0) 294 | total_output_tokens = final_state.get('total_output_tokens', 0) 295 | total_tokens = total_input_tokens + total_output_tokens 296 | 297 | # Calculate elapsed time 298 | elapsed_seconds = 0 299 | if progress_display.start_time: 300 | elapsed_seconds = (datetime.now() - progress_display.start_time).seconds 301 | 302 | summary_content = f"""## Research Complete! 303 | 304 | ### Data Collected 305 | | Metric | Value | 306 | |--------|-------| 307 | | Unique Sources | **{len(unique_sources)}** | 308 | | High Credibility | **{high_cred_count}** | 309 | | Medium Credibility | **{medium_cred_count}** | 310 | | Key Insights | **{len(key_findings)}** | 311 | | Report Sections | **{len(report_sections)}** | 312 | 313 | ### Performance 314 | | Metric | Value | 315 | |--------|-------| 316 | | Total Time | **{elapsed_seconds}s** | 317 | | LLM Calls | **{llm_calls}** | 318 | | Input Tokens | **{total_input_tokens:,}** | 319 | | Output Tokens | **{total_output_tokens:,}** | 320 | | Total Tokens | **{total_tokens:,}** | 321 | """ 322 | 323 | await cl.Message( 324 | content=summary_content, 325 | author="Research Agent" 326 | ).send() 327 | 328 | # Save and display report 329 | if final_state.get("final_report"): 330 | report = final_state["final_report"] 331 | 332 | # Save to file 333 | output_dir = Path("outputs") 334 | output_dir.mkdir(exist_ok=True) 335 | 336 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 337 | safe_topic = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in topic) 338 | safe_topic = safe_topic[:30].strip() 339 | filename = f"{safe_topic}_{timestamp}.md" 340 | output_file = output_dir / filename 341 | output_file.write_text(report, encoding='utf-8') 342 | 343 | # Add to history 344 | history = ResearchHistory() 345 | history.add_research( 346 | topic=topic, 347 | output_file=output_file, 348 | metadata={ 349 | 'sources': len(unique_sources), 350 | 'sections': len(report_sections), 351 | 'findings': len(key_findings), 352 | 'elapsed_seconds': elapsed_seconds, 353 | 'total_tokens': total_tokens 354 | } 355 | ) 356 | 357 | report_header = f"""## Final Report 358 | 359 | **Report Statistics:** 360 | - Length: **{len(report):,}** characters 361 | - Saved to: `{output_file}` 362 | 363 | --- 364 | 365 | {report}""" 366 | 367 | await cl.Message( 368 | content=report_header, 369 | author="Research Agent" 370 | ).send() 371 | 372 | # Export to multiple formats 373 | exporter = ReportExporter() 374 | base_path = output_file.with_suffix('') 375 | 376 | # Export HTML 377 | html_file = exporter.export(report, base_path, format='html') 378 | 379 | # Export TXT 380 | txt_file = exporter.export(report, base_path, format='txt') 381 | 382 | # Offer downloads 383 | elements = [ 384 | cl.File( 385 | name=filename, 386 | path=str(output_file), 387 | display="inline" 388 | ), 389 | cl.File( 390 | name=html_file.name, 391 | path=str(html_file), 392 | display="inline" 393 | ), 394 | cl.File( 395 | name=txt_file.name, 396 | path=str(txt_file), 397 | display="inline" 398 | ) 399 | ] 400 | 401 | await cl.Message( 402 | content=f"""## Download Report 403 | 404 | Download your report in multiple formats: 405 | 406 | | Format | File | 407 | |--------|------| 408 | | Markdown | `{filename}` | 409 | | HTML | `{html_file.name}` | 410 | | Plain Text | `{txt_file.name}` | 411 | """, 412 | elements=elements, 413 | author="Research Agent" 414 | ).send() 415 | 416 | # Ask for next research with suggestions 417 | await cl.Message( 418 | content="""--- 419 | 420 | ## Ready for Another Research? 421 | 422 | Type your next research topic below, or try one of these: 423 | 424 | - *"Future trends in [your industry]"* 425 | - *"Comparative analysis of [topic A] vs [topic B]"* 426 | - *"Best practices for [specific challenge]"* 427 | - *"Impact of [technology/trend] on [domain]"* 428 | 429 | **What would you like to research next?**""", 430 | author="Research Agent" 431 | ).send() 432 | else: 433 | await cl.Message( 434 | content="WARNING: No report was generated. Please try again.", 435 | author="System" 436 | ).send() 437 | 438 | except Exception as e: 439 | import traceback 440 | error_details = traceback.format_exc() 441 | await cl.Message( 442 | content=f"""## Unexpected Error 443 | 444 | **Error:** {str(e)} 445 | 446 |
447 | Technical Details 448 | 449 | ``` 450 | {error_details} 451 | ``` 452 | 453 |
454 | 455 | Please check the logs and try again. 456 | """, 457 | author="System" 458 | ).send() 459 | 460 | 461 | if __name__ == "__main__": 462 | from chainlit.cli import run_chainlit 463 | run_chainlit(__file__) 464 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Research Agent 2 | 3 | [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/) 4 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 5 | [![LangGraph](https://img.shields.io/badge/LangGraph-0.2.57+-green.svg)](https://github.com/langchain-ai/langgraph) 6 | 7 | A production-ready multi-agent autonomous research system built with LangGraph and LangChain. Four specialized agents work together to conduct comprehensive research on any topic and generate detailed, citation-backed reports with credibility scoring and quality metrics. Supports both local models (Ollama) and cloud APIs (Gemini). 8 | 9 | **Actively seeking opportunities as an ML Engineer II / Data Scientist II / AI Engineer II** 10 | 11 | ## Demo 12 | https://github.com/user-attachments/assets/df8404c6-7423-4a49-864a-bd4d59885c1b 13 | 14 | *Watch the full demo video to see the Deep Research Agent in action, showcasing the multi-agent workflow, real-time progress updates, and comprehensive report generation.* 15 | 16 | ## Features 17 | 18 | ### Core Capabilities 19 | 20 | - **Multi-Agent Architecture**: Four specialized autonomous agents (ResearchPlanner, ResearchSearcher, ResearchSynthesizer, ReportWriter) orchestrated by LangGraph's StateGraph. Each agent operates independently with its own tools and decision-making logic. 21 | 22 | - **Autonomous Research**: The search agent dynamically decides when to search, which queries to execute, and which sources warrant deep content extraction. This adaptive approach ensures quality over quantity, typically targeting 5-8 high-quality sources. 23 | 24 | - **Credibility Scoring**: Automatic source evaluation using domain authority analysis. Sources are scored (0-100) based on trusted domains (.edu, .gov), HTTPS, suspicious patterns, and academic indicators. Low-credibility sources are automatically filtered before synthesis. 25 | 26 | - **Quality Validation**: Section-level validation ensures minimum length requirements (500+ characters) and quality standards. Retry logic with exponential backoff handles failures gracefully, with up to 3 attempts per operation. 27 | 28 | - **Multi-Format Export**: Reports are automatically exported in three formats: Markdown (original), HTML (styled for web), and plain text (markdown stripped). 29 | 30 | - **LLM Usage Tracking**: Real-time monitoring of API calls, input/output tokens, and estimated costs. Per-agent breakdowns help identify optimization opportunities. 31 | 32 | - **Research Caching**: Intelligent file-based caching with 7-day TTL reduces redundant API calls. MD5-based topic hashing ensures accurate cache lookups. 33 | 34 | - **Web Interface**: Interactive Chainlit-based UI provides real-time progress updates, quality metrics, and multiple format downloads. 35 | 36 | ## Architecture 37 | 38 | The system implements a four-stage pipeline orchestrated by LangGraph's StateGraph: 39 | 40 | ``` 41 | ResearchPlanner → ResearchSearcher → ResearchSynthesizer → ReportWriter 42 | ``` 43 | 44 | ### Agent Responsibilities 45 | 46 | **ResearchPlanner** 47 | - Analyzes research topics and generates 3-5 research objectives 48 | - Creates 3 targeted search queries covering different aspects 49 | - Designs report outline with up to 8 sections 50 | - Provides strategic guidance for the autonomous search agent 51 | 52 | **ResearchSearcher** (Autonomous Agent) 53 | - LangChain-powered autonomous agent using `create_agent()` 54 | - Dynamically decides which queries to execute and when to extract content 55 | - Uses `web_search` and `extract_webpage_content` tools autonomously 56 | - Adapts research strategy based on intermediate findings 57 | - Targets 5-8 high-quality sources with deep content extraction 58 | - All sources are scored for credibility and filtered before synthesis 59 | 60 | **ResearchSynthesizer** 61 | - Analyzes aggregated search results with credibility awareness 62 | - Prioritizes HIGH-credibility sources (score ≥70) in findings 63 | - Resolves contradictions using credibility hierarchy 64 | - Extracts key insights and identifies patterns 65 | - Progressive truncation handles token limit errors gracefully 66 | 67 | **ReportWriter** 68 | - Generates structured report sections with consistent academic tone 69 | - Adds proper citations with configurable styles (APA, MLA, Chicago, IEEE) 70 | - Validates section quality and re-generates on failures 71 | - Compiles final markdown document with reference section 72 | 73 | ### Workflow 74 | 75 | 1. **Planning**: LLM generates research plan with objectives, queries, and outline 76 | 2. **Autonomous Search**: Agent executes searches and extracts content from promising sources 77 | 3. **Credibility Scoring**: All sources scored and filtered (default threshold: 40) 78 | 4. **Synthesis**: Findings extracted with credibility-aware prioritization 79 | 5. **Report Generation**: Structured sections written with citations 80 | 6. **Export**: Reports saved in multiple formats to `outputs/` directory 81 | 82 | ## Installation 83 | 84 | ### Prerequisites 85 | 86 | - Python 3.11+ 87 | - pip or uv package manager 88 | - [Ollama](https://ollama.com/) (for local models) **OR** Google Gemini API key ([Get one free](https://makersuite.google.com/app/apikey)) **OR** OpenAI API key ([Get one here](https://platform.openai.com/api-keys)) 89 | 90 | ### Setup 91 | 92 | ```bash 93 | # Clone the repository 94 | git clone https://github.com/tarun7r/deep-research-agent.git 95 | cd deep-research-agent 96 | 97 | # Create virtual environment 98 | python -m venv .venv 99 | source .venv/bin/activate # Windows: .venv\Scripts\activate 100 | 101 | # Install dependencies 102 | pip install -r requirements.txt 103 | 104 | # Configure (choose one): 105 | # Option A - Ollama: Install Ollama, pull a model (e.g., ollama pull qwen2.5:7b) 106 | # Option B - Gemini: Get API key from https://makersuite.google.com/app/apikey 107 | # Option C - OpenAI: Get API key from https://platform.openai.com/api-keys 108 | 109 | # Create .env file (see Configuration section below) 110 | ``` 111 | 112 | ### Using Ollama (Local Models) 113 | 114 | Ollama allows you to run powerful LLMs locally on your machine without API costs or internet dependency. 115 | 116 | **Quick Start:** 117 | 118 | ```bash 119 | # Install Ollama (macOS/Linux) 120 | curl -fsSL https://ollama.com/install.sh | sh 121 | 122 | # Or download from https://ollama.com for other platforms 123 | 124 | # Pull a recommended model 125 | ollama pull qwen2.5:7b 126 | 127 | # Verify it's working 128 | ollama run qwen2.5:7b "Hello, test message" 129 | ``` 130 | 131 | **Configuration:** 132 | 133 | Create a `.env` file: 134 | ```bash 135 | MODEL_PROVIDER=ollama 136 | MODEL_NAME=qwen2.5:7b 137 | SUMMARIZATION_MODEL=qwen2.5:7b 138 | ``` 139 | 140 | > **Tip**: Ollama runs a local server at `http://localhost:11434` by default. The agent will automatically connect to it. 141 | 142 | ### Using llama.cpp 143 | 144 | llama.cpp provides direct control over model execution with maximum performance on Mac M1/M2/M3 with Metal acceleration. 145 | 146 | **Quick Start:** 147 | 148 | ```bash 149 | 150 | # 1. Download a GGUF model (e.g., Qwen2.5:7B q4_k_m quantization) 151 | cd ../.. 152 | mkdir models 153 | # Download from Hugging Face 154 | huggingface-cli download Qwen/Qwen2.5-7B-Instruct-GGUF qwen2.5-7b-instruct-q4_k_m.gguf --local-dir ./models 155 | 156 | # 2. Start llama.cpp server with tool calling support 157 | cd llama.cpp/build/bin 158 | ./llama-server -m ~/models/qwen2.5-7b-instruct-q4_k_m.gguf \ 159 | --host 0.0.0.0 \ 160 | --port 8080 \ 161 | -ngl 35 \ 162 | --ctx-size 4096 \ 163 | --jinja 164 | ``` 165 | 166 | **Configuration:** 167 | 168 | Create a `.env` file: 169 | ```bash 170 | MODEL_PROVIDER=llamacpp 171 | MODEL_NAME=qwen2.5-7b-instruct-q4_k_m # Model name (can be anything) 172 | SUMMARIZATION_MODEL=qwen2.5-7b-instruct-q4_k_m 173 | LLAMACPP_BASE_URL=http://localhost:8080 174 | ``` 175 | 176 | **Important Flags:** 177 | - `--jinja` - Required for tool/function calling support (used by research agents) 178 | - `-ngl 35` - Offload 35 layers to GPU (Metal acceleration) 179 | - `--ctx-size 4096` - Context window size 180 | - `--host 0.0.0.0` - Allow connections from any IP 181 | - `--port 8080` - Server port 182 | 183 | **Performance Tips:** 184 | - Metal acceleration provides ~2-3x speedup on M1/M2/M3 185 | - The server exposes an OpenAI-compatible API at `/v1/chat/completions` 186 | - Use `--n-gpu-layers` (or `-ngl`) to maximize GPU usage 187 | 188 | > **Note**: llama.cpp offers more control and can be faster than Ollama, but requires manual setup. Choose Ollama for simplicity or llama.cpp for maximum performance. 189 | 190 | ## Usage 191 | 192 | ### Command Line 193 | 194 | ```bash 195 | # Interactive mode 196 | python main.py 197 | 198 | # Direct topic 199 | python main.py "Impact of quantum computing on cryptography" 200 | ``` 201 | 202 | ### Programmatic API 203 | 204 | ```python 205 | import asyncio 206 | from src.graph import run_research 207 | 208 | async def research(): 209 | state = await run_research("Topic here", verbose=True, use_cache=True) 210 | 211 | # Access report 212 | print(state["final_report"]) 213 | 214 | # Access LLM metrics 215 | print(f"LLM Calls: {state['llm_calls']}") 216 | print(f"Input Tokens: {state['total_input_tokens']:,}") 217 | print(f"Output Tokens: {state['total_output_tokens']:,}") 218 | print(f"Total Tokens: {state['total_input_tokens'] + state['total_output_tokens']:,}") 219 | 220 | # Access quality score 221 | if state.get("quality_score"): 222 | print(f"Quality: {state['quality_score']['total_score']}/100") 223 | 224 | asyncio.run(research()) 225 | ``` 226 | 227 | ### Web Interface 228 | 229 | ```bash 230 | # Start the web interface 231 | chainlit run app.py --host 127.0.0.1 --port 8000 232 | ``` 233 | 234 | The web interface provides: 235 | - Interactive chat-based research 236 | - Real-time progress updates with stage indicators 237 | - Quality metrics and LLM usage statistics 238 | - Multiple format downloads (Markdown, HTML, TXT) 239 | - Research history tracking 240 | 241 | ## Configuration 242 | 243 | Environment variables in `.env`: 244 | 245 | ```bash 246 | # Model Provider (choose one) 247 | MODEL_PROVIDER=ollama # Options: ollama, llamacpp, gemini, openai 248 | 249 | # For Ollama 250 | MODEL_NAME=qwen2.5:7b # Recommended: qwen2.5:7b, llama3.1:8b, mistral:7b 251 | SUMMARIZATION_MODEL=qwen2.5:7b 252 | OLLAMA_BASE_URL=http://localhost:11434 253 | 254 | # For llama.cpp (alternative - requires --jinja flag on server) 255 | # MODEL_PROVIDER=llamacpp 256 | # MODEL_NAME=qwen2.5-7b-instruct-q4_k_m 257 | # SUMMARIZATION_MODEL=qwen2.5-7b-instruct-q4_k_m 258 | # LLAMACPP_BASE_URL=http://localhost:8080 259 | 260 | # For Gemini (alternative) 261 | # MODEL_PROVIDER=gemini 262 | # GEMINI_API_KEY=your_api_key_here 263 | # MODEL_NAME=gemini-2.5-flash 264 | # SUMMARIZATION_MODEL=gemini-2.5-flash 265 | 266 | # For OpenAI (alternative) 267 | # MODEL_PROVIDER=openai 268 | # OPENAI_API_KEY=your_api_key_here 269 | # MODEL_NAME=gpt-4o-mini # Recommended: gpt-4o-mini, gpt-4o, gpt-4-turbo 270 | # SUMMARIZATION_MODEL=gpt-4o-mini 271 | 272 | # Optional - Search Settings 273 | MAX_SEARCH_QUERIES=3 274 | MAX_SEARCH_RESULTS_PER_QUERY=3 275 | MIN_CREDIBILITY_SCORE=40 276 | 277 | # Optional - Report Settings 278 | MAX_REPORT_SECTIONS=8 279 | CITATION_STYLE=apa # Options: apa, mla, chicago, ieee 280 | ``` 281 | 282 | ### Model Provider Comparison 283 | 284 | **Ollama (Local Models):** 285 | - Free, no API costs 286 | - Works offline, privacy-focused 287 | - Faster response times (no network latency) 288 | - No rate limits 289 | - Easy setup and model management 290 | - Requires ~5-8GB RAM for good models 291 | - Initial model download (~4-5GB per model) 292 | 293 | **llama.cpp (Local Models):** 294 | - Free, no API costs 295 | - Works offline, maximum privacy 296 | - Fastest local inference with Metal acceleration 297 | - No rate limits 298 | - Fine-grained control over model parameters 299 | - Lower memory usage with quantization 300 | - Requires manual setup and compilation 301 | - Requires ~4-8GB RAM depending on quantization 302 | - Best for: Maximum performance on M1/M2/M3 Macs 303 | 304 | **Gemini (Cloud API):** 305 | - No local resources needed 306 | - Latest cutting-edge models 307 | - Consistently fast across devices 308 | - Requires API key and internet 309 | - API costs (free tier available) 310 | 311 | **OpenAI (Cloud API):** 312 | - No local resources needed 313 | - Industry-leading models (GPT-4, GPT-4o) 314 | - Excellent performance and reliability 315 | - Requires API key and internet 316 | - Pay-per-use pricing (competitive rates) 317 | - Recommended models: `gpt-4o-mini` (cost-effective), `gpt-4o` (best quality) 318 | 319 | ## Output Format 320 | 321 | Generated reports follow this structure: 322 | 323 | ```markdown 324 | # [Research Topic] 325 | 326 | **Deep Research Report** 327 | 328 | ## Research Objectives 329 | 1. [Objective 1] 330 | 2. [Objective 2] 331 | ... 332 | 333 | --- 334 | 335 | ## [Section 1 Title] 336 | [Content with inline citations [1], [2]] 337 | 338 | ## [Section 2 Title] 339 | [Content with inline citations [3], [4]] 340 | 341 | --- 342 | 343 | ## References 344 | 1. [Formatted citation according to selected style] 345 | 2. [Formatted citation according to selected style] 346 | ... 347 | ``` 348 | 349 | Reports are automatically exported in three formats: 350 | - **Markdown** (`.md`) - Original format with full markdown syntax 351 | - **HTML** (`.html`) - Styled web-ready format 352 | - **Plain Text** (`.txt`) - Markdown stripped, plain text version 353 | 354 | All reports are saved to the `outputs/` directory with timestamps. 355 | 356 | ## Project Structure 357 | 358 | ``` 359 | deep-research-agent/ 360 | ├── src/ 361 | │ ├── __init__.py # Package initialization 362 | │ ├── config.py # Configuration management (Pydantic models) 363 | │ ├── state.py # State models (ResearchState, ResearchPlan, etc.) 364 | │ ├── agents.py # Agent implementations (Planner, Searcher, Synthesizer, Writer) 365 | │ ├── graph.py # LangGraph workflow orchestration 366 | │ ├── llm_tracker.py # LLM call and token tracking 367 | │ └── utils/ 368 | │ ├── __init__.py # Utils package 369 | │ ├── tools.py # LangChain tools (@tool decorated for agents) 370 | │ ├── web_utils.py # Search & extraction implementations 371 | │ ├── cache.py # Research result caching (7-day TTL) 372 | │ ├── credibility.py # Source credibility scoring and filtering 373 | │ ├── exports.py # Multi-format export utilities 374 | │ ├── citations.py # Citation formatting (APA, MLA, Chicago, IEEE) 375 | │ └── history.py # Research history tracking 376 | ├── outputs/ # Generated reports (MD, HTML, TXT) 377 | ├── .cache/ # Cache and history storage 378 | │ ├── research/ # Cached research results 379 | │ └── research_history.json # Research history 380 | ├── main.py # CLI entry point 381 | ├── app.py # Chainlit web interface 382 | ├── requirements.txt # Python dependencies 383 | ├── pyproject.toml # Project metadata 384 | ├── LICENSE # MIT License 385 | └── README.md # This file 386 | ``` 387 | 388 | ## Key Components 389 | 390 | ### State Management (`src/state.py`) 391 | 392 | Centralized state using Pydantic models tracks research progress, search results, findings, and LLM usage metrics throughout the workflow. 393 | 394 | ### Tools Layer (`src/utils/tools.py`) 395 | 396 | LangChain tools decorated with `@tool` enable autonomous agent tool-calling: 397 | - `web_search`: DuckDuckGo integration for web searches 398 | - `extract_webpage_content`: BeautifulSoup4-based content extraction 399 | 400 | ### Credibility Scorer (`src/utils/credibility.py`) 401 | 402 | Evaluates sources based on: 403 | - Domain authority (trusted domains: +30 points) 404 | - HTTPS enabled (+5 points) 405 | - Academic/research paths (+10 points) 406 | - Suspicious patterns (-20 points) 407 | 408 | Sources are automatically filtered and sorted by credibility before synthesis. 409 | 410 | ## Development Note 411 | 412 | The core ideation, architecture design, and logic of this project are the result of original research and understanding. While AI tools were used to assist with code restructuring and implementation, the fundamental concepts, agent workflows, credibility scoring methodology, and overall system design reflect independent research and development. 413 | 414 | ## Contact 415 | 416 | For questions, issues, or collaboration: 417 | 418 | - **GitHub**: [tarun7r](https://github.com/tarun7r) 419 | - **LinkedIn**: [Tarun Sai Goddu](https://www.linkedin.com/in/tarunsaigoddu/) 420 | - **Hugging Face**: [tarun7r](https://huggingface.co/tarun7r) 421 | - **Email**: tarunsaiaa@gmail.com 422 | 423 | ## License 424 | 425 | MIT License - See [LICENSE](LICENSE) file for details. 426 | 427 | ## Acknowledgments 428 | 429 | Built with [LangGraph](https://github.com/langchain-ai/langgraph) and [LangChain](https://github.com/langchain-ai/langchain). Supports [Ollama](https://ollama.com/) and [llama.cpp](https://github.com/ggerganov/llama.cpp) for local models, [Google Gemini](https://ai.google.dev/) and [OpenAI](https://openai.com/) APIs. Web search via [DuckDuckGo](https://duckduckgo.com/). 430 | -------------------------------------------------------------------------------- /src/utils/tools.py: -------------------------------------------------------------------------------- 1 | """LLM-invokable tools for research agents.""" 2 | 3 | from typing import List, Optional, Dict 4 | from langchain_core.tools import tool 5 | import logging 6 | import json 7 | 8 | from src.utils.web_utils import WebSearchTool as WebSearchImpl, ContentExtractor as ContentExtractorImpl 9 | from src.state import SearchResult 10 | from src.utils.citations import CitationFormatter 11 | from src.config import config 12 | 13 | logging.basicConfig(level=logging.INFO) 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | # Initialize tool implementations with config values 18 | _search_impl = WebSearchImpl(max_results=config.max_search_results_per_query) 19 | _extractor_impl = ContentExtractorImpl(timeout=10) 20 | _citation_formatter = CitationFormatter() 21 | 22 | 23 | @tool 24 | async def web_search(query: str, max_results: int = None) -> List[dict]: 25 | """Search the web for authoritative information using DuckDuckGo search engine. 26 | 27 | This tool executes web searches to find current, accurate information from diverse sources 28 | including academic papers, official documentation, news articles, and expert analyses. 29 | 30 | ## When to Use 31 | - Gathering factual information on any topic 32 | - Finding authoritative sources (academic, government, official docs) 33 | - Researching current events or recent developments 34 | - Verifying claims or finding supporting evidence 35 | - Discovering expert opinions and analyses 36 | 37 | ## Query Optimization Strategies 38 | 39 | ### For Maximum Accuracy: 40 | - Add "official" or "documentation" for technical topics 41 | - Include "research" or "study" for scientific topics 42 | - Add year (e.g., "2024") for time-sensitive information 43 | - Use site-specific queries: "site:edu" or "site:gov" for authoritative sources 44 | 45 | ### Query Formulation Best Practices: 46 | - Be specific: "Python async await tutorial" > "Python programming" 47 | - Use technical terms: "WebSocket protocol implementation" > "real-time web" 48 | - Include context: "Azure Speech SDK streaming architecture" > "Azure speech" 49 | - Combine concepts: "machine learning healthcare diagnosis 2024" 50 | 51 | ### Query Types for Comprehensive Research: 52 | - Definitional: "what is [topic]", "[topic] explained" 53 | - Technical: "[topic] architecture", "how [topic] works" 54 | - Comparative: "[topic] vs [alternative]", "[topic] comparison" 55 | - Practical: "[topic] best practices", "[topic] tutorial" 56 | - Current: "[topic] 2024", "latest [topic]" 57 | 58 | Args: 59 | query: A well-crafted search query string. Be specific and include relevant 60 | qualifiers. Maximum ~10 words for best results. 61 | 62 | Good examples: 63 | - "WebSocket vs HTTP streaming performance comparison" 64 | - "Azure cognitive services speech SDK documentation" 65 | - "transformer architecture deep learning explained 2024" 66 | - "site:arxiv.org large language models efficiency" 67 | 68 | Avoid: 69 | - Single words: "AI", "cloud", "programming" 70 | - Overly long queries (>15 words) 71 | - Ambiguous terms without context 72 | 73 | max_results: Maximum results to return (default: from config). Higher values 74 | give more sources but may include less relevant results. 75 | 76 | Returns: 77 | List of dictionaries, each containing: 78 | - query (str): The search query used 79 | - title (str): Page title (indicates content focus) 80 | - url (str): Full URL (check domain for credibility) 81 | - snippet (str): Preview text (~150 chars, helps assess relevance) 82 | 83 | Tips: 84 | - Check URL domains: .edu, .gov, .org often indicate credibility 85 | - Review snippets before extracting full content 86 | - If results are poor, try rephrasing with different terms 87 | """ 88 | try: 89 | # Use config value if not specified 90 | if max_results is None: 91 | max_results = config.max_search_results_per_query 92 | 93 | # Update max_results if different 94 | if _search_impl.max_results != max_results: 95 | _search_impl.max_results = max_results 96 | 97 | results = await _search_impl.search_async(query) 98 | 99 | # Convert SearchResult objects to dicts for LLM consumption 100 | return [ 101 | { 102 | "query": r.query, 103 | "title": r.title, 104 | "url": r.url, 105 | "snippet": r.snippet 106 | } 107 | for r in results 108 | ] 109 | except Exception as e: 110 | logger.error(f"Web search tool error: {str(e)}") 111 | return [] 112 | 113 | 114 | @tool 115 | async def extract_webpage_content(url: str) -> Optional[str]: 116 | """Extract the main textual content from a webpage, removing boilerplate and noise. 117 | 118 | This tool fetches a webpage and uses intelligent content extraction to isolate the 119 | main article body, removing navigation, ads, sidebars, footers, and other non-content 120 | elements. Essential for getting the full context beyond search snippets. 121 | 122 | ## When to Use 123 | - After web_search identifies promising sources 124 | - To get full article text beyond the snippet preview 125 | - For in-depth analysis of specific sources 126 | - When you need to verify claims with full context 127 | - To extract technical details, examples, or data tables 128 | 129 | ## Source Prioritization Guide 130 | 131 | ### Extract First (High Value): 132 | - Official documentation pages (docs.*, developer.*) 133 | - Academic papers and research (arxiv.org, ieee.org, nature.com) 134 | - Government and institutional reports (.gov, .edu) 135 | - Detailed technical blog posts with code/examples 136 | - Industry whitepapers and case studies 137 | 138 | ### Extract If Needed (Medium Value): 139 | - News articles from reputable sources 140 | - Well-written tutorial and how-to guides 141 | - Expert commentary and analysis pieces 142 | - Wikipedia articles (good overviews) 143 | 144 | ### Usually Skip (Low Value): 145 | - Social media pages (limited extraction success) 146 | - Video-primary sites (YouTube, Vimeo) - no transcript extraction 147 | - Login-protected content 148 | - Heavily JavaScript-rendered single-page apps 149 | - Image galleries or portfolio sites 150 | 151 | ## What Gets Extracted 152 | - Main article/post body text 153 | - Headings and subheadings 154 | - Lists and bullet points 155 | - Code blocks and technical content 156 | - Tables (as text) 157 | 158 | ## What Gets Removed 159 | - Navigation menus and headers 160 | - Sidebar content and widgets 161 | - Footer links and copyright notices 162 | - Advertisements and promotions 163 | - Comment sections 164 | - Related article suggestions 165 | 166 | Args: 167 | url: Complete, valid HTTP/HTTPS URL to extract content from. 168 | Must be publicly accessible (no auth required). 169 | 170 | Good candidates: 171 | - "https://docs.microsoft.com/azure/cognitive-services/speech" 172 | - "https://arxiv.org/abs/2301.xxxxx" 173 | - "https://www.nature.com/articles/article-id" 174 | - "https://techblog.example.com/detailed-guide" 175 | 176 | Poor candidates: 177 | - "https://twitter.com/..." (social media) 178 | - "https://youtube.com/..." (video content) 179 | - URLs requiring login 180 | 181 | Returns: 182 | str: Extracted main text content (up to 5000 characters for efficiency). 183 | Content is cleaned and formatted with preserved paragraph breaks. 184 | 185 | None: If extraction fails due to: 186 | - Network/access errors (timeouts, 403/404) 187 | - Login/authentication requirements 188 | - JavaScript-heavy pages with no static content 189 | - Non-text content (PDFs, images, videos) 190 | 191 | Usage Pattern: 192 | 1. Run web_search to find relevant URLs 193 | 2. Review titles and domains for credibility 194 | 3. Extract content from top 3-5 most promising sources 195 | 4. Cross-reference extracted content for verification 196 | """ 197 | try: 198 | content = await _extractor_impl.extract_content_async(url) 199 | return content 200 | except Exception as e: 201 | logger.error(f"Content extraction tool error: {str(e)}") 202 | return None 203 | 204 | 205 | @tool 206 | def analyze_research_topic(topic: str) -> Dict[str, List[str]]: 207 | """Decompose a research topic into structured dimensions for comprehensive coverage. 208 | 209 | This tool performs preliminary topic analysis to identify the key aspects, 210 | stakeholder perspectives, and essential questions that should be addressed 211 | in a thorough research investigation. 212 | 213 | ## Purpose 214 | Ensures research planning covers all important dimensions of a topic rather 215 | than focusing too narrowly on one aspect. 216 | 217 | ## Analysis Framework 218 | 219 | ### Aspects (What to cover) 220 | The fundamental dimensions or components of the topic: 221 | - Technical/Functional aspects (how it works) 222 | - Historical context (evolution, origins) 223 | - Current state (adoption, implementations) 224 | - Future outlook (trends, predictions) 225 | - Practical implications (real-world impact) 226 | 227 | ### Perspectives (Whose viewpoint) 228 | Different stakeholder or analytical lenses: 229 | - Technical perspective (engineers, developers) 230 | - Business perspective (costs, ROI, strategy) 231 | - User perspective (experience, benefits) 232 | - Ethical perspective (risks, implications) 233 | - Policy perspective (regulations, standards) 234 | 235 | ### Questions (What to answer) 236 | Core questions that comprehensive research should address: 237 | - Definitional: What is it? 238 | - Mechanistic: How does it work? 239 | - Evaluative: What are the pros/cons? 240 | - Comparative: How does it compare to alternatives? 241 | - Prospective: What's the future outlook? 242 | 243 | ## When to Use 244 | - At the start of research planning 245 | - When unsure how to structure research approach 246 | - To ensure comprehensive topic coverage 247 | - To generate diverse search query ideas 248 | 249 | Args: 250 | topic: The research topic or question to analyze. 251 | Can be a simple topic ("machine learning") 252 | or a complex question ("How does Azure Speech SDK streaming work?") 253 | 254 | Returns: 255 | Dictionary containing: 256 | 257 | - aspects (List[str]): Key dimensions to investigate 258 | Typically 3-5 core aspects of the topic 259 | 260 | - perspectives (List[str]): Stakeholder/analytical viewpoints 261 | Different angles from which to examine the topic 262 | 263 | - questions (List[str]): Essential questions to answer 264 | Core questions that research should address 265 | 266 | Usage: 267 | Use the returned analysis to: 268 | 1. Generate diverse search queries covering all aspects 269 | 2. Structure report outline to address all perspectives 270 | 3. Verify final report answers all essential questions 271 | """ 272 | # This is a structured thinking tool for the planning agent 273 | # Returns structured breakdown to help with planning 274 | logger.info(f"Analyzing topic: {topic}") 275 | 276 | # Basic heuristic analysis 277 | aspects = [] 278 | perspectives = [] 279 | questions = [] 280 | 281 | # Extract key concepts 282 | words = topic.lower().split() 283 | if "ai" in words or "artificial" in words or "intelligence" in words: 284 | aspects.extend(["applications", "technology", "impact"]) 285 | perspectives.extend(["technical", "ethical", "societal"]) 286 | 287 | if "healthcare" in words or "medical" in words or "health" in words: 288 | aspects.extend(["patient care", "diagnosis", "treatment"]) 289 | perspectives.extend(["patients", "doctors", "researchers"]) 290 | 291 | # Default structure 292 | if not aspects: 293 | aspects = ["overview", "current state", "future trends", "implications"] 294 | if not perspectives: 295 | perspectives = ["technical", "practical", "societal"] 296 | 297 | questions = [ 298 | f"What is the current state of {topic}?", 299 | f"What are the key benefits and challenges?", 300 | f"What does the future hold for {topic}?" 301 | ] 302 | 303 | return { 304 | "aspects": aspects[:5], 305 | "perspectives": perspectives[:4], 306 | "questions": questions[:5] 307 | } 308 | 309 | 310 | @tool 311 | def extract_insights_from_text(text: str, focus: str = "key findings") -> List[str]: 312 | """Extract specific, targeted insights from text content based on a defined focus area. 313 | 314 | This tool performs focused extraction of relevant information from raw text, 315 | helping to isolate specific types of insights like findings, trends, challenges, 316 | benefits, technical details, or statistics. 317 | 318 | ## When to Use 319 | - Extracting specific categories of information from article content 320 | - Isolating technical details or specifications 321 | - Finding statistics, numbers, or quantitative data 322 | - Identifying challenges, limitations, or criticisms 323 | - Pulling out benefits, advantages, or positive outcomes 324 | - Discovering trends, patterns, or predictions 325 | 326 | ## Effective Focus Parameters 327 | 328 | ### For Technical Research: 329 | - "technical specifications" - Extract specs, requirements, parameters 330 | - "implementation details" - How it works, architecture, components 331 | - "performance metrics" - Speed, accuracy, benchmarks, comparisons 332 | - "limitations" - Constraints, edge cases, known issues 333 | 334 | ### For Analysis: 335 | - "key findings" - Main conclusions and discoveries (default) 336 | - "trends" - Patterns, trajectories, emerging developments 337 | - "challenges" - Problems, obstacles, difficulties 338 | - "benefits" - Advantages, positive outcomes, value propositions 339 | - "comparisons" - How things differ, trade-offs, alternatives 340 | 341 | ### For Practical Use: 342 | - "best practices" - Recommended approaches, guidelines 343 | - "use cases" - Applications, examples, scenarios 344 | - "requirements" - Prerequisites, dependencies, conditions 345 | - "steps" - Procedures, processes, workflows 346 | 347 | Args: 348 | text: The text content to analyze. Can be: 349 | - Extracted webpage content 350 | - Search result snippets 351 | - Combined content from multiple sources 352 | Longer texts (>1000 chars) yield better results. 353 | 354 | focus: The type of insight to extract. Be specific for better results. 355 | Default: "key findings" 356 | 357 | Examples: 358 | - "technical architecture" 359 | - "performance benchmarks" 360 | - "security considerations" 361 | - "cost implications" 362 | - "user benefits" 363 | 364 | Returns: 365 | List[str]: Extracted insights matching the focus area. 366 | Each insight is a complete, standalone statement. 367 | Returns ["No specific insights found..."] if none match. 368 | 369 | Tips: 370 | - Use specific focus terms for targeted extraction 371 | - Combine with multiple focus areas for comprehensive analysis 372 | - Review extracted insights for accuracy before including in reports 373 | """ 374 | logger.info(f"Extracting insights with focus: {focus}") 375 | 376 | # Simple extraction: split by sentences and filter 377 | insights = [] 378 | sentences = text.split('. ') 379 | 380 | focus_keywords = focus.lower().split() 381 | for sentence in sentences[:20]: # Limit to first 20 sentences 382 | sentence_lower = sentence.lower() 383 | # Check if sentence contains focus keywords 384 | if any(keyword in sentence_lower for keyword in focus_keywords): 385 | if len(sentence) > 20 and len(sentence) < 300: 386 | insights.append(sentence.strip() + '.') 387 | 388 | return insights[:10] if insights else ["No specific insights found for this focus."] 389 | 390 | 391 | @tool 392 | def format_citation(url: str, title: str = "", style: str = "apa") -> str: 393 | """Format a source citation in a standardized academic style. 394 | 395 | This tool generates properly formatted citations for the References section 396 | of research reports. Supports major academic citation styles used in 397 | scholarly writing. 398 | 399 | ## Supported Citation Styles 400 | 401 | ### APA (American Psychological Association) - Default 402 | - Common in: Social sciences, psychology, education, business 403 | - Format: Author. (Year). Title. Retrieved from URL 404 | - Example: Smith, J. (2024). Machine Learning Basics. Retrieved from https://... 405 | 406 | ### MLA (Modern Language Association) 407 | - Common in: Humanities, literature, arts 408 | - Format: Author. "Title." Date. Web. Access Date. 409 | - Example: Smith, John. "Machine Learning Basics." Web. 15 Dec. 2024. 410 | 411 | ### Chicago 412 | - Common in: History, some humanities, publishing 413 | - Format: Author. "Title." Accessed Date. URL. 414 | - Example: Smith, John. "Machine Learning Basics." Accessed December 15, 2024. https://... 415 | 416 | ### IEEE (Institute of Electrical and Electronics Engineers) 417 | - Common in: Engineering, computer science, technical fields 418 | - Format: Author, "Title," URL, accessed Date. 419 | - Example: J. Smith, "Machine Learning Basics," https://..., accessed December 15, 2024. 420 | 421 | ## When to Use 422 | - Building the References section of a report 423 | - Need consistent citation formatting 424 | - Converting URL + title into proper academic format 425 | 426 | Args: 427 | url: Complete URL of the source (required) 428 | Must be a valid HTTP/HTTPS URL 429 | 430 | title: Title of the article/page (recommended) 431 | Improves citation quality significantly 432 | If empty, citation will be URL-only 433 | 434 | style: Citation format to use (case-insensitive) 435 | Options: "apa" (default), "mla", "chicago", "ieee" 436 | Use the style appropriate for your field/audience 437 | 438 | Returns: 439 | str: Formatted citation string ready for inclusion in References 440 | Includes current date as access date where required by style 441 | 442 | Tips: 443 | - Always provide title when available for better citations 444 | - Use consistent style throughout a single report 445 | - APA is a safe default for most research contexts 446 | """ 447 | logger.info(f"Formatting citation in {style} style") 448 | 449 | try: 450 | # Use the appropriate formatting method based on style 451 | if style.lower() == "apa": 452 | return _citation_formatter.format_apa(url, title) 453 | elif style.lower() == "mla": 454 | return _citation_formatter.format_mla(url, title) 455 | elif style.lower() == "chicago": 456 | return _citation_formatter.format_chicago(url, title) 457 | elif style.lower() == "ieee": 458 | return _citation_formatter.format_ieee(url, title) 459 | else: 460 | # Default to APA 461 | return _citation_formatter.format_apa(url, title) 462 | except Exception as e: 463 | logger.error(f"Citation formatting error: {e}") 464 | # Fallback to simple format 465 | if title: 466 | return f"{title}. Retrieved from {url}" 467 | return url 468 | 469 | 470 | @tool 471 | def validate_section_quality(section_text: str, min_words: int = 150) -> Dict[str, any]: 472 | """Validate a report section against quality standards before finalizing. 473 | 474 | This tool performs comprehensive quality checks on written sections to ensure 475 | they meet minimum standards for length, citation usage, structure, and 476 | overall readability. Use BEFORE submitting final section content. 477 | 478 | ## Quality Dimensions Checked 479 | 480 | ### 1. Length Requirements 481 | - Minimum word count enforcement 482 | - Flags sections that are too short for meaningful coverage 483 | 484 | ### 2. Citation Analysis 485 | - Presence of inline citations [1], [2], etc. 486 | - Academic writing requires citations for factual claims 487 | 488 | ### 3. Structural Elements 489 | - Use of markdown headings for organization 490 | - Appropriate for sections over 300 words 491 | 492 | ## When to Use 493 | - After drafting any report section 494 | - Before returning final section content 495 | - To identify areas needing improvement 496 | - To ensure minimum quality thresholds are met 497 | 498 | ## Interpreting Results 499 | 500 | ### is_valid = True 501 | - Section meets all minimum requirements 502 | - Safe to include in final report 503 | 504 | ### is_valid = False 505 | - One or more critical issues found 506 | - Review 'issues' list for specific problems 507 | - Follow 'suggestions' for improvements 508 | - Revise section before submitting 509 | 510 | Args: 511 | section_text: The complete section content to validate. 512 | Should be the full markdown text you plan to submit. 513 | 514 | min_words: Minimum acceptable word count. Default: 150 515 | For comprehensive sections, use 200-300 516 | For brief overviews, 100-150 may suffice 517 | 518 | Returns: 519 | Dictionary containing: 520 | 521 | - is_valid (bool): True if ALL quality checks pass 522 | 523 | - word_count (int): Actual word count of the section 524 | Compare against min_words to see the gap 525 | 526 | - has_citations (bool): True if [n] citation format detected 527 | FALSE = Major issue for factual content 528 | 529 | - issues (List[str]): Specific problems found 530 | Empty list = no issues 531 | Examples: 532 | - "Section too short: 89 words (minimum: 150)" 533 | - "No citations found" 534 | 535 | - suggestions (List[str]): Actionable improvement recommendations 536 | Examples: 537 | - "Add more detail and supporting information" 538 | - "Add inline citations [1], [2] to support claims" 539 | - "Consider adding subheadings for better structure" 540 | 541 | Usage Pattern: 542 | 1. Write your section content 543 | 2. Call validate_section_quality(your_content, min_words=200) 544 | 3. If is_valid is False, revise based on issues/suggestions 545 | 4. Repeat until is_valid is True 546 | 5. Submit the validated section 547 | """ 548 | logger.info("Validating section quality") 549 | 550 | word_count = len(section_text.split()) 551 | has_citations = '[' in section_text and ']' in section_text 552 | has_headers = '#' in section_text 553 | 554 | issues = [] 555 | suggestions = [] 556 | 557 | if word_count < min_words: 558 | issues.append(f"Section too short: {word_count} words (minimum: {min_words})") 559 | suggestions.append("Add more detail and supporting information") 560 | 561 | if not has_citations: 562 | issues.append("No citations found") 563 | suggestions.append("Add inline citations [1], [2] to support claims") 564 | 565 | if not has_headers and word_count > 300: 566 | suggestions.append("Consider adding subheadings for better structure") 567 | 568 | is_valid = len(issues) == 0 569 | 570 | return { 571 | "is_valid": is_valid, 572 | "word_count": word_count, 573 | "has_citations": has_citations, 574 | "issues": issues, 575 | "suggestions": suggestions 576 | } 577 | 578 | 579 | # Tool lists for different agents 580 | research_search_tools = [ 581 | web_search, 582 | extract_webpage_content 583 | ] 584 | 585 | synthesis_tools = [ 586 | extract_insights_from_text 587 | ] 588 | 589 | writing_tools = [ 590 | format_citation, 591 | validate_section_quality 592 | ] 593 | 594 | planning_tools = [ 595 | analyze_research_topic 596 | ] 597 | 598 | # All tools combined 599 | all_research_tools = [ 600 | web_search, 601 | extract_webpage_content, 602 | analyze_research_topic, 603 | extract_insights_from_text, 604 | format_citation, 605 | validate_section_quality 606 | ] 607 | 608 | 609 | def get_research_tools(agent_type: str = "search") -> List: 610 | """Get research tools for a specific agent type. 611 | 612 | Args: 613 | agent_type: Type of agent ("search", "synthesis", "writing", "planning", "all") 614 | 615 | Returns: 616 | List of LangChain tool objects for that agent 617 | """ 618 | tools_map = { 619 | "search": research_search_tools, 620 | "synthesis": synthesis_tools, 621 | "writing": writing_tools, 622 | "planning": planning_tools, 623 | "all": all_research_tools 624 | } 625 | return tools_map.get(agent_type, research_search_tools) 626 | -------------------------------------------------------------------------------- /src/agents.py: -------------------------------------------------------------------------------- 1 | """Agent nodes for the research workflow.""" 2 | 3 | import asyncio 4 | from typing import List 5 | import logging 6 | 7 | from langchain_google_genai import ChatGoogleGenerativeAI 8 | from langchain_ollama import ChatOllama 9 | from langchain_openai import ChatOpenAI 10 | from langchain_core.prompts import ChatPromptTemplate 11 | from langchain_core.output_parsers import StrOutputParser, JsonOutputParser 12 | from langchain.agents import create_agent 13 | 14 | from src.state import ResearchState, ResearchPlan, SearchQuery, ReportSection 15 | from src.utils.tools import get_research_tools 16 | from src.config import config 17 | from src.utils.credibility import CredibilityScorer 18 | from src.utils.citations import CitationFormatter 19 | from src.llm_tracker import estimate_tokens 20 | from src.callbacks import ( 21 | emit_planning_start, emit_planning_complete, 22 | emit_search_start, emit_search_results, 23 | emit_extraction_start, emit_extraction_complete, 24 | emit_synthesis_start, emit_synthesis_progress, emit_synthesis_complete, 25 | emit_writing_start, emit_writing_section, emit_writing_complete, 26 | emit_error 27 | ) 28 | import time 29 | 30 | logging.basicConfig(level=logging.INFO) 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | def get_llm(temperature: float = 0.7, model_override: str = None): 35 | """Get LLM instance based on configuration. 36 | 37 | Args: 38 | temperature: Temperature for the LLM 39 | model_override: Optional model name to override config.model_name 40 | 41 | Returns: 42 | LLM instance (ChatOllama, ChatGoogleGenerativeAI, or ChatOpenAI) 43 | """ 44 | model_name = model_override or config.model_name 45 | 46 | if config.model_provider == "ollama": 47 | logger.info(f"Using Ollama model: {model_name}") 48 | return ChatOllama( 49 | model=model_name, 50 | base_url=config.ollama_base_url, 51 | temperature=temperature, 52 | num_ctx=8192, # Context window 53 | ) 54 | elif config.model_provider == "openai": 55 | logger.info(f"Using OpenAI model: {model_name}") 56 | return ChatOpenAI( 57 | model=model_name, 58 | api_key=config.openai_api_key, 59 | temperature=temperature 60 | ) 61 | elif config.model_provider == "llamacpp": 62 | logger.info(f"Using llama.cpp server model: {model_name}") 63 | # llama.cpp server exposes OpenAI-compatible API 64 | return ChatOpenAI( 65 | model=model_name, 66 | base_url=f"{config.llamacpp_base_url}/v1", # OpenAI-compatible endpoint 67 | api_key="not-needed", # llama.cpp doesn't require API key 68 | temperature=temperature 69 | ) 70 | else: # gemini 71 | logger.info(f"Using Gemini model: {model_name}") 72 | return ChatGoogleGenerativeAI( 73 | model=model_name, 74 | google_api_key=config.google_api_key, 75 | temperature=temperature 76 | ) 77 | 78 | 79 | class ResearchPlanner: 80 | """Autonomous agent responsible for planning research strategy.""" 81 | 82 | def __init__(self): 83 | self.llm = get_llm(temperature=0.7) 84 | # Note: Planning agent uses LLM directly with structured output for reliability 85 | # Tool calling works better for search/extraction tasks 86 | self.max_retries = 3 87 | 88 | async def plan(self, state: ResearchState) -> dict: 89 | """Create a research plan with structured LLM output. 90 | 91 | Returns dict with updates that LangGraph will merge into state. 92 | """ 93 | logger.info(f"Planning research for: {state.research_topic}") 94 | 95 | # Emit progress update 96 | await emit_planning_start(state.research_topic) 97 | 98 | prompt = ChatPromptTemplate.from_messages([ 99 | ("system", """You are an expert research strategist and information architect. Your role is to create comprehensive, methodical research plans that maximize accuracy and depth of coverage. 100 | 101 | ## Your Core Responsibilities 102 | 103 | ### 1. Define SMART Research Objectives (3-5 objectives) 104 | Create objectives that are: 105 | - **Specific**: Target concrete aspects of the topic, not vague generalities 106 | - **Measurable**: Can be verified as addressed in the final report 107 | - **Achievable**: Realistically answerable through web research 108 | - **Relevant**: Directly address the user's query and implied needs 109 | - **Time-aware**: Consider current state, recent developments, and future outlook 110 | 111 | ### 2. Design Strategic Search Queries (up to {max_queries} queries) 112 | 113 | **Query Diversity Matrix** - Ensure coverage across: 114 | - **Definitional queries**: "What is [topic]" / "[topic] explained" 115 | - **Mechanism queries**: "How does [topic] work" / "[topic] architecture" 116 | - **Comparison queries**: "[topic] vs alternatives" / "[topic] comparison" 117 | - **Expert/authoritative queries**: "[topic] research paper" / "[topic] official documentation" 118 | - **Practical queries**: "[topic] best practices" / "[topic] implementation guide" 119 | - **Trend queries**: "[topic] 2024" / "latest [topic] developments" 120 | - **Problem/solution queries**: "[topic] challenges" / "[topic] limitations" 121 | 122 | **Query Quality Guidelines**: 123 | - Use specific technical terms when appropriate 124 | - Include year markers for time-sensitive topics (e.g., "2024", "latest") 125 | - Add domain qualifiers for targeted results (e.g., "academic", "enterprise", "tutorial") 126 | - Avoid overly broad single-word queries 127 | - Consider alternative phrasings and synonyms 128 | 129 | ### 3. Structure the Report Outline (up to {max_sections} sections) 130 | 131 | Create a logical flow that: 132 | - Starts with context/background (helps readers understand the landscape) 133 | - Progresses from fundamentals to advanced topics 134 | - Groups related concepts together 135 | - Ends with practical implications, conclusions, or future outlook 136 | - Includes a dedicated section for technical details if applicable 137 | 138 | **Recommended Section Types**: 139 | - Executive Summary / Overview 140 | - Background & Context 141 | - Core Concepts / How It Works 142 | - Key Features / Components / Architecture 143 | - Benefits & Advantages 144 | - Challenges & Limitations 145 | - Use Cases / Applications 146 | - Comparison with Alternatives (if relevant) 147 | - Best Practices / Implementation Guidelines 148 | - Future Outlook / Trends 149 | - Conclusion & Recommendations 150 | 151 | ## Output Quality Standards 152 | - Every search query must have a clear, distinct purpose 153 | - No redundant or overlapping queries 154 | - Report sections should comprehensively cover all objectives 155 | - Consider the user's apparent expertise level when designing the plan"""), 156 | ("human", """Research Topic: {topic} 157 | 158 | Analyze this topic carefully. Consider: 159 | 1. What is the user really trying to understand? 160 | 2. What are the key dimensions of this topic? 161 | 3. What authoritative sources would have the best information? 162 | 4. What technical depth is appropriate? 163 | 164 | Create a detailed research plan in JSON format: 165 | {{ 166 | "topic": "the research topic (refined if needed for clarity)", 167 | "objectives": [ 168 | "Specific, measurable objective 1", 169 | "Specific, measurable objective 2", 170 | ... 171 | ], 172 | "search_queries": [ 173 | {{"query": "well-crafted search query 1", "purpose": "specific reason this query helps achieve objectives"}}, 174 | {{"query": "well-crafted search query 2", "purpose": "specific reason this query helps achieve objectives"}}, 175 | ... 176 | ], 177 | "report_outline": [ 178 | "Section 1: Logical starting point", 179 | "Section 2: Building on Section 1", 180 | ... 181 | ] 182 | }} 183 | 184 | Ensure each query targets different aspects and the outline tells a coherent story.""") 185 | ]) 186 | 187 | for attempt in range(self.max_retries): 188 | try: 189 | start_time = time.time() 190 | chain = prompt | self.llm | JsonOutputParser() 191 | 192 | # Estimate input tokens 193 | input_text = f"{state.research_topic} {config.max_search_queries} {config.max_report_sections}" 194 | input_tokens = estimate_tokens(input_text) 195 | 196 | result = await chain.ainvoke({ 197 | "topic": state.research_topic, 198 | "max_queries": config.max_search_queries, 199 | "max_sections": config.max_report_sections 200 | }) 201 | 202 | # Track LLM call 203 | duration = time.time() - start_time 204 | output_tokens = estimate_tokens(str(result)) 205 | call_detail = { 206 | 'agent': 'ResearchPlanner', 207 | 'operation': 'plan', 208 | 'model': config.model_name, 209 | 'input_tokens': input_tokens, 210 | 'output_tokens': output_tokens, 211 | 'duration': round(duration, 2), 212 | 'attempt': attempt + 1 213 | } 214 | 215 | # Validate result structure 216 | if not all(key in result for key in ["topic", "objectives", "search_queries", "report_outline"]): 217 | raise ValueError("Invalid plan structure returned") 218 | 219 | if not result["search_queries"]: 220 | raise ValueError("No search queries generated") 221 | 222 | # Convert to ResearchPlan 223 | plan_data = result 224 | 225 | # Validate result structure 226 | if not all(key in plan_data for key in ["topic", "objectives", "search_queries", "report_outline"]): 227 | raise ValueError("Invalid plan structure returned") 228 | 229 | if not plan_data["search_queries"]: 230 | raise ValueError("No search queries generated") 231 | 232 | # Convert to ResearchPlan with HARD LIMITS enforced 233 | plan = ResearchPlan( 234 | topic=plan_data["topic"], 235 | objectives=plan_data["objectives"][:5], # Max 5 objectives 236 | search_queries=[ 237 | SearchQuery(query=sq["query"], purpose=sq["purpose"]) 238 | for sq in plan_data["search_queries"][:config.max_search_queries] 239 | ], 240 | report_outline=plan_data["report_outline"][:config.max_report_sections] 241 | ) 242 | 243 | logger.info(f"Created plan with {len(plan.search_queries)} queries (enforced max: {config.max_search_queries})") 244 | logger.info(f"Report outline has {len(plan.report_outline)} sections (enforced max: {config.max_report_sections})") 245 | 246 | # Emit progress update 247 | await emit_planning_complete(len(plan.search_queries), len(plan.report_outline)) 248 | 249 | # Return dict updates - LangGraph merges into state 250 | return { 251 | "plan": plan, 252 | "current_stage": "searching", 253 | "iterations": state.iterations + 1, 254 | "llm_calls": state.llm_calls + 1, 255 | "total_input_tokens": state.total_input_tokens + input_tokens, 256 | "total_output_tokens": state.total_output_tokens + output_tokens, 257 | "llm_call_details": state.llm_call_details + [call_detail] 258 | } 259 | 260 | except Exception as e: 261 | logger.warning(f"Planning attempt {attempt + 1} failed: {str(e)}") 262 | if attempt == self.max_retries - 1: 263 | logger.error(f"Planning failed after {self.max_retries} attempts") 264 | return { 265 | "error": f"Planning failed: {str(e)}", 266 | "iterations": state.iterations + 1 267 | } 268 | else: 269 | await asyncio.sleep(2 ** attempt) 270 | 271 | # Fallback if all retries exhausted 272 | return { 273 | "error": "Planning failed: Maximum retries exceeded", 274 | "iterations": state.iterations + 1 275 | } 276 | 277 | 278 | class ResearchSearcher: 279 | """Autonomous agent responsible for executing research searches.""" 280 | 281 | def __init__(self): 282 | self.llm = get_llm(temperature=0.3) 283 | self.tools = get_research_tools(agent_type="search") 284 | self.credibility_scorer = CredibilityScorer() 285 | self.max_retries = 3 286 | 287 | async def search(self, state: ResearchState) -> dict: 288 | """Autonomously execute research searches using tools. 289 | 290 | The agent will decide which searches to perform, when to extract content, 291 | and how to gather comprehensive information. 292 | 293 | Returns dict with search results that LangGraph will merge into state. 294 | """ 295 | if not state.plan: 296 | await emit_error("No research plan available") 297 | return {"error": "No research plan available"} 298 | 299 | logger.info(f"Autonomous agent researching: {len(state.plan.search_queries)} planned queries") 300 | 301 | # Emit progress for each planned query 302 | total_queries = len(state.plan.search_queries) 303 | for i, query in enumerate(state.plan.search_queries, 1): 304 | await emit_search_start(query.query, i, total_queries) 305 | 306 | # Create system prompt for autonomous agent with config-based limits 307 | max_searches = config.max_search_queries 308 | max_results_per_search = config.max_search_results_per_query 309 | expected_total_results = max_searches * max_results_per_search 310 | 311 | system_prompt = f"""You are an elite research investigator with expertise in finding accurate, authoritative information. Your mission is to gather comprehensive, verified data from the most credible sources available. 312 | 313 | ## Your Available Tools 314 | 1. **web_search(query, max_results)**: Search the web for information 315 | 2. **extract_webpage_content(url)**: Extract full article content from a URL 316 | 317 | ## Research Protocol 318 | 319 | ### Phase 1: Strategic Searching 320 | Execute the planned search queries systematically: 321 | - Limit to **{max_searches} searches maximum** 322 | - Each search returns up to **{max_results_per_search} results** 323 | - If initial queries yield poor results, adapt with refined queries 324 | 325 | ### Phase 2: Source Evaluation & Content Extraction 326 | For each search result, quickly assess source quality: 327 | 328 | **HIGH-PRIORITY Sources (extract immediately):** 329 | - Government sites (.gov, .gov.uk, .europa.eu) 330 | - Academic institutions (.edu, .ac.uk, university domains) 331 | - Peer-reviewed journals (nature.com, sciencedirect.com, ieee.org) 332 | - Official documentation (docs.*, official product sites) 333 | - Established news organizations (reuters.com, bbc.com, nytimes.com) 334 | - Industry-recognized publications 335 | 336 | **MEDIUM-PRIORITY Sources (extract if needed):** 337 | - Well-known tech publications (techcrunch.com, wired.com, arstechnica.com) 338 | - Reputable blogs with author credentials 339 | - Company blogs from established organizations 340 | - Wikipedia (good for overview, verify claims elsewhere) 341 | 342 | **LOW-PRIORITY Sources (use cautiously):** 343 | - Personal blogs without credentials 344 | - User-generated content sites 345 | - Sites with excessive ads or clickbait titles 346 | - Sources without clear authorship 347 | - Outdated content (check publication dates) 348 | 349 | ### Phase 3: Content Gathering 350 | - Extract full content from the **top {expected_total_results} most promising URLs** 351 | - Prioritize sources that directly address the research objectives 352 | - Look for primary sources (original research, official docs) over secondary summaries 353 | - Note publication dates - prefer recent content for evolving topics 354 | 355 | ## Quality Checkpoints 356 | Before concluding, verify you have: 357 | [x] Multiple sources confirming key facts (cross-referencing) 358 | [x] At least some high-credibility sources in your collection 359 | [x] Coverage across different aspects of the research objectives 360 | [x] Both overview content and specific technical details 361 | 362 | ## Completion Signal 363 | When you have gathered sufficient high-quality information (aim for {expected_total_results} quality sources with extracted content), respond with: 364 | 365 | RESEARCH_COMPLETE: [Summary of what you found, including: 366 | - Number of sources gathered 367 | - Key themes discovered 368 | - Any notable gaps or areas needing more research 369 | - Confidence level in the gathered information]""" 370 | 371 | # Create autonomous agent using LangChain's create_agent 372 | agent_graph = create_agent( 373 | self.llm, 374 | self.tools, 375 | system_prompt=system_prompt 376 | ) 377 | 378 | for attempt in range(self.max_retries): 379 | try: 380 | start_time = time.time() 381 | 382 | # Prepare input 383 | objectives_text = "\n".join(f"- {obj}" for obj in state.plan.objectives) 384 | queries_text = "\n".join( 385 | f"- {q.query} (Purpose: {q.purpose})" 386 | for q in state.plan.search_queries 387 | ) 388 | 389 | # Estimate input tokens 390 | input_message = f"""## Research Mission Brief 391 | 392 | ### Topic Under Investigation: 393 | {state.research_topic} 394 | 395 | ### Research Objectives (All must be addressed): 396 | {objectives_text} 397 | 398 | ### Planned Search Queries (Execute strategically): 399 | {queries_text} 400 | 401 | --- 402 | 403 | ### Your Mission: 404 | 1. Execute the search queries above using the web_search tool 405 | 2. Evaluate results for credibility and relevance 406 | 3. Extract full content from the most authoritative sources using extract_webpage_content 407 | 4. Ensure you gather information that addresses ALL research objectives 408 | 5. Prioritize recent, authoritative sources over older or less credible ones 409 | 410 | ### Quality Targets: 411 | - Gather from at least {config.max_search_queries * config.max_search_results_per_query} different sources 412 | - Extract full content from the top 5-8 most relevant pages 413 | - Ensure coverage across all research objectives 414 | - Include at least some academic, government, or official documentation sources if available 415 | 416 | Begin your systematic research now. Execute searches and extract content until you have comprehensive coverage.""" 417 | 418 | input_tokens = estimate_tokens(input_message) 419 | 420 | # Execute autonomous research 421 | result = await agent_graph.ainvoke({ 422 | "messages": [{"role": "user", "content": input_message}] 423 | }) 424 | 425 | # Track LLM call (approximation - agent may make multiple calls) 426 | duration = time.time() - start_time 427 | 428 | # Extract messages from result 429 | messages = result.get('messages', []) 430 | output_text = "" 431 | if messages: 432 | output_text = str(messages[-1].content if hasattr(messages[-1], 'content') else str(messages[-1])) 433 | 434 | output_tokens = estimate_tokens(output_text) 435 | 436 | # Extract search results from messages 437 | # We need to track tool calls and results within the messages 438 | search_results = [] 439 | from src.state import SearchResult 440 | 441 | for msg in messages: 442 | # Check for tool calls in message 443 | if hasattr(msg, 'tool_calls') and msg.tool_calls: 444 | for tool_call in msg.tool_calls: 445 | if tool_call.get('name') == 'web_search': 446 | # This is a search request, we'll get results in next message 447 | pass 448 | 449 | # Check for tool responses 450 | if hasattr(msg, 'name') and msg.name == 'web_search': 451 | # Parse tool response 452 | try: 453 | content = msg.content 454 | if isinstance(content, str): 455 | import json 456 | tool_results = json.loads(content) 457 | else: 458 | tool_results = content 459 | 460 | if isinstance(tool_results, list): 461 | for item in tool_results: 462 | if isinstance(item, dict): 463 | search_results.append(SearchResult( 464 | query=item.get('query', ''), 465 | title=item.get('title', ''), 466 | url=item.get('url', ''), 467 | snippet=item.get('snippet', ''), 468 | content=None 469 | )) 470 | except Exception as e: 471 | logger.warning(f"Error parsing tool result: {e}") 472 | 473 | # Check for content extraction results 474 | if hasattr(msg, 'name') and msg.name == 'extract_webpage_content': 475 | try: 476 | content = msg.content 477 | # Find the corresponding search result and update it 478 | # Note: This is a simplified approach, might need refinement 479 | if search_results and content: 480 | # Update the most recent search result without content 481 | for sr in reversed(search_results): 482 | if not sr.content: 483 | sr.content = content 484 | break 485 | except Exception as e: 486 | logger.warning(f"Error updating content: {e}") 487 | 488 | logger.info(f"Autonomous agent collected {len(search_results)} results") 489 | 490 | # Calculate total extracted content 491 | total_extracted_chars = sum( 492 | len(r.content) if r.content else 0 493 | for r in search_results 494 | ) 495 | extracted_count = sum(1 for r in search_results if r.content) 496 | 497 | # Emit extraction completion 498 | await emit_extraction_complete(extracted_count, total_extracted_chars) 499 | 500 | if not search_results: 501 | await emit_error("Agent did not collect any search results") 502 | raise ValueError("Agent did not collect any search results") 503 | 504 | # Score all results first 505 | scored_results = self.credibility_scorer.score_search_results(search_results) 506 | 507 | # Filter by minimum credibility score 508 | filtered_scored = [ 509 | item for item in scored_results 510 | if item['credibility']['score'] >= config.min_credibility_score 511 | ] 512 | 513 | # Extract filtered results and scores (already sorted by score, highest first) 514 | credibility_scores = [item['credibility'] for item in filtered_scored] 515 | sorted_results = [item['result'] for item in filtered_scored] 516 | 517 | logger.info(f"Filtered {len(search_results)} -> {len(sorted_results)} results (min_credibility={config.min_credibility_score})") 518 | 519 | # Mark queries as completed 520 | for q in state.plan.search_queries: 521 | q.completed = True 522 | 523 | call_detail = { 524 | 'agent': 'ResearchSearcher', 525 | 'operation': 'autonomous_search', 526 | 'model': config.model_name, 527 | 'input_tokens': input_tokens, 528 | 'output_tokens': output_tokens, 529 | 'duration': round(duration, 2), 530 | 'results_count': len(sorted_results), 531 | 'original_results_count': len(search_results), 532 | 'min_credibility_score': config.min_credibility_score, 533 | 'attempt': attempt + 1 534 | } 535 | 536 | # Return dict updates - LangGraph merges into state 537 | return { 538 | "search_results": sorted_results, 539 | "credibility_scores": credibility_scores, 540 | "current_stage": "synthesizing", 541 | "iterations": state.iterations + 1, 542 | "llm_calls": state.llm_calls + 1, 543 | "total_input_tokens": state.total_input_tokens + input_tokens, 544 | "total_output_tokens": state.total_output_tokens + output_tokens, 545 | "llm_call_details": state.llm_call_details + [call_detail] 546 | } 547 | 548 | except Exception as e: 549 | logger.warning(f"Search attempt {attempt + 1} failed: {str(e)}") 550 | if attempt == self.max_retries - 1: 551 | logger.error(f"Search failed after {self.max_retries} attempts") 552 | return { 553 | "error": f"Search failed: {str(e)}", 554 | "iterations": state.iterations + 1 555 | } 556 | else: 557 | await asyncio.sleep(2 ** attempt) 558 | 559 | # Fallback if all retries exhausted 560 | return { 561 | "error": "Search failed: Maximum retries exceeded", 562 | "iterations": state.iterations + 1 563 | } 564 | 565 | 566 | class ResearchSynthesizer: 567 | """Autonomous agent responsible for synthesizing research findings.""" 568 | 569 | def __init__(self): 570 | self.llm = get_llm(temperature=0.3, model_override=config.summarization_model) 571 | self.tools = get_research_tools(agent_type="synthesis") 572 | self.max_retries = 3 573 | 574 | async def synthesize(self, state: ResearchState) -> dict: 575 | """Autonomously synthesize key findings using tools and reasoning. 576 | 577 | Returns dict with key findings that LangGraph will merge into state. 578 | """ 579 | logger.info(f"Synthesizing findings from {len(state.search_results)} results") 580 | 581 | if not state.search_results: 582 | await emit_error("No search results to synthesize") 583 | return {"error": "No search results to synthesize"} 584 | 585 | # Emit synthesis start 586 | await emit_synthesis_start(len(state.search_results)) 587 | 588 | # Create system prompt for autonomous synthesis agent 589 | system_prompt = """You are a senior research analyst specializing in synthesizing complex information into accurate, actionable insights. Your task is to analyze search results and extract verified, well-supported findings. 590 | 591 | ## Your Available Tools 592 | - **extract_insights_from_text(text, focus)**: Extract specific insights from text content 593 | 594 | ## Source Credibility Framework 595 | 596 | Each source has a credibility rating. Apply this hierarchy strictly: 597 | 598 | ### HIGH Credibility (Score >=70) - Primary Sources 599 | - Government and institutional sources 600 | - Peer-reviewed research and academic papers 601 | - Official documentation and specifications 602 | - Established news organizations with editorial standards 603 | => **TRUST**: Use as primary basis for findings 604 | 605 | ### MEDIUM Credibility (Score 40-69) - Supporting Sources 606 | - Industry publications and tech blogs 607 | - Expert commentary and analysis 608 | - Well-maintained wikis and documentation 609 | => **VERIFY**: Cross-reference with HIGH sources; use to add context 610 | 611 | ### LOW Credibility (Score <40) - Supplementary Only 612 | - Personal blogs, forums, user comments 613 | - Sources without clear authorship 614 | - Outdated or unverified content 615 | => **CAUTION**: Only use if corroborated by higher-credibility sources 616 | 617 | ## Synthesis Methodology 618 | 619 | ### Step 1: Identify Core Facts 620 | - What claims appear in multiple HIGH-credibility sources? 621 | - What are the foundational facts that most sources agree on? 622 | - Extract specific data points: numbers, dates, names, technical specifications 623 | 624 | ### Step 2: Detect and Resolve Conflicts 625 | When sources contradict each other: 626 | 1. Check credibility scores - trust higher-rated sources 627 | 2. Check recency - newer information may supersede older 628 | 3. Check specificity - primary sources trump secondary summaries 629 | 4. If unresolvable, note the disagreement in findings 630 | 631 | ### Step 3: Synthesize Key Findings 632 | For each finding, ensure: 633 | - **Accuracy**: Only include information that appears in the sources 634 | - **Attribution**: Note which source numbers support the finding [1], [2], etc. 635 | - **Specificity**: Include concrete details, not vague generalities 636 | - **Balance**: Present multiple perspectives if sources differ 637 | 638 | ### Step 4: Quality Control 639 | Before finalizing, verify: 640 | [x] No claims are made without source support 641 | [x] HIGH-credibility sources are prioritized 642 | [x] Contradictions are acknowledged, not ignored 643 | [x] Findings directly address research objectives 644 | [x] Technical accuracy is maintained (don't oversimplify incorrectly) 645 | 646 | ## Output Format 647 | 648 | Return findings as a JSON array of strings. Each finding should: 649 | - Be a complete, standalone insight 650 | - Include source references where applicable 651 | - Be specific enough to be useful (avoid generic statements) 652 | - Focus on facts over opinions (unless opinion is from recognized experts) 653 | 654 | Example format: 655 | [ 656 | "Finding 1: [Specific fact or insight] - supported by sources [1], [3]", 657 | "Finding 2: [Technical detail with specifics] - per official documentation [2]", 658 | "Finding 3: [Trend or development] - noted across multiple industry sources [4], [5], [6]" 659 | ] 660 | 661 | ## Anti-Hallucination Rules 662 | DO NOT invent statistics, dates, or specifics not in sources 663 | DO NOT make claims beyond what sources support 664 | DO NOT present speculation as fact 665 | DO NOT ignore source credibility ratings 666 | DO say "sources indicate" or "according to [source]" for less certain claims 667 | DO note when information is limited or conflicting""" 668 | 669 | # Create autonomous synthesis agent 670 | agent_graph = create_agent( 671 | self.llm, 672 | self.tools, 673 | system_prompt=system_prompt 674 | ) 675 | 676 | # Progressive truncation strategy 677 | max_results = 20 678 | 679 | for attempt in range(self.max_retries): 680 | try: 681 | start_time = time.time() 682 | 683 | # Adjust result count based on attempt 684 | current_max = max(5, max_results - (attempt * 5)) 685 | 686 | # Prepare search results text with credibility information 687 | results_to_use = state.search_results[:current_max] 688 | credibility_scores_to_use = state.credibility_scores[:current_max] if state.credibility_scores else [] 689 | 690 | results_text = "\n\n".join([ 691 | f"[{i+1}] {r.title}\n" 692 | f"URL: {r.url}\n" 693 | f"Credibility: {cred.get('level', 'unknown').upper()} (Score: {cred.get('score', 'N/A')}/100) - {', '.join(cred.get('factors', []))}\n" 694 | f"Snippet: {r.snippet}\n" + 695 | (f"Content: {r.content[:300]}..." if r.content else "") 696 | for i, (r, cred) in enumerate(zip(results_to_use, credibility_scores_to_use)) 697 | ]) 698 | 699 | # If credibility scores don't match (shouldn't happen, but handle gracefully) 700 | if len(results_to_use) != len(credibility_scores_to_use): 701 | # Fallback: format without credibility if mismatch 702 | results_text = "\n\n".join([ 703 | f"[{i+1}] {r.title}\nURL: {r.url}\nSnippet: {r.snippet}\n" + 704 | (f"Content: {r.content[:300]}..." if r.content else "") 705 | for i, r in enumerate(results_to_use) 706 | ]) 707 | 708 | # Prepare input message for the autonomous agent 709 | input_message = f"""## Research Synthesis Task 710 | 711 | ### Topic: {state.research_topic} 712 | 713 | ### Your Mission: 714 | Analyze the search results below and extract the most important, accurate, and well-supported findings. 715 | 716 | --- 717 | 718 | ### Search Results with Credibility Scores: 719 | {results_text} 720 | 721 | --- 722 | 723 | ### Synthesis Instructions: 724 | 725 | 1. **Extract Key Facts**: Identify the core factual claims across sources 726 | 2. **Cross-Reference**: Note which findings are supported by multiple sources 727 | 3. **Resolve Conflicts**: When sources disagree, trust higher-credibility sources 728 | 4. **Maintain Specificity**: Include specific details, numbers, and technical information 729 | 5. **Note Limitations**: Flag areas where information is sparse or contradictory 730 | 731 | ### Output Requirements: 732 | Return a JSON array of 10-15 key findings. Each finding should: 733 | - Be a complete, specific statement (not vague generalizations) 734 | - Reference source numbers when citing facts: "...according to [1]" or "...per [3], [5]" 735 | - Focus on facts that directly address the research topic 736 | - Prioritize findings from HIGH-credibility sources 737 | 738 | Example format: 739 | [ 740 | "The technology uses [specific mechanism] to achieve [specific outcome], enabling [specific capability] [1]", 741 | "According to official documentation [2], the key components include: [list specific items]", 742 | "Industry adoption has grown to [specific metric], with major deployments at [specific examples] [3], [5]", 743 | "Experts note challenges including [specific challenge 1] and [specific challenge 2] [4]" 744 | ] 745 | 746 | Analyze the sources now and extract your findings:""" 747 | 748 | # Estimate input tokens 749 | input_tokens = estimate_tokens(input_message) 750 | 751 | # Execute autonomous synthesis 752 | result = await agent_graph.ainvoke({ 753 | "messages": [{"role": "user", "content": input_message}] 754 | }) 755 | 756 | # Track LLM call 757 | duration = time.time() - start_time 758 | 759 | # Extract final response 760 | messages = result.get('messages', []) 761 | output_text = "" 762 | if messages: 763 | last_msg = messages[-1] 764 | output_text = str(last_msg.content if hasattr(last_msg, 'content') else str(last_msg)) 765 | 766 | output_tokens = estimate_tokens(output_text) 767 | 768 | call_detail = { 769 | 'agent': 'ResearchSynthesizer', 770 | 'operation': 'autonomous_synthesis', 771 | 'model': config.summarization_model, 772 | 'input_tokens': input_tokens, 773 | 'output_tokens': output_tokens, 774 | 'duration': round(duration, 2), 775 | 'attempt': attempt + 1 776 | } 777 | 778 | # Parse the JSON response 779 | import json 780 | import re 781 | 782 | # Try to extract JSON array from the response 783 | json_match = re.search(r'\[(.*?)\]', output_text, re.DOTALL) 784 | 785 | key_findings = [] 786 | if json_match: 787 | try: 788 | findings = json.loads(json_match.group(0)) 789 | if isinstance(findings, list): 790 | key_findings = [ 791 | str(f) # Convert all items to strings (handles int, dict, etc.) 792 | for f in findings 793 | ] 794 | else: 795 | key_findings = [str(findings)] 796 | except json.JSONDecodeError: 797 | pass 798 | 799 | # If JSON parsing failed or empty, use fallback extraction 800 | if not key_findings: 801 | # Look for bullet points or numbered items 802 | lines = output_text.split('\n') 803 | for line in lines: 804 | line = line.strip().lstrip('-').lstrip('*').lstrip('>').strip() 805 | # Remove numbering like "1.", "2.", etc. 806 | line = re.sub(r'^\d+\.\s*', '', line) 807 | if len(line) > 30 and not line.startswith('[') and not line.startswith(']'): 808 | key_findings.append(line) 809 | 810 | # Limit to reasonable number 811 | key_findings = key_findings[:15] 812 | 813 | # If still empty, create basic findings from search results 814 | if not key_findings and state.search_results: 815 | logger.warning("Agent produced no findings, creating basic ones from results") 816 | key_findings = [ 817 | f"{r.title}: {r.snippet[:100]}..." 818 | for r in state.search_results[:10] 819 | if r.snippet 820 | ] 821 | 822 | logger.info(f"Extracted {len(key_findings)} key findings") 823 | 824 | # Emit synthesis completion 825 | await emit_synthesis_complete(len(key_findings)) 826 | 827 | # Return dict updates - LangGraph merges into state 828 | return { 829 | "key_findings": key_findings, 830 | "current_stage": "reporting", 831 | "iterations": state.iterations + 1, 832 | "llm_calls": state.llm_calls + 1, 833 | "total_input_tokens": state.total_input_tokens + input_tokens, 834 | "total_output_tokens": state.total_output_tokens + output_tokens, 835 | "llm_call_details": state.llm_call_details + [call_detail] 836 | } 837 | 838 | except Exception as e: 839 | logger.warning(f"Synthesis attempt {attempt + 1} failed: {str(e)}") 840 | if attempt == self.max_retries - 1: 841 | logger.error(f"Synthesis failed after {self.max_retries} attempts") 842 | return { 843 | "error": f"Synthesis failed: {str(e)}", 844 | "iterations": state.iterations + 1 845 | } 846 | else: 847 | await asyncio.sleep(2 ** attempt) 848 | 849 | # Fallback if all retries exhausted 850 | return { 851 | "error": "Synthesis failed: Maximum retries exceeded", 852 | "iterations": state.iterations + 1 853 | } 854 | 855 | 856 | class ReportWriter: 857 | """Autonomous agent responsible for writing research reports.""" 858 | 859 | def __init__(self, citation_style: str = 'apa'): 860 | self.llm = get_llm(temperature=0.7) 861 | self.tools = get_research_tools(agent_type="writing") 862 | self.max_retries = 3 863 | self.citation_style = citation_style 864 | self.citation_formatter = CitationFormatter() 865 | 866 | async def write_report(self, state: ResearchState) -> dict: 867 | """Write the final research report with validation and retry. 868 | 869 | Returns dict with report data that LangGraph will merge into state. 870 | """ 871 | logger.info("Writing final report") 872 | 873 | if not state.plan or not state.key_findings: 874 | await emit_error("Insufficient data for report generation") 875 | return {"error": "Insufficient data for report generation"} 876 | 877 | # Emit writing start 878 | await emit_writing_start(len(state.plan.report_outline)) 879 | 880 | # Track total LLM calls for report generation 881 | report_llm_calls = 0 882 | report_input_tokens = 0 883 | report_output_tokens = 0 884 | report_call_details = [] 885 | 886 | for attempt in range(self.max_retries): 887 | try: 888 | # Generate each section with retry 889 | report_sections = [] 890 | total_sections = len(state.plan.report_outline) 891 | 892 | for section_idx, section_title in enumerate(state.plan.report_outline, 1): 893 | # Emit progress for each section 894 | await emit_writing_section(section_title, section_idx, total_sections) 895 | 896 | section, section_tokens = await self._write_section( 897 | state.research_topic, 898 | section_title, 899 | state.key_findings, 900 | state.search_results 901 | ) 902 | if section: 903 | report_sections.append(section) 904 | if section_tokens: 905 | report_llm_calls += 1 906 | report_input_tokens += section_tokens['input_tokens'] 907 | report_output_tokens += section_tokens['output_tokens'] 908 | report_call_details.append(section_tokens) 909 | 910 | # Validate minimum quality 911 | if not report_sections: 912 | raise ValueError("No report sections generated") 913 | 914 | # Create temporary state for compilation 915 | temp_state = ResearchState( 916 | research_topic=state.research_topic, 917 | plan=state.plan, 918 | report_sections=report_sections 919 | ) 920 | 921 | # Compile final report 922 | final_report = self._compile_report(temp_state) 923 | 924 | # Format citations in specified style 925 | if state.search_results: 926 | final_report = self.citation_formatter.update_report_citations( 927 | final_report, 928 | style=self.citation_style, 929 | search_results=state.search_results 930 | ) 931 | 932 | # Add credibility information to report if available 933 | if state.credibility_scores: 934 | high_cred_sources = [ 935 | i+1 for i, score in enumerate(state.credibility_scores) 936 | if score.get('level') == 'high' 937 | ] 938 | if high_cred_sources: 939 | final_report += f"\n\n---\n\n**Note:** {len(high_cred_sources)} high-credibility sources were prioritized in this research." 940 | 941 | # Validate report length 942 | if len(final_report) < 500: 943 | raise ValueError("Report too short - insufficient content") 944 | 945 | logger.info(f"Report generation complete: {len(final_report)} chars") 946 | 947 | # Emit writing completion 948 | await emit_writing_complete(len(final_report)) 949 | 950 | # Return dict updates - LangGraph merges into state 951 | return { 952 | "report_sections": report_sections, 953 | "final_report": final_report, 954 | "current_stage": "complete", 955 | "iterations": state.iterations + 1, 956 | "llm_calls": state.llm_calls + report_llm_calls, 957 | "total_input_tokens": state.total_input_tokens + report_input_tokens, 958 | "total_output_tokens": state.total_output_tokens + report_output_tokens, 959 | "llm_call_details": state.llm_call_details + report_call_details 960 | } 961 | 962 | except Exception as e: 963 | logger.warning(f"Report attempt {attempt + 1} failed: {str(e)}") 964 | if attempt == self.max_retries - 1: 965 | logger.error(f"Report generation failed after {self.max_retries} attempts") 966 | return { 967 | "error": f"Report writing failed: {str(e)}", 968 | "iterations": state.iterations + 1 969 | } 970 | else: 971 | await asyncio.sleep(2 ** attempt) 972 | 973 | # Fallback if all retries exhausted 974 | return { 975 | "error": "Report generation failed: Maximum retries exceeded", 976 | "iterations": state.iterations + 1 977 | } 978 | 979 | async def _write_section( 980 | self, 981 | topic: str, 982 | section_title: str, 983 | findings: List[str], 984 | search_results: List 985 | ) -> tuple: 986 | """Autonomously write a single report section using tools.""" 987 | logger.info(f"Writing section: {section_title}") 988 | 989 | # Create system prompt for section writing 990 | system_prompt = f"""You are a distinguished research writer and subject matter expert. Your task is to write authoritative, accurate, and well-structured report sections that inform and educate readers. 991 | 992 | ## Your Available Tools 993 | - **format_citation(url, title, style)**: Format citations in academic styles 994 | - **validate_section_quality(section_text, min_words)**: Verify section meets quality standards 995 | 996 | ## Writing Standards 997 | 998 | ### Content Quality Requirements 999 | 1. **Minimum Length**: {config.min_section_words} words (use validate_section_quality to verify) 1000 | 2. **Factual Accuracy**: Every claim must be grounded in the provided findings 1001 | 3. **Proper Citations**: Use inline citations [1], [2], etc. for all factual claims 1002 | 4. **Balanced Perspective**: Present multiple viewpoints when they exist 1003 | 5. **Technical Precision**: Use correct terminology; don't oversimplify incorrectly 1004 | 1005 | ### Structure & Formatting (Markdown) 1006 | - Use **bold** for key terms and important concepts 1007 | - Use bullet points or numbered lists for multiple items 1008 | - Use subheadings (### or ####) to organize complex sections 1009 | - Include specific examples, data points, or case studies when available 1010 | - Maintain logical flow from one paragraph to the next 1011 | 1012 | ### Writing Style Guidelines 1013 | - **Tone**: Professional, authoritative, but accessible 1014 | - **Voice**: Third-person academic style (avoid "I", "we", "you") 1015 | - **Clarity**: Explain complex concepts clearly; define technical terms 1016 | - **Conciseness**: Every sentence should add value; avoid filler 1017 | - **Precision**: Use specific language; avoid vague qualifiers like "very" or "many" 1018 | 1019 | ## Critical Accuracy Rules 1020 | 1021 | ### DO 1022 | - Base all claims on the provided key findings 1023 | - Cite sources for factual statements: "According to [1]..." or "Research indicates [2]..." 1024 | - Distinguish between established facts and emerging trends 1025 | - Note limitations or caveats when relevant 1026 | - Use specific numbers, dates, and names from sources 1027 | - Acknowledge when evidence is limited: "Available data suggests..." 1028 | 1029 | ### DO NOT 1030 | - Invent statistics, percentages, or specific numbers not in findings 1031 | - Make claims that go beyond the provided information 1032 | - Present opinions as facts without attribution 1033 | - Ignore contradictions between sources 1034 | - Use placeholder text or generic filler content 1035 | - Oversimplify to the point of inaccuracy 1036 | 1037 | ## Section Writing Process 1038 | 1039 | 1. **Analyze**: Review the findings relevant to this section's topic 1040 | 2. **Outline**: Mentally structure the key points to cover 1041 | 3. **Draft**: Write comprehensive content with proper citations 1042 | 4. **Verify**: Use validate_section_quality to check length and citations 1043 | 5. **Refine**: Ensure logical flow and accuracy 1044 | 1045 | ## Output Format 1046 | Write the section content directly in markdown format. Start with the content immediately (the section title will be added automatically). Ensure proper spacing between paragraphs. 1047 | 1048 | Example structure: 1049 | ``` 1050 | [Opening paragraph introducing the section topic] 1051 | 1052 | [Main content paragraph with specific details and citations [1]] 1053 | 1054 | ### [Subheading if needed] 1055 | 1056 | [Additional content with more citations [2], [3]] 1057 | 1058 | [Concluding paragraph summarizing key points] 1059 | ```""" 1060 | 1061 | # Create autonomous writing agent 1062 | agent_graph = create_agent( 1063 | self.llm, 1064 | self.tools, 1065 | system_prompt=system_prompt 1066 | ) 1067 | 1068 | try: 1069 | start_time = time.time() 1070 | 1071 | # Prepare input message with source context 1072 | sources_context = "" 1073 | if search_results: 1074 | sources_context = "\n\nAvailable Sources for Citation:\n" + "\n".join( 1075 | f"[{i+1}] {r.title} ({r.url})" 1076 | for i, r in enumerate(search_results[:15]) # Top 15 sources 1077 | ) 1078 | 1079 | input_message = f"""## Assignment: Write Report Section 1080 | 1081 | **Research Topic**: {topic} 1082 | **Section Title**: {section_title} 1083 | **Minimum Word Count**: {config.min_section_words} words 1084 | 1085 | --- 1086 | 1087 | ### Key Findings to Incorporate: 1088 | {chr(10).join(f"- {f}" for f in findings)} 1089 | 1090 | {sources_context} 1091 | 1092 | --- 1093 | 1094 | ### Instructions: 1095 | 1. Write a comprehensive section that covers the topic "{section_title}" thoroughly 1096 | 2. Incorporate the key findings above, adding context and explanation 1097 | 3. Use inline citations [1], [2], etc. when referencing specific facts from sources 1098 | 4. Maintain academic rigor while being accessible to general readers 1099 | 5. Use markdown formatting for structure (bold, lists, subheadings as needed) 1100 | 6. After writing, use validate_section_quality to ensure minimum word count is met 1101 | 1102 | Write the section content now:""" 1103 | 1104 | # Estimate input tokens 1105 | input_tokens = estimate_tokens(input_message) 1106 | 1107 | # Execute autonomous section writing 1108 | result = await agent_graph.ainvoke({ 1109 | "messages": [{"role": "user", "content": input_message}] 1110 | }) 1111 | 1112 | # Extract content from result 1113 | messages = result.get('messages', []) 1114 | content = "" 1115 | if messages: 1116 | last_msg = messages[-1] 1117 | # Handle different content formats 1118 | if hasattr(last_msg, 'content'): 1119 | msg_content = last_msg.content 1120 | # If content is a list (like from tool responses), extract text 1121 | if isinstance(msg_content, list): 1122 | content = "" 1123 | for item in msg_content: 1124 | if isinstance(item, dict) and 'text' in item: 1125 | content += item['text'] 1126 | elif isinstance(item, dict) and 'type' in item and item['type'] == 'text': 1127 | content += item.get('text', '') 1128 | else: 1129 | content += str(item) 1130 | else: 1131 | content = str(msg_content) 1132 | else: 1133 | content = str(last_msg) 1134 | 1135 | # Track LLM call 1136 | duration = time.time() - start_time 1137 | output_tokens = estimate_tokens(content) 1138 | call_detail = { 1139 | 'agent': 'ReportWriter', 1140 | 'operation': f'write_section_{section_title[:30]}', 1141 | 'model': config.model_name, 1142 | 'input_tokens': input_tokens, 1143 | 'output_tokens': output_tokens, 1144 | 'duration': round(duration, 2) 1145 | } 1146 | 1147 | # Extract cited sources 1148 | import re 1149 | citations = re.findall(r'\[(\d+)\]', content) 1150 | source_urls = [] 1151 | for cite_num in set(citations): 1152 | idx = int(cite_num) - 1 1153 | if 0 <= idx < len(search_results): 1154 | source_urls.append(search_results[idx].url) 1155 | 1156 | section = ReportSection( 1157 | title=section_title, 1158 | content=content, 1159 | sources=source_urls 1160 | ) 1161 | 1162 | return section, call_detail 1163 | 1164 | except Exception as e: 1165 | logger.error(f"Error writing section '{section_title}': {str(e)}") 1166 | return None, None 1167 | 1168 | def _compile_report(self, state: ResearchState) -> str: 1169 | """Compile all sections into final report.""" 1170 | # Count actual sources from search results 1171 | search_results = getattr(state, 'search_results', []) or [] 1172 | report_sections = getattr(state, 'report_sections', []) or [] 1173 | 1174 | # Get unique URLs from search results 1175 | unique_sources = set() 1176 | for result in search_results: 1177 | if hasattr(result, 'url') and result.url: 1178 | unique_sources.add(result.url) 1179 | 1180 | # Also collect from report sections if they have sources 1181 | for section in report_sections: 1182 | if hasattr(section, 'sources'): 1183 | unique_sources.update(section.sources) 1184 | 1185 | source_count = len(unique_sources) if unique_sources else len(search_results) 1186 | 1187 | report_parts = [ 1188 | f"# {state.research_topic}\n", 1189 | f"**Deep Research Report**\n", 1190 | f"\n## Executive Summary\n", 1191 | f"This report provides a comprehensive analysis of {state.research_topic}. ", 1192 | f"The research was conducted across **{source_count} sources** ", 1193 | f"and synthesized into **{len(report_sections)} key sections**.\n", 1194 | f"\n## Research Objectives\n" 1195 | ] 1196 | 1197 | if state.plan and hasattr(state.plan, 'objectives'): 1198 | for i, obj in enumerate(state.plan.objectives, 1): 1199 | report_parts.append(f"{i}. {obj}\n") 1200 | 1201 | report_parts.append("\n---\n") 1202 | 1203 | # Add all sections 1204 | has_references_section = False 1205 | for section in report_sections: 1206 | # Check if content already starts with the title as a heading 1207 | content = section.content.strip() 1208 | 1209 | # Check if this section contains References 1210 | if "## References" in content or section.title.lower() == "references": 1211 | has_references_section = True 1212 | 1213 | if content.startswith(f"## {section.title}"): 1214 | # Content already has heading, use as-is 1215 | report_parts.append(f"\n{content}\n\n") 1216 | else: 1217 | # Add heading before content 1218 | report_parts.append(f"\n## {section.title}\n\n") 1219 | report_parts.append(content) 1220 | report_parts.append("\n") 1221 | 1222 | # Only add references if not already present in sections 1223 | if not has_references_section: 1224 | # Add references from search results 1225 | report_parts.append("\n---\n\n## References\n\n") 1226 | 1227 | # Build a list of (url, title) tuples from search results 1228 | source_info = [] 1229 | seen_urls = set() 1230 | 1231 | for result in search_results: 1232 | if hasattr(result, 'url') and result.url and result.url not in seen_urls: 1233 | seen_urls.add(result.url) 1234 | title = getattr(result, 'title', '') 1235 | source_info.append((result.url, title)) 1236 | 1237 | # Add sources from sections if available (if not already included) 1238 | for section in report_sections: 1239 | if hasattr(section, 'sources'): 1240 | for url in section.sources: 1241 | if url not in seen_urls: 1242 | seen_urls.add(url) 1243 | source_info.append((url, '')) 1244 | 1245 | # Add formatted references (only once, outside the loop) 1246 | if not has_references_section: 1247 | if source_info: 1248 | from src.utils.citations import CitationFormatter 1249 | formatter = CitationFormatter() 1250 | for i, (url, title) in enumerate(source_info[:30], 1): # Top 30 sources 1251 | # Format citation in APA style 1252 | citation = formatter.format_apa(url, title) 1253 | report_parts.append(f"{i}. {citation}\n") 1254 | else: 1255 | report_parts.append("*No sources were available for this research.*\n") 1256 | 1257 | return "".join(report_parts) 1258 | 1259 | --------------------------------------------------------------------------------