├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples ├── gemini-flash2.5.md ├── gemini.md ├── gpt4.0-mini.md ├── mistral_large.md └── o3-mini-high.md ├── requirements.txt ├── setup.py ├── shandu ├── README.md ├── __init__.py ├── agents │ ├── __init__.py │ ├── agent.py │ ├── graph │ │ ├── __init__.py │ │ ├── builder.py │ │ └── wrapper.py │ ├── langgraph_agent.py │ ├── nodes │ │ ├── __init__.py │ │ ├── citations.py │ │ ├── generate_queries.py │ │ ├── initialize.py │ │ ├── reflect.py │ │ ├── report_generation.py │ │ ├── search.py │ │ └── source_selection.py │ ├── processors │ │ ├── __init__.py │ │ ├── content_processor.py │ │ └── report_generator.py │ └── utils │ │ ├── __init__.py │ │ ├── agent_utils.py │ │ ├── citation_manager.py │ │ └── citation_registry.py ├── cli.py ├── config.py ├── prompts.py ├── research │ ├── __init__.py │ └── researcher.py ├── scraper │ ├── __init__.py │ └── scraper.py ├── search │ ├── __init__.py │ ├── ai_search.py │ └── search.py └── utils │ └── logger.py └── tests ├── test_citation_registry.py └── test_report_generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python bytecode 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | dist/ 8 | build/ 9 | *.egg-info/ 10 | 11 | # Virtual environments 12 | venv/ 13 | env/ 14 | ENV/ 15 | 16 | # IDE files 17 | .idea/ 18 | .vscode/ 19 | *.swp 20 | *.swo 21 | 22 | # OS specific files 23 | .DS_Store 24 | Thumbs.db 25 | 26 | # Environment variables 27 | .env 28 | 29 | # Logs 30 | *.log 31 | 32 | # Test files 33 | test_*.md 34 | .pytest_cache/ 35 | .coverage 36 | htmlcov/ 37 | helper.txt 38 | # Jupyter Notebooks 39 | .ipynb_checkpoints 40 | *.ipynb 41 | 42 | # Cache directories 43 | .cache/ 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Dušan Jolović 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include requirements.txt 4 | recursive-include examples *.md 5 | recursive-include shandu *.md 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Shandu 2.0: Advanced AI Research System with Robust Report Generation 2 | 3 | Shandu is a cutting-edge AI research assistant that performs in-depth, multi-source research on any topic using advanced language models, intelligent web scraping, and iterative exploration to generate comprehensive, well-structured reports with proper citations. 4 | 5 | [![MIT License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) 6 | [![Python 3.9+](https://img.shields.io/badge/Python-3.9+-blue.svg)](https://www.python.org/downloads/) 7 | 8 | ## 🔍 What is Shandu? 9 | 10 | Shandu is an intelligent, LLM-powered research system that automates the comprehensive research process - from initial query clarification to in-depth content analysis and report generation. Built on LangGraph's state-based workflow, it recursively explores topics with sophisticated algorithms for source evaluation, content extraction, and knowledge synthesis. 11 | 12 | ### Key Use Cases 13 | 14 | - **Academic Research**: Generate literature reviews, background information, and complex topic analyses 15 | - **Market Intelligence**: Analyze industry trends, competitor strategies, and market opportunities 16 | - **Content Creation**: Produce well-researched articles, blog posts, and reports with proper citations 17 | - **Technology Exploration**: Track emerging technologies, innovations, and technical developments 18 | - **Policy Analysis**: Research regulations, compliance requirements, and policy implications 19 | - **Competitive Analysis**: Compare products, services, and company strategies across industries 20 | 21 | ## 🚀 What's New in Version 2.0 22 | 23 | Shandu 2.0 introduces a major redesign of the report generation pipeline to produce more coherent, reliable reports: 24 | 25 | - **Modular Report Generation**: Process reports in self-contained sections, enhancing overall system reliability 26 | - **Robust Error Recovery**: Automatic retry mechanisms with intelligent fallbacks prevent the system from getting stuck 27 | - **Section-By-Section Processing**: Each section is processed independently, allowing for better error isolation 28 | - **Progress Tracking**: Detailed progress tracking helps identify exactly where the process is at each stage 29 | - **Enhanced Citation Management**: More reliable citation handling ensures proper attribution throughout reports 30 | - **Intelligent Parallelization**: Key processes run in parallel where possible for improved performance 31 | - **Comprehensive Fallback Mechanisms**: If any step fails, the system gracefully degrades rather than halting 32 | 33 | ## ⚙️ How Shandu Works 34 | 35 | ```mermaid 36 | flowchart TB 37 | subgraph Input 38 | Q[User Query] 39 | B[Breadth Parameter] 40 | D[Depth Parameter] 41 | end 42 | 43 | subgraph Research[Research Phase] 44 | direction TB 45 | DR[Deep Research] 46 | SQ[SERP Queries] 47 | PR[Process Results] 48 | NL[(Sources & Learnings)] 49 | ND[(Directions)] 50 | end 51 | 52 | subgraph Report[Report Generation] 53 | direction TB 54 | TG[Title Generation] 55 | TE[Theme Extraction] 56 | IR[Initial Report] 57 | ES[Section Enhancement] 58 | EX[Section Expansion] 59 | FR[Final Report] 60 | end 61 | 62 | %% Main Flow 63 | Q & B & D --> DR 64 | DR --> SQ --> PR 65 | PR --> NL 66 | PR --> ND 67 | 68 | DP{depth > 0?} 69 | NL & ND --> DP 70 | 71 | RD["Next Direction: 72 | - Prior Goals 73 | - New Questions 74 | - Learnings"] 75 | 76 | %% Circular Flow 77 | DP -->|Yes| RD 78 | RD -->|New Context| DR 79 | 80 | %% To Report Generation 81 | DP -->|No| TG 82 | TG --> TE --> IR --> ES --> EX --> FR 83 | 84 | %% Styling 85 | classDef input fill:#7bed9f,stroke:#2ed573,color:black 86 | classDef process fill:#70a1ff,stroke:#1e90ff,color:black 87 | classDef recursive fill:#ffa502,stroke:#ff7f50,color:black 88 | classDef output fill:#ff4757,stroke:#ff6b81,color:white 89 | classDef storage fill:#a8e6cf,stroke:#3b7a57,color:black 90 | 91 | class Q,B,D input 92 | class DR,SQ,PR,TG,TE,IR,ES,EX process 93 | class DP,RD recursive 94 | class FR output 95 | class NL,ND storage 96 | ``` 97 | 98 | ## 🌟 Key Features 99 | 100 | - **Intelligent State-based Workflow**: Leverages LangGraph for a structured, step-by-step research process 101 | - **Iterative Deep Exploration**: Recursively explores topics with dynamic depth and breadth parameters 102 | - **Multi-source Information Synthesis**: Analyzes data from search engines, web content, and knowledge bases 103 | - **Enhanced Web Scraping**: Features dynamic JS rendering, content extraction, and ethical scraping practices 104 | - **Smart Source Evaluation**: Automatically assesses source credibility, relevance, and information value 105 | - **Content Analysis Pipeline**: Uses advanced NLP to extract key information, identify patterns, and synthesize findings 106 | - **Sectional Report Generation**: Creates detailed reports by processing individual sections for maximum reliability 107 | - **Parallel Processing Architecture**: Implements concurrent operations for efficient multi-query execution 108 | - **Adaptive Search Strategy**: Dynamically adjusts search queries based on discovered information 109 | - **Full Citation Management**: Properly attributes all sources with formatted citations in multiple styles 110 | 111 | ## 🏁 Quick Start 112 | 113 | ```bash 114 | # Install from PyPI 115 | pip install shandu 116 | 117 | # Install from source 118 | git clone https://github.com/jolovicdev/shandu.git 119 | cd shandu 120 | pip install -e . 121 | 122 | # Configure API settings (supports various LLM providers) 123 | shandu configure 124 | 125 | # Run comprehensive research 126 | shandu research "Your research query" --depth 2 --breadth 4 --output report.md 127 | 128 | # Quick AI-powered search with web scraping 129 | shandu aisearch "Who is the current sitting president of United States?" --detailed 130 | ``` 131 | 132 | ## 📚 Detailed Usage 133 | 134 | ### Research Command 135 | 136 | ```bash 137 | shandu research "Your research query" \ 138 | --depth 3 \ # How deep to explore (1-5, default: 2) 139 | --breadth 5 \ # How many parallel queries (2-10, default: 4) 140 | --output report.md \ # Save to file instead of terminal 141 | --verbose # Show detailed progress 142 | ``` 143 | 144 | ### Example Reports 145 | 146 | You can find example reports in the examples directory: 147 | 148 | 1. **The Intersection of Quantum Computing, Synthetic Biology, and Climate Modeling** 149 | ```bash 150 | shandu research "The Intersection of Quantum Computing, Synthetic Biology, and Climate Modeling" --depth 3 --breadth 3 --output examples/o3-mini-high.md 151 | ``` 152 | 153 | ## 💻 Python API 154 | 155 | ```python 156 | from shandu.agents import ResearchGraph 157 | from langchain_openai import ChatOpenAI 158 | 159 | # Initialize with custom LLM if desired 160 | llm = ChatOpenAI(model="gpt-4") 161 | 162 | # Initialize the research graph 163 | researcher = ResearchGraph( 164 | llm=llm, 165 | temperature=0.5 166 | ) 167 | 168 | # Perform deep research 169 | results = researcher.research_sync( 170 | query="Your research query", 171 | depth=3, # How deep to go with recursive research 172 | breadth=4, # How many parallel queries to explore 173 | detail_level="high" 174 | ) 175 | 176 | # Print or save results 177 | print(results.to_markdown()) 178 | ``` 179 | 180 | ## 🧩 Advanced Architecture 181 | 182 | ### Research Pipeline 183 | 184 | Shandu's research pipeline consists of these key stages: 185 | 186 | 1. **Query Clarification**: Interactive questions to understand research needs 187 | 2. **Research Planning**: Strategic planning for comprehensive topic coverage 188 | 3. **Iterative Exploration**: 189 | - Smart query generation based on knowledge gaps 190 | - Multi-engine search with parallelized execution 191 | - Relevance filtering of search results 192 | - Intelligent web scraping with content extraction 193 | - Source credibility assessment 194 | - Information analysis and synthesis 195 | - Reflection on findings to identify gaps 196 | 197 | ### Report Generation Pipeline 198 | 199 | Shandu 2.0 introduces a robust, modular report generation pipeline: 200 | 201 | 1. **Data Preparation**: Registration of all sources and their metadata for proper citation 202 | 2. **Title Generation**: Creating a concise, professional title (with retry mechanisms) 203 | 3. **Theme Extraction**: Identifying key themes to organize the report structure 204 | 4. **Citation Formatting**: Properly formatting all citations for reference 205 | 5. **Initial Report Generation**: Creating a comprehensive draft report 206 | 6. **Section Enhancement**: Individually processing each section to add detail and depth 207 | 7. **Key Section Expansion**: Identifying and expanding the most important sections 208 | 8. **Report Finalization**: Final processing and validation of the complete report 209 | 210 | Each step includes: 211 | - Comprehensive error handling 212 | - Automatic retries with exponential backoff 213 | - Intelligent fallbacks when issues occur 214 | - Progress tracking for transparency 215 | - Validation to ensure quality output 216 | 217 | ## 🔌 Supported Search Engines & Sources 218 | 219 | - Google Search 220 | - DuckDuckGo 221 | - Wikipedia 222 | - ArXiv (academic papers) 223 | - Custom search engines can be added 224 | 225 | ## 📊 Technical Capabilities 226 | 227 | - **Dynamic JS Rendering**: Handles JavaScript-heavy websites 228 | - **Content Extraction**: Identifies and extracts main content from web pages 229 | - **Parallel Processing**: Concurrent execution of searches and scraping 230 | - **Caching**: Efficient caching of search results and scraped content 231 | - **Rate Limiting**: Respectful access to web resources 232 | - **Robots.txt Compliance**: Ethical web scraping practices 233 | - **Flexible Output Formats**: Markdown, JSON, plain text 234 | 235 | ## 📜 License 236 | 237 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 238 | -------------------------------------------------------------------------------- /examples/gpt4.0-mini.md: -------------------------------------------------------------------------------- 1 | # Autonomous Vehicles: Safety and Smart City Integration 2 | 3 | ## Executive Summary 4 | 5 | The advent of autonomous vehicles (AVs) represents a transformative shift in urban mobility, promising enhanced safety, efficiency, and environmental sustainability. This report delves into the current state of AV technology, its safety implications, infrastructure requirements for integration into smart cities, and the broader impacts on urban mobility and the environment. Through a comprehensive analysis of technological advancements, safety records, infrastructure needs, and policy frameworks, this report aims to provide actionable recommendations for urban planners and policymakers. The findings indicate that while AVs hold significant potential to reduce greenhouse gas emissions and improve traffic flow, challenges such as public perception, regulatory hurdles, and infrastructure readiness must be addressed to facilitate their widespread adoption. 6 | 7 | ## Introduction 8 | 9 | The integration of autonomous vehicles into urban environments is a critical component of the evolution towards smart cities. As cities grapple with increasing traffic congestion, pollution, and safety concerns, AVs offer a promising solution that could revolutionize urban transportation. This report aims to analyze the multifaceted aspects of AV technology, focusing on safety features, infrastructure requirements, and the implications for urban mobility and environmental sustainability. By synthesizing findings from various sources, this report seeks to provide a comprehensive understanding of the current landscape of AVs and their potential role in shaping the future of urban transportation. 10 | 11 | ## Technological Advancements in Autonomous Vehicles 12 | 13 | ### Sensor Systems 14 | 15 | The backbone of AV technology lies in its sophisticated sensor systems, which include LiDAR, radar, cameras, and ultrasonic sensors. These systems work in tandem to create a comprehensive understanding of the vehicle's surroundings, enabling safe navigation and obstacle detection. Recent advancements have led to improved sensor accuracy and reliability, allowing AVs to operate effectively in diverse environmental conditions, including low visibility scenarios such as fog and heavy rain. 16 | 17 | ### Artificial Intelligence Algorithms 18 | 19 | Artificial intelligence (AI) plays a pivotal role in processing the vast amounts of data collected by AV sensors. Machine learning algorithms are employed to enhance decision-making capabilities, enabling vehicles to predict and respond to dynamic traffic conditions. Innovations in deep learning have significantly improved the ability of AVs to recognize and interpret complex scenarios, such as pedestrian movements and unexpected road hazards, thereby enhancing safety and performance metrics. 20 | 21 | ### Vehicle-to-Everything (V2X) Communication 22 | 23 | V2X communication technology facilitates real-time data exchange between vehicles, infrastructure, and other road users. This connectivity is crucial for the effective operation of AVs within smart city frameworks. By sharing information about traffic conditions, road hazards, and other relevant data, V2X communication enhances situational awareness and enables coordinated responses to traffic events, ultimately improving safety and efficiency on urban roadways. 24 | 25 | ## Safety Implications of Autonomous Vehicles 26 | 27 | ### Safety Records Compared to Traditional Vehicles 28 | 29 | The safety implications of AVs are a critical area of investigation. Preliminary data suggests that AVs may have lower accident rates compared to traditional vehicles. For instance, studies indicate that AVs are less prone to human error, which is a leading cause of traffic accidents. However, the transition period, during which AVs share the road with human-driven vehicles, presents unique challenges that must be addressed to ensure overall safety on urban roadways [3][5]. 30 | 31 | ### Risk Factors and Public Perception 32 | 33 | Despite the potential safety benefits, public perception of AVs remains mixed. Concerns about the reliability of AV technology, particularly in complex urban environments, contribute to skepticism regarding their safety. Addressing these concerns through transparent communication, public education, and robust safety testing is essential for fostering public trust in AV technology. 34 | 35 | ## Infrastructure Requirements for AV Integration 36 | 37 | ### Smart Traffic Management Systems 38 | 39 | The successful integration of AVs into urban environments necessitates the development of smart traffic management systems. These systems utilize real-time data analytics to optimize traffic flow, reduce congestion, and enhance safety. Adaptive traffic signals, for example, can adjust their timing based on current traffic conditions, allowing for smoother vehicle movement and reduced wait times at intersections [10][15]. 40 | 41 | ### Dedicated Lanes and Charging Station Networks 42 | 43 | To accommodate the unique operational characteristics of AVs, dedicated lanes may be required in certain urban areas. These lanes can facilitate smoother traffic flow and reduce the likelihood of conflicts with human-driven vehicles. Additionally, the establishment of a comprehensive network of charging stations is essential to support the anticipated increase in electric AVs, further contributing to environmental sustainability [12][14]. 44 | 45 | ## Role of Smart Traffic Systems 46 | 47 | ### Adaptive Traffic Signals 48 | 49 | Adaptive traffic signals are a key component of smart traffic systems, allowing for real-time adjustments based on traffic conditions. By integrating AVs into these systems, cities can enhance traffic management and improve overall safety. For instance, signals can prioritize AVs during peak hours, reducing congestion and improving travel times for all road users. 50 | 51 | ### Real-Time Data Analytics 52 | 53 | The use of real-time data analytics in traffic management enables cities to respond proactively to changing conditions. By analyzing traffic patterns, cities can implement congestion management strategies that optimize the flow of AVs and traditional vehicles alike. This approach not only enhances safety but also contributes to a more efficient urban transportation network [11][15]. 54 | 55 | ## Urban Mobility and Environmental Impact 56 | 57 | ### Enhancing Urban Mobility 58 | 59 | AVs have the potential to significantly enhance urban mobility by reducing traffic congestion and improving accessibility. By optimizing routing and minimizing delays, AVs can facilitate smoother transportation for all users, including pedestrians and cyclists. Furthermore, the integration of AVs into public transportation systems can provide last-mile connectivity, enhancing overall mobility options within urban areas [2][6]. 60 | 61 | ### Environmental Benefits 62 | 63 | The environmental benefits associated with the adoption of AVs are substantial. As AV technology matures, it is expected that a significant proportion of these vehicles will be electric, leading to a reduction in greenhouse gas emissions from the transportation sector. Additionally, AVs can contribute to more efficient land use by reducing the need for extensive parking infrastructure, allowing for the repurposing of urban spaces for green areas and community development [1][4]. 64 | 65 | ## Policy and Regulatory Frameworks 66 | 67 | ### Existing Policies and Regulations 68 | 69 | The regulatory landscape for AVs is still evolving, with various jurisdictions implementing different policies to govern their operation. Current regulations often focus on safety standards, testing protocols, and liability issues. However, there are significant gaps in the regulatory framework that need to be addressed to facilitate the safe integration of AVs into urban environments [3][10]. 70 | 71 | ### Identifying Gaps and Challenges 72 | 73 | Key challenges include the need for standardized testing protocols, liability frameworks for accidents involving AVs, and guidelines for data privacy and security. Policymakers must work collaboratively with industry stakeholders to develop comprehensive regulations that address these challenges while promoting innovation and public safety [6][12]. 74 | 75 | ## Recommendations for Urban Planners and Policymakers 76 | 77 | ### Infrastructure Development 78 | 79 | Urban planners should prioritize the development of infrastructure that supports AV integration, including dedicated lanes, charging stations, and smart traffic management systems. Investments in these areas will be crucial for facilitating the safe and efficient operation of AVs within urban environments. 80 | 81 | ### Public Engagement and Education 82 | 83 | To address public concerns regarding AV safety, policymakers should implement public engagement initiatives that educate citizens about the benefits and safety features of AV technology. Transparent communication and community involvement in the planning process can help build trust and acceptance of AVs in urban settings. 84 | 85 | ### Collaborative Policy Frameworks 86 | 87 | Policymakers should foster collaboration between government agencies, industry stakeholders, and academic institutions to develop comprehensive policy frameworks that address the unique challenges posed by AVs. This collaborative approach will ensure that regulations are informed by the latest technological advancements and best practices in urban mobility. 88 | 89 | ## Conclusion 90 | 91 | The integration of autonomous vehicles into urban environments presents both opportunities and challenges. While AVs have the potential to enhance safety, improve traffic flow, and reduce environmental impacts, significant barriers must be overcome to facilitate their widespread adoption. By focusing on infrastructure development, public engagement, and collaborative policy frameworks, urban planners and policymakers can create a conducive environment for the successful integration of AVs into smart cities. The findings of this report underscore the importance of a holistic approach to AV integration, one that prioritizes safety, efficiency, and sustainability in urban mobility. 92 | 93 | # References 94 | 95 | [1] *www.eesi.org*, "https://www.eesi.org/papers/view/issue-brief-autonomous-vehicles-state-of-the-technology-and-potential-role-as-a-climate-solution", https://www.eesi.org/papers/view/issue-brief-autonomous-vehicles-state-of-the-technology-and-potential-role-as-a-climate-solution 96 | [2] *www.mdpi.com*, "How Autonomous Vehicles Shape Urban Traffic Sustainability - MDPI", https://www.mdpi.com/2071-1050/17/6/2589 97 | [3] *www.nhtsa.gov*, "https://www.nhtsa.gov/vehicle-safety/automated-vehicles-safety", https://www.nhtsa.gov/vehicle-safety/automated-vehicles-safety 98 | [4] *www.nature.com*, "https://www.nature.com/articles/s41467-024-48526-4", https://www.nature.com/articles/s41467-024-48526-4 99 | [5] *arstechnica.com*, "https://arstechnica.com/cars/2023/09/are-self-driving-cars-already-safer-than-human-drivers/", https://arstechnica.com/cars/2023/09/are-self-driving-cars-already-safer-than-human-drivers/ 100 | [6] *www.forbes.com*, "https://www.forbes.com/sites/technology/article/self-driving-cars/", https://www.forbes.com/sites/technology/article/self-driving-cars/ 101 | [7] *www.hashstudioz.com*, "The Role of Autonomous Vehicles and AI in Smart Transportation Systems", https://www.hashstudioz.com/blog/ai-in-transportation-from-self-driving-cars-to-smart-traffic/ 102 | [8] *maddevs.io*, "https://maddevs.io/blog/transportation-and-ai/", https://maddevs.io/blog/transportation-and-ai/ 103 | [9] *arxiv.org*, "https://arxiv.org/html/2410.10929v6", https://arxiv.org/html/2410.10929v6 104 | [10] *www.mckinsey.com*, "https://www.mckinsey.com/industries/infrastructure/our-insights/a-new-look-at-autonomous-vehicle-infrastructure", https://www.mckinsey.com/industries/infrastructure/our-insights/a-new-look-at-autonomous-vehicle-infrastructure 105 | [11] *medium.com*, "https://medium.com/@codebykrishna/the-future-of-transportation-autonomous-vehicles-and-smart-cities-c42f205bd46a", https://medium.com/@codebykrishna/the-future-of-transportation-autonomous-vehicles-and-smart-cities-c42f205bd46a 106 | [12] *www.nae.edu*, "https://www.nae.edu/290948/Smart-Infrastructure-for-Autonomous-Driving-in-Urban-Areas", https://www.nae.edu/290948/Smart-Infrastructure-for-Autonomous-Driving-in-Urban-Areas 107 | [13] *www.mdpi.com*, "https://www.mdpi.com/2227-7080/11/5/117", https://www.mdpi.com/2227-7080/11/5/117 108 | [14] *www.saam.swiss*, "https://www.saam.swiss/autonomous-vehicle-safety-future-of-safe-mobility/", https://www.saam.swiss/autonomous-vehicle-safety-future-of-safe-mobility/ 109 | [15] *www.wistronchina.com*, "https://www.wistronchina.com/smart-traffic-the-future-of-urban-mobility/", https://www.wistronchina.com/smart-traffic-the-future-of-urban-mobility/ 110 | [16] *www.weforum.org*, "The Role of Autonomous Vehicles and AI in Smart Transportation Systems", https://www.weforum.org/stories/2024/10/how-will-autonomous-vehicles-shape-urban-mobility/ 111 | 112 | 113 | ## Research Process 114 | 115 | - **Depth**: 3 116 | - **Breadth**: 3 117 | - **Time Taken**: 3m 56s 118 | - **Subqueries Explored**: 9 119 | - **Sources Analyzed**: 85 120 | -------------------------------------------------------------------------------- /examples/mistral_large.md: -------------------------------------------------------------------------------- 1 | # The revival of indigenous agricultural practices, such as agroforestry, seed saving, and polyculture, holds significant potential for creating sustainable food systems. These practices have demonstrated benefits for soil health, water conservation, and carbon sequestration, making them critical components in addressing contemporary environmental challenges. By focusing on these specific methods, the research can provide an in-depth analysis of how they contribute to sustainability and resilience in food production. 2 | 3 | In addition to the ecological advantages, it is essential to explore the barriers that hinder the widespread adoption of these indigenous practices. Key obstacles include issues related to land rights, the need for policy support, and the preservation of cultural knowledge. Understanding these challenges can help in developing strategies to overcome them and promote the integration of indigenous agricultural practices into mainstream farming. 4 | 5 | Given the global relevance of these practices, the research will not be limited to a specific geographic region or indigenous community. Instead, it will aim to draw insights from a diverse range of contexts, thereby enriching the analysis with a broad perspective. This approach will allow for a comprehensive examination of how indigenous agricultural practices can be effectively revived and supported across different settings. 6 | 7 | # Reviving Indigenous Practices for Sustainable Agriculture 8 | 9 | ## Executive Summary 10 | 11 | This report provides a comprehensive analysis of the ecological benefits, barriers, successful implementations, policy frameworks, cultural preservation, and socio-economic impacts of reviving indigenous agricultural practices. By focusing on agroforestry, seed saving, and polyculture, the report aims to highlight the importance of these practices for sustainable food systems and offer actionable recommendations for their broader adoption. The findings are supported by extensive literature review, qualitative research, and case studies from diverse geographic regions. 12 | 13 | ## Introduction 14 | 15 | Indigenous agricultural practices have long been recognized for their ecological and socio-economic benefits. However, their widespread adoption faces numerous challenges. This report investigates the ecological advantages, barriers to adoption, successful case studies, policy frameworks, and cultural preservation strategies related to these practices. The report also assesses the broader impacts on local communities and proposes strategies to integrate indigenous agricultural practices into mainstream farming. 16 | 17 | ## Ecological Benefits 18 | 19 | ### Agroforestry 20 | 21 | Agroforestry, which integrates trees with crops and livestock, offers multiple ecological benefits. It enhances soil health by improving nutrient cycling and reducing erosion. Trees in agroforestry systems provide organic matter, which enriches the soil and supports microbial activity. This, in turn, enhances soil fertility and water retention capacity [6]. 22 | 23 | Water conservation is another significant benefit of agroforestry. Trees help to reduce runoff and increase water infiltration, thereby recharging groundwater reserves. This is particularly crucial in regions prone to water scarcity. Agroforestry systems also moderate soil temperatures, which can improve crop productivity and resilience to climate change [6]. 24 | 25 | Carbon sequestration is a critical ecological benefit of agroforestry. Trees act as carbon sinks, absorbing carbon dioxide from the atmosphere and storing it in their biomass. This process helps mitigate greenhouse gas emissions and contributes to climate change mitigation [6]. 26 | 27 | ### Seed Saving 28 | 29 | Seed saving is a traditional practice that involves collecting, storing, and replanting seeds from one harvest to the next. This practice promotes genetic diversity and resilience in crops. By preserving local seed varieties, farmers can ensure that their crops are adapted to local conditions, which enhances their resilience to pests, diseases, and climate variability [1]. 30 | 31 | Seed saving also reduces the need for external inputs such as commercial seeds and pesticides, thereby lowering the environmental footprint of agriculture. It supports biodiversity conservation by preserving rare and heirloom varieties that might otherwise be lost to commercial monocultures [1]. 32 | 33 | ### Polyculture 34 | 35 | Polyculture, the practice of growing multiple crops together, offers numerous ecological benefits. It enhances soil health by promoting nutrient cycling and reducing the need for synthetic fertilizers. Polyculture systems often include leguminous plants that fix nitrogen in the soil, benefiting other crops in the system [2]. 36 | 37 | Water conservation is another advantage of polyculture. By intercropping, farmers can reduce water usage and improve water-use efficiency. Polyculture also supports biodiversity by creating habitats for beneficial insects and wildlife, which can help control pests and diseases naturally [2]. 38 | 39 | Carbon sequestration is another ecological benefit of polyculture. By maintaining diverse and continuous plant cover, polyculture systems can sequester more carbon in the soil compared to monocultures. This helps mitigate greenhouse gas emissions and contributes to climate change mitigation [2]. 40 | 41 | ## Barriers to Adoption 42 | 43 | ### Legal and Policy Challenges 44 | 45 | One of the primary barriers to adopting indigenous agricultural practices is the lack of supportive legal and policy frameworks. Many countries lack policies that recognize and protect indigenous land rights, which are crucial for the continuation of traditional agricultural practices. Without secure land tenure, indigenous communities may face displacement or conversion of their lands to other uses, such as commercial agriculture or urban development [1]. 46 | 47 | Another policy challenge is the dominance of industrial agriculture, which often receives government subsidies and support. This creates an uneven playing field, making it difficult for small-scale farmers practicing indigenous agriculture to compete [1]. 48 | 49 | ### Cultural and Knowledge Preservation 50 | 51 | The preservation and transmission of indigenous agricultural knowledge are essential for the continuation of these practices. However, many indigenous communities face challenges in maintaining their traditional knowledge due to a lack of documentation, generational gaps, and cultural assimilation [1]. 52 | 53 | Educational systems often prioritize modern agricultural techniques over traditional methods, leading to a loss of indigenous knowledge. Additionally, the migration of younger generations to urban areas can result in a decline in the number of practitioners of traditional agriculture [1]. 54 | 55 | ### Economic and Market Constraints 56 | 57 | Economic constraints also hinder the adoption of indigenous agricultural practices. Many small-scale farmers lack access to markets, credit, and other financial resources necessary to implement these practices. The high initial investment required for agroforestry, for example, can be a significant barrier for resource-poor farmers [1]. 58 | 59 | Market demand for diverse and traditional crops is often limited, making it challenging for farmers to sell their products. The dominance of commercial monocultures in the market further marginalizes indigenous agricultural practices [1]. 60 | 61 | ## Success Stories and Case Studies 62 | 63 | ### Polyculture in Brazilian Drylands 64 | 65 | In the Brazilian drylands, polyculture practices have been successfully implemented to enhance food security and agricultural productivity. By integrating various crops such as beans, corn, and squash, farmers have improved soil health, reduced water usage, and increased crop yields. This approach has also provided diverse and nutritious food sources for local communities, contributing to their food security [2]. 66 | 67 | ### Tiger Prawn and Barramundi Polyculture in Indonesia 68 | 69 | In Indonesia, polyculture practices involving tiger prawn and barramundi have significantly boosted farmers' productivity. By integrating these aquatic species, farmers have improved the efficiency of water and nutrient use, leading to higher yields and income. This practice has also contributed to the conservation of local aquatic biodiversity and provided a sustainable source of protein for local communities [3]. 70 | 71 | ### Banana Polyculture in Uganda 72 | 73 | In Uganda, banana polyculture practices have been successfully implemented to enhance agricultural productivity and resilience. By intercropping bananas with other crops such as beans and coffee, farmers have improved soil health, reduced pest and disease incidence, and increased crop yields. This approach has also contributed to the conservation of local biodiversity and provided diverse food sources for local communities [4]. 74 | 75 | ### Polyculture and Food Security 76 | 77 | Polyculture practices have been identified as a key strategy for enhancing food security. By promoting crop diversity and resilience, polyculture can help ensure a stable and nutritious food supply, even in the face of climate variability and other environmental challenges. This approach has been successfully implemented in various regions, contributing to improved food security and community well-being [5]. 78 | 79 | ## Policy and Legal Frameworks 80 | 81 | ### Supportive Policies for Indigenous Agriculture 82 | 83 | To promote the adoption of indigenous agricultural practices, supportive policies and legal frameworks are essential. Governments can play a crucial role by recognizing and protecting indigenous land rights, providing financial and technical support, and promoting cultural knowledge preservation. Policies that incentivize sustainable agricultural practices, such as agroforestry and seed saving, can also help overcome economic barriers and encourage farmers to adopt these practices [1]. 84 | 85 | ### International Initiatives 86 | 87 | International initiatives, such as the United Nations Declaration on the Rights of Indigenous Peoples, provide a framework for protecting indigenous land rights and promoting traditional agricultural practices. These initiatives emphasize the importance of cultural preservation and the role of indigenous knowledge in sustainable development [1]. 88 | 89 | ### National and Local Policies 90 | 91 | At the national and local levels, policies that support small-scale farmers and promote agricultural diversity can help overcome barriers to adopting indigenous agricultural practices. For example, policies that provide access to markets, credit, and other financial resources can help farmers implement these practices. Additionally, policies that promote education and training in traditional agricultural methods can help preserve and transmit indigenous knowledge [1]. 92 | 93 | ## Cultural Knowledge Preservation 94 | 95 | ### Strategies for Preserving Indigenous Knowledge 96 | 97 | Preserving and transmitting indigenous agricultural knowledge is crucial for the continuation of these practices. Strategies for cultural knowledge preservation include documentation, education, and community engagement. Documenting traditional agricultural practices through written and oral histories can help preserve this knowledge for future generations [1]. 98 | 99 | Education and training programs that incorporate indigenous agricultural methods can help transmit this knowledge to younger generations. Community engagement initiatives, such as workshops and knowledge-sharing events, can also help preserve and promote indigenous agricultural practices [1]. 100 | 101 | ### Role of Indigenous Leaders and Practitioners 102 | 103 | Indigenous leaders and practitioners play a vital role in preserving and transmitting traditional agricultural knowledge. By sharing their expertise and experiences, they can help educate and inspire others to adopt these practices. Supporting indigenous leaders and practitioners through financial and technical assistance can also help ensure the continuation of traditional agricultural practices [1]. 104 | 105 | ### Cultural Institutions and Organizations 106 | 107 | Cultural institutions and organizations can play a crucial role in preserving and promoting indigenous agricultural knowledge. By providing resources and support for documentation, education, and community engagement, these institutions can help ensure the continuation of traditional agricultural practices. Collaboration between cultural institutions, indigenous communities, and other stakeholders can also help promote the adoption of these practices [1]. 108 | 109 | ## Socio-Economic Impacts 110 | 111 | ### Food Security 112 | 113 | Reviving indigenous agricultural practices can significantly enhance food security. By promoting crop diversity and resilience, these practices can help ensure a stable and nutritious food supply, even in the face of environmental challenges. This approach has been successfully implemented in various regions, contributing to improved food security and community well-being [5]. 114 | 115 | ### Economic Stability 116 | 117 | Indigenous agricultural practices can also contribute to economic stability. By providing diverse and sustainable food sources, these practices can help farmers reduce their dependence on external inputs and markets. This can lead to increased self-sufficiency and economic resilience for local communities [5]. 118 | 119 | ### Cultural Preservation 120 | 121 | Preserving and promoting indigenous agricultural knowledge is crucial for cultural preservation. These practices are often deeply rooted in local traditions and values, and their continuation can help maintain cultural identity and heritage. By supporting indigenous agricultural practices, communities can also preserve their cultural landscapes and natural environments [1]. 122 | 123 | ## Strategies and Recommendations 124 | 125 | ### Overcoming Barriers to Adoption 126 | 127 | To overcome the barriers to adopting indigenous agricultural practices, a multi-faceted approach is needed. Supportive policies and legal frameworks that recognize and protect indigenous land rights, provide financial and technical support, and promote cultural knowledge preservation are essential. Additionally, education and training programs that incorporate traditional agricultural methods can help transmit this knowledge to younger generations [1]. 128 | 129 | ### Promoting Indigenous Agriculture 130 | 131 | Promoting indigenous agricultural practices requires collaboration between governments, NGOs, cultural institutions, and indigenous communities. By working together, these stakeholders can develop and implement strategies to overcome barriers and encourage the adoption of these practices. Supporting indigenous leaders and practitioners through financial and technical assistance can also help ensure the continuation of traditional agricultural practices [1]. 132 | 133 | ### Integrating Indigenous Practices into Mainstream Farming 134 | 135 | Integrating indigenous agricultural practices into mainstream farming requires a shift in mindset and approach. Policies that incentivize sustainable agricultural practices, such as agroforestry and seed saving, can help encourage farmers to adopt these practices. Additionally, promoting education and training in traditional agricultural methods can help transmit this knowledge to a broader audience [1]. 136 | 137 | ## Conclusion 138 | 139 | Reviving indigenous agricultural practices offers numerous ecological, socio-economic, and cultural benefits. By promoting soil health, water conservation, carbon sequestration, and biodiversity, these practices contribute to sustainable food systems. However, their widespread adoption faces numerous challenges, including legal and policy barriers, cultural knowledge preservation, and economic constraints. 140 | 141 | Success stories from diverse regions highlight the potential of indigenous agricultural practices to enhance food security, economic stability, and cultural preservation. Supportive policies and legal frameworks, along with strategies for cultural knowledge preservation and education, are essential for overcoming these barriers and promoting the adoption of these practices. 142 | 143 | By working together, governments, NGOs, cultural institutions, and indigenous communities can develop and implement strategies to integrate indigenous agricultural practices into mainstream farming. This collaborative effort is crucial for ensuring the continuation of these practices and their benefits for future generations. 144 | 145 | # References 146 | 147 | [1] *www.firstnations.org*, "https://www.firstnations.org/wp-content/uploads/publication-attachments/2015-Fact-Sheet-11-Seed-Saving-and-Seed-Sovereignty.pdf", https://www.firstnations.org/wp-content/uploads/publication-attachments/2015-Fact-Sheet-11-Seed-Saving-and-Seed-Sovereignty.pdf 148 | [2] *dry-net.org*, "https://dry-net.org/initiatives/polyculture-in-brazilian-drylands/", https://dry-net.org/initiatives/polyculture-in-brazilian-drylands/ 149 | [3] *thefishsite.com*, "https://thefishsite.com/articles/how-tiger-prawn-and-barramundi-polyculture-helps-boost-farmers-productivity-indonesia", https://thefishsite.com/articles/how-tiger-prawn-and-barramundi-polyculture-helps-boost-farmers-productivity-indonesia 150 | [4] *alliancebioversityciat.org*, "https://alliancebioversityciat.org/stories/embracing-banana-polyculture-uganda", https://alliancebioversityciat.org/stories/embracing-banana-polyculture-uganda 151 | [5] *www.foodunfolded.com*, "https://www.foodunfolded.com/article/is-polyculture-the-key-to-food-security", https://www.foodunfolded.com/article/is-polyculture-the-key-to-food-security 152 | [6] *tracextech.com*, "https://tracextech.com/carbon-sequestration-in-agroforestry/", https://tracextech.com/carbon-sequestration-in-agroforestry/ 153 | 154 | 155 | ## Research Process 156 | 157 | - **Depth**: 3 158 | - **Breadth**: 3 159 | - **Time Taken**: 5m 15s 160 | - **Subqueries Explored**: 9 161 | - **Sources Analyzed**: 52 162 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | langchain>=0.1.0 3 | langchain-core>=0.1.0 4 | langchain-openai>=0.0.5 5 | langchain-community>=0.0.13 6 | langgraph>=0.0.20 7 | langchain-text-splitters>=0.0.1 8 | pydantic>=2.0.0 9 | click>=8.0.0 10 | rich>=13.0.0 11 | aiohttp>=3.8.0 12 | asyncio>=3.4.3 13 | beautifulsoup4>=4.12.0 14 | trafilatura>=1.6.0 15 | fake_useragent>=1.2.0 16 | playwright>=1.40.0 17 | tiktoken>=0.5.0 18 | 19 | # Search engines 20 | googlesearch-python>=1.2.3 21 | wikipedia>=1.4.0 22 | arxiv>=2.0.0 23 | 24 | # Utilities 25 | python-dotenv>=1.0.0 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup script for Shandu deep research system. 3 | """ 4 | from setuptools import setup, find_packages 5 | import os 6 | 7 | # Read long description from README.md 8 | with open("shandu/README.md", "r", encoding="utf-8") as fh: 9 | long_description = fh.read() 10 | 11 | # Read requirements 12 | with open("requirements.txt") as f: 13 | requirements = [line.strip() for line in f if line.strip() and not line.startswith("#")] 14 | 15 | setup( 16 | name="shandu", 17 | version="2.0.0", 18 | description="Deep research system with LangChain and LangGraph", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | author="Dušan Jolović", 22 | author_email="jolovic@pm.me", 23 | url="https://github.com/jolovicdev/shandu", 24 | packages=find_packages(), 25 | install_requires=requirements, 26 | entry_points={ 27 | "console_scripts": [ 28 | "shandu=shandu.cli:cli", 29 | ], 30 | }, 31 | python_requires=">=3.9", 32 | classifiers=[ 33 | "Development Status :: 3 - Alpha", 34 | "Intended Audience :: Science/Research", 35 | "Intended Audience :: Developers", 36 | "License :: OSI Approved :: MIT License", 37 | "Programming Language :: Python :: 3", 38 | "Programming Language :: Python :: 3.9", 39 | "Programming Language :: Python :: 3.10", 40 | "Programming Language :: Python :: 3.11", 41 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 42 | "Topic :: Software Development :: Libraries :: Python Modules", 43 | ], 44 | keywords="research, ai, llm, langchain, langgraph, deepresearch, deepsearch, search", 45 | project_urls={ 46 | "Source": "https://github.com/jolovicdev/shandu", 47 | }, 48 | include_package_data=True, 49 | ) 50 | -------------------------------------------------------------------------------- /shandu/README.md: -------------------------------------------------------------------------------- 1 | # Shandu: Advanced Research System Architecture 2 | 3 | This directory contains the core architecture of the Shandu deep research system. Our modular design separates concerns and enables future extensibility while maintaining clean, testable code. 4 | 5 | ## 📊 System Architecture 6 | 7 | Shandu implements a sophisticated state-based workflow using LangGraph and LangChain to create a robust, extensible research system: 8 | 9 | ``` 10 | shandu/ 11 | ├── __init__.py # Package initialization 12 | ├── cli.py # Command-line interface 13 | ├── config.py # Configuration management 14 | ├── prompts.py # Centralized prompt templates 15 | ├── agents/ # Research agent implementations 16 | │ ├── __init__.py 17 | │ ├── agent.py # LangChain-based agent 18 | │ ├── langgraph_agent.py # LangGraph state-based agent 19 | │ ├── graph/ # Graph workflow components 20 | │ │ ├── __init__.py 21 | │ │ ├── builder.py # Graph construction 22 | │ │ └── wrapper.py # Async function wrappers 23 | │ ├── nodes/ # Graph node implementations 24 | │ │ ├── __init__.py 25 | │ │ ├── initialize.py # Research initialization 26 | │ │ ├── reflect.py # Research reflection 27 | │ │ ├── search.py # Content search and analysis 28 | │ │ └── ... # Other node implementations 29 | │ ├── processors/ # Content processing 30 | │ │ ├── __init__.py 31 | │ │ ├── content_processor.py # Content extraction 32 | │ │ └── report_generator.py # Report generation 33 | │ └── utils/ # Agent utilities 34 | │ ├── __init__.py 35 | │ └── agent_utils.py # Helper functions 36 | ├── research/ # Research orchestration 37 | │ ├── __init__.py 38 | │ └── researcher.py # Result management 39 | ├── scraper/ # Web scraping functionality 40 | │ ├── __init__.py 41 | │ └── scraper.py # Ethical web scraper 42 | └── search/ # Search functionality 43 | ├── __init__.py 44 | ├── ai_search.py # AI-powered search 45 | └── search.py # Multi-engine search 46 | ``` 47 | 48 | ## 🔄 LangGraph Research Workflow 49 | 50 | Shandu's research process follows a sophisticated state-based workflow: 51 | 52 | 1. **Initialize**: Define research query, parameters, and create a research plan 53 | 2. **Reflect**: Analyze current findings and identify knowledge gaps 54 | 3. **Generate Queries**: Create targeted search queries based on analysis 55 | 4. **Search**: Execute search queries and collect results 56 | 5. **Smart Source Selection**: Filter and prioritize the most valuable sources 57 | 6. **Format Citations**: Prepare properly formatted citations for all sources 58 | 7. **Generate Initial Report**: Create a first draft of the research report 59 | 8. **Enhance Report**: Add depth, detail, and proper structure 60 | 9. **Expand Key Sections**: Further develop important sections through multi-step synthesis 61 | 10. **Finalize Report**: Apply final formatting and quality checks 62 | 63 | ## 🧠 Advanced Technical Features 64 | 65 | ### State-Based Research With LangGraph 66 | 67 | Our LangGraph implementation provides several key advantages: 68 | 69 | - **Clear State Transitions**: Each research phase has well-defined inputs and outputs 70 | - **Conditional Logic**: Dynamically determines next steps based on current state 71 | - **Circular Flow**: Supports recursive exploration until depth conditions are met 72 | - **Parallel Processing**: Handles concurrent operations for efficiency 73 | - **Error Resilience**: Continues functioning even if individual steps encounter issues 74 | 75 | ### Enhanced Content Processing 76 | 77 | Shandu implements sophisticated content processing: 78 | 79 | - **Content Relevance Filtering**: Uses AI to determine if content is relevant to the research query 80 | - **Source Reliability Assessment**: Evaluates sources for credibility and authority 81 | - **Main Content Extraction**: Identifies and extracts the primary content from web pages 82 | - **Content Analysis Pipeline**: Multi-step analysis for key information extraction 83 | - **Theme Identification**: Automatically discovers and organizes thematic elements 84 | 85 | ### Advanced Report Generation 86 | 87 | Our multi-step report generation process ensures high-quality output: 88 | 89 | 1. **Theme Extraction**: Identifies key themes across all research 90 | 2. **Initial Report Generation**: Creates a structured first draft 91 | 3. **Report Enhancement**: Adds depth, citations, and improved organization 92 | 4. **Key Section Expansion**: Further develops the most important sections 93 | 5. **Citation Management**: Ensures proper attribution of all sources 94 | 6. **Final Cleanup**: Removes artifacts and ensures consistent formatting 95 | 96 | ## 💻 API Details 97 | 98 | ### ResearchGraph Class 99 | 100 | ```python 101 | class ResearchGraph: 102 | """ 103 | State-based research workflow using LangGraph. 104 | Provides a structured approach to deep research with multiple stages. 105 | """ 106 | def __init__( 107 | self, 108 | llm: Optional[ChatOpenAI] = None, 109 | searcher: Optional[UnifiedSearcher] = None, 110 | scraper: Optional[WebScraper] = None, 111 | temperature: float = 0.5, 112 | date: Optional[str] = None 113 | ) 114 | 115 | async def research( 116 | self, 117 | query: str, 118 | depth: int = 2, 119 | breadth: int = 4, 120 | progress_callback: Optional[Callable] = None, 121 | include_objective: bool = False, 122 | detail_level: str = "high" 123 | ) -> ResearchResult 124 | 125 | def research_sync( 126 | self, 127 | query: str, 128 | depth: int = 2, 129 | breadth: int = 4, 130 | progress_callback: Optional[Callable] = None, 131 | include_objective: bool = False, 132 | detail_level: str = "high" 133 | ) -> ResearchResult 134 | ``` 135 | 136 | ### AISearcher Class 137 | 138 | ```python 139 | class AISearcher: 140 | """ 141 | AI-powered search with content scraping for deeper insights. 142 | """ 143 | def __init__( 144 | self, 145 | llm: Optional[ChatOpenAI] = None, 146 | searcher: Optional[UnifiedSearcher] = None, 147 | scraper: Optional[WebScraper] = None, 148 | max_results: int = 10, 149 | max_pages_to_scrape: int = 3 150 | ) 151 | 152 | async def search( 153 | self, 154 | query: str, 155 | engines: Optional[List[str]] = None, 156 | detailed: bool = False, 157 | enable_scraping: bool = True 158 | ) -> AISearchResult 159 | ``` 160 | 161 | ## 🔌 Integration Points 162 | 163 | Shandu is designed for easy integration: 164 | 165 | - **CLI Interface**: Command-line tools for direct usage 166 | - **Python API**: Clean, well-documented API for integration into other applications 167 | - **Extensible Components**: Easy to add new search engines, scrapers, or processing steps 168 | - **Custom LLM Support**: Works with any LangChain-compatible LLM 169 | - **Callback System**: Progress tracking and event hooks 170 | 171 | ## 🔍 Implementation Details 172 | 173 | ### Prompt Engineering 174 | 175 | Shandu uses carefully crafted prompts for: 176 | 177 | - Query clarification 178 | - Research planning 179 | - Content analysis 180 | - Source evaluation 181 | - Report generation 182 | - Citation formatting 183 | 184 | ### Async Processing 185 | 186 | Extensive use of async/await patterns for: 187 | 188 | - Parallel search execution 189 | - Concurrent web scraping 190 | - Efficient content processing 191 | - Responsive UI updates 192 | 193 | ### Caching System 194 | 195 | Multi-level caching for: 196 | 197 | - Search results 198 | - Scraped content 199 | - Content analysis 200 | - LLM responses 201 | 202 | ## 🔬 Research Algorithm 203 | 204 | Our research algorithm optimizes for: 205 | 206 | 1. **Breadth**: Exploring multiple relevant sub-topics 207 | 2. **Depth**: Drilling down into important details 208 | 3. **Convergence**: Focusing on the most relevant information 209 | 4. **Coverage**: Ensuring comprehensive topic exploration 210 | 5. **Source Quality**: Prioritizing reliable, authoritative sources 211 | 6. **Synthesis**: Creating coherent, well-structured reports 212 | 213 | For more information on using Shandu, see the main [README.md](../README.md) file. -------------------------------------------------------------------------------- /shandu/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shandu Deep Research System 3 | A powerful research tool combining multiple search engines with LangChain integration. 4 | 5 | Copyright (c) 2025 Dušan Jolović 6 | Licensed under the MIT License. See LICENSE file for details. 7 | """ 8 | 9 | from .search.search import UnifiedSearcher, SearchResult 10 | from .research.researcher import DeepResearcher, ResearchResult 11 | from .agents.agent import ResearchAgent 12 | 13 | __version__ = "2.0.0" 14 | __all__ = [ 15 | "UnifiedSearcher", 16 | "SearchResult", 17 | "DeepResearcher", 18 | "ResearchResult", 19 | "ResearchAgent" 20 | ] 21 | -------------------------------------------------------------------------------- /shandu/agents/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agents module for Shandu deep research system. 3 | """ 4 | 5 | from .agent import ResearchAgent 6 | 7 | __all__ = ["ResearchAgent"] 8 | -------------------------------------------------------------------------------- /shandu/agents/agent.py: -------------------------------------------------------------------------------- 1 | """Agent module for Shandu research system.""" 2 | from typing import List, Dict, Optional, Union, Any 3 | from dataclasses import dataclass 4 | from datetime import datetime 5 | import asyncio 6 | import json 7 | import time 8 | 9 | from langchain_core.prompts import ChatPromptTemplate 10 | from langchain_core.output_parsers import StrOutputParser 11 | from langchain_core.runnables import RunnablePassthrough 12 | from langchain_openai import ChatOpenAI 13 | from langchain.agents import AgentType, initialize_agent 14 | from langchain.chains import LLMChain 15 | from langchain.prompts import PromptTemplate 16 | from langchain_community.tools import Tool, DuckDuckGoSearchResults, DuckDuckGoSearchRun 17 | 18 | from ..search.search import UnifiedSearcher, SearchResult 19 | from ..research.researcher import ResearchResult 20 | from ..scraper import WebScraper, ScrapedContent 21 | from ..prompts import SYSTEM_PROMPTS, USER_PROMPTS 22 | from .utils.citation_manager import CitationManager, SourceInfo, Learning 23 | 24 | class ResearchAgent: 25 | """LangChain-based research agent with enhanced citation tracking.""" 26 | def __init__( 27 | self, 28 | llm: Optional[ChatOpenAI] = None, 29 | searcher: Optional[UnifiedSearcher] = None, 30 | scraper: Optional[WebScraper] = None, 31 | temperature: float = 0, 32 | max_depth: int = 2, 33 | breadth: int = 4, 34 | max_urls_per_query: int = 3, 35 | proxy: Optional[str] = None 36 | ): 37 | self.llm = llm or ChatOpenAI( 38 | temperature=temperature, 39 | model="gpt-4" 40 | ) 41 | self.searcher = searcher or UnifiedSearcher() 42 | self.scraper = scraper or WebScraper(proxy=proxy) 43 | self.citation_manager = CitationManager() # Initialize citation manager 44 | # Research parameters 45 | self.max_depth = max_depth 46 | self.breadth = breadth 47 | self.max_urls_per_query = max_urls_per_query 48 | 49 | self.system_prompt = ChatPromptTemplate.from_template(SYSTEM_PROMPTS["research_agent"]) 50 | self.reflection_prompt = ChatPromptTemplate.from_template(USER_PROMPTS["reflection"]) 51 | self.query_gen_prompt = ChatPromptTemplate.from_template(USER_PROMPTS["query_generation"]) 52 | 53 | self.tools = self._setup_tools() 54 | 55 | self.agent = initialize_agent( 56 | tools=self.tools, 57 | llm=self.llm, 58 | agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, 59 | verbose=True 60 | ) 61 | 62 | def _setup_tools(self) -> List[Tool]: 63 | """Setup agent tools.""" 64 | return [ 65 | Tool( 66 | name="search", 67 | func=self.searcher.search_sync, 68 | description="Search multiple sources for information about a topic" 69 | ), 70 | DuckDuckGoSearchResults( 71 | name="ddg_results", 72 | description="Get detailed search results from DuckDuckGo" 73 | ), 74 | DuckDuckGoSearchRun( 75 | name="ddg_search", 76 | description="Search DuckDuckGo for a quick answer" 77 | ), 78 | Tool( 79 | name="reflect", 80 | func=self._reflect_on_findings, 81 | description="Analyze and reflect on current research findings" 82 | ), 83 | Tool( 84 | name="generate_queries", 85 | func=self._generate_subqueries, 86 | description="Generate targeted subqueries for deeper research" 87 | ) 88 | ] 89 | 90 | async def _reflect_on_findings(self, findings: str) -> str: 91 | """Analyze research findings.""" 92 | reflection_chain = self.reflection_prompt | self.llm | StrOutputParser() 93 | return await reflection_chain.ainvoke({"findings": findings}) 94 | 95 | async def _generate_subqueries( 96 | self, 97 | query: str, 98 | findings: str, 99 | questions: str 100 | ) -> List[str]: 101 | """Generate subqueries for deeper research.""" 102 | query_chain = self.query_gen_prompt | self.llm | StrOutputParser() 103 | result = await query_chain.ainvoke({ 104 | "query": query, 105 | "findings": findings, 106 | "questions": questions, 107 | "breadth": self.breadth 108 | }) 109 | 110 | queries = [q.strip() for q in result.split("\n") if q.strip()] 111 | return queries[:self.breadth] 112 | 113 | async def _extract_urls_from_results( 114 | self, 115 | search_results: List[SearchResult], 116 | max_urls: int = 3 117 | ) -> List[str]: 118 | """Extract top URLs from search results.""" 119 | urls = [] 120 | seen = set() 121 | 122 | for result in search_results: 123 | if len(urls) >= max_urls: 124 | break 125 | 126 | url = result.url 127 | if url and url not in seen and url.startswith('http'): 128 | urls.append(url) 129 | seen.add(url) 130 | 131 | return urls 132 | 133 | async def _analyze_content( 134 | self, 135 | query: str, 136 | content: List[ScrapedContent] 137 | ) -> Dict[str, Any]: 138 | """Analyze scraped content and track learnings with citation manager.""" 139 | # Prepare content for analysis 140 | content_text = "" 141 | for item in content: 142 | 143 | source_info = SourceInfo( 144 | url=item.url, 145 | title=item.title, 146 | content_type=item.content_type, 147 | access_time=time.time(), 148 | domain=item.url.split("//")[1].split("/")[0] if "//" in item.url else "unknown", 149 | reliability_score=0.8, # Default score, could be more dynamic 150 | metadata=item.metadata 151 | ) 152 | self.citation_manager.add_source(source_info) 153 | 154 | content_text += f"\nSource: {item.url}\nTitle: {item.title}\n" 155 | content_text += f"Content Summary:\n{item.text[:2000]}...\n" 156 | 157 | # Use the content analysis prompt from centralized prompts 158 | analysis_prompt = ChatPromptTemplate.from_messages([ 159 | ("system", SYSTEM_PROMPTS["content_analysis"]), 160 | ("user", USER_PROMPTS["content_analysis"]) 161 | ]) 162 | 163 | analysis_chain = analysis_prompt | self.llm | StrOutputParser() 164 | analysis = await analysis_chain.ainvoke({"query": query, "content": content_text}) 165 | 166 | for item in content: 167 | # Use citation manager to extract and register learnings 168 | learning_hashes = self.citation_manager.extract_learning_from_text( 169 | analysis, # Use the analysis as the source of learnings 170 | item.url, 171 | context=f"Analysis for query: {query}" 172 | ) 173 | 174 | return { 175 | "analysis": analysis, 176 | "sources": [c.url for c in content], 177 | "learnings": len(self.citation_manager.learnings) # Track number of learnings 178 | } 179 | 180 | async def research( 181 | self, 182 | query: str, 183 | depth: Optional[int] = None, 184 | engines: List[str] = ["google", "duckduckgo"] 185 | ) -> ResearchResult: 186 | """Execute the research process with enhanced citation tracking.""" 187 | depth = depth if depth is not None else self.max_depth 188 | 189 | context = { 190 | "query": query, 191 | "depth": depth, 192 | "breadth": self.breadth, 193 | "findings": "", 194 | "sources": [], 195 | "subqueries": [], 196 | "content_analysis": [], 197 | "learnings_by_source": {} # Track learnings by source 198 | } 199 | 200 | # Initial system prompt to set up the research 201 | system_chain = self.system_prompt | self.llm | StrOutputParser() 202 | context["findings"] = await system_chain.ainvoke(context) 203 | 204 | # Iterative deepening research process 205 | for current_depth in range(depth): 206 | # Reflect on current findings 207 | reflection = await self._reflect_on_findings(context["findings"]) 208 | 209 | new_queries = await self._generate_subqueries( 210 | query=query, 211 | findings=context["findings"], 212 | questions=reflection 213 | ) 214 | context["subqueries"].extend(new_queries) 215 | 216 | for subquery in new_queries: 217 | agent_result = await self.agent.arun( 218 | f"Research this specific aspect: {subquery}\n\n" 219 | f"Current findings: {context['findings']}\n\n" 220 | "Think step by step about what tools to use and how to verify the information." 221 | ) 222 | 223 | # Perform the search 224 | search_results = await self.searcher.search( 225 | subquery, 226 | engines=engines 227 | ) 228 | 229 | urls_to_scrape = await self._extract_urls_from_results( 230 | search_results, 231 | self.max_urls_per_query 232 | ) 233 | 234 | # Scrape and analyze content 235 | if urls_to_scrape: 236 | scraped_content = await self.scraper.scrape_urls( 237 | urls_to_scrape, 238 | dynamic=True, 239 | force_refresh=False # Use cache when available 240 | ) 241 | 242 | if scraped_content: 243 | # Analyze the content 244 | analysis = await self._analyze_content(subquery, scraped_content) 245 | context["content_analysis"].append({ 246 | "subquery": subquery, 247 | "analysis": analysis["analysis"], 248 | "sources": analysis["sources"], 249 | "learnings": analysis.get("learnings", 0) 250 | }) 251 | 252 | for r in search_results: 253 | if isinstance(r, SearchResult): 254 | context["sources"].append(r.to_dict()) 255 | elif isinstance(r, dict): 256 | context["sources"].append(r) 257 | else: 258 | print(f"Warning: Skipping non-serializable search result: {type(r)}") 259 | 260 | context["findings"] += f"\n\nFindings for '{subquery}':\n{agent_result}" 261 | 262 | if context["content_analysis"]: 263 | latest_analysis = context["content_analysis"][-1] 264 | context["findings"] += f"\n\nDetailed Analysis:\n{latest_analysis['analysis']}" 265 | 266 | # Final reflection and summary 267 | final_reflection = await self._reflect_on_findings(context["findings"]) 268 | 269 | # Prepare detailed sources with content analysis 270 | detailed_sources = [] 271 | for source in context["sources"]: 272 | # Source is already a dictionary at this point 273 | source_dict = source.copy() # Make a copy to avoid modifying the original 274 | 275 | for analysis in context["content_analysis"]: 276 | if source.get("url", "") in analysis["sources"]: 277 | source_dict["detailed_analysis"] = analysis["analysis"] 278 | 279 | if source.get("url") in self.citation_manager.source_to_learnings: 280 | source_url = source.get("url") 281 | learning_ids = self.citation_manager.source_to_learnings.get(source_url, []) 282 | source_dict["tracked_learnings"] = len(learning_ids) 283 | context["learnings_by_source"][source_url] = len(learning_ids) 284 | 285 | detailed_sources.append(source_dict) 286 | 287 | citation_stats = { 288 | "total_sources": len(self.citation_manager.sources), 289 | "total_learnings": len(self.citation_manager.learnings), 290 | "source_reliability": self.citation_manager._calculate_source_reliability() 291 | } 292 | 293 | return ResearchResult( 294 | query=query, 295 | summary=final_reflection, 296 | sources=detailed_sources, 297 | subqueries=context["subqueries"], 298 | depth=depth, 299 | content_analysis=context["content_analysis"], 300 | citation_stats=citation_stats 301 | ) 302 | 303 | def research_sync( 304 | self, 305 | query: str, 306 | depth: Optional[int] = None, 307 | engines: List[str] = ["google", "duckduckgo"] 308 | ) -> ResearchResult: 309 | """Synchronous research wrapper.""" 310 | return asyncio.run(self.research(query, depth, engines)) 311 | -------------------------------------------------------------------------------- /shandu/agents/graph/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Graph building module for research LangGraph. 3 | """ 4 | from .builder import build_graph 5 | from .wrapper import create_node_wrapper 6 | 7 | __all__ = ['build_graph', 'create_node_wrapper'] -------------------------------------------------------------------------------- /shandu/agents/graph/builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Graph builder for research graph. 3 | """ 4 | from langgraph.graph import Graph, StateGraph 5 | from ..processors.content_processor import AgentState 6 | from ..utils.agent_utils import should_continue 7 | 8 | def build_graph( 9 | initialize_node, 10 | reflect_node, 11 | generate_queries_node, 12 | search_node, 13 | smart_source_selection, 14 | format_citations_node, 15 | generate_initial_report_node, 16 | enhance_report_node, 17 | expand_key_sections_node, 18 | report_node 19 | ) -> Graph: 20 | """ 21 | Build the research workflow graph with all nodes. 22 | 23 | Args: 24 | All node functions for the research graph 25 | 26 | Returns: 27 | Compiled graph ready for execution 28 | """ 29 | workflow = StateGraph(AgentState) 30 | 31 | workflow.add_node("initialize", initialize_node) 32 | workflow.add_node("reflect", reflect_node) 33 | workflow.add_node("generate_queries", generate_queries_node) 34 | workflow.add_node("search", search_node) 35 | workflow.add_node("smart_source_selection", smart_source_selection) 36 | workflow.add_node("format_citations", format_citations_node) 37 | workflow.add_node("generate_initial_report", generate_initial_report_node) 38 | workflow.add_node("enhance_report", enhance_report_node) 39 | workflow.add_node("expand_key_sections", expand_key_sections_node) 40 | workflow.add_node("report", report_node) 41 | 42 | workflow.add_edge("initialize", "generate_queries") 43 | workflow.add_edge("reflect", "generate_queries") 44 | workflow.add_edge("generate_queries", "search") 45 | workflow.add_conditional_edges("search", should_continue, { 46 | "continue": "reflect", 47 | "end": "smart_source_selection" 48 | }) 49 | 50 | workflow.add_edge("smart_source_selection", "format_citations") 51 | workflow.add_edge("format_citations", "generate_initial_report") 52 | workflow.add_edge("generate_initial_report", "enhance_report") 53 | workflow.add_edge("enhance_report", "expand_key_sections") 54 | workflow.add_edge("expand_key_sections", "report") 55 | 56 | workflow.set_entry_point("initialize") 57 | workflow.set_finish_point("report") 58 | 59 | return workflow.compile() -------------------------------------------------------------------------------- /shandu/agents/graph/wrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper functions to handle async functions in LangGraph nodes. 3 | This module provides thread-safe handling of asyncio event loops. 4 | """ 5 | import asyncio 6 | import threading 7 | from typing import Callable, Any, Awaitable, TypeVar, Dict 8 | from concurrent.futures import ThreadPoolExecutor 9 | 10 | T = TypeVar('T') 11 | 12 | # Thread-local storage for per-thread event loops 13 | _thread_local = threading.local() 14 | # Lock for thread safety when manipulating event loops 15 | _loop_lock = threading.Lock() 16 | # Track active event loops per thread 17 | _thread_loops: Dict[int, asyncio.AbstractEventLoop] = {} 18 | 19 | def get_or_create_event_loop(): 20 | """Get the event loop for the current thread or create a new one if it doesn't exist.""" 21 | thread_id = threading.get_ident() 22 | 23 | # First check thread-local storage to see if we already have a loop 24 | if hasattr(_thread_local, 'loop'): 25 | # Make sure the loop is still valid (not closed) 26 | if not _thread_local.loop.is_closed(): 27 | return _thread_local.loop 28 | 29 | # Try to get the current event loop 30 | try: 31 | loop = asyncio.get_event_loop() 32 | if not loop.is_closed(): 33 | # Store in thread local for faster access next time 34 | _thread_local.loop = loop 35 | with _loop_lock: 36 | _thread_loops[thread_id] = loop 37 | return loop 38 | except RuntimeError: 39 | # No event loop exists for this thread, or it was closed 40 | pass 41 | 42 | # Need to create a new loop 43 | with _loop_lock: 44 | # Double-check if we have a valid loop for this thread 45 | if thread_id in _thread_loops and not _thread_loops[thread_id].is_closed(): 46 | _thread_local.loop = _thread_loops[thread_id] 47 | return _thread_loops[thread_id] 48 | 49 | loop = asyncio.new_event_loop() 50 | asyncio.set_event_loop(loop) 51 | _thread_local.loop = loop 52 | _thread_loops[thread_id] = loop 53 | return loop 54 | 55 | def run_async_in_new_loop(async_fn, *args, **kwargs): 56 | """Run an async function in a new event loop in the current thread.""" 57 | loop = get_or_create_event_loop() 58 | try: 59 | return loop.run_until_complete(async_fn(*args, **kwargs)) 60 | except Exception as e: 61 | 62 | raise e 63 | 64 | def create_node_wrapper(async_fn: Callable[..., Awaitable[T]]) -> Callable[..., T]: 65 | """ 66 | Creates a wrapper that safely executes an async function, ensuring proper event loop handling 67 | across different threading scenarios. 68 | """ 69 | def wrapped_function(*args, **kwargs): 70 | # Use our reliable get_or_create_event_loop function 71 | loop = get_or_create_event_loop() 72 | 73 | if loop.is_running(): 74 | # We're in a running event loop - create a task in ThreadPoolExecutor 75 | # This approach prevents nesting of event loops 76 | with ThreadPoolExecutor(max_workers=1) as executor: 77 | future = executor.submit(run_async_in_new_loop, async_fn, *args, **kwargs) 78 | return future.result() 79 | else: 80 | # We have an event loop but it's not running, use it directly 81 | try: 82 | return loop.run_until_complete(async_fn(*args, **kwargs)) 83 | except Exception as e: 84 | # Log the error if needed 85 | from ...utils.logger import log_error 86 | log_error(f"Error in async execution", e, 87 | context=f"Function: {async_fn.__name__}") 88 | # Re-raise to maintain original behavior 89 | raise 90 | 91 | return wrapped_function 92 | -------------------------------------------------------------------------------- /shandu/agents/langgraph_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Research agent implementation using LangGraph. 3 | """ 4 | import time 5 | import asyncio 6 | from datetime import datetime 7 | from typing import List, Dict, Optional, Any, Callable 8 | from langchain_openai import ChatOpenAI 9 | from langchain_core.messages import HumanMessage 10 | from rich.console import Console 11 | from rich.panel import Panel 12 | from ..search.search import UnifiedSearcher, SearchResult 13 | from ..scraper import WebScraper, ScrapedContent 14 | from ..research.researcher import ResearchResult 15 | from ..config import config, get_current_date 16 | from .processors import AgentState 17 | from .utils.agent_utils import ( 18 | get_user_input, 19 | clarify_query, 20 | display_research_progress, 21 | is_shutdown_requested 22 | ) 23 | from .nodes import ( 24 | initialize_node, 25 | reflect_node, 26 | generate_queries_node, 27 | search_node, 28 | smart_source_selection, 29 | format_citations_node, 30 | generate_initial_report_node, 31 | enhance_report_node, 32 | expand_key_sections_node, 33 | report_node 34 | ) 35 | from .graph import build_graph, create_node_wrapper 36 | 37 | console = Console() 38 | 39 | class ResearchGraph: 40 | """Research workflow graph implementation.""" 41 | def __init__( 42 | self, 43 | llm: Optional[ChatOpenAI] = None, 44 | searcher: Optional[UnifiedSearcher] = None, 45 | scraper: Optional[WebScraper] = None, 46 | temperature: float = 0.5, 47 | date: Optional[str] = None 48 | ): 49 | api_base = config.get("api", "base_url") 50 | api_key = config.get("api", "api_key") 51 | model = config.get("api", "model") 52 | 53 | self.llm = llm or ChatOpenAI( 54 | base_url=api_base, 55 | api_key=api_key, 56 | model=model, 57 | temperature=temperature, 58 | max_tokens=16384 # Significantly increased max tokens to support much more comprehensive responses 59 | ) 60 | self.searcher = searcher or UnifiedSearcher() 61 | self.scraper = scraper or WebScraper() 62 | self.date = date or get_current_date() 63 | self.progress_callback = None 64 | self.include_objective = False 65 | self.detail_level = "high" 66 | self.graph = self._build_graph() 67 | 68 | def _build_graph(self): 69 | """Build the research graph.""" 70 | 71 | init_node = create_node_wrapper(lambda state: initialize_node(self.llm, self.date, self.progress_callback, state)) 72 | reflect = create_node_wrapper(lambda state: reflect_node(self.llm, self.progress_callback, state)) 73 | gen_queries = create_node_wrapper(lambda state: generate_queries_node(self.llm, self.progress_callback, state)) 74 | search = create_node_wrapper(lambda state: search_node(self.llm, self.searcher, self.scraper, self.progress_callback, state)) 75 | source_selection = create_node_wrapper(lambda state: smart_source_selection(self.llm, self.progress_callback, state)) 76 | citations = create_node_wrapper(lambda state: format_citations_node(self.llm, self.progress_callback, state)) 77 | initial_report = create_node_wrapper(lambda state: generate_initial_report_node(self.llm, self.include_objective, self.progress_callback, state)) 78 | enhance = create_node_wrapper(lambda state: enhance_report_node(self.llm, self.progress_callback, state)) 79 | expand_sections = create_node_wrapper(lambda state: expand_key_sections_node(self.llm, self.progress_callback, state)) 80 | final_report = create_node_wrapper(lambda state: report_node(self.llm, self.progress_callback, state)) 81 | 82 | # Build graph with these node functions 83 | return build_graph( 84 | init_node, 85 | reflect, 86 | gen_queries, 87 | search, 88 | source_selection, 89 | citations, 90 | initial_report, 91 | enhance, 92 | expand_sections, 93 | final_report 94 | ) 95 | 96 | async def research( 97 | self, 98 | query: str, 99 | depth: int = 2, 100 | breadth: int = 4, 101 | progress_callback: Optional[Callable[[AgentState], None]] = None, 102 | include_objective: bool = False, 103 | detail_level: str = "high" 104 | ) -> ResearchResult: 105 | """Execute research process on a query.""" 106 | self.progress_callback = progress_callback 107 | self.include_objective = include_objective 108 | self.detail_level = detail_level 109 | 110 | depth = max(1, min(5, depth)) # Ensure depth is between 1 and 5 111 | breadth = max(1, min(10, breadth)) # Ensure breadth is between 1 and 10 112 | 113 | state = AgentState( 114 | messages=[HumanMessage(content=f"Starting research on: {query}")], 115 | query=query, 116 | depth=depth, 117 | breadth=breadth, 118 | current_depth=0, 119 | findings="", 120 | sources=[], 121 | selected_sources=[], 122 | formatted_citations="", 123 | subqueries=[], 124 | content_analysis=[], 125 | start_time=time.time(), 126 | chain_of_thought=[], 127 | status="Starting", 128 | current_date=get_current_date(), 129 | detail_level=detail_level, 130 | identified_themes="", 131 | initial_report="", 132 | enhanced_report="", 133 | final_report="" 134 | ) 135 | 136 | try: 137 | # Invoke the graph with increased recursion limit 138 | final_state = await self.graph.ainvoke(state, {"recursion_limit": 50}) 139 | 140 | elapsed_time = time.time() - final_state["start_time"] 141 | minutes, seconds = divmod(int(elapsed_time), 60) 142 | 143 | return ResearchResult( 144 | query=query, 145 | summary=final_state["findings"], 146 | sources=final_state["sources"], 147 | subqueries=final_state["subqueries"], 148 | depth=depth, 149 | content_analysis=final_state["content_analysis"], 150 | chain_of_thought=final_state["chain_of_thought"], 151 | research_stats={ 152 | "elapsed_time": elapsed_time, 153 | "elapsed_time_formatted": f"{minutes}m {seconds}s", 154 | "sources_count": len(final_state["sources"]), 155 | "subqueries_count": len(final_state["subqueries"]), 156 | "depth": depth, 157 | "breadth": breadth, 158 | "detail_level": detail_level 159 | } 160 | ) 161 | except KeyboardInterrupt: 162 | console.print("\n[yellow]Research interrupted by user. Generating report with current findings...[/]") 163 | 164 | elapsed_time = time.time() - state["start_time"] 165 | minutes, seconds = divmod(int(elapsed_time), 60) 166 | 167 | return ResearchResult( 168 | query=query, 169 | summary=state["findings"] + "\n\n*Note: Research was interrupted before completion.*", 170 | sources=state["sources"], 171 | subqueries=state["subqueries"], 172 | depth=state["current_depth"], 173 | content_analysis=state["content_analysis"], 174 | chain_of_thought=state["chain_of_thought"], 175 | research_stats={ 176 | "elapsed_time": elapsed_time, 177 | "elapsed_time_formatted": f"{minutes}m {seconds}s", 178 | "sources_count": len(state["sources"]), 179 | "subqueries_count": len(state["subqueries"]), 180 | "depth": state["current_depth"], 181 | "breadth": breadth, 182 | "detail_level": detail_level, 183 | "interrupted": True 184 | } 185 | ) 186 | 187 | def research_sync( 188 | self, 189 | query: str, 190 | depth: int = 2, 191 | breadth: int = 4, 192 | progress_callback: Optional[Callable[[AgentState], None]] = None, 193 | include_objective: bool = False, 194 | detail_level: str = "high" 195 | ) -> ResearchResult: 196 | """Synchronous wrapper for research.""" 197 | try: 198 | return asyncio.run(self.research(query, depth, breadth, progress_callback, include_objective, detail_level)) 199 | except KeyboardInterrupt: 200 | console.print("\n[yellow]Research interrupted by user.[/]") 201 | raise 202 | -------------------------------------------------------------------------------- /shandu/agents/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Node functions for the research graph workflow. 3 | Each node represents a discrete step in the research process. 4 | """ 5 | from .initialize import initialize_node 6 | from .reflect import reflect_node 7 | from .generate_queries import generate_queries_node 8 | from .search import search_node 9 | from .source_selection import smart_source_selection 10 | from .citations import format_citations_node 11 | from .report_generation import ( 12 | generate_initial_report_node, 13 | enhance_report_node, 14 | expand_key_sections_node, 15 | report_node 16 | ) 17 | 18 | __all__ = [ 19 | 'initialize_node', 20 | 'reflect_node', 21 | 'generate_queries_node', 22 | 'search_node', 23 | 'smart_source_selection', 24 | 'format_citations_node', 25 | 'generate_initial_report_node', 26 | 'enhance_report_node', 27 | 'expand_key_sections_node', 28 | 'report_node' 29 | ] -------------------------------------------------------------------------------- /shandu/agents/nodes/citations.py: -------------------------------------------------------------------------------- 1 | """ 2 | Citation formatting node for research graph with advanced tracking capabilities. 3 | """ 4 | import time 5 | from rich.console import Console 6 | from pydantic import BaseModel, Field 7 | from ..processors.content_processor import AgentState 8 | from ..processors.report_generator import format_citations 9 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback 10 | from ..utils.citation_manager import CitationManager, SourceInfo 11 | from ..utils.citation_registry import CitationRegistry 12 | 13 | console = Console() 14 | 15 | class FormattedCitations(BaseModel): 16 | """Structured output for formatted citations.""" 17 | citations: list[str] = Field( 18 | description="List of properly formatted citations", 19 | min_items=1 20 | ) 21 | 22 | async def format_citations_node(llm, progress_callback, state: AgentState) -> AgentState: 23 | """ 24 | Format citations for selected sources to ensure consistent referencing. 25 | 26 | This enhanced version uses the new CitationManager to track relationships 27 | between sources and specific learnings from them. 28 | """ 29 | state["status"] = "Processing and formatting citations" 30 | console.print("[bold blue]Processing source citations with enhanced attribution...[/]") 31 | 32 | selected_urls = state["selected_sources"] 33 | if not selected_urls: 34 | log_chain_of_thought(state, "No sources selected for citations") 35 | return state 36 | 37 | if "citation_manager" not in state: 38 | state["citation_manager"] = CitationManager() 39 | # For backward compatibility 40 | state["citation_registry"] = state["citation_manager"].citation_registry 41 | 42 | citation_manager = state["citation_manager"] 43 | 44 | # Register each source with the citation manager 45 | for url in selected_urls: 46 | 47 | source_meta = next((s for s in state["sources"] if s.get("url") == url), {}) 48 | 49 | source_info = SourceInfo( 50 | url=url, 51 | title=source_meta.get("title", ""), 52 | snippet=source_meta.get("snippet", ""), 53 | source_type="web", 54 | content_type=source_meta.get("content_type", "article"), 55 | access_time=time.time(), 56 | domain=url.split("//")[1].split("/")[0] if "//" in url else "unknown", 57 | reliability_score=0.8, # Default score, could be more dynamic 58 | metadata=source_meta 59 | ) 60 | 61 | citation_manager.add_source(source_info) 62 | 63 | # For backward compatibility, also register with citation registry 64 | citation_id = citation_manager.citation_registry.register_citation(url) 65 | citation_manager.citation_registry.update_citation_metadata(citation_id, { 66 | "title": source_meta.get("title", ""), 67 | "url": url, 68 | "snippet": source_meta.get("snippet", ""), 69 | "source": source_meta.get("source", "") 70 | }) 71 | 72 | formatted_citations = await format_citations( 73 | llm, 74 | selected_urls, 75 | state["sources"], 76 | citation_registry=citation_manager.citation_registry 77 | ) 78 | 79 | state["formatted_citations"] = formatted_citations 80 | log_chain_of_thought(state, f"Processed and formatted citations for {len(selected_urls)} sources") 81 | 82 | if progress_callback: 83 | await _call_progress_callback(progress_callback, state) 84 | return state 85 | -------------------------------------------------------------------------------- /shandu/agents/nodes/generate_queries.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query generation node for research graph. 3 | """ 4 | import os 5 | import re 6 | from rich.console import Console 7 | from langchain_core.messages import AIMessage, HumanMessage 8 | from langchain_core.prompts import ChatPromptTemplate 9 | from pydantic import BaseModel, Field 10 | from ..processors.content_processor import AgentState 11 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback 12 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS 13 | 14 | console = Console() 15 | 16 | # Structured output model for query generation 17 | class SearchQueries(BaseModel): 18 | """Structured output for search query generation.""" 19 | queries: list[str] = Field( 20 | description="List of search queries to investigate the topic further", 21 | min_items=1 22 | ) 23 | rationale: str = Field( 24 | description="Explanation of why these queries were selected and how they will help the research" 25 | ) 26 | 27 | async def generate_queries_node(llm, progress_callback, state: AgentState) -> AgentState: 28 | """Generate targeted search queries based on current findings using structured output.""" 29 | state["status"] = "Generating research queries" 30 | console.print("[bold yellow]Generating targeted search queries...[/]") 31 | 32 | try: 33 | # Use a completely direct approach to avoid template issues 34 | direct_prompt = f"""Generate {state['breadth']} specific search queries to investigate the topic: 35 | 36 | Main Query: {state['query']} 37 | 38 | Requirements: 39 | 1. Generate exactly {state['breadth']} search queries 40 | 2. Queries should be natural and conversational (like what someone would type in Google) 41 | 3. Each query should target specific facts, data points, or perspectives 42 | 4. Keep queries direct and concise - avoid complex academic phrasing 43 | 44 | Today's date: {state['current_date']} 45 | 46 | Current Research Findings: 47 | {state['findings'][:2000]} 48 | 49 | Return ONLY the search queries themselves, one per line, with no additional text, numbering, or explanation. 50 | """ 51 | # Send the prompt directly to the model 52 | response = await llm.ainvoke(direct_prompt) 53 | 54 | new_queries = [line.strip() for line in response.content.split("\n") if line.strip()] 55 | # Remove any numbering, bullet points, or other formatting 56 | new_queries = [re.sub(r'^[\d\s\-\*•\.\)]+\s*', '', line).strip() for line in new_queries] 57 | # Remove phrases like "Here are...", "I'll search for..." etc. 58 | new_queries = [re.sub(r'^(here are|i will|i\'ll|let me|these are|i recommend|completed:|search for:).*?:', '', line, flags=re.IGNORECASE).strip() for line in new_queries] 59 | # Filter out any empty lines or lines that don't look like actual queries 60 | new_queries = [q for q in new_queries if q and len(q.split()) >= 2 and not q.lower().startswith(("query", "search", "investigate", "explore", "research"))] 61 | # Limit to the specified breadth 62 | new_queries = new_queries[:state["breadth"]] 63 | 64 | log_chain_of_thought(state, f"Generated {len(new_queries)} search queries for investigation") 65 | 66 | except Exception as e: 67 | from ...utils.logger import log_error 68 | log_error("Error in structured query generation", e, 69 | context=f"Query: {state['query']}, Function: generate_queries_node") 70 | console.print(f"[dim red]Error in structured query generation: {str(e)}. Using simpler approach.[/dim red]") 71 | try: 72 | # Even simpler fallback approach 73 | response = await llm.ainvoke(f"Generate {state['breadth']} simple search queries for {state['query']}. Return only the queries, one per line.") 74 | 75 | new_queries = [line.strip() for line in response.content.split("\n") if line.strip()] 76 | # Remove any numbering, bullet points, or other formatting 77 | new_queries = [re.sub(r'^[\d\s\-\*•\.\)]+\s*', '', line).strip() for line in new_queries] 78 | # Remove phrases like "Here are...", "I'll search for..." etc. 79 | new_queries = [re.sub(r'^(here are|i will|i\'ll|let me|these are|i recommend|completed:|search for:).*?:', '', line, flags=re.IGNORECASE).strip() for line in new_queries] 80 | # Filter out any empty lines or lines that don't look like actual queries 81 | new_queries = [q for q in new_queries if q and len(q.split()) >= 2 and not q.lower().startswith(("query", "search", "investigate", "explore", "research"))] 82 | # Limit to the specified breadth 83 | new_queries = new_queries[:state["breadth"]] 84 | except Exception as e2: 85 | console.print(f"[dim red]Error in fallback query generation: {str(e2)}. Using default queries.[/dim red]") 86 | 87 | new_queries = [ 88 | f"{state['query']} latest research", 89 | f"{state['query']} examples", 90 | f"{state['query']} applications" 91 | ][:state["breadth"]] 92 | 93 | if not new_queries and state["query"]: 94 | new_queries = [state["query"]] 95 | 96 | state["messages"].append(HumanMessage(content="Generating new research directions...")) 97 | state["messages"].append(AIMessage(content="Generated queries:\n" + "\n".join(new_queries))) 98 | state["subqueries"].extend(new_queries) 99 | 100 | console.print("[bold green]Generated search queries:[/]") 101 | for i, query in enumerate(new_queries, 1): 102 | console.print(f" {i}. {query}") 103 | 104 | log_chain_of_thought(state, f"Generated {len(new_queries)} search queries for investigation") 105 | if progress_callback: 106 | await _call_progress_callback(progress_callback, state) 107 | return state 108 | -------------------------------------------------------------------------------- /shandu/agents/nodes/initialize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Initialize node for research graph. 3 | """ 4 | import os 5 | import time 6 | from rich.console import Console 7 | from rich.panel import Panel 8 | from langchain_core.messages import AIMessage, HumanMessage 9 | from langchain_core.prompts import ChatPromptTemplate 10 | from pydantic import BaseModel, Field 11 | from ..processors.content_processor import AgentState 12 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback 13 | from ...config import get_current_date 14 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS 15 | 16 | console = Console() 17 | 18 | class ResearchPlan(BaseModel): 19 | """Structured output for research plan.""" 20 | objectives: list[str] = Field( 21 | description="Clear objectives for the research", 22 | min_items=1 23 | ) 24 | key_areas: list[str] = Field( 25 | description="Key areas to investigate", 26 | min_items=1 27 | ) 28 | methodology: str = Field( 29 | description="Approach to conducting the research" 30 | ) 31 | expected_outcomes: list[str] = Field( 32 | description="Expected outcomes of the research", 33 | min_items=1 34 | ) 35 | 36 | async def initialize_node(llm, date, progress_callback, state: AgentState) -> AgentState: 37 | """Initialize the research process with a research plan using structured output.""" 38 | console.print(Panel(f"[bold blue]Starting Research:[/] {state['query']}", title="Research Process", border_style="blue")) 39 | state["start_time"] = time.time() 40 | state["status"] = "Initializing research" 41 | state["current_date"] = date or get_current_date() 42 | 43 | try: 44 | # Use a completely direct approach to avoid template issues 45 | direct_prompt = f"""You are an expert research agent tasked with creating a comprehensive research plan. Current date: {state['current_date']} 46 | 47 | Please create a detailed research plan for this query: {state['query']} 48 | 49 | Your plan must include the following sections clearly labeled: 50 | 51 | ## Objectives 52 | - List 3-5 clear objectives for the research 53 | 54 | ## Key Areas to Investigate 55 | - List 4-6 specific areas or aspects that need to be researched 56 | 57 | ## Methodology 58 | - Describe the approach to conducting this research 59 | - Include information sources and analysis methods 60 | 61 | ## Expected Outcomes 62 | - List 3-5 expected results or deliverables from this research 63 | 64 | Format your response with clear section headings and bullet points for clarity. Be specific and detailed in your planning. 65 | """ 66 | # Send the direct prompt to the model 67 | response = await llm.ainvoke(direct_prompt) 68 | 69 | research_text = response.content 70 | 71 | import re 72 | objectives = [] 73 | key_areas = [] 74 | methodology = "" 75 | expected_outcomes = [] 76 | 77 | objectives_section = re.search(r'(?:objectives|goals|aims)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL) 78 | if objectives_section: 79 | objectives_text = objectives_section.group(1).strip() 80 | objectives = [line.strip().strip('-*').strip() for line in objectives_text.split('\n') if line.strip() and not line.strip().startswith('#')] 81 | 82 | areas_section = re.search(r'(?:key areas|areas to investigate|investigation areas)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL) 83 | if areas_section: 84 | areas_text = areas_section.group(1).strip() 85 | key_areas = [line.strip().strip('-*').strip() for line in areas_text.split('\n') if line.strip() and not line.strip().startswith('#')] 86 | 87 | methodology_section = re.search(r'(?:methodology|approach|method)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL) 88 | if methodology_section: 89 | methodology = methodology_section.group(1).strip() 90 | 91 | outcomes_section = re.search(r'(?:expected outcomes|outcomes|results|expected results)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL) 92 | if outcomes_section: 93 | outcomes_text = outcomes_section.group(1).strip() 94 | expected_outcomes = [line.strip().strip('-*').strip() for line in outcomes_text.split('\n') if line.strip() and not line.strip().startswith('#')] 95 | 96 | if not objectives: 97 | objectives = ["Understand the key aspects of " + state['query']] 98 | if not key_areas: 99 | key_areas = ["Primary concepts and definitions", "Current applications and examples", "Future trends and developments"] 100 | if not methodology: 101 | methodology = "Systematic review of available literature and analysis of current applications and examples." 102 | if not expected_outcomes: 103 | expected_outcomes = ["Comprehensive understanding of " + state['query'], "Identification of key challenges and opportunities"] 104 | 105 | formatted_plan = "# Research Plan\n\n" 106 | 107 | formatted_plan += "## Objectives\n\n" 108 | for objective in objectives: 109 | formatted_plan += f"- {objective}\n" 110 | 111 | formatted_plan += "\n## Key Areas to Investigate\n\n" 112 | for area in key_areas: 113 | formatted_plan += f"- {area}\n" 114 | 115 | formatted_plan += f"\n## Methodology\n\n{methodology}\n" 116 | 117 | formatted_plan += "\n## Expected Outcomes\n\n" 118 | for outcome in expected_outcomes: 119 | formatted_plan += f"- {outcome}\n" 120 | 121 | state["messages"].append(HumanMessage(content=f"Planning research on: {state['query']}")) 122 | state["messages"].append(AIMessage(content=formatted_plan)) 123 | state["findings"] = f"{formatted_plan}\n\n# Initial Findings\n\n" 124 | 125 | except Exception as e: 126 | from ...utils.logger import log_error 127 | log_error("Error in structured plan generation", e, 128 | context=f"Query: {state['query']}, Function: initialize_node") 129 | console.print(f"[dim red]Error in structured plan generation: {str(e)}. Using simpler approach.[/dim red]") 130 | try: 131 | # Even simpler fallback approach 132 | response = await llm.ainvoke(f"""Create a research plan for: {state['query']} 133 | 134 | Include: 135 | 1. Main objectives 136 | 2. Key areas to investigate 137 | 3. Approach/methodology 138 | 4. Expected outcomes 139 | 140 | Keep it concise and practical. 141 | """) 142 | 143 | cleaned_plan = response.content.replace("**", "").replace("# ", "").replace("## ", "") 144 | 145 | state["messages"].append(HumanMessage(content=f"Planning research on: {state['query']}")) 146 | state["messages"].append(AIMessage(content=cleaned_plan)) 147 | state["findings"] = f"# Research Plan\n\n{cleaned_plan}\n\n# Initial Findings\n\n" 148 | except Exception as e2: 149 | console.print(f"[dim red]Error in fallback plan generation: {str(e2)}. Using minimal plan.[/dim red]") 150 | 151 | minimal_plan = f"Research plan for: {state['query']}\n\n- Investigate key aspects\n- Analyze relevant sources\n- Synthesize findings" 152 | 153 | state["messages"].append(HumanMessage(content=f"Planning research on: {state['query']}")) 154 | state["messages"].append(AIMessage(content=minimal_plan)) 155 | state["findings"] = f"# Research Plan\n\n{minimal_plan}\n\n# Initial Findings\n\n" 156 | 157 | log_chain_of_thought(state, f"Created research plan for query: {state['query']}") 158 | if progress_callback: 159 | await _call_progress_callback(progress_callback, state) 160 | return state 161 | -------------------------------------------------------------------------------- /shandu/agents/nodes/reflect.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reflection node for research graph. 3 | """ 4 | import os 5 | from rich.console import Console 6 | from langchain_core.messages import AIMessage, HumanMessage 7 | from langchain_core.prompts import ChatPromptTemplate 8 | from pydantic import BaseModel, Field 9 | from ..processors.content_processor import AgentState 10 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback 11 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS, safe_format 12 | 13 | console = Console() 14 | 15 | # Structured output model for reflection 16 | class ResearchReflection(BaseModel): 17 | """Structured output for research reflection.""" 18 | key_insights: list[str] = Field( 19 | description="Key insights gained from the research so far", 20 | min_items=1 21 | ) 22 | knowledge_gaps: list[str] = Field( 23 | description="Identified gaps in the current research", 24 | min_items=1 25 | ) 26 | next_steps: list[str] = Field( 27 | description="Recommended next steps for the research", 28 | min_items=1 29 | ) 30 | reflection_summary: str = Field( 31 | description="Overall reflection on the current state of the research" 32 | ) 33 | 34 | async def reflect_node(llm, progress_callback, state: AgentState) -> AgentState: 35 | """Reflect on current findings to identify gaps and opportunities using structured output.""" 36 | state["status"] = "Reflecting on findings" 37 | console.print("[bold yellow]Reflecting on current findings...[/]") 38 | 39 | try: 40 | # Use safe_format instead of manual escaping 41 | current_date = state['current_date'] 42 | findings = state['findings'][:3000] 43 | 44 | direct_prompt = safe_format("""Analyze the following research findings and provide a detailed reflection. Today's date: {current_date} 45 | 46 | Research Findings: 47 | {findings} 48 | 49 | Your reflection must include these sections clearly labeled: 50 | 51 | ## Key Insights 52 | - List the most important discoveries and insights from the research 53 | - Evaluate the evidence strength for each insight 54 | 55 | ## Knowledge Gaps 56 | - Identify specific questions that remain unanswered 57 | - Explain why these gaps are significant 58 | 59 | ## Next Steps 60 | - Suggest specific areas for deeper investigation 61 | - Recommend research methods to address the knowledge gaps 62 | 63 | ## Overall Reflection 64 | - Provide a comprehensive assessment of the research progress 65 | - Evaluate the overall quality and reliability of the findings 66 | 67 | Format your response with clear section headings and bullet points for clarity.""", current_date=current_date, findings=findings) 68 | # Send the prompt directly to the model 69 | response = await llm.ainvoke(direct_prompt) 70 | 71 | reflection_text = response.content 72 | 73 | import re 74 | key_insights = [] 75 | knowledge_gaps = [] 76 | next_steps = [] 77 | reflection_summary = "" 78 | 79 | insights_section = re.search(r'(?:key insights|insights|key findings)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL) 80 | if insights_section: 81 | insights_text = insights_section.group(1).strip() 82 | key_insights = [line.strip().strip('-*').strip() for line in insights_text.split('\n') if line.strip() and not line.strip().startswith('#')] 83 | 84 | gaps_section = re.search(r'(?:knowledge gaps|gaps|questions|unanswered questions)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL) 85 | if gaps_section: 86 | gaps_text = gaps_section.group(1).strip() 87 | knowledge_gaps = [line.strip().strip('-*').strip() for line in gaps_text.split('\n') if line.strip() and not line.strip().startswith('#')] 88 | 89 | steps_section = re.search(r'(?:next steps|steps|recommendations|future directions)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL) 90 | if steps_section: 91 | steps_text = steps_section.group(1).strip() 92 | next_steps = [line.strip().strip('-*').strip() for line in steps_text.split('\n') if line.strip() and not line.strip().startswith('#')] 93 | 94 | summary_section = re.search(r'(?:overall reflection|reflection summary|summary|conclusion)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL) 95 | if summary_section: 96 | reflection_summary = summary_section.group(1).strip() 97 | 98 | if not key_insights: 99 | key_insights = ["Research is progressing on " + state['query']] 100 | if not knowledge_gaps: 101 | knowledge_gaps = ["Further details needed on specific aspects"] 102 | if not next_steps: 103 | next_steps = ["Continue investigating primary aspects", "Search for more specific examples"] 104 | if not reflection_summary: 105 | reflection_summary = "The research is making progress and has uncovered valuable information, but further investigation is needed in key areas." 106 | 107 | formatted_reflection = "## Key Insights\n\n" 108 | for insight in key_insights: 109 | formatted_reflection += f"- {insight}\n" 110 | 111 | formatted_reflection += "\n## Knowledge Gaps\n\n" 112 | for gap in knowledge_gaps: 113 | formatted_reflection += f"- {gap}\n" 114 | 115 | formatted_reflection += "\n## Next Steps\n\n" 116 | for step in next_steps: 117 | formatted_reflection += f"- {step}\n" 118 | 119 | formatted_reflection += f"\n## Overall Reflection\n\n{reflection_summary}\n" 120 | 121 | state["messages"].append(HumanMessage(content="Analyzing current findings...")) 122 | state["messages"].append(AIMessage(content=formatted_reflection)) 123 | state["findings"] += f"\n\n## Reflection on Current Findings\n\n{formatted_reflection}\n\n" 124 | 125 | except Exception as e: 126 | from ...utils.logger import log_error 127 | log_error("Error in structured reflection", e, 128 | context=f"Function: reflect_node") 129 | console.print(f"[dim red]Error in structured reflection: {str(e)}. Using simpler approach.[/dim red]") 130 | try: 131 | # Use safe_format in the fallback case too 132 | fallback_findings = state['findings'][:2000] 133 | 134 | fallback_prompt = safe_format("""Reflect on these research findings: 135 | 136 | {findings} 137 | 138 | Include: 139 | 1. Key insights 140 | 2. Knowledge gaps 141 | 3. Next steps 142 | 4. Overall assessment 143 | """, findings=fallback_findings) 144 | 145 | response = await llm.ainvoke(fallback_prompt) 146 | 147 | reflection_content = response.content 148 | 149 | state["messages"].append(HumanMessage(content="Analyzing current findings...")) 150 | state["messages"].append(AIMessage(content=reflection_content)) 151 | state["findings"] += f"\n\n## Reflection on Current Findings\n\n{reflection_content}\n\n" 152 | except Exception as e2: 153 | console.print(f"[dim red]Error in fallback reflection: {str(e2)}. Using minimal reflection.[/dim red]") 154 | 155 | minimal_reflection = "## Research Reflection\n\nThe research is progressing. Further investigation is needed to develop a more comprehensive understanding of the topic." 156 | 157 | state["messages"].append(HumanMessage(content="Analyzing current findings...")) 158 | state["messages"].append(AIMessage(content=minimal_reflection)) 159 | state["findings"] += f"\n\n## Reflection on Current Findings\n\n{minimal_reflection}\n\n" 160 | 161 | log_chain_of_thought(state, "Completed reflection on current findings") 162 | if progress_callback: 163 | await _call_progress_callback(progress_callback, state) 164 | return state 165 | -------------------------------------------------------------------------------- /shandu/agents/nodes/search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search node for research graph. 3 | """ 4 | import asyncio 5 | import time 6 | import random 7 | import logging 8 | from typing import List, Dict, Any, Optional, Set 9 | from concurrent.futures import ThreadPoolExecutor 10 | from rich.console import Console 11 | from langchain_core.messages import AIMessage, HumanMessage 12 | from langchain_core.prompts import ChatPromptTemplate 13 | from pydantic import BaseModel, Field 14 | from ..processors.content_processor import AgentState, is_relevant_url, process_scraped_item, analyze_content 15 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback, is_shutdown_requested 16 | from ...search.search import SearchResult 17 | 18 | console = Console() 19 | 20 | # Structured output model for search results 21 | class SearchResultAnalysis(BaseModel): 22 | """Structured output for search result analysis.""" 23 | relevant_urls: list[str] = Field( 24 | description="List of URLs that are relevant to the query", 25 | min_items=0 26 | ) 27 | analysis: str = Field( 28 | description="Analysis of the search results" 29 | ) 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | async def search_node(llm, searcher, scraper, progress_callback, state: AgentState) -> AgentState: 34 | """ 35 | Search for information based on the current subqueries. 36 | 37 | Args: 38 | llm: Language model to use 39 | searcher: Search engine to use 40 | scraper: Web scraper to use 41 | progress_callback: Callback function for progress updates 42 | state: Current agent state 43 | 44 | Returns: 45 | Updated agent state 46 | """ 47 | if is_shutdown_requested(): 48 | state["status"] = "Shutdown requested, skipping search" 49 | log_chain_of_thought(state, "Shutdown requested, skipping search") 50 | return state 51 | 52 | state["status"] = f"Searching for information (Depth {state['current_depth']})" 53 | 54 | breadth = state["breadth"] 55 | if len(state["subqueries"]) > 0: 56 | recent_queries = state["subqueries"][-breadth:] 57 | else: 58 | recent_queries = [state["query"]] 59 | 60 | async def process_query(query, query_idx): 61 | if is_shutdown_requested(): 62 | log_chain_of_thought(state, f"Shutdown requested, stopping search after {query_idx} queries") 63 | return 64 | 65 | logger.info(f"Processing query {query_idx+1}/{len(recent_queries)}: {query}") 66 | console.print(f"Executing search for: {query}") 67 | state["status"] = f"Searching for: {query}" 68 | 69 | # Search for the query using multiple engines for better results 70 | try: 71 | # Use multiple engines in parallel for more diverse results 72 | engines = ["google", "duckduckgo"] # Using primary engines 73 | if query_idx % 2 == 0: # Add Wikipedia for every other query 74 | engines.append("wikipedia") 75 | 76 | search_results = await searcher.search(query, engines=engines) 77 | if not search_results: 78 | logger.warning(f"No search results found for: {query}") 79 | log_chain_of_thought(state, f"No search results found for '{query}'") 80 | return 81 | 82 | except Exception as e: 83 | console.print(f"[red]Error during search: {e}[/]") 84 | log_chain_of_thought(state, f"Error during search for '{query}': {str(e)}") 85 | return 86 | 87 | # Filter relevant URLs in batches to avoid overwhelming the LLM 88 | relevant_urls = [] 89 | url_batches = [search_results[i:i+10] for i in range(0, len(search_results), 10)] 90 | 91 | for batch in url_batches: 92 | if is_shutdown_requested(): 93 | break 94 | 95 | relevance_tasks = [] 96 | for result in batch: 97 | relevance_task = is_relevant_url(llm, result.url, result.title, result.snippet, query) 98 | relevance_tasks.append((result, relevance_task)) 99 | 100 | # Wait for all relevance checks in this batch 101 | for result, relevance_task in relevance_tasks: 102 | try: 103 | is_relevant = await relevance_task 104 | if is_relevant: 105 | relevant_urls.append(result) 106 | 107 | state["sources"].append({ 108 | "url": result.url, 109 | "title": result.title, 110 | "snippet": result.snippet, 111 | "source": result.source, 112 | "query": query 113 | }) 114 | except Exception as e: 115 | logger.error(f"Error checking relevance for {result.url}: {e}") 116 | 117 | if not relevant_urls: 118 | log_chain_of_thought(state, f"No relevant URLs found for '{query}'") 119 | return 120 | 121 | # Limit the number of URLs to scrape for efficiency 122 | # Choose a mix of the most relevant URLs across different sources 123 | # Sort by source first to ensure diversity, then take top N 124 | relevant_urls.sort(key=lambda r: r.source) 125 | relevant_urls = relevant_urls[:8] # Increased from 5 to 8 for better coverage 126 | 127 | # Scrape the relevant URLs all at once using our improved scraper 128 | urls_to_scrape = [result.url for result in relevant_urls] 129 | 130 | # The new scraper implementation handles concurrency internally 131 | # It will use semaphores to limit concurrent scraping and handle timeouts 132 | try: 133 | scraped_contents = await scraper.scrape_urls( 134 | urls_to_scrape, 135 | dynamic=False, # Avoid dynamic for speed unless specially needed 136 | force_refresh=False # Use caching if available 137 | ) 138 | except Exception as e: 139 | logger.error(f"Error scraping URLs for query '{query}': {e}") 140 | log_chain_of_thought(state, f"Error scraping URLs for query '{query}': {str(e)}") 141 | return 142 | 143 | processed_items = [] 144 | successful_scrapes = [item for item in scraped_contents if item.is_successful()] 145 | 146 | for item in successful_scrapes: 147 | if is_shutdown_requested(): 148 | break 149 | 150 | logger.info(f"Processing scraped content from: {item.url}") 151 | content_preview = item.text[:100] + "..." if len(item.text) > 100 else item.text 152 | logger.debug(f"Content preview: {content_preview}") 153 | 154 | processed_item = await process_scraped_item(llm, item, query, item.text) 155 | processed_items.append(processed_item) 156 | 157 | if not processed_items: 158 | log_chain_of_thought(state, f"No content could be extracted from URLs for '{query}'") 159 | return 160 | 161 | # Prepare content for analysis in a structured way 162 | combined_content = "" 163 | for item in processed_items: 164 | 165 | combined_content += f"\n\n## SOURCE: {item['item'].url}\n" 166 | combined_content += f"## TITLE: {item['item'].title or 'No title'}\n" 167 | combined_content += f"## RELIABILITY: {item['rating']}\n" 168 | combined_content += f"## CONTENT START\n{item['content']}\n## CONTENT END\n" 169 | 170 | analysis = await analyze_content(llm, query, combined_content) 171 | 172 | state["content_analysis"].append({ 173 | "query": query, 174 | "sources": [item["item"].url for item in processed_items], 175 | "analysis": analysis 176 | }) 177 | 178 | state["findings"] += f"\n\n## Analysis for: {query}\n\n{analysis}\n\n" 179 | 180 | log_chain_of_thought(state, f"Analyzed content for query: {query}") 181 | if progress_callback: 182 | await _call_progress_callback(progress_callback, state) 183 | 184 | tasks = [] 185 | for idx, query in enumerate(recent_queries): 186 | tasks.append(process_query(query, idx)) 187 | 188 | # Use gather to process all queries concurrently but with proper control 189 | await asyncio.gather(*tasks) 190 | 191 | state["current_depth"] += 1 192 | log_chain_of_thought(state, f"Completed depth {state['current_depth']} of {state['depth']}") 193 | 194 | if progress_callback and state.get("status") != "Searching": 195 | state["status"] = "Searching completed" 196 | await _call_progress_callback(progress_callback, state) 197 | 198 | return state 199 | -------------------------------------------------------------------------------- /shandu/agents/nodes/source_selection.py: -------------------------------------------------------------------------------- 1 | """Source selection node with robust error handling and retry logic.""" 2 | import os 3 | import re 4 | import time 5 | import asyncio 6 | import random 7 | from typing import List, Dict, Any, Optional 8 | from rich.console import Console 9 | from rich.progress import Progress, SpinnerColumn, TextColumn 10 | from langchain_core.prompts import ChatPromptTemplate 11 | from pydantic import BaseModel, Field 12 | from ..processors.content_processor import AgentState 13 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback 14 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS 15 | 16 | console = Console() 17 | 18 | # Maximum retry attempts for source selection 19 | MAX_RETRIES = 3 20 | 21 | # Structured output model for source selection 22 | class SourceSelection(BaseModel): 23 | """Structured output for source selection.""" 24 | selected_sources: list[str] = Field( 25 | description="List of URLs for the most valuable sources to include in the report", 26 | min_items=1 27 | ) 28 | selection_rationale: str = Field( 29 | description="Explanation of why these sources were selected" 30 | ) 31 | 32 | # Exponential backoff function for retries 33 | async def backoff_retry(attempt: int) -> None: 34 | """Simple exponential backoff.""" 35 | if attempt > 0: 36 | # Exponential backoff with jitter to avoid thundering herd 37 | delay = min(30, (2 ** attempt) + (random.random() * 0.5)) 38 | console.print(f"[yellow]Backing off for {delay:.1f} seconds before retry...[/]") 39 | await asyncio.sleep(delay) 40 | 41 | def extract_urls_from_text(text: str, all_source_urls: List[str]) -> List[str]: 42 | """ 43 | Extract URLs from the model response text. 44 | 45 | Args: 46 | text: The text to extract URLs from 47 | all_source_urls: List of all possible source URLs 48 | 49 | Returns: 50 | List of extracted URLs 51 | """ 52 | selected_urls = [] 53 | lines = text.split('\n') 54 | 55 | # Iterate through each line looking for URLs 56 | for line in lines: 57 | for url in all_source_urls: 58 | if url in line: 59 | if url not in selected_urls: 60 | selected_urls.append(url) 61 | break 62 | 63 | return selected_urls 64 | 65 | async def select_sources_with_llm(llm, all_source_urls: List[str], sources_text: str, query: str) -> List[str]: 66 | """ 67 | Try to select sources using LLM with retry logic. 68 | 69 | Args: 70 | llm: The language model 71 | all_source_urls: List of all source URLs 72 | sources_text: Formatted text of all sources 73 | query: The research query 74 | 75 | Returns: 76 | List of selected URLs 77 | """ 78 | selected_urls = [] 79 | 80 | with Progress( 81 | SpinnerColumn(), 82 | TextColumn("[bold blue]Selecting sources..."), 83 | console=console 84 | ) as progress: 85 | task = progress.add_task("Selecting", total=1) 86 | 87 | # Try using a standard source selection approach first 88 | for attempt in range(MAX_RETRIES): 89 | try: 90 | await backoff_retry(attempt) 91 | 92 | # Use a direct, simplified prompt 93 | direct_prompt = f"""Select the 15-20 most valuable sources for this research report. 94 | 95 | RESEARCH TOPIC: {query} 96 | 97 | SOURCES TO EVALUATE: 98 | {sources_text[:15000]} # Limit text length to avoid token issues 99 | 100 | INSTRUCTIONS: 101 | - Select 15-20 of the most valuable sources from the list 102 | - Return ONLY the exact URLs of your selected sources 103 | - List the URLs in order of importance, one URL per line 104 | - Do not include any explanations, just the URLs 105 | """ 106 | # Try with a smaller timeout and token limit 107 | retry_llm = llm.with_config({"timeout": 30, "max_tokens": 1024}) 108 | response = await retry_llm.ainvoke(direct_prompt) 109 | selected_urls = extract_urls_from_text(response.content, all_source_urls) 110 | 111 | # If we got some results, we're done 112 | if selected_urls: 113 | progress.update(task, completed=1) 114 | break 115 | 116 | except Exception as e: 117 | console.print(f"[yellow]Source selection attempt {attempt+1} failed: {str(e)}[/]") 118 | 119 | # Only log the first error in detail 120 | if attempt == 0: 121 | from ...utils.logger import log_error 122 | log_error("Error in source selection", e, 123 | context=f"Query: {query}, Function: select_sources_with_llm") 124 | 125 | # If this was the last attempt, continue to fallback mechanisms 126 | if attempt == MAX_RETRIES - 1: 127 | console.print("[yellow]All source selection attempts failed, using fallback approach[/]") 128 | 129 | progress.update(task, completed=1) 130 | 131 | return selected_urls 132 | 133 | async def smart_source_selection(llm, progress_callback, state: AgentState) -> AgentState: 134 | """Select relevant sources for the report using robust error handling.""" 135 | state["status"] = "Selecting most valuable sources" 136 | console.print("[bold blue]Selecting most relevant and high-quality sources...[/]") 137 | 138 | # Collect all unique source URLs 139 | all_source_urls = [] 140 | for analysis in state["content_analysis"]: 141 | if "sources" in analysis and isinstance(analysis["sources"], list): 142 | for url in analysis["sources"]: 143 | if url not in all_source_urls: 144 | all_source_urls.append(url) 145 | 146 | console.print(f"[green]Found {len(all_source_urls)} total sources to evaluate[/]") 147 | 148 | # If we have too many sources, use smart selection to filter them 149 | if len(all_source_urls) > 25: 150 | # Prepare formatted source text 151 | sources_text = "" 152 | for i, url in enumerate(all_source_urls, 1): 153 | source_meta = next((s for s in state["sources"] if s.get("url") == url), {}) 154 | 155 | sources_text += f"Source {i}:\nURL: {url}\n" 156 | if source_meta.get("title"): 157 | sources_text += f"Title: {source_meta.get('title')}\n" 158 | if source_meta.get("snippet"): 159 | sources_text += f"Summary: {source_meta.get('snippet')}\n" 160 | if source_meta.get("date"): 161 | sources_text += f"Date: {source_meta.get('date')}\n" 162 | sources_text += "\n" 163 | 164 | # Try LLM-based selection with retry logic 165 | selected_urls = await select_sources_with_llm( 166 | llm, 167 | all_source_urls, 168 | sources_text, 169 | state['query'] 170 | ) 171 | 172 | # Fallback: If all attempts fail, use a simplified ranking based on source metadata 173 | if not selected_urls: 174 | console.print("[yellow]Using fallback source selection based on metadata ranking[/]") 175 | 176 | # Prioritize sources with titles and snippets 177 | ranked_sources = [] 178 | for url in all_source_urls: 179 | source_meta = next((s for s in state["sources"] if s.get("url") == url), {}) 180 | 181 | # Simple ranking based on metadata completeness 182 | score = 0 183 | if source_meta.get("title"): 184 | score += 2 185 | if source_meta.get("snippet"): 186 | score += 1 187 | if source_meta.get("date"): 188 | score += 1 189 | 190 | ranked_sources.append((url, score)) 191 | 192 | # Sort by score in descending order 193 | ranked_sources.sort(key=lambda x: x[1], reverse=True) 194 | 195 | # Take top 15-20 sources 196 | max_sources = min(20, len(ranked_sources)) 197 | selected_urls = [url for url, _ in ranked_sources[:max_sources]] 198 | 199 | # Always ensure we have sources 200 | if not selected_urls and all_source_urls: 201 | # Last resort: take the first 15-20 sources 202 | selected_urls = all_source_urls[:min(20, len(all_source_urls))] 203 | 204 | # Store the selected sources 205 | state["selected_sources"] = selected_urls 206 | log_chain_of_thought( 207 | state, 208 | f"Selected {len(selected_urls)} most relevant sources from {len(all_source_urls)} total sources" 209 | ) 210 | else: 211 | # If we don't have too many sources, use all of them 212 | state["selected_sources"] = all_source_urls 213 | log_chain_of_thought(state, f"Using all {len(all_source_urls)} sources for final report") 214 | 215 | if progress_callback: 216 | await _call_progress_callback(progress_callback, state) 217 | return state 218 | -------------------------------------------------------------------------------- /shandu/agents/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Content processing and report generation modules for research agents. 3 | """ 4 | from .content_processor import ( 5 | AgentState, 6 | is_relevant_url, 7 | process_scraped_item, 8 | analyze_content 9 | ) 10 | from .report_generator import ( 11 | generate_title, 12 | format_citations, 13 | extract_themes, 14 | generate_initial_report, 15 | enhance_report, 16 | expand_key_sections 17 | ) 18 | 19 | __all__ = [ 20 | 'AgentState', 21 | 'is_relevant_url', 22 | 'process_scraped_item', 23 | 'analyze_content', 24 | 'generate_title', 25 | 'format_citations', 26 | 'extract_themes', 27 | 'generate_initial_report', 28 | 'enhance_report', 29 | 'expand_key_sections' 30 | ] -------------------------------------------------------------------------------- /shandu/agents/processors/content_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Content processing utilities for research agents. 3 | Contains functionality for handling search results, extracting content, and analyzing information. 4 | """ 5 | 6 | import os 7 | from typing import List, Dict, Optional, Any, Union, TypedDict, Sequence 8 | from dataclasses import dataclass 9 | import json 10 | import time 11 | import asyncio 12 | import re 13 | from datetime import datetime 14 | from rich.console import Console 15 | from langchain_core.messages import AIMessage, HumanMessage, BaseMessage 16 | from langchain_core.prompts import ChatPromptTemplate 17 | from langchain_core.output_parsers import StrOutputParser 18 | from pydantic import BaseModel, Field 19 | from langchain_openai import ChatOpenAI 20 | from ...search.search import SearchResult 21 | from ...scraper import WebScraper, ScrapedContent 22 | 23 | console = Console() 24 | 25 | class AgentState(TypedDict): 26 | messages: Sequence[Union[HumanMessage, AIMessage]] 27 | query: str 28 | depth: int 29 | breadth: int 30 | current_depth: int 31 | findings: str 32 | sources: List[Dict[str, Any]] 33 | selected_sources: List[str] 34 | formatted_citations: str 35 | subqueries: List[str] 36 | content_analysis: List[Dict[str, Any]] 37 | start_time: float 38 | chain_of_thought: List[str] 39 | status: str 40 | current_date: str 41 | detail_level: str 42 | identified_themes: str 43 | initial_report: str 44 | enhanced_report: str 45 | final_report: str 46 | 47 | # Structured output models 48 | class UrlRelevanceResult(BaseModel): 49 | """Structured output for URL relevance check.""" 50 | is_relevant: bool = Field(description="Whether the URL is relevant to the query") 51 | reason: str = Field(description="Reason for the relevance decision") 52 | 53 | class ContentRating(BaseModel): 54 | """Structured output for content reliability rating.""" 55 | rating: str = Field(description="Reliability rating: HIGH, MEDIUM, or LOW") 56 | justification: str = Field(description="Justification for the rating") 57 | extracted_content: str = Field(description="Extracted relevant content from the source") 58 | 59 | class ContentAnalysis(BaseModel): 60 | """Structured output for content analysis.""" 61 | key_findings: List[str] = Field(description="List of key findings from the content") 62 | main_themes: List[str] = Field(description="Main themes identified in the content") 63 | analysis: str = Field(description="Comprehensive analysis of the content") 64 | source_evaluation: str = Field(description="Evaluation of the sources' credibility and relevance") 65 | 66 | async def is_relevant_url(llm: ChatOpenAI, url: str, title: str, snippet: str, query: str) -> bool: 67 | """ 68 | Check if a URL is relevant to the query using structured output. 69 | """ 70 | # First use simple heuristics to avoid LLM calls for obviously irrelevant domains 71 | irrelevant_domains = [ 72 | "pinterest", "instagram", "facebook", "twitter", "youtube", "tiktok", 73 | "reddit", "quora", "linkedin", "amazon.com", "ebay.com", "etsy.com", 74 | "walmart.com", "target.com" 75 | ] 76 | if any(domain in url.lower() for domain in irrelevant_domains): 77 | return False 78 | 79 | # Escape any literal curly braces in the inputs 80 | safe_url = url.replace("{", "{{").replace("}", "}}") 81 | safe_title = title.replace("{", "{{").replace("}", "}}") 82 | safe_snippet = snippet.replace("{", "{{").replace("}", "}}") 83 | safe_query = query.replace("{", "{{").replace("}", "}}") 84 | 85 | # Use structured output for relevance check 86 | structured_llm = llm.with_structured_output(UrlRelevanceResult) 87 | system_prompt = ( 88 | "You are evaluating search results for relevance to a specific query.\n\n" 89 | "DETERMINE if the search result is RELEVANT or NOT RELEVANT to answering the query.\n" 90 | "Consider the title, URL, and snippet to make your determination.\n\n" 91 | "Provide a structured response with your decision and reasoning.\n" 92 | ) 93 | user_content = ( 94 | f"Query: {safe_query}\n\n" 95 | f"Search Result:\nTitle: {safe_title}\nURL: {safe_url}\nSnippet: {safe_snippet}\n\n" 96 | "Is this result relevant to the query?" 97 | ) 98 | # Build the prompt chain by piping the prompt into the structured LLM. 99 | prompt = ChatPromptTemplate.from_messages([ 100 | {"role": "system", "content": system_prompt}, 101 | {"role": "user", "content": user_content} 102 | ]) 103 | mapping = {"query": query, "title": title, "url": url, "snippet": snippet} 104 | try: 105 | # Chain the prompt and structured LLM; then call invoke with the mapping 106 | chain = prompt | structured_llm 107 | result = await chain.ainvoke(mapping) 108 | return result.is_relevant 109 | except Exception as e: 110 | from ...utils.logger import log_error 111 | log_error("Error in structured relevance check", e, 112 | context=f"Query: {query}, Function: is_relevant_url") 113 | console.print(f"[dim red]Error in structured relevance check: {str(e)}. Using simpler approach.[/dim red]") 114 | # Escape any literal curly braces in the fallback prompt 115 | safe_fb_url = url.replace("{", "{{").replace("}", "}}") 116 | safe_fb_title = title.replace("{", "{{").replace("}", "}}") 117 | safe_fb_snippet = snippet.replace("{", "{{").replace("}", "}}") 118 | safe_fb_query = query.replace("{", "{{").replace("}", "}}") 119 | 120 | simple_prompt = ( 121 | f"Evaluate if this search result is RELEVANT or NOT RELEVANT to the query.\n" 122 | "Answer with ONLY \"RELEVANT\" or \"NOT RELEVANT\".\n\n" 123 | f"Query: {safe_fb_query}\n" 124 | f"Title: {safe_fb_title}\n" 125 | f"URL: {safe_fb_url}\n" 126 | f"Snippet: {safe_fb_snippet}" 127 | ) 128 | response = await llm.ainvoke(simple_prompt) 129 | result_text = response.content 130 | return "RELEVANT" in result_text.upper() 131 | 132 | async def process_scraped_item(llm: ChatOpenAI, item: ScrapedContent, subquery: str, main_content: str) -> Dict[str, Any]: 133 | """ 134 | Process a scraped item to evaluate reliability and extract content using structured output. 135 | """ 136 | try: 137 | # Escape any literal curly braces in the content to avoid format string errors 138 | safe_content = main_content[:8000].replace("{", "{{").replace("}", "}}") 139 | safe_url = item.url.replace("{", "{{").replace("}", "}}") 140 | safe_title = item.title.replace("{", "{{").replace("}", "}}") 141 | safe_subquery = subquery.replace("{", "{{").replace("}", "}}") 142 | 143 | structured_llm = llm.with_structured_output(ContentRating) 144 | system_prompt = ( 145 | "You are analyzing web content for reliability and extracting the most relevant information.\n\n" 146 | "Evaluate the RELIABILITY of the content using these criteria:\n" 147 | "1. Source credibility and expertise\n" 148 | "2. Evidence quality\n" 149 | "3. Consistency with known facts\n" 150 | "4. Publication date recency\n" 151 | "5. Presence of citations or references\n\n" 152 | "Rate the source as \"HIGH\", \"MEDIUM\", or \"LOW\" reliability with a brief justification.\n\n" 153 | "Then, EXTRACT the most relevant and valuable content related to the query.\n" 154 | ) 155 | user_message = ( 156 | f"Analyze this web content:\n\n" 157 | f"URL: {safe_url}\n" 158 | f"Title: {safe_title}\n" 159 | f"Query: {safe_subquery}\n\n" 160 | "Content:\n" 161 | f"{safe_content}" 162 | ) 163 | prompt = ChatPromptTemplate.from_messages([ 164 | {"role": "system", "content": system_prompt}, 165 | {"role": "user", "content": user_message} 166 | ]) 167 | mapping = {"url": item.url, "title": item.title, "subquery": subquery} 168 | # Chain the prompt with the structured LLM 169 | chain = prompt | structured_llm 170 | result = await chain.ainvoke(mapping) 171 | return { 172 | "item": item, 173 | "rating": result.rating, 174 | "justification": result.justification, 175 | "content": result.extracted_content 176 | } 177 | except Exception as e: 178 | from ...utils.logger import log_error 179 | log_error("Error in structured content processing", e, 180 | context=f"Query: {subquery}, Function: process_scraped_item") 181 | console.print(f"[dim red]Error in structured content processing: {str(e)}. Using simpler approach.[/dim red]") 182 | current_file = os.path.basename(__file__) 183 | # Escape any literal curly braces in the fallback content 184 | safe_shorter_content = main_content[:5000].replace("{", "{{").replace("}", "}}") 185 | safe_fb_url = item.url.replace("{", "{{").replace("}", "}}") 186 | safe_fb_title = item.title.replace("{", "{{").replace("}", "}}") 187 | safe_fb_subquery = subquery.replace("{", "{{").replace("}", "}}") 188 | 189 | simple_prompt = ( 190 | f"Analyze web content for reliability (HIGH/MEDIUM/LOW) and extract relevant information.\n" 191 | "Format your response as:\n" 192 | "RELIABILITY: [rating]\n" 193 | "JUSTIFICATION: [brief explanation]\n" 194 | "EXTRACTED_CONTENT: [relevant content]\n\n" 195 | f"URL: {safe_fb_url}\n" 196 | f"Title: {safe_fb_title}\n" 197 | f"Query: {safe_fb_subquery}\n\n" 198 | "Content:\n" 199 | f"{safe_shorter_content}" 200 | ) 201 | response = await llm.ainvoke(simple_prompt) 202 | content = response.content 203 | rating = "MEDIUM" # Default fallback rating 204 | justification = "" 205 | extracted_content = content 206 | 207 | if "RELIABILITY:" in content: 208 | reliability_match = re.search(r"RELIABILITY:\s*(HIGH|MEDIUM|LOW)", content) 209 | if reliability_match: 210 | rating = reliability_match.group(1) 211 | if "JUSTIFICATION:" in content: 212 | justification_match = re.search(r"JUSTIFICATION:\s*(.+?)(?=\n\n|EXTRACTED_CONTENT:|$)", content, re.DOTALL) 213 | if justification_match: 214 | justification = justification_match.group(1).strip() 215 | if "EXTRACTED_CONTENT:" in content: 216 | content_match = re.search(r"EXTRACTED_CONTENT:\s*(.+?)(?=$)", content, re.DOTALL) 217 | if content_match: 218 | extracted_content = content_match.group(1).strip() 219 | 220 | return { 221 | "item": item, 222 | "rating": rating, 223 | "justification": justification, 224 | "content": extracted_content 225 | } 226 | 227 | async def analyze_content(llm: ChatOpenAI, subquery: str, content_text: str) -> str: 228 | """ 229 | Analyze content from multiple sources and synthesize the information using structured output. 230 | """ 231 | try: 232 | structured_llm = llm.with_structured_output(ContentAnalysis) 233 | system_prompt = ( 234 | "You are analyzing and synthesizing information from multiple web sources.\n\n" 235 | "Your task is to:\n" 236 | "1. Identify the most important and relevant information related to the query\n" 237 | "2. Extract key findings and main themes\n" 238 | "3. Organize the information into a coherent analysis\n" 239 | "4. Evaluate the credibility and relevance of the sources\n" 240 | "5. Maintain source attributions when presenting facts or claims\n\n" 241 | "Create a thorough, well-structured analysis that captures the most valuable insights.\n" 242 | ) 243 | user_message = ( 244 | f"Analyze the following content related to the query: \"{subquery}\"\n\n" 245 | f"{content_text}\n\n" 246 | "Provide a comprehensive analysis that synthesizes the most relevant information " 247 | "from these sources, organized into a well-structured format with key findings." 248 | ) 249 | # Escape any literal curly braces in the content to avoid format string errors 250 | system_prompt_escaped = system_prompt.replace("{", "{{").replace("}", "}}") 251 | user_message_escaped = user_message.replace("{", "{{").replace("}", "}}") 252 | 253 | prompt = ChatPromptTemplate.from_messages([ 254 | {"role": "system", "content": system_prompt_escaped}, 255 | {"role": "user", "content": user_message_escaped} 256 | ]) 257 | mapping = {"query": subquery} 258 | # Chain the prompt with the structured LLM (using a modified config if needed) 259 | chain = prompt | structured_llm.with_config({"timeout": 180}) 260 | result = await chain.ainvoke(mapping) 261 | formatted_analysis = "### Key Findings\n\n" 262 | for i, finding in enumerate(result.key_findings, 1): 263 | formatted_analysis += f"{i}. {finding}\n" 264 | formatted_analysis += "\n### Main Themes\n\n" 265 | for i, theme in enumerate(result.main_themes, 1): 266 | formatted_analysis += f"{i}. {theme}\n" 267 | formatted_analysis += f"\n### Analysis\n\n{result.analysis}\n" 268 | formatted_analysis += f"\n### Source Evaluation\n\n{result.source_evaluation}\n" 269 | return formatted_analysis 270 | except Exception as e: 271 | from ...utils.logger import log_error 272 | log_error("Error in structured content analysis", e, 273 | context=f"Query: {subquery}, Function: analyze_content") 274 | console.print(f"[dim red]Error in structured content analysis: {str(e)}. Using simpler approach.[/dim red]") 275 | # Escape any literal curly braces in the fallback content 276 | safe_ac_subquery = subquery.replace("{", "{{").replace("}", "}}") 277 | safe_ac_content = content_text[:5000].replace("{", "{{").replace("}", "}}") 278 | 279 | simple_prompt = ( 280 | f"Analyze and synthesize information from multiple web sources.\n" 281 | "Provide a concise but comprehensive analysis of the content related to the query.\n\n" 282 | f"Analyze content related to: {safe_ac_subquery}\n\n" 283 | f"{safe_ac_content}" 284 | ) 285 | simple_llm = llm.with_config({"timeout": 60}) 286 | response = await simple_llm.ainvoke(simple_prompt) 287 | return response.content 288 | -------------------------------------------------------------------------------- /shandu/agents/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for research agents. 3 | """ 4 | from .agent_utils import ( 5 | get_user_input, 6 | should_continue, 7 | log_chain_of_thought, 8 | display_research_progress, 9 | _call_progress_callback, 10 | clarify_query 11 | ) 12 | 13 | __all__ = [ 14 | 'get_user_input', 15 | 'should_continue', 16 | 'log_chain_of_thought', 17 | 'display_research_progress', 18 | '_call_progress_callback', 19 | 'clarify_query' 20 | ] -------------------------------------------------------------------------------- /shandu/agents/utils/agent_utils.py: -------------------------------------------------------------------------------- 1 | """Agent utility functions.""" 2 | from typing import List, Dict, Optional, Any, Callable, Union, TypedDict, Sequence 3 | from dataclasses import dataclass 4 | import time 5 | import re 6 | import asyncio 7 | import signal 8 | import threading 9 | import sys 10 | import os 11 | from datetime import datetime 12 | from rich.console import Console 13 | from rich.tree import Tree 14 | from rich.progress import Progress, SpinnerColumn, TextColumn 15 | from rich.markup import escape 16 | from langchain_core.messages import AIMessage, HumanMessage, BaseMessage 17 | from langchain_core.prompts import ChatPromptTemplate 18 | from pydantic import BaseModel, Field 19 | from ..processors.content_processor import AgentState 20 | 21 | console = Console() 22 | 23 | # Global shutdown flag for graceful termination 24 | _shutdown_requested = False 25 | _shutdown_lock = threading.Lock() 26 | _shutdown_counter = 0 27 | _MAX_SHUTDOWN_ATTEMPTS = 3 28 | 29 | def setup_signal_handlers(): 30 | """Set up signal handlers for graceful shutdown.""" 31 | def signal_handler(sig, frame): 32 | global _shutdown_requested, _shutdown_counter 33 | with _shutdown_lock: 34 | _shutdown_requested = True 35 | _shutdown_counter += 1 36 | 37 | if _shutdown_counter == 1: 38 | console.print("\n[yellow]Shutdown requested. Completing current operations...[/]") 39 | elif _shutdown_counter == 2: 40 | console.print("\n[orange]Second shutdown request. Canceling operations...[/]") 41 | elif _shutdown_counter >= _MAX_SHUTDOWN_ATTEMPTS: 42 | console.print("\n[bold red]Forced exit requested. Exiting immediately.[/]") 43 | # Force exit after multiple attempts 44 | os._exit(1) 45 | 46 | signal.signal(signal.SIGINT, signal_handler) 47 | signal.signal(signal.SIGTERM, signal_handler) 48 | 49 | # Call this at application startup 50 | setup_signal_handlers() 51 | 52 | def is_shutdown_requested() -> bool: 53 | """Check if shutdown has been requested.""" 54 | with _shutdown_lock: 55 | return _shutdown_requested 56 | 57 | def get_shutdown_level() -> int: 58 | """Get the current shutdown level (number of attempts).""" 59 | with _shutdown_lock: 60 | return _shutdown_counter 61 | 62 | def get_user_input(prompt: str) -> str: 63 | """Get formatted user input with shutdown handling.""" 64 | console.print(prompt, style="yellow") 65 | 66 | if is_shutdown_requested(): 67 | console.print("[yellow]Shutdown requested, skipping user input...[/]") 68 | return "any" # Return a generic answer to allow the process to continue to shutdown 69 | 70 | try: 71 | 72 | return input("> ").strip() 73 | except (KeyboardInterrupt, EOFError): 74 | 75 | with _shutdown_lock: 76 | global _shutdown_requested 77 | _shutdown_requested = True 78 | console.print("\n[yellow]Input interrupted. Proceeding with shutdown...[/]") 79 | return "any" # Return a generic answer to allow the process to continue to shutdown 80 | 81 | def should_continue(state: AgentState) -> str: 82 | """Check if research should continue.""" 83 | # First check if shutdown was requested 84 | if is_shutdown_requested(): 85 | # If this is a forceful shutdown (second attempt or higher) 86 | if get_shutdown_level() >= 2: 87 | console.print("[bold red]Forceful shutdown requested. Ending research immediately.[/]") 88 | return "end" 89 | 90 | # For first shutdown request, try to complete gracefully 91 | console.print("[yellow]Shutdown requested. Completing current depth before ending.[/]") 92 | 93 | # If we're already at the end of a depth cycle, end now 94 | if state.get("current_depth", 0) >= state.get("depth", 1): 95 | return "end" 96 | 97 | # Otherwise, allow the current depth to complete 98 | return "continue" 99 | 100 | if "iteration_count" not in state: 101 | state["iteration_count"] = 1 102 | else: 103 | state["iteration_count"] += 1 104 | 105 | # This is separate from depth/breadth and ensures we won't get stuck 106 | if state["iteration_count"] >= 25: 107 | console.print("[yellow]Maximum iterations reached. Ending research to prevent infinite loop.[/]") 108 | return "end" 109 | 110 | # Then check if we've reached the desired depth 111 | if state["current_depth"] < state["depth"]: 112 | return "continue" 113 | 114 | return "end" 115 | 116 | def log_chain_of_thought(state: AgentState, thought: str) -> None: 117 | """ 118 | Log a thought to the agent's chain of thought with timestamp. 119 | 120 | Args: 121 | state: The current agent state 122 | thought: The thought to log 123 | """ 124 | # Sanitize the thought to prevent Rich markup issues 125 | sanitized_thought = thought 126 | # Remove any square brackets that could be misinterpreted as markup 127 | sanitized_thought = re.sub(r'\[[^\]]*\]', '', sanitized_thought) 128 | # Remove any orphaned brackets or tags 129 | sanitized_thought = re.sub(r'\[\/?[^\]]*\]?', '', sanitized_thought) 130 | sanitized_thought = re.sub(r'\[\]', '', sanitized_thought) 131 | 132 | timestamp = datetime.now().strftime("%H:%M:%S") 133 | state["chain_of_thought"].append(f"[{timestamp}] {sanitized_thought}") 134 | 135 | def display_research_progress(state: AgentState) -> Tree: 136 | """ 137 | Create a rich tree display of current research progress. 138 | 139 | Args: 140 | state: The current agent state 141 | 142 | Returns: 143 | Rich Tree object for display 144 | """ 145 | elapsed_time = time.time() - state["start_time"] 146 | minutes, seconds = divmod(int(elapsed_time), 60) 147 | 148 | # Sanitize status to prevent markup errors 149 | status_raw = state["status"] 150 | status = re.sub(r'\[[^\]]*\]', '', status_raw) # Remove any potential markup 151 | status = escape(status) # Escape any remaining characters 152 | 153 | phase = "Research" if "depth" in status.lower() or any(word in status.lower() for word in ["searching", "querying", "reflecting", "analyzing"]) else "Report Generation" 154 | 155 | tree = Tree(f"[bold blue]{phase} Progress: {status}") 156 | 157 | stats_node = tree.add(f"[cyan]Stats") 158 | stats_node.add(f"[blue]Time Elapsed:[/] {minutes}m {seconds}s") 159 | 160 | if phase == "Research": 161 | # Display research-specific stats 162 | stats_node.add(f"[blue]Current Depth:[/] {state['current_depth']}/{state['depth']}") 163 | stats_node.add(f"[blue]Sources Found:[/] {len(state['sources'])}") 164 | stats_node.add(f"[blue]Subqueries Explored:[/] {len(state['subqueries'])}") 165 | 166 | # Show current research paths - with safety checks 167 | if state["subqueries"]: 168 | queries_node = tree.add("[green]Current Research Paths") 169 | # Safely get the last N queries based on breadth 170 | breadth = max(1, state.get("breadth", 1)) # Ensure breadth is at least 1 171 | 172 | # Limit to actual number of queries available 173 | display_count = min(breadth, len(state["subqueries"])) 174 | 175 | if display_count > 0: 176 | for i in range(-display_count, 0): # Get the last 'display_count' elements 177 | if i + len(state["subqueries"]) >= 0: # Safety check 178 | query_text = state["subqueries"][i] 179 | # Sanitize the query text 180 | query_text = re.sub(r'\[[^\]]*\]', '', query_text) 181 | query_text = escape(query_text) 182 | queries_node.add(query_text) 183 | else: 184 | # Display report generation specific stats 185 | stats_node.add(f"[blue]Sources Selected:[/] {len(state.get('selected_sources', []))}") 186 | 187 | # Show report generation progress 188 | report_progress = tree.add("[green]Report Generation Progress") 189 | if state.get("selected_sources"): 190 | report_progress.add("[green]✓[/green] Sources selected") 191 | if state.get("formatted_citations"): 192 | report_progress.add("[green]✓[/green] Citations formatted") 193 | if state.get("initial_report"): 194 | report_progress.add("[green]✓[/green] Initial report generated") 195 | if state.get("enhanced_report"): 196 | report_progress.add("[green]✓[/green] Report enhanced with details") 197 | if state.get("final_report"): 198 | report_progress.add("[green]✓[/green] Key sections expanded") 199 | 200 | # Show recent thoughts regardless of phase 201 | if state["chain_of_thought"]: 202 | thoughts_node = tree.add("[yellow]Recent Thoughts") 203 | for thought in state["chain_of_thought"][-3:]: 204 | thoughts_node.add(thought) 205 | 206 | # Show latest findings only in research phase 207 | if phase == "Research" and state["findings"]: 208 | findings_node = tree.add("[magenta]Latest Findings") 209 | sections = state["findings"].split("\n\n") 210 | for section in sections[-2:]: 211 | if section.strip(): 212 | # Sanitize findings text to prevent markup errors 213 | section_text = section.strip()[:100] + "..." if len(section.strip()) > 100 else section.strip() 214 | # Remove any square brackets that could be misinterpreted as markup 215 | section_text = re.sub(r'\[[^\]]*\]', '', section_text) 216 | # Remove any orphaned brackets or tags 217 | section_text = re.sub(r'\[\/?[^\]]*\]?', '', section_text) 218 | section_text = re.sub(r'\[\]', '', section_text) 219 | # Escape any remaining characters that could be misinterpreted 220 | section_text = escape(section_text) 221 | findings_node.add(section_text) 222 | 223 | # Show shutdown status if requested 224 | if is_shutdown_requested(): 225 | tree.add(f"[bold red]Shutdown requested. Attempt {get_shutdown_level()}/{_MAX_SHUTDOWN_ATTEMPTS}") 226 | 227 | return tree 228 | 229 | async def _call_progress_callback(callback: Optional[Callable], state: AgentState) -> None: 230 | """ 231 | Call the progress callback with the current state if provided. 232 | 233 | Args: 234 | callback: The callback function 235 | state: The current agent state 236 | """ 237 | # Sanitize state values that will be displayed to prevent Rich markup errors 238 | if "status" in state: 239 | state["status"] = escape(re.sub(r'\[[^\]]*\]', '', state["status"])) 240 | 241 | if callback: 242 | try: 243 | if asyncio.iscoroutinefunction(callback): 244 | await callback(state) 245 | else: 246 | callback(state) 247 | except Exception as e: 248 | # Sanitize the error message before displaying 249 | error_msg = str(e) 250 | error_msg = re.sub(r'\[[^\]]*\]', '', error_msg) 251 | error_msg = re.sub(r'\[\/?[^\]]*\]?', '', error_msg) 252 | error_msg = escape(error_msg) 253 | console.print(f"[dim red]Error in progress callback: {error_msg}[/dim red]") 254 | 255 | # Structured output model for query clarification 256 | class ClarificationQuestions(BaseModel): 257 | """Structured output for query clarification questions.""" 258 | questions: list[str] = Field( 259 | description="List of clarifying questions to better understand the research needs", 260 | min_items=1, 261 | max_items=3 262 | ) 263 | 264 | class RefinedQuery(BaseModel): 265 | """Structured output for refined query.""" 266 | query: str = Field(description="The refined, comprehensive research query") 267 | explanation: str = Field(description="Explanation of how the query was refined based on the Q&A") 268 | 269 | async def clarify_query(query: str, llm, date: Optional[str] = None, system_prompt: str = "", user_prompt: str = "") -> str: 270 | """Interactive query clarification process with structured output.""" 271 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS 272 | 273 | current_date = date or datetime.now().strftime("%Y-%m-%d") 274 | console.print(f"[bold blue]Initial Query:[/] {query}") 275 | 276 | if not system_prompt: 277 | # Use direct string with current_date instead of format 278 | clarify_prompt = SYSTEM_PROMPTS.get("clarify_query", "") 279 | system_prompt = f"""You must generate clarifying questions to refine the research query with strict adherence to: 280 | - Eliciting specific details about user goals, scope, and knowledge level. 281 | - Avoiding extraneous or trivial queries. 282 | - Providing precisely 4-5 targeted questions. 283 | 284 | Today's date: {current_date}. 285 | 286 | These questions must seek to clarify the exact focal points, the depth of detail, constraints, and user background knowledge. Provide them succinctly and plainly, with no added commentary.""" 287 | 288 | if not user_prompt: 289 | user_prompt = USER_PROMPTS.get("clarify_query", "") 290 | 291 | try: 292 | # Use a simpler approach to avoid issues with prompt templates 293 | try: 294 | # Direct approach without structured output 295 | response = await llm.ainvoke(f""" 296 | {system_prompt} 297 | 298 | Generate 3-5 direct, specific questions to better understand the research needs for the query: "{query}" 299 | 300 | Focus on: 301 | 1. Clarifying the specific areas the user wants to explore 302 | 2. The level of detail needed 303 | 3. Specific sources or perspectives to include 304 | 4. Time frame or context relevant to the query 305 | 306 | IMPORTANT: Provide ONLY the questions themselves, without any introduction or preamble. 307 | Each question should be clear, direct, and standalone. 308 | """) 309 | 310 | questions = [q.strip() for q in response.content.split("\n") if q.strip() and "?" in q] 311 | 312 | # Limit to top 3-5 questions 313 | questions = questions[:5] 314 | except Exception as e: 315 | console.print(f"[dim red]Error in question generation: {str(e)}. Using default questions.[/dim red]") 316 | questions = [] 317 | except Exception as e: 318 | from ...utils.logger import log_error 319 | log_error("Error in clarify_query", e, 320 | context=f"Query: {query}, Function: clarify_query") 321 | console.print(f"[dim red]Error in structured question generation: {str(e)}. Using simpler approach.[/dim red]") 322 | try: 323 | # Direct approach without structured output 324 | response = await llm.ainvoke(f"Generate 3 direct clarifying questions for the research query: {query}") 325 | 326 | questions = [q.strip() for q in response.content.split("\n") if q.strip() and "?" in q] 327 | except Exception as e2: 328 | console.print(f"[dim red]Error in fallback question generation: {str(e2)}. Using default questions.[/dim red]") 329 | questions = [] 330 | 331 | # If we couldn't extract questions, create some generic ones 332 | if not questions: 333 | questions = [ 334 | "What specific application or area of this topic are you most interested in?", 335 | "What is the intended audience or purpose of this research?", 336 | "Are you interested in current applications, future trends, ethical considerations, or a combination of these aspects?" 337 | ] 338 | 339 | # Limit to 3 questions 340 | questions = questions[:3] 341 | 342 | answers = [] 343 | for q in questions: 344 | 345 | if is_shutdown_requested(): 346 | console.print("[yellow]Shutdown requested, using generic answers...[/]") 347 | answers.append("any") # Use a generic answer 348 | continue 349 | 350 | answer = get_user_input(q) 351 | answers.append(answer) 352 | 353 | qa_text = "\n".join([f"Q: {q}\nA: {a}" for q, a in zip(questions, answers)]) 354 | 355 | refine_system_prompt = f"""You must refine the research query into a strict, focused direction based on user-provided answers. Today's date: {current_date}. 356 | 357 | REQUIREMENTS: 358 | - DO NOT present any "Research Framework" or "Objective" headings. 359 | - Provide a concise topic statement followed by 2-3 paragraphs integrating all key points from the user. 360 | - Preserve all critical details mentioned by the user. 361 | - The format must be simple plain text with no extraneous headings or bullet points.""" 362 | 363 | refine_user_prompt = USER_PROMPTS.get("refine_query", "") 364 | 365 | try: 366 | # Use direct approach without structured output 367 | response = await llm.ainvoke(f""" 368 | {refine_system_prompt} 369 | 370 | Original query: {query} 371 | Follow-up questions and answers: 372 | {qa_text} 373 | 374 | Based on this information, create a comprehensive, well-structured research query. 375 | The query should be clear, focused, and incorporate all relevant information from the answers. 376 | """) 377 | 378 | refined_context_raw = response.content 379 | 380 | refined_context = refined_context_raw.replace("**", "").replace("# ", "").replace("## ", "") 381 | refined_context = re.sub(r'^(?:Based on our discussion,|Following our conversation,|As per our discussion,).*?(?:refined topic:|research the following:|exploring|analyze):\s*', '', refined_context, flags=re.IGNORECASE) 382 | refined_context = re.sub(r'Based on our discussion.*?(?=\.)\.', '', refined_context, flags=re.IGNORECASE) 383 | except Exception as e: 384 | from ...utils.logger import log_error 385 | log_error("Error in clarify_query", e, 386 | context=f"Query: {query}, Function: clarify_query") 387 | console.print(f"[dim red]Error in structured query refinement: {str(e)}. Using simpler approach.[/dim red]") 388 | #current_file = os.path.basename(__file__) 389 | #with open('example.txt', 'a') as file: 390 | # Append the current file's name and some text 391 | #file.write(f'This line was written by: {current_file}\n') 392 | #file.write(f'Error {e}.\n') 393 | # Fallback to non-structured approach 394 | try: 395 | # Direct approach without structured output 396 | response = await llm.ainvoke(f""" 397 | Original query: {query} 398 | 399 | Follow-up questions and answers: 400 | {qa_text} 401 | 402 | Based on this information, create a comprehensive, well-structured research query. 403 | """) 404 | 405 | refined_context_raw = response.content 406 | 407 | refined_context = refined_context_raw.replace("**", "").replace("# ", "").replace("## ", "") 408 | refined_context = re.sub(r'^(?:Based on our discussion,|Following our conversation,|As per our discussion,).*?(?:refined topic:|research the following:|exploring|analyze):\s*', '', refined_context, flags=re.IGNORECASE) 409 | refined_context = re.sub(r'Based on our discussion.*?(?=\.)\.', '', refined_context, flags=re.IGNORECASE) 410 | except Exception as e2: 411 | console.print(f"[dim red]Error in fallback query refinement: {str(e2)}. Using original query.[/dim red]") 412 | refined_context = query 413 | 414 | console.print(f"\n[bold green]Refined Research Query:[/]\n{refined_context}") 415 | return refined_context 416 | -------------------------------------------------------------------------------- /shandu/agents/utils/citation_registry.py: -------------------------------------------------------------------------------- 1 | """ 2 | Citation registry to track and manage citations throughout the report generation process. 3 | """ 4 | from typing import Dict, Any, List, Optional, Set, Union 5 | 6 | class CitationRegistry: 7 | """ 8 | Registry that tracks all citations used in a report, ensuring that in-text citations 9 | match the sources in the references section. 10 | """ 11 | def __init__(self): 12 | self.citations = {} # Maps citation_id to source metadata 13 | self.id_to_url = {} # Maps citation_id to source URL for quick lookups 14 | self.url_to_id = {} # Maps source URL to citation_id for deduplication 15 | self.next_id = 1 # Next available citation ID 16 | self.citation_contexts = {} # Stores context for each citation use 17 | 18 | def register_citation(self, source_url: str, context: str = "") -> int: 19 | """ 20 | Register a citation and return its ID. 21 | 22 | Args: 23 | source_url: The URL of the source being cited 24 | context: Optional context about how the citation is being used 25 | 26 | Returns: 27 | int: The citation ID to use in the report 28 | """ 29 | 30 | if source_url in self.url_to_id: 31 | citation_id = self.url_to_id[source_url] 32 | 33 | if context and context not in self.citation_contexts.get(citation_id, []): 34 | if citation_id not in self.citation_contexts: 35 | self.citation_contexts[citation_id] = [] 36 | self.citation_contexts[citation_id].append(context) 37 | return citation_id 38 | 39 | # Register new citation 40 | citation_id = self.next_id 41 | self.citations[citation_id] = { 42 | "url": source_url, 43 | "id": citation_id 44 | } 45 | self.id_to_url[citation_id] = source_url 46 | self.url_to_id[source_url] = citation_id 47 | 48 | # Store context if provided 49 | if context: 50 | self.citation_contexts[citation_id] = [context] 51 | 52 | self.next_id += 1 53 | return citation_id 54 | 55 | def get_citation_url(self, citation_id: int) -> Optional[str]: 56 | """Get the URL associated with a citation ID.""" 57 | return self.id_to_url.get(citation_id) 58 | 59 | def get_citation_info(self, citation_id: int) -> Optional[Dict[str, Any]]: 60 | """Get the full citation info for a citation ID.""" 61 | return self.citations.get(citation_id) 62 | 63 | def get_all_citations(self) -> Dict[int, Dict[str, Any]]: 64 | """Return all registered citations.""" 65 | return self.citations 66 | 67 | def get_all_citation_urls(self) -> List[str]: 68 | """Return all unique cited URLs in order of first citation.""" 69 | return [self.id_to_url[cid] for cid in sorted(self.id_to_url.keys())] 70 | 71 | def get_citation_contexts(self, citation_id: int) -> List[str]: 72 | """Get the contexts in which a citation was used.""" 73 | return self.citation_contexts.get(citation_id, []) 74 | 75 | def bulk_register_sources(self, source_urls: List[str]) -> None: 76 | """Pre-register a list of sources without assigning contexts.""" 77 | for url in source_urls: 78 | if url not in self.url_to_id: 79 | self.register_citation(url) 80 | 81 | def update_citation_metadata(self, citation_id: int, metadata: Dict[str, Any]) -> None: 82 | """Update metadata for a citation (e.g., add title, date, etc.).""" 83 | if citation_id in self.citations: 84 | self.citations[citation_id].update(metadata) 85 | 86 | def validate_citations(self, text: str) -> Dict[str, Any]: 87 | """ 88 | Validate all citations in a text against the registry. 89 | 90 | Args: 91 | text: The text content to validate citations in 92 | 93 | Returns: 94 | Dict containing validation results with keys: 95 | - valid: Boolean indicating if all citations are valid 96 | - invalid_citations: Set of invalid citation IDs 97 | - missing_citations: Set of citation IDs in the registry not used in the text 98 | - used_citations: Set of citation IDs that are actually used in the text 99 | - out_of_range_citations: Set of citation IDs that exceed the maximum registered ID 100 | """ 101 | import re 102 | 103 | citation_pattern = re.compile(r'\[(\d+)\]') 104 | used_citations = set(int(cid) for cid in citation_pattern.findall(text) if cid.isdigit()) 105 | 106 | registry_ids = set(self.citations.keys()) 107 | invalid_citations = used_citations - registry_ids 108 | missing_citations = registry_ids - used_citations 109 | 110 | # Identify citations that exceed the maximum registered ID 111 | max_id = max(registry_ids) if registry_ids else 0 112 | out_of_range_citations = {cid for cid in used_citations if cid > max_id} 113 | 114 | invalid_citations = invalid_citations.union(out_of_range_citations) 115 | 116 | return { 117 | "valid": len(invalid_citations) == 0, 118 | "invalid_citations": invalid_citations, 119 | "missing_citations": missing_citations, 120 | "used_citations": used_citations, 121 | "out_of_range_citations": out_of_range_citations, 122 | "max_valid_id": max_id 123 | } 124 | -------------------------------------------------------------------------------- /shandu/config.py: -------------------------------------------------------------------------------- 1 | """Configuration management module.""" 2 | import os 3 | import json 4 | from typing import Dict, Any, Optional 5 | from pathlib import Path 6 | import datetime 7 | 8 | DEFAULT_CONFIG = { 9 | "api": { 10 | "base_url": "https://api.openai.com/v1", 11 | "api_key": "", 12 | "model": "gpt-4", 13 | "temperature": 0 14 | }, 15 | "search": { 16 | "engines": ["duckduckgo", "google"], 17 | "max_results": 10, 18 | "region": "wt-wt", 19 | "safesearch": "moderate", 20 | "user_agent": "Research 1.0" 21 | }, 22 | "research": { 23 | "default_depth": 2, 24 | "default_breadth": 4, 25 | "max_depth": 5, 26 | "max_breadth": 10, 27 | "max_urls_per_query": 3 28 | }, 29 | "scraper": { 30 | "timeout": 30, 31 | "max_retries": 3, 32 | "chunk_size": 1000, 33 | "chunk_overlap": 200, 34 | "proxy": None 35 | }, 36 | "display": { 37 | "verbose": False, 38 | "show_progress": True, 39 | "show_chain_of_thought": True 40 | } 41 | } 42 | 43 | class Config: 44 | """Configuration manager.""" 45 | 46 | def __init__(self): 47 | self._config = DEFAULT_CONFIG.copy() 48 | self._config_path = os.path.expanduser("~/.shandu/config.json") 49 | self._load_config() 50 | self._load_env_vars() 51 | 52 | def _load_config(self): 53 | """Load config from file.""" 54 | config_path = Path(self._config_path) 55 | if config_path.exists(): 56 | try: 57 | with open(config_path) as f: 58 | file_config = json.load(f) 59 | self._update_nested_dict(self._config, file_config) 60 | except Exception as e: 61 | print(f"Error loading config file: {e}") 62 | 63 | def _load_env_vars(self): 64 | """Load config from environment variables.""" 65 | if os.environ.get("OPENAI_API_BASE"): 66 | self._config["api"]["base_url"] = os.environ["OPENAI_API_BASE"] 67 | if os.environ.get("OPENAI_API_KEY"): 68 | self._config["api"]["api_key"] = os.environ["OPENAI_API_KEY"] 69 | if os.environ.get("OPENAI_MODEL_NAME"): 70 | self._config["api"]["model"] = os.environ["OPENAI_MODEL_NAME"] 71 | 72 | if os.environ.get("SHANDU_PROXY"): 73 | self._config["scraper"]["proxy"] = os.environ["SHANDU_PROXY"] 74 | 75 | if os.environ.get("USER_AGENT"): 76 | self._config["search"]["user_agent"] = os.environ["USER_AGENT"] 77 | 78 | def _update_nested_dict(self, d: Dict, u: Dict): 79 | """Update nested dictionary.""" 80 | for k, v in u.items(): 81 | if isinstance(v, dict) and k in d and isinstance(d[k], dict): 82 | self._update_nested_dict(d[k], v) 83 | else: 84 | d[k] = v 85 | 86 | def save(self): 87 | """Save config to file.""" 88 | config_path = Path(self._config_path) 89 | config_path.parent.mkdir(exist_ok=True, parents=True) 90 | with open(config_path, "w") as f: 91 | json.dump(self._config, f, indent=2) 92 | 93 | def get(self, section: str, key: str, default: Any = None) -> Any: 94 | """Get config value.""" 95 | try: 96 | return self._config[section][key] 97 | except KeyError: 98 | return default 99 | 100 | def set(self, section: str, key: str, value: Any): 101 | """Set config value.""" 102 | if section not in self._config: 103 | self._config[section] = {} 104 | self._config[section][key] = value 105 | 106 | def get_section(self, section: str) -> Dict[str, Any]: 107 | """Get config section.""" 108 | return self._config.get(section, {}).copy() 109 | 110 | def get_all(self) -> Dict[str, Any]: 111 | """Get all config.""" 112 | return self._config.copy() 113 | 114 | config = Config() 115 | 116 | def get_current_date() -> str: 117 | """Get current date.""" 118 | return datetime.datetime.now().strftime("%Y-%m-%d") 119 | 120 | def get_current_datetime() -> str: 121 | """Get current date and time.""" 122 | return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 123 | 124 | def get_user_agent() -> str: 125 | """Get user agent string.""" 126 | configured_agent = config.get("search", "user_agent", None) 127 | if configured_agent and configured_agent != "Research 1.0": 128 | return configured_agent 129 | 130 | try: 131 | from fake_useragent import UserAgent 132 | ua = UserAgent() 133 | return ua.random 134 | except ImportError: 135 | import random 136 | fake_user_agents = [ 137 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 138 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", 139 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0", 140 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", 141 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 142 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0" 143 | ] 144 | return random.choice(fake_user_agents) 145 | except Exception as e: 146 | print(f"Error generating user agent: {e}") 147 | return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" 148 | -------------------------------------------------------------------------------- /shandu/prompts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Centralized prompts for Shandu deep research system. 3 | All prompts used throughout the system are defined here for easier maintenance. 4 | """ 5 | from typing import Dict, Any 6 | 7 | # Utility function to safely format prompts with content that may contain curly braces 8 | def safe_format(template: str, **kwargs: Any) -> str: 9 | """ 10 | Safely format a template string, escaping any curly braces in the values. 11 | This prevents ValueError when content contains unexpected curly braces. 12 | """ 13 | # Escape any curly braces in the values 14 | safe_kwargs = {k: v.replace('{', '{{').replace('}', '}}') if isinstance(v, str) else v 15 | for k, v in kwargs.items()} 16 | return template.format(**safe_kwargs) 17 | 18 | # System prompts 19 | SYSTEM_PROMPTS: Dict[str, str] = { 20 | "research_agent": """You are an expert research agent with a strict mandate to investigate topics in exhaustive detail. Adhere to the following instructions without deviation: 21 | 22 | 1. You MUST break down complex queries into smaller subqueries to thoroughly explore each component. 23 | 2. You MUST consult and analyze multiple sources for comprehensive information. 24 | 3. You MUST verify and cross-check findings from all sources for accuracy. 25 | 4. You MUST provide deep insights and structured reasoning through self-reflection. 26 | 5. You MUST produce meticulously detailed research reports. 27 | 28 | REQUIRED CONDUCT: 29 | - Assume user statements referring to events beyond your known timeline are correct if explicitly indicated as new information. 30 | - The user is highly experienced, so maintain a sophisticated level of detail. 31 | - Provide thoroughly organized and carefully reasoned responses. 32 | - Anticipate additional angles and solutions beyond the immediate scope. 33 | - NEVER make unwarranted assumptions. If information is uncertain, state so clearly. 34 | - ALWAYS correct mistakes promptly and without hesitation. 35 | - NEVER rely on authoritative claims alone. Base responses on thorough analysis of the content. 36 | - Acknowledge new or unconventional technologies and ideas but label speculative elements clearly. 37 | 38 | When examining any sources, you must carefully seek: 39 | - Primary sources and official data 40 | - Recent, up-to-date materials 41 | - Expert analyses with strong evidence 42 | - Cross-verification of major claims 43 | 44 | You must strictly address the current query as follows: 45 | Current query: {{query}} 46 | Research depth: {{depth}} 47 | Research breadth: {{breadth}}""", 48 | 49 | "initialize": """You are an expert research agent with a strict mandate to devise a comprehensive research plan. You must adhere to the following directives without exception: 50 | 51 | Current date: {{current_date}} 52 | 53 | Your mission is to produce a meticulous research plan for the given query. You must: 54 | 1. Rigorously decompose the query into key subtopics and objectives. 55 | 2. Identify robust potential information sources and potential angles of investigation. 56 | 3. Weigh multiple perspectives and acknowledge any biases explicitly. 57 | 4. Devise reliable strategies for verifying gathered information from diverse sources. 58 | 59 | Your response must appear as plain text with clear section headings, but no special formatting or extraneous commentary. Remain strictly methodical and thorough throughout.""", 60 | 61 | "reflection": """You are strictly required to analyze the assembled research findings in detail to generate well-founded insights. Today's date: {{current_date}} 62 | 63 | You must: 64 | - Conduct a thorough, critical, and balanced assessment. 65 | - Identify patterns, contradictions, and content that is not directly relevant. 66 | - Evaluate the reliability of sources, accounting for potential biases. 67 | - Highlight areas necessitating further information, with recommendations for refining focus. 68 | 69 | Ensure that you identify subtle insights and potential oversights, emphasizing depth and rigor in your analysis.""", 70 | 71 | "query_generation": """You must generate specific, targeted search queries with unwavering precision to investigate discrete aspects of a research topic. Today's date: {{current_date}}. 72 | 73 | You are required to: 74 | - Craft queries in everyday language, avoiding academic or overly formal phrasing. 75 | - Ensure queries are succinct but laser-focused on pinpointing needed information. 76 | - Avoid any extraneous formatting or labeling (like numbering or categories). 77 | - Provide direct, natural-sounding queries that a real person would input into a search engine.""", 78 | 79 | "url_relevance": """You must evaluate whether the provided search result directly addresses the given query. If it does, respond with "RELEVANT". Otherwise, respond with "IRRELEVANT". Provide no additional words or statements beyond this single-word response.""", 80 | 81 | "content_analysis": """You must meticulously analyze the provided web content regarding "{{query}}" to produce a structured, in-depth examination. Your analysis must: 82 | 83 | 1. Thoroughly identify and explain major themes. 84 | 2. Extract relevant evidence, statistics, and data points in a clear, organized format. 85 | 3. Integrate details from multiple sources into cohesive, thematic sections. 86 | 4. Eliminate contradictions and duplications. 87 | 5. Evaluate source reliability briefly but directly. 88 | 6. Present extensive exploration of key concepts with robust detail. 89 | 90 | Present your findings in a methodically organized, well-structured format using clear headings, bullet points, and direct quotes where necessary.""", 91 | 92 | "source_reliability": """You must examine this source in two strictly delineated parts: 93 | 94 | PART 1 – RELIABILITY ASSESSMENT: 95 | Rate reliability as HIGH, MEDIUM, or LOW based on domain reputation, author expertise, citations, objectivity, and recency. Provide a concise rationale (1-2 sentences). 96 | 97 | PART 2 – EXTRACTED CONTENT: 98 | Deliver an exhaustive extraction of all relevant data, statistics, opinions, methodologies, and context directly related to the query. Do not omit any critical information. Be thorough yet organized.""", 99 | 100 | "report_generation": """You must compile a comprehensive research report. Today's date: {{current_date}}. 101 | 102 | MANDATORY REQUIREMENTS: 103 | 1. DO NOT begin with a "Research Framework," "Objective," or any meta-commentary. Start with a # Title. 104 | 2. The structure must be entirely dynamic with headings that reflect the content naturally. 105 | 3. Substantiate factual statements with appropriate references. 106 | 4. Provide detailed paragraphs for every major topic or section. 107 | 108 | MARKDOWN ENFORCEMENT: 109 | - Use headings (#, ##, ###) carefully to maintain a hierarchical structure. 110 | - Incorporate tables, bolding, italics, code blocks, blockquotes, and horizontal rules as appropriate. 111 | - Maintain significant spacing for readability. 112 | 113 | CONTENT VOLUME AND DEPTH: 114 | - Each main section should be comprehensive and detailed. 115 | - Offer thorough historical context, theoretical underpinnings, practical applications, and future perspectives. 116 | - Provide a high level of detail, including multiple examples and case studies. 117 | 118 | REFERENCES: 119 | - Include well-chosen references that support key claims. 120 | - Cite them in bracketed numeric form [1], [2], etc., with a single reference list at the end. 121 | 122 | STRICT META AND FORMATTING RULES: 123 | - Never include extraneous statements about your process, the research framework, or time taken. 124 | - The final document should read as a polished, standalone publication of the highest scholarly caliber. 125 | {{objective_instruction}}""", 126 | 127 | "clarify_query": """You must generate clarifying questions to refine the research query with strict adherence to: 128 | - Eliciting specific details about user goals, scope, and knowledge level. 129 | - Avoiding extraneous or trivial queries. 130 | - Providing precisely 4-5 targeted questions. 131 | 132 | Today's date: {{current_date}}. 133 | 134 | These questions must seek to clarify the exact focal points, the depth of detail, constraints, and user background knowledge. Provide them succinctly and plainly, with no added commentary.""", 135 | 136 | "refine_query": """You must refine the research query into a strict, focused direction based on user-provided answers. Today's date: {{current_date}}. 137 | 138 | REQUIREMENTS: 139 | - DO NOT present any "Research Framework" or "Objective" headings. 140 | - Provide a concise topic statement followed by 2-3 paragraphs integrating all key points from the user. 141 | - Preserve all critical details mentioned by the user. 142 | - The format must be simple plain text with no extraneous headings or bullet points.""", 143 | 144 | "report_enhancement": """You must enhance an existing research report for greater depth and clarity. Today's date: {{current_date}}. 145 | 146 | MANDATORY ENHANCEMENT DIRECTIVES: 147 | 1. Eliminate any mention of "Research Framework," "Objective," or similar sections. 148 | 2. Start with a # heading for the report title, with no meta-commentary. 149 | 3. Use references that provide valuable supporting evidence. 150 | 4. Transform each section into a thorough analysis with comprehensive paragraphs. 151 | 5. Use markdown formatting, including headings, bold, italics, code blocks, blockquotes, tables, and horizontal rules, to create a highly readable, visually structured document. 152 | 6. Omit any mention of time spent or processes used to generate the report. 153 | 154 | CONTENT ENHANCEMENT: 155 | - Improve depth and clarity throughout. 156 | - Provide more examples, historical backgrounds, theoretical frameworks, and future directions. 157 | - Compare multiple viewpoints and delve into technical complexities. 158 | - Maintain cohesive narrative flow and do not introduce contradictory information. 159 | 160 | Your final product must be an authoritative work that exhibits academic-level depth, thoroughness, and clarity.""", 161 | 162 | "section_expansion": """You must significantly expand the specified section of the research report. Strictly adhere to the following: 163 | 164 | - Add newly written paragraphs of in-depth analysis and context. 165 | - Employ extensive markdown for headings, tables, bold highlights, italics, code blocks, blockquotes, and lists. 166 | - Include comprehensive examples, case studies, historical trajectories, theoretical frameworks, and nuanced viewpoints. 167 | 168 | Transform this section into an authoritative, stand-alone piece that could be published independently, demonstrating meticulous scholarship and thorough reasoning. 169 | 170 | Section to expand: {{section}}""", 171 | 172 | "smart_source_selection": """You must carefully select the most critical 15-25 sources from a large set. Your selection must follow these strict standards: 173 | 174 | 1. DIRECT RELEVANCE: The source must explicitly address the core research question. 175 | 2. INFORMATION DENSITY: The source must provide significant unique data. 176 | 3. CREDIBILITY: The source must be authoritative and reliable. 177 | 4. RECENCY: The source must be updated enough for the topic. 178 | 5. DIVERSITY: The source must offer unique perspectives or insights. 179 | 6. DEPTH: The source must present thorough, detailed analysis. 180 | 181 | Present only the URLs of the selected sources, ordered by overall value, with no justifications or commentary.""", 182 | 183 | "citation_formatter": """You must format each source into a rigorous citation that includes: 184 | - Publication or website name 185 | - Author(s) if available 186 | - Title of the article or page 187 | - Publication date if available 188 | - URL 189 | 190 | Number each citation in sequential bracketed format [n]. Maintain consistency and do not add any extra explanations or remarks. Provide citations only, with correct, clear structure.""", 191 | 192 | "multi_step_synthesis": """You must perform a multi-step synthesis of research findings. Current date: {{current_date}}. 193 | 194 | In this step ({{step_number}} of {{total_steps}}), you are strictly required to: 195 | {{current_step}} 196 | 197 | Guidelines: 198 | 1. Integrate information from multiple sources into a coherent narrative on the specified aspect. 199 | 2. Identify patterns and connections relevant to this focus. 200 | 3. Develop a thorough, evidence-backed analysis with examples. 201 | 4. Note any contradictions or open questions. 202 | 5. Build upon prior steps to move toward a comprehensive final report. 203 | 204 | Your synthesis must be precise, deeply reasoned, and self-consistent. Provide multiple paragraphs of thorough explanation.""" 205 | } 206 | 207 | # User prompts 208 | USER_PROMPTS: Dict[str, str] = { 209 | "reflection": """You must deliver a deeply detailed analysis of current findings, strictly following these points: 210 | 211 | 1. Clearly state the key insights discovered, assessing evidence strength. 212 | 2. Identify critical unanswered questions and explain their significance. 213 | 3. Evaluate the reliability and biases of sources. 214 | 4. Pinpoint areas needing deeper inquiry, suggesting investigative methods. 215 | 5. Highlight subtle patterns or connections among sources. 216 | 6. Disregard irrelevant or tangential information. 217 | 218 | Ensure your analysis is methodical, multi-perspectival, and strictly evidence-based. Provide structured paragraphs with logical progression.""", 219 | 220 | "query_generation": """Generate {{breadth}} strictly focused search queries to investigate the main query: {{query}} 221 | 222 | Informed by the current findings and reflection: {{findings}} 223 | 224 | INSTRUCTIONS FOR YOUR QUERIES: 225 | 1. Each query must be phrased in natural, conversational language. 226 | 2. Keep them concise, typically under 10 words. 227 | 3. Address explicit knowledge gaps identified in the reflection. 228 | 4. Do not number or list them. Place each query on its own line. 229 | 5. Avoid academic or formal language. 230 | 231 | Provide only the queries, nothing else.""", 232 | 233 | "url_relevance": """You must judge if the following search result directly addresses the query. If yes, respond "RELEVANT"; if no, respond "IRRELEVANT". Supply only that single word. 234 | 235 | Query: {{query}} 236 | Title: {{title}} 237 | URL: {{url}} 238 | Snippet: {{snippet}}""", 239 | 240 | "content_analysis": """You must carefully analyze the provided content for "{{query}}" and produce a comprehensive thematic report. The content is: 241 | 242 | {{content}} 243 | 244 | Your analysis must include: 245 | 1. Clear identification of major themes. 246 | 2. Exhaustive extraction of facts, statistics, and data. 247 | 3. Organized sections that integrate multiple sources. 248 | 4. Background context for significance. 249 | 5. Comparison of differing perspectives or methodologies. 250 | 6. Detailed case studies and examples. 251 | 252 | Use markdown headings and bullet points for clarity. Include direct quotes for notable expert statements. Bold key findings or statistics for emphasis. Focus on thoroughness and precision.""", 253 | 254 | "source_reliability": """Source URL: {{url}} 255 | Title: {{title}} 256 | Query: {{query}} 257 | Content: {{content}} 258 | 259 | You must respond in two segments: 260 | 261 | RELIABILITY: 262 | - Rate the source as HIGH, MEDIUM, or LOW. In 1-2 sentences, justify your rating using domain authority, author credentials, objectivity, and methodological soundness. 263 | 264 | EXTRACTED_CONTENT: 265 | - Provide every relevant data point, example, statistic, or expert opinion from the source. Organize logically and maintain fidelity to the source's meaning. 266 | 267 | No additional commentary is permitted beyond these two required sections.""", 268 | 269 | "report_generation": """You must produce an all-encompassing research report for the query: {{query}} 270 | 271 | Analyzed Findings: {{analyzed_findings}} 272 | Number of sources: {{num_sources}} 273 | 274 | MANDATORY REQUIREMENTS: 275 | - The final document must exceed 15,000 words, with no exceptions. 276 | - Do NOT include a "Research Framework" or "Objective" heading. 277 | - Start with a descriptive title using #, then proceed to a detailed introduction. 278 | - Restrict references to a maximum of 15-25 carefully selected sources. 279 | - Each major topic requires 7-10 paragraphs of deep analysis. 280 | 281 | STRUCTURE: 282 | 1. Title 283 | 2. Introduction (500-800 words minimum) 284 | 3. Main Body: 5-10 major sections, each at least 1,000-1,500 words, subdivided into 3-5 subsections. 285 | 4. Conclusion (800-1,000 words) summarizing insights and projecting future directions. 286 | 5. References: 15-25 high-quality sources, numbered [1], [2], etc. 287 | 288 | CONTENT DEMANDS: 289 | - Provide extensive details, including examples, comparisons, and historical context. 290 | - Discuss theories, practical applications, and prospective developments. 291 | - Weave in data from your analysis but do not rely on repeated citations. 292 | - Maintain an authoritative tone with thorough arguments, disclaimers for speculation, and consistent use of markdown elements. 293 | 294 | Deliver a final product that stands as a definitive, publishable resource on this topic.""", 295 | 296 | "initialize": """Formulate a comprehensive plan for researching: 297 | {{query}} 298 | 299 | You must: 300 | 1. Identify 5-7 major aspects of the topic. 301 | 2. Specify key questions for each aspect. 302 | 3. Propose relevant sources (academic, governmental, etc.). 303 | 4. Outline the methodological approach for thorough coverage. 304 | 5. Anticipate potential obstacles and suggest mitigating strategies. 305 | 6. Highlight possible cross-cutting themes. 306 | 307 | Present your response as plain text with simple section headings. Remain direct and systematic, without superfluous elaboration or meta commentary.""", 308 | 309 | "clarify_query": """You must generate 4-5 follow-up questions to further pinpoint the research scope for "{{query}}". These questions must: 310 | 311 | 1. Narrow down or clarify the exact topic aspects the user prioritizes. 312 | 2. Determine the technical depth or simplicity required. 313 | 3. Identify relevant time frames, geographies, or perspectives. 314 | 4. Probe for the user's background knowledge and specific interests. 315 | 316 | Keep each question concise and purposeful. Avoid extraneous details or explanations.""", 317 | 318 | "refine_query": """Original query: {{query}} 319 | Follow-up Q&A: 320 | {{qa}} 321 | 322 | You must finalize a refined research direction by: 323 | 324 | 1. Stating a concise topic statement without additional labels. 325 | 2. Expanding it in 2-3 paragraphs that incorporate all relevant user concerns, constraints, and goals. 326 | 327 | Remember: 328 | - Never refer to any "Research Framework" or structural headings. 329 | - Write in natural, flowing text without bullet points. 330 | - Provide no meta commentary about the research process.""", 331 | 332 | "report_enhancement": """You must enhance the following research report to dramatically increase its depth and scope: 333 | 334 | {{initial_report}} 335 | 336 | REQUIRED: 337 | - At least double the existing word count. 338 | - Expand each section with additional paragraphs of analysis, examples, and context. 339 | - Keep references consistent but do not add more than the existing cited sources. 340 | - Use advanced markdown formatting, maintain logical flow, and strictly avoid contradictory information. 341 | 342 | Aim for a polished and authoritative final version with thoroughly developed arguments in every section.""", 343 | 344 | "section_expansion": """Expand the following research report section significantly: 345 | 346 | {{section}} 347 | 348 | MANDATORY: 349 | 1. Add 3-5 new paragraphs with deeper analysis, examples, or data. 350 | 2. Incorporate alternative perspectives, historical background, or technical details. 351 | 3. Retain the original content but build upon it. 352 | 353 | Maintain the same style and referencing system, avoiding contradictions or redundant text. Ensure the expansion is coherent and stands as a robust discourse on the topic.""", 354 | 355 | "smart_source_selection": """Your mission is to filter sources for the research on {{query}} to only the most essential 15-20. The sources are: 356 | 357 | {{sources}} 358 | 359 | SELECTION CRITERIA: 360 | 1. Relevance to the core question. 361 | 2. Credibility and authority. 362 | 3. Uniqueness of perspective or data. 363 | 4. Depth of analysis offered. 364 | 365 | Provide the final list of chosen sources, ranked by priority, and include a brief rationale for each. Summaries must be concise and free from extraneous commentary.""", 366 | 367 | "citation_formatter": """Format the following sources into standardized references: 368 | 369 | {{sources}} 370 | 371 | Each citation must: 372 | - Include publication name or website 373 | - List author(s) if available 374 | - Provide the title 375 | - Give the publication date if available 376 | - Show the URL 377 | 378 | Use a numbered [n] format for each entry. Maintain consistency and brevity, without additional remarks beyond these essential details.""", 379 | 380 | "multi_step_synthesis": """You must perform a targeted synthesis step for the multi-step process. For this specific portion: 381 | 382 | {{current_step}} 383 | 384 | Relevant findings: 385 | {{findings}} 386 | 387 | Instructions: 388 | 1. Integrate the above findings cohesively, focusing on {{current_step}}. 389 | 2. Identify patterns, discrepancies, or important details relevant to the broader topic. 390 | 3. Provide thorough explanations, citing data where pertinent. 391 | 4. Connect this step to the overall research direction. 392 | 393 | This is step {{step_number}} of {{total_steps}} in a multi-layered synthesis. Produce a clear, detailed discussion of your progress here, strictly guided by the given instructions.""" 394 | } 395 | -------------------------------------------------------------------------------- /shandu/research/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Research module for Shandu deep research system. 3 | """ 4 | 5 | from .researcher import DeepResearcher, ResearchResult 6 | 7 | __all__ = ["DeepResearcher", "ResearchResult"] 8 | -------------------------------------------------------------------------------- /shandu/research/researcher.py: -------------------------------------------------------------------------------- 1 | """Research module implementation.""" 2 | from typing import List, Dict, Optional, Any, Union 3 | from dataclasses import dataclass, field 4 | from datetime import datetime 5 | import json 6 | from pathlib import Path 7 | import os 8 | 9 | @dataclass 10 | class ResearchResult: 11 | """Container for research results with enhanced citation tracking.""" 12 | query: str 13 | summary: str 14 | sources: List[Dict[str, Any]] 15 | subqueries: List[str] 16 | depth: int 17 | content_analysis: Optional[List[Dict[str, Any]]] = None 18 | chain_of_thought: Optional[List[str]] = None 19 | research_stats: Optional[Dict[str, Any]] = None 20 | citation_stats: Optional[Dict[str, Any]] = None # New field for tracking citation statistics 21 | timestamp: datetime = field(default_factory=datetime.now) 22 | 23 | def to_markdown(self, include_chain_of_thought: bool = False, include_objective: bool = False) -> str: 24 | """Convert research results to markdown format including citation statistics.""" 25 | stats = self.research_stats or {} 26 | elapsed_time = stats.get("elapsed_time_formatted", "Unknown") 27 | sources_count = stats.get("sources_count", len(self.sources)) 28 | subqueries_count = stats.get("subqueries_count", len(self.subqueries)) 29 | 30 | citation_stats = self.citation_stats or {} 31 | total_sources = citation_stats.get("total_sources", sources_count) 32 | total_learnings = citation_stats.get("total_learnings", 0) 33 | 34 | summary = self.summary 35 | 36 | lines = summary.split("\n") 37 | 38 | # Remove specific artifacts that can appear in the output 39 | cleaned_lines = [] 40 | for line in lines: 41 | # Skip lines with these patterns 42 | if (line.strip().startswith("*Generated on:") or 43 | line.strip().startswith("Completed:") or 44 | "Here are" in line and ("search queries" in line or "queries to investigate" in line) or 45 | line.strip() == "Research Framework:" or 46 | "Key Findings:" in line or 47 | "Key aspects to focus on:" in line): 48 | continue 49 | cleaned_lines.append(line) 50 | 51 | summary = "\n".join(cleaned_lines) 52 | 53 | # Fix the "Research Report: **Objective:**" formatting issue 54 | if summary.startswith("# Research Report: **Objective:**"): 55 | summary = summary.replace("# Research Report: **Objective:**", "# Research Report") 56 | 57 | # Remove objective section if not requested 58 | if not include_objective and "**Objective:**" in summary: 59 | # Split by sections 60 | parts = summary.split("## ") 61 | filtered_parts = [] 62 | 63 | for part in parts: 64 | # Keep executive summary or empty parts 65 | if part.startswith("Executive Summary") or not part.strip(): 66 | filtered_parts.append(part) 67 | continue 68 | 69 | # Skip objective section 70 | if "**Objective:**" in part and "**Key Aspects to Focus On:**" in part: 71 | continue 72 | 73 | # Keep other sections 74 | filtered_parts.append(part) 75 | 76 | # Reconstruct the summary 77 | if filtered_parts: 78 | if not filtered_parts[0].startswith("Executive Summary"): 79 | summary = "## ".join(filtered_parts) 80 | else: 81 | summary = filtered_parts[0] + "## " + "## ".join(filtered_parts[1:]) 82 | 83 | md = [ 84 | f"# {self.query}\n", 85 | f"{summary}\n" 86 | ] 87 | 88 | md.append("## Research Process\n") 89 | md.append(f"- **Depth**: {self.depth}") 90 | md.append(f"- **Breadth**: {stats.get('breadth', 'Not specified')}") 91 | md.append(f"- **Time Taken**: {elapsed_time}") 92 | md.append(f"- **Subqueries Explored**: {subqueries_count}") 93 | md.append(f"- **Sources Analyzed**: {sources_count}") 94 | 95 | if total_learnings > 0: 96 | md.append(f"- **Total Learnings Extracted**: {total_learnings}") 97 | md.append(f"- **Source Coverage**: {total_sources} sources with {total_learnings} tracked information points") 98 | 99 | source_reliability = citation_stats.get("source_reliability", {}) 100 | if source_reliability: 101 | md.append(f"- **Source Quality**: {len(source_reliability)} domains assessed for reliability\n") 102 | else: 103 | md.append("") 104 | else: 105 | md.append("") 106 | 107 | if include_chain_of_thought and self.chain_of_thought: 108 | md.append("## Research Process: Chain of Thought\n") 109 | significant_thoughts = [] 110 | 111 | for thought in self.chain_of_thought: 112 | # Skip generic or repetitive thoughts and output artifacts 113 | if any(x in thought.lower() for x in [ 114 | "searching for", "selected relevant url", "completed", 115 | "here are", "generated search queries", "queries to investigate" 116 | ]): 117 | continue 118 | significant_thoughts.append(thought) 119 | 120 | if len(significant_thoughts) > 20: 121 | selected_thoughts = ( 122 | significant_thoughts[:5] + 123 | significant_thoughts[len(significant_thoughts)//2-2:len(significant_thoughts)//2+3] + 124 | significant_thoughts[-5:] 125 | ) 126 | else: 127 | selected_thoughts = significant_thoughts 128 | 129 | for thought in selected_thoughts: 130 | md.append(f"- {thought}") 131 | md.append("") 132 | 133 | return "\n".join(md) 134 | 135 | def to_dict(self) -> Dict[str, Any]: 136 | """Convert to dictionary format.""" 137 | result = { 138 | "query": self.query, 139 | "summary": self.summary, 140 | "sources": self.sources, 141 | "subqueries": self.subqueries, 142 | "depth": self.depth, 143 | "content_analysis": self.content_analysis, 144 | "chain_of_thought": self.chain_of_thought, 145 | "research_stats": self.research_stats, 146 | "timestamp": self.timestamp.isoformat() 147 | } 148 | 149 | if self.citation_stats: 150 | result["citation_stats"] = self.citation_stats 151 | 152 | return result 153 | 154 | def save_to_file(self, filepath: str, include_chain_of_thought: bool = False, include_objective: bool = False) -> None: 155 | """Save research results to a file.""" 156 | directory = os.path.dirname(filepath) 157 | if directory: 158 | os.makedirs(directory, exist_ok=True) 159 | 160 | _, ext = os.path.splitext(filepath) 161 | ext = ext.lower() 162 | 163 | if ext == '.md': 164 | # Save as markdown 165 | with open(filepath, 'w', encoding='utf-8') as f: 166 | f.write(self.to_markdown(include_chain_of_thought, include_objective)) 167 | elif ext == '.json': 168 | # Save as JSON 169 | with open(filepath, 'w', encoding='utf-8') as f: 170 | json.dump(self.to_dict(), f, indent=2, default=str) 171 | else: 172 | # Default to markdown 173 | with open(filepath, 'w', encoding='utf-8') as f: 174 | f.write(self.to_markdown(include_chain_of_thought, include_objective)) 175 | 176 | @classmethod 177 | def from_dict(cls, data: Dict[str, Any]) -> 'ResearchResult': 178 | """Create a ResearchResult from a dictionary.""" 179 | if 'timestamp' in data and isinstance(data['timestamp'], str): 180 | data['timestamp'] = datetime.fromisoformat(data['timestamp']) 181 | 182 | return cls(**data) 183 | 184 | @classmethod 185 | def load_from_file(cls, filepath: str) -> 'ResearchResult': 186 | """Load research results from a file.""" 187 | with open(filepath, 'r', encoding='utf-8') as f: 188 | data = json.load(f) 189 | 190 | return cls.from_dict(data) 191 | 192 | class DeepResearcher: 193 | """Research orchestrator.""" 194 | def __init__( 195 | self, 196 | output_dir: Optional[str] = None, 197 | save_results: bool = True, 198 | auto_save_interval: Optional[int] = None 199 | ): 200 | """Initialize the researcher.""" 201 | self.output_dir = output_dir or os.path.expanduser("~/shandu_research") 202 | self.save_results = save_results 203 | self.auto_save_interval = auto_save_interval 204 | 205 | if self.save_results: 206 | os.makedirs(self.output_dir, exist_ok=True) 207 | 208 | def get_output_path(self, query: str, format: str = 'md') -> str: 209 | """Get output path for research results.""" 210 | sanitized = "".join(c if c.isalnum() or c in " -_" else "_" for c in query) 211 | sanitized = sanitized[:50] 212 | 213 | timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') 214 | filename = f"{sanitized}_{timestamp}.{format}" 215 | 216 | return os.path.join(self.output_dir, filename) 217 | 218 | async def research( 219 | self, 220 | query: str, 221 | strategy: str = 'langgraph', 222 | **kwargs 223 | ) -> ResearchResult: 224 | """Perform research using the specified strategy.""" 225 | from ..agents.langgraph_agent import ResearchGraph 226 | from ..agents.agent import ResearchAgent 227 | 228 | result = None 229 | 230 | if strategy == 'langgraph': 231 | graph = ResearchGraph() 232 | result = await graph.research(query, **kwargs) 233 | elif strategy == 'agent': 234 | agent = ResearchAgent() 235 | result = await agent.research(query, **kwargs) 236 | else: 237 | raise ValueError(f"Unknown research strategy: {strategy}") 238 | 239 | if self.save_results and result: 240 | md_path = self.get_output_path(query, 'md') 241 | result.save_to_file(md_path) 242 | 243 | json_path = self.get_output_path(query, 'json') 244 | result.save_to_file(json_path) 245 | 246 | return result 247 | 248 | def research_sync( 249 | self, 250 | query: str, 251 | strategy: str = 'langgraph', 252 | **kwargs 253 | ) -> ResearchResult: 254 | """Synchronous research wrapper.""" 255 | import asyncio 256 | return asyncio.run(self.research(query, strategy, **kwargs)) 257 | -------------------------------------------------------------------------------- /shandu/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scraper module for Shandu deep research system. 3 | """ 4 | 5 | from .scraper import WebScraper, ScrapedContent 6 | 7 | __all__ = ["WebScraper", "ScrapedContent"] 8 | -------------------------------------------------------------------------------- /shandu/search/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Search module for Shandu deep research system. 3 | """ 4 | 5 | from .search import UnifiedSearcher, SearchResult 6 | 7 | __all__ = ["UnifiedSearcher", "SearchResult"] 8 | -------------------------------------------------------------------------------- /shandu/search/ai_search.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Any, Union 2 | import asyncio 3 | import time 4 | from dataclasses import dataclass 5 | from datetime import datetime 6 | from langchain_core.prompts import ChatPromptTemplate 7 | from langchain_openai import ChatOpenAI 8 | from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults 9 | from .search import UnifiedSearcher, SearchResult 10 | from ..config import config 11 | from ..scraper import WebScraper, ScrapedContent 12 | from ..agents.utils.citation_manager import CitationManager, SourceInfo 13 | 14 | @dataclass 15 | class AISearchResult: 16 | """Container for AI-enhanced search results with enriched output and citation tracking.""" 17 | query: str 18 | summary: str 19 | sources: List[Dict[str, Any]] 20 | citation_stats: Optional[Dict[str, Any]] = None 21 | timestamp: datetime = datetime.now() 22 | 23 | def to_markdown(self) -> str: 24 | """Convert to markdown format with improved readability.""" 25 | timestamp_str = self.timestamp.strftime('%Y-%m-%d %H:%M:%S') 26 | md = [ 27 | f"# {self.query}", 28 | "## Summary", 29 | self.summary, 30 | "## Sources" 31 | ] 32 | for i, source in enumerate(self.sources, 1): 33 | title = source.get('title', 'Untitled') 34 | url = source.get('url', '') 35 | snippet = source.get('snippet', '') 36 | source_type = source.get('source', 'Unknown') 37 | md.append(f"### {i}. {title}") 38 | if url: 39 | md.append(f"- **URL:** [{url}]({url})") 40 | if source_type: 41 | md.append(f"- **Source:** {source_type}") 42 | if snippet: 43 | md.append(f"- **Snippet:** {snippet}") 44 | md.append("") 45 | 46 | if self.citation_stats: 47 | md.append("## Research Process") 48 | md.append(f"- **Sources Analyzed**: {self.citation_stats.get('total_sources', len(self.sources))}") 49 | md.append(f"- **Key Information Points**: {self.citation_stats.get('total_learnings', 0)}") 50 | if self.citation_stats.get('source_reliability'): 51 | md.append(f"- **Source Quality**: {len(self.citation_stats.get('source_reliability', {}))} domains assessed") 52 | md.append("") 53 | 54 | return "\n".join(md) 55 | 56 | def to_dict(self) -> Dict[str, Any]: 57 | """Convert to dictionary format.""" 58 | result = { 59 | "query": self.query, 60 | "summary": self.summary, 61 | "sources": self.sources, 62 | "timestamp": self.timestamp.isoformat() 63 | } 64 | if self.citation_stats: 65 | result["citation_stats"] = self.citation_stats 66 | return result 67 | 68 | class AISearcher: 69 | """ 70 | AI-powered search functionality. 71 | Combines search results with AI analysis for any type of query. 72 | Enhanced with scraping capability, detailed outputs, source citations, and learning extraction. 73 | """ 74 | def __init__( 75 | self, 76 | llm: Optional[ChatOpenAI] = None, 77 | searcher: Optional[UnifiedSearcher] = None, 78 | scraper: Optional[WebScraper] = None, 79 | citation_manager: Optional[CitationManager] = None, 80 | max_results: int = 10, 81 | max_pages_to_scrape: int = 3 82 | ): 83 | api_base = config.get("api", "base_url") 84 | api_key = config.get("api", "api_key") 85 | model = config.get("api", "model") 86 | self.llm = llm or ChatOpenAI( 87 | base_url=api_base, 88 | api_key=api_key, 89 | model=model, 90 | temperature=0.4, 91 | max_tokens=8192 92 | ) 93 | self.searcher = searcher or UnifiedSearcher(max_results=max_results) 94 | self.scraper = scraper or WebScraper() 95 | self.citation_manager = citation_manager or CitationManager() 96 | self.max_results = max_results 97 | self.max_pages_to_scrape = max_pages_to_scrape 98 | 99 | self.ddg_search = DuckDuckGoSearchRun() 100 | self.ddg_results = DuckDuckGoSearchResults(output_format="list") 101 | 102 | async def search( 103 | self, 104 | query: str, 105 | engines: Optional[List[str]] = None, 106 | detailed: bool = False, 107 | enable_scraping: bool = True, 108 | use_ddg_tools: bool = True 109 | ) -> AISearchResult: 110 | """ 111 | Perform AI-enhanced search with detailed outputs and source citations. 112 | 113 | Args: 114 | query: Search query (can be about any topic) 115 | engines: List of search engines to use 116 | detailed: Whether to generate a detailed analysis 117 | enable_scraping: Whether to scrape content from top results 118 | use_ddg_tools: Whether to use DuckDuckGo tools from langchain_community 119 | 120 | Returns: 121 | AISearchResult object with a comprehensive summary and cited sources 122 | """ 123 | timestamp = datetime.now() 124 | sources = [] 125 | 126 | # Use DuckDuckGo tools if enabled 127 | if use_ddg_tools and (not engines or 'duckduckgo' in engines): 128 | try: 129 | 130 | ddg_structured_results = self.ddg_results.invoke(query) 131 | for result in ddg_structured_results[:self.max_results]: 132 | source_info = { 133 | "title": result.get("title", "Untitled"), 134 | "url": result.get("link", ""), 135 | "snippet": result.get("snippet", ""), 136 | "source": "DuckDuckGo" 137 | } 138 | sources.append(source_info) 139 | 140 | # Register source with citation manager 141 | self._register_source_with_citation_manager(source_info) 142 | except Exception as e: 143 | print(f"Error using DuckDuckGoSearchResults: {e}") 144 | 145 | # Use UnifiedSearcher as a fallback or if DuckDuckGo tools are disabled 146 | if not sources or not use_ddg_tools: 147 | search_results = await self.searcher.search(query, engines) 148 | 149 | # Collect all sources 150 | for result in search_results: 151 | if isinstance(result, SearchResult): 152 | result_dict = result.to_dict() 153 | sources.append(result_dict) 154 | 155 | # Register source with citation manager 156 | self._register_source_with_citation_manager(result_dict) 157 | elif isinstance(result, dict): 158 | sources.append(result) 159 | 160 | # Register source with citation manager 161 | self._register_source_with_citation_manager(result) 162 | 163 | # Scrape additional content if enabled 164 | if enable_scraping: 165 | urls_to_scrape = [] 166 | for source in sources: 167 | if source.get('url') and len(urls_to_scrape) < self.max_pages_to_scrape: 168 | urls_to_scrape.append(source['url']) 169 | if urls_to_scrape: 170 | print(f"Scraping {len(urls_to_scrape)} pages for deeper insights...") 171 | scraped_results = await self.scraper.scrape_urls(urls_to_scrape, dynamic=True) 172 | for scraped in scraped_results: 173 | if hasattr(scraped, 'is_successful') and scraped.is_successful(): 174 | try: 175 | main_content = scraped.text 176 | if hasattr(self.scraper, 'extract_main_content'): 177 | main_content = await self.scraper.extract_main_content(scraped) 178 | if "unexpected error" in main_content.lower(): 179 | continue 180 | preview = main_content[:500] + ("...(truncated)" if len(main_content) > 1500 else "") 181 | source_info = { 182 | "title": scraped.title, 183 | "url": scraped.url, 184 | "snippet": preview, 185 | "source": "Scraped Content" 186 | } 187 | sources.append(source_info) 188 | 189 | # Register source with citation manager and extract learnings 190 | source_id = self._register_source_with_citation_manager(source_info) 191 | if source_id and main_content: 192 | self.citation_manager.extract_learning_from_text( 193 | main_content, 194 | scraped.url, 195 | context=f"Search query: {query}" 196 | ) 197 | except Exception as e: 198 | print(f"Error processing scraped content from {scraped.url}: {e}") 199 | 200 | # Prepare sources with improved citation format 201 | aggregated_text = "" 202 | for i, source in enumerate(sources, 1): 203 | 204 | url = source.get('url', '') 205 | domain = url.split("//")[1].split("/")[0] if "//" in url else "Unknown Source" 206 | # Capitalize first letter of domain for a more professional look 207 | domain_name = domain.split('.')[0].capitalize() if '.' in domain else domain 208 | 209 | aggregated_text += ( 210 | f"[{i}] {domain_name}\n" 211 | f"Title: {source.get('title', 'Untitled')}\n" 212 | f"URL: {url}\n" 213 | f"Snippet: {source.get('snippet', '')}\n\n" 214 | ) 215 | 216 | current_date = timestamp.strftime('%Y-%m-%d') 217 | if detailed: 218 | detail_instruction = ( 219 | "Provide a detailed analysis with in-depth explanations, " 220 | "specific examples, relevant background, and additional insights " 221 | "to enhance understanding of the topic." 222 | ) 223 | else: 224 | detail_instruction = "Provide a concise yet informative summary, focusing on the key points and essential information." 225 | 226 | final_prompt = f"""You are Shandu, an expert analyst. Based on the following sources retrieved on {current_date} for the query "{query}", {detail_instruction} 227 | 228 | - If the query is a question, answer it directly with a thorough explanation. 229 | - If it's a topic, provide a well-rounded overview with supporting details. 230 | - Use bullet points or numbered lists to organize information clearly. 231 | - If there are conflicting views or uncertainties, discuss them explicitly. 232 | - When providing information, cite the source by using the number in square brackets, like [1], to indicate where the information was sourced. 233 | - ONLY use the citation numbers provided in the sources below. 234 | - DO NOT include years or dates in your citations, just use the bracketed number like [1]. 235 | - Ensure the response is engaging, detailed, and written in plain text suitable for all readers. 236 | 237 | Sources: 238 | 239 | {aggregated_text} 240 | """ 241 | 242 | final_output = await self.llm.ainvoke(final_prompt) 243 | 244 | citation_stats = None 245 | if sources: 246 | citation_stats = { 247 | "total_sources": len(self.citation_manager.sources), 248 | "total_learnings": len(self.citation_manager.learnings), 249 | "source_reliability": self.citation_manager._calculate_source_reliability() 250 | } 251 | 252 | return AISearchResult( 253 | query=query, 254 | summary=final_output.content.strip(), 255 | sources=sources, 256 | citation_stats=citation_stats, 257 | timestamp=timestamp 258 | ) 259 | 260 | def _register_source_with_citation_manager(self, source: Dict[str, Any]) -> Optional[str]: 261 | """Register a source with the citation manager and return its ID.""" 262 | try: 263 | url = source.get('url', '') 264 | if not url: 265 | return None 266 | 267 | title = source.get('title', 'Untitled') 268 | snippet = source.get('snippet', '') 269 | source_type = source.get('source', 'web') 270 | 271 | domain = url.split("//")[1].split("/")[0] if "//" in url else "unknown" 272 | 273 | source_info = SourceInfo( 274 | url=url, 275 | title=title, 276 | snippet=snippet, 277 | source_type=source_type, 278 | content_type="article", 279 | access_time=time.time(), 280 | domain=domain, 281 | reliability_score=0.8, # Default score 282 | metadata=source 283 | ) 284 | 285 | return self.citation_manager.add_source(source_info) 286 | 287 | except Exception as e: 288 | print(f"Error registering source with citation manager: {e}") 289 | return None 290 | 291 | def search_sync( 292 | self, 293 | query: str, 294 | engines: Optional[List[str]] = None, 295 | detailed: bool = False, 296 | enable_scraping: bool = True, 297 | use_ddg_tools: bool = True 298 | ) -> AISearchResult: 299 | """Synchronous version of the search method.""" 300 | return asyncio.run(self.search(query, engines, detailed, enable_scraping, use_ddg_tools)) 301 | -------------------------------------------------------------------------------- /shandu/utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logging utilities for Shandu. 3 | """ 4 | import os 5 | import logging 6 | import traceback 7 | import inspect 8 | from datetime import datetime 9 | from pathlib import Path 10 | 11 | log_dir = os.path.expanduser("~/.shandu/logs") 12 | Path(log_dir).mkdir(parents=True, exist_ok=True) 13 | 14 | logger = logging.getLogger("shandu") 15 | logger.setLevel(logging.DEBUG) 16 | 17 | current_date = datetime.now().strftime("%Y-%m-%d") 18 | log_file = os.path.join(log_dir, f"shandu_{current_date}.log") 19 | file_handler = logging.FileHandler(log_file, encoding='utf-8') 20 | file_handler.setLevel(logging.DEBUG) 21 | 22 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s') 23 | file_handler.setFormatter(formatter) 24 | 25 | logger.addHandler(file_handler) 26 | 27 | def get_caller_filename(): 28 | """ 29 | Get the filename of the caller. 30 | 31 | Returns: 32 | str: The filename of the caller. 33 | """ 34 | 35 | stack = inspect.stack() 36 | # The caller is the third frame in the stack (index 2) 37 | caller_frame = stack[2] 38 | 39 | caller_filename = os.path.basename(caller_frame.filename) 40 | return caller_filename 41 | 42 | def log_error(message, error, context=None): 43 | """ 44 | Log an error with detailed information. 45 | 46 | Args: 47 | message: The error message 48 | error: The exception object 49 | context: Additional context information (optional) 50 | """ 51 | caller_filename = get_caller_filename() 52 | error_details = f"[{caller_filename}] {message}: {str(error)}" 53 | if context: 54 | error_details += f" | Context: {context}" 55 | 56 | error_details += f"\nTraceback: {traceback.format_exc()}" 57 | 58 | logger.error(error_details) 59 | 60 | def log_warning(message, context=None): 61 | """ 62 | Log a warning with context information. 63 | 64 | Args: 65 | message: The warning message 66 | context: Additional context information (optional) 67 | """ 68 | caller_filename = get_caller_filename() 69 | warning_details = f"[{caller_filename}] {message}" 70 | if context: 71 | warning_details += f" | Context: {context}" 72 | 73 | logger.warning(warning_details) 74 | 75 | def log_info(message, context=None): 76 | """ 77 | Log an info message with context information. 78 | 79 | Args: 80 | message: The info message 81 | context: Additional context information (optional) 82 | """ 83 | caller_filename = get_caller_filename() 84 | info_details = f"[{caller_filename}] {message}" 85 | if context: 86 | info_details += f" | Context: {context}" 87 | 88 | logger.info(info_details) -------------------------------------------------------------------------------- /tests/test_citation_registry.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from shandu.agents.utils.citation_registry import CitationRegistry 3 | 4 | class TestCitationRegistry(unittest.TestCase): 5 | """Basic tests for the CitationRegistry class.""" 6 | 7 | def test_citation_registration(self): 8 | """Test that citations can be registered and retrieved correctly.""" 9 | registry = CitationRegistry() 10 | 11 | # Register a few citations 12 | cid1 = registry.register_citation("https://example.com/article1") 13 | cid2 = registry.register_citation("https://example.com/article2") 14 | cid3 = registry.register_citation("https://example.com/article3") 15 | 16 | # Test citation IDs are sequential 17 | self.assertEqual(cid1, 1) 18 | self.assertEqual(cid2, 2) 19 | self.assertEqual(cid3, 3) 20 | 21 | # Test URL to ID mapping works 22 | self.assertEqual(registry.url_to_id["https://example.com/article1"], 1) 23 | self.assertEqual(registry.url_to_id["https://example.com/article2"], 2) 24 | 25 | # Test ID to URL mapping works 26 | self.assertEqual(registry.id_to_url[1], "https://example.com/article1") 27 | self.assertEqual(registry.id_to_url[2], "https://example.com/article2") 28 | 29 | # Test getting citation info 30 | self.assertEqual(registry.get_citation_info(1)["url"], "https://example.com/article1") 31 | self.assertEqual(registry.get_citation_info(2)["url"], "https://example.com/article2") 32 | 33 | def test_bulk_registration(self): 34 | """Test bulk registration of citations.""" 35 | registry = CitationRegistry() 36 | 37 | urls = [ 38 | "https://example.com/article1", 39 | "https://example.com/article2", 40 | "https://example.com/article3" 41 | ] 42 | 43 | registry.bulk_register_sources(urls) 44 | 45 | # Check all URLs were registered 46 | self.assertEqual(len(registry.citations), 3) 47 | 48 | # Check URL to ID mappings 49 | self.assertIn("https://example.com/article1", registry.url_to_id) 50 | self.assertIn("https://example.com/article2", registry.url_to_id) 51 | self.assertIn("https://example.com/article3", registry.url_to_id) 52 | 53 | def test_citation_validation(self): 54 | """Test citation validation in text.""" 55 | registry = CitationRegistry() 56 | 57 | # Register a few citations 58 | registry.register_citation("https://example.com/article1") 59 | registry.register_citation("https://example.com/article2") 60 | 61 | # Text with valid and invalid citations 62 | text = """ 63 | This is a test with valid citation [1] and another valid citation [2]. 64 | This is an invalid citation [3] that doesn't exist. 65 | Here's another mention of [1] and an out-of-range [5]. 66 | """ 67 | 68 | result = registry.validate_citations(text) 69 | 70 | # Check validation results 71 | self.assertFalse(result["valid"]) 72 | self.assertIn(3, result["invalid_citations"]) 73 | self.assertIn(5, result["invalid_citations"]) 74 | self.assertEqual(len(result["used_citations"]), 2) 75 | self.assertIn(1, result["used_citations"]) 76 | self.assertIn(2, result["used_citations"]) 77 | 78 | if __name__ == '__main__': 79 | unittest.main() 80 | -------------------------------------------------------------------------------- /tests/test_report_generator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import AsyncMock, patch 3 | import asyncio 4 | from shandu.agents.processors.report_generator import format_citations 5 | from shandu.agents.utils.citation_registry import CitationRegistry 6 | 7 | class TestReportGenerator(unittest.TestCase): 8 | """Basic tests for report generation functions.""" 9 | 10 | def setUp(self): 11 | """Set up test cases.""" 12 | self.mock_llm = AsyncMock() 13 | self.mock_llm.ainvoke = AsyncMock() 14 | 15 | # Sample citation data 16 | self.sample_sources = [ 17 | {"url": "https://example.com/article1", "title": "Test Article 1", "date": "2023-01-01"}, 18 | {"url": "https://github.com/user/repo", "title": "Sample Repository", "date": "2024-02-15"} 19 | ] 20 | 21 | # Create a citation registry 22 | self.registry = CitationRegistry() 23 | self.registry.register_citation("https://example.com/article1") 24 | self.registry.register_citation("https://github.com/user/repo") 25 | 26 | # Add metadata to the citations 27 | self.registry.update_citation_metadata(1, { 28 | "title": "Test Article 1", 29 | "date": "2023-01-01" 30 | }) 31 | self.registry.update_citation_metadata(2, { 32 | "title": "Sample Repository", 33 | "date": "2024-02-15" 34 | }) 35 | 36 | def test_format_citations_sync(self): 37 | """Test format_citations function synchronously by running the async function.""" 38 | # Set up the mock to return properly formatted citations 39 | self.mock_llm.ainvoke.return_value.content = """ 40 | [1] *example.com*, "Test Article 1", https://example.com/article1 41 | [2] *github.com*, "Sample Repository", https://github.com/user/repo 42 | """ 43 | 44 | # Run the async function in a synchronous context 45 | formatted_citations = asyncio.run(format_citations( 46 | self.mock_llm, 47 | ["https://example.com/article1", "https://github.com/user/repo"], 48 | self.sample_sources, 49 | self.registry 50 | )) 51 | 52 | # Check the results 53 | self.assertIn("*example.com*", formatted_citations) 54 | self.assertIn("\"Test Article 1\"", formatted_citations) 55 | self.assertIn("https://example.com/article1", formatted_citations) 56 | 57 | # Verify the correct format (no date in citations) 58 | self.assertNotIn("2023-01-01", formatted_citations) 59 | self.assertNotIn("2024-02-15", formatted_citations) 60 | 61 | # Ensure citation numbers are properly formatted 62 | self.assertIn("[1]", formatted_citations) 63 | self.assertIn("[2]", formatted_citations) 64 | 65 | if __name__ == '__main__': 66 | unittest.main() 67 | --------------------------------------------------------------------------------