├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
    ├── gemini-flash2.5.md
    ├── gemini.md
    ├── gpt4.0-mini.md
    ├── mistral_large.md
    └── o3-mini-high.md
├── requirements.txt
├── setup.py
├── shandu
    ├── README.md
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── agent.py
    │   ├── graph
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   └── wrapper.py
    │   ├── langgraph_agent.py
    │   ├── nodes
    │   │   ├── __init__.py
    │   │   ├── citations.py
    │   │   ├── generate_queries.py
    │   │   ├── initialize.py
    │   │   ├── reflect.py
    │   │   ├── report_generation.py
    │   │   ├── search.py
    │   │   └── source_selection.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── content_processor.py
    │   │   └── report_generator.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── agent_utils.py
    │   │   ├── citation_manager.py
    │   │   └── citation_registry.py
    ├── cli.py
    ├── config.py
    ├── prompts.py
    ├── research
    │   ├── __init__.py
    │   └── researcher.py
    ├── scraper
    │   ├── __init__.py
    │   └── scraper.py
    ├── search
    │   ├── __init__.py
    │   ├── ai_search.py
    │   └── search.py
    └── utils
    │   └── logger.py
└── tests
    ├── test_citation_registry.py
    └── test_report_generator.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python bytecode
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | dist/
 8 | build/
 9 | *.egg-info/
10 | 
11 | # Virtual environments
12 | venv/
13 | env/
14 | ENV/
15 | 
16 | # IDE files
17 | .idea/
18 | .vscode/
19 | *.swp
20 | *.swo
21 | 
22 | # OS specific files
23 | .DS_Store
24 | Thumbs.db
25 | 
26 | # Environment variables
27 | .env
28 | 
29 | # Logs
30 | *.log
31 | 
32 | # Test files
33 | test_*.md
34 | .pytest_cache/
35 | .coverage
36 | htmlcov/
37 | helper.txt
38 | # Jupyter Notebooks
39 | .ipynb_checkpoints
40 | *.ipynb
41 | 
42 | # Cache directories
43 | .cache/
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Dušan Jolović
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include requirements.txt
4 | recursive-include examples *.md
5 | recursive-include shandu *.md
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Shandu 2.0: Advanced AI Research System with Robust Report Generation
  2 | 
  3 | Shandu is a cutting-edge AI research assistant that performs in-depth, multi-source research on any topic using advanced language models, intelligent web scraping, and iterative exploration to generate comprehensive, well-structured reports with proper citations.
  4 | 
  5 | [![MIT License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
  6 | [![Python 3.9+](https://img.shields.io/badge/Python-3.9+-blue.svg)](https://www.python.org/downloads/)
  7 | 
  8 | ## 🔍 What is Shandu?
  9 | 
 10 | Shandu is an intelligent, LLM-powered research system that automates the comprehensive research process - from initial query clarification to in-depth content analysis and report generation. Built on LangGraph's state-based workflow, it recursively explores topics with sophisticated algorithms for source evaluation, content extraction, and knowledge synthesis.
 11 | 
 12 | ### Key Use Cases
 13 | 
 14 | - **Academic Research**: Generate literature reviews, background information, and complex topic analyses
 15 | - **Market Intelligence**: Analyze industry trends, competitor strategies, and market opportunities
 16 | - **Content Creation**: Produce well-researched articles, blog posts, and reports with proper citations
 17 | - **Technology Exploration**: Track emerging technologies, innovations, and technical developments
 18 | - **Policy Analysis**: Research regulations, compliance requirements, and policy implications
 19 | - **Competitive Analysis**: Compare products, services, and company strategies across industries
 20 | 
 21 | ## 🚀 What's New in Version 2.0
 22 | 
 23 | Shandu 2.0 introduces a major redesign of the report generation pipeline to produce more coherent, reliable reports:
 24 | 
 25 | - **Modular Report Generation**: Process reports in self-contained sections, enhancing overall system reliability
 26 | - **Robust Error Recovery**: Automatic retry mechanisms with intelligent fallbacks prevent the system from getting stuck
 27 | - **Section-By-Section Processing**: Each section is processed independently, allowing for better error isolation
 28 | - **Progress Tracking**: Detailed progress tracking helps identify exactly where the process is at each stage
 29 | - **Enhanced Citation Management**: More reliable citation handling ensures proper attribution throughout reports
 30 | - **Intelligent Parallelization**: Key processes run in parallel where possible for improved performance
 31 | - **Comprehensive Fallback Mechanisms**: If any step fails, the system gracefully degrades rather than halting
 32 | 
 33 | ## ⚙️ How Shandu Works
 34 | 
 35 | ```mermaid
 36 | flowchart TB
 37 |     subgraph Input
 38 |         Q[User Query]
 39 |         B[Breadth Parameter]
 40 |         D[Depth Parameter]
 41 |     end
 42 | 
 43 |     subgraph Research[Research Phase]
 44 |         direction TB
 45 |         DR[Deep Research]
 46 |         SQ[SERP Queries]
 47 |         PR[Process Results]
 48 |         NL[(Sources & Learnings)]
 49 |         ND[(Directions)]
 50 |     end
 51 | 
 52 |     subgraph Report[Report Generation]
 53 |         direction TB
 54 |         TG[Title Generation]
 55 |         TE[Theme Extraction]
 56 |         IR[Initial Report]
 57 |         ES[Section Enhancement]
 58 |         EX[Section Expansion]
 59 |         FR[Final Report]
 60 |     end
 61 | 
 62 |     %% Main Flow
 63 |     Q & B & D --> DR
 64 |     DR --> SQ --> PR
 65 |     PR --> NL
 66 |     PR --> ND
 67 |     
 68 |     DP{depth > 0?}
 69 |     NL & ND --> DP
 70 | 
 71 |     RD["Next Direction:
 72 |     - Prior Goals
 73 |     - New Questions
 74 |     - Learnings"]
 75 | 
 76 |     %% Circular Flow
 77 |     DP -->|Yes| RD
 78 |     RD -->|New Context| DR
 79 | 
 80 |     %% To Report Generation
 81 |     DP -->|No| TG
 82 |     TG --> TE --> IR --> ES --> EX --> FR
 83 | 
 84 |     %% Styling
 85 |     classDef input fill:#7bed9f,stroke:#2ed573,color:black
 86 |     classDef process fill:#70a1ff,stroke:#1e90ff,color:black
 87 |     classDef recursive fill:#ffa502,stroke:#ff7f50,color:black
 88 |     classDef output fill:#ff4757,stroke:#ff6b81,color:white
 89 |     classDef storage fill:#a8e6cf,stroke:#3b7a57,color:black
 90 | 
 91 |     class Q,B,D input
 92 |     class DR,SQ,PR,TG,TE,IR,ES,EX process
 93 |     class DP,RD recursive
 94 |     class FR output
 95 |     class NL,ND storage
 96 | ```
 97 | 
 98 | ## 🌟 Key Features
 99 | 
100 | - **Intelligent State-based Workflow**: Leverages LangGraph for a structured, step-by-step research process
101 | - **Iterative Deep Exploration**: Recursively explores topics with dynamic depth and breadth parameters
102 | - **Multi-source Information Synthesis**: Analyzes data from search engines, web content, and knowledge bases
103 | - **Enhanced Web Scraping**: Features dynamic JS rendering, content extraction, and ethical scraping practices
104 | - **Smart Source Evaluation**: Automatically assesses source credibility, relevance, and information value
105 | - **Content Analysis Pipeline**: Uses advanced NLP to extract key information, identify patterns, and synthesize findings
106 | - **Sectional Report Generation**: Creates detailed reports by processing individual sections for maximum reliability
107 | - **Parallel Processing Architecture**: Implements concurrent operations for efficient multi-query execution
108 | - **Adaptive Search Strategy**: Dynamically adjusts search queries based on discovered information
109 | - **Full Citation Management**: Properly attributes all sources with formatted citations in multiple styles
110 | 
111 | ## 🏁 Quick Start
112 | 
113 | ```bash
114 | # Install from PyPI
115 | pip install shandu
116 | 
117 | # Install from source
118 | git clone https://github.com/jolovicdev/shandu.git
119 | cd shandu
120 | pip install -e .
121 | 
122 | # Configure API settings (supports various LLM providers)
123 | shandu configure
124 | 
125 | # Run comprehensive research
126 | shandu research "Your research query" --depth 2 --breadth 4 --output report.md
127 | 
128 | # Quick AI-powered search with web scraping
129 | shandu aisearch "Who is the current sitting president of United States?" --detailed
130 | ```
131 | 
132 | ## 📚 Detailed Usage
133 | 
134 | ### Research Command
135 | 
136 | ```bash
137 | shandu research "Your research query" \
138 |     --depth 3 \                # How deep to explore (1-5, default: 2)
139 |     --breadth 5 \              # How many parallel queries (2-10, default: 4)
140 |     --output report.md \       # Save to file instead of terminal
141 |     --verbose                  # Show detailed progress
142 | ```
143 | 
144 | ### Example Reports
145 | 
146 | You can find example reports in the examples directory:
147 | 
148 | 1. **The Intersection of Quantum Computing, Synthetic Biology, and Climate Modeling**
149 |    ```bash
150 |    shandu research "The Intersection of Quantum Computing, Synthetic Biology, and Climate Modeling" --depth 3 --breadth 3 --output examples/o3-mini-high.md
151 |    ```
152 | 
153 | ## 💻 Python API
154 | 
155 | ```python
156 | from shandu.agents import ResearchGraph
157 | from langchain_openai import ChatOpenAI
158 | 
159 | # Initialize with custom LLM if desired
160 | llm = ChatOpenAI(model="gpt-4")
161 | 
162 | # Initialize the research graph
163 | researcher = ResearchGraph(
164 |     llm=llm,
165 |     temperature=0.5
166 | )
167 | 
168 | # Perform deep research
169 | results = researcher.research_sync(
170 |     query="Your research query",
171 |     depth=3,       # How deep to go with recursive research
172 |     breadth=4,     # How many parallel queries to explore
173 |     detail_level="high"
174 | )
175 | 
176 | # Print or save results
177 | print(results.to_markdown())
178 | ```
179 | 
180 | ## 🧩 Advanced Architecture
181 | 
182 | ### Research Pipeline
183 | 
184 | Shandu's research pipeline consists of these key stages:
185 | 
186 | 1. **Query Clarification**: Interactive questions to understand research needs
187 | 2. **Research Planning**: Strategic planning for comprehensive topic coverage
188 | 3. **Iterative Exploration**:
189 |    - Smart query generation based on knowledge gaps
190 |    - Multi-engine search with parallelized execution
191 |    - Relevance filtering of search results
192 |    - Intelligent web scraping with content extraction
193 |    - Source credibility assessment
194 |    - Information analysis and synthesis
195 |    - Reflection on findings to identify gaps
196 | 
197 | ### Report Generation Pipeline
198 | 
199 | Shandu 2.0 introduces a robust, modular report generation pipeline:
200 | 
201 | 1. **Data Preparation**: Registration of all sources and their metadata for proper citation
202 | 2. **Title Generation**: Creating a concise, professional title (with retry mechanisms)
203 | 3. **Theme Extraction**: Identifying key themes to organize the report structure
204 | 4. **Citation Formatting**: Properly formatting all citations for reference
205 | 5. **Initial Report Generation**: Creating a comprehensive draft report
206 | 6. **Section Enhancement**: Individually processing each section to add detail and depth
207 | 7. **Key Section Expansion**: Identifying and expanding the most important sections
208 | 8. **Report Finalization**: Final processing and validation of the complete report
209 | 
210 | Each step includes:
211 | - Comprehensive error handling
212 | - Automatic retries with exponential backoff
213 | - Intelligent fallbacks when issues occur
214 | - Progress tracking for transparency
215 | - Validation to ensure quality output
216 | 
217 | ## 🔌 Supported Search Engines & Sources
218 | 
219 | - Google Search
220 | - DuckDuckGo
221 | - Wikipedia
222 | - ArXiv (academic papers)
223 | - Custom search engines can be added
224 | 
225 | ## 📊 Technical Capabilities
226 | 
227 | - **Dynamic JS Rendering**: Handles JavaScript-heavy websites
228 | - **Content Extraction**: Identifies and extracts main content from web pages
229 | - **Parallel Processing**: Concurrent execution of searches and scraping
230 | - **Caching**: Efficient caching of search results and scraped content
231 | - **Rate Limiting**: Respectful access to web resources
232 | - **Robots.txt Compliance**: Ethical web scraping practices
233 | - **Flexible Output Formats**: Markdown, JSON, plain text
234 | 
235 | ## 📜 License
236 | 
237 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
238 | 


--------------------------------------------------------------------------------
/examples/gpt4.0-mini.md:
--------------------------------------------------------------------------------
  1 | # Autonomous Vehicles: Safety and Smart City Integration
  2 | 
  3 | ## Executive Summary
  4 | 
  5 | The advent of autonomous vehicles (AVs) represents a transformative shift in urban mobility, promising enhanced safety, efficiency, and environmental sustainability. This report delves into the current state of AV technology, its safety implications, infrastructure requirements for integration into smart cities, and the broader impacts on urban mobility and the environment. Through a comprehensive analysis of technological advancements, safety records, infrastructure needs, and policy frameworks, this report aims to provide actionable recommendations for urban planners and policymakers. The findings indicate that while AVs hold significant potential to reduce greenhouse gas emissions and improve traffic flow, challenges such as public perception, regulatory hurdles, and infrastructure readiness must be addressed to facilitate their widespread adoption.
  6 | 
  7 | ## Introduction
  8 | 
  9 | The integration of autonomous vehicles into urban environments is a critical component of the evolution towards smart cities. As cities grapple with increasing traffic congestion, pollution, and safety concerns, AVs offer a promising solution that could revolutionize urban transportation. This report aims to analyze the multifaceted aspects of AV technology, focusing on safety features, infrastructure requirements, and the implications for urban mobility and environmental sustainability. By synthesizing findings from various sources, this report seeks to provide a comprehensive understanding of the current landscape of AVs and their potential role in shaping the future of urban transportation.
 10 | 
 11 | ## Technological Advancements in Autonomous Vehicles
 12 | 
 13 | ### Sensor Systems
 14 | 
 15 | The backbone of AV technology lies in its sophisticated sensor systems, which include LiDAR, radar, cameras, and ultrasonic sensors. These systems work in tandem to create a comprehensive understanding of the vehicle's surroundings, enabling safe navigation and obstacle detection. Recent advancements have led to improved sensor accuracy and reliability, allowing AVs to operate effectively in diverse environmental conditions, including low visibility scenarios such as fog and heavy rain.
 16 | 
 17 | ### Artificial Intelligence Algorithms
 18 | 
 19 | Artificial intelligence (AI) plays a pivotal role in processing the vast amounts of data collected by AV sensors. Machine learning algorithms are employed to enhance decision-making capabilities, enabling vehicles to predict and respond to dynamic traffic conditions. Innovations in deep learning have significantly improved the ability of AVs to recognize and interpret complex scenarios, such as pedestrian movements and unexpected road hazards, thereby enhancing safety and performance metrics.
 20 | 
 21 | ### Vehicle-to-Everything (V2X) Communication
 22 | 
 23 | V2X communication technology facilitates real-time data exchange between vehicles, infrastructure, and other road users. This connectivity is crucial for the effective operation of AVs within smart city frameworks. By sharing information about traffic conditions, road hazards, and other relevant data, V2X communication enhances situational awareness and enables coordinated responses to traffic events, ultimately improving safety and efficiency on urban roadways.
 24 | 
 25 | ## Safety Implications of Autonomous Vehicles
 26 | 
 27 | ### Safety Records Compared to Traditional Vehicles
 28 | 
 29 | The safety implications of AVs are a critical area of investigation. Preliminary data suggests that AVs may have lower accident rates compared to traditional vehicles. For instance, studies indicate that AVs are less prone to human error, which is a leading cause of traffic accidents. However, the transition period, during which AVs share the road with human-driven vehicles, presents unique challenges that must be addressed to ensure overall safety on urban roadways [3][5].
 30 | 
 31 | ### Risk Factors and Public Perception
 32 | 
 33 | Despite the potential safety benefits, public perception of AVs remains mixed. Concerns about the reliability of AV technology, particularly in complex urban environments, contribute to skepticism regarding their safety. Addressing these concerns through transparent communication, public education, and robust safety testing is essential for fostering public trust in AV technology.
 34 | 
 35 | ## Infrastructure Requirements for AV Integration
 36 | 
 37 | ### Smart Traffic Management Systems
 38 | 
 39 | The successful integration of AVs into urban environments necessitates the development of smart traffic management systems. These systems utilize real-time data analytics to optimize traffic flow, reduce congestion, and enhance safety. Adaptive traffic signals, for example, can adjust their timing based on current traffic conditions, allowing for smoother vehicle movement and reduced wait times at intersections [10][15].
 40 | 
 41 | ### Dedicated Lanes and Charging Station Networks
 42 | 
 43 | To accommodate the unique operational characteristics of AVs, dedicated lanes may be required in certain urban areas. These lanes can facilitate smoother traffic flow and reduce the likelihood of conflicts with human-driven vehicles. Additionally, the establishment of a comprehensive network of charging stations is essential to support the anticipated increase in electric AVs, further contributing to environmental sustainability [12][14].
 44 | 
 45 | ## Role of Smart Traffic Systems
 46 | 
 47 | ### Adaptive Traffic Signals
 48 | 
 49 | Adaptive traffic signals are a key component of smart traffic systems, allowing for real-time adjustments based on traffic conditions. By integrating AVs into these systems, cities can enhance traffic management and improve overall safety. For instance, signals can prioritize AVs during peak hours, reducing congestion and improving travel times for all road users.
 50 | 
 51 | ### Real-Time Data Analytics
 52 | 
 53 | The use of real-time data analytics in traffic management enables cities to respond proactively to changing conditions. By analyzing traffic patterns, cities can implement congestion management strategies that optimize the flow of AVs and traditional vehicles alike. This approach not only enhances safety but also contributes to a more efficient urban transportation network [11][15].
 54 | 
 55 | ## Urban Mobility and Environmental Impact
 56 | 
 57 | ### Enhancing Urban Mobility
 58 | 
 59 | AVs have the potential to significantly enhance urban mobility by reducing traffic congestion and improving accessibility. By optimizing routing and minimizing delays, AVs can facilitate smoother transportation for all users, including pedestrians and cyclists. Furthermore, the integration of AVs into public transportation systems can provide last-mile connectivity, enhancing overall mobility options within urban areas [2][6].
 60 | 
 61 | ### Environmental Benefits
 62 | 
 63 | The environmental benefits associated with the adoption of AVs are substantial. As AV technology matures, it is expected that a significant proportion of these vehicles will be electric, leading to a reduction in greenhouse gas emissions from the transportation sector. Additionally, AVs can contribute to more efficient land use by reducing the need for extensive parking infrastructure, allowing for the repurposing of urban spaces for green areas and community development [1][4].
 64 | 
 65 | ## Policy and Regulatory Frameworks
 66 | 
 67 | ### Existing Policies and Regulations
 68 | 
 69 | The regulatory landscape for AVs is still evolving, with various jurisdictions implementing different policies to govern their operation. Current regulations often focus on safety standards, testing protocols, and liability issues. However, there are significant gaps in the regulatory framework that need to be addressed to facilitate the safe integration of AVs into urban environments [3][10].
 70 | 
 71 | ### Identifying Gaps and Challenges
 72 | 
 73 | Key challenges include the need for standardized testing protocols, liability frameworks for accidents involving AVs, and guidelines for data privacy and security. Policymakers must work collaboratively with industry stakeholders to develop comprehensive regulations that address these challenges while promoting innovation and public safety [6][12].
 74 | 
 75 | ## Recommendations for Urban Planners and Policymakers
 76 | 
 77 | ### Infrastructure Development
 78 | 
 79 | Urban planners should prioritize the development of infrastructure that supports AV integration, including dedicated lanes, charging stations, and smart traffic management systems. Investments in these areas will be crucial for facilitating the safe and efficient operation of AVs within urban environments.
 80 | 
 81 | ### Public Engagement and Education
 82 | 
 83 | To address public concerns regarding AV safety, policymakers should implement public engagement initiatives that educate citizens about the benefits and safety features of AV technology. Transparent communication and community involvement in the planning process can help build trust and acceptance of AVs in urban settings.
 84 | 
 85 | ### Collaborative Policy Frameworks
 86 | 
 87 | Policymakers should foster collaboration between government agencies, industry stakeholders, and academic institutions to develop comprehensive policy frameworks that address the unique challenges posed by AVs. This collaborative approach will ensure that regulations are informed by the latest technological advancements and best practices in urban mobility.
 88 | 
 89 | ## Conclusion
 90 | 
 91 | The integration of autonomous vehicles into urban environments presents both opportunities and challenges. While AVs have the potential to enhance safety, improve traffic flow, and reduce environmental impacts, significant barriers must be overcome to facilitate their widespread adoption. By focusing on infrastructure development, public engagement, and collaborative policy frameworks, urban planners and policymakers can create a conducive environment for the successful integration of AVs into smart cities. The findings of this report underscore the importance of a holistic approach to AV integration, one that prioritizes safety, efficiency, and sustainability in urban mobility.
 92 | 
 93 | # References
 94 | 
 95 | [1] *www.eesi.org*, "https://www.eesi.org/papers/view/issue-brief-autonomous-vehicles-state-of-the-technology-and-potential-role-as-a-climate-solution", https://www.eesi.org/papers/view/issue-brief-autonomous-vehicles-state-of-the-technology-and-potential-role-as-a-climate-solution
 96 | [2] *www.mdpi.com*, "How Autonomous Vehicles Shape Urban Traffic Sustainability - MDPI", https://www.mdpi.com/2071-1050/17/6/2589
 97 | [3] *www.nhtsa.gov*, "https://www.nhtsa.gov/vehicle-safety/automated-vehicles-safety", https://www.nhtsa.gov/vehicle-safety/automated-vehicles-safety
 98 | [4] *www.nature.com*, "https://www.nature.com/articles/s41467-024-48526-4", https://www.nature.com/articles/s41467-024-48526-4
 99 | [5] *arstechnica.com*, "https://arstechnica.com/cars/2023/09/are-self-driving-cars-already-safer-than-human-drivers/", https://arstechnica.com/cars/2023/09/are-self-driving-cars-already-safer-than-human-drivers/
100 | [6] *www.forbes.com*, "https://www.forbes.com/sites/technology/article/self-driving-cars/", https://www.forbes.com/sites/technology/article/self-driving-cars/
101 | [7] *www.hashstudioz.com*, "The Role of Autonomous Vehicles and AI in Smart Transportation Systems", https://www.hashstudioz.com/blog/ai-in-transportation-from-self-driving-cars-to-smart-traffic/
102 | [8] *maddevs.io*, "https://maddevs.io/blog/transportation-and-ai/", https://maddevs.io/blog/transportation-and-ai/
103 | [9] *arxiv.org*, "https://arxiv.org/html/2410.10929v6", https://arxiv.org/html/2410.10929v6
104 | [10] *www.mckinsey.com*, "https://www.mckinsey.com/industries/infrastructure/our-insights/a-new-look-at-autonomous-vehicle-infrastructure", https://www.mckinsey.com/industries/infrastructure/our-insights/a-new-look-at-autonomous-vehicle-infrastructure
105 | [11] *medium.com*, "https://medium.com/@codebykrishna/the-future-of-transportation-autonomous-vehicles-and-smart-cities-c42f205bd46a", https://medium.com/@codebykrishna/the-future-of-transportation-autonomous-vehicles-and-smart-cities-c42f205bd46a
106 | [12] *www.nae.edu*, "https://www.nae.edu/290948/Smart-Infrastructure-for-Autonomous-Driving-in-Urban-Areas", https://www.nae.edu/290948/Smart-Infrastructure-for-Autonomous-Driving-in-Urban-Areas
107 | [13] *www.mdpi.com*, "https://www.mdpi.com/2227-7080/11/5/117", https://www.mdpi.com/2227-7080/11/5/117
108 | [14] *www.saam.swiss*, "https://www.saam.swiss/autonomous-vehicle-safety-future-of-safe-mobility/", https://www.saam.swiss/autonomous-vehicle-safety-future-of-safe-mobility/
109 | [15] *www.wistronchina.com*, "https://www.wistronchina.com/smart-traffic-the-future-of-urban-mobility/", https://www.wistronchina.com/smart-traffic-the-future-of-urban-mobility/
110 | [16] *www.weforum.org*, "The Role of Autonomous Vehicles and AI in Smart Transportation Systems", https://www.weforum.org/stories/2024/10/how-will-autonomous-vehicles-shape-urban-mobility/
111 | 
112 | 
113 | ## Research Process
114 | 
115 | - **Depth**: 3
116 | - **Breadth**: 3
117 | - **Time Taken**: 3m 56s
118 | - **Subqueries Explored**: 9
119 | - **Sources Analyzed**: 85
120 | 


--------------------------------------------------------------------------------
/examples/mistral_large.md:
--------------------------------------------------------------------------------
  1 | # The revival of indigenous agricultural practices, such as agroforestry, seed saving, and polyculture, holds significant potential for creating sustainable food systems. These practices have demonstrated benefits for soil health, water conservation, and carbon sequestration, making them critical components in addressing contemporary environmental challenges. By focusing on these specific methods, the research can provide an in-depth analysis of how they contribute to sustainability and resilience in food production.
  2 | 
  3 | In addition to the ecological advantages, it is essential to explore the barriers that hinder the widespread adoption of these indigenous practices. Key obstacles include issues related to land rights, the need for policy support, and the preservation of cultural knowledge. Understanding these challenges can help in developing strategies to overcome them and promote the integration of indigenous agricultural practices into mainstream farming.
  4 | 
  5 | Given the global relevance of these practices, the research will not be limited to a specific geographic region or indigenous community. Instead, it will aim to draw insights from a diverse range of contexts, thereby enriching the analysis with a broad perspective. This approach will allow for a comprehensive examination of how indigenous agricultural practices can be effectively revived and supported across different settings.
  6 | 
  7 | # Reviving Indigenous Practices for Sustainable Agriculture
  8 | 
  9 | ## Executive Summary
 10 | 
 11 | This report provides a comprehensive analysis of the ecological benefits, barriers, successful implementations, policy frameworks, cultural preservation, and socio-economic impacts of reviving indigenous agricultural practices. By focusing on agroforestry, seed saving, and polyculture, the report aims to highlight the importance of these practices for sustainable food systems and offer actionable recommendations for their broader adoption. The findings are supported by extensive literature review, qualitative research, and case studies from diverse geographic regions.
 12 | 
 13 | ## Introduction
 14 | 
 15 | Indigenous agricultural practices have long been recognized for their ecological and socio-economic benefits. However, their widespread adoption faces numerous challenges. This report investigates the ecological advantages, barriers to adoption, successful case studies, policy frameworks, and cultural preservation strategies related to these practices. The report also assesses the broader impacts on local communities and proposes strategies to integrate indigenous agricultural practices into mainstream farming.
 16 | 
 17 | ## Ecological Benefits
 18 | 
 19 | ### Agroforestry
 20 | 
 21 | Agroforestry, which integrates trees with crops and livestock, offers multiple ecological benefits. It enhances soil health by improving nutrient cycling and reducing erosion. Trees in agroforestry systems provide organic matter, which enriches the soil and supports microbial activity. This, in turn, enhances soil fertility and water retention capacity [6].
 22 | 
 23 | Water conservation is another significant benefit of agroforestry. Trees help to reduce runoff and increase water infiltration, thereby recharging groundwater reserves. This is particularly crucial in regions prone to water scarcity. Agroforestry systems also moderate soil temperatures, which can improve crop productivity and resilience to climate change [6].
 24 | 
 25 | Carbon sequestration is a critical ecological benefit of agroforestry. Trees act as carbon sinks, absorbing carbon dioxide from the atmosphere and storing it in their biomass. This process helps mitigate greenhouse gas emissions and contributes to climate change mitigation [6].
 26 | 
 27 | ### Seed Saving
 28 | 
 29 | Seed saving is a traditional practice that involves collecting, storing, and replanting seeds from one harvest to the next. This practice promotes genetic diversity and resilience in crops. By preserving local seed varieties, farmers can ensure that their crops are adapted to local conditions, which enhances their resilience to pests, diseases, and climate variability [1].
 30 | 
 31 | Seed saving also reduces the need for external inputs such as commercial seeds and pesticides, thereby lowering the environmental footprint of agriculture. It supports biodiversity conservation by preserving rare and heirloom varieties that might otherwise be lost to commercial monocultures [1].
 32 | 
 33 | ### Polyculture
 34 | 
 35 | Polyculture, the practice of growing multiple crops together, offers numerous ecological benefits. It enhances soil health by promoting nutrient cycling and reducing the need for synthetic fertilizers. Polyculture systems often include leguminous plants that fix nitrogen in the soil, benefiting other crops in the system [2].
 36 | 
 37 | Water conservation is another advantage of polyculture. By intercropping, farmers can reduce water usage and improve water-use efficiency. Polyculture also supports biodiversity by creating habitats for beneficial insects and wildlife, which can help control pests and diseases naturally [2].
 38 | 
 39 | Carbon sequestration is another ecological benefit of polyculture. By maintaining diverse and continuous plant cover, polyculture systems can sequester more carbon in the soil compared to monocultures. This helps mitigate greenhouse gas emissions and contributes to climate change mitigation [2].
 40 | 
 41 | ## Barriers to Adoption
 42 | 
 43 | ### Legal and Policy Challenges
 44 | 
 45 | One of the primary barriers to adopting indigenous agricultural practices is the lack of supportive legal and policy frameworks. Many countries lack policies that recognize and protect indigenous land rights, which are crucial for the continuation of traditional agricultural practices. Without secure land tenure, indigenous communities may face displacement or conversion of their lands to other uses, such as commercial agriculture or urban development [1].
 46 | 
 47 | Another policy challenge is the dominance of industrial agriculture, which often receives government subsidies and support. This creates an uneven playing field, making it difficult for small-scale farmers practicing indigenous agriculture to compete [1].
 48 | 
 49 | ### Cultural and Knowledge Preservation
 50 | 
 51 | The preservation and transmission of indigenous agricultural knowledge are essential for the continuation of these practices. However, many indigenous communities face challenges in maintaining their traditional knowledge due to a lack of documentation, generational gaps, and cultural assimilation [1].
 52 | 
 53 | Educational systems often prioritize modern agricultural techniques over traditional methods, leading to a loss of indigenous knowledge. Additionally, the migration of younger generations to urban areas can result in a decline in the number of practitioners of traditional agriculture [1].
 54 | 
 55 | ### Economic and Market Constraints
 56 | 
 57 | Economic constraints also hinder the adoption of indigenous agricultural practices. Many small-scale farmers lack access to markets, credit, and other financial resources necessary to implement these practices. The high initial investment required for agroforestry, for example, can be a significant barrier for resource-poor farmers [1].
 58 | 
 59 | Market demand for diverse and traditional crops is often limited, making it challenging for farmers to sell their products. The dominance of commercial monocultures in the market further marginalizes indigenous agricultural practices [1].
 60 | 
 61 | ## Success Stories and Case Studies
 62 | 
 63 | ### Polyculture in Brazilian Drylands
 64 | 
 65 | In the Brazilian drylands, polyculture practices have been successfully implemented to enhance food security and agricultural productivity. By integrating various crops such as beans, corn, and squash, farmers have improved soil health, reduced water usage, and increased crop yields. This approach has also provided diverse and nutritious food sources for local communities, contributing to their food security [2].
 66 | 
 67 | ### Tiger Prawn and Barramundi Polyculture in Indonesia
 68 | 
 69 | In Indonesia, polyculture practices involving tiger prawn and barramundi have significantly boosted farmers' productivity. By integrating these aquatic species, farmers have improved the efficiency of water and nutrient use, leading to higher yields and income. This practice has also contributed to the conservation of local aquatic biodiversity and provided a sustainable source of protein for local communities [3].
 70 | 
 71 | ### Banana Polyculture in Uganda
 72 | 
 73 | In Uganda, banana polyculture practices have been successfully implemented to enhance agricultural productivity and resilience. By intercropping bananas with other crops such as beans and coffee, farmers have improved soil health, reduced pest and disease incidence, and increased crop yields. This approach has also contributed to the conservation of local biodiversity and provided diverse food sources for local communities [4].
 74 | 
 75 | ### Polyculture and Food Security
 76 | 
 77 | Polyculture practices have been identified as a key strategy for enhancing food security. By promoting crop diversity and resilience, polyculture can help ensure a stable and nutritious food supply, even in the face of climate variability and other environmental challenges. This approach has been successfully implemented in various regions, contributing to improved food security and community well-being [5].
 78 | 
 79 | ## Policy and Legal Frameworks
 80 | 
 81 | ### Supportive Policies for Indigenous Agriculture
 82 | 
 83 | To promote the adoption of indigenous agricultural practices, supportive policies and legal frameworks are essential. Governments can play a crucial role by recognizing and protecting indigenous land rights, providing financial and technical support, and promoting cultural knowledge preservation. Policies that incentivize sustainable agricultural practices, such as agroforestry and seed saving, can also help overcome economic barriers and encourage farmers to adopt these practices [1].
 84 | 
 85 | ### International Initiatives
 86 | 
 87 | International initiatives, such as the United Nations Declaration on the Rights of Indigenous Peoples, provide a framework for protecting indigenous land rights and promoting traditional agricultural practices. These initiatives emphasize the importance of cultural preservation and the role of indigenous knowledge in sustainable development [1].
 88 | 
 89 | ### National and Local Policies
 90 | 
 91 | At the national and local levels, policies that support small-scale farmers and promote agricultural diversity can help overcome barriers to adopting indigenous agricultural practices. For example, policies that provide access to markets, credit, and other financial resources can help farmers implement these practices. Additionally, policies that promote education and training in traditional agricultural methods can help preserve and transmit indigenous knowledge [1].
 92 | 
 93 | ## Cultural Knowledge Preservation
 94 | 
 95 | ### Strategies for Preserving Indigenous Knowledge
 96 | 
 97 | Preserving and transmitting indigenous agricultural knowledge is crucial for the continuation of these practices. Strategies for cultural knowledge preservation include documentation, education, and community engagement. Documenting traditional agricultural practices through written and oral histories can help preserve this knowledge for future generations [1].
 98 | 
 99 | Education and training programs that incorporate indigenous agricultural methods can help transmit this knowledge to younger generations. Community engagement initiatives, such as workshops and knowledge-sharing events, can also help preserve and promote indigenous agricultural practices [1].
100 | 
101 | ### Role of Indigenous Leaders and Practitioners
102 | 
103 | Indigenous leaders and practitioners play a vital role in preserving and transmitting traditional agricultural knowledge. By sharing their expertise and experiences, they can help educate and inspire others to adopt these practices. Supporting indigenous leaders and practitioners through financial and technical assistance can also help ensure the continuation of traditional agricultural practices [1].
104 | 
105 | ### Cultural Institutions and Organizations
106 | 
107 | Cultural institutions and organizations can play a crucial role in preserving and promoting indigenous agricultural knowledge. By providing resources and support for documentation, education, and community engagement, these institutions can help ensure the continuation of traditional agricultural practices. Collaboration between cultural institutions, indigenous communities, and other stakeholders can also help promote the adoption of these practices [1].
108 | 
109 | ## Socio-Economic Impacts
110 | 
111 | ### Food Security
112 | 
113 | Reviving indigenous agricultural practices can significantly enhance food security. By promoting crop diversity and resilience, these practices can help ensure a stable and nutritious food supply, even in the face of environmental challenges. This approach has been successfully implemented in various regions, contributing to improved food security and community well-being [5].
114 | 
115 | ### Economic Stability
116 | 
117 | Indigenous agricultural practices can also contribute to economic stability. By providing diverse and sustainable food sources, these practices can help farmers reduce their dependence on external inputs and markets. This can lead to increased self-sufficiency and economic resilience for local communities [5].
118 | 
119 | ### Cultural Preservation
120 | 
121 | Preserving and promoting indigenous agricultural knowledge is crucial for cultural preservation. These practices are often deeply rooted in local traditions and values, and their continuation can help maintain cultural identity and heritage. By supporting indigenous agricultural practices, communities can also preserve their cultural landscapes and natural environments [1].
122 | 
123 | ## Strategies and Recommendations
124 | 
125 | ### Overcoming Barriers to Adoption
126 | 
127 | To overcome the barriers to adopting indigenous agricultural practices, a multi-faceted approach is needed. Supportive policies and legal frameworks that recognize and protect indigenous land rights, provide financial and technical support, and promote cultural knowledge preservation are essential. Additionally, education and training programs that incorporate traditional agricultural methods can help transmit this knowledge to younger generations [1].
128 | 
129 | ### Promoting Indigenous Agriculture
130 | 
131 | Promoting indigenous agricultural practices requires collaboration between governments, NGOs, cultural institutions, and indigenous communities. By working together, these stakeholders can develop and implement strategies to overcome barriers and encourage the adoption of these practices. Supporting indigenous leaders and practitioners through financial and technical assistance can also help ensure the continuation of traditional agricultural practices [1].
132 | 
133 | ### Integrating Indigenous Practices into Mainstream Farming
134 | 
135 | Integrating indigenous agricultural practices into mainstream farming requires a shift in mindset and approach. Policies that incentivize sustainable agricultural practices, such as agroforestry and seed saving, can help encourage farmers to adopt these practices. Additionally, promoting education and training in traditional agricultural methods can help transmit this knowledge to a broader audience [1].
136 | 
137 | ## Conclusion
138 | 
139 | Reviving indigenous agricultural practices offers numerous ecological, socio-economic, and cultural benefits. By promoting soil health, water conservation, carbon sequestration, and biodiversity, these practices contribute to sustainable food systems. However, their widespread adoption faces numerous challenges, including legal and policy barriers, cultural knowledge preservation, and economic constraints.
140 | 
141 | Success stories from diverse regions highlight the potential of indigenous agricultural practices to enhance food security, economic stability, and cultural preservation. Supportive policies and legal frameworks, along with strategies for cultural knowledge preservation and education, are essential for overcoming these barriers and promoting the adoption of these practices.
142 | 
143 | By working together, governments, NGOs, cultural institutions, and indigenous communities can develop and implement strategies to integrate indigenous agricultural practices into mainstream farming. This collaborative effort is crucial for ensuring the continuation of these practices and their benefits for future generations.
144 | 
145 | # References
146 | 
147 | [1] *www.firstnations.org*, "https://www.firstnations.org/wp-content/uploads/publication-attachments/2015-Fact-Sheet-11-Seed-Saving-and-Seed-Sovereignty.pdf", https://www.firstnations.org/wp-content/uploads/publication-attachments/2015-Fact-Sheet-11-Seed-Saving-and-Seed-Sovereignty.pdf
148 | [2] *dry-net.org*, "https://dry-net.org/initiatives/polyculture-in-brazilian-drylands/", https://dry-net.org/initiatives/polyculture-in-brazilian-drylands/
149 | [3] *thefishsite.com*, "https://thefishsite.com/articles/how-tiger-prawn-and-barramundi-polyculture-helps-boost-farmers-productivity-indonesia", https://thefishsite.com/articles/how-tiger-prawn-and-barramundi-polyculture-helps-boost-farmers-productivity-indonesia
150 | [4] *alliancebioversityciat.org*, "https://alliancebioversityciat.org/stories/embracing-banana-polyculture-uganda", https://alliancebioversityciat.org/stories/embracing-banana-polyculture-uganda
151 | [5] *www.foodunfolded.com*, "https://www.foodunfolded.com/article/is-polyculture-the-key-to-food-security", https://www.foodunfolded.com/article/is-polyculture-the-key-to-food-security
152 | [6] *tracextech.com*, "https://tracextech.com/carbon-sequestration-in-agroforestry/", https://tracextech.com/carbon-sequestration-in-agroforestry/
153 | 
154 | 
155 | ## Research Process
156 | 
157 | - **Depth**: 3
158 | - **Breadth**: 3
159 | - **Time Taken**: 5m 15s
160 | - **Subqueries Explored**: 9
161 | - **Sources Analyzed**: 52
162 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | langchain>=0.1.0
 3 | langchain-core>=0.1.0
 4 | langchain-openai>=0.0.5
 5 | langchain-community>=0.0.13
 6 | langgraph>=0.0.20
 7 | langchain-text-splitters>=0.0.1
 8 | pydantic>=2.0.0
 9 | click>=8.0.0
10 | rich>=13.0.0
11 | aiohttp>=3.8.0
12 | asyncio>=3.4.3
13 | beautifulsoup4>=4.12.0
14 | trafilatura>=1.6.0
15 | fake_useragent>=1.2.0
16 | playwright>=1.40.0
17 | tiktoken>=0.5.0
18 | 
19 | # Search engines
20 | googlesearch-python>=1.2.3
21 | wikipedia>=1.4.0
22 | arxiv>=2.0.0
23 | 
24 | # Utilities
25 | python-dotenv>=1.0.0
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Setup script for Shandu deep research system.
 3 | """
 4 | from setuptools import setup, find_packages
 5 | import os
 6 | 
 7 | # Read long description from README.md
 8 | with open("shandu/README.md", "r", encoding="utf-8") as fh:
 9 |     long_description = fh.read()
10 | 
11 | # Read requirements
12 | with open("requirements.txt") as f:
13 |     requirements = [line.strip() for line in f if line.strip() and not line.startswith("#")]
14 | 
15 | setup(
16 |     name="shandu",
17 |     version="2.0.0",
18 |     description="Deep research system with LangChain and LangGraph",
19 |     long_description=long_description,
20 |     long_description_content_type="text/markdown",
21 |     author="Dušan Jolović",
22 |     author_email="jolovic@pm.me",
23 |     url="https://github.com/jolovicdev/shandu",
24 |     packages=find_packages(),
25 |     install_requires=requirements,
26 |     entry_points={
27 |         "console_scripts": [
28 |             "shandu=shandu.cli:cli",
29 |         ],
30 |     },
31 |     python_requires=">=3.9",
32 |     classifiers=[
33 |         "Development Status :: 3 - Alpha",
34 |         "Intended Audience :: Science/Research",
35 |         "Intended Audience :: Developers",
36 |         "License :: OSI Approved :: MIT License",
37 |         "Programming Language :: Python :: 3",
38 |         "Programming Language :: Python :: 3.9",
39 |         "Programming Language :: Python :: 3.10",
40 |         "Programming Language :: Python :: 3.11",
41 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
42 |         "Topic :: Software Development :: Libraries :: Python Modules",
43 |     ],
44 |     keywords="research, ai, llm, langchain, langgraph, deepresearch, deepsearch, search",
45 |     project_urls={
46 |         "Source": "https://github.com/jolovicdev/shandu",
47 |     },
48 |     include_package_data=True,
49 | )
50 | 


--------------------------------------------------------------------------------
/shandu/README.md:
--------------------------------------------------------------------------------
  1 | # Shandu: Advanced Research System Architecture
  2 | 
  3 | This directory contains the core architecture of the Shandu deep research system. Our modular design separates concerns and enables future extensibility while maintaining clean, testable code.
  4 | 
  5 | ## 📊 System Architecture
  6 | 
  7 | Shandu implements a sophisticated state-based workflow using LangGraph and LangChain to create a robust, extensible research system:
  8 | 
  9 | ```
 10 | shandu/
 11 | ├── __init__.py           # Package initialization
 12 | ├── cli.py                # Command-line interface
 13 | ├── config.py             # Configuration management
 14 | ├── prompts.py            # Centralized prompt templates
 15 | ├── agents/               # Research agent implementations
 16 | │   ├── __init__.py
 17 | │   ├── agent.py          # LangChain-based agent
 18 | │   ├── langgraph_agent.py # LangGraph state-based agent
 19 | │   ├── graph/            # Graph workflow components
 20 | │   │   ├── __init__.py
 21 | │   │   ├── builder.py    # Graph construction
 22 | │   │   └── wrapper.py    # Async function wrappers
 23 | │   ├── nodes/            # Graph node implementations
 24 | │   │   ├── __init__.py
 25 | │   │   ├── initialize.py # Research initialization
 26 | │   │   ├── reflect.py    # Research reflection
 27 | │   │   ├── search.py     # Content search and analysis
 28 | │   │   └── ...           # Other node implementations
 29 | │   ├── processors/       # Content processing
 30 | │   │   ├── __init__.py
 31 | │   │   ├── content_processor.py # Content extraction
 32 | │   │   └── report_generator.py  # Report generation
 33 | │   └── utils/            # Agent utilities
 34 | │       ├── __init__.py
 35 | │       └── agent_utils.py # Helper functions
 36 | ├── research/             # Research orchestration
 37 | │   ├── __init__.py
 38 | │   └── researcher.py     # Result management
 39 | ├── scraper/              # Web scraping functionality
 40 | │   ├── __init__.py
 41 | │   └── scraper.py        # Ethical web scraper
 42 | └── search/               # Search functionality
 43 |     ├── __init__.py
 44 |     ├── ai_search.py      # AI-powered search
 45 |     └── search.py         # Multi-engine search
 46 | ```
 47 | 
 48 | ## 🔄 LangGraph Research Workflow
 49 | 
 50 | Shandu's research process follows a sophisticated state-based workflow:
 51 | 
 52 | 1. **Initialize**: Define research query, parameters, and create a research plan
 53 | 2. **Reflect**: Analyze current findings and identify knowledge gaps
 54 | 3. **Generate Queries**: Create targeted search queries based on analysis
 55 | 4. **Search**: Execute search queries and collect results
 56 | 5. **Smart Source Selection**: Filter and prioritize the most valuable sources
 57 | 6. **Format Citations**: Prepare properly formatted citations for all sources
 58 | 7. **Generate Initial Report**: Create a first draft of the research report
 59 | 8. **Enhance Report**: Add depth, detail, and proper structure
 60 | 9. **Expand Key Sections**: Further develop important sections through multi-step synthesis
 61 | 10. **Finalize Report**: Apply final formatting and quality checks
 62 | 
 63 | ## 🧠 Advanced Technical Features
 64 | 
 65 | ### State-Based Research With LangGraph
 66 | 
 67 | Our LangGraph implementation provides several key advantages:
 68 | 
 69 | - **Clear State Transitions**: Each research phase has well-defined inputs and outputs
 70 | - **Conditional Logic**: Dynamically determines next steps based on current state
 71 | - **Circular Flow**: Supports recursive exploration until depth conditions are met
 72 | - **Parallel Processing**: Handles concurrent operations for efficiency
 73 | - **Error Resilience**: Continues functioning even if individual steps encounter issues
 74 | 
 75 | ### Enhanced Content Processing
 76 | 
 77 | Shandu implements sophisticated content processing:
 78 | 
 79 | - **Content Relevance Filtering**: Uses AI to determine if content is relevant to the research query
 80 | - **Source Reliability Assessment**: Evaluates sources for credibility and authority
 81 | - **Main Content Extraction**: Identifies and extracts the primary content from web pages
 82 | - **Content Analysis Pipeline**: Multi-step analysis for key information extraction
 83 | - **Theme Identification**: Automatically discovers and organizes thematic elements
 84 | 
 85 | ### Advanced Report Generation
 86 | 
 87 | Our multi-step report generation process ensures high-quality output:
 88 | 
 89 | 1. **Theme Extraction**: Identifies key themes across all research
 90 | 2. **Initial Report Generation**: Creates a structured first draft
 91 | 3. **Report Enhancement**: Adds depth, citations, and improved organization
 92 | 4. **Key Section Expansion**: Further develops the most important sections
 93 | 5. **Citation Management**: Ensures proper attribution of all sources
 94 | 6. **Final Cleanup**: Removes artifacts and ensures consistent formatting
 95 | 
 96 | ## 💻 API Details
 97 | 
 98 | ### ResearchGraph Class
 99 | 
100 | ```python
101 | class ResearchGraph:
102 |     """
103 |     State-based research workflow using LangGraph.
104 |     Provides a structured approach to deep research with multiple stages.
105 |     """
106 |     def __init__(
107 |         self, 
108 |         llm: Optional[ChatOpenAI] = None, 
109 |         searcher: Optional[UnifiedSearcher] = None, 
110 |         scraper: Optional[WebScraper] = None, 
111 |         temperature: float = 0.5,
112 |         date: Optional[str] = None
113 |     )
114 |     
115 |     async def research(
116 |         self, 
117 |         query: str, 
118 |         depth: int = 2, 
119 |         breadth: int = 4, 
120 |         progress_callback: Optional[Callable] = None,
121 |         include_objective: bool = False,
122 |         detail_level: str = "high" 
123 |     ) -> ResearchResult
124 |     
125 |     def research_sync(
126 |         self, 
127 |         query: str, 
128 |         depth: int = 2, 
129 |         breadth: int = 4, 
130 |         progress_callback: Optional[Callable] = None,
131 |         include_objective: bool = False,
132 |         detail_level: str = "high"
133 |     ) -> ResearchResult
134 | ```
135 | 
136 | ### AISearcher Class
137 | 
138 | ```python
139 | class AISearcher:
140 |     """
141 |     AI-powered search with content scraping for deeper insights.
142 |     """
143 |     def __init__(
144 |         self,
145 |         llm: Optional[ChatOpenAI] = None,
146 |         searcher: Optional[UnifiedSearcher] = None,
147 |         scraper: Optional[WebScraper] = None,
148 |         max_results: int = 10,
149 |         max_pages_to_scrape: int = 3
150 |     )
151 |     
152 |     async def search(
153 |         self, 
154 |         query: str,
155 |         engines: Optional[List[str]] = None,
156 |         detailed: bool = False,
157 |         enable_scraping: bool = True
158 |     ) -> AISearchResult
159 | ```
160 | 
161 | ## 🔌 Integration Points
162 | 
163 | Shandu is designed for easy integration:
164 | 
165 | - **CLI Interface**: Command-line tools for direct usage
166 | - **Python API**: Clean, well-documented API for integration into other applications
167 | - **Extensible Components**: Easy to add new search engines, scrapers, or processing steps
168 | - **Custom LLM Support**: Works with any LangChain-compatible LLM
169 | - **Callback System**: Progress tracking and event hooks
170 | 
171 | ## 🔍 Implementation Details
172 | 
173 | ### Prompt Engineering
174 | 
175 | Shandu uses carefully crafted prompts for:
176 | 
177 | - Query clarification
178 | - Research planning
179 | - Content analysis
180 | - Source evaluation
181 | - Report generation
182 | - Citation formatting
183 | 
184 | ### Async Processing
185 | 
186 | Extensive use of async/await patterns for:
187 | 
188 | - Parallel search execution
189 | - Concurrent web scraping
190 | - Efficient content processing
191 | - Responsive UI updates
192 | 
193 | ### Caching System
194 | 
195 | Multi-level caching for:
196 | 
197 | - Search results
198 | - Scraped content
199 | - Content analysis
200 | - LLM responses
201 | 
202 | ## 🔬 Research Algorithm
203 | 
204 | Our research algorithm optimizes for:
205 | 
206 | 1. **Breadth**: Exploring multiple relevant sub-topics
207 | 2. **Depth**: Drilling down into important details
208 | 3. **Convergence**: Focusing on the most relevant information
209 | 4. **Coverage**: Ensuring comprehensive topic exploration
210 | 5. **Source Quality**: Prioritizing reliable, authoritative sources
211 | 6. **Synthesis**: Creating coherent, well-structured reports
212 | 
213 | For more information on using Shandu, see the main [README.md](../README.md) file.


--------------------------------------------------------------------------------
/shandu/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shandu Deep Research System
 3 | A powerful research tool combining multiple search engines with LangChain integration.
 4 | 
 5 | Copyright (c) 2025 Dušan Jolović
 6 | Licensed under the MIT License. See LICENSE file for details.
 7 | """
 8 | 
 9 | from .search.search import UnifiedSearcher, SearchResult
10 | from .research.researcher import DeepResearcher, ResearchResult
11 | from .agents.agent import ResearchAgent
12 | 
13 | __version__ = "2.0.0"
14 | __all__ = [
15 |     "UnifiedSearcher",
16 |     "SearchResult",
17 |     "DeepResearcher",
18 |     "ResearchResult",
19 |     "ResearchAgent"
20 | ]
21 | 


--------------------------------------------------------------------------------
/shandu/agents/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Agents module for Shandu deep research system.
3 | """
4 | 
5 | from .agent import ResearchAgent
6 | 
7 | __all__ = ["ResearchAgent"]
8 | 


--------------------------------------------------------------------------------
/shandu/agents/agent.py:
--------------------------------------------------------------------------------
  1 | """Agent module for Shandu research system."""
  2 | from typing import List, Dict, Optional, Union, Any
  3 | from dataclasses import dataclass
  4 | from datetime import datetime
  5 | import asyncio
  6 | import json
  7 | import time
  8 | 
  9 | from langchain_core.prompts import ChatPromptTemplate
 10 | from langchain_core.output_parsers import StrOutputParser
 11 | from langchain_core.runnables import RunnablePassthrough
 12 | from langchain_openai import ChatOpenAI
 13 | from langchain.agents import AgentType, initialize_agent
 14 | from langchain.chains import LLMChain
 15 | from langchain.prompts import PromptTemplate
 16 | from langchain_community.tools import Tool, DuckDuckGoSearchResults, DuckDuckGoSearchRun
 17 | 
 18 | from ..search.search import UnifiedSearcher, SearchResult
 19 | from ..research.researcher import ResearchResult
 20 | from ..scraper import WebScraper, ScrapedContent
 21 | from ..prompts import SYSTEM_PROMPTS, USER_PROMPTS
 22 | from .utils.citation_manager import CitationManager, SourceInfo, Learning
 23 | 
 24 | class ResearchAgent:
 25 |     """LangChain-based research agent with enhanced citation tracking."""
 26 |     def __init__(
 27 |         self,
 28 |         llm: Optional[ChatOpenAI] = None,
 29 |         searcher: Optional[UnifiedSearcher] = None,
 30 |         scraper: Optional[WebScraper] = None,
 31 |         temperature: float = 0,
 32 |         max_depth: int = 2,
 33 |         breadth: int = 4,
 34 |         max_urls_per_query: int = 3,
 35 |         proxy: Optional[str] = None
 36 |     ):
 37 |         self.llm = llm or ChatOpenAI(
 38 |             temperature=temperature,
 39 |             model="gpt-4"
 40 |         )
 41 |         self.searcher = searcher or UnifiedSearcher()
 42 |         self.scraper = scraper or WebScraper(proxy=proxy)
 43 |         self.citation_manager = CitationManager()  # Initialize citation manager
 44 |         # Research parameters
 45 |         self.max_depth = max_depth
 46 |         self.breadth = breadth
 47 |         self.max_urls_per_query = max_urls_per_query
 48 | 
 49 |         self.system_prompt = ChatPromptTemplate.from_template(SYSTEM_PROMPTS["research_agent"])
 50 |         self.reflection_prompt = ChatPromptTemplate.from_template(USER_PROMPTS["reflection"])
 51 |         self.query_gen_prompt = ChatPromptTemplate.from_template(USER_PROMPTS["query_generation"])
 52 | 
 53 |         self.tools = self._setup_tools()
 54 | 
 55 |         self.agent = initialize_agent(
 56 |             tools=self.tools,
 57 |             llm=self.llm,
 58 |             agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
 59 |             verbose=True
 60 |         )
 61 | 
 62 |     def _setup_tools(self) -> List[Tool]:
 63 |         """Setup agent tools."""
 64 |         return [
 65 |             Tool(
 66 |                 name="search",
 67 |                 func=self.searcher.search_sync,
 68 |                 description="Search multiple sources for information about a topic"
 69 |             ),
 70 |             DuckDuckGoSearchResults(
 71 |                 name="ddg_results",
 72 |                 description="Get detailed search results from DuckDuckGo"
 73 |             ),
 74 |             DuckDuckGoSearchRun(
 75 |                 name="ddg_search",
 76 |                 description="Search DuckDuckGo for a quick answer"
 77 |             ),
 78 |             Tool(
 79 |                 name="reflect",
 80 |                 func=self._reflect_on_findings,
 81 |                 description="Analyze and reflect on current research findings"
 82 |             ),
 83 |             Tool(
 84 |                 name="generate_queries",
 85 |                 func=self._generate_subqueries,
 86 |                 description="Generate targeted subqueries for deeper research"
 87 |             )
 88 |         ]
 89 | 
 90 |     async def _reflect_on_findings(self, findings: str) -> str:
 91 |         """Analyze research findings."""
 92 |         reflection_chain = self.reflection_prompt | self.llm | StrOutputParser()
 93 |         return await reflection_chain.ainvoke({"findings": findings})
 94 | 
 95 |     async def _generate_subqueries(
 96 |         self,
 97 |         query: str,
 98 |         findings: str,
 99 |         questions: str
100 |     ) -> List[str]:
101 |         """Generate subqueries for deeper research."""
102 |         query_chain = self.query_gen_prompt | self.llm | StrOutputParser()
103 |         result = await query_chain.ainvoke({
104 |             "query": query,
105 |             "findings": findings,
106 |             "questions": questions,
107 |             "breadth": self.breadth
108 |         })
109 | 
110 |         queries = [q.strip() for q in result.split("\n") if q.strip()]
111 |         return queries[:self.breadth]
112 | 
113 |     async def _extract_urls_from_results(
114 |         self,
115 |         search_results: List[SearchResult],
116 |         max_urls: int = 3
117 |     ) -> List[str]:
118 |         """Extract top URLs from search results."""
119 |         urls = []
120 |         seen = set()
121 |         
122 |         for result in search_results:
123 |             if len(urls) >= max_urls:
124 |                 break
125 |                 
126 |             url = result.url
127 |             if url and url not in seen and url.startswith('http'):
128 |                 urls.append(url)
129 |                 seen.add(url)
130 |         
131 |         return urls
132 | 
133 |     async def _analyze_content(
134 |         self,
135 |         query: str,
136 |         content: List[ScrapedContent]
137 |     ) -> Dict[str, Any]:
138 |         """Analyze scraped content and track learnings with citation manager."""
139 |         # Prepare content for analysis
140 |         content_text = ""
141 |         for item in content:
142 | 
143 |             source_info = SourceInfo(
144 |                 url=item.url,
145 |                 title=item.title,
146 |                 content_type=item.content_type,
147 |                 access_time=time.time(),
148 |                 domain=item.url.split("//")[1].split("/")[0] if "//" in item.url else "unknown",
149 |                 reliability_score=0.8,  # Default score, could be more dynamic
150 |                 metadata=item.metadata
151 |             )
152 |             self.citation_manager.add_source(source_info)
153 | 
154 |             content_text += f"\nSource: {item.url}\nTitle: {item.title}\n"
155 |             content_text += f"Content Summary:\n{item.text[:2000]}...\n"
156 |         
157 |         # Use the content analysis prompt from centralized prompts
158 |         analysis_prompt = ChatPromptTemplate.from_messages([
159 |             ("system", SYSTEM_PROMPTS["content_analysis"]),
160 |             ("user", USER_PROMPTS["content_analysis"])
161 |         ])
162 |         
163 |         analysis_chain = analysis_prompt | self.llm | StrOutputParser()
164 |         analysis = await analysis_chain.ainvoke({"query": query, "content": content_text})
165 | 
166 |         for item in content:
167 |             # Use citation manager to extract and register learnings
168 |             learning_hashes = self.citation_manager.extract_learning_from_text(
169 |                 analysis,  # Use the analysis as the source of learnings
170 |                 item.url,
171 |                 context=f"Analysis for query: {query}"
172 |             )
173 |             
174 |         return {
175 |             "analysis": analysis,
176 |             "sources": [c.url for c in content],
177 |             "learnings": len(self.citation_manager.learnings)  # Track number of learnings
178 |         }
179 | 
180 |     async def research(
181 |         self,
182 |         query: str,
183 |         depth: Optional[int] = None,
184 |         engines: List[str] = ["google", "duckduckgo"]
185 |     ) -> ResearchResult:
186 |         """Execute the research process with enhanced citation tracking."""
187 |         depth = depth if depth is not None else self.max_depth
188 | 
189 |         context = {
190 |             "query": query,
191 |             "depth": depth,
192 |             "breadth": self.breadth,
193 |             "findings": "",
194 |             "sources": [],
195 |             "subqueries": [],
196 |             "content_analysis": [],
197 |             "learnings_by_source": {}  # Track learnings by source
198 |         }
199 |         
200 |         # Initial system prompt to set up the research
201 |         system_chain = self.system_prompt | self.llm | StrOutputParser()
202 |         context["findings"] = await system_chain.ainvoke(context)
203 |         
204 |         # Iterative deepening research process
205 |         for current_depth in range(depth):
206 |             # Reflect on current findings
207 |             reflection = await self._reflect_on_findings(context["findings"])
208 |             
209 |             new_queries = await self._generate_subqueries(
210 |                 query=query,
211 |                 findings=context["findings"],
212 |                 questions=reflection
213 |             )
214 |             context["subqueries"].extend(new_queries)
215 |             
216 |             for subquery in new_queries:
217 |                 agent_result = await self.agent.arun(
218 |                     f"Research this specific aspect: {subquery}\n\n"
219 |                     f"Current findings: {context['findings']}\n\n"
220 |                     "Think step by step about what tools to use and how to verify the information."
221 |                 )
222 |                 
223 |                 # Perform the search
224 |                 search_results = await self.searcher.search(
225 |                     subquery,
226 |                     engines=engines
227 |                 )
228 | 
229 |                 urls_to_scrape = await self._extract_urls_from_results(
230 |                     search_results,
231 |                     self.max_urls_per_query
232 |                 )
233 |                 
234 |                 # Scrape and analyze content
235 |                 if urls_to_scrape:
236 |                     scraped_content = await self.scraper.scrape_urls(
237 |                         urls_to_scrape,
238 |                         dynamic=True,
239 |                         force_refresh=False  # Use cache when available
240 |                     )
241 |                     
242 |                     if scraped_content:
243 |                         # Analyze the content
244 |                         analysis = await self._analyze_content(subquery, scraped_content)
245 |                         context["content_analysis"].append({
246 |                             "subquery": subquery,
247 |                             "analysis": analysis["analysis"],
248 |                             "sources": analysis["sources"],
249 |                             "learnings": analysis.get("learnings", 0)
250 |                         })
251 | 
252 |                 for r in search_results:
253 |                     if isinstance(r, SearchResult):
254 |                         context["sources"].append(r.to_dict())
255 |                     elif isinstance(r, dict):
256 |                         context["sources"].append(r)
257 |                     else:
258 |                         print(f"Warning: Skipping non-serializable search result: {type(r)}")
259 |                 
260 |                 context["findings"] += f"\n\nFindings for '{subquery}':\n{agent_result}"
261 | 
262 |                 if context["content_analysis"]:
263 |                     latest_analysis = context["content_analysis"][-1]
264 |                     context["findings"] += f"\n\nDetailed Analysis:\n{latest_analysis['analysis']}"
265 |         
266 |         # Final reflection and summary
267 |         final_reflection = await self._reflect_on_findings(context["findings"])
268 |         
269 |         # Prepare detailed sources with content analysis
270 |         detailed_sources = []
271 |         for source in context["sources"]:
272 |             # Source is already a dictionary at this point
273 |             source_dict = source.copy()  # Make a copy to avoid modifying the original
274 | 
275 |             for analysis in context["content_analysis"]:
276 |                 if source.get("url", "") in analysis["sources"]:
277 |                     source_dict["detailed_analysis"] = analysis["analysis"]
278 | 
279 |             if source.get("url") in self.citation_manager.source_to_learnings:
280 |                 source_url = source.get("url")
281 |                 learning_ids = self.citation_manager.source_to_learnings.get(source_url, [])
282 |                 source_dict["tracked_learnings"] = len(learning_ids)
283 |                 context["learnings_by_source"][source_url] = len(learning_ids)
284 |                 
285 |             detailed_sources.append(source_dict)
286 | 
287 |         citation_stats = {
288 |             "total_sources": len(self.citation_manager.sources),
289 |             "total_learnings": len(self.citation_manager.learnings),
290 |             "source_reliability": self.citation_manager._calculate_source_reliability()
291 |         }
292 |         
293 |         return ResearchResult(
294 |             query=query,
295 |             summary=final_reflection,
296 |             sources=detailed_sources,
297 |             subqueries=context["subqueries"],
298 |             depth=depth,
299 |             content_analysis=context["content_analysis"],
300 |             citation_stats=citation_stats
301 |         )
302 | 
303 |     def research_sync(
304 |         self,
305 |         query: str,
306 |         depth: Optional[int] = None,
307 |         engines: List[str] = ["google", "duckduckgo"]
308 |     ) -> ResearchResult:
309 |         """Synchronous research wrapper."""
310 |         return asyncio.run(self.research(query, depth, engines))
311 | 


--------------------------------------------------------------------------------
/shandu/agents/graph/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Graph building module for research LangGraph.
3 | """
4 | from .builder import build_graph
5 | from .wrapper import create_node_wrapper
6 | 
7 | __all__ = ['build_graph', 'create_node_wrapper']


--------------------------------------------------------------------------------
/shandu/agents/graph/builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Graph builder for research graph.
 3 | """
 4 | from langgraph.graph import Graph, StateGraph
 5 | from ..processors.content_processor import AgentState
 6 | from ..utils.agent_utils import should_continue
 7 | 
 8 | def build_graph(
 9 |     initialize_node,
10 |     reflect_node,
11 |     generate_queries_node,
12 |     search_node,
13 |     smart_source_selection,
14 |     format_citations_node,
15 |     generate_initial_report_node,
16 |     enhance_report_node,
17 |     expand_key_sections_node,
18 |     report_node
19 | ) -> Graph:
20 |     """
21 |     Build the research workflow graph with all nodes.
22 |     
23 |     Args:
24 |         All node functions for the research graph
25 |         
26 |     Returns:
27 |         Compiled graph ready for execution
28 |     """
29 |     workflow = StateGraph(AgentState)
30 | 
31 |     workflow.add_node("initialize", initialize_node)
32 |     workflow.add_node("reflect", reflect_node)
33 |     workflow.add_node("generate_queries", generate_queries_node)
34 |     workflow.add_node("search", search_node)
35 |     workflow.add_node("smart_source_selection", smart_source_selection)
36 |     workflow.add_node("format_citations", format_citations_node)
37 |     workflow.add_node("generate_initial_report", generate_initial_report_node)
38 |     workflow.add_node("enhance_report", enhance_report_node)
39 |     workflow.add_node("expand_key_sections", expand_key_sections_node)
40 |     workflow.add_node("report", report_node)
41 | 
42 |     workflow.add_edge("initialize", "generate_queries")
43 |     workflow.add_edge("reflect", "generate_queries")
44 |     workflow.add_edge("generate_queries", "search")
45 |     workflow.add_conditional_edges("search", should_continue, {
46 |         "continue": "reflect", 
47 |         "end": "smart_source_selection"
48 |     })
49 | 
50 |     workflow.add_edge("smart_source_selection", "format_citations")
51 |     workflow.add_edge("format_citations", "generate_initial_report")
52 |     workflow.add_edge("generate_initial_report", "enhance_report")
53 |     workflow.add_edge("enhance_report", "expand_key_sections")
54 |     workflow.add_edge("expand_key_sections", "report")
55 |     
56 |     workflow.set_entry_point("initialize")
57 |     workflow.set_finish_point("report")
58 |     
59 |     return workflow.compile()


--------------------------------------------------------------------------------
/shandu/agents/graph/wrapper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Wrapper functions to handle async functions in LangGraph nodes.
 3 | This module provides thread-safe handling of asyncio event loops.
 4 | """
 5 | import asyncio
 6 | import threading
 7 | from typing import Callable, Any, Awaitable, TypeVar, Dict
 8 | from concurrent.futures import ThreadPoolExecutor
 9 | 
10 | T = TypeVar('T')
11 | 
12 | # Thread-local storage for per-thread event loops
13 | _thread_local = threading.local()
14 | # Lock for thread safety when manipulating event loops
15 | _loop_lock = threading.Lock()
16 | # Track active event loops per thread
17 | _thread_loops: Dict[int, asyncio.AbstractEventLoop] = {}
18 | 
19 | def get_or_create_event_loop():
20 |     """Get the event loop for the current thread or create a new one if it doesn't exist."""
21 |     thread_id = threading.get_ident()
22 |     
23 |     # First check thread-local storage to see if we already have a loop
24 |     if hasattr(_thread_local, 'loop'):
25 |         # Make sure the loop is still valid (not closed)
26 |         if not _thread_local.loop.is_closed():
27 |             return _thread_local.loop
28 |     
29 |     # Try to get the current event loop
30 |     try:
31 |         loop = asyncio.get_event_loop()
32 |         if not loop.is_closed():
33 |             # Store in thread local for faster access next time
34 |             _thread_local.loop = loop
35 |             with _loop_lock:
36 |                 _thread_loops[thread_id] = loop
37 |             return loop
38 |     except RuntimeError:
39 |         # No event loop exists for this thread, or it was closed
40 |         pass
41 |     
42 |     # Need to create a new loop
43 |     with _loop_lock:
44 |         # Double-check if we have a valid loop for this thread
45 |         if thread_id in _thread_loops and not _thread_loops[thread_id].is_closed():
46 |             _thread_local.loop = _thread_loops[thread_id]
47 |             return _thread_loops[thread_id]
48 | 
49 |         loop = asyncio.new_event_loop()
50 |         asyncio.set_event_loop(loop)
51 |         _thread_local.loop = loop
52 |         _thread_loops[thread_id] = loop
53 |         return loop
54 | 
55 | def run_async_in_new_loop(async_fn, *args, **kwargs):
56 |     """Run an async function in a new event loop in the current thread."""
57 |     loop = get_or_create_event_loop()
58 |     try:
59 |         return loop.run_until_complete(async_fn(*args, **kwargs))
60 |     except Exception as e:
61 | 
62 |         raise e
63 |     
64 | def create_node_wrapper(async_fn: Callable[..., Awaitable[T]]) -> Callable[..., T]:
65 |     """
66 |     Creates a wrapper that safely executes an async function, ensuring proper event loop handling
67 |     across different threading scenarios.
68 |     """
69 |     def wrapped_function(*args, **kwargs):
70 |         # Use our reliable get_or_create_event_loop function
71 |         loop = get_or_create_event_loop()
72 | 
73 |         if loop.is_running():
74 |             # We're in a running event loop - create a task in ThreadPoolExecutor
75 |             # This approach prevents nesting of event loops
76 |             with ThreadPoolExecutor(max_workers=1) as executor:
77 |                 future = executor.submit(run_async_in_new_loop, async_fn, *args, **kwargs)
78 |                 return future.result()
79 |         else:
80 |             # We have an event loop but it's not running, use it directly
81 |             try:
82 |                 return loop.run_until_complete(async_fn(*args, **kwargs))
83 |             except Exception as e:
84 |                 # Log the error if needed
85 |                 from ...utils.logger import log_error
86 |                 log_error(f"Error in async execution", e, 
87 |                         context=f"Function: {async_fn.__name__}")
88 |                 # Re-raise to maintain original behavior
89 |                 raise
90 |     
91 |     return wrapped_function
92 | 


--------------------------------------------------------------------------------
/shandu/agents/langgraph_agent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Research agent implementation using LangGraph.
  3 | """
  4 | import time
  5 | import asyncio
  6 | from datetime import datetime
  7 | from typing import List, Dict, Optional, Any, Callable
  8 | from langchain_openai import ChatOpenAI
  9 | from langchain_core.messages import HumanMessage
 10 | from rich.console import Console
 11 | from rich.panel import Panel
 12 | from ..search.search import UnifiedSearcher, SearchResult
 13 | from ..scraper import WebScraper, ScrapedContent
 14 | from ..research.researcher import ResearchResult
 15 | from ..config import config, get_current_date
 16 | from .processors import AgentState
 17 | from .utils.agent_utils import (
 18 |     get_user_input,
 19 |     clarify_query,
 20 |     display_research_progress,
 21 |     is_shutdown_requested
 22 | )
 23 | from .nodes import (
 24 |     initialize_node,
 25 |     reflect_node,
 26 |     generate_queries_node,
 27 |     search_node,
 28 |     smart_source_selection,
 29 |     format_citations_node,
 30 |     generate_initial_report_node,
 31 |     enhance_report_node,
 32 |     expand_key_sections_node,
 33 |     report_node
 34 | )
 35 | from .graph import build_graph, create_node_wrapper
 36 | 
 37 | console = Console()
 38 | 
 39 | class ResearchGraph:
 40 |     """Research workflow graph implementation."""
 41 |     def __init__(
 42 |         self, 
 43 |         llm: Optional[ChatOpenAI] = None, 
 44 |         searcher: Optional[UnifiedSearcher] = None, 
 45 |         scraper: Optional[WebScraper] = None, 
 46 |         temperature: float = 0.5,
 47 |         date: Optional[str] = None
 48 |     ):
 49 |         api_base = config.get("api", "base_url")
 50 |         api_key = config.get("api", "api_key")
 51 |         model = config.get("api", "model")
 52 |         
 53 |         self.llm = llm or ChatOpenAI(
 54 |             base_url=api_base,
 55 |             api_key=api_key,
 56 |             model=model,
 57 |             temperature=temperature,
 58 |             max_tokens=16384  # Significantly increased max tokens to support much more comprehensive responses
 59 |         )
 60 |         self.searcher = searcher or UnifiedSearcher()
 61 |         self.scraper = scraper or WebScraper()
 62 |         self.date = date or get_current_date()
 63 |         self.progress_callback = None
 64 |         self.include_objective = False
 65 |         self.detail_level = "high"
 66 |         self.graph = self._build_graph()
 67 | 
 68 |     def _build_graph(self):
 69 |         """Build the research graph."""
 70 | 
 71 |         init_node = create_node_wrapper(lambda state: initialize_node(self.llm, self.date, self.progress_callback, state))
 72 |         reflect = create_node_wrapper(lambda state: reflect_node(self.llm, self.progress_callback, state))
 73 |         gen_queries = create_node_wrapper(lambda state: generate_queries_node(self.llm, self.progress_callback, state))
 74 |         search = create_node_wrapper(lambda state: search_node(self.llm, self.searcher, self.scraper, self.progress_callback, state))
 75 |         source_selection = create_node_wrapper(lambda state: smart_source_selection(self.llm, self.progress_callback, state))
 76 |         citations = create_node_wrapper(lambda state: format_citations_node(self.llm, self.progress_callback, state))
 77 |         initial_report = create_node_wrapper(lambda state: generate_initial_report_node(self.llm, self.include_objective, self.progress_callback, state))
 78 |         enhance = create_node_wrapper(lambda state: enhance_report_node(self.llm, self.progress_callback, state))
 79 |         expand_sections = create_node_wrapper(lambda state: expand_key_sections_node(self.llm, self.progress_callback, state))
 80 |         final_report = create_node_wrapper(lambda state: report_node(self.llm, self.progress_callback, state))
 81 |         
 82 |         # Build graph with these node functions
 83 |         return build_graph(
 84 |             init_node,
 85 |             reflect,
 86 |             gen_queries,
 87 |             search,
 88 |             source_selection,
 89 |             citations,
 90 |             initial_report,
 91 |             enhance,
 92 |             expand_sections,
 93 |             final_report
 94 |         )
 95 | 
 96 |     async def research(
 97 |         self, 
 98 |         query: str, 
 99 |         depth: int = 2, 
100 |         breadth: int = 4, 
101 |         progress_callback: Optional[Callable[[AgentState], None]] = None,
102 |         include_objective: bool = False,
103 |         detail_level: str = "high" 
104 |     ) -> ResearchResult:
105 |         """Execute research process on a query."""
106 |         self.progress_callback = progress_callback
107 |         self.include_objective = include_objective
108 |         self.detail_level = detail_level
109 | 
110 |         depth = max(1, min(5, depth))  # Ensure depth is between 1 and 5
111 |         breadth = max(1, min(10, breadth))  # Ensure breadth is between 1 and 10
112 | 
113 |         state = AgentState(
114 |             messages=[HumanMessage(content=f"Starting research on: {query}")],
115 |             query=query,
116 |             depth=depth,
117 |             breadth=breadth,
118 |             current_depth=0,
119 |             findings="",
120 |             sources=[],
121 |             selected_sources=[],
122 |             formatted_citations="",
123 |             subqueries=[],
124 |             content_analysis=[],
125 |             start_time=time.time(),
126 |             chain_of_thought=[],
127 |             status="Starting",
128 |             current_date=get_current_date(),
129 |             detail_level=detail_level,
130 |             identified_themes="",
131 |             initial_report="",
132 |             enhanced_report="",
133 |             final_report=""
134 |         )
135 |         
136 |         try:
137 |             # Invoke the graph with increased recursion limit
138 |             final_state = await self.graph.ainvoke(state, {"recursion_limit": 50})
139 |             
140 |             elapsed_time = time.time() - final_state["start_time"]
141 |             minutes, seconds = divmod(int(elapsed_time), 60)
142 |             
143 |             return ResearchResult(
144 |                 query=query,
145 |                 summary=final_state["findings"],
146 |                 sources=final_state["sources"],
147 |                 subqueries=final_state["subqueries"],
148 |                 depth=depth,
149 |                 content_analysis=final_state["content_analysis"],
150 |                 chain_of_thought=final_state["chain_of_thought"],
151 |                 research_stats={
152 |                     "elapsed_time": elapsed_time,
153 |                     "elapsed_time_formatted": f"{minutes}m {seconds}s",
154 |                     "sources_count": len(final_state["sources"]),
155 |                     "subqueries_count": len(final_state["subqueries"]),
156 |                     "depth": depth,
157 |                     "breadth": breadth,
158 |                     "detail_level": detail_level
159 |                 }
160 |             )
161 |         except KeyboardInterrupt:
162 |             console.print("\n[yellow]Research interrupted by user. Generating report with current findings...[/]")
163 | 
164 |             elapsed_time = time.time() - state["start_time"]
165 |             minutes, seconds = divmod(int(elapsed_time), 60)
166 |             
167 |             return ResearchResult(
168 |                 query=query,
169 |                 summary=state["findings"] + "\n\n*Note: Research was interrupted before completion.*",
170 |                 sources=state["sources"],
171 |                 subqueries=state["subqueries"],
172 |                 depth=state["current_depth"],
173 |                 content_analysis=state["content_analysis"],
174 |                 chain_of_thought=state["chain_of_thought"],
175 |                 research_stats={
176 |                     "elapsed_time": elapsed_time,
177 |                     "elapsed_time_formatted": f"{minutes}m {seconds}s",
178 |                     "sources_count": len(state["sources"]),
179 |                     "subqueries_count": len(state["subqueries"]),
180 |                     "depth": state["current_depth"],
181 |                     "breadth": breadth,
182 |                     "detail_level": detail_level,
183 |                     "interrupted": True
184 |                 }
185 |             )
186 |     
187 |     def research_sync(
188 |         self, 
189 |         query: str, 
190 |         depth: int = 2, 
191 |         breadth: int = 4, 
192 |         progress_callback: Optional[Callable[[AgentState], None]] = None,
193 |         include_objective: bool = False,
194 |         detail_level: str = "high"
195 |     ) -> ResearchResult:
196 |         """Synchronous wrapper for research."""
197 |         try:
198 |             return asyncio.run(self.research(query, depth, breadth, progress_callback, include_objective, detail_level))
199 |         except KeyboardInterrupt:
200 |             console.print("\n[yellow]Research interrupted by user.[/]")
201 |             raise
202 | 


--------------------------------------------------------------------------------
/shandu/agents/nodes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Node functions for the research graph workflow.
 3 | Each node represents a discrete step in the research process.
 4 | """
 5 | from .initialize import initialize_node
 6 | from .reflect import reflect_node
 7 | from .generate_queries import generate_queries_node
 8 | from .search import search_node
 9 | from .source_selection import smart_source_selection
10 | from .citations import format_citations_node
11 | from .report_generation import (
12 |     generate_initial_report_node,
13 |     enhance_report_node,
14 |     expand_key_sections_node,
15 |     report_node
16 | )
17 | 
18 | __all__ = [
19 |     'initialize_node',
20 |     'reflect_node',
21 |     'generate_queries_node',
22 |     'search_node',
23 |     'smart_source_selection',
24 |     'format_citations_node',
25 |     'generate_initial_report_node',
26 |     'enhance_report_node',
27 |     'expand_key_sections_node',
28 |     'report_node'
29 | ]


--------------------------------------------------------------------------------
/shandu/agents/nodes/citations.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Citation formatting node for research graph with advanced tracking capabilities.
 3 | """
 4 | import time
 5 | from rich.console import Console
 6 | from pydantic import BaseModel, Field
 7 | from ..processors.content_processor import AgentState
 8 | from ..processors.report_generator import format_citations
 9 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback
10 | from ..utils.citation_manager import CitationManager, SourceInfo
11 | from ..utils.citation_registry import CitationRegistry
12 | 
13 | console = Console()
14 | 
15 | class FormattedCitations(BaseModel):
16 |     """Structured output for formatted citations."""
17 |     citations: list[str] = Field(
18 |         description="List of properly formatted citations",
19 |         min_items=1
20 |     )
21 | 
22 | async def format_citations_node(llm, progress_callback, state: AgentState) -> AgentState:
23 |     """
24 |     Format citations for selected sources to ensure consistent referencing.
25 |     
26 |     This enhanced version uses the new CitationManager to track relationships
27 |     between sources and specific learnings from them.
28 |     """
29 |     state["status"] = "Processing and formatting citations"
30 |     console.print("[bold blue]Processing source citations with enhanced attribution...[/]")
31 | 
32 |     selected_urls = state["selected_sources"]
33 |     if not selected_urls:
34 |         log_chain_of_thought(state, "No sources selected for citations")
35 |         return state
36 | 
37 |     if "citation_manager" not in state:
38 |         state["citation_manager"] = CitationManager()
39 |         # For backward compatibility
40 |         state["citation_registry"] = state["citation_manager"].citation_registry
41 |     
42 |     citation_manager = state["citation_manager"]
43 |     
44 |     # Register each source with the citation manager
45 |     for url in selected_urls:
46 | 
47 |         source_meta = next((s for s in state["sources"] if s.get("url") == url), {})
48 | 
49 |         source_info = SourceInfo(
50 |             url=url,
51 |             title=source_meta.get("title", ""),
52 |             snippet=source_meta.get("snippet", ""),
53 |             source_type="web",
54 |             content_type=source_meta.get("content_type", "article"),
55 |             access_time=time.time(),
56 |             domain=url.split("//")[1].split("/")[0] if "//" in url else "unknown",
57 |             reliability_score=0.8,  # Default score, could be more dynamic
58 |             metadata=source_meta
59 |         )
60 | 
61 |         citation_manager.add_source(source_info)
62 |         
63 |         # For backward compatibility, also register with citation registry
64 |         citation_id = citation_manager.citation_registry.register_citation(url)
65 |         citation_manager.citation_registry.update_citation_metadata(citation_id, {
66 |             "title": source_meta.get("title", ""),
67 |             "url": url,
68 |             "snippet": source_meta.get("snippet", ""),
69 |             "source": source_meta.get("source", "")
70 |         })
71 | 
72 |     formatted_citations = await format_citations(
73 |         llm, 
74 |         selected_urls, 
75 |         state["sources"], 
76 |         citation_registry=citation_manager.citation_registry
77 |     )
78 |     
79 |     state["formatted_citations"] = formatted_citations
80 |     log_chain_of_thought(state, f"Processed and formatted citations for {len(selected_urls)} sources")
81 |     
82 |     if progress_callback:
83 |         await _call_progress_callback(progress_callback, state)
84 |     return state
85 | 


--------------------------------------------------------------------------------
/shandu/agents/nodes/generate_queries.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Query generation node for research graph.
  3 | """
  4 | import os
  5 | import re
  6 | from rich.console import Console
  7 | from langchain_core.messages import AIMessage, HumanMessage
  8 | from langchain_core.prompts import ChatPromptTemplate
  9 | from pydantic import BaseModel, Field
 10 | from ..processors.content_processor import AgentState
 11 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback
 12 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS
 13 | 
 14 | console = Console()
 15 | 
 16 | # Structured output model for query generation
 17 | class SearchQueries(BaseModel):
 18 |     """Structured output for search query generation."""
 19 |     queries: list[str] = Field(
 20 |         description="List of search queries to investigate the topic further",
 21 |         min_items=1
 22 |     )
 23 |     rationale: str = Field(
 24 |         description="Explanation of why these queries were selected and how they will help the research"
 25 |     )
 26 | 
 27 | async def generate_queries_node(llm, progress_callback, state: AgentState) -> AgentState:
 28 |     """Generate targeted search queries based on current findings using structured output."""
 29 |     state["status"] = "Generating research queries"
 30 |     console.print("[bold yellow]Generating targeted search queries...[/]")
 31 |     
 32 |     try:
 33 |         # Use a completely direct approach to avoid template issues
 34 |         direct_prompt = f"""Generate {state['breadth']} specific search queries to investigate the topic:
 35 | 
 36 | Main Query: {state['query']}
 37 | 
 38 | Requirements:
 39 | 1. Generate exactly {state['breadth']} search queries
 40 | 2. Queries should be natural and conversational (like what someone would type in Google)
 41 | 3. Each query should target specific facts, data points, or perspectives
 42 | 4. Keep queries direct and concise - avoid complex academic phrasing
 43 | 
 44 | Today's date: {state['current_date']}
 45 | 
 46 | Current Research Findings:
 47 | {state['findings'][:2000]}
 48 | 
 49 | Return ONLY the search queries themselves, one per line, with no additional text, numbering, or explanation.
 50 | """
 51 |         # Send the prompt directly to the model
 52 |         response = await llm.ainvoke(direct_prompt)
 53 | 
 54 |         new_queries = [line.strip() for line in response.content.split("\n") if line.strip()]
 55 |         # Remove any numbering, bullet points, or other formatting
 56 |         new_queries = [re.sub(r'^[\d\s\-\*•\.\)]+\s*', '', line).strip() for line in new_queries]
 57 |         # Remove phrases like "Here are...", "I'll search for..." etc.
 58 |         new_queries = [re.sub(r'^(here are|i will|i\'ll|let me|these are|i recommend|completed:|search for:).*?:', '', line, flags=re.IGNORECASE).strip() for line in new_queries]
 59 |         # Filter out any empty lines or lines that don't look like actual queries
 60 |         new_queries = [q for q in new_queries if q and len(q.split()) >= 2 and not q.lower().startswith(("query", "search", "investigate", "explore", "research"))]
 61 |         # Limit to the specified breadth
 62 |         new_queries = new_queries[:state["breadth"]]
 63 |         
 64 |         log_chain_of_thought(state, f"Generated {len(new_queries)} search queries for investigation")
 65 |         
 66 |     except Exception as e:
 67 |         from ...utils.logger import log_error
 68 |         log_error("Error in structured query generation", e, 
 69 |                  context=f"Query: {state['query']}, Function: generate_queries_node")
 70 |         console.print(f"[dim red]Error in structured query generation: {str(e)}. Using simpler approach.[/dim red]")
 71 |         try:
 72 |             # Even simpler fallback approach
 73 |             response = await llm.ainvoke(f"Generate {state['breadth']} simple search queries for {state['query']}. Return only the queries, one per line.")
 74 | 
 75 |             new_queries = [line.strip() for line in response.content.split("\n") if line.strip()]
 76 |             # Remove any numbering, bullet points, or other formatting
 77 |             new_queries = [re.sub(r'^[\d\s\-\*•\.\)]+\s*', '', line).strip() for line in new_queries]
 78 |             # Remove phrases like "Here are...", "I'll search for..." etc.
 79 |             new_queries = [re.sub(r'^(here are|i will|i\'ll|let me|these are|i recommend|completed:|search for:).*?:', '', line, flags=re.IGNORECASE).strip() for line in new_queries]
 80 |             # Filter out any empty lines or lines that don't look like actual queries
 81 |             new_queries = [q for q in new_queries if q and len(q.split()) >= 2 and not q.lower().startswith(("query", "search", "investigate", "explore", "research"))]
 82 |             # Limit to the specified breadth
 83 |             new_queries = new_queries[:state["breadth"]]
 84 |         except Exception as e2:
 85 |             console.print(f"[dim red]Error in fallback query generation: {str(e2)}. Using default queries.[/dim red]")
 86 | 
 87 |             new_queries = [
 88 |                 f"{state['query']} latest research",
 89 |                 f"{state['query']} examples",
 90 |                 f"{state['query']} applications"
 91 |             ][:state["breadth"]]
 92 |     
 93 |     if not new_queries and state["query"]:
 94 |         new_queries = [state["query"]]
 95 |     
 96 |     state["messages"].append(HumanMessage(content="Generating new research directions..."))
 97 |     state["messages"].append(AIMessage(content="Generated queries:\n" + "\n".join(new_queries)))
 98 |     state["subqueries"].extend(new_queries)
 99 |     
100 |     console.print("[bold green]Generated search queries:[/]")
101 |     for i, query in enumerate(new_queries, 1):
102 |         console.print(f"  {i}. {query}")
103 |     
104 |     log_chain_of_thought(state, f"Generated {len(new_queries)} search queries for investigation")
105 |     if progress_callback:
106 |         await _call_progress_callback(progress_callback, state)
107 |     return state
108 | 


--------------------------------------------------------------------------------
/shandu/agents/nodes/initialize.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Initialize node for research graph.
  3 | """
  4 | import os
  5 | import time
  6 | from rich.console import Console
  7 | from rich.panel import Panel
  8 | from langchain_core.messages import AIMessage, HumanMessage
  9 | from langchain_core.prompts import ChatPromptTemplate
 10 | from pydantic import BaseModel, Field
 11 | from ..processors.content_processor import AgentState
 12 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback
 13 | from ...config import get_current_date
 14 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS
 15 | 
 16 | console = Console()
 17 | 
 18 | class ResearchPlan(BaseModel):
 19 |     """Structured output for research plan."""
 20 |     objectives: list[str] = Field(
 21 |         description="Clear objectives for the research",
 22 |         min_items=1
 23 |     )
 24 |     key_areas: list[str] = Field(
 25 |         description="Key areas to investigate",
 26 |         min_items=1
 27 |     )
 28 |     methodology: str = Field(
 29 |         description="Approach to conducting the research"
 30 |     )
 31 |     expected_outcomes: list[str] = Field(
 32 |         description="Expected outcomes of the research",
 33 |         min_items=1
 34 |     )
 35 | 
 36 | async def initialize_node(llm, date, progress_callback, state: AgentState) -> AgentState:
 37 |     """Initialize the research process with a research plan using structured output."""
 38 |     console.print(Panel(f"[bold blue]Starting Research:[/] {state['query']}", title="Research Process", border_style="blue"))
 39 |     state["start_time"] = time.time()
 40 |     state["status"] = "Initializing research"
 41 |     state["current_date"] = date or get_current_date()
 42 |     
 43 |     try:
 44 |         # Use a completely direct approach to avoid template issues
 45 |         direct_prompt = f"""You are an expert research agent tasked with creating a comprehensive research plan. Current date: {state['current_date']}
 46 | 
 47 | Please create a detailed research plan for this query: {state['query']}
 48 | 
 49 | Your plan must include the following sections clearly labeled:
 50 | 
 51 | ## Objectives
 52 | - List 3-5 clear objectives for the research
 53 | 
 54 | ## Key Areas to Investigate
 55 | - List 4-6 specific areas or aspects that need to be researched
 56 | 
 57 | ## Methodology
 58 | - Describe the approach to conducting this research
 59 | - Include information sources and analysis methods
 60 | 
 61 | ## Expected Outcomes
 62 | - List 3-5 expected results or deliverables from this research
 63 | 
 64 | Format your response with clear section headings and bullet points for clarity. Be specific and detailed in your planning.
 65 | """
 66 |         # Send the direct prompt to the model
 67 |         response = await llm.ainvoke(direct_prompt)
 68 | 
 69 |         research_text = response.content
 70 | 
 71 |         import re
 72 |         objectives = []
 73 |         key_areas = []
 74 |         methodology = ""
 75 |         expected_outcomes = []
 76 | 
 77 |         objectives_section = re.search(r'(?:objectives|goals|aims)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL)
 78 |         if objectives_section:
 79 |             objectives_text = objectives_section.group(1).strip()
 80 |             objectives = [line.strip().strip('-*').strip() for line in objectives_text.split('\n') if line.strip() and not line.strip().startswith('#')]
 81 |         
 82 |         areas_section = re.search(r'(?:key areas|areas to investigate|investigation areas)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL)
 83 |         if areas_section:
 84 |             areas_text = areas_section.group(1).strip()
 85 |             key_areas = [line.strip().strip('-*').strip() for line in areas_text.split('\n') if line.strip() and not line.strip().startswith('#')]
 86 |         
 87 |         methodology_section = re.search(r'(?:methodology|approach|method)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL)
 88 |         if methodology_section:
 89 |             methodology = methodology_section.group(1).strip()
 90 |         
 91 |         outcomes_section = re.search(r'(?:expected outcomes|outcomes|results|expected results)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', research_text.lower(), re.IGNORECASE | re.DOTALL)
 92 |         if outcomes_section:
 93 |             outcomes_text = outcomes_section.group(1).strip()
 94 |             expected_outcomes = [line.strip().strip('-*').strip() for line in outcomes_text.split('\n') if line.strip() and not line.strip().startswith('#')]
 95 | 
 96 |         if not objectives:
 97 |             objectives = ["Understand the key aspects of " + state['query']]
 98 |         if not key_areas:
 99 |             key_areas = ["Primary concepts and definitions", "Current applications and examples", "Future trends and developments"]
100 |         if not methodology:
101 |             methodology = "Systematic review of available literature and analysis of current applications and examples."
102 |         if not expected_outcomes:
103 |             expected_outcomes = ["Comprehensive understanding of " + state['query'], "Identification of key challenges and opportunities"]
104 | 
105 |         formatted_plan = "# Research Plan\n\n"
106 |         
107 |         formatted_plan += "## Objectives\n\n"
108 |         for objective in objectives:
109 |             formatted_plan += f"- {objective}\n"
110 |         
111 |         formatted_plan += "\n## Key Areas to Investigate\n\n"
112 |         for area in key_areas:
113 |             formatted_plan += f"- {area}\n"
114 |         
115 |         formatted_plan += f"\n## Methodology\n\n{methodology}\n"
116 |         
117 |         formatted_plan += "\n## Expected Outcomes\n\n"
118 |         for outcome in expected_outcomes:
119 |             formatted_plan += f"- {outcome}\n"
120 |         
121 |         state["messages"].append(HumanMessage(content=f"Planning research on: {state['query']}"))
122 |         state["messages"].append(AIMessage(content=formatted_plan))
123 |         state["findings"] = f"{formatted_plan}\n\n# Initial Findings\n\n"
124 |         
125 |     except Exception as e:
126 |         from ...utils.logger import log_error
127 |         log_error("Error in structured plan generation", e, 
128 |                  context=f"Query: {state['query']}, Function: initialize_node")
129 |         console.print(f"[dim red]Error in structured plan generation: {str(e)}. Using simpler approach.[/dim red]")
130 |         try:
131 |             # Even simpler fallback approach
132 |             response = await llm.ainvoke(f"""Create a research plan for: {state['query']}
133 | 
134 | Include:
135 | 1. Main objectives
136 | 2. Key areas to investigate
137 | 3. Approach/methodology
138 | 4. Expected outcomes
139 | 
140 | Keep it concise and practical.
141 | """)
142 |             
143 |             cleaned_plan = response.content.replace("**", "").replace("# ", "").replace("## ", "")
144 |             
145 |             state["messages"].append(HumanMessage(content=f"Planning research on: {state['query']}"))
146 |             state["messages"].append(AIMessage(content=cleaned_plan))
147 |             state["findings"] = f"# Research Plan\n\n{cleaned_plan}\n\n# Initial Findings\n\n"
148 |         except Exception as e2:
149 |             console.print(f"[dim red]Error in fallback plan generation: {str(e2)}. Using minimal plan.[/dim red]")
150 |             
151 |             minimal_plan = f"Research plan for: {state['query']}\n\n- Investigate key aspects\n- Analyze relevant sources\n- Synthesize findings"
152 |             
153 |             state["messages"].append(HumanMessage(content=f"Planning research on: {state['query']}"))
154 |             state["messages"].append(AIMessage(content=minimal_plan))
155 |             state["findings"] = f"# Research Plan\n\n{minimal_plan}\n\n# Initial Findings\n\n"
156 |     
157 |     log_chain_of_thought(state, f"Created research plan for query: {state['query']}")
158 |     if progress_callback:
159 |         await _call_progress_callback(progress_callback, state)
160 |     return state
161 | 


--------------------------------------------------------------------------------
/shandu/agents/nodes/reflect.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reflection node for research graph.
  3 | """
  4 | import os
  5 | from rich.console import Console
  6 | from langchain_core.messages import AIMessage, HumanMessage
  7 | from langchain_core.prompts import ChatPromptTemplate
  8 | from pydantic import BaseModel, Field
  9 | from ..processors.content_processor import AgentState
 10 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback
 11 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS, safe_format
 12 | 
 13 | console = Console()
 14 | 
 15 | # Structured output model for reflection
 16 | class ResearchReflection(BaseModel):
 17 |     """Structured output for research reflection."""
 18 |     key_insights: list[str] = Field(
 19 |         description="Key insights gained from the research so far",
 20 |         min_items=1
 21 |     )
 22 |     knowledge_gaps: list[str] = Field(
 23 |         description="Identified gaps in the current research",
 24 |         min_items=1
 25 |     )
 26 |     next_steps: list[str] = Field(
 27 |         description="Recommended next steps for the research",
 28 |         min_items=1
 29 |     )
 30 |     reflection_summary: str = Field(
 31 |         description="Overall reflection on the current state of the research"
 32 |     )
 33 | 
 34 | async def reflect_node(llm, progress_callback, state: AgentState) -> AgentState:
 35 |     """Reflect on current findings to identify gaps and opportunities using structured output."""
 36 |     state["status"] = "Reflecting on findings"
 37 |     console.print("[bold yellow]Reflecting on current findings...[/]")
 38 |     
 39 |     try:
 40 |         # Use safe_format instead of manual escaping
 41 |         current_date = state['current_date']
 42 |         findings = state['findings'][:3000]
 43 |         
 44 |         direct_prompt = safe_format("""Analyze the following research findings and provide a detailed reflection. Today's date: {current_date}
 45 | 
 46 | Research Findings:
 47 | {findings}
 48 | 
 49 | Your reflection must include these sections clearly labeled:
 50 | 
 51 | ## Key Insights
 52 | - List the most important discoveries and insights from the research
 53 | - Evaluate the evidence strength for each insight
 54 | 
 55 | ## Knowledge Gaps
 56 | - Identify specific questions that remain unanswered
 57 | - Explain why these gaps are significant
 58 | 
 59 | ## Next Steps
 60 | - Suggest specific areas for deeper investigation
 61 | - Recommend research methods to address the knowledge gaps
 62 | 
 63 | ## Overall Reflection
 64 | - Provide a comprehensive assessment of the research progress
 65 | - Evaluate the overall quality and reliability of the findings
 66 | 
 67 | Format your response with clear section headings and bullet points for clarity.""", current_date=current_date, findings=findings)
 68 |         # Send the prompt directly to the model
 69 |         response = await llm.ainvoke(direct_prompt)
 70 | 
 71 |         reflection_text = response.content
 72 | 
 73 |         import re
 74 |         key_insights = []
 75 |         knowledge_gaps = []
 76 |         next_steps = []
 77 |         reflection_summary = ""
 78 | 
 79 |         insights_section = re.search(r'(?:key insights|insights|key findings)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL)
 80 |         if insights_section:
 81 |             insights_text = insights_section.group(1).strip()
 82 |             key_insights = [line.strip().strip('-*').strip() for line in insights_text.split('\n') if line.strip() and not line.strip().startswith('#')]
 83 | 
 84 |         gaps_section = re.search(r'(?:knowledge gaps|gaps|questions|unanswered questions)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL)
 85 |         if gaps_section:
 86 |             gaps_text = gaps_section.group(1).strip()
 87 |             knowledge_gaps = [line.strip().strip('-*').strip() for line in gaps_text.split('\n') if line.strip() and not line.strip().startswith('#')]
 88 |         
 89 |         steps_section = re.search(r'(?:next steps|steps|recommendations|future directions)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL)
 90 |         if steps_section:
 91 |             steps_text = steps_section.group(1).strip()
 92 |             next_steps = [line.strip().strip('-*').strip() for line in steps_text.split('\n') if line.strip() and not line.strip().startswith('#')]
 93 |         
 94 |         summary_section = re.search(r'(?:overall reflection|reflection summary|summary|conclusion)(?:\s*:|\s*\n)([^#]*?)(?:#|$)', reflection_text.lower(), re.IGNORECASE | re.DOTALL)
 95 |         if summary_section:
 96 |             reflection_summary = summary_section.group(1).strip()
 97 |         
 98 |         if not key_insights:
 99 |             key_insights = ["Research is progressing on " + state['query']]
100 |         if not knowledge_gaps:
101 |             knowledge_gaps = ["Further details needed on specific aspects"]
102 |         if not next_steps:
103 |             next_steps = ["Continue investigating primary aspects", "Search for more specific examples"]
104 |         if not reflection_summary:
105 |             reflection_summary = "The research is making progress and has uncovered valuable information, but further investigation is needed in key areas."
106 | 
107 |         formatted_reflection = "## Key Insights\n\n"
108 |         for insight in key_insights:
109 |             formatted_reflection += f"- {insight}\n"
110 |         
111 |         formatted_reflection += "\n## Knowledge Gaps\n\n"
112 |         for gap in knowledge_gaps:
113 |             formatted_reflection += f"- {gap}\n"
114 |         
115 |         formatted_reflection += "\n## Next Steps\n\n"
116 |         for step in next_steps:
117 |             formatted_reflection += f"- {step}\n"
118 |         
119 |         formatted_reflection += f"\n## Overall Reflection\n\n{reflection_summary}\n"
120 |         
121 |         state["messages"].append(HumanMessage(content="Analyzing current findings..."))
122 |         state["messages"].append(AIMessage(content=formatted_reflection))
123 |         state["findings"] += f"\n\n## Reflection on Current Findings\n\n{formatted_reflection}\n\n"
124 |         
125 |     except Exception as e:
126 |         from ...utils.logger import log_error
127 |         log_error("Error in structured reflection", e, 
128 |                  context=f"Function: reflect_node")
129 |         console.print(f"[dim red]Error in structured reflection: {str(e)}. Using simpler approach.[/dim red]")
130 |         try:
131 |             # Use safe_format in the fallback case too
132 |             fallback_findings = state['findings'][:2000]
133 |             
134 |             fallback_prompt = safe_format("""Reflect on these research findings:
135 | 
136 | {findings}
137 | 
138 | Include: 
139 | 1. Key insights
140 | 2. Knowledge gaps
141 | 3. Next steps
142 | 4. Overall assessment
143 | """, findings=fallback_findings)
144 |             
145 |             response = await llm.ainvoke(fallback_prompt)
146 |             
147 |             reflection_content = response.content
148 |             
149 |             state["messages"].append(HumanMessage(content="Analyzing current findings..."))
150 |             state["messages"].append(AIMessage(content=reflection_content))
151 |             state["findings"] += f"\n\n## Reflection on Current Findings\n\n{reflection_content}\n\n"
152 |         except Exception as e2:
153 |             console.print(f"[dim red]Error in fallback reflection: {str(e2)}. Using minimal reflection.[/dim red]")
154 |             
155 |             minimal_reflection = "## Research Reflection\n\nThe research is progressing. Further investigation is needed to develop a more comprehensive understanding of the topic."
156 |             
157 |             state["messages"].append(HumanMessage(content="Analyzing current findings..."))
158 |             state["messages"].append(AIMessage(content=minimal_reflection))
159 |             state["findings"] += f"\n\n## Reflection on Current Findings\n\n{minimal_reflection}\n\n"
160 |     
161 |     log_chain_of_thought(state, "Completed reflection on current findings")
162 |     if progress_callback:
163 |         await _call_progress_callback(progress_callback, state)
164 |     return state
165 | 


--------------------------------------------------------------------------------
/shandu/agents/nodes/search.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Search node for research graph.
  3 | """
  4 | import asyncio
  5 | import time
  6 | import random
  7 | import logging
  8 | from typing import List, Dict, Any, Optional, Set
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | from rich.console import Console
 11 | from langchain_core.messages import AIMessage, HumanMessage
 12 | from langchain_core.prompts import ChatPromptTemplate
 13 | from pydantic import BaseModel, Field
 14 | from ..processors.content_processor import AgentState, is_relevant_url, process_scraped_item, analyze_content
 15 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback, is_shutdown_requested
 16 | from ...search.search import SearchResult
 17 | 
 18 | console = Console()
 19 | 
 20 | # Structured output model for search results
 21 | class SearchResultAnalysis(BaseModel):
 22 |     """Structured output for search result analysis."""
 23 |     relevant_urls: list[str] = Field(
 24 |         description="List of URLs that are relevant to the query",
 25 |         min_items=0
 26 |     )
 27 |     analysis: str = Field(
 28 |         description="Analysis of the search results"
 29 |     )
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | async def search_node(llm, searcher, scraper, progress_callback, state: AgentState) -> AgentState:
 34 |     """
 35 |     Search for information based on the current subqueries.
 36 |     
 37 |     Args:
 38 |         llm: Language model to use
 39 |         searcher: Search engine to use
 40 |         scraper: Web scraper to use
 41 |         progress_callback: Callback function for progress updates
 42 |         state: Current agent state
 43 |         
 44 |     Returns:
 45 |         Updated agent state
 46 |     """
 47 |     if is_shutdown_requested():
 48 |         state["status"] = "Shutdown requested, skipping search"
 49 |         log_chain_of_thought(state, "Shutdown requested, skipping search")
 50 |         return state
 51 |     
 52 |     state["status"] = f"Searching for information (Depth {state['current_depth']})"
 53 |     
 54 |     breadth = state["breadth"]
 55 |     if len(state["subqueries"]) > 0:
 56 |         recent_queries = state["subqueries"][-breadth:]
 57 |     else:
 58 |         recent_queries = [state["query"]]
 59 | 
 60 |     async def process_query(query, query_idx):
 61 |         if is_shutdown_requested():
 62 |             log_chain_of_thought(state, f"Shutdown requested, stopping search after {query_idx} queries")
 63 |             return
 64 |             
 65 |         logger.info(f"Processing query {query_idx+1}/{len(recent_queries)}: {query}")
 66 |         console.print(f"Executing search for: {query}")
 67 |         state["status"] = f"Searching for: {query}"
 68 |         
 69 |         # Search for the query using multiple engines for better results
 70 |         try:
 71 |             # Use multiple engines in parallel for more diverse results
 72 |             engines = ["google", "duckduckgo"]  # Using primary engines 
 73 |             if query_idx % 2 == 0:  # Add Wikipedia for every other query
 74 |                 engines.append("wikipedia")
 75 |             
 76 |             search_results = await searcher.search(query, engines=engines)
 77 |             if not search_results:
 78 |                 logger.warning(f"No search results found for: {query}")
 79 |                 log_chain_of_thought(state, f"No search results found for '{query}'")
 80 |                 return
 81 |                 
 82 |         except Exception as e:
 83 |             console.print(f"[red]Error during search: {e}[/]")
 84 |             log_chain_of_thought(state, f"Error during search for '{query}': {str(e)}")
 85 |             return
 86 |         
 87 |         # Filter relevant URLs in batches to avoid overwhelming the LLM
 88 |         relevant_urls = []
 89 |         url_batches = [search_results[i:i+10] for i in range(0, len(search_results), 10)]
 90 |         
 91 |         for batch in url_batches:
 92 |             if is_shutdown_requested():
 93 |                 break
 94 | 
 95 |             relevance_tasks = []
 96 |             for result in batch:
 97 |                 relevance_task = is_relevant_url(llm, result.url, result.title, result.snippet, query)
 98 |                 relevance_tasks.append((result, relevance_task))
 99 |             
100 |             # Wait for all relevance checks in this batch
101 |             for result, relevance_task in relevance_tasks:
102 |                 try:
103 |                     is_relevant = await relevance_task
104 |                     if is_relevant:
105 |                         relevant_urls.append(result)
106 | 
107 |                         state["sources"].append({
108 |                             "url": result.url,
109 |                             "title": result.title,
110 |                             "snippet": result.snippet,
111 |                             "source": result.source,
112 |                             "query": query
113 |                         })
114 |                 except Exception as e:
115 |                     logger.error(f"Error checking relevance for {result.url}: {e}")
116 |         
117 |         if not relevant_urls:
118 |             log_chain_of_thought(state, f"No relevant URLs found for '{query}'")
119 |             return
120 |         
121 |         # Limit the number of URLs to scrape for efficiency
122 |         # Choose a mix of the most relevant URLs across different sources
123 |         # Sort by source first to ensure diversity, then take top N
124 |         relevant_urls.sort(key=lambda r: r.source)
125 |         relevant_urls = relevant_urls[:8]  # Increased from 5 to 8 for better coverage
126 |         
127 |         # Scrape the relevant URLs all at once using our improved scraper
128 |         urls_to_scrape = [result.url for result in relevant_urls]
129 |         
130 |         # The new scraper implementation handles concurrency internally
131 |         # It will use semaphores to limit concurrent scraping and handle timeouts
132 |         try:
133 |             scraped_contents = await scraper.scrape_urls(
134 |                 urls_to_scrape, 
135 |                 dynamic=False,  # Avoid dynamic for speed unless specially needed 
136 |                 force_refresh=False  # Use caching if available
137 |             )
138 |         except Exception as e:
139 |             logger.error(f"Error scraping URLs for query '{query}': {e}")
140 |             log_chain_of_thought(state, f"Error scraping URLs for query '{query}': {str(e)}")
141 |             return
142 | 
143 |         processed_items = []
144 |         successful_scrapes = [item for item in scraped_contents if item.is_successful()]
145 | 
146 |         for item in successful_scrapes:
147 |             if is_shutdown_requested():
148 |                 break
149 |                 
150 |             logger.info(f"Processing scraped content from: {item.url}")
151 |             content_preview = item.text[:100] + "..." if len(item.text) > 100 else item.text
152 |             logger.debug(f"Content preview: {content_preview}")
153 |             
154 |             processed_item = await process_scraped_item(llm, item, query, item.text)
155 |             processed_items.append(processed_item)
156 |         
157 |         if not processed_items:
158 |             log_chain_of_thought(state, f"No content could be extracted from URLs for '{query}'")
159 |             return
160 |         
161 |         # Prepare content for analysis in a structured way
162 |         combined_content = ""
163 |         for item in processed_items:
164 | 
165 |             combined_content += f"\n\n## SOURCE: {item['item'].url}\n"
166 |             combined_content += f"## TITLE: {item['item'].title or 'No title'}\n"
167 |             combined_content += f"## RELIABILITY: {item['rating']}\n"
168 |             combined_content += f"## CONTENT START\n{item['content']}\n## CONTENT END\n"
169 |         
170 |         analysis = await analyze_content(llm, query, combined_content)
171 |         
172 |         state["content_analysis"].append({
173 |             "query": query,
174 |             "sources": [item["item"].url for item in processed_items],
175 |             "analysis": analysis
176 |         })
177 |         
178 |         state["findings"] += f"\n\n## Analysis for: {query}\n\n{analysis}\n\n"
179 |         
180 |         log_chain_of_thought(state, f"Analyzed content for query: {query}")
181 |         if progress_callback:
182 |             await _call_progress_callback(progress_callback, state)
183 | 
184 |     tasks = []
185 |     for idx, query in enumerate(recent_queries):
186 |         tasks.append(process_query(query, idx))
187 |     
188 |     # Use gather to process all queries concurrently but with proper control
189 |     await asyncio.gather(*tasks)
190 |     
191 |     state["current_depth"] += 1
192 |     log_chain_of_thought(state, f"Completed depth {state['current_depth']} of {state['depth']}")
193 | 
194 |     if progress_callback and state.get("status") != "Searching":
195 |         state["status"] = "Searching completed"
196 |         await _call_progress_callback(progress_callback, state)
197 |     
198 |     return state
199 | 


--------------------------------------------------------------------------------
/shandu/agents/nodes/source_selection.py:
--------------------------------------------------------------------------------
  1 | """Source selection node with robust error handling and retry logic."""
  2 | import os
  3 | import re
  4 | import time
  5 | import asyncio
  6 | import random
  7 | from typing import List, Dict, Any, Optional
  8 | from rich.console import Console
  9 | from rich.progress import Progress, SpinnerColumn, TextColumn
 10 | from langchain_core.prompts import ChatPromptTemplate
 11 | from pydantic import BaseModel, Field
 12 | from ..processors.content_processor import AgentState
 13 | from ..utils.agent_utils import log_chain_of_thought, _call_progress_callback
 14 | from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS
 15 | 
 16 | console = Console()
 17 | 
 18 | # Maximum retry attempts for source selection
 19 | MAX_RETRIES = 3
 20 | 
 21 | # Structured output model for source selection
 22 | class SourceSelection(BaseModel):
 23 |     """Structured output for source selection."""
 24 |     selected_sources: list[str] = Field(
 25 |         description="List of URLs for the most valuable sources to include in the report",
 26 |         min_items=1
 27 |     )
 28 |     selection_rationale: str = Field(
 29 |         description="Explanation of why these sources were selected"
 30 |     )
 31 | 
 32 | # Exponential backoff function for retries
 33 | async def backoff_retry(attempt: int) -> None:
 34 |     """Simple exponential backoff."""
 35 |     if attempt > 0:
 36 |         # Exponential backoff with jitter to avoid thundering herd
 37 |         delay = min(30, (2 ** attempt) + (random.random() * 0.5))
 38 |         console.print(f"[yellow]Backing off for {delay:.1f} seconds before retry...[/]")
 39 |         await asyncio.sleep(delay)
 40 | 
 41 | def extract_urls_from_text(text: str, all_source_urls: List[str]) -> List[str]:
 42 |     """
 43 |     Extract URLs from the model response text.
 44 |     
 45 |     Args:
 46 |         text: The text to extract URLs from
 47 |         all_source_urls: List of all possible source URLs
 48 |         
 49 |     Returns:
 50 |         List of extracted URLs
 51 |     """
 52 |     selected_urls = []
 53 |     lines = text.split('\n')
 54 |     
 55 |     # Iterate through each line looking for URLs
 56 |     for line in lines:
 57 |         for url in all_source_urls:
 58 |             if url in line:
 59 |                 if url not in selected_urls:
 60 |                     selected_urls.append(url)
 61 |                     break
 62 |                     
 63 |     return selected_urls
 64 | 
 65 | async def select_sources_with_llm(llm, all_source_urls: List[str], sources_text: str, query: str) -> List[str]:
 66 |     """
 67 |     Try to select sources using LLM with retry logic.
 68 |     
 69 |     Args:
 70 |         llm: The language model
 71 |         all_source_urls: List of all source URLs
 72 |         sources_text: Formatted text of all sources
 73 |         query: The research query
 74 |         
 75 |     Returns:
 76 |         List of selected URLs
 77 |     """
 78 |     selected_urls = []
 79 |     
 80 |     with Progress(
 81 |         SpinnerColumn(),
 82 |         TextColumn("[bold blue]Selecting sources..."),
 83 |         console=console
 84 |     ) as progress:
 85 |         task = progress.add_task("Selecting", total=1)
 86 |         
 87 |         # Try using a standard source selection approach first
 88 |         for attempt in range(MAX_RETRIES):
 89 |             try:
 90 |                 await backoff_retry(attempt)
 91 |                 
 92 |                 # Use a direct, simplified prompt
 93 |                 direct_prompt = f"""Select the 15-20 most valuable sources for this research report.
 94 | 
 95 | RESEARCH TOPIC: {query}
 96 | 
 97 | SOURCES TO EVALUATE:
 98 | {sources_text[:15000]}  # Limit text length to avoid token issues
 99 | 
100 | INSTRUCTIONS:
101 | - Select 15-20 of the most valuable sources from the list
102 | - Return ONLY the exact URLs of your selected sources
103 | - List the URLs in order of importance, one URL per line
104 | - Do not include any explanations, just the URLs
105 | """
106 |                 # Try with a smaller timeout and token limit
107 |                 retry_llm = llm.with_config({"timeout": 30, "max_tokens": 1024})
108 |                 response = await retry_llm.ainvoke(direct_prompt)
109 |                 selected_urls = extract_urls_from_text(response.content, all_source_urls)
110 |                 
111 |                 # If we got some results, we're done
112 |                 if selected_urls:
113 |                     progress.update(task, completed=1)
114 |                     break
115 |                     
116 |             except Exception as e:
117 |                 console.print(f"[yellow]Source selection attempt {attempt+1} failed: {str(e)}[/]")
118 |                 
119 |                 # Only log the first error in detail
120 |                 if attempt == 0:
121 |                     from ...utils.logger import log_error
122 |                     log_error("Error in source selection", e,
123 |                          context=f"Query: {query}, Function: select_sources_with_llm")
124 |                 
125 |                 # If this was the last attempt, continue to fallback mechanisms
126 |                 if attempt == MAX_RETRIES - 1:
127 |                     console.print("[yellow]All source selection attempts failed, using fallback approach[/]")
128 |                     
129 |         progress.update(task, completed=1)
130 |     
131 |     return selected_urls
132 | 
133 | async def smart_source_selection(llm, progress_callback, state: AgentState) -> AgentState:
134 |     """Select relevant sources for the report using robust error handling."""
135 |     state["status"] = "Selecting most valuable sources"
136 |     console.print("[bold blue]Selecting most relevant and high-quality sources...[/]")
137 | 
138 |     # Collect all unique source URLs
139 |     all_source_urls = []
140 |     for analysis in state["content_analysis"]:
141 |         if "sources" in analysis and isinstance(analysis["sources"], list):
142 |             for url in analysis["sources"]:
143 |                 if url not in all_source_urls:
144 |                     all_source_urls.append(url)
145 |     
146 |     console.print(f"[green]Found {len(all_source_urls)} total sources to evaluate[/]")
147 |     
148 |     # If we have too many sources, use smart selection to filter them
149 |     if len(all_source_urls) > 25:
150 |         # Prepare formatted source text
151 |         sources_text = ""
152 |         for i, url in enumerate(all_source_urls, 1):
153 |             source_meta = next((s for s in state["sources"] if s.get("url") == url), {})
154 |             
155 |             sources_text += f"Source {i}:\nURL: {url}\n"
156 |             if source_meta.get("title"):
157 |                 sources_text += f"Title: {source_meta.get('title')}\n"
158 |             if source_meta.get("snippet"):
159 |                 sources_text += f"Summary: {source_meta.get('snippet')}\n"
160 |             if source_meta.get("date"):
161 |                 sources_text += f"Date: {source_meta.get('date')}\n"
162 |             sources_text += "\n"
163 |         
164 |         # Try LLM-based selection with retry logic
165 |         selected_urls = await select_sources_with_llm(
166 |             llm, 
167 |             all_source_urls, 
168 |             sources_text, 
169 |             state['query']
170 |         )
171 |         
172 |         # Fallback: If all attempts fail, use a simplified ranking based on source metadata
173 |         if not selected_urls:
174 |             console.print("[yellow]Using fallback source selection based on metadata ranking[/]")
175 |             
176 |             # Prioritize sources with titles and snippets
177 |             ranked_sources = []
178 |             for url in all_source_urls:
179 |                 source_meta = next((s for s in state["sources"] if s.get("url") == url), {})
180 |                 
181 |                 # Simple ranking based on metadata completeness
182 |                 score = 0
183 |                 if source_meta.get("title"):
184 |                     score += 2
185 |                 if source_meta.get("snippet"):
186 |                     score += 1
187 |                 if source_meta.get("date"):
188 |                     score += 1
189 |                 
190 |                 ranked_sources.append((url, score))
191 |             
192 |             # Sort by score in descending order
193 |             ranked_sources.sort(key=lambda x: x[1], reverse=True)
194 |             
195 |             # Take top 15-20 sources
196 |             max_sources = min(20, len(ranked_sources))
197 |             selected_urls = [url for url, _ in ranked_sources[:max_sources]]
198 |         
199 |         # Always ensure we have sources
200 |         if not selected_urls and all_source_urls:
201 |             # Last resort: take the first 15-20 sources
202 |             selected_urls = all_source_urls[:min(20, len(all_source_urls))]
203 |         
204 |         # Store the selected sources
205 |         state["selected_sources"] = selected_urls
206 |         log_chain_of_thought(
207 |             state, 
208 |             f"Selected {len(selected_urls)} most relevant sources from {len(all_source_urls)} total sources"
209 |         )
210 |     else:
211 |         # If we don't have too many sources, use all of them
212 |         state["selected_sources"] = all_source_urls
213 |         log_chain_of_thought(state, f"Using all {len(all_source_urls)} sources for final report")
214 | 
215 |     if progress_callback:
216 |         await _call_progress_callback(progress_callback, state)
217 |     return state
218 | 


--------------------------------------------------------------------------------
/shandu/agents/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Content processing and report generation modules for research agents.
 3 | """
 4 | from .content_processor import (
 5 |     AgentState,
 6 |     is_relevant_url,
 7 |     process_scraped_item,
 8 |     analyze_content
 9 | )
10 | from .report_generator import (
11 |     generate_title,
12 |     format_citations,
13 |     extract_themes,
14 |     generate_initial_report,
15 |     enhance_report,
16 |     expand_key_sections
17 | )
18 | 
19 | __all__ = [
20 |     'AgentState',
21 |     'is_relevant_url',
22 |     'process_scraped_item',
23 |     'analyze_content',
24 |     'generate_title',
25 |     'format_citations',
26 |     'extract_themes',
27 |     'generate_initial_report',
28 |     'enhance_report',
29 |     'expand_key_sections'
30 | ]


--------------------------------------------------------------------------------
/shandu/agents/processors/content_processor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Content processing utilities for research agents.
  3 | Contains functionality for handling search results, extracting content, and analyzing information.
  4 | """
  5 | 
  6 | import os
  7 | from typing import List, Dict, Optional, Any, Union, TypedDict, Sequence
  8 | from dataclasses import dataclass
  9 | import json
 10 | import time
 11 | import asyncio
 12 | import re
 13 | from datetime import datetime
 14 | from rich.console import Console
 15 | from langchain_core.messages import AIMessage, HumanMessage, BaseMessage
 16 | from langchain_core.prompts import ChatPromptTemplate
 17 | from langchain_core.output_parsers import StrOutputParser
 18 | from pydantic import BaseModel, Field
 19 | from langchain_openai import ChatOpenAI
 20 | from ...search.search import SearchResult
 21 | from ...scraper import WebScraper, ScrapedContent
 22 | 
 23 | console = Console()
 24 | 
 25 | class AgentState(TypedDict):
 26 |     messages: Sequence[Union[HumanMessage, AIMessage]]
 27 |     query: str
 28 |     depth: int
 29 |     breadth: int
 30 |     current_depth: int
 31 |     findings: str
 32 |     sources: List[Dict[str, Any]]
 33 |     selected_sources: List[str]
 34 |     formatted_citations: str
 35 |     subqueries: List[str]
 36 |     content_analysis: List[Dict[str, Any]]
 37 |     start_time: float
 38 |     chain_of_thought: List[str]
 39 |     status: str
 40 |     current_date: str
 41 |     detail_level: str
 42 |     identified_themes: str
 43 |     initial_report: str
 44 |     enhanced_report: str
 45 |     final_report: str
 46 | 
 47 | # Structured output models
 48 | class UrlRelevanceResult(BaseModel):
 49 |     """Structured output for URL relevance check."""
 50 |     is_relevant: bool = Field(description="Whether the URL is relevant to the query")
 51 |     reason: str = Field(description="Reason for the relevance decision")
 52 | 
 53 | class ContentRating(BaseModel):
 54 |     """Structured output for content reliability rating."""
 55 |     rating: str = Field(description="Reliability rating: HIGH, MEDIUM, or LOW")
 56 |     justification: str = Field(description="Justification for the rating")
 57 |     extracted_content: str = Field(description="Extracted relevant content from the source")
 58 | 
 59 | class ContentAnalysis(BaseModel):
 60 |     """Structured output for content analysis."""
 61 |     key_findings: List[str] = Field(description="List of key findings from the content")
 62 |     main_themes: List[str] = Field(description="Main themes identified in the content")
 63 |     analysis: str = Field(description="Comprehensive analysis of the content")
 64 |     source_evaluation: str = Field(description="Evaluation of the sources' credibility and relevance")
 65 | 
 66 | async def is_relevant_url(llm: ChatOpenAI, url: str, title: str, snippet: str, query: str) -> bool:
 67 |     """
 68 |     Check if a URL is relevant to the query using structured output.
 69 |     """
 70 |     # First use simple heuristics to avoid LLM calls for obviously irrelevant domains
 71 |     irrelevant_domains = [
 72 |         "pinterest", "instagram", "facebook", "twitter", "youtube", "tiktok",
 73 |         "reddit", "quora", "linkedin", "amazon.com", "ebay.com", "etsy.com",
 74 |         "walmart.com", "target.com"
 75 |     ]
 76 |     if any(domain in url.lower() for domain in irrelevant_domains):
 77 |         return False
 78 | 
 79 |     # Escape any literal curly braces in the inputs
 80 |     safe_url = url.replace("{", "{{").replace("}", "}}")
 81 |     safe_title = title.replace("{", "{{").replace("}", "}}")
 82 |     safe_snippet = snippet.replace("{", "{{").replace("}", "}}")
 83 |     safe_query = query.replace("{", "{{").replace("}", "}}")
 84 |     
 85 |     # Use structured output for relevance check
 86 |     structured_llm = llm.with_structured_output(UrlRelevanceResult)
 87 |     system_prompt = (
 88 |         "You are evaluating search results for relevance to a specific query.\n\n"
 89 |         "DETERMINE if the search result is RELEVANT or NOT RELEVANT to answering the query.\n"
 90 |         "Consider the title, URL, and snippet to make your determination.\n\n"
 91 |         "Provide a structured response with your decision and reasoning.\n"
 92 |     )
 93 |     user_content = (
 94 |         f"Query: {safe_query}\n\n"
 95 |         f"Search Result:\nTitle: {safe_title}\nURL: {safe_url}\nSnippet: {safe_snippet}\n\n"
 96 |         "Is this result relevant to the query?"
 97 |     )
 98 |     # Build the prompt chain by piping the prompt into the structured LLM.
 99 |     prompt = ChatPromptTemplate.from_messages([
100 |         {"role": "system", "content": system_prompt},
101 |         {"role": "user", "content": user_content}
102 |     ])
103 |     mapping = {"query": query, "title": title, "url": url, "snippet": snippet}
104 |     try:
105 |         # Chain the prompt and structured LLM; then call invoke with the mapping
106 |         chain = prompt | structured_llm
107 |         result = await chain.ainvoke(mapping)
108 |         return result.is_relevant
109 |     except Exception as e:
110 |         from ...utils.logger import log_error
111 |         log_error("Error in structured relevance check", e, 
112 |                  context=f"Query: {query}, Function: is_relevant_url")
113 |         console.print(f"[dim red]Error in structured relevance check: {str(e)}. Using simpler approach.[/dim red]")
114 |         # Escape any literal curly braces in the fallback prompt
115 |         safe_fb_url = url.replace("{", "{{").replace("}", "}}")
116 |         safe_fb_title = title.replace("{", "{{").replace("}", "}}")
117 |         safe_fb_snippet = snippet.replace("{", "{{").replace("}", "}}")
118 |         safe_fb_query = query.replace("{", "{{").replace("}", "}}")
119 |         
120 |         simple_prompt = (
121 |             f"Evaluate if this search result is RELEVANT or NOT RELEVANT to the query.\n"
122 |             "Answer with ONLY \"RELEVANT\" or \"NOT RELEVANT\".\n\n"
123 |             f"Query: {safe_fb_query}\n"
124 |             f"Title: {safe_fb_title}\n"
125 |             f"URL: {safe_fb_url}\n"
126 |             f"Snippet: {safe_fb_snippet}"
127 |         )
128 |         response = await llm.ainvoke(simple_prompt)
129 |         result_text = response.content
130 |         return "RELEVANT" in result_text.upper()
131 | 
132 | async def process_scraped_item(llm: ChatOpenAI, item: ScrapedContent, subquery: str, main_content: str) -> Dict[str, Any]:
133 |     """
134 |     Process a scraped item to evaluate reliability and extract content using structured output.
135 |     """
136 |     try:
137 |         # Escape any literal curly braces in the content to avoid format string errors
138 |         safe_content = main_content[:8000].replace("{", "{{").replace("}", "}}")
139 |         safe_url = item.url.replace("{", "{{").replace("}", "}}")
140 |         safe_title = item.title.replace("{", "{{").replace("}", "}}")
141 |         safe_subquery = subquery.replace("{", "{{").replace("}", "}}")
142 |         
143 |         structured_llm = llm.with_structured_output(ContentRating)
144 |         system_prompt = (
145 |             "You are analyzing web content for reliability and extracting the most relevant information.\n\n"
146 |             "Evaluate the RELIABILITY of the content using these criteria:\n"
147 |             "1. Source credibility and expertise\n"
148 |             "2. Evidence quality\n"
149 |             "3. Consistency with known facts\n"
150 |             "4. Publication date recency\n"
151 |             "5. Presence of citations or references\n\n"
152 |             "Rate the source as \"HIGH\", \"MEDIUM\", or \"LOW\" reliability with a brief justification.\n\n"
153 |             "Then, EXTRACT the most relevant and valuable content related to the query.\n"
154 |         )
155 |         user_message = (
156 |             f"Analyze this web content:\n\n"
157 |             f"URL: {safe_url}\n"
158 |             f"Title: {safe_title}\n"
159 |             f"Query: {safe_subquery}\n\n"
160 |             "Content:\n"
161 |             f"{safe_content}"
162 |         )
163 |         prompt = ChatPromptTemplate.from_messages([
164 |             {"role": "system", "content": system_prompt},
165 |             {"role": "user", "content": user_message}
166 |         ])
167 |         mapping = {"url": item.url, "title": item.title, "subquery": subquery}
168 |         # Chain the prompt with the structured LLM
169 |         chain = prompt | structured_llm
170 |         result = await chain.ainvoke(mapping)
171 |         return {
172 |             "item": item,
173 |             "rating": result.rating,
174 |             "justification": result.justification,
175 |             "content": result.extracted_content
176 |         }
177 |     except Exception as e:
178 |         from ...utils.logger import log_error
179 |         log_error("Error in structured content processing", e, 
180 |                  context=f"Query: {subquery}, Function: process_scraped_item")
181 |         console.print(f"[dim red]Error in structured content processing: {str(e)}. Using simpler approach.[/dim red]")
182 |         current_file = os.path.basename(__file__)
183 |         # Escape any literal curly braces in the fallback content
184 |         safe_shorter_content = main_content[:5000].replace("{", "{{").replace("}", "}}")
185 |         safe_fb_url = item.url.replace("{", "{{").replace("}", "}}")
186 |         safe_fb_title = item.title.replace("{", "{{").replace("}", "}}")
187 |         safe_fb_subquery = subquery.replace("{", "{{").replace("}", "}}")
188 |         
189 |         simple_prompt = (
190 |             f"Analyze web content for reliability (HIGH/MEDIUM/LOW) and extract relevant information.\n"
191 |             "Format your response as:\n"
192 |             "RELIABILITY: [rating]\n"
193 |             "JUSTIFICATION: [brief explanation]\n"
194 |             "EXTRACTED_CONTENT: [relevant content]\n\n"
195 |             f"URL: {safe_fb_url}\n"
196 |             f"Title: {safe_fb_title}\n"
197 |             f"Query: {safe_fb_subquery}\n\n"
198 |             "Content:\n"
199 |             f"{safe_shorter_content}"
200 |         )
201 |         response = await llm.ainvoke(simple_prompt)
202 |         content = response.content
203 |         rating = "MEDIUM"  # Default fallback rating
204 |         justification = ""
205 |         extracted_content = content
206 | 
207 |         if "RELIABILITY:" in content:
208 |             reliability_match = re.search(r"RELIABILITY:\s*(HIGH|MEDIUM|LOW)", content)
209 |             if reliability_match:
210 |                 rating = reliability_match.group(1)
211 |         if "JUSTIFICATION:" in content:
212 |             justification_match = re.search(r"JUSTIFICATION:\s*(.+?)(?=\n\n|EXTRACTED_CONTENT:|$)", content, re.DOTALL)
213 |             if justification_match:
214 |                 justification = justification_match.group(1).strip()
215 |         if "EXTRACTED_CONTENT:" in content:
216 |             content_match = re.search(r"EXTRACTED_CONTENT:\s*(.+?)(?=$)", content, re.DOTALL)
217 |             if content_match:
218 |                 extracted_content = content_match.group(1).strip()
219 | 
220 |         return {
221 |             "item": item,
222 |             "rating": rating,
223 |             "justification": justification,
224 |             "content": extracted_content
225 |         }
226 | 
227 | async def analyze_content(llm: ChatOpenAI, subquery: str, content_text: str) -> str:
228 |     """
229 |     Analyze content from multiple sources and synthesize the information using structured output.
230 |     """
231 |     try:
232 |         structured_llm = llm.with_structured_output(ContentAnalysis)
233 |         system_prompt = (
234 |             "You are analyzing and synthesizing information from multiple web sources.\n\n"
235 |             "Your task is to:\n"
236 |             "1. Identify the most important and relevant information related to the query\n"
237 |             "2. Extract key findings and main themes\n"
238 |             "3. Organize the information into a coherent analysis\n"
239 |             "4. Evaluate the credibility and relevance of the sources\n"
240 |             "5. Maintain source attributions when presenting facts or claims\n\n"
241 |             "Create a thorough, well-structured analysis that captures the most valuable insights.\n"
242 |         )
243 |         user_message = (
244 |             f"Analyze the following content related to the query: \"{subquery}\"\n\n"
245 |             f"{content_text}\n\n"
246 |             "Provide a comprehensive analysis that synthesizes the most relevant information "
247 |             "from these sources, organized into a well-structured format with key findings."
248 |         )
249 |         # Escape any literal curly braces in the content to avoid format string errors
250 |         system_prompt_escaped = system_prompt.replace("{", "{{").replace("}", "}}")
251 |         user_message_escaped = user_message.replace("{", "{{").replace("}", "}}")
252 |         
253 |         prompt = ChatPromptTemplate.from_messages([
254 |             {"role": "system", "content": system_prompt_escaped},
255 |             {"role": "user", "content": user_message_escaped}
256 |         ])
257 |         mapping = {"query": subquery}
258 |         # Chain the prompt with the structured LLM (using a modified config if needed)
259 |         chain = prompt | structured_llm.with_config({"timeout": 180})
260 |         result = await chain.ainvoke(mapping)
261 |         formatted_analysis = "### Key Findings\n\n"
262 |         for i, finding in enumerate(result.key_findings, 1):
263 |             formatted_analysis += f"{i}. {finding}\n"
264 |         formatted_analysis += "\n### Main Themes\n\n"
265 |         for i, theme in enumerate(result.main_themes, 1):
266 |             formatted_analysis += f"{i}. {theme}\n"
267 |         formatted_analysis += f"\n### Analysis\n\n{result.analysis}\n"
268 |         formatted_analysis += f"\n### Source Evaluation\n\n{result.source_evaluation}\n"
269 |         return formatted_analysis
270 |     except Exception as e:
271 |         from ...utils.logger import log_error
272 |         log_error("Error in structured content analysis", e, 
273 |                  context=f"Query: {subquery}, Function: analyze_content")
274 |         console.print(f"[dim red]Error in structured content analysis: {str(e)}. Using simpler approach.[/dim red]")
275 |         # Escape any literal curly braces in the fallback content
276 |         safe_ac_subquery = subquery.replace("{", "{{").replace("}", "}}")
277 |         safe_ac_content = content_text[:5000].replace("{", "{{").replace("}", "}}")
278 |         
279 |         simple_prompt = (
280 |             f"Analyze and synthesize information from multiple web sources.\n"
281 |             "Provide a concise but comprehensive analysis of the content related to the query.\n\n"
282 |             f"Analyze content related to: {safe_ac_subquery}\n\n"
283 |             f"{safe_ac_content}"
284 |         )
285 |         simple_llm = llm.with_config({"timeout": 60})
286 |         response = await simple_llm.ainvoke(simple_prompt)
287 |         return response.content
288 | 


--------------------------------------------------------------------------------
/shandu/agents/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for research agents.
 3 | """
 4 | from .agent_utils import (
 5 |     get_user_input,
 6 |     should_continue,
 7 |     log_chain_of_thought,
 8 |     display_research_progress,
 9 |     _call_progress_callback,
10 |     clarify_query
11 | )
12 | 
13 | __all__ = [
14 |     'get_user_input',
15 |     'should_continue',
16 |     'log_chain_of_thought',
17 |     'display_research_progress',
18 |     '_call_progress_callback',
19 |     'clarify_query'
20 | ]


--------------------------------------------------------------------------------
/shandu/agents/utils/agent_utils.py:
--------------------------------------------------------------------------------
  1 | """Agent utility functions."""
  2 | from typing import List, Dict, Optional, Any, Callable, Union, TypedDict, Sequence
  3 | from dataclasses import dataclass
  4 | import time
  5 | import re
  6 | import asyncio
  7 | import signal
  8 | import threading
  9 | import sys
 10 | import os
 11 | from datetime import datetime
 12 | from rich.console import Console
 13 | from rich.tree import Tree
 14 | from rich.progress import Progress, SpinnerColumn, TextColumn
 15 | from rich.markup import escape
 16 | from langchain_core.messages import AIMessage, HumanMessage, BaseMessage
 17 | from langchain_core.prompts import ChatPromptTemplate
 18 | from pydantic import BaseModel, Field
 19 | from ..processors.content_processor import AgentState
 20 | 
 21 | console = Console()
 22 | 
 23 | # Global shutdown flag for graceful termination
 24 | _shutdown_requested = False
 25 | _shutdown_lock = threading.Lock()
 26 | _shutdown_counter = 0
 27 | _MAX_SHUTDOWN_ATTEMPTS = 3
 28 | 
 29 | def setup_signal_handlers():
 30 |     """Set up signal handlers for graceful shutdown."""
 31 |     def signal_handler(sig, frame):
 32 |         global _shutdown_requested, _shutdown_counter
 33 |         with _shutdown_lock:
 34 |             _shutdown_requested = True
 35 |             _shutdown_counter += 1
 36 |             
 37 |             if _shutdown_counter == 1:
 38 |                 console.print("\n[yellow]Shutdown requested. Completing current operations...[/]")
 39 |             elif _shutdown_counter == 2:
 40 |                 console.print("\n[orange]Second shutdown request. Canceling operations...[/]")
 41 |             elif _shutdown_counter >= _MAX_SHUTDOWN_ATTEMPTS:
 42 |                 console.print("\n[bold red]Forced exit requested. Exiting immediately.[/]")
 43 |                 # Force exit after multiple attempts
 44 |                 os._exit(1)
 45 | 
 46 |     signal.signal(signal.SIGINT, signal_handler)
 47 |     signal.signal(signal.SIGTERM, signal_handler)
 48 | 
 49 | # Call this at application startup
 50 | setup_signal_handlers()
 51 | 
 52 | def is_shutdown_requested() -> bool:
 53 |     """Check if shutdown has been requested."""
 54 |     with _shutdown_lock:
 55 |         return _shutdown_requested
 56 | 
 57 | def get_shutdown_level() -> int:
 58 |     """Get the current shutdown level (number of attempts)."""
 59 |     with _shutdown_lock:
 60 |         return _shutdown_counter
 61 | 
 62 | def get_user_input(prompt: str) -> str:
 63 |     """Get formatted user input with shutdown handling."""
 64 |     console.print(prompt, style="yellow")
 65 | 
 66 |     if is_shutdown_requested():
 67 |         console.print("[yellow]Shutdown requested, skipping user input...[/]")
 68 |         return "any"  # Return a generic answer to allow the process to continue to shutdown
 69 |     
 70 |     try:
 71 | 
 72 |         return input("> ").strip()
 73 |     except (KeyboardInterrupt, EOFError):
 74 | 
 75 |         with _shutdown_lock:
 76 |             global _shutdown_requested
 77 |             _shutdown_requested = True
 78 |         console.print("\n[yellow]Input interrupted. Proceeding with shutdown...[/]")
 79 |         return "any"  # Return a generic answer to allow the process to continue to shutdown
 80 | 
 81 | def should_continue(state: AgentState) -> str:
 82 |     """Check if research should continue."""
 83 |     # First check if shutdown was requested
 84 |     if is_shutdown_requested():
 85 |         # If this is a forceful shutdown (second attempt or higher)
 86 |         if get_shutdown_level() >= 2:
 87 |             console.print("[bold red]Forceful shutdown requested. Ending research immediately.[/]")
 88 |             return "end"
 89 |         
 90 |         # For first shutdown request, try to complete gracefully
 91 |         console.print("[yellow]Shutdown requested. Completing current depth before ending.[/]")
 92 |         
 93 |         # If we're already at the end of a depth cycle, end now
 94 |         if state.get("current_depth", 0) >= state.get("depth", 1):
 95 |             return "end"
 96 |         
 97 |         # Otherwise, allow the current depth to complete
 98 |         return "continue"
 99 | 
100 |     if "iteration_count" not in state:
101 |         state["iteration_count"] = 1
102 |     else:
103 |         state["iteration_count"] += 1
104 | 
105 |     # This is separate from depth/breadth and ensures we won't get stuck
106 |     if state["iteration_count"] >= 25:
107 |         console.print("[yellow]Maximum iterations reached. Ending research to prevent infinite loop.[/]")
108 |         return "end"
109 |     
110 |     # Then check if we've reached the desired depth
111 |     if state["current_depth"] < state["depth"]:
112 |         return "continue"
113 |     
114 |     return "end"
115 | 
116 | def log_chain_of_thought(state: AgentState, thought: str) -> None:
117 |     """
118 |     Log a thought to the agent's chain of thought with timestamp.
119 |     
120 |     Args:
121 |         state: The current agent state
122 |         thought: The thought to log
123 |     """
124 |     # Sanitize the thought to prevent Rich markup issues
125 |     sanitized_thought = thought
126 |     # Remove any square brackets that could be misinterpreted as markup
127 |     sanitized_thought = re.sub(r'\[[^\]]*\]', '', sanitized_thought)
128 |     # Remove any orphaned brackets or tags
129 |     sanitized_thought = re.sub(r'\[\/?[^\]]*\]?', '', sanitized_thought)
130 |     sanitized_thought = re.sub(r'\[\]', '', sanitized_thought)
131 |     
132 |     timestamp = datetime.now().strftime("%H:%M:%S")
133 |     state["chain_of_thought"].append(f"[{timestamp}] {sanitized_thought}")
134 | 
135 | def display_research_progress(state: AgentState) -> Tree:
136 |     """
137 |     Create a rich tree display of current research progress.
138 |     
139 |     Args:
140 |         state: The current agent state
141 |         
142 |     Returns:
143 |         Rich Tree object for display
144 |     """
145 |     elapsed_time = time.time() - state["start_time"]
146 |     minutes, seconds = divmod(int(elapsed_time), 60)
147 |     
148 |     # Sanitize status to prevent markup errors
149 |     status_raw = state["status"]
150 |     status = re.sub(r'\[[^\]]*\]', '', status_raw)  # Remove any potential markup
151 |     status = escape(status)  # Escape any remaining characters
152 |     
153 |     phase = "Research" if "depth" in status.lower() or any(word in status.lower() for word in ["searching", "querying", "reflecting", "analyzing"]) else "Report Generation"
154 |     
155 |     tree = Tree(f"[bold blue]{phase} Progress: {status}")
156 | 
157 |     stats_node = tree.add(f"[cyan]Stats")
158 |     stats_node.add(f"[blue]Time Elapsed:[/] {minutes}m {seconds}s")
159 |     
160 |     if phase == "Research":
161 |         # Display research-specific stats
162 |         stats_node.add(f"[blue]Current Depth:[/] {state['current_depth']}/{state['depth']}")
163 |         stats_node.add(f"[blue]Sources Found:[/] {len(state['sources'])}")
164 |         stats_node.add(f"[blue]Subqueries Explored:[/] {len(state['subqueries'])}")
165 |         
166 |         # Show current research paths - with safety checks
167 |         if state["subqueries"]:
168 |             queries_node = tree.add("[green]Current Research Paths")
169 |             # Safely get the last N queries based on breadth
170 |             breadth = max(1, state.get("breadth", 1))  # Ensure breadth is at least 1
171 |             
172 |             # Limit to actual number of queries available
173 |             display_count = min(breadth, len(state["subqueries"]))
174 |             
175 |             if display_count > 0:
176 |                 for i in range(-display_count, 0):  # Get the last 'display_count' elements
177 |                     if i + len(state["subqueries"]) >= 0:  # Safety check
178 |                         query_text = state["subqueries"][i]
179 |                         # Sanitize the query text
180 |                         query_text = re.sub(r'\[[^\]]*\]', '', query_text)
181 |                         query_text = escape(query_text)
182 |                         queries_node.add(query_text)
183 |     else:
184 |         # Display report generation specific stats
185 |         stats_node.add(f"[blue]Sources Selected:[/] {len(state.get('selected_sources', []))}")
186 |         
187 |         # Show report generation progress
188 |         report_progress = tree.add("[green]Report Generation Progress")
189 |         if state.get("selected_sources"):
190 |             report_progress.add("[green]✓[/green] Sources selected")
191 |         if state.get("formatted_citations"):
192 |             report_progress.add("[green]✓[/green] Citations formatted")
193 |         if state.get("initial_report"):
194 |             report_progress.add("[green]✓[/green] Initial report generated")
195 |         if state.get("enhanced_report"):
196 |             report_progress.add("[green]✓[/green] Report enhanced with details")
197 |         if state.get("final_report"):
198 |             report_progress.add("[green]✓[/green] Key sections expanded")
199 |     
200 |     # Show recent thoughts regardless of phase
201 |     if state["chain_of_thought"]:
202 |         thoughts_node = tree.add("[yellow]Recent Thoughts")
203 |         for thought in state["chain_of_thought"][-3:]:
204 |             thoughts_node.add(thought)
205 |     
206 |     # Show latest findings only in research phase
207 |     if phase == "Research" and state["findings"]:
208 |         findings_node = tree.add("[magenta]Latest Findings")
209 |         sections = state["findings"].split("\n\n")
210 |         for section in sections[-2:]:
211 |             if section.strip():
212 |                 # Sanitize findings text to prevent markup errors
213 |                 section_text = section.strip()[:100] + "..." if len(section.strip()) > 100 else section.strip()
214 |                 # Remove any square brackets that could be misinterpreted as markup
215 |                 section_text = re.sub(r'\[[^\]]*\]', '', section_text)
216 |                 # Remove any orphaned brackets or tags
217 |                 section_text = re.sub(r'\[\/?[^\]]*\]?', '', section_text)
218 |                 section_text = re.sub(r'\[\]', '', section_text)
219 |                 # Escape any remaining characters that could be misinterpreted
220 |                 section_text = escape(section_text)
221 |                 findings_node.add(section_text)
222 |     
223 |     # Show shutdown status if requested
224 |     if is_shutdown_requested():
225 |         tree.add(f"[bold red]Shutdown requested. Attempt {get_shutdown_level()}/{_MAX_SHUTDOWN_ATTEMPTS}")
226 |     
227 |     return tree
228 | 
229 | async def _call_progress_callback(callback: Optional[Callable], state: AgentState) -> None:
230 |     """
231 |     Call the progress callback with the current state if provided.
232 |     
233 |     Args:
234 |         callback: The callback function
235 |         state: The current agent state
236 |     """
237 |     # Sanitize state values that will be displayed to prevent Rich markup errors
238 |     if "status" in state:
239 |         state["status"] = escape(re.sub(r'\[[^\]]*\]', '', state["status"]))
240 |     
241 |     if callback:
242 |         try:
243 |             if asyncio.iscoroutinefunction(callback):
244 |                 await callback(state)
245 |             else:
246 |                 callback(state)
247 |         except Exception as e:
248 |             # Sanitize the error message before displaying
249 |             error_msg = str(e)
250 |             error_msg = re.sub(r'\[[^\]]*\]', '', error_msg)
251 |             error_msg = re.sub(r'\[\/?[^\]]*\]?', '', error_msg)
252 |             error_msg = escape(error_msg)
253 |             console.print(f"[dim red]Error in progress callback: {error_msg}[/dim red]")
254 | 
255 | # Structured output model for query clarification
256 | class ClarificationQuestions(BaseModel):
257 |     """Structured output for query clarification questions."""
258 |     questions: list[str] = Field(
259 |         description="List of clarifying questions to better understand the research needs",
260 |         min_items=1,
261 |         max_items=3
262 |     )
263 | 
264 | class RefinedQuery(BaseModel):
265 |     """Structured output for refined query."""
266 |     query: str = Field(description="The refined, comprehensive research query")
267 |     explanation: str = Field(description="Explanation of how the query was refined based on the Q&A")
268 | 
269 | async def clarify_query(query: str, llm, date: Optional[str] = None, system_prompt: str = "", user_prompt: str = "") -> str:
270 |     """Interactive query clarification process with structured output."""
271 |     from ...prompts import SYSTEM_PROMPTS, USER_PROMPTS
272 |     
273 |     current_date = date or datetime.now().strftime("%Y-%m-%d")
274 |     console.print(f"[bold blue]Initial Query:[/] {query}")
275 | 
276 |     if not system_prompt:
277 |         # Use direct string with current_date instead of format
278 |         clarify_prompt = SYSTEM_PROMPTS.get("clarify_query", "")
279 |         system_prompt = f"""You must generate clarifying questions to refine the research query with strict adherence to:
280 | - Eliciting specific details about user goals, scope, and knowledge level.
281 | - Avoiding extraneous or trivial queries.
282 | - Providing precisely 4-5 targeted questions.
283 | 
284 | Today's date: {current_date}.
285 | 
286 | These questions must seek to clarify the exact focal points, the depth of detail, constraints, and user background knowledge. Provide them succinctly and plainly, with no added commentary."""
287 |     
288 |     if not user_prompt:
289 |         user_prompt = USER_PROMPTS.get("clarify_query", "")
290 |     
291 |     try:
292 |         # Use a simpler approach to avoid issues with prompt templates
293 |         try:
294 |             # Direct approach without structured output
295 |             response = await llm.ainvoke(f"""
296 |             {system_prompt}
297 |             
298 |             Generate 3-5 direct, specific questions to better understand the research needs for the query: "{query}"
299 |             
300 |             Focus on:
301 |             1. Clarifying the specific areas the user wants to explore
302 |             2. The level of detail needed
303 |             3. Specific sources or perspectives to include
304 |             4. Time frame or context relevant to the query
305 |             
306 |             IMPORTANT: Provide ONLY the questions themselves, without any introduction or preamble.
307 |             Each question should be clear, direct, and standalone.
308 |             """)
309 | 
310 |             questions = [q.strip() for q in response.content.split("\n") if q.strip() and "?" in q]
311 |             
312 |             # Limit to top 3-5 questions
313 |             questions = questions[:5]
314 |         except Exception as e:
315 |             console.print(f"[dim red]Error in question generation: {str(e)}. Using default questions.[/dim red]")
316 |             questions = []
317 |     except Exception as e:
318 |         from ...utils.logger import log_error
319 |         log_error("Error in clarify_query", e, 
320 |                  context=f"Query: {query}, Function: clarify_query")
321 |         console.print(f"[dim red]Error in structured question generation: {str(e)}. Using simpler approach.[/dim red]")
322 |         try:
323 |             # Direct approach without structured output
324 |             response = await llm.ainvoke(f"Generate 3 direct clarifying questions for the research query: {query}")
325 | 
326 |             questions = [q.strip() for q in response.content.split("\n") if q.strip() and "?" in q]
327 |         except Exception as e2:
328 |             console.print(f"[dim red]Error in fallback question generation: {str(e2)}. Using default questions.[/dim red]")
329 |             questions = []
330 |         
331 |         # If we couldn't extract questions, create some generic ones
332 |         if not questions:
333 |             questions = [
334 |                 "What specific application or area of this topic are you most interested in?",
335 |                 "What is the intended audience or purpose of this research?",
336 |                 "Are you interested in current applications, future trends, ethical considerations, or a combination of these aspects?"
337 |             ]
338 |     
339 |     # Limit to 3 questions
340 |     questions = questions[:3]
341 | 
342 |     answers = []
343 |     for q in questions:
344 | 
345 |         if is_shutdown_requested():
346 |             console.print("[yellow]Shutdown requested, using generic answers...[/]")
347 |             answers.append("any")  # Use a generic answer
348 |             continue
349 |             
350 |         answer = get_user_input(q)
351 |         answers.append(answer)
352 |     
353 |     qa_text = "\n".join([f"Q: {q}\nA: {a}" for q, a in zip(questions, answers)])
354 | 
355 |     refine_system_prompt = f"""You must refine the research query into a strict, focused direction based on user-provided answers. Today's date: {current_date}.
356 | 
357 | REQUIREMENTS:
358 | - DO NOT present any "Research Framework" or "Objective" headings.
359 | - Provide a concise topic statement followed by 2-3 paragraphs integrating all key points from the user.
360 | - Preserve all critical details mentioned by the user.
361 | - The format must be simple plain text with no extraneous headings or bullet points."""
362 |     
363 |     refine_user_prompt = USER_PROMPTS.get("refine_query", "")
364 |     
365 |     try:
366 |         # Use direct approach without structured output
367 |         response = await llm.ainvoke(f"""
368 |         {refine_system_prompt}
369 |         
370 |         Original query: {query}
371 |         Follow-up questions and answers:
372 |         {qa_text}
373 |         
374 |         Based on this information, create a comprehensive, well-structured research query.
375 |         The query should be clear, focused, and incorporate all relevant information from the answers.
376 |         """)
377 |         
378 |         refined_context_raw = response.content
379 | 
380 |         refined_context = refined_context_raw.replace("**", "").replace("# ", "").replace("## ", "")
381 |         refined_context = re.sub(r'^(?:Based on our discussion,|Following our conversation,|As per our discussion,).*?(?:refined topic:|research the following:|exploring|analyze):\s*', '', refined_context, flags=re.IGNORECASE)
382 |         refined_context = re.sub(r'Based on our discussion.*?(?=\.)\.', '', refined_context, flags=re.IGNORECASE)
383 |     except Exception as e:
384 |         from ...utils.logger import log_error
385 |         log_error("Error in clarify_query", e, 
386 |                  context=f"Query: {query}, Function: clarify_query")
387 |         console.print(f"[dim red]Error in structured query refinement: {str(e)}. Using simpler approach.[/dim red]")
388 |         #current_file = os.path.basename(__file__)
389 |         #with open('example.txt', 'a') as file:
390 |             # Append the current file's name and some text
391 |             #file.write(f'This line was written by: {current_file}\n')
392 |             #file.write(f'Error {e}.\n')
393 |         # Fallback to non-structured approach
394 |         try:
395 |             # Direct approach without structured output
396 |             response = await llm.ainvoke(f"""
397 |             Original query: {query}
398 |             
399 |             Follow-up questions and answers:
400 |             {qa_text}
401 |             
402 |             Based on this information, create a comprehensive, well-structured research query.
403 |             """)
404 |             
405 |             refined_context_raw = response.content
406 | 
407 |             refined_context = refined_context_raw.replace("**", "").replace("# ", "").replace("## ", "")
408 |             refined_context = re.sub(r'^(?:Based on our discussion,|Following our conversation,|As per our discussion,).*?(?:refined topic:|research the following:|exploring|analyze):\s*', '', refined_context, flags=re.IGNORECASE)
409 |             refined_context = re.sub(r'Based on our discussion.*?(?=\.)\.', '', refined_context, flags=re.IGNORECASE)
410 |         except Exception as e2:
411 |             console.print(f"[dim red]Error in fallback query refinement: {str(e2)}. Using original query.[/dim red]")
412 |             refined_context = query
413 |     
414 |     console.print(f"\n[bold green]Refined Research Query:[/]\n{refined_context}")
415 |     return refined_context
416 | 


--------------------------------------------------------------------------------
/shandu/agents/utils/citation_registry.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Citation registry to track and manage citations throughout the report generation process.
  3 | """
  4 | from typing import Dict, Any, List, Optional, Set, Union
  5 | 
  6 | class CitationRegistry:
  7 |     """
  8 |     Registry that tracks all citations used in a report, ensuring that in-text citations
  9 |     match the sources in the references section.
 10 |     """
 11 |     def __init__(self):
 12 |         self.citations = {}  # Maps citation_id to source metadata
 13 |         self.id_to_url = {}  # Maps citation_id to source URL for quick lookups
 14 |         self.url_to_id = {}  # Maps source URL to citation_id for deduplication
 15 |         self.next_id = 1     # Next available citation ID
 16 |         self.citation_contexts = {}  # Stores context for each citation use
 17 |     
 18 |     def register_citation(self, source_url: str, context: str = "") -> int:
 19 |         """
 20 |         Register a citation and return its ID.
 21 |         
 22 |         Args:
 23 |             source_url: The URL of the source being cited
 24 |             context: Optional context about how the citation is being used
 25 |             
 26 |         Returns:
 27 |             int: The citation ID to use in the report
 28 |         """
 29 | 
 30 |         if source_url in self.url_to_id:
 31 |             citation_id = self.url_to_id[source_url]
 32 | 
 33 |             if context and context not in self.citation_contexts.get(citation_id, []):
 34 |                 if citation_id not in self.citation_contexts:
 35 |                     self.citation_contexts[citation_id] = []
 36 |                 self.citation_contexts[citation_id].append(context)
 37 |             return citation_id
 38 |         
 39 |         # Register new citation
 40 |         citation_id = self.next_id
 41 |         self.citations[citation_id] = {
 42 |             "url": source_url,
 43 |             "id": citation_id
 44 |         }
 45 |         self.id_to_url[citation_id] = source_url
 46 |         self.url_to_id[source_url] = citation_id
 47 |         
 48 |         # Store context if provided
 49 |         if context:
 50 |             self.citation_contexts[citation_id] = [context]
 51 |             
 52 |         self.next_id += 1
 53 |         return citation_id
 54 |     
 55 |     def get_citation_url(self, citation_id: int) -> Optional[str]:
 56 |         """Get the URL associated with a citation ID."""
 57 |         return self.id_to_url.get(citation_id)
 58 |     
 59 |     def get_citation_info(self, citation_id: int) -> Optional[Dict[str, Any]]:
 60 |         """Get the full citation info for a citation ID."""
 61 |         return self.citations.get(citation_id)
 62 |     
 63 |     def get_all_citations(self) -> Dict[int, Dict[str, Any]]:
 64 |         """Return all registered citations."""
 65 |         return self.citations
 66 |     
 67 |     def get_all_citation_urls(self) -> List[str]:
 68 |         """Return all unique cited URLs in order of first citation."""
 69 |         return [self.id_to_url[cid] for cid in sorted(self.id_to_url.keys())]
 70 |     
 71 |     def get_citation_contexts(self, citation_id: int) -> List[str]:
 72 |         """Get the contexts in which a citation was used."""
 73 |         return self.citation_contexts.get(citation_id, [])
 74 |     
 75 |     def bulk_register_sources(self, source_urls: List[str]) -> None:
 76 |         """Pre-register a list of sources without assigning contexts."""
 77 |         for url in source_urls:
 78 |             if url not in self.url_to_id:
 79 |                 self.register_citation(url)
 80 |     
 81 |     def update_citation_metadata(self, citation_id: int, metadata: Dict[str, Any]) -> None:
 82 |         """Update metadata for a citation (e.g., add title, date, etc.)."""
 83 |         if citation_id in self.citations:
 84 |             self.citations[citation_id].update(metadata)
 85 |     
 86 |     def validate_citations(self, text: str) -> Dict[str, Any]:
 87 |         """
 88 |         Validate all citations in a text against the registry.
 89 |         
 90 |         Args:
 91 |             text: The text content to validate citations in
 92 |             
 93 |         Returns:
 94 |             Dict containing validation results with keys:
 95 |             - valid: Boolean indicating if all citations are valid
 96 |             - invalid_citations: Set of invalid citation IDs
 97 |             - missing_citations: Set of citation IDs in the registry not used in the text
 98 |             - used_citations: Set of citation IDs that are actually used in the text
 99 |             - out_of_range_citations: Set of citation IDs that exceed the maximum registered ID
100 |         """
101 |         import re
102 | 
103 |         citation_pattern = re.compile(r'\[(\d+)\]')
104 |         used_citations = set(int(cid) for cid in citation_pattern.findall(text) if cid.isdigit())
105 | 
106 |         registry_ids = set(self.citations.keys())
107 |         invalid_citations = used_citations - registry_ids
108 |         missing_citations = registry_ids - used_citations
109 |         
110 |         # Identify citations that exceed the maximum registered ID
111 |         max_id = max(registry_ids) if registry_ids else 0
112 |         out_of_range_citations = {cid for cid in used_citations if cid > max_id}
113 | 
114 |         invalid_citations = invalid_citations.union(out_of_range_citations)
115 |         
116 |         return {
117 |             "valid": len(invalid_citations) == 0,
118 |             "invalid_citations": invalid_citations,
119 |             "missing_citations": missing_citations,
120 |             "used_citations": used_citations,
121 |             "out_of_range_citations": out_of_range_citations,
122 |             "max_valid_id": max_id
123 |         }
124 | 


--------------------------------------------------------------------------------
/shandu/config.py:
--------------------------------------------------------------------------------
  1 | """Configuration management module."""
  2 | import os
  3 | import json
  4 | from typing import Dict, Any, Optional
  5 | from pathlib import Path
  6 | import datetime
  7 | 
  8 | DEFAULT_CONFIG = {
  9 |     "api": {
 10 |         "base_url": "https://api.openai.com/v1",
 11 |         "api_key": "",
 12 |         "model": "gpt-4",
 13 |         "temperature": 0
 14 |     },
 15 |     "search": {
 16 |         "engines": ["duckduckgo", "google"],
 17 |         "max_results": 10,
 18 |         "region": "wt-wt",
 19 |         "safesearch": "moderate",
 20 |         "user_agent": "Research 1.0"
 21 |     },
 22 |     "research": {
 23 |         "default_depth": 2,
 24 |         "default_breadth": 4,
 25 |         "max_depth": 5,
 26 |         "max_breadth": 10,
 27 |         "max_urls_per_query": 3
 28 |     },
 29 |     "scraper": {
 30 |         "timeout": 30,
 31 |         "max_retries": 3,
 32 |         "chunk_size": 1000,
 33 |         "chunk_overlap": 200,
 34 |         "proxy": None
 35 |     },
 36 |     "display": {
 37 |         "verbose": False,
 38 |         "show_progress": True,
 39 |         "show_chain_of_thought": True
 40 |     }
 41 | }
 42 | 
 43 | class Config:
 44 |     """Configuration manager."""
 45 |     
 46 |     def __init__(self):
 47 |         self._config = DEFAULT_CONFIG.copy()
 48 |         self._config_path = os.path.expanduser("~/.shandu/config.json")
 49 |         self._load_config()
 50 |         self._load_env_vars()
 51 |         
 52 |     def _load_config(self):
 53 |         """Load config from file."""
 54 |         config_path = Path(self._config_path)
 55 |         if config_path.exists():
 56 |             try:
 57 |                 with open(config_path) as f:
 58 |                     file_config = json.load(f)
 59 |                     self._update_nested_dict(self._config, file_config)
 60 |             except Exception as e:
 61 |                 print(f"Error loading config file: {e}")
 62 |     
 63 |     def _load_env_vars(self):
 64 |         """Load config from environment variables."""
 65 |         if os.environ.get("OPENAI_API_BASE"):
 66 |             self._config["api"]["base_url"] = os.environ["OPENAI_API_BASE"]
 67 |         if os.environ.get("OPENAI_API_KEY"):
 68 |             self._config["api"]["api_key"] = os.environ["OPENAI_API_KEY"]
 69 |         if os.environ.get("OPENAI_MODEL_NAME"):
 70 |             self._config["api"]["model"] = os.environ["OPENAI_MODEL_NAME"]
 71 |         
 72 |         if os.environ.get("SHANDU_PROXY"):
 73 |             self._config["scraper"]["proxy"] = os.environ["SHANDU_PROXY"]
 74 |             
 75 |         if os.environ.get("USER_AGENT"):
 76 |             self._config["search"]["user_agent"] = os.environ["USER_AGENT"]
 77 |     
 78 |     def _update_nested_dict(self, d: Dict, u: Dict):
 79 |         """Update nested dictionary."""
 80 |         for k, v in u.items():
 81 |             if isinstance(v, dict) and k in d and isinstance(d[k], dict):
 82 |                 self._update_nested_dict(d[k], v)
 83 |             else:
 84 |                 d[k] = v
 85 |     
 86 |     def save(self):
 87 |         """Save config to file."""
 88 |         config_path = Path(self._config_path)
 89 |         config_path.parent.mkdir(exist_ok=True, parents=True)
 90 |         with open(config_path, "w") as f:
 91 |             json.dump(self._config, f, indent=2)
 92 |     
 93 |     def get(self, section: str, key: str, default: Any = None) -> Any:
 94 |         """Get config value."""
 95 |         try:
 96 |             return self._config[section][key]
 97 |         except KeyError:
 98 |             return default
 99 |     
100 |     def set(self, section: str, key: str, value: Any):
101 |         """Set config value."""
102 |         if section not in self._config:
103 |             self._config[section] = {}
104 |         self._config[section][key] = value
105 |     
106 |     def get_section(self, section: str) -> Dict[str, Any]:
107 |         """Get config section."""
108 |         return self._config.get(section, {}).copy()
109 |     
110 |     def get_all(self) -> Dict[str, Any]:
111 |         """Get all config."""
112 |         return self._config.copy()
113 | 
114 | config = Config()
115 | 
116 | def get_current_date() -> str:
117 |     """Get current date."""
118 |     return datetime.datetime.now().strftime("%Y-%m-%d")
119 | 
120 | def get_current_datetime() -> str:
121 |     """Get current date and time."""
122 |     return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
123 | 
124 | def get_user_agent() -> str:
125 |     """Get user agent string."""
126 |     configured_agent = config.get("search", "user_agent", None)
127 |     if configured_agent and configured_agent != "Research 1.0":
128 |         return configured_agent
129 |     
130 |     try:
131 |         from fake_useragent import UserAgent
132 |         ua = UserAgent()
133 |         return ua.random
134 |     except ImportError:
135 |         import random
136 |         fake_user_agents = [
137 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
138 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
139 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0",
140 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
141 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
142 |             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
143 |         ]
144 |         return random.choice(fake_user_agents)
145 |     except Exception as e:
146 |         print(f"Error generating user agent: {e}")
147 |         return "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
148 | 


--------------------------------------------------------------------------------
/shandu/prompts.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Centralized prompts for Shandu deep research system.
  3 | All prompts used throughout the system are defined here for easier maintenance.
  4 | """
  5 | from typing import Dict, Any
  6 | 
  7 | # Utility function to safely format prompts with content that may contain curly braces
  8 | def safe_format(template: str, **kwargs: Any) -> str:
  9 |     """
 10 |     Safely format a template string, escaping any curly braces in the values.
 11 |     This prevents ValueError when content contains unexpected curly braces.
 12 |     """
 13 |     # Escape any curly braces in the values
 14 |     safe_kwargs = {k: v.replace('{', '{{').replace('}', '}}') if isinstance(v, str) else v 
 15 |                   for k, v in kwargs.items()}
 16 |     return template.format(**safe_kwargs)
 17 | 
 18 | # System prompts
 19 | SYSTEM_PROMPTS: Dict[str, str] = {
 20 |     "research_agent": """You are an expert research agent with a strict mandate to investigate topics in exhaustive detail. Adhere to the following instructions without deviation:
 21 | 
 22 | 1. You MUST break down complex queries into smaller subqueries to thoroughly explore each component.
 23 | 2. You MUST consult and analyze multiple sources for comprehensive information.
 24 | 3. You MUST verify and cross-check findings from all sources for accuracy.
 25 | 4. You MUST provide deep insights and structured reasoning through self-reflection.
 26 | 5. You MUST produce meticulously detailed research reports.
 27 | 
 28 | REQUIRED CONDUCT:
 29 | - Assume user statements referring to events beyond your known timeline are correct if explicitly indicated as new information.
 30 | - The user is highly experienced, so maintain a sophisticated level of detail.
 31 | - Provide thoroughly organized and carefully reasoned responses.
 32 | - Anticipate additional angles and solutions beyond the immediate scope.
 33 | - NEVER make unwarranted assumptions. If information is uncertain, state so clearly.
 34 | - ALWAYS correct mistakes promptly and without hesitation.
 35 | - NEVER rely on authoritative claims alone. Base responses on thorough analysis of the content.
 36 | - Acknowledge new or unconventional technologies and ideas but label speculative elements clearly.
 37 | 
 38 | When examining any sources, you must carefully seek:
 39 | - Primary sources and official data
 40 | - Recent, up-to-date materials
 41 | - Expert analyses with strong evidence
 42 | - Cross-verification of major claims
 43 | 
 44 | You must strictly address the current query as follows:
 45 | Current query: {{query}}
 46 | Research depth: {{depth}}
 47 | Research breadth: {{breadth}}""",
 48 | 
 49 |     "initialize": """You are an expert research agent with a strict mandate to devise a comprehensive research plan. You must adhere to the following directives without exception:
 50 | 
 51 | Current date: {{current_date}}
 52 | 
 53 | Your mission is to produce a meticulous research plan for the given query. You must:
 54 | 1. Rigorously decompose the query into key subtopics and objectives.
 55 | 2. Identify robust potential information sources and potential angles of investigation.
 56 | 3. Weigh multiple perspectives and acknowledge any biases explicitly.
 57 | 4. Devise reliable strategies for verifying gathered information from diverse sources.
 58 | 
 59 | Your response must appear as plain text with clear section headings, but no special formatting or extraneous commentary. Remain strictly methodical and thorough throughout.""",
 60 | 
 61 |     "reflection": """You are strictly required to analyze the assembled research findings in detail to generate well-founded insights. Today's date: {{current_date}}
 62 | 
 63 | You must:
 64 | - Conduct a thorough, critical, and balanced assessment.
 65 | - Identify patterns, contradictions, and content that is not directly relevant.
 66 | - Evaluate the reliability of sources, accounting for potential biases.
 67 | - Highlight areas necessitating further information, with recommendations for refining focus.
 68 | 
 69 | Ensure that you identify subtle insights and potential oversights, emphasizing depth and rigor in your analysis.""",
 70 | 
 71 |     "query_generation": """You must generate specific, targeted search queries with unwavering precision to investigate discrete aspects of a research topic. Today's date: {{current_date}}.
 72 | 
 73 | You are required to:
 74 | - Craft queries in everyday language, avoiding academic or overly formal phrasing.
 75 | - Ensure queries are succinct but laser-focused on pinpointing needed information.
 76 | - Avoid any extraneous formatting or labeling (like numbering or categories).
 77 | - Provide direct, natural-sounding queries that a real person would input into a search engine.""",
 78 | 
 79 |     "url_relevance": """You must evaluate whether the provided search result directly addresses the given query. If it does, respond with "RELEVANT". Otherwise, respond with "IRRELEVANT". Provide no additional words or statements beyond this single-word response.""",
 80 | 
 81 |     "content_analysis": """You must meticulously analyze the provided web content regarding "{{query}}" to produce a structured, in-depth examination. Your analysis must:
 82 | 
 83 | 1. Thoroughly identify and explain major themes.
 84 | 2. Extract relevant evidence, statistics, and data points in a clear, organized format.
 85 | 3. Integrate details from multiple sources into cohesive, thematic sections.
 86 | 4. Eliminate contradictions and duplications.
 87 | 5. Evaluate source reliability briefly but directly.
 88 | 6. Present extensive exploration of key concepts with robust detail.
 89 | 
 90 | Present your findings in a methodically organized, well-structured format using clear headings, bullet points, and direct quotes where necessary.""",
 91 | 
 92 |     "source_reliability": """You must examine this source in two strictly delineated parts:
 93 | 
 94 | PART 1 – RELIABILITY ASSESSMENT:
 95 | Rate reliability as HIGH, MEDIUM, or LOW based on domain reputation, author expertise, citations, objectivity, and recency. Provide a concise rationale (1-2 sentences).
 96 | 
 97 | PART 2 – EXTRACTED CONTENT:
 98 | Deliver an exhaustive extraction of all relevant data, statistics, opinions, methodologies, and context directly related to the query. Do not omit any critical information. Be thorough yet organized.""",
 99 | 
100 |     "report_generation": """You must compile a comprehensive research report. Today's date: {{current_date}}.
101 | 
102 | MANDATORY REQUIREMENTS:
103 | 1. DO NOT begin with a "Research Framework," "Objective," or any meta-commentary. Start with a # Title.
104 | 2. The structure must be entirely dynamic with headings that reflect the content naturally.
105 | 3. Substantiate factual statements with appropriate references.
106 | 4. Provide detailed paragraphs for every major topic or section.
107 | 
108 | MARKDOWN ENFORCEMENT:
109 | - Use headings (#, ##, ###) carefully to maintain a hierarchical structure.
110 | - Incorporate tables, bolding, italics, code blocks, blockquotes, and horizontal rules as appropriate.
111 | - Maintain significant spacing for readability.
112 | 
113 | CONTENT VOLUME AND DEPTH:
114 | - Each main section should be comprehensive and detailed.
115 | - Offer thorough historical context, theoretical underpinnings, practical applications, and future perspectives.
116 | - Provide a high level of detail, including multiple examples and case studies.
117 | 
118 | REFERENCES:
119 | - Include well-chosen references that support key claims.
120 | - Cite them in bracketed numeric form [1], [2], etc., with a single reference list at the end.
121 | 
122 | STRICT META AND FORMATTING RULES:
123 | - Never include extraneous statements about your process, the research framework, or time taken.
124 | - The final document should read as a polished, standalone publication of the highest scholarly caliber.
125 | {{objective_instruction}}""",
126 | 
127 |     "clarify_query": """You must generate clarifying questions to refine the research query with strict adherence to:
128 | - Eliciting specific details about user goals, scope, and knowledge level.
129 | - Avoiding extraneous or trivial queries.
130 | - Providing precisely 4-5 targeted questions.
131 | 
132 | Today's date: {{current_date}}.
133 | 
134 | These questions must seek to clarify the exact focal points, the depth of detail, constraints, and user background knowledge. Provide them succinctly and plainly, with no added commentary.""",
135 | 
136 |     "refine_query": """You must refine the research query into a strict, focused direction based on user-provided answers. Today's date: {{current_date}}.
137 | 
138 | REQUIREMENTS:
139 | - DO NOT present any "Research Framework" or "Objective" headings.
140 | - Provide a concise topic statement followed by 2-3 paragraphs integrating all key points from the user.
141 | - Preserve all critical details mentioned by the user.
142 | - The format must be simple plain text with no extraneous headings or bullet points.""",
143 | 
144 |     "report_enhancement": """You must enhance an existing research report for greater depth and clarity. Today's date: {{current_date}}.
145 | 
146 | MANDATORY ENHANCEMENT DIRECTIVES:
147 | 1. Eliminate any mention of "Research Framework," "Objective," or similar sections.
148 | 2. Start with a # heading for the report title, with no meta-commentary.
149 | 3. Use references that provide valuable supporting evidence.
150 | 4. Transform each section into a thorough analysis with comprehensive paragraphs.
151 | 5. Use markdown formatting, including headings, bold, italics, code blocks, blockquotes, tables, and horizontal rules, to create a highly readable, visually structured document.
152 | 6. Omit any mention of time spent or processes used to generate the report.
153 | 
154 | CONTENT ENHANCEMENT:
155 | - Improve depth and clarity throughout.
156 | - Provide more examples, historical backgrounds, theoretical frameworks, and future directions.
157 | - Compare multiple viewpoints and delve into technical complexities.
158 | - Maintain cohesive narrative flow and do not introduce contradictory information.
159 | 
160 | Your final product must be an authoritative work that exhibits academic-level depth, thoroughness, and clarity.""",
161 | 
162 |     "section_expansion": """You must significantly expand the specified section of the research report. Strictly adhere to the following:
163 | 
164 | - Add newly written paragraphs of in-depth analysis and context.
165 | - Employ extensive markdown for headings, tables, bold highlights, italics, code blocks, blockquotes, and lists.
166 | - Include comprehensive examples, case studies, historical trajectories, theoretical frameworks, and nuanced viewpoints.
167 | 
168 | Transform this section into an authoritative, stand-alone piece that could be published independently, demonstrating meticulous scholarship and thorough reasoning.
169 | 
170 | Section to expand: {{section}}""",
171 | 
172 |     "smart_source_selection": """You must carefully select the most critical 15-25 sources from a large set. Your selection must follow these strict standards:
173 | 
174 | 1. DIRECT RELEVANCE: The source must explicitly address the core research question.
175 | 2. INFORMATION DENSITY: The source must provide significant unique data.
176 | 3. CREDIBILITY: The source must be authoritative and reliable.
177 | 4. RECENCY: The source must be updated enough for the topic.
178 | 5. DIVERSITY: The source must offer unique perspectives or insights.
179 | 6. DEPTH: The source must present thorough, detailed analysis.
180 | 
181 | Present only the URLs of the selected sources, ordered by overall value, with no justifications or commentary.""",
182 | 
183 |     "citation_formatter": """You must format each source into a rigorous citation that includes:
184 | - Publication or website name
185 | - Author(s) if available
186 | - Title of the article or page
187 | - Publication date if available
188 | - URL
189 | 
190 | Number each citation in sequential bracketed format [n]. Maintain consistency and do not add any extra explanations or remarks. Provide citations only, with correct, clear structure.""",
191 | 
192 |     "multi_step_synthesis": """You must perform a multi-step synthesis of research findings. Current date: {{current_date}}.
193 | 
194 | In this step ({{step_number}} of {{total_steps}}), you are strictly required to:
195 | {{current_step}}
196 | 
197 | Guidelines:
198 | 1. Integrate information from multiple sources into a coherent narrative on the specified aspect.
199 | 2. Identify patterns and connections relevant to this focus.
200 | 3. Develop a thorough, evidence-backed analysis with examples.
201 | 4. Note any contradictions or open questions.
202 | 5. Build upon prior steps to move toward a comprehensive final report.
203 | 
204 | Your synthesis must be precise, deeply reasoned, and self-consistent. Provide multiple paragraphs of thorough explanation."""
205 | }
206 | 
207 | # User prompts
208 | USER_PROMPTS: Dict[str, str] = {
209 |     "reflection": """You must deliver a deeply detailed analysis of current findings, strictly following these points:
210 | 
211 | 1. Clearly state the key insights discovered, assessing evidence strength.
212 | 2. Identify critical unanswered questions and explain their significance.
213 | 3. Evaluate the reliability and biases of sources.
214 | 4. Pinpoint areas needing deeper inquiry, suggesting investigative methods.
215 | 5. Highlight subtle patterns or connections among sources.
216 | 6. Disregard irrelevant or tangential information.
217 | 
218 | Ensure your analysis is methodical, multi-perspectival, and strictly evidence-based. Provide structured paragraphs with logical progression.""",
219 | 
220 |     "query_generation": """Generate {{breadth}} strictly focused search queries to investigate the main query: {{query}}
221 | 
222 | Informed by the current findings and reflection: {{findings}}
223 | 
224 | INSTRUCTIONS FOR YOUR QUERIES:
225 | 1. Each query must be phrased in natural, conversational language.
226 | 2. Keep them concise, typically under 10 words.
227 | 3. Address explicit knowledge gaps identified in the reflection.
228 | 4. Do not number or list them. Place each query on its own line.
229 | 5. Avoid academic or formal language.
230 | 
231 | Provide only the queries, nothing else.""",
232 | 
233 |     "url_relevance": """You must judge if the following search result directly addresses the query. If yes, respond "RELEVANT"; if no, respond "IRRELEVANT". Supply only that single word.
234 | 
235 | Query: {{query}}
236 | Title: {{title}}
237 | URL: {{url}}
238 | Snippet: {{snippet}}""",
239 | 
240 |     "content_analysis": """You must carefully analyze the provided content for "{{query}}" and produce a comprehensive thematic report. The content is:
241 | 
242 | {{content}}
243 | 
244 | Your analysis must include:
245 | 1. Clear identification of major themes.
246 | 2. Exhaustive extraction of facts, statistics, and data.
247 | 3. Organized sections that integrate multiple sources.
248 | 4. Background context for significance.
249 | 5. Comparison of differing perspectives or methodologies.
250 | 6. Detailed case studies and examples.
251 | 
252 | Use markdown headings and bullet points for clarity. Include direct quotes for notable expert statements. Bold key findings or statistics for emphasis. Focus on thoroughness and precision.""",
253 | 
254 |     "source_reliability": """Source URL: {{url}}
255 | Title: {{title}}
256 | Query: {{query}}
257 | Content: {{content}}
258 | 
259 | You must respond in two segments:
260 | 
261 | RELIABILITY:
262 | - Rate the source as HIGH, MEDIUM, or LOW. In 1-2 sentences, justify your rating using domain authority, author credentials, objectivity, and methodological soundness.
263 | 
264 | EXTRACTED_CONTENT:
265 | - Provide every relevant data point, example, statistic, or expert opinion from the source. Organize logically and maintain fidelity to the source's meaning.
266 | 
267 | No additional commentary is permitted beyond these two required sections.""",
268 | 
269 |     "report_generation": """You must produce an all-encompassing research report for the query: {{query}}
270 | 
271 | Analyzed Findings: {{analyzed_findings}}
272 | Number of sources: {{num_sources}}
273 | 
274 | MANDATORY REQUIREMENTS:
275 | - The final document must exceed 15,000 words, with no exceptions.
276 | - Do NOT include a "Research Framework" or "Objective" heading.
277 | - Start with a descriptive title using #, then proceed to a detailed introduction.
278 | - Restrict references to a maximum of 15-25 carefully selected sources.
279 | - Each major topic requires 7-10 paragraphs of deep analysis.
280 | 
281 | STRUCTURE:
282 | 1. Title
283 | 2. Introduction (500-800 words minimum)
284 | 3. Main Body: 5-10 major sections, each at least 1,000-1,500 words, subdivided into 3-5 subsections.
285 | 4. Conclusion (800-1,000 words) summarizing insights and projecting future directions.
286 | 5. References: 15-25 high-quality sources, numbered [1], [2], etc.
287 | 
288 | CONTENT DEMANDS:
289 | - Provide extensive details, including examples, comparisons, and historical context.
290 | - Discuss theories, practical applications, and prospective developments.
291 | - Weave in data from your analysis but do not rely on repeated citations.
292 | - Maintain an authoritative tone with thorough arguments, disclaimers for speculation, and consistent use of markdown elements.
293 | 
294 | Deliver a final product that stands as a definitive, publishable resource on this topic.""",
295 | 
296 |     "initialize": """Formulate a comprehensive plan for researching:
297 | {{query}}
298 | 
299 | You must:
300 | 1. Identify 5-7 major aspects of the topic.
301 | 2. Specify key questions for each aspect.
302 | 3. Propose relevant sources (academic, governmental, etc.).
303 | 4. Outline the methodological approach for thorough coverage.
304 | 5. Anticipate potential obstacles and suggest mitigating strategies.
305 | 6. Highlight possible cross-cutting themes.
306 | 
307 | Present your response as plain text with simple section headings. Remain direct and systematic, without superfluous elaboration or meta commentary.""",
308 | 
309 |     "clarify_query": """You must generate 4-5 follow-up questions to further pinpoint the research scope for "{{query}}". These questions must:
310 | 
311 | 1. Narrow down or clarify the exact topic aspects the user prioritizes.
312 | 2. Determine the technical depth or simplicity required.
313 | 3. Identify relevant time frames, geographies, or perspectives.
314 | 4. Probe for the user's background knowledge and specific interests.
315 | 
316 | Keep each question concise and purposeful. Avoid extraneous details or explanations.""",
317 | 
318 |     "refine_query": """Original query: {{query}}
319 | Follow-up Q&A: 
320 | {{qa}}
321 | 
322 | You must finalize a refined research direction by:
323 | 
324 | 1. Stating a concise topic statement without additional labels.
325 | 2. Expanding it in 2-3 paragraphs that incorporate all relevant user concerns, constraints, and goals.
326 | 
327 | Remember:
328 | - Never refer to any "Research Framework" or structural headings.
329 | - Write in natural, flowing text without bullet points.
330 | - Provide no meta commentary about the research process.""",
331 | 
332 |     "report_enhancement": """You must enhance the following research report to dramatically increase its depth and scope:
333 | 
334 | {{initial_report}}
335 | 
336 | REQUIRED:
337 | - At least double the existing word count.
338 | - Expand each section with additional paragraphs of analysis, examples, and context.
339 | - Keep references consistent but do not add more than the existing cited sources.
340 | - Use advanced markdown formatting, maintain logical flow, and strictly avoid contradictory information.
341 | 
342 | Aim for a polished and authoritative final version with thoroughly developed arguments in every section.""",
343 | 
344 |     "section_expansion": """Expand the following research report section significantly:
345 | 
346 | {{section}}
347 | 
348 | MANDATORY:
349 | 1. Add 3-5 new paragraphs with deeper analysis, examples, or data.
350 | 2. Incorporate alternative perspectives, historical background, or technical details.
351 | 3. Retain the original content but build upon it.
352 | 
353 | Maintain the same style and referencing system, avoiding contradictions or redundant text. Ensure the expansion is coherent and stands as a robust discourse on the topic.""",
354 | 
355 |     "smart_source_selection": """Your mission is to filter sources for the research on {{query}} to only the most essential 15-20. The sources are:
356 | 
357 | {{sources}}
358 | 
359 | SELECTION CRITERIA:
360 | 1. Relevance to the core question.
361 | 2. Credibility and authority.
362 | 3. Uniqueness of perspective or data.
363 | 4. Depth of analysis offered.
364 | 
365 | Provide the final list of chosen sources, ranked by priority, and include a brief rationale for each. Summaries must be concise and free from extraneous commentary.""",
366 | 
367 |     "citation_formatter": """Format the following sources into standardized references:
368 | 
369 | {{sources}}
370 | 
371 | Each citation must:
372 | - Include publication name or website
373 | - List author(s) if available
374 | - Provide the title
375 | - Give the publication date if available
376 | - Show the URL
377 | 
378 | Use a numbered [n] format for each entry. Maintain consistency and brevity, without additional remarks beyond these essential details.""",
379 | 
380 |     "multi_step_synthesis": """You must perform a targeted synthesis step for the multi-step process. For this specific portion:
381 | 
382 | {{current_step}}
383 | 
384 | Relevant findings:
385 | {{findings}}
386 | 
387 | Instructions:
388 | 1. Integrate the above findings cohesively, focusing on {{current_step}}.
389 | 2. Identify patterns, discrepancies, or important details relevant to the broader topic.
390 | 3. Provide thorough explanations, citing data where pertinent.
391 | 4. Connect this step to the overall research direction.
392 | 
393 | This is step {{step_number}} of {{total_steps}} in a multi-layered synthesis. Produce a clear, detailed discussion of your progress here, strictly guided by the given instructions."""
394 | }
395 | 


--------------------------------------------------------------------------------
/shandu/research/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Research module for Shandu deep research system.
3 | """
4 | 
5 | from .researcher import DeepResearcher, ResearchResult
6 | 
7 | __all__ = ["DeepResearcher", "ResearchResult"]
8 | 


--------------------------------------------------------------------------------
/shandu/research/researcher.py:
--------------------------------------------------------------------------------
  1 | """Research module implementation."""
  2 | from typing import List, Dict, Optional, Any, Union
  3 | from dataclasses import dataclass, field
  4 | from datetime import datetime
  5 | import json
  6 | from pathlib import Path
  7 | import os
  8 | 
  9 | @dataclass
 10 | class ResearchResult:
 11 |     """Container for research results with enhanced citation tracking."""
 12 |     query: str
 13 |     summary: str
 14 |     sources: List[Dict[str, Any]]
 15 |     subqueries: List[str]
 16 |     depth: int
 17 |     content_analysis: Optional[List[Dict[str, Any]]] = None
 18 |     chain_of_thought: Optional[List[str]] = None
 19 |     research_stats: Optional[Dict[str, Any]] = None
 20 |     citation_stats: Optional[Dict[str, Any]] = None  # New field for tracking citation statistics
 21 |     timestamp: datetime = field(default_factory=datetime.now)
 22 | 
 23 |     def to_markdown(self, include_chain_of_thought: bool = False, include_objective: bool = False) -> str:
 24 |         """Convert research results to markdown format including citation statistics."""
 25 |         stats = self.research_stats or {}
 26 |         elapsed_time = stats.get("elapsed_time_formatted", "Unknown")
 27 |         sources_count = stats.get("sources_count", len(self.sources))
 28 |         subqueries_count = stats.get("subqueries_count", len(self.subqueries))
 29 | 
 30 |         citation_stats = self.citation_stats or {}
 31 |         total_sources = citation_stats.get("total_sources", sources_count)
 32 |         total_learnings = citation_stats.get("total_learnings", 0)
 33 | 
 34 |         summary = self.summary
 35 | 
 36 |         lines = summary.split("\n")
 37 |         
 38 |         # Remove specific artifacts that can appear in the output
 39 |         cleaned_lines = []
 40 |         for line in lines:
 41 |             # Skip lines with these patterns
 42 |             if (line.strip().startswith("*Generated on:") or 
 43 |                 line.strip().startswith("Completed:") or 
 44 |                 "Here are" in line and ("search queries" in line or "queries to investigate" in line) or
 45 |                 line.strip() == "Research Framework:" or
 46 |                 "Key Findings:" in line or
 47 |                 "Key aspects to focus on:" in line):
 48 |                 continue
 49 |             cleaned_lines.append(line)
 50 |             
 51 |         summary = "\n".join(cleaned_lines)
 52 |         
 53 |         # Fix the "Research Report: **Objective:**" formatting issue
 54 |         if summary.startswith("# Research Report: **Objective:**"):
 55 |             summary = summary.replace("# Research Report: **Objective:**", "# Research Report")
 56 |         
 57 |         # Remove objective section if not requested
 58 |         if not include_objective and "**Objective:**" in summary:
 59 |             # Split by sections
 60 |             parts = summary.split("## ")
 61 |             filtered_parts = []
 62 | 
 63 |             for part in parts:
 64 |                 # Keep executive summary or empty parts
 65 |                 if part.startswith("Executive Summary") or not part.strip():
 66 |                     filtered_parts.append(part)
 67 |                     continue
 68 |                 
 69 |                 # Skip objective section
 70 |                 if "**Objective:**" in part and "**Key Aspects to Focus On:**" in part:
 71 |                     continue
 72 |                 
 73 |                 # Keep other sections
 74 |                 filtered_parts.append(part)
 75 |             
 76 |             # Reconstruct the summary
 77 |             if filtered_parts:
 78 |                 if not filtered_parts[0].startswith("Executive Summary"):
 79 |                     summary = "## ".join(filtered_parts)
 80 |                 else:
 81 |                     summary = filtered_parts[0] + "## " + "## ".join(filtered_parts[1:])
 82 | 
 83 |         md = [
 84 |             f"# {self.query}\n",
 85 |             f"{summary}\n"
 86 |         ]
 87 | 
 88 |         md.append("## Research Process\n")
 89 |         md.append(f"- **Depth**: {self.depth}")
 90 |         md.append(f"- **Breadth**: {stats.get('breadth', 'Not specified')}")
 91 |         md.append(f"- **Time Taken**: {elapsed_time}")
 92 |         md.append(f"- **Subqueries Explored**: {subqueries_count}")
 93 |         md.append(f"- **Sources Analyzed**: {sources_count}")
 94 | 
 95 |         if total_learnings > 0:
 96 |             md.append(f"- **Total Learnings Extracted**: {total_learnings}")
 97 |             md.append(f"- **Source Coverage**: {total_sources} sources with {total_learnings} tracked information points")
 98 | 
 99 |             source_reliability = citation_stats.get("source_reliability", {})
100 |             if source_reliability:
101 |                 md.append(f"- **Source Quality**: {len(source_reliability)} domains assessed for reliability\n")
102 |             else:
103 |                 md.append("")
104 |         else:
105 |             md.append("")
106 | 
107 |         if include_chain_of_thought and self.chain_of_thought:
108 |             md.append("## Research Process: Chain of Thought\n")
109 |             significant_thoughts = []
110 |             
111 |             for thought in self.chain_of_thought:
112 |                 # Skip generic or repetitive thoughts and output artifacts
113 |                 if any(x in thought.lower() for x in [
114 |                     "searching for", "selected relevant url", "completed", 
115 |                     "here are", "generated search queries", "queries to investigate"
116 |                 ]):
117 |                     continue
118 |                 significant_thoughts.append(thought)
119 |             
120 |             if len(significant_thoughts) > 20:
121 |                 selected_thoughts = (
122 |                     significant_thoughts[:5] + 
123 |                     significant_thoughts[len(significant_thoughts)//2-2:len(significant_thoughts)//2+3] + 
124 |                     significant_thoughts[-5:]
125 |                 )
126 |             else:
127 |                 selected_thoughts = significant_thoughts
128 |                 
129 |             for thought in selected_thoughts:
130 |                 md.append(f"- {thought}")
131 |             md.append("")
132 |         
133 |         return "\n".join(md)
134 | 
135 |     def to_dict(self) -> Dict[str, Any]:
136 |         """Convert to dictionary format."""
137 |         result = {
138 |             "query": self.query,
139 |             "summary": self.summary,
140 |             "sources": self.sources,
141 |             "subqueries": self.subqueries,
142 |             "depth": self.depth,
143 |             "content_analysis": self.content_analysis,
144 |             "chain_of_thought": self.chain_of_thought,
145 |             "research_stats": self.research_stats,
146 |             "timestamp": self.timestamp.isoformat()
147 |         }
148 | 
149 |         if self.citation_stats:
150 |             result["citation_stats"] = self.citation_stats
151 |             
152 |         return result
153 |     
154 |     def save_to_file(self, filepath: str, include_chain_of_thought: bool = False, include_objective: bool = False) -> None:
155 |         """Save research results to a file."""
156 |         directory = os.path.dirname(filepath)
157 |         if directory:
158 |             os.makedirs(directory, exist_ok=True)
159 |         
160 |         _, ext = os.path.splitext(filepath)
161 |         ext = ext.lower()
162 |         
163 |         if ext == '.md':
164 |             # Save as markdown
165 |             with open(filepath, 'w', encoding='utf-8') as f:
166 |                 f.write(self.to_markdown(include_chain_of_thought, include_objective))
167 |         elif ext == '.json':
168 |             # Save as JSON
169 |             with open(filepath, 'w', encoding='utf-8') as f:
170 |                 json.dump(self.to_dict(), f, indent=2, default=str)
171 |         else:
172 |             # Default to markdown
173 |             with open(filepath, 'w', encoding='utf-8') as f:
174 |                 f.write(self.to_markdown(include_chain_of_thought, include_objective))
175 |     
176 |     @classmethod
177 |     def from_dict(cls, data: Dict[str, Any]) -> 'ResearchResult':
178 |         """Create a ResearchResult from a dictionary."""
179 |         if 'timestamp' in data and isinstance(data['timestamp'], str):
180 |             data['timestamp'] = datetime.fromisoformat(data['timestamp'])
181 |             
182 |         return cls(**data)
183 |     
184 |     @classmethod
185 |     def load_from_file(cls, filepath: str) -> 'ResearchResult':
186 |         """Load research results from a file."""
187 |         with open(filepath, 'r', encoding='utf-8') as f:
188 |             data = json.load(f)
189 |         
190 |         return cls.from_dict(data)
191 | 
192 | class DeepResearcher:
193 |     """Research orchestrator."""
194 |     def __init__(
195 |         self,
196 |         output_dir: Optional[str] = None,
197 |         save_results: bool = True,
198 |         auto_save_interval: Optional[int] = None
199 |     ):
200 |         """Initialize the researcher."""
201 |         self.output_dir = output_dir or os.path.expanduser("~/shandu_research")
202 |         self.save_results = save_results
203 |         self.auto_save_interval = auto_save_interval
204 |         
205 |         if self.save_results:
206 |             os.makedirs(self.output_dir, exist_ok=True)
207 |     
208 |     def get_output_path(self, query: str, format: str = 'md') -> str:
209 |         """Get output path for research results."""
210 |         sanitized = "".join(c if c.isalnum() or c in " -_" else "_" for c in query)
211 |         sanitized = sanitized[:50]
212 |         
213 |         timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
214 |         filename = f"{sanitized}_{timestamp}.{format}"
215 |         
216 |         return os.path.join(self.output_dir, filename)
217 |     
218 |     async def research(
219 |         self, 
220 |         query: str,
221 |         strategy: str = 'langgraph',
222 |         **kwargs
223 |     ) -> ResearchResult:
224 |         """Perform research using the specified strategy."""
225 |         from ..agents.langgraph_agent import ResearchGraph
226 |         from ..agents.agent import ResearchAgent
227 |         
228 |         result = None
229 |         
230 |         if strategy == 'langgraph':
231 |             graph = ResearchGraph()
232 |             result = await graph.research(query, **kwargs)
233 |         elif strategy == 'agent':
234 |             agent = ResearchAgent()
235 |             result = await agent.research(query, **kwargs)
236 |         else:
237 |             raise ValueError(f"Unknown research strategy: {strategy}")
238 |         
239 |         if self.save_results and result:
240 |             md_path = self.get_output_path(query, 'md')
241 |             result.save_to_file(md_path)
242 |             
243 |             json_path = self.get_output_path(query, 'json')
244 |             result.save_to_file(json_path)
245 |         
246 |         return result
247 |     
248 |     def research_sync(
249 |         self, 
250 |         query: str,
251 |         strategy: str = 'langgraph',
252 |         **kwargs
253 |     ) -> ResearchResult:
254 |         """Synchronous research wrapper."""
255 |         import asyncio
256 |         return asyncio.run(self.research(query, strategy, **kwargs))
257 | 


--------------------------------------------------------------------------------
/shandu/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Scraper module for Shandu deep research system.
3 | """
4 | 
5 | from .scraper import WebScraper, ScrapedContent
6 | 
7 | __all__ = ["WebScraper", "ScrapedContent"]
8 | 


--------------------------------------------------------------------------------
/shandu/search/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Search module for Shandu deep research system.
3 | """
4 | 
5 | from .search import UnifiedSearcher, SearchResult
6 | 
7 | __all__ = ["UnifiedSearcher", "SearchResult"]
8 | 


--------------------------------------------------------------------------------
/shandu/search/ai_search.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Optional, Any, Union
  2 | import asyncio
  3 | import time
  4 | from dataclasses import dataclass
  5 | from datetime import datetime
  6 | from langchain_core.prompts import ChatPromptTemplate
  7 | from langchain_openai import ChatOpenAI
  8 | from langchain_community.tools import DuckDuckGoSearchRun, DuckDuckGoSearchResults
  9 | from .search import UnifiedSearcher, SearchResult
 10 | from ..config import config
 11 | from ..scraper import WebScraper, ScrapedContent
 12 | from ..agents.utils.citation_manager import CitationManager, SourceInfo
 13 | 
 14 | @dataclass
 15 | class AISearchResult:
 16 |     """Container for AI-enhanced search results with enriched output and citation tracking."""
 17 |     query: str
 18 |     summary: str
 19 |     sources: List[Dict[str, Any]]
 20 |     citation_stats: Optional[Dict[str, Any]] = None
 21 |     timestamp: datetime = datetime.now()
 22 |     
 23 |     def to_markdown(self) -> str:
 24 |         """Convert to markdown format with improved readability."""
 25 |         timestamp_str = self.timestamp.strftime('%Y-%m-%d %H:%M:%S')
 26 |         md = [
 27 |             f"# {self.query}",
 28 |             "## Summary",
 29 |             self.summary,
 30 |             "## Sources"
 31 |         ]
 32 |         for i, source in enumerate(self.sources, 1):
 33 |             title = source.get('title', 'Untitled')
 34 |             url = source.get('url', '')
 35 |             snippet = source.get('snippet', '')
 36 |             source_type = source.get('source', 'Unknown')
 37 |             md.append(f"### {i}. {title}")
 38 |             if url:
 39 |                 md.append(f"- **URL:** [{url}]({url})")
 40 |             if source_type:
 41 |                 md.append(f"- **Source:** {source_type}")
 42 |             if snippet:
 43 |                 md.append(f"- **Snippet:** {snippet}")
 44 |             md.append("")
 45 | 
 46 |         if self.citation_stats:
 47 |             md.append("## Research Process")
 48 |             md.append(f"- **Sources Analyzed**: {self.citation_stats.get('total_sources', len(self.sources))}")
 49 |             md.append(f"- **Key Information Points**: {self.citation_stats.get('total_learnings', 0)}")
 50 |             if self.citation_stats.get('source_reliability'):
 51 |                 md.append(f"- **Source Quality**: {len(self.citation_stats.get('source_reliability', {}))} domains assessed")
 52 |             md.append("")
 53 |             
 54 |         return "\n".join(md)
 55 |     
 56 |     def to_dict(self) -> Dict[str, Any]:
 57 |         """Convert to dictionary format."""
 58 |         result = {
 59 |             "query": self.query,
 60 |             "summary": self.summary,
 61 |             "sources": self.sources,
 62 |             "timestamp": self.timestamp.isoformat()
 63 |         }
 64 |         if self.citation_stats:
 65 |             result["citation_stats"] = self.citation_stats
 66 |         return result
 67 | 
 68 | class AISearcher:
 69 |     """
 70 |     AI-powered search functionality.
 71 |     Combines search results with AI analysis for any type of query.
 72 |     Enhanced with scraping capability, detailed outputs, source citations, and learning extraction.
 73 |     """
 74 |     def __init__(
 75 |         self,
 76 |         llm: Optional[ChatOpenAI] = None,
 77 |         searcher: Optional[UnifiedSearcher] = None,
 78 |         scraper: Optional[WebScraper] = None,
 79 |         citation_manager: Optional[CitationManager] = None,
 80 |         max_results: int = 10,
 81 |         max_pages_to_scrape: int = 3
 82 |     ):
 83 |         api_base = config.get("api", "base_url")
 84 |         api_key = config.get("api", "api_key")
 85 |         model = config.get("api", "model")
 86 |         self.llm = llm or ChatOpenAI(
 87 |             base_url=api_base,
 88 |             api_key=api_key,
 89 |             model=model,
 90 |             temperature=0.4,
 91 |             max_tokens=8192
 92 |         )
 93 |         self.searcher = searcher or UnifiedSearcher(max_results=max_results)
 94 |         self.scraper = scraper or WebScraper()
 95 |         self.citation_manager = citation_manager or CitationManager()
 96 |         self.max_results = max_results
 97 |         self.max_pages_to_scrape = max_pages_to_scrape
 98 | 
 99 |         self.ddg_search = DuckDuckGoSearchRun()
100 |         self.ddg_results = DuckDuckGoSearchResults(output_format="list")
101 |     
102 |     async def search(
103 |         self, 
104 |         query: str,
105 |         engines: Optional[List[str]] = None,
106 |         detailed: bool = False,
107 |         enable_scraping: bool = True,
108 |         use_ddg_tools: bool = True
109 |     ) -> AISearchResult:
110 |         """
111 |         Perform AI-enhanced search with detailed outputs and source citations.
112 |         
113 |         Args:
114 |             query: Search query (can be about any topic)
115 |             engines: List of search engines to use
116 |             detailed: Whether to generate a detailed analysis
117 |             enable_scraping: Whether to scrape content from top results
118 |             use_ddg_tools: Whether to use DuckDuckGo tools from langchain_community
119 |         
120 |         Returns:
121 |             AISearchResult object with a comprehensive summary and cited sources
122 |         """
123 |         timestamp = datetime.now()
124 |         sources = []
125 |         
126 |         # Use DuckDuckGo tools if enabled
127 |         if use_ddg_tools and (not engines or 'duckduckgo' in engines):
128 |             try:
129 | 
130 |                 ddg_structured_results = self.ddg_results.invoke(query)
131 |                 for result in ddg_structured_results[:self.max_results]:
132 |                     source_info = {
133 |                         "title": result.get("title", "Untitled"),
134 |                         "url": result.get("link", ""),
135 |                         "snippet": result.get("snippet", ""),
136 |                         "source": "DuckDuckGo"
137 |                     }
138 |                     sources.append(source_info)
139 |                     
140 |                     # Register source with citation manager
141 |                     self._register_source_with_citation_manager(source_info)
142 |             except Exception as e:
143 |                 print(f"Error using DuckDuckGoSearchResults: {e}")
144 |         
145 |         # Use UnifiedSearcher as a fallback or if DuckDuckGo tools are disabled
146 |         if not sources or not use_ddg_tools:
147 |             search_results = await self.searcher.search(query, engines)
148 |         
149 |         # Collect all sources
150 |             for result in search_results:
151 |                 if isinstance(result, SearchResult):
152 |                     result_dict = result.to_dict()
153 |                     sources.append(result_dict)
154 |                     
155 |                     # Register source with citation manager
156 |                     self._register_source_with_citation_manager(result_dict)
157 |                 elif isinstance(result, dict):
158 |                     sources.append(result)
159 |                     
160 |                     # Register source with citation manager
161 |                     self._register_source_with_citation_manager(result)
162 |         
163 |         # Scrape additional content if enabled
164 |         if enable_scraping:
165 |             urls_to_scrape = []
166 |             for source in sources:
167 |                 if source.get('url') and len(urls_to_scrape) < self.max_pages_to_scrape:
168 |                     urls_to_scrape.append(source['url'])
169 |             if urls_to_scrape:
170 |                 print(f"Scraping {len(urls_to_scrape)} pages for deeper insights...")
171 |                 scraped_results = await self.scraper.scrape_urls(urls_to_scrape, dynamic=True)
172 |                 for scraped in scraped_results:
173 |                     if hasattr(scraped, 'is_successful') and scraped.is_successful():
174 |                         try:
175 |                             main_content = scraped.text
176 |                             if hasattr(self.scraper, 'extract_main_content'):
177 |                                 main_content = await self.scraper.extract_main_content(scraped)
178 |                             if "unexpected error" in main_content.lower():
179 |                                 continue
180 |                             preview = main_content[:500] + ("...(truncated)" if len(main_content) > 1500 else "")
181 |                             source_info = {
182 |                                 "title": scraped.title,
183 |                                 "url": scraped.url,
184 |                                 "snippet": preview,
185 |                                 "source": "Scraped Content"
186 |                             }
187 |                             sources.append(source_info)
188 |                             
189 |                             # Register source with citation manager and extract learnings
190 |                             source_id = self._register_source_with_citation_manager(source_info)
191 |                             if source_id and main_content:
192 |                                 self.citation_manager.extract_learning_from_text(
193 |                                     main_content, 
194 |                                     scraped.url,
195 |                                     context=f"Search query: {query}"
196 |                                 )
197 |                         except Exception as e:
198 |                             print(f"Error processing scraped content from {scraped.url}: {e}")
199 |         
200 |         # Prepare sources with improved citation format
201 |         aggregated_text = ""
202 |         for i, source in enumerate(sources, 1):
203 | 
204 |             url = source.get('url', '')
205 |             domain = url.split("//")[1].split("/")[0] if "//" in url else "Unknown Source"
206 |             # Capitalize first letter of domain for a more professional look
207 |             domain_name = domain.split('.')[0].capitalize() if '.' in domain else domain
208 |             
209 |             aggregated_text += (
210 |                 f"[{i}] {domain_name}\n"
211 |                 f"Title: {source.get('title', 'Untitled')}\n"
212 |                 f"URL: {url}\n"
213 |                 f"Snippet: {source.get('snippet', '')}\n\n"
214 |             )
215 |         
216 |         current_date = timestamp.strftime('%Y-%m-%d')
217 |         if detailed:
218 |             detail_instruction = (
219 |                 "Provide a detailed analysis with in-depth explanations, "
220 |                 "specific examples, relevant background, and additional insights "
221 |                 "to enhance understanding of the topic."
222 |             )
223 |         else:
224 |             detail_instruction = "Provide a concise yet informative summary, focusing on the key points and essential information."
225 |         
226 |         final_prompt = f"""You are Shandu, an expert analyst. Based on the following sources retrieved on {current_date} for the query "{query}", {detail_instruction}
227 | 
228 | - If the query is a question, answer it directly with a thorough explanation.
229 | - If it's a topic, provide a well-rounded overview with supporting details.
230 | - Use bullet points or numbered lists to organize information clearly.
231 | - If there are conflicting views or uncertainties, discuss them explicitly.
232 | - When providing information, cite the source by using the number in square brackets, like [1], to indicate where the information was sourced.
233 | - ONLY use the citation numbers provided in the sources below.
234 | - DO NOT include years or dates in your citations, just use the bracketed number like [1].
235 | - Ensure the response is engaging, detailed, and written in plain text suitable for all readers.
236 | 
237 | Sources:
238 | 
239 | {aggregated_text}
240 | """
241 |         
242 |         final_output = await self.llm.ainvoke(final_prompt)
243 | 
244 |         citation_stats = None
245 |         if sources:
246 |             citation_stats = {
247 |                 "total_sources": len(self.citation_manager.sources),
248 |                 "total_learnings": len(self.citation_manager.learnings),
249 |                 "source_reliability": self.citation_manager._calculate_source_reliability()
250 |             }
251 |         
252 |         return AISearchResult(
253 |             query=query,
254 |             summary=final_output.content.strip(),
255 |             sources=sources,
256 |             citation_stats=citation_stats,
257 |             timestamp=timestamp
258 |         )
259 |     
260 |     def _register_source_with_citation_manager(self, source: Dict[str, Any]) -> Optional[str]:
261 |         """Register a source with the citation manager and return its ID."""
262 |         try:
263 |             url = source.get('url', '')
264 |             if not url:
265 |                 return None
266 |                 
267 |             title = source.get('title', 'Untitled')
268 |             snippet = source.get('snippet', '')
269 |             source_type = source.get('source', 'web')
270 | 
271 |             domain = url.split("//")[1].split("/")[0] if "//" in url else "unknown"
272 | 
273 |             source_info = SourceInfo(
274 |                 url=url,
275 |                 title=title,
276 |                 snippet=snippet,
277 |                 source_type=source_type,
278 |                 content_type="article",
279 |                 access_time=time.time(),
280 |                 domain=domain,
281 |                 reliability_score=0.8,  # Default score
282 |                 metadata=source
283 |             )
284 | 
285 |             return self.citation_manager.add_source(source_info)
286 |             
287 |         except Exception as e:
288 |             print(f"Error registering source with citation manager: {e}")
289 |             return None
290 |     
291 |     def search_sync(
292 |         self, 
293 |         query: str,
294 |         engines: Optional[List[str]] = None,
295 |         detailed: bool = False,
296 |         enable_scraping: bool = True,
297 |         use_ddg_tools: bool = True
298 |     ) -> AISearchResult:
299 |         """Synchronous version of the search method."""
300 |         return asyncio.run(self.search(query, engines, detailed, enable_scraping, use_ddg_tools))
301 | 


--------------------------------------------------------------------------------
/shandu/utils/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logging utilities for Shandu.
 3 | """
 4 | import os
 5 | import logging
 6 | import traceback
 7 | import inspect
 8 | from datetime import datetime
 9 | from pathlib import Path
10 | 
11 | log_dir = os.path.expanduser("~/.shandu/logs")
12 | Path(log_dir).mkdir(parents=True, exist_ok=True)
13 | 
14 | logger = logging.getLogger("shandu")
15 | logger.setLevel(logging.DEBUG)
16 | 
17 | current_date = datetime.now().strftime("%Y-%m-%d")
18 | log_file = os.path.join(log_dir, f"shandu_{current_date}.log")
19 | file_handler = logging.FileHandler(log_file, encoding='utf-8')
20 | file_handler.setLevel(logging.DEBUG)
21 | 
22 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s')
23 | file_handler.setFormatter(formatter)
24 | 
25 | logger.addHandler(file_handler)
26 | 
27 | def get_caller_filename():
28 |     """
29 |     Get the filename of the caller.
30 |     
31 |     Returns:
32 |         str: The filename of the caller.
33 |     """
34 | 
35 |     stack = inspect.stack()
36 |     # The caller is the third frame in the stack (index 2)
37 |     caller_frame = stack[2]
38 | 
39 |     caller_filename = os.path.basename(caller_frame.filename)
40 |     return caller_filename
41 | 
42 | def log_error(message, error, context=None):
43 |     """
44 |     Log an error with detailed information.
45 |     
46 |     Args:
47 |         message: The error message
48 |         error: The exception object
49 |         context: Additional context information (optional)
50 |     """
51 |     caller_filename = get_caller_filename()
52 |     error_details = f"[{caller_filename}] {message}: {str(error)}"
53 |     if context:
54 |         error_details += f" | Context: {context}"
55 | 
56 |     error_details += f"\nTraceback: {traceback.format_exc()}"
57 |     
58 |     logger.error(error_details)
59 | 
60 | def log_warning(message, context=None):
61 |     """
62 |     Log a warning with context information.
63 |     
64 |     Args:
65 |         message: The warning message
66 |         context: Additional context information (optional)
67 |     """
68 |     caller_filename = get_caller_filename()
69 |     warning_details = f"[{caller_filename}] {message}"
70 |     if context:
71 |         warning_details += f" | Context: {context}"
72 |     
73 |     logger.warning(warning_details)
74 | 
75 | def log_info(message, context=None):
76 |     """
77 |     Log an info message with context information.
78 |     
79 |     Args:
80 |         message: The info message
81 |         context: Additional context information (optional)
82 |     """
83 |     caller_filename = get_caller_filename()
84 |     info_details = f"[{caller_filename}] {message}"
85 |     if context:
86 |         info_details += f" | Context: {context}"
87 |     
88 |     logger.info(info_details)


--------------------------------------------------------------------------------
/tests/test_citation_registry.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from shandu.agents.utils.citation_registry import CitationRegistry
 3 | 
 4 | class TestCitationRegistry(unittest.TestCase):
 5 |     """Basic tests for the CitationRegistry class."""
 6 |     
 7 |     def test_citation_registration(self):
 8 |         """Test that citations can be registered and retrieved correctly."""
 9 |         registry = CitationRegistry()
10 |         
11 |         # Register a few citations
12 |         cid1 = registry.register_citation("https://example.com/article1")
13 |         cid2 = registry.register_citation("https://example.com/article2")
14 |         cid3 = registry.register_citation("https://example.com/article3")
15 |         
16 |         # Test citation IDs are sequential
17 |         self.assertEqual(cid1, 1)
18 |         self.assertEqual(cid2, 2)
19 |         self.assertEqual(cid3, 3)
20 |         
21 |         # Test URL to ID mapping works
22 |         self.assertEqual(registry.url_to_id["https://example.com/article1"], 1)
23 |         self.assertEqual(registry.url_to_id["https://example.com/article2"], 2)
24 |         
25 |         # Test ID to URL mapping works
26 |         self.assertEqual(registry.id_to_url[1], "https://example.com/article1")
27 |         self.assertEqual(registry.id_to_url[2], "https://example.com/article2")
28 |         
29 |         # Test getting citation info
30 |         self.assertEqual(registry.get_citation_info(1)["url"], "https://example.com/article1")
31 |         self.assertEqual(registry.get_citation_info(2)["url"], "https://example.com/article2")
32 |     
33 |     def test_bulk_registration(self):
34 |         """Test bulk registration of citations."""
35 |         registry = CitationRegistry()
36 |         
37 |         urls = [
38 |             "https://example.com/article1",
39 |             "https://example.com/article2",
40 |             "https://example.com/article3"
41 |         ]
42 |         
43 |         registry.bulk_register_sources(urls)
44 |         
45 |         # Check all URLs were registered
46 |         self.assertEqual(len(registry.citations), 3)
47 |         
48 |         # Check URL to ID mappings
49 |         self.assertIn("https://example.com/article1", registry.url_to_id)
50 |         self.assertIn("https://example.com/article2", registry.url_to_id)
51 |         self.assertIn("https://example.com/article3", registry.url_to_id)
52 |     
53 |     def test_citation_validation(self):
54 |         """Test citation validation in text."""
55 |         registry = CitationRegistry()
56 |         
57 |         # Register a few citations
58 |         registry.register_citation("https://example.com/article1")
59 |         registry.register_citation("https://example.com/article2")
60 |         
61 |         # Text with valid and invalid citations
62 |         text = """
63 |         This is a test with valid citation [1] and another valid citation [2].
64 |         This is an invalid citation [3] that doesn't exist.
65 |         Here's another mention of [1] and an out-of-range [5].
66 |         """
67 |         
68 |         result = registry.validate_citations(text)
69 |         
70 |         # Check validation results
71 |         self.assertFalse(result["valid"])
72 |         self.assertIn(3, result["invalid_citations"])
73 |         self.assertIn(5, result["invalid_citations"])
74 |         self.assertEqual(len(result["used_citations"]), 2)
75 |         self.assertIn(1, result["used_citations"])
76 |         self.assertIn(2, result["used_citations"])
77 | 
78 | if __name__ == '__main__':
79 |     unittest.main()
80 | 


--------------------------------------------------------------------------------
/tests/test_report_generator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import AsyncMock, patch
 3 | import asyncio
 4 | from shandu.agents.processors.report_generator import format_citations
 5 | from shandu.agents.utils.citation_registry import CitationRegistry
 6 | 
 7 | class TestReportGenerator(unittest.TestCase):
 8 |     """Basic tests for report generation functions."""
 9 |     
10 |     def setUp(self):
11 |         """Set up test cases."""
12 |         self.mock_llm = AsyncMock()
13 |         self.mock_llm.ainvoke = AsyncMock()
14 |         
15 |         # Sample citation data
16 |         self.sample_sources = [
17 |             {"url": "https://example.com/article1", "title": "Test Article 1", "date": "2023-01-01"},
18 |             {"url": "https://github.com/user/repo", "title": "Sample Repository", "date": "2024-02-15"}
19 |         ]
20 |         
21 |         # Create a citation registry
22 |         self.registry = CitationRegistry()
23 |         self.registry.register_citation("https://example.com/article1")
24 |         self.registry.register_citation("https://github.com/user/repo")
25 |         
26 |         # Add metadata to the citations
27 |         self.registry.update_citation_metadata(1, {
28 |             "title": "Test Article 1",
29 |             "date": "2023-01-01"
30 |         })
31 |         self.registry.update_citation_metadata(2, {
32 |             "title": "Sample Repository",
33 |             "date": "2024-02-15"
34 |         })
35 |     
36 |     def test_format_citations_sync(self):
37 |         """Test format_citations function synchronously by running the async function."""
38 |         # Set up the mock to return properly formatted citations
39 |         self.mock_llm.ainvoke.return_value.content = """
40 |         [1] *example.com*, "Test Article 1", https://example.com/article1
41 |         [2] *github.com*, "Sample Repository", https://github.com/user/repo
42 |         """
43 |         
44 |         # Run the async function in a synchronous context
45 |         formatted_citations = asyncio.run(format_citations(
46 |             self.mock_llm,
47 |             ["https://example.com/article1", "https://github.com/user/repo"],
48 |             self.sample_sources,
49 |             self.registry
50 |         ))
51 |         
52 |         # Check the results
53 |         self.assertIn("*example.com*", formatted_citations)
54 |         self.assertIn("\"Test Article 1\"", formatted_citations)
55 |         self.assertIn("https://example.com/article1", formatted_citations)
56 |         
57 |         # Verify the correct format (no date in citations)
58 |         self.assertNotIn("2023-01-01", formatted_citations)
59 |         self.assertNotIn("2024-02-15", formatted_citations)
60 |         
61 |         # Ensure citation numbers are properly formatted
62 |         self.assertIn("[1]", formatted_citations)
63 |         self.assertIn("[2]", formatted_citations)
64 | 
65 | if __name__ == '__main__':
66 |     unittest.main()
67 | 


--------------------------------------------------------------------------------