├── src └── deep_research │ ├── core │ ├── exception.py │ ├── __init__.py │ ├── config.py │ ├── research.py │ └── report.py │ ├── cli │ ├── __init__.py │ └── main.py │ ├── utils │ ├── __init__.py │ ├── log_util.py │ └── research_helper.py │ └── services │ ├── __init__.py │ ├── search_template.py │ ├── search_service.py │ ├── persistence_service.py │ └── ai_service.py ├── output └── readme.md ├── demo └── China's Credit Card Consumer Market: Comprehensive Analysis Report (2015-2030).pdf ├── setup.py ├── requirements.txt ├── .gitignore ├── LICENSE ├── .env.example └── README.md /src/deep_research/core/exception.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /output/readme.md: -------------------------------------------------------------------------------- 1 | # All data will be generated to this output folder. -------------------------------------------------------------------------------- /src/deep_research/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """Deep Research CLI - A tool for conducting deep research""" 2 | 3 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /src/deep_research/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Deep Research CLI - A tool for conducting deep research""" 2 | 3 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /src/deep_research/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Deep Research CLI - A tool for conducting deep research""" 2 | 3 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /src/deep_research/services/__init__.py: -------------------------------------------------------------------------------- 1 | """Deep Research CLI - A tool for conducting deep research""" 2 | 3 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /demo/China's Credit Card Consumer Market: Comprehensive Analysis Report (2015-2030).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamshaynez/deep-research-cli/HEAD/demo/China's Credit Card Consumer Market: Comprehensive Analysis Report (2015-2030).pdf -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="deep-research-cli", 5 | version="0.1.0", 6 | packages=find_packages(where="src"), 7 | package_dir={"":"src"}, 8 | install_requires=[ 9 | # Add your dependencies here 10 | ], 11 | ) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | click>=8.0.0 3 | rich>=10.0.0 4 | python-dotenv>=1.0.0 5 | 6 | # AI and LLM Integration 7 | openai>=1.0.0 8 | tavily-python>=0.2.0 9 | 10 | # Utilities 11 | requests>=2.31.0 12 | pydantic>=2.0.0 13 | python-dateutil>=2.8.2 14 | tqdm>=4.66.0 15 | json_repair>=0.35.0 16 | 17 | # Logging and Error Handling 18 | loguru>=0.7.0 19 | 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | 3 | src/deep_research/cli/exec.py 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # Distribution / packaging 10 | dist/ 11 | build/ 12 | *.egg-info/ 13 | 14 | # Virtual environments 15 | venv/ 16 | env/ 17 | .env/ 18 | .venv/ 19 | 20 | # IDE specific files 21 | .idea/ 22 | .vscode/ 23 | *.swp 24 | *.swo 25 | 26 | # Testing 27 | .coverage 28 | htmlcov/ 29 | .pytest_cache/ 30 | .tox/ 31 | 32 | # Jupyter Notebook 33 | .ipynb_checkpoints 34 | 35 | # Environment variables 36 | .env 37 | .env.local 38 | 39 | # macOS specific 40 | .DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Xiaowen.Z 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Madantory Variables 2 | # 1. LLM Model settings, openrouter is recommanded since it integrates multiple model providers. 3 | OPENAI_KEY= 4 | OPENAI_BASE=https://openrouter.ai/api/v1 5 | 6 | # 2. Tavily API Token 7 | TAVILY_API_KEY= 8 | 9 | # Optional Variables 10 | 11 | # a. LLM Model Choice, refer config.py for default settings. 12 | # SMART_MODEL = "deepseek/deepseek-r1" 13 | # NORMAL_MODEL = "deepseek/deepseek-r1-distill-llama-70b" 14 | # LONG_MODEL = "google/gemini-2.0-flash-001" 15 | # REPORT_MODEL = "google/gemini-2.0-pro-exp-02-05:free" 16 | 17 | # b. Log Level in Console, default INFO 18 | # DEEP_RESEARCH_LOG_LEVEL=INFO 19 | 20 | # c. Language use to create reports. Default = Chinese. Used in Prompt so just english words. 21 | # REPORT_LANG=Chinese 22 | 23 | # d. RESEARCH_PLAN_PROMPT, additional prompt which will hijack into Research Plan Creation process 24 | # RESEARCH_PLAN_PROMPT= 25 | 26 | # e. REPORT_PLAN_PROMPT, additional prompt which will hijack into Report Creation process influence the report content 27 | # REPORT_PROMPT= 28 | 29 | # f. NUMBER_OF_QUERIES_IN_CATEGORY, number of queries in each category, default 3 30 | # NUMBER_OF_QUERIES_IN_CATEGORY= -------------------------------------------------------------------------------- /src/deep_research/core/config.py: -------------------------------------------------------------------------------- 1 | """Configuration module for deep-research-cli""" 2 | 3 | import os 4 | from typing import Optional 5 | from dotenv import load_dotenv 6 | 7 | # Load environment variables 8 | load_dotenv() 9 | 10 | class Config: 11 | """Configuration class for model settings""" 12 | SMART_MODEL = os.getenv('SMART_MODEL', "deepseek/deepseek-r1") 13 | NORMAL_MODEL = os.getenv('NORMAL_MODEL', "deepseek/deepseek-r1-distill-llama-70b") 14 | LONG_MODEL = os.getenv('LONG_MODEL', "google/gemini-2.0-flash-001") 15 | REPORT_MODEL = os.getenv('REPORT_MODEL', "google/gemini-2.0-pro-exp-02-05:free") 16 | 17 | REPORT_LANG = os.getenv('REPORT_LANG', "Chinese") 18 | 19 | class LLMConfig: 20 | """Configuration class for LLM models""" 21 | def __init__(self): 22 | self.api_key = os.getenv('OPENAI_KEY') 23 | self.api_base = os.getenv('OPENAI_BASE') 24 | self.smart_model = os.getenv('SMART_MODEL', Config.SMART_MODEL) 25 | self.normal_model = os.getenv('NORMAL_MODEL', Config.NORMAL_MODEL) 26 | self.long_model = os.getenv('LONG_MODEL', Config.LONG_MODEL) 27 | 28 | if not self.api_key: 29 | raise ValueError('OPENAI_KEY environment variable is not set') 30 | 31 | class SearchConfig: 32 | """Configuration class for Tavily Search API""" 33 | def __init__(self): 34 | self.api_key = os.getenv('TAVILY_API_KEY') 35 | if not self.api_key: 36 | raise ValueError('TAVILY_API_KEY environment variable is not set') -------------------------------------------------------------------------------- /src/deep_research/services/search_template.py: -------------------------------------------------------------------------------- 1 | """Module for managing search parameter templates""" 2 | 3 | from typing import Dict, Any 4 | 5 | class SearchTemplate: 6 | """Class for managing search parameter templates with predefined configurations""" 7 | 8 | # Predefined search templates 9 | TEMPLATES = { 10 | "basic": { 11 | "search_depth": "basic", 12 | "include_raw_content": True, 13 | "max_results": 5 14 | }, 15 | "advanced": { 16 | "search_depth": "advanced", 17 | "max_results": 10, 18 | "include_domains": [], 19 | "exclude_domains": [], 20 | "include_raw_content": True, 21 | }, 22 | "news": { 23 | "search_depth": "advanced", 24 | "max_results": 8, 25 | "include_raw_content": True, 26 | "topic": "news" 27 | }, 28 | "academic": { 29 | "search_depth": "advanced", 30 | "max_results": 15, 31 | "include_raw_content": True, 32 | "include_domains": [".edu", ".org", "scholar.google.com"], 33 | "topic": "general" 34 | } 35 | } 36 | 37 | def load_template(self, name: str) -> Dict[str, Any]: 38 | """Load a predefined search parameter template 39 | 40 | Args: 41 | name: Name of the template to load 42 | 43 | Returns: 44 | Dictionary containing the template parameters or error message 45 | """ 46 | if name in self.TEMPLATES: 47 | return {"success": True, "data": self.TEMPLATES[name].copy()} 48 | return {"success": False, "error": f"Template not found: {name}"} 49 | 50 | def apply_template(self, name: str, **override_params: Any) -> Dict[str, Any]: 51 | """Load a template and optionally override specific parameters 52 | 53 | Args: 54 | name: Name of the template to load 55 | **override_params: Optional parameters to override in the template 56 | 57 | Returns: 58 | Dictionary containing the final parameters or error message 59 | """ 60 | result = self.load_template(name) 61 | if result["success"]: 62 | params = result["data"] 63 | params.update(override_params) 64 | return {"success": True, "data": params} 65 | return result -------------------------------------------------------------------------------- /src/deep_research/cli/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from deep_research.core.research import Research 3 | from deep_research.core.report import Report 4 | from deep_research.utils.log_util import LogUtil 5 | from dotenv import load_dotenv 6 | import traceback 7 | 8 | # Load environment variables 9 | load_dotenv() 10 | 11 | def main(): 12 | logger = LogUtil().logger 13 | logger.info("Starting Deep Research CLI") 14 | parser = argparse.ArgumentParser( 15 | description="Deep Research CLI - A tool for conducting deep research", 16 | formatter_class=argparse.RawDescriptionHelpFormatter, 17 | epilog=\ 18 | """ 19 | Examples: 20 | # Create a new research with a topic 21 | python main.py --topic "Impact of AI on healthcare in 2024" 22 | 23 | # Load and continue an existing research 24 | python main.py --research-id RS_20240214_123456 25 | """ 26 | ) 27 | 28 | group = parser.add_mutually_exclusive_group(required=True) 29 | group.add_argument('--topic', '-t', help='Research topic in any language') 30 | group.add_argument('--research-id', '-r', help='ID of existing research to load') 31 | parser.add_argument('--report-method', '-p', default='generate_research_report_detailed', 32 | choices=['generate_research_report', 'generate_research_report_detailed', 'generate_wechat_article'], 33 | help='Report generation method to use (default: generate_research_report)') 34 | 35 | args = parser.parse_args() 36 | 37 | try: 38 | if args.topic: 39 | logger.info(f"Creating new research with topic: {args.topic}") 40 | research = Research(topic=args.topic) 41 | logger.info(f"Created new research with ID: {research.id}") 42 | logger.info("Starting research execution") 43 | research.execute() 44 | logger.info(f"Completed research execution. All data saved to output directory [/output/{research.id}]") 45 | else: 46 | logger.info(f"Loading existing research with ID: {args.research_id}") 47 | research = Research(research_id=args.research_id) 48 | logger.info(f"Loaded existing research with ID: {research.id}") 49 | 50 | # Create report instance and generate report 51 | logger.info(f"Generating report using method: {args.report_method}") 52 | report = Report(research.id) 53 | report_method = getattr(report, args.report_method) 54 | report_content = report_method() 55 | logger.info(f"Successfully generated report using {args.report_method}") 56 | 57 | except Exception as e: 58 | logger.error(f"Error during research execution: {str(e)}") 59 | logger.error("Full error stack trace:") 60 | logger.error(traceback.format_exc()) 61 | exit(1) 62 | 63 | if __name__ == "__main__": 64 | main() -------------------------------------------------------------------------------- /src/deep_research/utils/log_util.py: -------------------------------------------------------------------------------- 1 | """Module for handling logging operations""" 2 | 3 | import logging 4 | import os 5 | from datetime import datetime 6 | from typing import Optional, Union 7 | from rich.logging import RichHandler 8 | from rich.console import Console 9 | from rich.theme import Theme 10 | 11 | # Environment variable name for log level 12 | LOG_LEVEL_ENV = "DEEP_RESEARCH_LOG_LEVEL" 13 | 14 | class LogUtil: 15 | """Class for managing logging operations""" 16 | 17 | _instance = None 18 | _initialized = False 19 | _console = None 20 | 21 | def __new__(cls): 22 | """Singleton pattern implementation""" 23 | if cls._instance is None: 24 | cls._instance = super(LogUtil, cls).__new__(cls) 25 | return cls._instance 26 | 27 | def __init__(self): 28 | """Initialize logging configuration""" 29 | if not LogUtil._initialized: 30 | # Initialize Rich console with custom theme 31 | self._console = Console(theme=Theme({ 32 | "info": "cyan", 33 | "warning": "yellow", 34 | "error": "red", 35 | "critical": "red bold", 36 | "debug": "dim cyan" 37 | })) 38 | 39 | # Get log level from environment variable or default to INFO 40 | log_level_str = os.getenv(LOG_LEVEL_ENV, 'INFO').upper() 41 | try: 42 | log_level = getattr(logging, log_level_str) 43 | except AttributeError: 44 | log_level = logging.INFO 45 | print(f"Invalid log level {log_level_str}, defaulting to INFO") 46 | 47 | self.logger = logging.getLogger('deep_research') 48 | self.logger.setLevel(log_level) 49 | LogUtil._initialized = True 50 | self._setup_logging() 51 | 52 | def _setup_logging(self): 53 | """Setup logging configuration with Rich handler""" 54 | # Create Rich handler with custom format 55 | rich_handler = RichHandler( 56 | console=self._console, 57 | rich_tracebacks=True, 58 | tracebacks_show_locals=True, 59 | show_time=True, 60 | show_path=True, 61 | enable_link_path=True 62 | ) 63 | rich_handler.setLevel(logging.DEBUG) 64 | 65 | # Set format to include minimal timestamp since Rich handler adds its own 66 | rich_handler.setFormatter(logging.Formatter('%(message)s', datefmt='[%X]')) 67 | 68 | # Remove any existing handlers and add Rich handler 69 | self.logger.handlers = [] 70 | self.logger.addHandler(rich_handler) 71 | 72 | def debug(self, message: str, *args, **kwargs): 73 | """Log debug message with Rich formatting 74 | 75 | Args: 76 | message: The message to log 77 | *args: Variable length argument list 78 | **kwargs: Arbitrary keyword arguments 79 | """ 80 | self.logger.debug(message, *args, **kwargs) 81 | 82 | def info(self, message: str, *args, **kwargs): 83 | """Log info message with Rich formatting 84 | 85 | Args: 86 | message: The message to log 87 | *args: Variable length argument list 88 | **kwargs: Arbitrary keyword arguments 89 | """ 90 | self.logger.info(message, *args, **kwargs) 91 | 92 | def warning(self, message: str, *args, **kwargs): 93 | """Log warning message with Rich formatting 94 | 95 | Args: 96 | message: The message to log 97 | *args: Variable length argument list 98 | **kwargs: Arbitrary keyword arguments 99 | """ 100 | self.logger.warning(message, *args, **kwargs) 101 | 102 | def error(self, message: str, *args, **kwargs): 103 | """Log error message with Rich formatting 104 | 105 | Args: 106 | message: The message to log 107 | *args: Variable length argument list 108 | **kwargs: Arbitrary keyword arguments 109 | """ 110 | self.logger.error(message, *args, **kwargs) 111 | 112 | def critical(self, message: str, *args, **kwargs): 113 | """Log critical message with Rich formatting 114 | 115 | Args: 116 | message: The message to log 117 | *args: Variable length argument list 118 | **kwargs: Arbitrary keyword arguments 119 | """ 120 | self.logger.critical(message, *args, **kwargs) 121 | 122 | @staticmethod 123 | def get_logger() -> 'LogUtil': 124 | """Get singleton instance of LogUtil 125 | 126 | Returns: 127 | LogUtil instance 128 | """ 129 | return LogUtil() 130 | 131 | if __name__ == "__main__": 132 | # Create logger instance 133 | logger = LogUtil.get_logger() 134 | 135 | # Example usage of different log levels 136 | logger.debug("This is a debug message") 137 | logger.info("This is an info message") 138 | logger.warning("This is a warning message") 139 | logger.error("This is an error message") 140 | logger.critical("This is a critical message") -------------------------------------------------------------------------------- /src/deep_research/services/search_service.py: -------------------------------------------------------------------------------- 1 | """Utility module for Tavily search API interactions 2 | 3 | This module provides a client interface for interacting with the Tavily Search API, 4 | with support for templated searches and configurable parameters. 5 | """ 6 | 7 | from typing import Optional, Dict, Any, List 8 | from deep_research.core.config import SearchConfig 9 | from deep_research.services.search_template import SearchTemplate 10 | from deep_research.utils.log_util import LogUtil 11 | 12 | class SearchClient: 13 | """Client for interacting with Tavily Search API 14 | 15 | This class provides methods to perform searches using the Tavily API, 16 | with support for template-based parameter configurations and error handling. 17 | """ 18 | def __init__(self, config: Optional[SearchConfig] = None, template_dir: Optional[str] = None): 19 | """Initialize the SearchClient with configuration and templates 20 | 21 | Args: 22 | config: Optional SearchConfig instance for API configuration 23 | template_dir: Optional directory path for custom templates 24 | 25 | Raises: 26 | ImportError: If tavily-python package is not installed 27 | ValueError: If TAVILY_API_KEY is not set 28 | """ 29 | # Initialize logger 30 | self.logger = LogUtil.get_logger() 31 | self.logger.debug("Initializing SearchClient") 32 | 33 | # Set up configuration and template 34 | self.config = config or SearchConfig() 35 | self.template = SearchTemplate() 36 | 37 | try: 38 | from tavily import TavilyClient 39 | self.client = TavilyClient(api_key=self.config.api_key) 40 | self.logger.info("Successfully initialized Tavily client") 41 | except ImportError: 42 | self.logger.error("Failed to import tavily package") 43 | raise ImportError( 44 | 'Tavily package is not installed. ' 45 | 'Please install it with: pip install tavily-python' 46 | ) 47 | 48 | def search_with_template( 49 | self, 50 | query: str, 51 | template_name: str, 52 | **override_params: Any 53 | ) -> Dict[str, Any]: 54 | """Perform a search using a parameter template 55 | 56 | Args: 57 | query: The search query string 58 | template_name: Name of the template to use 59 | **override_params: Parameters to override from the template 60 | 61 | Returns: 62 | The search results as a dictionary 63 | """ 64 | self.logger.debug(f"Applying template '{template_name}' for query: {query}") 65 | 66 | # Apply template and get parameters 67 | template_result = self.template.apply_template(template_name, **override_params) 68 | if not template_result["success"]: 69 | error_msg = f"Template error: {template_result.get('error', 'Unknown error')}" 70 | self.logger.error(error_msg) 71 | return {"error": error_msg} 72 | 73 | # Prepare search parameters 74 | params = template_result["data"] 75 | params["query"] = query # Ensure query is included in params 76 | self.logger.debug(f"Search parameters prepared: {params}") 77 | 78 | return self.search(**params) 79 | 80 | def search( 81 | self, 82 | query: str, 83 | search_depth: str = "basic", 84 | max_results: int = 5, 85 | **kwargs: Any 86 | ) -> Dict[str, Any]: 87 | """Perform a search using the Tavily API 88 | 89 | Args: 90 | query: The search query string 91 | search_depth: The depth of search ('basic' or 'advanced') 92 | max_results: Maximum number of results to return 93 | **kwargs: Additional parameters to pass to the API 94 | 95 | Returns: 96 | The search results as a dictionary 97 | """ 98 | self.logger.info(f"Executing search query: {query} with depth: {search_depth}") 99 | 100 | try: 101 | # Execute search request 102 | response = self.client.search( 103 | query=query, 104 | search_depth=search_depth, 105 | max_results=max_results, 106 | **kwargs 107 | ) 108 | self.logger.debug(f"Search completed successfully with {len(response.get('results', []))} results") 109 | return response 110 | except Exception as e: 111 | error_msg = str(e) 112 | self.logger.error(f"Search failed: {error_msg}") 113 | return {"error": error_msg} 114 | 115 | 116 | if __name__ == '__main__': 117 | # Example usage of SearchClient 118 | logger = LogUtil.get_logger() 119 | try: 120 | logger.info("Starting example search") 121 | client = SearchClient() 122 | response = client.search_with_template( 123 | query="What is DeepSeek R1", 124 | template_name="advanced" 125 | ) 126 | 127 | print(response) 128 | logger.info("Example search completed") 129 | 130 | except Exception as e: 131 | logger.error(f"Example failed: {str(e)}") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Research CLI 2 | 3 | A command-line python tool for conducting comprehensive research on any topic using AI and advanced search capabilities. This tool automates the research process by generating structured research plans, executing targeted searches, and producing detailed reports in multiple formats similar to 'ChatGPT Deep Research' 4 | 5 | > !!!!! THIS PROJECT IS UNDER VERY EARLY STAGE OF DEVELOPMENT. NOTHING IS GUARANTEED WORKING. :) 6 | 7 | ## Features 8 | 9 | ![](https://cdn.sa.net/2025/02/09/3UPtxEc6eDK4RvA.png) 10 | 11 | - **Multi-language Support**: Research topics can be input in any language 12 | - **Automated Research Planning**: Generates comprehensive research plans with categorized queries 13 | - **Advanced Search Integration**: Utilizes Tavily Search API for high-quality search results 14 | - **Multiple Report Formats**: 15 | - Detailed Research Reports 16 | - Concise Research Summaries 17 | - WeChat Article Format 18 | - **Flexible Model Selection**: Supports various AI models for report generation 19 | - **Structured Output**: Organizes research results and reports in a clear directory structure 20 | 21 | ## Installation 22 | 23 | 1. Clone the repository: 24 | ```bash 25 | git clone https://github.com/yourusername/deep-research-cli.git 26 | cd deep-research-cli 27 | ``` 28 | 29 | 2. Install dependencies: 30 | ```bash 31 | pip install -r requirements.txt 32 | ``` 33 | 34 | 3. Install Setup.py: 35 | ```bash 36 | pip install -e . 37 | ``` 38 | 39 | 4. Configure environment variables: 40 | ```bash 41 | cp .env.example .env 42 | ``` 43 | Edit `.env` file and add your API keys: 44 | ``` 45 | OPENAI_KEY=your_openai_api_key 46 | OPENAI_BASE=your_openai_api_base_url 47 | TAVILY_API_KEY=your_tavily_api_key 48 | ``` 49 | 50 | ## Usage 51 | 52 | ### Starting New Research 53 | 54 | ```bash 55 | # Create a new research project 56 | python main.py --topic "Impact of AI on healthcare in 2024" 57 | ``` 58 | 59 | ### Generating Reports from Existing Research 60 | 61 | ```bash 62 | # Generate a detailed report from existing research 63 | python main.py --research-id RS_20240214_123456 --report-method generate_research_report_detailed 64 | 65 | # Generate a WeChat article 66 | python main.py --research-id RS_20240214_123456 --report-method generate_wechat_article 67 | ``` 68 | 69 | ### Command Line Arguments 70 | 71 | - `--topic, -t`: Research topic in any language 72 | - `--research-id, -r`: ID of existing research to load 73 | - `--model, -m`: Model to use for report generation (default: deepseek/deepseek-r1) 74 | - `--report-method, -p`: Report generation method to use 75 | - `generate_research_report`: Basic research report 76 | - `generate_research_report_detailed`: Detailed research report 77 | - `generate_wechat_article`: WeChat article format 78 | 79 | ## Research Process 80 | 81 | 1. **Topic Analysis**: Translates and analyzes the research topic 82 | 2. **Plan Generation**: Creates a structured research plan with categories and queries 83 | 3. **Data Collection**: Executes searches for each query and category 84 | 4. **Report Generation**: Processes collected data into comprehensive reports 85 | 5. **Reference Management**: Generates reference links for all sources 86 | 87 | ## Output Structure 88 | 89 | All the progress data will be saved in output folder. Including searching literature, searching results, and generated reports. 90 | 91 | ``` 92 | output/ 93 | └── RS_[DATE]_[TIME]/ 94 | ├── RS_[DATE]_[TIME]_meta.json # Research metadata 95 | ├── [CATEGORY]/ # Category-specific results 96 | │ └── [QUERY].json # Search results 97 | ├── [CATEGORY]_report.json # Category reports 98 | ├── RS_[DATE]_[TIME]_reference.md # Reference links 99 | └── RS_[DATE]_[TIME]_[MODEL]_[TYPE].md # Generated reports 100 | ``` 101 | 102 | ## Configuration (.env.example) 103 | 104 | ``` 105 | # Madantory Variables 106 | # 1. LLM Model settings, openrouter is recommanded since it integrates multiple model providers. 107 | OPENAI_KEY= 108 | OPENAI_BASE=https://openrouter.ai/api/v1 109 | 110 | # 2. Tavily API Token 111 | TAVILY_API_KEY= 112 | 113 | # Optional Variables 114 | 115 | # a. LLM Model Choice, refer config.py for default settings. 116 | # SMART_MODEL = "deepseek/deepseek-r1" 117 | # NORMAL_MODEL = "deepseek/deepseek-r1-distill-llama-70b" 118 | # LONG_MODEL = "google/gemini-2.0-flash-001" 119 | # REPORT_MODEL = "google/gemini-2.0-pro-exp-02-05:free" 120 | 121 | # b. Log Level in Console, default INFO 122 | # DEEP_RESEARCH_LOG_LEVEL=INFO 123 | 124 | # c. Language use to create reports. Default = Chinese. Used in Prompt so just english words. 125 | # REPORT_LANG=Chinese 126 | 127 | # d. RESEARCH_PLAN_PROMPT, additional prompt which will hijack into Research Plan Creation process 128 | # RESEARCH_PLAN_PROMPT= 129 | 130 | # e. REPORT_PLAN_PROMPT, additional prompt which will hijack into Report Creation process influence the report content 131 | # REPORT_PROMPT= 132 | 133 | # f. NUMBER_OF_QUERIES_IN_CATEGORY, number of queries in each category, default 3 134 | # NUMBER_OF_QUERIES_IN_CATEGORY=3 135 | 136 | ``` 137 | 138 | ## Research Cost Estimation 139 | 140 | > !!!!!! A LOT ROOM FOR OPTIMIZATION 141 | 142 | - LLM Cost: $0.1 / Research (default, you can always change to free models) 143 | - Tavily Search: 100 Credit / Research = $0.8 (Tavily provide 1000 free credits per month) 144 | 145 | ## License 146 | 147 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. -------------------------------------------------------------------------------- /src/deep_research/services/persistence_service.py: -------------------------------------------------------------------------------- 1 | """Utility module for JSON data persistence operations 2 | 3 | This module provides a client interface for handling JSON data persistence operations, 4 | including saving and loading JSON files, managing research metadata, and handling 5 | search results. It implements proper error handling and directory management. 6 | 7 | Typical usage example: 8 | client = PersistenceClient() 9 | client.save_json({"key": "value"}, "data.json") 10 | data = client.load_json("data.json") 11 | """ 12 | 13 | import json 14 | import os 15 | from typing import Any, Dict, Optional, List 16 | from ..utils.log_util import LogUtil 17 | 18 | class PersistenceClient: 19 | """Client for handling JSON data persistence operations 20 | 21 | This class provides methods to save and load JSON data, manage research metadata, 22 | and handle search results. It includes proper error handling, directory creation, 23 | and path sanitization. 24 | 25 | Attributes: 26 | base_dir: The base directory for all file operations 27 | logger: Logger instance for tracking operations 28 | """ 29 | 30 | def __init__(self, base_dir: Optional[str] = None): 31 | """Initialize the persistence client 32 | 33 | Args: 34 | base_dir: Optional base directory for storing files. If not provided, 35 | uses the current working directory 36 | """ 37 | self.base_dir = base_dir or os.getcwd() 38 | self.logger = LogUtil.get_logger() # Initialize logger 39 | self.logger.info(f"Initialized PersistenceClient with base directory: {self.base_dir}") 40 | 41 | def save_json(self, data: Any, file_path: str) -> Dict[str, Any]: 42 | """Save data to a JSON file 43 | 44 | This method ensures the target directory exists before writing and handles 45 | any potential errors during the save operation. 46 | 47 | Args: 48 | data: The data to save (must be JSON serializable) 49 | file_path: Path to the file where data will be saved 50 | 51 | Returns: 52 | A dictionary indicating success or error 53 | """ 54 | full_path = os.path.join(self.base_dir, file_path) 55 | self.logger.debug(f"Attempting to save JSON data to: {full_path}") 56 | 57 | try: 58 | # Ensure the directory exists 59 | os.makedirs(os.path.dirname(full_path), exist_ok=True) 60 | self.logger.debug(f"Ensured directory exists: {os.path.dirname(full_path)}") 61 | 62 | # Write data to file 63 | with open(full_path, 'w', encoding='utf-8') as f: 64 | json.dump(data, f, ensure_ascii=False, indent=2) 65 | 66 | self.logger.info(f"Successfully saved JSON data to: {file_path}") 67 | return {"success": True, "message": f"Data successfully saved to {file_path}"} 68 | except Exception as e: 69 | self.logger.error(f"Failed to save JSON data to {file_path}: {str(e)}") 70 | return {"success": False, "error": str(e)} 71 | 72 | 73 | def load_json(self, file_path: str) -> Dict[str, Any]: 74 | """Load data from a JSON file 75 | 76 | This method attempts to read and parse a JSON file, handling any potential 77 | file access or JSON parsing errors. 78 | 79 | Args: 80 | file_path: Path to the file to load data from 81 | 82 | Returns: 83 | A dictionary containing the loaded data 84 | 85 | Raises: 86 | ValueError: If the file cannot be read or parsed 87 | """ 88 | full_path = os.path.join(self.base_dir, file_path) 89 | self.logger.debug(f"Attempting to load JSON data from: {full_path}") 90 | 91 | try: 92 | with open(full_path, 'r', encoding='utf-8') as f: 93 | data = json.load(f) 94 | self.logger.info(f"Successfully loaded JSON data from: {file_path}") 95 | return data 96 | except FileNotFoundError: 97 | self.logger.error(f"File not found: {file_path}") 98 | raise ValueError(f"File not found: {file_path}") 99 | except json.JSONDecodeError as e: 100 | self.logger.error(f"Invalid JSON format in {file_path}: {str(e)}") 101 | raise ValueError(f"Invalid JSON format in {file_path}: {str(e)}") 102 | except Exception as e: 103 | self.logger.error(f"Failed to load JSON data from {file_path}: {str(e)}") 104 | raise ValueError(f"Failed to load JSON data from {file_path}: {str(e)}") 105 | 106 | 107 | 108 | def save_file(self, file_path: str, content: str) -> Dict[str, Any]: 109 | """Save content to a file 110 | 111 | This method ensures the target directory exists before writing and handles 112 | any potential errors during the save operation. It's useful for saving 113 | non-JSON content like text files or configuration files. 114 | 115 | Args: 116 | file_path: Path to the file where content will be saved 117 | content: The content to save to the file 118 | 119 | Returns: 120 | A dictionary indicating success or error 121 | """ 122 | full_path = os.path.join(self.base_dir, file_path) 123 | self.logger.debug(f"Attempting to save content to: {full_path}") 124 | 125 | try: 126 | # Ensure the directory exists 127 | os.makedirs(os.path.dirname(full_path), exist_ok=True) 128 | self.logger.debug(f"Ensured directory exists: {os.path.dirname(full_path)}") 129 | 130 | # Write content to file 131 | with open(full_path, 'w', encoding='utf-8') as f: 132 | f.write(content) 133 | 134 | self.logger.info(f"Successfully saved content to: {file_path}") 135 | return {"success": True, "message": f"Content successfully saved to {file_path}"} 136 | except Exception as e: 137 | self.logger.error(f"Failed to save content to {file_path}: {str(e)}") 138 | return {"success": False, "error": str(e)} 139 | 140 | if __name__ == '__main__': 141 | # Example usage of PersistenceClient 142 | client = PersistenceClient() 143 | 144 | # Example data 145 | test_data = { 146 | "name": "Test User", 147 | "age": 30, 148 | "interests": ["programming", "reading"] 149 | } 150 | 151 | # Save data 152 | save_result = client.save_json(test_data, "test_data.json") 153 | print("Save result:", save_result) 154 | 155 | # Load data 156 | load_result = client.load_json("test_data.json") 157 | print("\nLoad result:", load_result) -------------------------------------------------------------------------------- /src/deep_research/services/ai_service.py: -------------------------------------------------------------------------------- 1 | """Utility module for OpenAI LLM model interactions 2 | 3 | This module provides a client interface for interacting with OpenAI-compatible LLM models. 4 | It handles different types of completions (normal, smart, long) and supports various response formats. 5 | """ 6 | 7 | import os 8 | import json 9 | import json_repair 10 | from typing import Optional, Dict, Any 11 | from dotenv import load_dotenv 12 | 13 | # Load environment variables 14 | load_dotenv() 15 | 16 | from ..core.config import LLMConfig 17 | from ..utils.log_util import LogUtil 18 | 19 | class LLMClient: 20 | """Client for interacting with OpenAI LLM models 21 | 22 | This class provides methods to interact with different types of LLM models, 23 | handling authentication, API calls, and response processing. 24 | """ 25 | def __init__(self, config: Optional[LLMConfig] = None): 26 | self.config = config or LLMConfig() 27 | self.logger = LogUtil.get_logger() 28 | 29 | try: 30 | import openai 31 | self.client = openai.OpenAI( 32 | api_key=self.config.api_key, 33 | base_url=self.config.api_base 34 | ) 35 | self.logger.info("Successfully initialized OpenAI client") 36 | except ImportError: 37 | self.logger.critical("OpenAI package is not installed") 38 | raise ImportError( 39 | 'OpenAI package is not installed. ' 40 | 'Please install it with: pip install openai' 41 | ) 42 | 43 | def chat_completion( 44 | self, 45 | messages: list[Dict[str, str]], 46 | model: Optional[str] = None, 47 | response_format: str = 'json', 48 | **kwargs: Any 49 | ) -> Dict[str, Any]: 50 | """Send a chat completion request to the OpenAI API 51 | 52 | This method handles the core interaction with the OpenAI API, including 53 | response format handling and token limit management for certain models. 54 | 55 | Args: 56 | messages: List of message dictionaries with 'role' and 'content' 57 | model: Optional model override, defaults to normal_model 58 | response_format: Optional response format, e.g. {"type": "json_object"} 59 | **kwargs: Additional parameters to pass to the API 60 | 61 | Returns: 62 | The response content as a JSON object or markdown string 63 | """ 64 | # Prepare request parameters 65 | params = { 66 | "model": model or self.config.normal_model, 67 | "messages": messages 68 | } 69 | 70 | self.logger.debug(f"Sending chat completion request with model: {params['model']}") 71 | 72 | try: 73 | response = self.client.chat.completions.create( 74 | **params, 75 | **kwargs 76 | ) 77 | self.logger.debug("Successfully received response from API") 78 | content = response.choices[0].message.content 79 | 80 | # Handle markdown format with potential token limit handling 81 | if response_format == 'markdown' and 'google' in model.lower(): 82 | # Check if response was truncated due to token limit 83 | if response.choices[0].native_finish_reason == 'MAX_TOKENS' : 84 | self.logger.info("Response truncated due to token limit, continuing conversation") 85 | # Append the partial response to messages and continue the conversation 86 | messages.append({"role": "assistant", "content": content}) 87 | messages.append({"role": "user", "content": "Please continue from where you left off."}) 88 | # Recursively get the rest of the response 89 | continuation = self.chat_completion( 90 | messages=messages, 91 | model=model, 92 | response_format='markdown', 93 | **kwargs 94 | ) 95 | # Combine the current content with the continuation 96 | return content + continuation 97 | return content 98 | 99 | # Handle JSON format 100 | if response_format == 'json': 101 | try: 102 | repaired_json = json_repair.loads(content) 103 | return repaired_json 104 | except Exception as e: 105 | self.logger.error(f"Failed to parse JSON response: {str(e)}") 106 | raise 107 | 108 | return content 109 | except Exception as e: 110 | self.logger.error(f"Error in chat completion: {str(e)}") 111 | raise 112 | 113 | def smart_completion( 114 | self, 115 | messages: list[Dict[str, str]], 116 | response_format: str = 'json', 117 | **kwargs: Any 118 | ) -> Dict[str, Any]: 119 | """Use the smart model (e.g. GPT-4) for chat completion 120 | 121 | This method is optimized for tasks requiring higher intelligence and reasoning. 122 | 123 | Args: 124 | messages: List of message dictionaries 125 | response_format: Optional response format, e.g. {"type": "json_object"} 126 | **kwargs: Additional parameters to pass to the API 127 | 128 | Returns: 129 | The response content as a JSON object 130 | """ 131 | self.logger.info(f"Using smart model: {self.config.smart_model}") 132 | return self.chat_completion( 133 | messages=messages, 134 | model=self.config.smart_model, 135 | response_format=response_format, 136 | stream=False, 137 | **kwargs 138 | ) 139 | 140 | def long_completion( 141 | self, 142 | messages: list[Dict[str, str]], 143 | response_format: str = 'json', 144 | **kwargs: Any 145 | ) -> Dict[str, Any]: 146 | """Use the long context model for chat completion 147 | 148 | This method is designed for handling longer conversations or inputs 149 | that require more context window. 150 | 151 | Args: 152 | messages: List of message dictionaries 153 | response_format: Optional response format, e.g. {"type": "json_object"} 154 | **kwargs: Additional parameters to pass to the API 155 | 156 | Returns: 157 | The response content as a JSON object 158 | """ 159 | self.logger.info(f"Using long context model: {self.config.long_model}") 160 | return self.chat_completion( 161 | messages=messages, 162 | model=self.config.long_model, 163 | response_format=response_format, 164 | stream=False, 165 | **kwargs 166 | ) 167 | 168 | 169 | if __name__ == '__main__': 170 | # Example usage of LLMClient 171 | try: 172 | client = LLMClient() 173 | 174 | # Test normal completion 175 | messages = [ 176 | {"role": "user", "content": "What is Python?"} 177 | ] 178 | print("\nNormal Completion Test:") 179 | response = client.chat_completion(messages) 180 | print(response) 181 | 182 | except Exception as e: 183 | print(f"Error: {str(e)}") -------------------------------------------------------------------------------- /src/deep_research/core/research.py: -------------------------------------------------------------------------------- 1 | """Module for handling research operations 2 | 3 | This module provides the core functionality for managing research operations, including: 4 | - Research initialization and metadata management 5 | - Topic translation and research plan generation 6 | - Search execution and result management 7 | - Report generation and link compilation 8 | 9 | The Research class serves as the main entry point for conducting research tasks, 10 | handling both new research topics and loading existing research data. 11 | """ 12 | 13 | from datetime import datetime 14 | from typing import List, Dict, Any 15 | from deep_research.utils.research_helper import ResearchHelper 16 | from deep_research.core.config import Config 17 | from deep_research.utils.log_util import LogUtil 18 | 19 | class Research: 20 | """Class for managing research operations 21 | 22 | This class orchestrates the entire research process, from initialization to execution. 23 | It handles topic translation, research plan generation, search execution, and report 24 | generation. The class can either start a new research project from a topic or load 25 | an existing research project using its ID. 26 | """ 27 | 28 | def __init__(self, topic: str = None, research_id: str = None): 29 | """Initialize a new research instance 30 | 31 | Args: 32 | topic: The research topic in any language (optional if research_id is provided) 33 | research_id: Existing research ID to load data from (optional) 34 | 35 | Raises: 36 | ValueError: If neither topic nor research_id is provided, or if both are provided 37 | """ 38 | self._logger = LogUtil().logger 39 | self._logger.info(f"Initializing Research instance with topic='{topic}', research_id='{research_id}')") 40 | 41 | if topic and research_id: 42 | error_msg = "Cannot provide both topic and research_id" 43 | self._logger.error(error_msg) 44 | raise ValueError(error_msg) 45 | elif not topic and not research_id: 46 | error_msg = "Must provide either topic or research_id" 47 | self._logger.error(error_msg) 48 | raise ValueError(error_msg) 49 | 50 | # Generate new research ID if not provided 51 | self.research_id = research_id or 'RS_' + datetime.now().strftime("%Y%m%d_%H%M%S") 52 | self._helper = ResearchHelper(self.research_id) 53 | self._logger.debug(f"Created research instance with ID: {self.research_id}") 54 | 55 | if topic: 56 | self._init_from_topic(topic) 57 | else: 58 | self._init_from_id() 59 | 60 | def _init_from_topic(self, topic: str): 61 | self._logger.info(f"Initializing new research from topic: {topic}") 62 | """Initialize research instance with a new topic 63 | 64 | Args: 65 | topic: The research topic in any language 66 | """ 67 | self.topic = topic 68 | self.english_topic = self._translate_topic() 69 | self.research_content = None 70 | self.research_plan = None 71 | 72 | self.create_research_detail(self.english_topic) 73 | 74 | def _init_from_id(self): 75 | self._logger.info(f"Loading existing research from ID: {self.research_id}") 76 | """Initialize research instance from existing research ID""" 77 | try: 78 | data = self._helper.load_research_metadata() 79 | self.topic = data['topic'] 80 | self.english_topic = data['english_topic'] 81 | self.research_content = data['research_content'] 82 | self.research_plan = data['research_plan'] 83 | except Exception as e: 84 | error_msg = f"Failed to load research data for ID {self.research_id}: {str(e)}" 85 | self._logger.error(error_msg) 86 | raise ValueError(error_msg) 87 | 88 | def _translate_topic(self) -> str: 89 | self._logger.debug(f"Translating topic to English: {self.topic}") 90 | """Translate the research topic to English 91 | 92 | Returns: 93 | The translated topic in English 94 | """ 95 | return self._helper.translate_to_english(self.topic) 96 | 97 | def create_research_detail(self, english_topic): 98 | self._logger.info(f"Generating research details for topic: {english_topic}") 99 | """Generate research content based on the topic 100 | 101 | Returns: 102 | A dictionary containing the research content 103 | """ 104 | self.research_content = self._helper.generate_research_content(english_topic) 105 | json_plan = self._helper.generate_research_plan(self.research_content) 106 | self.research_plan = json_plan['research_plan'] 107 | self.save() 108 | 109 | def save(self) -> None: 110 | self._logger.debug(f"Saving research metadata for ID: {self.research_id}") 111 | """Save research metadata to a JSON file in the output directory 112 | 113 | Creates a new folder with the research ID and saves metadata as JSON. 114 | The directory will be created automatically if it doesn't exist. 115 | """ 116 | metadata = { 117 | 'topic': self.topic, 118 | 'research_id': self.research_id, 119 | 'english_topic': self.english_topic, 120 | 'research_content': self.research_content, 121 | 'research_plan': self.research_plan 122 | } 123 | self._helper.save_research_metadata(metadata) 124 | 125 | @property 126 | def id(self) -> str: 127 | """Get the research's unique ID 128 | 129 | Returns: 130 | The research's unique ID string 131 | """ 132 | return self.research_id 133 | 134 | def execute_search(self) -> None: 135 | self._logger.info("Starting search execution for all research categories") 136 | """Execute searches for all queries in the research plan 137 | 138 | Iterates through the research plan, performs advanced searches for each query, 139 | and saves results in category-specific directories. 140 | """ 141 | if not self.research_plan: 142 | error_msg = "No research plan available" 143 | self._logger.error(error_msg) 144 | raise ValueError(error_msg) 145 | 146 | self._logger.debug(f'Research plan: {self.research_plan}') 147 | for category_data in self.research_plan: 148 | category = category_data.get('category') 149 | if not category: 150 | continue 151 | 152 | for query in category_data.get('queries_list', []): 153 | if not query: 154 | continue 155 | 156 | search_results = self._helper.search_advanced(query) 157 | self._helper.save_search_results(category, query, search_results) 158 | 159 | def get_category_reports(self) -> List[Dict[str, str]]: 160 | self._logger.debug("Retrieving all category reports") 161 | """Retrieve and process search results from a specific category 162 | 163 | Returns: 164 | A list of dictionaries containing title and content for each result 165 | """ 166 | return self._helper.read_category_reports() 167 | 168 | def generate_category_report(self, category: str): 169 | self._logger.info(f"Generating report for category: {category}") 170 | """Generate and save a research report for a specific category 171 | 172 | Args: 173 | category: The category name to generate report for 174 | 175 | Returns: 176 | A dictionary containing the generated report 177 | """ 178 | category_results = self._helper.read_category_results(category) 179 | report = self._helper.generate_category_report( 180 | research_content=self.research_content, 181 | category=category, 182 | category_resources=category_results 183 | ) 184 | self._helper.save_category_report(category, report) 185 | return report 186 | 187 | def generate_all_category_links(self) -> List[str]: 188 | self._logger.info("Generating reference links for all categories") 189 | """Generate links for all categories in the research plan 190 | Iterates through each category in the research plan and generates a link for each one. 191 | Returns: 192 | A list of strings containing the generated links for each category 193 | """ 194 | if not self.research_plan: 195 | error_msg = "No research plan available" 196 | self._logger.error(error_msg) 197 | raise ValueError(error_msg) 198 | 199 | links = [] 200 | for category_data in self.research_plan: 201 | category = category_data.get('category') 202 | if not category: 203 | continue 204 | 205 | try: 206 | category_results = self._helper.read_category_results(category) 207 | for result in category_results: 208 | title = result['title'] 209 | url = result['url'] 210 | link = f"[{title}]({url})" 211 | links.append(link) 212 | except Exception as e: 213 | self._logger.error(f"Error generating link for category {category}: {str(e)}") 214 | continue 215 | 216 | reference_content = "## Reference\n\n" 217 | reference_content += '\n'.join(f'- {item}' for item in links) 218 | file_path = f'output/{self.research_id}/{self.research_id}_reference.md' 219 | 220 | from deep_research.services.persistence_service import PersistenceClient 221 | persistence_client = PersistenceClient() 222 | persistence_client.save_file(file_path, reference_content) 223 | 224 | return links 225 | 226 | def generate_all_category_reports(self) -> List[Dict[str, Any]]: 227 | self._logger.info("Generating reports for all research categories") 228 | """Generate reports for all categories in the research plan 229 | 230 | Iterates through each category in the research plan and generates a report for each one. 231 | 232 | Returns: 233 | A list of dictionaries containing the generated reports for each category 234 | """ 235 | if not self.research_plan: 236 | error_msg = "No research plan available" 237 | self._logger.error(error_msg) 238 | raise ValueError(error_msg) 239 | 240 | reports = [] 241 | for category_data in self.research_plan: 242 | category = category_data.get('category') 243 | if not category: 244 | continue 245 | 246 | try: 247 | report = self.generate_category_report(category) 248 | reports.append(report) 249 | except Exception as e: 250 | self._logger.error(f"Error generating report for category {category}: {str(e)}") 251 | continue 252 | 253 | return reports 254 | 255 | def execute(self): 256 | self._logger.info(f"Starting complete research execution for ID: {self.research_id}") 257 | """Execute the research process 258 | This method orchestrates the entire research process, including search, report generation, and saving. 259 | """ 260 | # Step 1: Execute searches 261 | self.execute_search() 262 | # Step 2: Generate reports 263 | self.generate_all_category_reports() 264 | # Step 3: Generate links 265 | self.generate_all_category_links() 266 | 267 | 268 | if __name__ == "__main__": 269 | research = Research(research_id="RS_20250210_175342") 270 | 271 | 272 | -------------------------------------------------------------------------------- /src/deep_research/core/report.py: -------------------------------------------------------------------------------- 1 | """Module for handling report generation operations 2 | 3 | This module provides functionality for generating various types of research reports, 4 | including WeChat articles, research reports, and detailed research reports. 5 | It uses LLM models for content generation and handles file operations for saving reports. 6 | """ 7 | 8 | from typing import Dict, Any, List 9 | from deep_research.core.config import Config 10 | from deep_research.services.persistence_service import PersistenceClient 11 | from deep_research.utils.research_helper import ResearchHelper 12 | from deep_research.services.ai_service import LLMClient 13 | from deep_research.utils.log_util import LogUtil 14 | import time 15 | import os 16 | from datetime import datetime 17 | 18 | class Report: 19 | """Class for managing report generation operations 20 | 21 | This class handles the generation of different types of research reports, 22 | including WeChat articles and comprehensive research reports. It manages 23 | the loading of research data, report generation using LLM models, and 24 | saving of generated reports. 25 | """ 26 | 27 | def __init__(self, research_id: str): 28 | """Initialize a new report instance 29 | 30 | Args: 31 | research_id: The ID of the research to generate reports for 32 | 33 | Raises: 34 | ValueError: If research data cannot be loaded for the given ID 35 | """ 36 | self.research_id = research_id 37 | self._persistence_client = PersistenceClient() 38 | self._research_helper = ResearchHelper(research_id) 39 | self._logger = LogUtil().logger 40 | self._logger.info(f"Initializing Report instance for research ID: {research_id}") 41 | self._load_research_data() 42 | 43 | def _load_research_data(self) -> None: 44 | """Load research metadata from local storage 45 | 46 | Raises: 47 | ValueError: If research data cannot be loaded or is invalid 48 | """ 49 | file_path = f'output/{self.research_id}/{self.research_id}_meta.json' 50 | try: 51 | self._logger.debug(f"Loading research data from {file_path}") 52 | data = self._persistence_client.load_json(file_path) 53 | self.research_content = data['research_content'] 54 | self.research_plan = data['research_plan'] 55 | self._logger.info("Successfully loaded research data") 56 | except Exception as e: 57 | error_msg = f"Failed to load research data for ID {self.research_id}: {str(e)}" 58 | self._logger.error(error_msg) 59 | raise ValueError(error_msg) 60 | 61 | 62 | def generate_wechat_article(self, model: str = Config.SMART_MODEL) -> str: 63 | """Generate a WeChat article based on research data 64 | 65 | This method generates a user-friendly article suitable for WeChat platform, 66 | incorporating research findings in an engaging and accessible format. 67 | 68 | Args: 69 | model: The LLM model to use for report generation 70 | 71 | Returns: 72 | The generated article content as a string 73 | 74 | Raises: 75 | ValueError: If no research plan is available 76 | """ 77 | if not self.research_plan: 78 | self._logger.error("No research plan available for WeChat article generation") 79 | raise ValueError("No research plan available") 80 | 81 | self._logger.info(f"Starting WeChat article generation using model: {model}") 82 | reports = self._research_helper.read_category_reports() 83 | 84 | 85 | client = LLMClient() 86 | messages = [ 87 | {"role": "system", "content": f'''你的任务是根据课题和收集到的资料,编写一篇内容详实的微信公众号文章。 88 | 89 | 要求: 90 | 91 | - {os.getenv('REPORT_WECHAT_PROMPT', "")} 92 | - {os.getenv('REPORT_PROMPT', "")} 93 | - 要有非常明确的态度和观点。 94 | - 要有一个吸引眼球但不要太肤浅的标题 95 | - 文本要具有亲和力,每个段落要有感情,要有深度,并添加你非常详细的解释涉及到的原理或者知识的背景。 96 | - 不要虚构任何案例和内容,遵照文献里的内容进行交叉比对,思考深度含义。需要非常详细的观点称述,逻辑过程推演和讲解。 97 | - 不要有太多奇怪的比喻。 98 | - 内容有情绪,有代入感,文字通俗,偶尔使用 emoji。 99 | - 文章最后要把所有相关的科学的依据罗列出来。 100 | 101 | 102 | --- 103 | 104 | 话题和相关信息: 105 | {self.research_content} 106 | 107 | 108 | 收集的资料和全文报告: 109 | {reports} 110 | 111 | --- 112 | 输出文章格式: 113 | 114 | - Use [{Config.REPORT_LANG}] 115 | - Markdown format 116 | - with Key highlighted using ** bold 117 | - Title # 118 | - Section ## with insights 119 | - Subsection ### with lengthy explaination on each section 120 | 121 | Provide your output in markdown format. 简体中文编写。 122 | 123 | '''} 124 | ] 125 | try: 126 | report_content = client.chat_completion(messages, model=model, response_format='markdown') 127 | model_name = self._research_helper.sanitize_filename(model) 128 | file_path = f'output/{self.research_id}/{self.research_id}_{model_name}_wechat.md' 129 | self._persistence_client.save_file(file_path, report_content) 130 | self._logger.info(f"Successfully generated and saved WeChat article to {file_path}") 131 | return report_content 132 | except Exception as e: 133 | error_msg = f"Failed to generate WeChat article: {str(e)}" 134 | self._logger.error(error_msg) 135 | raise 136 | 137 | 138 | def generate_research_report(self, model=Config.SMART_MODEL) -> Dict[str, Any]: 139 | """Generate a comprehensive research report 140 | 141 | This method creates a detailed research report that includes analysis, 142 | insights, and explanations based on the collected literature. 143 | 144 | Args: 145 | model: The LLM model to use for report generation 146 | 147 | Returns: 148 | The generated report content 149 | 150 | Raises: 151 | ValueError: If no research plan is available 152 | """ 153 | if not self.research_plan: 154 | self._logger.error("No research plan available for research report generation") 155 | raise ValueError("No research plan available") 156 | 157 | self._logger.info(f"Starting research report generation using model: {model}") 158 | reports = self._research_helper.read_category_reports() 159 | 160 | 161 | client = LLMClient() 162 | messages = [ 163 | {"role": "system", "content": f'''Your Task: Based on the provided literature and materials, your goal is to compile a comprehensive and detailed investigative report. 164 | The report provide extensive analysis, insights, and explanations to ensure sufficient length and depth. 165 | 166 | Instructions: 167 | 168 | - {os.getenv('REPORT_PROMPT', "")} 169 | - Always Focus on the research goal. 170 | - Integrate the Literature: First, you need to integrate all the content from the provided literature. Avoid deleting or simplifying the information; instead, reorganize it logically with explain content for each. 171 | - Numbers and Statistics: Always leave the reference source together. 172 | - Develop Insights: carefully analyze the content and the research topic to develop meaningful insights. These insights should go beyond what is explicitly mentioned in the literature and uncover new perspectives or implications. 173 | - Do not mention numbers you don't have evidence to support or skip sections without actual numbers from literature. 174 | - Use Tables or Mermaid Graphs to illustrate but only if needed. 175 | 176 | --- 177 | 178 | Research Topic: 179 | {self.research_content} 180 | 181 | 182 | Collected Literatures: 183 | {reports} 184 | 185 | --- 186 | Report format: 187 | 188 | - Use [{Config.REPORT_LANG}] 189 | - Markdown format 190 | - with Key highlighted using ** bold 191 | - Title # 192 | - Section ## with insights 193 | - Subsection ### with lengthy explaination on each section 194 | 195 | 196 | Provide your output in markdown format. 197 | 198 | '''} 199 | ] 200 | try: 201 | report_content = client.chat_completion(messages, model=model, response_format='markdown') 202 | model_name = self._research_helper.sanitize_filename(model) 203 | file_path = f'output/{self.research_id}/{self.research_id}_{model_name}_research.md' 204 | persistence_client = PersistenceClient() 205 | persistence_client.save_file(file_path, report_content) 206 | self._logger.info(f"Successfully generated and saved research report to {file_path}") 207 | return report_content 208 | except Exception as e: 209 | error_msg = f"Failed to generate research report: {str(e)}" 210 | self._logger.error(error_msg) 211 | raise 212 | 213 | 214 | def generate_research_report_detailed(self, model=Config.REPORT_MODEL) -> Dict[str, Any]: 215 | """Generate a detailed research report 216 | 217 | This method creates a comprehensive and detailed research report with 218 | extensive analysis and insights from the collected literature. 219 | 220 | Args: 221 | model: The LLM model to use for report generation 222 | 223 | Returns: 224 | The generated detailed report content 225 | 226 | Raises: 227 | ValueError: If no research plan is available 228 | """ 229 | if not self.research_plan: 230 | self._logger.error("No research plan available for detailed research report generation") 231 | raise ValueError("No research plan available") 232 | 233 | self._logger.info(f"Starting detailed research report generation using model: {model}") 234 | reports = self._research_helper.read_category_reports() 235 | 236 | 237 | client = LLMClient() 238 | messages = [ 239 | {"role": "system", "content": f'''Your Task: Based on the provided literature and materials, your goal is to compile a comprehensive and detailed investigative report. 240 | 241 | Instructions: 242 | 243 | - {os.getenv('REPORT_PROMPT', "")} 244 | - Always Focus on the research goal. 245 | - Integrate the Literature: First, you need to integrate all the content from the provided literature. 246 | - Numbers and Statistics: Always leave the reference source together. 247 | - Comprehensive and detail, organized structured report with logical section order, do not summarize. 248 | - Conclusion with deepen insights 249 | - Use Tables or Mermaid Graphs to illustrate but if needed. 250 | 251 | --- 252 | 253 | Research Topic: 254 | {self.research_content} 255 | 256 | 257 | Collected Literatures: 258 | {reports} 259 | 260 | --- 261 | Report format: 262 | 263 | - Use [{Config.REPORT_LANG}] 264 | - Markdown format 265 | - with Key highlighted using ** bold 266 | - Title # 267 | - Section ## with insights 268 | - Subsection ### with detailed content 269 | 270 | Provide your output in markdown format. 271 | 272 | '''} 273 | ] 274 | try: 275 | report_content = client.chat_completion(messages, model=model, response_format='markdown') 276 | model_name = self._research_helper.sanitize_filename(model) 277 | file_path = f'output/{self.research_id}/{self.research_id}_{model_name}_detail_research.md' 278 | persistence_client = PersistenceClient() 279 | persistence_client.save_file(file_path, report_content) 280 | self._logger.info(f"Successfully generated and saved detailed research report to {file_path}") 281 | return report_content 282 | except Exception as e: 283 | error_msg = f"Failed to generate detailed research report: {str(e)}" 284 | self._logger.error(error_msg) 285 | raise 286 | 287 | 288 | if __name__ == "__main__": 289 | research_id = "RS_20250210_214128" # Example research ID 290 | report = Report(research_id) 291 | article_content = report.generate_wechat_article(model='deepseek/deepseek-r1') 292 | -------------------------------------------------------------------------------- /src/deep_research/utils/research_helper.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Dict, Any, List 3 | import os 4 | import glob 5 | from deep_research.services.ai_service import LLMClient 6 | from deep_research.services.search_service import SearchClient 7 | from deep_research.services.persistence_service import PersistenceClient 8 | from deep_research.core.config import Config 9 | from deep_research.utils.log_util import LogUtil 10 | from dotenv import load_dotenv 11 | # Load environment variables 12 | load_dotenv() 13 | 14 | class ResearchHelper: 15 | """Class for managing research helper operations 16 | 17 | This class provides utility methods for handling various research operations, 18 | including translation, content generation, search execution, and result management. 19 | It integrates with various services (LLM, Search, Persistence) to provide a 20 | comprehensive research assistance functionality. 21 | """ 22 | 23 | def __init__(self, research_id: str = None): 24 | """Initialize a new research helper instance 25 | 26 | Args: 27 | research_id: The ID of the research to work with 28 | """ 29 | self._logger = LogUtil().logger 30 | self._logger.info(f"Initializing ResearchHelper with research_id: {research_id}") 31 | self.research_id = research_id 32 | self._llm_client = LLMClient() 33 | self._search_client = SearchClient() 34 | self._persistence_client = PersistenceClient() 35 | 36 | def translate_to_english(self, text: str, **kwargs: Any) -> Dict[str, str]: 37 | """Translate any text to English using the LLM model 38 | 39 | Args: 40 | text: The text to translate to English 41 | **kwargs: Additional parameters to pass to the API 42 | 43 | Returns: 44 | The English translation as a string in a dictionary format 45 | """ 46 | self._logger.debug(f"Translating text to English: {text}") 47 | messages = [ 48 | {"role": "user", "content": f"""You are a professional translator. Translate the [{text}] to English. Only return translated text in Json format: {{response:""}}"""} 49 | ] 50 | try: 51 | result = self._llm_client.chat_completion(messages, **kwargs)['response'] 52 | self._logger.info(f"Successfully translated text to English") 53 | return result 54 | except Exception as e: 55 | self._logger.error(f"Failed to translate text: {str(e)}") 56 | raise 57 | 58 | def generate_research_content(self, text: str, **kwargs: Any) -> Dict[str, str]: 59 | """Generate research content based on the topic 60 | 61 | Args: 62 | text: The research topic 63 | **kwargs: Additional parameters to pass to the API 64 | 65 | Returns: 66 | A dictionary containing the research content 67 | """ 68 | self._logger.debug(f"Generating research content for topic: {text}") 69 | messages = [ 70 | {"role": "user", "content": f'''You are a research expert, to provide comprehensive framework of searching keywords for user to search information for research purpose. 71 | 72 | User will provide a topic or target for research. 73 | 74 | You will think about the topic or target, deep dive in the core question and target, define the scope of the research and goal and meaning of the research to help set up a solid background content of the whole research. 75 | 76 | return your result in JSON format: 77 | 78 | ``` 79 | {{ 80 | original_topic: "", 81 | core_research_topic:"", 82 | research_scope:"", 83 | research_target:"" 84 | }} 85 | ``` 86 | Research Topic: [{text}]'''} 87 | ] 88 | try: 89 | result = self._llm_client.smart_completion(messages, **kwargs) 90 | self._logger.info("Successfully generated research content") 91 | return result 92 | except Exception as e: 93 | self._logger.error(f"Failed to generate research content: {str(e)}") 94 | raise 95 | 96 | def generate_research_plan(self, research_content: Dict[str, str], **kwargs: Any) -> Dict[str, str]: 97 | """Generate a research plan based on the research content 98 | 99 | Args: 100 | research_content: The research content to base the plan on 101 | **kwargs: Additional parameters to pass to the API 102 | 103 | Returns: 104 | A dictionary containing the research plan 105 | """ 106 | self._logger.debug("Generating research plan") 107 | messages = [ 108 | {"role": "user", "content": f'''You are a research planner, to provide comprehensive framework of searching keywords for user to search information for research purpose. 109 | 110 | Based on below research information 111 | 112 | - {os.getenv('RESEARCH_PLAN_PROMPT', "")} 113 | - you will work out a comprehensive list of queries for user to collect informations on Search Engines cover everything aspect of the research goal. 114 | - Query need to be specific to the research topic and category to narrow the results with lengthy description and keywords. 115 | - Less than {os.getenv('NUMBER_OF_QUERIES_IN_CATEGORY', "3")} queries each list 116 | 117 | ``` 118 | {research_content} 119 | ``` 120 | You will provide the research plan in below in JSON format: 121 | 122 | ``` 123 | {{ 124 | research_plan: [ 125 | {{ 126 | category: "", 127 | category_research_goal: "", 128 | queries_list: ["",""] 129 | }}, 130 | {{ 131 | category: "", 132 | category_research_goal: "", 133 | queries_list: ["",""] 134 | }} 135 | ] 136 | }} 137 | ``` 138 | 139 | rethink until you think the plan is comprehensive for finding the answer or support the research. Adjust or append if you think still missing some. 140 | Provide output in pure JSON format. 141 | '''} 142 | ] 143 | try: 144 | result = self._llm_client.smart_completion(messages, **kwargs) 145 | self._logger.info("Successfully generated research plan") 146 | return result 147 | except Exception as e: 148 | self._logger.error(f"Failed to generate research plan: {str(e)}") 149 | raise 150 | 151 | def save_research_metadata(self, metadata: Dict[str, Any]) -> None: 152 | """Save research metadata to a JSON file 153 | 154 | Args: 155 | metadata: The metadata to save 156 | """ 157 | output_file = f'output/{self.research_id}/{self.research_id}_meta.json' 158 | self._logger.debug(f"Saving research metadata to {output_file}") 159 | try: 160 | result = self._persistence_client.save_json(metadata, output_file) 161 | self._logger.info("Successfully saved research metadata") 162 | return result 163 | except Exception as e: 164 | self._logger.error(f"Failed to save research metadata: {str(e)}") 165 | raise 166 | 167 | def load_research_metadata(self) -> Dict[str, Any]: 168 | """Load research metadata from a JSON file 169 | 170 | Returns: 171 | A dictionary containing the research metadata 172 | """ 173 | file_path = f'output/{self.research_id}/{self.research_id}_meta.json' 174 | self._logger.debug(f"Loading research metadata from {file_path}") 175 | try: 176 | result = self._persistence_client.load_json(file_path) 177 | self._logger.info("Successfully loaded research metadata") 178 | return result 179 | except Exception as e: 180 | self._logger.error(f"Failed to load research metadata: {str(e)}") 181 | raise 182 | 183 | def search_advanced(self, query: str, **kwargs: Any) -> Dict[str, Any]: 184 | """Perform an advanced search 185 | 186 | Args: 187 | query: The search query 188 | **kwargs: Additional parameters to pass to the API 189 | 190 | Returns: 191 | The search results 192 | """ 193 | self._logger.debug(f"Executing advanced search with query: {query}") 194 | try: 195 | response = self._search_client.search_with_template( 196 | query=query, 197 | template_name="advanced" 198 | ) 199 | self._logger.info("Successfully executed advanced search") 200 | return response 201 | except Exception as e: 202 | self._logger.error(f"Failed to execute advanced search: {str(e)}") 203 | raise 204 | 205 | def generate_category_report(self, research_content: Dict[str, str], category: str, category_resources: List[Dict[str, str]]) -> str: 206 | """Generate a report for a specific category 207 | 208 | Args: 209 | research_content: The research content 210 | category: The category to generate a report for 211 | category_resources: The resources for the category 212 | 213 | Returns: 214 | The generated report 215 | """ 216 | self._logger.info(f"Generating summary report for category: {category}") 217 | messages = [ 218 | {"role": "user", "content": f'''You are a pro researcher. Current research topic is: 219 | 220 | {research_content} 221 | 222 | Under sub research category [{category}] 223 | 224 | Please read all the collected resources and integrate into one comprehensive report. 225 | 226 | Follow below: 227 | 228 | - Based on the specified theme, please collect relevant literature and materials to generate a comprehensive report. 229 | - The report should be lengthy and thorough, with each section fully elaborated to ensure no detail from the literature is overlooked. 230 | - Every section is written in a detailed and elaborate manner, with no omission of information. 231 | - The analysis integrates all relevant literature, avoiding any gaps or oversights. 232 | - structured the report 233 | 234 | Collected resources to read: 235 | ``` 236 | {category_resources} 237 | ``` 238 | 239 | Provide output in Markdown format. 240 | '''} 241 | ] 242 | try: 243 | result = self._llm_client.long_completion(messages, response_format='markdown') 244 | self._logger.info(f"Successfully generated report for category: {category}") 245 | return result 246 | except Exception as e: 247 | self._logger.error(f"Failed to generate category report: {str(e)}") 248 | raise 249 | 250 | @staticmethod 251 | def sanitize_filename(name: str) -> str: 252 | """Sanitize a string to be used as a filename 253 | 254 | Args: 255 | name: The string to sanitize 256 | 257 | Returns: 258 | A sanitized string safe for use as a filename 259 | """ 260 | import re 261 | sanitized = re.sub(r'[^\w\s-]', '', name) 262 | sanitized = re.sub(r'[-\s]+', '_', sanitized) 263 | return sanitized 264 | 265 | def save_search_results(self, category: str, query: str, results: Dict[str, Any]) -> Dict[str, Any]: 266 | """Save search results to a JSON file 267 | 268 | Args: 269 | category: The category name 270 | query: The search query 271 | results: The search results to save 272 | 273 | Returns: 274 | A dictionary indicating success or error 275 | """ 276 | sanitized_category = self.sanitize_filename(category) 277 | sanitized_query = self.sanitize_filename(query) 278 | output_path = f'output/{self.research_id}/{sanitized_category}' 279 | output_file = f'{output_path}/{sanitized_query}.json' 280 | 281 | self._logger.debug(f"Saving search results to {output_file}") 282 | try: 283 | os.makedirs(output_path, exist_ok=True) 284 | result = self._persistence_client.save_json(results, output_file) 285 | self._logger.info("Successfully saved search results") 286 | return result 287 | except Exception as e: 288 | self._logger.error(f"Failed to save search results: {str(e)}") 289 | raise 290 | 291 | def save_category_report(self, category: str, report: str) -> None: 292 | """Save a category report to a JSON file 293 | 294 | Args: 295 | category: The category name 296 | report: The report content 297 | """ 298 | report_json = { 299 | "category": category, 300 | "report": report 301 | } 302 | sanitized_category = self.sanitize_filename(category) 303 | file_path = f'output/{self.research_id}/{sanitized_category}_report.json' 304 | 305 | self._logger.debug(f"Saving category report to {file_path}") 306 | try: 307 | self._persistence_client.save_json(report_json, file_path) 308 | self._logger.info(f"Successfully saved report for category: {category}") 309 | except Exception as e: 310 | self._logger.error(f"Failed to save category report: {str(e)}") 311 | raise 312 | 313 | def read_category_results(self, category: str) -> List[Dict[str, str]]: 314 | """Read results for a specific category 315 | 316 | Args: 317 | category: The category name 318 | 319 | Returns: 320 | A list of results for the category 321 | """ 322 | sanitized_category = self.sanitize_filename(category) 323 | category_path = f'output/{self.research_id}/{sanitized_category}' 324 | 325 | self._logger.debug(f"Reading category results from {category_path}") 326 | results = [] 327 | 328 | if not os.path.exists(category_path): 329 | self._logger.warning(f"Category path does not exist: {category_path}") 330 | return results 331 | 332 | json_files = glob.glob(os.path.join(category_path, '*.json')) 333 | for json_file in json_files: 334 | try: 335 | data = self._persistence_client.load_json(os.path.relpath(json_file)) 336 | for result in data.get('results', []): 337 | title = result.get('title', '') 338 | content = result.get('raw_content') or result.get('content', '') 339 | url = result.get('url', '') 340 | if title and content and result.get('score', 0) > 0.6: 341 | results.append({ 342 | 'title': title, 343 | 'url': url, 344 | 'content': content 345 | }) 346 | except Exception as e: 347 | self._logger.error(f"Error processing file {json_file}: {str(e)}") 348 | continue 349 | 350 | self._logger.info(f"Successfully read {len(results)} results for category: {category}") 351 | return results 352 | 353 | def read_category_reports(self) -> List[Dict[str, str]]: 354 | """Read all category reports 355 | 356 | Returns: 357 | A list of category reports 358 | """ 359 | report_path = f'output/{self.research_id}' 360 | self._logger.debug(f"Reading category reports from {report_path}") 361 | reports = [] 362 | 363 | if not os.path.exists(report_path): 364 | self._logger.warning(f"Report path does not exist: {report_path}") 365 | return reports 366 | 367 | json_files = glob.glob(os.path.join(report_path, '*_report.json')) 368 | for json_file in json_files: 369 | try: 370 | data = self._persistence_client.load_json(os.path.relpath(json_file)) 371 | reports.append(data) 372 | except Exception as e: 373 | self._logger.error(f"Error processing file {json_file}: {str(e)}") 374 | continue 375 | 376 | self._logger.info(f"Successfully read {len(reports)} category reports") 377 | return reports 378 | 379 | --------------------------------------------------------------------------------