├── .env Example
├── .gitignore
├── Architecture.png
├── DATAGEN.jpg
├── LICENSE
├── README.md
├── agent
    ├── code_agent.py
    ├── hypothesis_agent.py
    ├── note_agent.py
    ├── process_agent.py
    ├── quality_review_agent.py
    ├── refiner_agent.py
    ├── report_agent.py
    ├── search_agent.py
    └── visualization_agent.py
├── core
    ├── language_models.py
    ├── node.py
    ├── router.py
    ├── state.py
    └── workflow.py
├── create_agent.py
├── load_cfg.py
├── logger.py
├── main.ipynb
├── main.py
├── requirements.txt
└── tools
    ├── FileEdit.py
    ├── __init__.py
    ├── basetool.py
    └── internet.py


/.env Example:
--------------------------------------------------------------------------------
 1 | # Your data storage path(required)
 2 | WORKING_DIRECTORY =./data_storage/
 3 | 
 4 | # Anaconda installation path(required)
 5 | CONDA_PATH = /home/user/anaconda3
 6 | 
 7 | # Conda environment name(required)
 8 | CONDA_ENV = data_assistant
 9 | 
10 | # ChromeDriver executable path(required)
11 | CHROMEDRIVER_PATH =./chromedriver-linux64/chromedriver
12 | 
13 | # Firecrawl API key (optional)
14 | # Note: If this key is missing, query capabilities may be reduced
15 | FIRECRAWL_API_KEY = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
16 | 
17 | # OpenAI API key (required)
18 | # Warning: This key is essential; the program will not run without it
19 | OPENAI_API_KEY = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
20 | 
21 | # LangChain API key (optional)
22 | # Used for monitoring the processing
23 | LANGCHAIN_API_KEY = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | *.log
3 | *.csv
4 | data_storage/*
5 | *__pycache__/
6 | chromedriver/*


--------------------------------------------------------------------------------
/Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/starpig1129/DATAGEN/cb6f37318e1f78000ea4c0f56ec569bb59956aef/Architecture.png


--------------------------------------------------------------------------------
/DATAGEN.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/starpig1129/DATAGEN/cb6f37318e1f78000ea4c0f56ec569bb59956aef/DATAGEN.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 starpig1129
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DATAGEN (Previously AI-Data-Analysis-MultiAgent)
  2 | 
  3 | ![DATAGEN Banner](DATAGEN.jpg "DATAGEN Banner")
  4 | 
  5 | ## About DATAGEN
  6 | DATAGEN is a powerful brand name that represents our vision of leveraging artificial intelligence technology for data generation and analysis. The name combines "DATA" and "GEN"(generation), perfectly embodying the core functionality of this project - automated data analysis and research through a multi-agent system.
  7 | 
  8 | Visit us at [DATAGEN Digital](https://datagen.digital/)(website under development) to learn more about our vision and services.
  9 | 
 10 | ![System Architecture](Architecture.png)
 11 | ## Overview
 12 | 
 13 | DATAGEN is an advanced AI-powered data analysis and research platform that utilizes multiple specialized agents to streamline tasks such as data analysis, visualization, and report generation. Our platform leverages cutting-edge technologies including LangChain, OpenAI's GPT models, and LangGraph to handle complex research processes, integrating diverse AI architectures for optimal performance.
 14 | 
 15 | ## Key Features
 16 | 
 17 | ### Intelligent Analysis Core
 18 | - **Advanced Hypothesis Engine**
 19 |   - AI-driven hypothesis generation and validation
 20 |   - Automated research direction optimization
 21 |   - Real-time hypothesis refinement
 22 | - **Enterprise Data Processing**
 23 |   - Robust data cleaning and transformation
 24 |   - Scalable analysis pipelines
 25 |   - Automated quality assurance
 26 | - **Dynamic Visualization Suite**
 27 |   - Interactive data visualization
 28 |   - Custom report generation
 29 |   - Automated insight extraction
 30 | 
 31 | ### Advanced Technical Architecture
 32 | - **Multi-Agent Intelligence** 
 33 |   - Specialized agents for diverse tasks
 34 |   - Intelligent task distribution
 35 |   - Real-time coordination and optimization
 36 | - **Smart Memory Management**
 37 |   - State-of-the-art Note Taker agent
 38 |   - Efficient context retention system
 39 |   - Seamless workflow integration
 40 | - **Adaptive Processing Pipeline**
 41 |   - Dynamic workflow adjustment
 42 |   - Automated resource optimization
 43 |   - Real-time performance monitoring
 44 | 
 45 | ## Why DATAGEN Stands Out
 46 | 
 47 | DATAGEN revolutionizes data analysis through its innovative multi-agent architecture and intelligent automation capabilities:
 48 | 
 49 | 1. **Advanced Multi-Agent System**
 50 |    - Specialized agents working in harmony
 51 |    - Intelligent task distribution and coordination
 52 |    - Real-time adaptation to complex analysis requirements
 53 | 
 54 | 2. **Smart Context Management**
 55 |    - Pioneering Note Taker agent for state tracking
 56 |    - Efficient memory utilization and context retention
 57 |    - Seamless integration across analysis phases
 58 | 
 59 | 3. **Enterprise-Grade Performance**
 60 |    - Robust and scalable architecture
 61 |    - Consistent and reliable outcomes
 62 |    - Production-ready implementation
 63 | 
 64 | ## System Requirements
 65 | 
 66 | - Python 3.10 or higher
 67 | - Jupyter Notebook environment
 68 | 
 69 | ## Installation
 70 | 
 71 | 1. Clone the repository:
 72 | ```bash
 73 | git clone https://github.com/starpig1129/DATAGEN.git
 74 | ```
 75 | 2. Create and activate a Conda virtual environment:
 76 | ```bash
 77 | conda create -n data_assistant python=3.10
 78 | conda activate data_assistant
 79 | ```
 80 | 3. Install dependencies:
 81 | ```bash
 82 | pip install -r requirements.txt
 83 | ```
 84 | 4. Set up environment variables:
 85 | **Rename `.env Example` to `.env` and fill all the values**
 86 | ```sh
 87 | # Your data storage path(required)
 88 | DATA_STORAGE_PATH =./data_storage/
 89 | 
 90 | # Anaconda installation path(required)
 91 | CONDA_PATH = /home/user/anaconda3
 92 | 
 93 | # Conda environment name(required)
 94 | CONDA_ENV = envname
 95 | 
 96 | # ChromeDriver executable path(required)
 97 | CHROMEDRIVER_PATH =./chromedriver-linux64/chromedriver
 98 | 
 99 | # Firecrawl API key (optional)
100 | # Note: If this key is missing, query capabilities may be reduced
101 | FIRECRAWL_API_KEY = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
102 | 
103 | # OpenAI API key (required)
104 | # Warning: This key is essential; the program will not run without it
105 | OPENAI_API_KEY = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
106 | 
107 | # LangChain API key (optional)
108 | # Used for monitoring the processing
109 | LANGCHAIN_API_KEY = XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
110 | ```
111 | ## Usage
112 | 
113 | ### Using Jupyter Notebook
114 | 
115 | 1. Start Jupyter Notebook:
116 | 
117 | 2. Set YourDataName.csv in data_storage
118 | 
119 | 3. Open the `main.ipynb` file.
120 | 
121 | 4. Run all cells to initialize the system and create the workflow.
122 | 
123 | 5. In the last cell, you can customize the research task by modifying the `userInput` variable.
124 | 
125 | 6. Run the final few cells to execute the research process and view the results.
126 | 
127 | ### Using Python Script
128 | 
129 | You can also run the system directly using main.py:
130 | 
131 | 1. Place your data file (e.g., YourDataName.csv) in the data_storage directory
132 | 
133 | 2. Run the script:
134 | ```bash
135 | python main.py
136 | ```
137 | 
138 | 3. By default, it will process 'OnlineSalesData.csv'. To analyze a different dataset, modify the user_input variable in the main() function of main.py:
139 | ```python
140 | user_input = '''
141 | datapath:YourDataName.csv
142 | Use machine learning to perform data analysis and write complete graphical reports
143 | '''
144 | ```
145 | 
146 | ## Main Components
147 | 
148 | - `hypothesis_agent`: Generates research hypotheses
149 | - `process_agent`: Supervises the entire research process
150 | - `visualization_agent`: Creates data visualizations
151 | - `code_agent`: Writes data analysis code
152 | - `searcher_agent`: Conducts literature and web searches
153 | - `report_agent`: Writes research reports
154 | - `quality_review_agent`: Performs quality reviews
155 | - `note_agent`: Records the research process
156 | 
157 | ## Workflow
158 | 
159 | The system uses LangGraph to create a state graph that manages the entire research process. The workflow includes the following steps:
160 | 
161 | 1. Hypothesis generation
162 | 2. Human choice (continue or regenerate hypothesis)
163 | 3. Processing (including data analysis, visualization, search, and report writing)
164 | 4. Quality review
165 | 5. Revision as needed
166 | 
167 | ## Customization
168 | 
169 | You can customize the system behavior by modifying the agent creation and workflow definition in `main.ipynb`.
170 | 
171 | ## Notes
172 | 
173 | - Ensure you have sufficient OpenAI API credits, as the system will make multiple API calls.
174 | - The system may take some time to complete the entire research process, depending on the complexity of the task.
175 | - **WARNING**: The agent system may modify the data being analyzed. It is highly recommended to backup your data before using this system.
176 | ## Current Issues and Solutions
177 | 1. OpenAI Internal Server Error (Error code: 500)
178 | 2. NoteTaker Efficiency Improvement
179 | 3. Overall Runtime Optimization
180 | 4. Refiner needs to be better
181 | ## Contributing
182 | 
183 | Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change.
184 | 
185 | ## Strategic Partnership
186 | [![CTL GROUP](https://img.shields.io/badge/DATAGEN-Strategic_Partner-blue)](https://datagen.digital/)
187 | 
188 | We are excited to announce our upcoming strategic partnership with CTL GROUP, an innovative AI-Powered Crypto Intelligence Platform currently in development. This collaboration will bring together advanced AI research capabilities with crypto market intelligence:
189 | 
190 | ### Upcoming Partnership Features
191 | - **AI Crypto Research Integration**
192 |   - Automated market research and analysis system
193 |   - Advanced whale tracking capabilities
194 |   - Real-time sentiment analysis tools
195 |   - Comprehensive trading insights and strategies
196 | 
197 | - **Platform Features** (Coming Soon)
198 |   - State-of-the-art AI-powered crypto insights
199 |   - Smart trading strategy development
200 |   - Advanced whale & on-chain activity monitoring
201 |   - Interactive community engagement tools
202 | 
203 | - **Token Integration Benefits** (Coming Soon)
204 |   - Dynamic staking rewards system
205 |   - Premium tools and features access
206 |   - Innovative passive income opportunities
207 |   - Exclusive platform privileges
208 | 
209 | The platform is currently under development. Follow our progress on [GitHub](https://github.com/ctlgroupdev).
210 | 
211 | ## License
212 | 
213 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
214 | 
215 | ## Star History
216 | 
217 | [![Star History Chart](https://api.star-history.com/svg?repos=starpig1129/DATAGEN&type=Date)](https://star-history.com/#starpig1129/DATAGEN&Date)
218 | 
219 | ## Other Projects
220 | Here are some of my other notable projects:
221 | ### ShareLMAPI
222 | ShareLMAPI is a local language model sharing API that uses FastAPI to provide interfaces, allowing different programs or device to share the same local model, thereby reducing resource consumption. It supports streaming generation and various model configuration methods.
223 | - GitHub: [ShareLMAPI](https://github.com/starpig1129/ShareLMAPI)
224 | ### PigPig: Advanced Multi-modal LLM Discord Bot: 
225 | A powerful Discord bot based on multi-modal Large Language Models (LLM), designed to interact with users through natural language. 
226 | It combines advanced AI capabilities with practical features, offering a rich experience for Discord communities.
227 | - GitHub: [ai-discord-bot-PigPig](https://github.com/starpig1129/ai-discord-bot-PigPig)
228 | 


--------------------------------------------------------------------------------
/agent/code_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.basetool import execute_code, execute_command
 3 | from tools.FileEdit import read_document
 4 | 
 5 | def create_code_agent(power_llm, members, working_directory):
 6 |     """Create the code agent"""
 7 |     tools = [read_document, execute_code, execute_command]
 8 |     system_prompt = """
 9 |     You are an expert Python programmer specializing in data processing and analysis. Your main responsibilities include:
10 | 
11 |     1. Writing clean, efficient Python code for data manipulation, cleaning, and transformation.
12 |     2. Implementing statistical methods and machine learning algorithms as needed.
13 |     3. Debugging and optimizing existing code for performance improvements.
14 |     4. Adhering to PEP 8 standards and ensuring code readability with meaningful variable and function names.
15 | 
16 |     Constraints:
17 |     - Focus solely on data processing tasks; do not generate visualizations or write non-Python code.
18 |     - Provide only valid, executable Python code, including necessary comments for complex logic.
19 |     - Avoid unnecessary complexity; prioritize readability and efficiency.
20 |     """
21 |     return create_agent(
22 |         power_llm,
23 |         tools,
24 |         system_prompt,
25 |         members,
26 |         working_directory
27 |     )
28 | 


--------------------------------------------------------------------------------
/agent/hypothesis_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.FileEdit import collect_data
 3 | from langchain_community.tools import WikipediaQueryRun
 4 | from langchain_community.utilities import WikipediaAPIWrapper
 5 | from tools.internet import google_search, scrape_webpages_with_fallback
 6 | from langchain.agents import load_tools
 7 | 
 8 | def create_hypothesis_agent(llm, members, working_directory):
 9 |     """Create the hypothesis agent"""
10 |     wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
11 |     base_tools = [
12 |         collect_data, 
13 |         wikipedia, 
14 |         google_search, 
15 |         scrape_webpages_with_fallback
16 |     ] + load_tools(["arxiv"],)
17 |     
18 |     system_prompt = '''
19 |     As an esteemed expert in data analysis, your task is to formulate a set of research hypotheses and outline the steps to be taken based on the information table provided. Utilize statistics, machine learning, deep learning, and artificial intelligence in developing these hypotheses. Your hypotheses should be precise, achievable, professional, and innovative. To ensure the feasibility and uniqueness of your hypotheses, thoroughly investigate relevant information. For each hypothesis, include ample references to support your claims.
20 | 
21 |     Upon analyzing the information table, you are required to:
22 | 
23 |     1. Formulate research hypotheses that leverage statistics, machine learning, deep learning, and AI techniques.
24 |     2. Outline the steps involved in testing these hypotheses.
25 |     3. Verify the feasibility and uniqueness of each hypothesis through a comprehensive literature review.
26 | 
27 |     At the conclusion of your analysis, present the complete research hypotheses, elaborate on their uniqueness and feasibility, and provide relevant references to support your assertions. Please answer in structured way to enhance readability.
28 |     Just answer a research hypothesis.
29 |     '''
30 | 
31 |     return create_agent(
32 |         llm, 
33 |         base_tools,
34 |         system_prompt,
35 |         members,
36 |         working_directory
37 |     )
38 | 


--------------------------------------------------------------------------------
/agent/note_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_note_agent as base_create_note_agent
 2 | from tools.FileEdit import read_document
 3 | 
 4 | def create_note_agent(json_llm):
 5 |     """Create the note agent"""
 6 |     tools = [read_document]
 7 |     system_prompt = '''
 8 |     You are a meticulous research process note-taker. Your main responsibility is to observe, summarize, and document the actions and findings of the research team. Your tasks include:
 9 | 
10 |     1. Observing and recording key activities, decisions, and discussions among team members.
11 |     2. Summarizing complex information into clear, concise, and accurate notes.
12 |     3. Organizing notes in a structured format that ensures easy retrieval and reference.
13 |     4. Highlighting significant insights, breakthroughs, challenges, or any deviations from the research plan.
14 |     5. Responding only in JSON format to ensure structured documentation.
15 | 
16 |     Your output should be well-organized and easy to integrate with other project documentation.
17 |     '''
18 |     return base_create_note_agent(
19 |         json_llm,
20 |         tools,
21 |         system_prompt    
22 |         )
23 | 


--------------------------------------------------------------------------------
/agent/process_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_supervisor
 2 | 
 3 | def create_process_agent(power_llm):
 4 |     """Create the process/supervisor agent"""
 5 |     system_prompt = """
 6 |     You are a research supervisor responsible for overseeing and coordinating a comprehensive data analysis project, resulting in a complete and cohesive research report. Your primary tasks include:
 7 | 
 8 |     1. Validating and refining the research hypothesis to ensure it is clear, specific, and testable.
 9 |     2. Orchestrating a thorough data analysis process, with all code well-documented and reproducible.
10 |     3. Compiling and refining a research report that includes:
11 |         - Introduction
12 |         - Hypothesis
13 |         - Methodology
14 |         - Results, accompanied by relevant visualizations
15 |         - Discussion
16 |         - Conclusion
17 |         - References
18 | 
19 |     **Step-by-Step Process:**
20 |     1. **Planning:** Define clear objectives and expected outcomes for each phase of the project.
21 |     2. **Task Assignment:** Assign specific tasks to the appropriate agents ("Visualization," "Search," "Coder," "Report").
22 |     3. **Review and Integration:** Critically review and integrate outputs from each agent, ensuring consistency, quality, and relevance.
23 |     4. **Feedback:** Provide feedback and further instructions as needed to refine outputs.
24 |     5. **Final Compilation:** Ensure all components are logically connected and meet high academic standards.
25 | 
26 |     **Agent Guidelines:**
27 |     - **Visualization Agent:** Develop and explain data visualizations that effectively communicate key findings.
28 |     - **Search Agent:** Collect and summarize relevant information, and compile a comprehensive list of references.
29 |     - **Coder Agent:** Write and document efficient Python code for data analysis, ensuring that the code is clean and reproducible.
30 |     - **Report Agent:** Draft, refine, and finalize the research report, integrating inputs from all agents and ensuring the narrative is clear and cohesive.
31 | 
32 |     **Workflow:**
33 |     1. Plan the overall analysis and reporting process.
34 |     2. Assign tasks to the appropriate agents and oversee their progress.
35 |     3. Continuously review and integrate the outputs from each agent, ensuring that each contributes effectively to the final report.
36 |     4. Adjust the analysis and reporting process based on emerging results and insights.
37 |     5. Compile the final report, ensuring all sections are complete and well-integrated.
38 | 
39 |     **Completion Criteria:**
40 |     Respond with "FINISH" only when:
41 |     1. The hypothesis has been thoroughly tested and validated.
42 |     2. The data analysis is complete, with all code documented and reproducible.
43 |     3. All required visualizations have been created, properly labeled, and explained.
44 |     4. The research report is comprehensive, logically structured, and includes all necessary sections.
45 |     5. The reference list is complete and accurately cited.
46 |     6. All components are cohesively integrated into a polished final report.
47 | 
48 |     Ensure that the final report delivers a clear, insightful analysis, addressing all aspects of the hypothesis and meeting the highest academic standards.
49 |     """
50 |     
51 |     member = ["Visualization", "Search", "Coder", "Report"]
52 |     return create_supervisor(
53 |         power_llm,
54 |         system_prompt,
55 |         member
56 |     )
57 | 


--------------------------------------------------------------------------------
/agent/quality_review_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.FileEdit import create_document, read_document, edit_document
 3 | 
 4 | def create_quality_review_agent(llm, members, working_directory):
 5 |     """Create the quality review agent"""
 6 |     tools = [create_document, read_document, edit_document]
 7 |     system_prompt = '''
 8 |     You are a meticulous quality control expert responsible for reviewing and ensuring the high standard of all research outputs. Your tasks include:
 9 | 
10 |     1. Critically evaluating the content, methodology, and conclusions of research reports.
11 |     2. Checking for consistency, accuracy, and clarity in all documents.
12 |     3. Identifying areas that need improvement or further elaboration.
13 |     4. Ensuring adherence to scientific writing standards and ethical guidelines.
14 | 
15 |     After your review, if revisions are needed, respond with 'REVISION' as a prefix, set needs_revision=True, and provide specific feedback on parts that need improvement. If no revisions are necessary, respond with 'CONTINUE' as a prefix and set needs_revision=False.
16 |     '''
17 |     return create_agent(
18 |         llm,
19 |         tools,
20 |         system_prompt,
21 |         members,
22 |         working_directory
23 |     )
24 | 


--------------------------------------------------------------------------------
/agent/refiner_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.FileEdit import create_document, read_document, edit_document
 3 | from langchain_community.tools import WikipediaQueryRun
 4 | from langchain_community.utilities import WikipediaAPIWrapper
 5 | from tools.internet import google_search, scrape_webpages_with_fallback
 6 | from langchain.agents import load_tools
 7 | 
 8 | def create_refiner_agent(power_llm, members, working_directory):
 9 |     """Create the refiner agent"""
10 |     wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
11 |     tools = [
12 |         create_document, 
13 |         read_document, 
14 |         edit_document,
15 |         wikipedia, 
16 |         google_search, 
17 |         scrape_webpages_with_fallback
18 |     ] + load_tools(["arxiv"],)
19 |     
20 |     system_prompt = '''
21 |     You are an expert AI report refiner tasked with optimizing and enhancing research reports. Your responsibilities include:
22 | 
23 |     1. Thoroughly reviewing the entire research report, focusing on content, structure, and readability.
24 |     2. Identifying and emphasizing key findings, insights, and conclusions.
25 |     3. Restructuring the report to improve clarity, coherence, and logical flow.
26 |     4. Ensuring that all sections are well-integrated and support the primary research hypothesis.
27 |     5. Condensing redundant or repetitive content while preserving essential details.
28 |     6. Enhancing the overall readability, ensuring the report is engaging and impactful.
29 | 
30 |     Refinement Guidelines:
31 |     - Maintain the scientific accuracy and integrity of the original content.
32 |     - Ensure all critical points from the original report are preserved and clearly articulated.
33 |     - Improve the logical progression of ideas and arguments.
34 |     - Highlight the most significant results and their implications for the research hypothesis.
35 |     - Ensure that the refined report aligns with the initial research objectives and hypothesis.
36 | 
37 |     After refining the report, submit it for final human review, ensuring it is ready for publication or presentation.
38 |     '''
39 |     return create_agent(
40 |         power_llm,
41 |         tools,
42 |         system_prompt,
43 |         members,
44 |         working_directory
45 |     )
46 | 


--------------------------------------------------------------------------------
/agent/report_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.FileEdit import create_document, read_document, edit_document
 3 | 
 4 | def create_report_agent(power_llm, members, working_directory):
 5 |     """Create the report agent"""
 6 |     tools = [create_document, read_document, edit_document]
 7 |     
 8 |     system_prompt = """
 9 |     You are an experienced scientific writer tasked with drafting comprehensive research reports. Your primary duties include:
10 | 
11 |     1. Clearly stating the research hypothesis and objectives in the introduction.
12 |     2. Detailing the methodology used, including data collection and analysis techniques.
13 |     3. Structuring the report into coherent sections (e.g., Introduction, Methodology, Results, Discussion, Conclusion).
14 |     4. Synthesizing information from various sources into a unified narrative.
15 |     5. Integrating relevant data visualizations and ensuring they are appropriately referenced and explained.
16 | 
17 |     Constraints:
18 |     - Focus solely on report writing; do not perform data analysis or create visualizations.
19 |     - Maintain an objective, academic tone throughout the report.
20 |     - Cite all sources using APA style and ensure that all findings are supported by evidence.
21 |     """
22 |     return create_agent(
23 |         power_llm,
24 |         tools,
25 |         system_prompt,
26 |         members,
27 |         working_directory
28 |     )
29 | 


--------------------------------------------------------------------------------
/agent/search_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.FileEdit import create_document, read_document, collect_data
 3 | from langchain_community.tools import WikipediaQueryRun
 4 | from langchain_community.utilities import WikipediaAPIWrapper
 5 | from tools.internet import google_search, scrape_webpages_with_fallback
 6 | from langchain.agents import load_tools
 7 | 
 8 | def create_search_agent(llm, members, working_directory):
 9 |     """Create the search agent"""
10 |     wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
11 |     tools = [
12 |         create_document, 
13 |         read_document, 
14 |         collect_data, 
15 |         wikipedia, 
16 |         google_search, 
17 |         scrape_webpages_with_fallback
18 |     ] + load_tools(["arxiv"],)
19 |     
20 |     system_prompt = """
21 |     You are a skilled research assistant responsible for gathering and summarizing relevant information. Your main tasks include:
22 | 
23 |     1. Conducting thorough literature reviews using academic databases and reputable online sources.
24 |     2. Summarizing key findings in a clear, concise manner.
25 |     3. Providing citations for all sources, prioritizing peer-reviewed and academically reputable materials.
26 | 
27 |     Constraints:
28 |     - Focus exclusively on information retrieval and summarization; do not engage in data analysis or processing.
29 |     - Present information in an organized format, with clear attributions to sources.
30 |     - Evaluate the credibility of sources and prioritize high-quality, reliable information.
31 |     """
32 |     return create_agent(
33 |         llm,
34 |         tools,
35 |         system_prompt,
36 |         members,
37 |         working_directory
38 |     )
39 | 


--------------------------------------------------------------------------------
/agent/visualization_agent.py:
--------------------------------------------------------------------------------
 1 | from create_agent import create_agent
 2 | from tools.basetool import execute_code, execute_command
 3 | from tools.FileEdit import read_document
 4 | 
 5 | def create_visualization_agent(llm, members, working_directory):
 6 |     """Create the visualization agent"""
 7 |     tools = [read_document, execute_code, execute_command]
 8 |     
 9 |     system_prompt = """
10 |     You are a data visualization expert tasked with creating insightful visual representations of data. Your primary responsibilities include:
11 |     
12 |     1. Designing appropriate visualizations that clearly communicate data trends and patterns.
13 |     2. Selecting the most suitable chart types (e.g., bar charts, scatter plots, heatmaps) for different data types and analytical purposes.
14 |     3. Providing executable Python code (using libraries such as matplotlib, seaborn, or plotly) that generates these visualizations.
15 |     4. Including well-defined titles, axis labels, legends, and saving the visualizations as files.
16 |     5. Offering brief but clear interpretations of the visual findings.
17 | 
18 |     **File Saving Guidelines:**
19 |     - Save all visualizations as files with descriptive and meaningful filenames.
20 |     - Ensure filenames are structured to easily identify the content (e.g., 'sales_trends_2024.png' for a sales trend chart).
21 |     - Confirm that the saved files are organized in the working directory, making them easy for other agents to locate and use.
22 | 
23 |     **Constraints:**
24 |     - Focus solely on visualization tasks; do not perform data analysis or preprocessing.
25 |     - Ensure all visual elements are suitable for the target audience, with attention to color schemes and design principles.
26 |     - Avoid over-complicating visualizations; aim for clarity and simplicity.
27 |     """
28 |     return create_agent(
29 |         llm,
30 |         tools,
31 |         system_prompt,
32 |         members,
33 |         working_directory
34 |     )
35 | 


--------------------------------------------------------------------------------
/core/language_models.py:
--------------------------------------------------------------------------------
 1 | from langchain_openai import ChatOpenAI
 2 | from logger import setup_logger
 3 | 
 4 | class LanguageModelManager:
 5 |     def __init__(self):
 6 |         """Initialize the language model manager"""
 7 |         self.logger = setup_logger()
 8 |         self.llm = None
 9 |         self.power_llm = None
10 |         self.json_llm = None
11 |         self.initialize_llms()
12 | 
13 |     def initialize_llms(self):
14 |         """Initialize language models"""
15 |         try:
16 |             self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=4096)
17 |             self.power_llm = ChatOpenAI(model="gpt-4o", temperature=0.5, max_tokens=4096)
18 |             self.json_llm = ChatOpenAI(
19 |                 model="gpt-4o",
20 |                 model_kwargs={"response_format": {"type": "json_object"}},
21 |                 temperature=0,
22 |                 max_tokens=4096
23 |             )
24 |             self.logger.info("Language models initialized successfully.")
25 |         except Exception as e:
26 |             self.logger.error(f"Error initializing language models: {str(e)}")
27 |             raise
28 | 
29 |     def get_models(self):
30 |         """Return all initialized language models"""
31 |         return {
32 |             "llm": self.llm,
33 |             "power_llm": self.power_llm,
34 |             "json_llm": self.json_llm
35 |         }
36 | 


--------------------------------------------------------------------------------
/core/node.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | from langchain_core.messages import AIMessage, HumanMessage, BaseMessage,ToolMessage
  3 | from openai import InternalServerError
  4 | from core.state import State
  5 | import logging
  6 | import json
  7 | import re
  8 | import os
  9 | from pathlib import Path
 10 | from langchain.agents import AgentExecutor
 11 | # Set up logger
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | def agent_node(state: State, agent: AgentExecutor, name: str) -> State:
 15 |     """
 16 |     Process an agent's action and update the state accordingly.
 17 |     """
 18 |     logger.info(f"Processing agent: {name}")
 19 |     try:
 20 |         result = agent.invoke(state)
 21 |         logger.debug(f"Agent {name} result: {result}")
 22 |         
 23 |         output = result["output"] if isinstance(result, dict) and "output" in result else str(result)
 24 |         
 25 |         ai_message = AIMessage(content=output, name=name)
 26 |         state["messages"].append(ai_message)
 27 |         state["sender"] = name
 28 |         
 29 |         if name == "hypothesis_agent" and not state["hypothesis"]:
 30 |             state["hypothesis"] = ai_message
 31 |             logger.info("Hypothesis updated")
 32 |         elif name == "process_agent":
 33 |             state["process_decision"] = ai_message
 34 |             logger.info("Process decision updated")
 35 |         elif name == "visualization_agent":
 36 |             state["visualization_state"] = ai_message
 37 |             logger.info("Visualization state updated")
 38 |         elif name == "searcher_agent":
 39 |             state["searcher_state"] = ai_message
 40 |             logger.info("Searcher state updated")
 41 |         elif name == "report_agent":
 42 |             state["report_section"] = ai_message
 43 |             logger.info("Report section updated")
 44 |         elif name == "quality_review_agent":
 45 |             state["quality_review"] = ai_message
 46 |             state["needs_revision"] = "revision needed" in output.lower()
 47 |             logger.info(f"Quality review updated. Needs revision: {state['needs_revision']}")
 48 |         
 49 |         logger.info(f"Agent {name} processing completed")
 50 |         return state
 51 |     except Exception as e:
 52 |         logger.error(f"Error occurred while processing agent {name}: {str(e)}", exc_info=True)
 53 |         error_message = AIMessage(content=f"Error: {str(e)}", name=name)
 54 |         return {"messages": [error_message]}
 55 | 
 56 | def human_choice_node(state: State) -> State:
 57 |     """
 58 |     Handle human input to choose the next step in the process.
 59 |     If regenerating hypothesis, prompt for specific areas to modify.
 60 |     """
 61 |     logger.info("Prompting for human choice")
 62 |     print("Please choose the next step:")
 63 |     print("1. Regenerate hypothesis")
 64 |     print("2. Continue the research process")
 65 |     
 66 |     while True:
 67 |         choice = input("Please enter your choice (1 or 2): ")
 68 |         if choice in ["1", "2"]:
 69 |             break
 70 |         logger.warning(f"Invalid input received: {choice}")
 71 |         print("Invalid input, please try again.")
 72 |     
 73 |     if choice == "1":
 74 |         modification_areas = input("Please specify which parts of the hypothesis you want to modify: ")
 75 |         content = f"Regenerate hypothesis. Areas to modify: {modification_areas}"
 76 |         state["hypothesis"] = ""
 77 |         state["modification_areas"] = modification_areas
 78 |         logger.info("Hypothesis cleared for regeneration")
 79 |         logger.info(f"Areas to modify: {modification_areas}")
 80 |     else:
 81 |         content = "Continue the research process"
 82 |         state["process"] = "Continue the research process"
 83 |         logger.info("Continuing research process")
 84 |     
 85 |     human_message = HumanMessage(content=content)
 86 |     
 87 |     state["messages"].append(human_message)
 88 |     state["sender"] = 'human'
 89 |     
 90 |     logger.info("Human choice processed")
 91 |     return state
 92 | 
 93 | def create_message(message: dict[str], name: str) -> BaseMessage:
 94 |     """
 95 |     Create a BaseMessage object based on the message type.
 96 |     """
 97 |     content = message.get("content", "")
 98 |     message_type = message.get("type", "").lower()
 99 |     
100 |     logger.debug(f"Creating message of type {message_type} for {name}")
101 |     return HumanMessage(content=content) if message_type == "human" else AIMessage(content=content, name=name)
102 | 
103 | def note_agent_node(state: State, agent: AgentExecutor, name: str) -> State:
104 |     """
105 |     Process the note agent's action and update the entire state.
106 |     """
107 |     logger.info(f"Processing note agent: {name}")
108 |     try:
109 |         current_messages = state.get("messages", [])
110 |         
111 |         head_messages, tail_messages = [], []
112 |         
113 |         if len(current_messages) > 6:
114 |             head_messages = current_messages[:2] 
115 |             tail_messages = current_messages[-2:]
116 |             state = {**state, "messages": current_messages[2:-2]}
117 |             logger.debug("Trimmed messages for processing")
118 |         
119 |         result = agent.invoke(state)
120 |         logger.debug(f"Note agent {name} result: {result}")
121 |         output = result["output"] if isinstance(result, dict) and "output" in result else str(result)
122 | 
123 |         cleaned_output = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', output)
124 |         parsed_output = json.loads(cleaned_output)
125 |         logger.debug(f"Parsed output: {parsed_output}")
126 | 
127 |         new_messages = [create_message(msg, name) for msg in parsed_output.get("messages", [])]
128 |         
129 |         messages = new_messages if new_messages else current_messages
130 |         
131 |         combined_messages = head_messages + messages + tail_messages
132 |         
133 |         updated_state: State = {
134 |             "messages": combined_messages,
135 |             "hypothesis": str(parsed_output.get("hypothesis", state.get("hypothesis", ""))),
136 |             "process": str(parsed_output.get("process", state.get("process", ""))),
137 |             "process_decision": str(parsed_output.get("process_decision", state.get("process_decision", ""))),
138 |             "visualization_state": str(parsed_output.get("visualization_state", state.get("visualization_state", ""))),
139 |             "searcher_state": str(parsed_output.get("searcher_state", state.get("searcher_state", ""))),
140 |             "code_state": str(parsed_output.get("code_state", state.get("code_state", ""))),
141 |             "report_section": str(parsed_output.get("report_section", state.get("report_section", ""))),
142 |             "quality_review": str(parsed_output.get("quality_review", state.get("quality_review", ""))),
143 |             "needs_revision": bool(parsed_output.get("needs_revision", state.get("needs_revision", False))),
144 |             "sender": 'note_agent'
145 |         }
146 |         
147 |         logger.info("Updated state successfully")
148 |         return updated_state
149 | 
150 |     except json.JSONDecodeError as e:
151 |         logger.error(f"JSON decode error: {e}", exc_info=True)
152 |         return _create_error_state(state, AIMessage(content=f"Error parsing output: {output}", name=name), name, "JSON decode error")
153 | 
154 |     except InternalServerError as e:
155 |         logger.error(f"OpenAI Internal Server Error: {e}", exc_info=True)
156 |         return _create_error_state(state, AIMessage(content=f"OpenAI Error: {str(e)}", name=name), name, "OpenAI error")
157 | 
158 |     except Exception as e:
159 |         logger.error(f"Unexpected error in note_agent_node: {e}", exc_info=True)
160 |         return _create_error_state(state, AIMessage(content=f"Unexpected error: {str(e)}", name=name), name, "Unexpected error")
161 | 
162 | def _create_error_state(state: State, error_message: AIMessage, name: str, error_type: str) -> State:
163 |     """
164 |     Create an error state when an exception occurs.
165 |     """
166 |     logger.info(f"Creating error state for {name}: {error_type}")
167 |     error_state:State = {
168 |             "messages": state.get("messages", []) + [error_message],
169 |             "hypothesis": str(state.get("hypothesis", "")),
170 |             "process": str(state.get("process", "")),
171 |             "process_decision": str(state.get("process_decision", "")),
172 |             "visualization_state": str(state.get("visualization_state", "")),
173 |             "searcher_state": str(state.get("searcher_state", "")),
174 |             "code_state": str(state.get("code_state", "")),
175 |             "report_section": str(state.get("report_section", "")),
176 |             "quality_review": str(state.get("quality_review", "")),
177 |             "needs_revision": bool(state.get("needs_revision", False)),
178 |             "sender": 'note_agent'
179 |         }
180 |     return error_state
181 | 
182 | def human_review_node(state: State) -> State:
183 |     """
184 |     Display current state to the user and update the state based on user input.
185 |     Includes error handling for robustness.
186 |     """
187 |     try:
188 |         print("Current research progress:")
189 |         print(state)
190 |         print("\nDo you need additional analysis or modifications?")
191 |         
192 |         while True:
193 |             user_input = input("Enter 'yes' to continue analysis, or 'no' to end the research: ").lower()
194 |             if user_input in ['yes', 'no']:
195 |                 break
196 |             print("Invalid input. Please enter 'yes' or 'no'.")
197 |         
198 |         if user_input == 'yes':
199 |             while True:
200 |                 additional_request = input("Please enter your additional analysis request: ").strip()
201 |                 if additional_request:
202 |                     state["messages"].append(HumanMessage(content=additional_request))
203 |                     state["needs_revision"] = True
204 |                     break
205 |                 print("Request cannot be empty. Please try again.")
206 |         else:
207 |             state["needs_revision"] = False
208 |         
209 |         state["sender"] = "human"
210 |         logger.info("Human review completed successfully.")
211 |         return state
212 |     
213 |     except KeyboardInterrupt:
214 |         logger.warning("Human review interrupted by user.")
215 |         return None
216 |     
217 |     except Exception as e:
218 |         logger.error(f"An error occurred during human review: {str(e)}", exc_info=True)
219 |         return None
220 |     
221 | def refiner_node(state: State, agent: AgentExecutor, name: str) -> State:
222 |     """
223 |     Read MD file contents and PNG file names from the specified storage path,
224 |     add them as report materials to a new message,
225 |     then process with the agent and update the original state.
226 |     If token limit is exceeded, use only MD file names instead of full content.
227 |     """
228 |     try:
229 |         # Get storage path
230 |         storage_path = Path(os.getenv('STORAGE_PATH', './data_storage/'))
231 |         
232 |         # Collect materials
233 |         materials = []
234 |         md_files = list(storage_path.glob("*.md"))
235 |         png_files = list(storage_path.glob("*.png"))
236 |         
237 |         # Process MD files
238 |         for md_file in md_files:
239 |             with open(md_file, "r", encoding="utf-8") as f:
240 |                 materials.append(f"MD file '{md_file.name}':\n{f.read()}")
241 |         
242 |         # Process PNG files
243 |         materials.extend(f"PNG file: '{png_file.name}'" for png_file in png_files)
244 |         
245 |         # Combine materials
246 |         combined_materials = "\n\n".join(materials)
247 |         report_content = f"Report materials:\n{combined_materials}"
248 |         
249 |         # Create refiner state
250 |         refiner_state = state.copy()
251 |         refiner_state["messages"] = [BaseMessage(content=report_content)]
252 |         
253 |         try:
254 |             # Attempt to invoke agent with full content
255 |             result = agent.invoke(refiner_state)
256 |         except Exception as token_error:
257 |             # If token limit is exceeded, retry with only MD file names
258 |             logger.warning("Token limit exceeded. Retrying with MD file names only.")
259 |             md_file_names = [f"MD file: '{md_file.name}'" for md_file in md_files]
260 |             png_file_names = [f"PNG file: '{png_file.name}'" for png_file in png_files]
261 |             
262 |             simplified_materials = "\n".join(md_file_names + png_file_names)
263 |             simplified_report_content = f"Report materials (file names only):\n{simplified_materials}"
264 |             
265 |             refiner_state["messages"] = [BaseMessage(content=simplified_report_content)]
266 |             result = agent.invoke(refiner_state)
267 |         
268 |         # Update original state
269 |         state["messages"].append(AIMessage(content=result))
270 |         state["sender"] = name
271 |         
272 |         logger.info("Refiner node processing completed")
273 |         return state
274 |     except Exception as e:
275 |         logger.error(f"Error occurred while processing refiner node: {str(e)}", exc_info=True)
276 |         state["messages"].append(AIMessage(content=f"Error: {str(e)}", name=name))
277 |         return state
278 |     
279 | logger.info("Agent processing module initialized")


--------------------------------------------------------------------------------
/core/router.py:
--------------------------------------------------------------------------------
  1 | from core.state import State
  2 | from typing import Literal, Union, Dict, List, Optional
  3 | from langchain_core.messages import AIMessage
  4 | import logging
  5 | import json
  6 | 
  7 | # Set up logger
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | # Define types for node routing
 11 | NodeType = Literal['Visualization', 'Search', 'Coder', 'Report', 'Process', 'NoteTaker', 'Hypothesis', 'QualityReview']
 12 | ProcessNodeType = Literal['Coder', 'Search', 'Visualization', 'Report', 'Process', 'Refiner']
 13 | 
 14 | def hypothesis_router(state: State) -> NodeType:
 15 |     """
 16 |     Route based on the presence of a hypothesis in the state.
 17 | 
 18 |     Args:
 19 |         state (State): The current state of the system.
 20 | 
 21 |     Returns:
 22 |         NodeType: 'Hypothesis' if no hypothesis exists, otherwise 'Process'.
 23 |     """
 24 |     logger.info("Entering hypothesis_router")
 25 |     hypothesis: Union[AIMessage, str, None] = state.get("hypothesis")
 26 |     
 27 |     try:
 28 |         if isinstance(hypothesis, AIMessage):
 29 |             hypothesis_content = hypothesis.content
 30 |             logger.debug("Hypothesis is an AIMessage")
 31 |         elif isinstance(hypothesis, str):
 32 |             hypothesis_content = hypothesis
 33 |             logger.debug("Hypothesis is a string")
 34 |         else:
 35 |             hypothesis_content = ""
 36 |             logger.warning(f"Unexpected hypothesis type: {type(hypothesis)}")
 37 |             
 38 |         if not isinstance(hypothesis_content, str):
 39 |             hypothesis_content = str(hypothesis_content)
 40 |             logger.warning("Converting hypothesis content to string")
 41 |     except Exception as e:
 42 |         logger.error(f"Error processing hypothesis: {e}")
 43 |         hypothesis_content = ""
 44 |     
 45 |     result = "Hypothesis" if not hypothesis_content.strip() else "Process"
 46 |     logger.info(f"hypothesis_router decision: {result}")
 47 |     return result
 48 | 
 49 | def QualityReview_router(state: State) -> NodeType:
 50 |     """
 51 |     Route based on the quality review outcome and process decision.
 52 | 
 53 |     Args:
 54 |     state (State): The current state of the system.
 55 | 
 56 |     Returns:
 57 |     NodeType: The next node to route to based on the quality review and process decision.
 58 |     """
 59 |     logger.info("Entering QualityReview_router")
 60 |     messages = state.get("messages", [])
 61 |     last_message = messages[-1] if messages else None
 62 |     
 63 |     # Check if revision is needed
 64 |     if (last_message and 'REVISION' in str(last_message.content)) or state.get("needs_revision", False):
 65 |         previous_node = state.get("last_sender", "")
 66 |         revision_routes = {
 67 |             "Visualization": "Visualization",
 68 |             "Search": "Search",
 69 |             "Coder": "Coder",
 70 |             "Report": "Report"
 71 |         }
 72 |         result = revision_routes.get(previous_node, "NoteTaker")
 73 |         logger.info(f"Revision needed. Routing to: {result}")
 74 |         return result
 75 |     
 76 |     else:
 77 |         return "NoteTaker"
 78 |     
 79 | 
 80 | def process_router(state: State) -> ProcessNodeType:
 81 |     """
 82 |     Route based on the process decision in the state.
 83 | 
 84 |     Args:
 85 |         state (State): The current state of the system.
 86 | 
 87 |     Returns:
 88 |         ProcessNodeType: The next process node to route to based on the process decision.
 89 |     """
 90 |     logger.info("Entering process_router")
 91 |     process_decision: Union[AIMessage, Dict, str, None] = state.get("process_decision", "")
 92 |     
 93 |     decision_str: str = ""
 94 |     
 95 |     try:
 96 |         if isinstance(process_decision, AIMessage):
 97 |             logger.debug("Process decision is an AIMessage")
 98 |             try:
 99 |                 decision_dict = json.loads(process_decision.content.replace("'", '"'))
100 |                 decision_str = str(decision_dict.get('next', ''))
101 |             except json.JSONDecodeError as e:
102 |                 logger.warning(f"JSON parse error: {e}. Using content directly.")
103 |                 decision_str = process_decision.content
104 |         elif isinstance(process_decision, dict):
105 |             decision_str = str(process_decision.get('next', ''))
106 |         else:
107 |             decision_str = str(process_decision)
108 |     except Exception as e:
109 |         logger.error(f"Error processing decision: {e}")
110 |         decision_str = ""
111 |     
112 |     # Define valid decisions
113 |     valid_decisions = {"Coder", "Search", "Visualization", "Report"}
114 |     
115 |     if decision_str in valid_decisions:
116 |         logger.info(f"Valid process decision: {decision_str}")
117 |         return decision_str
118 |     
119 |     if decision_str == "FINISH":
120 |         logger.info("Process decision is FINISH. Ending process.")
121 |         return "Refiner"
122 |     
123 |     # If decision_str is empty or not a valid decision, return "Process"
124 |     if not decision_str or decision_str not in valid_decisions:
125 |         logger.warning(f"Invalid or empty process decision: {decision_str}. Defaulting to 'Process'.")
126 |         return "Process"
127 |     
128 |     # Default to "Process"
129 |     logger.info("Defaulting to 'Process'")
130 |     return "Process"
131 | 
132 | logger.info("Router module initialized")
133 | 


--------------------------------------------------------------------------------
/core/state.py:
--------------------------------------------------------------------------------
 1 | from langchain_core.messages import BaseMessage
 2 | from typing import Sequence, TypedDict
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | class State(TypedDict):
 6 |     """TypedDict for the entire state structure."""
 7 |     # The sequence of messages exchanged in the conversation
 8 |     messages: Sequence[BaseMessage]
 9 | 
10 |     # The complete content of the research hypothesis
11 |     hypothesis: str = ""
12 |     
13 |     # The complete content of the research process
14 |     process: str = ""
15 |     
16 |     # next process
17 |     process_decision: str = ""
18 |     
19 |     # The current state of data visualization planning and execution
20 |     visualization_state: str = ""
21 |     
22 |     # The current state of the search process, including queries and results
23 |     searcher_state: str = ""
24 |     
25 |     # The current state of Coder development, including scripts and outputs
26 |     code_state: str = ""
27 |     
28 |     # The content of the report sections being written
29 |     report_section: str = ""
30 |     
31 |     # The feedback and comments from the quality review process
32 |     quality_review: str = ""
33 |     
34 |     # A boolean flag indicating if the current output requires revision
35 |     needs_revision: bool = False
36 |     
37 |     # The identifier of the agent who sent the last message
38 |     sender: str = ""
39 | 
40 | class NoteState(BaseModel):
41 |     """Pydantic model for the entire state structure."""
42 |     messages: Sequence[BaseMessage] = Field(default_factory=list, description="List of message dictionaries")
43 |     hypothesis: str = Field(default="", description="Current research hypothesis")
44 |     process: str = Field(default="", description="Current research process")
45 |     process_decision: str = Field(default="", description="Decision about the next process step")
46 |     visualization_state: str = Field(default="", description="Current state of data visualization")
47 |     searcher_state: str = Field(default="", description="Current state of the search process")
48 |     code_state: str = Field(default="", description="Current state of code development")
49 |     report_section: str = Field(default="", description="Content of the report sections")
50 |     quality_review: str = Field(default="", description="Feedback from quality review")
51 |     needs_revision: bool = Field(default=False, description="Flag indicating if revision is needed")
52 |     sender: str = Field(default="", description="Identifier of the last message sender")
53 | 
54 |     class Config:
55 |         arbitrary_types_allowed = True  # Allow BaseMessage type without explicit validator
56 | 


--------------------------------------------------------------------------------
/core/workflow.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Any
  2 | from langgraph.graph import StateGraph, END, START
  3 | from langgraph.checkpoint.memory import MemorySaver
  4 | from core.state import State
  5 | from core.node import agent_node, human_choice_node, note_agent_node, human_review_node, refiner_node
  6 | from core.router import QualityReview_router, hypothesis_router, process_router
  7 | from agent.hypothesis_agent import create_hypothesis_agent
  8 | from agent.process_agent import create_process_agent
  9 | from agent.visualization_agent import create_visualization_agent
 10 | from agent.code_agent import create_code_agent
 11 | from agent.search_agent import create_search_agent
 12 | from agent.report_agent import create_report_agent
 13 | from agent.quality_review_agent import create_quality_review_agent
 14 | from agent.note_agent import create_note_agent
 15 | from agent.refiner_agent import create_refiner_agent
 16 | 
 17 | class WorkflowManager:
 18 |     def __init__(self, language_models, working_directory):
 19 |         """
 20 |         Initialize the workflow manager with language models and working directory.
 21 |         
 22 |         Args:
 23 |             language_models (dict): Dictionary containing language model instances
 24 |             working_directory (str): Path to the working directory
 25 |         """
 26 |         self.language_models = language_models
 27 |         self.working_directory = working_directory
 28 |         self.workflow = None
 29 |         self.memory = None
 30 |         self.graph = None
 31 |         self.members = ["Hypothesis", "Process", "Visualization", "Search", "Coder", "Report", "QualityReview", "Refiner"]
 32 |         self.agents = self.create_agents()
 33 |         self.setup_workflow()
 34 | 
 35 |     def create_agents(self):
 36 |         """Create all system agents"""
 37 |         # Get language models
 38 |         llm = self.language_models["llm"]
 39 |         power_llm = self.language_models["power_llm"]
 40 |         json_llm = self.language_models["json_llm"]
 41 | 
 42 |         # Create agents dictionary
 43 |         agents = {}
 44 | 
 45 |         # Create each agent using their respective creation functions
 46 |         agents["hypothesis_agent"] = create_hypothesis_agent(
 47 |             llm, 
 48 |             self.members,
 49 |             self.working_directory
 50 |         )
 51 | 
 52 |         agents["process_agent"] = create_process_agent(power_llm)
 53 | 
 54 |         agents["visualization_agent"] = create_visualization_agent(
 55 |             llm,
 56 |             self.members,
 57 |             self.working_directory
 58 |         )
 59 | 
 60 |         agents["code_agent"] = create_code_agent(
 61 |             power_llm,
 62 |             self.members,
 63 |             self.working_directory
 64 |         )
 65 | 
 66 |         agents["searcher_agent"] = create_search_agent(
 67 |             llm,
 68 |             self.members,
 69 |             self.working_directory
 70 |         )
 71 | 
 72 |         agents["report_agent"] = create_report_agent(
 73 |             power_llm,
 74 |             self.members,
 75 |             self.working_directory
 76 |         )
 77 | 
 78 |         agents["quality_review_agent"] = create_quality_review_agent(
 79 |             llm,
 80 |             self.members,
 81 |             self.working_directory
 82 |         )
 83 | 
 84 |         agents["note_agent"] = create_note_agent(json_llm)
 85 | 
 86 |         agents["refiner_agent"] = create_refiner_agent(
 87 |             power_llm,
 88 |             self.members,
 89 |             self.working_directory
 90 |         )
 91 | 
 92 |         return agents
 93 | 
 94 |     def setup_workflow(self):
 95 |         """Set up the workflow graph"""
 96 |         self.workflow = StateGraph(State)
 97 |         
 98 |         # Add nodes
 99 |         self.workflow.add_node("Hypothesis", lambda state: agent_node(state, self.agents["hypothesis_agent"], "hypothesis_agent"))
100 |         self.workflow.add_node("Process", lambda state: agent_node(state, self.agents["process_agent"], "process_agent"))
101 |         self.workflow.add_node("Visualization", lambda state: agent_node(state, self.agents["visualization_agent"], "visualization_agent"))
102 |         self.workflow.add_node("Search", lambda state: agent_node(state, self.agents["searcher_agent"], "searcher_agent"))
103 |         self.workflow.add_node("Coder", lambda state: agent_node(state, self.agents["code_agent"], "code_agent"))
104 |         self.workflow.add_node("Report", lambda state: agent_node(state, self.agents["report_agent"], "report_agent"))
105 |         self.workflow.add_node("QualityReview", lambda state: agent_node(state, self.agents["quality_review_agent"], "quality_review_agent"))
106 |         self.workflow.add_node("NoteTaker", lambda state: note_agent_node(state, self.agents["note_agent"], "note_agent"))
107 |         self.workflow.add_node("HumanChoice", human_choice_node)
108 |         self.workflow.add_node("HumanReview", human_review_node)
109 |         self.workflow.add_node("Refiner", lambda state: refiner_node(state, self.agents["refiner_agent"], "refiner_agent"))
110 | 
111 |         # Add edges
112 |         self.workflow.add_edge(START, "Hypothesis")
113 |         self.workflow.add_edge("Hypothesis", "HumanChoice")
114 |         
115 |         self.workflow.add_conditional_edges(
116 |             "HumanChoice",
117 |             hypothesis_router,
118 |             {
119 |                 "Hypothesis": "Hypothesis",
120 |                 "Process": "Process"
121 |             }
122 |         )
123 | 
124 |         self.workflow.add_conditional_edges(
125 |             "Process",
126 |             process_router,
127 |             {
128 |                 "Coder": "Coder",
129 |                 "Search": "Search",
130 |                 "Visualization": "Visualization",
131 |                 "Report": "Report",
132 |                 "Process": "Process",
133 |                 "Refiner": "Refiner",
134 |             }
135 |         )
136 | 
137 |         for member in ["Visualization", 'Search', 'Coder', 'Report']:
138 |             self.workflow.add_edge(member, "QualityReview")
139 | 
140 |         self.workflow.add_conditional_edges(
141 |             "QualityReview",
142 |             QualityReview_router,
143 |             {
144 |                 'Visualization': "Visualization",
145 |                 'Search': "Search",
146 |                 'Coder': "Coder",
147 |                 'Report': "Report",
148 |                 'NoteTaker': "NoteTaker",
149 |             }
150 |         )
151 | 
152 |         self.workflow.add_edge("NoteTaker", "Process")
153 |         self.workflow.add_edge("Refiner", "HumanReview")
154 |         
155 |         self.workflow.add_conditional_edges(
156 |             "HumanReview",
157 |             lambda state: "Process" if state and state.get("needs_revision", False) else "END",
158 |             {
159 |                 "Process": "Process",
160 |                 "END": END
161 |             }
162 |         )
163 | 
164 |         # Compile workflow
165 |         self.memory = MemorySaver()
166 |         self.graph = self.workflow.compile()
167 | 
168 |     def get_graph(self):
169 |         """Return the compiled workflow graph"""
170 |         return self.graph
171 | 


--------------------------------------------------------------------------------
/create_agent.py:
--------------------------------------------------------------------------------
  1 | from langchain.agents import create_openai_functions_agent, AgentExecutor
  2 | from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
  3 | from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
  4 | from langchain_openai import ChatOpenAI
  5 | from typing import List
  6 | from langchain.tools import tool
  7 | import os
  8 | from logger import setup_logger
  9 | 
 10 | # Set up logger
 11 | logger = setup_logger()
 12 | 
 13 | @tool
 14 | def list_directory_contents(directory: str = './data_storage/') -> str:
 15 |     """
 16 |     List the contents of the specified directory.
 17 |     
 18 |     Args:
 19 |         directory (str): The path to the directory to list. Defaults to the data storage directory.
 20 |     
 21 |     Returns:
 22 |         str: A string representation of the directory contents.
 23 |     """
 24 |     try:
 25 |         logger.info(f"Listing contents of directory: {directory}")
 26 |         contents = os.listdir(directory)
 27 |         logger.debug(f"Directory contents: {contents}")
 28 |         return f"Directory contents :\n" + "\n".join(contents)
 29 |     except Exception as e:
 30 |         logger.error(f"Error listing directory contents: {str(e)}")
 31 |         return f"Error listing directory contents: {str(e)}"
 32 | 
 33 | def create_agent(
 34 |     llm: ChatOpenAI,
 35 |     tools: list[tool],
 36 |     system_message: str,
 37 |     team_members: list[str],
 38 |     working_directory: str = './data_storage/'
 39 | ) -> AgentExecutor:
 40 |     """
 41 |     Create an agent with the given language model, tools, system message, and team members.
 42 |     
 43 |     Parameters:
 44 |         llm (ChatOpenAI): The language model to use for the agent.
 45 |         tools (list[tool]): A list of tools the agent can use.
 46 |         system_message (str): A message defining the agent's role and tasks.
 47 |         team_members (list[str]): A list of team member roles for collaboration.
 48 |         working_directory (str): The directory where the agent's data will be stored.
 49 |         
 50 |     Returns:
 51 |         AgentExecutor: An executor that manages the agent's task execution.
 52 |     """
 53 |     
 54 |     logger.info("Creating agent")
 55 | 
 56 |     # Ensure the ListDirectoryContents tool is available
 57 |     if list_directory_contents not in tools:
 58 |         tools.append(list_directory_contents)
 59 | 
 60 |     # Prepare the tool names and team members for the system prompt
 61 |     tool_names = ", ".join([tool.name for tool in tools])
 62 |     team_members_str = ", ".join(team_members)
 63 | 
 64 |     # List the initial contents of the working directory
 65 |     initial_directory_contents = list_directory_contents(working_directory)
 66 | 
 67 |     # Create the system prompt for the agent
 68 |     system_prompt = (
 69 |         "You are a specialized AI assistant in a data analysis team. "
 70 |         "Your role is to complete specific tasks in the research process. "
 71 |         "Use the provided tools to make progress on your task. "
 72 |         "If you can't fully complete a task, explain what you've done and what's needed next. "
 73 |         "Always aim for accurate and clear outputs. "
 74 |         f"You have access to the following tools: {tool_names}. "
 75 |         f"Your specific role: {system_message}\n"
 76 |         "Work autonomously according to your specialty, using the tools available to you. "
 77 |         "Do not ask for clarification. "
 78 |         "Your other team members (and other teams) will collaborate with you based on their specialties. "
 79 |         f"You are chosen for a reason! You are one of the following team members: {team_members_str}.\n"
 80 |         f"The initial contents of your working directory are:\n{initial_directory_contents}\n"
 81 |         "Use the ListDirectoryContents tool to check for updates in the directory contents when needed."
 82 |     )
 83 | 
 84 |     # Define the prompt structure with placeholders for dynamic content
 85 |     prompt = ChatPromptTemplate.from_messages([
 86 |         ("system", system_prompt),
 87 |         MessagesPlaceholder(variable_name="messages"),
 88 |         ("ai", "hypothesis: {hypothesis}"),
 89 |         ("ai", "process: {process}"),
 90 |         ("ai", "process_decision: {process_decision}"),
 91 |         ("ai", "visualization_state: {visualization_state}"),
 92 |         ("ai", "searcher_state: {searcher_state}"),
 93 |         ("ai", "code_state: {code_state}"),
 94 |         ("ai", "report_section: {report_section}"),
 95 |         ("ai", "quality_review: {quality_review}"),
 96 |         ("ai", "needs_revision: {needs_revision}"),
 97 |         MessagesPlaceholder(variable_name="agent_scratchpad"),
 98 |     ])
 99 | 
100 |     # Create the agent using the defined prompt and tools
101 |     agent = create_openai_functions_agent(llm=llm, tools=tools, prompt=prompt)
102 |     
103 |     logger.info("Agent created successfully")
104 |     
105 |     # Return an executor to manage the agent's task execution
106 |     return AgentExecutor.from_agent_and_tools(agent=agent, tools=tools, verbose=False)
107 | 
108 | 
109 | def create_supervisor(llm: ChatOpenAI, system_prompt: str, members: list[str]) -> AgentExecutor:
110 |     # Log the start of supervisor creation
111 |     logger.info("Creating supervisor")
112 |     
113 |     # Define options for routing, including FINISH and team members
114 |     options = ["FINISH"] + members
115 |     
116 |     # Define the function for routing and task assignment
117 |     function_def = {
118 |         "name": "route",
119 |         "description": "Select the next role and assign a task.",
120 |         "parameters": {
121 |             "title": "routeSchema",
122 |             "type": "object",
123 |             "properties": {
124 |                 "next": {
125 |                     "title": "Next",
126 |                     "anyOf": [
127 |                         {"enum": options},
128 |                     ],
129 |                 },
130 |                 "task": {
131 |                     "title": "Task",
132 |                     "type": "string",
133 |                     "description": "The task to be performed by the selected agent"
134 |                 }
135 |             },
136 |             "required": ["next", "task"],
137 |         },
138 |     }
139 |     
140 |     # Create the prompt template
141 |     prompt = ChatPromptTemplate.from_messages(
142 |         [
143 |             ("system", system_prompt),
144 |             MessagesPlaceholder(variable_name="messages"),
145 |             (
146 |                 "system",
147 |                 "Given the conversation above, who should act next? "
148 |                 "Or should we FINISH? Select one of: {options}. "
149 |                 "Additionally, specify the task that the selected role should perform."
150 |             ),
151 |         ]
152 |     ).partial(options=str(options), team_members=", ".join(members))
153 |     
154 |     # Log successful creation of supervisor
155 |     logger.info("Supervisor created successfully")
156 |     
157 |     # Return the chained operations
158 |     return (
159 |         prompt
160 |         | llm.bind_functions(functions=[function_def], function_call="route")
161 |         | JsonOutputFunctionsParser()
162 |     )
163 | 
164 | from core.state import NoteState
165 | from langchain.output_parsers import PydanticOutputParser
166 | 
167 | def create_note_agent(
168 |     llm: ChatOpenAI,
169 |     tools: list,
170 |     system_prompt: str,
171 | ) -> AgentExecutor:
172 |     """
173 |     Create a Note Agent that updates the entire state.
174 |     """
175 |     logger.info("Creating note agent")
176 |     parser = PydanticOutputParser(pydantic_object=NoteState)
177 |     output_format = parser.get_format_instructions()
178 |     escaped_output_format = output_format.replace("{", "{{").replace("}", "}}")
179 |     prompt = ChatPromptTemplate.from_messages([
180 |         ("system", system_prompt+"\n\nPlease format your response as a JSON object with the following structure:\n"+escaped_output_format),
181 |         MessagesPlaceholder(variable_name="messages"),
182 |         MessagesPlaceholder(variable_name="agent_scratchpad"),
183 |     ])
184 |     logger.debug(f"Note agent prompt: {prompt}")
185 |     agent = create_openai_functions_agent(llm=llm, tools=tools, prompt=prompt)
186 |     logger.info("Note agent created successfully")
187 |     return AgentExecutor.from_agent_and_tools(
188 |         agent=agent, 
189 |         tools=tools, 
190 |         verbose=False,
191 |     )
192 | 
193 | logger.info("Agent creation module initialized")


--------------------------------------------------------------------------------
/load_cfg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | # Load environment variables
 4 | load_dotenv()
 5 | 
 6 | # Set up API keys and environment variables
 7 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 8 | LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY')
 9 | FIRECRAWL_API_KEY = os.getenv('FIRECRAWL_API_KEY')
10 | # Get working directory from environment variable
11 | WORKING_DIRECTORY = os.getenv('WORKING_DIRECTORY', './data_storage/')
12 | # Get Conda-related paths from environment variables
13 | CONDA_PATH = os.getenv('CONDA_PATH', '/home/user/anaconda3')
14 | CONDA_ENV = os.getenv('CONDA_ENV', 'base')
15 | # Get ChromeDriver
16 | CHROMEDRIVER_PATH = os.getenv('CHROMEDRIVER_PATH', './chromedriver/chromedriver')


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | # Configure logging
 3 | def setup_logger(log_file:str='agent.log'):
 4 |     logger = logging.getLogger(__name__)
 5 |     logger.setLevel(logging.DEBUG)
 6 |     
 7 |     # Remove any existing handlers to prevent duplicates
 8 |     if logger.hasHandlers():
 9 |         logger.handlers.clear()
10 | 
11 |     # File handler
12 |     file_handler = logging.FileHandler(log_file)
13 |     file_handler.setLevel(logging.DEBUG)
14 | 
15 |     # Console handler
16 |     console_handler = logging.StreamHandler()
17 |     console_handler.setLevel(logging.INFO)
18 | 
19 |     # Formatter
20 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21 |     file_handler.setFormatter(formatter)
22 |     console_handler.setFormatter(formatter)
23 | 
24 |     # Add handlers
25 |     logger.addHandler(file_handler)
26 |     logger.addHandler(console_handler)
27 | 
28 |     return logger
29 | 


--------------------------------------------------------------------------------
/main.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import os\n",
 10 |     "from logger import setup_logger\n",
 11 |     "from langchain_openai import ChatOpenAI\n",
 12 |     "from langgraph.graph import StateGraph\n",
 13 |     "from load_cfg import OPENAI_API_KEY,LANGCHAIN_API_KEY,WORKING_DIRECTORY\n",
 14 |     "# Set environment variables\n",
 15 |     "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY\n",
 16 |     "os.environ[\"LANGCHAIN_API_KEY\"] = LANGCHAIN_API_KEY\n",
 17 |     "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
 18 |     "os.environ[\"LANGCHAIN_PROJECT\"] = \"Multi-Agent Data Analysis System\"\n",
 19 |     "\n",
 20 |     "# Set up logger\n",
 21 |     "logger = setup_logger()\n",
 22 |     "\n",
 23 |     "# Initialize language models\n",
 24 |     "try:\n",
 25 |     "    llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0, max_tokens=4096)\n",
 26 |     "    power_llm = ChatOpenAI(model=\"gpt-4o\", temperature=0.5, max_tokens=4096)\n",
 27 |     "    json_llm = ChatOpenAI(\n",
 28 |     "        model=\"gpt-4o\",\n",
 29 |     "        model_kwargs={\"response_format\": {\"type\": \"json_object\"}},\n",
 30 |     "        temperature=0,\n",
 31 |     "        max_tokens=4096\n",
 32 |     "    )\n",
 33 |     "    logger.info(\"Language models initialized successfully.\")\n",
 34 |     "except Exception as e:\n",
 35 |     "    logger.error(f\"Error initializing language models: {str(e)}\")\n",
 36 |     "    raise\n",
 37 |     "\n",
 38 |     "# Ensure working directory exists\n",
 39 |     "if not os.path.exists(WORKING_DIRECTORY):\n",
 40 |     "    os.makedirs(WORKING_DIRECTORY)\n",
 41 |     "    logger.info(f\"Created working directory: {WORKING_DIRECTORY}\")\n",
 42 |     "\n",
 43 |     "logger.info(\"Initialization complete.\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from core.state import State\n",
 53 |     "from core.node import agent_node,human_choice_node,note_agent_node,human_review_node,refiner_node\n",
 54 |     "from create_agent import create_agent,create_supervisor\n",
 55 |     "from core.router import QualityReview_router,hypothesis_router,process_router"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# Create state graph for the workflow\n",
 65 |     "workflow = StateGraph(State)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "members = [\"Hypothesis\",\"Process\",\"Visualization\", \"Search\", \"Coder\", \"Report\", \"QualityReview\",\"Refiner\"]"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "from tools.internet import google_search,scrape_webpages_with_fallback\n",
 84 |     "from tools.basetool import execute_code,execute_command\n",
 85 |     "from tools.FileEdit import create_document,read_document,edit_document,collect_data\n",
 86 |     "from langchain.agents import load_tools\n",
 87 |     "from langchain_community.tools import WikipediaQueryRun\n",
 88 |     "from langchain_community.utilities import WikipediaAPIWrapper\n",
 89 |     "wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())\n",
 90 |     "hypothesis_agent = create_agent(\n",
 91 |     "llm, \n",
 92 |     "[collect_data,wikipedia,google_search,scrape_webpages_with_fallback]+load_tools([\"arxiv\"],),\n",
 93 |     "'''\n",
 94 |     "As an esteemed expert in data analysis, your task is to formulate a set of research hypotheses and outline the steps to be taken based on the information table provided. Utilize statistics, machine learning, deep learning, and artificial intelligence in developing these hypotheses. Your hypotheses should be precise, achievable, professional, and innovative. To ensure the feasibility and uniqueness of your hypotheses, thoroughly investigate relevant information. For each hypothesis, include ample references to support your claims.\n",
 95 |     "\n",
 96 |     "Upon analyzing the information table, you are required to:\n",
 97 |     "\n",
 98 |     "1. Formulate research hypotheses that leverage statistics, machine learning, deep learning, and AI techniques.\n",
 99 |     "2. Outline the steps involved in testing these hypotheses.\n",
100 |     "3. Verify the feasibility and uniqueness of each hypothesis through a comprehensive literature review.\n",
101 |     "\n",
102 |     "At the conclusion of your analysis, present the complete research hypotheses, elaborate on their uniqueness and feasibility, and provide relevant references to support your assertions. Please answer in structured way to enhance readability.\n",
103 |     "Just answer a research hypothesis.\n",
104 |     "''',\n",
105 |     "members,WORKING_DIRECTORY)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "process_agent = create_supervisor(\n",
115 |     "    power_llm,\n",
116 |     "    \"\"\"\n",
117 |     "    You are a research supervisor responsible for overseeing and coordinating a comprehensive data analysis project, resulting in a complete and cohesive research report. Your primary tasks include:\n",
118 |     "\n",
119 |     "    1. Validating and refining the research hypothesis to ensure it is clear, specific, and testable.\n",
120 |     "    2. Orchestrating a thorough data analysis process, with all code well-documented and reproducible.\n",
121 |     "    3. Compiling and refining a research report that includes:\n",
122 |     "        - Introduction\n",
123 |     "        - Hypothesis\n",
124 |     "        - Methodology\n",
125 |     "        - Results, accompanied by relevant visualizations\n",
126 |     "        - Discussion\n",
127 |     "        - Conclusion\n",
128 |     "        - References\n",
129 |     "\n",
130 |     "    **Step-by-Step Process:**\n",
131 |     "    1. **Planning:** Define clear objectives and expected outcomes for each phase of the project.\n",
132 |     "    2. **Task Assignment:** Assign specific tasks to the appropriate agents (\"Visualization,\" \"Search,\" \"Coder,\" \"Report\").\n",
133 |     "    3. **Review and Integration:** Critically review and integrate outputs from each agent, ensuring consistency, quality, and relevance.\n",
134 |     "    4. **Feedback:** Provide feedback and further instructions as needed to refine outputs.\n",
135 |     "    5. **Final Compilation:** Ensure all components are logically connected and meet high academic standards.\n",
136 |     "\n",
137 |     "    **Agent Guidelines:**\n",
138 |     "    - **Visualization Agent:** Develop and explain data visualizations that effectively communicate key findings.\n",
139 |     "    - **Search Agent:** Collect and summarize relevant information, and compile a comprehensive list of references.\n",
140 |     "    - **Coder Agent:** Write and document efficient Python code for data analysis, ensuring that the code is clean and reproducible.\n",
141 |     "    - **Report Agent:** Draft, refine, and finalize the research report, integrating inputs from all agents and ensuring the narrative is clear and cohesive.\n",
142 |     "\n",
143 |     "    **Workflow:**\n",
144 |     "    1. Plan the overall analysis and reporting process.\n",
145 |     "    2. Assign tasks to the appropriate agents and oversee their progress.\n",
146 |     "    3. Continuously review and integrate the outputs from each agent, ensuring that each contributes effectively to the final report.\n",
147 |     "    4. Adjust the analysis and reporting process based on emerging results and insights.\n",
148 |     "    5. Compile the final report, ensuring all sections are complete and well-integrated.\n",
149 |     "\n",
150 |     "    **Completion Criteria:**\n",
151 |     "    Respond with \"FINISH\" only when:\n",
152 |     "    1. The hypothesis has been thoroughly tested and validated.\n",
153 |     "    2. The data analysis is complete, with all code documented and reproducible.\n",
154 |     "    3. All required visualizations have been created, properly labeled, and explained.\n",
155 |     "    4. The research report is comprehensive, logically structured, and includes all necessary sections.\n",
156 |     "    5. The reference list is complete and accurately cited.\n",
157 |     "    6. All components are cohesively integrated into a polished final report.\n",
158 |     "\n",
159 |     "    Ensure that the final report delivers a clear, insightful analysis, addressing all aspects of the hypothesis and meeting the highest academic standards.\n",
160 |     "    \"\"\",\n",
161 |     "    [\"Visualization\", \"Search\", \"Coder\", \"Report\"],\n",
162 |     ")"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "visualization_agent = create_agent(\n",
172 |     "    llm, \n",
173 |     "    [read_document, execute_code, execute_command],\n",
174 |     "    \"\"\"\n",
175 |     "    You are a data visualization expert tasked with creating insightful visual representations of data. Your primary responsibilities include:\n",
176 |     "    \n",
177 |     "    1. Designing appropriate visualizations that clearly communicate data trends and patterns.\n",
178 |     "    2. Selecting the most suitable chart types (e.g., bar charts, scatter plots, heatmaps) for different data types and analytical purposes.\n",
179 |     "    3. Providing executable Python code (using libraries such as matplotlib, seaborn, or plotly) that generates these visualizations.\n",
180 |     "    4. Including well-defined titles, axis labels, legends, and saving the visualizations as files.\n",
181 |     "    5. Offering brief but clear interpretations of the visual findings.\n",
182 |     "\n",
183 |     "    **File Saving Guidelines:**\n",
184 |     "    - Save all visualizations as files with descriptive and meaningful filenames.\n",
185 |     "    - Ensure filenames are structured to easily identify the content (e.g., 'sales_trends_2024.png' for a sales trend chart).\n",
186 |     "    - Confirm that the saved files are organized in the working directory, making them easy for other agents to locate and use.\n",
187 |     "\n",
188 |     "    **Constraints:**\n",
189 |     "    - Focus solely on visualization tasks; do not perform data analysis or preprocessing.\n",
190 |     "    - Ensure all visual elements are suitable for the target audience, with attention to color schemes and design principles.\n",
191 |     "    - Avoid over-complicating visualizations; aim for clarity and simplicity.\n",
192 |     "    \"\"\",\n",
193 |     "    members,WORKING_DIRECTORY\n",
194 |     "    )\n",
195 |     "\n",
196 |     "code_agent = create_agent(\n",
197 |     "    power_llm,\n",
198 |     "    [read_document,execute_code, execute_command],\n",
199 |     "    \"\"\"\n",
200 |     "    You are an expert Python programmer specializing in data processing and analysis. Your main responsibilities include:\n",
201 |     "\n",
202 |     "    1. Writing clean, efficient Python code for data manipulation, cleaning, and transformation.\n",
203 |     "    2. Implementing statistical methods and machine learning algorithms as needed.\n",
204 |     "    3. Debugging and optimizing existing code for performance improvements.\n",
205 |     "    4. Adhering to PEP 8 standards and ensuring code readability with meaningful variable and function names.\n",
206 |     "\n",
207 |     "    Constraints:\n",
208 |     "    - Focus solely on data processing tasks; do not generate visualizations or write non-Python code.\n",
209 |     "    - Provide only valid, executable Python code, including necessary comments for complex logic.\n",
210 |     "    - Avoid unnecessary complexity; prioritize readability and efficiency.\n",
211 |     "    \"\"\",\n",
212 |     "    members,WORKING_DIRECTORY\n",
213 |     ")\n",
214 |     "\n",
215 |     "searcher_agent= create_agent(\n",
216 |     "    llm,\n",
217 |     "    [create_document,read_document, collect_data,wikipedia,google_search,scrape_webpages_with_fallback]+load_tools([\"arxiv\"],),\n",
218 |     "    \"\"\"\n",
219 |     "    You are a skilled research assistant responsible for gathering and summarizing relevant information. Your main tasks include:\n",
220 |     "\n",
221 |     "    1. Conducting thorough literature reviews using academic databases and reputable online sources.\n",
222 |     "    2. Summarizing key findings in a clear, concise manner.\n",
223 |     "    3. Providing citations for all sources, prioritizing peer-reviewed and academically reputable materials.\n",
224 |     "\n",
225 |     "    Constraints:\n",
226 |     "    - Focus exclusively on information retrieval and summarization; do not engage in data analysis or processing.\n",
227 |     "    - Present information in an organized format, with clear attributions to sources.\n",
228 |     "    - Evaluate the credibility of sources and prioritize high-quality, reliable information.\n",
229 |     "    \"\"\",\n",
230 |     "    members,WORKING_DIRECTORY\n",
231 |     "    )\n",
232 |     "\n",
233 |     "report_agent = create_agent(\n",
234 |     "    power_llm, \n",
235 |     "    [create_document, read_document, edit_document], \n",
236 |     "    \"\"\"\n",
237 |     "    You are an experienced scientific writer tasked with drafting comprehensive research reports. Your primary duties include:\n",
238 |     "\n",
239 |     "    1. Clearly stating the research hypothesis and objectives in the introduction.\n",
240 |     "    2. Detailing the methodology used, including data collection and analysis techniques.\n",
241 |     "    3. Structuring the report into coherent sections (e.g., Introduction, Methodology, Results, Discussion, Conclusion).\n",
242 |     "    4. Synthesizing information from various sources into a unified narrative.\n",
243 |     "    5. Integrating relevant data visualizations and ensuring they are appropriately referenced and explained.\n",
244 |     "\n",
245 |     "    Constraints:\n",
246 |     "    - Focus solely on report writing; do not perform data analysis or create visualizations.\n",
247 |     "    - Maintain an objective, academic tone throughout the report.\n",
248 |     "    - Cite all sources using APA style and ensure that all findings are supported by evidence.\n",
249 |     "    \"\"\",\n",
250 |     "    members,WORKING_DIRECTORY\n",
251 |     ")"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "quality_review_agent=create_agent(\n",
261 |     "    llm, \n",
262 |     "    [create_document,read_document,edit_document], \n",
263 |     "    '''\n",
264 |     "    You are a meticulous quality control expert responsible for reviewing and ensuring the high standard of all research outputs. Your tasks include:\n",
265 |     "\n",
266 |     "    1. Critically evaluating the content, methodology, and conclusions of research reports.\n",
267 |     "    2. Checking for consistency, accuracy, and clarity in all documents.\n",
268 |     "    3. Identifying areas that need improvement or further elaboration.\n",
269 |     "    4. Ensuring adherence to scientific writing standards and ethical guidelines.\n",
270 |     "\n",
271 |     "    After your review, if revisions are needed, respond with 'REVISION' as a prefix, set needs_revision=True, and provide specific feedback on parts that need improvement. If no revisions are necessary, respond with 'CONTINUE' as a prefix and set needs_revision=False.\n",
272 |     "    ''',\n",
273 |     "    members,WORKING_DIRECTORY\n",
274 |     "    )\n",
275 |     "                        "
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "from create_agent import create_note_agent\n",
285 |     "note_agent=create_note_agent(\n",
286 |     "    json_llm, \n",
287 |     "    [read_document], \n",
288 |     "    '''\n",
289 |     "    You are a meticulous research process note-taker. Your main responsibility is to observe, summarize, and document the actions and findings of the research team. Your tasks include:\n",
290 |     "\n",
291 |     "    1. Observing and recording key activities, decisions, and discussions among team members.\n",
292 |     "    2. Summarizing complex information into clear, concise, and accurate notes.\n",
293 |     "    3. Organizing notes in a structured format that ensures easy retrieval and reference.\n",
294 |     "    4. Highlighting significant insights, breakthroughs, challenges, or any deviations from the research plan.\n",
295 |     "    5. Responding only in JSON format to ensure structured documentation.\n",
296 |     "\n",
297 |     "    Your output should be well-organized and easy to integrate with other project documentation.\n",
298 |     "    ''',\n",
299 |     "    )"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "refiner_agent = create_agent(\n",
309 |     "    power_llm,  \n",
310 |     "    [read_document, edit_document,create_document,collect_data,wikipedia,google_search,scrape_webpages_with_fallback]+load_tools([\"arxiv\"],),\n",
311 |     "    '''\n",
312 |     "    You are an expert AI report refiner tasked with optimizing and enhancing research reports. Your responsibilities include:\n",
313 |     "\n",
314 |     "    1. Thoroughly reviewing the entire research report, focusing on content, structure, and readability.\n",
315 |     "    2. Identifying and emphasizing key findings, insights, and conclusions.\n",
316 |     "    3. Restructuring the report to improve clarity, coherence, and logical flow.\n",
317 |     "    4. Ensuring that all sections are well-integrated and support the primary research hypothesis.\n",
318 |     "    5. Condensing redundant or repetitive content while preserving essential details.\n",
319 |     "    6. Enhancing the overall readability, ensuring the report is engaging and impactful.\n",
320 |     "\n",
321 |     "    Refinement Guidelines:\n",
322 |     "    - Maintain the scientific accuracy and integrity of the original content.\n",
323 |     "    - Ensure all critical points from the original report are preserved and clearly articulated.\n",
324 |     "    - Improve the logical progression of ideas and arguments.\n",
325 |     "    - Highlight the most significant results and their implications for the research hypothesis.\n",
326 |     "    - Ensure that the refined report aligns with the initial research objectives and hypothesis.\n",
327 |     "\n",
328 |     "    After refining the report, submit it for final human review, ensuring it is ready for publication or presentation.\n",
329 |     "    ''',\n",
330 |     "    members,  \n",
331 |     "    WORKING_DIRECTORY\n",
332 |     ")"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "workflow.add_node(\"Hypothesis\", lambda state: agent_node(state, hypothesis_agent, \"hypothesis_agent\"))\n",
342 |     "workflow.add_node(\"Process\", lambda state: agent_node(state, process_agent, \"process_agent\"))\n",
343 |     "workflow.add_node(\"Visualization\", lambda state: agent_node(state, visualization_agent, \"visualization_agent\"))\n",
344 |     "workflow.add_node(\"Search\", lambda state: agent_node(state, searcher_agent, \"searcher_agent\"))\n",
345 |     "workflow.add_node(\"Coder\", lambda state: agent_node(state, code_agent, \"code_agent\"))\n",
346 |     "workflow.add_node(\"Report\", lambda state: agent_node(state, report_agent, \"report_agent\"))\n",
347 |     "workflow.add_node(\"QualityReview\", lambda state: agent_node(state, quality_review_agent, \"quality_review_agent\"))\n",
348 |     "workflow.add_node(\"NoteTaker\", lambda state: note_agent_node(state, note_agent, \"note_agent\"))\n",
349 |     "workflow.add_node(\"HumanChoice\", human_choice_node)\n",
350 |     "workflow.add_node(\"HumanReview\", human_review_node)\n",
351 |     "workflow.add_node(\"Refiner\", lambda state: refiner_node(state, refiner_agent, \"refiner_agent\"))"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "from langgraph.graph import END, START\n",
361 |     "\n",
362 |     "workflow.add_edge(\"Hypothesis\", \"HumanChoice\")\n",
363 |     "workflow.add_conditional_edges(\n",
364 |     "    \"HumanChoice\",\n",
365 |     "    hypothesis_router,\n",
366 |     "    {\n",
367 |     "        \"Hypothesis\": \"Hypothesis\",\n",
368 |     "        \"Process\": \"Process\"\n",
369 |     "    }\n",
370 |     ")\n",
371 |     "\n",
372 |     "workflow.add_conditional_edges(\n",
373 |     "    \"Process\",\n",
374 |     "    process_router,\n",
375 |     "    {\n",
376 |     "        \"Coder\": \"Coder\",\n",
377 |     "        \"Search\": \"Search\",\n",
378 |     "        \"Visualization\": \"Visualization\",\n",
379 |     "        \"Report\": \"Report\",\n",
380 |     "        \"Process\": \"Process\",\n",
381 |     "        \"Refiner\": \"Refiner\",\n",
382 |     "    }\n",
383 |     ")\n",
384 |     "\n",
385 |     "for member in [\"Visualization\",'Search','Coder','Report']:\n",
386 |     "    workflow.add_edge(member, \"QualityReview\")"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "workflow.add_conditional_edges(\n",
396 |     "    \"QualityReview\",\n",
397 |     "    QualityReview_router,\n",
398 |     "    {\n",
399 |     "        'Visualization': \"Visualization\",\n",
400 |     "        'Search': \"Search\",\n",
401 |     "        'Coder': \"Coder\",\n",
402 |     "        'Report': \"Report\",\n",
403 |     "        'NoteTaker': \"NoteTaker\",\n",
404 |     "    }\n",
405 |     ")\n",
406 |     "workflow.add_edge(\"NoteTaker\", \"Process\")"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": null,
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "workflow.add_edge(\"Refiner\", \"HumanReview\")"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": [
424 |     "# Add an edge from HumanReview to Process\n",
425 |     "workflow.add_conditional_edges(\n",
426 |     "    \"HumanReview\",\n",
427 |     "    lambda state: \"Process\" if state and state.get(\"needs_revision\", False) else \"END\",\n",
428 |     "    {\n",
429 |     "        \"Process\": \"Process\",\n",
430 |     "        \"END\": END\n",
431 |     "    }\n",
432 |     ")"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": null,
438 |    "metadata": {},
439 |    "outputs": [],
440 |    "source": [
441 |     "from langgraph.checkpoint.memory import MemorySaver\n",
442 |     "workflow.add_edge(START, \"Hypothesis\")\n",
443 |     "memory = MemorySaver()\n",
444 |     "graph = workflow.compile()"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "metadata": {},
451 |    "outputs": [],
452 |    "source": [
453 |     "from IPython.display import Image, display\n",
454 |     "\n",
455 |     "#display(Image(graph.get_graph().draw_mermaid_png()))"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "#OnlineSalesData.csv is my data set for demo\n",
465 |     "userInput = '''\n",
466 |     "datapath:OnlineSalesData.csv\n",
467 |     "Use machine learning to perform data analysis and write complete graphical reports\n",
468 |     "'''\n",
469 |     "#Here you can describe how you want your data to be processed"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": null,
475 |    "metadata": {},
476 |    "outputs": [],
477 |    "source": [
478 |     "from langchain_core.messages import HumanMessage\n",
479 |     "events = graph.stream(\n",
480 |     "    {\n",
481 |     "        \"messages\": [\n",
482 |     "            HumanMessage(\n",
483 |     "                content=userInput\n",
484 |     "            ),\n",
485 |     "        ],\n",
486 |     "        \"hypothesis\": \"\",\n",
487 |     "        \"process_decision\":\"\",\n",
488 |     "        \"process\": \"\",\n",
489 |     "        \"visualization_state\": \"\",\n",
490 |     "        \"searcher_state\": \"\",\n",
491 |     "        \"code_state\": \"\",\n",
492 |     "        \"report_section\": \"\",\n",
493 |     "        \"quality_review\": \"\",\n",
494 |     "        \"needs_revision\": False,\n",
495 |     "        \"last_sender\": \"\",\n",
496 |     "    },\n",
497 |     "    {\"configurable\": {\"thread_id\": \"1\"}, \"recursion_limit\": 3000},\n",
498 |     "    stream_mode=\"values\",\n",
499 |     "    debug=False\n",
500 |     ")"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "metadata": {},
507 |    "outputs": [],
508 |    "source": [
509 |     "def print_stream(stream):\n",
510 |     "    for s in stream:\n",
511 |     "        message = s[\"messages\"][-1]\n",
512 |     "        if isinstance(message, tuple):\n",
513 |     "            print(message,end='',flush=True)\n",
514 |     "        else:\n",
515 |     "            message.pretty_print()"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {},
522 |    "outputs": [],
523 |    "source": [
524 |     "print_stream(events)"
525 |    ]
526 |   }
527 |  ],
528 |  "metadata": {
529 |   "kernelspec": {
530 |    "display_name": "DataAnalysis",
531 |    "language": "python",
532 |    "name": "python3"
533 |   },
534 |   "language_info": {
535 |    "codemirror_mode": {
536 |     "name": "ipython",
537 |     "version": 3
538 |    },
539 |    "file_extension": ".py",
540 |    "mimetype": "text/x-python",
541 |    "name": "python",
542 |    "nbconvert_exporter": "python",
543 |    "pygments_lexer": "ipython3",
544 |    "version": "3.10.13"
545 |   }
546 |  },
547 |  "nbformat": 4,
548 |  "nbformat_minor": 2
549 | }
550 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | from typing import Dict, Any
 5 | from logger import setup_logger
 6 | from langchain_core.messages import HumanMessage
 7 | 
 8 | from load_cfg import OPENAI_API_KEY, LANGCHAIN_API_KEY, WORKING_DIRECTORY
 9 | from core.workflow import WorkflowManager
10 | from core.language_models import LanguageModelManager
11 | 
12 | class MultiAgentSystem:
13 |     def __init__(self):
14 |         self.logger = setup_logger()
15 |         self.setup_environment()
16 |         self.lm_manager = LanguageModelManager()
17 |         self.workflow_manager = WorkflowManager(
18 |             language_models=self.lm_manager.get_models(),
19 |             working_directory=WORKING_DIRECTORY
20 |         )
21 | 
22 |     def setup_environment(self):
23 |         """Initialize environment variables"""
24 |         os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
25 |         os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
26 |         os.environ["LANGCHAIN_TRACING_V2"] = "true"
27 |         os.environ["LANGCHAIN_PROJECT"] = "Multi-Agent Data Analysis System"
28 | 
29 |         if not os.path.exists(WORKING_DIRECTORY):
30 |             os.makedirs(WORKING_DIRECTORY)
31 |             self.logger.info(f"Created working directory: {WORKING_DIRECTORY}")
32 | 
33 |     def run(self, user_input: str) -> None:
34 |         """Run the multi-agent system with user input"""
35 |         graph = self.workflow_manager.get_graph()
36 |         events = graph.stream(
37 |             {
38 |                 "messages": [HumanMessage(content=user_input)],
39 |                 "hypothesis": "",
40 |                 "process_decision": "",
41 |                 "process": "",
42 |                 "visualization_state": "",
43 |                 "searcher_state": "",
44 |                 "code_state": "",
45 |                 "report_section": "",
46 |                 "quality_review": "",
47 |                 "needs_revision": False,
48 |                 "last_sender": "",
49 |             },
50 |             {"configurable": {"thread_id": "1"}, "recursion_limit": 3000},
51 |             stream_mode="values",
52 |             debug=False
53 |         )
54 |         
55 |         for event in events:
56 |             message = event["messages"][-1]
57 |             if isinstance(message, tuple):
58 |                 print(message, end='', flush=True)
59 |             else:
60 |                 message.pretty_print()
61 | 
62 | def main():
63 |     """Main entry point"""
64 |     system = MultiAgentSystem()
65 |     
66 |     # Example usage
67 |     user_input = '''
68 |     datapath:OnlineSalesData.csv
69 |     Use machine learning to perform data analysis and write complete graphical reports
70 |     '''
71 |     system.run(user_input)
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | arxiv==2.1.3
 2 | beautifulsoup4==4.13.3
 3 | langchain==0.3.12
 4 | langchain-community==0.3.12
 5 | langchain-openai==0.2.13
 6 | langgraph==0.2.73
 7 | pandas==2.2.2
 8 | python-dotenv==1.0.1
 9 | selenium==4.27.1
10 | wikipedia==1.4.0
11 | firecrawl-py==0.0.20
12 | openai==1.55.3
13 | 


--------------------------------------------------------------------------------
/tools/FileEdit.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from langchain_core.tools import tool
  3 | import pandas as pd
  4 | from typing import Dict, Optional, Annotated, List
  5 | from logger import setup_logger
  6 | from load_cfg import WORKING_DIRECTORY
  7 | 
  8 | # Set up logger
  9 | logger = setup_logger()
 10 | 
 11 | # Ensure the working directory exists
 12 | if not os.path.exists(WORKING_DIRECTORY):
 13 |     os.makedirs(WORKING_DIRECTORY)
 14 |     logger.info(f"Created working directory: {WORKING_DIRECTORY}")
 15 | 
 16 | def normalize_path(file_path: str) -> str:
 17 |     """
 18 |     Normalize file path for cross-platform compatibility.
 19 |     
 20 |     Args:
 21 |     file_path (str): The file path to normalize
 22 |     
 23 |     Returns:
 24 |     str: Normalized file path
 25 |     """
 26 |     if WORKING_DIRECTORY not in file_path:
 27 |         file_path = os.path.join(WORKING_DIRECTORY, file_path)
 28 |     return os.path.normpath(file_path)
 29 | 
 30 | @tool
 31 | def collect_data(data_path: Annotated[str, "Path to the CSV file"] = './data.csv'):
 32 |     """
 33 |     Collect data from a CSV file.
 34 | 
 35 |     This function attempts to read a CSV file using different encodings.
 36 | 
 37 |     Returns:
 38 |     pandas.DataFrame: The data read from the CSV file.
 39 | 
 40 |     Raises:
 41 |     ValueError: If unable to read the file with any of the provided encodings.
 42 |     """
 43 |     data_path = normalize_path(data_path)
 44 |     logger.info(f"Attempting to read CSV file: {data_path}")
 45 |     encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
 46 |     for encoding in encodings:
 47 |         try:
 48 |             data = pd.read_csv(data_path, encoding=encoding)
 49 |             logger.info(f"Successfully read CSV file with encoding: {encoding}")
 50 |             return data
 51 |         except Exception as e:
 52 |             logger.warning(f"Error with encoding {encoding}: {e}")
 53 |     logger.error("Unable to read file with provided encodings")
 54 |     raise ValueError("Unable to read file with provided encodings")
 55 | 
 56 | @tool
 57 | def create_document(
 58 |     points: Annotated[List[str], "List of points to be included in the document"],
 59 |     file_name: Annotated[str, "Name of the file to save the document"]
 60 | ) -> str:
 61 |     """
 62 |     Create and save a text document in Markdown format.
 63 | 
 64 |     This function takes a list of points and writes them as numbered items in a Markdown file.
 65 |     
 66 |     Returns:
 67 |     str: A message indicating where the outline was saved or an error message.
 68 |     """
 69 |     try:
 70 |         file_path = normalize_path(file_name)
 71 |         logger.info(f"Creating document: {file_path}")
 72 |         with open(file_path, "w", encoding='utf-8') as file:
 73 |             for i, point in enumerate(points):
 74 |                 file.write(f"{i + 1}. {point}\n")
 75 |         logger.info(f"Document created successfully: {file_path}")
 76 |         return f"Outline saved to {file_path}"
 77 |     except Exception as e:
 78 |         logger.error(f"Error while saving outline: {str(e)}")
 79 |         return f"Error while saving outline: {str(e)}"
 80 | 
 81 | @tool
 82 | def read_document(
 83 |     file_name: Annotated[str, "Name of the file to read"],
 84 |     start: Annotated[Optional[int], "Starting line number to read from"] = None,
 85 |     end: Annotated[Optional[int], "Ending line number to read to"] = None
 86 | ) -> str:
 87 |     """
 88 |     Read the specified document.
 89 | 
 90 |     This function reads a document from the specified file and returns its content.
 91 |     Optionally, it can return a specific range of lines.
 92 | 
 93 |     Returns:
 94 |     str: The content of the document or an error message.
 95 |     """
 96 |     try:
 97 |         file_path = normalize_path(file_name)
 98 |         logger.info(f"Reading document: {file_path}")
 99 |         with open(file_path, "r", encoding='utf-8') as file:
100 |             lines = file.readlines()
101 |         if start is None:
102 |             start = 0
103 |         content = "\n".join(lines[start:end])
104 |         logger.info(f"Document read successfully: {file_path}")
105 |         return content
106 |     except FileNotFoundError:
107 |         logger.error(f"File not found: {file_name}")
108 |         return f"Error: The file {file_name} was not found."
109 |     except Exception as e:
110 |         logger.error(f"Error while reading document: {str(e)}")
111 |         return f"Error while reading document: {str(e)}"
112 | 
113 | @tool
114 | def write_document(
115 |     content: Annotated[str, "Content to be written to the document"],
116 |     file_name: Annotated[str, "Name of the file to save the document"]
117 | ) -> str:
118 |     """
119 |     Create and save a Markdown document.
120 | 
121 |     This function takes a string of content and writes it to a file.
122 |     """
123 |     try:
124 |         file_path = normalize_path(file_name)
125 |         logger.info(f"Writing document: {file_path}")
126 |         with open(file_path, "w", encoding='utf-8') as file:
127 |             file.write(content)
128 |         logger.info(f"Document written successfully: {file_path}")
129 |         return f"Document saved to {file_path}"
130 |     except Exception as e:
131 |         logger.error(f"Error while saving document: {str(e)}")
132 |         return f"Error while saving document: {str(e)}"
133 | 
134 | @tool
135 | def edit_document(
136 |     file_name: Annotated[str, "Name of the file to edit"],
137 |     inserts: Annotated[Dict[int, str], "Dictionary of line numbers and text to insert"]
138 | ) -> str:
139 |     """
140 |     Edit a document by inserting text at specific line numbers.
141 | 
142 |     This function reads an existing document, inserts new text at specified line numbers,
143 |     and saves the modified document.
144 | 
145 |     Args:
146 |         file_name (str): Name of the file to edit.
147 |         inserts (Dict[int, str]): Dictionary where keys are line numbers and values are text to insert.
148 | 
149 |     Returns:
150 |         str: A message indicating the result of the operation.
151 | 
152 |     Example:
153 |         file_name = "example.txt"
154 |         inserts = {
155 |             1: "This is the first line to insert.",
156 |             3: "This is the third line to insert."
157 |         }
158 |         result = edit_document(file_name=file_name, inserts=inserts)
159 |         print(result)
160 |         # Output: "Document edited and saved to /path/to/example.txt"
161 |     """
162 |     try:
163 |         file_path = normalize_path(file_name)
164 |         logger.info(f"Editing document: {file_path}")
165 |         with open(file_path, "r", encoding='utf-8') as file:
166 |             lines = file.readlines()
167 | 
168 |         sorted_inserts = sorted(inserts.items())
169 | 
170 |         for line_number, text in sorted_inserts:
171 |             if 1 <= line_number <= len(lines) + 1:
172 |                 lines.insert(line_number - 1, text + "\n")
173 |             else:
174 |                 logger.error(f"Line number out of range: {line_number}")
175 |                 return f"Error: Line number {line_number} is out of range."
176 | 
177 |         with open(file_path, "w", encoding='utf-8') as file:
178 |             file.writelines(lines)
179 | 
180 |         logger.info(f"Document edited successfully: {file_path}")
181 |         return f"Document edited and saved to {file_path}"
182 |     except FileNotFoundError:
183 |         logger.error(f"File not found: {file_name}")
184 |         return f"Error: The file {file_name} was not found."
185 |     except Exception as e:
186 |         logger.error(f"Error while editing document: {str(e)}")
187 |         return f"Error while editing document: {str(e)}"
188 | 
189 | logger.info("Document management tools initialized")
190 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/starpig1129/DATAGEN/cb6f37318e1f78000ea4c0f56ec569bb59956aef/tools/__init__.py


--------------------------------------------------------------------------------
/tools/basetool.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import platform
  4 | from typing import Annotated
  5 | import subprocess
  6 | from langchain_core.tools import tool
  7 | from logger import setup_logger
  8 | from load_cfg import WORKING_DIRECTORY,CONDA_PATH,CONDA_ENV
  9 | 
 10 | # Initialize logger
 11 | logger = setup_logger()
 12 | 
 13 | # Ensure the storage directory exists
 14 | if not os.path.exists(WORKING_DIRECTORY):
 15 |     os.makedirs(WORKING_DIRECTORY)
 16 |     logger.info(f"Created storage directory: {WORKING_DIRECTORY}")
 17 | 
 18 | def get_platform_specific_command(command: str) -> tuple:
 19 |     """
 20 |     Get platform-specific command execution details.
 21 |     Returns a tuple of (shell_command, shell_type, executable)
 22 |     """
 23 |     system = platform.system().lower()
 24 |     if system == "windows":
 25 |         # Windows-specific command
 26 |         conda_commands = [
 27 |             f"call {os.path.join(CONDA_PATH, 'Scripts', 'activate.bat')}",
 28 |             f"conda activate {CONDA_ENV}",
 29 |             command
 30 |         ]
 31 |         return (" && ".join(conda_commands), True, None)
 32 |     else:
 33 |         # Unix-like systems (Linux, macOS)
 34 |         conda_commands = [
 35 |             f"source {os.path.join(CONDA_PATH, 'etc/profile.d/conda.sh')}",
 36 |             f"conda activate {CONDA_ENV}",
 37 |             command
 38 |         ]
 39 |         return (" && ".join(conda_commands), True, "/bin/bash")
 40 | 
 41 | @tool
 42 | def execute_code(
 43 |     input_code: Annotated[str, "The Python code to execute."],
 44 |     codefile_name: Annotated[str, "The Python code file name or full path."] = 'code.py'
 45 | ):
 46 |     """
 47 |     Execute Python code in a specified conda environment and return the result.
 48 | 
 49 |     This function takes Python code as input, writes it to a file, executes it in the specified
 50 |     conda environment, and returns the output or any errors encountered during execution.
 51 | 
 52 |     Args:
 53 |     input_code (str): The Python code to be executed.
 54 |     codefile_name (str): The name of the file to save the code in, or the full path.
 55 | 
 56 |     Returns:
 57 |     dict: A dictionary containing the execution result, output, and file path.
 58 |     """
 59 |     try:
 60 |         # Ensure WORKING_DIRECTORY exists
 61 |         os.makedirs(WORKING_DIRECTORY, exist_ok=True)
 62 |         
 63 |         # Handle codefile_name, ensuring it's a valid path
 64 |         if os.path.isabs(codefile_name):
 65 |             code_file_path = codefile_name
 66 |         else:
 67 |             if WORKING_DIRECTORY not in codefile_name:
 68 |                 code_file_path = os.path.join(WORKING_DIRECTORY, codefile_name)
 69 |             else:
 70 |                 code_file_path = codefile_name
 71 | 
 72 |         # Normalize the path for the current platform
 73 |         code_file_path = os.path.normpath(code_file_path)
 74 | 
 75 |         logger.info(f"Code will be written to file: {code_file_path}")
 76 |         
 77 |         # Write the code to the file with UTF-8 encoding
 78 |         with open(code_file_path, 'w', encoding='utf-8') as code_file:
 79 |             code_file.write(input_code)
 80 |         
 81 |         logger.info(f"Code has been written to file: {code_file_path}")
 82 |         
 83 |         # Get platform-specific command
 84 |         python_cmd = f"python {codefile_name}"
 85 |         full_command, shell, executable = get_platform_specific_command(python_cmd)
 86 |         
 87 |         logger.info(f"Executing command: {full_command}")
 88 |         
 89 |         # Execute the code
 90 |         result = subprocess.run(
 91 |             full_command,
 92 |             shell=shell,
 93 |             capture_output=True,
 94 |             text=True,
 95 |             executable=executable,
 96 |             cwd=WORKING_DIRECTORY
 97 |         )
 98 |         
 99 |         # Capture standard output and error output
100 |         output = result.stdout
101 |         error_output = result.stderr
102 |         
103 |         if result.returncode == 0:
104 |             logger.info("Code executed successfully")
105 |             return {
106 |                 "result": "Code executed successfully",
107 |                 "output": output + "\n\nIf you have completed all tasks, respond with FINAL ANSWER.",
108 |                 "file_path": code_file_path
109 |             }
110 |         else:
111 |             logger.error(f"Code execution failed: {error_output}")
112 |             return {
113 |                 "result": "Failed to execute",
114 |                 "error": error_output,
115 |                 "file_path": code_file_path
116 |             }
117 |     except Exception as e:
118 |         logger.exception("An error occurred while executing code")
119 |         return {
120 |             "result": "Error occurred",
121 |             "error": str(e),
122 |             "file_path": code_file_path if 'code_file_path' in locals() else "Unknown"
123 |         }
124 | 
125 | @tool
126 | def execute_command(
127 |     command: Annotated[str, "Command to be executed."]
128 | ) -> Annotated[str, "Output of the command."]:
129 |     """
130 |     Execute a command in a specified Conda environment and return its output.
131 | 
132 |     This function activates a Conda environment, executes the given command,
133 |     and returns the output or any errors encountered during execution.
134 |     Please use pip to install the package.
135 | 
136 |     Args:
137 |     command (str): The command to be executed in the Conda environment.
138 | 
139 |     Returns:
140 |     str: The output of the command or an error message.
141 |     """
142 |     try:
143 |         # Get platform-specific command
144 |         full_command, shell, executable = get_platform_specific_command(command)
145 |         
146 |         logger.info(f"Executing command: {command}")
147 |         
148 |         # Execute the command and capture the output
149 |         result = subprocess.run(
150 |             full_command,
151 |             shell=shell,
152 |             check=True,
153 |             stdout=subprocess.PIPE,
154 |             stderr=subprocess.PIPE,
155 |             text=True,
156 |             executable=executable,
157 |             cwd=WORKING_DIRECTORY
158 |         )
159 |         logger.info("Command executed successfully")
160 |         return result.stdout
161 |     except subprocess.CalledProcessError as e:
162 |         logger.error(f"Error executing command: {e.stderr}")
163 |         return f"Error: {e.stderr}"
164 | 
165 | logger.info("Module initialized successfully")
166 | 


--------------------------------------------------------------------------------
/tools/internet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from langchain_core.tools import tool
  3 | from langchain_community.document_loaders import WebBaseLoader, FireCrawlLoader
  4 | from selenium import webdriver
  5 | from selenium.webdriver.chrome.options import Options
  6 | from selenium.webdriver.chrome.service import Service
  7 | from typing import Annotated, List
  8 | from bs4 import BeautifulSoup
  9 | from logger import setup_logger
 10 | from load_cfg import FIRECRAWL_API_KEY,CHROMEDRIVER_PATH
 11 | # Set up logger
 12 | logger = setup_logger()
 13 | 
 14 | @tool
 15 | def google_search(query: Annotated[str, "The search query to use"]) -> str:
 16 |     """
 17 |     Perform a Google search based on the given query and return the top 5 results.
 18 | 
 19 |     This function uses Selenium to perform a headless Google search and BeautifulSoup to parse the results.
 20 | 
 21 |     Args:
 22 |     query (str): The search query to use.
 23 | 
 24 |     Returns:
 25 |     str: A string containing the titles, snippets, and links of the top 5 search results.
 26 | 
 27 |     Raises:
 28 |     Exception: If there's an error during the search process.
 29 |     """
 30 |     try:
 31 |         logger.info(f"Performing Google search for query: {query}")
 32 |         chrome_options = Options()
 33 |         chrome_options.add_argument("--headless")
 34 |         chrome_options.add_argument("--no-sandbox")
 35 |         chrome_options.add_argument("--disable-dev-shm-usage")
 36 |         service = Service(CHROMEDRIVER_PATH)
 37 | 
 38 |         with webdriver.Chrome(options=chrome_options, service=service) as driver:
 39 |             url = f"https://www.google.com/search?q={query}"
 40 |             logger.debug(f"Accessing URL: {url}")
 41 |             driver.get(url)
 42 |             html = driver.page_source
 43 | 
 44 |         soup = BeautifulSoup(html, 'html.parser')
 45 |         search_results = soup.select('.g') 
 46 |         search = ""
 47 |         for result in search_results[:5]:
 48 |             title_element = result.select_one('h3')
 49 |             title = title_element.text if title_element else 'No Title'
 50 |             snippet_element = result.select_one('.VwiC3b')
 51 |             snippet = snippet_element.text if snippet_element else 'No Snippet'
 52 |             link_element = result.select_one('a')
 53 |             link = link_element['href'] if link_element else 'No Link'
 54 |             search += f"{title}\n{snippet}\n{link}\n\n"
 55 | 
 56 |         logger.info("Google search completed successfully")
 57 |         return search
 58 |     except Exception as e:
 59 |         logger.error(f"Error during Google search: {str(e)}")
 60 |         return f'Error: {e}'
 61 | @tool
 62 | def scrape_webpages(urls: Annotated[List[str], "List of URLs to scrape"]) -> str:
 63 |     """
 64 |     Scrape the provided web pages for detailed information using WebBaseLoader.
 65 | 
 66 |     This function uses the WebBaseLoader to load and scrape the content of the provided URLs.
 67 | 
 68 |     Args:
 69 |     urls (List[str]): A list of URLs to scrape.
 70 | 
 71 |     Returns:
 72 |     str: A string containing the concatenated content of all scraped web pages.
 73 | 
 74 |     Raises:
 75 |     Exception: If there's an error during the scraping process.
 76 |     """
 77 |     try:
 78 |         logger.info(f"Scraping webpages: {urls}")
 79 |         loader = WebBaseLoader(urls)
 80 |         docs = loader.load()
 81 |         content = "\n\n".join([f'\n{doc.page_content}\n' for doc in docs])
 82 |         logger.info("Webpage scraping completed successfully")
 83 |         return content
 84 |     except Exception as e:
 85 |         logger.error(f"Error during webpage scraping: {str(e)}")
 86 |         raise  # Re-raise the exception to be caught by the calling function
 87 | @tool
 88 | def FireCrawl_scrape_webpages(urls: Annotated[List[str], "List of URLs to scrape"]) -> str:
 89 |     """
 90 |     Scrape the provided web pages for detailed information using FireCrawlLoader.
 91 | 
 92 |     This function uses the FireCrawlLoader to load and scrape the content of the provided URLs.
 93 | 
 94 |     Args:
 95 |     urls (List[str]): A list of URLs to scrape.
 96 | 
 97 |     Returns:
 98 |     Any: The result of the FireCrawlLoader's load operation.
 99 | 
100 |     Raises:
101 |     Exception: If there's an error during the scraping process or if the API key is not set.
102 |     """
103 |     if not FIRECRAWL_API_KEY:
104 |         raise ValueError("FireCrawl API key is not set")
105 | 
106 |     try:
107 |         logger.info(f"Scraping webpages using FireCrawl: {urls}")
108 |         loader = FireCrawlLoader(
109 |             api_key=FIRECRAWL_API_KEY,
110 |             url=urls,
111 |             mode="scrape"
112 |         )
113 |         result = loader.load()
114 |         logger.info("FireCrawl scraping completed successfully")
115 |         return result
116 |     except Exception as e:
117 |         logger.error(f"Error during FireCrawl scraping: {str(e)}")
118 |         raise  # Re-raise the exception to be caught by the calling function
119 | @tool
120 | def scrape_webpages_with_fallback(urls: Annotated[List[str], "List of URLs to scrape"]) -> str:
121 |     """
122 |     Attempt to scrape webpages using FireCrawl, falling back to WebBaseLoader if unsuccessful.
123 | 
124 |     Args:
125 |     urls (List[str]): A list of URLs to scrape.
126 | 
127 |     Returns:
128 |     str: The scraped content from either FireCrawl or WebBaseLoader.
129 |     """
130 |     try:
131 |         return FireCrawl_scrape_webpages(urls)
132 |     except Exception as e:
133 |         logger.warning(f"FireCrawl scraping failed: {str(e)}. Falling back to WebBaseLoader.")
134 |         try:
135 |             return scrape_webpages(urls)
136 |         except Exception as e:
137 |             logger.error(f"Both scraping methods failed. Error: {str(e)}")
138 |             return f"Error: Unable to scrape webpages using both methods. {str(e)}"
139 | 
140 | logger.info("Web scraping tools initialized")


--------------------------------------------------------------------------------